diff options
Diffstat (limited to 'kernel')
115 files changed, 7042 insertions, 3218 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f2a8b62..dc5c775 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -3,12 +3,11 @@ # obj-y = fork.o exec_domain.o panic.o \ - cpu.o exit.o itimer.o time.o softirq.o resource.o \ - sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ + cpu.o exit.o softirq.o resource.o \ + sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ - extable.o params.o posix-timers.o \ - kthread.o sys_ni.o posix-cpu-timers.o \ - hrtimer.o nsproxy.o \ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o smpboot.o @@ -87,6 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o +obj-$(CONFIG_NET) += bpf/ obj-$(CONFIG_PERF_EVENTS) += events/ @@ -105,27 +105,11 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) -$(obj)/time.o: $(obj)/timeconst.h - -quiet_cmd_hzfile = HZFILE $@ - cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ - -targets += hz.bc -$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE - $(call if_changed,hzfile) - -quiet_cmd_bc = BC $@ - cmd_bc = bc -q $(filter-out FORCE,$^) > $@ - -targets += timeconst.h -$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE - $(call if_changed,bc) - ############################################################################### # # Roll all the X.509 certificates that we can find together and pull them into diff --git a/kernel/acct.c b/kernel/acct.c index 808a86f..5179352 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -141,12 +141,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) if (acct->active) { if (act < 0) { acct->active = 0; - printk(KERN_INFO "Process accounting paused\n"); + pr_info("Process accounting paused\n"); } } else { if (act > 0) { acct->active = 1; - printk(KERN_INFO "Process accounting resumed\n"); + pr_info("Process accounting resumed\n"); } } @@ -261,6 +261,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); + if (IS_ERR(tmp)) return PTR_ERR(tmp); error = acct_on(tmp); @@ -376,7 +377,7 @@ static comp_t encode_comp_t(unsigned long value) return exp; } -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* * encode an u64 into a comp2_t (24 bits) * @@ -389,7 +390,7 @@ static comp_t encode_comp_t(unsigned long value) #define MANTSIZE2 20 /* 20 bit mantissa. */ #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ -#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */ +#define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */ static comp2_t encode_comp2_t(u64 value) { @@ -420,7 +421,7 @@ static comp2_t encode_comp2_t(u64 value) } #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 /* * encode an u64 into a 32 bit IEEE float */ @@ -429,8 +430,9 @@ static u32 encode_float(u64 value) unsigned exp = 190; unsigned u; - if (value==0) return 0; - while ((s64)value > 0){ + if (value == 0) + return 0; + while ((s64)value > 0) { value <<= 1; exp--; } @@ -458,9 +460,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, acct_t ac; mm_segment_t fs; unsigned long flim; - u64 elapsed; - u64 run_time; - struct timespec uptime; + u64 elapsed, run_time; struct tty_struct *tty; const struct cred *orig_cred; @@ -484,22 +484,21 @@ static void do_acct_process(struct bsd_acct_struct *acct, strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); /* calculate run_time in nsec*/ - do_posix_clock_monotonic_gettime(&uptime); - run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; - run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC - + current->group_leader->start_time.tv_nsec; + run_time = ktime_get_ns(); + run_time -= current->group_leader->start_time; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_etime = encode_float(elapsed); #else ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? - (unsigned long) elapsed : (unsigned long) -1l); + (unsigned long) elapsed : (unsigned long) -1l); #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); + ac.ac_etime_hi = etime >> 16; ac.ac_etime_lo = (u16) etime; } @@ -509,15 +508,15 @@ static void do_acct_process(struct bsd_acct_struct *acct, /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); -#if ACCT_VERSION==2 +#if ACCT_VERSION == 2 ac.ac_ahz = AHZ; #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; ac.ac_gid16 = ac.ac_gid; #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); @@ -578,6 +577,7 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); vma = current->mm->mmap; while (vma) { diff --git a/kernel/audit.c b/kernel/audit.c index 3ef2e0e..ba2ff5a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1677,7 +1677,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) audit_log_format(ab, " %s=", prefix); CAP_FOR_EACH_U32(i) { audit_log_format(ab, "%08x", - cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); + cap->cap[CAP_LAST_U32 - i]); } } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8e9bc9c..c447cd9 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -106,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count) if (unlikely(!entry)) return NULL; - fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); + fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL); if (unlikely(!fields)) { kfree(entry); return NULL; @@ -160,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES]; int __init audit_register_class(int class, unsigned *list) { - __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); + __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL); if (!p) return -ENOMEM; while (*list != ~0U) { diff --git a/kernel/bounds.c b/kernel/bounds.c index 9fd4246..e1d1d195 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,7 +9,6 @@ #include <linux/page-flags.h> #include <linux/mmzone.h> #include <linux/kbuild.h> -#include <linux/page_cgroup.h> #include <linux/log2.h> #include <linux/spinlock_types.h> @@ -18,7 +17,6 @@ void foo(void) /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); - DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); #ifdef CONFIG_SMP DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile new file mode 100644 index 0000000..6a71145 --- /dev/null +++ b/kernel/bpf/Makefile @@ -0,0 +1 @@ +obj-y := core.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c new file mode 100644 index 0000000..7f0dbcb --- /dev/null +++ b/kernel/bpf/core.c @@ -0,0 +1,534 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Based on the design of the Berkeley Packet Filter. The new + * internal format has been designed by PLUMgrid: + * + * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com + * + * Authors: + * + * Jay Schulist <jschlst@samba.org> + * Alexei Starovoitov <ast@plumgrid.com> + * Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Andi Kleen - Fix a few bad bugs and races. + * Kris Katterjohn - Added many additional checks in bpf_check_classic() + */ +#include <linux/filter.h> +#include <linux/skbuff.h> +#include <asm/unaligned.h> + +/* Registers */ +#define BPF_R0 regs[BPF_REG_0] +#define BPF_R1 regs[BPF_REG_1] +#define BPF_R2 regs[BPF_REG_2] +#define BPF_R3 regs[BPF_REG_3] +#define BPF_R4 regs[BPF_REG_4] +#define BPF_R5 regs[BPF_REG_5] +#define BPF_R6 regs[BPF_REG_6] +#define BPF_R7 regs[BPF_REG_7] +#define BPF_R8 regs[BPF_REG_8] +#define BPF_R9 regs[BPF_REG_9] +#define BPF_R10 regs[BPF_REG_10] + +/* Named registers */ +#define DST regs[insn->dst_reg] +#define SRC regs[insn->src_reg] +#define FP regs[BPF_REG_FP] +#define ARG1 regs[BPF_REG_ARG1] +#define CTX regs[BPF_REG_CTX] +#define IMM insn->imm + +/* No hurry in this branch + * + * Exported for the bpf jit load helper. + */ +void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) +{ + u8 *ptr = NULL; + + if (k >= SKF_NET_OFF) + ptr = skb_network_header(skb) + k - SKF_NET_OFF; + else if (k >= SKF_LL_OFF) + ptr = skb_mac_header(skb) + k - SKF_LL_OFF; + if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) + return ptr; + + return NULL; +} + +/* Base function for offset calculation. Needs to go into .text section, + * therefore keeping it non-static as well; will also be used by JITs + * anyway later on, so do not let the compiler omit it. + */ +noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return 0; +} + +/** + * __bpf_prog_run - run eBPF program on a given context + * @ctx: is the data we are operating on + * @insn: is the array of eBPF instructions + * + * Decode and execute eBPF instructions. + */ +static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) +{ + u64 stack[MAX_BPF_STACK / sizeof(u64)]; + u64 regs[MAX_BPF_REG], tmp; + static const void *jumptable[256] = { + [0 ... 255] = &&default_label, + /* Now overwrite non-defaults ... */ + /* 32 bit ALU operations */ + [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, + [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, + [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, + [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, + [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, + [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, + [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, + [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, + [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, + [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, + [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, + [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, + [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, + [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, + [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, + [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, + [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, + [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, + [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, + [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, + [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, + [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, + [BPF_ALU | BPF_NEG] = &&ALU_NEG, + [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, + [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, + /* 64 bit ALU operations */ + [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, + [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, + [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, + [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, + [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, + [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, + [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, + [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, + [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, + [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, + [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, + [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, + [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, + [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, + [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, + [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, + [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, + [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, + [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, + [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, + [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, + [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, + [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, + [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, + [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, + /* Call instruction */ + [BPF_JMP | BPF_CALL] = &&JMP_CALL, + /* Jumps */ + [BPF_JMP | BPF_JA] = &&JMP_JA, + [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, + [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, + [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, + [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, + [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, + [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, + [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, + [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, + [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, + [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, + [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, + [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, + [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, + [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, + /* Program return */ + [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, + /* Store instructions */ + [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, + [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, + [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, + [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, + [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, + [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, + [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, + [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, + [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, + [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, + /* Load instructions */ + [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, + [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, + [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, + [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, + [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, + [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, + [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, + [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, + [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, + [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, + }; + void *ptr; + int off; + +#define CONT ({ insn++; goto select_insn; }) +#define CONT_JMP ({ insn++; goto select_insn; }) + + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; + ARG1 = (u64) (unsigned long) ctx; + + /* Registers used in classic BPF programs need to be reset first. */ + regs[BPF_REG_A] = 0; + regs[BPF_REG_X] = 0; + +select_insn: + goto *jumptable[insn->code]; + + /* ALU */ +#define ALU(OPCODE, OP) \ + ALU64_##OPCODE##_X: \ + DST = DST OP SRC; \ + CONT; \ + ALU_##OPCODE##_X: \ + DST = (u32) DST OP (u32) SRC; \ + CONT; \ + ALU64_##OPCODE##_K: \ + DST = DST OP IMM; \ + CONT; \ + ALU_##OPCODE##_K: \ + DST = (u32) DST OP (u32) IMM; \ + CONT; + + ALU(ADD, +) + ALU(SUB, -) + ALU(AND, &) + ALU(OR, |) + ALU(LSH, <<) + ALU(RSH, >>) + ALU(XOR, ^) + ALU(MUL, *) +#undef ALU + ALU_NEG: + DST = (u32) -DST; + CONT; + ALU64_NEG: + DST = -DST; + CONT; + ALU_MOV_X: + DST = (u32) SRC; + CONT; + ALU_MOV_K: + DST = (u32) IMM; + CONT; + ALU64_MOV_X: + DST = SRC; + CONT; + ALU64_MOV_K: + DST = IMM; + CONT; + ALU64_ARSH_X: + (*(s64 *) &DST) >>= SRC; + CONT; + ALU64_ARSH_K: + (*(s64 *) &DST) >>= IMM; + CONT; + ALU64_MOD_X: + if (unlikely(SRC == 0)) + return 0; + tmp = DST; + DST = do_div(tmp, SRC); + CONT; + ALU_MOD_X: + if (unlikely(SRC == 0)) + return 0; + tmp = (u32) DST; + DST = do_div(tmp, (u32) SRC); + CONT; + ALU64_MOD_K: + tmp = DST; + DST = do_div(tmp, IMM); + CONT; + ALU_MOD_K: + tmp = (u32) DST; + DST = do_div(tmp, (u32) IMM); + CONT; + ALU64_DIV_X: + if (unlikely(SRC == 0)) + return 0; + do_div(DST, SRC); + CONT; + ALU_DIV_X: + if (unlikely(SRC == 0)) + return 0; + tmp = (u32) DST; + do_div(tmp, (u32) SRC); + DST = (u32) tmp; + CONT; + ALU64_DIV_K: + do_div(DST, IMM); + CONT; + ALU_DIV_K: + tmp = (u32) DST; + do_div(tmp, (u32) IMM); + DST = (u32) tmp; + CONT; + ALU_END_TO_BE: + switch (IMM) { + case 16: + DST = (__force u16) cpu_to_be16(DST); + break; + case 32: + DST = (__force u32) cpu_to_be32(DST); + break; + case 64: + DST = (__force u64) cpu_to_be64(DST); + break; + } + CONT; + ALU_END_TO_LE: + switch (IMM) { + case 16: + DST = (__force u16) cpu_to_le16(DST); + break; + case 32: + DST = (__force u32) cpu_to_le32(DST); + break; + case 64: + DST = (__force u64) cpu_to_le64(DST); + break; + } + CONT; + + /* CALL */ + JMP_CALL: + /* Function call scratches BPF_R1-BPF_R5 registers, + * preserves BPF_R6-BPF_R9, and stores return value + * into BPF_R0. + */ + BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, + BPF_R4, BPF_R5); + CONT; + + /* JMP */ + JMP_JA: + insn += insn->off; + CONT; + JMP_JEQ_X: + if (DST == SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JEQ_K: + if (DST == IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JNE_X: + if (DST != SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JNE_K: + if (DST != IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGT_X: + if (DST > SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGT_K: + if (DST > IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGE_X: + if (DST >= SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGE_K: + if (DST >= IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGT_X: + if (((s64) DST) > ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGT_K: + if (((s64) DST) > ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGE_X: + if (((s64) DST) >= ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGE_K: + if (((s64) DST) >= ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSET_X: + if (DST & SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSET_K: + if (DST & IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_EXIT: + return BPF_R0; + + /* STX and ST and LDX*/ +#define LDST(SIZEOP, SIZE) \ + STX_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ + CONT; \ + ST_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ + CONT; \ + LDX_MEM_##SIZEOP: \ + DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ + CONT; + + LDST(B, u8) + LDST(H, u16) + LDST(W, u32) + LDST(DW, u64) +#undef LDST + STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ + atomic_add((u32) SRC, (atomic_t *)(unsigned long) + (DST + insn->off)); + CONT; + STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ + atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) + (DST + insn->off)); + CONT; + LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ + off = IMM; +load_word: + /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are + * only appearing in the programs where ctx == + * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] + * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, + * internal BPF verifier will check that BPF_R6 == + * ctx. + * + * BPF_ABS and BPF_IND are wrappers of function calls, + * so they scratch BPF_R1-BPF_R5 registers, preserve + * BPF_R6-BPF_R9, and store return value into BPF_R0. + * + * Implicit input: + * ctx == skb == BPF_R6 == CTX + * + * Explicit input: + * SRC == any register + * IMM == 32-bit immediate + * + * Output: + * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness + */ + + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = get_unaligned_be32(ptr); + CONT; + } + + return 0; + LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ + off = IMM; +load_half: + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = get_unaligned_be16(ptr); + CONT; + } + + return 0; + LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ + off = IMM; +load_byte: + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = *(u8 *)ptr; + CONT; + } + + return 0; + LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ + off = IMM + SRC; + goto load_word; + LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ + off = IMM + SRC; + goto load_half; + LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ + off = IMM + SRC; + goto load_byte; + + default_label: + /* If we ever reach this, we have a bug somewhere. */ + WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); + return 0; +} + +void __weak bpf_int_jit_compile(struct bpf_prog *prog) +{ +} + +/** + * bpf_prog_select_runtime - select execution runtime for BPF program + * @fp: bpf_prog populated with internal BPF program + * + * try to JIT internal BPF program, if JIT is not available select interpreter + * BPF program will be executed via BPF_PROG_RUN() macro + */ +void bpf_prog_select_runtime(struct bpf_prog *fp) +{ + fp->bpf_func = (void *) __bpf_prog_run; + + /* Probe if internal BPF can be JITed */ + bpf_int_jit_compile(fp); +} +EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); + +/* free internal BPF program */ +void bpf_prog_free(struct bpf_prog *fp) +{ + bpf_jit_free(fp); +} +EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/capability.c b/kernel/capability.c index a5cf13c..989f5bf 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -258,6 +258,10 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) i++; } + effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + new = prepare_creds(); if (!new) return -ENOMEM; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 70776ae..7dc8788 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root; */ static bool cgrp_dfl_root_visible; +/* + * Set by the boot param of the same name and makes subsystems with NULL + * ->dfl_files to use ->legacy_files on the default hierarchy. + */ +static bool cgroup_legacy_files_on_dfl; + /* some controllers are not supported in the default hierarchy */ -static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 -#ifdef CONFIG_CGROUP_DEBUG - | (1 << debug_cgrp_id) -#endif - ; +static unsigned int cgrp_dfl_root_inhibit_ss_mask; /* The list of hierarchy roots */ @@ -180,13 +182,15 @@ static u64 css_serial_nr_next = 1; */ static int need_forkexit_callback __read_mostly; -static struct cftype cgroup_base_files[]; +static struct cftype cgroup_dfl_base_files[]; +static struct cftype cgroup_legacy_base_files[]; static void cgroup_put(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, + bool visible); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], @@ -1037,6 +1041,58 @@ static void cgroup_put(struct cgroup *cgrp) } /** + * cgroup_refresh_child_subsys_mask - update child_subsys_mask + * @cgrp: the target cgroup + * + * On the default hierarchy, a subsystem may request other subsystems to be + * enabled together through its ->depends_on mask. In such cases, more + * subsystems than specified in "cgroup.subtree_control" may be enabled. + * + * This function determines which subsystems need to be enabled given the + * current @cgrp->subtree_control and records it in + * @cgrp->child_subsys_mask. The resulting mask is always a superset of + * @cgrp->subtree_control and follows the usual hierarchy rules. + */ +static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + unsigned int cur_ss_mask = cgrp->subtree_control; + struct cgroup_subsys *ss; + int ssid; + + lockdep_assert_held(&cgroup_mutex); + + if (!cgroup_on_dfl(cgrp)) { + cgrp->child_subsys_mask = cur_ss_mask; + return; + } + + while (true) { + unsigned int new_ss_mask = cur_ss_mask; + + for_each_subsys(ss, ssid) + if (cur_ss_mask & (1 << ssid)) + new_ss_mask |= ss->depends_on; + + /* + * Mask out subsystems which aren't available. This can + * happen only if some depended-upon subsystems were bound + * to non-default hierarchies. + */ + if (parent) + new_ss_mask &= parent->child_subsys_mask; + else + new_ss_mask &= cgrp->root->subsys_mask; + + if (new_ss_mask == cur_ss_mask) + break; + cur_ss_mask = new_ss_mask; + } + + cgrp->child_subsys_mask = cur_ss_mask; +} + +/** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @@ -1208,12 +1264,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) up_write(&css_set_rwsem); src_root->subsys_mask &= ~(1 << ssid); - src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + src_root->cgrp.subtree_control &= ~(1 << ssid); + cgroup_refresh_child_subsys_mask(&src_root->cgrp); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; - if (dst_root != &cgrp_dfl_root) - dst_root->cgrp.child_subsys_mask |= 1 << ssid; + if (dst_root != &cgrp_dfl_root) { + dst_root->cgrp.subtree_control |= 1 << ssid; + cgroup_refresh_child_subsys_mask(&dst_root->cgrp); + } if (ss->bind) ss->bind(css); @@ -1233,8 +1292,6 @@ static int cgroup_show_options(struct seq_file *seq, for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); - if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) - seq_puts(seq, ",sane_behavior"); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) @@ -1268,6 +1325,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) bool all_ss = false, one_ss = false; unsigned int mask = -1U; struct cgroup_subsys *ss; + int nr_opts = 0; int i; #ifdef CONFIG_CPUSETS @@ -1277,6 +1335,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) memset(opts, 0, sizeof(*opts)); while ((token = strsep(&o, ",")) != NULL) { + nr_opts++; + if (!*token) return -EINVAL; if (!strcmp(token, "none")) { @@ -1361,37 +1421,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - /* Consistency checks */ - if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - - if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || - opts->cpuset_clone_children || opts->release_agent || - opts->name) { - pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); + if (nr_opts != 1) { + pr_err("sane_behavior: no other mount options allowed\n"); return -EINVAL; } - } else { - /* - * If the 'all' option was specified select all the - * subsystems, otherwise if 'none', 'name=' and a subsystem - * name options were not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (!ss->disabled) - opts->subsys_mask |= (1 << i); - - /* - * We either have to specify by name or by subsystems. (So - * all empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; + return 0; } /* + * If the 'all' option was specified select all the subsystems, + * otherwise if 'none', 'name=' and a subsystem name options were + * not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (!ss->disabled) + opts->subsys_mask |= (1 << i); + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) + return -EINVAL; + + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. @@ -1399,7 +1455,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) return -EINVAL; - /* Can't specify "none" and some subsystems */ if (opts->subsys_mask && opts->none) return -EINVAL; @@ -1414,8 +1469,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) struct cgroup_sb_opts opts; unsigned int added_mask, removed_mask; - if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("sane_behavior: remount is not allowed\n"); + if (root == &cgrp_dfl_root) { + pr_err("remount is not allowed\n"); return -EINVAL; } @@ -1434,11 +1489,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) removed_mask = root->subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ - if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || + if ((opts.flags ^ root->flags) || (opts.name && strcmp(opts.name, root->name))) { pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", - root->flags & CGRP_ROOT_OPTION_MASK, root->name); + opts.flags, opts.name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } @@ -1563,6 +1617,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; + struct cftype *base_files; struct css_set *cset; int i, ret; @@ -1600,7 +1655,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) } root_cgrp->kn = root->kf_root->kn; - ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); + if (root == &cgrp_dfl_root) + base_files = cgroup_dfl_base_files; + else + base_files = cgroup_legacy_base_files; + + ret = cgroup_addrm_files(root_cgrp, base_files, true); if (ret) goto destroy_root; @@ -1638,7 +1698,7 @@ destroy_root: exit_root_id: cgroup_exit_root_id(root); cancel_ref: - percpu_ref_cancel_init(&root_cgrp->self.refcnt); + percpu_ref_exit(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; @@ -1672,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; /* look for a matching existing root */ - if (!opts.subsys_mask && !opts.none && !opts.name) { + if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { cgrp_dfl_root_visible = true; root = &cgrp_dfl_root; cgroup_get(&root->cgrp); @@ -1730,15 +1790,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } - if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { - if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("sane_behavior: new mount options should match the existing superblock\n"); - ret = -EINVAL; - goto out_unlock; - } else { - pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - } - } + if (root->flags ^ opts.flags) + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); /* * We want to reuse @root whose lifetime is governed by its @@ -2457,9 +2510,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v) static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; - - seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); + seq_puts(seq, "0\n"); return 0; } @@ -2496,7 +2547,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); + cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control); return 0; } @@ -2505,7 +2556,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); + cgroup_print_ss_mask(seq, cgrp->subtree_control); return 0; } @@ -2611,6 +2662,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, loff_t off) { unsigned int enable = 0, disable = 0; + unsigned int css_enable, css_disable, old_ctrl, new_ctrl; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -2650,11 +2702,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { - if (cgrp->child_subsys_mask & (1 << ssid)) { + if (cgrp->subtree_control & (1 << ssid)) { enable &= ~(1 << ssid); continue; } + /* unavailable or not enabled on the parent? */ + if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || + (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) { + ret = -ENOENT; + goto out_unlock; + } + + /* + * @ss is already enabled through dependency and + * we'll just make it visible. Skip draining. + */ + if (cgrp->child_subsys_mask & (1 << ssid)) + continue; + /* * Because css offlining is asynchronous, userland * might try to re-enable the same controller while @@ -2677,23 +2744,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return restart_syscall(); } - - /* unavailable or not enabled on the parent? */ - if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || - (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { - ret = -ENOENT; - goto out_unlock; - } } else if (disable & (1 << ssid)) { - if (!(cgrp->child_subsys_mask & (1 << ssid))) { + if (!(cgrp->subtree_control & (1 << ssid))) { disable &= ~(1 << ssid); continue; } /* a child has it enabled? */ cgroup_for_each_live_child(child, cgrp) { - if (child->child_subsys_mask & (1 << ssid)) { + if (child->subtree_control & (1 << ssid)) { ret = -EBUSY; goto out_unlock; } @@ -2707,7 +2766,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } /* - * Except for the root, child_subsys_mask must be zero for a cgroup + * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { @@ -2716,36 +2775,75 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } /* - * Create csses for enables and update child_subsys_mask. This - * changes cgroup_e_css() results which in turn makes the - * subsequent cgroup_update_dfl_csses() associate all tasks in the - * subtree to the updated csses. + * Update subsys masks and calculate what needs to be done. More + * subsystems than specified may need to be enabled or disabled + * depending on subsystem dependencies. + */ + cgrp->subtree_control |= enable; + cgrp->subtree_control &= ~disable; + + old_ctrl = cgrp->child_subsys_mask; + cgroup_refresh_child_subsys_mask(cgrp); + new_ctrl = cgrp->child_subsys_mask; + + css_enable = ~old_ctrl & new_ctrl; + css_disable = old_ctrl & ~new_ctrl; + enable |= css_enable; + disable |= css_disable; + + /* + * Create new csses or make the existing ones visible. A css is + * created invisible if it's being implicitly enabled through + * dependency. An invisible css is made visible when the userland + * explicitly enables it. */ for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) continue; cgroup_for_each_live_child(child, cgrp) { - ret = create_css(child, ss); + if (css_enable & (1 << ssid)) + ret = create_css(child, ss, + cgrp->subtree_control & (1 << ssid)); + else + ret = cgroup_populate_dir(child, 1 << ssid); if (ret) goto err_undo_css; } } - cgrp->child_subsys_mask |= enable; - cgrp->child_subsys_mask &= ~disable; - + /* + * At this point, cgroup_e_css() results reflect the new csses + * making the following cgroup_update_dfl_csses() properly update + * css associations of all tasks in the subtree. + */ ret = cgroup_update_dfl_csses(cgrp); if (ret) goto err_undo_css; - /* all tasks are now migrated away from the old csses, kill them */ + /* + * All tasks are migrated out of disabled csses. Kill or hide + * them. A css is hidden when the userland requests it to be + * disabled while other subsystems are still depending on it. The + * css must not actively control resources and be in the vanilla + * state if it's made visible again later. Controllers which may + * be depended upon should provide ->css_reset() for this purpose. + */ for_each_subsys(ss, ssid) { if (!(disable & (1 << ssid))) continue; - cgroup_for_each_live_child(child, cgrp) - kill_css(cgroup_css(child, ss)); + cgroup_for_each_live_child(child, cgrp) { + struct cgroup_subsys_state *css = cgroup_css(child, ss); + + if (css_disable & (1 << ssid)) { + kill_css(css); + } else { + cgroup_clear_dir(child, 1 << ssid); + if (ss->css_reset) + ss->css_reset(css); + } + } } kernfs_activate(cgrp->kn); @@ -2755,8 +2853,9 @@ out_unlock: return ret ?: nbytes; err_undo_css: - cgrp->child_subsys_mask &= ~enable; - cgrp->child_subsys_mask |= disable; + cgrp->subtree_control &= ~enable; + cgrp->subtree_control |= disable; + cgroup_refresh_child_subsys_mask(cgrp); for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) @@ -2764,8 +2863,14 @@ err_undo_css: cgroup_for_each_live_child(child, cgrp) { struct cgroup_subsys_state *css = cgroup_css(child, ss); - if (css) + + if (!css) + continue; + + if (css_enable & (1 << ssid)) kill_css(css); + else + cgroup_clear_dir(child, 1 << ssid); } } goto out_unlock; @@ -2878,9 +2983,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, /* * This isn't a proper migration and its usefulness is very - * limited. Disallow if sane_behavior. + * limited. Disallow on the default hierarchy. */ - if (cgroup_sane_behavior(cgrp)) + if (cgroup_on_dfl(cgrp)) return -EPERM; /* @@ -2964,9 +3069,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ - if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) + if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; - if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) + if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; @@ -3024,6 +3129,9 @@ static void cgroup_exit_cftypes(struct cftype *cfts) kfree(cft->kf_ops); cft->kf_ops = NULL; cft->ss = NULL; + + /* revert flags set by cgroup core while adding @cfts */ + cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL); } } @@ -3109,7 +3217,7 @@ int cgroup_rm_cftypes(struct cftype *cfts) * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ -int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; @@ -3135,6 +3243,40 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) } /** + * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Similar to cgroup_add_cftypes() but the added files are only used for + * the default hierarchy. + */ +int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + struct cftype *cft; + + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_ONLY_ON_DFL; + return cgroup_add_cftypes(ss, cfts); +} + +/** + * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Similar to cgroup_add_cftypes() but the added files are only used for + * the legacy hierarchies. + */ +int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + struct cftype *cft; + + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_NOT_ON_DFL; + return cgroup_add_cftypes(ss, cfts); +} + +/** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * @@ -3699,8 +3841,9 @@ after: * * All this extra complexity was caused by the original implementation * committing to an entirely unnecessary property. In the long term, we - * want to do away with it. Explicitly scramble sort order if - * sane_behavior so that no such expectation exists in the new interface. + * want to do away with it. Explicitly scramble sort order if on the + * default hierarchy so that no such expectation exists in the new + * interface. * * Scrambling is done by swapping every two consecutive bits, which is * non-identity one-to-one mapping which disturbs sort order sufficiently. @@ -3715,7 +3858,7 @@ static pid_t pid_fry(pid_t pid) static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) { - if (cgroup_sane_behavior(cgrp)) + if (cgroup_on_dfl(cgrp)) return pid_fry(pid); else return pid; @@ -3818,7 +3961,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ - if (cgroup_sane_behavior(cgrp)) + if (cgroup_on_dfl(cgrp)) sort(array, length, sizeof(pid_t), fried_cmppid, NULL); else sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -4040,7 +4183,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, return 0; } -static struct cftype cgroup_base_files[] = { +/* cgroup core interface files for the default hierarchy */ +static struct cftype cgroup_dfl_base_files[] = { { .name = "cgroup.procs", .seq_start = cgroup_pidlist_start, @@ -4052,46 +4196,52 @@ static struct cftype cgroup_base_files[] = { .mode = S_IRUGO | S_IWUSR, }, { - .name = "cgroup.clone_children", - .flags = CFTYPE_INSANE, - .read_u64 = cgroup_clone_children_read, - .write_u64 = cgroup_clone_children_write, - }, - { - .name = "cgroup.sane_behavior", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_sane_behavior_show, - }, - { .name = "cgroup.controllers", - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, + .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_root_controllers_show, }, { .name = "cgroup.controllers", - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_controllers_show, }, { .name = "cgroup.subtree_control", - .flags = CFTYPE_ONLY_ON_DFL, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, { .name = "cgroup.populated", - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_populated_show, }, + { } /* terminate */ +}; - /* - * Historical crazy stuff. These don't have "cgroup." prefix and - * don't exist if sane_behavior. If you're depending on these, be - * prepared to be burned. - */ +/* cgroup core interface files for the legacy hierarchies */ +static struct cftype cgroup_legacy_base_files[] = { + { + .name = "cgroup.procs", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, + .write = cgroup_procs_write, + .mode = S_IRUGO | S_IWUSR, + }, + { + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, + }, + { + .name = "cgroup.sane_behavior", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_sane_behavior_show, + }, { .name = "tasks", - .flags = CFTYPE_INSANE, /* use "procs" instead */ .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, @@ -4102,13 +4252,12 @@ static struct cftype cgroup_base_files[] = { }, { .name = "notify_on_release", - .flags = CFTYPE_INSANE, .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, { .name = "release_agent", - .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, + .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, @@ -4175,6 +4324,8 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; + percpu_ref_exit(&css->refcnt); + if (css->ss) { /* css free path */ if (css->parent) @@ -4314,12 +4465,14 @@ static void offline_css(struct cgroup_subsys_state *css) * create_css - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css + * @visible: whether to create control knobs for the new css or not * * Create a new css associated with @cgrp - @ss pair. On success, the new - * css is online and installed in @cgrp with all interface files created. - * Returns 0 on success, -errno on failure. + * css is online and installed in @cgrp with all interface files created if + * @visible. Returns 0 on success, -errno on failure. */ -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, + bool visible) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); @@ -4343,9 +4496,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) goto err_free_percpu_ref; css->id = err; - err = cgroup_populate_dir(cgrp, 1 << ss->id); - if (err) - goto err_free_id; + if (visible) { + err = cgroup_populate_dir(cgrp, 1 << ss->id); + if (err) + goto err_free_id; + } /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); @@ -4372,7 +4527,7 @@ err_list_del: err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: - percpu_ref_cancel_init(&css->refcnt); + percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); return err; @@ -4385,6 +4540,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; + struct cftype *base_files; int ssid, ret; parent = cgroup_kn_lock_live(parent_kn); @@ -4455,14 +4611,20 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); + if (cgroup_on_dfl(cgrp)) + base_files = cgroup_dfl_base_files; + else + base_files = cgroup_legacy_base_files; + + ret = cgroup_addrm_files(cgrp, base_files, true); if (ret) goto out_destroy; /* let's create and online css's */ for_each_subsys(ss, ssid) { if (parent->child_subsys_mask & (1 << ssid)) { - ret = create_css(cgrp, ss); + ret = create_css(cgrp, ss, + parent->subtree_control & (1 << ssid)); if (ret) goto out_destroy; } @@ -4470,10 +4632,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, /* * On the default hierarchy, a child doesn't automatically inherit - * child_subsys_mask from the parent. Each is configured manually. + * subtree_control from the parent. Each is configured manually. */ - if (!cgroup_on_dfl(cgrp)) - cgrp->child_subsys_mask = parent->child_subsys_mask; + if (!cgroup_on_dfl(cgrp)) { + cgrp->subtree_control = parent->subtree_control; + cgroup_refresh_child_subsys_mask(cgrp); + } kernfs_activate(kn); @@ -4483,7 +4647,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, out_free_id: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_cancel_ref: - percpu_ref_cancel_init(&cgrp->self.refcnt); + percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: kfree(cgrp); out_unlock: @@ -4736,8 +4900,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) */ int __init cgroup_init_early(void) { - static struct cgroup_sb_opts __initdata opts = - { .flags = CGRP_ROOT_SANE_BEHAVIOR }; + static struct cgroup_sb_opts __initdata opts; struct cgroup_subsys *ss; int i; @@ -4775,7 +4938,8 @@ int __init cgroup_init(void) unsigned long key; int ssid, err; - BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); mutex_lock(&cgroup_mutex); @@ -4807,9 +4971,22 @@ int __init cgroup_init(void) * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ - if (!ss->disabled) { - cgrp_dfl_root.subsys_mask |= 1 << ss->id; - WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); + if (ss->disabled) + continue; + + cgrp_dfl_root.subsys_mask |= 1 << ss->id; + + if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes) + ss->dfl_cftypes = ss->legacy_cftypes; + + if (!ss->dfl_cftypes) + cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; + + if (ss->dfl_cftypes == ss->legacy_cftypes) { + WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); + } else { + WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); + WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); } } @@ -5205,6 +5382,14 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); +static int __init cgroup_set_legacy_files_on_dfl(char *str) +{ + printk("cgroup: using legacy files on the default hierarchy\n"); + cgroup_legacy_files_on_dfl = true; + return 0; +} +__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -5399,6 +5584,6 @@ static struct cftype debug_files[] = { struct cgroup_subsys debug_cgrp_subsys = { .css_alloc = debug_css_alloc, .css_free = debug_css_free, - .base_cftypes = debug_files, + .legacy_cftypes = debug_files, }; #endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index a79e40f..92b98cc 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -480,5 +480,5 @@ struct cgroup_subsys freezer_cgrp_subsys = { .css_free = freezer_css_free, .attach = freezer_attach, .fork = freezer_fork, - .base_cftypes = files, + .legacy_cftypes = files, }; diff --git a/kernel/cpu.c b/kernel/cpu.c index a343bde..81e2a38 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -274,21 +274,28 @@ void clear_tasks_mm_cpumask(int cpu) rcu_read_unlock(); } -static inline void check_for_tasks(int cpu) +static inline void check_for_tasks(int dead_cpu) { - struct task_struct *p; - cputime_t utime, stime; + struct task_struct *g, *p; - write_lock_irq(&tasklist_lock); - for_each_process(p) { - task_cputime(p, &utime, &stime); - if (task_cpu(p) == cpu && p->state == TASK_RUNNING && - (utime || stime)) - pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", - p->comm, task_pid_nr(p), cpu, - p->state, p->flags); - } - write_unlock_irq(&tasklist_lock); + read_lock_irq(&tasklist_lock); + do_each_thread(g, p) { + if (!p->on_rq) + continue; + /* + * We do the check with unlocked task_rq(p)->lock. + * Order the reading to do not warn about a task, + * which was running on this cpu in the past, and + * it's just been woken on another cpu. + */ + rmb(); + if (task_cpu(p) != dead_cpu) + continue; + + pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", + p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); + } while_each_thread(g, p); + read_unlock_irq(&tasklist_lock); } struct take_cpu_down_param { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 116a416..22874d7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -76,8 +76,34 @@ struct cpuset { struct cgroup_subsys_state css; unsigned long flags; /* "unsigned long" so bitops work */ - cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ - nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ + + /* + * On default hierarchy: + * + * The user-configured masks can only be changed by writing to + * cpuset.cpus and cpuset.mems, and won't be limited by the + * parent masks. + * + * The effective masks is the real masks that apply to the tasks + * in the cpuset. They may be changed if the configured masks are + * changed or hotplug happens. + * + * effective_mask == configured_mask & parent's effective_mask, + * and if it ends up empty, it will inherit the parent's mask. + * + * + * On legacy hierachy: + * + * The user-configured masks are always the same with effective masks. + */ + + /* user-configured CPUs and Memory Nodes allow to tasks */ + cpumask_var_t cpus_allowed; + nodemask_t mems_allowed; + + /* effective CPUs and Memory Nodes allow to tasks */ + cpumask_var_t effective_cpus; + nodemask_t effective_mems; /* * This is old Memory Nodes tasks took on. @@ -307,9 +333,9 @@ static struct file_system_type cpuset_fs_type = { */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { - while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) + while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) cs = parent_cs(cs); - cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); + cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); } /* @@ -325,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { - while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) + while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) cs = parent_cs(cs); - nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); + nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } /* @@ -376,13 +402,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) if (!trial) return NULL; - if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { - kfree(trial); - return NULL; - } - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) + goto free_cs; + if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) + goto free_cpus; + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial; + +free_cpus: + free_cpumask_var(trial->cpus_allowed); +free_cs: + kfree(trial); + return NULL; } /** @@ -391,6 +424,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) */ static void free_trial_cpuset(struct cpuset *trial) { + free_cpumask_var(trial->effective_cpus); free_cpumask_var(trial->cpus_allowed); kfree(trial); } @@ -436,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); - /* We must be a subset of our parent cpuset */ + /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!is_cpuset_subset(trial, par)) + if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par)) goto out; /* @@ -480,11 +514,11 @@ out: #ifdef CONFIG_SMP /* * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping cpus_allowed masks? + * Do cpusets a, b have overlapping effective cpus_allowed masks? */ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { - return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); + return cpumask_intersects(a->effective_cpus, b->effective_cpus); } static void @@ -601,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - cpumask_copy(doms[0], top_cpuset.cpus_allowed); + cpumask_copy(doms[0], top_cpuset.effective_cpus); goto done; } @@ -705,7 +739,7 @@ restart: struct cpuset *b = csa[j]; if (apn == b->pn) { - cpumask_or(dp, dp, b->cpus_allowed); + cpumask_or(dp, dp, b->effective_cpus); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -757,7 +791,7 @@ static void rebuild_sched_domains_locked(void) * passing doms with offlined cpu to partition_sched_domains(). * Anyways, hotplug work item will rebuild sched domains. */ - if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) + if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) goto out; /* Generate domain masks and attrs */ @@ -781,45 +815,6 @@ void rebuild_sched_domains(void) mutex_unlock(&cpuset_mutex); } -/* - * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus - * @cs: the cpuset in interest - * - * A cpuset's effective cpumask is the cpumask of the nearest ancestor - * with non-empty cpus. We use effective cpumask whenever: - * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask - * if the cpuset they reside in has no cpus) - * - we want to retrieve task_cs(tsk)'s cpus_allowed. - * - * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an - * exception. See comments there. - */ -static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs) -{ - while (cpumask_empty(cs->cpus_allowed)) - cs = parent_cs(cs); - return cs; -} - -/* - * effective_nodemask_cpuset - return nearest ancestor with non-empty mems - * @cs: the cpuset in interest - * - * A cpuset's effective nodemask is the nodemask of the nearest ancestor - * with non-empty memss. We use effective nodemask whenever: - * - we update tasks' mems_allowed. (they take on the ancestor's nodemask - * if the cpuset they reside in has no mems) - * - we want to retrieve task_cs(tsk)'s mems_allowed. - * - * Called with cpuset_mutex held. - */ -static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) -{ - while (nodes_empty(cs->mems_allowed)) - cs = parent_cs(cs); - return cs; -} - /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed @@ -830,53 +825,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) */ static void update_tasks_cpumask(struct cpuset *cs) { - struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct css_task_iter it; struct task_struct *task; css_task_iter_start(&cs->css, &it); while ((task = css_task_iter_next(&it))) - set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); + set_cpus_allowed_ptr(task, cs->effective_cpus); css_task_iter_end(&it); } /* - * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. - * @root_cs: the root cpuset of the hierarchy - * @update_root: update root cpuset or not? + * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree + * @cs: the cpuset to consider + * @new_cpus: temp variable for calculating new effective_cpus + * + * When congifured cpumask is changed, the effective cpumasks of this cpuset + * and all its descendants need to be updated. * - * This will update cpumasks of tasks in @root_cs and all other empty cpusets - * which take on cpumask of @root_cs. + * On legacy hierachy, effective_cpus will be the same with cpu_allowed. * * Called with cpuset_mutex held */ -static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) +static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; + bool need_rebuild_sched_domains = false; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - if (cp == root_cs) { - if (!update_root) - continue; - } else { - /* skip the whole subtree if @cp have some CPU */ - if (!cpumask_empty(cp->cpus_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } + cpuset_for_each_descendant_pre(cp, pos_css, cs) { + struct cpuset *parent = parent_cs(cp); + + cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus); + + /* + * If it becomes empty, inherit the effective mask of the + * parent, which is guaranteed to have some CPUs. + */ + if (cpumask_empty(new_cpus)) + cpumask_copy(new_cpus, parent->effective_cpus); + + /* Skip the whole subtree if the cpumask remains the same. */ + if (cpumask_equal(new_cpus, cp->effective_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; } + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); + mutex_lock(&callback_mutex); + cpumask_copy(cp->effective_cpus, new_cpus); + mutex_unlock(&callback_mutex); + + WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); + update_tasks_cpumask(cp); + /* + * If the effective cpumask of any non-empty cpuset is changed, + * we need to rebuild sched domains. + */ + if (!cpumask_empty(cp->cpus_allowed) && + is_sched_load_balance(cp)) + need_rebuild_sched_domains = true; + rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); + + if (need_rebuild_sched_domains) + rebuild_sched_domains_locked(); } /** @@ -889,7 +911,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; - int is_load_balanced; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) @@ -908,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) + if (!cpumask_subset(trialcs->cpus_allowed, + top_cpuset.cpus_allowed)) return -EINVAL; } @@ -920,16 +942,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - is_load_balanced = is_sched_load_balance(trialcs); - mutex_lock(&callback_mutex); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); mutex_unlock(&callback_mutex); - update_tasks_cpumask_hier(cs, true); - - if (is_load_balanced) - rebuild_sched_domains_locked(); + /* use trialcs->cpus_allowed as a temp variable */ + update_cpumasks_hier(cs, trialcs->cpus_allowed); return 0; } @@ -951,15 +969,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { struct task_struct *tsk = current; - struct cpuset *mems_cs; tsk->mems_allowed = *to; do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); rcu_read_lock(); - mems_cs = effective_nodemask_cpuset(task_cs(tsk)); - guarantee_online_mems(mems_cs, &tsk->mems_allowed); + guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); rcu_read_unlock(); } @@ -1028,13 +1044,12 @@ static void *cpuset_being_rebound; static void update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ - struct cpuset *mems_cs = effective_nodemask_cpuset(cs); struct css_task_iter it; struct task_struct *task; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ - guarantee_online_mems(mems_cs, &newmems); + guarantee_online_mems(cs, &newmems); /* * The mpol_rebind_mm() call takes mmap_sem, which we couldn't @@ -1077,36 +1092,52 @@ static void update_tasks_nodemask(struct cpuset *cs) } /* - * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. - * @cs: the root cpuset of the hierarchy - * @update_root: update the root cpuset or not? + * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree + * @cs: the cpuset to consider + * @new_mems: a temp variable for calculating new effective_mems * - * This will update nodemasks of tasks in @root_cs and all other empty cpusets - * which take on nodemask of @root_cs. + * When configured nodemask is changed, the effective nodemasks of this cpuset + * and all its descendants need to be updated. + * + * On legacy hiearchy, effective_mems will be the same with mems_allowed. * * Called with cpuset_mutex held */ -static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) +static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - if (cp == root_cs) { - if (!update_root) - continue; - } else { - /* skip the whole subtree if @cp have some CPU */ - if (!nodes_empty(cp->mems_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } + cpuset_for_each_descendant_pre(cp, pos_css, cs) { + struct cpuset *parent = parent_cs(cp); + + nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); + + /* + * If it becomes empty, inherit the effective mask of the + * parent, which is guaranteed to have some MEMs. + */ + if (nodes_empty(*new_mems)) + *new_mems = parent->effective_mems; + + /* Skip the whole subtree if the nodemask remains the same. */ + if (nodes_equal(*new_mems, cp->effective_mems)) { + pos_css = css_rightmost_descendant(pos_css); + continue; } + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); + mutex_lock(&callback_mutex); + cp->effective_mems = *new_mems; + mutex_unlock(&callback_mutex); + + WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + !nodes_equal(cp->mems_allowed, cp->effective_mems)); + update_tasks_nodemask(cp); rcu_read_lock(); @@ -1156,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, goto done; if (!nodes_subset(trialcs->mems_allowed, - node_states[N_MEMORY])) { - retval = -EINVAL; + top_cpuset.mems_allowed)) { + retval = -EINVAL; goto done; } } @@ -1174,7 +1205,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask_hier(cs, true); + /* use trialcs->mems_allowed as a temp variable */ + update_nodemasks_hier(cs, &cs->mems_allowed); done: return retval; } @@ -1389,12 +1421,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, mutex_lock(&cpuset_mutex); - /* - * We allow to move tasks into an empty cpuset if sane_behavior - * flag is set. - */ + /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!cgroup_sane_behavior(css->cgroup) && + if (!cgroup_on_dfl(css->cgroup) && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; @@ -1452,8 +1481,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css, struct task_struct *leader = cgroup_taskset_first(tset); struct cpuset *cs = css_cs(css); struct cpuset *oldcs = cpuset_attach_old_cs; - struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); - struct cpuset *mems_cs = effective_nodemask_cpuset(cs); mutex_lock(&cpuset_mutex); @@ -1461,9 +1488,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css, if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); else - guarantee_online_cpus(cpus_cs, cpus_attach); + guarantee_online_cpus(cs, cpus_attach); - guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cgroup_taskset_for_each(task, tset) { /* @@ -1480,11 +1507,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * Change mm, possibly for multiple threads in a threadgroup. This is * expensive and may sleep. */ - cpuset_attach_nodemask_to = cs->mems_allowed; + cpuset_attach_nodemask_to = cs->effective_mems; mm = get_task_mm(leader); if (mm) { - struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs); - mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); /* @@ -1495,7 +1520,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * mm from. */ if (is_memory_migrate(cs)) { - cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, + cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); } mmput(mm); @@ -1516,6 +1541,8 @@ typedef enum { FILE_MEMORY_MIGRATE, FILE_CPULIST, FILE_MEMLIST, + FILE_EFFECTIVE_CPULIST, + FILE_EFFECTIVE_MEMLIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_MEM_HARDWALL, @@ -1694,6 +1721,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_MEMLIST: s += nodelist_scnprintf(s, count, cs->mems_allowed); break; + case FILE_EFFECTIVE_CPULIST: + s += cpulist_scnprintf(s, count, cs->effective_cpus); + break; + case FILE_EFFECTIVE_MEMLIST: + s += nodelist_scnprintf(s, count, cs->effective_mems); + break; default: ret = -EINVAL; goto out_unlock; @@ -1779,6 +1812,18 @@ static struct cftype files[] = { }, { + .name = "effective_cpus", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_CPULIST, + }, + + { + .name = "effective_mems", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_MEMLIST, + }, + + { .name = "cpu_exclusive", .read_u64 = cpuset_read_u64, .write_u64 = cpuset_write_u64, @@ -1869,18 +1914,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); - if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { - kfree(cs); - return ERR_PTR(-ENOMEM); - } + if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) + goto free_cs; + if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) + goto free_cpus; set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); + cpumask_clear(cs->effective_cpus); + nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; return &cs->css; + +free_cpus: + free_cpumask_var(cs->cpus_allowed); +free_cs: + kfree(cs); + return ERR_PTR(-ENOMEM); } static int cpuset_css_online(struct cgroup_subsys_state *css) @@ -1903,6 +1956,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); + mutex_lock(&callback_mutex); + if (cgroup_on_dfl(cs->css.cgroup)) { + cpumask_copy(cs->effective_cpus, parent->effective_cpus); + cs->effective_mems = parent->effective_mems; + } + mutex_unlock(&callback_mutex); + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; @@ -1962,20 +2022,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); + free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->cpus_allowed); kfree(cs); } +static void cpuset_bind(struct cgroup_subsys_state *root_css) +{ + mutex_lock(&cpuset_mutex); + mutex_lock(&callback_mutex); + + if (cgroup_on_dfl(root_css->cgroup)) { + cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); + top_cpuset.mems_allowed = node_possible_map; + } else { + cpumask_copy(top_cpuset.cpus_allowed, + top_cpuset.effective_cpus); + top_cpuset.mems_allowed = top_cpuset.effective_mems; + } + + mutex_unlock(&callback_mutex); + mutex_unlock(&cpuset_mutex); +} + struct cgroup_subsys cpuset_cgrp_subsys = { - .css_alloc = cpuset_css_alloc, - .css_online = cpuset_css_online, - .css_offline = cpuset_css_offline, - .css_free = cpuset_css_free, - .can_attach = cpuset_can_attach, - .cancel_attach = cpuset_cancel_attach, - .attach = cpuset_attach, - .base_cftypes = files, - .early_init = 1, + .css_alloc = cpuset_css_alloc, + .css_online = cpuset_css_online, + .css_offline = cpuset_css_offline, + .css_free = cpuset_css_free, + .can_attach = cpuset_can_attach, + .cancel_attach = cpuset_cancel_attach, + .attach = cpuset_attach, + .bind = cpuset_bind, + .legacy_cftypes = files, + .early_init = 1, }; /** @@ -1990,9 +2070,13 @@ int __init cpuset_init(void) if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) BUG(); + if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) + BUG(); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); + cpumask_setall(top_cpuset.effective_cpus); + nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); @@ -2035,6 +2119,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } } +static void +hotplug_update_tasks_legacy(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) +{ + bool is_empty; + + mutex_lock(&callback_mutex); + cpumask_copy(cs->cpus_allowed, new_cpus); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->mems_allowed = *new_mems; + cs->effective_mems = *new_mems; + mutex_unlock(&callback_mutex); + + /* + * Don't call update_tasks_cpumask() if the cpuset becomes empty, + * as the tasks will be migratecd to an ancestor. + */ + if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) + update_tasks_cpumask(cs); + if (mems_updated && !nodes_empty(cs->mems_allowed)) + update_tasks_nodemask(cs); + + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + + mutex_unlock(&cpuset_mutex); + + /* + * Move tasks to the nearest ancestor with execution resources, + * This is full cgroup operation which will also call back into + * cpuset. Should be done outside any lock. + */ + if (is_empty) + remove_tasks_in_empty_cpuset(cs); + + mutex_lock(&cpuset_mutex); +} + +static void +hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) +{ + if (cpumask_empty(new_cpus)) + cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); + if (nodes_empty(*new_mems)) + *new_mems = parent_cs(cs)->effective_mems; + + mutex_lock(&callback_mutex); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->effective_mems = *new_mems; + mutex_unlock(&callback_mutex); + + if (cpus_updated) + update_tasks_cpumask(cs); + if (mems_updated) + update_tasks_nodemask(cs); +} + /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest @@ -2045,11 +2189,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) */ static void cpuset_hotplug_update_tasks(struct cpuset *cs) { - static cpumask_t off_cpus; - static nodemask_t off_mems; - bool is_empty; - bool sane = cgroup_sane_behavior(cs->css.cgroup); - + static cpumask_t new_cpus; + static nodemask_t new_mems; + bool cpus_updated; + bool mems_updated; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -2064,51 +2207,20 @@ retry: goto retry; } - cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); - nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); - - mutex_lock(&callback_mutex); - cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); - mutex_unlock(&callback_mutex); - - /* - * If sane_behavior flag is set, we need to update tasks' cpumask - * for empty cpuset to take on ancestor's cpumask. Otherwise, don't - * call update_tasks_cpumask() if the cpuset becomes empty, as - * the tasks in it will be migrated to an ancestor. - */ - if ((sane && cpumask_empty(cs->cpus_allowed)) || - (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) - update_tasks_cpumask(cs); + cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); + nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); - mutex_lock(&callback_mutex); - nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); - mutex_unlock(&callback_mutex); - - /* - * If sane_behavior flag is set, we need to update tasks' nodemask - * for empty cpuset to take on ancestor's nodemask. Otherwise, don't - * call update_tasks_nodemask() if the cpuset becomes empty, as - * the tasks in it will be migratd to an ancestor. - */ - if ((sane && nodes_empty(cs->mems_allowed)) || - (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) - update_tasks_nodemask(cs); + cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); + mems_updated = !nodes_equal(new_mems, cs->effective_mems); - is_empty = cpumask_empty(cs->cpus_allowed) || - nodes_empty(cs->mems_allowed); + if (cgroup_on_dfl(cs->css.cgroup)) + hotplug_update_tasks(cs, &new_cpus, &new_mems, + cpus_updated, mems_updated); + else + hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, + cpus_updated, mems_updated); mutex_unlock(&cpuset_mutex); - - /* - * If sane_behavior flag is set, we'll keep tasks in empty cpusets. - * - * Otherwise move tasks to the nearest ancestor with execution - * resources. This is full cgroup operation which will - * also call back into cpuset. Should be done outside any lock. - */ - if (!sane && is_empty) - remove_tasks_in_empty_cpuset(cs); } /** @@ -2132,6 +2244,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; + bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup); mutex_lock(&cpuset_mutex); @@ -2139,13 +2252,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work) cpumask_copy(&new_cpus, cpu_active_mask); new_mems = node_states[N_MEMORY]; - cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); - mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); + cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); + mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + if (!on_dfl) + cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + cpumask_copy(top_cpuset.effective_cpus, &new_cpus); mutex_unlock(&callback_mutex); /* we don't mess with cpumasks of tasks in top_cpuset */ } @@ -2153,7 +2268,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* synchronize mems_allowed to N_MEMORY */ if (mems_updated) { mutex_lock(&callback_mutex); - top_cpuset.mems_allowed = new_mems; + if (!on_dfl) + top_cpuset.mems_allowed = new_mems; + top_cpuset.effective_mems = new_mems; mutex_unlock(&callback_mutex); update_tasks_nodemask(&top_cpuset); } @@ -2228,6 +2345,9 @@ void __init cpuset_init_smp(void) top_cpuset.mems_allowed = node_states[N_MEMORY]; top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; + cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); + top_cpuset.effective_mems = node_states[N_MEMORY]; + register_hotmemory_notifier(&cpuset_track_online_nodes_nb); } @@ -2244,23 +2364,17 @@ void __init cpuset_init_smp(void) void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { - struct cpuset *cpus_cs; - mutex_lock(&callback_mutex); rcu_read_lock(); - cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); - guarantee_online_cpus(cpus_cs, pmask); + guarantee_online_cpus(task_cs(tsk), pmask); rcu_read_unlock(); mutex_unlock(&callback_mutex); } void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { - struct cpuset *cpus_cs; - rcu_read_lock(); - cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); - do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed); + do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); rcu_read_unlock(); /* @@ -2299,13 +2413,11 @@ void cpuset_init_current_mems_allowed(void) nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { - struct cpuset *mems_cs; nodemask_t mask; mutex_lock(&callback_mutex); rcu_read_lock(); - mems_cs = effective_nodemask_cpuset(task_cs(tsk)); - guarantee_online_mems(mems_cs, &mask); + guarantee_online_mems(task_cs(tsk), &mask); rcu_read_unlock(); mutex_unlock(&callback_mutex); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2f7c760..379650b 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2472,7 +2472,7 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) static void kdb_sysinfo(struct sysinfo *val) { struct timespec uptime; - do_posix_clock_monotonic_gettime(&uptime); + ktime_get_ts(&uptime); memset(val, 0, sizeof(*val)); val->uptime = uptime.tv_sec; val->loads[0] = avenrun[0]; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 54996b7..ef90b04 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -46,42 +46,25 @@ void __delayacct_tsk_init(struct task_struct *tsk) } /* - * Start accounting for a delay statistic using - * its starting timestamp (@start) + * Finish delay accounting for a statistic using its timestamps (@start), + * accumalator (@total) and @count */ - -static inline void delayacct_start(struct timespec *start) +static void delayacct_end(u64 *start, u64 *total, u32 *count) { - do_posix_clock_monotonic_gettime(start); -} - -/* - * Finish delay accounting for a statistic using - * its timestamps (@start, @end), accumalator (@total) and @count - */ - -static void delayacct_end(struct timespec *start, struct timespec *end, - u64 *total, u32 *count) -{ - struct timespec ts; - s64 ns; + s64 ns = ktime_get_ns() - *start; unsigned long flags; - do_posix_clock_monotonic_gettime(end); - ts = timespec_sub(*end, *start); - ns = timespec_to_ns(&ts); - if (ns < 0) - return; - - spin_lock_irqsave(¤t->delays->lock, flags); - *total += ns; - (*count)++; - spin_unlock_irqrestore(¤t->delays->lock, flags); + if (ns > 0) { + spin_lock_irqsave(¤t->delays->lock, flags); + *total += ns; + (*count)++; + spin_unlock_irqrestore(¤t->delays->lock, flags); + } } void __delayacct_blkio_start(void) { - delayacct_start(¤t->delays->blkio_start); + current->delays->blkio_start = ktime_get_ns(); } void __delayacct_blkio_end(void) @@ -89,35 +72,29 @@ void __delayacct_blkio_end(void) if (current->delays->flags & DELAYACCT_PF_SWAPIN) /* Swapin block I/O */ delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_end, ¤t->delays->swapin_delay, ¤t->delays->swapin_count); else /* Other block I/O */ delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_end, ¤t->delays->blkio_delay, ¤t->delays->blkio_count); } int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { - s64 tmp; - unsigned long t1; - unsigned long long t2, t3; - unsigned long flags; - struct timespec ts; cputime_t utime, stime, stimescaled, utimescaled; + unsigned long long t2, t3; + unsigned long flags, t1; + s64 tmp; - tmp = (s64)d->cpu_run_real_total; task_cputime(tsk, &utime, &stime); - cputime_to_timespec(utime + stime, &ts); - tmp += timespec_to_ns(&ts); + tmp = (s64)d->cpu_run_real_total; + tmp += cputime_to_nsecs(utime + stime); d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; - tmp = (s64)d->cpu_scaled_run_real_total; task_cputime_scaled(tsk, &utimescaled, &stimescaled); - cputime_to_timespec(utimescaled + stimescaled, &ts); - tmp += timespec_to_ns(&ts); + tmp = (s64)d->cpu_scaled_run_real_total; + tmp += cputime_to_nsecs(utimescaled + stimescaled); d->cpu_scaled_run_real_total = (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; @@ -169,13 +146,12 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) void __delayacct_freepages_start(void) { - delayacct_start(¤t->delays->freepages_start); + current->delays->freepages_start = ktime_get_ns(); } void __delayacct_freepages_end(void) { delayacct_end(¤t->delays->freepages_start, - ¤t->delays->freepages_end, ¤t->delays->freepages_delay, ¤t->delays->freepages_count); } diff --git a/kernel/events/core.c b/kernel/events/core.c index 6b17ac1..1cf24b3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5266,6 +5266,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) goto got_name; } else { + if (vma->vm_ops && vma->vm_ops->name) { + name = (char *) vma->vm_ops->name(vma); + if (name) + goto cpy_name; + } + name = (char *)arch_vma_name(vma); if (name) goto cpy_name; @@ -7804,7 +7810,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, /* * Initialize the perf_event context in task_struct */ -int perf_event_init_context(struct task_struct *child, int ctxn) +static int perf_event_init_context(struct task_struct *child, int ctxn) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6f3254e..1d0af8a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, /* For mmu_notifiers */ const unsigned long mmun_start = addr; const unsigned long mmun_end = addr + PAGE_SIZE; + struct mem_cgroup *memcg; + + err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); + if (err) + return err; /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(page); @@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); + mem_cgroup_commit_charge(kpage, memcg, false); + lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { dec_mm_counter(mm, MM_FILEPAGES); @@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: + mem_cgroup_cancel_charge(kpage, memcg); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; @@ -315,18 +323,11 @@ retry: if (!new_page) goto put_old; - if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) - goto put_new; - __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = __replace_page(vma, vaddr, old_page, new_page); - if (ret) - mem_cgroup_uncharge_page(new_page); - -put_new: page_cache_release(new_page); put_old: put_page(old_page); diff --git a/kernel/exit.c b/kernel/exit.c index e5c4668..32c58f7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -59,7 +59,7 @@ #include <asm/pgtable.h> #include <asm/mmu_context.h> -static void exit_mm(struct task_struct * tsk); +static void exit_mm(struct task_struct *tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { @@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk) spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) { flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); @@ -168,7 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) } -void release_task(struct task_struct * p) +void release_task(struct task_struct *p) { struct task_struct *leader; int zap_leader; @@ -192,7 +192,8 @@ repeat: */ zap_leader = 0; leader = p->group_leader; - if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + if (leader != p && thread_group_empty(leader) + && leader->exit_state == EXIT_ZOMBIE) { /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -241,7 +242,8 @@ struct pid *session_of_pgrp(struct pid *pgrp) * * "I ask you, have you ever known what it is to be an orphan?" */ -static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) +static int will_become_orphaned_pgrp(struct pid *pgrp, + struct task_struct *ignored_task) { struct task_struct *p; @@ -294,9 +296,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) struct task_struct *ignored_task = tsk; if (!parent) - /* exit: our father is in a different pgrp than - * we are and we were the only connection outside. - */ + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than @@ -405,7 +407,7 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct * tsk) +static void exit_mm(struct task_struct *tsk) { struct mm_struct *mm = tsk->mm; struct core_state *core_state; @@ -425,6 +427,7 @@ static void exit_mm(struct task_struct * tsk) core_state = mm->core_state; if (core_state) { struct core_thread self; + up_read(&mm->mmap_sem); self.task = tsk; @@ -455,6 +458,7 @@ static void exit_mm(struct task_struct * tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); + clear_thread_flag(TIF_MEMDIE); } /* @@ -565,6 +569,7 @@ static void forget_original_parent(struct task_struct *father) list_for_each_entry_safe(p, n, &father->children, sibling) { struct task_struct *t = p; + do { t->real_parent = reaper; if (t->parent == father) { @@ -598,7 +603,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* * This does two things: * - * A. Make init inherit all the child processes + * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) @@ -648,9 +653,8 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - printk(KERN_WARNING "%s (%d) used greatest stack depth: " - "%lu bytes left\n", - current->comm, task_pid_nr(current), free); + pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", + current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); @@ -691,8 +695,7 @@ void do_exit(long code) * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { - printk(KERN_ALERT - "Fixing recursive fault but reboot is needed!\n"); + pr_alert("Fixing recursive fault but reboot is needed!\n"); /* * We can do this unlocked here. The futex code uses * this flag just to verify whether the pi state @@ -716,9 +719,9 @@ void do_exit(long code) raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) - printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ @@ -836,7 +839,6 @@ void do_exit(long code) for (;;) cpu_relax(); /* For when BUG is null */ } - EXPORT_SYMBOL_GPL(do_exit); void complete_and_exit(struct completion *comp, long code) @@ -846,7 +848,6 @@ void complete_and_exit(struct completion *comp, long code) do_exit(code); } - EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) @@ -869,6 +870,7 @@ do_group_exit(int exit_code) exit_code = sig->group_exit_code; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; + spin_lock_irq(&sighand->siglock); if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ @@ -1033,9 +1035,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_cputime_adjusted() to get times for the thread - * group, which consolidates times for all threads in the - * group including the group leader. + * We use thread_group_cputime_adjusted() to get times for + * the thread group, which consolidates times for all threads + * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); @@ -1417,6 +1419,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); + if (ret) return ret; } @@ -1430,6 +1433,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); + if (ret) return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index 6a13c46..1380d8a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -315,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto free_ti; tsk->stack = ti; +#ifdef CONFIG_SECCOMP + /* + * We must handle setting up seccomp filters once we're under + * the sighand lock in case orig has changed between now and + * then. Until then, filter must be NULL to avoid messing up + * the usage counts on the error path calling free_task. + */ + tsk->seccomp.filter = NULL; +#endif setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); @@ -365,12 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - mm->locked_vm = 0; - mm->mmap = NULL; - mm->vmacache_seqnum = 0; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; + mm->total_vm = oldmm->total_vm; + mm->shared_vm = oldmm->shared_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; @@ -421,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ if (unlikely(tmp->vm_flags & VM_NONLINEAR)) @@ -527,19 +535,37 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + mm->owner = p; +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); + mm->map_count = 0; + mm->locked_vm = 0; + mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mmu_notifier_mm_init(mm); clear_tlb_flush_pending(mm); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + mm->pmd_huge_pte = NULL; +#endif if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; @@ -549,11 +575,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->def_flags = 0; } - if (likely(!mm_alloc_pgd(mm))) { - mmu_notifier_mm_init(mm); - return mm; - } + if (mm_alloc_pgd(mm)) + goto fail_nopgd; + + if (init_new_context(p, mm)) + goto fail_nocontext; + return mm; + +fail_nocontext: + mm_free_pgd(mm); +fail_nopgd: free_mm(mm); return NULL; } @@ -587,7 +619,6 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - mm_init_cpumask(mm); return mm_init(mm, current); } @@ -819,17 +850,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); - mm_init_cpumask(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - mm->pmd_huge_pte = NULL; -#endif if (!mm_init(mm, tsk)) goto fail_nomem; - if (init_new_context(tsk, mm)) - goto fail_nocontext; - dup_mm_exe_file(oldmm, mm); err = dup_mmap(mm, oldmm); @@ -851,15 +875,6 @@ free_pt: fail_nomem: return NULL; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; } static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) @@ -1081,6 +1096,39 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } +static void copy_seccomp(struct task_struct *p) +{ +#ifdef CONFIG_SECCOMP + /* + * Must be called with sighand->lock held, which is common to + * all threads in the group. Holding cred_guard_mutex is not + * needed because this new task is not yet running and cannot + * be racing exec. + */ + BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + + /* Ref-count the new filter user, and assign it. */ + get_seccomp_filter(current); + p->seccomp = current->seccomp; + + /* + * Explicitly enable no_new_privs here in case it got set + * between the task_struct being duplicated and holding the + * sighand lock. The seccomp state and nnp must be in sync. + */ + if (task_no_new_privs(current)) + task_set_no_new_privs(p); + + /* + * If the parent gained a seccomp mode after copying thread + * flags and between before we held the sighand lock, we have + * to manually enable the seccomp thread flag here. + */ + if (p->seccomp.mode != SECCOMP_MODE_DISABLED) + set_tsk_thread_flag(p, TIF_SECCOMP); +#endif +} + SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; @@ -1095,17 +1143,9 @@ static void rt_mutex_init_task(struct task_struct *p) p->pi_waiters = RB_ROOT; p->pi_waiters_leftmost = NULL; p->pi_blocked_on = NULL; - p->pi_top_task = NULL; #endif } -#ifdef CONFIG_MEMCG -void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ - mm->owner = p; -} -#endif /* CONFIG_MEMCG */ - /* * Initialize POSIX timer handling for a single task. */ @@ -1196,7 +1236,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; ftrace_graph_init_task(p); - get_seccomp_filter(p); rt_mutex_init_task(p); @@ -1262,9 +1301,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, posix_cpu_timers_init(p); - do_posix_clock_monotonic_gettime(&p->start_time); - p->real_start_time = p->start_time; - monotonic_to_bootbased(&p->real_start_time); + p->start_time = ktime_get_ns(); + p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; if (clone_flags & CLONE_THREAD) @@ -1307,10 +1345,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_MEMCG - p->memcg_batch.do_batch = 0; - p->memcg_batch.memcg = NULL; -#endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; @@ -1328,6 +1362,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ + shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; @@ -1437,6 +1472,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_lock(¤t->sighand->siglock); /* + * Copy seccomp details explicitly here, in case they were changed + * before holding sighand lock. + */ + copy_seccomp(p); + + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to @@ -1873,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ exit_sem(current); } + if (unshare_flags & CLONE_NEWIPC) { + /* Orphan segments in old ns (see sem above). */ + exit_shm(current); + shm_init_task(current); + } if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); diff --git a/kernel/futex.c b/kernel/futex.c index b632b5f..d3a9d94 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -792,94 +792,91 @@ void exit_pi_state_list(struct task_struct *curr) * [10] There is no transient state which leaves owner and user space * TID out of sync. */ -static int -lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps) + +/* + * Validate that the existing waiter has a pi_state and sanity check + * the pi_state against the user space value. If correct, attach to + * it. + */ +static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, + struct futex_pi_state **ps) { - struct futex_pi_state *pi_state = NULL; - struct futex_q *this, *next; - struct task_struct *p; pid_t pid = uval & FUTEX_TID_MASK; - plist_for_each_entry_safe(this, next, &hb->chain, list) { - if (match_futex(&this->key, key)) { - /* - * Sanity check the waiter before increasing - * the refcount and attaching to it. - */ - pi_state = this->pi_state; - /* - * Userspace might have messed up non-PI and - * PI futexes [3] - */ - if (unlikely(!pi_state)) - return -EINVAL; + /* + * Userspace might have messed up non-PI and PI futexes [3] + */ + if (unlikely(!pi_state)) + return -EINVAL; - WARN_ON(!atomic_read(&pi_state->refcount)); + WARN_ON(!atomic_read(&pi_state->refcount)); + /* + * Handle the owner died case: + */ + if (uval & FUTEX_OWNER_DIED) { + /* + * exit_pi_state_list sets owner to NULL and wakes the + * topmost waiter. The task which acquires the + * pi_state->rt_mutex will fixup owner. + */ + if (!pi_state->owner) { /* - * Handle the owner died case: + * No pi state owner, but the user space TID + * is not 0. Inconsistent state. [5] */ - if (uval & FUTEX_OWNER_DIED) { - /* - * exit_pi_state_list sets owner to NULL and - * wakes the topmost waiter. The task which - * acquires the pi_state->rt_mutex will fixup - * owner. - */ - if (!pi_state->owner) { - /* - * No pi state owner, but the user - * space TID is not 0. Inconsistent - * state. [5] - */ - if (pid) - return -EINVAL; - /* - * Take a ref on the state and - * return. [4] - */ - goto out_state; - } - - /* - * If TID is 0, then either the dying owner - * has not yet executed exit_pi_state_list() - * or some waiter acquired the rtmutex in the - * pi state, but did not yet fixup the TID in - * user space. - * - * Take a ref on the state and return. [6] - */ - if (!pid) - goto out_state; - } else { - /* - * If the owner died bit is not set, - * then the pi_state must have an - * owner. [7] - */ - if (!pi_state->owner) - return -EINVAL; - } - + if (pid) + return -EINVAL; /* - * Bail out if user space manipulated the - * futex value. If pi state exists then the - * owner TID must be the same as the user - * space TID. [9/10] + * Take a ref on the state and return success. [4] */ - if (pid != task_pid_vnr(pi_state->owner)) - return -EINVAL; - - out_state: - atomic_inc(&pi_state->refcount); - *ps = pi_state; - return 0; + goto out_state; } + + /* + * If TID is 0, then either the dying owner has not + * yet executed exit_pi_state_list() or some waiter + * acquired the rtmutex in the pi state, but did not + * yet fixup the TID in user space. + * + * Take a ref on the state and return success. [6] + */ + if (!pid) + goto out_state; + } else { + /* + * If the owner died bit is not set, then the pi_state + * must have an owner. [7] + */ + if (!pi_state->owner) + return -EINVAL; } /* + * Bail out if user space manipulated the futex value. If pi + * state exists then the owner TID must be the same as the + * user space TID. [9/10] + */ + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; +out_state: + atomic_inc(&pi_state->refcount); + *ps = pi_state; + return 0; +} + +/* + * Lookup the task for the TID provided from user space and attach to + * it after doing proper sanity checks. + */ +static int attach_to_pi_owner(u32 uval, union futex_key *key, + struct futex_pi_state **ps) +{ + pid_t pid = uval & FUTEX_TID_MASK; + struct futex_pi_state *pi_state; + struct task_struct *p; + + /* * We are the first waiter - try to look up the real owner and attach * the new pi_state to it, but bail out when TID = 0 [1] */ @@ -920,7 +917,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, pi_state = alloc_pi_state(); /* - * Initialize the pi_mutex in locked state and make 'p' + * Initialize the pi_mutex in locked state and make @p * the owner of it: */ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); @@ -940,6 +937,36 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return 0; } +static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, + union futex_key *key, struct futex_pi_state **ps) +{ + struct futex_q *match = futex_top_waiter(hb, key); + + /* + * If there is a waiter on that futex, validate it and + * attach to the pi_state when the validation succeeds. + */ + if (match) + return attach_to_pi_state(uval, match->pi_state, ps); + + /* + * We are the first waiter - try to look up the owner based on + * @uval and attach to it. + */ + return attach_to_pi_owner(uval, key, ps); +} + +static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) +{ + u32 uninitialized_var(curval); + + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) + return -EFAULT; + + /*If user space value changed, let the caller retry */ + return curval != uval ? -EAGAIN : 0; +} + /** * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex * @uaddr: the pi futex user address @@ -963,113 +990,69 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct futex_pi_state **ps, struct task_struct *task, int set_waiters) { - int lock_taken, ret, force_take = 0; - u32 uval, newval, curval, vpid = task_pid_vnr(task); - -retry: - ret = lock_taken = 0; + u32 uval, newval, vpid = task_pid_vnr(task); + struct futex_q *match; + int ret; /* - * To avoid races, we attempt to take the lock here again - * (by doing a 0 -> TID atomic cmpxchg), while holding all - * the locks. It will most likely not succeed. + * Read the user space value first so we can validate a few + * things before proceeding further. */ - newval = vpid; - if (set_waiters) - newval |= FUTEX_WAITERS; - - if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) + if (get_futex_value_locked(&uval, uaddr)) return -EFAULT; /* * Detect deadlocks. */ - if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) + if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) return -EDEADLK; /* - * Surprise - we got the lock, but we do not trust user space at all. - */ - if (unlikely(!curval)) { - /* - * We verify whether there is kernel state for this - * futex. If not, we can safely assume, that the 0 -> - * TID transition is correct. If state exists, we do - * not bother to fixup the user space state as it was - * corrupted already. - */ - return futex_top_waiter(hb, key) ? -EINVAL : 1; - } - - uval = curval; - - /* - * Set the FUTEX_WAITERS flag, so the owner will know it has someone - * to wake at the next unlock. + * Lookup existing state first. If it exists, try to attach to + * its pi_state. */ - newval = curval | FUTEX_WAITERS; + match = futex_top_waiter(hb, key); + if (match) + return attach_to_pi_state(uval, match->pi_state, ps); /* - * Should we force take the futex? See below. + * No waiter and user TID is 0. We are here because the + * waiters or the owner died bit is set or called from + * requeue_cmp_pi or for whatever reason something took the + * syscall. */ - if (unlikely(force_take)) { + if (!(uval & FUTEX_TID_MASK)) { /* - * Keep the OWNER_DIED and the WAITERS bit and set the - * new TID value. + * We take over the futex. No other waiters and the user space + * TID is 0. We preserve the owner died bit. */ - newval = (curval & ~FUTEX_TID_MASK) | vpid; - force_take = 0; - lock_taken = 1; - } + newval = uval & FUTEX_OWNER_DIED; + newval |= vpid; - if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) - return -EFAULT; - if (unlikely(curval != uval)) - goto retry; + /* The futex requeue_pi code can enforce the waiters bit */ + if (set_waiters) + newval |= FUTEX_WAITERS; + + ret = lock_pi_update_atomic(uaddr, uval, newval); + /* If the take over worked, return 1 */ + return ret < 0 ? ret : 1; + } /* - * We took the lock due to forced take over. + * First waiter. Set the waiters bit before attaching ourself to + * the owner. If owner tries to unlock, it will be forced into + * the kernel and blocked on hb->lock. */ - if (unlikely(lock_taken)) - return 1; - + newval = uval | FUTEX_WAITERS; + ret = lock_pi_update_atomic(uaddr, uval, newval); + if (ret) + return ret; /* - * We dont have the lock. Look up the PI state (or create it if - * we are the first waiter): + * If the update of the user space value succeeded, we try to + * attach to the owner. If that fails, no harm done, we only + * set the FUTEX_WAITERS bit in the user space variable. */ - ret = lookup_pi_state(uval, hb, key, ps); - - if (unlikely(ret)) { - switch (ret) { - case -ESRCH: - /* - * We failed to find an owner for this - * futex. So we have no pi_state to block - * on. This can happen in two cases: - * - * 1) The owner died - * 2) A stale FUTEX_WAITERS bit - * - * Re-read the futex value. - */ - if (get_futex_value_locked(&curval, uaddr)) - return -EFAULT; - - /* - * If the owner died or we have a stale - * WAITERS bit the owner TID in the user space - * futex is 0. - */ - if (!(curval & FUTEX_TID_MASK)) { - force_take = 1; - goto retry; - } - default: - break; - } - } - - return ret; + return attach_to_pi_owner(uval, key, ps); } /** @@ -1186,22 +1169,6 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) return 0; } -static int unlock_futex_pi(u32 __user *uaddr, u32 uval) -{ - u32 uninitialized_var(oldval); - - /* - * There is no waiter, so we unlock the futex. The owner died - * bit has not to be preserved here. We are the owner: - */ - if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) - return -EFAULT; - if (oldval != uval) - return -EAGAIN; - - return 0; -} - /* * Express the locking dependencies for lockdep: */ @@ -1659,7 +1626,12 @@ retry_private: goto retry; goto out; case -EAGAIN: - /* The owner was exiting, try again. */ + /* + * Two reasons for this: + * - Owner is exiting and we just wait for the + * exit to complete. + * - The user space value changed. + */ double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1718,7 +1690,7 @@ retry_private: this->pi_state = pi_state; ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, - this->task, 1); + this->task); if (ret == 1) { /* We got the lock. */ requeue_pi_wake_futex(this, &key2, hb2); @@ -2316,8 +2288,10 @@ retry_private: goto uaddr_faulted; case -EAGAIN: /* - * Task is exiting and we just wait for the - * exit to complete. + * Two reasons for this: + * - Task is exiting and we just wait for the + * exit to complete. + * - The user space value changed. */ queue_unlock(hb); put_futex_key(&q.key); @@ -2337,9 +2311,9 @@ retry_private: /* * Block on the PI mutex: */ - if (!trylock) - ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); - else { + if (!trylock) { + ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); + } else { ret = rt_mutex_trylock(&q.pi_state->pi_mutex); /* Fixup the trylock return value: */ ret = ret ? 0 : -EWOULDBLOCK; @@ -2401,10 +2375,10 @@ uaddr_faulted: */ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { - struct futex_hash_bucket *hb; - struct futex_q *this, *next; + u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; - u32 uval, vpid = task_pid_vnr(current); + struct futex_hash_bucket *hb; + struct futex_q *match; int ret; retry: @@ -2417,57 +2391,47 @@ retry: return -EPERM; ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); - if (unlikely(ret != 0)) - goto out; + if (ret) + return ret; hb = hash_futex(&key); spin_lock(&hb->lock); /* - * To avoid races, try to do the TID -> 0 atomic transition - * again. If it succeeds then we can return without waking - * anyone else up. We only try this if neither the waiters nor - * the owner died bit are set. - */ - if (!(uval & ~FUTEX_TID_MASK) && - cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) - goto pi_faulted; - /* - * Rare case: we managed to release the lock atomically, - * no need to wake anyone else up: - */ - if (unlikely(uval == vpid)) - goto out_unlock; - - /* - * Ok, other tasks may need to be woken up - check waiters - * and do the wakeup if necessary: + * Check waiters first. We do not trust user space values at + * all and we at least want to know if user space fiddled + * with the futex value instead of blindly unlocking. */ - plist_for_each_entry_safe(this, next, &hb->chain, list) { - if (!match_futex (&this->key, &key)) - continue; - ret = wake_futex_pi(uaddr, uval, this); + match = futex_top_waiter(hb, &key); + if (match) { + ret = wake_futex_pi(uaddr, uval, match); /* - * The atomic access to the futex value - * generated a pagefault, so retry the - * user-access and the wakeup: + * The atomic access to the futex value generated a + * pagefault, so retry the user-access and the wakeup: */ if (ret == -EFAULT) goto pi_faulted; goto out_unlock; } + /* - * No waiters - kernel unlocks the futex: + * We have no kernel internal state, i.e. no waiters in the + * kernel. Waiters which are about to queue themselves are stuck + * on hb->lock. So we can safely ignore them. We do neither + * preserve the WAITERS bit not the OWNER_DIED one. We are the + * owner. */ - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) goto pi_faulted; + /* + * If uval has changed, let user space handle it. + */ + ret = (curval == uval) ? 0 : -EAGAIN; + out_unlock: spin_unlock(&hb->lock); put_futex_key(&key); - -out: return ret; pi_faulted: @@ -2669,7 +2633,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; - ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); + ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter); debug_rt_mutex_free_waiter(&rt_waiter); spin_lock(q.lock_ptr); diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 15ff01a..edf67c4 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -784,8 +784,7 @@ static __init int gcov_fs_init(void) err_remove: pr_err("init failed\n"); - if (root_node.dentry) - debugfs_remove(root_node.dentry); + debugfs_remove(root_node.dentry); return rc; } diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 452d6f2..cf80e7b 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -341,8 +341,8 @@ static struct lock_class_key irq_nested_lock_class; /* * irq_map_generic_chip - Map a generic chip for an irq domain */ -static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, - irq_hw_number_t hw_irq) +int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, + irq_hw_number_t hw_irq) { struct irq_data *data = irq_get_irq_data(virq); struct irq_domain_chip_generic *dgc = d->gc; @@ -394,6 +394,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); return 0; } +EXPORT_SYMBOL_GPL(irq_map_generic_chip); struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index eb5e10e..6534ff6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -231,7 +231,7 @@ void irq_set_default_host(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_set_default_host); -static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) +void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) { struct irq_data *irq_data = irq_get_irq_data(irq); irq_hw_number_t hwirq; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index a82170e..e6bcbe7 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -16,11 +16,12 @@ #include <linux/tick.h> #include <linux/cpu.h> #include <linux/notifier.h> +#include <linux/smp.h> #include <asm/processor.h> -static DEFINE_PER_CPU(struct llist_head, irq_work_list); -static DEFINE_PER_CPU(int, irq_work_raised); +static DEFINE_PER_CPU(struct llist_head, raised_list); +static DEFINE_PER_CPU(struct llist_head, lazy_list); /* * Claim the entry so that no one else will poke at it. @@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void) */ } +#ifdef CONFIG_SMP /* - * Enqueue the irq_work @entry unless it's already pending + * Enqueue the irq_work @work on @cpu unless it's already pending * somewhere. * * Can be re-enqueued while the callback is still in progress. */ +bool irq_work_queue_on(struct irq_work *work, int cpu) +{ + /* All work should have been flushed before going offline */ + WARN_ON_ONCE(cpu_is_offline(cpu)); + + /* Arch remote IPI send/receive backend aren't NMI safe */ + WARN_ON_ONCE(in_nmi()); + + /* Only queue if not already pending */ + if (!irq_work_claim(work)) + return false; + + if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) + arch_send_call_function_single_ipi(cpu); + + return true; +} +EXPORT_SYMBOL_GPL(irq_work_queue_on); +#endif + +/* Enqueue the irq work @work on the current CPU */ bool irq_work_queue(struct irq_work *work) { /* Only queue if not already pending */ @@ -70,15 +93,13 @@ bool irq_work_queue(struct irq_work *work) /* Queue the entry and raise the IPI if needed. */ preempt_disable(); - llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); - - /* - * If the work is not "lazy" or the tick is stopped, raise the irq - * work interrupt (if supported by the arch), otherwise, just wait - * for the next tick. - */ - if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) { - if (!this_cpu_cmpxchg(irq_work_raised, 0, 1)) + /* If the work is "lazy", handle it from next tick if any */ + if (work->flags & IRQ_WORK_LAZY) { + if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) && + tick_nohz_tick_stopped()) + arch_irq_work_raise(); + } else { + if (llist_add(&work->llnode, &__get_cpu_var(raised_list))) arch_irq_work_raise(); } @@ -90,10 +111,11 @@ EXPORT_SYMBOL_GPL(irq_work_queue); bool irq_work_needs_cpu(void) { - struct llist_head *this_list; + struct llist_head *raised, *lazy; - this_list = &__get_cpu_var(irq_work_list); - if (llist_empty(this_list)) + raised = &__get_cpu_var(raised_list); + lazy = &__get_cpu_var(lazy_list); + if (llist_empty(raised) && llist_empty(lazy)) return false; /* All work should have been flushed before going offline */ @@ -102,28 +124,18 @@ bool irq_work_needs_cpu(void) return true; } -static void __irq_work_run(void) +static void irq_work_run_list(struct llist_head *list) { unsigned long flags; struct irq_work *work; - struct llist_head *this_list; struct llist_node *llnode; + BUG_ON(!irqs_disabled()); - /* - * Reset the "raised" state right before we check the list because - * an NMI may enqueue after we find the list empty from the runner. - */ - __this_cpu_write(irq_work_raised, 0); - barrier(); - - this_list = &__get_cpu_var(irq_work_list); - if (llist_empty(this_list)) + if (llist_empty(list)) return; - BUG_ON(!irqs_disabled()); - - llnode = llist_del_all(this_list); + llnode = llist_del_all(list); while (llnode != NULL) { work = llist_entry(llnode, struct irq_work, llnode); @@ -149,13 +161,13 @@ static void __irq_work_run(void) } /* - * Run the irq_work entries on this cpu. Requires to be ran from hardirq - * context with local IRQs disabled. + * hotplug calls this through: + * hotplug_cfd() -> flush_smp_call_function_queue() */ void irq_work_run(void) { - BUG_ON(!in_irq()); - __irq_work_run(); + irq_work_run_list(&__get_cpu_var(raised_list)); + irq_work_run_list(&__get_cpu_var(lazy_list)); } EXPORT_SYMBOL_GPL(irq_work_run); @@ -171,35 +183,3 @@ void irq_work_sync(struct irq_work *work) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); - -#ifdef CONFIG_HOTPLUG_CPU -static int irq_work_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_DYING: - /* Called from stop_machine */ - if (WARN_ON_ONCE(cpu != smp_processor_id())) - break; - __irq_work_run(); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block cpu_notify; - -static __init int irq_work_init_cpu_notifier(void) -{ - cpu_notify.notifier_call = irq_work_cpu_notify; - cpu_notify.priority = 0; - register_cpu_notifier(&cpu_notify); - return 0; -} -device_initcall(irq_work_init_cpu_notifier); - -#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index cb0cf37..ae51670 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address, address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) - return sprintf(buffer, "0x%lx", address); + return sprintf(buffer, "0x%lx", address - symbol_offset); if (name != buffer) strcpy(buffer, name); diff --git a/kernel/kexec.c b/kernel/kexec.c index 4b8f0c9..0b49a0a 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> @@ -40,6 +42,9 @@ #include <asm/io.h> #include <asm/sections.h> +#include <crypto/hash.h> +#include <crypto/sha.h> + /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -52,6 +57,15 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; +/* + * Declare these symbols weak so that if architecture provides a purgatory, + * these will be overridden. + */ +char __weak kexec_purgatory[0]; +size_t __weak kexec_purgatory_size = 0; + +static int kexec_calculate_store_digests(struct kimage *image); + /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", @@ -125,45 +139,27 @@ static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); -static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int copy_user_segment_list(struct kimage *image, + unsigned long nr_segments, + struct kexec_segment __user *segments) { + int ret; size_t segment_bytes; - struct kimage *image; - unsigned long i; - int result; - - /* Allocate a controlling structure */ - result = -ENOMEM; - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - goto out; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->start = entry; - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unuseable_pages); /* Read in the segments */ image->nr_segments = nr_segments; segment_bytes = nr_segments * sizeof(*segments); - result = copy_from_user(image->segment, segments, segment_bytes); - if (result) { - result = -EFAULT; - goto out; - } + ret = copy_from_user(image->segment, segments, segment_bytes); + if (ret) + ret = -EFAULT; + + return ret; +} + +static int sanity_check_segment_list(struct kimage *image) +{ + int result, i; + unsigned long nr_segments = image->nr_segments; /* * Verify we have good destination addresses. The caller is @@ -185,9 +181,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - goto out; + return result; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - goto out; + return result; } /* Verify our destination addresses do not overlap. @@ -208,7 +204,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) - goto out; + return result; } } @@ -220,130 +216,401 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, result = -EINVAL; for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) - goto out; + return result; } - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ - return result; + if (image->type == KEXEC_TYPE_CRASH) { + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || + (mend > crashk_res.end)) + return result; + } + } + return 0; +} + +static struct kimage *do_kimage_alloc_init(void) +{ + struct kimage *image; + + /* Allocate a controlling structure */ + image = kzalloc(sizeof(*image), GFP_KERNEL); + if (!image) + return NULL; + + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unusable pages */ + INIT_LIST_HEAD(&image->unusable_pages); + + return image; } static void kimage_free_page_list(struct list_head *list); -static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags) { - int result; + int ret; struct kimage *image; + bool kexec_on_panic = flags & KEXEC_ON_CRASH; + + if (kexec_on_panic) { + /* Verify we have a valid entry point */ + if ((entry < crashk_res.start) || (entry > crashk_res.end)) + return -EADDRNOTAVAIL; + } /* Allocate and initialize a controlling structure */ - image = NULL; - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) - goto out; + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->start = entry; + + ret = copy_user_segment_list(image, nr_segments, segments); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_image; + + /* Enable the special crash kernel control page allocation policy. */ + if (kexec_on_panic) { + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be * counted as destination pages. */ - result = -ENOMEM; + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_image; } - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err("Could not allocate swap buffer\n"); - goto out_free; + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err("Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; return 0; - -out_free: +out_free_control_pages: kimage_free_page_list(&image->control_pages); +out_free_image: kfree(image); -out: - return result; + return ret; } -static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) { - int result; - struct kimage *image; - unsigned long i; + struct fd f = fdget(fd); + int ret; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; - image = NULL; - /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) { - result = -EADDRNOTAVAIL; + if (!f.file) + return -EBADF; + + ret = vfs_getattr(&f.file->f_path, &stat); + if (ret) + goto out; + + if (stat.size > INT_MAX) { + ret = -EFBIG; goto out; } - /* Allocate and initialize a controlling structure */ - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + ret = -EINVAL; goto out; + } - /* Enable the special crash kernel control page - * allocation policy. - */ - image->control_page = crashk_res.start; - image->type = KEXEC_TYPE_CRASH; + *buf = vmalloc(stat.size); + if (!*buf) { + ret = -ENOMEM; + goto out; + } - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(*buf); + ret = bytes; + goto out; + } - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - goto out_free; + if (bytes == 0) + break; + pos += bytes; } + if (pos != stat.size) { + ret = -EBADF; + vfree(*buf); + goto out; + } + + *buf_len = pos; +out: + fdput(f); + return ret; +} + +/* Architectures can provide this probe function */ +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -ENOEXEC; +} + +void * __weak arch_kexec_kernel_image_load(struct kimage *image) +{ + return ERR_PTR(-ENOEXEC); +} + +void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) +{ +} + +int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -EKEYREJECTED; +} + +/* Apply relocations of type RELA */ +int __weak +arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} + +/* Apply relocations of type REL */ +int __weak +arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} + +/* + * Free up memory used by kernel, initrd, and comand line. This is temporary + * memory allocation which is not needed any more after these buffers have + * been loaded into separate segments and have been copied elsewhere. + */ +static void kimage_file_post_load_cleanup(struct kimage *image) +{ + struct purgatory_info *pi = &image->purgatory_info; + + vfree(image->kernel_buf); + image->kernel_buf = NULL; + + vfree(image->initrd_buf); + image->initrd_buf = NULL; + + kfree(image->cmdline_buf); + image->cmdline_buf = NULL; + + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + + vfree(pi->sechdrs); + pi->sechdrs = NULL; + + /* See if architecture has anything to cleanup post load */ + arch_kimage_file_post_load_cleanup(image); + /* - * Find a location for the control code buffer, and add - * the vector of segments so that it's pages will also be - * counted as destination pages. + * Above call should have called into bootloader to free up + * any data stored in kimage->image_loader_data. It should + * be ok now to free it up. */ - result = -ENOMEM; + kfree(image->image_loader_data); + image->image_loader_data = NULL; +} + +/* + * In file mode list of segments is prepared by kernel. Copy relevant + * data from user space, do error checking, prepare segment list + */ +static int +kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, + const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned flags) +{ + int ret = 0; + void *ldata; + + ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, + &image->kernel_buf_len); + if (ret) + return ret; + + /* Call arch image probe handlers */ + ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, + image->kernel_buf_len); + + if (ret) + goto out; + +#ifdef CONFIG_KEXEC_VERIFY_SIG + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + if (ret) { + pr_debug("kernel signature verification failed.\n"); + goto out; + } + pr_debug("kernel signature verification successful.\n"); +#endif + /* It is possible that there no initramfs is being loaded */ + if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { + ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, + &image->initrd_buf_len); + if (ret) + goto out; + } + + if (cmdline_len) { + image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); + if (!image->cmdline_buf) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(image->cmdline_buf, cmdline_ptr, + cmdline_len); + if (ret) { + ret = -EFAULT; + goto out; + } + + image->cmdline_buf_len = cmdline_len; + + /* command line should be a string with last byte null */ + if (image->cmdline_buf[cmdline_len - 1] != '\0') { + ret = -EINVAL; + goto out; + } + } + + /* Call arch image load handlers */ + ldata = arch_kexec_kernel_image_load(image); + + if (IS_ERR(ldata)) { + ret = PTR_ERR(ldata); + goto out; + } + + image->image_loader_data = ldata; +out: + /* In case of error, free up all allocated memory in this function */ + if (ret) + kimage_file_post_load_cleanup(image); + return ret; +} + +static int +kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, + int initrd_fd, const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned long flags) +{ + int ret; + struct kimage *image; + bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; + + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->file_mode = 1; + + if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, + cmdline_ptr, cmdline_len, flags); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_post_load_bufs; + + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_post_load_bufs; + } + + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err(KERN_ERR "Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; return 0; - -out_free: +out_free_control_pages: + kimage_free_page_list(&image->control_pages); +out_free_post_load_bufs: + kimage_file_post_load_cleanup(image); +out_free_image: kfree(image); -out: - return result; + return ret; } static int kimage_is_destination_range(struct kimage *image, @@ -609,7 +876,7 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ - kimage_free_page_list(&image->unuseable_pages); + kimage_free_page_list(&image->unusable_pages); } static void kimage_terminate(struct kimage *image) @@ -663,6 +930,14 @@ static void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + + /* + * Free up any temporary buffers allocated. This might hit if + * error occurred much later after buffer allocation. + */ + if (image->file_mode) + kimage_file_post_load_cleanup(image); + kfree(image); } @@ -732,7 +1007,7 @@ static struct page *kimage_alloc_page(struct kimage *image, /* If the page cannot be used file it away */ if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unuseable_pages); + list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_pfn(page) << PAGE_SHIFT; @@ -791,10 +1066,14 @@ static int kimage_load_normal_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -826,7 +1105,11 @@ static int kimage_load_normal_segment(struct kimage *image, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); - result = copy_from_user(ptr, buf, uchunk); + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kunmap(page); if (result) { result = -EFAULT; @@ -834,7 +1117,10 @@ static int kimage_load_normal_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -851,10 +1137,14 @@ static int kimage_load_crash_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -877,7 +1167,12 @@ static int kimage_load_crash_segment(struct kimage *image, /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } - result = copy_from_user(ptr, buf, uchunk); + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); kunmap(page); if (result) { @@ -886,7 +1181,10 @@ static int kimage_load_crash_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -986,16 +1284,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, /* Loading another kernel to reboot into */ if ((flags & KEXEC_ON_CRASH) == 0) - result = kimage_normal_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); /* Loading another kernel to switch to if this one crashes */ else if (flags & KEXEC_ON_CRASH) { /* Free any current crash dump kernel before * we corrupt it. */ kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_crash_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); crash_map_reserved_pages(); } if (result) @@ -1077,6 +1375,82 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, } #endif +SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, + unsigned long, cmdline_len, const char __user *, cmdline_ptr, + unsigned long, flags) +{ + int ret = 0, i; + struct kimage **dest_image, *image; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + return -EPERM; + + /* Make sure we have a legal set of flags */ + if (flags != (flags & KEXEC_FILE_FLAGS)) + return -EINVAL; + + image = NULL; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_FILE_ON_CRASH) + dest_image = &kexec_crash_image; + + if (flags & KEXEC_FILE_UNLOAD) + goto exchange; + + /* + * In case of crash, new kernel gets loaded in reserved region. It is + * same memory where old crash kernel might be loaded. Free any + * current crash dump kernel before we corrupt it. + */ + if (flags & KEXEC_FILE_ON_CRASH) + kimage_free(xchg(&kexec_crash_image, NULL)); + + ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, + cmdline_len, flags); + if (ret) + goto out; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + ret = kexec_calculate_store_digests(image); + if (ret) + goto out; + + for (i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); + + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* + * Free up any temporary buffers allocated which are not needed + * after image has been loaded + */ + kimage_file_post_load_cleanup(image); +exchange: + image = xchg(dest_image, image); +out: + mutex_unlock(&kexec_mutex); + kimage_free(image); + return ret; +} + void crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load @@ -1632,6 +2006,683 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); +static int __kexec_add_segment(struct kimage *image, char *buf, + unsigned long bufsz, unsigned long mem, + unsigned long memsz) +{ + struct kexec_segment *ksegment; + + ksegment = &image->segment[image->nr_segments]; + ksegment->kbuf = buf; + ksegment->bufsz = bufsz; + ksegment->mem = mem; + ksegment->memsz = memsz; + image->nr_segments++; + + return 0; +} + +static int locate_mem_hole_top_down(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_end = min(end, kbuf->buf_max); + temp_start = temp_end - kbuf->memsz; + + do { + /* align down start */ + temp_start = temp_start & (~(kbuf->buf_align - 1)); + + if (temp_start < start || temp_start < kbuf->buf_min) + return 0; + + temp_end = temp_start + kbuf->memsz - 1; + + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_start = max(start, kbuf->buf_min); + + do { + temp_start = ALIGN(temp_start, kbuf->buf_align); + temp_end = temp_start + kbuf->memsz - 1; + + if (temp_end > end || temp_end > kbuf->buf_max) + return 0; + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +{ + struct kexec_buf *kbuf = (struct kexec_buf *)arg; + unsigned long sz = end - start + 1; + + /* Returning 0 will take to next memory range */ + if (sz < kbuf->memsz) + return 0; + + if (end < kbuf->buf_min || start > kbuf->buf_max) + return 0; + + /* + * Allocate memory top down with-in ram range. Otherwise bottom up + * allocation. + */ + if (kbuf->top_down) + return locate_mem_hole_top_down(start, end, kbuf); + return locate_mem_hole_bottom_up(start, end, kbuf); +} + +/* + * Helper function for placing a buffer in a kexec segment. This assumes + * that kexec_mutex is held. + */ +int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, + unsigned long memsz, unsigned long buf_align, + unsigned long buf_min, unsigned long buf_max, + bool top_down, unsigned long *load_addr) +{ + + struct kexec_segment *ksegment; + struct kexec_buf buf, *kbuf; + int ret; + + /* Currently adding segment this way is allowed only in file mode */ + if (!image->file_mode) + return -EINVAL; + + if (image->nr_segments >= KEXEC_SEGMENT_MAX) + return -EINVAL; + + /* + * Make sure we are not trying to add buffer after allocating + * control pages. All segments need to be placed first before + * any control pages are allocated. As control page allocation + * logic goes through list of segments to make sure there are + * no destination overlaps. + */ + if (!list_empty(&image->control_pages)) { + WARN_ON(1); + return -EINVAL; + } + + memset(&buf, 0, sizeof(struct kexec_buf)); + kbuf = &buf; + kbuf->image = image; + kbuf->buffer = buffer; + kbuf->bufsz = bufsz; + + kbuf->memsz = ALIGN(memsz, PAGE_SIZE); + kbuf->buf_align = max(buf_align, PAGE_SIZE); + kbuf->buf_min = buf_min; + kbuf->buf_max = buf_max; + kbuf->top_down = top_down; + + /* Walk the RAM ranges and allocate a suitable range for the buffer */ + if (image->type == KEXEC_TYPE_CRASH) + ret = walk_iomem_res("Crash kernel", + IORESOURCE_MEM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); + else + ret = walk_system_ram_res(0, -1, kbuf, + locate_mem_hole_callback); + if (ret != 1) { + /* A suitable memory range could not be found for buffer */ + return -EADDRNOTAVAIL; + } + + /* Found a suitable memory range */ + ksegment = &image->segment[image->nr_segments - 1]; + *load_addr = ksegment->mem; + return 0; +} + +/* Calculate and store the digest of segments */ +static int kexec_calculate_store_digests(struct kimage *image) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int ret = 0, i, j, zero_buf_sz, sha_region_sz; + size_t desc_size, nullsz; + char *digest; + void *zero_buf; + struct kexec_sha_region *sha_regions; + struct purgatory_info *pi = &image->purgatory_info; + + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); + zero_buf_sz = PAGE_SIZE; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out; + } + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + desc = kzalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out_free_tfm; + } + + sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); + sha_regions = vzalloc(sha_region_sz); + if (!sha_regions) + goto out_free_desc; + + desc->tfm = tfm; + desc->flags = 0; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto out_free_sha_regions; + + digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_sha_regions; + } + + for (j = i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + /* + * Skip purgatory as it will be modified once we put digest + * info in purgatory. + */ + if (ksegment->kbuf == pi->purgatory_buf) + continue; + + ret = crypto_shash_update(desc, ksegment->kbuf, + ksegment->bufsz); + if (ret) + break; + + /* + * Assume rest of the buffer is filled with zero and + * update digest accordingly. + */ + nullsz = ksegment->memsz - ksegment->bufsz; + while (nullsz) { + unsigned long bytes = nullsz; + + if (bytes > zero_buf_sz) + bytes = zero_buf_sz; + ret = crypto_shash_update(desc, zero_buf, bytes); + if (ret) + break; + nullsz -= bytes; + } + + if (ret) + break; + + sha_regions[j].start = ksegment->mem; + sha_regions[j].len = ksegment->memsz; + j++; + } + + if (!ret) { + ret = crypto_shash_final(desc, digest); + if (ret) + goto out_free_digest; + ret = kexec_purgatory_get_set_symbol(image, "sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_digest; + + ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); + if (ret) + goto out_free_digest; + } + +out_free_digest: + kfree(digest); +out_free_sha_regions: + vfree(sha_regions); +out_free_desc: + kfree(desc); +out_free_tfm: + kfree(tfm); +out: + return ret; +} + +/* Actually load purgatory. Lot of code taken from kexec-tools */ +static int __kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down) +{ + struct purgatory_info *pi = &image->purgatory_info; + unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; + unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned char *buf_addr, *src; + int i, ret = 0, entry_sidx = -1; + const Elf_Shdr *sechdrs_c; + Elf_Shdr *sechdrs = NULL; + void *purgatory_buf = NULL; + + /* + * sechdrs_c points to section headers in purgatory and are read + * only. No modifications allowed. + */ + sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + + /* + * We can not modify sechdrs_c[] and its fields. It is read only. + * Copy it over to a local copy where one can store some temporary + * data and free it at the end. We need to modify ->sh_addr and + * ->sh_offset fields to keep track of permanent and temporary + * locations of sections. + */ + sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + if (!sechdrs) + return -ENOMEM; + + memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + + /* + * We seem to have multiple copies of sections. First copy is which + * is embedded in kernel in read only section. Some of these sections + * will be copied to a temporary buffer and relocated. And these + * sections will finally be copied to their final destination at + * segment load time. + * + * Use ->sh_offset to reflect section address in memory. It will + * point to original read only copy if section is not allocatable. + * Otherwise it will point to temporary copy which will be relocated. + * + * Use ->sh_addr to contain final address of the section where it + * will go during execution time. + */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_NOBITS) + continue; + + sechdrs[i].sh_offset = (unsigned long)pi->ehdr + + sechdrs[i].sh_offset; + } + + /* + * Identify entry point section and make entry relative to section + * start. + */ + entry = pi->ehdr->e_entry; + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + /* Make entry section relative */ + if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && + ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > + pi->ehdr->e_entry)) { + entry_sidx = i; + entry -= sechdrs[i].sh_addr; + break; + } + } + + /* Determine how much memory is needed to load relocatable object. */ + buf_align = 1; + bss_align = 1; + buf_sz = 0; + bss_sz = 0; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + if (buf_align < align) + buf_align = align; + buf_sz = ALIGN(buf_sz, align); + buf_sz += sechdrs[i].sh_size; + } else { + /* bss section */ + if (bss_align < align) + bss_align = align; + bss_sz = ALIGN(bss_sz, align); + bss_sz += sechdrs[i].sh_size; + } + } + + /* Determine the bss padding required to align bss properly */ + bss_pad = 0; + if (buf_sz & (bss_align - 1)) + bss_pad = bss_align - (buf_sz & (bss_align - 1)); + + memsz = buf_sz + bss_pad + bss_sz; + + /* Allocate buffer for purgatory */ + purgatory_buf = vzalloc(buf_sz); + if (!purgatory_buf) { + ret = -ENOMEM; + goto out; + } + + if (buf_align < bss_align) + buf_align = bss_align; + + /* Add buffer to segment list */ + ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, + buf_align, min, max, top_down, + &pi->purgatory_load_addr); + if (ret) + goto out; + + /* Load SHF_ALLOC sections */ + buf_addr = purgatory_buf; + load_addr = curr_load_addr = pi->purgatory_load_addr; + bss_addr = load_addr + buf_sz + bss_pad; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + curr_load_addr = ALIGN(curr_load_addr, align); + offset = curr_load_addr - load_addr; + /* We already modifed ->sh_offset to keep src addr */ + src = (char *) sechdrs[i].sh_offset; + memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + + /* Store load address and source address of section */ + sechdrs[i].sh_addr = curr_load_addr; + + /* + * This section got copied to temporary buffer. Update + * ->sh_offset accordingly. + */ + sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + + /* Advance to the next address */ + curr_load_addr += sechdrs[i].sh_size; + } else { + bss_addr = ALIGN(bss_addr, align); + sechdrs[i].sh_addr = bss_addr; + bss_addr += sechdrs[i].sh_size; + } + } + + /* Update entry point based on load address of text section */ + if (entry_sidx >= 0) + entry += sechdrs[entry_sidx].sh_addr; + + /* Make kernel jump to purgatory after shutdown */ + image->start = entry; + + /* Used later to get/set symbol values */ + pi->sechdrs = sechdrs; + + /* + * Used later to identify which section is purgatory and skip it + * from checksumming. + */ + pi->purgatory_buf = purgatory_buf; + return ret; +out: + vfree(sechdrs); + vfree(purgatory_buf); + return ret; +} + +static int kexec_apply_relocations(struct kimage *image) +{ + int i, ret; + struct purgatory_info *pi = &image->purgatory_info; + Elf_Shdr *sechdrs = pi->sechdrs; + + /* Apply relocations */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + Elf_Shdr *section, *symtab; + + if (sechdrs[i].sh_type != SHT_RELA && + sechdrs[i].sh_type != SHT_REL) + continue; + + /* + * For section of type SHT_RELA/SHT_REL, + * ->sh_link contains section header index of associated + * symbol table. And ->sh_info contains section header + * index of section to which relocations apply. + */ + if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || + sechdrs[i].sh_link >= pi->ehdr->e_shnum) + return -ENOEXEC; + + section = &sechdrs[sechdrs[i].sh_info]; + symtab = &sechdrs[sechdrs[i].sh_link]; + + if (!(section->sh_flags & SHF_ALLOC)) + continue; + + /* + * symtab->sh_link contain section header index of associated + * string table. + */ + if (symtab->sh_link >= pi->ehdr->e_shnum) + /* Invalid section number? */ + continue; + + /* + * Respective archicture needs to provide support for applying + * relocations of type SHT_RELA/SHT_REL. + */ + if (sechdrs[i].sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi->ehdr, + sechdrs, i); + else if (sechdrs[i].sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi->ehdr, + sechdrs, i); + if (ret) + return ret; + } + + return 0; +} + +/* Load relocatable purgatory object and relocate it appropriately */ +int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr) +{ + struct purgatory_info *pi = &image->purgatory_info; + int ret; + + if (kexec_purgatory_size <= 0) + return -EINVAL; + + if (kexec_purgatory_size < sizeof(Elf_Ehdr)) + return -ENOEXEC; + + pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + + if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 + || pi->ehdr->e_type != ET_REL + || !elf_check_arch(pi->ehdr) + || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (pi->ehdr->e_shoff >= kexec_purgatory_size + || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > + kexec_purgatory_size - pi->ehdr->e_shoff)) + return -ENOEXEC; + + ret = __kexec_load_purgatory(image, min, max, top_down); + if (ret) + return ret; + + ret = kexec_apply_relocations(image); + if (ret) + goto out; + + *load_addr = pi->purgatory_load_addr; + return 0; +out: + vfree(pi->sechdrs); + vfree(pi->purgatory_buf); + return ret; +} + +static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + Elf_Sym *syms; + Elf_Shdr *sechdrs; + Elf_Ehdr *ehdr; + int i, k; + const char *strtab; + + if (!pi->sechdrs || !pi->ehdr) + return NULL; + + sechdrs = pi->sechdrs; + ehdr = pi->ehdr; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (Elf_Sym *)sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} + +void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) +{ + struct purgatory_info *pi = &image->purgatory_info; + Elf_Sym *sym; + Elf_Shdr *sechdr; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return ERR_PTR(-EINVAL); + + sechdr = &pi->sechdrs[sym->st_shndx]; + + /* + * Returns the address where symbol will finally be loaded after + * kexec_load_segment() + */ + return (void *)(sechdr->sh_addr + sym->st_value); +} + +/* + * Get or set value of a symbol. If "get_value" is true, symbol value is + * returned in buf otherwise symbol value is set based on value in buf. + */ +int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, + void *buf, unsigned int size, bool get_value) +{ + Elf_Sym *sym; + Elf_Shdr *sechdrs; + struct purgatory_info *pi = &image->purgatory_info; + char *sym_buf; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return -EINVAL; + + if (sym->st_size != size) { + pr_err("symbol %s size mismatch: expected %lu actual %u\n", + name, (unsigned long)sym->st_size, size); + return -EINVAL; + } + + sechdrs = pi->sechdrs; + + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + pr_err("symbol %s is in a bss section. Cannot %s\n", name, + get_value ? "get" : "set"); + return -EINVAL; + } + + sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + + sym->st_value; + + if (get_value) + memcpy((void *)buf, sym_buf, size); + else + memcpy((void *)sym_buf, buf, size); + + return 0; +} + /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. diff --git a/kernel/kthread.c b/kernel/kthread.c index c2390f4..ef48322 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -591,7 +591,7 @@ static void insert_kthread_work(struct kthread_worker *worker, list_add_tail(&work->node, pos); work->worker = worker; - if (likely(worker->task)) + if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d24e433..88d0d44 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -384,7 +384,9 @@ static void print_lockdep_off(const char *bug_msg) { printk(KERN_DEBUG "%s\n", bug_msg); printk(KERN_DEBUG "turning off the locking correctness validator.\n"); +#ifdef CONFIG_LOCK_STAT printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); +#endif } static int save_trace(struct stack_trace *trace) diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c index be9ee15..9887a90 100644 --- a/kernel/locking/mcs_spinlock.c +++ b/kernel/locking/mcs_spinlock.c @@ -1,6 +1,4 @@ - #include <linux/percpu.h> -#include <linux/mutex.h> #include <linux/sched.h> #include "mcs_spinlock.h" @@ -79,7 +77,7 @@ osq_wait_next(struct optimistic_spin_queue *lock, break; } - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } return next; @@ -120,7 +118,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) if (need_resched()) goto unqueue; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } return true; @@ -146,7 +144,7 @@ unqueue: if (smp_load_acquire(&node->locked)) return true; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); /* * Or we race against a concurrent unqueue()'s step-B, in which diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 74356dc..23e89c5 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -27,7 +27,7 @@ struct mcs_spinlock { #define arch_mcs_spin_lock_contended(l) \ do { \ while (!(smp_load_acquire(l))) \ - arch_mutex_cpu_relax(); \ + cpu_relax_lowlatency(); \ } while (0) #endif @@ -104,7 +104,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) return; /* Wait until the next pointer is set */ while (!(next = ACCESS_ONCE(node->next))) - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } /* Pass lock to next waiter. */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index acca2c1..ae712b2 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -46,12 +46,6 @@ # include <asm/mutex.h> #endif -/* - * A negative mutex count indicates that waiters are sleeping waiting for the - * mutex. - */ -#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0) - void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { @@ -152,7 +146,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) if (need_resched()) break; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } rcu_read_unlock(); @@ -388,12 +382,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* * Optimistic spinning. * - * We try to spin for acquisition when we find that there are no - * pending waiters and the lock owner is currently running on a - * (different) CPU. - * - * The rationale is that if the lock owner is running, it is likely to - * release the lock soon. + * We try to spin for acquisition when we find that the lock owner + * is currently running on a (different) CPU and while we don't + * need to reschedule. The rationale is that if the lock owner is + * running, it is likely to release the lock soon. * * Since this needs the lock owner, and this mutex implementation * doesn't track the owner atomically in the lock field, we need to @@ -440,7 +432,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (owner && !mutex_spin_on_owner(lock, owner)) break; - if ((atomic_read(&lock->count) == 1) && + /* Try to acquire the mutex if it is unlocked. */ + if (!mutex_is_locked(lock) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); if (use_ww_ctx) { @@ -471,7 +464,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } osq_unlock(&lock->osq); slowpath: @@ -485,8 +478,11 @@ slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); - /* once more, can we acquire the lock? */ - if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) + /* + * Once more, try to acquire the lock. Only try-lock the mutex if + * it is unlocked to reduce unnecessary xchg() operations. + */ + if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1)) goto skip_wait; debug_mutex_lock_common(lock, &waiter); @@ -506,9 +502,10 @@ slowpath: * it's unlocked. Later on, if we sleep, this is the * operation that gives us the lock. We xchg it to -1, so * that when we release the lock, we properly wake up the - * other waiters: + * other waiters. We only attempt the xchg if the count is + * non-negative in order to avoid unnecessary xchg operations: */ - if (MUTEX_SHOW_NO_WAITER(lock) && + if (atomic_read(&lock->count) >= 0 && (atomic_xchg(&lock->count, -1) == 1)) break; @@ -823,6 +820,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) unsigned long flags; int prev; + /* No need to trylock if the mutex is locked. */ + if (mutex_is_locked(lock)) + return 0; + spin_lock_mutex(&lock->wait_lock, flags); prev = atomic_xchg(&lock->count, -1); diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fb5b8ac..f956ede 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -20,7 +20,6 @@ #include <linux/cpumask.h> #include <linux/percpu.h> #include <linux/hardirq.h> -#include <linux/mutex.h> #include <asm/qrwlock.h> /** @@ -35,7 +34,7 @@ static __always_inline void rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) { while ((cnts & _QW_WMASK) == _QW_LOCKED) { - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); cnts = smp_load_acquire((u32 *)&lock->cnts); } } @@ -75,7 +74,7 @@ void queue_read_lock_slowpath(struct qrwlock *lock) * to make sure that the write lock isn't taken. */ while (atomic_read(&lock->cnts) & _QW_WMASK) - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; rspin_until_writer_unlock(lock, cnts); @@ -114,7 +113,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) cnts | _QW_WAITING) == cnts)) break; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } /* When no more readers, set the locked flag */ @@ -125,7 +124,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) _QW_LOCKED) == _QW_WAITING)) break; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } unlock: arch_spin_unlock(&lock->lock); diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 49b2ed3..62b6cee 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -66,12 +66,13 @@ void rt_mutex_debug_task_free(struct task_struct *task) * the deadlock. We print when we return. act_waiter can be NULL in * case of a remove waiter operation. */ -void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, +void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, + struct rt_mutex_waiter *act_waiter, struct rt_mutex *lock) { struct task_struct *task; - if (!debug_locks || detect || !act_waiter) + if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter) return; task = rt_mutex_owner(act_waiter->lock); diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index ab29b6a..d0519c3 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -20,14 +20,15 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner); extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, +extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, + struct rt_mutex_waiter *waiter, struct rt_mutex *lock); extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); # define debug_rt_mutex_reset_waiter(w) \ do { (w)->deadlock_lock = NULL; } while (0) -static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, - int detect) +static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, + enum rtmutex_chainwalk walk) { return (waiter != NULL); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index fc60594..a0ea2a1 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -308,6 +308,32 @@ static void rt_mutex_adjust_prio(struct task_struct *task) } /* + * Deadlock detection is conditional: + * + * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted + * if the detect argument is == RT_MUTEX_FULL_CHAINWALK. + * + * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always + * conducted independent of the detect argument. + * + * If the waiter argument is NULL this indicates the deboost path and + * deadlock detection is disabled independent of the detect argument + * and the config settings. + */ +static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, + enum rtmutex_chainwalk chwalk) +{ + /* + * This is just a wrapper function for the following call, + * because debug_rt_mutex_detect_deadlock() smells like a magic + * debug feature and I wanted to keep the cond function in the + * main source file along with the comments instead of having + * two of the same in the headers. + */ + return debug_rt_mutex_detect_deadlock(waiter, chwalk); +} + +/* * Max number of times we'll walk the boosting chain: */ int max_lock_depth = 1024; @@ -337,21 +363,65 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) * @top_task: the current top waiter * * Returns 0 or -EDEADLK. + * + * Chain walk basics and protection scope + * + * [R] refcount on task + * [P] task->pi_lock held + * [L] rtmutex->wait_lock held + * + * Step Description Protected by + * function arguments: + * @task [R] + * @orig_lock if != NULL @top_task is blocked on it + * @next_lock Unprotected. Cannot be + * dereferenced. Only used for + * comparison. + * @orig_waiter if != NULL @top_task is blocked on it + * @top_task current, or in case of proxy + * locking protected by calling + * code + * again: + * loop_sanity_check(); + * retry: + * [1] lock(task->pi_lock); [R] acquire [P] + * [2] waiter = task->pi_blocked_on; [P] + * [3] check_exit_conditions_1(); [P] + * [4] lock = waiter->lock; [P] + * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L] + * unlock(task->pi_lock); release [P] + * goto retry; + * } + * [6] check_exit_conditions_2(); [P] + [L] + * [7] requeue_lock_waiter(lock, waiter); [P] + [L] + * [8] unlock(task->pi_lock); release [P] + * put_task_struct(task); release [R] + * [9] check_exit_conditions_3(); [L] + * [10] task = owner(lock); [L] + * get_task_struct(task); [L] acquire [R] + * lock(task->pi_lock); [L] acquire [P] + * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L] + * [12] check_exit_conditions_4(); [P] + [L] + * [13] unlock(task->pi_lock); release [P] + * unlock(lock->wait_lock); release [L] + * goto again; */ static int rt_mutex_adjust_prio_chain(struct task_struct *task, - int deadlock_detect, + enum rtmutex_chainwalk chwalk, struct rt_mutex *orig_lock, struct rt_mutex *next_lock, struct rt_mutex_waiter *orig_waiter, struct task_struct *top_task) { - struct rt_mutex *lock; struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; - int detect_deadlock, ret = 0, depth = 0; + struct rt_mutex_waiter *prerequeue_top_waiter; + int ret = 0, depth = 0; + struct rt_mutex *lock; + bool detect_deadlock; unsigned long flags; + bool requeue = true; - detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, - deadlock_detect); + detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk); /* * The (de)boosting is a step by step approach with a lot of @@ -360,6 +430,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * carefully whether things change under us. */ again: + /* + * We limit the lock chain length for each invocation. + */ if (++depth > max_lock_depth) { static int prev_max; @@ -377,13 +450,28 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, return -EDEADLK; } + + /* + * We are fully preemptible here and only hold the refcount on + * @task. So everything can have changed under us since the + * caller or our own code below (goto retry/again) dropped all + * locks. + */ retry: /* - * Task can not go away as we did a get_task() before ! + * [1] Task cannot go away as we did a get_task() before ! */ raw_spin_lock_irqsave(&task->pi_lock, flags); + /* + * [2] Get the waiter on which @task is blocked on. + */ waiter = task->pi_blocked_on; + + /* + * [3] check_exit_conditions_1() protected by task->pi_lock. + */ + /* * Check whether the end of the boosting chain has been * reached or the state of the chain has changed while we @@ -421,20 +509,41 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto out_unlock_pi; /* * If deadlock detection is off, we stop here if we - * are not the top pi waiter of the task. + * are not the top pi waiter of the task. If deadlock + * detection is enabled we continue, but stop the + * requeueing in the chain walk. */ - if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) - goto out_unlock_pi; + if (top_waiter != task_top_pi_waiter(task)) { + if (!detect_deadlock) + goto out_unlock_pi; + else + requeue = false; + } } /* - * When deadlock detection is off then we check, if further - * priority adjustment is necessary. + * If the waiter priority is the same as the task priority + * then there is no further priority adjustment necessary. If + * deadlock detection is off, we stop the chain walk. If its + * enabled we continue, but stop the requeueing in the chain + * walk. */ - if (!detect_deadlock && waiter->prio == task->prio) - goto out_unlock_pi; + if (waiter->prio == task->prio) { + if (!detect_deadlock) + goto out_unlock_pi; + else + requeue = false; + } + /* + * [4] Get the next lock + */ lock = waiter->lock; + /* + * [5] We need to trylock here as we are holding task->pi_lock, + * which is the reverse lock order versus the other rtmutex + * operations. + */ if (!raw_spin_trylock(&lock->wait_lock)) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); cpu_relax(); @@ -442,79 +551,180 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* + * [6] check_exit_conditions_2() protected by task->pi_lock and + * lock->wait_lock. + * * Deadlock detection. If the lock is the same as the original * lock which caused us to walk the lock chain or if the * current lock is owned by the task which initiated the chain * walk, we detected a deadlock. */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { - debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); + debug_rt_mutex_deadlock(chwalk, orig_waiter, lock); raw_spin_unlock(&lock->wait_lock); ret = -EDEADLK; goto out_unlock_pi; } - top_waiter = rt_mutex_top_waiter(lock); + /* + * If we just follow the lock chain for deadlock detection, no + * need to do all the requeue operations. To avoid a truckload + * of conditionals around the various places below, just do the + * minimum chain walk checks. + */ + if (!requeue) { + /* + * No requeue[7] here. Just release @task [8] + */ + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + put_task_struct(task); + + /* + * [9] check_exit_conditions_3 protected by lock->wait_lock. + * If there is no owner of the lock, end of chain. + */ + if (!rt_mutex_owner(lock)) { + raw_spin_unlock(&lock->wait_lock); + return 0; + } + + /* [10] Grab the next task, i.e. owner of @lock */ + task = rt_mutex_owner(lock); + get_task_struct(task); + raw_spin_lock_irqsave(&task->pi_lock, flags); + + /* + * No requeue [11] here. We just do deadlock detection. + * + * [12] Store whether owner is blocked + * itself. Decision is made after dropping the locks + */ + next_lock = task_blocked_on_lock(task); + /* + * Get the top waiter for the next iteration + */ + top_waiter = rt_mutex_top_waiter(lock); + + /* [13] Drop locks */ + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&lock->wait_lock); + + /* If owner is not blocked, end of chain. */ + if (!next_lock) + goto out_put_task; + goto again; + } - /* Requeue the waiter */ + /* + * Store the current top waiter before doing the requeue + * operation on @lock. We need it for the boost/deboost + * decision below. + */ + prerequeue_top_waiter = rt_mutex_top_waiter(lock); + + /* [7] Requeue the waiter in the lock waiter list. */ rt_mutex_dequeue(lock, waiter); waiter->prio = task->prio; rt_mutex_enqueue(lock, waiter); - /* Release the task */ + /* [8] Release the task */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); + put_task_struct(task); + + /* + * [9] check_exit_conditions_3 protected by lock->wait_lock. + * + * We must abort the chain walk if there is no lock owner even + * in the dead lock detection case, as we have nothing to + * follow here. This is the end of the chain we are walking. + */ if (!rt_mutex_owner(lock)) { /* - * If the requeue above changed the top waiter, then we need - * to wake the new top waiter up to try to get the lock. + * If the requeue [7] above changed the top waiter, + * then we need to wake the new top waiter up to try + * to get the lock. */ - - if (top_waiter != rt_mutex_top_waiter(lock)) + if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) wake_up_process(rt_mutex_top_waiter(lock)->task); raw_spin_unlock(&lock->wait_lock); - goto out_put_task; + return 0; } - put_task_struct(task); - /* Grab the next task */ + /* [10] Grab the next task, i.e. the owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); raw_spin_lock_irqsave(&task->pi_lock, flags); + /* [11] requeue the pi waiters if necessary */ if (waiter == rt_mutex_top_waiter(lock)) { - /* Boost the owner */ - rt_mutex_dequeue_pi(task, top_waiter); + /* + * The waiter became the new top (highest priority) + * waiter on the lock. Replace the previous top waiter + * in the owner tasks pi waiters list with this waiter + * and adjust the priority of the owner. + */ + rt_mutex_dequeue_pi(task, prerequeue_top_waiter); rt_mutex_enqueue_pi(task, waiter); __rt_mutex_adjust_prio(task); - } else if (top_waiter == waiter) { - /* Deboost the owner */ + } else if (prerequeue_top_waiter == waiter) { + /* + * The waiter was the top waiter on the lock, but is + * no longer the top prority waiter. Replace waiter in + * the owner tasks pi waiters list with the new top + * (highest priority) waiter and adjust the priority + * of the owner. + * The new top waiter is stored in @waiter so that + * @waiter == @top_waiter evaluates to true below and + * we continue to deboost the rest of the chain. + */ rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); rt_mutex_enqueue_pi(task, waiter); __rt_mutex_adjust_prio(task); + } else { + /* + * Nothing changed. No need to do any priority + * adjustment. + */ } /* + * [12] check_exit_conditions_4() protected by task->pi_lock + * and lock->wait_lock. The actual decisions are made after we + * dropped the locks. + * * Check whether the task which owns the current lock is pi * blocked itself. If yes we store a pointer to the lock for * the lock chain change detection above. After we dropped * task->pi_lock next_lock cannot be dereferenced anymore. */ next_lock = task_blocked_on_lock(task); + /* + * Store the top waiter of @lock for the end of chain walk + * decision below. + */ + top_waiter = rt_mutex_top_waiter(lock); + /* [13] Drop the locks */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - top_waiter = rt_mutex_top_waiter(lock); raw_spin_unlock(&lock->wait_lock); /* + * Make the actual exit decisions [12], based on the stored + * values. + * * We reached the end of the lock chain. Stop right here. No * point to go back just to figure that out. */ if (!next_lock) goto out_put_task; + /* + * If the current waiter is not the top waiter on the lock, + * then we can stop the chain walk here if we are not in full + * deadlock detection mode. + */ if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -533,76 +743,119 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * * Must be called with lock->wait_lock held. * - * @lock: the lock to be acquired. - * @task: the task which wants to acquire the lock - * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) + * @lock: The lock to be acquired. + * @task: The task which wants to acquire the lock + * @waiter: The waiter that is queued to the lock's wait list if the + * callsite called task_blocked_on_lock(), otherwise NULL */ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter) { + unsigned long flags; + /* - * We have to be careful here if the atomic speedups are - * enabled, such that, when - * - no other waiter is on the lock - * - the lock has been released since we did the cmpxchg - * the lock can be released or taken while we are doing the - * checks and marking the lock with RT_MUTEX_HAS_WAITERS. + * Before testing whether we can acquire @lock, we set the + * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all + * other tasks which try to modify @lock into the slow path + * and they serialize on @lock->wait_lock. + * + * The RT_MUTEX_HAS_WAITERS bit can have a transitional state + * as explained at the top of this file if and only if: * - * The atomic acquire/release aware variant of - * mark_rt_mutex_waiters uses a cmpxchg loop. After setting - * the WAITERS bit, the atomic release / acquire can not - * happen anymore and lock->wait_lock protects us from the - * non-atomic case. + * - There is a lock owner. The caller must fixup the + * transient state if it does a trylock or leaves the lock + * function due to a signal or timeout. * - * Note, that this might set lock->owner = - * RT_MUTEX_HAS_WAITERS in the case the lock is not contended - * any more. This is fixed up when we take the ownership. - * This is the transitional state explained at the top of this file. + * - @task acquires the lock and there are no other + * waiters. This is undone in rt_mutex_set_owner(@task) at + * the end of this function. */ mark_rt_mutex_waiters(lock); + /* + * If @lock has an owner, give up. + */ if (rt_mutex_owner(lock)) return 0; /* - * It will get the lock because of one of these conditions: - * 1) there is no waiter - * 2) higher priority than waiters - * 3) it is top waiter + * If @waiter != NULL, @task has already enqueued the waiter + * into @lock waiter list. If @waiter == NULL then this is a + * trylock attempt. */ - if (rt_mutex_has_waiters(lock)) { - if (task->prio >= rt_mutex_top_waiter(lock)->prio) { - if (!waiter || waiter != rt_mutex_top_waiter(lock)) - return 0; - } - } - - if (waiter || rt_mutex_has_waiters(lock)) { - unsigned long flags; - struct rt_mutex_waiter *top; - - raw_spin_lock_irqsave(&task->pi_lock, flags); + if (waiter) { + /* + * If waiter is not the highest priority waiter of + * @lock, give up. + */ + if (waiter != rt_mutex_top_waiter(lock)) + return 0; - /* remove the queued waiter. */ - if (waiter) { - rt_mutex_dequeue(lock, waiter); - task->pi_blocked_on = NULL; - } + /* + * We can acquire the lock. Remove the waiter from the + * lock waiters list. + */ + rt_mutex_dequeue(lock, waiter); + } else { /* - * We have to enqueue the top waiter(if it exists) into - * task->pi_waiters list. + * If the lock has waiters already we check whether @task is + * eligible to take over the lock. + * + * If there are no other waiters, @task can acquire + * the lock. @task->pi_blocked_on is NULL, so it does + * not need to be dequeued. */ if (rt_mutex_has_waiters(lock)) { - top = rt_mutex_top_waiter(lock); - rt_mutex_enqueue_pi(task, top); + /* + * If @task->prio is greater than or equal to + * the top waiter priority (kernel view), + * @task lost. + */ + if (task->prio >= rt_mutex_top_waiter(lock)->prio) + return 0; + + /* + * The current top waiter stays enqueued. We + * don't have to change anything in the lock + * waiters order. + */ + } else { + /* + * No waiters. Take the lock without the + * pi_lock dance.@task->pi_blocked_on is NULL + * and we have no waiters to enqueue in @task + * pi waiters list. + */ + goto takeit; } - raw_spin_unlock_irqrestore(&task->pi_lock, flags); } + /* + * Clear @task->pi_blocked_on. Requires protection by + * @task->pi_lock. Redundant operation for the @waiter == NULL + * case, but conditionals are more expensive than a redundant + * store. + */ + raw_spin_lock_irqsave(&task->pi_lock, flags); + task->pi_blocked_on = NULL; + /* + * Finish the lock acquisition. @task is the new owner. If + * other waiters exist we have to insert the highest priority + * waiter into @task->pi_waiters list. + */ + if (rt_mutex_has_waiters(lock)) + rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + +takeit: /* We got the lock. */ debug_rt_mutex_lock(lock); + /* + * This either preserves the RT_MUTEX_HAS_WAITERS bit if there + * are still waiters or clears it. + */ rt_mutex_set_owner(lock, task); rt_mutex_deadlock_account_lock(lock, task); @@ -620,7 +873,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, - int detect_deadlock) + enum rtmutex_chainwalk chwalk) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; @@ -666,7 +919,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { + } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { chain_walk = 1; } @@ -691,7 +944,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, + res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); raw_spin_lock(&lock->wait_lock); @@ -753,9 +1006,9 @@ static void wakeup_next_waiter(struct rt_mutex *lock) static void remove_waiter(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { - int first = (waiter == rt_mutex_top_waiter(lock)); + bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); - struct rt_mutex *next_lock = NULL; + struct rt_mutex *next_lock; unsigned long flags; raw_spin_lock_irqsave(¤t->pi_lock, flags); @@ -763,29 +1016,31 @@ static void remove_waiter(struct rt_mutex *lock, current->pi_blocked_on = NULL; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - if (!owner) + /* + * Only update priority if the waiter was the highest priority + * waiter of the lock and there is an owner to update. + */ + if (!owner || !is_top_waiter) return; - if (first) { - - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock_irqsave(&owner->pi_lock, flags); - rt_mutex_dequeue_pi(owner, waiter); + rt_mutex_dequeue_pi(owner, waiter); - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; + if (rt_mutex_has_waiters(lock)) + rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - next = rt_mutex_top_waiter(lock); - rt_mutex_enqueue_pi(owner, next); - } - __rt_mutex_adjust_prio(owner); + __rt_mutex_adjust_prio(owner); - /* Store the lock on which owner is blocked or NULL */ - next_lock = task_blocked_on_lock(owner); + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); - } + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + /* + * Don't walk the chain, if the owner task is not blocked + * itself. + */ if (!next_lock) return; @@ -794,7 +1049,8 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current); + rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, + next_lock, NULL, current); raw_spin_lock(&lock->wait_lock); } @@ -824,7 +1080,8 @@ void rt_mutex_adjust_pi(struct task_struct *task) /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); - rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task); + rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, + next_lock, NULL, task); } /** @@ -902,7 +1159,7 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock, static int __sched rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock) + enum rtmutex_chainwalk chwalk) { struct rt_mutex_waiter waiter; int ret = 0; @@ -928,7 +1185,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, timeout->task = NULL; } - ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); + ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); if (likely(!ret)) ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); @@ -937,7 +1194,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(ret)) { remove_waiter(lock, &waiter); - rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter); + rt_mutex_handle_deadlock(ret, chwalk, &waiter); } /* @@ -960,22 +1217,31 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, /* * Slow path try-lock function: */ -static inline int -rt_mutex_slowtrylock(struct rt_mutex *lock) +static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) { - int ret = 0; + int ret; + + /* + * If the lock already has an owner we fail to get the lock. + * This can be done without taking the @lock->wait_lock as + * it is only being read, and this is a trylock anyway. + */ + if (rt_mutex_owner(lock)) + return 0; + /* + * The mutex has currently no owner. Lock the wait lock and + * try to acquire the lock. + */ raw_spin_lock(&lock->wait_lock); - if (likely(rt_mutex_owner(lock) != current)) { + ret = try_to_take_rt_mutex(lock, current, NULL); - ret = try_to_take_rt_mutex(lock, current, NULL); - /* - * try_to_take_rt_mutex() sets the lock waiters - * bit unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); - } + /* + * try_to_take_rt_mutex() sets the lock waiters bit + * unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); raw_spin_unlock(&lock->wait_lock); @@ -1053,30 +1319,31 @@ rt_mutex_slowunlock(struct rt_mutex *lock) */ static inline int rt_mutex_fastlock(struct rt_mutex *lock, int state, - int detect_deadlock, int (*slowfn)(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock)) + enum rtmutex_chainwalk chwalk)) { - if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else - return slowfn(lock, state, NULL, detect_deadlock); + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); } static inline int rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, int detect_deadlock, + struct hrtimer_sleeper *timeout, + enum rtmutex_chainwalk chwalk, int (*slowfn)(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock)) + enum rtmutex_chainwalk chwalk)) { - if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + if (chwalk == RT_MUTEX_MIN_CHAINWALK && + likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else - return slowfn(lock, state, timeout, detect_deadlock); + return slowfn(lock, state, timeout, chwalk); } static inline int @@ -1109,54 +1376,61 @@ void __sched rt_mutex_lock(struct rt_mutex *lock) { might_sleep(); - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_lock); /** * rt_mutex_lock_interruptible - lock a rt_mutex interruptible * - * @lock: the rt_mutex to be locked - * @detect_deadlock: deadlock detection on/off + * @lock: the rt_mutex to be locked * * Returns: - * 0 on success - * -EINTR when interrupted by a signal - * -EDEADLK when the lock would deadlock (when deadlock detection is on) + * 0 on success + * -EINTR when interrupted by a signal */ -int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, - int detect_deadlock) +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) { might_sleep(); - return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, - detect_deadlock, rt_mutex_slowlock); + return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); +/* + * Futex variant with full deadlock detection. + */ +int rt_mutex_timed_futex_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *timeout) +{ + might_sleep(); + + return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, + RT_MUTEX_FULL_CHAINWALK, + rt_mutex_slowlock); +} + /** * rt_mutex_timed_lock - lock a rt_mutex interruptible * the timeout structure is provided * by the caller * - * @lock: the rt_mutex to be locked + * @lock: the rt_mutex to be locked * @timeout: timeout structure or NULL (no timeout) - * @detect_deadlock: deadlock detection on/off * * Returns: - * 0 on success - * -EINTR when interrupted by a signal + * 0 on success + * -EINTR when interrupted by a signal * -ETIMEDOUT when the timeout expired - * -EDEADLK when the lock would deadlock (when deadlock detection is on) */ int -rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, - int detect_deadlock) +rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) { might_sleep(); return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - detect_deadlock, rt_mutex_slowlock); + RT_MUTEX_MIN_CHAINWALK, + rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); @@ -1262,7 +1536,6 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, * @lock: the rt_mutex to take * @waiter: the pre-initialized rt_mutex_waiter * @task: the task to prepare - * @detect_deadlock: perform deadlock detection (1) or not (0) * * Returns: * 0 - task blocked on lock @@ -1273,7 +1546,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, */ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, - struct task_struct *task, int detect_deadlock) + struct task_struct *task) { int ret; @@ -1285,7 +1558,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, } /* We enforce deadlock detection for futexes */ - ret = task_blocks_on_rt_mutex(lock, waiter, task, 1); + ret = task_blocks_on_rt_mutex(lock, waiter, task, + RT_MUTEX_FULL_CHAINWALK); if (ret && !rt_mutex_owner(lock)) { /* @@ -1331,22 +1605,20 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) * rt_mutex_finish_proxy_lock() - Complete lock acquisition * @lock: the rt_mutex we were woken on * @to: the timeout, null if none. hrtimer should already have - * been started. + * been started. * @waiter: the pre-initialized rt_mutex_waiter - * @detect_deadlock: perform deadlock detection (1) or not (0) * * Complete the lock acquisition started our behalf by another thread. * * Returns: * 0 - success - * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK + * <0 - error, one of -EINTR, -ETIMEDOUT * * Special API call for PI-futex requeue support */ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter, - int detect_deadlock) + struct rt_mutex_waiter *waiter) { int ret; diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index f6a1f3c..c406058 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -22,10 +22,15 @@ #define debug_rt_mutex_init(m, n) do { } while (0) #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) #define debug_rt_mutex_print_deadlock(w) do { } while (0) -#define debug_rt_mutex_detect_deadlock(w,d) (d) #define debug_rt_mutex_reset_waiter(w) do { } while (0) static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) { WARN(1, "rtmutex deadlock detected\n"); } + +static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w, + enum rtmutex_chainwalk walk) +{ + return walk == RT_MUTEX_FULL_CHAINWALK; +} diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 7431a9c..8552125 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -102,6 +102,21 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) } /* + * Constants for rt mutex functions which have a selectable deadlock + * detection. + * + * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are + * no further PI adjustments to be made. + * + * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full + * walk of the lock chain. + */ +enum rtmutex_chainwalk { + RT_MUTEX_MIN_CHAINWALK, + RT_MUTEX_FULL_CHAINWALK, +}; + +/* * PI-futex support (proxy locking functions, etc.): */ extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); @@ -111,12 +126,11 @@ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, - struct task_struct *task, - int detect_deadlock); + struct task_struct *task); extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter, - int detect_deadlock); + struct rt_mutex_waiter *waiter); +extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a2391ac..d6203fa 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -329,7 +329,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) if (need_resched()) break; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } rcu_read_unlock(); @@ -381,7 +381,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } osq_unlock(&sem->osq); done: diff --git a/kernel/module.c b/kernel/module.c index 81e727c..ae79ce6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -60,7 +60,6 @@ #include <linux/jump_label.h> #include <linux/pfn.h> #include <linux/bsearch.h> -#include <linux/fips.h> #include <uapi/linux/module.h> #include "module-internal.h" @@ -2448,9 +2447,6 @@ static int module_sig_check(struct load_info *info) } /* Not having a signature is only an error if we're strict. */ - if (err < 0 && fips_enabled) - panic("Module verification failed with error %d in FIPS mode\n", - err); if (err == -ENOKEY && !sig_enforce) err = 0; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 8e78110..ef42d0a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) might_sleep(); + task_lock(p); ns = p->nsproxy; + p->nsproxy = new; + task_unlock(p); - rcu_assign_pointer(p->nsproxy, new); - - if (ns && atomic_dec_and_test(&ns->count)) { - /* - * wait for others to get what they want from this nsproxy. - * - * cannot release this nsproxy via the call_rcu() since - * put_mnt_ns() will want to sleep - */ - synchronize_rcu(); + if (ns && atomic_dec_and_test(&ns->count)) free_nsproxy(ns); - } } void exit_task_namespaces(struct task_struct *p) diff --git a/kernel/panic.c b/kernel/panic.c index 62e16ce..d09dc5c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -224,6 +224,7 @@ static const struct tnt tnts[] = { { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, { TAINT_OOT_MODULE, 'O', ' ' }, { TAINT_UNSIGNED_MODULE, 'E', ' ' }, + { TAINT_SOFTLOCKUP, 'L', ' ' }, }; /** diff --git a/kernel/params.c b/kernel/params.c index 1e52ca2..34f5270 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -256,6 +256,7 @@ STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); STANDARD_PARAM_DEF(long, long, "%li", kstrtol); STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); +STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); int param_set_charp(const char *val, const struct kernel_param *kp) { diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9a83d78..e4e4121 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -253,9 +253,6 @@ config APM_EMULATION anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). -config ARCH_HAS_OPP - bool - config PM_OPP bool ---help--- diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index fcc2611..a9dfa79 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -371,7 +371,6 @@ int hibernation_snapshot(int platform_mode) } suspend_console(); - ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -397,7 +396,6 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) pm_restore_gfp_mask(); - ftrace_start(); resume_console(); dpm_complete(msg); @@ -500,7 +498,6 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); - ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { @@ -508,7 +505,6 @@ int hibernation_restore(int platform_mode) dpm_resume_end(PMSG_RECOVER); } pm_restore_gfp_mask(); - ftrace_start(); resume_console(); pm_restore_console(); return error; @@ -535,7 +531,6 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); - ftrace_stop(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -579,7 +574,6 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); - ftrace_start(); resume_console(); Close: diff --git a/kernel/power/main.c b/kernel/power/main.c index 8e90f33..9a59d04 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -296,8 +296,8 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) - if (pm_states[i].state) - s += sprintf(s,"%s ", pm_states[i].label); + if (pm_states[i]) + s += sprintf(s,"%s ", pm_states[i]); #endif if (hibernation_available()) @@ -311,8 +311,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND - suspend_state_t state = PM_SUSPEND_MIN; - struct pm_sleep_state *s; + suspend_state_t state; #endif char *p; int len; @@ -325,10 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n) return PM_SUSPEND_MAX; #ifdef CONFIG_SUSPEND - for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) - if (s->state && len == strlen(s->label) - && !strncmp(buf, s->label, len)) - return s->state; + for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { + const char *label = pm_states[state]; + + if (label && len == strlen(label) && !strncmp(buf, label, len)) + return state; + } #endif return PM_SUSPEND_ON; @@ -446,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj, #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) - return sprintf(buf, "%s\n", pm_states[state].state ? - pm_states[state].label : "error"); + return sprintf(buf, "%s\n", pm_states[state] ? + pm_states[state] : "error"); #endif #ifdef CONFIG_HIBERNATION return sprintf(buf, "disk\n"); @@ -615,7 +616,6 @@ static struct attribute_group attr_group = { .attrs = g, }; -#ifdef CONFIG_PM_RUNTIME struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); @@ -625,9 +625,6 @@ static int __init pm_start_workqueue(void) return pm_wq ? 0 : -ENOMEM; } -#else -static inline int pm_start_workqueue(void) { return 0; } -#endif static int __init pm_init(void) { diff --git a/kernel/power/power.h b/kernel/power/power.h index c60f13b..5d49dca 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -178,13 +178,8 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, unsigned int, char *); #ifdef CONFIG_SUSPEND -struct pm_sleep_state { - const char *label; - suspend_state_t state; -}; - /* kernel/power/suspend.c */ -extern struct pm_sleep_state pm_states[]; +extern const char *pm_states[]; extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1ea328a..4fc5c32 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -248,33 +248,61 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) * information is stored (in the form of a block of bitmap) * It also contains the pfns that correspond to the start and end of * the represented memory area. + * + * The memory bitmap is organized as a radix tree to guarantee fast random + * access to the bits. There is one radix tree for each zone (as returned + * from create_mem_extents). + * + * One radix tree is represented by one struct mem_zone_bm_rtree. There are + * two linked lists for the nodes of the tree, one for the inner nodes and + * one for the leave nodes. The linked leave nodes are used for fast linear + * access of the memory bitmap. + * + * The struct rtree_node represents one node of the radix tree. */ #define BM_END_OF_MAP (~0UL) #define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) +#define BM_BLOCK_SHIFT (PAGE_SHIFT + 3) +#define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1) -struct bm_block { - struct list_head hook; /* hook into a list of bitmap blocks */ - unsigned long start_pfn; /* pfn represented by the first bit */ - unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ - unsigned long *data; /* bitmap representing pages */ +/* + * struct rtree_node is a wrapper struct to link the nodes + * of the rtree together for easy linear iteration over + * bits and easy freeing + */ +struct rtree_node { + struct list_head list; + unsigned long *data; }; -static inline unsigned long bm_block_bits(struct bm_block *bb) -{ - return bb->end_pfn - bb->start_pfn; -} +/* + * struct mem_zone_bm_rtree represents a bitmap used for one + * populated memory zone. + */ +struct mem_zone_bm_rtree { + struct list_head list; /* Link Zones together */ + struct list_head nodes; /* Radix Tree inner nodes */ + struct list_head leaves; /* Radix Tree leaves */ + unsigned long start_pfn; /* Zone start page frame */ + unsigned long end_pfn; /* Zone end page frame + 1 */ + struct rtree_node *rtree; /* Radix Tree Root */ + int levels; /* Number of Radix Tree Levels */ + unsigned int blocks; /* Number of Bitmap Blocks */ +}; /* strcut bm_position is used for browsing memory bitmaps */ struct bm_position { - struct bm_block *block; - int bit; + struct mem_zone_bm_rtree *zone; + struct rtree_node *node; + unsigned long node_pfn; + int node_bit; }; struct memory_bitmap { - struct list_head blocks; /* list of bitmap blocks */ + struct list_head zones; struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block * objects @@ -284,38 +312,178 @@ struct memory_bitmap { /* Functions that operate on memory bitmaps */ -static void memory_bm_position_reset(struct memory_bitmap *bm) +#define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long)) +#if BITS_PER_LONG == 32 +#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2) +#else +#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3) +#endif +#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) + +/* + * alloc_rtree_node - Allocate a new node and add it to the radix tree. + * + * This function is used to allocate inner nodes as well as the + * leave nodes of the radix tree. It also adds the node to the + * corresponding linked list passed in by the *list parameter. + */ +static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, + struct chain_allocator *ca, + struct list_head *list) { - bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); - bm->cur.bit = 0; -} + struct rtree_node *node; -static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); + node = chain_alloc(ca, sizeof(struct rtree_node)); + if (!node) + return NULL; -/** - * create_bm_block_list - create a list of block bitmap objects - * @pages - number of pages to track - * @list - list to put the allocated blocks into - * @ca - chain allocator to be used for allocating memory + node->data = get_image_page(gfp_mask, safe_needed); + if (!node->data) + return NULL; + + list_add_tail(&node->list, list); + + return node; +} + +/* + * add_rtree_block - Add a new leave node to the radix tree + * + * The leave nodes need to be allocated in order to keep the leaves + * linked list in order. This is guaranteed by the zone->blocks + * counter. */ -static int create_bm_block_list(unsigned long pages, - struct list_head *list, - struct chain_allocator *ca) +static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, + int safe_needed, struct chain_allocator *ca) { - unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); + struct rtree_node *node, *block, **dst; + unsigned int levels_needed, block_nr; + int i; - while (nr_blocks-- > 0) { - struct bm_block *bb; + block_nr = zone->blocks; + levels_needed = 0; - bb = chain_alloc(ca, sizeof(struct bm_block)); - if (!bb) + /* How many levels do we need for this block nr? */ + while (block_nr) { + levels_needed += 1; + block_nr >>= BM_RTREE_LEVEL_SHIFT; + } + + /* Make sure the rtree has enough levels */ + for (i = zone->levels; i < levels_needed; i++) { + node = alloc_rtree_node(gfp_mask, safe_needed, ca, + &zone->nodes); + if (!node) return -ENOMEM; - list_add(&bb->hook, list); + + node->data[0] = (unsigned long)zone->rtree; + zone->rtree = node; + zone->levels += 1; + } + + /* Allocate new block */ + block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves); + if (!block) + return -ENOMEM; + + /* Now walk the rtree to insert the block */ + node = zone->rtree; + dst = &zone->rtree; + block_nr = zone->blocks; + for (i = zone->levels; i > 0; i--) { + int index; + + if (!node) { + node = alloc_rtree_node(gfp_mask, safe_needed, ca, + &zone->nodes); + if (!node) + return -ENOMEM; + *dst = node; + } + + index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); + index &= BM_RTREE_LEVEL_MASK; + dst = (struct rtree_node **)&((*dst)->data[index]); + node = *dst; } + zone->blocks += 1; + *dst = block; + return 0; } +static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, + int clear_nosave_free); + +/* + * create_zone_bm_rtree - create a radix tree for one zone + * + * Allocated the mem_zone_bm_rtree structure and initializes it. + * This function also allocated and builds the radix tree for the + * zone. + */ +static struct mem_zone_bm_rtree * +create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, + struct chain_allocator *ca, + unsigned long start, unsigned long end) +{ + struct mem_zone_bm_rtree *zone; + unsigned int i, nr_blocks; + unsigned long pages; + + pages = end - start; + zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree)); + if (!zone) + return NULL; + + INIT_LIST_HEAD(&zone->nodes); + INIT_LIST_HEAD(&zone->leaves); + zone->start_pfn = start; + zone->end_pfn = end; + nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); + + for (i = 0; i < nr_blocks; i++) { + if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) { + free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR); + return NULL; + } + } + + return zone; +} + +/* + * free_zone_bm_rtree - Free the memory of the radix tree + * + * Free all node pages of the radix tree. The mem_zone_bm_rtree + * structure itself is not freed here nor are the rtree_node + * structs. + */ +static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, + int clear_nosave_free) +{ + struct rtree_node *node; + + list_for_each_entry(node, &zone->nodes, list) + free_image_page(node->data, clear_nosave_free); + + list_for_each_entry(node, &zone->leaves, list) + free_image_page(node->data, clear_nosave_free); +} + +static void memory_bm_position_reset(struct memory_bitmap *bm) +{ + bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, + list); + bm->cur.node = list_entry(bm->cur.zone->leaves.next, + struct rtree_node, list); + bm->cur.node_pfn = 0; + bm->cur.node_bit = 0; +} + +static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); + struct mem_extent { struct list_head hook; unsigned long start; @@ -407,40 +575,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) int error; chain_init(&ca, gfp_mask, safe_needed); - INIT_LIST_HEAD(&bm->blocks); + INIT_LIST_HEAD(&bm->zones); error = create_mem_extents(&mem_extents, gfp_mask); if (error) return error; list_for_each_entry(ext, &mem_extents, hook) { - struct bm_block *bb; - unsigned long pfn = ext->start; - unsigned long pages = ext->end - ext->start; - - bb = list_entry(bm->blocks.prev, struct bm_block, hook); + struct mem_zone_bm_rtree *zone; - error = create_bm_block_list(pages, bm->blocks.prev, &ca); - if (error) + zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca, + ext->start, ext->end); + if (!zone) { + error = -ENOMEM; goto Error; - - list_for_each_entry_continue(bb, &bm->blocks, hook) { - bb->data = get_image_page(gfp_mask, safe_needed); - if (!bb->data) { - error = -ENOMEM; - goto Error; - } - - bb->start_pfn = pfn; - if (pages >= BM_BITS_PER_BLOCK) { - pfn += BM_BITS_PER_BLOCK; - pages -= BM_BITS_PER_BLOCK; - } else { - /* This is executed only once in the loop */ - pfn += pages; - } - bb->end_pfn = pfn; } + list_add_tail(&zone->list, &bm->zones); } bm->p_list = ca.chain; @@ -460,51 +610,83 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) */ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { - struct bm_block *bb; + struct mem_zone_bm_rtree *zone; - list_for_each_entry(bb, &bm->blocks, hook) - if (bb->data) - free_image_page(bb->data, clear_nosave_free); + list_for_each_entry(zone, &bm->zones, list) + free_zone_bm_rtree(zone, clear_nosave_free); free_list_of_pages(bm->p_list, clear_nosave_free); - INIT_LIST_HEAD(&bm->blocks); + INIT_LIST_HEAD(&bm->zones); } /** - * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds - * to given pfn. The cur_zone_bm member of @bm and the cur_block member - * of @bm->cur_zone_bm are updated. + * memory_bm_find_bit - Find the bit for pfn in the memory + * bitmap + * + * Find the bit in the bitmap @bm that corresponds to given pfn. + * The cur.zone, cur.block and cur.node_pfn member of @bm are + * updated. + * It walks the radix tree to find the page which contains the bit for + * pfn and returns the bit position in **addr and *bit_nr. */ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, - void **addr, unsigned int *bit_nr) + void **addr, unsigned int *bit_nr) { - struct bm_block *bb; + struct mem_zone_bm_rtree *curr, *zone; + struct rtree_node *node; + int i, block_nr; + zone = bm->cur.zone; + + if (pfn >= zone->start_pfn && pfn < zone->end_pfn) + goto zone_found; + + zone = NULL; + + /* Find the right zone */ + list_for_each_entry(curr, &bm->zones, list) { + if (pfn >= curr->start_pfn && pfn < curr->end_pfn) { + zone = curr; + break; + } + } + + if (!zone) + return -EFAULT; + +zone_found: /* - * Check if the pfn corresponds to the current bitmap block and find - * the block where it fits if this is not the case. + * We have a zone. Now walk the radix tree to find the leave + * node for our pfn. */ - bb = bm->cur.block; - if (pfn < bb->start_pfn) - list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) - if (pfn >= bb->start_pfn) - break; - if (pfn >= bb->end_pfn) - list_for_each_entry_continue(bb, &bm->blocks, hook) - if (pfn >= bb->start_pfn && pfn < bb->end_pfn) - break; + node = bm->cur.node; + if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) + goto node_found; - if (&bb->hook == &bm->blocks) - return -EFAULT; + node = zone->rtree; + block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT; + + for (i = zone->levels; i > 0; i--) { + int index; + + index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); + index &= BM_RTREE_LEVEL_MASK; + BUG_ON(node->data[index] == 0); + node = (struct rtree_node *)node->data[index]; + } + +node_found: + /* Update last position */ + bm->cur.zone = zone; + bm->cur.node = node; + bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; + + /* Set return values */ + *addr = node->data; + *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK; - /* The block has been found */ - bm->cur.block = bb; - pfn -= bb->start_pfn; - bm->cur.bit = pfn + 1; - *bit_nr = pfn; - *addr = bb->data; return 0; } @@ -528,6 +710,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) error = memory_bm_find_bit(bm, pfn, &addr, &bit); if (!error) set_bit(bit, addr); + return error; } @@ -542,6 +725,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) clear_bit(bit, addr); } +static void memory_bm_clear_current(struct memory_bitmap *bm) +{ + int bit; + + bit = max(bm->cur.node_bit - 1, 0); + clear_bit(bit, bm->cur.node->data); +} + static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; @@ -561,38 +752,70 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) return !memory_bm_find_bit(bm, pfn, &addr, &bit); } -/** - * memory_bm_next_pfn - find the pfn that corresponds to the next set bit - * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is - * returned. +/* + * rtree_next_node - Jumps to the next leave node + * + * Sets the position to the beginning of the next node in the + * memory bitmap. This is either the next node in the current + * zone's radix tree or the first node in the radix tree of the + * next zone. * - * It is required to run memory_bm_position_reset() before the first call to - * this function. + * Returns true if there is a next node, false otherwise. */ +static bool rtree_next_node(struct memory_bitmap *bm) +{ + bm->cur.node = list_entry(bm->cur.node->list.next, + struct rtree_node, list); + if (&bm->cur.node->list != &bm->cur.zone->leaves) { + bm->cur.node_pfn += BM_BITS_PER_BLOCK; + bm->cur.node_bit = 0; + touch_softlockup_watchdog(); + return true; + } + + /* No more nodes, goto next zone */ + bm->cur.zone = list_entry(bm->cur.zone->list.next, + struct mem_zone_bm_rtree, list); + if (&bm->cur.zone->list != &bm->zones) { + bm->cur.node = list_entry(bm->cur.zone->leaves.next, + struct rtree_node, list); + bm->cur.node_pfn = 0; + bm->cur.node_bit = 0; + return true; + } + /* No more zones */ + return false; +} + +/** + * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm + * + * Starting from the last returned position this function searches + * for the next set bit in the memory bitmap and returns its + * number. If no more bit is set BM_END_OF_MAP is returned. + * + * It is required to run memory_bm_position_reset() before the + * first call to this function. + */ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { - struct bm_block *bb; + unsigned long bits, pfn, pages; int bit; - bb = bm->cur.block; do { - bit = bm->cur.bit; - bit = find_next_bit(bb->data, bm_block_bits(bb), bit); - if (bit < bm_block_bits(bb)) - goto Return_pfn; - - bb = list_entry(bb->hook.next, struct bm_block, hook); - bm->cur.block = bb; - bm->cur.bit = 0; - } while (&bb->hook != &bm->blocks); + pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn; + bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK); + bit = find_next_bit(bm->cur.node->data, bits, + bm->cur.node_bit); + if (bit < bits) { + pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; + bm->cur.node_bit = bit + 1; + return pfn; + } + } while (rtree_next_node(bm)); - memory_bm_position_reset(bm); return BM_END_OF_MAP; - - Return_pfn: - bm->cur.bit = bit + 1; - return bb->start_pfn + bit; } /** @@ -816,12 +1039,17 @@ void free_basic_memory_bitmaps(void) unsigned int snapshot_additional_pages(struct zone *zone) { - unsigned int res; + unsigned int rtree, nodes; + + rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); + rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node), + LINKED_PAGE_DATA_SIZE); + while (nodes > 1) { + nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL); + rtree += nodes; + } - res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - res += DIV_ROUND_UP(res * sizeof(struct bm_block), - LINKED_PAGE_DATA_SIZE); - return 2 * res; + return 2 * rtree; } #ifdef CONFIG_HIGHMEM @@ -1094,23 +1322,35 @@ static struct memory_bitmap copy_bm; void swsusp_free(void) { - struct zone *zone; - unsigned long pfn, max_zone_pfn; + unsigned long fb_pfn, fr_pfn; - for_each_populated_zone(zone) { - max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - - if (swsusp_page_is_forbidden(page) && - swsusp_page_is_free(page)) { - swsusp_unset_page_forbidden(page); - swsusp_unset_page_free(page); - __free_page(page); - } - } + memory_bm_position_reset(forbidden_pages_map); + memory_bm_position_reset(free_pages_map); + +loop: + fr_pfn = memory_bm_next_pfn(free_pages_map); + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + + /* + * Find the next bit set in both bitmaps. This is guaranteed to + * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP. + */ + do { + if (fb_pfn < fr_pfn) + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + if (fr_pfn < fb_pfn) + fr_pfn = memory_bm_next_pfn(free_pages_map); + } while (fb_pfn != fr_pfn); + + if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { + struct page *page = pfn_to_page(fr_pfn); + + memory_bm_clear_current(forbidden_pages_map); + memory_bm_clear_current(free_pages_map); + __free_page(page); + goto loop; } + nr_copy_pages = 0; nr_meta_pages = 0; restore_pblist = NULL; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ed35a47..6dadb25 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -31,20 +31,11 @@ #include "power.h" -struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, - [PM_SUSPEND_STANDBY] = { .label = "standby", }, - [PM_SUSPEND_MEM] = { .label = "mem", }, -}; +static const char *pm_labels[] = { "mem", "standby", "freeze", }; +const char *pm_states[PM_SUSPEND_MAX]; static const struct platform_suspend_ops *suspend_ops; static const struct platform_freeze_ops *freeze_ops; - -static bool need_suspend_ops(suspend_state_t state) -{ - return state > PM_SUSPEND_FREEZE; -} - static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); static bool suspend_freeze_wake; @@ -97,10 +88,7 @@ static bool relative_states; static int __init sleep_states_setup(char *str) { relative_states = !strncmp(str, "1", 1); - if (relative_states) { - pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; - pm_states[PM_SUSPEND_FREEZE].state = 0; - } + pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; return 1; } @@ -113,20 +101,20 @@ __setup("relative_sleep_states=", sleep_states_setup); void suspend_set_ops(const struct platform_suspend_ops *ops) { suspend_state_t i; - int j = PM_SUSPEND_MAX - 1; + int j = 0; lock_system_sleep(); suspend_ops = ops; for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) - if (valid_state(i)) - pm_states[j--].state = i; - else if (!relative_states) - pm_states[j--].state = 0; + if (valid_state(i)) { + pm_states[i] = pm_labels[j++]; + } else if (!relative_states) { + pm_states[i] = NULL; + j++; + } - pm_states[j--].state = PM_SUSPEND_FREEZE; - while (j >= PM_SUSPEND_MIN) - pm_states[j--].state = 0; + pm_states[PM_SUSPEND_FREEZE] = pm_labels[j]; unlock_system_sleep(); } @@ -145,6 +133,65 @@ int suspend_valid_only_mem(suspend_state_t state) } EXPORT_SYMBOL_GPL(suspend_valid_only_mem); +static bool sleep_state_supported(suspend_state_t state) +{ + return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter); +} + +static int platform_suspend_prepare(suspend_state_t state) +{ + return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ? + suspend_ops->prepare() : 0; +} + +static int platform_suspend_prepare_late(suspend_state_t state) +{ + return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? + suspend_ops->prepare_late() : 0; +} + +static void platform_suspend_wake(suspend_state_t state) +{ + if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) + suspend_ops->wake(); +} + +static void platform_suspend_finish(suspend_state_t state) +{ + if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) + suspend_ops->finish(); +} + +static int platform_suspend_begin(suspend_state_t state) +{ + if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) + return freeze_ops->begin(); + else if (suspend_ops->begin) + return suspend_ops->begin(state); + else + return 0; +} + +static void platform_suspend_end(suspend_state_t state) +{ + if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) + freeze_ops->end(); + else if (suspend_ops->end) + suspend_ops->end(); +} + +static void platform_suspend_recover(suspend_state_t state) +{ + if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) + suspend_ops->recover(); +} + +static bool platform_suspend_again(suspend_state_t state) +{ + return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ? + suspend_ops->suspend_again() : false; +} + static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG @@ -168,7 +215,7 @@ static int suspend_prepare(suspend_state_t state) { int error; - if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) + if (!sleep_state_supported(state)) return -EPERM; pm_prepare_console(); @@ -214,23 +261,18 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) { int error; - if (need_suspend_ops(state) && suspend_ops->prepare) { - error = suspend_ops->prepare(); - if (error) - goto Platform_finish; - } + error = platform_suspend_prepare(state); + if (error) + goto Platform_finish; error = dpm_suspend_end(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); goto Platform_finish; } - - if (need_suspend_ops(state) && suspend_ops->prepare_late) { - error = suspend_ops->prepare_late(); - if (error) - goto Platform_wake; - } + error = platform_suspend_prepare_late(state); + if (error) + goto Platform_wake; if (suspend_test(TEST_PLATFORM)) goto Platform_wake; @@ -248,7 +290,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_wake; } - ftrace_stop(); error = disable_nonboot_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; @@ -275,18 +316,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) Enable_cpus: enable_nonboot_cpus(); - ftrace_start(); Platform_wake: - if (need_suspend_ops(state) && suspend_ops->wake) - suspend_ops->wake(); - + platform_suspend_wake(state); dpm_resume_start(PMSG_RESUME); Platform_finish: - if (need_suspend_ops(state) && suspend_ops->finish) - suspend_ops->finish(); - + platform_suspend_finish(state); return error; } @@ -299,18 +335,13 @@ int suspend_devices_and_enter(suspend_state_t state) int error; bool wakeup = false; - if (need_suspend_ops(state) && !suspend_ops) + if (!sleep_state_supported(state)) return -ENOSYS; - if (need_suspend_ops(state) && suspend_ops->begin) { - error = suspend_ops->begin(state); - if (error) - goto Close; - } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) { - error = freeze_ops->begin(); - if (error) - goto Close; - } + error = platform_suspend_begin(state); + if (error) + goto Close; + suspend_console(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); @@ -324,25 +355,20 @@ int suspend_devices_and_enter(suspend_state_t state) do { error = suspend_enter(state, &wakeup); - } while (!error && !wakeup && need_suspend_ops(state) - && suspend_ops->suspend_again && suspend_ops->suspend_again()); + } while (!error && !wakeup && platform_suspend_again(state)); Resume_devices: suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); resume_console(); - Close: - if (need_suspend_ops(state) && suspend_ops->end) - suspend_ops->end(); - else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) - freeze_ops->end(); + Close: + platform_suspend_end(state); return error; Recover_platform: - if (need_suspend_ops(state) && suspend_ops->recover) - suspend_ops->recover(); + platform_suspend_recover(state); goto Resume_devices; } @@ -395,7 +421,7 @@ static int enter_state(suspend_state_t state) printk("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); error = suspend_prepare(state); if (error) goto Unlock; @@ -404,7 +430,7 @@ static int enter_state(suspend_state_t state) goto Finish; trace_suspend_resume(TPS("suspend_enter"), state, false); - pr_debug("PM: Entering %s sleep\n", pm_states[state].label); + pr_debug("PM: Entering %s sleep\n", pm_states[state]); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); pm_restore_gfp_mask(); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 269b097..2f524928 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) } if (state == PM_SUSPEND_MEM) { - printk(info_test, pm_states[state].label); + printk(info_test, pm_states[state]); status = pm_suspend(state); if (status == -ENODEV) state = PM_SUSPEND_STANDBY; } if (state == PM_SUSPEND_STANDBY) { - printk(info_test, pm_states[state].label); + printk(info_test, pm_states[state]); status = pm_suspend(state); } if (status < 0) @@ -141,8 +141,8 @@ static int __init setup_test_suspend(char *value) /* "=mem" ==> "mem" */ value++; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) - if (!strcmp(pm_states[i].label, value)) { - test_state = pm_states[i].state; + if (!strcmp(pm_states[i], value)) { + test_state = i; return 0; } @@ -162,8 +162,8 @@ static int __init test_suspend(void) /* PM is initialized by now; is that state testable? */ if (test_state == PM_SUSPEND_ON) goto done; - if (!pm_states[test_state].state) { - printk(warn_bad_state, pm_states[test_state].label); + if (!pm_states[test_state]) { + printk(warn_bad_state, pm_states[test_state]); goto done; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 13e839d..de1a6bb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -45,6 +45,7 @@ #include <linux/poll.h> #include <linux/irq_work.h> #include <linux/utsname.h> +#include <linux/ctype.h> #include <asm/uaccess.h> @@ -56,7 +57,7 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ - DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ + MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; @@ -113,9 +114,9 @@ static int __down_trylock_console_sem(unsigned long ip) * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ - * hold it are racing, but it helps tracking those weird code - * path in the console code where we end up in places I want - * locked without the console sempahore held + * hold it and are racing, but it helps tracking those weird code + * paths in the console code where we end up in places I want + * locked without the console sempahore held). */ static int console_locked, console_suspended; @@ -146,8 +147,8 @@ static int console_may_schedule; * the overall length of the record. * * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these both entries are maintained when messages - * are stored.. + * sequence numbers of these entries are maintained when messages are + * stored. * * If the heads indicate available messages, the length in the header * tells the start next message. A length == 0 for the next message @@ -257,7 +258,7 @@ static u64 clear_seq; static u32 clear_idx; #define PREFIX_MAX 32 -#define LOG_LINE_MAX 1024 - PREFIX_MAX +#define LOG_LINE_MAX (1024 - PREFIX_MAX) /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -266,6 +267,7 @@ static u32 clear_idx; #define LOG_ALIGN __alignof__(struct printk_log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; @@ -344,7 +346,7 @@ static int log_make_free_space(u32 msg_size) while (log_first_seq < log_next_seq) { if (logbuf_has_space(msg_size, false)) return 0; - /* drop old messages until we have enough continuous space */ + /* drop old messages until we have enough contiguous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } @@ -453,11 +455,7 @@ static int log_store(int facility, int level, return msg->text_len; } -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif +int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { @@ -828,34 +826,74 @@ void log_buf_kexec_setup(void) /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; -/* save requested log_buf_len since it's too early to process it */ -static int __init log_buf_len_setup(char *str) +/* we practice scaling the ring buffer by powers of 2 */ +static void __init log_buf_len_update(unsigned size) { - unsigned size = memparse(str, &str); - if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = size; +} + +/* save requested log_buf_len since it's too early to process it */ +static int __init log_buf_len_setup(char *str) +{ + unsigned size = memparse(str, &str); + + log_buf_len_update(size); return 0; } early_param("log_buf_len", log_buf_len_setup); +static void __init log_buf_add_cpu(void) +{ + unsigned int cpu_extra; + + /* + * archs should set up cpu_possible_bits properly with + * set_cpu_possible() after setup_arch() but just in + * case lets ensure this is valid. + */ + if (num_possible_cpus() == 1) + return; + + cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; + + /* by default this will only continue through for large > 64 CPUs */ + if (cpu_extra <= __LOG_BUF_LEN / 2) + return; + + pr_info("log_buf_len individual max cpu contribution: %d bytes\n", + __LOG_CPU_MAX_BUF_LEN); + pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", + cpu_extra); + pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); + + log_buf_len_update(cpu_extra + __LOG_BUF_LEN); +} + void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; int free; + if (log_buf != __log_buf) + return; + + if (!early && !new_log_buf_len) + log_buf_add_cpu(); + if (!new_log_buf_len) return; if (early) { new_log_buf = - memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); + memblock_virt_alloc(new_log_buf_len, LOG_ALIGN); } else { - new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, + LOG_ALIGN); } if (unlikely(!new_log_buf)) { @@ -872,7 +910,7 @@ void __init setup_log_buf(int early) memcpy(log_buf, __log_buf, __LOG_BUF_LEN); raw_spin_unlock_irqrestore(&logbuf_lock, flags); - pr_info("log_buf_len: %d\n", log_buf_len); + pr_info("log_buf_len: %d bytes\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", free, (free * 100) / __LOG_BUF_LEN); } @@ -881,7 +919,7 @@ static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { - ignore_loglevel = 1; + ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; @@ -947,11 +985,7 @@ static inline void boot_delay_msec(int level) } #endif -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time; -#endif +static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_time(u64 ts, char *buf) @@ -1310,7 +1344,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) * for pending data, not the size; return the count of * records, not the length. */ - error = log_next_idx - syslog_idx; + error = log_next_seq - syslog_seq; } else { u64 seq = syslog_seq; u32 idx = syslog_idx; @@ -1416,10 +1450,9 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1432,8 +1465,10 @@ static inline int can_use_console(unsigned int cpu) * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. */ -static int console_trylock_for_printk(unsigned int cpu) +static int console_trylock_for_printk(void) { + unsigned int cpu = smp_processor_id(); + if (!console_trylock()) return 0; /* @@ -1476,7 +1511,7 @@ static struct cont { struct task_struct *owner; /* task of first print*/ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ - u8 facility; /* log level of first message */ + u8 facility; /* log facility of first message */ enum log_flags flags; /* prefix, newline flags */ bool flushed:1; /* buffer sealed and committed */ } cont; @@ -1608,7 +1643,8 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - goto out_restore_irqs; + local_irq_restore(flags); + return 0; } zap_locks(); } @@ -1716,21 +1752,30 @@ asmlinkage int vprintk_emit(int facility, int level, logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { + lockdep_off(); + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to + * console + */ + preempt_disable(); + /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock_for_printk(this_cpu)) + if (console_trylock_for_printk()) console_unlock(); + preempt_enable(); + lockdep_on(); } - lockdep_on(); -out_restore_irqs: - local_irq_restore(flags); return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -1802,7 +1847,7 @@ EXPORT_SYMBOL(printk); #define LOG_LINE_MAX 0 #define PREFIX_MAX 0 -#define LOG_LINE_MAX 0 + static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; @@ -1881,11 +1926,12 @@ static int __add_preferred_console(char *name, int idx, char *options, return 0; } /* - * Set up a list of consoles. Called from init/main.c + * Set up a console. Called via do_early_param() in init/main.c + * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { - char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ + char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */ char *s, *options, *brl_options = NULL; int idx; @@ -1902,7 +1948,8 @@ static int __init console_setup(char *str) strncpy(buf, str, sizeof(buf) - 1); } buf[sizeof(buf) - 1] = 0; - if ((options = strchr(str, ',')) != NULL) + options = strchr(str, ','); + if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) @@ -1911,7 +1958,7 @@ static int __init console_setup(char *str) strcpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) - if ((*s >= '0' && *s <= '9') || *s == ',') + if (isdigit(*s) || *s == ',') break; idx = simple_strtoul(s, NULL, 10); *s = 0; @@ -1950,7 +1997,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha i++, c++) if (strcmp(c->name, name) == 0 && c->index == idx) { strlcpy(c->name, name_new, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; c->options = options; c->index = idx_new; return i; @@ -1959,12 +2005,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha return -1; } -bool console_suspend_enabled = 1; +bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { - console_suspend_enabled = 0; + console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); @@ -2045,8 +2091,8 @@ EXPORT_SYMBOL(console_lock); /** * console_trylock - try to lock the console system for exclusive use. * - * Tried to acquire a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. + * Try to acquire a lock which guarantees that the caller has exclusive + * access to the console system and the console_drivers list. * * returns 1 on success, and 0 on failure to acquire the lock. */ @@ -2618,14 +2664,13 @@ EXPORT_SYMBOL(__printk_ratelimit); bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { - if (*caller_jiffies == 0 - || !time_in_range(jiffies, *caller_jiffies, - *caller_jiffies - + msecs_to_jiffies(interval_msecs))) { - *caller_jiffies = jiffies; - return true; - } - return false; + unsigned long elapsed = jiffies - *caller_jiffies; + + if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) + return false; + + *caller_jiffies = jiffies; + return true; } EXPORT_SYMBOL(printk_timed_ratelimit); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index adf9862..54e7522 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -28,12 +28,6 @@ #include <linux/compat.h> -static int ptrace_trapping_sleep_fn(void *flags) -{ - schedule(); - return 0; -} - /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. @@ -371,7 +365,7 @@ unlock_creds: out: if (!retval) { wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, - ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); proc_ptrace_connector(task, PTRACE_ATTACH); } diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bfda272..ff1a6de 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -99,6 +99,10 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) void kfree(const void *); +/* + * Reclaim the specified callback, either by invoking it (non-lazy case) + * or freeing it directly (lazy case). Return true if lazy, false otherwise. + */ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) { unsigned long offset = (unsigned long)head->func; @@ -108,12 +112,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); kfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); - return 1; + return true; } else { RCU_TRACE(trace_rcu_invoke_callback(rn, head)); head->func(head); rcu_lock_release(&rcu_callback_map); - return 0; + return false; } } diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index c639556..e037f3e 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -298,9 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp) idx = ACCESS_ONCE(sp->completed) & 0x1; preempt_disable(); - ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; + __this_cpu_inc(sp->per_cpu_ref->c[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; + __this_cpu_inc(sp->per_cpu_ref->seq[idx]); preempt_enable(); return idx; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 625d0b0..1b70cb6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1013,10 +1013,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) } /* - * Dump stacks of all tasks running on stalled CPUs. This is a fallback - * for architectures that do not implement trigger_all_cpu_backtrace(). - * The NMI-triggered stack traces are more accurate because they are - * printed by the target CPU. + * Dump stacks of all tasks running on stalled CPUs. */ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) { @@ -1094,7 +1091,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) (long)rsp->gpnum, (long)rsp->completed, totqlen); if (ndetected == 0) pr_err("INFO: Stall ended before state dump start\n"); - else if (!trigger_all_cpu_backtrace()) + else rcu_dump_cpu_stacks(rsp); /* Complain about tasks blocking the grace period. */ @@ -1125,8 +1122,7 @@ static void print_cpu_stall(struct rcu_state *rsp) pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", jiffies - rsp->gp_start, (long)rsp->gpnum, (long)rsp->completed, totqlen); - if (!trigger_all_cpu_backtrace()) - dump_stack(); + rcu_dump_cpu_stacks(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) @@ -1305,10 +1301,16 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * believe that a grace period is in progress, then we must wait * for the one following, which is in "c". Because our request * will be noticed at the end of the current grace period, we don't - * need to explicitly start one. + * need to explicitly start one. We only do the lockless check + * of rnp_root's fields if the current rcu_node structure thinks + * there is no grace period in flight, and because we hold rnp->lock, + * the only possible change is when rnp_root's two fields are + * equal, in which case rnp_root->gpnum might be concurrently + * incremented. But that is OK, as it will just result in our + * doing some extra useless work. */ if (rnp->gpnum != rnp->completed || - ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { + ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) { rnp->need_future_gp[c & 0x1]++; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); goto out; @@ -1645,11 +1647,6 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); -#ifdef CONFIG_PROVE_RCU_DELAY - if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 && - system_state == SYSTEM_RUNNING) - udelay(200); -#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); } @@ -2347,7 +2344,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) } smp_mb(); /* List handling before counting for rcu_barrier(). */ rdp->qlen_lazy -= count_lazy; - ACCESS_ONCE(rdp->qlen) -= count; + ACCESS_ONCE(rdp->qlen) = rdp->qlen - count; rdp->n_cbs_invoked += count; /* Reinstate batch limit if we have worked down the excess. */ @@ -2485,14 +2482,14 @@ static void force_quiescent_state(struct rcu_state *rsp) struct rcu_node *rnp_old = NULL; /* Funnel through hierarchy to reduce memory contention. */ - rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + rnp = __this_cpu_read(rsp->rda->mynode); for (; rnp != NULL; rnp = rnp->parent) { ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); if (ret) { - ACCESS_ONCE(rsp->n_force_qs_lh)++; + rsp->n_force_qs_lh++; return; } rnp_old = rnp; @@ -2504,7 +2501,7 @@ static void force_quiescent_state(struct rcu_state *rsp) smp_mb__after_unlock_lock(); raw_spin_unlock(&rnp_old->fqslock); if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - ACCESS_ONCE(rsp->n_force_qs_lh)++; + rsp->n_force_qs_lh++; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ } @@ -2662,7 +2659,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), unsigned long flags; struct rcu_data *rdp; - WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ + WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */ if (debug_rcu_head_queue(head)) { /* Probable double call_rcu(), so leak the callback. */ ACCESS_ONCE(head->func) = rcu_leak_callback; @@ -2693,7 +2690,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), local_irq_restore(flags); return; } - ACCESS_ONCE(rdp->qlen)++; + ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; if (lazy) rdp->qlen_lazy++; else @@ -3257,7 +3254,7 @@ static void _rcu_barrier(struct rcu_state *rsp) * ACCESS_ONCE() to prevent the compiler from speculating * the increment to precede the early-exit check. */ - ACCESS_ONCE(rsp->n_barrier_done)++; + ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ @@ -3307,7 +3304,7 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Increment ->n_barrier_done to prevent duplicate work. */ smp_mb(); /* Keep increment after above mechanism. */ - ACCESS_ONCE(rsp->n_barrier_done)++; + ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); smp_mb(); /* Keep increment before caller's subsequent code. */ @@ -3564,14 +3561,16 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) static void __init rcu_init_one(struct rcu_state *rsp, struct rcu_data __percpu *rda) { - static char *buf[] = { "rcu_node_0", - "rcu_node_1", - "rcu_node_2", - "rcu_node_3" }; /* Match MAX_RCU_LVLS */ - static char *fqs[] = { "rcu_node_fqs_0", - "rcu_node_fqs_1", - "rcu_node_fqs_2", - "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ + static const char * const buf[] = { + "rcu_node_0", + "rcu_node_1", + "rcu_node_2", + "rcu_node_3" }; /* Match MAX_RCU_LVLS */ + static const char * const fqs[] = { + "rcu_node_fqs_0", + "rcu_node_fqs_1", + "rcu_node_fqs_2", + "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ static u8 fl_mask = 0x1; int cpustride = 1; int i; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0f69a79..71e64c7 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -172,6 +172,14 @@ struct rcu_node { /* queued on this rcu_node structure that */ /* are blocking the current grace period, */ /* there can be no such task. */ + struct completion boost_completion; + /* Used to ensure that the rt_mutex used */ + /* to carry out the boosting is fully */ + /* released with no future boostee accesses */ + /* before that rt_mutex is re-initialized. */ + struct rt_mutex boost_mtx; + /* Used only for the priority-boosting */ + /* side effect, not as a lock. */ unsigned long boost_time; /* When to start boosting (jiffies). */ struct task_struct *boost_kthread_task; @@ -334,11 +342,29 @@ struct rcu_data { struct rcu_head **nocb_tail; atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ atomic_long_t nocb_q_count_lazy; /* (approximate). */ + struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ + struct rcu_head **nocb_follower_tail; + atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */ + atomic_long_t nocb_follower_count_lazy; /* (approximate). */ int nocb_p_count; /* # CBs being invoked by kthread */ int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ + + /* The following fields are used by the leader, hence own cacheline. */ + struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; + /* CBs waiting for GP. */ + struct rcu_head **nocb_gp_tail; + long nocb_gp_count; + long nocb_gp_count_lazy; + bool nocb_leader_wake; /* Is the nocb leader thread awake? */ + struct rcu_data *nocb_next_follower; + /* Next follower in wakeup chain. */ + + /* The following fields are used by the follower, hence new cachline. */ + struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp; + /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* 8) RCU CPU stall data. */ @@ -587,8 +613,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp); /* Sum up queue lengths for tracing. */ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) { - *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; - *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; + *ql = atomic_long_read(&rdp->nocb_q_count) + + rdp->nocb_p_count + + atomic_long_read(&rdp->nocb_follower_count) + + rdp->nocb_p_count + rdp->nocb_gp_count; + *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + + rdp->nocb_p_count_lazy + + atomic_long_read(&rdp->nocb_follower_count_lazy) + + rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy; } #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 02ac0fb..00dc411 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -33,6 +33,7 @@ #define RCU_KTHREAD_PRIO 1 #ifdef CONFIG_RCU_BOOST +#include "../locking/rtmutex_common.h" #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO #else #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO @@ -336,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t) unsigned long flags; struct list_head *np; #ifdef CONFIG_RCU_BOOST - struct rt_mutex *rbmp = NULL; + bool drop_boost_mutex = false; #endif /* #ifdef CONFIG_RCU_BOOST */ struct rcu_node *rnp; int special; @@ -398,11 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST if (&t->rcu_node_entry == rnp->boost_tasks) rnp->boost_tasks = np; - /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ - if (t->rcu_boost_mutex) { - rbmp = t->rcu_boost_mutex; - t->rcu_boost_mutex = NULL; - } + /* Snapshot ->boost_mtx ownership with rcu_node lock held. */ + drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -427,8 +425,10 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (rbmp) - rt_mutex_unlock(rbmp); + if (drop_boost_mutex) { + rt_mutex_unlock(&rnp->boost_mtx); + complete(&rnp->boost_completion); + } #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -988,6 +988,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) /* Because preemptible RCU does not exist, no quieting of tasks. */ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1149,7 +1150,6 @@ static void rcu_wake_cond(struct task_struct *t, int status) static int rcu_boost(struct rcu_node *rnp) { unsigned long flags; - struct rt_mutex mtx; struct task_struct *t; struct list_head *tb; @@ -1200,11 +1200,15 @@ static int rcu_boost(struct rcu_node *rnp) * section. */ t = container_of(tb, struct task_struct, rcu_node_entry); - rt_mutex_init_proxy_locked(&mtx, t); - t->rcu_boost_mutex = &mtx; + rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); + init_completion(&rnp->boost_completion); raw_spin_unlock_irqrestore(&rnp->lock, flags); - rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ - rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + /* Lock only for side effect: boosts task t's priority. */ + rt_mutex_lock(&rnp->boost_mtx); + rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ + + /* Wait for boostee to be done w/boost_mtx before reinitializing. */ + wait_for_completion(&rnp->boost_completion); return ACCESS_ONCE(rnp->exp_tasks) != NULL || ACCESS_ONCE(rnp->boost_tasks) != NULL; @@ -1256,6 +1260,7 @@ static int rcu_boost_kthread(void *arg) * about it going away. */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { struct task_struct *t; @@ -1491,6 +1496,7 @@ static void rcu_prepare_kthreads(int cpu) #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -2060,6 +2066,22 @@ bool rcu_is_nocb_cpu(int cpu) #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* + * Kick the leader kthread for this NOCB group. + */ +static void wake_nocb_leader(struct rcu_data *rdp, bool force) +{ + struct rcu_data *rdp_leader = rdp->nocb_leader; + + if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) + return; + if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) { + /* Prior xchg orders against prior callback enqueue. */ + ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true; + wake_up(&rdp_leader->nocb_wq); + } +} + +/* * Enqueue the specified string of rcu_head structures onto the specified * CPU's no-CBs lists. The CPU is specified by rdp, the head of the * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy @@ -2093,7 +2115,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, len = atomic_long_read(&rdp->nocb_q_count); if (old_rhpp == &rdp->nocb_head) { if (!irqs_disabled_flags(flags)) { - wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */ + /* ... if queue was empty ... */ + wake_nocb_leader(rdp, false); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { @@ -2103,7 +2126,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { - wake_up_process(t); /* ... or if many callbacks queued. */ + /* ... or if many callbacks queued. */ + wake_nocb_leader(rdp, true); rdp->qlen_last_fqs_check = LONG_MAX / 2; trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { @@ -2213,13 +2237,150 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) } /* + * Leaders come here to wait for additional callbacks to show up. + * This function does not return until callbacks appear. + */ +static void nocb_leader_wait(struct rcu_data *my_rdp) +{ + bool firsttime = true; + bool gotcbs; + struct rcu_data *rdp; + struct rcu_head **tail; + +wait_again: + + /* Wait for callbacks to appear. */ + if (!rcu_nocb_poll) { + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); + wait_event_interruptible(my_rdp->nocb_wq, + ACCESS_ONCE(my_rdp->nocb_leader_wake)); + /* Memory barrier handled by smp_mb() calls below and repoll. */ + } else if (firsttime) { + firsttime = false; /* Don't drown trace log with "Poll"! */ + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); + } + + /* + * Each pass through the following loop checks a follower for CBs. + * We are our own first follower. Any CBs found are moved to + * nocb_gp_head, where they await a grace period. + */ + gotcbs = false; + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { + rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head); + if (!rdp->nocb_gp_head) + continue; /* No CBs here, try next follower. */ + + /* Move callbacks to wait-for-GP list, which is empty. */ + ACCESS_ONCE(rdp->nocb_head) = NULL; + rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); + rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0); + rdp->nocb_gp_count_lazy = + atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); + gotcbs = true; + } + + /* + * If there were no callbacks, sleep a bit, rescan after a + * memory barrier, and go retry. + */ + if (unlikely(!gotcbs)) { + if (!rcu_nocb_poll) + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, + "WokeEmpty"); + flush_signals(current); + schedule_timeout_interruptible(1); + + /* Rescan in case we were a victim of memory ordering. */ + my_rdp->nocb_leader_wake = false; + smp_mb(); /* Ensure _wake false before scan. */ + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) + if (ACCESS_ONCE(rdp->nocb_head)) { + /* Found CB, so short-circuit next wait. */ + my_rdp->nocb_leader_wake = true; + break; + } + goto wait_again; + } + + /* Wait for one grace period. */ + rcu_nocb_wait_gp(my_rdp); + + /* + * We left ->nocb_leader_wake set to reduce cache thrashing. + * We clear it now, but recheck for new callbacks while + * traversing our follower list. + */ + my_rdp->nocb_leader_wake = false; + smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */ + + /* Each pass through the following loop wakes a follower, if needed. */ + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { + if (ACCESS_ONCE(rdp->nocb_head)) + my_rdp->nocb_leader_wake = true; /* No need to wait. */ + if (!rdp->nocb_gp_head) + continue; /* No CBs, so no need to wake follower. */ + + /* Append callbacks to follower's "done" list. */ + tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); + *tail = rdp->nocb_gp_head; + atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); + atomic_long_add(rdp->nocb_gp_count_lazy, + &rdp->nocb_follower_count_lazy); + if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { + /* + * List was empty, wake up the follower. + * Memory barriers supplied by atomic_long_add(). + */ + wake_up(&rdp->nocb_wq); + } + } + + /* If we (the leader) don't have CBs, go wait some more. */ + if (!my_rdp->nocb_follower_head) + goto wait_again; +} + +/* + * Followers come here to wait for additional callbacks to show up. + * This function does not return until callbacks appear. + */ +static void nocb_follower_wait(struct rcu_data *rdp) +{ + bool firsttime = true; + + for (;;) { + if (!rcu_nocb_poll) { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + "FollowerSleep"); + wait_event_interruptible(rdp->nocb_wq, + ACCESS_ONCE(rdp->nocb_follower_head)); + } else if (firsttime) { + /* Don't drown trace log with "Poll"! */ + firsttime = false; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll"); + } + if (smp_load_acquire(&rdp->nocb_follower_head)) { + /* ^^^ Ensure CB invocation follows _head test. */ + return; + } + if (!rcu_nocb_poll) + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + "WokeEmpty"); + flush_signals(current); + schedule_timeout_interruptible(1); + } +} + +/* * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes - * callbacks queued by the corresponding no-CBs CPU. + * callbacks queued by the corresponding no-CBs CPU, however, there is + * an optional leader-follower relationship so that the grace-period + * kthreads don't have to do quite so many wakeups. */ static int rcu_nocb_kthread(void *arg) { int c, cl; - bool firsttime = 1; struct rcu_head *list; struct rcu_head *next; struct rcu_head **tail; @@ -2227,41 +2388,22 @@ static int rcu_nocb_kthread(void *arg) /* Each pass through this loop invokes one batch of callbacks */ for (;;) { - /* If not polling, wait for next batch of callbacks. */ - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("Sleep")); - wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); - /* Memory barrier provide by xchg() below. */ - } else if (firsttime) { - firsttime = 0; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("Poll")); - } - list = ACCESS_ONCE(rdp->nocb_head); - if (!list) { - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WokeEmpty")); - schedule_timeout_interruptible(1); - flush_signals(current); - continue; - } - firsttime = 1; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WokeNonEmpty")); - - /* - * Extract queued callbacks, update counts, and wait - * for a grace period to elapse. - */ - ACCESS_ONCE(rdp->nocb_head) = NULL; - tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); - c = atomic_long_xchg(&rdp->nocb_q_count, 0); - cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); - ACCESS_ONCE(rdp->nocb_p_count) += c; - ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; - rcu_nocb_wait_gp(rdp); + /* Wait for callbacks. */ + if (rdp->nocb_leader == rdp) + nocb_leader_wait(rdp); + else + nocb_follower_wait(rdp); + + /* Pull the ready-to-invoke callbacks onto local list. */ + list = ACCESS_ONCE(rdp->nocb_follower_head); + BUG_ON(!list); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); + ACCESS_ONCE(rdp->nocb_follower_head) = NULL; + tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); + c = atomic_long_xchg(&rdp->nocb_follower_count, 0); + cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0); + rdp->nocb_p_count += c; + rdp->nocb_p_count_lazy += cl; /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); @@ -2305,7 +2447,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) if (!rcu_nocb_need_deferred_wakeup(rdp)) return; ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; - wake_up(&rdp->nocb_wq); + wake_nocb_leader(rdp, false); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); } @@ -2314,19 +2456,57 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { rdp->nocb_tail = &rdp->nocb_head; init_waitqueue_head(&rdp->nocb_wq); + rdp->nocb_follower_tail = &rdp->nocb_follower_head; } -/* Create a kthread for each RCU flavor for each no-CBs CPU. */ +/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ +static int rcu_nocb_leader_stride = -1; +module_param(rcu_nocb_leader_stride, int, 0444); + +/* + * Create a kthread for each RCU flavor for each no-CBs CPU. + * Also initialize leader-follower relationships. + */ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) { int cpu; + int ls = rcu_nocb_leader_stride; + int nl = 0; /* Next leader. */ struct rcu_data *rdp; + struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ + struct rcu_data *rdp_prev = NULL; struct task_struct *t; if (rcu_nocb_mask == NULL) return; +#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) + if (tick_nohz_full_running) + cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); +#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */ + if (ls == -1) { + ls = int_sqrt(nr_cpu_ids); + rcu_nocb_leader_stride = ls; + } + + /* + * Each pass through this loop sets up one rcu_data structure and + * spawns one rcu_nocb_kthread(). + */ for_each_cpu(cpu, rcu_nocb_mask) { rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->cpu >= nl) { + /* New leader, set up for followers & next leader. */ + nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; + rdp->nocb_leader = rdp; + rdp_leader = rdp; + } else { + /* Another follower, link to previous leader. */ + rdp->nocb_leader = rdp_leader; + rdp_prev->nocb_next_follower = rdp; + } + rdp_prev = rdp; + + /* Spawn the kthread for this CPU. */ t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%c/%d", rsp->abbr, cpu); BUG_ON(IS_ERR(t)); @@ -2843,12 +3023,16 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) */ static void rcu_bind_gp_kthread(void) { -#ifdef CONFIG_NO_HZ_FULL - int cpu = ACCESS_ONCE(tick_do_timer_cpu); + int __maybe_unused cpu; - if (cpu < 0 || cpu >= nr_cpu_ids) + if (!tick_nohz_full_enabled()) return; - if (raw_smp_processor_id() != cpu) +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + cpu = tick_do_timer_cpu; + if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu) set_cpus_allowed_ptr(current, cpumask_of(cpu)); -#endif /* #ifdef CONFIG_NO_HZ_FULL */ +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + if (!is_housekeeping_cpu(raw_smp_processor_id())) + housekeeping_affine(current); +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index bc78835..4056d79 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -90,9 +90,6 @@ void __rcu_read_unlock(void) } else { barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; -#ifdef CONFIG_PROVE_RCU_DELAY - udelay(10); /* Make preemption more probable. */ -#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) rcu_read_unlock_special(t); diff --git a/kernel/resource.c b/kernel/resource.c index 3c2237a..da14b8d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock); static struct resource *bootmem_resource_free; static DEFINE_SPINLOCK(bootmem_resource_lock); -static void *r_next(struct seq_file *m, void *v, loff_t *pos) +static struct resource *next_resource(struct resource *p, bool sibling_only) { - struct resource *p = v; - (*pos)++; + /* Caller wants to traverse through siblings only */ + if (sibling_only) + return p->sibling; + if (p->child) return p->child; while (!p->sibling && p->parent) @@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + (*pos)++; + return (void *)next_resource(p, false); +} + #ifdef CONFIG_PROC_FS enum { MAX_IORES_LEVEL = 5 }; @@ -322,16 +331,19 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* - * Finds the lowest memory reosurce exists within [res->start.res->end) + * Finds the lowest iomem reosurce exists with-in [res->start.res->end) * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. + * This walks through whole tree and not just first level children + * until and unless first_level_children_only is true. */ -static int find_next_system_ram(struct resource *res, char *name) +static int find_next_iomem_res(struct resource *res, char *name, + bool first_level_children_only) { resource_size_t start, end; struct resource *p; + bool sibling_only = false; BUG_ON(!res); @@ -340,8 +352,14 @@ static int find_next_system_ram(struct resource *res, char *name) BUG_ON(start >= end); read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - /* system ram is just marked as IORESOURCE_MEM */ + + if (first_level_children_only) { + p = iomem_resource.child; + sibling_only = true; + } else + p = &iomem_resource; + + while ((p = next_resource(p, sibling_only))) { if (p->flags != res->flags) continue; if (name && strcmp(p->name, name)) @@ -353,6 +371,7 @@ static int find_next_system_ram(struct resource *res, char *name) if ((p->end >= start) && (p->start < end)) break; } + read_unlock(&resource_lock); if (!p) return -1; @@ -365,6 +384,70 @@ static int find_next_system_ram(struct resource *res, char *name) } /* + * Walks through iomem resources and calls func() with matching resource + * ranges. This walks through whole tree and not just first level children. + * All the memory ranges which overlap start,end and also match flags and + * name are valid candidates. + * + * @name: name of resource + * @flags: resource flags + * @start: start addr + * @end: end addr + */ +int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, + void *arg, int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = flags; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, name, false))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". This function deals with + * full ranges and not pfn. If resources are not pfn aligned, dealing + * with pfn can truncate ranges. + */ +int walk_system_ram_res(u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, "System RAM", true))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) + +/* * This function calls callback against all memory range of "System RAM" * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. * Now, this function is only for "System RAM". @@ -382,7 +465,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_system_ram(&res, "System RAM") >= 0)) { + (find_next_iomem_res(&res, "System RAM", true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc1638b..1211575 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq) return; delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + if (delta < 0) + return; rq->clock += delta; update_rq_clock_task(rq, delta); } @@ -243,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, char buf[64]; char *cmp; int i; + struct inode *inode; if (cnt > 63) cnt = 63; @@ -253,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; cmp = strstrip(buf); + /* Ensure the static_key remains in a consistent state */ + inode = file_inode(filp); + mutex_lock(&inode->i_mutex); i = sched_feat_set(cmp); + mutex_unlock(&inode->i_mutex); if (i == __SCHED_FEAT_NR) return -EINVAL; @@ -587,30 +594,31 @@ static bool set_nr_if_polling(struct task_struct *p) #endif /* - * resched_task - mark a task 'to be rescheduled now'. + * resched_curr - mark rq's current task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -void resched_task(struct task_struct *p) +void resched_curr(struct rq *rq) { + struct task_struct *curr = rq->curr; int cpu; - lockdep_assert_held(&task_rq(p)->lock); + lockdep_assert_held(&rq->lock); - if (test_tsk_need_resched(p)) + if (test_tsk_need_resched(curr)) return; - cpu = task_cpu(p); + cpu = cpu_of(rq); if (cpu == smp_processor_id()) { - set_tsk_need_resched(p); + set_tsk_need_resched(curr); set_preempt_need_resched(); return; } - if (set_nr_and_not_polling(p)) + if (set_nr_and_not_polling(curr)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); @@ -623,7 +631,7 @@ void resched_cpu(int cpu) if (!raw_spin_trylock_irqsave(&rq->lock, flags)) return; - resched_task(cpu_curr(cpu)); + resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -684,10 +692,16 @@ static void wake_up_idle_cpu(int cpu) static bool wake_up_full_nohz_cpu(int cpu) { + /* + * We just need the target to call irq_exit() and re-evaluate + * the next tick. The nohz full kick at least implies that. + * If needed we can still optimize that later with an + * empty IRQ. + */ if (tick_nohz_full_cpu(cpu)) { if (cpu != smp_processor_id() || tick_nohz_tick_stopped()) - smp_send_reschedule(cpu); + tick_nohz_full_kick_cpu(cpu); return true; } @@ -730,18 +744,15 @@ static inline bool got_nohz_idle_kick(void) #ifdef CONFIG_NO_HZ_FULL bool sched_can_stop_tick(void) { - struct rq *rq; - - rq = this_rq(); - - /* Make sure rq->nr_running update is visible after the IPI */ - smp_rmb(); - - /* More than one running task need preemption */ - if (rq->nr_running > 1) - return false; + /* + * More than one running task need preemption. + * nr_running update is assumed to be visible + * after IPI is sent from wakers. + */ + if (this_rq()->nr_running > 1) + return false; - return true; + return true; } #endif /* CONFIG_NO_HZ_FULL */ @@ -1022,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) if (class == rq->curr->sched_class) break; if (class == p->sched_class) { - resched_task(rq->curr); + resched_curr(rq); break; } } @@ -1568,9 +1579,7 @@ void scheduler_ipi(void) */ preempt_fold_need_resched(); - if (llist_empty(&this_rq()->wake_list) - && !tick_nohz_full_cpu(smp_processor_id()) - && !got_nohz_idle_kick()) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) return; /* @@ -1587,7 +1596,6 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); - tick_nohz_full_check(); sched_ttwu_pending(); /* @@ -2431,7 +2439,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) { u64 ns = 0; - if (task_current(rq, p)) { + /* + * Must be ->curr _and_ ->on_rq. If dequeued, we would + * project cycles that may never be accounted to this + * thread, breaking clock_gettime(). + */ + if (task_current(rq, p) && p->on_rq) { update_rq_clock(rq); ns = rq_clock_task(rq) - p->se.exec_start; if ((s64)ns < 0) @@ -2474,8 +2487,10 @@ unsigned long long task_sched_runtime(struct task_struct *p) * If we race with it leaving cpu, we'll take a lock. So we're correct. * If we race with it entering cpu, unaccounted time is 0. This is * indistinguishable from the read occurring a few cycles earlier. + * If we see ->on_cpu without ->on_rq, the task is leaving, and has + * been accounted, so we're correct here as well. */ - if (!p->on_cpu) + if (!p->on_cpu || !p->on_rq) return p->se.sum_exec_runtime; #endif @@ -2971,7 +2986,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } trace_sched_pi_setprio(p, prio); - p->pi_top_task = rt_mutex_get_top_task(p); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->on_rq; @@ -2991,8 +3005,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) * running task */ if (dl_prio(prio)) { - if (!dl_prio(p->normal_prio) || (p->pi_top_task && - dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { + struct task_struct *pi_task = rt_mutex_get_top_task(p); + if (!dl_prio(p->normal_prio) || + (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; p->dl.dl_throttled = 0; enqueue_flag = ENQUEUE_REPLENISH; @@ -3064,7 +3079,7 @@ void set_user_nice(struct task_struct *p, long nice) * lowered its priority, then reschedule its CPU: */ if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_task(rq->curr); + resched_curr(rq); } out_unlock: task_rq_unlock(rq, p, &flags); @@ -3203,12 +3218,18 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_yielded = 0; } +/* + * sched_setparam() passes in -1 for its policy, to let the functions + * it calls know not to change it. + */ +#define SETPARAM_POLICY -1 + static void __setscheduler_params(struct task_struct *p, const struct sched_attr *attr) { int policy = attr->sched_policy; - if (policy == -1) /* setparam */ + if (policy == SETPARAM_POLICY) policy = p->policy; p->policy = policy; @@ -3557,10 +3578,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy, .sched_nice = PRIO_TO_NICE(p->static_prio), }; - /* - * Fixup the legacy SCHED_RESET_ON_FORK hack - */ - if (policy & SCHED_RESET_ON_FORK) { + /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ + if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; policy &= ~SCHED_RESET_ON_FORK; attr.sched_policy = policy; @@ -3730,7 +3749,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, */ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) { - return do_sched_setscheduler(pid, -1, param); + return do_sched_setscheduler(pid, SETPARAM_POLICY, param); } /** @@ -4285,7 +4304,7 @@ again: * fairness. */ if (preempt && rq != p_rq) - resched_task(p_rq->curr); + resched_curr(p_rq); } out_unlock: @@ -6465,6 +6484,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; sd->child = child; + + if (!cpumask_subset(sched_domain_span(child), + sched_domain_span(sd))) { + pr_err("BUG: arch topology borken\n"); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" the %s domain not a subset of the %s domain\n", + child->name, sd->name); +#endif + /* Fixup, ensure @sd has at least @child cpus. */ + cpumask_or(sched_domain_span(sd), + sched_domain_span(sd), + sched_domain_span(child)); + } + } set_domain_attribute(sd, attr); @@ -7092,7 +7125,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) __setscheduler(rq, p, &attr); if (on_rq) { enqueue_task(rq, p, 0); - resched_task(rq->curr); + resched_curr(rq); } check_class_changed(rq, p, prev_class, old_prio); @@ -7803,6 +7836,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) if (period > max_cfs_quota_period) return -EINVAL; + /* + * Prevent race between setting of cfs_rq->runtime_enabled and + * unthrottle_offline_cfs_rqs(). + */ + get_online_cpus(); mutex_lock(&cfs_constraints_mutex); ret = __cfs_schedulable(tg, period, quota); if (ret) @@ -7828,7 +7866,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) } raw_spin_unlock_irq(&cfs_b->lock); - for_each_possible_cpu(i) { + for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; @@ -7844,6 +7882,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); + put_online_cpus(); return ret; } @@ -8083,7 +8122,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .base_cftypes = cpu_files, + .legacy_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9cf350c..dd7cbb5 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -278,6 +278,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, - .base_cftypes = files, + .legacy_cftypes = files, .early_init = 1, }; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fc4f98b1..255ce13 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -306,7 +306,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, * the overrunning entity can't interfere with other entity in the system and * can't make them miss their deadlines. Reasons why this kind of overruns * could happen are, typically, a entity voluntarily trying to overcome its - * runtime, or it just underestimated it during sched_setscheduler_ex(). + * runtime, or it just underestimated it during sched_setattr(). */ static void replenish_dl_entity(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se) @@ -535,7 +535,7 @@ again: if (task_has_dl_policy(rq->curr)) check_preempt_curr_dl(rq, p, 0); else - resched_task(rq->curr); + resched_curr(rq); #ifdef CONFIG_SMP /* * Queueing this task back might have overloaded rq, @@ -634,7 +634,7 @@ static void update_curr_dl(struct rq *rq) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) - resched_task(curr); + resched_curr(rq); } /* @@ -964,7 +964,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) cpudl_find(&rq->rd->cpudl, p, NULL) != -1) return; - resched_task(rq->curr); + resched_curr(rq); } static int pull_dl_task(struct rq *this_rq); @@ -979,7 +979,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags) { if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { - resched_task(rq->curr); + resched_curr(rq); return; } @@ -1333,7 +1333,7 @@ retry: if (dl_task(rq->curr) && dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && rq->curr->nr_cpus_allowed > 1) { - resched_task(rq->curr); + resched_curr(rq); return 0; } @@ -1373,7 +1373,7 @@ retry: set_task_cpu(next_task, later_rq->cpu); activate_task(later_rq, next_task, 0); - resched_task(later_rq->curr); + resched_curr(later_rq); double_unlock_balance(rq, later_rq); @@ -1632,14 +1632,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && rq->curr == p) - resched_task(p); + resched_curr(rq); #else /* * Again, we don't know if p has a earlier * or later deadline, so let's blindly set a * (maybe not needed) rescheduling point. */ - resched_task(p); + resched_curr(rq); #endif /* CONFIG_SMP */ } else switched_to_dl(rq, p); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fea7d33..bfa3c86 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; ns->task_capacity = DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); ns->has_free_capacity = (ns->nr_running < ns->task_capacity); @@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } -static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, - long src_load, long dst_load, +static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { long imb, old_imb; + long orig_src_load, orig_dst_load; + long src_capacity, dst_capacity; + + /* + * The load is corrected for the CPU capacity available on each node. + * + * src_load dst_load + * ------------ vs --------- + * src_capacity dst_capacity + */ + src_capacity = env->src_stats.compute_capacity; + dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ if (dst_load < src_load) swap(dst_load, src_load); /* Is the difference below the threshold? */ - imb = dst_load * 100 - src_load * env->imbalance_pct; + imb = dst_load * src_capacity * 100 - + src_load * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; @@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, * The imbalance is above the allowed threshold. * Compare it with the old imbalance. */ + orig_src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; + if (orig_dst_load < orig_src_load) swap(orig_dst_load, orig_src_load); - old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + old_imb = orig_dst_load * src_capacity * 100 - + orig_src_load * dst_capacity * env->imbalance_pct; /* Would this change make things worse? */ return (imb > old_imb); @@ -1136,10 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long orig_src_load, src_load; - long orig_dst_load, dst_load; + long src_load, dst_load; long load; - long imp = (groupimp > 0) ? groupimp : taskimp; + long imp = env->p->numa_group ? groupimp : taskimp; + long moveimp = imp; rcu_read_lock(); cur = ACCESS_ONCE(dst_rq->curr); @@ -1177,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env, * itself (not part of a group), use the task weight * instead. */ - if (env->p->numa_group) - imp = groupimp; - else - imp = taskimp; - if (cur->numa_group) imp += group_weight(cur, env->src_nid) - group_weight(cur, env->dst_nid); @@ -1191,7 +1201,7 @@ static void task_numa_compare(struct task_numa_env *env, } } - if (imp < env->best_imp) + if (imp <= env->best_imp && moveimp <= env->best_imp) goto unlock; if (!cur) { @@ -1204,20 +1214,34 @@ static void task_numa_compare(struct task_numa_env *env, } /* Balance doesn't matter much if we're running a task per cpu */ - if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) + if (imp > env->best_imp && src_rq->nr_running == 1 && + dst_rq->nr_running == 1) goto assign; /* * In the overloaded case, try and keep the load balanced. */ balance: - orig_dst_load = env->dst_stats.load; - orig_src_load = env->src_stats.load; - - /* XXX missing capacity terms */ load = task_h_load(env->p); - dst_load = orig_dst_load + load; - src_load = orig_src_load - load; + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + + if (moveimp > imp && moveimp > env->best_imp) { + /* + * If the improvement from just moving env->p direction is + * better than swapping tasks around, check if a move is + * possible. Store a slightly smaller score than moveimp, + * so an actually idle CPU will win. + */ + if (!load_too_imbalanced(src_load, dst_load, env)) { + imp = moveimp - 1; + cur = NULL; + goto assign; + } + } + + if (imp <= env->best_imp) + goto unlock; if (cur) { load = task_h_load(cur); @@ -1225,8 +1249,7 @@ balance: src_load += load; } - if (load_too_imbalanced(orig_src_load, orig_dst_load, - src_load, dst_load, env)) + if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; assign: @@ -1302,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p) groupimp = group_weight(p, env.dst_nid) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); - /* If the preferred nid has free capacity, try to use it. */ - if (env.dst_stats.has_free_capacity) - task_numa_find_cpu(&env, taskimp, groupimp); + /* Try to find a spot on the preferred nid. */ + task_numa_find_cpu(&env, taskimp, groupimp); /* No space available on the preferred nid. Look elsewhere. */ if (env.best_cpu == -1) { @@ -1324,10 +1346,6 @@ static int task_numa_migrate(struct task_struct *p) } } - /* No better CPU than the current one was found. */ - if (env.best_cpu == -1) - return -EAGAIN; - /* * If the task is part of a workload that spans multiple NUMA nodes, * and is migrating into one of the workload's active nodes, remember @@ -1336,8 +1354,19 @@ static int task_numa_migrate(struct task_struct *p) * A task that migrated to a second choice node will be better off * trying for a better one later. Do not set the preferred node here. */ - if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) - sched_setnuma(p, env.dst_nid); + if (p->numa_group) { + if (env.best_cpu == -1) + nid = env.src_nid; + else + nid = env.dst_nid; + + if (node_isset(nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); + } + + /* No better CPU than the current one was found. */ + if (env.best_cpu == -1) + return -EAGAIN; /* * Reset the scan period if the task is being rescheduled on an @@ -1415,12 +1444,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) /* * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS * increments. The more local the fault statistics are, the higher the scan - * period will be for the next scan window. If local/remote ratio is below - * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the - * scan period will decrease + * period will be for the next scan window. If local/(local+remote) ratio is + * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) + * the scan period will decrease. Aim for 70% local accesses. */ #define NUMA_PERIOD_SLOTS 10 -#define NUMA_PERIOD_THRESHOLD 3 +#define NUMA_PERIOD_THRESHOLD 7 /* * Increase the scan period (slow down scanning) if the majority of @@ -1595,30 +1624,17 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_group) { update_numa_active_node_mask(p->numa_group); - /* - * If the preferred task and group nids are different, - * iterate over the nodes again to find the best place. - */ - if (max_nid != max_group_nid) { - unsigned long weight, max_weight = 0; - - for_each_online_node(nid) { - weight = task_weight(p, nid) + group_weight(p, nid); - if (weight > max_weight) { - max_weight = weight; - max_nid = nid; - } - } - } - spin_unlock_irq(group_lock); + max_nid = max_group_nid; } - /* Preferred node as the node with the most faults */ - if (max_faults && max_nid != p->numa_preferred_nid) { - /* Update the preferred nid and migrate task if possible */ - sched_setnuma(p, max_nid); - numa_migrate_preferred(p); + if (max_faults) { + /* Set the new preferred node */ + if (max_nid != p->numa_preferred_nid) + sched_setnuma(p, max_nid); + + if (task_node(p) != p->numa_preferred_nid) + numa_migrate_preferred(p); } } @@ -2899,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. @@ -2923,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (delta > ideal_runtime) - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); } static void @@ -3063,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); return; } /* @@ -3254,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); } static __always_inline @@ -3360,7 +3376,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + /* + * Add to the _head_ of the list, so that an already-started + * distribute_cfs_runtime will not see us + */ + list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); if (!cfs_b->timer_active) __start_cfs_bandwidth(cfs_b, false); raw_spin_unlock(&cfs_b->lock); @@ -3410,14 +3430,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) - resched_task(rq->curr); + resched_curr(rq); } static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining, u64 expires) { struct cfs_rq *cfs_rq; - u64 runtime = remaining; + u64 runtime; + u64 starting_runtime = remaining; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -3448,7 +3469,7 @@ next: } rcu_read_unlock(); - return remaining; + return starting_runtime - remaining; } /* @@ -3494,22 +3515,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - /* - * There are throttled entities so we must first use the new bandwidth - * to unthrottle them before making it generally available. This - * ensures that all existing debts will be paid before a new cfs_rq is - * allowed to run. - */ - runtime = cfs_b->runtime; runtime_expires = cfs_b->runtime_expires; - cfs_b->runtime = 0; /* - * This check is repeated as we are holding onto the new bandwidth - * while we unthrottle. This can potentially race with an unthrottled - * group trying to acquire new bandwidth from the global pool. + * This check is repeated as we are holding onto the new bandwidth while + * we unthrottle. This can potentially race with an unthrottled group + * trying to acquire new bandwidth from the global pool. This can result + * in us over-using our runtime if it is all used during this loop, but + * only by limited amounts in that extreme case. */ - while (throttled && runtime > 0) { + while (throttled && cfs_b->runtime > 0) { + runtime = cfs_b->runtime; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, @@ -3517,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) raw_spin_lock(&cfs_b->lock); throttled = !list_empty(&cfs_b->throttled_cfs_rq); + + cfs_b->runtime -= min(runtime, cfs_b->runtime); } - /* return (any) remaining runtime */ - cfs_b->runtime = runtime; /* * While we are ensured activity in the period following an * unthrottle, this also covers the case in which the new bandwidth is @@ -3631,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) return; } - if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - cfs_b->runtime = 0; - } + expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); @@ -3645,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) raw_spin_lock(&cfs_b->lock); if (expires == cfs_b->runtime_expires) - cfs_b->runtime = runtime; + cfs_b->runtime -= min(runtime, cfs_b->runtime); raw_spin_unlock(&cfs_b->lock); } @@ -3775,6 +3790,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } +static void __maybe_unused update_runtime_enabled(struct rq *rq) +{ + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + + raw_spin_lock(&cfs_b->lock); + cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; + raw_spin_unlock(&cfs_b->lock); + } +} + static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { struct cfs_rq *cfs_rq; @@ -3788,6 +3816,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * there's some valid quota amount */ cfs_rq->runtime_remaining = 1; + /* + * Offline rq is schedulable till cpu is completely disabled + * in take_cpu_down(), so we prevent new cfs throttling here. + */ + cfs_rq->runtime_enabled = 0; + if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } @@ -3831,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return NULL; } static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static inline void update_runtime_enabled(struct rq *rq) {} static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #endif /* CONFIG_CFS_BANDWIDTH */ @@ -3854,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) if (delta < 0) { if (rq->curr == p) - resched_task(p); + resched_curr(rq); return; } @@ -4723,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; preempt: - resched_task(curr); + resched_curr(rq); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved @@ -5094,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env) /* * Is this task likely cache-hot: */ -static int -task_hot(struct task_struct *p, u64 now) +static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; @@ -5108,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && + if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; @@ -5118,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now) if (sysctl_sched_migration_cost == 0) return 0; - delta = now - p->se.exec_start; + delta = rq_clock_task(env->src_rq) - p->se.exec_start; return delta < (s64)sysctl_sched_migration_cost; } @@ -5272,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); + tsk_cache_hot = task_hot(p, env); if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); @@ -5864,10 +5898,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. + * @overload: Indicate more than one runnable task for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, - int local_group, struct sg_lb_stats *sgs) + int local_group, struct sg_lb_stats *sgs, + bool *overload) { unsigned long load; int i; @@ -5885,6 +5921,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += load; sgs->sum_nr_running += rq->nr_running; + + if (rq->nr_running > 1) + *overload = true; + #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; @@ -5995,6 +6035,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; + bool overload = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -6015,7 +6056,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, load_idx, local_group, sgs); + update_sg_lb_stats(env, sg, load_idx, local_group, sgs, + &overload); if (local_group) goto next_group; @@ -6049,6 +6091,13 @@ next_group: if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); + + if (!env->sd->parent) { + /* update overload indicator if we are at root domain */ + if (env->dst_rq->rd->overload != overload) + env->dst_rq->rd->overload = overload; + } + } /** @@ -6767,7 +6816,8 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) { + if (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) @@ -7325,6 +7375,8 @@ void trigger_load_balance(struct rq *rq) static void rq_online_fair(struct rq *rq) { update_sysctl(); + + update_runtime_enabled(rq); } static void rq_offline_fair(struct rq *rq) @@ -7398,7 +7450,7 @@ static void task_fork_fair(struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); - resched_task(rq->curr); + resched_curr(rq); } se->vruntime -= cfs_rq->min_vruntime; @@ -7423,7 +7475,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) */ if (rq->curr == p) { if (p->prio > oldprio) - resched_task(rq->curr); + resched_curr(rq); } else check_preempt_curr(rq, p, 0); } @@ -7486,7 +7538,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * if we can still preempt the current task. */ if (rq->curr == p) - resched_task(rq->curr); + resched_curr(rq); else check_preempt_curr(rq, p, 0); } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cf009fb..11e7bc4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -79,7 +79,7 @@ static void cpuidle_idle_call(void) struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; - bool broadcast; + unsigned int broadcast; /* * Check if the idle task must be rescheduled. If it is the @@ -135,7 +135,7 @@ use_default: goto exit_idle; } - broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); + broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP; /* * Tell the time framework to switch to a broadcast timer @@ -147,8 +147,6 @@ use_default: clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) goto use_default; - trace_cpu_idle_rcuidle(next_state, dev->cpu); - /* * Enter the idle state previously returned by the governor decision. * This function will block until an interrupt occurs and will take @@ -156,8 +154,6 @@ use_default: */ entered_state = cpuidle_enter(drv, dev, next_state); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); - if (broadcast) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 879f2b7..67ad4e7 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) */ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) { - resched_task(rq->idle); + resched_curr(rq); } static struct task_struct * diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a490831..5f6edca 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -463,9 +463,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + struct rq *rq = rq_of_rt_rq(rt_rq); struct sched_rt_entity *rt_se; - int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + int cpu = cpu_of(rq); rt_se = rt_rq->tg->rt_se[cpu]; @@ -476,7 +477,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) enqueue_rt_entity(rt_se, false); if (rt_rq->highest_prio.curr < curr->prio) - resched_task(curr); + resched_curr(rq); } } @@ -566,7 +567,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) return; enqueue_top_rt_rq(rt_rq); - resched_task(rq->curr); + resched_curr(rq); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) @@ -740,6 +741,9 @@ balanced: rt_rq->rt_throttled = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock); + + /* Make rt_rq available for pick_next_task() */ + sched_rt_rq_enqueue(rt_rq); } } @@ -948,7 +952,7 @@ static void update_curr_rt(struct rq *rq) raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); } } @@ -1363,7 +1367,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * to try and push current away: */ requeue_task_rt(rq, p, 1); - resched_task(rq->curr); + resched_curr(rq); } #endif /* CONFIG_SMP */ @@ -1374,7 +1378,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) { if (p->prio < rq->curr->prio) { - resched_task(rq->curr); + resched_curr(rq); return; } @@ -1690,7 +1694,7 @@ retry: * just reschedule current. */ if (unlikely(next_task->prio < rq->curr->prio)) { - resched_task(rq->curr); + resched_curr(rq); return 0; } @@ -1737,7 +1741,7 @@ retry: activate_task(lowest_rq, next_task, 0); ret = 1; - resched_task(lowest_rq->curr); + resched_curr(lowest_rq); double_unlock_balance(rq, lowest_rq); @@ -1936,7 +1940,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) return; if (pull_rt_task(rq)) - resched_task(rq->curr); + resched_curr(rq); } void __init init_sched_rt_class(void) @@ -1974,7 +1978,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) check_resched = 0; #endif /* CONFIG_SMP */ if (check_resched && p->prio < rq->curr->prio) - resched_task(rq->curr); + resched_curr(rq); } } @@ -2003,11 +2007,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * Only reschedule if p is still on the same runqueue. */ if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) - resched_task(p); + resched_curr(rq); #else /* For UP simply resched on drop of prio */ if (oldprio < p->prio) - resched_task(p); + resched_curr(rq); #endif /* CONFIG_SMP */ } else { /* @@ -2016,7 +2020,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * then reschedule. */ if (p->prio < rq->curr->prio) - resched_task(rq->curr); + resched_curr(rq); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 31cc02e..579712f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -477,6 +477,9 @@ struct root_domain { cpumask_var_t span; cpumask_var_t online; + /* Indicate more than one runnable task for any CPU */ + bool overload; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). @@ -884,20 +887,10 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct static_key *key) -{ - return static_key_true(key); /* Not out of line branch. */ -} - -static __always_inline bool static_branch__false(struct static_key *key) -{ - return static_key_false(key); /* Out of line branch. */ -} - #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ - return static_branch__##enabled(key); \ + return static_key_##enabled(key); \ } #include "features.h" @@ -1196,7 +1189,7 @@ extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); extern void init_sched_dl_class(void); -extern void resched_task(struct task_struct *p); +extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; @@ -1218,15 +1211,26 @@ static inline void add_nr_running(struct rq *rq, unsigned count) rq->nr_running = prev_nr + count; -#ifdef CONFIG_NO_HZ_FULL if (prev_nr < 2 && rq->nr_running >= 2) { +#ifdef CONFIG_SMP + if (!rq->rd->overload) + rq->rd->overload = true; +#endif + +#ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(rq->cpu)) { - /* Order rq->nr_running write against the IPI */ - smp_wmb(); - smp_send_reschedule(rq->cpu); + /* + * Tick is needed if more than one task runs on a CPU. + * Send the target an IPI to kick it out of nohz mode. + * + * We assume that IPI implies full memory barrier and the + * new value of rq->nr_running is visible on reception + * from the target. + */ + tick_nohz_full_kick_cpu(rq->cpu); } - } #endif + } } static inline void sub_nr_running(struct rq *rq, unsigned count) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 0ffa20a..15cab1a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -319,14 +319,14 @@ EXPORT_SYMBOL(wake_bit_function); */ int __sched __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { int ret = 0; do { prepare_to_wait(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(q->key.flags); + ret = (*action)(&q->key); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); finish_wait(wq, &q->wait); return ret; @@ -334,7 +334,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, EXPORT_SYMBOL(__wait_on_bit); int __sched out_of_line_wait_on_bit(void *word, int bit, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { wait_queue_head_t *wq = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); @@ -345,7 +345,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit); int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { do { int ret; @@ -353,7 +353,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait_exclusive(wq, &q->wait, mode); if (!test_bit(q->key.bit_nr, q->key.flags)) continue; - ret = action(q->key.flags); + ret = action(&q->key); if (!ret) continue; abort_exclusive_wait(wq, &q->wait, mode, &q->key); @@ -365,7 +365,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, EXPORT_SYMBOL(__wait_on_bit_lock); int __sched out_of_line_wait_on_bit_lock(void *word, int bit, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { wait_queue_head_t *wq = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); @@ -502,3 +502,21 @@ void wake_up_atomic_t(atomic_t *p) __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); } EXPORT_SYMBOL(wake_up_atomic_t); + +__sched int bit_wait(struct wait_bit_key *word) +{ + if (signal_pending_state(current->state, current)) + return 1; + schedule(); + return 0; +} +EXPORT_SYMBOL(bit_wait); + +__sched int bit_wait_io(struct wait_bit_key *word) +{ + if (signal_pending_state(current->state, current)) + return 1; + io_schedule(); + return 0; +} +EXPORT_SYMBOL(bit_wait_io); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 301bbc2..25b0043 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -18,15 +18,17 @@ #include <linux/compat.h> #include <linux/sched.h> #include <linux/seccomp.h> +#include <linux/slab.h> +#include <linux/syscalls.h> /* #define SECCOMP_DEBUG 1 */ #ifdef CONFIG_SECCOMP_FILTER #include <asm/syscall.h> #include <linux/filter.h> +#include <linux/pid.h> #include <linux/ptrace.h> #include <linux/security.h> -#include <linux/slab.h> #include <linux/tracehook.h> #include <linux/uaccess.h> @@ -54,7 +56,7 @@ struct seccomp_filter { atomic_t usage; struct seccomp_filter *prev; - struct sk_filter *prog; + struct bpf_prog *prog; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -87,7 +89,7 @@ static void populate_seccomp_data(struct seccomp_data *sd) * @filter: filter to verify * @flen: length of filter * - * Takes a previously checked filter (by sk_chk_filter) and + * Takes a previously checked filter (by bpf_check_classic) and * redirects all filter code that loads struct sk_buff data * and related data through seccomp_bpf_load. It also * enforces length and alignment checking of those loads. @@ -172,51 +174,184 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) */ static u32 seccomp_run_filters(int syscall) { - struct seccomp_filter *f; + struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); struct seccomp_data sd; u32 ret = SECCOMP_RET_ALLOW; /* Ensure unexpected behavior doesn't result in failing open. */ - if (WARN_ON(current->seccomp.filter == NULL)) + if (unlikely(WARN_ON(f == NULL))) return SECCOMP_RET_KILL; + /* Make sure cross-thread synced filter points somewhere sane. */ + smp_read_barrier_depends(); + populate_seccomp_data(&sd); /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ - for (f = current->seccomp.filter; f; f = f->prev) { - u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); + for (; f; f = f->prev) { + u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; } return ret; } +#endif /* CONFIG_SECCOMP_FILTER */ + +static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) +{ + BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + + if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) + return false; + + return true; +} + +static inline void seccomp_assign_mode(struct task_struct *task, + unsigned long seccomp_mode) +{ + BUG_ON(!spin_is_locked(&task->sighand->siglock)); + + task->seccomp.mode = seccomp_mode; + /* + * Make sure TIF_SECCOMP cannot be set before the mode (and + * filter) is set. + */ + smp_mb__before_atomic(); + set_tsk_thread_flag(task, TIF_SECCOMP); +} + +#ifdef CONFIG_SECCOMP_FILTER +/* Returns 1 if the parent is an ancestor of the child. */ +static int is_ancestor(struct seccomp_filter *parent, + struct seccomp_filter *child) +{ + /* NULL is the root ancestor. */ + if (parent == NULL) + return 1; + for (; child; child = child->prev) + if (child == parent) + return 1; + return 0; +} /** - * seccomp_attach_filter: Attaches a seccomp filter to current. + * seccomp_can_sync_threads: checks if all threads can be synchronized + * + * Expects sighand and cred_guard_mutex locks to be held. + * + * Returns 0 on success, -ve on error, or the pid of a thread which was + * either not in the correct seccomp mode or it did not have an ancestral + * seccomp filter. + */ +static inline pid_t seccomp_can_sync_threads(void) +{ + struct task_struct *thread, *caller; + + BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); + BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + + /* Validate all threads being eligible for synchronization. */ + caller = current; + for_each_thread(caller, thread) { + pid_t failed; + + /* Skip current, since it is initiating the sync. */ + if (thread == caller) + continue; + + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED || + (thread->seccomp.mode == SECCOMP_MODE_FILTER && + is_ancestor(thread->seccomp.filter, + caller->seccomp.filter))) + continue; + + /* Return the first thread that cannot be synchronized. */ + failed = task_pid_vnr(thread); + /* If the pid cannot be resolved, then return -ESRCH */ + if (unlikely(WARN_ON(failed == 0))) + failed = -ESRCH; + return failed; + } + + return 0; +} + +/** + * seccomp_sync_threads: sets all threads to use current's filter + * + * Expects sighand and cred_guard_mutex locks to be held, and for + * seccomp_can_sync_threads() to have returned success already + * without dropping the locks. + * + */ +static inline void seccomp_sync_threads(void) +{ + struct task_struct *thread, *caller; + + BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); + BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + + /* Synchronize all threads. */ + caller = current; + for_each_thread(caller, thread) { + /* Skip current, since it needs no changes. */ + if (thread == caller) + continue; + + /* Get a task reference for the new leaf node. */ + get_seccomp_filter(caller); + /* + * Drop the task reference to the shared ancestor since + * current's path will hold a reference. (This also + * allows a put before the assignment.) + */ + put_seccomp_filter(thread); + smp_store_release(&thread->seccomp.filter, + caller->seccomp.filter); + /* + * Opt the other thread into seccomp if needed. + * As threads are considered to be trust-realm + * equivalent (see ptrace_may_access), it is safe to + * allow one thread to transition the other. + */ + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { + /* + * Don't let an unprivileged task work around + * the no_new_privs restriction by creating + * a thread that sets it up, enters seccomp, + * then dies. + */ + if (task_no_new_privs(caller)) + task_set_no_new_privs(thread); + + seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); + } + } +} + +/** + * seccomp_prepare_filter: Prepares a seccomp filter for use. * @fprog: BPF program to install * - * Returns 0 on success or an errno on failure. + * Returns filter on success or an ERR_PTR on failure. */ -static long seccomp_attach_filter(struct sock_fprog *fprog) +static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *filter; - unsigned long fp_size = fprog->len * sizeof(struct sock_filter); - unsigned long total_insns = fprog->len; + unsigned long fp_size; struct sock_filter *fp; int new_len; long ret; if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) - return -EINVAL; - - for (filter = current->seccomp.filter; filter; filter = filter->prev) - total_insns += filter->prog->len + 4; /* include a 4 instr penalty */ - if (total_insns > MAX_INSNS_PER_PATH) - return -ENOMEM; + return ERR_PTR(-EINVAL); + BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); + fp_size = fprog->len * sizeof(struct sock_filter); /* * Installing a seccomp filter requires that the task has @@ -224,14 +359,14 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) * This avoids scenarios where unprivileged tasks can affect the * behavior of privileged children. */ - if (!current->no_new_privs && + if (!task_no_new_privs(current) && security_capable_noaudit(current_cred(), current_user_ns(), CAP_SYS_ADMIN) != 0) - return -EACCES; + return ERR_PTR(-EACCES); fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); if (!fp) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* Copy the instructions from fprog. */ ret = -EFAULT; @@ -239,7 +374,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) goto free_prog; /* Check and rewrite the fprog via the skb checker */ - ret = sk_chk_filter(fp, fprog->len); + ret = bpf_check_classic(fp, fprog->len); if (ret) goto free_prog; @@ -248,8 +383,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) if (ret) goto free_prog; - /* Convert 'sock_filter' insns to 'sock_filter_int' insns */ - ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); + /* Convert 'sock_filter' insns to 'bpf_insn' insns */ + ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len); if (ret) goto free_prog; @@ -260,12 +395,12 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) if (!filter) goto free_prog; - filter->prog = kzalloc(sk_filter_size(new_len), + filter->prog = kzalloc(bpf_prog_size(new_len), GFP_KERNEL|__GFP_NOWARN); if (!filter->prog) goto free_filter; - ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); + ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); if (ret) goto free_filter_prog; kfree(fp); @@ -273,15 +408,9 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) atomic_set(&filter->usage, 1); filter->prog->len = new_len; - sk_filter_select_runtime(filter->prog); + bpf_prog_select_runtime(filter->prog); - /* - * If there is an existing filter, make it the prev and don't drop its - * task reference. - */ - filter->prev = current->seccomp.filter; - current->seccomp.filter = filter; - return 0; + return filter; free_filter_prog: kfree(filter->prog); @@ -289,19 +418,20 @@ free_filter: kfree(filter); free_prog: kfree(fp); - return ret; + return ERR_PTR(ret); } /** - * seccomp_attach_user_filter - attaches a user-supplied sock_fprog + * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog * @user_filter: pointer to the user data containing a sock_fprog. * * Returns 0 on success and non-zero otherwise. */ -static long seccomp_attach_user_filter(char __user *user_filter) +static struct seccomp_filter * +seccomp_prepare_user_filter(const char __user *user_filter) { struct sock_fprog fprog; - long ret = -EFAULT; + struct seccomp_filter *filter = ERR_PTR(-EFAULT); #ifdef CONFIG_COMPAT if (is_compat_task()) { @@ -314,9 +444,56 @@ static long seccomp_attach_user_filter(char __user *user_filter) #endif if (copy_from_user(&fprog, user_filter, sizeof(fprog))) goto out; - ret = seccomp_attach_filter(&fprog); + filter = seccomp_prepare_filter(&fprog); out: - return ret; + return filter; +} + +/** + * seccomp_attach_filter: validate and attach filter + * @flags: flags to change filter behavior + * @filter: seccomp filter to add to the current process + * + * Caller must be holding current->sighand->siglock lock. + * + * Returns 0 on success, -ve on error. + */ +static long seccomp_attach_filter(unsigned int flags, + struct seccomp_filter *filter) +{ + unsigned long total_insns; + struct seccomp_filter *walker; + + BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + + /* Validate resulting filter length. */ + total_insns = filter->prog->len; + for (walker = current->seccomp.filter; walker; walker = walker->prev) + total_insns += walker->prog->len + 4; /* 4 instr penalty */ + if (total_insns > MAX_INSNS_PER_PATH) + return -ENOMEM; + + /* If thread sync has been requested, check that it is possible. */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC) { + int ret; + + ret = seccomp_can_sync_threads(); + if (ret) + return ret; + } + + /* + * If there is an existing filter, make it the prev and don't drop its + * task reference. + */ + filter->prev = current->seccomp.filter; + current->seccomp.filter = filter; + + /* Now that the new filter is in place, synchronize to all threads. */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC) + seccomp_sync_threads(); + + return 0; } /* get_seccomp_filter - increments the reference count of the filter on @tsk */ @@ -329,6 +506,14 @@ void get_seccomp_filter(struct task_struct *tsk) atomic_inc(&orig->usage); } +static inline void seccomp_filter_free(struct seccomp_filter *filter) +{ + if (filter) { + bpf_prog_free(filter->prog); + kfree(filter); + } +} + /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ void put_seccomp_filter(struct task_struct *tsk) { @@ -337,8 +522,7 @@ void put_seccomp_filter(struct task_struct *tsk) while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; - sk_filter_free(freeme->prog); - kfree(freeme); + seccomp_filter_free(freeme); } } @@ -382,12 +566,17 @@ static int mode1_syscalls_32[] = { int __secure_computing(int this_syscall) { - int mode = current->seccomp.mode; int exit_sig = 0; int *syscall; u32 ret; - switch (mode) { + /* + * Make sure that any changes to mode from another thread have + * been seen after TIF_SECCOMP was seen. + */ + rmb(); + + switch (current->seccomp.mode) { case SECCOMP_MODE_STRICT: syscall = mode1_syscalls; #ifdef CONFIG_COMPAT @@ -473,47 +662,152 @@ long prctl_get_seccomp(void) } /** - * prctl_set_seccomp: configures current->seccomp.mode - * @seccomp_mode: requested mode to use - * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER + * seccomp_set_mode_strict: internal function for setting strict seccomp * - * This function may be called repeatedly with a @seccomp_mode of - * SECCOMP_MODE_FILTER to install additional filters. Every filter - * successfully installed will be evaluated (in reverse order) for each system - * call the task makes. + * Once current->seccomp.mode is non-zero, it may not be changed. + * + * Returns 0 on success or -EINVAL on failure. + */ +static long seccomp_set_mode_strict(void) +{ + const unsigned long seccomp_mode = SECCOMP_MODE_STRICT; + long ret = -EINVAL; + + spin_lock_irq(¤t->sighand->siglock); + + if (!seccomp_may_assign_mode(seccomp_mode)) + goto out; + +#ifdef TIF_NOTSC + disable_TSC(); +#endif + seccomp_assign_mode(current, seccomp_mode); + ret = 0; + +out: + spin_unlock_irq(¤t->sighand->siglock); + + return ret; +} + +#ifdef CONFIG_SECCOMP_FILTER +/** + * seccomp_set_mode_filter: internal function for setting seccomp filter + * @flags: flags to change filter behavior + * @filter: struct sock_fprog containing filter + * + * This function may be called repeatedly to install additional filters. + * Every filter successfully installed will be evaluated (in reverse order) + * for each system call the task makes. * * Once current->seccomp.mode is non-zero, it may not be changed. * * Returns 0 on success or -EINVAL on failure. */ -long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) +static long seccomp_set_mode_filter(unsigned int flags, + const char __user *filter) { + const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; + struct seccomp_filter *prepared = NULL; long ret = -EINVAL; - if (current->seccomp.mode && - current->seccomp.mode != seccomp_mode) + /* Validate flags. */ + if (flags & ~SECCOMP_FILTER_FLAG_MASK) + return -EINVAL; + + /* Prepare the new filter before holding any locks. */ + prepared = seccomp_prepare_user_filter(filter); + if (IS_ERR(prepared)) + return PTR_ERR(prepared); + + /* + * Make sure we cannot change seccomp or nnp state via TSYNC + * while another thread is in the middle of calling exec. + */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC && + mutex_lock_killable(¤t->signal->cred_guard_mutex)) + goto out_free; + + spin_lock_irq(¤t->sighand->siglock); + + if (!seccomp_may_assign_mode(seccomp_mode)) + goto out; + + ret = seccomp_attach_filter(flags, prepared); + if (ret) goto out; + /* Do not free the successfully attached filter. */ + prepared = NULL; + + seccomp_assign_mode(current, seccomp_mode); +out: + spin_unlock_irq(¤t->sighand->siglock); + if (flags & SECCOMP_FILTER_FLAG_TSYNC) + mutex_unlock(¤t->signal->cred_guard_mutex); +out_free: + seccomp_filter_free(prepared); + return ret; +} +#else +static inline long seccomp_set_mode_filter(unsigned int flags, + const char __user *filter) +{ + return -EINVAL; +} +#endif + +/* Common entry point for both prctl and syscall. */ +static long do_seccomp(unsigned int op, unsigned int flags, + const char __user *uargs) +{ + switch (op) { + case SECCOMP_SET_MODE_STRICT: + if (flags != 0 || uargs != NULL) + return -EINVAL; + return seccomp_set_mode_strict(); + case SECCOMP_SET_MODE_FILTER: + return seccomp_set_mode_filter(flags, uargs); + default: + return -EINVAL; + } +} + +SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags, + const char __user *, uargs) +{ + return do_seccomp(op, flags, uargs); +} + +/** + * prctl_set_seccomp: configures current->seccomp.mode + * @seccomp_mode: requested mode to use + * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER + * + * Returns 0 on success or -EINVAL on failure. + */ +long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) +{ + unsigned int op; + char __user *uargs; switch (seccomp_mode) { case SECCOMP_MODE_STRICT: - ret = 0; -#ifdef TIF_NOTSC - disable_TSC(); -#endif + op = SECCOMP_SET_MODE_STRICT; + /* + * Setting strict mode through prctl always ignored filter, + * so make sure it is always NULL here to pass the internal + * check in do_seccomp(). + */ + uargs = NULL; break; -#ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: - ret = seccomp_attach_user_filter(filter); - if (ret) - goto out; + op = SECCOMP_SET_MODE_FILTER; + uargs = filter; break; -#endif default: - goto out; + return -EINVAL; } - current->seccomp.mode = seccomp_mode; - set_thread_flag(TIF_SECCOMP); -out: - return ret; + /* prctl interface doesn't have flags, so they are always zero. */ + return do_seccomp(op, 0, uargs); } diff --git a/kernel/signal.c b/kernel/signal.c index a4077e9..8f0876f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1263,6 +1263,10 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, struct sighand_struct *sighand; for (;;) { + /* + * Disable interrupts early to avoid deadlocks. + * See rcu_read_unlock() comment header for details. + */ local_irq_save(*flags); rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); @@ -2166,8 +2170,7 @@ static int ptrace_signal(int signr, siginfo_t *info) return signr; } -int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, - struct pt_regs *regs, void *cookie) +int get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; @@ -2237,13 +2240,13 @@ relock: goto relock; } - signr = dequeue_signal(current, ¤t->blocked, info); + signr = dequeue_signal(current, ¤t->blocked, &ksig->info); if (!signr) break; /* will return 0 */ if (unlikely(current->ptrace) && signr != SIGKILL) { - signr = ptrace_signal(signr, info); + signr = ptrace_signal(signr, &ksig->info); if (!signr) continue; } @@ -2251,13 +2254,13 @@ relock: ka = &sighand->action[signr-1]; /* Trace actually delivered signals. */ - trace_signal_deliver(signr, info, ka); + trace_signal_deliver(signr, &ksig->info, ka); if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { /* Run the handler. */ - *return_ka = *ka; + ksig->ka = *ka; if (ka->sa.sa_flags & SA_ONESHOT) ka->sa.sa_handler = SIG_DFL; @@ -2307,7 +2310,7 @@ relock: spin_lock_irq(&sighand->siglock); } - if (likely(do_signal_stop(info->si_signo))) { + if (likely(do_signal_stop(ksig->info.si_signo))) { /* It released the siglock. */ goto relock; } @@ -2328,7 +2331,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) - print_fatal_signal(info->si_signo); + print_fatal_signal(ksig->info.si_signo); proc_coredump_connector(current); /* * If it was able to dump core, this kills all @@ -2338,34 +2341,32 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(info); + do_coredump(&ksig->info); } /* * Death signals, no core dump. */ - do_group_exit(info->si_signo); + do_group_exit(ksig->info.si_signo); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); - return signr; + + ksig->sig = signr; + return ksig->sig > 0; } /** * signal_delivered - - * @sig: number of signal being delivered - * @info: siginfo_t of signal being delivered - * @ka: sigaction setting that chose the handler - * @regs: user register state + * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has successfully been - * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask + * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask * is always blocked, and the signal itself is blocked unless %SA_NODEFER - * is set in @ka->sa.sa_flags. Tracing is notified. + * is set in @ksig->ka.sa.sa_flags. Tracing is notified. */ -void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, - struct pt_regs *regs, int stepping) +static void signal_delivered(struct ksignal *ksig, int stepping) { sigset_t blocked; @@ -2375,11 +2376,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, simply clear the restore sigmask flag. */ clear_restore_sigmask(); - sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, sig); + sigorsets(&blocked, ¤t->blocked, &ksig->ka.sa.sa_mask); + if (!(ksig->ka.sa.sa_flags & SA_NODEFER)) + sigaddset(&blocked, ksig->sig); set_current_blocked(&blocked); - tracehook_signal_handler(sig, info, ka, regs, stepping); + tracehook_signal_handler(stepping); } void signal_setup_done(int failed, struct ksignal *ksig, int stepping) @@ -2387,8 +2388,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping) if (failed) force_sigsegv(ksig->sig, current); else - signal_delivered(ksig->sig, &ksig->info, &ksig->ka, - signal_pt_regs(), stepping); + signal_delivered(ksig, stepping); } /* diff --git a/kernel/smp.c b/kernel/smp.c index 80c33f8..aff8aa1 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -3,6 +3,7 @@ * * (C) Jens Axboe <jens.axboe@oracle.com> 2008 */ +#include <linux/irq_work.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/kernel.h> @@ -251,6 +252,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) csd->func(csd->info); csd_unlock(csd); } + + /* + * Handle irq works queued remotely by irq_work_queue_on(). + * Smp functions above are typically synchronous so they + * better run first since some other CPUs may be busy waiting + * for them. + */ + irq_work_run(); } /* @@ -661,7 +670,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), if (cond_func(cpu, info)) { ret = smp_call_function_single(cpu, func, info, wait); - WARN_ON_ONCE(!ret); + WARN_ON_ONCE(ret); } preempt_enable(); } diff --git a/kernel/sys.c b/kernel/sys.c index 66a751e..ce81291 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1990,12 +1990,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg2 != 1 || arg3 || arg4 || arg5) return -EINVAL; - current->no_new_privs = 1; + task_set_no_new_privs(current); break; case PR_GET_NO_NEW_PRIVS: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - return current->no_new_privs ? 1 : 0; + return task_no_new_privs(current) ? 1 : 0; case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 36441b5..391d4dd 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -25,6 +25,7 @@ cond_syscall(sys_swapon); cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); cond_syscall(compat_sys_kexec_load); +cond_syscall(sys_kexec_file_load); cond_syscall(sys_init_module); cond_syscall(sys_finit_module); cond_syscall(sys_delete_module); @@ -197,6 +198,7 @@ cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); +cond_syscall(sys_memfd_create); /* performance counters: */ cond_syscall(sys_perf_event_open); @@ -213,3 +215,6 @@ cond_syscall(compat_sys_open_by_handle_at); /* compare kernel pointers */ cond_syscall(sys_kcmp); + +/* operate on Secure Computing state */ +cond_syscall(sys_seccomp); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75b22e2..75875a7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #ifdef CONFIG_NUMA { @@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { @@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 653cbbd..e4ba9a5 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -522,6 +522,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = { { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" }, {} }; diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c index 52ebc70..875f64e 100644 --- a/kernel/system_keyring.c +++ b/kernel/system_keyring.c @@ -89,6 +89,7 @@ static __init int load_system_certificate_list(void) pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", PTR_ERR(key)); } else { + set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags); pr_notice("Loaded X.509 cert '%s'\n", key_ref_to_ptr(key)->description); key_ref_put(key); diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 12d6ebbf..0dbab6d 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -14,6 +14,8 @@ * the GNU General Public License for more details. */ +#define pr_fmt(fmt) "Kprobe smoke test: " fmt + #include <linux/kernel.h> #include <linux/kprobes.h> #include <linux/random.h> @@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler\n"); + pr_err("incorrect value in post_handler\n"); } posth_val = preh_val + div_factor; } @@ -59,8 +60,7 @@ static int test_kprobe(void) ret = register_kprobe(&kp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobe returned %d\n", ret); + pr_err("register_kprobe returned %d\n", ret); return ret; } @@ -68,14 +68,12 @@ static int test_kprobe(void) unregister_kprobe(&kp); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler2\n"); + pr_err("incorrect value in post_handler2\n"); } posth_val = preh_val + div_factor; } @@ -120,8 +117,7 @@ static int test_kprobes(void) kp.flags = 0; ret = register_kprobes(kps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobes returned %d\n", ret); + pr_err("register_kprobes returned %d\n", ret); return ret; } @@ -130,14 +126,12 @@ static int test_kprobes(void) ret = target(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -146,14 +140,12 @@ static int test_kprobes(void) ret = target2(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler2 not called\n"); + pr_err("kprobe pre_handler2 not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler2 not called\n"); + pr_err("kprobe post_handler2 not called\n"); handler_errors++; } @@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value) { if (value != rand1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in jprobe handler\n"); + pr_err("incorrect value in jprobe handler\n"); } jph_val = rand1; @@ -186,16 +177,14 @@ static int test_jprobe(void) ret = register_jprobe(&jp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobe returned %d\n", ret); + pr_err("register_jprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_jprobe(&jp); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } @@ -217,24 +206,21 @@ static int test_jprobes(void) jp.kp.flags = 0; ret = register_jprobes(jps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobes returned %d\n", ret); + pr_err("register_jprobes returned %d\n", ret); return ret; } jph_val = 0; ret = target(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } jph_val = 0; ret = target2(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler2 not called\n"); + pr_err("jprobe handler2 not called\n"); handler_errors++; } unregister_jprobes(jps, 2); @@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler\n"); + pr_err("incorrect value in kretprobe handler\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -281,16 +265,14 @@ static int test_kretprobe(void) ret = register_kretprobe(&rp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_kretprobe(&rp); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } @@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler2\n"); + pr_err("incorrect value in kretprobe handler2\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -332,24 +312,21 @@ static int test_kretprobes(void) rp.kp.flags = 0; ret = register_kretprobes(rps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } krph_val = 0; ret = target(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } krph_val = 0; ret = target2(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler2 not called\n"); + pr_err("kretprobe handler2 not called\n"); handler_errors++; } unregister_kretprobes(rps, 2); @@ -368,7 +345,7 @@ int init_test_probes(void) rand1 = prandom_u32(); } while (rand1 <= div_factor); - printk(KERN_INFO "Kprobe smoke test started\n"); + pr_info("started\n"); num_tests++; ret = test_kprobe(); if (ret < 0) @@ -402,13 +379,11 @@ int init_test_probes(void) #endif /* CONFIG_KRETPROBES */ if (errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " - "%d tests failed\n", errors, num_tests); + pr_err("BUG: %d out of %d tests failed\n", errors, num_tests); else if (handler_errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " - "running handlers\n", handler_errors); + pr_err("BUG: %d error(s) running handlers\n", handler_errors); else - printk(KERN_INFO "Kprobe smoke test passed successfully\n"); + pr_info("passed successfully\n"); return 0; } diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f448513..d626dc9 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG config ARCH_CLOCKSOURCE_DATA bool +# Clocksources require validation of the clocksource against the last +# cycle update - x86/TSC misfeature +config CLOCKSOURCE_VALIDATE_LAST_CYCLE + bool + # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool @@ -20,10 +25,6 @@ config GENERIC_TIME_VSYSCALL config GENERIC_TIME_VSYSCALL_OLD bool -# ktime_t scalar 64bit nsec representation -config KTIME_SCALAR - bool - # Old style timekeeping config ARCH_USES_GETTIMEOFFSET bool diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 57a413f..7347426 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,3 +1,4 @@ +obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o posix-clock.o alarmtimer.o @@ -12,3 +13,21 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o +obj-$(CONFIG_TEST_UDELAY) += udelay_test.o + +$(obj)/time.o: $(obj)/timeconst.h + +quiet_cmd_hzfile = HZFILE $@ + cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ + +targets += hz.bc +$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE + $(call if_changed,hzfile) + +quiet_cmd_bc = BC $@ + cmd_bc = bc -q $(filter-out FORCE,$^) > $@ + +targets += timeconst.h +$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE + $(call if_changed,bc) + diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index fe75444..4aec4a4 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void) return ret; } - +EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); static int alarmtimer_rtc_add_device(struct device *dev, struct class_interface *class_intf) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ba3e502..2e949cc 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -32,6 +32,7 @@ #include <linux/kthread.h> #include "tick-internal.h" +#include "timekeeping_internal.h" void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, @@ -249,7 +250,7 @@ void clocksource_mark_unstable(struct clocksource *cs) static void clocksource_watchdog(unsigned long data) { struct clocksource *cs; - cycle_t csnow, wdnow; + cycle_t csnow, wdnow, delta; int64_t wd_nsec, cs_nsec; int next_cpu, reset_pending; @@ -282,11 +283,12 @@ static void clocksource_watchdog(unsigned long data) continue; } - wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, - watchdog->mult, watchdog->shift); + delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask); + wd_nsec = clocksource_cyc2ns(delta, watchdog->mult, + watchdog->shift); - cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & - cs->mask, cs->mult, cs->shift); + delta = clocksource_delta(csnow, cs->cs_last, cs->mask); + cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); cs->cs_last = csnow; cs->wd_last = wdnow; diff --git a/kernel/hrtimer.c b/kernel/time/hrtimer.c index 3ab2899..1c2fe7d 100644 --- a/kernel/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -54,6 +54,8 @@ #include <trace/events/timer.h> +#include "timekeeping.h" + /* * The timer bases: * @@ -114,21 +116,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) */ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { - ktime_t xtim, mono, boot; - struct timespec xts, tom, slp; - s32 tai_offset; + ktime_t xtim, mono, boot, tai; + ktime_t off_real, off_boot, off_tai; - get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); - tai_offset = timekeeping_get_tai_offset(); + mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); + boot = ktime_add(mono, off_boot); + xtim = ktime_add(mono, off_real); + tai = ktime_add(xtim, off_tai); - xtim = timespec_to_ktime(xts); - mono = ktime_add(xtim, timespec_to_ktime(tom)); - boot = ktime_add(mono, timespec_to_ktime(slp)); base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; - base->clock_base[HRTIMER_BASE_TAI].softirq_time = - ktime_add(xtim, ktime_set(tai_offset, 0)); + base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai; } /* @@ -264,60 +263,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * too large for inlining: */ #if BITS_PER_LONG < 64 -# ifndef CONFIG_KTIME_SCALAR -/** - * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable - * @kt: addend - * @nsec: the scalar nsec value to add - * - * Returns the sum of kt and nsec in ktime_t format - */ -ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) -{ - ktime_t tmp; - - if (likely(nsec < NSEC_PER_SEC)) { - tmp.tv64 = nsec; - } else { - unsigned long rem = do_div(nsec, NSEC_PER_SEC); - - /* Make sure nsec fits into long */ - if (unlikely(nsec > KTIME_SEC_MAX)) - return (ktime_t){ .tv64 = KTIME_MAX }; - - tmp = ktime_set((long)nsec, rem); - } - - return ktime_add(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_add_ns); - -/** - * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable - * @kt: minuend - * @nsec: the scalar nsec value to subtract - * - * Returns the subtraction of @nsec from @kt in ktime_t format - */ -ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) -{ - ktime_t tmp; - - if (likely(nsec < NSEC_PER_SEC)) { - tmp.tv64 = nsec; - } else { - unsigned long rem = do_div(nsec, NSEC_PER_SEC); - - tmp = ktime_set((long)nsec, rem); - } - - return ktime_sub(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_sub_ns); -# endif /* !CONFIG_KTIME_SCALAR */ - /* * Divide a ktime value by a nanosecond value */ @@ -337,6 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div) return dclc; } +EXPORT_SYMBOL_GPL(ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* @@ -602,6 +548,11 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * + * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming + * and no expiry check happens. The timer gets enqueued into the rbtree. The + * reprogramming and expiry check is done in the hrtimer_interrupt or in the + * softirq. + * * Called with interrupts disabled and base->cpu_base.lock held */ static int hrtimer_reprogram(struct hrtimer *timer, @@ -662,25 +613,13 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) base->hres_active = 0; } -/* - * When High resolution timers are active, try to reprogram. Note, that in case - * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry - * check happens. The timer gets enqueued into the rbtree. The reprogramming - * and expiry check is done in the hrtimer_interrupt or in the softirq. - */ -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return base->cpu_base->hres_active && hrtimer_reprogram(timer, base); -} - static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); + return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); } /* @@ -755,8 +694,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline int hrtimer_switch_to_hres(void) { return 0; } static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) +static inline int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) { return 0; } @@ -1013,14 +952,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, leftmost = enqueue_hrtimer(timer, new_base); - /* - * Only allow reprogramming if the new base is on this CPU. - * (it might still be on another CPU if the timer was pending) - * - * XXX send_remote_softirq() ? - */ - if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) - && hrtimer_enqueue_reprogram(timer, new_base)) { + if (!leftmost) { + unlock_hrtimer_base(timer, &flags); + return ret; + } + + if (!hrtimer_is_hres_active(timer)) { + /* + * Kick to reschedule the next tick to handle the new timer + * on dynticks target. + */ + wake_up_nohz_cpu(new_base->cpu_base->cpu); + } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) && + hrtimer_reprogram(timer, new_base)) { + /* + * Only allow reprogramming if the new base is on this CPU. + * (it might still be on another CPU if the timer was pending) + * + * XXX send_remote_softirq() ? + */ if (wakeup) { /* * We need to drop cpu_base->lock to avoid a @@ -1680,6 +1630,7 @@ static void init_hrtimers_cpu(int cpu) timerqueue_init_head(&cpu_base->clock_base[i].active); } + cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); } diff --git a/kernel/itimer.c b/kernel/time/itimer.c index 8d262b4..8d262b4 100644 --- a/kernel/itimer.c +++ b/kernel/time/itimer.c diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 33db43a..87a346f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -466,7 +466,8 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); static void sync_cmos_clock(struct work_struct *work) { - struct timespec now, next; + struct timespec64 now; + struct timespec next; int fail = 1; /* @@ -485,9 +486,9 @@ static void sync_cmos_clock(struct work_struct *work) return; } - getnstimeofday(&now); + getnstimeofday64(&now); if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { - struct timespec adjust = now; + struct timespec adjust = timespec64_to_timespec(now); fail = -ENODEV; if (persistent_clock_is_local) @@ -531,7 +532,7 @@ void ntp_notify_cmos_timer(void) { } /* * Propagate a new txc->status value into the NTP state: */ -static inline void process_adj_status(struct timex *txc, struct timespec *ts) +static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; @@ -554,7 +555,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) static inline void process_adjtimex_modes(struct timex *txc, - struct timespec *ts, + struct timespec64 *ts, s32 *time_tai) { if (txc->modes & ADJ_STATUS) @@ -640,7 +641,7 @@ int ntp_validate_timex(struct timex *txc) * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) +int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) { int result; @@ -684,7 +685,7 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) /* fill PPS status fields */ pps_fill_timex(txc); - txc->time.tv_sec = ts->tv_sec; + txc->time.tv_sec = (time_t)ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 1950cb4..bbd102a 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -7,6 +7,6 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern int second_overflow(unsigned long secs); extern int ntp_validate_timex(struct timex *); -extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); +extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); extern void __hardpps(const struct timespec *, const struct timespec *); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b89464..3b89464 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c diff --git a/kernel/posix-timers.c b/kernel/time/posix-timers.c index 424c2d4..42b463a 100644 --- a/kernel/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -49,6 +49,8 @@ #include <linux/export.h> #include <linux/hashtable.h> +#include "timekeeping.h" + /* * Management arrays for POSIX timers. Timers are now kept in static hash table * with 512 entries. diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 7ab92b1..c19c1d8 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -4,6 +4,8 @@ #include <linux/hrtimer.h> #include <linux/tick.h> +#include "timekeeping.h" + extern seqlock_t jiffies_lock; #define CS_NAME_LEN 32 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6558b7a..99aa6ee 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -154,6 +154,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; +cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; static bool can_stop_full_tick(void) @@ -224,13 +225,15 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { }; /* - * Kick the current CPU if it's full dynticks in order to force it to + * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ -void tick_nohz_full_kick(void) +void tick_nohz_full_kick_cpu(int cpu) { - if (tick_nohz_full_cpu(smp_processor_id())) - irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); + if (!tick_nohz_full_cpu(cpu)) + return; + + irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } static void nohz_full_kick_ipi(void *info) @@ -281,6 +284,7 @@ static int __init tick_nohz_full_setup(char *str) int cpu; alloc_bootmem_cpumask_var(&tick_nohz_full_mask); + alloc_bootmem_cpumask_var(&housekeeping_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); return 1; @@ -291,6 +295,8 @@ static int __init tick_nohz_full_setup(char *str) pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, tick_nohz_full_mask); tick_nohz_full_running = true; return 1; @@ -332,9 +338,15 @@ static int tick_nohz_init_all(void) pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); return err; } + if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { + pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n"); + return err; + } err = 0; cpumask_setall(tick_nohz_full_mask); cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); + cpumask_clear(housekeeping_mask); + cpumask_set_cpu(smp_processor_id(), housekeeping_mask); tick_nohz_full_running = true; #endif return err; diff --git a/kernel/time.c b/kernel/time/time.c index 7c7964c..f0294ba 100644 --- a/kernel/time.c +++ b/kernel/time/time.c @@ -42,6 +42,7 @@ #include <asm/unistd.h> #include "timeconst.h" +#include "timekeeping.h" /* * The timezone where the local system is located. Used as a default by some @@ -420,6 +421,68 @@ struct timeval ns_to_timeval(const s64 nsec) } EXPORT_SYMBOL(ns_to_timeval); +#if BITS_PER_LONG == 32 +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) +{ + while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + asm("" : "+rm"(nsec)); + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} +EXPORT_SYMBOL(set_normalized_timespec64); + +/** + * ns_to_timespec64 - Convert nanoseconds to timespec64 + * @nsec: the nanoseconds value to be converted + * + * Returns the timespec64 representation of the nsec parameter. + */ +struct timespec64 ns_to_timespec64(const s64 nsec) +{ + struct timespec64 ts; + s32 rem; + + if (!nsec) + return (struct timespec64) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} +EXPORT_SYMBOL(ns_to_timespec64); +#endif /* * When we convert to jiffies then we interpret incoming values * the following way: @@ -694,6 +757,7 @@ unsigned long nsecs_to_jiffies(u64 n) { return (unsigned long)nsecs_to_jiffies64(n); } +EXPORT_SYMBOL_GPL(nsecs_to_jiffies); /* * Add two timespec values and do a safety check for overflow. diff --git a/kernel/timeconst.bc b/kernel/time/timeconst.bc index 511bdf2..511bdf2 100644 --- a/kernel/timeconst.bc +++ b/kernel/time/timeconst.bc diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 32d8d6a..f36b028 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -32,11 +32,34 @@ #define TK_MIRROR (1 << 1) #define TK_CLOCK_WAS_SET (1 << 2) -static struct timekeeper timekeeper; +/* + * The most important data for readout fits into a single 64 byte + * cache line. + */ +static struct { + seqcount_t seq; + struct timekeeper timekeeper; +} tk_core ____cacheline_aligned; + static DEFINE_RAW_SPINLOCK(timekeeper_lock); -static seqcount_t timekeeper_seq; static struct timekeeper shadow_timekeeper; +/** + * struct tk_fast - NMI safe timekeeper + * @seq: Sequence counter for protecting updates. The lowest bit + * is the index for the tk_read_base array + * @base: tk_read_base array. Access is indexed by the lowest bit of + * @seq. + * + * See @update_fast_timekeeper() below. + */ +struct tk_fast { + seqcount_t seq; + struct tk_read_base base[2]; +}; + +static struct tk_fast tk_fast_mono ____cacheline_aligned; + /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -45,49 +68,54 @@ bool __read_mostly persistent_clock_exist = false; static inline void tk_normalize_xtime(struct timekeeper *tk) { - while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { - tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; + while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) { + tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift; tk->xtime_sec++; } } -static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) +static inline struct timespec64 tk_xtime(struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift); + return ts; +} + +static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; - tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; + tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift; } -static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) +static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec += ts->tv_sec; - tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; + tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift; tk_normalize_xtime(tk); } -static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) +static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) { - struct timespec tmp; + struct timespec64 tmp; /* * Verify consistency of: offset_real = -wall_to_monotonic * before modifying anything */ - set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, + set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, -tk->wall_to_monotonic.tv_nsec); - WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); + WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64); tk->wall_to_monotonic = wtm; - set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); - tk->offs_real = timespec_to_ktime(tmp); + set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); + tk->offs_real = timespec64_to_ktime(tmp); tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); } -static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) +static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - /* Verify consistency before modifying */ - WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64); - - tk->total_sleep_time = t; - tk->offs_boot = timespec_to_ktime(t); + tk->offs_boot = ktime_add(tk->offs_boot, delta); } /** @@ -107,9 +135,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) u64 tmp, ntpinterval; struct clocksource *old_clock; - old_clock = tk->clock; - tk->clock = clock; - tk->cycle_last = clock->cycle_last = clock->read(clock); + old_clock = tk->tkr.clock; + tk->tkr.clock = clock; + tk->tkr.read = clock->read; + tk->tkr.mask = clock->mask; + tk->tkr.cycle_last = tk->tkr.read(clock); /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; @@ -133,78 +163,212 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) if (old_clock) { int shift_change = clock->shift - old_clock->shift; if (shift_change < 0) - tk->xtime_nsec >>= -shift_change; + tk->tkr.xtime_nsec >>= -shift_change; else - tk->xtime_nsec <<= shift_change; + tk->tkr.xtime_nsec <<= shift_change; } - tk->shift = clock->shift; + tk->tkr.shift = clock->shift; tk->ntp_error = 0; tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + tk->ntp_tick = ntpinterval << tk->ntp_error_shift; /* * The timekeeper keeps its own mult values for the currently * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ - tk->mult = clock->mult; + tk->tkr.mult = clock->mult; + tk->ntp_err_mult = 0; } /* Timekeeper helper functions. */ #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET -u32 (*arch_gettimeoffset)(void); - -u32 get_arch_timeoffset(void) -{ - if (likely(arch_gettimeoffset)) - return arch_gettimeoffset(); - return 0; -} +static u32 default_arch_gettimeoffset(void) { return 0; } +u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; #else -static inline u32 get_arch_timeoffset(void) { return 0; } +static inline u32 arch_gettimeoffset(void) { return 0; } #endif -static inline s64 timekeeping_get_ns(struct timekeeper *tk) +static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) { - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; + cycle_t cycle_now, delta; s64 nsec; /* read clocksource: */ - clock = tk->clock; - cycle_now = clock->read(clock); + cycle_now = tkr->read(tkr->clock); /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); - nsec = cycle_delta * tk->mult + tk->xtime_nsec; - nsec >>= tk->shift; + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; /* If arch requires, add in get_arch_timeoffset() */ - return nsec + get_arch_timeoffset(); + return nsec + arch_gettimeoffset(); } static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) { - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; + struct clocksource *clock = tk->tkr.clock; + cycle_t cycle_now, delta; s64 nsec; /* read clocksource: */ - clock = tk->clock; - cycle_now = clock->read(clock); + cycle_now = tk->tkr.read(clock); /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); /* convert delta to nanoseconds. */ - nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); /* If arch requires, add in get_arch_timeoffset() */ - return nsec + get_arch_timeoffset(); + return nsec + arch_gettimeoffset(); +} + +/** + * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. + * @tk: The timekeeper from which we take the update + * @tkf: The fast timekeeper to update + * @tbase: The time base for the fast timekeeper (mono/raw) + * + * We want to use this from any context including NMI and tracing / + * instrumenting the timekeeping code itself. + * + * So we handle this differently than the other timekeeping accessor + * functions which retry when the sequence count has changed. The + * update side does: + * + * smp_wmb(); <- Ensure that the last base[1] update is visible + * tkf->seq++; + * smp_wmb(); <- Ensure that the seqcount update is visible + * update(tkf->base[0], tk); + * smp_wmb(); <- Ensure that the base[0] update is visible + * tkf->seq++; + * smp_wmb(); <- Ensure that the seqcount update is visible + * update(tkf->base[1], tk); + * + * The reader side does: + * + * do { + * seq = tkf->seq; + * smp_rmb(); + * idx = seq & 0x01; + * now = now(tkf->base[idx]); + * smp_rmb(); + * } while (seq != tkf->seq) + * + * As long as we update base[0] readers are forced off to + * base[1]. Once base[0] is updated readers are redirected to base[0] + * and the base[1] update takes place. + * + * So if a NMI hits the update of base[0] then it will use base[1] + * which is still consistent. In the worst case this can result is a + * slightly wrong timestamp (a few nanoseconds). See + * @ktime_get_mono_fast_ns. + */ +static void update_fast_timekeeper(struct timekeeper *tk) +{ + struct tk_read_base *base = tk_fast_mono.base; + + /* Force readers off to base[1] */ + raw_write_seqcount_latch(&tk_fast_mono.seq); + + /* Update base[0] */ + memcpy(base, &tk->tkr, sizeof(*base)); + + /* Force readers back to base[0] */ + raw_write_seqcount_latch(&tk_fast_mono.seq); + + /* Update base[1] */ + memcpy(base + 1, base, sizeof(*base)); } +/** + * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic + * + * This timestamp is not guaranteed to be monotonic across an update. + * The timestamp is calculated by: + * + * now = base_mono + clock_delta * slope + * + * So if the update lowers the slope, readers who are forced to the + * not yet updated second array are still using the old steeper slope. + * + * tmono + * ^ + * | o n + * | o n + * | u + * | o + * |o + * |12345678---> reader order + * + * o = old slope + * u = update + * n = new slope + * + * So reader 6 will observe time going backwards versus reader 5. + * + * While other CPUs are likely to be able observe that, the only way + * for a CPU local observation is when an NMI hits in the middle of + * the update. Timestamps taken from that NMI context might be ahead + * of the following timestamps. Callers need to be aware of that and + * deal with it. + */ +u64 notrace ktime_get_mono_fast_ns(void) +{ + struct tk_read_base *tkr; + unsigned int seq; + u64 now; + + do { + seq = raw_read_seqcount(&tk_fast_mono.seq); + tkr = tk_fast_mono.base + (seq & 0x01); + now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr); + + } while (read_seqcount_retry(&tk_fast_mono.seq, seq)); + return now; +} +EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); + +#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD + +static inline void update_vsyscall(struct timekeeper *tk) +{ + struct timespec xt; + + xt = timespec64_to_timespec(tk_xtime(tk)); + update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult, + tk->tkr.cycle_last); +} + +static inline void old_vsyscall_fixup(struct timekeeper *tk) +{ + s64 remainder; + + /* + * Store only full nanoseconds into xtime_nsec after rounding + * it up and add the remainder to the error difference. + * XXX - This is necessary to avoid small 1ns inconsistnecies caused + * by truncating the remainder in vsyscalls. However, it causes + * additional work to be done in timekeeping_adjust(). Once + * the vsyscall implementations are converted to use xtime_nsec + * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD + * users are removed, this can be killed. + */ + remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1); + tk->tkr.xtime_nsec -= remainder; + tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift; + tk->ntp_error += remainder << tk->ntp_error_shift; + tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift; +} +#else +#define old_vsyscall_fixup(tk) +#endif + static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) @@ -217,7 +381,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; int ret; @@ -247,6 +411,29 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); +/* + * Update the ktime_t based scalar nsec members of the timekeeper + */ +static inline void tk_update_ktime_data(struct timekeeper *tk) +{ + s64 nsec; + + /* + * The xtime based monotonic readout is: + * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); + * The ktime based monotonic readout is: + * nsec = base_mono + now(); + * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + */ + nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); + nsec *= NSEC_PER_SEC; + nsec += tk->wall_to_monotonic.tv_nsec; + tk->tkr.base_mono = ns_to_ktime(nsec); + + /* Update the monotonic raw base */ + tk->base_raw = timespec64_to_ktime(tk->raw_time); +} + /* must hold timekeeper_lock */ static void timekeeping_update(struct timekeeper *tk, unsigned int action) { @@ -257,8 +444,13 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + tk_update_ktime_data(tk); + if (action & TK_MIRROR) - memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + memcpy(&shadow_timekeeper, &tk_core.timekeeper, + sizeof(tk_core.timekeeper)); + + update_fast_timekeeper(tk); } /** @@ -270,49 +462,48 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) */ static void timekeeping_forward_now(struct timekeeper *tk) { - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; + struct clocksource *clock = tk->tkr.clock; + cycle_t cycle_now, delta; s64 nsec; - clock = tk->clock; - cycle_now = clock->read(clock); - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - tk->cycle_last = clock->cycle_last = cycle_now; + cycle_now = tk->tkr.read(clock); + delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); + tk->tkr.cycle_last = cycle_now; - tk->xtime_nsec += cycle_delta * tk->mult; + tk->tkr.xtime_nsec += delta * tk->tkr.mult; /* If arch requires, add in get_arch_timeoffset() */ - tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; + tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift; tk_normalize_xtime(tk); - nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); - timespec_add_ns(&tk->raw_time, nsec); + nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); + timespec64_add_ns(&tk->raw_time, nsec); } /** - * __getnstimeofday - Returns the time of day in a timespec. + * __getnstimeofday64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * * Updates the time of day in the timespec. * Returns 0 on success, or -ve when suspended (timespec will be undefined). */ -int __getnstimeofday(struct timespec *ts) +int __getnstimeofday64(struct timespec64 *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; s64 nsecs = 0; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; - nsecs = timekeeping_get_ns(tk); + nsecs = timekeeping_get_ns(&tk->tkr); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; - timespec_add_ns(ts, nsecs); + timespec64_add_ns(ts, nsecs); /* * Do not bail out early, in case there were callers still using @@ -322,116 +513,138 @@ int __getnstimeofday(struct timespec *ts) return -EAGAIN; return 0; } -EXPORT_SYMBOL(__getnstimeofday); +EXPORT_SYMBOL(__getnstimeofday64); /** - * getnstimeofday - Returns the time of day in a timespec. + * getnstimeofday64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * * Returns the time of day in a timespec (WARN if suspended). */ -void getnstimeofday(struct timespec *ts) +void getnstimeofday64(struct timespec64 *ts) { - WARN_ON(__getnstimeofday(ts)); + WARN_ON(__getnstimeofday64(ts)); } -EXPORT_SYMBOL(getnstimeofday); +EXPORT_SYMBOL(getnstimeofday64); ktime_t ktime_get(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; - s64 secs, nsecs; + ktime_t base; + s64 nsecs; WARN_ON(timekeeping_suspended); do { - seq = read_seqcount_begin(&timekeeper_seq); - secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; + seq = read_seqcount_begin(&tk_core.seq); + base = tk->tkr.base_mono; + nsecs = timekeeping_get_ns(&tk->tkr); - } while (read_seqcount_retry(&timekeeper_seq, seq)); - /* - * Use ktime_set/ktime_add_ns to create a proper ktime on - * 32-bit architectures without CONFIG_KTIME_SCALAR. - */ - return ktime_add_ns(ktime_set(secs, 0), nsecs); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get); -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) +static ktime_t *offsets[TK_OFFS_MAX] = { + [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, + [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, + [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, +}; + +ktime_t ktime_get_with_offset(enum tk_offsets offs) { - struct timekeeper *tk = &timekeeper; - struct timespec tomono; - s64 nsec; + struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; + ktime_t base, *offset = offsets[offs]; + s64 nsecs; WARN_ON(timekeeping_suspended); do { - seq = read_seqcount_begin(&timekeeper_seq); - ts->tv_sec = tk->xtime_sec; - nsec = timekeeping_get_ns(tk); - tomono = tk->wall_to_monotonic; + seq = read_seqcount_begin(&tk_core.seq); + base = ktime_add(tk->tkr.base_mono, *offset); + nsecs = timekeeping_get_ns(&tk->tkr); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - ts->tv_sec += tomono.tv_sec; - ts->tv_nsec = 0; - timespec_add_ns(ts, nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); + return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_with_offset); /** - * timekeeping_clocktai - Returns the TAI time of day in a timespec - * @ts: pointer to the timespec to be set - * - * Returns the time of day in a timespec. + * ktime_mono_to_any() - convert mononotic time to any other time + * @tmono: time to convert. + * @offs: which offset to use */ -void timekeeping_clocktai(struct timespec *ts) +ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { - struct timekeeper *tk = &timekeeper; + ktime_t *offset = offsets[offs]; unsigned long seq; - u64 nsecs; - - WARN_ON(timekeeping_suspended); + ktime_t tconv; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); + tconv = ktime_add(tmono, *offset); + } while (read_seqcount_retry(&tk_core.seq, seq)); - ts->tv_sec = tk->xtime_sec + tk->tai_offset; - nsecs = timekeeping_get_ns(tk); + return tconv; +} +EXPORT_SYMBOL_GPL(ktime_mono_to_any); - } while (read_seqcount_retry(&timekeeper_seq, seq)); +/** + * ktime_get_raw - Returns the raw monotonic time in ktime_t format + */ +ktime_t ktime_get_raw(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + s64 nsecs; - ts->tv_nsec = 0; - timespec_add_ns(ts, nsecs); + do { + seq = read_seqcount_begin(&tk_core.seq); + base = tk->base_raw; + nsecs = timekeeping_get_ns_raw(tk); -} -EXPORT_SYMBOL(timekeeping_clocktai); + } while (read_seqcount_retry(&tk_core.seq, seq)); + return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_raw); /** - * ktime_get_clocktai - Returns the TAI time of day in a ktime + * ktime_get_ts64 - get the monotonic clock in timespec64 format + * @ts: pointer to timespec variable * - * Returns the time of day in a ktime. + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. */ -ktime_t ktime_get_clocktai(void) +void ktime_get_ts64(struct timespec64 *ts) { - struct timespec ts; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 tomono; + s64 nsec; + unsigned int seq; + + WARN_ON(timekeeping_suspended); - timekeeping_clocktai(&ts); - return timespec_to_ktime(ts); + do { + seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->xtime_sec; + nsec = timekeeping_get_ns(&tk->tkr); + tomono = tk->wall_to_monotonic; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + ts->tv_sec += tomono.tv_sec; + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsec + tomono.tv_nsec); } -EXPORT_SYMBOL(ktime_get_clocktai); +EXPORT_SYMBOL_GPL(ktime_get_ts64); #ifdef CONFIG_NTP_PPS @@ -446,23 +659,23 @@ EXPORT_SYMBOL(ktime_get_clocktai); */ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; s64 nsecs_raw, nsecs_real; WARN_ON_ONCE(timekeeping_suspended); do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); - *ts_raw = tk->raw_time; + *ts_raw = timespec64_to_timespec(tk->raw_time); ts_real->tv_sec = tk->xtime_sec; ts_real->tv_nsec = 0; nsecs_raw = timekeeping_get_ns_raw(tk); - nsecs_real = timekeeping_get_ns(tk); + nsecs_real = timekeeping_get_ns(&tk->tkr); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -479,9 +692,9 @@ EXPORT_SYMBOL(getnstime_raw_and_real); */ void do_gettimeofday(struct timeval *tv) { - struct timespec now; + struct timespec64 now; - getnstimeofday(&now); + getnstimeofday64(&now); tv->tv_sec = now.tv_sec; tv->tv_usec = now.tv_nsec/1000; } @@ -495,15 +708,15 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(const struct timespec *tv) { - struct timekeeper *tk = &timekeeper; - struct timespec ts_delta, xt; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 ts_delta, xt, tmp; unsigned long flags; if (!timespec_valid_strict(tv)) return -EINVAL; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); @@ -511,13 +724,14 @@ int do_settimeofday(const struct timespec *tv) ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; - tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); - tk_set_xtime(tk, tv); + tmp = timespec_to_timespec64(*tv); + tk_set_xtime(tk, &tmp); timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ @@ -535,33 +749,35 @@ EXPORT_SYMBOL(do_settimeofday); */ int timekeeping_inject_offset(struct timespec *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - struct timespec tmp; + struct timespec64 ts64, tmp; int ret = 0; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; + ts64 = timespec_to_timespec64(*ts); + raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); /* Make sure the proposed value is valid */ - tmp = timespec_add(tk_xtime(tk), *ts); - if (!timespec_valid_strict(&tmp)) { + tmp = timespec64_add(tk_xtime(tk), ts64); + if (!timespec64_valid_strict(&tmp)) { ret = -EINVAL; goto error; } - tk_xtime_add(tk, ts); - tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); + tk_xtime_add(tk, &ts64); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64)); error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ @@ -578,14 +794,14 @@ EXPORT_SYMBOL(timekeeping_inject_offset); */ s32 timekeeping_get_tai_offset(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; s32 ret; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ret = tk->tai_offset; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } @@ -606,14 +822,14 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) */ void timekeeping_set_tai_offset(s32 tai_offset) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); __timekeeping_set_tai_offset(tk, tai_offset); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clock_was_set(); } @@ -625,14 +841,14 @@ void timekeeping_set_tai_offset(s32 tai_offset) */ static int change_clocksource(void *data) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *new, *old; unsigned long flags; new = (struct clocksource *) data; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); /* @@ -641,7 +857,7 @@ static int change_clocksource(void *data) */ if (try_module_get(new->owner)) { if (!new->enable || new->enable(new) == 0) { - old = tk->clock; + old = tk->tkr.clock; tk_setup_internals(tk, new); if (old->disable) old->disable(old); @@ -652,7 +868,7 @@ static int change_clocksource(void *data) } timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return 0; @@ -667,29 +883,14 @@ static int change_clocksource(void *data) */ int timekeeping_notify(struct clocksource *clock) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; - if (tk->clock == clock) + if (tk->tkr.clock == clock) return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); - return tk->clock == clock ? 0 : -1; -} - -/** - * ktime_get_real - get the real (wall-) time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get_real(void) -{ - struct timespec now; - - getnstimeofday(&now); - - return timespec_to_ktime(now); + return tk->tkr.clock == clock ? 0 : -1; } -EXPORT_SYMBOL_GPL(ktime_get_real); /** * getrawmonotonic - Returns the raw monotonic time in a timespec @@ -699,18 +900,20 @@ EXPORT_SYMBOL_GPL(ktime_get_real); */ void getrawmonotonic(struct timespec *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 ts64; unsigned long seq; s64 nsecs; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); nsecs = timekeeping_get_ns_raw(tk); - *ts = tk->raw_time; + ts64 = tk->raw_time; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - timespec_add_ns(ts, nsecs); + timespec64_add_ns(&ts64, nsecs); + *ts = timespec64_to_timespec(ts64); } EXPORT_SYMBOL(getrawmonotonic); @@ -719,16 +922,16 @@ EXPORT_SYMBOL(getrawmonotonic); */ int timekeeping_valid_for_hres(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; int ret; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); - ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } @@ -738,16 +941,16 @@ int timekeeping_valid_for_hres(void) */ u64 timekeeping_max_deferment(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; u64 ret; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); - ret = tk->clock->max_idle_ns; + ret = tk->tkr.clock->max_idle_ns; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } @@ -787,14 +990,15 @@ void __weak read_boot_clock(struct timespec *ts) */ void __init timekeeping_init(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock; unsigned long flags; - struct timespec now, boot, tmp; - - read_persistent_clock(&now); + struct timespec64 now, boot, tmp; + struct timespec ts; - if (!timespec_valid_strict(&now)) { + read_persistent_clock(&ts); + now = timespec_to_timespec64(ts); + if (!timespec64_valid_strict(&now)) { pr_warn("WARNING: Persistent clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); now.tv_sec = 0; @@ -802,8 +1006,9 @@ void __init timekeeping_init(void) } else if (now.tv_sec || now.tv_nsec) persistent_clock_exist = true; - read_boot_clock(&boot); - if (!timespec_valid_strict(&boot)) { + read_boot_clock(&ts); + boot = timespec_to_timespec64(ts); + if (!timespec64_valid_strict(&boot)) { pr_warn("WARNING: Boot clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); boot.tv_sec = 0; @@ -811,7 +1016,7 @@ void __init timekeeping_init(void) } raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); ntp_init(); clock = clocksource_default_clock(); @@ -822,24 +1027,21 @@ void __init timekeeping_init(void) tk_set_xtime(tk, &now); tk->raw_time.tv_sec = 0; tk->raw_time.tv_nsec = 0; + tk->base_raw.tv64 = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) boot = tk_xtime(tk); - set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); + set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); tk_set_wall_to_mono(tk, tmp); - tmp.tv_sec = 0; - tmp.tv_nsec = 0; - tk_set_sleep_time(tk, tmp); - - memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + timekeeping_update(tk, TK_MIRROR); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } /* time in seconds when suspend began */ -static struct timespec timekeeping_suspend_time; +static struct timespec64 timekeeping_suspend_time; /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval @@ -849,17 +1051,17 @@ static struct timespec timekeeping_suspend_time; * adds the sleep offset to the timekeeping variables. */ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, - struct timespec *delta) + struct timespec64 *delta) { - if (!timespec_valid_strict(delta)) { + if (!timespec64_valid_strict(delta)) { printk_deferred(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); - tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); - tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); + tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } @@ -875,7 +1077,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, */ void timekeeping_inject_sleeptime(struct timespec *delta) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 tmp; unsigned long flags; /* @@ -886,15 +1089,16 @@ void timekeeping_inject_sleeptime(struct timespec *delta) return; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); - __timekeeping_inject_sleeptime(tk, delta); + tmp = timespec_to_timespec64(*delta); + __timekeeping_inject_sleeptime(tk, &tmp); timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ @@ -910,20 +1114,22 @@ void timekeeping_inject_sleeptime(struct timespec *delta) */ static void timekeeping_resume(void) { - struct timekeeper *tk = &timekeeper; - struct clocksource *clock = tk->clock; + struct timekeeper *tk = &tk_core.timekeeper; + struct clocksource *clock = tk->tkr.clock; unsigned long flags; - struct timespec ts_new, ts_delta; + struct timespec64 ts_new, ts_delta; + struct timespec tmp; cycle_t cycle_now, cycle_delta; bool suspendtime_found = false; - read_persistent_clock(&ts_new); + read_persistent_clock(&tmp); + ts_new = timespec_to_timespec64(tmp); clockevents_resume(); clocksource_resume(); raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); /* * After system resumes, we need to calculate the suspended time and @@ -937,15 +1143,16 @@ static void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = clock->read(clock); + cycle_now = tk->tkr.read(clock); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && - cycle_now > clock->cycle_last) { + cycle_now > tk->tkr.cycle_last) { u64 num, max = ULLONG_MAX; u32 mult = clock->mult; u32 shift = clock->shift; s64 nsec = 0; - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, + tk->tkr.mask); /* * "cycle_delta * mutl" may cause 64 bits overflow, if the @@ -960,10 +1167,10 @@ static void timekeeping_resume(void) } nsec += ((u64) cycle_delta * mult) >> shift; - ts_delta = ns_to_timespec(nsec); + ts_delta = ns_to_timespec64(nsec); suspendtime_found = true; - } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { - ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); + } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { + ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); suspendtime_found = true; } @@ -971,11 +1178,11 @@ static void timekeeping_resume(void) __timekeeping_inject_sleeptime(tk, &ts_delta); /* Re-base the last cycle value */ - tk->cycle_last = clock->cycle_last = cycle_now; + tk->tkr.cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); touch_softlockup_watchdog(); @@ -988,12 +1195,14 @@ static void timekeeping_resume(void) static int timekeeping_suspend(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - struct timespec delta, delta_delta; - static struct timespec old_delta; + struct timespec64 delta, delta_delta; + static struct timespec64 old_delta; + struct timespec tmp; - read_persistent_clock(&timekeeping_suspend_time); + read_persistent_clock(&tmp); + timekeeping_suspend_time = timespec_to_timespec64(tmp); /* * On some systems the persistent_clock can not be detected at @@ -1004,7 +1213,7 @@ static int timekeeping_suspend(void) persistent_clock_exist = true; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); timekeeping_suspended = 1; @@ -1014,8 +1223,8 @@ static int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); - delta_delta = timespec_sub(delta, old_delta); + delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); + delta_delta = timespec64_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* * if delta_delta is too large, assume time correction @@ -1025,11 +1234,11 @@ static int timekeeping_suspend(void) } else { /* Otherwise try to adjust old_system to compensate */ timekeeping_suspend_time = - timespec_add(timekeeping_suspend_time, delta_delta); + timespec64_add(timekeeping_suspend_time, delta_delta); } timekeeping_update(tk, TK_MIRROR); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); @@ -1050,125 +1259,34 @@ static int __init timekeeping_init_ops(void) register_syscore_ops(&timekeeping_syscore_ops); return 0; } - device_initcall(timekeeping_init_ops); /* - * If the error is already larger, we look ahead even further - * to compensate for late or lost adjustments. + * Apply a multiplier adjustment to the timekeeper */ -static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, - s64 error, s64 *interval, - s64 *offset) +static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, + s64 offset, + bool negative, + int adj_scale) { - s64 tick_error, i; - u32 look_ahead, adj; - s32 error2, mult; - - /* - * Use the current error value to determine how much to look ahead. - * The larger the error the slower we adjust for it to avoid problems - * with losing too many ticks, otherwise we would overadjust and - * produce an even larger error. The smaller the adjustment the - * faster we try to adjust for it, as lost ticks can do less harm - * here. This is tuned so that an error of about 1 msec is adjusted - * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). - */ - error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); - error2 = abs(error2); - for (look_ahead = 0; error2 > 0; look_ahead++) - error2 >>= 2; + s64 interval = tk->cycle_interval; + s32 mult_adj = 1; - /* - * Now calculate the error in (1 << look_ahead) ticks, but first - * remove the single look ahead already included in the error. - */ - tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1); - tick_error -= tk->xtime_interval >> 1; - error = ((error - tick_error) >> look_ahead) + tick_error; - - /* Finally calculate the adjustment shift value. */ - i = *interval; - mult = 1; - if (error < 0) { - error = -error; - *interval = -*interval; - *offset = -*offset; - mult = -1; + if (negative) { + mult_adj = -mult_adj; + interval = -interval; + offset = -offset; } - for (adj = 0; error > i; adj++) - error >>= 1; - - *interval <<= adj; - *offset <<= adj; - return mult << adj; -} - -/* - * Adjust the multiplier to reduce the error value, - * this is optimized for the most common adjustments of -1,0,1, - * for other values we can do a bit more work. - */ -static void timekeeping_adjust(struct timekeeper *tk, s64 offset) -{ - s64 error, interval = tk->cycle_interval; - int adj; + mult_adj <<= adj_scale; + interval <<= adj_scale; + offset <<= adj_scale; /* - * The point of this is to check if the error is greater than half - * an interval. - * - * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. - * - * Note we subtract one in the shift, so that error is really error*2. - * This "saves" dividing(shifting) interval twice, but keeps the - * (error > interval) comparison as still measuring if error is - * larger than half an interval. - * - * Note: It does not "save" on aggravation when reading the code. - */ - error = tk->ntp_error >> (tk->ntp_error_shift - 1); - if (error > interval) { - /* - * We now divide error by 4(via shift), which checks if - * the error is greater than twice the interval. - * If it is greater, we need a bigadjust, if its smaller, - * we can adjust by 1. - */ - error >>= 2; - if (likely(error <= interval)) - adj = 1; - else - adj = timekeeping_bigadjust(tk, error, &interval, &offset); - } else { - if (error < -interval) { - /* See comment above, this is just switched for the negative */ - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else { - adj = timekeeping_bigadjust(tk, error, &interval, &offset); - } - } else { - goto out_adjust; - } - } - - if (unlikely(tk->clock->maxadj && - (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { - printk_deferred_once(KERN_WARNING - "Adjusting %s more than 11%% (%ld vs %ld)\n", - tk->clock->name, (long)tk->mult + adj, - (long)tk->clock->mult + tk->clock->maxadj); - } - /* * So the following can be confusing. * - * To keep things simple, lets assume adj == 1 for now. + * To keep things simple, lets assume mult_adj == 1 for now. * - * When adj != 1, remember that the interval and offset values + * When mult_adj != 1, remember that the interval and offset values * have been appropriately scaled so the math is the same. * * The basic idea here is that we're increasing the multiplier @@ -1212,12 +1330,78 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * * XXX - TODO: Doc ntp_error calculation. */ - tk->mult += adj; + tk->tkr.mult += mult_adj; tk->xtime_interval += interval; - tk->xtime_nsec -= offset; + tk->tkr.xtime_nsec -= offset; tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; +} + +/* + * Calculate the multiplier adjustment needed to match the frequency + * specified by NTP + */ +static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, + s64 offset) +{ + s64 interval = tk->cycle_interval; + s64 xinterval = tk->xtime_interval; + s64 tick_error; + bool negative; + u32 adj; + + /* Remove any current error adj from freq calculation */ + if (tk->ntp_err_mult) + xinterval -= tk->cycle_interval; + + tk->ntp_tick = ntp_tick_length(); + + /* Calculate current error per tick */ + tick_error = ntp_tick_length() >> tk->ntp_error_shift; + tick_error -= (xinterval + tk->xtime_remainder); + + /* Don't worry about correcting it if its small */ + if (likely((tick_error >= 0) && (tick_error <= interval))) + return; + + /* preserve the direction of correction */ + negative = (tick_error < 0); + + /* Sort out the magnitude of the correction */ + tick_error = abs(tick_error); + for (adj = 0; tick_error > interval; adj++) + tick_error >>= 1; + + /* scale the corrections */ + timekeeping_apply_adjustment(tk, offset, negative, adj); +} + +/* + * Adjust the timekeeper's multiplier to the correct frequency + * and also to reduce the accumulated error value. + */ +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) +{ + /* Correct for the current frequency error */ + timekeeping_freqadjust(tk, offset); + + /* Next make a small adjustment to fix any cumulative error */ + if (!tk->ntp_err_mult && (tk->ntp_error > 0)) { + tk->ntp_err_mult = 1; + timekeeping_apply_adjustment(tk, offset, 0, 0); + } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) { + /* Undo any existing error adjustment */ + timekeeping_apply_adjustment(tk, offset, 1, 0); + tk->ntp_err_mult = 0; + } + + if (unlikely(tk->tkr.clock->maxadj && + (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) { + printk_once(KERN_WARNING + "Adjusting %s more than 11%% (%ld vs %ld)\n", + tk->tkr.clock->name, (long)tk->tkr.mult, + (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj); + } -out_adjust: /* * It may be possible that when we entered this function, xtime_nsec * was very small. Further, if we're slightly speeding the clocksource @@ -1232,12 +1416,11 @@ out_adjust: * We'll correct this error next time through this function, when * xtime_nsec is not as small. */ - if (unlikely((s64)tk->xtime_nsec < 0)) { - s64 neg = -(s64)tk->xtime_nsec; - tk->xtime_nsec = 0; + if (unlikely((s64)tk->tkr.xtime_nsec < 0)) { + s64 neg = -(s64)tk->tkr.xtime_nsec; + tk->tkr.xtime_nsec = 0; tk->ntp_error += neg << tk->ntp_error_shift; } - } /** @@ -1250,26 +1433,26 @@ out_adjust: */ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { - u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; + u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift; unsigned int clock_set = 0; - while (tk->xtime_nsec >= nsecps) { + while (tk->tkr.xtime_nsec >= nsecps) { int leap; - tk->xtime_nsec -= nsecps; + tk->tkr.xtime_nsec -= nsecps; tk->xtime_sec++; /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); if (unlikely(leap)) { - struct timespec ts; + struct timespec64 ts; tk->xtime_sec += leap; ts.tv_sec = leap; ts.tv_nsec = 0; tk_set_wall_to_mono(tk, - timespec_sub(tk->wall_to_monotonic, ts)); + timespec64_sub(tk->wall_to_monotonic, ts)); __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); @@ -1301,9 +1484,9 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, /* Accumulate one shifted interval */ offset -= interval; - tk->cycle_last += interval; + tk->tkr.cycle_last += interval; - tk->xtime_nsec += tk->xtime_interval << shift; + tk->tkr.xtime_nsec += tk->xtime_interval << shift; *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ @@ -1317,48 +1500,20 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, tk->raw_time.tv_nsec = raw_nsecs; /* Accumulate error between NTP and clock interval */ - tk->ntp_error += ntp_tick_length() << shift; + tk->ntp_error += tk->ntp_tick << shift; tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << (tk->ntp_error_shift + shift); return offset; } -#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD -static inline void old_vsyscall_fixup(struct timekeeper *tk) -{ - s64 remainder; - - /* - * Store only full nanoseconds into xtime_nsec after rounding - * it up and add the remainder to the error difference. - * XXX - This is necessary to avoid small 1ns inconsistnecies caused - * by truncating the remainder in vsyscalls. However, it causes - * additional work to be done in timekeeping_adjust(). Once - * the vsyscall implementations are converted to use xtime_nsec - * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD - * users are removed, this can be killed. - */ - remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); - tk->xtime_nsec -= remainder; - tk->xtime_nsec += 1ULL << tk->shift; - tk->ntp_error += remainder << tk->ntp_error_shift; - tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; -} -#else -#define old_vsyscall_fixup(tk) -#endif - - - /** * update_wall_time - Uses the current clocksource to increment the wall time * */ void update_wall_time(void) { - struct clocksource *clock; - struct timekeeper *real_tk = &timekeeper; + struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; @@ -1371,12 +1526,11 @@ void update_wall_time(void) if (unlikely(timekeeping_suspended)) goto out; - clock = real_tk->clock; - #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; #else - offset = (clock->read(clock) - clock->cycle_last) & clock->mask; + offset = clocksource_delta(tk->tkr.read(tk->tkr.clock), + tk->tkr.cycle_last, tk->tkr.mask); #endif /* Check if there's really nothing to do */ @@ -1418,9 +1572,7 @@ void update_wall_time(void) */ clock_set |= accumulate_nsecs_to_secs(tk); - write_seqcount_begin(&timekeeper_seq); - /* Update clock->cycle_last with the new value */ - clock->cycle_last = tk->cycle_last; + write_seqcount_begin(&tk_core.seq); /* * Update the real timekeeper. * @@ -1428,12 +1580,12 @@ void update_wall_time(void) * requires changes to all other timekeeper usage sites as * well, i.e. move the timekeeper pointer getter into the * spinlocked/seqcount protected sections. And we trade this - * memcpy under the timekeeper_seq against one before we start + * memcpy under the tk_core.seq against one before we start * updating. */ memcpy(real_tk, tk, sizeof(*tk)); timekeeping_update(real_tk, clock_set); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); if (clock_set) @@ -1454,83 +1606,16 @@ out: */ void getboottime(struct timespec *ts) { - struct timekeeper *tk = &timekeeper; - struct timespec boottime = { - .tv_sec = tk->wall_to_monotonic.tv_sec + - tk->total_sleep_time.tv_sec, - .tv_nsec = tk->wall_to_monotonic.tv_nsec + - tk->total_sleep_time.tv_nsec - }; - - set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); -} -EXPORT_SYMBOL_GPL(getboottime); - -/** - * get_monotonic_boottime - Returns monotonic time since boot - * @ts: pointer to the timespec to be set - * - * Returns the monotonic time since boot in a timespec. - * - * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also - * includes the time spent in suspend. - */ -void get_monotonic_boottime(struct timespec *ts) -{ - struct timekeeper *tk = &timekeeper; - struct timespec tomono, sleep; - s64 nsec; - unsigned int seq; - - WARN_ON(timekeeping_suspended); - - do { - seq = read_seqcount_begin(&timekeeper_seq); - ts->tv_sec = tk->xtime_sec; - nsec = timekeeping_get_ns(tk); - tomono = tk->wall_to_monotonic; - sleep = tk->total_sleep_time; - - } while (read_seqcount_retry(&timekeeper_seq, seq)); - - ts->tv_sec += tomono.tv_sec + sleep.tv_sec; - ts->tv_nsec = 0; - timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec); -} -EXPORT_SYMBOL_GPL(get_monotonic_boottime); - -/** - * ktime_get_boottime - Returns monotonic time since boot in a ktime - * - * Returns the monotonic time since boot in a ktime - * - * This is similar to CLOCK_MONTONIC/ktime_get, but also - * includes the time spent in suspend. - */ -ktime_t ktime_get_boottime(void) -{ - struct timespec ts; - - get_monotonic_boottime(&ts); - return timespec_to_ktime(ts); -} -EXPORT_SYMBOL_GPL(ktime_get_boottime); - -/** - * monotonic_to_bootbased - Convert the monotonic time to boot based. - * @ts: pointer to the timespec to be converted - */ -void monotonic_to_bootbased(struct timespec *ts) -{ - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); - *ts = timespec_add(*ts, tk->total_sleep_time); + *ts = ktime_to_timespec(t); } -EXPORT_SYMBOL_GPL(monotonic_to_bootbased); +EXPORT_SYMBOL_GPL(getboottime); unsigned long get_seconds(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; return tk->xtime_sec; } @@ -1538,43 +1623,44 @@ EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; - return tk_xtime(tk); + return timespec64_to_timespec(tk_xtime(tk)); } struct timespec current_kernel_time(void) { - struct timekeeper *tk = &timekeeper; - struct timespec now; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 now; unsigned long seq; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); now = tk_xtime(tk); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - return now; + return timespec64_to_timespec(now); } EXPORT_SYMBOL(current_kernel_time); struct timespec get_monotonic_coarse(void) { - struct timekeeper *tk = &timekeeper; - struct timespec now, mono; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 now, mono; unsigned long seq; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); now = tk_xtime(tk); mono = tk->wall_to_monotonic; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, + set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); - return now; + + return timespec64_to_timespec(now); } /* @@ -1587,29 +1673,38 @@ void do_timer(unsigned long ticks) } /** - * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, - * and sleep offsets. - * @xtim: pointer to timespec to be set with xtime - * @wtom: pointer to timespec to be set with wall_to_monotonic - * @sleep: pointer to timespec to be set with time in suspend + * ktime_get_update_offsets_tick - hrtimer helper + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * @offs_tai: pointer to storage for monotonic -> clock tai offset + * + * Returns monotonic time at last tick and various offsets */ -void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, - struct timespec *wtom, struct timespec *sleep) +ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, + ktime_t *offs_tai) { - struct timekeeper *tk = &timekeeper; - unsigned long seq; + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + u64 nsecs; do { - seq = read_seqcount_begin(&timekeeper_seq); - *xtim = tk_xtime(tk); - *wtom = tk->wall_to_monotonic; - *sleep = tk->total_sleep_time; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + seq = read_seqcount_begin(&tk_core.seq); + + base = tk->tkr.base_mono; + nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift; + + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); } #ifdef CONFIG_HIGH_RES_TIMERS /** - * ktime_get_update_offsets - hrtimer helper + * ktime_get_update_offsets_now - hrtimer helper * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset @@ -1617,57 +1712,37 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, * Returns current monotonic time and updates the offsets * Called from hrtimer_interrupt() or retrigger_next_event() */ -ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, +ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { - struct timekeeper *tk = &timekeeper; - ktime_t now; + struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; - u64 secs, nsecs; + ktime_t base; + u64 nsecs; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); - secs = tk->xtime_sec; - nsecs = timekeeping_get_ns(tk); + base = tk->tkr.base_mono; + nsecs = timekeeping_get_ns(&tk->tkr); *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - now = ktime_add_ns(ktime_set(secs, 0), nsecs); - now = ktime_sub(now, *offs_real); - return now; + return ktime_add_ns(base, nsecs); } #endif /** - * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format - */ -ktime_t ktime_get_monotonic_offset(void) -{ - struct timekeeper *tk = &timekeeper; - unsigned long seq; - struct timespec wtom; - - do { - seq = read_seqcount_begin(&timekeeper_seq); - wtom = tk->wall_to_monotonic; - } while (read_seqcount_retry(&timekeeper_seq, seq)); - - return timespec_to_ktime(wtom); -} -EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); - -/** * do_adjtimex() - Accessor function to NTP __do_adjtimex function */ int do_adjtimex(struct timex *txc) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - struct timespec ts; + struct timespec64 ts; s32 orig_tai, tai; int ret; @@ -1687,10 +1762,10 @@ int do_adjtimex(struct timex *txc) return ret; } - getnstimeofday(&ts); + getnstimeofday64(&ts); raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); orig_tai = tai = tk->tai_offset; ret = __do_adjtimex(txc, &ts, &tai); @@ -1699,7 +1774,7 @@ int do_adjtimex(struct timex *txc) __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); } - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); if (tai != orig_tai) @@ -1719,11 +1794,11 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); __hardpps(phase_ts, raw_ts); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h new file mode 100644 index 0000000..adc1fc9 --- /dev/null +++ b/kernel/time/timekeeping.h @@ -0,0 +1,20 @@ +#ifndef _KERNEL_TIME_TIMEKEEPING_H +#define _KERNEL_TIME_TIMEKEEPING_H +/* + * Internal interfaces for kernel/time/ + */ +extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); +extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); + +extern int timekeeping_valid_for_hres(void); +extern u64 timekeeping_max_deferment(void); +extern int timekeeping_inject_offset(struct timespec *ts); +extern s32 timekeeping_get_tai_offset(void); +extern void timekeeping_set_tai_offset(s32 tai_offset); +extern void timekeeping_clocktai(struct timespec *ts); + +#endif diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 4d54f97..f6bd652 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -67,7 +67,7 @@ static int __init tk_debug_sleep_time_init(void) } late_initcall(tk_debug_sleep_time_init); -void tk_debug_account_sleep_time(struct timespec *t) +void tk_debug_account_sleep_time(struct timespec64 *t) { sleep_time_bin[fls(t->tv_sec)]++; } diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 13323ea..4ea005a 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -3,12 +3,27 @@ /* * timekeeping debug functions */ +#include <linux/clocksource.h> #include <linux/time.h> #ifdef CONFIG_DEBUG_FS -extern void tk_debug_account_sleep_time(struct timespec *t); +extern void tk_debug_account_sleep_time(struct timespec64 *t); #else #define tk_debug_account_sleep_time(x) #endif +#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE +static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) +{ + cycle_t ret = (now - last) & mask; + + return (s64) ret > 0 ? ret : 0; +} +#else +static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) +{ + return (now - last) & mask; +} +#endif + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/timer.c b/kernel/time/timer.c index 3bb01a3..aca5dfe 100644 --- a/kernel/timer.c +++ b/kernel/time/timer.c @@ -82,6 +82,7 @@ struct tvec_base { unsigned long next_timer; unsigned long active_timers; unsigned long all_timers; + int cpu; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -409,6 +410,22 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) base->next_timer = timer->expires; } base->all_timers++; + + /* + * Check whether the other CPU is in dynticks mode and needs + * to be triggered to reevaluate the timer wheel. + * We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to stop its tick can not + * evaluate the timer wheel. + * + * Spare the IPI for deferrable timers on idle targets though. + * The next busy ticks will take care of it. Except full dynticks + * require special care against races with idle_cpu(), lets deal + * with that later. + */ + if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu)) + wake_up_nohz_cpu(base->cpu); } #ifdef CONFIG_TIMER_STATS @@ -948,22 +965,6 @@ void add_timer_on(struct timer_list *timer, int cpu) timer_set_base(timer, base); debug_activate(timer, timer->expires); internal_add_timer(base, timer); - /* - * Check whether the other CPU is in dynticks mode and needs - * to be triggered to reevaluate the timer wheel. - * We are protected against the other CPU fiddling - * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to stop its tick can not - * evaluate the timer wheel. - * - * Spare the IPI for deferrable timers on idle targets though. - * The next busy ticks will take care of it. Except full dynticks - * require special care against races with idle_cpu(), lets deal - * with that later. - */ - if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu)) - wake_up_nohz_cpu(cpu); - spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); @@ -1568,6 +1569,7 @@ static int init_timers_cpu(int cpu) } spin_lock_init(&base->lock); tvec_base_done[cpu] = 1; + base->cpu = cpu; } else { base = per_cpu(tvec_bases, cpu); } diff --git a/kernel/time/udelay_test.c b/kernel/time/udelay_test.c new file mode 100644 index 0000000..e622ba3 --- /dev/null +++ b/kernel/time/udelay_test.c @@ -0,0 +1,168 @@ +/* + * udelay() test kernel module + * + * Test is executed by writing and reading to /sys/kernel/debug/udelay_test + * Tests are configured by writing: USECS ITERATIONS + * Tests are executed by reading from the same file. + * Specifying usecs of 0 or negative values will run multiples tests. + * + * Copyright (C) 2014 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/ktime.h> +#include <linux/module.h> +#include <linux/uaccess.h> + +#define DEFAULT_ITERATIONS 100 + +#define DEBUGFS_FILENAME "udelay_test" + +static DEFINE_MUTEX(udelay_test_lock); +static struct dentry *udelay_test_debugfs_file; +static int udelay_test_usecs; +static int udelay_test_iterations = DEFAULT_ITERATIONS; + +static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters) +{ + int min = 0, max = 0, fail_count = 0; + uint64_t sum = 0; + uint64_t avg; + int i; + /* Allow udelay to be up to 0.5% fast */ + int allowed_error_ns = usecs * 5; + + for (i = 0; i < iters; ++i) { + struct timespec ts1, ts2; + int time_passed; + + ktime_get_ts(&ts1); + udelay(usecs); + ktime_get_ts(&ts2); + time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1); + + if (i == 0 || time_passed < min) + min = time_passed; + if (i == 0 || time_passed > max) + max = time_passed; + if ((time_passed + allowed_error_ns) / 1000 < usecs) + ++fail_count; + WARN_ON(time_passed < 0); + sum += time_passed; + } + + avg = sum; + do_div(avg, iters); + seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d", + usecs, iters, usecs * 1000, + (usecs * 1000) - allowed_error_ns, min, avg, max); + if (fail_count) + seq_printf(s, " FAIL=%d", fail_count); + seq_puts(s, "\n"); + + return 0; +} + +static int udelay_test_show(struct seq_file *s, void *v) +{ + int usecs; + int iters; + int ret = 0; + + mutex_lock(&udelay_test_lock); + usecs = udelay_test_usecs; + iters = udelay_test_iterations; + mutex_unlock(&udelay_test_lock); + + if (usecs > 0 && iters > 0) { + return udelay_test_single(s, usecs, iters); + } else if (usecs == 0) { + struct timespec ts; + + ktime_get_ts(&ts); + seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n", + loops_per_jiffy, ts.tv_sec, ts.tv_nsec); + seq_puts(s, "usage:\n"); + seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); + seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); + } + + return ret; +} + +static int udelay_test_open(struct inode *inode, struct file *file) +{ + return single_open(file, udelay_test_show, inode->i_private); +} + +static ssize_t udelay_test_write(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + char lbuf[32]; + int ret; + int usecs; + int iters; + + if (count >= sizeof(lbuf)) + return -EINVAL; + + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + lbuf[count] = '\0'; + + ret = sscanf(lbuf, "%d %d", &usecs, &iters); + if (ret < 1) + return -EINVAL; + else if (ret < 2) + iters = DEFAULT_ITERATIONS; + + mutex_lock(&udelay_test_lock); + udelay_test_usecs = usecs; + udelay_test_iterations = iters; + mutex_unlock(&udelay_test_lock); + + return count; +} + +static const struct file_operations udelay_test_debugfs_ops = { + .owner = THIS_MODULE, + .open = udelay_test_open, + .read = seq_read, + .write = udelay_test_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init udelay_test_init(void) +{ + mutex_lock(&udelay_test_lock); + udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME, + S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops); + mutex_unlock(&udelay_test_lock); + + return 0; +} + +module_init(udelay_test_init); + +static void __exit udelay_test_exit(void) +{ + mutex_lock(&udelay_test_lock); + debugfs_remove(udelay_test_debugfs_file); + mutex_unlock(&udelay_test_lock); +} + +module_exit(udelay_test_exit); + +MODULE_AUTHOR("David Riley <davidriley@chromium.org>"); +MODULE_LICENSE("GPL"); diff --git a/kernel/torture.c b/kernel/torture.c index 40bb511..d600af21 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -708,7 +708,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, int ret = 0; VERBOSE_TOROUT_STRING(m); - *tp = kthread_run(fn, arg, s); + *tp = kthread_run(fn, arg, "%s", s); if (IS_ERR(*tp)) { ret = PTR_ERR(*tp); VERBOSE_TOROUT_ERRSTRING(f); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d440935..a5da09c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -29,11 +29,6 @@ config HAVE_FUNCTION_GRAPH_FP_TEST help See Documentation/trace/ftrace-design.txt -config HAVE_FUNCTION_TRACE_MCOUNT_TEST - bool - help - See Documentation/trace/ftrace-design.txt - config HAVE_DYNAMIC_FTRACE bool help diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2611613..67d6369 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_TRACING) += trace_output.o +obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ac9d1da..1654b12 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -80,9 +80,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { int ftrace_enabled __read_mostly; static int last_ftrace_enabled; -/* Quick disabling of function tracer. */ -int function_trace_stop __read_mostly; - /* Current function tracing op */ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; /* What to set function_trace_op to */ @@ -1042,6 +1039,8 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid; #ifdef CONFIG_DYNAMIC_FTRACE +static struct ftrace_ops *removed_ops; + #ifndef CONFIG_FTRACE_MCOUNT_RECORD # error Dynamic ftrace depends on MCOUNT_RECORD #endif @@ -1304,25 +1303,15 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash *new_hash; int size = src->count; int bits = 0; - int ret; int i; /* - * Remove the current set, update the hash and add - * them back. - */ - ftrace_hash_rec_disable(ops, enable); - - /* * If the new source is empty, just free dst and assign it * the empty_hash. */ if (!src->count) { - free_ftrace_hash_rcu(*dst); - rcu_assign_pointer(*dst, EMPTY_HASH); - /* still need to update the function records */ - ret = 0; - goto out; + new_hash = EMPTY_HASH; + goto update; } /* @@ -1335,10 +1324,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, if (bits > FTRACE_HASH_MAX_BITS) bits = FTRACE_HASH_MAX_BITS; - ret = -ENOMEM; new_hash = alloc_ftrace_hash(bits); if (!new_hash) - goto out; + return -ENOMEM; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1349,20 +1337,20 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, } } +update: + /* + * Remove the current set, update the hash and add + * them back. + */ + ftrace_hash_rec_disable(ops, enable); + old_hash = *dst; rcu_assign_pointer(*dst, new_hash); free_ftrace_hash_rcu(old_hash); - ret = 0; - out: - /* - * Enable regardless of ret: - * On success, we enable the new hash. - * On failure, we re-enable the original hash. - */ ftrace_hash_rec_enable(ops, enable); - return ret; + return 0; } /* @@ -1492,6 +1480,53 @@ int ftrace_text_reserved(const void *start, const void *end) return (int)!!ret; } +/* Test if ops registered to this rec needs regs */ +static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + bool keep_regs = false; + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) { + /* pass rec in as regs to have non-NULL val */ + if (ftrace_ops_test(ops, rec->ip, rec)) { + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + keep_regs = true; + break; + } + } + } + + return keep_regs; +} + +static void ftrace_remove_tramp(struct ftrace_ops *ops, + struct dyn_ftrace *rec) +{ + struct ftrace_func_entry *entry; + + entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip); + if (!entry) + return; + + /* + * The tramp_hash entry will be removed at time + * of update. + */ + ops->nr_trampolines--; + rec->flags &= ~FTRACE_FL_TRAMP; +} + +static void ftrace_clear_tramps(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->nr_trampolines) + ftrace_remove_tramp(op, rec); + } while_for_each_ftrace_op(op); +} + static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1572,8 +1607,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (inc) { rec->flags++; - if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) + if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) return; + + /* + * If there's only a single callback registered to a + * function, and the ops has a trampoline registered + * for it, then we can call it directly. + */ + if (ftrace_rec_count(rec) == 1 && ops->trampoline) { + rec->flags |= FTRACE_FL_TRAMP; + ops->nr_trampolines++; + } else { + /* + * If we are adding another function callback + * to this function, and the previous had a + * trampoline used, then we need to go back to + * the default trampoline. + */ + rec->flags &= ~FTRACE_FL_TRAMP; + + /* remove trampolines from any ops for this rec */ + ftrace_clear_tramps(rec); + } + /* * If any ops wants regs saved for this function * then all ops will get saved regs. @@ -1581,9 +1638,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) rec->flags |= FTRACE_FL_REGS; } else { - if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) + if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0)) return; rec->flags--; + + if (ops->trampoline && !ftrace_rec_count(rec)) + ftrace_remove_tramp(ops, rec); + + /* + * If the rec had REGS enabled and the ops that is + * being removed had REGS set, then see if there is + * still any ops for this record that wants regs. + * If not, we can stop recording them. + */ + if (ftrace_rec_count(rec) > 0 && + rec->flags & FTRACE_FL_REGS && + ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + if (!test_rec_ops_needs_regs(rec)) + rec->flags &= ~FTRACE_FL_REGS; + } + + /* + * flags will be cleared in ftrace_check_record() + * if rec count is zero. + */ } count++; /* Shortcut, if we handled all records, we are done. */ @@ -1668,17 +1746,23 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) * If we are disabling calls, then disable all records that * are enabled. */ - if (enable && (rec->flags & ~FTRACE_FL_MASK)) + if (enable && ftrace_rec_count(rec)) flag = FTRACE_FL_ENABLED; /* - * If enabling and the REGS flag does not match the REGS_EN, then - * do not ignore this record. Set flags to fail the compare against - * ENABLED. + * If enabling and the REGS flag does not match the REGS_EN, or + * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore + * this record. Set flags to fail the compare against ENABLED. */ - if (flag && - (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) - flag |= FTRACE_FL_REGS; + if (flag) { + if (!(rec->flags & FTRACE_FL_REGS) != + !(rec->flags & FTRACE_FL_REGS_EN)) + flag |= FTRACE_FL_REGS; + + if (!(rec->flags & FTRACE_FL_TRAMP) != + !(rec->flags & FTRACE_FL_TRAMP_EN)) + flag |= FTRACE_FL_TRAMP; + } /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) @@ -1696,6 +1780,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) else rec->flags &= ~FTRACE_FL_REGS_EN; } + if (flag & FTRACE_FL_TRAMP) { + if (rec->flags & FTRACE_FL_TRAMP) + rec->flags |= FTRACE_FL_TRAMP_EN; + else + rec->flags &= ~FTRACE_FL_TRAMP_EN; + } } /* @@ -1704,7 +1794,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) * Otherwise, * return UPDATE_MODIFY_CALL to tell the caller to convert * from the save regs, to a non-save regs function or - * vice versa. + * vice versa, or from a trampoline call. */ if (flag & FTRACE_FL_ENABLED) return FTRACE_UPDATE_MAKE_CALL; @@ -1714,7 +1804,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) if (update) { /* If there's no more users, clear all flags */ - if (!(rec->flags & ~FTRACE_FL_MASK)) + if (!ftrace_rec_count(rec)) rec->flags = 0; else /* Just disable the record (keep REGS state) */ @@ -1751,6 +1841,43 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) return ftrace_check_record(rec, enable, 0); } +static struct ftrace_ops * +ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + + /* Removed ops need to be tested first */ + if (removed_ops && removed_ops->tramp_hash) { + if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip)) + return removed_ops; + } + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (!op->tramp_hash) + continue; + + if (ftrace_lookup_ip(op->tramp_hash, rec->ip)) + return op; + + } while_for_each_ftrace_op(op); + + return NULL; +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* pass rec in as regs to have non-NULL val */ + if (ftrace_ops_test(op, rec->ip, rec)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + /** * ftrace_get_addr_new - Get the call address to set to * @rec: The ftrace record descriptor @@ -1763,6 +1890,20 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) */ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) { + struct ftrace_ops *ops; + + /* Trampolines take precedence over regs */ + if (rec->flags & FTRACE_FL_TRAMP) { + ops = ftrace_find_tramp_ops_new(rec); + if (FTRACE_WARN_ON(!ops || !ops->trampoline)) { + pr_warning("Bad trampoline accounting at: %p (%pS)\n", + (void *)rec->ip, (void *)rec->ip); + /* Ftrace is shutting down, return anything */ + return (unsigned long)FTRACE_ADDR; + } + return ops->trampoline; + } + if (rec->flags & FTRACE_FL_REGS) return (unsigned long)FTRACE_REGS_ADDR; else @@ -1781,6 +1922,20 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) */ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) { + struct ftrace_ops *ops; + + /* Trampolines take precedence over regs */ + if (rec->flags & FTRACE_FL_TRAMP_EN) { + ops = ftrace_find_tramp_ops_curr(rec); + if (FTRACE_WARN_ON(!ops)) { + pr_warning("Bad trampoline accounting at: %p (%pS)\n", + (void *)rec->ip, (void *)rec->ip); + /* Ftrace is shutting down, return anything */ + return (unsigned long)FTRACE_ADDR; + } + return ops->trampoline; + } + if (rec->flags & FTRACE_FL_REGS_EN) return (unsigned long)FTRACE_REGS_ADDR; else @@ -2023,6 +2178,89 @@ void __weak arch_ftrace_update_code(int command) ftrace_run_stop_machine(command); } +static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int size, bits; + int ret; + + size = ops->nr_trampolines; + bits = 0; + /* + * Make the hash size about 1/2 the # found + */ + for (size /= 2; size; size >>= 1) + bits++; + + ops->tramp_hash = alloc_ftrace_hash(bits); + /* + * TODO: a failed allocation is going to screw up + * the accounting of what needs to be modified + * and not. For now, we kill ftrace if we fail + * to allocate here. But there are ways around this, + * but that will take a little more work. + */ + if (!ops->tramp_hash) + return -ENOMEM; + + do_for_each_ftrace_rec(pg, rec) { + if (ftrace_rec_count(rec) == 1 && + ftrace_ops_test(ops, rec->ip, rec)) { + + /* + * If another ops adds to a rec, the rec will + * lose its trampoline and never get it back + * until all ops are off of it. + */ + if (!(rec->flags & FTRACE_FL_TRAMP)) + continue; + + /* This record had better have a trampoline */ + if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN))) + return -1; + + ret = add_hash_entry(ops->tramp_hash, rec->ip); + if (ret < 0) + return ret; + } + } while_for_each_ftrace_rec(); + + /* The number of recs in the hash must match nr_trampolines */ + FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines); + + return 0; +} + +static int ftrace_save_tramp_hashes(void) +{ + struct ftrace_ops *op; + int ret; + + /* + * Now that any trampoline is being used, we need to save the + * hashes for the ops that have them. This allows the mapping + * back from the record to the ops that has the trampoline to + * know what code is being replaced. Modifying code must always + * verify what it is changing. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + + /* The tramp_hash is recreated each time. */ + free_ftrace_hash(op->tramp_hash); + op->tramp_hash = NULL; + + if (op->nr_trampolines) { + ret = ftrace_save_ops_tramp_hash(op); + if (ret) + return ret; + } + + } while_for_each_ftrace_op(op); + + return 0; +} + static void ftrace_run_update_code(int command) { int ret; @@ -2031,11 +2269,6 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); if (ret) return; - /* - * Do not call function tracer while we update the code. - * We are in stop machine. - */ - function_trace_stop++; /* * By default we use stop_machine() to modify the code. @@ -2045,15 +2278,15 @@ static void ftrace_run_update_code(int command) */ arch_ftrace_update_code(command); - function_trace_stop--; - ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); + + ret = ftrace_save_tramp_hashes(); + FTRACE_WARN_ON(ret); } static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; -static int global_start_up; static void control_ops_free(struct ftrace_ops *ops) { @@ -2117,8 +2350,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) ftrace_hash_rec_disable(ops, 1); - if (!global_start_up) - ops->flags &= ~FTRACE_OPS_FL_ENABLED; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; command |= FTRACE_UPDATE_CALLS; @@ -2139,8 +2371,16 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) return 0; } + /* + * If the ops uses a trampoline, then it needs to be + * tested first on update. + */ + removed_ops = ops; + ftrace_run_update_code(command); + removed_ops = NULL; + /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. @@ -2398,7 +2638,8 @@ ftrace_allocate_pages(unsigned long num_to_init) return start_pg; free_pages: - while (start_pg) { + pg = start_pg; + while (pg) { order = get_count_order(pg->size / ENTRIES_PER_PAGE); free_pages((unsigned long)pg->records, order); start_pg = pg->next; @@ -2595,8 +2836,10 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if (iter->flags & FTRACE_ITER_FILTER && - ftrace_hash_empty(ops->filter_hash)) { + if ((iter->flags & FTRACE_ITER_FILTER && + ftrace_hash_empty(ops->filter_hash)) || + (iter->flags & FTRACE_ITER_NOTRACE && + ftrace_hash_empty(ops->notrace_hash))) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -2641,7 +2884,10 @@ static int t_show(struct seq_file *m, void *v) return t_hash_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { - seq_printf(m, "#### all functions enabled ####\n"); + if (iter->flags & FTRACE_ITER_NOTRACE) + seq_printf(m, "#### no functions disabled ####\n"); + else + seq_printf(m, "#### all functions enabled ####\n"); return 0; } @@ -2651,10 +2897,22 @@ static int t_show(struct seq_file *m, void *v) return 0; seq_printf(m, "%ps", (void *)rec->ip); - if (iter->flags & FTRACE_ITER_ENABLED) + if (iter->flags & FTRACE_ITER_ENABLED) { seq_printf(m, " (%ld)%s", - rec->flags & ~FTRACE_FL_MASK, - rec->flags & FTRACE_FL_REGS ? " R" : ""); + ftrace_rec_count(rec), + rec->flags & FTRACE_FL_REGS ? " R" : " "); + if (rec->flags & FTRACE_FL_TRAMP_EN) { + struct ftrace_ops *ops; + + ops = ftrace_find_tramp_ops_curr(rec); + if (ops && ops->trampoline) + seq_printf(m, "\ttramp: %pS", + (void *)ops->trampoline); + else + seq_printf(m, "\ttramp: ERROR!"); + } + } + seq_printf(m, "\n"); return 0; @@ -2702,13 +2960,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file) return iter ? 0 : -ENOMEM; } -static void ftrace_filter_reset(struct ftrace_hash *hash) -{ - mutex_lock(&ftrace_lock); - ftrace_hash_clear(hash); - mutex_unlock(&ftrace_lock); -} - /** * ftrace_regex_open - initialize function tracer filter files * @ops: The ftrace_ops that hold the hash filters @@ -2758,7 +3009,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, hash = ops->filter_hash; if (file->f_mode & FMODE_WRITE) { - iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); + const int size_bits = FTRACE_HASH_DEFAULT_BITS; + + if (file->f_flags & O_TRUNC) + iter->hash = alloc_ftrace_hash(size_bits); + else + iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); + if (!iter->hash) { trace_parser_put(&iter->parser); kfree(iter); @@ -2767,10 +3024,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, } } - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) - ftrace_filter_reset(iter->hash); - if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; @@ -3471,14 +3724,16 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, else orig_hash = &ops->notrace_hash; - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (reset) + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + else + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) { ret = -ENOMEM; goto out_regex_unlock; } - if (reset) - ftrace_filter_reset(hash); if (buf && !ftrace_match_records(hash, buf, len)) { ret = -EINVAL; goto out_regex_unlock; @@ -3630,6 +3885,7 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); static int __init set_graph_function(char *str) @@ -3639,16 +3895,29 @@ static int __init set_graph_function(char *str) } __setup("ftrace_graph_filter=", set_graph_function); -static void __init set_ftrace_early_graph(char *buf) +static int __init set_graph_notrace_function(char *str) +{ + strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_notrace=", set_graph_notrace_function); + +static void __init set_ftrace_early_graph(char *buf, int enable) { int ret; char *func; + unsigned long *table = ftrace_graph_funcs; + int *count = &ftrace_graph_count; + + if (!enable) { + table = ftrace_graph_notrace_funcs; + count = &ftrace_graph_notrace_count; + } while (buf) { func = strsep(&buf, ","); /* we allow only one expression at a time */ - ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, - FTRACE_GRAPH_MAX_FUNCS, func); + ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func); if (ret) printk(KERN_DEBUG "ftrace: function %s not " "traceable\n", func); @@ -3677,7 +3946,9 @@ static void __init set_ftrace_early_filters(void) ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_graph_buf[0]) - set_ftrace_early_graph(ftrace_graph_buf); + set_ftrace_early_graph(ftrace_graph_buf, 1); + if (ftrace_graph_notrace_buf[0]) + set_ftrace_early_graph(ftrace_graph_notrace_buf, 0); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } @@ -3819,7 +4090,12 @@ static int g_show(struct seq_file *m, void *v) return 0; if (ptr == (unsigned long *)1) { - seq_printf(m, "#### all functions enabled ####\n"); + struct ftrace_graph_data *fgd = m->private; + + if (fgd->table == ftrace_graph_funcs) + seq_printf(m, "#### all functions enabled ####\n"); + else + seq_printf(m, "#### no functions disabled ####\n"); return 0; } @@ -4447,9 +4723,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op; int bit; - if (function_trace_stop) - return; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; @@ -4461,9 +4734,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); do_for_each_ftrace_op(op, ftrace_ops_list) { if (ftrace_ops_test(op, ip, regs)) { - if (WARN_ON(!op->func)) { - function_trace_stop = 1; - printk("op=%p %pS\n", op, op); + if (FTRACE_WARN_ON(!op->func)) { + pr_warn("op=%p %pS\n", op, op); goto out; } op->func(ip, parent_ip, op, regs); @@ -5084,6 +5356,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, /* Function graph doesn't use the .func field of global_ops */ global_ops.flags |= FTRACE_OPS_FL_STUB; +#ifdef CONFIG_DYNAMIC_FTRACE + /* Optimize function graph calling (if implemented by arch) */ + if (FTRACE_GRAPH_TRAMP_ADDR != 0) + global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR; +#endif + ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); out: @@ -5104,6 +5382,10 @@ void unregister_ftrace_graph(void) __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); global_ops.flags &= ~FTRACE_OPS_FL_STUB; +#ifdef CONFIG_DYNAMIC_FTRACE + if (FTRACE_GRAPH_TRAMP_ADDR != 0) + global_ops.trampoline = 0; +#endif unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); @@ -5183,9 +5465,4 @@ void ftrace_graph_exit_task(struct task_struct *t) kfree(ret_stack); } - -void ftrace_graph_stop(void) -{ - ftrace_stop(); -} #endif diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b95381e..afb04b9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1689,22 +1689,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, if (!cpu_buffer->nr_pages_to_update) continue; - /* The update must run on the CPU that is being updated. */ - preempt_disable(); - if (cpu == smp_processor_id() || !cpu_online(cpu)) { + /* Can't run something on an offline CPU. */ + if (!cpu_online(cpu)) { rb_update_pages(cpu_buffer); cpu_buffer->nr_pages_to_update = 0; } else { - /* - * Can not disable preemption for schedule_work_on() - * on PREEMPT_RT. - */ - preempt_enable(); schedule_work_on(cpu, &cpu_buffer->update_pages_work); - preempt_disable(); } - preempt_enable(); } /* wait for all the updates to complete */ @@ -1742,22 +1734,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, get_online_cpus(); - preempt_disable(); - /* The update must run on the CPU that is being updated. */ - if (cpu_id == smp_processor_id() || !cpu_online(cpu_id)) + /* Can't run something on an offline CPU. */ + if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); else { - /* - * Can not disable preemption for schedule_work_on() - * on PREEMPT_RT. - */ - preempt_enable(); schedule_work_on(cpu_id, &cpu_buffer->update_pages_work); wait_for_completion(&cpu_buffer->update_done); - preempt_disable(); } - preempt_enable(); cpu_buffer->nr_pages_to_update = 0; put_online_cpus(); @@ -3772,7 +3756,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) if (rb_per_cpu_empty(cpu_buffer)) return NULL; - if (iter->head >= local_read(&iter->head_page->page->commit)) { + if (iter->head >= rb_page_size(iter->head_page)) { rb_inc_iter(iter); goto again; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 291397e..8a52839 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -820,11 +820,12 @@ static struct { const char *name; int in_ns; /* is this clock in nanoseconds? */ } trace_clocks[] = { - { trace_clock_local, "local", 1 }, - { trace_clock_global, "global", 1 }, - { trace_clock_counter, "counter", 0 }, - { trace_clock_jiffies, "uptime", 0 }, - { trace_clock, "perf", 1 }, + { trace_clock_local, "local", 1 }, + { trace_clock_global, "global", 1 }, + { trace_clock_counter, "counter", 0 }, + { trace_clock_jiffies, "uptime", 0 }, + { trace_clock, "perf", 1 }, + { ktime_get_mono_fast_ns, "mono", 1 }, ARCH_TRACE_CLOCKS }; @@ -937,30 +938,6 @@ out: return ret; } -ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) -{ - int len; - int ret; - - if (!cnt) - return 0; - - if (s->len <= s->readpos) - return -EBUSY; - - len = s->len - s->readpos; - if (cnt > len) - cnt = len; - ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); - if (ret == cnt) - return -EFAULT; - - cnt -= ret; - - s->readpos += cnt; - return cnt; -} - static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; @@ -3699,6 +3676,7 @@ static const char readme_msg[] = #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" + " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n" " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT @@ -4238,10 +4216,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, } static ssize_t -tracing_max_lat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, + size_t cnt, loff_t *ppos) { - unsigned long *ptr = filp->private_data; char buf[64]; int r; @@ -4253,10 +4230,9 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf, } static ssize_t -tracing_max_lat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, + size_t cnt, loff_t *ppos) { - unsigned long *ptr = filp->private_data; unsigned long val; int ret; @@ -4269,6 +4245,52 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, return cnt; } +static ssize_t +tracing_thresh_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos); +} + +static ssize_t +tracing_thresh_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + int ret; + + mutex_lock(&trace_types_lock); + ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); + if (ret < 0) + goto out; + + if (tr->current_trace->update_thresh) { + ret = tr->current_trace->update_thresh(tr); + if (ret < 0) + goto out; + } + + ret = cnt; +out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos); +} + static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; @@ -5170,6 +5192,13 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp) #endif /* CONFIG_TRACER_SNAPSHOT */ +static const struct file_operations tracing_thresh_fops = { + .open = tracing_open_generic, + .read = tracing_thresh_read, + .write = tracing_thresh_write, + .llseek = generic_file_llseek, +}; + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -6107,10 +6136,8 @@ destroy_trace_option_files(struct trace_option_dentry *topts) if (!topts) return; - for (cnt = 0; topts[cnt].opt; cnt++) { - if (topts[cnt].entry) - debugfs_remove(topts[cnt].entry); - } + for (cnt = 0; topts[cnt].opt; cnt++) + debugfs_remove(topts[cnt].entry); kfree(topts); } @@ -6533,7 +6560,7 @@ static __init int tracer_init_debugfs(void) init_tracer_debugfs(&global_trace, d_tracer); trace_create_file("tracing_thresh", 0644, d_tracer, - &tracing_thresh, &tracing_max_lat_fops); + &global_trace, &tracing_thresh_fops); trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9258f5a..385391f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -339,6 +339,7 @@ struct tracer_flags { * @reset: called when one switches to another tracer * @start: called when tracing is unpaused (echo 1 > tracing_enabled) * @stop: called when tracing is paused (echo 0 > tracing_enabled) + * @update_thresh: called when tracing_thresh is updated * @open: called when the trace file is opened * @pipe_open: called when the trace_pipe file is opened * @close: called when the trace file is released @@ -357,6 +358,7 @@ struct tracer { void (*reset)(struct trace_array *tr); void (*start)(struct trace_array *tr); void (*stop)(struct trace_array *tr); + int (*update_thresh)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); void (*pipe_open)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 5d12bb4..4b9c114 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -30,6 +30,18 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, return ret; } + /* + * We checked and allowed to create parent, + * allow children without checking. + */ + if (p_event->parent) + return 0; + + /* + * It's ok to check current process (owner) permissions in here, + * because code below is called only via perf_event_open syscall. + */ + /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event)) { if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2de53628..ef06ce7 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -8,6 +8,8 @@ * */ +#define pr_fmt(fmt) fmt + #include <linux/workqueue.h> #include <linux/spinlock.h> #include <linux/kthread.h> @@ -1491,7 +1493,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, dir->entry = debugfs_create_dir(name, parent); if (!dir->entry) { - pr_warning("Failed to create system directory %s\n", name); + pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; } @@ -1507,7 +1509,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, if (!entry) { kfree(system->filter); system->filter = NULL; - pr_warning("Could not create debugfs '%s/filter' entry\n", name); + pr_warn("Could not create debugfs '%s/filter' entry\n", name); } trace_create_file("enable", 0644, dir->entry, dir, @@ -1522,8 +1524,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, out_fail: /* Only print this message if failed on memory allocation */ if (!dir || !system) - pr_warning("No memory to create event subsystem %s\n", - name); + pr_warn("No memory to create event subsystem %s\n", name); return NULL; } @@ -1551,8 +1552,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) name = ftrace_event_name(call); file->dir = debugfs_create_dir(name, d_events); if (!file->dir) { - pr_warning("Could not create debugfs '%s' directory\n", - name); + pr_warn("Could not create debugfs '%s' directory\n", name); return -1; } @@ -1575,8 +1575,8 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) if (list_empty(head)) { ret = call->class->define_fields(call); if (ret < 0) { - pr_warning("Could not initialize trace point" - " events/%s\n", name); + pr_warn("Could not initialize trace point events/%s\n", + name); return -1; } } @@ -1621,7 +1621,6 @@ static void event_remove(struct ftrace_event_call *call) if (file->event_call != call) continue; ftrace_event_enable_disable(file, 0); - destroy_preds(file); /* * The do_for_each_event_file() is * a double loop. After finding the call for this @@ -1649,8 +1648,7 @@ static int event_init(struct ftrace_event_call *call) if (call->class->raw_init) { ret = call->class->raw_init(call); if (ret < 0 && ret != -ENOSYS) - pr_warn("Could not initialize trace events/%s\n", - name); + pr_warn("Could not initialize trace events/%s\n", name); } return ret; @@ -1749,7 +1747,8 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) { event_remove(call); trace_destroy_fields(call); - destroy_call_preds(call); + free_event_filter(call->filter); + call->filter = NULL; } static int probe_remove_event_call(struct ftrace_event_call *call) @@ -1895,8 +1894,8 @@ __trace_add_event_dirs(struct trace_array *tr) list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) - pr_warning("Could not create directory for event %s\n", - ftrace_event_name(call)); + pr_warn("Could not create directory for event %s\n", + ftrace_event_name(call)); } } @@ -2208,8 +2207,8 @@ __trace_early_add_event_dirs(struct trace_array *tr) list_for_each_entry(file, &tr->events, list) { ret = event_create_dir(tr->event_dir, file); if (ret < 0) - pr_warning("Could not create directory for event %s\n", - ftrace_event_name(file->event_call)); + pr_warn("Could not create directory for event %s\n", + ftrace_event_name(file->event_call)); } } @@ -2232,8 +2231,8 @@ __trace_early_add_events(struct trace_array *tr) ret = __trace_early_add_new_event(call, tr); if (ret < 0) - pr_warning("Could not create early event %s\n", - ftrace_event_name(call)); + pr_warn("Could not create early event %s\n", + ftrace_event_name(call)); } } @@ -2280,13 +2279,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) entry = debugfs_create_file("set_event", 0644, parent, tr, &ftrace_set_event_fops); if (!entry) { - pr_warning("Could not create debugfs 'set_event' entry\n"); + pr_warn("Could not create debugfs 'set_event' entry\n"); return -ENOMEM; } d_events = debugfs_create_dir("events", parent); if (!d_events) { - pr_warning("Could not create debugfs 'events' directory\n"); + pr_warn("Could not create debugfs 'events' directory\n"); return -ENOMEM; } @@ -2462,11 +2461,10 @@ static __init int event_trace_init(void) entry = debugfs_create_file("available_events", 0444, d_tracer, tr, &ftrace_avail_fops); if (!entry) - pr_warning("Could not create debugfs " - "'available_events' entry\n"); + pr_warn("Could not create debugfs 'available_events' entry\n"); if (trace_define_common_fields()) - pr_warning("tracing: Failed to allocate common fields"); + pr_warn("tracing: Failed to allocate common fields"); ret = early_event_add_tracer(d_tracer, tr); if (ret) @@ -2475,7 +2473,7 @@ static __init int event_trace_init(void) #ifdef CONFIG_MODULES ret = register_module_notifier(&trace_module_nb); if (ret) - pr_warning("Failed to register trace events module notifier\n"); + pr_warn("Failed to register trace events module notifier\n"); #endif return 0; } @@ -2579,7 +2577,7 @@ static __init void event_trace_self_tests(void) * it and the self test should not be on. */ if (file->flags & FTRACE_EVENT_FL_ENABLED) { - pr_warning("Enabled event during self test!\n"); + pr_warn("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; } @@ -2607,8 +2605,8 @@ static __init void event_trace_self_tests(void) ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); if (WARN_ON_ONCE(ret)) { - pr_warning("error enabling system %s\n", - system->name); + pr_warn("error enabling system %s\n", + system->name); continue; } @@ -2616,8 +2614,8 @@ static __init void event_trace_self_tests(void) ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); if (WARN_ON_ONCE(ret)) { - pr_warning("error disabling system %s\n", - system->name); + pr_warn("error disabling system %s\n", + system->name); continue; } @@ -2631,7 +2629,7 @@ static __init void event_trace_self_tests(void) ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); if (WARN_ON_ONCE(ret)) { - pr_warning("error enabling all events\n"); + pr_warn("error enabling all events\n"); return; } @@ -2640,7 +2638,7 @@ static __init void event_trace_self_tests(void) /* reset sysname */ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); if (WARN_ON_ONCE(ret)) { - pr_warning("error disabling all events\n"); + pr_warn("error disabling all events\n"); return; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8a86319..7a8c152 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -774,17 +774,12 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } -static void call_filter_disable(struct ftrace_event_call *call) -{ - call->flags &= ~TRACE_EVENT_FL_FILTERED; -} - static void filter_disable(struct ftrace_event_file *file) { struct ftrace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call_filter_disable(call); + call->flags &= ~TRACE_EVENT_FL_FILTERED; else file->flags &= ~FTRACE_EVENT_FL_FILTERED; } @@ -804,32 +799,6 @@ void free_event_filter(struct event_filter *filter) __free_filter(filter); } -void destroy_call_preds(struct ftrace_event_call *call) -{ - __free_filter(call->filter); - call->filter = NULL; -} - -static void destroy_file_preds(struct ftrace_event_file *file) -{ - __free_filter(file->filter); - file->filter = NULL; -} - -/* - * Called when destroying the ftrace_event_file. - * The file is being freed, so we do not need to worry about - * the file being currently used. This is for module code removing - * the tracepoints from within it. - */ -void destroy_preds(struct ftrace_event_file *file) -{ - if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - destroy_call_preds(file->event_call); - else - destroy_file_preds(file); -} - static struct event_filter *__alloc_filter(void) { struct event_filter *filter; @@ -873,17 +842,14 @@ static inline void __remove_filter(struct ftrace_event_file *file) remove_filter_string(file->filter); } -static void filter_free_subsystem_preds(struct event_subsystem *system, +static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir, struct trace_array *tr) { struct ftrace_event_file *file; - struct ftrace_event_call *call; list_for_each_entry(file, &tr->events, list) { - call = file->event_call; - if (strcmp(call->class->system, system->name) != 0) + if (file->system != dir) continue; - __remove_filter(file); } } @@ -901,15 +867,13 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file) } } -static void filter_free_subsystem_filters(struct event_subsystem *system, +static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir, struct trace_array *tr) { struct ftrace_event_file *file; - struct ftrace_event_call *call; list_for_each_entry(file, &tr->events, list) { - call = file->event_call; - if (strcmp(call->class->system, system->name) != 0) + if (file->system != dir) continue; __free_subsystem_filter(file); } @@ -1582,7 +1546,6 @@ static int fold_pred_tree(struct event_filter *filter, static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, - char *filter_string, bool dry_run) { char *operand1 = NULL, *operand2 = NULL; @@ -1755,13 +1718,12 @@ struct filter_list { struct event_filter *filter; }; -static int replace_system_preds(struct event_subsystem *system, +static int replace_system_preds(struct ftrace_subsystem_dir *dir, struct trace_array *tr, struct filter_parse_state *ps, char *filter_string) { struct ftrace_event_file *file; - struct ftrace_event_call *call; struct filter_list *filter_item; struct filter_list *tmp; LIST_HEAD(filter_list); @@ -1769,15 +1731,14 @@ static int replace_system_preds(struct event_subsystem *system, int err; list_for_each_entry(file, &tr->events, list) { - call = file->event_call; - if (strcmp(call->class->system, system->name) != 0) + if (file->system != dir) continue; /* * Try to see if the filter can be applied * (filter arg is ignored on dry_run) */ - err = replace_preds(call, NULL, ps, filter_string, true); + err = replace_preds(file->event_call, NULL, ps, true); if (err) event_set_no_set_filter_flag(file); else @@ -1787,9 +1748,7 @@ static int replace_system_preds(struct event_subsystem *system, list_for_each_entry(file, &tr->events, list) { struct event_filter *filter; - call = file->event_call; - - if (strcmp(call->class->system, system->name) != 0) + if (file->system != dir) continue; if (event_no_set_filter_flag(file)) @@ -1811,7 +1770,7 @@ static int replace_system_preds(struct event_subsystem *system, if (err) goto fail_mem; - err = replace_preds(call, filter, ps, filter_string, false); + err = replace_preds(file->event_call, filter, ps, false); if (err) { filter_disable(file); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); @@ -1933,7 +1892,7 @@ static int create_filter(struct ftrace_event_call *call, err = create_filter_start(filter_str, set_str, &ps, &filter); if (!err) { - err = replace_preds(call, filter, ps, filter_str, false); + err = replace_preds(call, filter, ps, false); if (err && set_str) append_filter_err(ps, filter); } @@ -1959,7 +1918,7 @@ int create_event_filter(struct ftrace_event_call *call, * Identical to create_filter() except that it creates a subsystem filter * and always remembers @filter_str. */ -static int create_system_filter(struct event_subsystem *system, +static int create_system_filter(struct ftrace_subsystem_dir *dir, struct trace_array *tr, char *filter_str, struct event_filter **filterp) { @@ -1969,7 +1928,7 @@ static int create_system_filter(struct event_subsystem *system, err = create_filter_start(filter_str, true, &ps, &filter); if (!err) { - err = replace_system_preds(system, tr, ps, filter_str); + err = replace_system_preds(dir, tr, ps, filter_str); if (!err) { /* System filters just show a default message */ kfree(filter->filter_string); @@ -2053,18 +2012,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, } if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system, tr); + filter_free_subsystem_preds(dir, tr); remove_filter_string(system->filter); filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ synchronize_sched(); - filter_free_subsystem_filters(system, tr); + filter_free_subsystem_filters(dir, tr); __free_filter(filter); goto out_unlock; } - err = create_system_filter(system, tr, filter_string, &filter); + err = create_system_filter(dir, tr, filter_string, &filter); if (filter) { /* * No event actually uses the system filter diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4de3e57..f0a0c98 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -15,6 +15,33 @@ #include "trace.h" #include "trace_output.h" +static bool kill_ftrace_graph; + +/** + * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called + * + * ftrace_graph_stop() is called when a severe error is detected in + * the function graph tracing. This function is called by the critical + * paths of function graph to keep those paths from doing any more harm. + */ +bool ftrace_graph_is_dead(void) +{ + return kill_ftrace_graph; +} + +/** + * ftrace_graph_stop - set to permanently disable function graph tracincg + * + * In case of an error int function graph tracing, this is called + * to try to keep function graph tracing from causing any more harm. + * Usually this is pretty severe and this is called to try to at least + * get a warning out to the user. + */ +void ftrace_graph_stop(void) +{ + kill_ftrace_graph = true; +} + /* When set, irq functions will be ignored */ static int ftrace_graph_skip_irqs; @@ -92,6 +119,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, unsigned long long calltime; int index; + if (unlikely(ftrace_graph_is_dead())) + return -EBUSY; + if (!current->ret_stack) return -EBUSY; @@ -323,7 +353,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return ret; } -int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) +static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) { if (tracing_thresh) return 1; @@ -412,7 +442,7 @@ void set_graph_array(struct trace_array *tr) smp_mb(); } -void trace_graph_thresh_return(struct ftrace_graph_ret *trace) +static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) { if (tracing_thresh && (trace->rettime - trace->calltime < tracing_thresh)) @@ -445,6 +475,12 @@ static void graph_trace_reset(struct trace_array *tr) unregister_ftrace_graph(); } +static int graph_trace_update_thresh(struct trace_array *tr) +{ + graph_trace_reset(tr); + return graph_trace_init(tr); +} + static int max_bytes_for_cpu; static enum print_line_t @@ -1399,7 +1435,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) seq_printf(s, " | | | |\n"); } -void print_graph_headers(struct seq_file *s) +static void print_graph_headers(struct seq_file *s) { print_graph_headers_flags(s, tracer_flags.val); } @@ -1495,6 +1531,7 @@ static struct trace_event graph_trace_ret_event = { static struct tracer graph_trace __tracer_data = { .name = "function_graph", + .update_thresh = graph_trace_update_thresh, .open = graph_trace_open, .pipe_open = graph_trace_open, .close = graph_trace_close, diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index f3dad80..c6977d5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -20,23 +20,6 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; -int trace_print_seq(struct seq_file *m, struct trace_seq *s) -{ - int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; - int ret; - - ret = seq_write(m, s->buffer, len); - - /* - * Only reset this buffer if we successfully wrote to the - * seq_file buffer. - */ - if (!ret) - trace_seq_init(s); - - return ret; -} - enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -85,257 +68,6 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; } -/** - * trace_seq_printf - sequence printing of trace information - * @s: trace sequence descriptor - * @fmt: printf format string - * - * It returns 0 if the trace oversizes the buffer's free - * space, 1 otherwise. - * - * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace - * trace_seq_printf is used to store strings into a special - * buffer (@s). Then the output may be either used by - * the sequencer or pulled into another buffer. - */ -int -trace_seq_printf(struct trace_seq *s, const char *fmt, ...) -{ - int len = (PAGE_SIZE - 1) - s->len; - va_list ap; - int ret; - - if (s->full || !len) - return 0; - - va_start(ap, fmt); - ret = vsnprintf(s->buffer + s->len, len, fmt, ap); - va_end(ap); - - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { - s->full = 1; - return 0; - } - - s->len += ret; - - return 1; -} -EXPORT_SYMBOL_GPL(trace_seq_printf); - -/** - * trace_seq_bitmask - put a list of longs as a bitmask print output - * @s: trace sequence descriptor - * @maskp: points to an array of unsigned longs that represent a bitmask - * @nmaskbits: The number of bits that are valid in @maskp - * - * It returns 0 if the trace oversizes the buffer's free - * space, 1 otherwise. - * - * Writes a ASCII representation of a bitmask string into @s. - */ -int -trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, - int nmaskbits) -{ - int len = (PAGE_SIZE - 1) - s->len; - int ret; - - if (s->full || !len) - return 0; - - ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); - s->len += ret; - - return 1; -} -EXPORT_SYMBOL_GPL(trace_seq_bitmask); - -/** - * trace_seq_vprintf - sequence printing of trace information - * @s: trace sequence descriptor - * @fmt: printf format string - * - * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace - * trace_seq_printf is used to store strings into a special - * buffer (@s). Then the output may be either used by - * the sequencer or pulled into another buffer. - */ -int -trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) -{ - int len = (PAGE_SIZE - 1) - s->len; - int ret; - - if (s->full || !len) - return 0; - - ret = vsnprintf(s->buffer + s->len, len, fmt, args); - - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { - s->full = 1; - return 0; - } - - s->len += ret; - - return len; -} -EXPORT_SYMBOL_GPL(trace_seq_vprintf); - -int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) -{ - int len = (PAGE_SIZE - 1) - s->len; - int ret; - - if (s->full || !len) - return 0; - - ret = bstr_printf(s->buffer + s->len, len, fmt, binary); - - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { - s->full = 1; - return 0; - } - - s->len += ret; - - return len; -} - -/** - * trace_seq_puts - trace sequence printing of simple string - * @s: trace sequence descriptor - * @str: simple string to record - * - * The tracer may use either the sequence operations or its own - * copy to user routines. This function records a simple string - * into a special buffer (@s) for later retrieval by a sequencer - * or other mechanism. - */ -int trace_seq_puts(struct trace_seq *s, const char *str) -{ - int len = strlen(str); - - if (s->full) - return 0; - - if (len > ((PAGE_SIZE - 1) - s->len)) { - s->full = 1; - return 0; - } - - memcpy(s->buffer + s->len, str, len); - s->len += len; - - return len; -} - -int trace_seq_putc(struct trace_seq *s, unsigned char c) -{ - if (s->full) - return 0; - - if (s->len >= (PAGE_SIZE - 1)) { - s->full = 1; - return 0; - } - - s->buffer[s->len++] = c; - - return 1; -} -EXPORT_SYMBOL(trace_seq_putc); - -int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) -{ - if (s->full) - return 0; - - if (len > ((PAGE_SIZE - 1) - s->len)) { - s->full = 1; - return 0; - } - - memcpy(s->buffer + s->len, mem, len); - s->len += len; - - return len; -} - -int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len) -{ - unsigned char hex[HEX_CHARS]; - const unsigned char *data = mem; - int i, j; - - if (s->full) - return 0; - -#ifdef __BIG_ENDIAN - for (i = 0, j = 0; i < len; i++) { -#else - for (i = len-1, j = 0; i >= 0; i--) { -#endif - hex[j++] = hex_asc_hi(data[i]); - hex[j++] = hex_asc_lo(data[i]); - } - hex[j++] = ' '; - - return trace_seq_putmem(s, hex, j); -} - -void *trace_seq_reserve(struct trace_seq *s, size_t len) -{ - void *ret; - - if (s->full) - return NULL; - - if (len > ((PAGE_SIZE - 1) - s->len)) { - s->full = 1; - return NULL; - } - - ret = s->buffer + s->len; - s->len += len; - - return ret; -} - -int trace_seq_path(struct trace_seq *s, const struct path *path) -{ - unsigned char *p; - - if (s->full) - return 0; - - if (s->len >= (PAGE_SIZE - 1)) { - s->full = 1; - return 0; - } - - p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); - if (!IS_ERR(p)) { - p = mangle_path(s->buffer + s->len, p, "\n"); - if (p) { - s->len = p - s->buffer; - return 1; - } - } else { - s->buffer[s->len++] = '?'; - return 1; - } - - s->full = 1; - return 0; -} - const char * ftrace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, @@ -343,7 +75,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, { unsigned long mask; const char *str; - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; for (i = 0; flag_array[i].name && flags; i++) { @@ -379,7 +111,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, const struct trace_print_flags *symbol_array) { int i; - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); for (i = 0; symbol_array[i].name; i++) { @@ -390,7 +122,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, break; } - if (ret == (const char *)(p->buffer + p->len)) + if (ret == (const char *)(trace_seq_buffer_ptr(p))) trace_seq_printf(p, "0x%lx", val); trace_seq_putc(p, 0); @@ -405,7 +137,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 *symbol_array) { int i; - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); for (i = 0; symbol_array[i].name; i++) { @@ -416,7 +148,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, break; } - if (ret == (const char *)(p->buffer + p->len)) + if (ret == (const char *)(trace_seq_buffer_ptr(p))) trace_seq_printf(p, "0x%llx", val); trace_seq_putc(p, 0); @@ -430,7 +162,7 @@ const char * ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, unsigned int bitmask_size) { - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); trace_seq_putc(p, 0); @@ -443,7 +175,7 @@ const char * ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) { int i; - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); for (i = 0; i < buf_len; i++) trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 127a9d8..80b25b5 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -35,9 +35,6 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); extern int __unregister_ftrace_event(struct trace_event *event); extern struct rw_semaphore trace_event_sem; -#define MAX_MEMHEX_BYTES 8 -#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) - #define SEQ_PUT_FIELD_RET(s, x) \ do { \ if (!trace_seq_putmem(s, &(x), sizeof(x))) \ @@ -46,7 +43,6 @@ do { \ #define SEQ_PUT_HEX_FIELD_RET(s, x) \ do { \ - BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ return TRACE_TYPE_PARTIAL_LINE; \ } while (0) diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c new file mode 100644 index 0000000..1f24ed9 --- /dev/null +++ b/kernel/trace/trace_seq.c @@ -0,0 +1,428 @@ +/* + * trace_seq.c + * + * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> + * + * The trace_seq is a handy tool that allows you to pass a descriptor around + * to a buffer that other functions can write to. It is similar to the + * seq_file functionality but has some differences. + * + * To use it, the trace_seq must be initialized with trace_seq_init(). + * This will set up the counters within the descriptor. You can call + * trace_seq_init() more than once to reset the trace_seq to start + * from scratch. + * + * The buffer size is currently PAGE_SIZE, although it may become dynamic + * in the future. + * + * A write to the buffer will either succed or fail. That is, unlike + * sprintf() there will not be a partial write (well it may write into + * the buffer but it wont update the pointers). This allows users to + * try to write something into the trace_seq buffer and if it fails + * they can flush it and try again. + * + */ +#include <linux/uaccess.h> +#include <linux/seq_file.h> +#include <linux/trace_seq.h> + +/* How much buffer is left on the trace_seq? */ +#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len) + +/* How much buffer is written? */ +#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1)) + +/** + * trace_print_seq - move the contents of trace_seq into a seq_file + * @m: the seq_file descriptor that is the destination + * @s: the trace_seq descriptor that is the source. + * + * Returns 0 on success and non zero on error. If it succeeds to + * write to the seq_file it will reset the trace_seq, otherwise + * it does not modify the trace_seq to let the caller try again. + */ +int trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + unsigned int len = TRACE_SEQ_BUF_USED(s); + int ret; + + ret = seq_write(m, s->buffer, len); + + /* + * Only reset this buffer if we successfully wrote to the + * seq_file buffer. This lets the caller try again or + * do something else with the contents. + */ + if (!ret) + trace_seq_init(s); + + return ret; +} + +/** + * trace_seq_printf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf() is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + * + * Returns 1 if we successfully written all the contents to + * the buffer. + * Returns 0 if we the length to write is bigger than the + * reserved buffer space. In this case, nothing gets written. + */ +int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +{ + unsigned int len = TRACE_SEQ_BUF_LEFT(s); + va_list ap; + int ret; + + if (s->full || !len) + return 0; + + va_start(ap, fmt); + ret = vsnprintf(s->buffer + s->len, len, fmt, ap); + va_end(ap); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) { + s->full = 1; + return 0; + } + + s->len += ret; + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_printf); + +/** + * trace_seq_bitmask - write a bitmask array in its ASCII representation + * @s: trace sequence descriptor + * @maskp: points to an array of unsigned longs that represent a bitmask + * @nmaskbits: The number of bits that are valid in @maskp + * + * Writes a ASCII representation of a bitmask string into @s. + * + * Returns 1 if we successfully written all the contents to + * the buffer. + * Returns 0 if we the length to write is bigger than the + * reserved buffer space. In this case, nothing gets written. + */ +int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + unsigned int len = TRACE_SEQ_BUF_LEFT(s); + int ret; + + if (s->full || !len) + return 0; + + ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); + s->len += ret; + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask); + +/** + * trace_seq_vprintf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) +{ + unsigned int len = TRACE_SEQ_BUF_LEFT(s); + int ret; + + if (s->full || !len) + return 0; + + ret = vsnprintf(s->buffer + s->len, len, fmt, args); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) { + s->full = 1; + return 0; + } + + s->len += ret; + + return len; +} +EXPORT_SYMBOL_GPL(trace_seq_vprintf); + +/** + * trace_seq_bprintf - Write the printf string from binary arguments + * @s: trace sequence descriptor + * @fmt: The format string for the @binary arguments + * @binary: The binary arguments for @fmt. + * + * When recording in a fast path, a printf may be recorded with just + * saving the format and the arguments as they were passed to the + * function, instead of wasting cycles converting the arguments into + * ASCII characters. Instead, the arguments are saved in a 32 bit + * word array that is defined by the format string constraints. + * + * This function will take the format and the binary array and finish + * the conversion into the ASCII string within the buffer. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +{ + unsigned int len = TRACE_SEQ_BUF_LEFT(s); + int ret; + + if (s->full || !len) + return 0; + + ret = bstr_printf(s->buffer + s->len, len, fmt, binary); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) { + s->full = 1; + return 0; + } + + s->len += ret; + + return len; +} +EXPORT_SYMBOL_GPL(trace_seq_bprintf); + +/** + * trace_seq_puts - trace sequence printing of simple string + * @s: trace sequence descriptor + * @str: simple string to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple string + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_puts(struct trace_seq *s, const char *str) +{ + unsigned int len = strlen(str); + + if (s->full) + return 0; + + if (len > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return 0; + } + + memcpy(s->buffer + s->len, str, len); + s->len += len; + + return len; +} +EXPORT_SYMBOL_GPL(trace_seq_puts); + +/** + * trace_seq_putc - trace sequence printing of simple character + * @s: trace sequence descriptor + * @c: simple character to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple charater + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_putc(struct trace_seq *s, unsigned char c) +{ + if (s->full) + return 0; + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return 0; + } + + s->buffer[s->len++] = c; + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_putc); + +/** + * trace_seq_putmem - write raw data into the trace_seq buffer + * @s: trace sequence descriptor + * @mem: The raw memory to copy into the buffer + * @len: The length of the raw memory to copy (in bytes) + * + * There may be cases where raw memory needs to be written into the + * buffer and a strcpy() would not work. Using this function allows + * for such cases. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) +{ + if (s->full) + return 0; + + if (len > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return 0; + } + + memcpy(s->buffer + s->len, mem, len); + s->len += len; + + return len; +} +EXPORT_SYMBOL_GPL(trace_seq_putmem); + +#define MAX_MEMHEX_BYTES 8U +#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) + +/** + * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex + * @s: trace sequence descriptor + * @mem: The raw memory to write its hex ASCII representation of + * @len: The length of the raw memory to copy (in bytes) + * + * This is similar to trace_seq_putmem() except instead of just copying the + * raw memory into the buffer it writes its ASCII representation of it + * in hex characters. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + unsigned int len) +{ + unsigned char hex[HEX_CHARS]; + const unsigned char *data = mem; + unsigned int start_len; + int i, j; + int cnt = 0; + + if (s->full) + return 0; + + while (len) { + start_len = min(len, HEX_CHARS - 1); +#ifdef __BIG_ENDIAN + for (i = 0, j = 0; i < start_len; i++) { +#else + for (i = start_len-1, j = 0; i >= 0; i--) { +#endif + hex[j++] = hex_asc_hi(data[i]); + hex[j++] = hex_asc_lo(data[i]); + } + if (WARN_ON_ONCE(j == 0 || j/2 > len)) + break; + + /* j increments twice per loop */ + len -= j / 2; + hex[j++] = ' '; + + cnt += trace_seq_putmem(s, hex, j); + } + return cnt; +} +EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); + +/** + * trace_seq_path - copy a path into the sequence buffer + * @s: trace sequence descriptor + * @path: path to write into the sequence buffer. + * + * Write a path name into the sequence buffer. + * + * Returns 1 if we successfully written all the contents to + * the buffer. + * Returns 0 if we the length to write is bigger than the + * reserved buffer space. In this case, nothing gets written. + */ +int trace_seq_path(struct trace_seq *s, const struct path *path) +{ + unsigned char *p; + + if (s->full) + return 0; + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return 0; + } + + p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); + if (!IS_ERR(p)) { + p = mangle_path(s->buffer + s->len, p, "\n"); + if (p) { + s->len = p - s->buffer; + return 1; + } + } else { + s->buffer[s->len++] = '?'; + return 1; + } + + s->full = 1; + return 0; +} +EXPORT_SYMBOL_GPL(trace_seq_path); + +/** + * trace_seq_to_user - copy the squence buffer to user space + * @s: trace sequence descriptor + * @ubuf: The userspace memory location to copy to + * @cnt: The amount to copy + * + * Copies the sequence buffer into the userspace memory pointed to + * by @ubuf. It starts from the last read position (@s->readpos) + * and writes up to @cnt characters or till it reaches the end of + * the content in the buffer (@s->len), which ever comes first. + * + * On success, it returns a positive number of the number of bytes + * it copied. + * + * On failure it returns -EBUSY if all of the content in the + * sequence has been already read, which includes nothing in the + * sequenc (@s->len == @s->readpos). + * + * Returns -EFAULT if the copy to userspace fails. + */ +int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) +{ + int len; + int ret; + + if (!cnt) + return 0; + + if (s->len <= s->readpos) + return -EBUSY; + + len = s->len - s->readpos; + if (cnt > len) + cnt = len; + ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); + if (ret == cnt) + return -EFAULT; + + cnt -= ret; + + s->readpos += cnt; + return cnt; +} +EXPORT_SYMBOL_GPL(trace_seq_to_user); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3c9b97e..33ff6a2 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -265,7 +265,6 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; init_trace_uprobe_filter(&tu->filter); - tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; return tu; error: @@ -1292,7 +1291,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) kfree(call->print_fmt); return -ENODEV; } - call->flags = 0; + call->class->reg = trace_uprobe_register; call->data = tu; ret = trace_add_event_call(call); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index a1dd9a1..975cb49 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -31,20 +31,19 @@ void bacct_add_tsk(struct user_namespace *user_ns, struct taskstats *stats, struct task_struct *tsk) { const struct cred *tcred; - struct timespec uptime, ts; cputime_t utime, stime, utimescaled, stimescaled; - u64 ac_etime; + u64 delta; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); - /* calculate task elapsed time in timespec */ - do_posix_clock_monotonic_gettime(&uptime); - ts = timespec_sub(uptime, tsk->start_time); - /* rebase elapsed time to usec (should never be negative) */ - ac_etime = timespec_to_ns(&ts); - do_div(ac_etime, NSEC_PER_USEC); - stats->ac_etime = ac_etime; - stats->ac_btime = get_seconds() - ts.tv_sec; + /* calculate task elapsed time in nsec */ + delta = ktime_get_ns() - tsk->start_time; + /* Convert to micro seconds */ + do_div(delta, NSEC_PER_USEC); + stats->ac_etime = delta; + /* Convert to seconds for btime */ + do_div(delta, USEC_PER_SEC); + stats->ac_btime = get_seconds() - delta; if (thread_group_leader(tsk)) { stats->ac_exitcode = tsk->exit_code; if (tsk->flags & PF_FORKNOEXEC) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index fcc0256..aa312b0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v) return; } -struct seq_operations proc_uid_seq_operations = { +const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; -struct seq_operations proc_gid_seq_operations = { +const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; -struct seq_operations proc_projid_seq_operations = { +const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, diff --git a/kernel/utsname.c b/kernel/utsname.c index fd39312..883aaaa 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task) struct uts_namespace *ns = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->uts_ns; get_uts_ns(ns); } - rcu_read_unlock(); + task_unlock(task); return ns; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c3319bd..a8d6914 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -260,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event, return; if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + panic("Watchdog detected hard LOCKUP on cpu %d", + this_cpu); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", + this_cpu); __this_cpu_write(hard_watchdog_warn, true); return; @@ -345,7 +347,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } } - printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); @@ -366,6 +368,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) smp_mb__after_atomic(); } + add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); @@ -484,7 +487,7 @@ static int watchdog_nmi_enable(unsigned int cpu) if (PTR_ERR(event) == -EOPNOTSUPP) pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - pr_warning("disabled (cpu%i): hardware events not enabled\n", + pr_warn("disabled (cpu%i): hardware events not enabled\n", cpu); else pr_err("disabled (cpu%i): unable to create perf event: %ld\n", diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 35974ac..5dbe22a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -265,7 +265,6 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; -static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */ static cpumask_var_t *wq_numa_possible_cpumask; /* possible CPUs of each node */ @@ -758,13 +757,6 @@ static bool too_many_workers(struct worker_pool *pool) int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; - /* - * nr_idle and idle_list may disagree if idle rebinding is in - * progress. Never return %true if idle_list is empty. - */ - if (list_empty(&pool->idle_list)) - return false; - return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } @@ -850,7 +842,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) pool = worker->pool; /* this can only happen on the local cpu */ - if (WARN_ON_ONCE(cpu != raw_smp_processor_id())) + if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu)) return NULL; /* @@ -874,35 +866,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to set - * @wakeup: wakeup an idle worker if necessary * - * Set @flags in @worker->flags and adjust nr_running accordingly. If - * nr_running becomes zero and @wakeup is %true, an idle worker is - * woken up. + * Set @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: * spin_lock_irq(pool->lock) */ -static inline void worker_set_flags(struct worker *worker, unsigned int flags, - bool wakeup) +static inline void worker_set_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; WARN_ON_ONCE(worker->task != current); - /* - * If transitioning into NOT_RUNNING, adjust nr_running and - * wake up an idle worker as necessary if requested by - * @wakeup. - */ + /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { - if (wakeup) { - if (atomic_dec_and_test(&pool->nr_running) && - !list_empty(&pool->worklist)) - wake_up_worker(pool); - } else - atomic_dec(&pool->nr_running); + atomic_dec(&pool->nr_running); } worker->flags |= flags; @@ -1232,7 +1211,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, pwq_activate_delayed_work(work); list_del_init(&work->entry); - pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work)); + pwq_dec_nr_in_flight(pwq, get_work_color(work)); /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); @@ -1560,7 +1539,7 @@ static void worker_enter_idle(struct worker *worker) (worker->hentry.next || worker->hentry.pprev))) return; - /* can't use worker_set_flags(), also called from start_worker() */ + /* can't use worker_set_flags(), also called from create_worker() */ worker->flags |= WORKER_IDLE; pool->nr_idle++; worker->last_active = jiffies; @@ -1602,11 +1581,11 @@ static void worker_leave_idle(struct worker *worker) list_del_init(&worker->entry); } -static struct worker *alloc_worker(void) +static struct worker *alloc_worker(int node) { struct worker *worker; - worker = kzalloc(sizeof(*worker), GFP_KERNEL); + worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node); if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); @@ -1670,6 +1649,9 @@ static void worker_detach_from_pool(struct worker *worker, detach_completion = pool->detach_completion; mutex_unlock(&pool->attach_mutex); + /* clear leftover flags without pool->lock after it is detached */ + worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); + if (detach_completion) complete(detach_completion); } @@ -1678,8 +1660,7 @@ static void worker_detach_from_pool(struct worker *worker, * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to * - * Create a new worker which is attached to @pool. The new worker must be - * started by start_worker(). + * Create and start a new worker which is attached to @pool. * * CONTEXT: * Might sleep. Does GFP_KERNEL allocations. @@ -1698,7 +1679,7 @@ static struct worker *create_worker(struct worker_pool *pool) if (id < 0) goto fail; - worker = alloc_worker(); + worker = alloc_worker(pool->node); if (!worker) goto fail; @@ -1724,6 +1705,13 @@ static struct worker *create_worker(struct worker_pool *pool) /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); + /* start the newly created worker */ + spin_lock_irq(&pool->lock); + worker->pool->nr_workers++; + worker_enter_idle(worker); + wake_up_process(worker->task); + spin_unlock_irq(&pool->lock); + return worker; fail: @@ -1734,44 +1722,6 @@ fail: } /** - * start_worker - start a newly created worker - * @worker: worker to start - * - * Make the pool aware of @worker and start it. - * - * CONTEXT: - * spin_lock_irq(pool->lock). - */ -static void start_worker(struct worker *worker) -{ - worker->pool->nr_workers++; - worker_enter_idle(worker); - wake_up_process(worker->task); -} - -/** - * create_and_start_worker - create and start a worker for a pool - * @pool: the target pool - * - * Grab the managership of @pool and create and start a new worker for it. - * - * Return: 0 on success. A negative error code otherwise. - */ -static int create_and_start_worker(struct worker_pool *pool) -{ - struct worker *worker; - - worker = create_worker(pool); - if (worker) { - spin_lock_irq(&pool->lock); - start_worker(worker); - spin_unlock_irq(&pool->lock); - } - - return worker ? 0 : -ENOMEM; -} - -/** * destroy_worker - destroy a workqueue worker * @worker: worker to be destroyed * @@ -1909,23 +1859,10 @@ restart: mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); while (true) { - struct worker *worker; - - worker = create_worker(pool); - if (worker) { - del_timer_sync(&pool->mayday_timer); - spin_lock_irq(&pool->lock); - start_worker(worker); - if (WARN_ON_ONCE(need_to_create_worker(pool))) - goto restart; - return true; - } - - if (!need_to_create_worker(pool)) + if (create_worker(pool) || !need_to_create_worker(pool)) break; - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(CREATE_COOLDOWN); + schedule_timeout_interruptible(CREATE_COOLDOWN); if (!need_to_create_worker(pool)) break; @@ -1933,6 +1870,11 @@ restart: del_timer_sync(&pool->mayday_timer); spin_lock_irq(&pool->lock); + /* + * This is necessary even after a new worker was just successfully + * created as @pool->lock was dropped and the new worker might have + * already become busy. + */ if (need_to_create_worker(pool)) goto restart; return true; @@ -2020,13 +1962,8 @@ __acquires(&pool->lock) lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif - /* - * Ensure we're on the correct CPU. DISASSOCIATED test is - * necessary to avoid spurious warnings from rescuers servicing the - * unbound or a disassociated pool. - */ - WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && - !(pool->flags & POOL_DISASSOCIATED) && + /* ensure we're on the correct CPU */ + WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && raw_smp_processor_id() != pool->cpu); /* @@ -2052,17 +1989,22 @@ __acquires(&pool->lock) list_del_init(&work->entry); /* - * CPU intensive works don't participate in concurrency - * management. They're the scheduler's responsibility. + * CPU intensive works don't participate in concurrency management. + * They're the scheduler's responsibility. This takes @worker out + * of concurrency management and the next code block will chain + * execution of the pending work items. */ if (unlikely(cpu_intensive)) - worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); + worker_set_flags(worker, WORKER_CPU_INTENSIVE); /* - * Unbound pool isn't concurrency managed and work items should be - * executed ASAP. Wake up another worker if necessary. + * Wake up another worker if necessary. The condition is always + * false for normal per-cpu workers since nr_running would always + * be >= 1 at this point. This is used to chain execution of the + * pending work items for WORKER_NOT_RUNNING workers such as the + * UNBOUND and CPU_INTENSIVE ones. */ - if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) + if (need_more_worker(pool)) wake_up_worker(pool); /* @@ -2218,7 +2160,7 @@ recheck: } } while (keep_working(pool)); - worker_set_flags(worker, WORKER_PREP, false); + worker_set_flags(worker, WORKER_PREP); sleep: /* * pool->lock is held and there's no work to process and no need to @@ -2311,29 +2253,27 @@ repeat: move_linked_works(work, scheduled, &n); process_scheduled_works(rescuer); - spin_unlock_irq(&pool->lock); - - worker_detach_from_pool(rescuer, pool); - - spin_lock_irq(&pool->lock); /* * Put the reference grabbed by send_mayday(). @pool won't - * go away while we're holding its lock. + * go away while we're still attached to it. */ put_pwq(pwq); /* - * Leave this pool. If keep_working() is %true, notify a + * Leave this pool. If need_more_worker() is %true, notify a * regular worker; otherwise, we end up with 0 concurrency * and stalling the execution. */ - if (keep_working(pool)) + if (need_more_worker(pool)) wake_up_worker(pool); rescuer->pool = NULL; - spin_unlock(&pool->lock); - spin_lock(&wq_mayday_lock); + spin_unlock_irq(&pool->lock); + + worker_detach_from_pool(rescuer, pool); + + spin_lock_irq(&wq_mayday_lock); } spin_unlock_irq(&wq_mayday_lock); @@ -3458,7 +3398,7 @@ static void put_unbound_pool(struct worker_pool *pool) return; /* sanity checks */ - if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || + if (WARN_ON(!(pool->cpu < 0)) || WARN_ON(!list_empty(&pool->worklist))) return; @@ -3524,7 +3464,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { if (wqattrs_equal(pool->attrs, attrs)) { pool->refcnt++; - goto out_unlock; + return pool; } } @@ -3557,12 +3497,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) goto fail; /* create and start the initial worker */ - if (create_and_start_worker(pool) < 0) + if (!create_worker(pool)) goto fail; /* install */ hash_add(unbound_pool_hash, &pool->hash_node, hash); -out_unlock: + return pool; fail: if (pool) @@ -3591,11 +3531,6 @@ static void pwq_unbound_release_workfn(struct work_struct *work) if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) return; - /* - * Unlink @pwq. Synchronization against wq->mutex isn't strictly - * necessary on release but do it anyway. It's easier to verify - * and consistent with the linking path. - */ mutex_lock(&wq->mutex); list_del_rcu(&pwq->pwqs_node); is_last = list_empty(&wq->pwqs); @@ -3692,10 +3627,7 @@ static void link_pwq(struct pool_workqueue *pwq) if (!list_empty(&pwq->pwqs_node)) return; - /* - * Set the matching work_color. This is synchronized with - * wq->mutex to avoid confusing flush_workqueue(). - */ + /* set the matching work_color */ pwq->work_color = wq->work_color; /* sync max_active to the current setting */ @@ -3832,7 +3764,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) return -EINVAL; - pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL); + pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(GFP_KERNEL); tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); if (!pwq_tbl || !new_attrs || !tmp_attrs) @@ -4080,7 +4012,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, /* allocate wq and format name */ if (flags & WQ_UNBOUND) - tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); + tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]); wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); if (!wq) @@ -4122,7 +4054,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (flags & WQ_MEM_RECLAIM) { struct worker *rescuer; - rescuer = alloc_worker(); + rescuer = alloc_worker(NUMA_NO_NODE); if (!rescuer) goto err_destroy; @@ -4470,8 +4402,6 @@ static void wq_unbind_fn(struct work_struct *work) struct worker *worker; for_each_cpu_worker_pool(pool, cpu) { - WARN_ON_ONCE(cpu != smp_processor_id()); - mutex_lock(&pool->attach_mutex); spin_lock_irq(&pool->lock); @@ -4543,6 +4473,7 @@ static void rebind_workers(struct worker_pool *pool) pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); + pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { unsigned int worker_flags = worker->flags; @@ -4632,7 +4563,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, for_each_cpu_worker_pool(pool, cpu) { if (pool->nr_workers) continue; - if (create_and_start_worker(pool) < 0) + if (!create_worker(pool)) return NOTIFY_BAD; } break; @@ -4644,15 +4575,10 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, for_each_pool(pool, pi) { mutex_lock(&pool->attach_mutex); - if (pool->cpu == cpu) { - spin_lock_irq(&pool->lock); - pool->flags &= ~POOL_DISASSOCIATED; - spin_unlock_irq(&pool->lock); - + if (pool->cpu == cpu) rebind_workers(pool); - } else if (pool->cpu < 0) { + else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); - } mutex_unlock(&pool->attach_mutex); } @@ -4856,10 +4782,6 @@ static void __init wq_numa_init(void) cpumask_var_t *tbl; int node, cpu; - /* determine NUMA pwq table len - highest node id + 1 */ - for_each_node(node) - wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1); - if (num_possible_nodes() <= 1) return; @@ -4876,7 +4798,7 @@ static void __init wq_numa_init(void) * available. Build one from cpu_to_node() which should have been * fully initialized by now. */ - tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL); + tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL); BUG_ON(!tbl); for_each_node(node) @@ -4936,7 +4858,7 @@ static int __init init_workqueues(void) for_each_cpu_worker_pool(pool, cpu) { pool->flags &= ~POOL_DISASSOCIATED; - BUG_ON(create_and_start_worker(pool) < 0); + BUG_ON(!create_worker(pool)); } } |