diff options
Diffstat (limited to 'kernel')
85 files changed, 2337 insertions, 2142 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index d15c0ee..addf773 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -102,7 +102,7 @@ static int check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - if (time_is_before_jiffies(acct->needcheck)) + if (time_is_after_jiffies(acct->needcheck)) goto out; /* May block */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7c25426..ab94d30 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); + u32 elem_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); struct bpf_array *array; - u64 array_size; - u32 elem_size; + u64 array_size, mask64; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -72,11 +73,32 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); + max_entries = attr->max_entries; + + /* On 32 bit archs roundup_pow_of_two() with max_entries that has + * upper most bit set in u32 space is undefined behavior due to + * resulting 1U << 32, so do it manually here in u64 space. + */ + mask64 = fls_long(max_entries - 1); + mask64 = 1ULL << mask64; + mask64 -= 1; + + index_mask = mask64; + if (unpriv) { + /* round up array size to nearest power of 2, + * since cpu will speculate within index_mask limits + */ + max_entries = index_mask + 1; + /* Check for overflows. */ + if (max_entries < attr->max_entries) + return ERR_PTR(-E2BIG); + } + array_size = sizeof(*array); if (percpu) - array_size += (u64) attr->max_entries * sizeof(void *); + array_size += (u64) max_entries * sizeof(void *); else - array_size += (u64) attr->max_entries * elem_size; + array_size += (u64) max_entries * elem_size; /* make sure there is no u32 overflow later in round_up() */ if (array_size >= U32_MAX - PAGE_SIZE) @@ -86,6 +108,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array = bpf_map_area_alloc(array_size, numa_node); if (!array) return ERR_PTR(-ENOMEM); + array->index_mask = index_mask; + array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ array->map.map_type = attr->map_type; @@ -121,12 +145,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return array->value + array->elem_size * index; + return array->value + array->elem_size * (index & array->index_mask); } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; u32 elem_size = round_up(map->value_size, 8); const int ret = BPF_REG_0; @@ -135,7 +160,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + } if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); @@ -157,7 +187,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return this_cpu_ptr(array->pptrs[index]); + return this_cpu_ptr(array->pptrs[index & array->index_mask]); } int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) @@ -177,7 +207,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); off += size; @@ -225,10 +255,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EEXIST; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) - memcpy(this_cpu_ptr(array->pptrs[index]), + memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); else - memcpy(array->value + array->elem_size * index, + memcpy(array->value + + array->elem_size * (index & array->index_mask), value, map->value_size); return 0; } @@ -262,7 +293,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); off += size; @@ -613,6 +644,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) static u32 array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); u32 elem_size = round_up(map->value_size, 8); struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -621,7 +653,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + } if (is_power_of_2(elem_size)) *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); else diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 86b50aa..7949e8b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +#ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on @@ -955,7 +956,7 @@ select_insn: DST = tmp; CONT; ALU_MOD_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); @@ -974,7 +975,7 @@ select_insn: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); @@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; +#else +static unsigned int __bpf_prog_ret0(const void *ctx, + const struct bpf_insn *insn) +{ + return 0; +} +#endif + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { +#ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else + fp->bpf_func = __bpf_prog_ret0; +#endif /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) */ if (!bpf_prog_is_dev_bound(fp->aux)) { fp = bpf_int_jit_compile(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) { + *err = -ENOTSUPP; + return fp; + } +#endif } else { *err = bpf_prog_offload_compile(fp); if (*err) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index e469e05..3905d4b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab) pptr = htab_elem_get_ptr(get_htab_elem(htab, i), htab->map.key_size); free_percpu(pptr); + cond_resched(); } free_elems: bpf_map_area_free(htab->elems); @@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab) goto free_elems; htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, pptr); + cond_resched(); } skip_percpu_elems: diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 01aaef1..5bb5e49 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -368,7 +368,45 @@ out: putname(pname); return ret; } -EXPORT_SYMBOL_GPL(bpf_obj_get_user); + +static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) +{ + struct bpf_prog *prog; + int ret = inode_permission(inode, MAY_READ | MAY_WRITE); + if (ret) + return ERR_PTR(ret); + + if (inode->i_op == &bpf_map_iops) + return ERR_PTR(-EINVAL); + if (inode->i_op != &bpf_prog_iops) + return ERR_PTR(-EACCES); + + prog = inode->i_private; + + ret = security_bpf_prog(prog); + if (ret < 0) + return ERR_PTR(ret); + + if (!bpf_prog_get_ok(prog, &type, false)) + return ERR_PTR(-EINVAL); + + return bpf_prog_inc(prog); +} + +struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) +{ + struct bpf_prog *prog; + struct path path; + int ret = kern_path(name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + prog = __get_prog_inode(d_backing_inode(path.dentry), type); + if (!IS_ERR(prog)) + touch_atime(&path); + path_put(&path); + return prog; +} +EXPORT_SYMBOL(bpf_prog_get_type_path); static void bpf_evict_inode(struct inode *inode) { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 5ee2e41..1712d31 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map) write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); - smap_list_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, sock); + } write_unlock_bh(&sock->sk_callback_lock); } rcu_read_unlock(); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c4cfea..5cb783f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1057,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static bool bpf_prog_get_ok(struct bpf_prog *prog, +bool bpf_prog_get_ok(struct bpf_prog *prog, enum bpf_prog_type *attach_type, bool attach_drv) { /* not an attachment, just a refcount inc, always allow */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d459357..13551e6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -978,6 +978,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); } +static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = cur_regs(env) + regno; + + return reg->type == PTR_TO_CTX; +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -1059,6 +1066,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, break; case PTR_TO_STACK: pointer_desc = "stack "; + /* The stack spill tracking logic in check_stack_write() + * and check_stack_read() relies on stack accesses being + * aligned. + */ + strict = true; break; default: break; @@ -1067,6 +1079,29 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } +/* truncate register to smaller size (in bytes) + * must be called with size < BPF_REG_SIZE + */ +static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) +{ + u64 mask; + + /* clear high bits in bit representation */ + reg->var_off = tnum_cast(reg->var_off, size); + + /* fix arithmetic bounds */ + mask = ((u64)1 << (size * 8)) - 1; + if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) { + reg->umin_value &= mask; + reg->umax_value &= mask; + } else { + reg->umin_value = 0; + reg->umax_value = mask; + } + reg->smin_value = reg->umin_value; + reg->smax_value = reg->umax_value; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -1200,9 +1235,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && regs[value_regno].type == SCALAR_VALUE) { /* b/h/w load zero-extends, mark upper bits as known 0 */ - regs[value_regno].var_off = - tnum_cast(regs[value_regno].var_off, size); - __update_reg_bounds(®s[value_regno]); + coerce_reg_to_size(®s[value_regno], size); } return err; } @@ -1232,6 +1265,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return -EACCES; } + if (is_ctx_reg(env, insn->dst_reg)) { + verbose(env, "BPF_XADD stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); @@ -1282,6 +1321,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); + return -EACCES; } off = regs[regno].off + regs[regno].var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || @@ -1674,7 +1714,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return -EINVAL; } + /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(fn->func); + if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { + verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", + func_id_name(func_id), func_id); + return -EINVAL; + } memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; @@ -1696,6 +1742,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; + if (func_id == BPF_FUNC_tail_call) { + if (meta.map_ptr == NULL) { + verbose(env, "verifier bug\n"); + return -EINVAL; + } + env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; + } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; @@ -1766,14 +1819,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return 0; } -static void coerce_reg_to_32(struct bpf_reg_state *reg) -{ - /* clear high 32 bits */ - reg->var_off = tnum_cast(reg->var_off, 4); - /* Update bounds */ - __update_reg_bounds(reg); -} - static bool signed_add_overflows(s64 a, s64 b) { /* Do the add in u64, where overflow is well-defined */ @@ -1794,6 +1839,41 @@ static bool signed_sub_overflows(s64 a, s64 b) return res > a; } +static bool check_reg_sane_offset(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + enum bpf_reg_type type) +{ + bool known = tnum_is_const(reg->var_off); + s64 val = reg->var_off.value; + s64 smin = reg->smin_value; + + if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { + verbose(env, "math between %s pointer and %lld is not allowed\n", + reg_type_str[type], val); + return false; + } + + if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { + verbose(env, "%s pointer offset %d is not allowed\n", + reg_type_str[type], reg->off); + return false; + } + + if (smin == S64_MIN) { + verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", + reg_type_str[type]); + return false; + } + + if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { + verbose(env, "value %lld makes %s pointer be out of bounds\n", + smin, reg_type_str[type]); + return false; + } + + return true; +} + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -1815,44 +1895,36 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; - if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(env, env->cur_state); - verbose(env, - "verifier internal error: known but bad sbounds\n"); - return -EINVAL; - } - if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(env, env->cur_state); - verbose(env, - "verifier internal error: known but bad ubounds\n"); - return -EINVAL; + if ((known && (smin_val != smax_val || umin_val != umax_val)) || + smin_val > smax_val || umin_val > umax_val) { + /* Taint dst register if offset had invalid bounds derived from + * e.g. dead branches. + */ + __mark_reg_unknown(dst_reg); + return 0; } if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ - if (!env->allow_ptr_leaks) - verbose(env, - "R%d 32-bit pointer arithmetic prohibited\n", - dst); + verbose(env, + "R%d 32-bit pointer arithmetic prohibited\n", + dst); return -EACCES; } if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", - dst); + verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", + dst); return -EACCES; } if (ptr_reg->type == CONST_PTR_TO_MAP) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", - dst); + verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", + dst); return -EACCES; } if (ptr_reg->type == PTR_TO_PACKET_END) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", - dst); + verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", + dst); return -EACCES; } @@ -1862,6 +1934,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->type = ptr_reg->type; dst_reg->id = ptr_reg->id; + if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) || + !check_reg_sane_offset(env, ptr_reg, ptr_reg->type)) + return -EINVAL; + switch (opcode) { case BPF_ADD: /* We can take a fixed offset as long as it doesn't overflow @@ -1915,9 +1991,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_SUB: if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ - if (!env->allow_ptr_leaks) - verbose(env, "R%d tried to subtract pointer from scalar\n", - dst); + verbose(env, "R%d tried to subtract pointer from scalar\n", + dst); return -EACCES; } /* We don't allow subtraction from FP, because (according to @@ -1925,9 +2000,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * be able to deal with it. */ if (ptr_reg->type == PTR_TO_STACK) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d subtraction from stack pointer prohibited\n", - dst); + verbose(env, "R%d subtraction from stack pointer prohibited\n", + dst); return -EACCES; } if (known && (ptr_reg->off - smin_val == @@ -1976,28 +2050,30 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_AND: case BPF_OR: case BPF_XOR: - /* bitwise ops on pointers are troublesome, prohibit for now. - * (However, in principle we could allow some cases, e.g. - * ptr &= ~3 which would reduce min_value by 3.) - */ - if (!env->allow_ptr_leaks) - verbose(env, "R%d bitwise operator %s on pointer prohibited\n", - dst, bpf_alu_string[opcode >> 4]); + /* bitwise ops on pointers are troublesome, prohibit. */ + verbose(env, "R%d bitwise operator %s on pointer prohibited\n", + dst, bpf_alu_string[opcode >> 4]); return -EACCES; default: /* other operators (e.g. MUL,LSH) produce non-pointer results */ - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", - dst, bpf_alu_string[opcode >> 4]); + verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", + dst, bpf_alu_string[opcode >> 4]); return -EACCES; } + if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) + return -EINVAL; + __update_reg_bounds(dst_reg); __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); return 0; } +/* WARNING: This function does calculations on 64-bit values, but the actual + * execution may occur on 32-bit values. Therefore, things like bitshifts + * need extra checks in the 32-bit case. + */ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_reg_state *dst_reg, @@ -2008,12 +2084,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, bool src_known, dst_known; s64 smin_val, smax_val; u64 umin_val, umax_val; + u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; - if (BPF_CLASS(insn->code) != BPF_ALU64) { - /* 32-bit ALU ops are (32,32)->64 */ - coerce_reg_to_32(dst_reg); - coerce_reg_to_32(&src_reg); - } smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; umin_val = src_reg.umin_value; @@ -2021,6 +2093,21 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, src_known = tnum_is_const(src_reg.var_off); dst_known = tnum_is_const(dst_reg->var_off); + if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || + smin_val > smax_val || umin_val > umax_val) { + /* Taint dst register if offset had invalid bounds derived from + * e.g. dead branches. + */ + __mark_reg_unknown(dst_reg); + return 0; + } + + if (!src_known && + opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { + __mark_reg_unknown(dst_reg); + return 0; + } + switch (opcode) { case BPF_ADD: if (signed_add_overflows(dst_reg->smin_value, smin_val) || @@ -2149,9 +2236,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; case BPF_LSH: - if (umax_val > 63) { - /* Shifts greater than 63 are undefined. This includes - * shifts by a negative number. + if (umax_val >= insn_bitness) { + /* Shifts greater than 31 or 63 are undefined. + * This includes shifts by a negative number. */ mark_reg_unknown(env, regs, insn->dst_reg); break; @@ -2177,27 +2264,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; case BPF_RSH: - if (umax_val > 63) { - /* Shifts greater than 63 are undefined. This includes - * shifts by a negative number. + if (umax_val >= insn_bitness) { + /* Shifts greater than 31 or 63 are undefined. + * This includes shifts by a negative number. */ mark_reg_unknown(env, regs, insn->dst_reg); break; } - /* BPF_RSH is an unsigned shift, so make the appropriate casts */ - if (dst_reg->smin_value < 0) { - if (umin_val) { - /* Sign bit will be cleared */ - dst_reg->smin_value = 0; - } else { - /* Lost sign bit information */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } - } else { - dst_reg->smin_value = - (u64)(dst_reg->smin_value) >> umax_val; - } + /* BPF_RSH is an unsigned shift. If the value in dst_reg might + * be negative, then either: + * 1) src_reg might be zero, so the sign bit of the result is + * unknown, so we lose our signed bounds + * 2) it's known negative, thus the unsigned bounds capture the + * signed bounds + * 3) the signed bounds cross zero, so they tell us nothing + * about the result + * If the value in dst_reg is known nonnegative, then again the + * unsigned bounts capture the signed bounds. + * Thus, in all cases it suffices to blow away our signed bounds + * and rely on inferring new ones from the unsigned bounds and + * var_off of the result. + */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; if (src_known) dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); @@ -2213,6 +2302,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, break; } + if (BPF_CLASS(insn->code) != BPF_ALU64) { + /* 32-bit ALU ops are (32,32)->32 */ + coerce_reg_to_size(dst_reg, 4); + coerce_reg_to_size(&src_reg, 4); + } + __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); return 0; @@ -2227,7 +2322,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); - int rc; dst_reg = ®s[insn->dst_reg]; src_reg = NULL; @@ -2238,43 +2332,29 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, if (src_reg->type != SCALAR_VALUE) { if (dst_reg->type != SCALAR_VALUE) { /* Combining two pointers by any ALU op yields - * an arbitrary scalar. + * an arbitrary scalar. Disallow all math except + * pointer subtraction */ - if (!env->allow_ptr_leaks) { - verbose(env, "R%d pointer %s pointer prohibited\n", - insn->dst_reg, - bpf_alu_string[opcode >> 4]); - return -EACCES; + if (opcode == BPF_SUB){ + mark_reg_unknown(env, regs, insn->dst_reg); + return 0; } - mark_reg_unknown(env, regs, insn->dst_reg); - return 0; + verbose(env, "R%d pointer %s pointer prohibited\n", + insn->dst_reg, + bpf_alu_string[opcode >> 4]); + return -EACCES; } else { /* scalar += pointer * This is legal, but we have to reverse our * src/dest handling in computing the range */ - rc = adjust_ptr_min_max_vals(env, insn, - src_reg, dst_reg); - if (rc == -EACCES && env->allow_ptr_leaks) { - /* scalar += unknown scalar */ - __mark_reg_unknown(&off_reg); - return adjust_scalar_min_max_vals( - env, insn, - dst_reg, off_reg); - } - return rc; + return adjust_ptr_min_max_vals(env, insn, + src_reg, dst_reg); } } else if (ptr_reg) { /* pointer += scalar */ - rc = adjust_ptr_min_max_vals(env, insn, - dst_reg, src_reg); - if (rc == -EACCES && env->allow_ptr_leaks) { - /* unknown scalar += scalar */ - __mark_reg_unknown(dst_reg); - return adjust_scalar_min_max_vals( - env, insn, dst_reg, *src_reg); - } - return rc; + return adjust_ptr_min_max_vals(env, insn, + dst_reg, src_reg); } } else { /* Pretend the src is a reg with a known value, since we only @@ -2283,17 +2363,9 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, off_reg.type = SCALAR_VALUE; __mark_reg_known(&off_reg, insn->imm); src_reg = &off_reg; - if (ptr_reg) { /* pointer += K */ - rc = adjust_ptr_min_max_vals(env, insn, - ptr_reg, src_reg); - if (rc == -EACCES && env->allow_ptr_leaks) { - /* unknown scalar += K */ - __mark_reg_unknown(dst_reg); - return adjust_scalar_min_max_vals( - env, insn, dst_reg, off_reg); - } - return rc; - } + if (ptr_reg) /* pointer += K */ + return adjust_ptr_min_max_vals(env, insn, + ptr_reg, src_reg); } /* Got here implies adding two SCALAR_VALUEs */ @@ -2390,17 +2462,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } mark_reg_unknown(env, regs, insn->dst_reg); - /* high 32 bits are known zero. */ - regs[insn->dst_reg].var_off = tnum_cast( - regs[insn->dst_reg].var_off, 4); - __update_reg_bounds(®s[insn->dst_reg]); + coerce_reg_to_size(®s[insn->dst_reg], 4); } } else { /* case: R = imm * remember the value we stored into this reg */ regs[insn->dst_reg].type = SCALAR_VALUE; - __mark_reg_known(regs + insn->dst_reg, insn->imm); + if (BPF_CLASS(insn->code) == BPF_ALU64) { + __mark_reg_known(regs + insn->dst_reg, + insn->imm); + } else { + __mark_reg_known(regs + insn->dst_reg, + (u32)insn->imm); + } } } else if (opcode > BPF_END) { @@ -2436,6 +2511,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { + verbose(env, "BPF_ARSH not supported for 32 bit ALU\n"); + return -EINVAL; + } + if ((opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; @@ -3431,15 +3511,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); } else { - /* if we knew anything about the old value, we're not - * equal, because we can't know anything about the - * scalar value of the pointer in the new value. + /* We're trying to use a pointer in place of a scalar. + * Even if the scalar was unbounded, this could lead to + * pointer leaks because scalars are allowed to leak + * while pointers are not. We could make this safe in + * special cases if root is calling us, but it's + * probably not worth the hassle. */ - return rold->umin_value == 0 && - rold->umax_value == U64_MAX && - rold->smin_value == S64_MIN && - rold->smax_value == S64_MAX && - tnum_is_unknown(rold->var_off); + return false; } case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and @@ -3932,6 +4011,12 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; + if (is_ctx_reg(env, insn->dst_reg)) { + verbose(env, "BPF_ST stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, @@ -4384,6 +4469,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + /* due to JIT bugs clear upper 32-bits of src register + * before div/mod operation + */ + insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); + insn_buf[1] = *insn; + cnt = 2; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; @@ -4407,6 +4510,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ insn->imm = 0; insn->code = BPF_JMP | BPF_TAIL_CALL; + + /* instead of changing every JIT dealing with tail_call + * emit two extra insns: + * if (index >= max_entries) goto out; + * index &= array->index_mask; + * to avoid out-of-bounds cpu speculation + */ + map_ptr = env->insn_aux_data[i + delta].map_ptr; + if (map_ptr == BPF_MAP_PTR_POISON) { + verbose(env, "tail_call abusing map_ptr\n"); + return -EINVAL; + } + if (!map_ptr->unpriv_array) + continue; + insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, + map_ptr->max_entries, 2); + insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, + container_of(map_ptr, + struct bpf_array, + map)->index_mask); + insn_buf[2] = *insn; + cnt = 3; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; continue; } diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 024085d..a2c05d2 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) */ do { css_task_iter_start(&from->self, 0, &it); - task = css_task_iter_next(&it); + + do { + task = css_task_iter_next(&it); + } while (task && (task->flags & PF_EXITING)); + if (task) get_task_struct(task); css_task_iter_end(&it); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0b1ffe1..7e4c445 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strcpy(root->release_agent_path, opts->release_agent); + strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) - strcpy(root->name, opts->name); + strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } @@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) static void css_task_iter_advance(struct css_task_iter *it) { - struct list_head *l = it->task_pos; + struct list_head *next; lockdep_assert_held(&css_set_lock); - WARN_ON_ONCE(!l); - repeat: /* * Advance iterator to find next entry. cset->tasks is consumed * first and then ->mg_tasks. After ->mg_tasks, we move onto the * next cset. */ - l = l->next; + next = it->task_pos->next; - if (l == it->tasks_head) - l = it->mg_tasks_head->next; + if (next == it->tasks_head) + next = it->mg_tasks_head->next; - if (l == it->mg_tasks_head) + if (next == it->mg_tasks_head) css_task_iter_advance_css_set(it); else - it->task_pos = l; + it->task_pos = next; /* if PROCS, skip over tasks which aren't group leaders */ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && @@ -4449,6 +4447,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cgroup.threads", + .flags = CFTYPE_NS_DELEGATABLE, .release = cgroup_procs_release, .seq_start = cgroup_threads_start, .seq_next = cgroup_procs_next, diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f780d8..9caeda6 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -50,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); if (refcnt > cset->nr_tasks) @@ -96,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c index 133b465..1e111dd 100644 --- a/kernel/cgroup/stat.c +++ b/kernel/cgroup/stat.c @@ -296,8 +296,12 @@ int cgroup_stat_init(struct cgroup *cgrp) } /* ->updated_children list is self terminated */ - for_each_possible_cpu(cpu) - cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + cstat->updated_children = cgrp; + u64_stats_init(&cstat->sync); + } prev_cputime_init(&cgrp->stat.prev_cputime); diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config new file mode 100644 index 0000000..81ff078 --- /dev/null +++ b/kernel/configs/nopm.config @@ -0,0 +1,15 @@ +CONFIG_PM=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n + +# Triggers PM on OMAP +CONFIG_CPU_IDLE=n + +# Triggers enablement via hibernate callbacks +CONFIG_XEN=n + +# ARM/ARM64 architectures that select PM unconditionally +CONFIG_ARCH_OMAP2PLUS_TYPICAL=n +CONFIG_ARCH_RENESAS=n +CONFIG_ARCH_TEGRA=n +CONFIG_ARCH_VEXPRESS=n diff --git a/kernel/cpu.c b/kernel/cpu.c index 41376c3..53f7dc6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -80,19 +80,19 @@ static struct lockdep_map cpuhp_state_down_map = STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); -static void inline cpuhp_lock_acquire(bool bringup) +static inline void cpuhp_lock_acquire(bool bringup) { lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); } -static void inline cpuhp_lock_release(bool bringup) +static inline void cpuhp_lock_release(bool bringup) { lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); } #else -static void inline cpuhp_lock_acquire(bool bringup) { } -static void inline cpuhp_lock_release(bool bringup) { } +static inline void cpuhp_lock_acquire(bool bringup) { } +static inline void cpuhp_lock_release(bool bringup) { } #endif @@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = { * before blk_mq_queue_reinit_notify() from notify_dead(), * otherwise a RCU stall occurs. */ - [CPUHP_TIMERS_DEAD] = { + [CPUHP_TIMERS_PREPARE] = { .name = "timers:dead", - .startup.single = NULL, + .startup.single = timers_prepare_cpu, .teardown.single = timers_dead_cpu, }, /* Kicks the plugged cpu into life */ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index b366389..4f63597 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(contig_page_data); #endif #ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_SYMBOL_ARRAY(mem_section); VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 4a1c334..e2764d7 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk) * Finish delay accounting for a statistic using its timestamps (@start), * accumalator (@total) and @count */ -static void delayacct_end(u64 *start, u64 *total, u32 *count) +static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count) { s64 ns = ktime_get_ns() - *start; unsigned long flags; if (ns > 0) { - spin_lock_irqsave(¤t->delays->lock, flags); + spin_lock_irqsave(lock, flags); *total += ns; (*count)++; - spin_unlock_irqrestore(¤t->delays->lock, flags); + spin_unlock_irqrestore(lock, flags); } } @@ -69,17 +69,25 @@ void __delayacct_blkio_start(void) current->delays->blkio_start = ktime_get_ns(); } -void __delayacct_blkio_end(void) +/* + * We cannot rely on the `current` macro, as we haven't yet switched back to + * the process being woken. + */ +void __delayacct_blkio_end(struct task_struct *p) { - if (current->delays->flags & DELAYACCT_PF_SWAPIN) - /* Swapin block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->swapin_delay, - ¤t->delays->swapin_count); - else /* Other block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_delay, - ¤t->delays->blkio_count); + struct task_delay_info *delays = p->delays; + u64 *total; + u32 *count; + + if (p->delays->flags & DELAYACCT_PF_SWAPIN) { + total = &delays->swapin_delay; + count = &delays->swapin_count; + } else { + total = &delays->blkio_delay; + count = &delays->blkio_count; + } + + delayacct_end(&delays->lock, &delays->blkio_start, total, count); } int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -153,8 +161,10 @@ void __delayacct_freepages_start(void) void __delayacct_freepages_end(void) { - delayacct_end(¤t->delays->freepages_start, - ¤t->delays->freepages_delay, - ¤t->delays->freepages_count); + delayacct_end( + ¤t->delays->lock, + ¤t->delays->freepages_start, + ¤t->delays->freepages_delay, + ¤t->delays->freepages_count); } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 1b2be63..772a43f 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -179,21 +179,6 @@ put_callchain_entry(int rctx) } struct perf_callchain_entry * -perf_callchain(struct perf_event *event, struct pt_regs *regs) -{ - bool kernel = !event->attr.exclude_callchain_kernel; - bool user = !event->attr.exclude_callchain_user; - /* Disallow cross-task user callchains. */ - bool crosstask = event->ctx->task && event->ctx->task != current; - const u32 max_stack = event->attr.sample_max_stack; - - if (!kernel && !user) - return NULL; - - return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); -} - -struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 4df5b69..d99fe3f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem + * + * cpu_hotplug_lock + * pmus_lock + * cpuctx->mutex / perf_event_context::mutex */ static struct perf_event_context * perf_event_ctx_lock_nested(struct perf_event *event, int nesting) @@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; + LIST_HEAD(free_list); /* * If we got here through err_file: fput(event_file); we will not have @@ -4268,8 +4273,7 @@ again: struct perf_event, child_list); if (tmp == child) { perf_remove_from_context(child, DETACH_GROUP); - list_del(&child->child_list); - free_event(child); + list_move(&child->child_list, &free_list); /* * This matches the refcount bump in inherit_event(); * this can't be the last reference. @@ -4284,6 +4288,11 @@ again: } mutex_unlock(&event->child_mutex); + list_for_each_entry_safe(child, tmp, &free_list, child_list) { + list_del(&child->child_list); + free_event(child); + } + no_ctx: put_event(event); /* Must be the 'last' reference */ return 0; @@ -5815,19 +5824,11 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_read(handle, event); if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (data->callchain) { - int size = 1; - - if (data->callchain) - size += data->callchain->nr; - - size *= sizeof(u64); + int size = 1; - __output_copy(handle, data->callchain, size); - } else { - u64 nr = 0; - perf_output_put(handle, nr); - } + size += data->callchain->nr; + size *= sizeof(u64); + __output_copy(handle, data->callchain, size); } if (sample_type & PERF_SAMPLE_RAW) { @@ -5980,6 +5981,26 @@ static u64 perf_virt_to_phys(u64 virt) return phys_addr; } +static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; + +static struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs) +{ + bool kernel = !event->attr.exclude_callchain_kernel; + bool user = !event->attr.exclude_callchain_user; + /* Disallow cross-task user callchains. */ + bool crosstask = event->ctx->task && event->ctx->task != current; + const u32 max_stack = event->attr.sample_max_stack; + struct perf_callchain_entry *callchain; + + if (!kernel && !user) + return &__empty_callchain; + + callchain = get_perf_callchain(regs, 0, kernel, user, + max_stack, crosstask, true); + return callchain ?: &__empty_callchain; +} + void perf_prepare_sample(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, @@ -6002,9 +6023,7 @@ void perf_prepare_sample(struct perf_event_header *header, int size = 1; data->callchain = perf_callchain(event, regs); - - if (data->callchain) - size += data->callchain->nr; + size += data->callchain->nr; header->size += size * sizeof(u64); } @@ -8516,6 +8535,29 @@ fail_clear_files: return ret; } +static int +perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) +{ + struct perf_event_context *ctx = event->ctx; + int ret; + + /* + * Beware, here be dragons!! + * + * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint + * stuff does not actually need it. So temporarily drop ctx->mutex. As per + * perf_event_ctx_lock() we already have a reference on ctx. + * + * This can result in event getting moved to a different ctx, but that + * does not affect the tracepoint state. + */ + mutex_unlock(&ctx->mutex); + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + mutex_lock(&ctx->mutex); + + return ret; +} + static int perf_event_set_filter(struct perf_event *event, void __user *arg) { char *filter_str; @@ -8532,8 +8574,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg) if (IS_ENABLED(CONFIG_EVENT_TRACING) && event->attr.type == PERF_TYPE_TRACEPOINT) - ret = ftrace_profile_set_filter(event, event->attr.config, - filter_str); + ret = perf_tracepoint_set_filter(event, filter_str); else if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); @@ -9168,7 +9209,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (!try_module_get(pmu->module)) return -ENODEV; - if (event->group_leader != event) { + /* + * A number of pmu->event_init() methods iterate the sibling_list to, + * for example, validate if the group fits on the PMU. Therefore, + * if this is a sibling event, acquire the ctx->mutex to protect + * the sibling_list. + */ + if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { /* * This ctx->mutex can nest when we're called through * inheritance. See the perf_event_ctx_lock_nested() comment. @@ -10703,6 +10750,19 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + + if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) && + !child_ctx->task_ctx_data) { + struct pmu *pmu = child_event->pmu; + + child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size, + GFP_KERNEL); + if (!child_ctx->task_ctx_data) { + free_event(child_event); + return NULL; + } + } + /* * is_orphaned_event() and list_add_tail(&parent_event->child_list) * must be under the same lock in order to serialize against @@ -10713,6 +10773,7 @@ inherit_event(struct perf_event *parent_event, if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); + /* task_ctx_data is freed with child_ctx */ free_event(child_event); return NULL; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 09b1537..6dc725a 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -201,10 +201,6 @@ arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) -/* Callchain handling */ -extern struct perf_callchain_entry * -perf_callchain(struct perf_event *event, struct pt_regs *regs); - static inline int get_recursion_context(int *recursion) { int rctx; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 267f6ef..ce6848e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1167,8 +1167,8 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } ret = 0; - smp_wmb(); /* pairs with get_xol_area() */ - mm->uprobes_state.xol_area = area; + /* pairs with get_xol_area() */ + smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */ fail: up_write(&mm->mmap_sem); @@ -1230,8 +1230,8 @@ static struct xol_area *get_xol_area(void) if (!mm->uprobes_state.xol_area) __create_xol_area(0); - area = mm->uprobes_state.xol_area; - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + /* Pairs with xol_add_vma() smp_store_release() */ + area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */ return area; } @@ -1528,8 +1528,8 @@ static unsigned long get_trampoline_vaddr(void) struct xol_area *area; unsigned long trampoline_vaddr = -1; - area = current->mm->uprobes_state.xol_area; - smp_read_barrier_depends(); + /* Pairs with xol_add_vma() smp_store_release() */ + area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ if (area) trampoline_vaddr = area->vaddr; diff --git a/kernel/exit.c b/kernel/exit.c index 6b4298a..995453d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1755,3 +1755,12 @@ Efault: return -EFAULT; } #endif + +__weak void abort(void) +{ + BUG(); + + /* if that doesn't kill us, halt */ + panic("Oops failed to kill thread"); +} +EXPORT_SYMBOL(abort); diff --git a/kernel/fork.c b/kernel/fork.c index 432eadf..2295fc6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -721,8 +721,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; } /* a new mm has just been created */ - arch_dup_mmap(oldmm, mm); - retval = 0; + retval = arch_dup_mmap(oldmm, mm); out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); diff --git a/kernel/futex.c b/kernel/futex.c index 57d0b36..7f719d1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1878,6 +1878,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); + if (nr_wake < 0 || nr_requeue < 0) + return -EINVAL; + /* * When PI not supported: return -ENOSYS if requeue_pi is true, * consequently the compiler knows requeue_pi is always false past @@ -2294,34 +2297,33 @@ static void unqueue_me_pi(struct futex_q *q) spin_unlock(q->lock_ptr); } -/* - * Fixup the pi_state owner with the new owner. - * - * Must be called with hash bucket lock held and mm->sem held for non - * private futexes. - */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *argowner) { - u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; u32 uval, uninitialized_var(curval), newval; - struct task_struct *oldowner; + struct task_struct *oldowner, *newowner; + u32 newtid; int ret; + lockdep_assert_held(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); oldowner = pi_state->owner; - /* Owner died? */ - if (!pi_state->owner) - newtid |= FUTEX_OWNER_DIED; /* - * We are here either because we stole the rtmutex from the - * previous highest priority waiter or we are the highest priority - * waiter but have failed to get the rtmutex the first time. + * We are here because either: + * + * - we stole the lock and pi_state->owner needs updating to reflect + * that (@argowner == current), + * + * or: * - * We have to replace the newowner TID in the user space variable. + * - someone stole our lock and we need to fix things to point to the + * new owner (@argowner == NULL). + * + * Either way, we have to replace the TID in the user space variable. * This must be atomic as we have to preserve the owner died bit here. * * Note: We write the user space value _before_ changing the pi_state @@ -2334,6 +2336,45 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * in the PID check in lookup_pi_state. */ retry: + if (!argowner) { + if (oldowner != current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + ret = 0; + goto out_unlock; + } + + if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { + /* We got the lock after all, nothing to fix. */ + ret = 0; + goto out_unlock; + } + + /* + * Since we just failed the trylock; there must be an owner. + */ + newowner = rt_mutex_owner(&pi_state->pi_mutex); + BUG_ON(!newowner); + } else { + WARN_ON_ONCE(argowner != current); + if (oldowner == current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + ret = 0; + goto out_unlock; + } + newowner = argowner; + } + + newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + if (get_futex_value_locked(&uval, uaddr)) goto handle_fault; @@ -2434,9 +2475,9 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * Got the lock. We might not be the anticipated owner if we * did a lock-steal - fix up the PI-state in that case: * - * We can safely read pi_state->owner without holding wait_lock - * because we now own the rt_mutex, only the owner will attempt - * to change it. + * Speculative pi_state->owner read (we don't hold wait_lock); + * since we own the lock pi_state->owner == current is the + * stable state, anything else needs more attention. */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); @@ -2444,6 +2485,19 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) } /* + * If we didn't get the lock; check if anybody stole it from us. In + * that case, we need to fix up the uval to point to them instead of + * us, otherwise bad things happen. [10] + * + * Another speculative read; pi_state->owner == current is unstable + * but needs our attention. + */ + if (q->pi_state->owner == current) { + ret = fixup_pi_state_owner(uaddr, q, NULL); + goto out; + } + + /* * Paranoia check. If we did not take the lock, then we should not be * the owner of the rt_mutex. */ diff --git a/kernel/groups.c b/kernel/groups.c index e357bc8..daae2f2 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b) return gid_gt(a, b) - gid_lt(a, b); } -static void groups_sort(struct group_info *group_info) +void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } +EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) @@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp) void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); - groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; } @@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 89e3558..6fc87cc 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -103,16 +103,6 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR config GENERIC_IRQ_RESERVATION_MODE bool -config IRQ_DOMAIN_DEBUG - bool "Expose hardware/virtual IRQ mapping via debugfs" - depends on IRQ_DOMAIN && DEBUG_FS - help - This option will show the mapping relationship between hardware irq - numbers and Linux irq numbers. The mapping is exposed via debugfs - in the file "irq_domain_mapping". - - If you don't know what this means you don't need it. - # Support forced irq threading config IRQ_FORCED_THREADING bool diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e12d351..a37a3b4 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, } } -static cpumask_var_t *alloc_node_to_present_cpumask(void) +static cpumask_var_t *alloc_node_to_possible_cpumask(void) { cpumask_var_t *masks; int node; @@ -62,7 +62,7 @@ out_unwind: return NULL; } -static void free_node_to_present_cpumask(cpumask_var_t *masks) +static void free_node_to_possible_cpumask(cpumask_var_t *masks) { int node; @@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t *masks) kfree(masks); } -static void build_node_to_present_cpumask(cpumask_var_t *masks) +static void build_node_to_possible_cpumask(cpumask_var_t *masks) { int cpu; - for_each_present_cpu(cpu) + for_each_possible_cpu(cpu) cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); } -static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask, +static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask, const struct cpumask *mask, nodemask_t *nodemsk) { int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ for_each_node(n) { - if (cpumask_intersects(mask, node_to_present_cpumask[n])) { + if (cpumask_intersects(mask, node_to_possible_cpumask[n])) { node_set(n, *nodemsk); nodes++; } @@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; struct cpumask *masks; - cpumask_var_t nmsk, *node_to_present_cpumask; + cpumask_var_t nmsk, *node_to_possible_cpumask; /* * If there aren't any vectors left after applying the pre/post @@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!masks) goto out; - node_to_present_cpumask = alloc_node_to_present_cpumask(); - if (!node_to_present_cpumask) + node_to_possible_cpumask = alloc_node_to_possible_cpumask(); + if (!node_to_possible_cpumask) goto out; /* Fill out vectors at the beginning that don't need affinity */ @@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Stabilize the cpumasks */ get_online_cpus(); - build_node_to_present_cpumask(node_to_present_cpumask); - nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask, + build_node_to_possible_cpumask(node_to_possible_cpumask); + nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask, &nodemsk); /* @@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (affv <= nodes) { for_each_node_mask(n, nodemsk) { cpumask_copy(masks + curvec, - node_to_present_cpumask[n]); + node_to_possible_cpumask[n]); if (++curvec == last_affv) break; } @@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]); + cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); @@ -192,7 +192,7 @@ done: /* Fill out vectors at the end that don't need affinity */ for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); - free_node_to_present_cpumask(node_to_present_cpumask); + free_node_to_possible_cpumask(node_to_possible_cpumask); out: free_cpumask_var(nmsk); return masks; @@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity return 0; get_online_cpus(); - ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv; + ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv; put_online_cpus(); return ret; } diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 17f05ef..e4d3819 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -12,6 +12,11 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { + static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5); + + if (!__ratelimit(&ratelimit)) + return; + printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); printk("->handle_irq(): %p, ", desc->handle_irq); diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 7f608ac..acfaaef 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), + BIT_MASK_DESCR(IRQD_CAN_RESERVE), BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c26c5bb..508c03d 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); /* - * Separate lockdep class for interrupt chip which can nest irq_desc - * lock. + * Separate lockdep classes for interrupt chip which can nest irq_desc + * lock and request mutex. */ static struct lock_class_key irq_nested_lock_class; +static struct lock_class_key irq_nested_request_class; /* * irq_map_generic_chip - Map a generic chip for an irq domain @@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, set_bit(idx, &gc->installed); if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK) - irq_set_lockdep_class(virq, &irq_nested_lock_class); + irq_set_lockdep_class(virq, &irq_nested_lock_class, + &irq_nested_request_class); if (chip->irq_calc_mask) chip->irq_calc_mask(data); @@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, continue; if (flags & IRQ_GC_INIT_NESTED_LOCK) - irq_set_lockdep_class(i, &irq_nested_lock_class); + irq_set_lockdep_class(i, &irq_nested_lock_class, + &irq_nested_request_class); if (!(flags & IRQ_GC_NO_MASK)) { struct irq_data *d = irq_get_irq_data(i); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 07d08ca..ab19371 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) -static inline int irq_domain_activate_irq(struct irq_data *data, bool early) +static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve) { irqd_set_activated(data); return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4f4f600..e6a9c36 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -897,124 +897,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain, } EXPORT_SYMBOL_GPL(irq_find_mapping); -#ifdef CONFIG_IRQ_DOMAIN_DEBUG -static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc) -{ - struct irq_domain *domain; - struct irq_data *data; - - domain = desc->irq_data.domain; - data = &desc->irq_data; - - while (domain) { - unsigned int irq = data->irq; - unsigned long hwirq = data->hwirq; - struct irq_chip *chip; - bool direct; - - if (data == &desc->irq_data) - seq_printf(m, "%5d ", irq); - else - seq_printf(m, "%5d+ ", irq); - seq_printf(m, "0x%05lx ", hwirq); - - chip = irq_data_get_irq_chip(data); - seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); - - seq_printf(m, "0x%p ", irq_data_get_irq_chip_data(data)); - - seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); - direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq); - seq_printf(m, "%6s%-8s ", - (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", - direct ? "(DIRECT)" : ""); - seq_printf(m, "%s\n", domain->name); -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - domain = domain->parent; - data = data->parent_data; -#else - domain = NULL; -#endif - } -} - -static int virq_debug_show(struct seq_file *m, void *private) -{ - unsigned long flags; - struct irq_desc *desc; - struct irq_domain *domain; - struct radix_tree_iter iter; - void __rcu **slot; - int i; - - seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", - "name", "mapped", "linear-max", "direct-max", "devtree-node"); - mutex_lock(&irq_domain_mutex); - list_for_each_entry(domain, &irq_domain_list, link) { - struct device_node *of_node; - const char *name; - - int count = 0; - - of_node = irq_domain_get_of_node(domain); - if (of_node) - name = of_node_full_name(of_node); - else if (is_fwnode_irqchip(domain->fwnode)) - name = container_of(domain->fwnode, struct irqchip_fwid, - fwnode)->name; - else - name = ""; - - radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) - count++; - seq_printf(m, "%c%-16s %6u %10u %10u %s\n", - domain == irq_default_domain ? '*' : ' ', domain->name, - domain->revmap_size + count, domain->revmap_size, - domain->revmap_direct_max_irq, - name); - } - mutex_unlock(&irq_domain_mutex); - - seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq", - "chip name", (int)(2 * sizeof(void *) + 2), "chip data", - "active", "type", "domain"); - - for (i = 1; i < nr_irqs; i++) { - desc = irq_to_desc(i); - if (!desc) - continue; - - raw_spin_lock_irqsave(&desc->lock, flags); - virq_debug_show_one(m, desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); - } - - return 0; -} - -static int virq_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, virq_debug_show, inode->i_private); -} - -static const struct file_operations virq_debug_fops = { - .open = virq_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init irq_debugfs_init(void) -{ - if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL, - NULL, &virq_debug_fops) == NULL) - return -ENOMEM; - - return 0; -} -__initcall(irq_debugfs_init); -#endif /* CONFIG_IRQ_DOMAIN_DEBUG */ - /** * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings * @@ -1693,7 +1575,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data) } } -static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) +static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve) { int ret = 0; @@ -1702,9 +1584,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) if (irqd->parent_data) ret = __irq_domain_activate_irq(irqd->parent_data, - early); + reserve); if (!ret && domain->ops->activate) { - ret = domain->ops->activate(domain, irqd, early); + ret = domain->ops->activate(domain, irqd, reserve); /* Rollback in case of error */ if (ret && irqd->parent_data) __irq_domain_deactivate_irq(irqd->parent_data); @@ -1716,17 +1598,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate * interrupt - * @irq_data: outermost irq_data associated with interrupt + * @irq_data: Outermost irq_data associated with interrupt + * @reserve: If set only reserve an interrupt vector instead of assigning one * * This is the second step to call domain_ops->activate to program interrupt * controllers, so the interrupt could actually get delivered. */ -int irq_domain_activate_irq(struct irq_data *irq_data, bool early) +int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve) { int ret = 0; if (!irqd_is_activated(irq_data)) - ret = __irq_domain_activate_irq(irq_data, early); + ret = __irq_domain_activate_irq(irq_data, reserve); if (!ret) irqd_set_activated(irq_data); return ret; diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 0ba0dd8..5187dfe 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -321,15 +321,23 @@ void irq_matrix_remove_reserved(struct irq_matrix *m) int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, bool reserved, unsigned int *mapped_cpu) { - unsigned int cpu; + unsigned int cpu, best_cpu, maxavl = 0; + struct cpumap *cm; + unsigned int bit; + best_cpu = UINT_MAX; for_each_cpu(cpu, msk) { - struct cpumap *cm = per_cpu_ptr(m->maps, cpu); - unsigned int bit; + cm = per_cpu_ptr(m->maps, cpu); - if (!cm->online) + if (!cm->online || cm->available <= maxavl) continue; + best_cpu = cpu; + maxavl = cm->available; + } + + if (maxavl) { + cm = per_cpu_ptr(m->maps, best_cpu); bit = matrix_alloc_area(m, cm, 1, false); if (bit < m->alloc_end) { cm->allocated++; @@ -338,8 +346,8 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, m->global_available--; if (reserved) m->global_reserved--; - *mapped_cpu = cpu; - trace_irq_matrix_alloc(bit, cpu, m, cm); + *mapped_cpu = best_cpu; + trace_irq_matrix_alloc(bit, best_cpu, m, cm); return bit; } } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index edb987b..2f3c4f5 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -339,6 +339,40 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, return ret; } +/* + * Carefully check whether the device can use reservation mode. If + * reservation mode is enabled then the early activation will assign a + * dummy vector to the device. If the PCI/MSI device does not support + * masking of the entry then this can result in spurious interrupts when + * the device driver is not absolutely careful. But even then a malfunction + * of the hardware could result in a spurious interrupt on the dummy vector + * and render the device unusable. If the entry can be masked then the core + * logic will prevent the spurious interrupt and reservation mode can be + * used. For now reservation mode is restricted to PCI/MSI. + */ +static bool msi_check_reservation_mode(struct irq_domain *domain, + struct msi_domain_info *info, + struct device *dev) +{ + struct msi_desc *desc; + + if (domain->bus_token != DOMAIN_BUS_PCI_MSI) + return false; + + if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) + return false; + + if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask) + return false; + + /* + * Checking the first MSI descriptor is sufficient. MSIX supports + * masking and MSI does so when the maskbit is set. + */ + desc = first_msi_entry(dev); + return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit; +} + /** * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain * @domain: The domain to allocate from @@ -353,9 +387,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; - msi_alloc_info_t arg; + struct irq_data *irq_data; struct msi_desc *desc; + msi_alloc_info_t arg; int i, ret, virq; + bool can_reserve; ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) @@ -385,6 +421,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, if (ops->msi_finish) ops->msi_finish(&arg, 0); + can_reserve = msi_check_reservation_mode(domain, info, dev); + for_each_msi_entry(desc, dev) { virq = desc->irq; if (desc->nvec_used == 1) @@ -397,15 +435,25 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, * the MSI entries before the PCI layer enables MSI in the * card. Otherwise the card latches a random msi message. */ - if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { - struct irq_data *irq_data; + if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY)) + continue; + irq_data = irq_domain_get_irq_data(domain, desc->irq); + if (!can_reserve) + irqd_clr_can_reserve(irq_data); + ret = irq_domain_activate_irq(irq_data, can_reserve); + if (ret) + goto cleanup; + } + + /* + * If these interrupts use reservation mode, clear the activated bit + * so request_irq() will assign the final vector. + */ + if (can_reserve) { + for_each_msi_entry(desc, dev) { irq_data = irq_domain_get_irq_data(domain, desc->irq); - ret = irq_domain_activate_irq(irq_data, true); - if (ret) - goto cleanup; - if (info->flags & MSI_FLAG_MUST_REACTIVATE) - irqd_clr_activated(irq_data); + irqd_clr_activated(irq_data); } } return 0; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 40e9d73..6b7cdf1 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -36,7 +36,7 @@ static bool irq_work_claim(struct irq_work *work) */ flags = work->flags & ~IRQ_WORK_PENDING; for (;;) { - nflags = flags | IRQ_WORK_FLAGS; + nflags = flags | IRQ_WORK_CLAIMED; oflags = cmpxchg(&work->flags, flags, nflags); if (oflags == flags) break; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 8594d24..b451709 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -79,7 +79,7 @@ int static_key_count(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_count); -static void static_key_slow_inc_cpuslocked(struct static_key *key) +void static_key_slow_inc_cpuslocked(struct static_key *key) { int v, v1; @@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_disable); -static void static_key_slow_dec_cpuslocked(struct static_key *key, +static void __static_key_slow_dec_cpuslocked(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { @@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key, struct delayed_work *work) { cpus_read_lock(); - static_key_slow_dec_cpuslocked(key, rate_limit, work); + __static_key_slow_dec_cpuslocked(key, rate_limit, work); cpus_read_unlock(); } @@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_slow_dec); +void static_key_slow_dec_cpuslocked(struct static_key *key) +{ + STATIC_KEY_CHECK_USE(key); + __static_key_slow_dec_cpuslocked(key, 0, NULL); +} + void static_key_slow_dec_deferred(struct static_key_deferred *key) { STATIC_KEY_CHECK_USE(key); diff --git a/kernel/kcov.c b/kernel/kcov.c index 15f33fa..7594c03 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -157,7 +157,7 @@ void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2); -void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2) +void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2) { write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_); } @@ -183,7 +183,7 @@ void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2); -void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2) +void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2) { write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2, _RET_IP_); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 670d8d7..89b5f83 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -49,6 +49,7 @@ #include <linux/gfp.h> #include <linux/random.h> #include <linux/jhash.h> +#include <linux/nmi.h> #include <asm/sections.h> @@ -57,10 +58,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/lock.h> -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -#include <linux/slab.h> -#endif - #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -75,19 +72,6 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif -#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK -static int crossrelease_fullstack = 1; -#else -static int crossrelease_fullstack; -#endif -static int __init allow_crossrelease_fullstack(char *str) -{ - crossrelease_fullstack = 1; - return 0; -} - -early_param("crossrelease_fullstack", allow_crossrelease_fullstack); - /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -664,18 +648,12 @@ static int count_matching_names(struct lock_class *new_class) return count + 1; } -/* - * Register a lock's class in the hash-table, if the class is not present - * yet. Otherwise we look it up. We cache the result in the lock object - * itself, so actual lookup of the hash should be once per lock object. - */ static inline struct lock_class * -look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) +look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; struct hlist_head *hash_head; struct lock_class *class; - bool is_static = false; if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { debug_locks_off(); @@ -688,24 +666,11 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } /* - * Static locks do not have their class-keys yet - for them the key - * is the lock object itself. If the lock is in the per cpu area, - * the canonical address of the lock (per cpu offset removed) is - * used. + * If it is not initialised then it has never been locked, + * so it won't be present in the hash table. */ - if (unlikely(!lock->key)) { - unsigned long can_addr, addr = (unsigned long)lock; - - if (__is_kernel_percpu_address(addr, &can_addr)) - lock->key = (void *)can_addr; - else if (__is_module_percpu_address(addr, &can_addr)) - lock->key = (void *)can_addr; - else if (static_obj(lock)) - lock->key = (void *)lock; - else - return ERR_PTR(-EINVAL); - is_static = true; - } + if (unlikely(!lock->key)) + return NULL; /* * NOTE: the class-key must be unique. For dynamic locks, a static @@ -737,20 +702,36 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } } - return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); + return NULL; } -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -static void cross_init(struct lockdep_map *lock, int cross); -static int cross_lock(struct lockdep_map *lock); -static int lock_acquire_crosslock(struct held_lock *hlock); -static int lock_release_crosslock(struct lockdep_map *lock); -#else -static inline void cross_init(struct lockdep_map *lock, int cross) {} -static inline int cross_lock(struct lockdep_map *lock) { return 0; } -static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; } -static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; } -#endif +/* + * Static locks do not have their class-keys yet - for them the key is + * the lock object itself. If the lock is in the per cpu area, the + * canonical address of the lock (per cpu offset removed) is used. + */ +static bool assign_lock_key(struct lockdep_map *lock) +{ + unsigned long can_addr, addr = (unsigned long)lock; + + if (__is_kernel_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (__is_module_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (static_obj(lock)) + lock->key = (void *)lock; + else { + /* Debug-check: all keys must be persistent! */ + debug_locks_off(); + pr_err("INFO: trying to register non-static key.\n"); + pr_err("the code is fine but needs lockdep annotation.\n"); + pr_err("turning off the locking correctness validator.\n"); + dump_stack(); + return false; + } + + return true; +} /* * Register a lock's class in the hash-table, if the class is not present @@ -767,18 +748,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) DEBUG_LOCKS_WARN_ON(!irqs_disabled()); class = look_up_lock_class(lock, subclass); - if (likely(!IS_ERR_OR_NULL(class))) + if (likely(class)) goto out_set_class_cache; - /* - * Debug-check: all keys must be persistent! - */ - if (IS_ERR(class)) { - debug_locks_off(); - printk("INFO: trying to register non-static key.\n"); - printk("the code is fine but needs lockdep annotation.\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); + if (!lock->key) { + if (!assign_lock_key(lock)) + return NULL; + } else if (!static_obj(lock->key)) { return NULL; } @@ -1151,41 +1127,22 @@ print_circular_lock_scenario(struct held_lock *src, printk(KERN_CONT "\n\n"); } - if (cross_lock(tgt->instance)) { - printk(" Possible unsafe locking scenario by crosslock:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk(" unlock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); - } else { - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); - } + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(parent); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(source); + printk(KERN_CONT ");\n"); + printk("\n *** DEADLOCK ***\n\n"); } /* @@ -1211,10 +1168,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, curr->comm, task_pid_nr(curr)); print_lock(check_src); - if (cross_lock(check_tgt->instance)) - pr_warn("\nbut now in release context of a crosslock acquired at the following:\n"); - else - pr_warn("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(check_tgt); pr_warn("\nwhich lock already depends on the new lock.\n\n"); @@ -1244,9 +1198,7 @@ static noinline int print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - if (cross_lock(check_tgt->instance)) - this->trace = *trace; - else if (!save_trace(&this->trace)) + if (!save_trace(&this->trace)) return 0; depth = get_lock_depth(target); @@ -1850,9 +1802,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; - if (cross_lock(prev->instance)) - continue; - return print_deadlock_bug(curr, prev, next); } return 1; @@ -2018,31 +1967,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) for (;;) { int distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; + /* - * Only non-crosslock entries get new dependencies added. - * Crosslock entries will be added by commit later: + * Only non-recursive-read entries get new dependencies + * added: */ - if (!cross_lock(hlock->instance)) { + if (hlock->read != 2 && hlock->check) { + int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); + if (!ret) + return 0; + /* - * Only non-recursive-read entries get new dependencies - * added: + * Stop after the first non-trylock entry, + * as non-trylock entries have added their + * own direct dependencies already, so this + * lock is connected to them indirectly: */ - if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, - distance, &trace, save_trace); - if (!ret) - return 0; - - /* - * Stop after the first non-trylock entry, - * as non-trylock entries have added their - * own direct dependencies already, so this - * lock is connected to them indirectly: - */ - if (!hlock->trylock) - break; - } + if (!hlock->trylock) + break; } + depth--; /* * End of lock-stack? @@ -3292,21 +3236,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - cross_init(lock, 0); __lockdep_init_map(lock, name, key, subclass); } EXPORT_SYMBOL_GPL(lockdep_init_map); -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) -{ - cross_init(lock, 1); - __lockdep_init_map(lock, name, key, subclass); -} -EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock); -#endif - struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3344,7 +3277,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr, return 0; } -static int __lock_is_held(struct lockdep_map *lock, int read); +static int __lock_is_held(const struct lockdep_map *lock, int read); /* * This gets called for every mutex_lock*()/spin_lock*() operation. @@ -3362,7 +3295,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int chain_head = 0; int class_idx; u64 chain_key; - int ret; if (unlikely(!debug_locks)) return 0; @@ -3411,8 +3343,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes + 1; - /* TODO: nest_lock is not implemented for crosslock yet. */ - if (depth && !cross_lock(lock)) { + if (depth) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (hlock->references) { @@ -3500,14 +3431,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; - ret = lock_acquire_crosslock(hlock); - /* - * 2 means normal acquire operations are needed. Otherwise, it's - * ok just to return with '0:fail, 1:success'. - */ - if (ret != 2) - return ret; - curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -3563,13 +3486,14 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, return 0; } -static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) +static int match_held_lock(const struct held_lock *hlock, + const struct lockdep_map *lock) { if (hlock->instance == lock) return 1; if (hlock->references) { - struct lock_class *class = lock->class_cache[0]; + const struct lock_class *class = lock->class_cache[0]; if (!class) class = look_up_lock_class(lock, 0); @@ -3580,7 +3504,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) * Clearly if the lock hasn't been acquired _ever_, we're not * holding it either, so report failure. */ - if (IS_ERR_OR_NULL(class)) + if (!class) return 0; /* @@ -3745,19 +3669,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) struct task_struct *curr = current; struct held_lock *hlock; unsigned int depth; - int ret, i; + int i; if (unlikely(!debug_locks)) return 0; - ret = lock_release_crosslock(lock); - /* - * 2 means normal release operations are needed. Otherwise, it's - * ok just to return with '0:fail, 1:success'. - */ - if (ret != 2) - return ret; - depth = curr->lockdep_depth; /* * So we're all set to release this lock.. wait what lock? We don't @@ -3813,7 +3729,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) return 1; } -static int __lock_is_held(struct lockdep_map *lock, int read) +static int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; int i; @@ -4027,7 +3943,7 @@ void lock_release(struct lockdep_map *lock, int nested, } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held_type(struct lockdep_map *lock, int read) +int lock_is_held_type(const struct lockdep_map *lock, int read) { unsigned long flags; int ret = 0; @@ -4384,7 +4300,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) * If the class exists we look it up and zap it: */ class = look_up_lock_class(lock, j); - if (!IS_ERR_OR_NULL(class)) + if (class) zap_class(class); } /* @@ -4580,6 +4496,7 @@ retry: if (!unlock) if (read_trylock(&tasklist_lock)) unlock = 1; + touch_nmi_watchdog(); } while_each_thread(g, p); pr_warn("\n"); @@ -4675,495 +4592,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); - -#ifdef CONFIG_LOCKDEP_CROSSRELEASE - -/* - * Crossrelease works by recording a lock history for each thread and - * connecting those historic locks that were taken after the - * wait_for_completion() in the complete() context. - * - * Task-A Task-B - * - * mutex_lock(&A); - * mutex_unlock(&A); - * - * wait_for_completion(&C); - * lock_acquire_crosslock(); - * atomic_inc_return(&cross_gen_id); - * | - * | mutex_lock(&B); - * | mutex_unlock(&B); - * | - * | complete(&C); - * `-- lock_commit_crosslock(); - * - * Which will then add a dependency between B and C. - */ - -#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR]) - -/* - * Whenever a crosslock is held, cross_gen_id will be increased. - */ -static atomic_t cross_gen_id; /* Can be wrapped */ - -/* - * Make an entry of the ring buffer invalid. - */ -static inline void invalidate_xhlock(struct hist_lock *xhlock) -{ - /* - * Normally, xhlock->hlock.instance must be !NULL. - */ - xhlock->hlock.instance = NULL; -} - -/* - * Lock history stacks; we have 2 nested lock history stacks: - * - * HARD(IRQ) - * SOFT(IRQ) - * - * The thing is that once we complete a HARD/SOFT IRQ the future task locks - * should not depend on any of the locks observed while running the IRQ. So - * what we do is rewind the history buffer and erase all our knowledge of that - * temporal event. - */ - -void crossrelease_hist_start(enum xhlock_context_t c) -{ - struct task_struct *cur = current; - - if (!cur->xhlocks) - return; - - cur->xhlock_idx_hist[c] = cur->xhlock_idx; - cur->hist_id_save[c] = cur->hist_id; -} - -void crossrelease_hist_end(enum xhlock_context_t c) -{ - struct task_struct *cur = current; - - if (cur->xhlocks) { - unsigned int idx = cur->xhlock_idx_hist[c]; - struct hist_lock *h = &xhlock(idx); - - cur->xhlock_idx = idx; - - /* Check if the ring was overwritten. */ - if (h->hist_id != cur->hist_id_save[c]) - invalidate_xhlock(h); - } -} - -/* - * lockdep_invariant_state() is used to annotate independence inside a task, to - * make one task look like multiple independent 'tasks'. - * - * Take for instance workqueues; each work is independent of the last. The - * completion of a future work does not depend on the completion of a past work - * (in general). Therefore we must not carry that (lock) dependency across - * works. - * - * This is true for many things; pretty much all kthreads fall into this - * pattern, where they have an invariant state and future completions do not - * depend on past completions. Its just that since they all have the 'same' - * form -- the kthread does the same over and over -- it doesn't typically - * matter. - * - * The same is true for system-calls, once a system call is completed (we've - * returned to userspace) the next system call does not depend on the lock - * history of the previous system call. - * - * They key property for independence, this invariant state, is that it must be - * a point where we hold no locks and have no history. Because if we were to - * hold locks, the restore at _end() would not necessarily recover it's history - * entry. Similarly, independence per-definition means it does not depend on - * prior state. - */ -void lockdep_invariant_state(bool force) -{ - /* - * We call this at an invariant point, no current state, no history. - * Verify the former, enforce the latter. - */ - WARN_ON_ONCE(!force && current->lockdep_depth); - if (current->xhlocks) - invalidate_xhlock(&xhlock(current->xhlock_idx)); -} - -static int cross_lock(struct lockdep_map *lock) -{ - return lock ? lock->cross : 0; -} - -/* - * This is needed to decide the relationship between wrapable variables. - */ -static inline int before(unsigned int a, unsigned int b) -{ - return (int)(a - b) < 0; -} - -static inline struct lock_class *xhlock_class(struct hist_lock *xhlock) -{ - return hlock_class(&xhlock->hlock); -} - -static inline struct lock_class *xlock_class(struct cross_lock *xlock) -{ - return hlock_class(&xlock->hlock); -} - -/* - * Should we check a dependency with previous one? - */ -static inline int depend_before(struct held_lock *hlock) -{ - return hlock->read != 2 && hlock->check && !hlock->trylock; -} - -/* - * Should we check a dependency with next one? - */ -static inline int depend_after(struct held_lock *hlock) -{ - return hlock->read != 2 && hlock->check; -} - -/* - * Check if the xhlock is valid, which would be false if, - * - * 1. Has not used after initializaion yet. - * 2. Got invalidated. - * - * Remind hist_lock is implemented as a ring buffer. - */ -static inline int xhlock_valid(struct hist_lock *xhlock) -{ - /* - * xhlock->hlock.instance must be !NULL. - */ - return !!xhlock->hlock.instance; -} - -/* - * Record a hist_lock entry. - * - * Irq disable is only required. - */ -static void add_xhlock(struct held_lock *hlock) -{ - unsigned int idx = ++current->xhlock_idx; - struct hist_lock *xhlock = &xhlock(idx); - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * This can be done locklessly because they are all task-local - * state, we must however ensure IRQs are disabled. - */ - WARN_ON_ONCE(!irqs_disabled()); -#endif - - /* Initialize hist_lock's members */ - xhlock->hlock = *hlock; - xhlock->hist_id = ++current->hist_id; - - xhlock->trace.nr_entries = 0; - xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; - xhlock->trace.entries = xhlock->trace_entries; - - if (crossrelease_fullstack) { - xhlock->trace.skip = 3; - save_stack_trace(&xhlock->trace); - } else { - xhlock->trace.nr_entries = 1; - xhlock->trace.entries[0] = hlock->acquire_ip; - } -} - -static inline int same_context_xhlock(struct hist_lock *xhlock) -{ - return xhlock->hlock.irq_context == task_irq_context(current); -} - -/* - * This should be lockless as far as possible because this would be - * called very frequently. - */ -static void check_add_xhlock(struct held_lock *hlock) -{ - /* - * Record a hist_lock, only in case that acquisitions ahead - * could depend on the held_lock. For example, if the held_lock - * is trylock then acquisitions ahead never depends on that. - * In that case, we don't need to record it. Just return. - */ - if (!current->xhlocks || !depend_before(hlock)) - return; - - add_xhlock(hlock); -} - -/* - * For crosslock. - */ -static int add_xlock(struct held_lock *hlock) -{ - struct cross_lock *xlock; - unsigned int gen_id; - - if (!graph_lock()) - return 0; - - xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock; - - /* - * When acquisitions for a crosslock are overlapped, we use - * nr_acquire to perform commit for them, based on cross_gen_id - * of the first acquisition, which allows to add additional - * dependencies. - * - * Moreover, when no acquisition of a crosslock is in progress, - * we should not perform commit because the lock might not exist - * any more, which might cause incorrect memory access. So we - * have to track the number of acquisitions of a crosslock. - * - * depend_after() is necessary to initialize only the first - * valid xlock so that the xlock can be used on its commit. - */ - if (xlock->nr_acquire++ && depend_after(&xlock->hlock)) - goto unlock; - - gen_id = (unsigned int)atomic_inc_return(&cross_gen_id); - xlock->hlock = *hlock; - xlock->hlock.gen_id = gen_id; -unlock: - graph_unlock(); - return 1; -} - -/* - * Called for both normal and crosslock acquires. Normal locks will be - * pushed on the hist_lock queue. Cross locks will record state and - * stop regular lock_acquire() to avoid being placed on the held_lock - * stack. - * - * Return: 0 - failure; - * 1 - crosslock, done; - * 2 - normal lock, continue to held_lock[] ops. - */ -static int lock_acquire_crosslock(struct held_lock *hlock) -{ - /* - * CONTEXT 1 CONTEXT 2 - * --------- --------- - * lock A (cross) - * X = atomic_inc_return(&cross_gen_id) - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * Y = atomic_read_acquire(&cross_gen_id) - * lock B - * - * atomic_read_acquire() is for ordering between A and B, - * IOW, A happens before B, when CONTEXT 2 see Y >= X. - * - * Pairs with atomic_inc_return() in add_xlock(). - */ - hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id); - - if (cross_lock(hlock->instance)) - return add_xlock(hlock); - - check_add_xhlock(hlock); - return 2; -} - -static int copy_trace(struct stack_trace *trace) -{ - unsigned long *buf = stack_trace + nr_stack_trace_entries; - unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - unsigned int nr = min(max_nr, trace->nr_entries); - - trace->nr_entries = nr; - memcpy(buf, trace->entries, nr * sizeof(trace->entries[0])); - trace->entries = buf; - nr_stack_trace_entries += nr; - - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { - if (!debug_locks_off_graph_unlock()) - return 0; - - print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); - dump_stack(); - - return 0; - } - - return 1; -} - -static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock) -{ - unsigned int xid, pid; - u64 chain_key; - - xid = xlock_class(xlock) - lock_classes; - chain_key = iterate_chain_key((u64)0, xid); - pid = xhlock_class(xhlock) - lock_classes; - chain_key = iterate_chain_key(chain_key, pid); - - if (lookup_chain_cache(chain_key)) - return 1; - - if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context, - chain_key)) - return 0; - - if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1, - &xhlock->trace, copy_trace)) - return 0; - - return 1; -} - -static void commit_xhlocks(struct cross_lock *xlock) -{ - unsigned int cur = current->xhlock_idx; - unsigned int prev_hist_id = xhlock(cur).hist_id; - unsigned int i; - - if (!graph_lock()) - return; - - if (xlock->nr_acquire) { - for (i = 0; i < MAX_XHLOCKS_NR; i++) { - struct hist_lock *xhlock = &xhlock(cur - i); - - if (!xhlock_valid(xhlock)) - break; - - if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id)) - break; - - if (!same_context_xhlock(xhlock)) - break; - - /* - * Filter out the cases where the ring buffer was - * overwritten and the current entry has a bigger - * hist_id than the previous one, which is impossible - * otherwise: - */ - if (unlikely(before(prev_hist_id, xhlock->hist_id))) - break; - - prev_hist_id = xhlock->hist_id; - - /* - * commit_xhlock() returns 0 with graph_lock already - * released if fail. - */ - if (!commit_xhlock(xlock, xhlock)) - return; - } - } - - graph_unlock(); -} - -void lock_commit_crosslock(struct lockdep_map *lock) -{ - struct cross_lock *xlock; - unsigned long flags; - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - if (!current->xhlocks) - return; - - /* - * Do commit hist_locks with the cross_lock, only in case that - * the cross_lock could depend on acquisitions after that. - * - * For example, if the cross_lock does not have the 'check' flag - * then we don't need to check dependencies and commit for that. - * Just skip it. In that case, of course, the cross_lock does - * not depend on acquisitions ahead, either. - * - * WARNING: Don't do that in add_xlock() in advance. When an - * acquisition context is different from the commit context, - * invalid(skipped) cross_lock might be accessed. - */ - if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - xlock = &((struct lockdep_map_cross *)lock)->xlock; - commit_xhlocks(xlock); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_commit_crosslock); - -/* - * Return: 0 - failure; - * 1 - crosslock, done; - * 2 - normal lock, continue to held_lock[] ops. - */ -static int lock_release_crosslock(struct lockdep_map *lock) -{ - if (cross_lock(lock)) { - if (!graph_lock()) - return 0; - ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--; - graph_unlock(); - return 1; - } - return 2; -} - -static void cross_init(struct lockdep_map *lock, int cross) -{ - if (cross) - ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0; - - lock->cross = cross; - - /* - * Crossrelease assumes that the ring buffer size of xhlocks - * is aligned with power of 2. So force it on build. - */ - BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1)); -} - -void lockdep_init_task(struct task_struct *task) -{ - int i; - - task->xhlock_idx = UINT_MAX; - task->hist_id = 0; - - for (i = 0; i < XHLOCK_CTX_NR; i++) { - task->xhlock_idx_hist[i] = UINT_MAX; - task->hist_id_save[i] = 0; - } - - task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR, - GFP_KERNEL); -} - -void lockdep_free_task(struct task_struct *task) -{ - if (task->xhlocks) { - void *tmp = task->xhlocks; - /* Diable crossrelease for current */ - task->xhlocks = NULL; - kfree(tmp); - } -} -#endif diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f24582d..6850ffd 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -77,10 +77,6 @@ struct lock_stress_stats { long n_lock_acquired; }; -int torture_runnable = IS_ENABLED(MODULE); -module_param(torture_runnable, int, 0444); -MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); - /* Forward reference. */ static void lock_torture_cleanup(void); @@ -130,10 +126,8 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp) if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_lock_busted_write_unlock(void) @@ -179,10 +173,8 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp) if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2 * shortdelay_us))) udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) @@ -352,10 +344,8 @@ static void torture_mutex_delay(struct torture_random_state *trsp) mdelay(longdelay_ms * 5); else mdelay(longdelay_ms / 5); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_mutex_unlock(void) __releases(torture_mutex) @@ -507,10 +497,8 @@ static void torture_rtmutex_delay(struct torture_random_state *trsp) if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2 * shortdelay_us))) udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_rtmutex_unlock(void) __releases(torture_rtmutex) @@ -547,10 +535,8 @@ static void torture_rwsem_write_delay(struct torture_random_state *trsp) mdelay(longdelay_ms * 10); else mdelay(longdelay_ms / 10); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_rwsem_up_write(void) __releases(torture_rwsem) @@ -570,14 +556,12 @@ static void torture_rwsem_read_delay(struct torture_random_state *trsp) /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + (cxt.nrealreaders_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 2); else mdelay(longdelay_ms / 2); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_rwsem_up_read(void) __releases(torture_rwsem) @@ -715,8 +699,7 @@ static void __torture_print_stats(char *page, { bool fail = 0; int i, n_stress; - long max = 0; - long min = statp[0].n_lock_acquired; + long max = 0, min = statp ? statp[0].n_lock_acquired : 0; long long sum = 0; n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; @@ -823,7 +806,7 @@ static void lock_torture_cleanup(void) * such, only perform the underlying torture-specific cleanups, * and avoid anything related to locktorture. */ - if (!cxt.lwsa) + if (!cxt.lwsa && !cxt.lrsa) goto end; if (writer_tasks) { @@ -879,7 +862,7 @@ static int __init lock_torture_init(void) &percpu_rwsem_lock_ops, }; - if (!torture_init_begin(torture_type, verbose, &torture_runnable)) + if (!torture_init_begin(torture_type, verbose)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ @@ -898,6 +881,13 @@ static int __init lock_torture_init(void) firsterr = -EINVAL; goto unwind; } + + if (nwriters_stress == 0 && nreaders_stress == 0) { + pr_alert("lock-torture: must run at least one locking thread\n"); + firsterr = -EINVAL; + goto unwind; + } + if (cxt.cur_ops->init) cxt.cur_ops->init(); @@ -921,17 +911,19 @@ static int __init lock_torture_init(void) #endif /* Initialize the statistics so that each run gets its own numbers. */ + if (nwriters_stress) { + lock_is_write_held = 0; + cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); + if (cxt.lwsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } - lock_is_write_held = 0; - cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); - if (cxt.lwsa == NULL) { - VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < cxt.nrealwriters_stress; i++) { - cxt.lwsa[i].n_lock_fail = 0; - cxt.lwsa[i].n_lock_acquired = 0; + for (i = 0; i < cxt.nrealwriters_stress; i++) { + cxt.lwsa[i].n_lock_fail = 0; + cxt.lwsa[i].n_lock_acquired = 0; + } } if (cxt.cur_ops->readlock) { @@ -948,19 +940,21 @@ static int __init lock_torture_init(void) cxt.nrealreaders_stress = cxt.nrealwriters_stress; } - lock_is_read_held = 0; - cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); - if (cxt.lrsa == NULL) { - VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); - firsterr = -ENOMEM; - kfree(cxt.lwsa); - cxt.lwsa = NULL; - goto unwind; - } - - for (i = 0; i < cxt.nrealreaders_stress; i++) { - cxt.lrsa[i].n_lock_fail = 0; - cxt.lrsa[i].n_lock_acquired = 0; + if (nreaders_stress) { + lock_is_read_held = 0; + cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); + if (cxt.lrsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); + firsterr = -ENOMEM; + kfree(cxt.lwsa); + cxt.lwsa = NULL; + goto unwind; + } + + for (i = 0; i < cxt.nrealreaders_stress; i++) { + cxt.lrsa[i].n_lock_fail = 0; + cxt.lrsa[i].n_lock_acquired = 0; + } } } @@ -990,12 +984,14 @@ static int __init lock_torture_init(void) goto unwind; } - writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), - GFP_KERNEL); - if (writer_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); - firsterr = -ENOMEM; - goto unwind; + if (nwriters_stress) { + writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), + GFP_KERNEL); + if (writer_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } } if (cxt.cur_ops->readlock) { diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 294294c..38ece03 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -170,7 +170,7 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) * @tail : The new queue tail code word * Return: The previous queue tail code word * - * xchg(lock, tail) + * xchg(lock, tail), which heads an address dependency * * p,*,* -> n,*,* ; prev = xchg(lock, node) */ @@ -409,13 +409,11 @@ queue: if (old & _Q_TAIL_MASK) { prev = decode_tail(old); /* - * The above xchg_tail() is also a load of @lock which generates, - * through decode_tail(), a pointer. - * - * The address dependency matches the RELEASE of xchg_tail() - * such that the access to @prev must happen after. + * The above xchg_tail() is also a load of @lock which + * generates, through decode_tail(), a pointer. The address + * dependency matches the RELEASE of xchg_tail() such that + * the subsequent access to @prev happens after. */ - smp_read_barrier_depends(); WRITE_ONCE(prev->next, node); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6f3dba6..65cc0cb 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; } +static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock) +{ + int ret = try_to_take_rt_mutex(lock, current, NULL); + + /* + * try_to_take_rt_mutex() sets the lock waiters bit + * unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + + return ret; +} + /* * Slow path try-lock function: */ @@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ raw_spin_lock_irqsave(&lock->wait_lock, flags); - ret = try_to_take_rt_mutex(lock, current, NULL); - - /* - * try_to_take_rt_mutex() sets the lock waiters bit - * unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); + ret = __rt_mutex_slowtrylock(lock); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); @@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) return rt_mutex_slowtrylock(lock); } +int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) +{ + return __rt_mutex_slowtrylock(lock); +} + /** * rt_mutex_timed_lock - lock a rt_mutex interruptible * the timeout structure is provided diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 124e98c..68686b3 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter); extern int rt_mutex_futex_trylock(struct rt_mutex *l); +extern int __rt_mutex_futex_trylock(struct rt_mutex *l); extern void rt_mutex_futex_unlock(struct rt_mutex *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 1fd1a75..936f3d1 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ break; \ preempt_enable(); \ \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while ((lock)->break_lock) \ - arch_##op##_relax(&lock->raw_lock); \ + arch_##op##_relax(&lock->raw_lock); \ } \ - (lock)->break_lock = 0; \ } \ \ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ @@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ local_irq_restore(flags); \ preempt_enable(); \ \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while ((lock)->break_lock) \ - arch_##op##_relax(&lock->raw_lock); \ + arch_##op##_relax(&lock->raw_lock); \ } \ - (lock)->break_lock = 0; \ + \ return flags; \ } \ \ diff --git a/kernel/module.c b/kernel/module.c index dea01ac..09e48ee 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2863,6 +2863,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info) } #endif /* CONFIG_LIVEPATCH */ +static void check_modinfo_retpoline(struct module *mod, struct load_info *info) +{ + if (retpoline_module_ok(get_modinfo(info, "retpoline"))) + return; + + pr_warn("%s: loading module not compiled with retpoline compiler.\n", + mod->name); +} + /* Sets info->hdr and info->len. */ static int copy_module_from_user(const void __user *umod, unsigned long len, struct load_info *info) @@ -3029,6 +3038,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); } + check_modinfo_retpoline(mod, info); + if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); pr_warn("%s: module is from the staging directory, the quality " diff --git a/kernel/pid.c b/kernel/pid.c index b13b624..5d30c87 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -41,7 +41,19 @@ #include <linux/sched/task.h> #include <linux/idr.h> -struct pid init_struct_pid = INIT_STRUCT_PID; +struct pid init_struct_pid = { + .count = ATOMIC_INIT(1), + .tasks = { + { .first = NULL }, + { .first = NULL }, + { .first = NULL }, + }, + .level = 0, + .numbers = { { + .nr = 0, + .ns = &init_pid_ns, + }, } +}; int pid_max = PID_MAX_DEFAULT; @@ -193,10 +205,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) } if (unlikely(is_child_reaper(pid))) { - if (pid_ns_prepare_proc(ns)) { - disable_pid_allocation(ns); + if (pid_ns_prepare_proc(ns)) goto out_free; - } } get_pid_ns(ns); @@ -226,6 +236,10 @@ out_free: while (++i <= ns->level) idr_remove(&ns->idr, (pid->numbers + i)->nr); + /* On failure to allocate the first pid, reset the state */ + if (ns->pid_allocated == PIDNS_ADDING) + idr_set_cursor(&ns->idr, 0); + spin_unlock_irq(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); diff --git a/kernel/power/main.c b/kernel/power/main.c index 3a2ca90..705c236 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -22,6 +22,35 @@ DEFINE_MUTEX(pm_mutex); #ifdef CONFIG_PM_SLEEP +void lock_system_sleep(void) +{ + current->flags |= PF_FREEZER_SKIP; + mutex_lock(&pm_mutex); +} +EXPORT_SYMBOL_GPL(lock_system_sleep); + +void unlock_system_sleep(void) +{ + /* + * Don't use freezer_count() because we don't want the call to + * try_to_freeze() here. + * + * Reason: + * Fundamentally, we just don't need it, because freezing condition + * doesn't come into effect until we release the pm_mutex lock, + * since the freezer always works with pm_mutex held. + * + * More importantly, in the case of hibernation, + * unlock_system_sleep() gets called in snapshot_read() and + * snapshot_write() when the freezing condition is still in effect. + * Which means, if we use try_to_freeze() here, it would make them + * enter the refrigerator, thus causing hibernation to lockup. + */ + current->flags &= ~PF_FREEZER_SKIP; + mutex_unlock(&pm_mutex); +} +EXPORT_SYMBOL_GPL(unlock_system_sleep); + /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index bce0464..3d37c27 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1645,8 +1645,7 @@ static unsigned long free_unnecessary_pages(void) * [number of saveable pages] - [number of pages that can be freed in theory] * * where the second term is the sum of (1) reclaimable slab pages, (2) active - * and (3) inactive anonymous pages, (4) active and (5) inactive file pages, - * minus mapped file pages. + * and (3) inactive anonymous pages, (4) active and (5) inactive file pages. */ static unsigned long minimum_image_size(unsigned long saveable) { @@ -1656,8 +1655,7 @@ static unsigned long minimum_image_size(unsigned long saveable) + global_node_page_state(NR_ACTIVE_ANON) + global_node_page_state(NR_INACTIVE_ANON) + global_node_page_state(NR_ACTIVE_FILE) - + global_node_page_state(NR_INACTIVE_FILE) - - global_node_page_state(NR_FILE_MAPPED); + + global_node_page_state(NR_INACTIVE_FILE); return saveable <= size ? 0 : saveable - size; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 293ead5..11b4282 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -240,7 +240,7 @@ static void hib_init_batch(struct hib_bio_batch *hb) static void hib_end_io(struct bio *bio) { struct hib_bio_batch *hb = bio->bi_private; - struct page *page = bio->bi_io_vec[0].bv_page; + struct page *page = bio_first_page_all(bio); if (bio->bi_status) { pr_alert("Read-error on swap-device (%u:%u:%Lu)\n", @@ -879,7 +879,7 @@ out_clean: * space avaiable from the resume partition. */ -static int enough_swap(unsigned int nr_pages, unsigned int flags) +static int enough_swap(unsigned int nr_pages) { unsigned int free_swap = count_swap_pages(root_swap, 1); unsigned int required; @@ -915,7 +915,7 @@ int swsusp_write(unsigned int flags) return error; } if (flags & SF_NOCOMPRESS_MODE) { - if (!enough_swap(pages, flags)) { + if (!enough_swap(pages)) { pr_err("Not enough free swap\n"); error = -ENOSPC; goto out_finish; diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 59c471d..6334f2c 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -30,31 +30,8 @@ #define RCU_TRACE(stmt) #endif /* #else #ifdef CONFIG_RCU_TRACE */ -/* - * Process-level increment to ->dynticks_nesting field. This allows for - * architectures that use half-interrupts and half-exceptions from - * process context. - * - * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH - * that counts the number of process-based reasons why RCU cannot - * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE - * is the value used to increment or decrement this field. - * - * The rest of the bits could in principle be used to count interrupts, - * but this would mean that a negative-one value in the interrupt - * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field. - * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK - * that is set to DYNTICK_TASK_FLAG upon initial exit from idle. - * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon - * initial exit from idle. - */ -#define DYNTICK_TASK_NEST_WIDTH 7 -#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1) -#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1) -#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2) -#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3) -#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ - DYNTICK_TASK_FLAG) +/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */ +#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) /* diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 1f87a02..d1ebdf9 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -106,10 +106,6 @@ static int rcu_perf_writer_state; #define MAX_MEAS 10000 #define MIN_MEAS 100 -static int perf_runnable = IS_ENABLED(MODULE); -module_param(perf_runnable, int, 0444); -MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); - /* * Operations vector for selecting different types of tests. */ @@ -646,7 +642,7 @@ rcu_perf_init(void) &tasks_ops, }; - if (!torture_init_begin(perf_type, verbose, &perf_runnable)) + if (!torture_init_begin(perf_type, verbose)) return -EBUSY; /* Process args and tell the world that the perf'er is on the job. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 74f6b01..308e6fd 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -187,10 +187,6 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -static int torture_runnable = IS_ENABLED(MODULE); -module_param(torture_runnable, int, 0444); -MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); - #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) #define rcu_can_boost() 1 #else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ @@ -315,11 +311,9 @@ static void rcu_read_delay(struct torture_random_state *rrsp) } if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT if (!preempt_count() && - !(torture_random(rrsp) % (nrealreaders * 20000))) - preempt_schedule(); /* No QS if preempt_disable() in effect */ -#endif + !(torture_random(rrsp) % (nrealreaders * 500))) + torture_preempt_schedule(); /* QS only if preemptible. */ } static void rcu_torture_read_unlock(int idx) __releases(RCU) @@ -1731,7 +1725,7 @@ rcu_torture_init(void) &sched_ops, &tasks_ops, }; - if (!torture_init_begin(torture_type, verbose, &torture_runnable)) + if (!torture_init_begin(torture_type, verbose)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6d58800..d5cea81 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -53,6 +53,33 @@ static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); static void process_srcu(struct work_struct *work); +/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ +#define spin_lock_rcu_node(p) \ +do { \ + spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) + +#define spin_lock_irq_rcu_node(p) \ +do { \ + spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define spin_unlock_irq_rcu_node(p) \ + spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) + +#define spin_lock_irqsave_rcu_node(p, flags) \ +do { \ + spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define spin_unlock_irqrestore_rcu_node(p, flags) \ + spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ + /* * Initialize SRCU combining tree. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and @@ -77,7 +104,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) /* Each pass through this loop initializes one srcu_node structure. */ rcu_for_each_node_breadth_first(sp, snp) { - raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock)); + spin_lock_init(&ACCESS_PRIVATE(snp, lock)); WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { @@ -111,7 +138,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) snp_first = sp->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; @@ -170,7 +197,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name, /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)sp, sizeof(*sp)); lockdep_init_map(&sp->dep_map, name, key, 0); - raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); + spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -187,7 +214,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); */ int init_srcu_struct(struct srcu_struct *sp) { - raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); + spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -210,13 +237,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp) /* The smp_load_acquire() pairs with the smp_store_release(). */ if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - raw_spin_lock_irqsave_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(sp, flags); if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); return; } init_srcu_struct_fields(sp, true); - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -513,7 +540,7 @@ static void srcu_gp_end(struct srcu_struct *sp) mutex_lock(&sp->srcu_cb_mutex); /* End the current grace period. */ - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); idx = rcu_seq_state(sp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); cbdelay = srcu_get_delay(sp); @@ -522,7 +549,7 @@ static void srcu_gp_end(struct srcu_struct *sp) gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) sp->srcu_gp_seq_needed_exp = gpseq; - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -530,7 +557,7 @@ static void srcu_gp_end(struct srcu_struct *sp) idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); rcu_for_each_node_breadth_first(sp, snp) { - raw_spin_lock_irq_rcu_node(snp); + spin_lock_irq_rcu_node(snp); cbs = false; if (snp >= sp->level[rcu_num_lvls - 1]) cbs = snp->srcu_have_cbs[idx] == gpseq; @@ -540,7 +567,7 @@ static void srcu_gp_end(struct srcu_struct *sp) snp->srcu_gp_seq_needed_exp = gpseq; mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; - raw_spin_unlock_irq_rcu_node(snp); + spin_unlock_irq_rcu_node(snp); if (cbs) srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); @@ -548,11 +575,11 @@ static void srcu_gp_end(struct srcu_struct *sp) if (!(gpseq & counter_wrap_check)) for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { sdp = per_cpu_ptr(sp->sda, cpu); - raw_spin_lock_irqsave_rcu_node(sdp, flags); + spin_lock_irqsave_rcu_node(sdp, flags); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; - raw_spin_unlock_irqrestore_rcu_node(sdp, flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); } } @@ -560,17 +587,17 @@ static void srcu_gp_end(struct srcu_struct *sp) mutex_unlock(&sp->srcu_cb_mutex); /* Start a new grace period if needed. */ - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (!rcu_seq_state(gpseq) && ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { srcu_gp_start(sp); - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); /* Throttle expedited grace periods: Should be rare! */ srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff ? 0 : SRCU_INTERVAL); } else { - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); } } @@ -590,18 +617,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, if (rcu_seq_done(&sp->srcu_gp_seq, s) || ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) return; - raw_spin_lock_irqsave_rcu_node(snp, flags); + spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); return; } WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); } - raw_spin_lock_irqsave_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(sp, flags); if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) sp->srcu_gp_seq_needed_exp = s; - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -623,12 +650,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, for (; snp != NULL; snp = snp->srcu_parent) { if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) return; /* GP already done and CBs recorded. */ - raw_spin_lock_irqsave_rcu_node(snp, flags); + spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { snp_seq = snp->srcu_have_cbs[idx]; if (snp == sdp->mynode && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); if (snp == sdp->mynode && snp_seq != s) { srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL @@ -644,11 +671,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) snp->srcu_gp_seq_needed_exp = s; - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); } /* Top of tree, must ensure the grace period will be started. */ - raw_spin_lock_irqsave_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(sp, flags); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -667,7 +694,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, queue_delayed_work(system_power_efficient_wq, &sp->work, srcu_get_delay(sp)); } - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -830,7 +857,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rhp->func = func; local_irq_save(flags); sdp = this_cpu_ptr(sp->sda); - raw_spin_lock_rcu_node(sdp); + spin_lock_rcu_node(sdp); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -844,7 +871,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, sdp->srcu_gp_seq_needed_exp = s; needexp = true; } - raw_spin_unlock_irqrestore_rcu_node(sdp, flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); if (needgp) srcu_funnel_gp_start(sp, sdp, s, do_norm); else if (needexp) @@ -900,7 +927,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) /* * Make sure that later code is ordered after the SRCU grace - * period. This pairs with the raw_spin_lock_irq_rcu_node() + * period. This pairs with the spin_lock_irq_rcu_node() * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed * because the current CPU might have been totally uninvolved with * (and thus unordered against) that grace period. @@ -1024,7 +1051,7 @@ void srcu_barrier(struct srcu_struct *sp) */ for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - raw_spin_lock_irq_rcu_node(sdp); + spin_lock_irq_rcu_node(sdp); atomic_inc(&sp->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); @@ -1033,7 +1060,7 @@ void srcu_barrier(struct srcu_struct *sp) debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&sp->srcu_barrier_cpu_cnt); } - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); } /* Remove the initial count, at which point reaching zero can happen. */ @@ -1082,17 +1109,17 @@ static void srcu_advance_state(struct srcu_struct *sp) */ idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); return; } idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(sp); - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); if (idx != SRCU_STATE_IDLE) { mutex_unlock(&sp->srcu_gp_mutex); return; /* Someone else started the grace period. */ @@ -1141,19 +1168,19 @@ static void srcu_invoke_callbacks(struct work_struct *work) sdp = container_of(work, struct srcu_data, work.work); sp = sdp->sp; rcu_cblist_init(&ready_cbs); - raw_spin_lock_irq_rcu_node(sdp); + spin_lock_irq_rcu_node(sdp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); return; /* Someone else on the job or nothing to do. */ } /* We are on the job! Extract and invoke ready callbacks. */ sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); @@ -1166,13 +1193,13 @@ static void srcu_invoke_callbacks(struct work_struct *work) * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ - raw_spin_lock_irq_rcu_node(sdp); + spin_lock_irq_rcu_node(sdp); rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&sp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); if (more) srcu_schedule_cbs_sdp(sdp, 0); } @@ -1185,7 +1212,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) { bool pushgp = true; - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ @@ -1195,7 +1222,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) /* Outstanding request and no GP. Start one. */ srcu_gp_start(sp); } - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); if (pushgp) queue_delayed_work(system_power_efficient_wq, &sp->work, delay); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f9c0ca2..491bdf3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -265,25 +265,12 @@ void rcu_bh_qs(void) #endif static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, + .dynticks_nesting = 1, + .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; /* - * There's a few places, currently just in the tracing infrastructure, - * that uses rcu_irq_enter() to make sure RCU is watching. But there's - * a small location where that will not even work. In those cases - * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter() - * can be called. - */ -static DEFINE_PER_CPU(bool, disable_rcu_irq_enter); - -bool rcu_irq_enter_disabled(void) -{ - return this_cpu_read(disable_rcu_irq_enter); -} - -/* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state. */ @@ -762,68 +749,39 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * rcu_eqs_enter_common - current CPU is entering an extended quiescent state + * Enter an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. * - * Enter idle, doing appropriate accounting. The caller must have - * disabled interrupts. + * We crowbar the ->dynticks_nmi_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. */ -static void rcu_eqs_enter_common(bool user) +static void rcu_eqs_enter(bool user) { struct rcu_state *rsp; struct rcu_data *rdp; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_dynticks *rdtp; - lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); - if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !user && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = - idle_task(smp_processor_id()); - - trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0); - rcu_ftrace_dump(DUMP_ORIG); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ + rdtp = this_cpu_ptr(&rcu_dynticks); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + rdtp->dynticks_nesting == 0); + if (rdtp->dynticks_nesting != 1) { + rdtp->dynticks_nesting--; + return; } + + lockdep_assert_irqs_disabled(); + trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); do_nocb_deferred_wakeup(rdp); } rcu_prepare_for_idle(); - __this_cpu_inc(disable_rcu_irq_enter); - rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */ - rcu_dynticks_eqs_enter(); /* After this, tracing works again. */ - __this_cpu_dec(disable_rcu_irq_enter); + WRITE_ONCE(rdtp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + rcu_dynticks_eqs_enter(); rcu_dynticks_task_enter(); - - /* - * It is illegal to enter an extended quiescent state while - * in an RCU read-side critical section. - */ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), - "Illegal idle entry in RCU read-side critical section."); - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), - "Illegal idle entry in RCU-bh read-side critical section."); - RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), - "Illegal idle entry in RCU-sched read-side critical section."); -} - -/* - * Enter an RCU extended quiescent state, which can be either the - * idle loop or adaptive-tickless usermode execution. - */ -static void rcu_eqs_enter(bool user) -{ - struct rcu_dynticks *rdtp; - - rdtp = this_cpu_ptr(&rcu_dynticks); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); - if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) - rcu_eqs_enter_common(user); - else - rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; } /** @@ -834,10 +792,6 @@ static void rcu_eqs_enter(bool user) * critical sections can occur in irq handlers in idle, a possibility * handled by irq_enter() and irq_exit().) * - * We crowbar the ->dynticks_nesting field to zero to allow for - * the possibility of usermode upcalls having messed up our count - * of interrupt nesting level during the prior busy period. - * * If you add or remove a call to rcu_idle_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ @@ -867,6 +821,46 @@ void rcu_user_enter(void) #endif /* CONFIG_NO_HZ_FULL */ /** + * rcu_nmi_exit - inform RCU of exit from NMI context + * + * If we are returning from the outermost NMI handler that interrupted an + * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting + * to let the RCU grace-period handling know that the CPU is back to + * being RCU-idle. + * + * If you add or remove a call to rcu_nmi_exit(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void rcu_nmi_exit(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + /* + * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. + * (We are exiting an NMI handler, so RCU better be paying attention + * to us!) + */ + WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); + WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); + + /* + * If the nesting level is not 1, the CPU wasn't RCU-idle, so + * leave it in non-RCU-idle state. + */ + if (rdtp->dynticks_nmi_nesting != 1) { + trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, rdtp->dynticks_nmi_nesting - 2, rdtp->dynticks); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ + rdtp->dynticks_nmi_nesting - 2); + return; + } + + /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ + trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0, rdtp->dynticks); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ + rcu_dynticks_eqs_enter(); +} + +/** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * * Exit from an interrupt handler, which might possibly result in entering @@ -875,8 +869,8 @@ void rcu_user_enter(void) * * This code assumes that the idle loop never does anything that might * result in unbalanced calls to irq_enter() and irq_exit(). If your - * architecture violates this assumption, RCU will give you what you - * deserve, good and hard. But very infrequently and irreproducibly. + * architecture's idle loop violates this assumption, RCU will give you what + * you deserve, good and hard. But very infrequently and irreproducibly. * * Use things like work queues to work around this limitation. * @@ -887,23 +881,14 @@ void rcu_user_enter(void) */ void rcu_irq_exit(void) { - struct rcu_dynticks *rdtp; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); lockdep_assert_irqs_disabled(); - rdtp = this_cpu_ptr(&rcu_dynticks); - - /* Page faults can happen in NMI handlers, so check... */ - if (rdtp->dynticks_nmi_nesting) - return; - - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting < 1); - if (rdtp->dynticks_nesting <= 1) { - rcu_eqs_enter_common(true); - } else { - trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); - rdtp->dynticks_nesting--; - } + if (rdtp->dynticks_nmi_nesting == 1) + rcu_prepare_for_idle(); + rcu_nmi_exit(); + if (rdtp->dynticks_nmi_nesting == 0) + rcu_dynticks_task_enter(); } /* @@ -922,55 +907,33 @@ void rcu_irq_exit_irqson(void) } /* - * rcu_eqs_exit_common - current CPU moving away from extended quiescent state - * - * If the new value of the ->dynticks_nesting counter was previously zero, - * we really have exited idle, and must do the appropriate accounting. - * The caller must have disabled interrupts. - */ -static void rcu_eqs_exit_common(long long oldval, int user) -{ - RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) - - rcu_dynticks_task_exit(); - rcu_dynticks_eqs_exit(); - rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); - if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !user && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = - idle_task(smp_processor_id()); - - trace_rcu_dyntick(TPS("Error on exit: not idle task"), - oldval, rdtp->dynticks_nesting); - rcu_ftrace_dump(DUMP_ORIG); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } -} - -/* * Exit an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. + * + * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to + * allow for the possibility of usermode upcalls messing up our count of + * interrupt nesting level during the busy period that is just now starting. */ static void rcu_eqs_exit(bool user) { struct rcu_dynticks *rdtp; - long long oldval; + long oldval; lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); - if (oldval & DYNTICK_TASK_NEST_MASK) { - rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - } else { - __this_cpu_inc(disable_rcu_irq_enter); - rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_eqs_exit_common(oldval, user); - __this_cpu_dec(disable_rcu_irq_enter); + if (oldval) { + rdtp->dynticks_nesting++; + return; } + rcu_dynticks_task_exit(); + rcu_dynticks_eqs_exit(); + rcu_cleanup_after_idle(); + trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, 1, rdtp->dynticks); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); + WRITE_ONCE(rdtp->dynticks_nesting, 1); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } /** @@ -979,11 +942,6 @@ static void rcu_eqs_exit(bool user) * Exit idle mode, in other words, -enter- the mode in which RCU * read-side critical sections can occur. * - * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to - * allow for the possibility of usermode upcalls messing up our count - * of interrupt nesting level during the busy period that is just - * now starting. - * * If you add or remove a call to rcu_idle_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ @@ -1013,65 +971,6 @@ void rcu_user_exit(void) #endif /* CONFIG_NO_HZ_FULL */ /** - * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle - * - * Enter an interrupt handler, which might possibly result in exiting - * idle mode, in other words, entering the mode in which read-side critical - * sections can occur. The caller must have disabled interrupts. - * - * Note that the Linux kernel is fully capable of entering an interrupt - * handler that it never exits, for example when doing upcalls to - * user mode! This code assumes that the idle loop never does upcalls to - * user mode. If your architecture does do upcalls from the idle loop (or - * does anything else that results in unbalanced calls to the irq_enter() - * and irq_exit() functions), RCU will give you what you deserve, good - * and hard. But very infrequently and irreproducibly. - * - * Use things like work queues to work around this limitation. - * - * You have been warned. - * - * If you add or remove a call to rcu_irq_enter(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_irq_enter(void) -{ - struct rcu_dynticks *rdtp; - long long oldval; - - lockdep_assert_irqs_disabled(); - rdtp = this_cpu_ptr(&rcu_dynticks); - - /* Page faults can happen in NMI handlers, so check... */ - if (rdtp->dynticks_nmi_nesting) - return; - - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting++; - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting == 0); - if (oldval) - trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); - else - rcu_eqs_exit_common(oldval, true); -} - -/* - * Wrapper for rcu_irq_enter() where interrupts are enabled. - * - * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_irq_enter_irqson(void) -{ - unsigned long flags; - - local_irq_save(flags); - rcu_irq_enter(); - local_irq_restore(flags); -} - -/** * rcu_nmi_enter - inform RCU of entry to NMI context * * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and @@ -1086,7 +985,7 @@ void rcu_irq_enter_irqson(void) void rcu_nmi_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int incby = 2; + long incby = 2; /* Complain about underflow. */ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); @@ -1103,45 +1002,61 @@ void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); incby = 1; } - rdtp->dynticks_nmi_nesting += incby; + trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), + rdtp->dynticks_nmi_nesting, + rdtp->dynticks_nmi_nesting + incby, rdtp->dynticks); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ + rdtp->dynticks_nmi_nesting + incby); barrier(); } /** - * rcu_nmi_exit - inform RCU of exit from NMI context + * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * - * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting - * to let the RCU grace-period handling know that the CPU is back to - * being RCU-idle. + * Enter an interrupt handler, which might possibly result in exiting + * idle mode, in other words, entering the mode in which read-side critical + * sections can occur. The caller must have disabled interrupts. * - * If you add or remove a call to rcu_nmi_exit(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. + * Note that the Linux kernel is fully capable of entering an interrupt + * handler that it never exits, for example when doing upcalls to user mode! + * This code assumes that the idle loop never does upcalls to user mode. + * If your architecture's idle loop does do upcalls to user mode (or does + * anything else that results in unbalanced calls to the irq_enter() and + * irq_exit() functions), RCU will give you what you deserve, good and hard. + * But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. + * + * If you add or remove a call to rcu_irq_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_nmi_exit(void) +void rcu_irq_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - /* - * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. - * (We are exiting an NMI handler, so RCU better be paying attention - * to us!) - */ - WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); + lockdep_assert_irqs_disabled(); + if (rdtp->dynticks_nmi_nesting == 0) + rcu_dynticks_task_exit(); + rcu_nmi_enter(); + if (rdtp->dynticks_nmi_nesting == 1) + rcu_cleanup_after_idle(); +} - /* - * If the nesting level is not 1, the CPU wasn't RCU-idle, so - * leave it in non-RCU-idle state. - */ - if (rdtp->dynticks_nmi_nesting != 1) { - rdtp->dynticks_nmi_nesting -= 2; - return; - } +/* + * Wrapper for rcu_irq_enter() where interrupts are enabled. + * + * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void rcu_irq_enter_irqson(void) +{ + unsigned long flags; - /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - rdtp->dynticks_nmi_nesting = 0; - rcu_dynticks_eqs_enter(); + local_irq_save(flags); + rcu_irq_enter(); + local_irq_restore(flags); } /** @@ -1233,7 +1148,8 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); */ static int rcu_is_cpu_rrupt_from_idle(void) { - return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; + return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 0 && + __this_cpu_read(rcu_dynticks.dynticks_nmi_nesting) <= 1; } /* @@ -2789,6 +2705,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rdp->n_force_qs_snap = rsp->n_force_qs; } else if (count < rdp->qlen_last_fqs_check - qhimark) rdp->qlen_last_fqs_check = count; + + /* + * The following usually indicates a double call_rcu(). To track + * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. + */ WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); local_irq_restore(flags); @@ -3723,7 +3644,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); rdp->cpu = cpu; rdp->rsp = rsp; @@ -3752,7 +3673,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ !init_nocb_callback_list(rdp)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ - rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rdp->dynticks->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 46a5d19..6488a3b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -38,9 +38,8 @@ * Dynticks per-CPU state. */ struct rcu_dynticks { - long long dynticks_nesting; /* Track irq/process nesting level. */ - /* Process level is worth LLONG_MAX/2. */ - int dynticks_nmi_nesting; /* Track NMI nesting level. */ + long dynticks_nesting; /* Track process nesting level. */ + long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index db85ca3..fb88a02 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -61,7 +61,6 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work); #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ -static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ @@ -1687,7 +1686,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum; - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%ld softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -1752,7 +1751,6 @@ static void increment_cpu_stall_ticks(void) static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - have_rcu_nocb_mask = true; cpulist_parse(str, rcu_nocb_mask); return 1; } @@ -1801,7 +1799,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) /* Is the specified CPU a no-CBs CPU? */ bool rcu_is_nocb_cpu(int cpu) { - if (have_rcu_nocb_mask) + if (cpumask_available(rcu_nocb_mask)) return cpumask_test_cpu(cpu, rcu_nocb_mask); return false; } @@ -2295,14 +2293,13 @@ void __init rcu_init_nohz(void) need_rcu_nocb_mask = true; #endif /* #if defined(CONFIG_NO_HZ_FULL) */ - if (!have_rcu_nocb_mask && need_rcu_nocb_mask) { + if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); return; } - have_rcu_nocb_mask = true; } - if (!have_rcu_nocb_mask) + if (!cpumask_available(rcu_nocb_mask)) return; #if defined(CONFIG_NO_HZ_FULL) @@ -2428,7 +2425,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ struct rcu_data *rdp_prev = NULL; - if (!have_rcu_nocb_mask) + if (!cpumask_available(rcu_nocb_mask)) return; if (ls == -1) { ls = int_sqrt(nr_cpu_ids); diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 2ddaec4..0926aef 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -34,11 +34,6 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); - /* - * Perform commit of crossrelease here. - */ - complete_release_commit(x); - if (x->done != UINT_MAX) x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75554f3..3da7a24 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -508,7 +508,8 @@ void resched_cpu(int cpu) unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); - resched_curr(rq); + if (cpu_online(cpu) || cpu == smp_processor_id()) + resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -2045,7 +2046,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * If the owning (remote) CPU is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. * - * Pairs with the smp_store_release() in finish_lock_switch(). + * Pairs with the smp_store_release() in finish_task(). * * This ensures that tasks getting woken will be fully ordered against * their previous state and preserve Program Order. @@ -2056,7 +2057,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->state = TASK_WAKING; if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2069,7 +2070,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #else /* CONFIG_SMP */ if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2122,7 +2123,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) if (!task_on_rq_queued(p)) { if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&rq->nr_iowait); } ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); @@ -2571,6 +2572,50 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, #endif /* CONFIG_PREEMPT_NOTIFIERS */ +static inline void prepare_task(struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * Claim the task as running, we do this before switching to it + * such that any running task will have this set. + */ + next->on_cpu = 1; +#endif +} + +static inline void finish_task(struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + * + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). + */ + smp_store_release(&prev->on_cpu, 0); +#endif +} + +static inline void finish_lock_switch(struct rq *rq) +{ +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + + raw_spin_unlock_irq(&rq->lock); +} + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -2591,7 +2636,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); - prepare_lock_switch(rq, next); + prepare_task(next); prepare_arch_switch(next); } @@ -2646,7 +2691,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) * the scheduled task must drop that reference. * * We must observe prev->state before clearing prev->on_cpu (in - * finish_lock_switch), otherwise a concurrent wakeup can get prev + * finish_task), otherwise a concurrent wakeup can get prev * running on another CPU and we could rave with its RUNNING -> DEAD * transition, resulting in a double drop. */ @@ -2663,7 +2708,8 @@ static struct rq *finish_task_switch(struct task_struct *prev) * to use. */ smp_mb__after_unlock_lock(); - finish_lock_switch(rq, prev); + finish_task(prev); + finish_lock_switch(rq); finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); @@ -4040,8 +4086,7 @@ recheck: return -EINVAL; } - if (attr->sched_flags & - ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM)) + if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) return -EINVAL; /* @@ -4108,6 +4153,9 @@ recheck: } if (user) { + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return -EINVAL; + retval = security_task_setscheduler(p); if (retval) return retval; @@ -4163,7 +4211,8 @@ change: } #endif #ifdef CONFIG_SMP - if (dl_bandwidth_enabled() && dl_policy(policy)) { + if (dl_bandwidth_enabled() && dl_policy(policy) && + !(attr->sched_flags & SCHED_FLAG_SUGOV)) { cpumask_t *span = rq->rd->span; /* @@ -4293,6 +4342,11 @@ int sched_setattr(struct task_struct *p, const struct sched_attr *attr) } EXPORT_SYMBOL_GPL(sched_setattr); +int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, false, true); +} + /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. * @p: the task in question. @@ -5097,17 +5151,6 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) return ret; } -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * this syscall writes the default timeslice value of a given process - * into the user-space timespec buffer. A value of '0' means infinity. - * - * Return: On success, 0 and the timeslice is in @interval. Otherwise, - * an error code. - */ static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { struct task_struct *p; @@ -5144,6 +5187,17 @@ out_unlock: return retval; } +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + * + * Return: On success, 0 and the timeslice is in @interval. Otherwise, + * an error code. + */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct timespec __user *, interval) { diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 2f52ec0..dd062a1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -60,7 +60,8 @@ struct sugov_cpu { u64 last_update; /* The fields below are only needed when sharing a policy. */ - unsigned long util; + unsigned long util_cfs; + unsigned long util_dl; unsigned long max; unsigned int flags; @@ -176,21 +177,28 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) +static void sugov_get_util(struct sugov_cpu *sg_cpu) { - struct rq *rq = cpu_rq(cpu); - unsigned long cfs_max; + struct rq *rq = cpu_rq(sg_cpu->cpu); - cfs_max = arch_scale_cpu_capacity(NULL, cpu); + sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + sg_cpu->util_cfs = cpu_util_cfs(rq); + sg_cpu->util_dl = cpu_util_dl(rq); +} - *util = min(rq->cfs.avg.util_avg, cfs_max); - *max = cfs_max; +static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) +{ + /* + * Ideally we would like to set util_dl as min/guaranteed freq and + * util_cfs + util_dl as requested freq. However, cpufreq is not yet + * ready for such an interface. So, we only do the latter for now. + */ + return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); } -static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, - unsigned int flags) +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) { - if (flags & SCHED_CPUFREQ_IOWAIT) { + if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { if (sg_cpu->iowait_boost_pending) return; @@ -244,7 +252,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, #ifdef CONFIG_NO_HZ_COMMON static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { - unsigned long idle_calls = tick_nohz_get_idle_calls(); + unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); bool ret = idle_calls == sg_cpu->saved_idle_calls; sg_cpu->saved_idle_calls = idle_calls; @@ -264,7 +272,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int next_f; bool busy; - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_set_iowait_boost(sg_cpu, time); sg_cpu->last_update = time; if (!sugov_should_update_freq(sg_policy, time)) @@ -272,10 +280,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, busy = sugov_cpu_is_busy(sg_cpu); - if (flags & SCHED_CPUFREQ_RT_DL) { + if (flags & SCHED_CPUFREQ_RT) { next_f = policy->cpuinfo.max_freq; } else { - sugov_get_util(&util, &max, sg_cpu->cpu); + sugov_get_util(sg_cpu); + max = sg_cpu->max; + util = sugov_aggregate_util(sg_cpu); sugov_iowait_boost(sg_cpu, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* @@ -305,23 +315,27 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) s64 delta_ns; /* - * If the CPU utilization was last updated before the previous - * frequency update and the time elapsed between the last update - * of the CPU utilization and the last frequency update is long - * enough, don't take the CPU into account as it probably is - * idle now (and clear iowait_boost for it). + * If the CFS CPU utilization was last updated before the + * previous frequency update and the time elapsed between the + * last update of the CPU utilization and the last frequency + * update is long enough, reset iowait_boost and util_cfs, as + * they are now probably stale. However, still consider the + * CPU contribution if it has some DEADLINE utilization + * (util_dl). */ delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; j_sg_cpu->iowait_boost_pending = false; - continue; + j_sg_cpu->util_cfs = 0; + if (j_sg_cpu->util_dl == 0) + continue; } - if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) + if (j_sg_cpu->flags & SCHED_CPUFREQ_RT) return policy->cpuinfo.max_freq; - j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; + j_util = sugov_aggregate_util(j_sg_cpu); if (j_util * max > j_max * util) { util = j_util; max = j_max; @@ -338,22 +352,18 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; - unsigned long util, max; unsigned int next_f; - sugov_get_util(&util, &max, sg_cpu->cpu); - raw_spin_lock(&sg_policy->update_lock); - sg_cpu->util = util; - sg_cpu->max = max; + sugov_get_util(sg_cpu); sg_cpu->flags = flags; - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_set_iowait_boost(sg_cpu, time); sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - if (flags & SCHED_CPUFREQ_RT_DL) + if (flags & SCHED_CPUFREQ_RT) next_f = sg_policy->policy->cpuinfo.max_freq; else next_f = sugov_next_freq_shared(sg_cpu, time); @@ -383,9 +393,9 @@ static void sugov_irq_work(struct irq_work *irq_work) sg_policy = container_of(irq_work, struct sugov_policy, irq_work); /* - * For RT and deadline tasks, the schedutil governor shoots the - * frequency to maximum. Special care must be taken to ensure that this - * kthread doesn't result in the same behavior. + * For RT tasks, the schedutil governor shoots the frequency to maximum. + * Special care must be taken to ensure that this kthread doesn't result + * in the same behavior. * * This is (mostly) guaranteed by the work_in_progress flag. The flag is * updated only at the end of the sugov_work() function and before that @@ -470,7 +480,20 @@ static void sugov_policy_free(struct sugov_policy *sg_policy) static int sugov_kthread_create(struct sugov_policy *sg_policy) { struct task_struct *thread; - struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + .sched_policy = SCHED_DEADLINE, + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, + /* + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ + .sched_runtime = 1000000, + .sched_deadline = 10000000, + .sched_period = 10000000, + }; struct cpufreq_policy *policy = sg_policy->policy; int ret; @@ -488,10 +511,10 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) return PTR_ERR(thread); } - ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + ret = sched_setattr_nocheck(thread, &attr); if (ret) { kthread_stop(thread); - pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); return ret; } @@ -655,7 +678,7 @@ static int sugov_start(struct cpufreq_policy *policy) memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; - sg_cpu->flags = SCHED_CPUFREQ_RT; + sg_cpu->flags = 0; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2473736..9bb0e0c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -78,7 +78,7 @@ static inline int dl_bw_cpus(int i) #endif static inline -void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; @@ -86,10 +86,12 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) dl_rq->running_bw += dl_bw; SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); } static inline -void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; @@ -98,10 +100,12 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) dl_rq->running_bw = 0; + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); } static inline -void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; @@ -111,7 +115,7 @@ void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) } static inline -void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; @@ -123,16 +127,46 @@ void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); } +static inline +void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __add_rq_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __sub_rq_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __add_running_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __sub_running_bw(dl_se->dl_bw, dl_rq); +} + void dl_change_utilization(struct task_struct *p, u64 new_bw) { struct rq *rq; + BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); + if (task_on_rq_queued(p)) return; rq = task_rq(p); if (p->dl.dl_non_contending) { - sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the @@ -144,8 +178,8 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) put_task_struct(p); } - sub_rq_bw(p->dl.dl_bw, &rq->dl); - add_rq_bw(new_bw, &rq->dl); + __sub_rq_bw(p->dl.dl_bw, &rq->dl); + __add_rq_bw(new_bw, &rq->dl); } /* @@ -217,6 +251,9 @@ static void task_non_contending(struct task_struct *p) if (dl_se->dl_runtime == 0) return; + if (dl_entity_is_special(dl_se)) + return; + WARN_ON(hrtimer_active(&dl_se->inactive_timer)); WARN_ON(dl_se->dl_non_contending); @@ -236,12 +273,12 @@ static void task_non_contending(struct task_struct *p) */ if (zerolag_time < 0) { if (dl_task(p)) - sub_running_bw(dl_se->dl_bw, dl_rq); + sub_running_bw(dl_se, dl_rq); if (!dl_task(p) || p->state == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (p->state == TASK_DEAD) - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); __dl_clear_params(p); @@ -268,7 +305,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) return; if (flags & ENQUEUE_MIGRATED) - add_rq_bw(dl_se->dl_bw, dl_rq); + add_rq_bw(dl_se, dl_rq); if (dl_se->dl_non_contending) { dl_se->dl_non_contending = 0; @@ -289,7 +326,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * when the "inactive timer" fired). * So, add it back. */ - add_running_bw(dl_se->dl_bw, dl_rq); + add_running_bw(dl_se, dl_rq); } } @@ -1114,7 +1151,8 @@ static void update_curr_dl(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_dl_entity *dl_se = &curr->dl; - u64 delta_exec; + u64 delta_exec, scaled_delta_exec; + int cpu = cpu_of(rq); if (!dl_task(curr) || !on_dl_rq(dl_se)) return; @@ -1134,9 +1172,6 @@ static void update_curr_dl(struct rq *rq) return; } - /* kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq, SCHED_CPUFREQ_DL); - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1148,13 +1183,39 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) - delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); - dl_se->runtime -= delta_exec; + if (dl_entity_is_special(dl_se)) + return; + + /* + * For tasks that participate in GRUB, we implement GRUB-PA: the + * spare reclaimed bandwidth is used to clock down frequency. + * + * For the others, we still need to scale reservation parameters + * according to current frequency and CPU maximum capacity. + */ + if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) { + scaled_delta_exec = grub_reclaim(delta_exec, + rq, + &curr->dl); + } else { + unsigned long scale_freq = arch_scale_freq_capacity(cpu); + unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + + scaled_delta_exec = cap_scale(delta_exec, scale_freq); + scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); + } + + dl_se->runtime -= scaled_delta_exec; throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; + + /* If requested, inform the user about runtime overruns. */ + if (dl_runtime_exceeded(dl_se) && + (dl_se->flags & SCHED_FLAG_DL_OVERRUN)) + dl_se->dl_overrun = 1; + __dequeue_task_dl(rq, curr, 0); if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); @@ -1204,8 +1265,8 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (p->state == TASK_DEAD && dl_se->dl_non_contending) { - sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); - sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + sub_running_bw(&p->dl, dl_rq_of_se(&p->dl)); + sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl)); dl_se->dl_non_contending = 0; } @@ -1222,7 +1283,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); - sub_running_bw(dl_se->dl_bw, &rq->dl); + sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending = 0; unlock: task_rq_unlock(rq, p, &rf); @@ -1416,8 +1477,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) dl_check_constrained_dl(&p->dl); if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { - add_rq_bw(p->dl.dl_bw, &rq->dl); - add_running_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(&p->dl, &rq->dl); + add_running_bw(&p->dl, &rq->dl); } /* @@ -1457,8 +1518,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) __dequeue_task_dl(rq, p, flags); if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { - sub_running_bw(p->dl.dl_bw, &rq->dl); - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); } /* @@ -1564,7 +1625,7 @@ static void migrate_task_rq_dl(struct task_struct *p) */ raw_spin_lock(&rq->lock); if (p->dl.dl_non_contending) { - sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the @@ -1576,7 +1637,7 @@ static void migrate_task_rq_dl(struct task_struct *p) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) put_task_struct(p); } - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); raw_spin_unlock(&rq->lock); } @@ -2019,11 +2080,11 @@ retry: } deactivate_task(rq, next_task, 0); - sub_running_bw(next_task->dl.dl_bw, &rq->dl); - sub_rq_bw(next_task->dl.dl_bw, &rq->dl); + sub_running_bw(&next_task->dl, &rq->dl); + sub_rq_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); - add_rq_bw(next_task->dl.dl_bw, &later_rq->dl); - add_running_bw(next_task->dl.dl_bw, &later_rq->dl); + add_rq_bw(&next_task->dl, &later_rq->dl); + add_running_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -2111,11 +2172,11 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); - sub_running_bw(p->dl.dl_bw, &src_rq->dl); - sub_rq_bw(p->dl.dl_bw, &src_rq->dl); + sub_running_bw(&p->dl, &src_rq->dl); + sub_rq_bw(&p->dl, &src_rq->dl); set_task_cpu(p, this_cpu); - add_rq_bw(p->dl.dl_bw, &this_rq->dl); - add_running_bw(p->dl.dl_bw, &this_rq->dl); + add_rq_bw(&p->dl, &this_rq->dl); + add_running_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -2224,7 +2285,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) task_non_contending(p); if (!task_on_rq_queued(p)) - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); /* * We cannot use inactive_task_timer() to invoke sub_running_bw() @@ -2256,7 +2317,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) /* If p is not queued we will update its parameters at next wakeup. */ if (!task_on_rq_queued(p)) { - add_rq_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(&p->dl, &rq->dl); return; } @@ -2435,6 +2496,9 @@ int sched_dl_overflow(struct task_struct *p, int policy, u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return 0; + /* !deadline task may carry old deadline bandwidth */ if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) return 0; @@ -2521,6 +2585,10 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) */ bool __checkparam_dl(const struct sched_attr *attr) { + /* special dl tasks don't actually use any parameter */ + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return true; + /* deadline != 0 */ if (attr->sched_deadline == 0) return false; @@ -2566,6 +2634,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; + dl_se->dl_overrun = 0; } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2fe3aa8..7b65359 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3020,9 +3020,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be - * a real problem -- added to that it only calls on the local - * CPU, so if we enqueue remotely we'll miss an update, but - * the next tick/schedule should update. + * a real problem. * * It will not get called when we go idle, because the idle * thread is a different class (!fair), nor will the utilization @@ -3091,8 +3089,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) return c1 + c2 + c3; } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) - /* * Accumulate the three separate parts of the sum; d1 the remainder * of the last (incomplete) period, d2 the span of full periods and d3 @@ -3122,7 +3118,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ u64 periods; - scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_freq = arch_scale_freq_capacity(cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); delta += sa->period_contrib; @@ -4365,12 +4361,12 @@ static inline bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) { - static_key_slow_inc(&__cfs_bandwidth_used); + static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used); } void cfs_bandwidth_usage_dec(void) { - static_key_slow_dec(&__cfs_bandwidth_used); + static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) @@ -5689,8 +5685,8 @@ static int wake_wide(struct task_struct *p) * soonest. For the purpose of speed we only consider the waking and previous * CPU. * - * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or - * will be) idle. + * wake_affine_idle() - only considers 'now', it check if the waking CPU is + * cache-affine and is (or will be) idle. * * wake_affine_weight() - considers the weight to reflect the average * scheduling latency of the CPUs. This seems to work @@ -5701,7 +5697,13 @@ static bool wake_affine_idle(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { - if (idle_cpu(this_cpu)) + /* + * If this_cpu is idle, it implies the wakeup is from interrupt + * context. Only allow the move if cache is shared. Otherwise an + * interrupt intensive workload could force all tasks onto one + * node depending on the IO topology or IRQ affinity settings. + */ + if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) return true; if (sync && cpu_rq(this_cpu)->nr_running == 1) @@ -5765,12 +5767,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return affine; } -static inline int task_util(struct task_struct *p); -static int cpu_util_wake(int cpu, struct task_struct *p); +static inline unsigned long task_util(struct task_struct *p); +static unsigned long cpu_util_wake(int cpu, struct task_struct *p); static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) { - return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); + return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0); } /* @@ -5950,7 +5952,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(cpu_rq(i)); - if (load < min_load || (load == min_load && i == this_cpu)) { + if (load < min_load) { min_load = load; least_loaded_cpu = i; } @@ -6247,7 +6249,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * capacity_orig) as it useful for predicting the capacity required after task * migrations (scheduler-driven DVFS). */ -static int cpu_util(int cpu) +static unsigned long cpu_util(int cpu) { unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -6255,7 +6257,7 @@ static int cpu_util(int cpu) return (util >= capacity) ? capacity : util; } -static inline int task_util(struct task_struct *p) +static inline unsigned long task_util(struct task_struct *p) { return p->se.avg.util_avg; } @@ -6264,7 +6266,7 @@ static inline int task_util(struct task_struct *p) * cpu_util_wake: Compute cpu utilization with any contributions from * the waking task p removed. */ -static int cpu_util_wake(int cpu, struct task_struct *p) +static unsigned long cpu_util_wake(int cpu, struct task_struct *p) { unsigned long util, capacity; @@ -6449,8 +6451,7 @@ static void task_dead_fair(struct task_struct *p) } #endif /* CONFIG_SMP */ -static unsigned long -wakeup_gran(struct sched_entity *curr, struct sched_entity *se) +static unsigned long wakeup_gran(struct sched_entity *se) { unsigned long gran = sysctl_sched_wakeup_granularity; @@ -6492,7 +6493,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) if (vdiff <= 0) return -1; - gran = wakeup_gran(curr, se); + gran = wakeup_gran(se); if (vdiff > gran) return 1; diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index dd79087..9bcbacb 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -89,7 +89,9 @@ static int membarrier_private_expedited(void) rcu_read_unlock(); } if (!fallback) { + preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); free_cpumask_var(tmpmask); } cpus_read_unlock(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4056c19..862a513 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2034,8 +2034,9 @@ static void pull_rt_task(struct rq *this_rq) bool resched = false; struct task_struct *p; struct rq *src_rq; + int rt_overload_count = rt_overloaded(this_rq); - if (likely(!rt_overloaded(this_rq))) + if (likely(!rt_overload_count)) return; /* @@ -2044,6 +2045,11 @@ static void pull_rt_task(struct rq *this_rq) */ smp_rmb(); + /* If we are the only overloaded CPU do nothing */ + if (rt_overload_count == 1 && + cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) + return; + #ifdef HAVE_RT_PUSH_IPI if (sched_feat(RT_PUSH_IPI)) { tell_cpu_to_push(this_rq); @@ -2206,7 +2212,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) queue_push_tasks(rq); #endif /* CONFIG_SMP */ - if (p->prio < rq->curr->prio) + if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b19552a2..2e95505 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -156,13 +156,39 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + +/* + * !! For sched_setattr_nocheck() (kernel) only !! + * + * This is actually gross. :( + * + * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE + * tasks, but still be able to sleep. We need this on platforms that cannot + * atomically change clock frequency. Remove once fast switching will be + * available on such platforms. + * + * SUGOV stands for SchedUtil GOVernor. + */ +#define SCHED_FLAG_SUGOV 0x10000000 + +static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) +{ +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); +#else + return false; +#endif +} + /* * Tells if entity @a should preempt entity @b. */ static inline bool dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) { - return dl_time_before(a->deadline, b->deadline); + return dl_entity_is_special(a) || + dl_time_before(a->deadline, b->deadline); } /* @@ -1328,47 +1354,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) # define finish_arch_post_lock_switch() do { } while (0) #endif -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - * - * In particular, the load of prev->state in finish_task_switch() must - * happen before this. - * - * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). - */ - smp_store_release(&prev->on_cpu, 0); -#endif -#ifdef CONFIG_DEBUG_SPINLOCK - /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; -#endif - /* - * If we are tracking spinlock dependencies then we have to - * fix up the runqueue lock - which gets 'carried over' from - * prev into current: - */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - - raw_spin_unlock_irq(&rq->lock); -} - /* * wake flags */ @@ -1687,17 +1672,17 @@ static inline int hrtick_enabled(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ -#ifdef CONFIG_SMP -extern void sched_avg_update(struct rq *rq); - #ifndef arch_scale_freq_capacity static __always_inline -unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +unsigned long arch_scale_freq_capacity(int cpu) { return SCHED_CAPACITY_SCALE; } #endif +#ifdef CONFIG_SMP +extern void sched_avg_update(struct rq *rq); + #ifndef arch_scale_cpu_capacity static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) @@ -1711,10 +1696,17 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { - rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); + rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq)); sched_avg_update(rq); } #else +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif @@ -2096,14 +2088,14 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); * The way cpufreq is currently arranged requires it to evaluate the CPU * performance state (frequency/voltage) on a regular basis to prevent it from * being stuck in a completely inadequate performance level for too long. - * That is not guaranteed to happen if the updates are only triggered from CFS, - * though, because they may not be coming in if RT or deadline tasks are active - * all the time (or there are RT and DL tasks only). + * That is not guaranteed to happen if the updates are only triggered from CFS + * and DL, though, because they may not be coming in if only RT tasks are + * active all the time (or there are RT tasks only). * - * As a workaround for that issue, this function is called by the RT and DL - * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * As a workaround for that issue, this function is called periodically by the + * RT sched class to trigger extra cpufreq updates to prevent it from stalling, * but that really is a band-aid. Going forward it should be replaced with - * solutions targeted more specifically at RT and DL tasks. + * solutions targeted more specifically at RT tasks. */ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { @@ -2125,3 +2117,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #else /* arch_scale_freq_capacity */ #define arch_scale_freq_invariant() (false) #endif + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; +} + +static inline unsigned long cpu_util_cfs(struct rq *rq) +{ + return rq->cfs.avg.util_avg; +} + +#endif diff --git a/kernel/softirq.c b/kernel/softirq.c index 2f5e87f..24d243e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -665,7 +665,7 @@ static void run_ksoftirqd(unsigned int cpu) */ __do_softirq(); local_irq_enable(); - cond_resched_rcu_qs(); + cond_resched(); return; } local_irq_enable(); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index e776fc8..f6b5f19 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -95,6 +95,7 @@ config NO_HZ_FULL select RCU_NOCB_CPU select VIRT_CPU_ACCOUNTING_GEN select IRQ_WORK + select CPU_ISOLATION help Adaptively try to shutdown the tick whenever possible, even when the CPU is running tasks. Typically this requires running a single diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d325208..ae0c8a4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -60,6 +60,15 @@ #include "tick-internal.h" /* + * Masks for selecting the soft and hard context timers from + * cpu_base->active + */ +#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) +#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) +#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) +#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) + +/* * The timer bases: * * There are more clockids than hrtimer bases. Thus, we index @@ -70,7 +79,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), - .seq = SEQCNT_ZERO(hrtimer_bases.seq), .clock_base = { { @@ -93,6 +101,26 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, + { + .index = HRTIMER_BASE_MONOTONIC_SOFT, + .clockid = CLOCK_MONOTONIC, + .get_time = &ktime_get, + }, + { + .index = HRTIMER_BASE_REALTIME_SOFT, + .clockid = CLOCK_REALTIME, + .get_time = &ktime_get_real, + }, + { + .index = HRTIMER_BASE_BOOTTIME_SOFT, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + }, + { + .index = HRTIMER_BASE_TAI_SOFT, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + }, } }; @@ -118,7 +146,6 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { - .seq = SEQCNT_ZERO(migration_cpu_base), .clock_base = { { .cpu_base = &migration_cpu_base, }, }, }; @@ -156,45 +183,33 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * With HIGHRES=y we do not migrate the timer when it is expiring - * before the next event on the target cpu because we cannot reprogram - * the target cpu hardware and we would cause it to fire late. + * We do not migrate the timer when it is expiring before the next + * event on the target cpu. When high resolution is enabled, we cannot + * reprogram the target cpu hardware and we would cause it to fire + * late. To keep it simple, we handle the high resolution enabled and + * disabled case similar. * * Called with cpu_base->lock of target cpu held. */ static int hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) { -#ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires; - if (!new_base->cpu_base->hres_active) - return 0; - expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires <= new_base->cpu_base->expires_next; -#else - return 0; -#endif + return expires < new_base->cpu_base->expires_next; } -#ifdef CONFIG_NO_HZ_COMMON -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) -{ - if (pinned || !base->migration_enabled) - return base; - return &per_cpu(hrtimer_bases, get_nohz_timer_target()); -} -#else static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + if (static_branch_likely(&timers_migration_enabled) && !pinned) + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); +#endif return base; } -#endif /* * We switch the timer base to a power-optimized selected CPU target, @@ -396,7 +411,8 @@ static inline void debug_hrtimer_init(struct hrtimer *timer) debug_object_init(timer, &hrtimer_debug_descr); } -static inline void debug_hrtimer_activate(struct hrtimer *timer) +static inline void debug_hrtimer_activate(struct hrtimer *timer, + enum hrtimer_mode mode) { debug_object_activate(timer, &hrtimer_debug_descr); } @@ -429,8 +445,10 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer) EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else + static inline void debug_hrtimer_init(struct hrtimer *timer) { } -static inline void debug_hrtimer_activate(struct hrtimer *timer) { } +static inline void debug_hrtimer_activate(struct hrtimer *timer, + enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif @@ -442,10 +460,11 @@ debug_init(struct hrtimer *timer, clockid_t clockid, trace_hrtimer_init(timer, clockid, mode); } -static inline void debug_activate(struct hrtimer *timer) +static inline void debug_activate(struct hrtimer *timer, + enum hrtimer_mode mode) { - debug_hrtimer_activate(timer); - trace_hrtimer_start(timer); + debug_hrtimer_activate(timer, mode); + trace_hrtimer_start(timer, mode); } static inline void debug_deactivate(struct hrtimer *timer) @@ -454,35 +473,43 @@ static inline void debug_deactivate(struct hrtimer *timer) trace_hrtimer_cancel(timer); } -#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) -static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base, - struct hrtimer *timer) +static struct hrtimer_clock_base * +__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) { -#ifdef CONFIG_HIGH_RES_TIMERS - cpu_base->next_timer = timer; -#endif + unsigned int idx; + + if (!*active) + return NULL; + + idx = __ffs(*active); + *active &= ~(1U << idx); + + return &cpu_base->clock_base[idx]; } -static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) +#define for_each_active_base(base, cpu_base, active) \ + while ((base = __next_base((cpu_base), &(active)))) + +static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, + unsigned int active, + ktime_t expires_next) { - struct hrtimer_clock_base *base = cpu_base->clock_base; - unsigned int active = cpu_base->active_bases; - ktime_t expires, expires_next = KTIME_MAX; + struct hrtimer_clock_base *base; + ktime_t expires; - hrtimer_update_next_timer(cpu_base, NULL); - for (; active; base++, active >>= 1) { + for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; struct hrtimer *timer; - if (!(active & 0x01)) - continue; - next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; - hrtimer_update_next_timer(cpu_base, timer); + if (timer->is_soft) + cpu_base->softirq_next_timer = timer; + else + cpu_base->next_timer = timer; } } /* @@ -494,7 +521,47 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) expires_next = 0; return expires_next; } -#endif + +/* + * Recomputes cpu_base::*next_timer and returns the earliest expires_next but + * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram. + * + * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, + * those timers will get run whenever the softirq gets handled, at the end of + * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. + * + * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. + * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual + * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. + * + * @active_mask must be one of: + * - HRTIMER_ACTIVE_ALL, + * - HRTIMER_ACTIVE_SOFT, or + * - HRTIMER_ACTIVE_HARD. + */ +static ktime_t +__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) +{ + unsigned int active; + struct hrtimer *next_timer = NULL; + ktime_t expires_next = KTIME_MAX; + + if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + cpu_base->softirq_next_timer = NULL; + expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX); + + next_timer = cpu_base->softirq_next_timer; + } + + if (active_mask & HRTIMER_ACTIVE_HARD) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + cpu_base->next_timer = next_timer; + expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next); + } + + return expires_next; +} static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { @@ -502,36 +569,14 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets_now(&base->clock_was_set_seq, + ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, offs_boot, offs_tai); -} -/* High resolution timer related functions */ -#ifdef CONFIG_HIGH_RES_TIMERS - -/* - * High resolution timer enabled ? - */ -static bool hrtimer_hres_enabled __read_mostly = true; -unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; -EXPORT_SYMBOL_GPL(hrtimer_resolution); - -/* - * Enable / Disable high resolution mode - */ -static int __init setup_hrtimer_hres(char *str) -{ - return (kstrtobool(str, &hrtimer_hres_enabled) == 0); -} - -__setup("highres=", setup_hrtimer_hres); + base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; + base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; + base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; -/* - * hrtimer_high_res_enabled - query, if the highres mode is enabled - */ -static inline int hrtimer_is_hres_enabled(void) -{ - return hrtimer_hres_enabled; + return now; } /* @@ -539,7 +584,8 @@ static inline int hrtimer_is_hres_enabled(void) */ static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { - return cpu_base->hres_active; + return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? + cpu_base->hres_active : 0; } static inline int hrtimer_hres_active(void) @@ -557,10 +603,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; - if (!cpu_base->hres_active) - return; + /* + * Find the current next expiration time. + */ + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); - expires_next = __hrtimer_get_next_event(cpu_base); + if (cpu_base->next_timer && cpu_base->next_timer->is_soft) { + /* + * When the softirq is activated, hrtimer has to be + * programmed with the first hard hrtimer because soft + * timer interrupt could occur too late. + */ + if (cpu_base->softirq_activated) + expires_next = __hrtimer_get_next_event(cpu_base, + HRTIMER_ACTIVE_HARD); + else + cpu_base->softirq_expires_next = expires_next; + } if (skip_equal && expires_next == cpu_base->expires_next) return; @@ -568,6 +627,9 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) cpu_base->expires_next = expires_next; /* + * If hres is not active, hardware does not have to be + * reprogrammed yet. + * * If a hang was detected in the last timer interrupt then we * leave the hang delay active in the hardware. We want the * system to make progress. That also prevents the following @@ -581,81 +643,38 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) * set. So we'd effectivly block all timers until the T2 event * fires. */ - if (cpu_base->hang_detected) + if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; tick_program_event(cpu_base->expires_next, 1); } +/* High resolution timer related functions */ +#ifdef CONFIG_HIGH_RES_TIMERS + /* - * When a timer is enqueued and expires earlier than the already enqueued - * timers, we have to check, whether it expires earlier than the timer for - * which the clock event device was armed. - * - * Called with interrupts disabled and base->cpu_base.lock held + * High resolution timer enabled ? */ -static void hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - - WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); - - /* - * If the timer is not on the current cpu, we cannot reprogram - * the other cpus clock event device. - */ - if (base->cpu_base != cpu_base) - return; - - /* - * If the hrtimer interrupt is running, then it will - * reevaluate the clock bases and reprogram the clock event - * device. The callbacks are always executed in hard interrupt - * context so we don't need an extra check for a running - * callback. - */ - if (cpu_base->in_hrtirq) - return; - - /* - * CLOCK_REALTIME timer might be requested with an absolute - * expiry time which is less than base->offset. Set it to 0. - */ - if (expires < 0) - expires = 0; - - if (expires >= cpu_base->expires_next) - return; - - /* Update the pointer to the next expiring timer */ - cpu_base->next_timer = timer; - - /* - * If a hang was detected in the last timer interrupt then we - * do not schedule a timer which is earlier than the expiry - * which we enforced in the hang detection. We want the system - * to make progress. - */ - if (cpu_base->hang_detected) - return; +static bool hrtimer_hres_enabled __read_mostly = true; +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; +EXPORT_SYMBOL_GPL(hrtimer_resolution); - /* - * Program the timer hardware. We enforce the expiry for - * events which are already in the past. - */ - cpu_base->expires_next = expires; - tick_program_event(expires, 1); +/* + * Enable / Disable high resolution mode + */ +static int __init setup_hrtimer_hres(char *str) +{ + return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } +__setup("highres=", setup_hrtimer_hres); + /* - * Initialize the high resolution related parts of cpu_base + * hrtimer_high_res_enabled - query, if the highres mode is enabled */ -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) +static inline int hrtimer_is_hres_enabled(void) { - base->expires_next = KTIME_MAX; - base->hres_active = 0; + return hrtimer_hres_enabled; } /* @@ -667,7 +686,7 @@ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); - if (!base->hres_active) + if (!__hrtimer_hres_active(base)) return; raw_spin_lock(&base->lock); @@ -714,23 +733,102 @@ void clock_was_set_delayed(void) #else -static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } -static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } -static inline void -hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } -static inline int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return 0; -} -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } static inline void retrigger_next_event(void *arg) { } #endif /* CONFIG_HIGH_RES_TIMERS */ /* + * When a timer is enqueued and expires earlier than the already enqueued + * timers, we have to check, whether it expires earlier than the timer for + * which the clock event device was armed. + * + * Called with interrupts disabled and base->cpu_base.lock held + */ +static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + struct hrtimer_clock_base *base = timer->base; + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + + WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); + + /* + * CLOCK_REALTIME timer might be requested with an absolute + * expiry time which is less than base->offset. Set it to 0. + */ + if (expires < 0) + expires = 0; + + if (timer->is_soft) { + /* + * soft hrtimer could be started on a remote CPU. In this + * case softirq_expires_next needs to be updated on the + * remote CPU. The soft hrtimer will not expire before the + * first hard hrtimer on the remote CPU - + * hrtimer_check_target() prevents this case. + */ + struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; + + if (timer_cpu_base->softirq_activated) + return; + + if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) + return; + + timer_cpu_base->softirq_next_timer = timer; + timer_cpu_base->softirq_expires_next = expires; + + if (!ktime_before(expires, timer_cpu_base->expires_next) || + !reprogram) + return; + } + + /* + * If the timer is not on the current cpu, we cannot reprogram + * the other cpus clock event device. + */ + if (base->cpu_base != cpu_base) + return; + + /* + * If the hrtimer interrupt is running, then it will + * reevaluate the clock bases and reprogram the clock event + * device. The callbacks are always executed in hard interrupt + * context so we don't need an extra check for a running + * callback. + */ + if (cpu_base->in_hrtirq) + return; + + if (expires >= cpu_base->expires_next) + return; + + /* Update the pointer to the next expiring timer */ + cpu_base->next_timer = timer; + cpu_base->expires_next = expires; + + /* + * If hres is not active, hardware does not have to be + * programmed yet. + * + * If a hang was detected in the last timer interrupt then we + * do not schedule a timer which is earlier than the expiry + * which we enforced in the hang detection. We want the system + * to make progress. + */ + if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) + return; + + /* + * Program the timer hardware. We enforce the expiry for + * events which are already in the past. + */ + tick_program_event(expires, 1); +} + +/* * Clock realtime was set * * Change the offset of the realtime clock vs. the monotonic @@ -835,9 +933,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * Returns 1 when the new timer is the leftmost timer in the tree. */ static int enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base) + struct hrtimer_clock_base *base, + enum hrtimer_mode mode) { - debug_activate(timer); + debug_activate(timer, mode); base->cpu_base->active_bases |= 1 << base->index; @@ -870,7 +969,6 @@ static void __remove_hrtimer(struct hrtimer *timer, if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); -#ifdef CONFIG_HIGH_RES_TIMERS /* * Note: If reprogram is false we do not update * cpu_base->next_timer. This happens when we remove the first @@ -881,7 +979,6 @@ static void __remove_hrtimer(struct hrtimer *timer, */ if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); -#endif } /* @@ -930,22 +1027,36 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, return tim; } -/** - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - */ -void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - u64 delta_ns, const enum hrtimer_mode mode) +static void +hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) { - struct hrtimer_clock_base *base, *new_base; - unsigned long flags; - int leftmost; + ktime_t expires; - base = lock_hrtimer_base(timer, &flags); + /* + * Find the next SOFT expiration. + */ + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); + + /* + * reprogramming needs to be triggered, even if the next soft + * hrtimer expires at the same time than the next hard + * hrtimer. cpu_base->softirq_expires_next needs to be updated! + */ + if (expires == KTIME_MAX) + return; + + /* + * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() + * cpu_base->*expires_next is only set by hrtimer_reprogram() + */ + hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); +} + +static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode, + struct hrtimer_clock_base *base) +{ + struct hrtimer_clock_base *new_base; /* Remove an active timer from the queue: */ remove_hrtimer(timer, base, true); @@ -960,21 +1071,35 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Switch the timer base, if necessary: */ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); - leftmost = enqueue_hrtimer(timer, new_base); - if (!leftmost) - goto unlock; + return enqueue_hrtimer(timer, new_base, mode); +} + +/** + * hrtimer_start_range_ns - (re)start an hrtimer + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); + * softirq based mode is considered for debug purpose only! + */ +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode) +{ + struct hrtimer_clock_base *base; + unsigned long flags; + + /* + * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft + * match. + */ + WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); + + base = lock_hrtimer_base(timer, &flags); + + if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) + hrtimer_reprogram(timer, true); - if (!hrtimer_is_hres_active(timer)) { - /* - * Kick to reschedule the next tick to handle the new timer - * on dynticks target. - */ - if (new_base->cpu_base->nohz_active) - wake_up_nohz_cpu(new_base->cpu_base->cpu); - } else { - hrtimer_reprogram(timer, new_base); - } -unlock: unlock_hrtimer_base(timer, &flags); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); @@ -1072,7 +1197,7 @@ u64 hrtimer_get_next_event(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!__hrtimer_hres_active(cpu_base)) - expires = __hrtimer_get_next_event(cpu_base); + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1095,17 +1220,24 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { + bool softtimer = !!(mode & HRTIMER_MODE_SOFT); + int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; struct hrtimer_cpu_base *cpu_base; - int base; memset(timer, 0, sizeof(struct hrtimer)); cpu_base = raw_cpu_ptr(&hrtimer_bases); - if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) + /* + * POSIX magic: Relative CLOCK_REALTIME timers are not affected by + * clock modifications, so they needs to become CLOCK_MONOTONIC to + * ensure POSIX compliance. + */ + if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; - base = hrtimer_clockid_to_base(clock_id); + base += hrtimer_clockid_to_base(clock_id); + timer->is_soft = softtimer; timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } @@ -1114,7 +1246,13 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized * @clock_id: the clock to be used - * @mode: timer mode abs/rel + * @mode: The modes which are relevant for intitialization: + * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, + * HRTIMER_MODE_REL_SOFT + * + * The PINNED variants of the above can be handed in, + * but the PINNED bit is ignored as pinning happens + * when the hrtimer is started */ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) @@ -1133,19 +1271,19 @@ EXPORT_SYMBOL_GPL(hrtimer_init); */ bool hrtimer_active(const struct hrtimer *timer) { - struct hrtimer_cpu_base *cpu_base; + struct hrtimer_clock_base *base; unsigned int seq; do { - cpu_base = READ_ONCE(timer->base->cpu_base); - seq = raw_read_seqcount_begin(&cpu_base->seq); + base = READ_ONCE(timer->base); + seq = raw_read_seqcount_begin(&base->seq); if (timer->state != HRTIMER_STATE_INACTIVE || - cpu_base->running == timer) + base->running == timer) return true; - } while (read_seqcount_retry(&cpu_base->seq, seq) || - cpu_base != READ_ONCE(timer->base->cpu_base)); + } while (read_seqcount_retry(&base->seq, seq) || + base != READ_ONCE(timer->base)); return false; } @@ -1171,7 +1309,8 @@ EXPORT_SYMBOL_GPL(hrtimer_active); static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, - struct hrtimer *timer, ktime_t *now) + struct hrtimer *timer, ktime_t *now, + unsigned long flags) { enum hrtimer_restart (*fn)(struct hrtimer *); int restart; @@ -1179,16 +1318,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); - cpu_base->running = timer; + base->running = timer; /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in - * hrtimer_active() cannot observe cpu_base->running == NULL && + * hrtimer_active() cannot observe base->running == NULL && * timer->state == INACTIVE. */ - raw_write_seqcount_barrier(&cpu_base->seq); + raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); fn = timer->function; @@ -1202,15 +1341,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, timer->is_rel = false; /* - * Because we run timers from hardirq context, there is no chance - * they get migrated to another cpu, therefore its safe to unlock - * the timer base. + * The timer is marked as running in the CPU base, so it is + * protected against migration to a different CPU even if the lock + * is dropped. */ - raw_spin_unlock(&cpu_base->lock); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); restart = fn(timer); trace_hrtimer_expire_exit(timer); - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irq(&cpu_base->lock); /* * Note: We clear the running state after enqueue_hrtimer and @@ -1223,33 +1362,31 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, */ if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) - enqueue_hrtimer(timer, base); + enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in - * hrtimer_active() cannot observe cpu_base->running == NULL && + * hrtimer_active() cannot observe base->running.timer == NULL && * timer->state == INACTIVE. */ - raw_write_seqcount_barrier(&cpu_base->seq); + raw_write_seqcount_barrier(&base->seq); - WARN_ON_ONCE(cpu_base->running != timer); - cpu_base->running = NULL; + WARN_ON_ONCE(base->running != timer); + base->running = NULL; } -static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, + unsigned long flags, unsigned int active_mask) { - struct hrtimer_clock_base *base = cpu_base->clock_base; - unsigned int active = cpu_base->active_bases; + struct hrtimer_clock_base *base; + unsigned int active = cpu_base->active_bases & active_mask; - for (; active; base++, active >>= 1) { + for_each_active_base(base, cpu_base, active) { struct timerqueue_node *node; ktime_t basenow; - if (!(active & 0x01)) - continue; - basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { @@ -1272,11 +1409,28 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) if (basenow < hrtimer_get_softexpires_tv64(timer)) break; - __run_hrtimer(cpu_base, base, timer, &basenow); + __run_hrtimer(cpu_base, base, timer, &basenow, flags); } } } +static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; + ktime_t now; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + now = hrtimer_update_base(cpu_base); + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); + + cpu_base->softirq_activated = 0; + hrtimer_update_softirq_timer(cpu_base, true); + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); +} + #ifdef CONFIG_HIGH_RES_TIMERS /* @@ -1287,13 +1441,14 @@ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; + unsigned long flags; int retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: cpu_base->in_hrtirq = 1; @@ -1306,17 +1461,23 @@ retry: */ cpu_base->expires_next = KTIME_MAX; - __hrtimer_run_queues(cpu_base, now); + if (!ktime_before(now, cpu_base->softirq_expires_next)) { + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->softirq_activated = 1; + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); /* Reevaluate the clock bases for the next expiry */ - expires_next = __hrtimer_get_next_event(cpu_base); + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; cpu_base->in_hrtirq = 0; - raw_spin_unlock(&cpu_base->lock); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { @@ -1337,7 +1498,7 @@ retry: * Acquire base lock for updating the offsets and retrieving * the current time. */ - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) @@ -1350,7 +1511,8 @@ retry: */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; - raw_spin_unlock(&cpu_base->lock); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + delta = ktime_sub(now, entry_time); if ((unsigned int)delta > cpu_base->max_hang_time) cpu_base->max_hang_time = (unsigned int) delta; @@ -1392,6 +1554,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { } void hrtimer_run_queues(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; ktime_t now; if (__hrtimer_hres_active(cpu_base)) @@ -1409,10 +1572,17 @@ void hrtimer_run_queues(void) return; } - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); - __hrtimer_run_queues(cpu_base, now); - raw_spin_unlock(&cpu_base->lock); + + if (!ktime_before(now, cpu_base->softirq_expires_next)) { + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->softirq_activated = 1; + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } /* @@ -1590,7 +1760,13 @@ int hrtimers_prepare_cpu(unsigned int cpu) } cpu_base->cpu = cpu; - hrtimer_init_hres(cpu_base); + cpu_base->active_bases = 0; + cpu_base->hres_active = 0; + cpu_base->hang_detected = 0; + cpu_base->next_timer = NULL; + cpu_base->softirq_next_timer = NULL; + cpu_base->expires_next = KTIME_MAX; + cpu_base->softirq_expires_next = KTIME_MAX; return 0; } @@ -1622,7 +1798,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * sort out already expired timers and reprogram the * event device. */ - enqueue_hrtimer(timer, new_base); + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); } } @@ -1634,6 +1810,12 @@ int hrtimers_dead_cpu(unsigned int scpu) BUG_ON(cpu_online(scpu)); tick_cancel_sched_timer(scpu); + /* + * this BH disable ensures that raise_softirq_irqoff() does + * not wakeup ksoftirqd (and acquire the pi-lock) while + * holding the cpu_base lock + */ + local_bh_disable(); local_irq_disable(); old_base = &per_cpu(hrtimer_bases, scpu); new_base = this_cpu_ptr(&hrtimer_bases); @@ -1649,12 +1831,19 @@ int hrtimers_dead_cpu(unsigned int scpu) &new_base->clock_base[i]); } + /* + * The migration might have changed the first expiring softirq + * timer on this CPU. Update it. + */ + hrtimer_update_softirq_timer(new_base, false); + raw_spin_unlock(&old_base->lock); raw_spin_unlock(&new_base->lock); /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); local_irq_enable(); + local_bh_enable(); return 0; } @@ -1663,18 +1852,19 @@ int hrtimers_dead_cpu(unsigned int scpu) void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); + open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL - * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME + * @mode: timer mode + * @clock_id: timer clock to be used */ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, - const enum hrtimer_mode mode, int clock) + const enum hrtimer_mode mode, clockid_t clock_id) { struct hrtimer_sleeper t; @@ -1695,7 +1885,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, return -EINTR; } - hrtimer_init_on_stack(&t.timer, clock, mode); + hrtimer_init_on_stack(&t.timer, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_init_sleeper(&t, current); @@ -1717,7 +1907,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless @@ -1756,7 +1946,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); /** * schedule_hrtimeout - sleep until timeout * @expires: timeout value (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 17cdc55..cc91d90 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -216,7 +216,7 @@ struct posix_clock_desc { static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) { - struct file *fp = fget(CLOCKID_TO_FD(id)); + struct file *fp = fget(clockid_to_fd(id)); int err = -EINVAL; if (!fp) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1f27887a..2541bd8 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -14,6 +14,7 @@ #include <linux/tick.h> #include <linux/workqueue.h> #include <linux/compat.h> +#include <linux/sched/deadline.h> #include "posix-timers.h" @@ -791,6 +792,14 @@ check_timers_list(struct list_head *timers, return 0; } +static inline void check_dl_overrun(struct task_struct *tsk) +{ + if (tsk->dl.dl_overrun) { + tsk->dl.dl_overrun = 0; + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } +} + /* * Check for any per-thread CPU timers that have fired and move them off * the tsk->cpu_timers[N] list onto the firing list. Here we update the @@ -804,6 +813,9 @@ static void check_thread_timers(struct task_struct *tsk, u64 expires; unsigned long soft; + if (dl_task(tsk)) + check_dl_overrun(tsk); + /* * If cputime_expires is zero, then there are no active * per thread CPU timers. @@ -906,6 +918,9 @@ static void check_process_timers(struct task_struct *tsk, struct task_cputime cputime; unsigned long soft; + if (dl_task(tsk)) + check_dl_overrun(tsk); + /* * If cputimer is not running, then there are no active * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). @@ -1111,6 +1126,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk) return 1; } + if (dl_task(tsk) && tsk->dl.dl_overrun) + return 1; + return 0; } @@ -1189,9 +1207,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, u64 now; WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); - cpu_timer_sample_group(clock_idx, tsk, &now); - if (oldval) { + if (oldval && cpu_timer_sample_group(clock_idx, tsk, &now) != -EINVAL) { /* * We are setting itimer. The *oldval is absolute and we update * it to be relative, *newval argument is relative and we update @@ -1363,8 +1380,8 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); } -#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) -#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) +#define PROCESS_CLOCK make_process_cpuclock(0, CPUCLOCK_SCHED) +#define THREAD_CLOCK make_thread_cpuclock(0, CPUCLOCK_SCHED) static int process_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 13d6881..ec999f3 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -434,17 +434,22 @@ static struct pid *good_sigevent(sigevent_t * event) { struct task_struct *rtn = current->group_leader; - if ((event->sigev_notify & SIGEV_THREAD_ID ) && - (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || - !same_thread_group(rtn, current) || - (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) + switch (event->sigev_notify) { + case SIGEV_SIGNAL | SIGEV_THREAD_ID: + rtn = find_task_by_vpid(event->sigev_notify_thread_id); + if (!rtn || !same_thread_group(rtn, current)) + return NULL; + /* FALLTHRU */ + case SIGEV_SIGNAL: + case SIGEV_THREAD: + if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) + return NULL; + /* FALLTHRU */ + case SIGEV_NONE: + return task_pid(rtn); + default: return NULL; - - if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) - return NULL; - - return task_pid(rtn); + } } static struct k_itimer * alloc_posix_timer(void) @@ -669,7 +674,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) struct timespec64 ts64; bool sig_none; - sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + sig_none = timr->it_sigev_notify == SIGEV_NONE; iv = timr->it_interval; /* interval timer ? */ @@ -856,7 +861,7 @@ int common_timer_set(struct k_itimer *timr, int flags, timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); - sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); timr->it_active = !sigev_none; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f8e1845..e277284 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -150,16 +150,15 @@ static inline void tick_nohz_init(void) { } #ifdef CONFIG_NO_HZ_COMMON extern unsigned long tick_nohz_active; -#else +extern void timers_update_nohz(void); +# ifdef CONFIG_SMP +extern struct static_key_false timers_migration_enabled; +# endif +#else /* CONFIG_NO_HZ_COMMON */ +static inline void timers_update_nohz(void) { } #define tick_nohz_active (0) #endif -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -extern void timers_update_migration(bool update_nohz); -#else -static inline void timers_update_migration(bool update_nohz) { } -#endif - DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99578f0..29a5733 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) ts->next_tick = 0; } +static inline bool local_timer_softirq_pending(void) +{ + return local_softirq_pending() & TIMER_SOFTIRQ; +} + static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { @@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; - if (rcu_needs_cpu(basemono, &next_rcu) || - arch_needs_cpu() || irq_work_needs_cpu()) { + /* + * Keep the periodic tick, when RCU, architecture or irq_work + * requests it. + * Aside of that check whether the local timer softirq is + * pending. If so its a bad idea to call get_next_timer_interrupt() + * because there is an already expired timer, so it will request + * immeditate expiry, which rearms the hardware timer with a + * minimal delta which brings us back to this place + * immediately. Lather, rinse and repeat... + */ + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || + irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* @@ -986,6 +1001,19 @@ ktime_t tick_nohz_get_sleep_length(void) } /** + * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value + * for a particular CPU. + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls_cpu(int cpu) +{ + struct tick_sched *ts = tick_get_tick_sched(cpu); + + return ts->idle_calls; +} + +/** * tick_nohz_get_idle_calls - return the current idle calls counter value * * Called from the schedutil frequency scaling governor in scheduler context. @@ -1079,7 +1107,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode) ts->nohz_mode = mode; /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) - timers_update_migration(true); + timers_update_nohz(); } /** diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ffebcf8..48150ab 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -200,8 +200,6 @@ struct timer_base { unsigned long clk; unsigned long next_expiry; unsigned int cpu; - bool migration_enabled; - bool nohz_active; bool is_idle; bool must_forward_clk; DECLARE_BITMAP(pending_map, WHEEL_SIZE); @@ -210,45 +208,64 @@ struct timer_base { static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON + +static DEFINE_STATIC_KEY_FALSE(timers_nohz_active); +static DEFINE_MUTEX(timer_keys_mutex); + +static void timer_update_keys(struct work_struct *work); +static DECLARE_WORK(timer_update_work, timer_update_keys); + +#ifdef CONFIG_SMP unsigned int sysctl_timer_migration = 1; -void timers_update_migration(bool update_nohz) +DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); + +static void timers_update_migration(void) { - bool on = sysctl_timer_migration && tick_nohz_active; - unsigned int cpu; + if (sysctl_timer_migration && tick_nohz_active) + static_branch_enable(&timers_migration_enabled); + else + static_branch_disable(&timers_migration_enabled); +} +#else +static inline void timers_update_migration(void) { } +#endif /* !CONFIG_SMP */ - /* Avoid the loop, if nothing to update */ - if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on) - return; +static void timer_update_keys(struct work_struct *work) +{ + mutex_lock(&timer_keys_mutex); + timers_update_migration(); + static_branch_enable(&timers_nohz_active); + mutex_unlock(&timer_keys_mutex); +} - for_each_possible_cpu(cpu) { - per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on; - per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on; - per_cpu(hrtimer_bases.migration_enabled, cpu) = on; - if (!update_nohz) - continue; - per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true; - per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true; - per_cpu(hrtimer_bases.nohz_active, cpu) = true; - } +void timers_update_nohz(void) +{ + schedule_work(&timer_update_work); } int timer_migration_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - static DEFINE_MUTEX(mutex); int ret; - mutex_lock(&mutex); + mutex_lock(&timer_keys_mutex); ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) - timers_update_migration(false); - mutex_unlock(&mutex); + timers_update_migration(); + mutex_unlock(&timer_keys_mutex); return ret; } -#endif + +static inline bool is_timers_nohz_active(void) +{ + return static_branch_unlikely(&timers_nohz_active); +} +#else +static inline bool is_timers_nohz_active(void) { return false; } +#endif /* NO_HZ_COMMON */ static unsigned long round_jiffies_common(unsigned long j, int cpu, bool force_up) @@ -534,7 +551,7 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer) static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { - if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + if (!is_timers_nohz_active()) return; /* @@ -823,11 +840,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); /* - * If the timer is deferrable and nohz is active then we need to use - * the deferrable base. + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && - (tflags & TIMER_DEFERRABLE)) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); return base; } @@ -837,11 +853,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); /* - * If the timer is deferrable and nohz is active then we need to use - * the deferrable base. + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && - (tflags & TIMER_DEFERRABLE)) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = this_cpu_ptr(&timer_bases[BASE_DEF]); return base; } @@ -851,21 +866,20 @@ static inline struct timer_base *get_timer_base(u32 tflags) return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); } -#ifdef CONFIG_NO_HZ_COMMON static inline struct timer_base * get_target_base(struct timer_base *base, unsigned tflags) { -#ifdef CONFIG_SMP - if ((tflags & TIMER_PINNED) || !base->migration_enabled) - return get_timer_this_cpu_base(tflags); - return get_timer_cpu_base(tflags, get_nohz_timer_target()); -#else - return get_timer_this_cpu_base(tflags); +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + if (static_branch_likely(&timers_migration_enabled) && + !(tflags & TIMER_PINNED)) + return get_timer_cpu_base(tflags, get_nohz_timer_target()); #endif + return get_timer_this_cpu_base(tflags); } static inline void forward_timer_base(struct timer_base *base) { +#ifdef CONFIG_NO_HZ_COMMON unsigned long jnow; /* @@ -889,16 +903,8 @@ static inline void forward_timer_base(struct timer_base *base) base->clk = jnow; else base->clk = base->next_expiry; -} -#else -static inline struct timer_base * -get_target_base(struct timer_base *base, unsigned tflags) -{ - return get_timer_this_cpu_base(tflags); -} - -static inline void forward_timer_base(struct timer_base *base) { } #endif +} /* @@ -1009,8 +1015,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock; - debug_activate(timer, expires); - new_base = get_target_base(base, timer->flags); if (base != new_base) { @@ -1034,6 +1038,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } } + debug_activate(timer, expires); + timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance @@ -1684,7 +1690,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) base->must_forward_clk = false; __run_timers(base); - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); } @@ -1698,7 +1704,7 @@ void run_local_timers(void) hrtimer_run_queues(); /* Raise the softirq only if required. */ if (time_before(jiffies, base->clk)) { - if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) return; /* CPU is awake, so check the deferrable base. */ base++; @@ -1855,6 +1861,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h } } +int timers_prepare_cpu(unsigned int cpu) +{ + struct timer_base *base; + int b; + + for (b = 0; b < NR_BASES; b++) { + base = per_cpu_ptr(&timer_bases[b], cpu); + base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->is_idle = false; + base->must_forward_clk = true; + } + return 0; +} + int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base; diff --git a/kernel/torture.c b/kernel/torture.c index 637e172..37b9401 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -47,6 +47,7 @@ #include <linux/ktime.h> #include <asm/byteorder.h> #include <linux/torture.h> +#include "rcu/rcu.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); @@ -60,7 +61,6 @@ static bool verbose; #define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */ static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); -static int *torture_runnable; #ifdef CONFIG_HOTPLUG_CPU @@ -500,7 +500,7 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); - ftrace_dump(DUMP_ALL); + rcu_ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } @@ -572,17 +572,19 @@ static int stutter; */ void stutter_wait(const char *title) { + int spt; + cond_resched_rcu_qs(); - while (READ_ONCE(stutter_pause_test) || - (torture_runnable && !READ_ONCE(*torture_runnable))) { - if (stutter_pause_test) - if (READ_ONCE(stutter_pause_test) == 1) - schedule_timeout_interruptible(1); - else - while (READ_ONCE(stutter_pause_test)) - cond_resched(); - else + spt = READ_ONCE(stutter_pause_test); + for (; spt; spt = READ_ONCE(stutter_pause_test)) { + if (spt == 1) { + schedule_timeout_interruptible(1); + } else if (spt == 2) { + while (READ_ONCE(stutter_pause_test)) + cond_resched(); + } else { schedule_timeout_interruptible(round_jiffies_relative(HZ)); + } torture_shutdown_absorb(title); } } @@ -596,17 +598,15 @@ static int torture_stutter(void *arg) { VERBOSE_TOROUT_STRING("torture_stutter task started"); do { - if (!torture_must_stop()) { - if (stutter > 1) { - schedule_timeout_interruptible(stutter - 1); - WRITE_ONCE(stutter_pause_test, 2); - } - schedule_timeout_interruptible(1); + if (!torture_must_stop() && stutter > 1) { WRITE_ONCE(stutter_pause_test, 1); + schedule_timeout_interruptible(stutter - 1); + WRITE_ONCE(stutter_pause_test, 2); + schedule_timeout_interruptible(1); } + WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) schedule_timeout_interruptible(stutter); - WRITE_ONCE(stutter_pause_test, 0); torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); torture_kthread_stopping("torture_stutter"); @@ -647,7 +647,7 @@ static void torture_stutter_cleanup(void) * The runnable parameter points to a flag that controls whether or not * the test is currently runnable. If there is no such flag, pass in NULL. */ -bool torture_init_begin(char *ttype, bool v, int *runnable) +bool torture_init_begin(char *ttype, bool v) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { @@ -659,7 +659,6 @@ bool torture_init_begin(char *ttype, bool v, int *runnable) } torture_type = ttype; verbose = v; - torture_runnable = runnable; fullstop = FULLSTOP_DONTSTOP; return true; } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index af7dad1..f54dc62 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -164,6 +164,7 @@ config PREEMPTIRQ_EVENTS bool "Enable trace events for preempt and irq disable/enable" select TRACE_IRQFLAGS depends on DEBUG_PREEMPT || !PROVE_LOCKING + depends on TRACING default n help Enable tracing of disable and enable events for preemption and irqs. @@ -354,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES on if you need to profile the system's use of these macros. config PROFILE_ALL_BRANCHES - bool "Profile all if conditionals" + bool "Profile all if conditionals" if !FORTIFY_SOURCE select TRACE_BRANCH_PROFILING help This tracer profiles all branch conditions. Every if () diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0ce99c3..40207c2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -343,14 +343,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; -static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); +static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, - u64 flags, struct perf_raw_record *raw) + u64 flags, struct perf_sample_data *sd) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; @@ -373,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_sample_data_init(sd, 0, 0); - sd->raw = raw; perf_event_output(event, sd, regs); return 0; } @@ -382,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { + struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); struct perf_raw_record raw = { .frag = { .size = size, @@ -392,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; - return __bpf_perf_event_output(regs, map, flags, &raw); + perf_sample_data_init(sd, 0, 0); + sd->raw = &raw; + + return __bpf_perf_event_output(regs, map, flags, sd); } static const struct bpf_func_proto bpf_perf_event_output_proto = { @@ -407,10 +408,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { }; static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); +static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { + struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd); struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); struct perf_raw_frag frag = { .copy = ctx_copy, @@ -428,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; perf_fetch_caller_regs(regs); + perf_sample_data_init(sd, 0, 0); + sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, &raw); + return __bpf_perf_event_output(regs, map, flags, sd); } BPF_CALL_0(bpf_get_current_task) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ccdf366..554b517 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = { }; /* - * This is used by __kernel_text_address() to return true if the - * address is on a dynamically allocated trampoline that would - * not return true for either core_kernel_text() or - * is_module_text_address(). + * Used by the stack undwinder to know about dynamic ftrace trampolines. */ -bool is_ftrace_trampoline(unsigned long addr) +struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) { - struct ftrace_ops *op; - bool ret = false; + struct ftrace_ops *op = NULL; /* * Some of the ops may be dynamically allocated, @@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr) if (op->trampoline && op->trampoline_size) if (addr >= op->trampoline && addr < op->trampoline + op->trampoline_size) { - ret = true; - goto out; + preempt_enable_notrace(); + return op; } } while_for_each_ftrace_op(op); - - out: preempt_enable_notrace(); - return ret; + return NULL; +} + +/* + * This is used by __kernel_text_address() to return true if the + * address is on a dynamically allocated trampoline that would + * not return true for either core_kernel_text() or + * is_module_text_address(). + */ +bool is_ftrace_trampoline(unsigned long addr) +{ + return ftrace_ops_trampoline(addr) != NULL; } struct ftrace_page { diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 91874a9..5af2842 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) +#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED) + struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ @@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage) */ size_t ring_buffer_page_len(void *page) { - return local_read(&((struct buffer_data_page *)page)->commit) + struct buffer_data_page *bpage = page; + + return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS) + BUF_PAGE_HDR_SIZE; } @@ -1799,12 +1803,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) } EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); -static __always_inline void * -__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) -{ - return bpage->data + index; -} - static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { return bpage->page->data + index; @@ -2536,29 +2534,58 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified * by the current task between lock and unlock. But it can - * be modified more than once via an interrupt. There are four - * different contexts that we need to consider. + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. * - * Normal context. - * SoftIRQ context - * IRQ context - * NMI context + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) * - * If for some reason the ring buffer starts to recurse, we - * only allow that to happen at most 4 times (one for each - * context). If it happens 5 times, then we consider this a - * recusive loop and do not let it go further. + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. */ static __always_inline int trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { - if (cpu_buffer->current_context >= 4) + unsigned int val = cpu_buffer->current_context; + unsigned long pc = preempt_count(); + int bit; + + if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) + bit = RB_CTX_NORMAL; + else + bit = pc & NMI_MASK ? RB_CTX_NMI : + pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; + + if (unlikely(val & (1 << bit))) return 1; - cpu_buffer->current_context++; - /* Interrupts must see this update */ - barrier(); + val |= (1 << bit); + cpu_buffer->current_context = val; return 0; } @@ -2566,9 +2593,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - /* Don't let the dec leak out */ - barrier(); - cpu_buffer->current_context--; + cpu_buffer->current_context &= cpu_buffer->current_context - 1; } /** @@ -4406,8 +4431,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct buffer_data_page *bpage = data; + struct page *page = virt_to_page(bpage); unsigned long flags; + /* If the page is still in use someplace else, we can't reuse it */ + if (page_ref_count(page) > 1) + goto out; + local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); @@ -4419,6 +4449,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); + out: free_page((unsigned long)bpage); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73e67b6..4f3a8e2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct } /** - * trace_pid_filter_add_remove - Add or remove a task from a pid_list + * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list * @pid_list: The list to modify * @self: The current task for fork or NULL for exit * @task: The task to add or remove @@ -925,7 +925,7 @@ static void tracing_snapshot_instance(struct trace_array *tr) } /** - * trace_snapshot - take a snapshot of the current buffer. + * tracing_snapshot - take a snapshot of the current buffer. * * This causes a swap between the snapshot buffer and the current live * tracing buffer. You can use this to take snapshots of the live @@ -1004,9 +1004,9 @@ int tracing_alloc_snapshot(void) EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); /** - * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. * - * This is similar to trace_snapshot(), but it will allocate the + * This is similar to tracing_snapshot(), but it will allocate the * snapshot buffer if it isn't already allocated. Use this only * where it is safe to sleep, as the allocation may sleep. * @@ -1303,7 +1303,7 @@ unsigned long __read_mostly tracing_thresh; /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + * for later retrieval via /sys/kernel/tracing/tracing_max_latency) */ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) @@ -2374,6 +2374,15 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); +/* + * Skip 3: + * + * trace_buffer_unlock_commit_regs() + * trace_event_buffer_commit() + * trace_event_raw_event_xxx() +*/ +# define STACK_SKIP 3 + void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, @@ -2383,16 +2392,12 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, __buffer_unlock_commit(buffer, event); /* - * If regs is not set, then skip the following callers: - * trace_buffer_unlock_commit_regs - * event_trigger_unlock_commit - * trace_event_buffer_commit - * trace_event_raw_event_sched_switch + * If regs is not set, then skip the necessary functions. * Note, we can still get here via blktrace, wakeup tracer * and mmiotrace, but that's ok if they lose a function or - * two. They are that meaningful. + * two. They are not that meaningful. */ - ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); + ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } @@ -2415,7 +2420,7 @@ trace_process_export(struct trace_export *export, entry = ring_buffer_event_data(event); size = ring_buffer_event_length(event); - export->write(entry, size); + export->write(export, entry, size); } static DEFINE_MUTEX(ftrace_export_lock); @@ -2579,11 +2584,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.skip = skip; /* - * Add two, for this function and the call to save_stack_trace() + * Add one, for this function and the call to save_stack_trace() * If regs is set, then these functions will not be in the way. */ +#ifndef CONFIG_UNWINDER_ORC if (!regs) - trace.skip += 2; + trace.skip++; +#endif /* * Since events can happen in NMIs there's no safe way to @@ -2682,17 +2689,6 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, if (unlikely(in_nmi())) return; - /* - * It is possible that a function is being traced in a - * location that RCU is not watching. A call to - * rcu_irq_enter() will make sure that it is, but there's - * a few internal rcu functions that could be traced - * where that wont work either. In those cases, we just - * do nothing. - */ - if (unlikely(rcu_irq_enter_disabled())) - return; - rcu_irq_enter_irqson(); __ftrace_trace_stack(buffer, flags, skip, pc, NULL); rcu_irq_exit_irqson(); @@ -2711,11 +2707,10 @@ void trace_dump_stack(int skip) local_save_flags(flags); - /* - * Skip 3 more, seems to get us at the caller of - * this function. - */ - skip += 3; +#ifndef CONFIG_UNWINDER_ORC + /* Skip 1 to skip this function. */ + skip++; +#endif __ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, skip, preempt_count(), NULL); } @@ -4178,37 +4173,30 @@ static const struct file_operations show_traces_fops = { .llseek = seq_lseek, }; -/* - * The tracer itself will not take this lock, but still we want - * to provide a consistent cpumask to user-space: - */ -static DEFINE_MUTEX(tracing_cpumask_update_lock); - -/* - * Temporary storage for the character representation of the - * CPU bitmask (and one more byte for the newline): - */ -static char mask_str[NR_CPUS + 1]; - static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; + char *mask_str; int len; - mutex_lock(&tracing_cpumask_update_lock); + len = snprintf(NULL, 0, "%*pb\n", + cpumask_pr_args(tr->tracing_cpumask)) + 1; + mask_str = kmalloc(len, GFP_KERNEL); + if (!mask_str) + return -ENOMEM; - len = snprintf(mask_str, count, "%*pb\n", + len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); if (len >= count) { count = -EINVAL; goto out_err; } - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); out_err: - mutex_unlock(&tracing_cpumask_update_lock); + kfree(mask_str); return count; } @@ -4228,8 +4216,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (err) goto err_unlock; - mutex_lock(&tracing_cpumask_update_lock); - local_irq_disable(); arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { @@ -4252,8 +4238,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); - - mutex_unlock(&tracing_cpumask_update_lock); free_cpumask_var(tracing_cpumask_new); return count; @@ -6780,7 +6764,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; - int entries, size, i; + int entries, i; ssize_t ret = 0; #ifdef CONFIG_TRACER_MAX_TRACE @@ -6834,14 +6818,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - /* - * zero out any left over data, this is going to - * user land. - */ - size = ring_buffer_page_len(ref->page); - if (size < PAGE_SIZE) - memset(ref->page + size, 0, PAGE_SIZE - size); - page = virt_to_page(ref->page); spd.pages[i] = page; @@ -7599,6 +7575,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size buf->data = alloc_percpu(struct trace_array_cpu); if (!buf->data) { ring_buffer_free(buf->buffer); + buf->buffer = NULL; return -ENOMEM; } @@ -7622,7 +7599,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot ? size : 1); if (WARN_ON(ret)) { ring_buffer_free(tr->trace_buffer.buffer); + tr->trace_buffer.buffer = NULL; free_percpu(tr->trace_buffer.data); + tr->trace_buffer.data = NULL; return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 79f838a..22fee76 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -165,7 +165,7 @@ static int benchmark_event_kthread(void *arg) * this thread will never voluntarily schedule which would * block synchronize_rcu_tasks() indefinitely. */ - cond_resched_rcu_qs(); + cond_resched(); } return 0; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ec0f9aa..1b87157 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; + bool first = false; int last_i; int i; @@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) list_for_each_entry_safe(call, p, &ftrace_events, list) { /* events are usually grouped together with systems */ if (!last_system || call->class->system != last_system) { + first = true; last_i = 0; last_system = call->class->system; } + /* + * Since calls are grouped by systems, the likelyhood that the + * next call in the iteration belongs to the same system as the + * previous call is high. As an optimization, we skip seaching + * for a map[] that matches the call's system if the last call + * was from the same system. That's what last_i is for. If the + * call has the same system as the previous call, then last_i + * will be the index of the first map[] that has a matching + * system. + */ for (i = last_i; i < len; i++) { if (call->class->system == map[i]->system) { /* Save the first system if need be */ - if (!last_i) + if (first) { last_i = i; + first = false; + } update_event_printk(call, map[i]); } } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f2ac9d4..8741148 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1123,13 +1123,22 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef CONFIG_STACKTRACE +#ifdef CONFIG_UNWINDER_ORC +/* Skip 2: + * event_triggers_post_call() + * trace_event_raw_event_xxx() + */ +# define STACK_SKIP 2 +#else /* - * Skip 3: + * Skip 4: * stacktrace_trigger() * event_triggers_post_call() + * trace_event_buffer_commit() * trace_event_raw_event_xxx() */ -#define STACK_SKIP 3 +#define STACK_SKIP 4 +#endif static void stacktrace_trigger(struct event_trigger_data *data, void *rec) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 27f7ad1..b611cd3 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -154,6 +154,24 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, preempt_enable_notrace(); } +#ifdef CONFIG_UNWINDER_ORC +/* + * Skip 2: + * + * function_stack_trace_call() + * ftrace_call() + */ +#define STACK_SKIP 2 +#else +/* + * Skip 3: + * __trace_stack() + * function_stack_trace_call() + * ftrace_call() + */ +#define STACK_SKIP 3 +#endif + static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) @@ -180,15 +198,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, if (likely(disabled == 1)) { pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); - /* - * skip over 5 funcs: - * __ftrace_trace_stack, - * __trace_stack, - * function_stack_trace_call - * ftrace_list_func - * ftrace_call - */ - __trace_stack(tr, flags, 5, pc); + __trace_stack(tr, flags, STACK_SKIP, pc); } atomic_dec(&data->disabled); @@ -367,14 +377,27 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, tracer_tracing_off(tr); } +#ifdef CONFIG_UNWINDER_ORC /* - * Skip 4: + * Skip 3: + * + * function_trace_probe_call() + * ftrace_ops_assist_func() + * ftrace_call() + */ +#define FTRACE_STACK_SKIP 3 +#else +/* + * Skip 5: + * + * __trace_stack() * ftrace_stacktrace() * function_trace_probe_call() - * ftrace_ops_list_func() + * ftrace_ops_assist_func() * ftrace_call() */ -#define STACK_SKIP 4 +#define FTRACE_STACK_SKIP 5 +#endif static __always_inline void trace_stack(struct trace_array *tr) { @@ -384,7 +407,7 @@ static __always_inline void trace_stack(struct trace_array *tr) local_save_flags(flags); pc = preempt_count(); - __trace_stack(tr, flags, STACK_SKIP, pc); + __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc); } static void diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 734accc..3c7bfc4 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (__this_cpu_read(disable_stack_tracer) != 1) goto out; + /* If rcu is not watching, then save stack trace can fail */ + if (!rcu_is_watching()) + goto out; + ip += MCOUNT_INSN_SIZE; check_stack(ip, &stack); diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 685c50a..671b134 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -212,11 +212,10 @@ static int tracepoint_add_func(struct tracepoint *tp, } /* - * rcu_assign_pointer has a smp_wmb() which makes sure that the new - * probe callbacks array is consistent before setting a pointer to it. - * This array is referenced by __DO_TRACE from - * include/linux/tracepoints.h. A matching smp_read_barrier_depends() - * is used. + * rcu_assign_pointer has as smp_store_release() which makes sure + * that the new probe callbacks array is consistent before setting + * a pointer to it. This array is referenced by __DO_TRACE from + * include/linux/tracepoint.h using rcu_dereference_sched(). */ rcu_assign_pointer(tp->funcs, tp_funcs); if (!static_key_enabled(&tp->key)) diff --git a/kernel/uid16.c b/kernel/uid16.c index ce74a49..ef1da2a 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8fdb710..8c34981 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -38,7 +38,6 @@ #include <linux/hardirq.h> #include <linux/mempolicy.h> #include <linux/freezer.h> -#include <linux/kallsyms.h> #include <linux/debug_locks.h> #include <linux/lockdep.h> #include <linux/idr.h> @@ -48,6 +47,8 @@ #include <linux/nodemask.h> #include <linux/moduleparam.h> #include <linux/uaccess.h> +#include <linux/sched/isolation.h> +#include <linux/nmi.h> #include "workqueue_internal.h" @@ -1634,7 +1635,7 @@ static void worker_enter_idle(struct worker *worker) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* - * Sanity check nr_running. Because wq_unbind_fn() releases + * Sanity check nr_running. Because unbind_workers() releases * pool->lock between setting %WORKER_UNBOUND and zapping * nr_running, the warning may trigger spuriously. Check iff * unbind is not in progress. @@ -2135,7 +2136,7 @@ __acquires(&pool->lock) * stop_machine. At the same time, report a quiescent RCU state so * the same condition doesn't freeze RCU. */ - cond_resched_rcu_qs(); + cond_resched(); spin_lock_irq(&pool->lock); @@ -4463,6 +4464,12 @@ void show_workqueue_state(void) if (pwq->nr_active || !list_empty(&pwq->delayed_works)) show_pwq(pwq); spin_unlock_irqrestore(&pwq->pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_workqueue_state(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); } } @@ -4490,6 +4497,12 @@ void show_workqueue_state(void) pr_cont("\n"); next_pool: spin_unlock_irqrestore(&pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_workqueue_state(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); } rcu_read_unlock_sched(); @@ -4510,9 +4523,8 @@ void show_workqueue_state(void) * cpu comes back online. */ -static void wq_unbind_fn(struct work_struct *work) +static void unbind_workers(int cpu) { - int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; @@ -4589,16 +4601,6 @@ static void rebind_workers(struct worker_pool *pool) spin_lock_irq(&pool->lock); - /* - * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED - * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is - * being reworked and this can go away in time. - */ - if (!(pool->flags & POOL_DISASSOCIATED)) { - spin_unlock_irq(&pool->lock); - return; - } - pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { @@ -4709,12 +4711,13 @@ int workqueue_online_cpu(unsigned int cpu) int workqueue_offline_cpu(unsigned int cpu) { - struct work_struct unbind_work; struct workqueue_struct *wq; /* unbinding per-cpu workers should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); - queue_work_on(cpu, system_highpri_wq, &unbind_work); + if (WARN_ON(cpu != smp_processor_id())) + return -1; + + unbind_workers(cpu); /* update NUMA affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); @@ -4722,9 +4725,6 @@ int workqueue_offline_cpu(unsigned int cpu) wq_update_unbound_numa(wq, cpu, false); mutex_unlock(&wq_pool_mutex); - /* wait for per-cpu unbinding to finish */ - flush_work(&unbind_work); - destroy_work_on_stack(&unbind_work); return 0; } @@ -4957,6 +4957,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) return -ENOMEM; + /* + * Not excluding isolated cpus on purpose. + * If the user wishes to include them, we allow that. + */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); @@ -5555,7 +5559,7 @@ int __init workqueue_init_early(void) WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); |