From 5731a879d03bdaa00265f8ebc32dfd0e65d25276 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 4 Jan 2018 20:02:09 -0800 Subject: bpf: sockmap missing NULL psock check Add psock NULL check to handle a racing sock event that can get the sk_callback_lock before this case but after xchg happens causing the refcnt to hit zero and sock user data (psock) to be null and queued for garbage collection. Also add a comment in the code because this is a bit subtle and not obvious in my opinion. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 5ee2e41..1712d31 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map) write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); - smap_list_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, sock); + } write_unlock_bh(&sock->sk_callback_lock); } rcu_read_unlock(); -- cgit v1.1 From b2157399cc9898260d6031c5bfe45fe137c1fbe7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 7 Jan 2018 17:33:02 -0800 Subject: bpf: prevent out-of-bounds speculation Under speculation, CPUs may mis-predict branches in bounds checks. Thus, memory accesses under a bounds check may be speculated even if the bounds check fails, providing a primitive for building a side channel. To avoid leaking kernel data round up array-based maps and mask the index after bounds check, so speculated load with out of bounds index will load either valid value from the array or zero from the padded area. Unconditionally mask index for all array types even when max_entries are not rounded to power of 2 for root user. When map is created by unpriv user generate a sequence of bpf insns that includes AND operation to make sure that JITed code includes the same 'index & index_mask' operation. If prog_array map is created by unpriv user replace bpf_tail_call(ctx, map, index); with if (index >= max_entries) { index &= map->index_mask; bpf_tail_call(ctx, map, index); } (along with roundup to power 2) to prevent out-of-bounds speculation. There is secondary redundant 'if (index >= max_entries)' in the interpreter and in all JITs, but they can be optimized later if necessary. Other array-like maps (cpumap, devmap, sockmap, perf_event_array, cgroup_array) cannot be used by unpriv, so no changes there. That fixes bpf side of "Variant 1: bounds check bypass (CVE-2017-5753)" on all architectures with and without JIT. v2->v3: Daniel noticed that attack potentially can be crafted via syscall commands without loading the program, so add masking to those paths as well. Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 47 ++++++++++++++++++++++++++++++++++++----------- kernel/bpf/verifier.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7c25426..aaa3198 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); + u32 elem_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); struct bpf_array *array; u64 array_size; - u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -72,11 +73,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); + max_entries = attr->max_entries; + index_mask = roundup_pow_of_two(max_entries) - 1; + + if (unpriv) + /* round up array size to nearest power of 2, + * since cpu will speculate within index_mask limits + */ + max_entries = index_mask + 1; + array_size = sizeof(*array); if (percpu) - array_size += (u64) attr->max_entries * sizeof(void *); + array_size += (u64) max_entries * sizeof(void *); else - array_size += (u64) attr->max_entries * elem_size; + array_size += (u64) max_entries * elem_size; /* make sure there is no u32 overflow later in round_up() */ if (array_size >= U32_MAX - PAGE_SIZE) @@ -86,6 +96,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array = bpf_map_area_alloc(array_size, numa_node); if (!array) return ERR_PTR(-ENOMEM); + array->index_mask = index_mask; + array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ array->map.map_type = attr->map_type; @@ -121,12 +133,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return array->value + array->elem_size * index; + return array->value + array->elem_size * (index & array->index_mask); } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; u32 elem_size = round_up(map->value_size, 8); const int ret = BPF_REG_0; @@ -135,7 +148,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + } if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); @@ -157,7 +175,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return this_cpu_ptr(array->pptrs[index]); + return this_cpu_ptr(array->pptrs[index & array->index_mask]); } int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) @@ -177,7 +195,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); off += size; @@ -225,10 +243,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EEXIST; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) - memcpy(this_cpu_ptr(array->pptrs[index]), + memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); else - memcpy(array->value + array->elem_size * index, + memcpy(array->value + + array->elem_size * (index & array->index_mask), value, map->value_size); return 0; } @@ -262,7 +281,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); off += size; @@ -613,6 +632,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) static u32 array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); u32 elem_size = round_up(map->value_size, 8); struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -621,7 +641,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + } if (is_power_of_2(elem_size)) *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); else diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 04b2487..b414d6b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1729,6 +1729,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; + if (func_id == BPF_FUNC_tail_call) { + if (meta.map_ptr == NULL) { + verbose(env, "verifier bug\n"); + return -EINVAL; + } + env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; + } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; @@ -4456,6 +4463,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ insn->imm = 0; insn->code = BPF_JMP | BPF_TAIL_CALL; + + /* instead of changing every JIT dealing with tail_call + * emit two extra insns: + * if (index >= max_entries) goto out; + * index &= array->index_mask; + * to avoid out-of-bounds cpu speculation + */ + map_ptr = env->insn_aux_data[i + delta].map_ptr; + if (map_ptr == BPF_MAP_PTR_POISON) { + verbose(env, "tail_call obusing map_ptr\n"); + return -EINVAL; + } + if (!map_ptr->unpriv_array) + continue; + insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, + map_ptr->max_entries, 2); + insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, + container_of(map_ptr, + struct bpf_array, + map)->index_mask); + insn_buf[2] = *insn; + cnt = 3; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; continue; } -- cgit v1.1 From 290af86629b25ffd1ed6232c4e9107da031705cb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Jan 2018 10:04:29 -0800 Subject: bpf: introduce BPF_JIT_ALWAYS_ON config The BPF interpreter has been used as part of the spectre 2 attack CVE-2017-5715. A quote from goolge project zero blog: "At this point, it would normally be necessary to locate gadgets in the host kernel code that can be used to actually leak data by reading from an attacker-controlled location, shifting and masking the result appropriately and then using the result of that as offset to an attacker-controlled address for a load. But piecing gadgets together and figuring out which ones work in a speculation context seems annoying. So instead, we decided to use the eBPF interpreter, which is built into the host kernel - while there is no legitimate way to invoke it from inside a VM, the presence of the code in the host kernel's text section is sufficient to make it usable for the attack, just like with ordinary ROP gadgets." To make attacker job harder introduce BPF_JIT_ALWAYS_ON config option that removes interpreter from the kernel in favor of JIT-only mode. So far eBPF JIT is supported by: x64, arm64, arm32, sparc64, s390, powerpc64, mips64 The start of JITed program is randomized and code page is marked as read-only. In addition "constant blinding" can be turned on with net.core.bpf_jit_harden v2->v3: - move __bpf_prog_ret0 under ifdef (Daniel) v1->v2: - fix init order, test_bpf and cBPF (Daniel's feedback) - fix offloaded bpf (Jakub's feedback) - add 'return 0' dummy in case something can invoke prog->bpf_func - retarget bpf tree. For bpf-next the patch would need one extra hunk. It will be sent when the trees are merged back to net-next Considered doing: int bpf_jit_enable __read_mostly = BPF_EBPF_JIT_DEFAULT; but it seems better to land the patch as-is and in bpf-next remove bpf_jit_enable global variable from all JITs, consolidate in one place and remove this jit_init() function. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 86b50aa..51ec2dd 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +#ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on @@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; +#else +static unsigned int __bpf_prog_ret0(const void *ctx, + const struct bpf_insn *insn) +{ + return 0; +} +#endif + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { +#ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else + fp->bpf_func = __bpf_prog_ret0; +#endif /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) */ if (!bpf_prog_is_dev_bound(fp->aux)) { fp = bpf_int_jit_compile(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) { + *err = -ENOTSUPP; + return fp; + } +#endif } else { *err = bpf_prog_offload_compile(fp); if (*err) -- cgit v1.1