From 0e4364560361d57e8cd873a8990327f3471d7d8a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 17 May 2018 09:08:43 -0500 Subject: bpf: sockmap, fix uninitialized variable There is a potential execution path in which variable err is returned without being properly initialized previously. Fix this by initializing variable err to 0. Addresses-Coverity-ID: 1468964 ("Uninitialized scalar variable") Fixes: e5cd3abcb31a ("bpf: sockmap, refactor sockmap routines to work with hashmap") Signed-off-by: Gustavo A. R. Silva Acked-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index c6de139..41b41fc 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1713,7 +1713,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, struct smap_psock_map_entry *e = NULL; struct smap_psock *psock; bool new = false; - int err; + int err = 0; /* 1. If sock map has BPF programs those will be inherited by the * sock being added. If the sock is already attached to BPF programs -- cgit v1.1 From a78622932c27e8ec33e5ba180f3d2e87fb806b28 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 17 May 2018 09:11:02 -0500 Subject: bpf: sockmap, fix double-free `e' is being freed twice. Fix this by removing one of the kfree() calls. Addresses-Coverity-ID: 1468983 ("Double free") Fixes: 81110384441a ("bpf: sockmap, add hash map support") Signed-off-by: Gustavo A. R. Silva Acked-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 41b41fc..c682669 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1823,7 +1823,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, write_unlock_bh(&sock->sk_callback_lock); return err; out_free: - kfree(e); smap_release_sock(psock, sock); out_progs: if (verdict) -- cgit v1.1 From dac09149d992995adbef0f472093fbb6940a8653 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 18 May 2018 14:00:21 +0200 Subject: xsk: clean up SPDX headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up SPDX-License-Identifier and removing licensing leftovers. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- kernel/bpf/xskmap.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index cb3a121..b3c5574 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -1,15 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* XSKMAP used for AF_XDP sockets * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include -- cgit v1.1 From 303def35f64e37bcd5401d202889f5fbc0241179 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 17 May 2018 14:16:58 -0700 Subject: bpf: allow sk_msg programs to read sock fields Currently sk_msg programs only have access to the raw data. However, it is often useful when building policies to have the policies specific to the socket endpoint. This allows using the socket tuple as input into filters, etc. This patch adds ctx access to the sock fields. Signed-off-by: John Fastabend Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index c682669..3b28955 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -523,6 +523,7 @@ static unsigned int smap_do_tx_msg(struct sock *sk, } bpf_compute_data_pointers_sg(md); + md->sk = sk; rc = (*prog->bpf_func)(md, prog->insnsi); psock->apply_bytes = md->apply_bytes; -- cgit v1.1 From dcab51f19b291d5ee23724c51b0a3a597c16451a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 15:03:31 -0700 Subject: bpf: Expose check_uarg_tail_zero() This patch exposes check_uarg_tail_zero() which will be reused by a later BTF patch. Its name is changed to bpf_check_uarg_tail_zero(). Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bfcde94..2b29ef8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -65,9 +65,9 @@ static const struct bpf_map_ops * const bpf_map_types[] = { * copy_from_user() call. However, this is not a concern since this function is * meant to be a future-proofing of bits. */ -static int check_uarg_tail_zero(void __user *uaddr, - size_t expected_size, - size_t actual_size) +int bpf_check_uarg_tail_zero(void __user *uaddr, + size_t expected_size, + size_t actual_size) { unsigned char __user *addr; unsigned char __user *end; @@ -1899,7 +1899,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, u32 ulen; int err; - err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -1998,7 +1998,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, u32 info_len = attr->info.info_len; int err; - err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -2038,7 +2038,7 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, u32 info_len = attr->info.info_len; int err; - err = check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len); + err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len); if (err) return err; @@ -2110,7 +2110,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) return -EPERM; - err = check_uarg_tail_zero(uattr, sizeof(attr), size); + err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) return err; size = min_t(u32, size, sizeof(attr)); -- cgit v1.1 From f80442a4cd1864154beaa060bb483a2c9f7d811f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 14:57:18 -0700 Subject: bpf: btf: Change how section is supported in btf_header There are currently unused section descriptions in the btf_header. Those sections are here to support future BTF use cases. For example, the func section (func_off) is to support function signature (e.g. the BPF prog function signature). Instead of spelling out all potential sections up-front in the btf_header. This patch makes changes to btf_header such that extending it (e.g. adding a section) is possible later. The unused ones can be removed for now and they can be added back later. This patch: 1. adds a hdr_len to the btf_header. It will allow adding sections (and other info like parent_label and parent_name) later. The check is similar to the existing bpf_attr. If a user passes in a longer hdr_len, the kernel ensures the extra tailing bytes are 0. 2. allows the section order in the BTF object to be different from its sec_off order in btf_header. 3. each sec_off is followed by a sec_len. It must not have gap or overlapping among sections. The string section is ensured to be at the end due to the 4 bytes alignment requirement of the type section. The above changes will allow enough flexibility to add new sections (and other info) to the btf_header later. This patch also removes an unnecessary !err check at the end of btf_parse(). Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 209 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 158 insertions(+), 51 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ded10ab..75da6cb 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -184,15 +185,13 @@ static DEFINE_IDR(btf_idr); static DEFINE_SPINLOCK(btf_idr_lock); struct btf { - union { - struct btf_header *hdr; - void *data; - }; + void *data; struct btf_type **types; u32 *resolved_ids; u32 *resolved_sizes; const char *strings; void *nohdr_data; + struct btf_header hdr; u32 nr_types; u32 types_size; u32 data_size; @@ -228,6 +227,11 @@ enum resolve_mode { #define MAX_RESOLVE_DEPTH 32 +struct btf_sec_info { + u32 off; + u32 len; +}; + struct btf_verifier_env { struct btf *btf; u8 *visit_states; @@ -418,14 +422,14 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { return !BTF_STR_TBL_ELF_ID(offset) && - BTF_STR_OFFSET(offset) < btf->hdr->str_len; + BTF_STR_OFFSET(offset) < btf->hdr.str_len; } static const char *btf_name_by_offset(const struct btf *btf, u32 offset) { if (!BTF_STR_OFFSET(offset)) return "(anon)"; - else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len) + else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len) return &btf->strings[BTF_STR_OFFSET(offset)]; else return "(invalid-name-offset)"; @@ -536,7 +540,8 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, __btf_verifier_log(log, "\n"); } -static void btf_verifier_log_hdr(struct btf_verifier_env *env) +static void btf_verifier_log_hdr(struct btf_verifier_env *env, + u32 btf_data_size) { struct bpf_verifier_log *log = &env->log; const struct btf *btf = env->btf; @@ -545,19 +550,16 @@ static void btf_verifier_log_hdr(struct btf_verifier_env *env) if (!bpf_verifier_log_needed(log)) return; - hdr = btf->hdr; + hdr = &btf->hdr; __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic); __btf_verifier_log(log, "version: %u\n", hdr->version); __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags); - __btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label); - __btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name); - __btf_verifier_log(log, "label_off: %u\n", hdr->label_off); - __btf_verifier_log(log, "object_off: %u\n", hdr->object_off); - __btf_verifier_log(log, "func_off: %u\n", hdr->func_off); + __btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len); __btf_verifier_log(log, "type_off: %u\n", hdr->type_off); + __btf_verifier_log(log, "type_len: %u\n", hdr->type_len); __btf_verifier_log(log, "str_off: %u\n", hdr->str_off); __btf_verifier_log(log, "str_len: %u\n", hdr->str_len); - __btf_verifier_log(log, "btf_total_size: %u\n", btf->data_size); + __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size); } static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) @@ -1754,9 +1756,9 @@ static int btf_check_all_metas(struct btf_verifier_env *env) struct btf_header *hdr; void *cur, *end; - hdr = btf->hdr; + hdr = &btf->hdr; cur = btf->nohdr_data + hdr->type_off; - end = btf->nohdr_data + hdr->str_off; + end = btf->nohdr_data + hdr->type_len; env->log_type_id = 1; while (cur < end) { @@ -1866,8 +1868,20 @@ static int btf_check_all_types(struct btf_verifier_env *env) static int btf_parse_type_sec(struct btf_verifier_env *env) { + const struct btf_header *hdr = &env->btf->hdr; int err; + /* Type section must align to 4 bytes */ + if (hdr->type_off & (sizeof(u32) - 1)) { + btf_verifier_log(env, "Unaligned type_off"); + return -EINVAL; + } + + if (!hdr->type_len) { + btf_verifier_log(env, "No type found"); + return -EINVAL; + } + err = btf_check_all_metas(env); if (err) return err; @@ -1881,10 +1895,15 @@ static int btf_parse_str_sec(struct btf_verifier_env *env) struct btf *btf = env->btf; const char *start, *end; - hdr = btf->hdr; + hdr = &btf->hdr; start = btf->nohdr_data + hdr->str_off; end = start + hdr->str_len; + if (end != btf->data + btf->data_size) { + btf_verifier_log(env, "String section is not at the end"); + return -EINVAL; + } + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || start[0] || end[-1]) { btf_verifier_log(env, "Invalid string section"); @@ -1896,20 +1915,122 @@ static int btf_parse_str_sec(struct btf_verifier_env *env) return 0; } -static int btf_parse_hdr(struct btf_verifier_env *env) +static const size_t btf_sec_info_offset[] = { + offsetof(struct btf_header, type_off), + offsetof(struct btf_header, str_off), +}; + +static int btf_sec_info_cmp(const void *a, const void *b) +{ + const struct btf_sec_info *x = a; + const struct btf_sec_info *y = b; + + return (int)(x->off - y->off) ? : (int)(x->len - y->len); +} + +static int btf_check_sec_info(struct btf_verifier_env *env, + u32 btf_data_size) { + const unsigned int nr_secs = ARRAY_SIZE(btf_sec_info_offset); + struct btf_sec_info secs[nr_secs]; + u32 total, expected_total, i; const struct btf_header *hdr; - struct btf *btf = env->btf; - u32 meta_left; + const struct btf *btf; + + btf = env->btf; + hdr = &btf->hdr; - if (btf->data_size < sizeof(*hdr)) { + /* Populate the secs from hdr */ + for (i = 0; i < nr_secs; i++) + secs[i] = *(struct btf_sec_info *)((void *)hdr + + btf_sec_info_offset[i]); + + sort(secs, nr_secs, sizeof(struct btf_sec_info), + btf_sec_info_cmp, NULL); + + /* Check for gaps and overlap among sections */ + total = 0; + expected_total = btf_data_size - hdr->hdr_len; + for (i = 0; i < nr_secs; i++) { + if (expected_total < secs[i].off) { + btf_verifier_log(env, "Invalid section offset"); + return -EINVAL; + } + if (total < secs[i].off) { + /* gap */ + btf_verifier_log(env, "Unsupported section found"); + return -EINVAL; + } + if (total > secs[i].off) { + btf_verifier_log(env, "Section overlap found"); + return -EINVAL; + } + if (expected_total - total < secs[i].len) { + btf_verifier_log(env, + "Total section length too long"); + return -EINVAL; + } + total += secs[i].len; + } + + /* There is data other than hdr and known sections */ + if (expected_total != total) { + btf_verifier_log(env, "Unsupported section found"); + return -EINVAL; + } + + return 0; +} + +static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, + u32 btf_data_size) +{ + const struct btf_header *hdr; + u32 hdr_len, hdr_copy; + /* + * Minimal part of the "struct btf_header" that + * contains the hdr_len. + */ + struct btf_min_header { + u16 magic; + u8 version; + u8 flags; + u32 hdr_len; + } __user *min_hdr; + struct btf *btf; + int err; + + btf = env->btf; + min_hdr = btf_data; + + if (btf_data_size < sizeof(*min_hdr)) { + btf_verifier_log(env, "hdr_len not found"); + return -EINVAL; + } + + if (get_user(hdr_len, &min_hdr->hdr_len)) + return -EFAULT; + + if (btf_data_size < hdr_len) { btf_verifier_log(env, "btf_header not found"); return -EINVAL; } - btf_verifier_log_hdr(env); + err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len); + if (err) { + if (err == -E2BIG) + btf_verifier_log(env, "Unsupported btf_header"); + return err; + } + + hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); + if (copy_from_user(&btf->hdr, btf_data, hdr_copy)) + return -EFAULT; + + hdr = &btf->hdr; + + btf_verifier_log_hdr(env, btf_data_size); - hdr = btf->hdr; if (hdr->magic != BTF_MAGIC) { btf_verifier_log(env, "Invalid magic"); return -EINVAL; @@ -1925,26 +2046,14 @@ static int btf_parse_hdr(struct btf_verifier_env *env) return -ENOTSUPP; } - meta_left = btf->data_size - sizeof(*hdr); - if (!meta_left) { + if (btf_data_size == hdr->hdr_len) { btf_verifier_log(env, "No data"); return -EINVAL; } - if (meta_left < hdr->type_off || hdr->str_off <= hdr->type_off || - /* Type section must align to 4 bytes */ - hdr->type_off & (sizeof(u32) - 1)) { - btf_verifier_log(env, "Invalid type_off"); - return -EINVAL; - } - - if (meta_left < hdr->str_off || - meta_left - hdr->str_off < hdr->str_len) { - btf_verifier_log(env, "Invalid str_off or str_len"); - return -EINVAL; - } - - btf->nohdr_data = btf->hdr + 1; + err = btf_check_sec_info(env, btf_data_size); + if (err) + return err; return 0; } @@ -1987,6 +2096,11 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, err = -ENOMEM; goto errout; } + env->btf = btf; + + err = btf_parse_hdr(env, btf_data, btf_data_size); + if (err) + goto errout; data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { @@ -1996,18 +2110,13 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, btf->data = data; btf->data_size = btf_data_size; + btf->nohdr_data = btf->data + btf->hdr.hdr_len; if (copy_from_user(data, btf_data, btf_data_size)) { err = -EFAULT; goto errout; } - env->btf = btf; - - err = btf_parse_hdr(env); - if (err) - goto errout; - err = btf_parse_str_sec(env); if (err) goto errout; @@ -2016,16 +2125,14 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, if (err) goto errout; - if (!err && log->level && bpf_verifier_log_full(log)) { + if (log->level && bpf_verifier_log_full(log)) { err = -ENOSPC; goto errout; } - if (!err) { - btf_verifier_env_free(env); - refcount_set(&btf->refcnt, 1); - return btf; - } + btf_verifier_env_free(env); + refcount_set(&btf->refcnt, 1); + return btf; errout: btf_verifier_env_free(env); -- cgit v1.1 From 4ef5f5741e340d625e668f78d44eb8c9a6a4a26e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 14:57:19 -0700 Subject: bpf: btf: Check array->index_type Instead of ingoring the array->index_type field. Enforce that it must be a BTF_KIND_INT in size 1/2/4/8 bytes. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 80 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 56 insertions(+), 24 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 75da6cb..e388a65 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -443,6 +443,28 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) return btf->types[type_id]; } +/* + * Regular int is not a bit field and it must be either + * u8/u16/u32/u64. + */ +static bool btf_type_int_is_regular(const struct btf_type *t) +{ + u16 nr_bits, nr_bytes; + u32 int_data; + + int_data = btf_type_int(t); + nr_bits = BTF_INT_BITS(int_data); + nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); + if (BITS_PER_BYTE_MASKED(nr_bits) || + BTF_INT_OFFSET(int_data) || + (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && + nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { + return false; + } + + return true; +} + __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, const char *fmt, ...) { @@ -1308,14 +1330,16 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, return -EINVAL; } - /* We are a little forgiving on array->index_type since - * the kernel is not using it. - */ - /* Array elem cannot be in type void, - * so !array->type is not allowed. + /* Array elem type and index type cannot be in type void, + * so !array->type and !array->index_type are not allowed. */ if (!array->type || BTF_TYPE_PARENT(array->type)) { - btf_verifier_log_type(env, t, "Invalid type_id"); + btf_verifier_log_type(env, t, "Invalid elem"); + return -EINVAL; + } + + if (!array->index_type || BTF_TYPE_PARENT(array->index_type)) { + btf_verifier_log_type(env, t, "Invalid index"); return -EINVAL; } @@ -1328,11 +1352,32 @@ static int btf_array_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_array *array = btf_type_array(v->t); - const struct btf_type *elem_type; - u32 elem_type_id = array->type; + const struct btf_type *elem_type, *index_type; + u32 elem_type_id, index_type_id; struct btf *btf = env->btf; u32 elem_size; + /* Check array->index_type */ + index_type_id = array->index_type; + index_type = btf_type_by_id(btf, index_type_id); + if (btf_type_is_void_or_null(index_type)) { + btf_verifier_log_type(env, v->t, "Invalid index"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, index_type) && + !env_type_is_resolved(env, index_type_id)) + return env_stack_push(env, index_type, index_type_id); + + index_type = btf_type_id_size(btf, &index_type_id, NULL); + if (!index_type || !btf_type_is_int(index_type) || + !btf_type_int_is_regular(index_type)) { + btf_verifier_log_type(env, v->t, "Invalid index"); + return -EINVAL; + } + + /* Check array->type */ + elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); if (btf_type_is_void_or_null(elem_type)) { btf_verifier_log_type(env, v->t, @@ -1350,22 +1395,9 @@ static int btf_array_resolve(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_is_int(elem_type)) { - int int_type_data = btf_type_int(elem_type); - u16 nr_bits = BTF_INT_BITS(int_type_data); - u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); - - /* Put more restriction on array of int. The int cannot - * be a bit field and it must be either u8/u16/u32/u64. - */ - if (BITS_PER_BYTE_MASKED(nr_bits) || - BTF_INT_OFFSET(int_type_data) || - (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && - nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { - btf_verifier_log_type(env, v->t, - "Invalid array of int"); - return -EINVAL; - } + if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) { + btf_verifier_log_type(env, v->t, "Invalid array of int"); + return -EINVAL; } if (array->nelems && elem_size > U32_MAX / array->nelems) { -- cgit v1.1 From aea2f7b8911617d7a8c23fb73d69e78764f91b58 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 14:57:20 -0700 Subject: bpf: btf: Remove unused bits from uapi/linux/btf.h This patch does the followings: 1. Limit BTF_MAX_TYPES and BTF_MAX_NAME_OFFSET to 64k. We can raise it later. 2. Remove the BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID. They are currently encoded at the highest bit of a u32. It is because the current use case does not require supporting parent type (i.e type_id referring to a type in another BTF file). It also does not support referring to a string in ELF. The BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID checks are replaced by BTF_TYPE_ID_CHECK and BTF_STR_OFFSET_CHECK which are defined in btf.c instead of uapi/linux/btf.h. 3. Limit the BTF_INFO_KIND from 5 bits to 4 bits which is enough. There is unused bits headroom if we ever needed it later. 4. The root bit in BTF_INFO is also removed because it is not used in the current use case. 5. Remove BTF_INT_VARARGS since func type is not supported now. The BTF_INT_ENCODING is limited to 4 bits instead of 8 bits. The above can be added back later because the verifier ensures the unused bits are zeros. Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 52 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 17 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e388a65..9cbeabb 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -163,13 +163,16 @@ #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) +#define BTF_INFO_MASK 0x0f00ffff +#define BTF_INT_MASK 0x0fffffff +#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) +#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) + /* 16MB for 64k structs and each has 16 members and * a few MB spaces for the string section. * The hard limit is S32_MAX. */ #define BTF_MAX_SIZE (16 * 1024 * 1024) -/* 64k. We can raise it later. The hard limit is S32_MAX. */ -#define BTF_MAX_NR_TYPES 65535 #define for_each_member(i, struct_type, member) \ for (i = 0, member = btf_type_member(struct_type); \ @@ -383,8 +386,6 @@ static const char *btf_int_encoding_str(u8 encoding) return "CHAR"; else if (encoding == BTF_INT_BOOL) return "BOOL"; - else if (encoding == BTF_INT_VARARGS) - return "VARARGS"; else return "UNKN"; } @@ -421,16 +422,16 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { - return !BTF_STR_TBL_ELF_ID(offset) && - BTF_STR_OFFSET(offset) < btf->hdr.str_len; + return BTF_STR_OFFSET_VALID(offset) && + offset < btf->hdr.str_len; } static const char *btf_name_by_offset(const struct btf *btf, u32 offset) { - if (!BTF_STR_OFFSET(offset)) + if (!offset) return "(anon)"; - else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len) - return &btf->strings[BTF_STR_OFFSET(offset)]; + else if (offset < btf->hdr.str_len) + return &btf->strings[offset]; else return "(invalid-name-offset)"; } @@ -598,13 +599,13 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) struct btf_type **new_types; u32 expand_by, new_size; - if (btf->types_size == BTF_MAX_NR_TYPES) { + if (btf->types_size == BTF_MAX_TYPE) { btf_verifier_log(env, "Exceeded max num of types"); return -E2BIG; } expand_by = max_t(u32, btf->types_size >> 2, 16); - new_size = min_t(u32, BTF_MAX_NR_TYPES, + new_size = min_t(u32, BTF_MAX_TYPE, btf->types_size + expand_by); new_types = kvzalloc(new_size * sizeof(*new_types), @@ -934,6 +935,12 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, } int_data = btf_type_int(t); + if (int_data & ~BTF_INT_MASK) { + btf_verifier_log_basic(env, t, "Invalid int_data:%x", + int_data); + return -EINVAL; + } + nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); if (nr_bits > BITS_PER_U64) { @@ -947,12 +954,17 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* + * Only one of the encoding bits is allowed and it + * should be sufficient for the pretty print purpose (i.e. decoding). + * Multiple bits can be allowed later if it is found + * to be insufficient. + */ encoding = BTF_INT_ENCODING(int_data); if (encoding && encoding != BTF_INT_SIGNED && encoding != BTF_INT_CHAR && - encoding != BTF_INT_BOOL && - encoding != BTF_INT_VARARGS) { + encoding != BTF_INT_BOOL) { btf_verifier_log_type(env, t, "Unsupported encoding"); return -ENOTSUPP; } @@ -1126,7 +1138,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (BTF_TYPE_PARENT(t->type)) { + if (!BTF_TYPE_ID_VALID(t->type)) { btf_verifier_log_type(env, t, "Invalid type_id"); return -EINVAL; } @@ -1333,12 +1345,12 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, /* Array elem type and index type cannot be in type void, * so !array->type and !array->index_type are not allowed. */ - if (!array->type || BTF_TYPE_PARENT(array->type)) { + if (!array->type || !BTF_TYPE_ID_VALID(array->type)) { btf_verifier_log_type(env, t, "Invalid elem"); return -EINVAL; } - if (!array->index_type || BTF_TYPE_PARENT(array->index_type)) { + if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) { btf_verifier_log_type(env, t, "Invalid index"); return -EINVAL; } @@ -1507,7 +1519,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, } /* A member cannot be in type void */ - if (!member->type || BTF_TYPE_PARENT(member->type)) { + if (!member->type || !BTF_TYPE_ID_VALID(member->type)) { btf_verifier_log_member(env, t, member, "Invalid type_id"); return -EINVAL; @@ -1760,6 +1772,12 @@ static s32 btf_check_meta(struct btf_verifier_env *env, } meta_left -= sizeof(*t); + if (t->info & ~BTF_INFO_MASK) { + btf_verifier_log(env, "[%u] Invalid btf_info:%x", + env->log_type_id, t->info); + return -EINVAL; + } + if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { btf_verifier_log(env, "[%u] Invalid kind:%u", -- cgit v1.1 From 9b2cf328b2eccf761537a06bef914d2a0700fba7 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 14:57:21 -0700 Subject: bpf: btf: Rename btf_key_id and btf_value_id in bpf_map_info In "struct bpf_map_info", the name "btf_id", "btf_key_id" and "btf_value_id" could cause confusion because the "id" of "btf_id" means the BPF obj id given to the BTF object while "btf_key_id" and "btf_value_id" means the BTF type id within that BTF object. To make it clear, btf_key_id and btf_value_id are renamed to btf_key_type_id and btf_value_type_id. Suggested-by: Daniel Borkmann Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 2 +- kernel/bpf/syscall.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 0fd8d8f..544e58f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -352,7 +352,7 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, } seq_printf(m, "%u: ", *(u32 *)key); - btf_type_seq_show(map->btf, map->btf_value_id, value, m); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); seq_puts(m, "\n"); rcu_read_unlock(); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2b29ef8..0b4c945 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -422,7 +422,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD btf_value_id +#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -457,10 +457,10 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->usercnt, 1); if (bpf_map_support_seq_show(map) && - (attr->btf_key_id || attr->btf_value_id)) { + (attr->btf_key_type_id || attr->btf_value_type_id)) { struct btf *btf; - if (!attr->btf_key_id || !attr->btf_value_id) { + if (!attr->btf_key_type_id || !attr->btf_value_type_id) { err = -EINVAL; goto free_map_nouncharge; } @@ -471,16 +471,16 @@ static int map_create(union bpf_attr *attr) goto free_map_nouncharge; } - err = map->ops->map_check_btf(map, btf, attr->btf_key_id, - attr->btf_value_id); + err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); if (err) { btf_put(btf); goto free_map_nouncharge; } map->btf = btf; - map->btf_key_id = attr->btf_key_id; - map->btf_value_id = attr->btf_value_id; + map->btf_key_type_id = attr->btf_key_type_id; + map->btf_value_type_id = attr->btf_value_type_id; } err = security_bpf_map_alloc(map); @@ -2013,8 +2013,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, if (map->btf) { info.btf_id = btf_id(map->btf); - info.btf_key_id = map->btf_key_id; - info.btf_value_id = map->btf_value_id; + info.btf_key_type_id = map->btf_key_type_id; + info.btf_value_type_id = map->btf_value_type_id; } if (bpf_map_is_dev_bound(map)) { -- cgit v1.1 From a2889a4c2d3aefdf6f2a636fa1531243653a7633 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 23 May 2018 11:32:36 -0700 Subject: bpf: btf: Avoid variable length array Sparse warning: kernel/bpf/btf.c:1985:34: warning: Variable length array is used. This patch directly uses ARRAY_SIZE(). Fixes: f80442a4cd18 ("bpf: btf: Change how section is supported in btf_header") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9cbeabb..7e90fd1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1981,8 +1981,7 @@ static int btf_sec_info_cmp(const void *a, const void *b) static int btf_check_sec_info(struct btf_verifier_env *env, u32 btf_data_size) { - const unsigned int nr_secs = ARRAY_SIZE(btf_sec_info_offset); - struct btf_sec_info secs[nr_secs]; + struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)]; u32 total, expected_total, i; const struct btf_header *hdr; const struct btf *btf; @@ -1991,17 +1990,17 @@ static int btf_check_sec_info(struct btf_verifier_env *env, hdr = &btf->hdr; /* Populate the secs from hdr */ - for (i = 0; i < nr_secs; i++) + for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) secs[i] = *(struct btf_sec_info *)((void *)hdr + btf_sec_info_offset[i]); - sort(secs, nr_secs, sizeof(struct btf_sec_info), - btf_sec_info_cmp, NULL); + sort(secs, ARRAY_SIZE(btf_sec_info_offset), + sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL); /* Check for gaps and overlap among sections */ total = 0; expected_total = btf_data_size - hdr->hdr_len; - for (i = 0; i < nr_secs; i++) { + for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) { if (expected_total < secs[i].off) { btf_verifier_log(env, "Invalid section offset"); return -EINVAL; -- cgit v1.1 From 2162fed49fa86cb3b5c296565837e674c53cc7ae Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 24 May 2018 12:26:45 +0530 Subject: bpf: support 64-bit offsets for bpf function calls The imm field of a bpf instruction is a signed 32-bit integer. For JITed bpf-to-bpf function calls, it holds the offset of the start address of the callee's JITed image from __bpf_call_base. For some architectures, such as powerpc64, this offset may be as large as 64 bits and cannot be accomodated in the imm field without truncation. We resolve this by: [1] Additionally using the auxiliary data of each function to keep a list of start addresses of the JITed images for all functions determined by the verifier. [2] Retaining the subprog id inside the off field of the call instructions and using it to index into the list mentioned above and lookup the callee's address. To make sure that the existing JIT compilers continue to work without requiring changes, we keep the imm field as it is. Signed-off-by: Sandipan Das Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a9e4b13..559cb74b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5383,11 +5383,24 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->src_reg != BPF_PSEUDO_CALL) continue; subprog = insn->off; - insn->off = 0; insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) func[subprog]->bpf_func - __bpf_call_base; } + + /* we use the aux data to keep a list of the start addresses + * of the JITed images for each function in the program + * + * for some architectures, such as powerpc64, the imm field + * might not be large enough to hold the offset of the start + * address of the callee's JITed image from __bpf_call_base + * + * in such cases, we can lookup the start address of a callee + * by using its subprog id, available from the off field of + * the call instruction, as an index for this list + */ + func[i]->aux->func = func; + func[i]->aux->func_cnt = env->subprog_cnt; } for (i = 0; i < env->subprog_cnt; i++) { old_bpf_func = func[i]->bpf_func; -- cgit v1.1 From dbecd7388476aedeb66389febea84d5450d28773 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 24 May 2018 12:26:48 +0530 Subject: bpf: get kernel symbol addresses via syscall This adds new two new fields to struct bpf_prog_info. For multi-function programs, these fields can be used to pass a list of kernel symbol addresses for all functions in a given program to userspace using the bpf system call with the BPF_OBJ_GET_INFO_BY_FD command. When bpf_jit_kallsyms is enabled, we can get the address of the corresponding kernel symbol for a callee function and resolve the symbol's name. The address is determined by adding the value of the call instruction's imm field to __bpf_call_base. This offset gets assigned to the imm field by the verifier. For some architectures, such as powerpc64, the imm field is not large enough to hold this offset. We resolve this by: [1] Assigning the subprog id to the imm field of a call instruction in the verifier instead of the offset of the callee's symbol's address from __bpf_call_base. [2] Determining the address of a callee's corresponding symbol by using the imm field as an index for the list of kernel symbol addresses now available from the program info. Suggested-by: Daniel Borkmann Signed-off-by: Sandipan Das Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 25 +++++++++++++++++++++++++ kernel/bpf/verifier.c | 7 +------ 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0b4c945..068a4fc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1933,6 +1933,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; + info.nr_jited_ksyms = 0; goto done; } @@ -1981,6 +1982,30 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } } + ulen = info.nr_jited_ksyms; + info.nr_jited_ksyms = prog->aux->func_cnt; + if (info.nr_jited_ksyms && ulen) { + if (bpf_dump_raw_ok()) { + u64 __user *user_ksyms; + ulong ksym_addr; + u32 i; + + /* copy the address of the kernel symbol + * corresponding to each function + */ + ulen = min_t(u32, info.nr_jited_ksyms, ulen); + user_ksyms = u64_to_user_ptr(info.jited_ksyms); + for (i = 0; i < ulen; i++) { + ksym_addr = (ulong) prog->aux->func[i]->bpf_func; + ksym_addr &= PAGE_MASK; + if (put_user((u64) ksym_addr, &user_ksyms[i])) + return -EFAULT; + } + } else { + info.jited_ksyms = 0; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 559cb74b..8c4d9d0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5426,17 +5426,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - unsigned long addr; - if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; insn->off = env->insn_aux_data[i].call_imm; subprog = find_subprog(env, i + insn->off + 1); - addr = (unsigned long)func[subprog]->bpf_func; - addr &= PAGE_MASK; - insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) - addr - __bpf_call_base; + insn->imm = subprog; } prog->jited = 1; -- cgit v1.1 From 4d56a76ead2fcd856e677cdc9445ad331a409b8c Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 24 May 2018 12:26:51 +0530 Subject: bpf: fix multi-function JITed dump obtained via syscall Currently, for multi-function programs, we cannot get the JITed instructions using the bpf system call's BPF_OBJ_GET_INFO_BY_FD command. Because of this, userspace tools such as bpftool fail to identify a multi-function program as being JITed or not. With the JIT enabled and the test program running, this can be verified as follows: # cat /proc/sys/net/core/bpf_jit_enable 1 Before applying this patch: # bpftool prog list 1: kprobe name foo tag b811aab41a39ad3d gpl loaded_at 2018-05-16T11:43:38+0530 uid 0 xlated 216B not jited memlock 65536B ... # bpftool prog dump jited id 1 no instructions returned After applying this patch: # bpftool prog list 1: kprobe name foo tag b811aab41a39ad3d gpl loaded_at 2018-05-16T12:13:01+0530 uid 0 xlated 216B jited 308B memlock 65536B ... # bpftool prog dump jited id 1 0: nop 4: nop 8: mflr r0 c: std r0,16(r1) 10: stdu r1,-112(r1) 14: std r31,104(r1) 18: addi r31,r1,48 1c: li r3,10 ... Signed-off-by: Sandipan Das Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 068a4fc..c8e987a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1970,13 +1970,44 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, * for offload. */ ulen = info.jited_prog_len; - info.jited_prog_len = prog->jited_len; + if (prog->aux->func_cnt) { + u32 i; + + info.jited_prog_len = 0; + for (i = 0; i < prog->aux->func_cnt; i++) + info.jited_prog_len += prog->aux->func[i]->jited_len; + } else { + info.jited_prog_len = prog->jited_len; + } + if (info.jited_prog_len && ulen) { if (bpf_dump_raw_ok()) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; + + /* for multi-function programs, copy the JITed + * instructions for all the functions + */ + if (prog->aux->func_cnt) { + u32 len, free, i; + u8 *img; + + free = ulen; + for (i = 0; i < prog->aux->func_cnt; i++) { + len = prog->aux->func[i]->jited_len; + len = min_t(u32, len, free); + img = (u8 *) prog->aux->func[i]->bpf_func; + if (copy_to_user(uinsns, img, len)) + return -EFAULT; + uinsns += len; + free -= len; + if (!free) + break; + } + } else { + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } } else { info.jited_prog_insns = 0; } -- cgit v1.1 From 815581c11cc29f74af252b6306ea1ec94160231a Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 24 May 2018 12:26:52 +0530 Subject: bpf: get JITed image lengths of functions via syscall This adds new two new fields to struct bpf_prog_info. For multi-function programs, these fields can be used to pass a list of the JITed image lengths of each function for a given program to userspace using the bpf system call with the BPF_OBJ_GET_INFO_BY_FD command. This can be used by userspace applications like bpftool to split up the contiguous JITed dump, also obtained via the system call, into more relatable chunks corresponding to each function. Signed-off-by: Sandipan Das Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c8e987a..788456c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2037,6 +2037,26 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } } + ulen = info.nr_jited_func_lens; + info.nr_jited_func_lens = prog->aux->func_cnt; + if (info.nr_jited_func_lens && ulen) { + if (bpf_dump_raw_ok()) { + u32 __user *user_lens; + u32 func_len, i; + + /* copy the JITed image lengths for each function */ + ulen = min_t(u32, info.nr_jited_func_lens, ulen); + user_lens = u64_to_user_ptr(info.jited_func_lens); + for (i = 0; i < ulen; i++) { + func_len = prog->aux->func[i]->jited_len; + if (put_user(func_len, &user_lens[i])) + return -EFAULT; + } + } else { + info.jited_func_lens = 0; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) -- cgit v1.1 From 004d4b274e2a1a895a0e5dc66158b90a7d463d44 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:16 +0100 Subject: ipv6: sr: Add seg6local action End.BPF This patch adds the End.BPF action to the LWT seg6local infrastructure. This action works like any other seg6local End action, meaning that an IPv6 header with SRH is needed, whose DA has to be equal to the SID of the action. It will also advance the SRH to the next segment, the BPF program does not have to take care of this. Since the BPF program may not be a source of instability in the kernel, it is important to ensure that the integrity of the packet is maintained before yielding it back to the IPv6 layer. The hook hence keeps track if the SRH has been altered through the helpers, and re-validates its content if needed with seg6_validate_srh. The state kept for validation is stored in a per-CPU buffer. The BPF program is not allowed to directly write into the packet, and only some fields of the SRH can be altered through the helper bpf_lwt_seg6_store_bytes. Performances profiling has shown that the SRH re-validation does not induce a significant overhead. If the altered SRH is deemed as invalid, the packet is dropped. This validation is also done before executing any action through bpf_lwt_seg6_action, and will not be performed again if the SRH is not modified after calling the action. The BPF program may return 3 types of return codes: - BPF_OK: the End.BPF action will look up the next destination through seg6_lookup_nexthop. - BPF_REDIRECT: if an action has been executed through the bpf_lwt_seg6_action helper, the BPF program should return this value, as the skb's destination is already set and the default lookup should not be performed. - BPF_DROP : the packet will be dropped. Signed-off-by: Mathieu Xhonneux Acked-by: David Lebrun Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8c4d9d0..967cacf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1262,6 +1262,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, switch (env->prog->type) { case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; -- cgit v1.1 From 41bdc4b40ed6fb26c6acc655ed9a243a348709c9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 24 May 2018 11:21:09 -0700 Subject: bpf: introduce bpf subcommand BPF_TASK_FD_QUERY Currently, suppose a userspace application has loaded a bpf program and attached it to a tracepoint/kprobe/uprobe, and a bpf introspection tool, e.g., bpftool, wants to show which bpf program is attached to which tracepoint/kprobe/uprobe. Such attachment information will be really useful to understand the overall bpf deployment in the system. There is a name field (16 bytes) for each program, which could be used to encode the attachment point. There are some drawbacks for this approaches. First, bpftool user (e.g., an admin) may not really understand the association between the name and the attachment point. Second, if one program is attached to multiple places, encoding a proper name which can imply all these attachments becomes difficult. This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY. Given a pid and fd, if the is associated with a tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return . prog_id . tracepoint name, or . k[ret]probe funcname + offset or kernel addr, or . u[ret]probe filename + offset to the userspace. The user can use "bpftool prog" to find more information about bpf program itself with prog_id. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 788456c..388d4fe 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -18,7 +18,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -2178,6 +2180,132 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) return btf_get_fd_by_id(attr->btf_id); } +static int bpf_task_fd_query_copy(const union bpf_attr *attr, + union bpf_attr __user *uattr, + u32 prog_id, u32 fd_type, + const char *buf, u64 probe_offset, + u64 probe_addr) +{ + char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf); + u32 len = buf ? strlen(buf) : 0, input_len; + int err = 0; + + if (put_user(len, &uattr->task_fd_query.buf_len)) + return -EFAULT; + input_len = attr->task_fd_query.buf_len; + if (input_len && ubuf) { + if (!len) { + /* nothing to copy, just make ubuf NULL terminated */ + char zero = '\0'; + + if (put_user(zero, ubuf)) + return -EFAULT; + } else if (input_len >= len + 1) { + /* ubuf can hold the string with NULL terminator */ + if (copy_to_user(ubuf, buf, len + 1)) + return -EFAULT; + } else { + /* ubuf cannot hold the string with NULL terminator, + * do a partial copy with NULL terminator. + */ + char zero = '\0'; + + err = -ENOSPC; + if (copy_to_user(ubuf, buf, input_len - 1)) + return -EFAULT; + if (put_user(zero, ubuf + input_len - 1)) + return -EFAULT; + } + } + + if (put_user(prog_id, &uattr->task_fd_query.prog_id) || + put_user(fd_type, &uattr->task_fd_query.fd_type) || + put_user(probe_offset, &uattr->task_fd_query.probe_offset) || + put_user(probe_addr, &uattr->task_fd_query.probe_addr)) + return -EFAULT; + + return err; +} + +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr + +static int bpf_task_fd_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + pid_t pid = attr->task_fd_query.pid; + u32 fd = attr->task_fd_query.fd; + const struct perf_event *event; + struct files_struct *files; + struct task_struct *task; + struct file *file; + int err; + + if (CHECK_ATTR(BPF_TASK_FD_QUERY)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (attr->task_fd_query.flags != 0) + return -EINVAL; + + task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + if (!task) + return -ENOENT; + + files = get_files_struct(task); + put_task_struct(task); + if (!files) + return -ENOENT; + + err = 0; + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (!file) + err = -EBADF; + else + get_file(file); + spin_unlock(&files->file_lock); + put_files_struct(files); + + if (err) + goto out; + + if (file->f_op == &bpf_raw_tp_fops) { + struct bpf_raw_tracepoint *raw_tp = file->private_data; + struct bpf_raw_event_map *btp = raw_tp->btp; + + err = bpf_task_fd_query_copy(attr, uattr, + raw_tp->prog->aux->id, + BPF_FD_TYPE_RAW_TRACEPOINT, + btp->tp->name, 0, 0); + goto put_file; + } + + event = perf_get_event(file); + if (!IS_ERR(event)) { + u64 probe_offset, probe_addr; + u32 prog_id, fd_type; + const char *buf; + + err = bpf_get_perf_event_info(event, &prog_id, &fd_type, + &buf, &probe_offset, + &probe_addr); + if (!err) + err = bpf_task_fd_query_copy(attr, uattr, prog_id, + fd_type, buf, + probe_offset, + probe_addr); + goto put_file; + } + + err = -ENOTSUPP; +put_file: + fput(file); +out: + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -2264,6 +2392,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_BTF_GET_FD_BY_ID: err = bpf_btf_get_fd_by_id(&attr); break; + case BPF_TASK_FD_QUERY: + err = bpf_task_fd_query(&attr, uattr); + break; default: err = -EINVAL; break; -- cgit v1.1 From 67f29e07e131ffa13ea158c259a513f474c7df27 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:46 +0200 Subject: bpf: devmap introduce dev_map_enqueue Functionality is the same, but the ndo_xdp_xmit call is now simply invoked from inside the devmap.c code. V2: Fix compile issue reported by kbuild test robot V5: Cleanups requested by Daniel - Newlines before func definition - Use BUILD_BUG_ON checks - Remove unnecessary use return value store in dev_map_enqueue Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 565f9ec..06c400e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,13 +48,15 @@ * calls will fail at this point. */ #include +#include #include +#include #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) struct bpf_dtab_netdev { - struct net_device *dev; + struct net_device *dev; /* must be first member, due to tracepoint */ struct bpf_dtab *dtab; unsigned int bit; struct rcu_head rcu; @@ -240,21 +242,38 @@ void __dev_map_flush(struct bpf_map *map) * update happens in parallel here a dev_put wont happen until after reading the * ifindex. */ -struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_dtab_netdev *dev; + struct bpf_dtab_netdev *obj; if (key >= map->max_entries) return NULL; - dev = READ_ONCE(dtab->netdev_map[key]); - return dev ? dev->dev : NULL; + obj = READ_ONCE(dtab->netdev_map[key]); + return obj; +} + +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) +{ + struct net_device *dev = dst->dev; + struct xdp_frame *xdpf; + + if (!dev->netdev_ops->ndo_xdp_xmit) + return -EOPNOTSUPP; + + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + /* TODO: implement a bulking/enqueue step later */ + return dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { - struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key); + struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); + struct net_device *dev = dev = obj ? obj->dev : NULL; return dev ? &dev->ifindex : NULL; } @@ -405,6 +424,9 @@ static struct notifier_block dev_map_notifier = { static int __init dev_map_init(void) { + /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ + BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != + offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); return 0; } -- cgit v1.1 From 5d053f9da4311a86bc58be8588bb5660fb3f0724 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:51 +0200 Subject: bpf: devmap prepare xdp frames for bulking Like cpumap create queue for xdp frames that will be bulked. For now, this patch simply invoke ndo_xdp_xmit foreach frame. This happens, either when the map flush operation is envoked, or when the limit DEV_MAP_BULK_SIZE is reached. V5: Avoid memleak on error path in dev_map_update_elem() Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 06c400e..15293b9 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -55,10 +55,17 @@ #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define DEV_MAP_BULK_SIZE 16 +struct xdp_bulk_queue { + struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + unsigned int count; +}; + struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct bpf_dtab *dtab; unsigned int bit; + struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; }; @@ -208,6 +215,34 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) __set_bit(bit, bitmap); } +static int bq_xmit_all(struct bpf_dtab_netdev *obj, + struct xdp_bulk_queue *bq) +{ + struct net_device *dev = obj->dev; + int i; + + if (unlikely(!bq->count)) + return 0; + + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + + prefetch(xdpf); + } + + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + int err; + + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); + if (err) + xdp_return_frame(xdpf); + } + bq->count = 0; + + return 0; +} + /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled @@ -223,6 +258,7 @@ void __dev_map_flush(struct bpf_map *map) for_each_set_bit(bit, bitmap, map->max_entries) { struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); + struct xdp_bulk_queue *bq; struct net_device *netdev; /* This is possible if the dev entry is removed by user space @@ -232,6 +268,9 @@ void __dev_map_flush(struct bpf_map *map) continue; __clear_bit(bit, bitmap); + + bq = this_cpu_ptr(dev->bulkq); + bq_xmit_all(dev, bq); netdev = dev->dev; if (likely(netdev->netdev_ops->ndo_xdp_flush)) netdev->netdev_ops->ndo_xdp_flush(netdev); @@ -254,6 +293,20 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) return obj; } +/* Runs under RCU-read-side, plus in softirq under NAPI protection. + * Thus, safe percpu variable access. + */ +static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) +{ + struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); + + if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) + bq_xmit_all(obj, bq); + + bq->q[bq->count++] = xdpf; + return 0; +} + int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) { struct net_device *dev = dst->dev; @@ -266,8 +319,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) if (unlikely(!xdpf)) return -EOVERFLOW; - /* TODO: implement a bulking/enqueue step later */ - return dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); + return bq_enqueue(dst, xdpf); } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) @@ -282,13 +334,18 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_flush) { struct net_device *fl = dev->dev; + struct xdp_bulk_queue *bq; unsigned long *bitmap; + int cpu; for_each_online_cpu(cpu) { bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); __clear_bit(dev->bit, bitmap); + bq = per_cpu_ptr(dev->bulkq, cpu); + bq_xmit_all(dev, bq); + fl->netdev_ops->ndo_xdp_flush(dev->dev); } } @@ -300,6 +357,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu) dev = container_of(rcu, struct bpf_dtab_netdev, rcu); dev_map_flush_old(dev); + free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -332,6 +390,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct net *net = current->nsproxy->net_ns; + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev, *old_dev; u32 i = *(u32 *)key; u32 ifindex = *(u32 *)value; @@ -346,13 +405,20 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, - map->numa_node); + dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); if (!dev) return -ENOMEM; + dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), + sizeof(void *), gfp); + if (!dev->bulkq) { + kfree(dev); + return -ENOMEM; + } + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { + free_percpu(dev->bulkq); kfree(dev); return -EINVAL; } -- cgit v1.1 From 38edddb81172e8b8decb057c0cd23271583a5fa0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:57 +0200 Subject: xdp: add tracepoint for devmap like cpumap have Notice how this allow us get XDP statistic without affecting the XDP performance, as tracepoint is no-longer activated on a per packet basis. V5: Spotted by John Fastabend. Fix 'sent' also counted 'drops' in this patch, a later patch corrected this, but it was a mistake in this intermediate step. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 15293b9..ff2f3bf 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -58,6 +58,7 @@ #define DEV_MAP_BULK_SIZE 16 struct xdp_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + struct net_device *dev_rx; unsigned int count; }; @@ -219,6 +220,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, struct xdp_bulk_queue *bq) { struct net_device *dev = obj->dev; + int sent = 0, drops = 0; int i; if (unlikely(!bq->count)) @@ -235,11 +237,18 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, int err; err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); - if (err) + if (err) { + drops++; xdp_return_frame(xdpf); + } else { + sent++; + } } bq->count = 0; + trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, + sent, drops, bq->dev_rx, dev); + bq->dev_rx = NULL; return 0; } @@ -296,18 +305,28 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) +static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, + struct net_device *dev_rx) + { struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(obj, bq); + /* Ingress dev_rx will be the same for all xdp_frame's in + * bulk_queue, because bq stored per-CPU and must be flushed + * from net_device drivers NAPI func end. + */ + if (!bq->dev_rx) + bq->dev_rx = dev_rx; + bq->q[bq->count++] = xdpf; return 0; } -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) { struct net_device *dev = dst->dev; struct xdp_frame *xdpf; @@ -319,7 +338,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) if (unlikely(!xdpf)) return -EOVERFLOW; - return bq_enqueue(dst, xdpf); + return bq_enqueue(dst, xdpf, dev_rx); } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) -- cgit v1.1 From 389ab7f01af988c2a1ec5617eb0c7e220df1ef1c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:07 +0200 Subject: xdp: introduce xdp_return_frame_rx_napi When sending an xdp_frame through xdp_do_redirect call, then error cases can happen where the xdp_frame needs to be dropped, and returning an -errno code isn't sufficient/possible any-longer (e.g. for cpumap case). This is already fully supported, by simply calling xdp_return_frame. This patch is an optimization, which provides xdp_return_frame_rx_napi, which is a faster variant for these error cases. It take advantage of the protection provided by XDP RX running under NAPI protection. This change is mostly relevant for drivers using the page_pool allocator as it can take advantage of this. (Tested with mlx5). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 2 +- kernel/bpf/devmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index c95b04e..e0918d1 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); } processed++; } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ff2f3bf..a9cd5c9 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -239,7 +239,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); if (err) { drops++; - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); } else { sent++; } -- cgit v1.1 From 735fc4054b3a25034445c6713d259da0f96f8131 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:12 +0200 Subject: xdp: change ndo_xdp_xmit API to support bulking This patch change the API for ndo_xdp_xmit to support bulking xdp_frames. When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown. Most of the slowdown is caused by DMA API indirect function calls, but also the net_device->ndo_xdp_xmit() call. Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed performance improved: for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps With frames avail as a bulk inside the driver ndo_xdp_xmit call, further optimizations are possible, like bulk DMA-mapping for TX. Testing without CONFIG_RETPOLINE show the same performance for physical NIC drivers. The virtual NIC driver tun sees a huge performance boost, as it can avoid doing per frame producer locking, but instead amortize the locking cost over the bulk. V2: Fix compile errors reported by kbuild test robot V4: Isolated ndo, driver changes and callers. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a9cd5c9..7790831 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -232,24 +232,31 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, prefetch(xdpf); } - for (i = 0; i < bq->count; i++) { - struct xdp_frame *xdpf = bq->q[i]; - int err; - - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); - if (err) { - drops++; - xdp_return_frame_rx_napi(xdpf); - } else { - sent++; - } + sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q); + if (sent < 0) { + sent = 0; + goto error; } + drops = bq->count - sent; +out: bq->count = 0; trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, sent, drops, bq->dev_rx, dev); bq->dev_rx = NULL; return 0; +error: + /* If ndo_xdp_xmit fails with an errno, no frames have been + * xmit'ed and it's our responsibility to them free all. + */ + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + + /* RX path under NAPI protection, can return frames faster */ + xdp_return_frame_rx_napi(xdpf); + drops++; + } + goto out; } /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled -- cgit v1.1 From e74de52e55c092e7113f839e74400ce9dbe12ceb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:17 +0200 Subject: xdp/trace: extend tracepoint in devmap with an err Extending tracepoint xdp:xdp_devmap_xmit in devmap with an err code allow people to easier identify the reason behind the ndo_xdp_xmit call to a given driver is failing. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 7790831..ae16d0c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -220,7 +220,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, struct xdp_bulk_queue *bq) { struct net_device *dev = obj->dev; - int sent = 0, drops = 0; + int sent = 0, drops = 0, err = 0; int i; if (unlikely(!bq->count)) @@ -234,6 +234,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q); if (sent < 0) { + err = sent; sent = 0; goto error; } @@ -242,7 +243,7 @@ out: bq->count = 0; trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, - sent, drops, bq->dev_rx, dev); + sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; return 0; error: -- cgit v1.1