From 0e4364560361d57e8cd873a8990327f3471d7d8a Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Thu, 17 May 2018 09:08:43 -0500
Subject: bpf: sockmap, fix uninitialized variable

There is a potential execution path in which variable err is
returned without being properly initialized previously.

Fix this by initializing variable err to 0.

Addresses-Coverity-ID: 1468964 ("Uninitialized scalar variable")
Fixes: e5cd3abcb31a ("bpf: sockmap, refactor sockmap routines to work with hashmap")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index c6de139..41b41fc 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1713,7 +1713,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 	struct smap_psock_map_entry *e = NULL;
 	struct smap_psock *psock;
 	bool new = false;
-	int err;
+	int err = 0;
 
 	/* 1. If sock map has BPF programs those will be inherited by the
 	 * sock being added. If the sock is already attached to BPF programs
-- 
cgit v1.1


From a78622932c27e8ec33e5ba180f3d2e87fb806b28 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Thu, 17 May 2018 09:11:02 -0500
Subject: bpf: sockmap, fix double-free

`e' is being freed twice.

Fix this by removing one of the kfree() calls.

Addresses-Coverity-ID: 1468983 ("Double free")
Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 41b41fc..c682669 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1823,7 +1823,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 	write_unlock_bh(&sock->sk_callback_lock);
 	return err;
 out_free:
-	kfree(e);
 	smap_release_sock(psock, sock);
 out_progs:
 	if (verdict)
-- 
cgit v1.1


From dac09149d992995adbef0f472093fbb6940a8653 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= <bjorn.topel@intel.com>
Date: Fri, 18 May 2018 14:00:21 +0200
Subject: xsk: clean up SPDX headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clean up SPDX-License-Identifier and removing licensing leftovers.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/xskmap.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index cb3a121..b3c5574 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -1,15 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* XSKMAP used for AF_XDP sockets
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #include <linux/bpf.h>
-- 
cgit v1.1


From 303def35f64e37bcd5401d202889f5fbc0241179 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 17 May 2018 14:16:58 -0700
Subject: bpf: allow sk_msg programs to read sock fields

Currently sk_msg programs only have access to the raw data. However,
it is often useful when building policies to have the policies specific
to the socket endpoint. This allows using the socket tuple as input
into filters, etc.

This patch adds ctx access to the sock fields.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index c682669..3b28955 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -523,6 +523,7 @@ static unsigned int smap_do_tx_msg(struct sock *sk,
 	}
 
 	bpf_compute_data_pointers_sg(md);
+	md->sk = sk;
 	rc = (*prog->bpf_func)(md, prog->insnsi);
 	psock->apply_bytes = md->apply_bytes;
 
-- 
cgit v1.1


From dcab51f19b291d5ee23724c51b0a3a597c16451a Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 15:03:31 -0700
Subject: bpf: Expose check_uarg_tail_zero()

This patch exposes check_uarg_tail_zero() which will
be reused by a later BTF patch.  Its name is changed to
bpf_check_uarg_tail_zero().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/syscall.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bfcde94..2b29ef8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -65,9 +65,9 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
  * copy_from_user() call. However, this is not a concern since this function is
  * meant to be a future-proofing of bits.
  */
-static int check_uarg_tail_zero(void __user *uaddr,
-				size_t expected_size,
-				size_t actual_size)
+int bpf_check_uarg_tail_zero(void __user *uaddr,
+			     size_t expected_size,
+			     size_t actual_size)
 {
 	unsigned char __user *addr;
 	unsigned char __user *end;
@@ -1899,7 +1899,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	u32 ulen;
 	int err;
 
-	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
 	if (err)
 		return err;
 	info_len = min_t(u32, sizeof(info), info_len);
@@ -1998,7 +1998,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	u32 info_len = attr->info.info_len;
 	int err;
 
-	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
 	if (err)
 		return err;
 	info_len = min_t(u32, sizeof(info), info_len);
@@ -2038,7 +2038,7 @@ static int bpf_btf_get_info_by_fd(struct btf *btf,
 	u32 info_len = attr->info.info_len;
 	int err;
 
-	err = check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
+	err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
 	if (err)
 		return err;
 
@@ -2110,7 +2110,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	err = check_uarg_tail_zero(uattr, sizeof(attr), size);
+	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
 	if (err)
 		return err;
 	size = min_t(u32, size, sizeof(attr));
-- 
cgit v1.1


From f80442a4cd1864154beaa060bb483a2c9f7d811f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 14:57:18 -0700
Subject: bpf: btf: Change how section is supported in btf_header

There are currently unused section descriptions in the btf_header.  Those
sections are here to support future BTF use cases.  For example, the
func section (func_off) is to support function signature (e.g. the BPF
prog function signature).

Instead of spelling out all potential sections up-front in the btf_header.
This patch makes changes to btf_header such that extending it (e.g. adding
a section) is possible later.  The unused ones can be removed for now and
they can be added back later.

This patch:
1. adds a hdr_len to the btf_header.  It will allow adding
sections (and other info like parent_label and parent_name)
later.  The check is similar to the existing bpf_attr.
If a user passes in a longer hdr_len, the kernel
ensures the extra tailing bytes are 0.

2. allows the section order in the BTF object to be
different from its sec_off order in btf_header.

3. each sec_off is followed by a sec_len.  It must not have gap or
overlapping among sections.

The string section is ensured to be at the end due to the 4 bytes
alignment requirement of the type section.

The above changes will allow enough flexibility to
add new sections (and other info) to the btf_header later.

This patch also removes an unnecessary !err check
at the end of btf_parse().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 209 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 158 insertions(+), 51 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ded10ab..75da6cb 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -12,6 +12,7 @@
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/idr.h>
+#include <linux/sort.h>
 #include <linux/bpf_verifier.h>
 #include <linux/btf.h>
 
@@ -184,15 +185,13 @@ static DEFINE_IDR(btf_idr);
 static DEFINE_SPINLOCK(btf_idr_lock);
 
 struct btf {
-	union {
-		struct btf_header *hdr;
-		void *data;
-	};
+	void *data;
 	struct btf_type **types;
 	u32 *resolved_ids;
 	u32 *resolved_sizes;
 	const char *strings;
 	void *nohdr_data;
+	struct btf_header hdr;
 	u32 nr_types;
 	u32 types_size;
 	u32 data_size;
@@ -228,6 +227,11 @@ enum resolve_mode {
 
 #define MAX_RESOLVE_DEPTH 32
 
+struct btf_sec_info {
+	u32 off;
+	u32 len;
+};
+
 struct btf_verifier_env {
 	struct btf *btf;
 	u8 *visit_states;
@@ -418,14 +422,14 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 {
 	return !BTF_STR_TBL_ELF_ID(offset) &&
-		BTF_STR_OFFSET(offset) < btf->hdr->str_len;
+		BTF_STR_OFFSET(offset) < btf->hdr.str_len;
 }
 
 static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
 	if (!BTF_STR_OFFSET(offset))
 		return "(anon)";
-	else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len)
+	else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len)
 		return &btf->strings[BTF_STR_OFFSET(offset)];
 	else
 		return "(invalid-name-offset)";
@@ -536,7 +540,8 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 	__btf_verifier_log(log, "\n");
 }
 
-static void btf_verifier_log_hdr(struct btf_verifier_env *env)
+static void btf_verifier_log_hdr(struct btf_verifier_env *env,
+				 u32 btf_data_size)
 {
 	struct bpf_verifier_log *log = &env->log;
 	const struct btf *btf = env->btf;
@@ -545,19 +550,16 @@ static void btf_verifier_log_hdr(struct btf_verifier_env *env)
 	if (!bpf_verifier_log_needed(log))
 		return;
 
-	hdr = btf->hdr;
+	hdr = &btf->hdr;
 	__btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
 	__btf_verifier_log(log, "version: %u\n", hdr->version);
 	__btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
-	__btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label);
-	__btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name);
-	__btf_verifier_log(log, "label_off: %u\n", hdr->label_off);
-	__btf_verifier_log(log, "object_off: %u\n", hdr->object_off);
-	__btf_verifier_log(log, "func_off: %u\n", hdr->func_off);
+	__btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len);
 	__btf_verifier_log(log, "type_off: %u\n", hdr->type_off);
+	__btf_verifier_log(log, "type_len: %u\n", hdr->type_len);
 	__btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
 	__btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
-	__btf_verifier_log(log, "btf_total_size: %u\n", btf->data_size);
+	__btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size);
 }
 
 static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
@@ -1754,9 +1756,9 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
 	struct btf_header *hdr;
 	void *cur, *end;
 
-	hdr = btf->hdr;
+	hdr = &btf->hdr;
 	cur = btf->nohdr_data + hdr->type_off;
-	end = btf->nohdr_data + hdr->str_off;
+	end = btf->nohdr_data + hdr->type_len;
 
 	env->log_type_id = 1;
 	while (cur < end) {
@@ -1866,8 +1868,20 @@ static int btf_check_all_types(struct btf_verifier_env *env)
 
 static int btf_parse_type_sec(struct btf_verifier_env *env)
 {
+	const struct btf_header *hdr = &env->btf->hdr;
 	int err;
 
+	/* Type section must align to 4 bytes */
+	if (hdr->type_off & (sizeof(u32) - 1)) {
+		btf_verifier_log(env, "Unaligned type_off");
+		return -EINVAL;
+	}
+
+	if (!hdr->type_len) {
+		btf_verifier_log(env, "No type found");
+		return -EINVAL;
+	}
+
 	err = btf_check_all_metas(env);
 	if (err)
 		return err;
@@ -1881,10 +1895,15 @@ static int btf_parse_str_sec(struct btf_verifier_env *env)
 	struct btf *btf = env->btf;
 	const char *start, *end;
 
-	hdr = btf->hdr;
+	hdr = &btf->hdr;
 	start = btf->nohdr_data + hdr->str_off;
 	end = start + hdr->str_len;
 
+	if (end != btf->data + btf->data_size) {
+		btf_verifier_log(env, "String section is not at the end");
+		return -EINVAL;
+	}
+
 	if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
 	    start[0] || end[-1]) {
 		btf_verifier_log(env, "Invalid string section");
@@ -1896,20 +1915,122 @@ static int btf_parse_str_sec(struct btf_verifier_env *env)
 	return 0;
 }
 
-static int btf_parse_hdr(struct btf_verifier_env *env)
+static const size_t btf_sec_info_offset[] = {
+	offsetof(struct btf_header, type_off),
+	offsetof(struct btf_header, str_off),
+};
+
+static int btf_sec_info_cmp(const void *a, const void *b)
+{
+	const struct btf_sec_info *x = a;
+	const struct btf_sec_info *y = b;
+
+	return (int)(x->off - y->off) ? : (int)(x->len - y->len);
+}
+
+static int btf_check_sec_info(struct btf_verifier_env *env,
+			      u32 btf_data_size)
 {
+	const unsigned int nr_secs = ARRAY_SIZE(btf_sec_info_offset);
+	struct btf_sec_info secs[nr_secs];
+	u32 total, expected_total, i;
 	const struct btf_header *hdr;
-	struct btf *btf = env->btf;
-	u32 meta_left;
+	const struct btf *btf;
+
+	btf = env->btf;
+	hdr = &btf->hdr;
 
-	if (btf->data_size < sizeof(*hdr)) {
+	/* Populate the secs from hdr */
+	for (i = 0; i < nr_secs; i++)
+		secs[i] = *(struct btf_sec_info *)((void *)hdr +
+						   btf_sec_info_offset[i]);
+
+	sort(secs, nr_secs, sizeof(struct btf_sec_info),
+	     btf_sec_info_cmp, NULL);
+
+	/* Check for gaps and overlap among sections */
+	total = 0;
+	expected_total = btf_data_size - hdr->hdr_len;
+	for (i = 0; i < nr_secs; i++) {
+		if (expected_total < secs[i].off) {
+			btf_verifier_log(env, "Invalid section offset");
+			return -EINVAL;
+		}
+		if (total < secs[i].off) {
+			/* gap */
+			btf_verifier_log(env, "Unsupported section found");
+			return -EINVAL;
+		}
+		if (total > secs[i].off) {
+			btf_verifier_log(env, "Section overlap found");
+			return -EINVAL;
+		}
+		if (expected_total - total < secs[i].len) {
+			btf_verifier_log(env,
+					 "Total section length too long");
+			return -EINVAL;
+		}
+		total += secs[i].len;
+	}
+
+	/* There is data other than hdr and known sections */
+	if (expected_total != total) {
+		btf_verifier_log(env, "Unsupported section found");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data,
+			 u32 btf_data_size)
+{
+	const struct btf_header *hdr;
+	u32 hdr_len, hdr_copy;
+	/*
+	 * Minimal part of the "struct btf_header" that
+	 * contains the hdr_len.
+	 */
+	struct btf_min_header {
+		u16	magic;
+		u8	version;
+		u8	flags;
+		u32	hdr_len;
+	} __user *min_hdr;
+	struct btf *btf;
+	int err;
+
+	btf = env->btf;
+	min_hdr = btf_data;
+
+	if (btf_data_size < sizeof(*min_hdr)) {
+		btf_verifier_log(env, "hdr_len not found");
+		return -EINVAL;
+	}
+
+	if (get_user(hdr_len, &min_hdr->hdr_len))
+		return -EFAULT;
+
+	if (btf_data_size < hdr_len) {
 		btf_verifier_log(env, "btf_header not found");
 		return -EINVAL;
 	}
 
-	btf_verifier_log_hdr(env);
+	err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len);
+	if (err) {
+		if (err == -E2BIG)
+			btf_verifier_log(env, "Unsupported btf_header");
+		return err;
+	}
+
+	hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr));
+	if (copy_from_user(&btf->hdr, btf_data, hdr_copy))
+		return -EFAULT;
+
+	hdr = &btf->hdr;
+
+	btf_verifier_log_hdr(env, btf_data_size);
 
-	hdr = btf->hdr;
 	if (hdr->magic != BTF_MAGIC) {
 		btf_verifier_log(env, "Invalid magic");
 		return -EINVAL;
@@ -1925,26 +2046,14 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
 		return -ENOTSUPP;
 	}
 
-	meta_left = btf->data_size - sizeof(*hdr);
-	if (!meta_left) {
+	if (btf_data_size == hdr->hdr_len) {
 		btf_verifier_log(env, "No data");
 		return -EINVAL;
 	}
 
-	if (meta_left < hdr->type_off || hdr->str_off <= hdr->type_off ||
-	    /* Type section must align to 4 bytes */
-	    hdr->type_off & (sizeof(u32) - 1)) {
-		btf_verifier_log(env, "Invalid type_off");
-		return -EINVAL;
-	}
-
-	if (meta_left < hdr->str_off ||
-	    meta_left - hdr->str_off < hdr->str_len) {
-		btf_verifier_log(env, "Invalid str_off or str_len");
-		return -EINVAL;
-	}
-
-	btf->nohdr_data = btf->hdr + 1;
+	err = btf_check_sec_info(env, btf_data_size);
+	if (err)
+		return err;
 
 	return 0;
 }
@@ -1987,6 +2096,11 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 		err = -ENOMEM;
 		goto errout;
 	}
+	env->btf = btf;
+
+	err = btf_parse_hdr(env, btf_data, btf_data_size);
+	if (err)
+		goto errout;
 
 	data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
 	if (!data) {
@@ -1996,18 +2110,13 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 
 	btf->data = data;
 	btf->data_size = btf_data_size;
+	btf->nohdr_data = btf->data + btf->hdr.hdr_len;
 
 	if (copy_from_user(data, btf_data, btf_data_size)) {
 		err = -EFAULT;
 		goto errout;
 	}
 
-	env->btf = btf;
-
-	err = btf_parse_hdr(env);
-	if (err)
-		goto errout;
-
 	err = btf_parse_str_sec(env);
 	if (err)
 		goto errout;
@@ -2016,16 +2125,14 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 	if (err)
 		goto errout;
 
-	if (!err && log->level && bpf_verifier_log_full(log)) {
+	if (log->level && bpf_verifier_log_full(log)) {
 		err = -ENOSPC;
 		goto errout;
 	}
 
-	if (!err) {
-		btf_verifier_env_free(env);
-		refcount_set(&btf->refcnt, 1);
-		return btf;
-	}
+	btf_verifier_env_free(env);
+	refcount_set(&btf->refcnt, 1);
+	return btf;
 
 errout:
 	btf_verifier_env_free(env);
-- 
cgit v1.1


From 4ef5f5741e340d625e668f78d44eb8c9a6a4a26e Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 14:57:19 -0700
Subject: bpf: btf: Check array->index_type

Instead of ingoring the array->index_type field.  Enforce that
it must be a BTF_KIND_INT in size 1/2/4/8 bytes.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 80 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 24 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 75da6cb..e388a65 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -443,6 +443,28 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
 	return btf->types[type_id];
 }
 
+/*
+ * Regular int is not a bit field and it must be either
+ * u8/u16/u32/u64.
+ */
+static bool btf_type_int_is_regular(const struct btf_type *t)
+{
+	u16 nr_bits, nr_bytes;
+	u32 int_data;
+
+	int_data = btf_type_int(t);
+	nr_bits = BTF_INT_BITS(int_data);
+	nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+	if (BITS_PER_BYTE_MASKED(nr_bits) ||
+	    BTF_INT_OFFSET(int_data) ||
+	    (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
+	     nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+		return false;
+	}
+
+	return true;
+}
+
 __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
 					      const char *fmt, ...)
 {
@@ -1308,14 +1330,16 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	/* We are a little forgiving on array->index_type since
-	 * the kernel is not using it.
-	 */
-	/* Array elem cannot be in type void,
-	 * so !array->type is not allowed.
+	/* Array elem type and index type cannot be in type void,
+	 * so !array->type and !array->index_type are not allowed.
 	 */
 	if (!array->type || BTF_TYPE_PARENT(array->type)) {
-		btf_verifier_log_type(env, t, "Invalid type_id");
+		btf_verifier_log_type(env, t, "Invalid elem");
+		return -EINVAL;
+	}
+
+	if (!array->index_type || BTF_TYPE_PARENT(array->index_type)) {
+		btf_verifier_log_type(env, t, "Invalid index");
 		return -EINVAL;
 	}
 
@@ -1328,11 +1352,32 @@ static int btf_array_resolve(struct btf_verifier_env *env,
 			     const struct resolve_vertex *v)
 {
 	const struct btf_array *array = btf_type_array(v->t);
-	const struct btf_type *elem_type;
-	u32 elem_type_id = array->type;
+	const struct btf_type *elem_type, *index_type;
+	u32 elem_type_id, index_type_id;
 	struct btf *btf = env->btf;
 	u32 elem_size;
 
+	/* Check array->index_type */
+	index_type_id = array->index_type;
+	index_type = btf_type_by_id(btf, index_type_id);
+	if (btf_type_is_void_or_null(index_type)) {
+		btf_verifier_log_type(env, v->t, "Invalid index");
+		return -EINVAL;
+	}
+
+	if (!env_type_is_resolve_sink(env, index_type) &&
+	    !env_type_is_resolved(env, index_type_id))
+		return env_stack_push(env, index_type, index_type_id);
+
+	index_type = btf_type_id_size(btf, &index_type_id, NULL);
+	if (!index_type || !btf_type_is_int(index_type) ||
+	    !btf_type_int_is_regular(index_type)) {
+		btf_verifier_log_type(env, v->t, "Invalid index");
+		return -EINVAL;
+	}
+
+	/* Check array->type */
+	elem_type_id = array->type;
 	elem_type = btf_type_by_id(btf, elem_type_id);
 	if (btf_type_is_void_or_null(elem_type)) {
 		btf_verifier_log_type(env, v->t,
@@ -1350,22 +1395,9 @@ static int btf_array_resolve(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (btf_type_is_int(elem_type)) {
-		int int_type_data = btf_type_int(elem_type);
-		u16 nr_bits = BTF_INT_BITS(int_type_data);
-		u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
-
-		/* Put more restriction on array of int.  The int cannot
-		 * be a bit field and it must be either u8/u16/u32/u64.
-		 */
-		if (BITS_PER_BYTE_MASKED(nr_bits) ||
-		    BTF_INT_OFFSET(int_type_data) ||
-		    (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
-		     nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
-			btf_verifier_log_type(env, v->t,
-					      "Invalid array of int");
-			return -EINVAL;
-		}
+	if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) {
+		btf_verifier_log_type(env, v->t, "Invalid array of int");
+		return -EINVAL;
 	}
 
 	if (array->nelems && elem_size > U32_MAX / array->nelems) {
-- 
cgit v1.1


From aea2f7b8911617d7a8c23fb73d69e78764f91b58 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 14:57:20 -0700
Subject: bpf: btf: Remove unused bits from uapi/linux/btf.h

This patch does the followings:
1. Limit BTF_MAX_TYPES and BTF_MAX_NAME_OFFSET to 64k.  We can
   raise it later.

2. Remove the BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID.  They are
   currently encoded at the highest bit of a u32.
   It is because the current use case does not require supporting
   parent type (i.e type_id referring to a type in another BTF file).
   It also does not support referring to a string in ELF.

   The BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID checks are replaced
   by BTF_TYPE_ID_CHECK and BTF_STR_OFFSET_CHECK which are
   defined in btf.c instead of uapi/linux/btf.h.

3. Limit the BTF_INFO_KIND from 5 bits to 4 bits which is enough.
   There is unused bits headroom if we ever needed it later.

4. The root bit in BTF_INFO is also removed because it is not
   used in the current use case.

5. Remove BTF_INT_VARARGS since func type is not supported now.
   The BTF_INT_ENCODING is limited to 4 bits instead of 8 bits.

The above can be added back later because the verifier
ensures the unused bits are zeros.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 52 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 17 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e388a65..9cbeabb 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -163,13 +163,16 @@
 #define BITS_ROUNDUP_BYTES(bits) \
 	(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
 
+#define BTF_INFO_MASK 0x0f00ffff
+#define BTF_INT_MASK 0x0fffffff
+#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
+#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
+
 /* 16MB for 64k structs and each has 16 members and
  * a few MB spaces for the string section.
  * The hard limit is S32_MAX.
  */
 #define BTF_MAX_SIZE (16 * 1024 * 1024)
-/* 64k. We can raise it later. The hard limit is S32_MAX. */
-#define BTF_MAX_NR_TYPES 65535
 
 #define for_each_member(i, struct_type, member)			\
 	for (i = 0, member = btf_type_member(struct_type);	\
@@ -383,8 +386,6 @@ static const char *btf_int_encoding_str(u8 encoding)
 		return "CHAR";
 	else if (encoding == BTF_INT_BOOL)
 		return "BOOL";
-	else if (encoding == BTF_INT_VARARGS)
-		return "VARARGS";
 	else
 		return "UNKN";
 }
@@ -421,16 +422,16 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 
 static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 {
-	return !BTF_STR_TBL_ELF_ID(offset) &&
-		BTF_STR_OFFSET(offset) < btf->hdr.str_len;
+	return BTF_STR_OFFSET_VALID(offset) &&
+		offset < btf->hdr.str_len;
 }
 
 static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
-	if (!BTF_STR_OFFSET(offset))
+	if (!offset)
 		return "(anon)";
-	else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len)
-		return &btf->strings[BTF_STR_OFFSET(offset)];
+	else if (offset < btf->hdr.str_len)
+		return &btf->strings[offset];
 	else
 		return "(invalid-name-offset)";
 }
@@ -598,13 +599,13 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
 		struct btf_type **new_types;
 		u32 expand_by, new_size;
 
-		if (btf->types_size == BTF_MAX_NR_TYPES) {
+		if (btf->types_size == BTF_MAX_TYPE) {
 			btf_verifier_log(env, "Exceeded max num of types");
 			return -E2BIG;
 		}
 
 		expand_by = max_t(u32, btf->types_size >> 2, 16);
-		new_size = min_t(u32, BTF_MAX_NR_TYPES,
+		new_size = min_t(u32, BTF_MAX_TYPE,
 				 btf->types_size + expand_by);
 
 		new_types = kvzalloc(new_size * sizeof(*new_types),
@@ -934,6 +935,12 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
 	}
 
 	int_data = btf_type_int(t);
+	if (int_data & ~BTF_INT_MASK) {
+		btf_verifier_log_basic(env, t, "Invalid int_data:%x",
+				       int_data);
+		return -EINVAL;
+	}
+
 	nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
 
 	if (nr_bits > BITS_PER_U64) {
@@ -947,12 +954,17 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	/*
+	 * Only one of the encoding bits is allowed and it
+	 * should be sufficient for the pretty print purpose (i.e. decoding).
+	 * Multiple bits can be allowed later if it is found
+	 * to be insufficient.
+	 */
 	encoding = BTF_INT_ENCODING(int_data);
 	if (encoding &&
 	    encoding != BTF_INT_SIGNED &&
 	    encoding != BTF_INT_CHAR &&
-	    encoding != BTF_INT_BOOL &&
-	    encoding != BTF_INT_VARARGS) {
+	    encoding != BTF_INT_BOOL) {
 		btf_verifier_log_type(env, t, "Unsupported encoding");
 		return -ENOTSUPP;
 	}
@@ -1126,7 +1138,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (BTF_TYPE_PARENT(t->type)) {
+	if (!BTF_TYPE_ID_VALID(t->type)) {
 		btf_verifier_log_type(env, t, "Invalid type_id");
 		return -EINVAL;
 	}
@@ -1333,12 +1345,12 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
 	/* Array elem type and index type cannot be in type void,
 	 * so !array->type and !array->index_type are not allowed.
 	 */
-	if (!array->type || BTF_TYPE_PARENT(array->type)) {
+	if (!array->type || !BTF_TYPE_ID_VALID(array->type)) {
 		btf_verifier_log_type(env, t, "Invalid elem");
 		return -EINVAL;
 	}
 
-	if (!array->index_type || BTF_TYPE_PARENT(array->index_type)) {
+	if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) {
 		btf_verifier_log_type(env, t, "Invalid index");
 		return -EINVAL;
 	}
@@ -1507,7 +1519,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 		}
 
 		/* A member cannot be in type void */
-		if (!member->type || BTF_TYPE_PARENT(member->type)) {
+		if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {
 			btf_verifier_log_member(env, t, member,
 						"Invalid type_id");
 			return -EINVAL;
@@ -1760,6 +1772,12 @@ static s32 btf_check_meta(struct btf_verifier_env *env,
 	}
 	meta_left -= sizeof(*t);
 
+	if (t->info & ~BTF_INFO_MASK) {
+		btf_verifier_log(env, "[%u] Invalid btf_info:%x",
+				 env->log_type_id, t->info);
+		return -EINVAL;
+	}
+
 	if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
 	    BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
 		btf_verifier_log(env, "[%u] Invalid kind:%u",
-- 
cgit v1.1


From 9b2cf328b2eccf761537a06bef914d2a0700fba7 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 14:57:21 -0700
Subject: bpf: btf: Rename btf_key_id and btf_value_id in bpf_map_info

In "struct bpf_map_info", the name "btf_id", "btf_key_id" and "btf_value_id"
could cause confusion because the "id" of "btf_id" means the BPF obj id
given to the BTF object while
"btf_key_id" and "btf_value_id" means the BTF type id within
that BTF object.

To make it clear, btf_key_id and btf_value_id are
renamed to btf_key_type_id and btf_value_type_id.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/arraymap.c |  2 +-
 kernel/bpf/syscall.c  | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 0fd8d8f..544e58f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -352,7 +352,7 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
 	}
 
 	seq_printf(m, "%u: ", *(u32 *)key);
-	btf_type_seq_show(map->btf, map->btf_value_id, value, m);
+	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
 	seq_puts(m, "\n");
 
 	rcu_read_unlock();
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2b29ef8..0b4c945 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -422,7 +422,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD btf_value_id
+#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -457,10 +457,10 @@ static int map_create(union bpf_attr *attr)
 	atomic_set(&map->usercnt, 1);
 
 	if (bpf_map_support_seq_show(map) &&
-	    (attr->btf_key_id || attr->btf_value_id)) {
+	    (attr->btf_key_type_id || attr->btf_value_type_id)) {
 		struct btf *btf;
 
-		if (!attr->btf_key_id || !attr->btf_value_id) {
+		if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
 			err = -EINVAL;
 			goto free_map_nouncharge;
 		}
@@ -471,16 +471,16 @@ static int map_create(union bpf_attr *attr)
 			goto free_map_nouncharge;
 		}
 
-		err = map->ops->map_check_btf(map, btf, attr->btf_key_id,
-					      attr->btf_value_id);
+		err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id,
+					      attr->btf_value_type_id);
 		if (err) {
 			btf_put(btf);
 			goto free_map_nouncharge;
 		}
 
 		map->btf = btf;
-		map->btf_key_id = attr->btf_key_id;
-		map->btf_value_id = attr->btf_value_id;
+		map->btf_key_type_id = attr->btf_key_type_id;
+		map->btf_value_type_id = attr->btf_value_type_id;
 	}
 
 	err = security_bpf_map_alloc(map);
@@ -2013,8 +2013,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 
 	if (map->btf) {
 		info.btf_id = btf_id(map->btf);
-		info.btf_key_id = map->btf_key_id;
-		info.btf_value_id = map->btf_value_id;
+		info.btf_key_type_id = map->btf_key_type_id;
+		info.btf_value_type_id = map->btf_value_type_id;
 	}
 
 	if (bpf_map_is_dev_bound(map)) {
-- 
cgit v1.1


From a2889a4c2d3aefdf6f2a636fa1531243653a7633 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 23 May 2018 11:32:36 -0700
Subject: bpf: btf: Avoid variable length array

Sparse warning:
kernel/bpf/btf.c:1985:34: warning: Variable length array is used.

This patch directly uses ARRAY_SIZE().

Fixes: f80442a4cd18 ("bpf: btf: Change how section is supported in btf_header")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9cbeabb..7e90fd1 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1981,8 +1981,7 @@ static int btf_sec_info_cmp(const void *a, const void *b)
 static int btf_check_sec_info(struct btf_verifier_env *env,
 			      u32 btf_data_size)
 {
-	const unsigned int nr_secs = ARRAY_SIZE(btf_sec_info_offset);
-	struct btf_sec_info secs[nr_secs];
+	struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)];
 	u32 total, expected_total, i;
 	const struct btf_header *hdr;
 	const struct btf *btf;
@@ -1991,17 +1990,17 @@ static int btf_check_sec_info(struct btf_verifier_env *env,
 	hdr = &btf->hdr;
 
 	/* Populate the secs from hdr */
-	for (i = 0; i < nr_secs; i++)
+	for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++)
 		secs[i] = *(struct btf_sec_info *)((void *)hdr +
 						   btf_sec_info_offset[i]);
 
-	sort(secs, nr_secs, sizeof(struct btf_sec_info),
-	     btf_sec_info_cmp, NULL);
+	sort(secs, ARRAY_SIZE(btf_sec_info_offset),
+	     sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL);
 
 	/* Check for gaps and overlap among sections */
 	total = 0;
 	expected_total = btf_data_size - hdr->hdr_len;
-	for (i = 0; i < nr_secs; i++) {
+	for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) {
 		if (expected_total < secs[i].off) {
 			btf_verifier_log(env, "Invalid section offset");
 			return -EINVAL;
-- 
cgit v1.1


From 2162fed49fa86cb3b5c296565837e674c53cc7ae Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:45 +0530
Subject: bpf: support 64-bit offsets for bpf function calls

The imm field of a bpf instruction is a signed 32-bit integer.
For JITed bpf-to-bpf function calls, it holds the offset of the
start address of the callee's JITed image from __bpf_call_base.

For some architectures, such as powerpc64, this offset may be
as large as 64 bits and cannot be accomodated in the imm field
without truncation.

We resolve this by:

[1] Additionally using the auxiliary data of each function to
    keep a list of start addresses of the JITed images for all
    functions determined by the verifier.

[2] Retaining the subprog id inside the off field of the call
    instructions and using it to index into the list mentioned
    above and lookup the callee's address.

To make sure that the existing JIT compilers continue to work
without requiring changes, we keep the imm field as it is.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a9e4b13..559cb74b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5383,11 +5383,24 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			    insn->src_reg != BPF_PSEUDO_CALL)
 				continue;
 			subprog = insn->off;
-			insn->off = 0;
 			insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
 				func[subprog]->bpf_func -
 				__bpf_call_base;
 		}
+
+		/* we use the aux data to keep a list of the start addresses
+		 * of the JITed images for each function in the program
+		 *
+		 * for some architectures, such as powerpc64, the imm field
+		 * might not be large enough to hold the offset of the start
+		 * address of the callee's JITed image from __bpf_call_base
+		 *
+		 * in such cases, we can lookup the start address of a callee
+		 * by using its subprog id, available from the off field of
+		 * the call instruction, as an index for this list
+		 */
+		func[i]->aux->func = func;
+		func[i]->aux->func_cnt = env->subprog_cnt;
 	}
 	for (i = 0; i < env->subprog_cnt; i++) {
 		old_bpf_func = func[i]->bpf_func;
-- 
cgit v1.1


From dbecd7388476aedeb66389febea84d5450d28773 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:48 +0530
Subject: bpf: get kernel symbol addresses via syscall

This adds new two new fields to struct bpf_prog_info. For
multi-function programs, these fields can be used to pass
a list of kernel symbol addresses for all functions in a
given program to userspace using the bpf system call with
the BPF_OBJ_GET_INFO_BY_FD command.

When bpf_jit_kallsyms is enabled, we can get the address
of the corresponding kernel symbol for a callee function
and resolve the symbol's name. The address is determined
by adding the value of the call instruction's imm field
to __bpf_call_base. This offset gets assigned to the imm
field by the verifier.

For some architectures, such as powerpc64, the imm field
is not large enough to hold this offset.

We resolve this by:

[1] Assigning the subprog id to the imm field of a call
    instruction in the verifier instead of the offset of
    the callee's symbol's address from __bpf_call_base.

[2] Determining the address of a callee's corresponding
    symbol by using the imm field as an index for the
    list of kernel symbol addresses now available from
    the program info.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/syscall.c  | 25 +++++++++++++++++++++++++
 kernel/bpf/verifier.c |  7 +------
 2 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0b4c945..068a4fc 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1933,6 +1933,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
+		info.nr_jited_ksyms = 0;
 		goto done;
 	}
 
@@ -1981,6 +1982,30 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_jited_ksyms;
+	info.nr_jited_ksyms = prog->aux->func_cnt;
+	if (info.nr_jited_ksyms && ulen) {
+		if (bpf_dump_raw_ok()) {
+			u64 __user *user_ksyms;
+			ulong ksym_addr;
+			u32 i;
+
+			/* copy the address of the kernel symbol
+			 * corresponding to each function
+			 */
+			ulen = min_t(u32, info.nr_jited_ksyms, ulen);
+			user_ksyms = u64_to_user_ptr(info.jited_ksyms);
+			for (i = 0; i < ulen; i++) {
+				ksym_addr = (ulong) prog->aux->func[i]->bpf_func;
+				ksym_addr &= PAGE_MASK;
+				if (put_user((u64) ksym_addr, &user_ksyms[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_ksyms = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 559cb74b..8c4d9d0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5426,17 +5426,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * later look the same as if they were interpreted only.
 	 */
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
-		unsigned long addr;
-
 		if (insn->code != (BPF_JMP | BPF_CALL) ||
 		    insn->src_reg != BPF_PSEUDO_CALL)
 			continue;
 		insn->off = env->insn_aux_data[i].call_imm;
 		subprog = find_subprog(env, i + insn->off + 1);
-		addr  = (unsigned long)func[subprog]->bpf_func;
-		addr &= PAGE_MASK;
-		insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
-			    addr - __bpf_call_base;
+		insn->imm = subprog;
 	}
 
 	prog->jited = 1;
-- 
cgit v1.1


From 4d56a76ead2fcd856e677cdc9445ad331a409b8c Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:51 +0530
Subject: bpf: fix multi-function JITed dump obtained via syscall

Currently, for multi-function programs, we cannot get the JITed
instructions using the bpf system call's BPF_OBJ_GET_INFO_BY_FD
command. Because of this, userspace tools such as bpftool fail
to identify a multi-function program as being JITed or not.

With the JIT enabled and the test program running, this can be
verified as follows:

  # cat /proc/sys/net/core/bpf_jit_enable
  1

Before applying this patch:

  # bpftool prog list
  1: kprobe  name foo  tag b811aab41a39ad3d  gpl
          loaded_at 2018-05-16T11:43:38+0530  uid 0
          xlated 216B  not jited  memlock 65536B
  ...

  # bpftool prog dump jited id 1
  no instructions returned

After applying this patch:

  # bpftool prog list
  1: kprobe  name foo  tag b811aab41a39ad3d  gpl
          loaded_at 2018-05-16T12:13:01+0530  uid 0
          xlated 216B  jited 308B  memlock 65536B
  ...

  # bpftool prog dump jited id 1
     0:   nop
     4:   nop
     8:   mflr    r0
     c:   std     r0,16(r1)
    10:   stdu    r1,-112(r1)
    14:   std     r31,104(r1)
    18:   addi    r31,r1,48
    1c:   li      r3,10
  ...

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/syscall.c | 37 ++++++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 068a4fc..c8e987a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1970,13 +1970,44 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	 * for offload.
 	 */
 	ulen = info.jited_prog_len;
-	info.jited_prog_len = prog->jited_len;
+	if (prog->aux->func_cnt) {
+		u32 i;
+
+		info.jited_prog_len = 0;
+		for (i = 0; i < prog->aux->func_cnt; i++)
+			info.jited_prog_len += prog->aux->func[i]->jited_len;
+	} else {
+		info.jited_prog_len = prog->jited_len;
+	}
+
 	if (info.jited_prog_len && ulen) {
 		if (bpf_dump_raw_ok()) {
 			uinsns = u64_to_user_ptr(info.jited_prog_insns);
 			ulen = min_t(u32, info.jited_prog_len, ulen);
-			if (copy_to_user(uinsns, prog->bpf_func, ulen))
-				return -EFAULT;
+
+			/* for multi-function programs, copy the JITed
+			 * instructions for all the functions
+			 */
+			if (prog->aux->func_cnt) {
+				u32 len, free, i;
+				u8 *img;
+
+				free = ulen;
+				for (i = 0; i < prog->aux->func_cnt; i++) {
+					len = prog->aux->func[i]->jited_len;
+					len = min_t(u32, len, free);
+					img = (u8 *) prog->aux->func[i]->bpf_func;
+					if (copy_to_user(uinsns, img, len))
+						return -EFAULT;
+					uinsns += len;
+					free -= len;
+					if (!free)
+						break;
+				}
+			} else {
+				if (copy_to_user(uinsns, prog->bpf_func, ulen))
+					return -EFAULT;
+			}
 		} else {
 			info.jited_prog_insns = 0;
 		}
-- 
cgit v1.1


From 815581c11cc29f74af252b6306ea1ec94160231a Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:52 +0530
Subject: bpf: get JITed image lengths of functions via syscall

This adds new two new fields to struct bpf_prog_info. For
multi-function programs, these fields can be used to pass
a list of the JITed image lengths of each function for a
given program to userspace using the bpf system call with
the BPF_OBJ_GET_INFO_BY_FD command.

This can be used by userspace applications like bpftool
to split up the contiguous JITed dump, also obtained via
the system call, into more relatable chunks corresponding
to each function.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/syscall.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c8e987a..788456c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2037,6 +2037,26 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_jited_func_lens;
+	info.nr_jited_func_lens = prog->aux->func_cnt;
+	if (info.nr_jited_func_lens && ulen) {
+		if (bpf_dump_raw_ok()) {
+			u32 __user *user_lens;
+			u32 func_len, i;
+
+			/* copy the JITed image lengths for each function */
+			ulen = min_t(u32, info.nr_jited_func_lens, ulen);
+			user_lens = u64_to_user_ptr(info.jited_func_lens);
+			for (i = 0; i < ulen; i++) {
+				func_len = prog->aux->func[i]->jited_len;
+				if (put_user(func_len, &user_lens[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_func_lens = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.1


From 004d4b274e2a1a895a0e5dc66158b90a7d463d44 Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Sun, 20 May 2018 14:58:16 +0100
Subject: ipv6: sr: Add seg6local action End.BPF

This patch adds the End.BPF action to the LWT seg6local infrastructure.
This action works like any other seg6local End action, meaning that an IPv6
header with SRH is needed, whose DA has to be equal to the SID of the
action. It will also advance the SRH to the next segment, the BPF program
does not have to take care of this.

Since the BPF program may not be a source of instability in the kernel, it
is important to ensure that the integrity of the packet is maintained
before yielding it back to the IPv6 layer. The hook hence keeps track if
the SRH has been altered through the helpers, and re-validates its
content if needed with seg6_validate_srh. The state kept for validation is
stored in a per-CPU buffer. The BPF program is not allowed to directly
write into the packet, and only some fields of the SRH can be altered
through the helper bpf_lwt_seg6_store_bytes.

Performances profiling has shown that the SRH re-validation does not induce
a significant overhead. If the altered SRH is deemed as invalid, the packet
is dropped.

This validation is also done before executing any action through
bpf_lwt_seg6_action, and will not be performed again if the SRH is not
modified after calling the action.

The BPF program may return 3 types of return codes:
    - BPF_OK: the End.BPF action will look up the next destination through
             seg6_lookup_nexthop.
    - BPF_REDIRECT: if an action has been executed through the
          bpf_lwt_seg6_action helper, the BPF program should return this
          value, as the skb's destination is already set and the default
          lookup should not be performed.
    - BPF_DROP : the packet will be dropped.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8c4d9d0..967cacf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1262,6 +1262,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	switch (env->prog->type) {
 	case BPF_PROG_TYPE_LWT_IN:
 	case BPF_PROG_TYPE_LWT_OUT:
+	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
 		/* dst_input() and dst_output() can't write for now */
 		if (t == BPF_WRITE)
 			return false;
-- 
cgit v1.1


From 41bdc4b40ed6fb26c6acc655ed9a243a348709c9 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 24 May 2018 11:21:09 -0700
Subject: bpf: introduce bpf subcommand BPF_TASK_FD_QUERY

Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 788456c..388d4fe 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,7 +18,9 @@
 #include <linux/vmalloc.h>
 #include <linux/mmzone.h>
 #include <linux/anon_inodes.h>
+#include <linux/fdtable.h>
 #include <linux/file.h>
+#include <linux/fs.h>
 #include <linux/license.h>
 #include <linux/filter.h>
 #include <linux/version.h>
@@ -2178,6 +2180,132 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
 	return btf_get_fd_by_id(attr->btf_id);
 }
 
+static int bpf_task_fd_query_copy(const union bpf_attr *attr,
+				    union bpf_attr __user *uattr,
+				    u32 prog_id, u32 fd_type,
+				    const char *buf, u64 probe_offset,
+				    u64 probe_addr)
+{
+	char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
+	u32 len = buf ? strlen(buf) : 0, input_len;
+	int err = 0;
+
+	if (put_user(len, &uattr->task_fd_query.buf_len))
+		return -EFAULT;
+	input_len = attr->task_fd_query.buf_len;
+	if (input_len && ubuf) {
+		if (!len) {
+			/* nothing to copy, just make ubuf NULL terminated */
+			char zero = '\0';
+
+			if (put_user(zero, ubuf))
+				return -EFAULT;
+		} else if (input_len >= len + 1) {
+			/* ubuf can hold the string with NULL terminator */
+			if (copy_to_user(ubuf, buf, len + 1))
+				return -EFAULT;
+		} else {
+			/* ubuf cannot hold the string with NULL terminator,
+			 * do a partial copy with NULL terminator.
+			 */
+			char zero = '\0';
+
+			err = -ENOSPC;
+			if (copy_to_user(ubuf, buf, input_len - 1))
+				return -EFAULT;
+			if (put_user(zero, ubuf + input_len - 1))
+				return -EFAULT;
+		}
+	}
+
+	if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
+	    put_user(fd_type, &uattr->task_fd_query.fd_type) ||
+	    put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
+	    put_user(probe_addr, &uattr->task_fd_query.probe_addr))
+		return -EFAULT;
+
+	return err;
+}
+
+#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
+
+static int bpf_task_fd_query(const union bpf_attr *attr,
+			     union bpf_attr __user *uattr)
+{
+	pid_t pid = attr->task_fd_query.pid;
+	u32 fd = attr->task_fd_query.fd;
+	const struct perf_event *event;
+	struct files_struct *files;
+	struct task_struct *task;
+	struct file *file;
+	int err;
+
+	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (attr->task_fd_query.flags != 0)
+		return -EINVAL;
+
+	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+	if (!task)
+		return -ENOENT;
+
+	files = get_files_struct(task);
+	put_task_struct(task);
+	if (!files)
+		return -ENOENT;
+
+	err = 0;
+	spin_lock(&files->file_lock);
+	file = fcheck_files(files, fd);
+	if (!file)
+		err = -EBADF;
+	else
+		get_file(file);
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+	if (err)
+		goto out;
+
+	if (file->f_op == &bpf_raw_tp_fops) {
+		struct bpf_raw_tracepoint *raw_tp = file->private_data;
+		struct bpf_raw_event_map *btp = raw_tp->btp;
+
+		err = bpf_task_fd_query_copy(attr, uattr,
+					     raw_tp->prog->aux->id,
+					     BPF_FD_TYPE_RAW_TRACEPOINT,
+					     btp->tp->name, 0, 0);
+		goto put_file;
+	}
+
+	event = perf_get_event(file);
+	if (!IS_ERR(event)) {
+		u64 probe_offset, probe_addr;
+		u32 prog_id, fd_type;
+		const char *buf;
+
+		err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
+					      &buf, &probe_offset,
+					      &probe_addr);
+		if (!err)
+			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
+						     fd_type, buf,
+						     probe_offset,
+						     probe_addr);
+		goto put_file;
+	}
+
+	err = -ENOTSUPP;
+put_file:
+	fput(file);
+out:
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2264,6 +2392,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_BTF_GET_FD_BY_ID:
 		err = bpf_btf_get_fd_by_id(&attr);
 		break;
+	case BPF_TASK_FD_QUERY:
+		err = bpf_task_fd_query(&attr, uattr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.1


From 67f29e07e131ffa13ea158c259a513f474c7df27 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:45:46 +0200
Subject: bpf: devmap introduce dev_map_enqueue

Functionality is the same, but the ndo_xdp_xmit call is now
simply invoked from inside the devmap.c code.

V2: Fix compile issue reported by kbuild test robot <lkp@intel.com>

V5: Cleanups requested by Daniel
 - Newlines before func definition
 - Use BUILD_BUG_ON checks
 - Remove unnecessary use return value store in dev_map_enqueue

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/devmap.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 565f9ec..06c400e 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,13 +48,15 @@
  * calls will fail at this point.
  */
 #include <linux/bpf.h>
+#include <net/xdp.h>
 #include <linux/filter.h>
+#include <trace/events/xdp.h>
 
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
 struct bpf_dtab_netdev {
-	struct net_device *dev;
+	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct bpf_dtab *dtab;
 	unsigned int bit;
 	struct rcu_head rcu;
@@ -240,21 +242,38 @@ void __dev_map_flush(struct bpf_map *map)
  * update happens in parallel here a dev_put wont happen until after reading the
  * ifindex.
  */
-struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	struct bpf_dtab_netdev *dev;
+	struct bpf_dtab_netdev *obj;
 
 	if (key >= map->max_entries)
 		return NULL;
 
-	dev = READ_ONCE(dtab->netdev_map[key]);
-	return dev ? dev->dev : NULL;
+	obj = READ_ONCE(dtab->netdev_map[key]);
+	return obj;
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+{
+	struct net_device *dev = dst->dev;
+	struct xdp_frame *xdpf;
+
+	if (!dev->netdev_ops->ndo_xdp_xmit)
+		return -EOPNOTSUPP;
+
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	/* TODO: implement a bulking/enqueue step later */
+	return dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
 }
 
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 {
-	struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
+	struct net_device *dev = dev = obj ? obj->dev : NULL;
 
 	return dev ? &dev->ifindex : NULL;
 }
@@ -405,6 +424,9 @@ static struct notifier_block dev_map_notifier = {
 
 static int __init dev_map_init(void)
 {
+	/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
+	BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
+		     offsetof(struct _bpf_dtab_netdev, dev));
 	register_netdevice_notifier(&dev_map_notifier);
 	return 0;
 }
-- 
cgit v1.1


From 5d053f9da4311a86bc58be8588bb5660fb3f0724 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:45:51 +0200
Subject: bpf: devmap prepare xdp frames for bulking

Like cpumap create queue for xdp frames that will be bulked.  For now,
this patch simply invoke ndo_xdp_xmit foreach frame.  This happens,
either when the map flush operation is envoked, or when the limit
DEV_MAP_BULK_SIZE is reached.

V5: Avoid memleak on error path in dev_map_update_elem()

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/devmap.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 70 insertions(+), 4 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 06c400e..15293b9 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -55,10 +55,17 @@
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
+#define DEV_MAP_BULK_SIZE 16
+struct xdp_bulk_queue {
+	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+	unsigned int count;
+};
+
 struct bpf_dtab_netdev {
 	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct bpf_dtab *dtab;
 	unsigned int bit;
+	struct xdp_bulk_queue __percpu *bulkq;
 	struct rcu_head rcu;
 };
 
@@ -208,6 +215,34 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
 	__set_bit(bit, bitmap);
 }
 
+static int bq_xmit_all(struct bpf_dtab_netdev *obj,
+			 struct xdp_bulk_queue *bq)
+{
+	struct net_device *dev = obj->dev;
+	int i;
+
+	if (unlikely(!bq->count))
+		return 0;
+
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+
+		prefetch(xdpf);
+	}
+
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+		int err;
+
+		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+		if (err)
+			xdp_return_frame(xdpf);
+	}
+	bq->count = 0;
+
+	return 0;
+}
+
 /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
  * from the driver before returning from its napi->poll() routine. The poll()
  * routine is called either from busy_poll context or net_rx_action signaled
@@ -223,6 +258,7 @@ void __dev_map_flush(struct bpf_map *map)
 
 	for_each_set_bit(bit, bitmap, map->max_entries) {
 		struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
+		struct xdp_bulk_queue *bq;
 		struct net_device *netdev;
 
 		/* This is possible if the dev entry is removed by user space
@@ -232,6 +268,9 @@ void __dev_map_flush(struct bpf_map *map)
 			continue;
 
 		__clear_bit(bit, bitmap);
+
+		bq = this_cpu_ptr(dev->bulkq);
+		bq_xmit_all(dev, bq);
 		netdev = dev->dev;
 		if (likely(netdev->netdev_ops->ndo_xdp_flush))
 			netdev->netdev_ops->ndo_xdp_flush(netdev);
@@ -254,6 +293,20 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 	return obj;
 }
 
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+{
+	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
+
+	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
+		bq_xmit_all(obj, bq);
+
+	bq->q[bq->count++] = xdpf;
+	return 0;
+}
+
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 {
 	struct net_device *dev = dst->dev;
@@ -266,8 +319,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	/* TODO: implement a bulking/enqueue step later */
-	return dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+	return bq_enqueue(dst, xdpf);
 }
 
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
@@ -282,13 +334,18 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 {
 	if (dev->dev->netdev_ops->ndo_xdp_flush) {
 		struct net_device *fl = dev->dev;
+		struct xdp_bulk_queue *bq;
 		unsigned long *bitmap;
+
 		int cpu;
 
 		for_each_online_cpu(cpu) {
 			bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
 			__clear_bit(dev->bit, bitmap);
 
+			bq = per_cpu_ptr(dev->bulkq, cpu);
+			bq_xmit_all(dev, bq);
+
 			fl->netdev_ops->ndo_xdp_flush(dev->dev);
 		}
 	}
@@ -300,6 +357,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
 
 	dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
 	dev_map_flush_old(dev);
+	free_percpu(dev->bulkq);
 	dev_put(dev->dev);
 	kfree(dev);
 }
@@ -332,6 +390,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct net *net = current->nsproxy->net_ns;
+	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
 	struct bpf_dtab_netdev *dev, *old_dev;
 	u32 i = *(u32 *)key;
 	u32 ifindex = *(u32 *)value;
@@ -346,13 +405,20 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (!ifindex) {
 		dev = NULL;
 	} else {
-		dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
-				   map->numa_node);
+		dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
 		if (!dev)
 			return -ENOMEM;
 
+		dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+						sizeof(void *), gfp);
+		if (!dev->bulkq) {
+			kfree(dev);
+			return -ENOMEM;
+		}
+
 		dev->dev = dev_get_by_index(net, ifindex);
 		if (!dev->dev) {
+			free_percpu(dev->bulkq);
 			kfree(dev);
 			return -EINVAL;
 		}
-- 
cgit v1.1


From 38edddb81172e8b8decb057c0cd23271583a5fa0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:45:57 +0200
Subject: xdp: add tracepoint for devmap like cpumap have

Notice how this allow us get XDP statistic without affecting the XDP
performance, as tracepoint is no-longer activated on a per packet basis.

V5: Spotted by John Fastabend.
 Fix 'sent' also counted 'drops' in this patch, a later patch corrected
 this, but it was a mistake in this intermediate step.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/devmap.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 15293b9..ff2f3bf 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -58,6 +58,7 @@
 #define DEV_MAP_BULK_SIZE 16
 struct xdp_bulk_queue {
 	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+	struct net_device *dev_rx;
 	unsigned int count;
 };
 
@@ -219,6 +220,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 			 struct xdp_bulk_queue *bq)
 {
 	struct net_device *dev = obj->dev;
+	int sent = 0, drops = 0;
 	int i;
 
 	if (unlikely(!bq->count))
@@ -235,11 +237,18 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		int err;
 
 		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
-		if (err)
+		if (err) {
+			drops++;
 			xdp_return_frame(xdpf);
+		} else {
+			sent++;
+		}
 	}
 	bq->count = 0;
 
+	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
+			      sent, drops, bq->dev_rx, dev);
+	bq->dev_rx = NULL;
 	return 0;
 }
 
@@ -296,18 +305,28 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
+		      struct net_device *dev_rx)
+
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
 
 	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
 		bq_xmit_all(obj, bq);
 
+	/* Ingress dev_rx will be the same for all xdp_frame's in
+	 * bulk_queue, because bq stored per-CPU and must be flushed
+	 * from net_device drivers NAPI func end.
+	 */
+	if (!bq->dev_rx)
+		bq->dev_rx = dev_rx;
+
 	bq->q[bq->count++] = xdpf;
 	return 0;
 }
 
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
 {
 	struct net_device *dev = dst->dev;
 	struct xdp_frame *xdpf;
@@ -319,7 +338,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	return bq_enqueue(dst, xdpf);
+	return bq_enqueue(dst, xdpf, dev_rx);
 }
 
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
-- 
cgit v1.1


From 389ab7f01af988c2a1ec5617eb0c7e220df1ef1c Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:46:07 +0200
Subject: xdp: introduce xdp_return_frame_rx_napi

When sending an xdp_frame through xdp_do_redirect call, then error
cases can happen where the xdp_frame needs to be dropped, and
returning an -errno code isn't sufficient/possible any-longer
(e.g. for cpumap case). This is already fully supported, by simply
calling xdp_return_frame.

This patch is an optimization, which provides xdp_return_frame_rx_napi,
which is a faster variant for these error cases.  It take advantage of
the protection provided by XDP RX running under NAPI protection.

This change is mostly relevant for drivers using the page_pool
allocator as it can take advantage of this. (Tested with mlx5).

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cpumap.c | 2 +-
 kernel/bpf/devmap.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index c95b04e..e0918d1 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 		err = __ptr_ring_produce(q, xdpf);
 		if (err) {
 			drops++;
-			xdp_return_frame(xdpf);
+			xdp_return_frame_rx_napi(xdpf);
 		}
 		processed++;
 	}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ff2f3bf..a9cd5c9 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -239,7 +239,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
 		if (err) {
 			drops++;
-			xdp_return_frame(xdpf);
+			xdp_return_frame_rx_napi(xdpf);
 		} else {
 			sent++;
 		}
-- 
cgit v1.1


From 735fc4054b3a25034445c6713d259da0f96f8131 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:46:12 +0200
Subject: xdp: change ndo_xdp_xmit API to support bulking

This patch change the API for ndo_xdp_xmit to support bulking
xdp_frames.

When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown.
Most of the slowdown is caused by DMA API indirect function calls, but
also the net_device->ndo_xdp_xmit() call.

Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with
single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed
performance improved:
 for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps
 for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps

With frames avail as a bulk inside the driver ndo_xdp_xmit call,
further optimizations are possible, like bulk DMA-mapping for TX.

Testing without CONFIG_RETPOLINE show the same performance for
physical NIC drivers.

The virtual NIC driver tun sees a huge performance boost, as it can
avoid doing per frame producer locking, but instead amortize the
locking cost over the bulk.

V2: Fix compile errors reported by kbuild test robot <lkp@intel.com>
V4: Isolated ndo, driver changes and callers.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/devmap.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index a9cd5c9..7790831 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -232,24 +232,31 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		prefetch(xdpf);
 	}
 
-	for (i = 0; i < bq->count; i++) {
-		struct xdp_frame *xdpf = bq->q[i];
-		int err;
-
-		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
-		if (err) {
-			drops++;
-			xdp_return_frame_rx_napi(xdpf);
-		} else {
-			sent++;
-		}
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
+	if (sent < 0) {
+		sent = 0;
+		goto error;
 	}
+	drops = bq->count - sent;
+out:
 	bq->count = 0;
 
 	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
 			      sent, drops, bq->dev_rx, dev);
 	bq->dev_rx = NULL;
 	return 0;
+error:
+	/* If ndo_xdp_xmit fails with an errno, no frames have been
+	 * xmit'ed and it's our responsibility to them free all.
+	 */
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+
+		/* RX path under NAPI protection, can return frames faster */
+		xdp_return_frame_rx_napi(xdpf);
+		drops++;
+	}
+	goto out;
 }
 
 /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
-- 
cgit v1.1


From e74de52e55c092e7113f839e74400ce9dbe12ceb Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:46:17 +0200
Subject: xdp/trace: extend tracepoint in devmap with an err

Extending tracepoint xdp:xdp_devmap_xmit in devmap with an err code
allow people to easier identify the reason behind the ndo_xdp_xmit
call to a given driver is failing.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/devmap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 7790831..ae16d0c 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -220,7 +220,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 			 struct xdp_bulk_queue *bq)
 {
 	struct net_device *dev = obj->dev;
-	int sent = 0, drops = 0;
+	int sent = 0, drops = 0, err = 0;
 	int i;
 
 	if (unlikely(!bq->count))
@@ -234,6 +234,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 
 	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
 	if (sent < 0) {
+		err = sent;
 		sent = 0;
 		goto error;
 	}
@@ -242,7 +243,7 @@ out:
 	bq->count = 0;
 
 	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
-			      sent, drops, bq->dev_rx, dev);
+			      sent, drops, bq->dev_rx, dev, err);
 	bq->dev_rx = NULL;
 	return 0;
 error:
-- 
cgit v1.1