From fcfb126defda3cee3f1d9460dbe9a2ccac4fbd21 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Tue, 16 Jan 2018 16:05:19 -0800
Subject: bpf: add new jited info fields in bpf_dev_offload and bpf_prog_info

For host JIT, there are "jited_len"/"bpf_func" fields in struct bpf_prog
used by all host JIT targets to get jited image and it's length. While for
offload, targets are likely to have different offload mechanisms that these
info are kept in device private data fields.

Therefore, BPF_OBJ_GET_INFO_BY_FD syscall needs an unified way to get JIT
length and contents info for offload targets.

One way is to introduce new callback to parse device private data then fill
those fields in bpf_prog_info. This might be a little heavy, the other way
is to add generic fields which will be initialized by all offload targets.

This patch follow the second approach to introduce two new fields in
struct bpf_dev_offload and teach bpf_prog_get_info_by_fd about them to fill
correct jited_prog_len and jited_prog_insns in bpf_prog_info.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h  |  2 ++
 kernel/bpf/offload.c | 23 +++++++++++++++++++++++
 kernel/bpf/syscall.c | 31 ++++++++++++++++++-------------
 3 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5c2c104..025b1c2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -234,6 +234,8 @@ struct bpf_prog_offload {
 	struct list_head	offloads;
 	bool			dev_state;
 	const struct bpf_prog_offload_ops *dev_ops;
+	void			*jited_image;
+	u32			jited_len;
 };
 
 struct bpf_prog_aux {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index a88cebf..6c0baa1 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -230,9 +230,12 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
 		.prog	= prog,
 		.info	= info,
 	};
+	struct bpf_prog_aux *aux = prog->aux;
 	struct inode *ns_inode;
 	struct path ns_path;
+	char __user *uinsns;
 	void *res;
+	u32 ulen;
 
 	res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args);
 	if (IS_ERR(res)) {
@@ -241,6 +244,26 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
 		return PTR_ERR(res);
 	}
 
+	down_read(&bpf_devs_lock);
+
+	if (!aux->offload) {
+		up_read(&bpf_devs_lock);
+		return -ENODEV;
+	}
+
+	ulen = info->jited_prog_len;
+	info->jited_prog_len = aux->offload->jited_len;
+	if (info->jited_prog_len & ulen) {
+		uinsns = u64_to_user_ptr(info->jited_prog_insns);
+		ulen = min_t(u32, info->jited_prog_len, ulen);
+		if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) {
+			up_read(&bpf_devs_lock);
+			return -EFAULT;
+		}
+	}
+
+	up_read(&bpf_devs_lock);
+
 	ns_inode = ns_path.dentry->d_inode;
 	info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
 	info->netns_ino = ns_inode->i_ino;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c691b9e..c285244 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1724,19 +1724,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		goto done;
 	}
 
-	ulen = info.jited_prog_len;
-	info.jited_prog_len = prog->jited_len;
-	if (info.jited_prog_len && ulen) {
-		if (bpf_dump_raw_ok()) {
-			uinsns = u64_to_user_ptr(info.jited_prog_insns);
-			ulen = min_t(u32, info.jited_prog_len, ulen);
-			if (copy_to_user(uinsns, prog->bpf_func, ulen))
-				return -EFAULT;
-		} else {
-			info.jited_prog_insns = 0;
-		}
-	}
-
 	ulen = info.xlated_prog_len;
 	info.xlated_prog_len = bpf_prog_insn_size(prog);
 	if (info.xlated_prog_len && ulen) {
@@ -1762,6 +1749,24 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		err = bpf_prog_offload_info_fill(&info, prog);
 		if (err)
 			return err;
+		goto done;
+	}
+
+	/* NOTE: the following code is supposed to be skipped for offload.
+	 * bpf_prog_offload_info_fill() is the place to fill similar fields
+	 * for offload.
+	 */
+	ulen = info.jited_prog_len;
+	info.jited_prog_len = prog->jited_len;
+	if (info.jited_prog_len && ulen) {
+		if (bpf_dump_raw_ok()) {
+			uinsns = u64_to_user_ptr(info.jited_prog_insns);
+			ulen = min_t(u32, info.jited_prog_len, ulen);
+			if (copy_to_user(uinsns, prog->bpf_func, ulen))
+				return -EFAULT;
+		} else {
+			info.jited_prog_insns = 0;
+		}
 	}
 
 done:
-- 
cgit v1.1


From eb1d7db927a9653f1402473c777839e0456a7836 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Tue, 16 Jan 2018 16:05:20 -0800
Subject: nfp: bpf: set new jit info fields

This patch set those new jit info fields introduced in this patch set.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/offload.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index e2859b2..c452bf9 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -127,6 +127,7 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog)
 	struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
 	unsigned int stack_size;
 	unsigned int max_instr;
+	int err;
 
 	stack_size = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64;
 	if (prog->aux->stack_depth > stack_size) {
@@ -143,7 +144,14 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog)
 	if (!nfp_prog->prog)
 		return -ENOMEM;
 
-	return nfp_bpf_jit(nfp_prog);
+	err = nfp_bpf_jit(nfp_prog);
+	if (err)
+		return err;
+
+	prog->aux->offload->jited_len = nfp_prog->prog_len * sizeof(u64);
+	prog->aux->offload->jited_image = nfp_prog->prog;
+
+	return 0;
 }
 
 static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog)
-- 
cgit v1.1


From e65935969d0fac9df28d9c49bdbab5d8d8286a20 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Tue, 16 Jan 2018 16:05:21 -0800
Subject: tools: bpftool: improve architecture detection by using ifindex

The current architecture detection method in bpftool is designed for host
case.

For offload case, we can't use the architecture of "bpftool" itself.
Instead, we could call the existing "ifindex_to_name_ns" to get DEVNAME,
then read pci id from /sys/class/dev/DEVNAME/device/vendor, finally we map
vendor id to bfd arch name which will finally be used to select bfd backend
for the disassembler.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/common.c     | 72 ++++++++++++++++++++++++++++++++++++++++++
 tools/bpf/bpftool/jit_disasm.c | 16 +++++++++-
 tools/bpf/bpftool/main.h       |  5 ++-
 tools/bpf/bpftool/prog.c       | 12 ++++++-
 4 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 6601c95..0b482c0 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -34,6 +34,7 @@
 /* Author: Jakub Kicinski <kubakici@wp.pl> */
 
 #include <errno.h>
+#include <fcntl.h>
 #include <fts.h>
 #include <libgen.h>
 #include <mntent.h>
@@ -433,6 +434,77 @@ ifindex_to_name_ns(__u32 ifindex, __u32 ns_dev, __u32 ns_ino, char *buf)
 	return if_indextoname(ifindex, buf);
 }
 
+static int read_sysfs_hex_int(char *path)
+{
+	char vendor_id_buf[8];
+	int len;
+	int fd;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0) {
+		p_err("Can't open %s: %s", path, strerror(errno));
+		return -1;
+	}
+
+	len = read(fd, vendor_id_buf, sizeof(vendor_id_buf));
+	close(fd);
+	if (len < 0) {
+		p_err("Can't read %s: %s", path, strerror(errno));
+		return -1;
+	}
+	if (len >= (int)sizeof(vendor_id_buf)) {
+		p_err("Value in %s too long", path);
+		return -1;
+	}
+
+	vendor_id_buf[len] = 0;
+
+	return strtol(vendor_id_buf, NULL, 0);
+}
+
+static int read_sysfs_netdev_hex_int(char *devname, const char *entry_name)
+{
+	char full_path[64];
+
+	snprintf(full_path, sizeof(full_path), "/sys/class/net/%s/device/%s",
+		 devname, entry_name);
+
+	return read_sysfs_hex_int(full_path);
+}
+
+const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino)
+{
+	char devname[IF_NAMESIZE];
+	int vendor_id;
+	int device_id;
+
+	if (!ifindex_to_name_ns(ifindex, ns_dev, ns_ino, devname)) {
+		p_err("Can't get net device name for ifindex %d: %s", ifindex,
+		      strerror(errno));
+		return NULL;
+	}
+
+	vendor_id = read_sysfs_netdev_hex_int(devname, "vendor");
+	if (vendor_id < 0) {
+		p_err("Can't get device vendor id for %s", devname);
+		return NULL;
+	}
+
+	switch (vendor_id) {
+	case 0x19ee:
+		device_id = read_sysfs_netdev_hex_int(devname, "device");
+		if (device_id != 0x4000 &&
+		    device_id != 0x6000 &&
+		    device_id != 0x6003)
+			p_info("Unknown NFP device ID, assuming it is NFP-6xxx arch");
+		return "NFP-6xxx";
+	default:
+		p_err("Can't get bfd arch name for device vendor id 0x%04x",
+		      vendor_id);
+		return NULL;
+	}
+}
+
 void print_dev_plain(__u32 ifindex, __u64 ns_dev, __u64 ns_inode)
 {
 	char name[IF_NAMESIZE];
diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c
index 57d32e8..8743932 100644
--- a/tools/bpf/bpftool/jit_disasm.c
+++ b/tools/bpf/bpftool/jit_disasm.c
@@ -76,7 +76,8 @@ static int fprintf_json(void *out, const char *fmt, ...)
 	return 0;
 }
 
-void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes)
+void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes,
+		       const char *arch)
 {
 	disassembler_ftype disassemble;
 	struct disassemble_info info;
@@ -100,6 +101,19 @@ void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes)
 	else
 		init_disassemble_info(&info, stdout,
 				      (fprintf_ftype) fprintf);
+
+	/* Update architecture info for offload. */
+	if (arch) {
+		const bfd_arch_info_type *inf = bfd_scan_arch(arch);
+
+		if (inf) {
+			bfdf->arch_info = inf;
+		} else {
+			p_err("No libfd support for %s", arch);
+			return;
+		}
+	}
+
 	info.arch = bfd_get_arch(bfdf);
 	info.mach = bfd_get_mach(bfdf);
 	info.buffer = image;
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 65b526f..b8e9584 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -121,7 +121,10 @@ int do_cgroup(int argc, char **arg);
 
 int prog_parse_fd(int *argc, char ***argv);
 
-void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes);
+void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes,
+		       const char *arch);
 void print_hex_data_json(uint8_t *data, size_t len);
 
+const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino);
+
 #endif
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 099e21cf..e8e2baa 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -776,7 +776,17 @@ static int do_dump(int argc, char **argv)
 		}
 	} else {
 		if (member_len == &info.jited_prog_len) {
-			disasm_print_insn(buf, *member_len, opcodes);
+			const char *name = NULL;
+
+			if (info.ifindex) {
+				name = ifindex_to_bfd_name_ns(info.ifindex,
+							      info.netns_dev,
+							      info.netns_ino);
+				if (!name)
+					goto err_free;
+			}
+
+			disasm_print_insn(buf, *member_len, opcodes, name);
 		} else {
 			kernel_syms_load(&dd);
 			if (json_output)
-- 
cgit v1.1


From e2e3224122e64ebe15fe02a63e8fe09b64a8c743 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 17 Jan 2018 11:17:37 +0100
Subject: samples/bpf: xdp2skb_meta comment explain why pkt-data pointers are
 invalidated

Improve the 'unknown reason' comment, with an actual explaination of why
the ctx pkt-data pointers need to be loaded after the helper function
bpf_xdp_adjust_meta().  Based on the explaination Daniel gave.

Fixes: 36e04a2d78d9 ("samples/bpf: xdp2skb_meta shows transferring info from XDP to SKB")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdp2skb_meta_kern.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c
index 12e1024..0c12048 100644
--- a/samples/bpf/xdp2skb_meta_kern.c
+++ b/samples/bpf/xdp2skb_meta_kern.c
@@ -35,15 +35,17 @@ int _xdp_mark(struct xdp_md *ctx)
 	void *data, *data_end;
 	int ret;
 
-	/* Reserve space in-front data pointer for our meta info.
+	/* Reserve space in-front of data pointer for our meta info.
 	 * (Notice drivers not supporting data_meta will fail here!)
 	 */
 	ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta));
 	if (ret < 0)
 		return XDP_ABORTED;
 
-	/* For some unknown reason, these ctx pointers must be read
-	 * after bpf_xdp_adjust_meta, else verifier will reject prog.
+	/* Notice: Kernel-side verifier requires that loading of
+	 * ctx->data MUST happen _after_ helper bpf_xdp_adjust_meta(),
+	 * as pkt-data pointers are invalidated.  Helpers that require
+	 * this are determined/marked by bpf_helper_changes_pkt_data()
 	 */
 	data = (void *)(unsigned long)ctx->data;
 
-- 
cgit v1.1


From eefa864a81501161c05b35f14197677c937e5b9a Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 17 Jan 2018 09:19:32 -0800
Subject: bpf: change fake_ip for bpf_trace_printk helper

Currently, for bpf_trace_printk helper, fake ip address 0x1
is used with comments saying that fake ip will not be printed.
This is indeed true for 4.12 and earlier version, but for
4.13 and later version, the ip address will be printed if
it cannot be resolved with kallsym. Running samples/bpf/tracex5
program and you will have the following in the debugfs
trace_pipe output:
  ...
  <...>-1819  [003] ....   443.497877: 0x00000001: mmap
  <...>-1819  [003] ....   443.498289: 0x00000001: syscall=102 (one of get/set uid/pid/gid)
  ...

The kernel commit changed this behavior is:
  commit feaf1283d11794b9d518fcfd54b6bf8bee1f0b4b
  Author: Steven Rostedt (VMware) <rostedt@goodmis.org>
  Date:   Thu Jun 22 17:04:55 2017 -0400

      tracing: Show address when function names are not found
  ...

This patch changed the comment and also altered the fake ip
address to 0x0 as users may think 0x1 has some special meaning
while it doesn't. The new output:
  ...
  <...>-1799  [002] ....    25.953576: 0: mmap
  <...>-1799  [002] ....    25.953865: 0: read(fd=0, buf=00000000053936b5, size=512)
  ...

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/trace/bpf_trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f274468..fc2838a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -245,7 +245,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
  */
 #define __BPF_TP_EMIT()	__BPF_ARG3_TP()
 #define __BPF_TP(...)							\
-	__trace_printk(1 /* Fake ip will not be printed. */,		\
+	__trace_printk(0 /* Fake ip */,					\
 		       fmt, ##__VA_ARGS__)
 
 #define __BPF_ARG1_TP(...)						\
-- 
cgit v1.1


From cb5f7334d479414adc6afe60105283277e297489 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 17 Jan 2018 12:05:36 +0100
Subject: bpf: add comments to BPF ld/ldx sizes

Doc BPF ld/ldx size defines as comments in code, as it makes in
faster to lookup in a programming/review setting, than looking up
the sizes in Documentation/networking/filter.txt.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h        | 2 +-
 include/uapi/linux/bpf_common.h | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7c2259e..74dc4dc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -17,7 +17,7 @@
 #define BPF_ALU64	0x07	/* alu mode in double word width */
 
 /* ld/ldx fields */
-#define BPF_DW		0x18	/* double word */
+#define BPF_DW		0x18	/* double word (64-bit) */
 #define BPF_XADD	0xc0	/* exclusive add */
 
 /* alu/jmp fields */
diff --git a/include/uapi/linux/bpf_common.h b/include/uapi/linux/bpf_common.h
index 18be907..ee97668 100644
--- a/include/uapi/linux/bpf_common.h
+++ b/include/uapi/linux/bpf_common.h
@@ -15,9 +15,10 @@
 
 /* ld/ldx fields */
 #define BPF_SIZE(code)  ((code) & 0x18)
-#define		BPF_W		0x00
-#define		BPF_H		0x08
-#define		BPF_B		0x10
+#define		BPF_W		0x00 /* 32-bit */
+#define		BPF_H		0x08 /* 16-bit */
+#define		BPF_B		0x10 /*  8-bit */
+/* eBPF		BPF_DW		0x18    64-bit */
 #define BPF_MODE(code)  ((code) & 0xe0)
 #define		BPF_IMM		0x00
 #define		BPF_ABS		0x20
-- 
cgit v1.1


From b223e3b4e03a13739ab462560b791a4c692fd86e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 18 Jan 2018 12:35:21 +0300
Subject: tools/bpf_jit_disasm: silence a static checker warning

There is a static checker warning that "proglen" has an upper bound but
no lower bound.  The allocation will just fail harmlessly so it's not a
big deal.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpf_jit_disasm.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/bpf/bpf_jit_disasm.c b/tools/bpf/bpf_jit_disasm.c
index 30044bc..58c2bab 100644
--- a/tools/bpf/bpf_jit_disasm.c
+++ b/tools/bpf/bpf_jit_disasm.c
@@ -172,7 +172,8 @@ static uint8_t *get_last_jit_image(char *haystack, size_t hlen,
 {
 	char *ptr, *pptr, *tmp;
 	off_t off = 0;
-	int ret, flen, proglen, pass, ulen = 0;
+	unsigned int proglen;
+	int ret, flen, pass, ulen = 0;
 	regmatch_t pmatch[1];
 	unsigned long base;
 	regex_t regex;
@@ -199,7 +200,7 @@ static uint8_t *get_last_jit_image(char *haystack, size_t hlen,
 	}
 
 	ptr = haystack + off - (pmatch[0].rm_eo - pmatch[0].rm_so);
-	ret = sscanf(ptr, "flen=%d proglen=%d pass=%d image=%lx",
+	ret = sscanf(ptr, "flen=%d proglen=%u pass=%d image=%lx",
 		     &flen, &proglen, &pass, &base);
 	if (ret != 4) {
 		regfree(&regex);
@@ -239,7 +240,7 @@ static uint8_t *get_last_jit_image(char *haystack, size_t hlen,
 	}
 
 	assert(ulen == proglen);
-	printf("%d bytes emitted from JIT compiler (pass:%d, flen:%d)\n",
+	printf("%u bytes emitted from JIT compiler (pass:%d, flen:%d)\n",
 	       proglen, pass, flen);
 	printf("%lx + <x>:\n", base);
 
-- 
cgit v1.1


From e7b2823a582a5bca5ee47644f448e317178e8824 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 18 Jan 2018 17:49:08 +0100
Subject: bpf: Sync kernel ABI header with tooling header

Update tools/include/uapi/linux/bpf.h to bring it in sync with
include/uapi/linux/bpf.h.  The listed commits forgot to update it.

Fixes: 02dd3291b2f0 ("bpf: finally expose xdp_rxq_info to XDP bpf-programs")
Fixes: f19397a5c656 ("bpf: Add access to snd_cwnd and others in sock_ops")
Fixes: 06ef0ccb5a36 ("bpf/cgroup: fix a verification error for a CGROUP_DEVICE type prog")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 69f96af..7c2259e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -900,6 +900,9 @@ struct xdp_md {
 	__u32 data;
 	__u32 data_end;
 	__u32 data_meta;
+	/* Below access go through struct xdp_rxq_info */
+	__u32 ingress_ifindex; /* rxq->dev->ifindex */
+	__u32 rx_queue_index;  /* rxq->queue_index  */
 };
 
 enum sk_action {
@@ -956,6 +959,12 @@ struct bpf_sock_ops {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	__u32 is_fullsock;	/* Some TCP fields are only valid if
+				 * there is a full socket. If not, the
+				 * fields read as zero.
+				 */
+	__u32 snd_cwnd;
+	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
 };
 
 /* List of known BPF sock_ops operators.
@@ -1010,7 +1019,8 @@ struct bpf_perf_event_value {
 #define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
 
 struct bpf_cgroup_dev_ctx {
-	__u32 access_type; /* (access << 16) | type */
+	/* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */
+	__u32 access_type;
 	__u32 major;
 	__u32 minor;
 };
-- 
cgit v1.1


From 61f3c964dfd287b05d7ac6660a4f4ddfef84786c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 17 Jan 2018 16:52:02 -0800
Subject: bpf: allow socket_filter programs to use bpf_prog_test_run

in order to improve test coverage allow socket_filter program type
to be run via bpf_prog_test_run command.
Since such programs can be loaded by non-root tighten
permissions for bpf_prog_test_run to be root only
to avoid surprises.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/syscall.c | 2 ++
 net/core/filter.c    | 1 +
 2 files changed, 3 insertions(+)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c285244..97a825f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1504,6 +1504,8 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
 	struct bpf_prog *prog;
 	int ret = -ENOTSUPP;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
 	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
 		return -EINVAL;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index db2ee8c..30fafaa 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4526,6 +4526,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = {
 };
 
 const struct bpf_prog_ops sk_filter_prog_ops = {
+	.test_run		= bpf_prog_test_run_skb,
 };
 
 const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
-- 
cgit v1.1


From 111e6b45315c8d13658f23885b30eb9df3ea2914 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 17 Jan 2018 16:52:03 -0800
Subject: selftests/bpf: make test_verifier run most programs

to improve test coverage make test_verifier run all successfully loaded
programs on 64-byte zero initialized data.
For clsbpf and xdp it means empty 64-byte packet.
For lwt and socket_filters it's 64-byte packet where skb->data
points after L2.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_verifier.c | 50 ++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 9601798..6c22edb 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -29,6 +29,7 @@
 #include <linux/filter.h>
 #include <linux/bpf_perf_event.h>
 #include <linux/bpf.h>
+#include <linux/if_ether.h>
 
 #include <bpf/bpf.h>
 
@@ -49,6 +50,8 @@
 #define MAX_INSNS	512
 #define MAX_FIXUPS	8
 #define MAX_NR_MAPS	4
+#define POINTER_VALUE	0xcafe4all
+#define TEST_DATA_LEN	64
 
 #define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS	(1 << 0)
 #define F_LOAD_WITH_STRICT_ALIGNMENT		(1 << 1)
@@ -62,6 +65,7 @@ struct bpf_test {
 	int fixup_map_in_map[MAX_FIXUPS];
 	const char *errstr;
 	const char *errstr_unpriv;
+	uint32_t retval;
 	enum {
 		UNDEF,
 		ACCEPT,
@@ -95,6 +99,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = ACCEPT,
+		.retval = -3,
 	},
 	{
 		"unreachable",
@@ -210,6 +215,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = ACCEPT,
+		.retval = 1,
 	},
 	{
 		"test8 ld_imm64",
@@ -517,6 +523,7 @@ static struct bpf_test tests[] = {
 		.errstr_unpriv = "R0 leaks addr",
 		.result = ACCEPT,
 		.result_unpriv = REJECT,
+		.retval = POINTER_VALUE,
 	},
 	{
 		"check valid spill/fill, skb mark",
@@ -803,6 +810,7 @@ static struct bpf_test tests[] = {
 		.errstr_unpriv = "R1 pointer comparison",
 		.result_unpriv = REJECT,
 		.result = ACCEPT,
+		.retval = -ENOENT,
 	},
 	{
 		"jump test 4",
@@ -1823,6 +1831,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = ACCEPT,
+		.retval = 0xfaceb00c,
 	},
 	{
 		"PTR_TO_STACK store/load - bad alignment on off",
@@ -1881,6 +1890,7 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.result_unpriv = REJECT,
 		.errstr_unpriv = "R0 leaks addr",
+		.retval = POINTER_VALUE,
 	},
 	{
 		"unpriv: add const to pointer",
@@ -2054,6 +2064,7 @@ static struct bpf_test tests[] = {
 			BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
 			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
 				     BPF_FUNC_get_hash_recalc),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
 		.result = ACCEPT,
@@ -2818,6 +2829,7 @@ static struct bpf_test tests[] = {
 		},
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.retval = 1,
 	},
 	{
 		"direct packet access: test12 (and, good access)",
@@ -2842,6 +2854,7 @@ static struct bpf_test tests[] = {
 		},
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.retval = 1,
 	},
 	{
 		"direct packet access: test13 (branches, good access)",
@@ -2872,6 +2885,7 @@ static struct bpf_test tests[] = {
 		},
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.retval = 1,
 	},
 	{
 		"direct packet access: test14 (pkt_ptr += 0, CONST_IMM, good access)",
@@ -2895,6 +2909,7 @@ static struct bpf_test tests[] = {
 		},
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.retval = 1,
 	},
 	{
 		"direct packet access: test15 (spill with xadd)",
@@ -3181,6 +3196,7 @@ static struct bpf_test tests[] = {
 		},
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.retval = 1,
 	},
 	{
 		"direct packet access: test28 (marking on <=, bad access)",
@@ -5798,6 +5814,7 @@ static struct bpf_test tests[] = {
 		},
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.retval = 0 /* csum_diff of 64-byte packet */,
 	},
 	{
 		"helper access to variable memory: size = 0 not allowed on NULL (!ARG_PTR_TO_MEM_OR_NULL)",
@@ -6166,6 +6183,7 @@ static struct bpf_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.result = ACCEPT,
+		.retval = 42 /* ultimate return value */,
 	},
 	{
 		"ld_ind: check calling conv, r1",
@@ -6237,6 +6255,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = ACCEPT,
+		.retval = 1,
 	},
 	{
 		"check bpf_perf_event_data->sample_period byte load permitted",
@@ -7224,6 +7243,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map1 = { 3 },
 		.result = ACCEPT,
+		.retval = POINTER_VALUE,
 		.result_unpriv = REJECT,
 		.errstr_unpriv = "R0 leaks addr as return value"
 	},
@@ -7244,6 +7264,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map1 = { 3 },
 		.result = ACCEPT,
+		.retval = POINTER_VALUE,
 		.result_unpriv = REJECT,
 		.errstr_unpriv = "R0 leaks addr as return value"
 	},
@@ -7685,6 +7706,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = ACCEPT,
+		.retval = TEST_DATA_LEN,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
@@ -8705,6 +8727,7 @@ static struct bpf_test tests[] = {
 		.errstr_unpriv = "function calls to other bpf functions are allowed for root only",
 		.result_unpriv = REJECT,
 		.result = ACCEPT,
+		.retval = 1,
 	},
 	{
 		"calls: overlapping caller/callee",
@@ -8900,6 +8923,7 @@ static struct bpf_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_ACT,
 		.result = ACCEPT,
+		.retval = TEST_DATA_LEN,
 	},
 	{
 		"calls: callee using args1",
@@ -8912,6 +8936,7 @@ static struct bpf_test tests[] = {
 		.errstr_unpriv = "allowed for root only",
 		.result_unpriv = REJECT,
 		.result = ACCEPT,
+		.retval = POINTER_VALUE,
 	},
 	{
 		"calls: callee using wrong args2",
@@ -8942,6 +8967,7 @@ static struct bpf_test tests[] = {
 		.errstr_unpriv = "allowed for root only",
 		.result_unpriv = REJECT,
 		.result = ACCEPT,
+		.retval = TEST_DATA_LEN + TEST_DATA_LEN - ETH_HLEN - ETH_HLEN,
 	},
 	{
 		"calls: callee changing pkt pointers",
@@ -8990,6 +9016,7 @@ static struct bpf_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.result = ACCEPT,
+		.retval = TEST_DATA_LEN + TEST_DATA_LEN,
 	},
 	{
 		"calls: calls with stack arith",
@@ -9008,6 +9035,7 @@ static struct bpf_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.result = ACCEPT,
+		.retval = 42,
 	},
 	{
 		"calls: calls with misaligned stack access",
@@ -9041,6 +9069,7 @@ static struct bpf_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.result = ACCEPT,
+		.retval = 43,
 	},
 	{
 		"calls: calls control flow, jump test 2",
@@ -9533,6 +9562,7 @@ static struct bpf_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_XDP,
 		.result = ACCEPT,
+		.retval = 42,
 	},
 	{
 		"calls: write into callee stack frame",
@@ -10144,6 +10174,7 @@ static struct bpf_test tests[] = {
 		},
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.retval = POINTER_VALUE,
 	},
 	{
 		"calls: pkt_ptr spill into caller stack 2",
@@ -10209,6 +10240,7 @@ static struct bpf_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.result = ACCEPT,
+		.retval = 1,
 	},
 	{
 		"calls: pkt_ptr spill into caller stack 4",
@@ -10242,6 +10274,7 @@ static struct bpf_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.result = ACCEPT,
+		.retval = 1,
 	},
 	{
 		"calls: pkt_ptr spill into caller stack 5",
@@ -10650,10 +10683,12 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 	int fd_prog, expected_ret, reject_from_alignment;
 	struct bpf_insn *prog = test->insns;
 	int prog_len = probe_filter_length(prog);
+	char data_in[TEST_DATA_LEN] = {};
 	int prog_type = test->prog_type;
 	int map_fds[MAX_NR_MAPS];
 	const char *expected_err;
-	int i;
+	uint32_t retval;
+	int i, err;
 
 	for (i = 0; i < MAX_NR_MAPS; i++)
 		map_fds[i] = -1;
@@ -10696,6 +10731,19 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 		}
 	}
 
+	if (fd_prog >= 0) {
+		err = bpf_prog_test_run(fd_prog, 1, data_in, sizeof(data_in),
+					NULL, NULL, &retval, NULL);
+		if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) {
+			printf("Unexpected bpf_prog_test_run error\n");
+			goto fail_log;
+		}
+		if (!err && retval != test->retval &&
+		    test->retval != POINTER_VALUE) {
+			printf("FAIL retval %d != %d\n", retval, test->retval);
+			goto fail_log;
+		}
+	}
 	(*passes)++;
 	printf("OK%s\n", reject_from_alignment ?
 	       " (NOTE: reject due to unknown alignment)" : "");
-- 
cgit v1.1


From ad46061fca87c0ab6670af3a44e03237f99d7a1f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 17 Jan 2018 19:13:25 -0800
Subject: bpf: arraymap: move checks out of alloc function

Use the new callback to perform allocation checks for array maps.
The fd maps don't need a special allocation callback, they only
need a special check callback.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/arraymap.c | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index ab94d30..6833609 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -49,27 +49,35 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
 }
 
 /* Called from syscall */
-static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+static int array_map_alloc_check(union bpf_attr *attr)
 {
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int numa_node = bpf_map_attr_numa_node(attr);
-	u32 elem_size, index_mask, max_entries;
-	bool unpriv = !capable(CAP_SYS_ADMIN);
-	struct bpf_array *array;
-	u64 array_size, mask64;
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    attr->value_size == 0 ||
 	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
 	    (percpu && numa_node != NUMA_NO_NODE))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (attr->value_size > KMALLOC_MAX_SIZE)
 		/* if value_size is bigger, the user space won't be able to
 		 * access the elements.
 		 */
-		return ERR_PTR(-E2BIG);
+		return -E2BIG;
+
+	return 0;
+}
+
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+	int numa_node = bpf_map_attr_numa_node(attr);
+	u32 elem_size, index_mask, max_entries;
+	bool unpriv = !capable(CAP_SYS_ADMIN);
+	struct bpf_array *array;
+	u64 array_size, mask64;
 
 	elem_size = round_up(attr->value_size, 8);
 
@@ -327,6 +335,7 @@ static void array_map_free(struct bpf_map *map)
 }
 
 const struct bpf_map_ops array_map_ops = {
+	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
 	.map_free = array_map_free,
 	.map_get_next_key = array_map_get_next_key,
@@ -337,6 +346,7 @@ const struct bpf_map_ops array_map_ops = {
 };
 
 const struct bpf_map_ops percpu_array_map_ops = {
+	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
 	.map_free = array_map_free,
 	.map_get_next_key = array_map_get_next_key,
@@ -345,12 +355,12 @@ const struct bpf_map_ops percpu_array_map_ops = {
 	.map_delete_elem = array_map_delete_elem,
 };
 
-static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
+static int fd_array_map_alloc_check(union bpf_attr *attr)
 {
 	/* only file descriptors can be stored in this type of map */
 	if (attr->value_size != sizeof(u32))
-		return ERR_PTR(-EINVAL);
-	return array_map_alloc(attr);
+		return -EINVAL;
+	return array_map_alloc_check(attr);
 }
 
 static void fd_array_map_free(struct bpf_map *map)
@@ -474,7 +484,8 @@ void bpf_fd_array_map_clear(struct bpf_map *map)
 }
 
 const struct bpf_map_ops prog_array_map_ops = {
-	.map_alloc = fd_array_map_alloc,
+	.map_alloc_check = fd_array_map_alloc_check,
+	.map_alloc = array_map_alloc,
 	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
@@ -561,7 +572,8 @@ static void perf_event_fd_array_release(struct bpf_map *map,
 }
 
 const struct bpf_map_ops perf_event_array_map_ops = {
-	.map_alloc = fd_array_map_alloc,
+	.map_alloc_check = fd_array_map_alloc_check,
+	.map_alloc = array_map_alloc,
 	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
@@ -592,7 +604,8 @@ static void cgroup_fd_array_free(struct bpf_map *map)
 }
 
 const struct bpf_map_ops cgroup_array_map_ops = {
-	.map_alloc = fd_array_map_alloc,
+	.map_alloc_check = fd_array_map_alloc_check,
+	.map_alloc = array_map_alloc,
 	.map_free = cgroup_fd_array_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
@@ -610,7 +623,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
 	if (IS_ERR(inner_map_meta))
 		return inner_map_meta;
 
-	map = fd_array_map_alloc(attr);
+	map = array_map_alloc(attr);
 	if (IS_ERR(map)) {
 		bpf_map_meta_free(inner_map_meta);
 		return map;
@@ -673,6 +686,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
 }
 
 const struct bpf_map_ops array_of_maps_map_ops = {
+	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = array_of_map_alloc,
 	.map_free = array_of_map_free,
 	.map_get_next_key = array_map_get_next_key,
-- 
cgit v1.1


From 32852649ba3f74aab10025f2e59ca2b49d5cccfa Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 17 Jan 2018 19:13:26 -0800
Subject: bpf: arraymap: use bpf_map_init_from_attr()

Arraymap was not converted to use bpf_map_init_from_attr()
to avoid merge conflicts with emergency fixes.  Do it now.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/arraymap.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 6833609..b1f6648 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -120,12 +120,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	array->map.unpriv_array = unpriv;
 
 	/* copy mandatory map attributes */
-	array->map.map_type = attr->map_type;
-	array->map.key_size = attr->key_size;
-	array->map.value_size = attr->value_size;
-	array->map.max_entries = attr->max_entries;
-	array->map.map_flags = attr->map_flags;
-	array->map.numa_node = numa_node;
+	bpf_map_init_from_attr(&array->map, attr);
 	array->elem_size = elem_size;
 
 	if (!percpu)
-- 
cgit v1.1


From 7a0ef6939548b9eb74bf464daf55ad68a23602a2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 17 Jan 2018 19:13:27 -0800
Subject: bpf: offload: allow array map offload

The special handling of different map types is left to the driver.
Allow offload of array maps by simply adding it to accepted types.
For nfp we have to make sure array elements are not deleted.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/offload.c | 2 ++
 kernel/bpf/offload.c                             | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index c452bf9..1a357aa 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -176,6 +176,8 @@ nfp_bpf_map_get_next_key(struct bpf_offloaded_map *offmap,
 static int
 nfp_bpf_map_delete_elem(struct bpf_offloaded_map *offmap, void *key)
 {
+	if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY)
+		return -EINVAL;
 	return nfp_bpf_ctrl_del_entry(offmap, key);
 }
 
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 6c0baa1..2657976 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -299,7 +299,8 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
 
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
-	if (attr->map_type != BPF_MAP_TYPE_HASH)
+	if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
+	    attr->map_type != BPF_MAP_TYPE_HASH)
 		return ERR_PTR(-EINVAL);
 
 	offmap = kzalloc(sizeof(*offmap), GFP_USER);
-- 
cgit v1.1


From 52775b33bb5072fbc07b02c0cf4fe8da1f7ee7cd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 17 Jan 2018 19:13:28 -0800
Subject: bpf: offload: report device information about offloaded maps

Tell user space about device on which the map was created.
Unfortunate reality of user ABI makes sharing this code
with program offload difficult but the information is the
same.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h            |  2 ++
 include/uapi/linux/bpf.h       |  3 +++
 kernel/bpf/offload.c           | 55 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  6 +++++
 tools/include/uapi/linux/bpf.h |  3 +++
 5 files changed, 69 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 025b1c2..66df387 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -586,6 +586,8 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog);
 int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
 			       struct bpf_prog *prog);
 
+int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map);
+
 int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value);
 int bpf_map_offload_update_elem(struct bpf_map *map,
 				void *key, void *value, u64 flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 74dc4dc..406c19d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -938,6 +938,9 @@ struct bpf_map_info {
 	__u32 max_entries;
 	__u32 map_flags;
 	char  name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 2657976..c940107 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -413,6 +413,61 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return ret;
 }
 
+struct ns_get_path_bpf_map_args {
+	struct bpf_offloaded_map *offmap;
+	struct bpf_map_info *info;
+};
+
+static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data)
+{
+	struct ns_get_path_bpf_map_args *args = private_data;
+	struct ns_common *ns;
+	struct net *net;
+
+	rtnl_lock();
+	down_read(&bpf_devs_lock);
+
+	if (args->offmap->netdev) {
+		args->info->ifindex = args->offmap->netdev->ifindex;
+		net = dev_net(args->offmap->netdev);
+		get_net(net);
+		ns = &net->ns;
+	} else {
+		args->info->ifindex = 0;
+		ns = NULL;
+	}
+
+	up_read(&bpf_devs_lock);
+	rtnl_unlock();
+
+	return ns;
+}
+
+int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map)
+{
+	struct ns_get_path_bpf_map_args args = {
+		.offmap	= map_to_offmap(map),
+		.info	= info,
+	};
+	struct inode *ns_inode;
+	struct path ns_path;
+	void *res;
+
+	res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args);
+	if (IS_ERR(res)) {
+		if (!info->ifindex)
+			return -ENODEV;
+		return PTR_ERR(res);
+	}
+
+	ns_inode = ns_path.dentry->d_inode;
+	info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+	info->netns_ino = ns_inode->i_ino;
+	path_put(&ns_path);
+
+	return 0;
+}
+
 bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
 {
 	struct bpf_offloaded_map *offmap;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 97a825f..5bdb0cc 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1801,6 +1801,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.map_flags = map->map_flags;
 	memcpy(info.name, map->name, sizeof(map->name));
 
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_info_fill(&info, map);
+		if (err)
+			return err;
+	}
+
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
 		return -EFAULT;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7c2259e..af1f49a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -938,6 +938,9 @@ struct bpf_map_info {
 	__u32 max_entries;
 	__u32 map_flags;
 	char  name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
-- 
cgit v1.1


From 064a07cba2919bcfbadf9edf5c26c740e69fa585 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 17 Jan 2018 19:13:29 -0800
Subject: tools: bpftool: report device information for offloaded maps

Print the information about device on which map is created.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/map.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 8d7db9d..a152c1a 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -428,6 +428,9 @@ static int show_map_close_json(int fd, struct bpf_map_info *info)
 
 	jsonw_name(json_wtr, "flags");
 	jsonw_printf(json_wtr, "%#x", info->map_flags);
+
+	print_dev_json(info->ifindex, info->netns_dev, info->netns_ino);
+
 	jsonw_uint_field(json_wtr, "bytes_key", info->key_size);
 	jsonw_uint_field(json_wtr, "bytes_value", info->value_size);
 	jsonw_uint_field(json_wtr, "max_entries", info->max_entries);
@@ -469,7 +472,9 @@ static int show_map_close_plain(int fd, struct bpf_map_info *info)
 	if (*info->name)
 		printf("name %s  ", info->name);
 
-	printf("flags 0x%x\n", info->map_flags);
+	printf("flags 0x%x", info->map_flags);
+	print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino);
+	printf("\n");
 	printf("\tkey %uB  value %uB  max_entries %u",
 	       info->key_size, info->value_size, info->max_entries);
 
-- 
cgit v1.1


From 395cacb5f1a0a290f1ae9ca4692c400d2b57a705 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 17 Jan 2018 19:13:30 -0800
Subject: netdevsim: bpf: support fake map offload

Add to netdevsim ability to pretend it's offloading BPF maps.
We only allow allocation of tiny 2 entry maps, to keep things
simple.  Mutex lock may seem heavy for the operations we
perform, but we want to make sure callbacks can sleep.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/netdevsim/bpf.c       | 246 ++++++++++++++++++++++++++++++++++++++
 drivers/net/netdevsim/netdevsim.h |   3 +
 2 files changed, 249 insertions(+)

diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 5134d5c..b3851bb 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -17,6 +17,7 @@
 #include <linux/bpf_verifier.h>
 #include <linux/debugfs.h>
 #include <linux/kernel.h>
+#include <linux/mutex.h>
 #include <linux/rtnetlink.h>
 #include <net/pkt_cls.h>
 
@@ -31,6 +32,19 @@ struct nsim_bpf_bound_prog {
 	struct list_head l;
 };
 
+#define NSIM_BPF_MAX_KEYS		2
+
+struct nsim_bpf_bound_map {
+	struct netdevsim *ns;
+	struct bpf_offloaded_map *map;
+	struct mutex mutex;
+	struct nsim_map_entry {
+		void *key;
+		void *value;
+	} entry[NSIM_BPF_MAX_KEYS];
+	struct list_head l;
+};
+
 static int nsim_debugfs_bpf_string_read(struct seq_file *file, void *data)
 {
 	const char **str = file->private;
@@ -284,6 +298,224 @@ nsim_setup_prog_hw_checks(struct netdevsim *ns, struct netdev_bpf *bpf)
 	return 0;
 }
 
+static bool
+nsim_map_key_match(struct bpf_map *map, struct nsim_map_entry *e, void *key)
+{
+	return e->key && !memcmp(key, e->key, map->key_size);
+}
+
+static int nsim_map_key_find(struct bpf_offloaded_map *offmap, void *key)
+{
+	struct nsim_bpf_bound_map *nmap = offmap->dev_priv;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(nmap->entry); i++)
+		if (nsim_map_key_match(&offmap->map, &nmap->entry[i], key))
+			return i;
+
+	return -ENOENT;
+}
+
+static int
+nsim_map_alloc_elem(struct bpf_offloaded_map *offmap, unsigned int idx)
+{
+	struct nsim_bpf_bound_map *nmap = offmap->dev_priv;
+
+	nmap->entry[idx].key = kmalloc(offmap->map.key_size, GFP_USER);
+	if (!nmap->entry[idx].key)
+		return -ENOMEM;
+	nmap->entry[idx].value = kmalloc(offmap->map.value_size, GFP_USER);
+	if (!nmap->entry[idx].value) {
+		kfree(nmap->entry[idx].key);
+		nmap->entry[idx].key = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int
+nsim_map_get_next_key(struct bpf_offloaded_map *offmap,
+		      void *key, void *next_key)
+{
+	struct nsim_bpf_bound_map *nmap = offmap->dev_priv;
+	int idx = -ENOENT;
+
+	mutex_lock(&nmap->mutex);
+
+	if (key)
+		idx = nsim_map_key_find(offmap, key);
+	if (idx == -ENOENT)
+		idx = 0;
+	else
+		idx++;
+
+	for (; idx < ARRAY_SIZE(nmap->entry); idx++) {
+		if (nmap->entry[idx].key) {
+			memcpy(next_key, nmap->entry[idx].key,
+			       offmap->map.key_size);
+			break;
+		}
+	}
+
+	mutex_unlock(&nmap->mutex);
+
+	if (idx == ARRAY_SIZE(nmap->entry))
+		return -ENOENT;
+	return 0;
+}
+
+static int
+nsim_map_lookup_elem(struct bpf_offloaded_map *offmap, void *key, void *value)
+{
+	struct nsim_bpf_bound_map *nmap = offmap->dev_priv;
+	int idx;
+
+	mutex_lock(&nmap->mutex);
+
+	idx = nsim_map_key_find(offmap, key);
+	if (idx >= 0)
+		memcpy(value, nmap->entry[idx].value, offmap->map.value_size);
+
+	mutex_unlock(&nmap->mutex);
+
+	return idx < 0 ? idx : 0;
+}
+
+static int
+nsim_map_update_elem(struct bpf_offloaded_map *offmap,
+		     void *key, void *value, u64 flags)
+{
+	struct nsim_bpf_bound_map *nmap = offmap->dev_priv;
+	int idx, err = 0;
+
+	mutex_lock(&nmap->mutex);
+
+	idx = nsim_map_key_find(offmap, key);
+	if (idx < 0 && flags == BPF_EXIST) {
+		err = idx;
+		goto exit_unlock;
+	}
+	if (idx >= 0 && flags == BPF_NOEXIST) {
+		err = -EEXIST;
+		goto exit_unlock;
+	}
+
+	if (idx < 0) {
+		for (idx = 0; idx < ARRAY_SIZE(nmap->entry); idx++)
+			if (!nmap->entry[idx].key)
+				break;
+		if (idx == ARRAY_SIZE(nmap->entry)) {
+			err = -E2BIG;
+			goto exit_unlock;
+		}
+
+		err = nsim_map_alloc_elem(offmap, idx);
+		if (err)
+			goto exit_unlock;
+	}
+
+	memcpy(nmap->entry[idx].key, key, offmap->map.key_size);
+	memcpy(nmap->entry[idx].value, value, offmap->map.value_size);
+exit_unlock:
+	mutex_unlock(&nmap->mutex);
+
+	return err;
+}
+
+static int nsim_map_delete_elem(struct bpf_offloaded_map *offmap, void *key)
+{
+	struct nsim_bpf_bound_map *nmap = offmap->dev_priv;
+	int idx;
+
+	if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY)
+		return -EINVAL;
+
+	mutex_lock(&nmap->mutex);
+
+	idx = nsim_map_key_find(offmap, key);
+	if (idx >= 0) {
+		kfree(nmap->entry[idx].key);
+		kfree(nmap->entry[idx].value);
+		memset(&nmap->entry[idx], 0, sizeof(nmap->entry[idx]));
+	}
+
+	mutex_unlock(&nmap->mutex);
+
+	return idx < 0 ? idx : 0;
+}
+
+static const struct bpf_map_dev_ops nsim_bpf_map_ops = {
+	.map_get_next_key	= nsim_map_get_next_key,
+	.map_lookup_elem	= nsim_map_lookup_elem,
+	.map_update_elem	= nsim_map_update_elem,
+	.map_delete_elem	= nsim_map_delete_elem,
+};
+
+static int
+nsim_bpf_map_alloc(struct netdevsim *ns, struct bpf_offloaded_map *offmap)
+{
+	struct nsim_bpf_bound_map *nmap;
+	unsigned int i;
+	int err;
+
+	if (WARN_ON(offmap->map.map_type != BPF_MAP_TYPE_ARRAY &&
+		    offmap->map.map_type != BPF_MAP_TYPE_HASH))
+		return -EINVAL;
+	if (offmap->map.max_entries > NSIM_BPF_MAX_KEYS)
+		return -ENOMEM;
+	if (offmap->map.map_flags)
+		return -EINVAL;
+
+	nmap = kzalloc(sizeof(*nmap), GFP_USER);
+	if (!nmap)
+		return -ENOMEM;
+
+	offmap->dev_priv = nmap;
+	nmap->ns = ns;
+	nmap->map = offmap;
+	mutex_init(&nmap->mutex);
+
+	if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) {
+		for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) {
+			u32 *key;
+
+			err = nsim_map_alloc_elem(offmap, i);
+			if (err)
+				goto err_free;
+			key = nmap->entry[i].key;
+			*key = i;
+		}
+	}
+
+	offmap->dev_ops = &nsim_bpf_map_ops;
+	list_add_tail(&nmap->l, &ns->bpf_bound_maps);
+
+	return 0;
+
+err_free:
+	while (--i) {
+		kfree(nmap->entry[i].key);
+		kfree(nmap->entry[i].value);
+	}
+	kfree(nmap);
+	return err;
+}
+
+static void nsim_bpf_map_free(struct bpf_offloaded_map *offmap)
+{
+	struct nsim_bpf_bound_map *nmap = offmap->dev_priv;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) {
+		kfree(nmap->entry[i].key);
+		kfree(nmap->entry[i].value);
+	}
+	list_del_init(&nmap->l);
+	mutex_destroy(&nmap->mutex);
+	kfree(nmap);
+}
+
 int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 {
 	struct netdevsim *ns = netdev_priv(dev);
@@ -328,6 +560,14 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 			return err;
 
 		return nsim_xdp_set_prog(ns, bpf);
+	case BPF_OFFLOAD_MAP_ALLOC:
+		if (!ns->bpf_map_accept)
+			return -EOPNOTSUPP;
+
+		return nsim_bpf_map_alloc(ns, bpf->offmap);
+	case BPF_OFFLOAD_MAP_FREE:
+		nsim_bpf_map_free(bpf->offmap);
+		return 0;
 	default:
 		return -EINVAL;
 	}
@@ -336,6 +576,7 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 int nsim_bpf_init(struct netdevsim *ns)
 {
 	INIT_LIST_HEAD(&ns->bpf_bound_progs);
+	INIT_LIST_HEAD(&ns->bpf_bound_maps);
 
 	debugfs_create_u32("bpf_offloaded_id", 0400, ns->ddir,
 			   &ns->bpf_offloaded_id);
@@ -362,12 +603,17 @@ int nsim_bpf_init(struct netdevsim *ns)
 	debugfs_create_bool("bpf_xdpoffload_accept", 0600, ns->ddir,
 			    &ns->bpf_xdpoffload_accept);
 
+	ns->bpf_map_accept = true;
+	debugfs_create_bool("bpf_map_accept", 0600, ns->ddir,
+			    &ns->bpf_map_accept);
+
 	return 0;
 }
 
 void nsim_bpf_uninit(struct netdevsim *ns)
 {
 	WARN_ON(!list_empty(&ns->bpf_bound_progs));
+	WARN_ON(!list_empty(&ns->bpf_bound_maps));
 	WARN_ON(ns->xdp_prog);
 	WARN_ON(ns->bpf_offloaded);
 }
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index 32270de..b803612 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -61,6 +61,9 @@ struct netdevsim {
 	bool bpf_tc_non_bound_accept;
 	bool bpf_xdpdrv_accept;
 	bool bpf_xdpoffload_accept;
+
+	bool bpf_map_accept;
+	struct list_head bpf_bound_maps;
 };
 
 extern struct dentry *nsim_ddir;
-- 
cgit v1.1


From 7fedbb7c5a7c4bda418bc1056c06c81db36e4299 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 17 Jan 2018 19:13:31 -0800
Subject: selftest/bpf: extend the offload test with map checks

Check map device information is reported correctly, and perform
basic map operations.  Check device destruction gets rid of the
maps and map allocation failure path by telling netdevsim to
reject map offload via DebugFS.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile          |   3 +-
 tools/testing/selftests/bpf/sample_map_ret0.c |  34 +++++
 tools/testing/selftests/bpf/test_offload.py   | 206 +++++++++++++++++++++++---
 3 files changed, 218 insertions(+), 25 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/sample_map_ret0.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index a8aa7e2..3a44b65 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -19,7 +19,8 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
-	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o
+	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
+	sample_map_ret0.o
 
 TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
 	test_offload.py
diff --git a/tools/testing/selftests/bpf/sample_map_ret0.c b/tools/testing/selftests/bpf/sample_map_ret0.c
new file mode 100644
index 0000000..0756303
--- /dev/null
+++ b/tools/testing/selftests/bpf/sample_map_ret0.c
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") htab = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(long),
+	.max_entries = 2,
+};
+
+struct bpf_map_def SEC("maps") array = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(long),
+	.max_entries = 2,
+};
+
+/* Sample program which should always load for testing control paths. */
+SEC(".text") int func()
+{
+	__u64 key64 = 0;
+	__u32 key = 0;
+	long *value;
+
+	value = bpf_map_lookup_elem(&htab, &key);
+	if (!value)
+		return 1;
+	value = bpf_map_lookup_elem(&array, &key64);
+	if (!value)
+		return 1;
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py
index e3c750f..833b9c1 100755
--- a/tools/testing/selftests/bpf/test_offload.py
+++ b/tools/testing/selftests/bpf/test_offload.py
@@ -20,6 +20,7 @@ import os
 import pprint
 import random
 import string
+import struct
 import subprocess
 import time
 
@@ -156,6 +157,14 @@ def bpftool_prog_list(expected=None, ns=""):
                  (len(progs), expected))
     return progs
 
+def bpftool_map_list(expected=None, ns=""):
+    _, maps = bpftool("map show", JSON=True, ns=ns, fail=True)
+    if expected is not None:
+        if len(maps) != expected:
+            fail(True, "%d BPF maps loaded, expected %d" %
+                 (len(maps), expected))
+    return maps
+
 def bpftool_prog_list_wait(expected=0, n_retry=20):
     for i in range(n_retry):
         nprogs = len(bpftool_prog_list())
@@ -164,6 +173,14 @@ def bpftool_prog_list_wait(expected=0, n_retry=20):
         time.sleep(0.05)
     raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs))
 
+def bpftool_map_list_wait(expected=0, n_retry=20):
+    for i in range(n_retry):
+        nmaps = len(bpftool_map_list())
+        if nmaps == expected:
+            return
+        time.sleep(0.05)
+    raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps))
+
 def ip(args, force=False, JSON=True, ns="", fail=True):
     if force:
         args = "-force " + args
@@ -193,6 +210,26 @@ def mknetns(n_retry=10):
             return name
     return None
 
+def int2str(fmt, val):
+    ret = []
+    for b in struct.pack(fmt, val):
+        ret.append(int(b))
+    return " ".join(map(lambda x: str(x), ret))
+
+def str2int(strtab):
+    inttab = []
+    for i in strtab:
+        inttab.append(int(i, 16))
+    ba = bytearray(inttab)
+    if len(strtab) == 4:
+        fmt = "I"
+    elif len(strtab) == 8:
+        fmt = "Q"
+    else:
+        raise Exception("String array of len %d can't be unpacked to an int" %
+                        (len(strtab)))
+    return struct.unpack(fmt, ba)[0]
+
 class DebugfsDir:
     """
     Class for accessing DebugFS directories as a dictionary.
@@ -311,13 +348,13 @@ class NetdevSim:
         return ip("link set dev %s mtu %d" % (self.dev["ifname"], mtu),
                   fail=fail)
 
-    def set_xdp(self, bpf, mode, force=False, fail=True):
+    def set_xdp(self, bpf, mode, force=False, JSON=True, fail=True):
         return ip("link set dev %s xdp%s %s" % (self.dev["ifname"], mode, bpf),
-                  force=force, fail=fail)
+                  force=force, JSON=JSON, fail=fail)
 
-    def unset_xdp(self, mode, force=False, fail=True):
+    def unset_xdp(self, mode, force=False, JSON=True, fail=True):
         return ip("link set dev %s xdp%s off" % (self.dev["ifname"], mode),
-                  force=force, fail=fail)
+                  force=force, JSON=JSON, fail=fail)
 
     def ip_link_show(self, xdp):
         _, link = ip("link show dev %s" % (self['ifname']))
@@ -390,12 +427,16 @@ class NetdevSim:
 
 ################################################################################
 def clean_up():
+    global files, netns, devs
+
     for dev in devs:
         dev.remove()
     for f in files:
         cmd("rm -f %s" % (f))
     for ns in netns:
         cmd("ip netns delete %s" % (ns))
+    files = []
+    netns = []
 
 def pin_prog(file_name, idx=0):
     progs = bpftool_prog_list(expected=(idx + 1))
@@ -405,16 +446,31 @@ def pin_prog(file_name, idx=0):
 
     return file_name, bpf_pinned(file_name)
 
-def check_dev_info(other_ns, ns, pin_file=None, removed=False):
-    if removed:
-        bpftool_prog_list(expected=0)
-        ret, err = bpftool("prog show pin %s" % (pin_file), fail=False)
-        fail(ret == 0, "Showing prog with removed device did not fail")
-        fail(err["error"].find("No such device") == -1,
-             "Showing prog with removed device expected ENODEV, error is %s" %
-             (err["error"]))
-        return
-    progs = bpftool_prog_list(expected=int(not removed), ns=ns)
+def pin_map(file_name, idx=0, expected=1):
+    maps = bpftool_map_list(expected=expected)
+    m = maps[idx]
+    bpftool("map pin id %d %s" % (m["id"], file_name))
+    files.append(file_name)
+
+    return file_name, bpf_pinned(file_name)
+
+def check_dev_info_removed(prog_file=None, map_file=None):
+    bpftool_prog_list(expected=0)
+    ret, err = bpftool("prog show pin %s" % (prog_file), fail=False)
+    fail(ret == 0, "Showing prog with removed device did not fail")
+    fail(err["error"].find("No such device") == -1,
+         "Showing prog with removed device expected ENODEV, error is %s" %
+         (err["error"]))
+
+    bpftool_map_list(expected=0)
+    ret, err = bpftool("map show pin %s" % (map_file), fail=False)
+    fail(ret == 0, "Showing map with removed device did not fail")
+    fail(err["error"].find("No such device") == -1,
+         "Showing map with removed device expected ENODEV, error is %s" %
+         (err["error"]))
+
+def check_dev_info(other_ns, ns, prog_file=None, map_file=None, removed=False):
+    progs = bpftool_prog_list(expected=1, ns=ns)
     prog = progs[0]
 
     fail("dev" not in prog.keys(), "Device parameters not reported")
@@ -423,16 +479,17 @@ def check_dev_info(other_ns, ns, pin_file=None, removed=False):
     fail("ns_dev" not in dev.keys(), "Device parameters not reported")
     fail("ns_inode" not in dev.keys(), "Device parameters not reported")
 
-    if not removed and not other_ns:
+    if not other_ns:
         fail("ifname" not in dev.keys(), "Ifname not reported")
         fail(dev["ifname"] != sim["ifname"],
              "Ifname incorrect %s vs %s" % (dev["ifname"], sim["ifname"]))
     else:
         fail("ifname" in dev.keys(), "Ifname is reported for other ns")
-        if removed:
-            fail(dev["ifindex"] != 0, "Device perameters not zero on removed")
-            fail(dev["ns_dev"] != 0, "Device perameters not zero on removed")
-            fail(dev["ns_inode"] != 0, "Device perameters not zero on removed")
+
+    maps = bpftool_map_list(expected=2, ns=ns)
+    for m in maps:
+        fail("dev" not in m.keys(), "Device parameters not reported")
+        fail(dev != m["dev"], "Map's device different than program's")
 
 # Parse command line
 parser = argparse.ArgumentParser()
@@ -464,7 +521,7 @@ if out.find("/sys/kernel/debug type debugfs") == -1:
     cmd("mount -t debugfs none /sys/kernel/debug")
 
 # Check samples are compiled
-samples = ["sample_ret0.o"]
+samples = ["sample_ret0.o", "sample_map_ret0.o"]
 for s in samples:
     ret, out = cmd("ls %s/%s" % (bpf_test_dir, s), fail=False)
     skip(ret != 0, "sample %s/%s not found, please compile it" %
@@ -739,8 +796,9 @@ try:
     bpftool_prog_list_wait(expected=0)
 
     sim = NetdevSim()
-    sim.set_ethtool_tc_offloads(True)
-    sim.set_xdp(obj, "offload")
+    map_obj = bpf_obj("sample_map_ret0.o")
+    start_test("Test loading program with maps...")
+    sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON
 
     start_test("Test bpftool bound info reporting (own ns)...")
     check_dev_info(False, "")
@@ -757,11 +815,111 @@ try:
     sim.set_ns("")
     check_dev_info(False, "")
 
-    pin_file, _ = pin_prog("/sys/fs/bpf/tmp")
+    prog_file, _ = pin_prog("/sys/fs/bpf/tmp_prog")
+    map_file, _ = pin_map("/sys/fs/bpf/tmp_map", idx=1, expected=2)
     sim.remove()
 
     start_test("Test bpftool bound info reporting (removed dev)...")
-    check_dev_info(True, "", pin_file=pin_file, removed=True)
+    check_dev_info_removed(prog_file=prog_file, map_file=map_file)
+
+    # Remove all pinned files and reinstantiate the netdev
+    clean_up()
+    bpftool_prog_list_wait(expected=0)
+
+    sim = NetdevSim()
+
+    start_test("Test map update (no flags)...")
+    sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON
+    maps = bpftool_map_list(expected=2)
+    array = maps[0] if maps[0]["type"] == "array" else maps[1]
+    htab = maps[0] if maps[0]["type"] == "hash" else maps[1]
+    for m in maps:
+        for i in range(2):
+            bpftool("map update id %d key %s value %s" %
+                    (m["id"], int2str("I", i), int2str("Q", i * 3)))
+
+    for m in maps:
+        ret, _ = bpftool("map update id %d key %s value %s" %
+                         (m["id"], int2str("I", 3), int2str("Q", 3 * 3)),
+                         fail=False)
+        fail(ret == 0, "added too many entries")
+
+    start_test("Test map update (exists)...")
+    for m in maps:
+        for i in range(2):
+            bpftool("map update id %d key %s value %s exist" %
+                    (m["id"], int2str("I", i), int2str("Q", i * 3)))
+
+    for m in maps:
+        ret, err = bpftool("map update id %d key %s value %s exist" %
+                           (m["id"], int2str("I", 3), int2str("Q", 3 * 3)),
+                           fail=False)
+        fail(ret == 0, "updated non-existing key")
+        fail(err["error"].find("No such file or directory") == -1,
+             "expected ENOENT, error is '%s'" % (err["error"]))
+
+    start_test("Test map update (noexist)...")
+    for m in maps:
+        for i in range(2):
+            ret, err = bpftool("map update id %d key %s value %s noexist" %
+                               (m["id"], int2str("I", i), int2str("Q", i * 3)),
+                               fail=False)
+        fail(ret == 0, "updated existing key")
+        fail(err["error"].find("File exists") == -1,
+             "expected EEXIST, error is '%s'" % (err["error"]))
+
+    start_test("Test map dump...")
+    for m in maps:
+        _, entries = bpftool("map dump id %d" % (m["id"]))
+        for i in range(2):
+            key = str2int(entries[i]["key"])
+            fail(key != i, "expected key %d, got %d" % (key, i))
+            val = str2int(entries[i]["value"])
+            fail(val != i * 3, "expected value %d, got %d" % (val, i * 3))
+
+    start_test("Test map getnext...")
+    for m in maps:
+        _, entry = bpftool("map getnext id %d" % (m["id"]))
+        key = str2int(entry["next_key"])
+        fail(key != 0, "next key %d, expected %d" % (key, 0))
+        _, entry = bpftool("map getnext id %d key %s" %
+                           (m["id"], int2str("I", 0)))
+        key = str2int(entry["next_key"])
+        fail(key != 1, "next key %d, expected %d" % (key, 1))
+        ret, err = bpftool("map getnext id %d key %s" %
+                           (m["id"], int2str("I", 1)), fail=False)
+        fail(ret == 0, "got next key past the end of map")
+        fail(err["error"].find("No such file or directory") == -1,
+             "expected ENOENT, error is '%s'" % (err["error"]))
+
+    start_test("Test map delete (htab)...")
+    for i in range(2):
+        bpftool("map delete id %d key %s" % (htab["id"], int2str("I", i)))
+
+    start_test("Test map delete (array)...")
+    for i in range(2):
+        ret, err = bpftool("map delete id %d key %s" %
+                           (htab["id"], int2str("I", i)), fail=False)
+        fail(ret == 0, "removed entry from an array")
+        fail(err["error"].find("No such file or directory") == -1,
+             "expected ENOENT, error is '%s'" % (err["error"]))
+
+    start_test("Test map remove...")
+    sim.unset_xdp("offload")
+    bpftool_map_list_wait(expected=0)
+    sim.remove()
+
+    sim = NetdevSim()
+    sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON
+    sim.remove()
+    bpftool_map_list_wait(expected=0)
+
+    start_test("Test map creation fail path...")
+    sim = NetdevSim()
+    sim.dfs["bpf_map_accept"] = "N"
+    ret, _ = sim.set_xdp(map_obj, "offload", JSON=False, fail=False)
+    fail(ret == 0,
+         "netdevsim didn't refuse to create a map with offload disabled")
 
     print("%s: OK" % (os.path.basename(__file__)))
 
-- 
cgit v1.1


From ca027a1c45e30d89c5cc6dcacbdcea74e1ff65fc Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 17 Jan 2018 19:13:32 -0800
Subject: nfp: bpf: add short busy wait for FW replies

Scheduling out and in for every FW message can slow us down
unnecessarily.  Our experiments show that even under heavy load
the FW responds to 99.9% messages within 200 us.  Add a short
busy wait before entering the wait queue.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/cmsg.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
index 71e6586..80d3aa0 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
@@ -157,7 +157,14 @@ nfp_bpf_cmsg_wait_reply(struct nfp_app_bpf *bpf, enum nfp_bpf_cmsg_type type,
 			int tag)
 {
 	struct sk_buff *skb;
-	int err;
+	int i, err;
+
+	for (i = 0; i < 50; i++) {
+		udelay(4);
+		skb = nfp_bpf_reply(bpf, tag);
+		if (skb)
+			return skb;
+	}
 
 	err = wait_event_interruptible_timeout(bpf->cmsg_wq,
 					       skb = nfp_bpf_reply(bpf, tag),
-- 
cgit v1.1


From a55aaf6db0587e1a6e79dce8ada0e237d6068afb Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 19 Jan 2018 14:17:45 +0000
Subject: bpftool: recognize BPF_MAP_TYPE_CPUMAP maps

Add BPF_MAP_TYPE_CPUMAP map type to the list
of map type recognized by bpftool and define
corresponding text representation.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Quentin Monnet <quentin.monnet@netronome.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Acked-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/map.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index a152c1a..f95fa67 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -66,6 +66,7 @@ static const char * const map_type_name[] = {
 	[BPF_MAP_TYPE_HASH_OF_MAPS]	= "hash_of_maps",
 	[BPF_MAP_TYPE_DEVMAP]		= "devmap",
 	[BPF_MAP_TYPE_SOCKMAP]		= "sockmap",
+	[BPF_MAP_TYPE_CPUMAP]		= "cpumap",
 };
 
 static unsigned int get_possible_cpus(void)
-- 
cgit v1.1


From b7bcc0bbb8acb640258bb451f1f9391737da48b1 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Thu, 18 Jan 2018 17:36:24 -0700
Subject: selftests: bpf: update .gitignore with missing generated files

Update .gitignore with missing generated files.

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/.gitignore | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 541d9d7..1e09d77 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -3,3 +3,10 @@ test_maps
 test_lru_map
 test_lpm_map
 test_tag
+FEATURE-DUMP.libbpf
+fixdep
+test_align
+test_dev_cgroup
+test_progs
+test_verifier_log
+feature
-- 
cgit v1.1


From b471f2f1de8b816f1e799b80aa92588f3566e4bd Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 18 Jan 2018 15:08:50 -0800
Subject: bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map

Current LPM_TRIE map type does not implement MAP_GET_NEXT_KEY
command. This command is handy when users want to enumerate
keys. Otherwise, a different map which supports key
enumeration may be required to store the keys. If the
map data is sparse and all map data are to be deleted without
closing file descriptor, using MAP_GET_NEXT_KEY to find
all keys is much faster than enumerating all key space.

This patch implements MAP_GET_NEXT_KEY command for LPM_TRIE map.
If user provided key pointer is NULL or the key does not have
an exact match in the trie, the first key will be returned.
Otherwise, the next key will be returned.

In this implemenation, key enumeration follows a postorder
traversal of internal trie. More specific keys
will be returned first than less specific ones, given
a sequence of MAP_GET_NEXT_KEY syscalls.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/lpm_trie.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 93 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 584e022..d7ea962 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -591,9 +591,100 @@ unlock:
 	raw_spin_unlock(&trie->lock);
 }
 
-static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
+static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
 {
-	return -ENOTSUPP;
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
+	struct lpm_trie_node *node, *next_node = NULL, *parent;
+	struct lpm_trie_node **node_stack = NULL;
+	struct lpm_trie_node __rcu **root;
+	int err = 0, stack_ptr = -1;
+	unsigned int next_bit;
+	size_t matchlen;
+
+	/* The get_next_key follows postorder. For the 4 node example in
+	 * the top of this file, the trie_get_next_key() returns the following
+	 * one after another:
+	 *   192.168.0.0/24
+	 *   192.168.1.0/24
+	 *   192.168.128.0/24
+	 *   192.168.0.0/16
+	 *
+	 * The idea is to return more specific keys before less specific ones.
+	 */
+
+	/* Empty trie */
+	if (!rcu_dereference(trie->root))
+		return -ENOENT;
+
+	/* For invalid key, find the leftmost node in the trie */
+	if (!key || key->prefixlen > trie->max_prefixlen) {
+		root = &trie->root;
+		goto find_leftmost;
+	}
+
+	node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *),
+			     GFP_USER | __GFP_NOWARN);
+	if (!node_stack)
+		return -ENOMEM;
+
+	/* Try to find the exact node for the given key */
+	for (node = rcu_dereference(trie->root); node;) {
+		node_stack[++stack_ptr] = node;
+		matchlen = longest_prefix_match(trie, node, key);
+		if (node->prefixlen != matchlen ||
+		    node->prefixlen == key->prefixlen)
+			break;
+
+		next_bit = extract_bit(key->data, node->prefixlen);
+		node = rcu_dereference(node->child[next_bit]);
+	}
+	if (!node || node->prefixlen != key->prefixlen ||
+	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+		root = &trie->root;
+		goto find_leftmost;
+	}
+
+	/* The node with the exactly-matching key has been found,
+	 * find the first node in postorder after the matched node.
+	 */
+	node = node_stack[stack_ptr];
+	while (stack_ptr > 0) {
+		parent = node_stack[stack_ptr - 1];
+		if (rcu_dereference(parent->child[0]) == node &&
+		    rcu_dereference(parent->child[1])) {
+			root = &parent->child[1];
+			goto find_leftmost;
+		}
+		if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
+			next_node = parent;
+			goto do_copy;
+		}
+
+		node = parent;
+		stack_ptr--;
+	}
+
+	/* did not find anything */
+	err = -ENOENT;
+	goto free_stack;
+
+find_leftmost:
+	/* Find the leftmost non-intermediate node, all intermediate nodes
+	 * have exact two children, so this function will never return NULL.
+	 */
+	for (node = rcu_dereference(*root); node;) {
+		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+			next_node = node;
+		node = rcu_dereference(node->child[0]);
+	}
+do_copy:
+	next_key->prefixlen = next_node->prefixlen;
+	memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
+	       next_node->data, trie->data_size);
+free_stack:
+	kfree(node_stack);
+	return err;
 }
 
 const struct bpf_map_ops trie_map_ops = {
-- 
cgit v1.1


From 8c417dc15f9522672795981dcb63d9099ca6bd8c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 18 Jan 2018 15:08:51 -0800
Subject: tools/bpf: add a testcase for MAP_GET_NEXT_KEY command of LPM_TRIE
 map

A test case is added in tools/testing/selftests/bpf/test_lpm_map.c
for MAP_GET_NEXT_KEY command. A four node trie, which
is described in kernel/bpf/lpm_trie.c, is built and the
MAP_GET_NEXT_KEY results are checked.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_lpm_map.c | 122 +++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c
index f614806..0815108 100644
--- a/tools/testing/selftests/bpf/test_lpm_map.c
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -521,6 +521,126 @@ static void test_lpm_delete(void)
 	close(map_fd);
 }
 
+static void test_lpm_get_next_key(void)
+{
+	struct bpf_lpm_trie_key *key_p, *next_key_p;
+	size_t key_size;
+	__u32 value = 0;
+	int map_fd;
+
+	key_size = sizeof(*key_p) + sizeof(__u32);
+	key_p = alloca(key_size);
+	next_key_p = alloca(key_size);
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, sizeof(value),
+				100, BPF_F_NO_PREALLOC);
+	assert(map_fd >= 0);
+
+	/* empty tree. get_next_key should return ENOENT */
+	assert(bpf_map_get_next_key(map_fd, NULL, key_p) == -1 &&
+	       errno == ENOENT);
+
+	/* get and verify the first key, get the second one should fail. */
+	key_p->prefixlen = 16;
+	inet_pton(AF_INET, "192.168.0.0", key_p->data);
+	assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0);
+
+	memset(key_p, 0, key_size);
+	assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0);
+	assert(key_p->prefixlen == 16 && key_p->data[0] == 192 &&
+	       key_p->data[1] == 168);
+
+	assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 &&
+	       errno == ENOENT);
+
+	/* no exact matching key should get the first one in post order. */
+	key_p->prefixlen = 8;
+	assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0);
+	assert(key_p->prefixlen == 16 && key_p->data[0] == 192 &&
+	       key_p->data[1] == 168);
+
+	/* add one more element (total two) */
+	key_p->prefixlen = 24;
+	inet_pton(AF_INET, "192.168.0.0", key_p->data);
+	assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0);
+
+	memset(key_p, 0, key_size);
+	assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0);
+	assert(key_p->prefixlen == 24 && key_p->data[0] == 192 &&
+	       key_p->data[1] == 168 && key_p->data[2] == 0);
+
+	memset(next_key_p, 0, key_size);
+	assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+	assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 &&
+	       next_key_p->data[1] == 168);
+
+	memcpy(key_p, next_key_p, key_size);
+	assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 &&
+	       errno == ENOENT);
+
+	/* Add one more element (total three) */
+	key_p->prefixlen = 24;
+	inet_pton(AF_INET, "192.168.128.0", key_p->data);
+	assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0);
+
+	memset(key_p, 0, key_size);
+	assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0);
+	assert(key_p->prefixlen == 24 && key_p->data[0] == 192 &&
+	       key_p->data[1] == 168 && key_p->data[2] == 0);
+
+	memset(next_key_p, 0, key_size);
+	assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+	assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 &&
+	       next_key_p->data[1] == 168 && next_key_p->data[2] == 128);
+
+	memcpy(key_p, next_key_p, key_size);
+	assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+	assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 &&
+	       next_key_p->data[1] == 168);
+
+	memcpy(key_p, next_key_p, key_size);
+	assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 &&
+	       errno == ENOENT);
+
+	/* Add one more element (total four) */
+	key_p->prefixlen = 24;
+	inet_pton(AF_INET, "192.168.1.0", key_p->data);
+	assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0);
+
+	memset(key_p, 0, key_size);
+	assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0);
+	assert(key_p->prefixlen == 24 && key_p->data[0] == 192 &&
+	       key_p->data[1] == 168 && key_p->data[2] == 0);
+
+	memset(next_key_p, 0, key_size);
+	assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+	assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 &&
+	       next_key_p->data[1] == 168 && next_key_p->data[2] == 1);
+
+	memcpy(key_p, next_key_p, key_size);
+	assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+	assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 &&
+	       next_key_p->data[1] == 168 && next_key_p->data[2] == 128);
+
+	memcpy(key_p, next_key_p, key_size);
+	assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+	assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 &&
+	       next_key_p->data[1] == 168);
+
+	memcpy(key_p, next_key_p, key_size);
+	assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 &&
+	       errno == ENOENT);
+
+	/* no exact matching key should return the first one in post order */
+	key_p->prefixlen = 22;
+	inet_pton(AF_INET, "192.168.1.0", key_p->data);
+	assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+	assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 &&
+	       next_key_p->data[1] == 168 && next_key_p->data[2] == 0);
+
+	close(map_fd);
+}
+
 int main(void)
 {
 	struct rlimit limit  = { RLIM_INFINITY, RLIM_INFINITY };
@@ -545,6 +665,8 @@ int main(void)
 
 	test_lpm_delete();
 
+	test_lpm_get_next_key();
+
 	printf("test_lpm: OK\n");
 	return 0;
 }
-- 
cgit v1.1


From 417f1d9f217922d822b64e8323458d7d03a12d4f Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 19 Jan 2018 17:15:50 +0100
Subject: samples/bpf: xdp_monitor include cpumap tracepoints in monitoring

The xdp_redirect_cpu sample have some "builtin" monitoring of the
tracepoints for xdp_cpumap_*, but it is practical to have an external
tool that can monitor these transpoint as an easy way to troubleshoot
an application using XDP + cpumap.

Specifically I need such external tool when working on Suricata and
XDP cpumap redirect. Extend the xdp_monitor tool sample with
monitoring of these xdp_cpumap_* tracepoints.  Model the output format
like xdp_redirect_cpu.

Given I needed to handle per CPU decoding for cpumap, this patch also
add per CPU info on the existing monitor events.  This resembles part
of the builtin monitoring output from sample xdp_rxq_info.  Thus, also
covering part of that sample in an external monitoring tool.

Performance wise, the cpumap tracepoints uses bulking, which cause
them to have very little overhead.  Thus, they are enabled by default.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdp_monitor_kern.c |  94 +++++++++-
 samples/bpf/xdp_monitor_user.c | 416 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 443 insertions(+), 67 deletions(-)

diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index c969141..211db8d 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -1,6 +1,7 @@
-/* XDP monitor tool, based on tracepoints
+/* SPDX-License-Identifier: GPL-2.0
+ *  Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc.
  *
- *  Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * XDP monitor tool, based on tracepoints
  */
 #include <uapi/linux/bpf.h>
 #include "bpf_helpers.h"
@@ -118,3 +119,92 @@ int trace_xdp_exception(struct xdp_exception_ctx *ctx)
 
 	return 0;
 }
+
+/* Common stats data record shared with _user.c */
+struct datarec {
+	u64 processed;
+	u64 dropped;
+	u64 info;
+};
+#define MAX_CPUS 64
+
+struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= MAX_CPUS,
+};
+
+struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= 1,
+};
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
+ * Code in:         kernel/include/trace/events/xdp.h
+ */
+struct cpumap_enqueue_ctx {
+	u64 __pad;		// First 8 bytes are not accessible by bpf code
+	int map_id;		//	offset:8;  size:4; signed:1;
+	u32 act;		//	offset:12; size:4; signed:0;
+	int cpu;		//	offset:16; size:4; signed:1;
+	unsigned int drops;	//	offset:20; size:4; signed:0;
+	unsigned int processed;	//	offset:24; size:4; signed:0;
+	int to_cpu;		//	offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_enqueue")
+int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
+{
+	u32 to_cpu = ctx->to_cpu;
+	struct datarec *rec;
+
+	if (to_cpu >= MAX_CPUS)
+		return 1;
+
+	rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
+	if (!rec)
+		return 0;
+	rec->processed += ctx->processed;
+	rec->dropped   += ctx->drops;
+
+	/* Record bulk events, then userspace can calc average bulk size */
+	if (ctx->processed > 0)
+		rec->info += 1;
+
+	return 0;
+}
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
+ * Code in:         kernel/include/trace/events/xdp.h
+ */
+struct cpumap_kthread_ctx {
+	u64 __pad;		// First 8 bytes are not accessible by bpf code
+	int map_id;		//	offset:8;  size:4; signed:1;
+	u32 act;		//	offset:12; size:4; signed:0;
+	int cpu;		//	offset:16; size:4; signed:1;
+	unsigned int drops;	//	offset:20; size:4; signed:0;
+	unsigned int processed;	//	offset:24; size:4; signed:0;
+	int sched;		//	offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_kthread")
+int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
+{
+	struct datarec *rec;
+	u32 key = 0;
+
+	rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
+	if (!rec)
+		return 0;
+	rec->processed += ctx->processed;
+	rec->dropped   += ctx->drops;
+
+	/* Count times kthread yielded CPU via schedule call */
+	if (ctx->sched)
+		rec->info++;
+
+	return 0;
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index eaba165..eec1452 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -1,4 +1,5 @@
-/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
  */
 static const char *__doc__=
  "XDP monitor tool, based on tracepoints\n"
@@ -40,6 +41,9 @@ static const struct option long_options[] = {
 	{0, 0, NULL,  0 }
 };
 
+/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */
+#define EXIT_FAIL_MEM	5
+
 static void usage(char *argv[])
 {
 	int i;
@@ -108,23 +112,93 @@ static const char *action2str(int action)
 	return NULL;
 }
 
+/* Common stats data record shared with _kern.c */
+struct datarec {
+	__u64 processed;
+	__u64 dropped;
+	__u64 info;
+};
+#define MAX_CPUS 64
+
+/* Userspace structs for collection of stats from maps */
 struct record {
-	__u64 counter;
 	__u64 timestamp;
+	struct datarec total;
+	struct datarec *cpu;
+};
+struct u64rec {
+	__u64 processed;
+};
+struct record_u64 {
+	/* record for _kern side __u64 values */
+	__u64 timestamp;
+	struct u64rec total;
+	struct u64rec *cpu;
 };
 
 struct stats_record {
-	struct record xdp_redir[REDIR_RES_MAX];
-	struct record xdp_exception[XDP_ACTION_MAX];
+	struct record_u64 xdp_redirect[REDIR_RES_MAX];
+	struct record_u64 xdp_exception[XDP_ACTION_MAX];
+	struct record xdp_cpumap_kthread;
+	struct record xdp_cpumap_enqueue[MAX_CPUS];
 };
 
-static void stats_print_headers(bool err_only)
+static bool map_collect_record(int fd, __u32 key, struct record *rec)
 {
-	if (err_only)
-		printf("\n%s\n", __doc_err_only__);
+	/* For percpu maps, userspace gets a value per possible CPU */
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct datarec values[nr_cpus];
+	__u64 sum_processed = 0;
+	__u64 sum_dropped = 0;
+	__u64 sum_info = 0;
+	int i;
+
+	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+		fprintf(stderr,
+			"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+		return false;
+	}
+	/* Get time as close as possible to reading map contents */
+	rec->timestamp = gettime();
 
-	printf("%-14s %-11s %-10s %-18s %-9s\n",
-	       "ACTION", "result", "pps ", "pps-human-readable", "measure-period");
+	/* Record and sum values from each CPU */
+	for (i = 0; i < nr_cpus; i++) {
+		rec->cpu[i].processed = values[i].processed;
+		sum_processed        += values[i].processed;
+		rec->cpu[i].dropped = values[i].dropped;
+		sum_dropped        += values[i].dropped;
+		rec->cpu[i].info = values[i].info;
+		sum_info        += values[i].info;
+	}
+	rec->total.processed = sum_processed;
+	rec->total.dropped   = sum_dropped;
+	rec->total.info      = sum_info;
+	return true;
+}
+
+static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec)
+{
+	/* For percpu maps, userspace gets a value per possible CPU */
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct u64rec values[nr_cpus];
+	__u64 sum_total = 0;
+	int i;
+
+	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+		fprintf(stderr,
+			"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+		return false;
+	}
+	/* Get time as close as possible to reading map contents */
+	rec->timestamp = gettime();
+
+	/* Record and sum values from each CPU */
+	for (i = 0; i < nr_cpus; i++) {
+		rec->cpu[i].processed = values[i].processed;
+		sum_total            += values[i].processed;
+	}
+	rec->total.processed = sum_total;
+	return true;
 }
 
 static double calc_period(struct record *r, struct record *p)
@@ -139,77 +213,203 @@ static double calc_period(struct record *r, struct record *p)
 	return period_;
 }
 
-static double calc_pps(struct record *r, struct record *p, double period)
+static double calc_period_u64(struct record_u64 *r, struct record_u64 *p)
+{
+	double period_ = 0;
+	__u64 period = 0;
+
+	period = r->timestamp - p->timestamp;
+	if (period > 0)
+		period_ = ((double) period / NANOSEC_PER_SEC);
+
+	return period_;
+}
+
+static double calc_pps(struct datarec *r, struct datarec *p, double period)
+{
+	__u64 packets = 0;
+	double pps = 0;
+
+	if (period > 0) {
+		packets = r->processed - p->processed;
+		pps = packets / period;
+	}
+	return pps;
+}
+
+static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period)
+{
+	__u64 packets = 0;
+	double pps = 0;
+
+	if (period > 0) {
+		packets = r->processed - p->processed;
+		pps = packets / period;
+	}
+	return pps;
+}
+
+static double calc_drop(struct datarec *r, struct datarec *p, double period)
+{
+	__u64 packets = 0;
+	double pps = 0;
+
+	if (period > 0) {
+		packets = r->dropped - p->dropped;
+		pps = packets / period;
+	}
+	return pps;
+}
+
+static double calc_info(struct datarec *r, struct datarec *p, double period)
 {
 	__u64 packets = 0;
 	double pps = 0;
 
 	if (period > 0) {
-		packets = r->counter - p->counter;
+		packets = r->info - p->info;
 		pps = packets / period;
 	}
 	return pps;
 }
 
-static void stats_print(struct stats_record *rec,
-			struct stats_record *prev,
+static void stats_print(struct stats_record *stats_rec,
+			struct stats_record *stats_prev,
 			bool err_only)
 {
-	double period = 0, pps = 0;
-	struct record *r, *p;
-	int i = 0;
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	int rec_i = 0, i, to_cpu;
+	double t = 0, pps = 0;
 
-	char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n";
+	/* Header */
+	printf("%-15s %-7s %-12s %-12s %-9s\n",
+	       "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info");
 
 	/* tracepoint: xdp:xdp_redirect_* */
 	if (err_only)
-		i = REDIR_ERROR;
-
-	for (; i < REDIR_RES_MAX; i++) {
-		r = &rec->xdp_redir[i];
-		p = &prev->xdp_redir[i];
-
-		if (p->timestamp) {
-			period = calc_period(r, p);
-			pps = calc_pps(r, p, period);
+		rec_i = REDIR_ERROR;
+
+	for (; rec_i < REDIR_RES_MAX; rec_i++) {
+		struct record_u64 *rec, *prev;
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
+
+		rec  =  &stats_rec->xdp_redirect[rec_i];
+		prev = &stats_prev->xdp_redirect[rec_i];
+		t = calc_period_u64(rec, prev);
+
+		for (i = 0; i < nr_cpus; i++) {
+			struct u64rec *r = &rec->cpu[i];
+			struct u64rec *p = &prev->cpu[i];
+
+			pps = calc_pps_u64(r, p, t);
+			if (pps > 0)
+				printf(fmt1, "XDP_REDIRECT", i,
+				       rec_i ? 0.0: pps, rec_i ? pps : 0.0,
+				       err2str(rec_i));
 		}
-		printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period);
+		pps = calc_pps_u64(&rec->total, &prev->total, t);
+		printf(fmt2, "XDP_REDIRECT", "total",
+		       rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i));
 	}
 
 	/* tracepoint: xdp:xdp_exception */
-	for (i = 0; i < XDP_ACTION_MAX; i++) {
-		r = &rec->xdp_exception[i];
-		p = &prev->xdp_exception[i];
-		if (p->timestamp) {
-			period = calc_period(r, p);
-			pps = calc_pps(r, p, period);
+	for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) {
+		struct record_u64 *rec, *prev;
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
+
+		rec  =  &stats_rec->xdp_exception[rec_i];
+		prev = &stats_prev->xdp_exception[rec_i];
+		t = calc_period_u64(rec, prev);
+
+		for (i = 0; i < nr_cpus; i++) {
+			struct u64rec *r = &rec->cpu[i];
+			struct u64rec *p = &prev->cpu[i];
+
+			pps = calc_pps_u64(r, p, t);
+			if (pps > 0)
+				printf(fmt1, "Exception", i,
+				       0.0, pps, err2str(rec_i));
 		}
+		pps = calc_pps_u64(&rec->total, &prev->total, t);
 		if (pps > 0)
-			printf(fmt, action2str(i), "Exception",
-			       pps, pps, period);
+			printf(fmt2, "Exception", "total",
+			       0.0, pps, action2str(rec_i));
 	}
-	printf("\n");
-}
 
-static __u64 get_key32_value64_percpu(int fd, __u32 key)
-{
-	/* For percpu maps, userspace gets a value per possible CPU */
-	unsigned int nr_cpus = bpf_num_possible_cpus();
-	__u64 values[nr_cpus];
-	__u64 sum = 0;
-	int i;
-
-	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
-		fprintf(stderr,
-			"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
-		return 0;
+	/* cpumap enqueue stats */
+	for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) {
+		char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
+		char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
+		struct record *rec, *prev;
+		char *info_str = "";
+		double drop, info;
+
+		rec  =  &stats_rec->xdp_cpumap_enqueue[to_cpu];
+		prev = &stats_prev->xdp_cpumap_enqueue[to_cpu];
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps  = calc_pps(r, p, t);
+			drop = calc_drop(r, p, t);
+			info = calc_info(r, p, t);
+			if (info > 0) {
+				info_str = "bulk-average";
+				info = pps / info; /* calc average bulk size */
+			}
+			if (pps > 0)
+				printf(fmt1, "cpumap-enqueue",
+				       i, to_cpu, pps, drop, info, info_str);
+		}
+		pps = calc_pps(&rec->total, &prev->total, t);
+		if (pps > 0) {
+			drop = calc_drop(&rec->total, &prev->total, t);
+			info = calc_info(&rec->total, &prev->total, t);
+			if (info > 0) {
+				info_str = "bulk-average";
+				info = pps / info; /* calc average bulk size */
+			}
+			printf(fmt2, "cpumap-enqueue",
+			       "sum", to_cpu, pps, drop, info, info_str);
+		}
 	}
 
-	/* Sum values from each CPU */
-	for (i = 0; i < nr_cpus; i++) {
-		sum += values[i];
+	/* cpumap kthread stats */
+	{
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n";
+		struct record *rec, *prev;
+		double drop, info;
+		char *i_str = "";
+
+		rec  =  &stats_rec->xdp_cpumap_kthread;
+		prev = &stats_prev->xdp_cpumap_kthread;
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps  = calc_pps(r, p, t);
+			drop = calc_drop(r, p, t);
+			info = calc_info(r, p, t);
+			if (info > 0)
+				i_str = "sched";
+			if (pps > 0)
+				printf(fmt1, "cpumap-kthread",
+				       i, pps, drop, info, i_str);
+		}
+		pps = calc_pps(&rec->total, &prev->total, t);
+		drop = calc_drop(&rec->total, &prev->total, t);
+		info = calc_info(&rec->total, &prev->total, t);
+		if (info > 0)
+			i_str = "sched-sum";
+		printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
 	}
-	return sum;
+
+	printf("\n");
 }
 
 static bool stats_collect(struct stats_record *rec)
@@ -222,25 +422,109 @@ static bool stats_collect(struct stats_record *rec)
 	 */
 
 	fd = map_data[0].fd; /* map0: redirect_err_cnt */
-	for (i = 0; i < REDIR_RES_MAX; i++) {
-		rec->xdp_redir[i].timestamp = gettime();
-		rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i);
-	}
+	for (i = 0; i < REDIR_RES_MAX; i++)
+		map_collect_record_u64(fd, i, &rec->xdp_redirect[i]);
 
 	fd = map_data[1].fd; /* map1: exception_cnt */
 	for (i = 0; i < XDP_ACTION_MAX; i++) {
-		rec->xdp_exception[i].timestamp = gettime();
-		rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i);
+		map_collect_record_u64(fd, i, &rec->xdp_exception[i]);
 	}
 
+	fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */
+	for (i = 0; i < MAX_CPUS; i++)
+		map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]);
+
+	fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
+	map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
+
 	return true;
 }
 
+static void *alloc_rec_per_cpu(int record_size)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	void *array;
+	size_t size;
+
+	size = record_size * nr_cpus;
+	array = malloc(size);
+	memset(array, 0, size);
+	if (!array) {
+		fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+		exit(EXIT_FAIL_MEM);
+	}
+	return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+	struct stats_record *rec;
+	int rec_sz;
+	int i;
+
+	/* Alloc main stats_record structure */
+	rec = malloc(sizeof(*rec));
+	memset(rec, 0, sizeof(*rec));
+	if (!rec) {
+		fprintf(stderr, "Mem alloc error\n");
+		exit(EXIT_FAIL_MEM);
+	}
+
+	/* Alloc stats stored per CPU for each record */
+	rec_sz = sizeof(struct u64rec);
+	for (i = 0; i < REDIR_RES_MAX; i++)
+		rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+	for (i = 0; i < XDP_ACTION_MAX; i++)
+		rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+	rec_sz = sizeof(struct datarec);
+	rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
+
+	for (i = 0; i < MAX_CPUS; i++)
+		rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+	return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+	int i;
+
+	for (i = 0; i < REDIR_RES_MAX; i++)
+		free(r->xdp_redirect[i].cpu);
+
+	for (i = 0; i < XDP_ACTION_MAX; i++)
+		free(r->xdp_exception[i].cpu);
+
+	free(r->xdp_cpumap_kthread.cpu);
+
+	for (i = 0; i < MAX_CPUS; i++)
+		free(r->xdp_cpumap_enqueue[i].cpu);
+
+	free(r);
+}
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+	struct stats_record *tmp;
+
+	tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
 static void stats_poll(int interval, bool err_only)
 {
-	struct stats_record rec, prev;
+	struct stats_record *rec, *prev;
 
-	memset(&rec, 0, sizeof(rec));
+	rec  = alloc_stats_record();
+	prev = alloc_stats_record();
+	stats_collect(rec);
+
+	if (err_only)
+		printf("\n%s\n", __doc_err_only__);
 
 	/* Trick to pretty printf with thousands separators use %' */
 	setlocale(LC_NUMERIC, "en_US");
@@ -258,13 +542,15 @@ static void stats_poll(int interval, bool err_only)
 	fflush(stdout);
 
 	while (1) {
-		memcpy(&prev, &rec, sizeof(rec));
-		stats_collect(&rec);
-		stats_print_headers(err_only);
-		stats_print(&rec, &prev, err_only);
+		swap(&prev, &rec);
+		stats_collect(rec);
+		stats_print(rec, prev, err_only);
 		fflush(stdout);
 		sleep(interval);
 	}
+
+	free_stats_record(rec);
+	free_stats_record(prev);
 }
 
 static void print_bpf_prog_info(void)
-- 
cgit v1.1


From 901334159419afc8c1b8556243fc53e9617472d2 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:29 +0100
Subject: bpf, verifier: detect misconfigured mem, size argument pair

I've seen two patch proposals now for helper additions that used
ARG_PTR_TO_MEM or similar in reg_X but no corresponding ARG_CONST_SIZE
in reg_X+1. Verifier won't complain in such case, but it will omit
verifying the memory passed to the helper thus ending up badly.
Detect such buggy helper function signature and bail out during
verification rather than finding them through review.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 79 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 24 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2e7a43e..5eeb200 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1837,6 +1837,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	}
 }
 
+static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
+{
+	return type == ARG_PTR_TO_MEM ||
+	       type == ARG_PTR_TO_MEM_OR_NULL ||
+	       type == ARG_PTR_TO_UNINIT_MEM;
+}
+
+static bool arg_type_is_mem_size(enum bpf_arg_type type)
+{
+	return type == ARG_CONST_SIZE ||
+	       type == ARG_CONST_SIZE_OR_ZERO;
+}
+
 static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			  enum bpf_arg_type arg_type,
 			  struct bpf_call_arg_meta *meta)
@@ -1886,9 +1899,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = PTR_TO_CTX;
 		if (type != expected_type)
 			goto err_type;
-	} else if (arg_type == ARG_PTR_TO_MEM ||
-		   arg_type == ARG_PTR_TO_MEM_OR_NULL ||
-		   arg_type == ARG_PTR_TO_UNINIT_MEM) {
+	} else if (arg_type_is_mem_ptr(arg_type)) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
 		 * passed in as argument, it's a SCALAR_VALUE type. Final test
@@ -1949,25 +1960,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			err = check_stack_boundary(env, regno,
 						   meta->map_ptr->value_size,
 						   false, NULL);
-	} else if (arg_type == ARG_CONST_SIZE ||
-		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
+	} else if (arg_type_is_mem_size(arg_type)) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
-		/* bpf_xxx(..., buf, len) call will access 'len' bytes
-		 * from stack pointer 'buf'. Check it
-		 * note: regno == len, regno - 1 == buf
-		 */
-		if (regno == 0) {
-			/* kernel subsystem misconfigured verifier */
-			verbose(env,
-				"ARG_CONST_SIZE cannot be first argument\n");
-			return -EACCES;
-		}
-
 		/* The register is SCALAR_VALUE; the access check
 		 * happens using its boundaries.
 		 */
-
 		if (!tnum_is_const(reg->var_off))
 			/* For unprivileged variable accesses, disable raw
 			 * mode so that the program is required to
@@ -2111,7 +2109,7 @@ error:
 	return -EINVAL;
 }
 
-static int check_raw_mode(const struct bpf_func_proto *fn)
+static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
 {
 	int count = 0;
 
@@ -2126,7 +2124,44 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 	if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
 
-	return count > 1 ? -EINVAL : 0;
+	/* We only support one arg being in raw mode at the moment,
+	 * which is sufficient for the helper functions we have
+	 * right now.
+	 */
+	return count <= 1;
+}
+
+static bool check_args_pair_invalid(enum bpf_arg_type arg_curr,
+				    enum bpf_arg_type arg_next)
+{
+	return (arg_type_is_mem_ptr(arg_curr) &&
+	        !arg_type_is_mem_size(arg_next)) ||
+	       (!arg_type_is_mem_ptr(arg_curr) &&
+		arg_type_is_mem_size(arg_next));
+}
+
+static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
+{
+	/* bpf_xxx(..., buf, len) call will access 'len'
+	 * bytes from memory 'buf'. Both arg types need
+	 * to be paired, so make sure there's no buggy
+	 * helper function specification.
+	 */
+	if (arg_type_is_mem_size(fn->arg1_type) ||
+	    arg_type_is_mem_ptr(fn->arg5_type)  ||
+	    check_args_pair_invalid(fn->arg1_type, fn->arg2_type) ||
+	    check_args_pair_invalid(fn->arg2_type, fn->arg3_type) ||
+	    check_args_pair_invalid(fn->arg3_type, fn->arg4_type) ||
+	    check_args_pair_invalid(fn->arg4_type, fn->arg5_type))
+		return false;
+
+	return true;
+}
+
+static int check_func_proto(const struct bpf_func_proto *fn)
+{
+	return check_raw_mode_ok(fn) &&
+	       check_arg_pair_ok(fn) ? 0 : -EINVAL;
 }
 
 /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
@@ -2282,7 +2317,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 
 	if (env->ops->get_func_proto)
 		fn = env->ops->get_func_proto(func_id);
-
 	if (!fn) {
 		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
 			func_id);
@@ -2306,10 +2340,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	memset(&meta, 0, sizeof(meta));
 	meta.pkt_access = fn->pkt_access;
 
-	/* We only support one arg being in raw mode at the moment, which
-	 * is sufficient for the helper functions we have right now.
-	 */
-	err = check_raw_mode(fn);
+	err = check_func_proto(fn);
 	if (err) {
 		verbose(env, "kernel subsystem misconfigured func %s#%d\n",
 			func_id_name(func_id), func_id);
-- 
cgit v1.1


From 205c380778d09291668da5267057a80385f89437 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:30 +0100
Subject: bpf: add csum_diff helper to xdp as well

Useful for porting cls_bpf programs w/o increasing program
complexity limits much at the same time, so add the helper
to XDP as well.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 30fafaa..e517885 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3456,6 +3456,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_xdp_event_output_proto;
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
 	case BPF_FUNC_xdp_adjust_head:
 		return &bpf_xdp_adjust_head_proto;
 	case BPF_FUNC_xdp_adjust_meta:
-- 
cgit v1.1


From fcd1c9177195489c40198d2769649439dd88505b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:31 +0100
Subject: bpf: add couple of test cases for signed extended imms

Add a couple of test cases for interpreter and JIT that are
related to an issue we faced some time ago in Cilium [1],
which is fixed in LLVM with commit e53750e1e086 ("bpf: fix
bug on silently truncating 64-bit immediate").

Test cases were run-time checking kernel to behave as intended
which should also provide some guidance for current or new
JITs in case they should trip over this. Added for cBPF and
eBPF.

  [1] https://github.com/cilium/cilium/pull/2162

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 lib/test_bpf.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index f369889..e3938e3 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -6109,6 +6109,110 @@ static struct bpf_test tests[] = {
 		{ { ETH_HLEN, 42 } },
 		.fill_helper = bpf_fill_ld_abs_vlan_push_pop2,
 	},
+	/* Checking interpreter vs JIT wrt signed extended imms. */
+	{
+		"JNE signed compare, test 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R1, 0xfefbbc12),
+			BPF_ALU32_IMM(BPF_MOV, R3, 0xffff0000),
+			BPF_MOV64_REG(R2, R1),
+			BPF_ALU64_REG(BPF_AND, R2, R3),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_JMP_IMM(BPF_JNE, R2, -17104896, 1),
+			BPF_ALU32_IMM(BPF_MOV, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JNE signed compare, test 2",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R1, 0xfefbbc12),
+			BPF_ALU32_IMM(BPF_MOV, R3, 0xffff0000),
+			BPF_MOV64_REG(R2, R1),
+			BPF_ALU64_REG(BPF_AND, R2, R3),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_JMP_IMM(BPF_JNE, R2, 0xfefb0000, 1),
+			BPF_ALU32_IMM(BPF_MOV, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JNE signed compare, test 3",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R1, 0xfefbbc12),
+			BPF_ALU32_IMM(BPF_MOV, R3, 0xffff0000),
+			BPF_ALU32_IMM(BPF_MOV, R4, 0xfefb0000),
+			BPF_MOV64_REG(R2, R1),
+			BPF_ALU64_REG(BPF_AND, R2, R3),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_JMP_REG(BPF_JNE, R2, R4, 1),
+			BPF_ALU32_IMM(BPF_MOV, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"JNE signed compare, test 4",
+		.u.insns_int = {
+			BPF_LD_IMM64(R1, -17104896),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_JMP_IMM(BPF_JNE, R1, -17104896, 1),
+			BPF_ALU32_IMM(BPF_MOV, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"JNE signed compare, test 5",
+		.u.insns_int = {
+			BPF_LD_IMM64(R1, 0xfefb0000),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_JMP_IMM(BPF_JNE, R1, 0xfefb0000, 1),
+			BPF_ALU32_IMM(BPF_MOV, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JNE signed compare, test 6",
+		.u.insns_int = {
+			BPF_LD_IMM64(R1, 0x7efb0000),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_JMP_IMM(BPF_JNE, R1, 0x7efb0000, 1),
+			BPF_ALU32_IMM(BPF_MOV, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"JNE signed compare, test 7",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0xffff0000),
+			BPF_STMT(BPF_MISC | BPF_TAX, 0),
+			BPF_STMT(BPF_LD | BPF_IMM, 0xfefbbc12),
+			BPF_STMT(BPF_ALU | BPF_AND | BPF_X, 0),
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0xfefb0000, 1, 0),
+			BPF_STMT(BPF_RET | BPF_K, 1),
+			BPF_STMT(BPF_RET | BPF_K, 2),
+		},
+		CLASSIC | FLAG_NO_DATA,
+		{},
+		{ { 0, 2 } },
+	},
 };
 
 static struct net_device dev;
-- 
cgit v1.1


From 87c1793b1b7f34915e9e64cdb503efb281c769a7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:32 +0100
Subject: bpf: add couple of test cases for div/mod by zero

Add couple of missing test cases for eBPF div/mod by zero to the
new test_verifier prog runtime feature. Also one for an empty prog
and only exit.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_verifier.c | 87 +++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 6c22edb..efca10d 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -102,6 +102,93 @@ static struct bpf_test tests[] = {
 		.retval = -3,
 	},
 	{
+		"DIV32 by 0, zero check 1",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_0, 42),
+			BPF_MOV32_IMM(BPF_REG_1, 0),
+			BPF_MOV32_IMM(BPF_REG_2, 1),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_2, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"DIV32 by 0, zero check 2",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_0, 42),
+			BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL),
+			BPF_MOV32_IMM(BPF_REG_2, 1),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_2, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"DIV64 by 0, zero check",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_0, 42),
+			BPF_MOV32_IMM(BPF_REG_1, 0),
+			BPF_MOV32_IMM(BPF_REG_2, 1),
+			BPF_ALU64_REG(BPF_DIV, BPF_REG_2, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"MOD32 by 0, zero check 1",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_0, 42),
+			BPF_MOV32_IMM(BPF_REG_1, 0),
+			BPF_MOV32_IMM(BPF_REG_2, 1),
+			BPF_ALU32_REG(BPF_MOD, BPF_REG_2, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"MOD32 by 0, zero check 2",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_0, 42),
+			BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL),
+			BPF_MOV32_IMM(BPF_REG_2, 1),
+			BPF_ALU32_REG(BPF_MOD, BPF_REG_2, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"MOD64 by 0, zero check",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_0, 42),
+			BPF_MOV32_IMM(BPF_REG_1, 0),
+			BPF_MOV32_IMM(BPF_REG_2, 1),
+			BPF_ALU64_REG(BPF_MOD, BPF_REG_2, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"empty prog",
+		.insns = {
+		},
+		.errstr = "last insn is not an exit or jmp",
+		.result = REJECT,
+	},
+	{
+		"only exit insn",
+		.insns = {
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 !read_ok",
+		.result = REJECT,
+	},
+	{
 		"unreachable",
 		.insns = {
 			BPF_EXIT_INSN(),
-- 
cgit v1.1


From fa9dd599b4dae841924b022768354cfde9affecb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:33 +0100
Subject: bpf: get rid of pure_initcall dependency to enable jits

Having a pure_initcall() callback just to permanently enable BPF
JITs under CONFIG_BPF_JIT_ALWAYS_ON is unnecessary and could leave
a small race window in future where JIT is still disabled on boot.
Since we know about the setting at compilation time anyway, just
initialize it properly there. Also consolidate all the individual
bpf_jit_enable variables into a single one and move them under one
location. Moreover, don't allow for setting unspecified garbage
values on them.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm/net/bpf_jit_32.c         |  2 --
 arch/arm64/net/bpf_jit_comp.c     |  2 --
 arch/mips/net/bpf_jit.c           |  2 --
 arch/mips/net/ebpf_jit.c          |  2 --
 arch/powerpc/net/bpf_jit_comp.c   |  2 --
 arch/powerpc/net/bpf_jit_comp64.c |  2 --
 arch/s390/net/bpf_jit_comp.c      |  2 --
 arch/sparc/net/bpf_jit_comp_32.c  |  2 --
 arch/sparc/net/bpf_jit_comp_64.c  |  2 --
 arch/x86/net/bpf_jit_comp.c       |  2 --
 kernel/bpf/core.c                 | 19 ++++++++++++-------
 net/core/sysctl_net_core.c        | 18 ++++++++++++------
 net/socket.c                      |  9 ---------
 13 files changed, 24 insertions(+), 42 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 4425189..a15e7cd 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -25,8 +25,6 @@
 
 #include "bpf_jit_32.h"
 
-int bpf_jit_enable __read_mostly;
-
 #define STACK_OFFSET(k)	(k)
 #define TMP_REG_1	(MAX_BPF_JIT_REG + 0)	/* TEMP Register 1 */
 #define TMP_REG_2	(MAX_BPF_JIT_REG + 1)	/* TEMP Register 2 */
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index acaa935e..8d456ee 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -31,8 +31,6 @@
 
 #include "bpf_jit.h"
 
-int bpf_jit_enable __read_mostly;
-
 #define TMP_REG_1 (MAX_BPF_JIT_REG + 0)
 #define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
 #define TCALL_CNT (MAX_BPF_JIT_REG + 2)
diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c
index 44b9250..4d8cb9b 100644
--- a/arch/mips/net/bpf_jit.c
+++ b/arch/mips/net/bpf_jit.c
@@ -1207,8 +1207,6 @@ jmp_cmp:
 	return 0;
 }
 
-int bpf_jit_enable __read_mostly;
-
 void bpf_jit_compile(struct bpf_prog *fp)
 {
 	struct jit_ctx ctx;
diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 97069a1..4e34703 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -177,8 +177,6 @@ static u32 b_imm(unsigned int tgt, struct jit_ctx *ctx)
 		(ctx->idx * 4) - 4;
 }
 
-int bpf_jit_enable __read_mostly;
-
 enum which_ebpf_reg {
 	src_reg,
 	src_reg_no_fp,
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index f9941b3..872d1f6 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -18,8 +18,6 @@
 
 #include "bpf_jit32.h"
 
-int bpf_jit_enable __read_mostly;
-
 static inline void bpf_flush_icache(void *start, void *end)
 {
 	smp_wmb();
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 6771c63..217a78e 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -21,8 +21,6 @@
 
 #include "bpf_jit64.h"
 
-int bpf_jit_enable __read_mostly;
-
 static void bpf_jit_fill_ill_insns(void *area, unsigned int size)
 {
 	memset32(area, BREAKPOINT_INSTRUCTION, size/4);
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 1dfadbd..e501887 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -28,8 +28,6 @@
 #include <asm/set_memory.h>
 #include "bpf_jit.h"
 
-int bpf_jit_enable __read_mostly;
-
 struct bpf_jit {
 	u32 seen;		/* Flags to remember seen eBPF instructions */
 	u32 seen_reg[16];	/* Array to remember which registers are used */
diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c
index 09e318e..3bd8ca9 100644
--- a/arch/sparc/net/bpf_jit_comp_32.c
+++ b/arch/sparc/net/bpf_jit_comp_32.c
@@ -11,8 +11,6 @@
 
 #include "bpf_jit_32.h"
 
-int bpf_jit_enable __read_mostly;
-
 static inline bool is_simm13(unsigned int value)
 {
 	return value + 0x1000 < 0x2000;
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 635fdef..50a24d7 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -12,8 +12,6 @@
 
 #include "bpf_jit_64.h"
 
-int bpf_jit_enable __read_mostly;
-
 static inline bool is_simm13(unsigned int value)
 {
 	return value + 0x1000 < 0x2000;
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 87f214f..b881a97 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -15,8 +15,6 @@
 #include <asm/set_memory.h>
 #include <linux/bpf.h>
 
-int bpf_jit_enable __read_mostly;
-
 /*
  * assembly code in arch/x86/net/bpf_jit.S
  */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 25e723b..bc9c5b1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -300,6 +300,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 }
 
 #ifdef CONFIG_BPF_JIT
+/* All BPF JIT sysctl knobs here. */
+int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+int bpf_jit_harden   __read_mostly;
+int bpf_jit_kallsyms __read_mostly;
+
 static __always_inline void
 bpf_get_prog_addr_region(const struct bpf_prog *prog,
 			 unsigned long *symbol_start,
@@ -381,8 +386,6 @@ static DEFINE_SPINLOCK(bpf_lock);
 static LIST_HEAD(bpf_kallsyms);
 static struct latch_tree_root bpf_tree __cacheline_aligned;
 
-int bpf_jit_kallsyms __read_mostly;
-
 static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
 {
 	WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
@@ -563,8 +566,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
 	bpf_prog_unlock_free(fp);
 }
 
-int bpf_jit_harden __read_mostly;
-
 static int bpf_jit_blind_insn(const struct bpf_insn *from,
 			      const struct bpf_insn *aux,
 			      struct bpf_insn *to_buff)
@@ -1379,9 +1380,13 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
 }
 
 #else
-static unsigned int __bpf_prog_ret0(const void *ctx,
-				    const struct bpf_insn *insn)
+static unsigned int __bpf_prog_ret0_warn(const void *ctx,
+					 const struct bpf_insn *insn)
 {
+	/* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
+	 * is not working properly, so warn about it!
+	 */
+	WARN_ON_ONCE(1);
 	return 0;
 }
 #endif
@@ -1441,7 +1446,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 
 	fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
 #else
-	fp->bpf_func = __bpf_prog_ret0;
+	fp->bpf_func = __bpf_prog_ret0_warn;
 #endif
 
 	/* eBPF JITs can rewrite the program in case constant
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index a47ad6c..6d39b4c 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -25,6 +25,7 @@
 
 static int zero = 0;
 static int one = 1;
+static int two __maybe_unused = 2;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
 static int max_skb_frags = MAX_SKB_FRAGS;
@@ -325,13 +326,14 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_enable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-#ifndef CONFIG_BPF_JIT_ALWAYS_ON
-		.proc_handler	= proc_dointvec
-#else
 		.proc_handler	= proc_dointvec_minmax,
+# ifdef CONFIG_BPF_JIT_ALWAYS_ON
 		.extra1		= &one,
 		.extra2		= &one,
-#endif
+# else
+		.extra1		= &zero,
+		.extra2		= &two,
+# endif
 	},
 # ifdef CONFIG_HAVE_EBPF_JIT
 	{
@@ -339,14 +341,18 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_harden,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &two,
 	},
 	{
 		.procname	= "bpf_jit_kallsyms",
 		.data		= &bpf_jit_kallsyms,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
 	},
 # endif
 #endif
diff --git a/net/socket.c b/net/socket.c
index fbfae1e..1536515b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2613,15 +2613,6 @@ out_fs:
 
 core_initcall(sock_init);	/* early initcall */
 
-static int __init jit_init(void)
-{
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
-	bpf_jit_enable = 1;
-#endif
-	return 0;
-}
-pure_initcall(jit_init);
-
 #ifdef CONFIG_PROC_FS
 void socket_seq_show(struct seq_file *seq)
 {
-- 
cgit v1.1


From 2e4a30983b0f9b19b59e38bbf7427d7fdd480d98 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:34 +0100
Subject: bpf: restrict access to core bpf sysctls

Given BPF reaches far beyond just networking these days, it was
never intended to allow setting and in some cases reading those
knobs out of a user namespace root running without CAP_SYS_ADMIN,
thus tighten such access.

Also the bpf_jit_enable = 2 debugging mode should only be allowed
if kptr_restrict is not set since it otherwise can leak addresses
to the kernel log. Dump a note to the kernel log that this is for
debugging JITs only when enabled.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/sysctl_net_core.c | 46 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 6d39b4c..f2d0462 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -251,6 +251,46 @@ static int proc_do_rss_key(struct ctl_table *table, int write,
 	return proc_dostring(&fake_table, write, buffer, lenp, ppos);
 }
 
+#ifdef CONFIG_BPF_JIT
+static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
+					   void __user *buffer, size_t *lenp,
+					   loff_t *ppos)
+{
+	int ret, jit_enable = *(int *)table->data;
+	struct ctl_table tmp = *table;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	tmp.data = &jit_enable;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (write && !ret) {
+		if (jit_enable < 2 ||
+		    (jit_enable == 2 && bpf_dump_raw_ok())) {
+			*(int *)table->data = jit_enable;
+			if (jit_enable == 2)
+				pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n");
+		} else {
+			ret = -EPERM;
+		}
+	}
+	return ret;
+}
+
+# ifdef CONFIG_HAVE_EBPF_JIT
+static int
+proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
+				    void __user *buffer, size_t *lenp,
+				    loff_t *ppos)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+# endif
+#endif
+
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -326,7 +366,7 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_enable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax_bpf_enable,
 # ifdef CONFIG_BPF_JIT_ALWAYS_ON
 		.extra1		= &one,
 		.extra2		= &one,
@@ -341,7 +381,7 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_harden,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
 		.extra1		= &zero,
 		.extra2		= &two,
 	},
@@ -350,7 +390,7 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_kallsyms,
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-- 
cgit v1.1


From de0a444ddae61b77683c27cd390e518e33858d20 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:35 +0100
Subject: bpf, x86: small optimization in alu ops with imm

For the BPF_REG_0 (BPF_REG_A in cBPF, respectively), we can use
the short form of the opcode as dst mapping is on eax/rax and
thus save a byte per such operation. Added to add/sub/and/or/xor
for 32/64 bit when K immediate is used. There may be more such
low-hanging fruit to add in future as well.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b881a97..5acee51 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -152,6 +152,11 @@ static bool is_ereg(u32 reg)
 			     BIT(BPF_REG_AX));
 }
 
+static bool is_axreg(u32 reg)
+{
+	return reg == BPF_REG_0;
+}
+
 /* add modifiers if 'reg' maps to x64 registers r8..r15 */
 static u8 add_1mod(u8 byte, u32 reg)
 {
@@ -445,16 +450,36 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			else if (is_ereg(dst_reg))
 				EMIT1(add_1mod(0x40, dst_reg));
 
+			/* b3 holds 'normal' opcode, b2 short form only valid
+			 * in case dst is eax/rax.
+			 */
 			switch (BPF_OP(insn->code)) {
-			case BPF_ADD: b3 = 0xC0; break;
-			case BPF_SUB: b3 = 0xE8; break;
-			case BPF_AND: b3 = 0xE0; break;
-			case BPF_OR: b3 = 0xC8; break;
-			case BPF_XOR: b3 = 0xF0; break;
+			case BPF_ADD:
+				b3 = 0xC0;
+				b2 = 0x05;
+				break;
+			case BPF_SUB:
+				b3 = 0xE8;
+				b2 = 0x2D;
+				break;
+			case BPF_AND:
+				b3 = 0xE0;
+				b2 = 0x25;
+				break;
+			case BPF_OR:
+				b3 = 0xC8;
+				b2 = 0x0D;
+				break;
+			case BPF_XOR:
+				b3 = 0xF0;
+				b2 = 0x35;
+				break;
 			}
 
 			if (is_imm8(imm32))
 				EMIT3(0x83, add_1reg(b3, dst_reg), imm32);
+			else if (is_axreg(dst_reg))
+				EMIT1_off32(b2, imm32);
 			else
 				EMIT2_off32(0x81, add_1reg(b3, dst_reg), imm32);
 			break;
-- 
cgit v1.1


From 4bd95f4b99e921f51783bfddcd9738e9d3eef2b5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:36 +0100
Subject: bpf: add upper complexity limit to verifier log

Given the limit could potentially get further adjustments in the
future, add it to the log so it becomes obvious what the current
limit is w/o having to check the source first. This may also be
helpful for debugging complexity related issues on kernels that
backport from upstream.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5eeb200..caae495 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4810,7 +4810,8 @@ process_bpf_exit:
 		insn_idx++;
 	}
 
-	verbose(env, "processed %d insns, stack depth ", insn_processed);
+	verbose(env, "processed %d insns (limit %d), stack depth ",
+		insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
 	for (i = 0; i < env->subprog_cnt + 1; i++) {
 		u32 depth = env->subprog_stack_depth[i];
 
-- 
cgit v1.1


From 1728a4f2ad6840746a6b1b9f01d652c5842f7e8d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Jan 2018 01:24:37 +0100
Subject: bpf: move event_output to const_size_or_zero for xdp/skb as well

Similar rationale as in a60dd35d2e39 ("bpf: change bpf_perf_event_output
arg5 type to ARG_CONST_SIZE_OR_ZERO"), change the type to CONST_SIZE_OR_ZERO
such that we can better deal with optimized code. No changes needed in
bpf_event_output() as it can also deal with 0 size entirely (e.g. as only
wake-up signal with empty frame in perf RB, or packet dumps w/o meta data
as another such possibility).

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index e517885..9b9b70d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2861,7 +2861,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
 	.arg4_type	= ARG_PTR_TO_MEM,
-	.arg5_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
 static unsigned short bpf_tunnel_key_af(u64 flags)
@@ -3150,7 +3150,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
 	.arg4_type	= ARG_PTR_TO_MEM,
-	.arg5_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
 BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
-- 
cgit v1.1