summaryrefslogtreecommitdiffstats
path: root/samples
diff options
context:
space:
mode:
Diffstat (limited to 'samples')
-rw-r--r--samples/bpf/Makefile17
-rw-r--r--samples/bpf/README.rst10
-rw-r--r--samples/bpf/bpf_load.c2
-rw-r--r--samples/bpf/cgroup_helpers.c4
-rw-r--r--samples/bpf/map_perf_test_kern.c2
-rw-r--r--samples/bpf/map_perf_test_user.c3
-rw-r--r--samples/bpf/syscall_tp_user.c66
-rw-r--r--samples/bpf/tcp_basertt_kern.c78
-rw-r--r--samples/bpf/tcp_bbf.readme26
-rw-r--r--samples/bpf/test_cgrp2_attach2.c224
-rw-r--r--samples/bpf/trace_event_kern.c10
-rw-r--r--samples/bpf/trace_event_user.c13
-rw-r--r--samples/bpf/tracex6_kern.c26
-rw-r--r--samples/bpf/tracex6_user.c13
-rw-r--r--samples/bpf/xdp1_user.c8
-rw-r--r--samples/bpf/xdp_monitor_kern.c60
-rw-r--r--samples/bpf/xdp_monitor_user.c123
-rw-r--r--samples/bpf/xdp_redirect_cpu_kern.c609
-rw-r--r--samples/bpf/xdp_redirect_cpu_user.c697
-rw-r--r--samples/bpf/xdp_redirect_map_user.c7
20 files changed, 1925 insertions, 73 deletions
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index cf17c79..ea2b9e6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -39,6 +39,7 @@ hostprogs-y += per_socket_stats_example
hostprogs-y += load_sock_ops
hostprogs-y += xdp_redirect
hostprogs-y += xdp_redirect_map
+hostprogs-y += xdp_redirect_cpu
hostprogs-y += xdp_monitor
hostprogs-y += syscall_tp
@@ -84,6 +85,7 @@ test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o
per_socket_stats_example-objs := $(LIBBPF) cookie_uid_helper_example.o
xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o
xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o
+xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
@@ -127,8 +129,10 @@ always += tcp_bufs_kern.o
always += tcp_cong_kern.o
always += tcp_iw_kern.o
always += tcp_clamp_kern.o
+always += tcp_basertt_kern.o
always += xdp_redirect_kern.o
always += xdp_redirect_map_kern.o
+always += xdp_redirect_cpu_kern.o
always += xdp_monitor_kern.o
always += syscall_tp_kern.o
@@ -169,6 +173,7 @@ HOSTLOADLIBES_xdp_tx_iptunnel += -lelf
HOSTLOADLIBES_test_map_in_map += -lelf
HOSTLOADLIBES_xdp_redirect += -lelf
HOSTLOADLIBES_xdp_redirect_map += -lelf
+HOSTLOADLIBES_xdp_redirect_cpu += -lelf
HOSTLOADLIBES_xdp_monitor += -lelf
HOSTLOADLIBES_syscall_tp += -lelf
@@ -177,6 +182,12 @@ HOSTLOADLIBES_syscall_tp += -lelf
LLC ?= llc
CLANG ?= clang
+# Detect that we're cross compiling and use the cross compiler
+ifdef CROSS_COMPILE
+HOSTCC = $(CROSS_COMPILE)gcc
+CLANG_ARCH_ARGS = -target $(ARCH)
+endif
+
# Trick to allow make to be run from this directory
all:
$(MAKE) -C ../../ $(CURDIR)/
@@ -224,9 +235,9 @@ $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
$(obj)/%.o: $(src)/%.c
$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \
-I$(srctree)/tools/testing/selftests/bpf/ \
- -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \
- -Wno-compare-distinct-pointer-types \
+ -D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \
+ -D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \
-Wno-gnu-variable-sized-type-not-at-end \
-Wno-address-of-packed-member -Wno-tautological-compare \
- -Wno-unknown-warning-option \
+ -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
-O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst
index 79f9a58..5f27e4f 100644
--- a/samples/bpf/README.rst
+++ b/samples/bpf/README.rst
@@ -64,3 +64,13 @@ It is also possible to point make to the newly compiled 'llc' or
'clang' command via redefining LLC or CLANG on the make command line::
make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
+
+Cross compiling samples
+-----------------------
+In order to cross-compile, say for arm64 targets, export CROSS_COMPILE and ARCH
+environment variables before calling make. This will direct make to build
+samples for the cross target.
+
+export ARCH=arm64
+export CROSS_COMPILE="aarch64-linux-gnu-"
+make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 6aa5009..18b1c8d 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -221,6 +221,7 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps,
int inner_map_fd = map_fd[maps[i].def.inner_map_idx];
map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type,
+ maps[i].name,
maps[i].def.key_size,
inner_map_fd,
maps[i].def.max_entries,
@@ -228,6 +229,7 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps,
numa_node);
} else {
map_fd[i] = bpf_create_map_node(maps[i].def.type,
+ maps[i].name,
maps[i].def.key_size,
maps[i].def.value_size,
maps[i].def.max_entries,
diff --git a/samples/bpf/cgroup_helpers.c b/samples/bpf/cgroup_helpers.c
index 9d1be94..88bdcf4 100644
--- a/samples/bpf/cgroup_helpers.c
+++ b/samples/bpf/cgroup_helpers.c
@@ -56,7 +56,7 @@ int setup_cgroup_environment(void)
return 1;
}
- if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL)) {
+ if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL) && errno != EBUSY) {
log_err("mount cgroup2");
return 1;
}
@@ -163,7 +163,7 @@ int create_and_get_cgroup(char *path)
format_cgroup_path(cgroup_path, path);
if (mkdir(cgroup_path, 0777) && errno != EEXIST) {
- log_err("mkdiring cgroup");
+ log_err("mkdiring cgroup %s .. %s", path, cgroup_path);
return 0;
}
diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c
index 098c857..2b2ffb9 100644
--- a/samples/bpf/map_perf_test_kern.c
+++ b/samples/bpf/map_perf_test_kern.c
@@ -266,7 +266,7 @@ int stress_hash_map_lookup(struct pt_regs *ctx)
return 0;
}
-SEC("kprobe/sys_getpgrp")
+SEC("kprobe/sys_getppid")
int stress_array_map_lookup(struct pt_regs *ctx)
{
u32 key = 1, i;
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
index f388254..519d9af 100644
--- a/samples/bpf/map_perf_test_user.c
+++ b/samples/bpf/map_perf_test_user.c
@@ -137,6 +137,7 @@ static void do_test_lru(enum test_type test, int cpu)
inner_lru_map_fds[cpu] =
bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH,
+ test_map_names[INNER_LRU_HASH_PREALLOC],
sizeof(uint32_t),
sizeof(long),
inner_lru_hash_size, 0,
@@ -282,7 +283,7 @@ static void test_array_lookup(int cpu)
start_time = time_get_ns();
for (i = 0; i < max_cnt; i++)
- syscall(__NR_getpgrp, 0);
+ syscall(__NR_getppid, 0);
printf("%d:array_lookup %lld lookups per sec\n",
cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time));
}
diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c
index a3cb91e..9169d32 100644
--- a/samples/bpf/syscall_tp_user.c
+++ b/samples/bpf/syscall_tp_user.c
@@ -23,6 +23,13 @@
* This requires kernel CONFIG_FTRACE_SYSCALLS to be set.
*/
+static void usage(const char *cmd)
+{
+ printf("USAGE: %s [-i num_progs] [-h]\n", cmd);
+ printf(" -i num_progs # number of progs of the test\n");
+ printf(" -h # help\n");
+}
+
static void verify_map(int map_id)
{
__u32 key = 0;
@@ -32,22 +39,29 @@ static void verify_map(int map_id)
fprintf(stderr, "map_lookup failed: %s\n", strerror(errno));
return;
}
- if (val == 0)
+ if (val == 0) {
fprintf(stderr, "failed: map #%d returns value 0\n", map_id);
+ return;
+ }
+ val = 0;
+ if (bpf_map_update_elem(map_id, &key, &val, BPF_ANY) != 0) {
+ fprintf(stderr, "map_update failed: %s\n", strerror(errno));
+ return;
+ }
}
-int main(int argc, char **argv)
+static int test(char *filename, int num_progs)
{
- struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
- char filename[256];
- int fd;
+ int i, fd, map0_fds[num_progs], map1_fds[num_progs];
- setrlimit(RLIMIT_MEMLOCK, &r);
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-
- if (load_bpf_file(filename)) {
- fprintf(stderr, "%s", bpf_log_buf);
- return 1;
+ for (i = 0; i < num_progs; i++) {
+ if (load_bpf_file(filename)) {
+ fprintf(stderr, "%s", bpf_log_buf);
+ return 1;
+ }
+ printf("prog #%d: map ids %d %d\n", i, map_fd[0], map_fd[1]);
+ map0_fds[i] = map_fd[0];
+ map1_fds[i] = map_fd[1];
}
/* current load_bpf_file has perf_event_open default pid = -1
@@ -64,8 +78,34 @@ int main(int argc, char **argv)
close(fd);
/* verify the map */
- verify_map(map_fd[0]);
- verify_map(map_fd[1]);
+ for (i = 0; i < num_progs; i++) {
+ verify_map(map0_fds[i]);
+ verify_map(map1_fds[i]);
+ }
return 0;
}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ int opt, num_progs = 1;
+ char filename[256];
+
+ while ((opt = getopt(argc, argv, "i:h")) != -1) {
+ switch (opt) {
+ case 'i':
+ num_progs = atoi(optarg);
+ break;
+ case 'h':
+ default:
+ usage(argv[0]);
+ return 0;
+ }
+ }
+
+ setrlimit(RLIMIT_MEMLOCK, &r);
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ return test(filename, num_progs);
+}
diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c
new file mode 100644
index 0000000..4bf4fc5
--- /dev/null
+++ b/samples/bpf/tcp_basertt_kern.c
@@ -0,0 +1,78 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set base_rtt to 80us when host is running TCP-NV and
+ * both hosts are in the same datacenter (as determined by IPv6 prefix).
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_basertt(struct bpf_sock_ops *skops)
+{
+ char cong[20];
+ char nv[] = "nv";
+ int rv = 0, n;
+ int op;
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check if both hosts are in the same datacenter. For this
+ * example they are if the 1st 5.5 bytes in the IPv6 address
+ * are the same.
+ */
+ if (skops->family == AF_INET6 &&
+ skops->local_ip6[0] == skops->remote_ip6[0] &&
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) {
+ switch (op) {
+ case BPF_SOCK_OPS_BASE_RTT:
+ n = bpf_getsockopt(skops, SOL_TCP, TCP_CONGESTION,
+ cong, sizeof(cong));
+ if (!n && !__builtin_memcmp(cong, nv, sizeof(nv)+1)) {
+ /* Set base_rtt to 80us */
+ rv = 80;
+ } else if (n) {
+ rv = n;
+ } else {
+ rv = -1;
+ }
+ break;
+ default:
+ rv = -1;
+ }
+ } else {
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_bbf.readme b/samples/bpf/tcp_bbf.readme
new file mode 100644
index 0000000..831fb60
--- /dev/null
+++ b/samples/bpf/tcp_bbf.readme
@@ -0,0 +1,26 @@
+This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops)
+programs. These programs attach to a cgroupv2. The following commands create
+a cgroupv2 and attach a bash shell to the group.
+
+ mkdir -p /tmp/cgroupv2
+ mount -t cgroup2 none /tmp/cgroupv2
+ mkdir -p /tmp/cgroupv2/foo
+ bash
+ echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
+
+Anything that runs under this shell belongs to the foo cgroupv2 To load
+(attach) one of the tcp_*_kern.o programs:
+
+ ./load_sock_ops -l /tmp/cgroupv2/foo tcp_basertt_kern.o
+
+If the "-l" flag is used, the load_sock_ops program will continue to run
+printing the BPF log buffer. The tcp_*_kern.o programs use special print
+functions to print logging information (if enabled by the ifdef).
+
+If using netperf/netserver to create traffic, you need to run them under the
+cgroupv2 to which the BPF programs are attached (i.e. under bash shell
+attached to the cgroupv2).
+
+To remove (unattach) a socket_ops BPF program from a cgroupv2:
+
+ ./load_sock_ops -r /tmp/cgroupv2/foo
diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c
index 3049b1f2..3e8232c 100644
--- a/samples/bpf/test_cgrp2_attach2.c
+++ b/samples/bpf/test_cgrp2_attach2.c
@@ -30,7 +30,7 @@
#define FOO "/foo"
#define BAR "/foo/bar/"
-#define PING_CMD "ping -c1 -w1 127.0.0.1"
+#define PING_CMD "ping -c1 -w1 127.0.0.1 > /dev/null"
char bpf_log_buf[BPF_LOG_BUF_SIZE];
@@ -55,8 +55,7 @@ static int prog_load(int verdict)
return ret;
}
-
-int main(int argc, char **argv)
+static int test_foo_bar(void)
{
int drop_prog, allow_prog, foo = 0, bar = 0, rc = 0;
@@ -189,8 +188,223 @@ out:
close(bar);
cleanup_cgroup_environment();
if (!rc)
- printf("PASS\n");
+ printf("### override:PASS\n");
else
- printf("FAIL\n");
+ printf("### override:FAIL\n");
return rc;
}
+
+static int map_fd = -1;
+
+static int prog_load_cnt(int verdict, int val)
+{
+ if (map_fd < 0)
+ map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
+ if (map_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ return -1;
+ }
+
+ struct bpf_insn prog[] = {
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+ BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+ BPF_EXIT_INSN(),
+ };
+ size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+ int ret;
+
+ ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+ prog, insns_cnt, "GPL", 0,
+ bpf_log_buf, BPF_LOG_BUF_SIZE);
+
+ if (ret < 0) {
+ log_err("Loading program");
+ printf("Output from verifier:\n%s\n-------\n", bpf_log_buf);
+ return 0;
+ }
+ return ret;
+}
+
+
+static int test_multiprog(void)
+{
+ __u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id;
+ int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0;
+ int drop_prog, allow_prog[6] = {}, rc = 0;
+ unsigned long long value;
+ int i = 0;
+
+ for (i = 0; i < 6; i++) {
+ allow_prog[i] = prog_load_cnt(1, 1 << i);
+ if (!allow_prog[i])
+ goto err;
+ }
+ drop_prog = prog_load_cnt(0, 1);
+ if (!drop_prog)
+ goto err;
+
+ if (setup_cgroup_environment())
+ goto err;
+
+ cg1 = create_and_get_cgroup("/cg1");
+ if (!cg1)
+ goto err;
+ cg2 = create_and_get_cgroup("/cg1/cg2");
+ if (!cg2)
+ goto err;
+ cg3 = create_and_get_cgroup("/cg1/cg2/cg3");
+ if (!cg3)
+ goto err;
+ cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4");
+ if (!cg4)
+ goto err;
+ cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5");
+ if (!cg5)
+ goto err;
+
+ if (join_cgroup("/cg1/cg2/cg3/cg4/cg5"))
+ goto err;
+
+ if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) {
+ log_err("Attaching prog to cg1");
+ goto err;
+ }
+ if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) {
+ log_err("Unexpected success attaching the same prog to cg1");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS, 2)) {
+ log_err("Attaching prog2 to cg1");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS, 1)) {
+ log_err("Attaching prog to cg2");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS, 2)) {
+ log_err("Attaching prog to cg3");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS, 1)) {
+ log_err("Attaching prog to cg4");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[5], cg5, BPF_CGROUP_INET_EGRESS, 0)) {
+ log_err("Attaching prog to cg5");
+ goto err;
+ }
+ assert(system(PING_CMD) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+ assert(value == 1 + 2 + 8 + 32);
+
+ /* query the number of effective progs in cg5 */
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+ NULL, NULL, &prog_cnt) == 0);
+ assert(prog_cnt == 4);
+ /* retrieve prog_ids of effective progs in cg5 */
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+ &attach_flags, prog_ids, &prog_cnt) == 0);
+ assert(prog_cnt == 4);
+ assert(attach_flags == 0);
+ saved_prog_id = prog_ids[0];
+ /* check enospc handling */
+ prog_ids[0] = 0;
+ prog_cnt = 2;
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+ &attach_flags, prog_ids, &prog_cnt) == -1 &&
+ errno == ENOSPC);
+ assert(prog_cnt == 4);
+ /* check that prog_ids are returned even when buffer is too small */
+ assert(prog_ids[0] == saved_prog_id);
+ /* retrieve prog_id of single attached prog in cg5 */
+ prog_ids[0] = 0;
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0,
+ NULL, prog_ids, &prog_cnt) == 0);
+ assert(prog_cnt == 1);
+ assert(prog_ids[0] == saved_prog_id);
+
+ /* detach bottom program and ping again */
+ if (bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Detaching prog from cg5");
+ goto err;
+ }
+ value = 0;
+ assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
+ assert(system(PING_CMD) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+ assert(value == 1 + 2 + 8 + 16);
+
+ /* detach 3rd from bottom program and ping again */
+ errno = 0;
+ if (!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Unexpected success on detach from cg3");
+ goto err;
+ }
+ if (bpf_prog_detach2(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Detaching from cg3");
+ goto err;
+ }
+ value = 0;
+ assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
+ assert(system(PING_CMD) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+ assert(value == 1 + 2 + 16);
+
+ /* detach 2nd from bottom program and ping again */
+ if (bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Detaching prog from cg4");
+ goto err;
+ }
+ value = 0;
+ assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
+ assert(system(PING_CMD) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+ assert(value == 1 + 2 + 4);
+
+ prog_cnt = 4;
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+ &attach_flags, prog_ids, &prog_cnt) == 0);
+ assert(prog_cnt == 3);
+ assert(attach_flags == 0);
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0,
+ NULL, prog_ids, &prog_cnt) == 0);
+ assert(prog_cnt == 0);
+ goto out;
+err:
+ rc = 1;
+
+out:
+ for (i = 0; i < 6; i++)
+ if (allow_prog[i] > 0)
+ close(allow_prog[i]);
+ close(cg1);
+ close(cg2);
+ close(cg3);
+ close(cg4);
+ close(cg5);
+ cleanup_cgroup_environment();
+ if (!rc)
+ printf("### multi:PASS\n");
+ else
+ printf("### multi:FAIL\n");
+ return rc;
+}
+
+int main(int argc, char **argv)
+{
+ int rc = 0;
+
+ rc = test_foo_bar();
+ if (rc)
+ return rc;
+
+ return test_multiprog();
+}
diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c
index 41b6115..a77a583d 100644
--- a/samples/bpf/trace_event_kern.c
+++ b/samples/bpf/trace_event_kern.c
@@ -37,10 +37,14 @@ struct bpf_map_def SEC("maps") stackmap = {
SEC("perf_event")
int bpf_prog1(struct bpf_perf_event_data *ctx)
{
+ char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu";
+ char time_fmt2[] = "Get Time Failed, ErrCode: %d";
char fmt[] = "CPU-%d period %lld ip %llx";
u32 cpu = bpf_get_smp_processor_id();
+ struct bpf_perf_event_value value_buf;
struct key_t key;
u64 *val, one = 1;
+ int ret;
if (ctx->sample_period < 10000)
/* ignore warmup */
@@ -54,6 +58,12 @@ int bpf_prog1(struct bpf_perf_event_data *ctx)
return 0;
}
+ ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value));
+ if (!ret)
+ bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running);
+ else
+ bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret);
+
val = bpf_map_lookup_elem(&counts, &key);
if (val)
(*val)++;
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c
index 7bd827b..bf4f1b6 100644
--- a/samples/bpf/trace_event_user.c
+++ b/samples/bpf/trace_event_user.c
@@ -127,6 +127,9 @@ static void test_perf_event_all_cpu(struct perf_event_attr *attr)
int *pmu_fd = malloc(nr_cpus * sizeof(int));
int i, error = 0;
+ /* system wide perf event, no need to inherit */
+ attr->inherit = 0;
+
/* open perf_event on all cpus */
for (i = 0; i < nr_cpus; i++) {
pmu_fd[i] = sys_perf_event_open(attr, -1, i, -1, 0);
@@ -154,6 +157,11 @@ static void test_perf_event_task(struct perf_event_attr *attr)
{
int pmu_fd;
+ /* per task perf event, enable inherit so the "dd ..." command can be traced properly.
+ * Enabling inherit will cause bpf_perf_prog_read_time helper failure.
+ */
+ attr->inherit = 1;
+
/* open task bound event */
pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0);
if (pmu_fd < 0) {
@@ -175,14 +183,12 @@ static void test_bpf_perf_event(void)
.freq = 1,
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
- .inherit = 1,
};
struct perf_event_attr attr_type_sw = {
.sample_freq = SAMPLE_FREQ,
.freq = 1,
.type = PERF_TYPE_SOFTWARE,
.config = PERF_COUNT_SW_CPU_CLOCK,
- .inherit = 1,
};
struct perf_event_attr attr_hw_cache_l1d = {
.sample_freq = SAMPLE_FREQ,
@@ -192,7 +198,6 @@ static void test_bpf_perf_event(void)
PERF_COUNT_HW_CACHE_L1D |
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
- .inherit = 1,
};
struct perf_event_attr attr_hw_cache_branch_miss = {
.sample_freq = SAMPLE_FREQ,
@@ -202,7 +207,6 @@ static void test_bpf_perf_event(void)
PERF_COUNT_HW_CACHE_BPU |
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
(PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
- .inherit = 1,
};
struct perf_event_attr attr_type_raw = {
.sample_freq = SAMPLE_FREQ,
@@ -210,7 +214,6 @@ static void test_bpf_perf_event(void)
.type = PERF_TYPE_RAW,
/* Intel Instruction Retired */
.config = 0xc0,
- .inherit = 1,
};
printf("Test HW_CPU_CYCLES\n");
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c
index e7d1803..46c557a 100644
--- a/samples/bpf/tracex6_kern.c
+++ b/samples/bpf/tracex6_kern.c
@@ -15,6 +15,12 @@ struct bpf_map_def SEC("maps") values = {
.value_size = sizeof(u64),
.max_entries = 64,
};
+struct bpf_map_def SEC("maps") values2 = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct bpf_perf_event_value),
+ .max_entries = 64,
+};
SEC("kprobe/htab_map_get_next_key")
int bpf_prog1(struct pt_regs *ctx)
@@ -37,5 +43,25 @@ int bpf_prog1(struct pt_regs *ctx)
return 0;
}
+SEC("kprobe/htab_map_lookup_elem")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ u32 key = bpf_get_smp_processor_id();
+ struct bpf_perf_event_value *val, buf;
+ int error;
+
+ error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf));
+ if (error)
+ return 0;
+
+ val = bpf_map_lookup_elem(&values2, &key);
+ if (val)
+ *val = buf;
+ else
+ bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST);
+
+ return 0;
+}
+
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
index a05a99a..3341a96 100644
--- a/samples/bpf/tracex6_user.c
+++ b/samples/bpf/tracex6_user.c
@@ -22,6 +22,7 @@
static void check_on_cpu(int cpu, struct perf_event_attr *attr)
{
+ struct bpf_perf_event_value value2;
int pmu_fd, error = 0;
cpu_set_t set;
__u64 value;
@@ -46,8 +47,18 @@ static void check_on_cpu(int cpu, struct perf_event_attr *attr)
fprintf(stderr, "Value missing for CPU %d\n", cpu);
error = 1;
goto on_exit;
+ } else {
+ fprintf(stderr, "CPU %d: %llu\n", cpu, value);
+ }
+ /* The above bpf_map_lookup_elem should trigger the second kprobe */
+ if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) {
+ fprintf(stderr, "Value2 missing for CPU %d\n", cpu);
+ error = 1;
+ goto on_exit;
+ } else {
+ fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu,
+ value2.counter, value2.enabled, value2.running);
}
- fprintf(stderr, "CPU %d: %llu\n", cpu, value);
on_exit:
assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error);
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c
index 2431c03..fdaefe9 100644
--- a/samples/bpf/xdp1_user.c
+++ b/samples/bpf/xdp1_user.c
@@ -14,6 +14,7 @@
#include <string.h>
#include <unistd.h>
#include <libgen.h>
+#include <sys/resource.h>
#include "bpf_load.h"
#include "bpf_util.h"
@@ -69,6 +70,7 @@ static void usage(const char *prog)
int main(int argc, char **argv)
{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
const char *optstr = "SN";
char filename[256];
int opt;
@@ -91,6 +93,12 @@ int main(int argc, char **argv)
usage(basename(argv[0]));
return 1;
}
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
ifindex = strtoul(argv[optind], NULL, 0);
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 74f3fd8..2fe2f76 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -13,23 +13,27 @@ struct bpf_map_def SEC("maps") redirect_err_cnt = {
/* TODO: have entries for all possible errno's */
};
+#define XDP_UNKNOWN XDP_REDIRECT + 1
+struct bpf_map_def SEC("maps") exception_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = XDP_UNKNOWN + 1,
+};
+
/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
* Code in: kernel/include/trace/events/xdp.h
*/
struct xdp_redirect_ctx {
- unsigned short common_type; // offset:0; size:2; signed:0;
- unsigned char common_flags; // offset:2; size:1; signed:0;
- unsigned char common_preempt_count;// offset:3; size:1; signed:0;
- int common_pid; // offset:4; size:4; signed:1;
-
- int prog_id; // offset:8; size:4; signed:1;
- u32 act; // offset:12 size:4; signed:0;
- int ifindex; // offset:16 size:4; signed:1;
- int err; // offset:20 size:4; signed:1;
- int to_ifindex; // offset:24 size:4; signed:1;
- u32 map_id; // offset:28 size:4; signed:0;
- int map_index; // offset:32 size:4; signed:1;
-}; // offset:36
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int prog_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12 size:4; signed:0;
+ int ifindex; // offset:16 size:4; signed:1;
+ int err; // offset:20 size:4; signed:1;
+ int to_ifindex; // offset:24 size:4; signed:1;
+ u32 map_id; // offset:28 size:4; signed:0;
+ int map_index; // offset:32 size:4; signed:1;
+}; // offset:36
enum {
XDP_REDIRECT_SUCCESS = 0,
@@ -48,7 +52,7 @@ int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
cnt = bpf_map_lookup_elem(&redirect_err_cnt, &key);
if (!cnt)
- return 0;
+ return 1;
*cnt += 1;
return 0; /* Indicate event was filtered (no further processing)*/
@@ -86,3 +90,31 @@ int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx)
{
return xdp_redirect_collect_stat(ctx);
}
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct xdp_exception_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int prog_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int ifindex; // offset:16; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_exception")
+int trace_xdp_exception(struct xdp_exception_ctx *ctx)
+{
+ u64 *cnt;;
+ u32 key;
+
+ key = ctx->act;
+ if (key > XDP_REDIRECT)
+ key = XDP_UNKNOWN;
+
+ cnt = bpf_map_lookup_elem(&exception_cnt, &key);
+ if (!cnt)
+ return 1;
+ *cnt += 1;
+
+ return 0;
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index b51b4f5..eaba165 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -20,6 +20,7 @@ static const char *__doc_err_only__=
#include <unistd.h>
#include <locale.h>
+#include <sys/resource.h>
#include <getopt.h>
#include <net/if.h>
#include <time.h>
@@ -61,7 +62,7 @@ static void usage(char *argv[])
}
#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
-__u64 gettime(void)
+static __u64 gettime(void)
{
struct timespec t;
int res;
@@ -89,6 +90,23 @@ static const char *err2str(int err)
return redir_names[err];
return NULL;
}
+/* enum xdp_action */
+#define XDP_UNKNOWN XDP_REDIRECT + 1
+#define XDP_ACTION_MAX (XDP_UNKNOWN + 1)
+static const char *xdp_action_names[XDP_ACTION_MAX] = {
+ [XDP_ABORTED] = "XDP_ABORTED",
+ [XDP_DROP] = "XDP_DROP",
+ [XDP_PASS] = "XDP_PASS",
+ [XDP_TX] = "XDP_TX",
+ [XDP_REDIRECT] = "XDP_REDIRECT",
+ [XDP_UNKNOWN] = "XDP_UNKNOWN",
+};
+static const char *action2str(int action)
+{
+ if (action < XDP_ACTION_MAX)
+ return xdp_action_names[action];
+ return NULL;
+}
struct record {
__u64 counter;
@@ -97,6 +115,7 @@ struct record {
struct stats_record {
struct record xdp_redir[REDIR_RES_MAX];
+ struct record xdp_exception[XDP_ACTION_MAX];
};
static void stats_print_headers(bool err_only)
@@ -104,39 +123,72 @@ static void stats_print_headers(bool err_only)
if (err_only)
printf("\n%s\n", __doc_err_only__);
- printf("%-14s %-10s %-18s %-9s\n",
- "XDP_REDIRECT", "pps ", "pps-human-readable", "measure-period");
+ printf("%-14s %-11s %-10s %-18s %-9s\n",
+ "ACTION", "result", "pps ", "pps-human-readable", "measure-period");
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+ double period_ = 0;
+ __u64 period = 0;
+
+ period = r->timestamp - p->timestamp;
+ if (period > 0)
+ period_ = ((double) period / NANOSEC_PER_SEC);
+
+ return period_;
+}
+
+static double calc_pps(struct record *r, struct record *p, double period)
+{
+ __u64 packets = 0;
+ double pps = 0;
+
+ if (period > 0) {
+ packets = r->counter - p->counter;
+ pps = packets / period;
+ }
+ return pps;
}
static void stats_print(struct stats_record *rec,
struct stats_record *prev,
bool err_only)
{
+ double period = 0, pps = 0;
+ struct record *r, *p;
int i = 0;
+ char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n";
+
+ /* tracepoint: xdp:xdp_redirect_* */
if (err_only)
i = REDIR_ERROR;
for (; i < REDIR_RES_MAX; i++) {
- struct record *r = &rec->xdp_redir[i];
- struct record *p = &prev->xdp_redir[i];
- __u64 period = 0;
- __u64 packets = 0;
- double pps = 0;
- double period_ = 0;
+ r = &rec->xdp_redir[i];
+ p = &prev->xdp_redir[i];
if (p->timestamp) {
- packets = r->counter - p->counter;
- period = r->timestamp - p->timestamp;
- if (period > 0) {
- period_ = ((double) period / NANOSEC_PER_SEC);
- pps = packets / period_;
- }
+ period = calc_period(r, p);
+ pps = calc_pps(r, p, period);
}
+ printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period);
+ }
- printf("%-14s %-10.0f %'-18.0f %f\n",
- err2str(i), pps, pps, period_);
+ /* tracepoint: xdp:xdp_exception */
+ for (i = 0; i < XDP_ACTION_MAX; i++) {
+ r = &rec->xdp_exception[i];
+ p = &prev->xdp_exception[i];
+ if (p->timestamp) {
+ period = calc_period(r, p);
+ pps = calc_pps(r, p, period);
+ }
+ if (pps > 0)
+ printf(fmt, action2str(i), "Exception",
+ pps, pps, period);
}
+ printf("\n");
}
static __u64 get_key32_value64_percpu(int fd, __u32 key)
@@ -160,25 +212,33 @@ static __u64 get_key32_value64_percpu(int fd, __u32 key)
return sum;
}
-static bool stats_collect(int fd, struct stats_record *rec)
+static bool stats_collect(struct stats_record *rec)
{
+ int fd;
int i;
/* TODO: Detect if someone unloaded the perf event_fd's, as
* this can happen by someone running perf-record -e
*/
+ fd = map_data[0].fd; /* map0: redirect_err_cnt */
for (i = 0; i < REDIR_RES_MAX; i++) {
rec->xdp_redir[i].timestamp = gettime();
rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i);
}
+
+ fd = map_data[1].fd; /* map1: exception_cnt */
+ for (i = 0; i < XDP_ACTION_MAX; i++) {
+ rec->xdp_exception[i].timestamp = gettime();
+ rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i);
+ }
+
return true;
}
static void stats_poll(int interval, bool err_only)
{
struct stats_record rec, prev;
- int map_fd;
memset(&rec, 0, sizeof(rec));
@@ -190,23 +250,24 @@ static void stats_poll(int interval, bool err_only)
printf("\n%s", __doc__);
/* TODO Need more advanced stats on error types */
- if (verbose)
- printf(" - Stats map: %s\n", map_data[0].name);
- map_fd = map_data[0].fd;
-
- stats_print_headers(err_only);
+ if (verbose) {
+ printf(" - Stats map0: %s\n", map_data[0].name);
+ printf(" - Stats map1: %s\n", map_data[1].name);
+ printf("\n");
+ }
fflush(stdout);
while (1) {
memcpy(&prev, &rec, sizeof(rec));
- stats_collect(map_fd, &rec);
+ stats_collect(&rec);
+ stats_print_headers(err_only);
stats_print(&rec, &prev, err_only);
fflush(stdout);
sleep(interval);
}
}
-void print_bpf_prog_info(void)
+static void print_bpf_prog_info(void)
{
int i;
@@ -235,6 +296,7 @@ void print_bpf_prog_info(void)
int main(int argc, char **argv)
{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
int longindex = 0, opt;
int ret = EXIT_SUCCESS;
char bpf_obj_file[256];
@@ -265,13 +327,18 @@ int main(int argc, char **argv)
}
}
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return EXIT_FAILURE;
+ }
+
if (load_bpf_file(bpf_obj_file)) {
printf("ERROR - bpf_log_buf: %s", bpf_log_buf);
- return 1;
+ return EXIT_FAILURE;
}
if (!prog_fd[0]) {
printf("ERROR - load_bpf_file: %s\n", strerror(errno));
- return 1;
+ return EXIT_FAILURE;
}
if (debug) {
diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu_kern.c
new file mode 100644
index 0000000..303e9e7
--- /dev/null
+++ b/samples/bpf/xdp_redirect_cpu_kern.c
@@ -0,0 +1,609 @@
+/* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
+ *
+ * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+ */
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/if_vlan.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/udp.h>
+
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define MAX_CPUS 12 /* WARNING - sync with _user.c */
+
+/* Special map type that can XDP_REDIRECT frames to another CPU */
+struct bpf_map_def SEC("maps") cpu_map = {
+ .type = BPF_MAP_TYPE_CPUMAP,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = MAX_CPUS,
+};
+
+/* Common stats data record to keep userspace more simple */
+struct datarec {
+ __u64 processed;
+ __u64 dropped;
+ __u64 issue;
+};
+
+/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
+ * feedback. Redirect TX errors can be caught via a tracepoint.
+ */
+struct bpf_map_def SEC("maps") rx_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") redirect_err_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 2,
+ /* TODO: have entries for all possible errno's */
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = MAX_CPUS,
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+/* Set of maps controlling available CPU, and for iterating through
+ * selectable redirect CPUs.
+ */
+struct bpf_map_def SEC("maps") cpus_available = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = MAX_CPUS,
+};
+struct bpf_map_def SEC("maps") cpus_count = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = 1,
+};
+struct bpf_map_def SEC("maps") cpus_iterator = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = 1,
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") exception_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+/* Helper parse functions */
+
+/* Parse Ethernet layer 2, extract network layer 3 offset and protocol
+ *
+ * Returns false on error and non-supported ether-type
+ */
+struct vlan_hdr {
+ __be16 h_vlan_TCI;
+ __be16 h_vlan_encapsulated_proto;
+};
+
+static __always_inline
+bool parse_eth(struct ethhdr *eth, void *data_end,
+ u16 *eth_proto, u64 *l3_offset)
+{
+ u16 eth_type;
+ u64 offset;
+
+ offset = sizeof(*eth);
+ if ((void *)eth + offset > data_end)
+ return false;
+
+ eth_type = eth->h_proto;
+
+ /* Skip non 802.3 Ethertypes */
+ if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
+ return false;
+
+ /* Handle VLAN tagged packet */
+ if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vlan_hdr;
+
+ vlan_hdr = (void *)eth + offset;
+ offset += sizeof(*vlan_hdr);
+ if ((void *)eth + offset > data_end)
+ return false;
+ eth_type = vlan_hdr->h_vlan_encapsulated_proto;
+ }
+ /* TODO: Handle double VLAN tagged packet */
+
+ *eth_proto = ntohs(eth_type);
+ *l3_offset = offset;
+ return true;
+}
+
+static __always_inline
+u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct iphdr *iph = data + nh_off;
+ struct udphdr *udph;
+ u16 dport;
+
+ if (iph + 1 > data_end)
+ return 0;
+ if (!(iph->protocol == IPPROTO_UDP))
+ return 0;
+
+ udph = (void *)(iph + 1);
+ if (udph + 1 > data_end)
+ return 0;
+
+ dport = ntohs(udph->dest);
+ return dport;
+}
+
+static __always_inline
+int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct iphdr *iph = data + nh_off;
+
+ if (iph + 1 > data_end)
+ return 0;
+ return iph->protocol;
+}
+
+static __always_inline
+int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ipv6hdr *ip6h = data + nh_off;
+
+ if (ip6h + 1 > data_end)
+ return 0;
+ return ip6h->nexthdr;
+}
+
+SEC("xdp_cpu_map0")
+int xdp_prognum0_no_touch(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct datarec *rec;
+ u32 *cpu_selected;
+ u32 cpu_dest;
+ u32 key = 0;
+
+ /* Only use first entry in cpus_available */
+ cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
+ if (!cpu_selected)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_selected;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map1_touch_data")
+int xdp_prognum1_touch_data(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ struct datarec *rec;
+ u32 *cpu_selected;
+ u32 cpu_dest;
+ u16 eth_type;
+ u32 key = 0;
+
+ /* Only use first entry in cpus_available */
+ cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
+ if (!cpu_selected)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_selected;
+
+ /* Validate packet length is minimum Eth header size */
+ if (eth + 1 > data_end)
+ return XDP_ABORTED;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ /* Read packet data, and use it (drop non 802.3 Ethertypes) */
+ eth_type = eth->h_proto;
+ if (ntohs(eth_type) < ETH_P_802_3_MIN) {
+ rec->dropped++;
+ return XDP_DROP;
+ }
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map2_round_robin")
+int xdp_prognum2_round_robin(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ struct datarec *rec;
+ u32 cpu_dest;
+ u32 *cpu_lookup;
+ u32 key0 = 0;
+
+ u32 *cpu_selected;
+ u32 *cpu_iterator;
+ u32 *cpu_max;
+ u32 cpu_idx;
+
+ cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
+ if (!cpu_max)
+ return XDP_ABORTED;
+
+ cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
+ if (!cpu_iterator)
+ return XDP_ABORTED;
+ cpu_idx = *cpu_iterator;
+
+ *cpu_iterator += 1;
+ if (*cpu_iterator == *cpu_max)
+ *cpu_iterator = 0;
+
+ cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+ if (!cpu_selected)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_selected;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key0);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map3_proto_separate")
+int xdp_prognum3_proto_separate(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ u8 ip_proto = IPPROTO_UDP;
+ struct datarec *rec;
+ u16 eth_proto = 0;
+ u64 l3_offset = 0;
+ u32 cpu_dest = 0;
+ u32 cpu_idx = 0;
+ u32 *cpu_lookup;
+ u32 key = 0;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
+ return XDP_PASS; /* Just skip */
+
+ /* Extract L4 protocol */
+ switch (eth_proto) {
+ case ETH_P_IP:
+ ip_proto = get_proto_ipv4(ctx, l3_offset);
+ break;
+ case ETH_P_IPV6:
+ ip_proto = get_proto_ipv6(ctx, l3_offset);
+ break;
+ case ETH_P_ARP:
+ cpu_idx = 0; /* ARP packet handled on separate CPU */
+ break;
+ default:
+ cpu_idx = 0;
+ }
+
+ /* Choose CPU based on L4 protocol */
+ switch (ip_proto) {
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ cpu_idx = 2;
+ break;
+ case IPPROTO_TCP:
+ cpu_idx = 0;
+ break;
+ case IPPROTO_UDP:
+ cpu_idx = 1;
+ break;
+ default:
+ cpu_idx = 0;
+ }
+
+ cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+ if (!cpu_lookup)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_lookup;
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map4_ddos_filter_pktgen")
+int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ u8 ip_proto = IPPROTO_UDP;
+ struct datarec *rec;
+ u16 eth_proto = 0;
+ u64 l3_offset = 0;
+ u32 cpu_dest = 0;
+ u32 cpu_idx = 0;
+ u16 dest_port;
+ u32 *cpu_lookup;
+ u32 key = 0;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
+ return XDP_PASS; /* Just skip */
+
+ /* Extract L4 protocol */
+ switch (eth_proto) {
+ case ETH_P_IP:
+ ip_proto = get_proto_ipv4(ctx, l3_offset);
+ break;
+ case ETH_P_IPV6:
+ ip_proto = get_proto_ipv6(ctx, l3_offset);
+ break;
+ case ETH_P_ARP:
+ cpu_idx = 0; /* ARP packet handled on separate CPU */
+ break;
+ default:
+ cpu_idx = 0;
+ }
+
+ /* Choose CPU based on L4 protocol */
+ switch (ip_proto) {
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ cpu_idx = 2;
+ break;
+ case IPPROTO_TCP:
+ cpu_idx = 0;
+ break;
+ case IPPROTO_UDP:
+ cpu_idx = 1;
+ /* DDoS filter UDP port 9 (pktgen) */
+ dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
+ if (dest_port == 9) {
+ if (rec)
+ rec->dropped++;
+ return XDP_DROP;
+ }
+ break;
+ default:
+ cpu_idx = 0;
+ }
+
+ cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+ if (!cpu_lookup)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_lookup;
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+
+char _license[] SEC("license") = "GPL";
+
+/*** Trace point code ***/
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct xdp_redirect_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int prog_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12 size:4; signed:0;
+ int ifindex; // offset:16 size:4; signed:1;
+ int err; // offset:20 size:4; signed:1;
+ int to_ifindex; // offset:24 size:4; signed:1;
+ u32 map_id; // offset:28 size:4; signed:0;
+ int map_index; // offset:32 size:4; signed:1;
+}; // offset:36
+
+enum {
+ XDP_REDIRECT_SUCCESS = 0,
+ XDP_REDIRECT_ERROR = 1
+};
+
+static __always_inline
+int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
+{
+ u32 key = XDP_REDIRECT_ERROR;
+ struct datarec *rec;
+ int err = ctx->err;
+
+ if (!err)
+ key = XDP_REDIRECT_SUCCESS;
+
+ rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
+ if (!rec)
+ return 0;
+ rec->dropped += 1;
+
+ return 0; /* Indicate event was filtered (no further processing)*/
+ /*
+ * Returning 1 here would allow e.g. a perf-record tracepoint
+ * to see and record these events, but it doesn't work well
+ * in-practice as stopping perf-record also unload this
+ * bpf_prog. Plus, there is additional overhead of doing so.
+ */
+}
+
+SEC("tracepoint/xdp/xdp_redirect_err")
+int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
+{
+ return xdp_redirect_collect_stat(ctx);
+}
+
+SEC("tracepoint/xdp/xdp_redirect_map_err")
+int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
+{
+ return xdp_redirect_collect_stat(ctx);
+}
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct xdp_exception_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int prog_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int ifindex; // offset:16; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_exception")
+int trace_xdp_exception(struct xdp_exception_ctx *ctx)
+{
+ struct datarec *rec;
+ u32 key = 0;
+
+ rec = bpf_map_lookup_elem(&exception_cnt, &key);
+ if (!rec)
+ return 1;
+ rec->dropped += 1;
+
+ return 0;
+}
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct cpumap_enqueue_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int map_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int cpu; // offset:16; size:4; signed:1;
+ unsigned int drops; // offset:20; size:4; signed:0;
+ unsigned int processed; // offset:24; size:4; signed:0;
+ int to_cpu; // offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_enqueue")
+int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
+{
+ u32 to_cpu = ctx->to_cpu;
+ struct datarec *rec;
+
+ if (to_cpu >= MAX_CPUS)
+ return 1;
+
+ rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
+ if (!rec)
+ return 0;
+ rec->processed += ctx->processed;
+ rec->dropped += ctx->drops;
+
+ /* Record bulk events, then userspace can calc average bulk size */
+ if (ctx->processed > 0)
+ rec->issue += 1;
+
+ /* Inception: It's possible to detect overload situations, via
+ * this tracepoint. This can be used for creating a feedback
+ * loop to XDP, which can take appropriate actions to mitigate
+ * this overload situation.
+ */
+ return 0;
+}
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct cpumap_kthread_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int map_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int cpu; // offset:16; size:4; signed:1;
+ unsigned int drops; // offset:20; size:4; signed:0;
+ unsigned int processed; // offset:24; size:4; signed:0;
+ int sched; // offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_kthread")
+int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
+{
+ struct datarec *rec;
+ u32 key = 0;
+
+ rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
+ if (!rec)
+ return 0;
+ rec->processed += ctx->processed;
+ rec->dropped += ctx->drops;
+
+ /* Count times kthread yielded CPU via schedule call */
+ if (ctx->sched)
+ rec->issue++;
+
+ return 0;
+}
diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c
new file mode 100644
index 0000000..35fec9f
--- /dev/null
+++ b/samples/bpf/xdp_redirect_cpu_user.c
@@ -0,0 +1,697 @@
+/* GPLv2 Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+ */
+static const char *__doc__ =
+ " XDP redirect with a CPU-map type \"BPF_MAP_TYPE_CPUMAP\"";
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <locale.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include <net/if.h>
+#include <time.h>
+
+#include <arpa/inet.h>
+#include <linux/if_link.h>
+
+#define MAX_CPUS 12 /* WARNING - sync with _kern.c */
+
+/* How many xdp_progs are defined in _kern.c */
+#define MAX_PROG 5
+
+/* Wanted to get rid of bpf_load.h and fake-"libbpf.h" (and instead
+ * use bpf/libbpf.h), but cannot as (currently) needed for XDP
+ * attaching to a device via set_link_xdp_fd()
+ */
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#include "bpf_util.h"
+
+static int ifindex = -1;
+static char ifname_buf[IF_NAMESIZE];
+static char *ifname;
+
+static __u32 xdp_flags;
+
+/* Exit return codes */
+#define EXIT_OK 0
+#define EXIT_FAIL 1
+#define EXIT_FAIL_OPTION 2
+#define EXIT_FAIL_XDP 3
+#define EXIT_FAIL_BPF 4
+#define EXIT_FAIL_MEM 5
+
+static const struct option long_options[] = {
+ {"help", no_argument, NULL, 'h' },
+ {"dev", required_argument, NULL, 'd' },
+ {"skb-mode", no_argument, NULL, 'S' },
+ {"debug", no_argument, NULL, 'D' },
+ {"sec", required_argument, NULL, 's' },
+ {"prognum", required_argument, NULL, 'p' },
+ {"qsize", required_argument, NULL, 'q' },
+ {"cpu", required_argument, NULL, 'c' },
+ {"stress-mode", no_argument, NULL, 'x' },
+ {"no-separators", no_argument, NULL, 'z' },
+ {0, 0, NULL, 0 }
+};
+
+static void int_exit(int sig)
+{
+ fprintf(stderr,
+ "Interrupted: Removing XDP program on ifindex:%d device:%s\n",
+ ifindex, ifname);
+ if (ifindex > -1)
+ set_link_xdp_fd(ifindex, -1, xdp_flags);
+ exit(EXIT_OK);
+}
+
+static void usage(char *argv[])
+{
+ int i;
+
+ printf("\nDOCUMENTATION:\n%s\n", __doc__);
+ printf("\n");
+ printf(" Usage: %s (options-see-below)\n", argv[0]);
+ printf(" Listing options:\n");
+ for (i = 0; long_options[i].name != 0; i++) {
+ printf(" --%-12s", long_options[i].name);
+ if (long_options[i].flag != NULL)
+ printf(" flag (internal value:%d)",
+ *long_options[i].flag);
+ else
+ printf(" short-option: -%c",
+ long_options[i].val);
+ printf("\n");
+ }
+ printf("\n");
+}
+
+/* gettime returns the current time of day in nanoseconds.
+ * Cost: clock_gettime (ns) => 26ns (CLOCK_MONOTONIC)
+ * clock_gettime (ns) => 9ns (CLOCK_MONOTONIC_COARSE)
+ */
+#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
+static __u64 gettime(void)
+{
+ struct timespec t;
+ int res;
+
+ res = clock_gettime(CLOCK_MONOTONIC, &t);
+ if (res < 0) {
+ fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
+ exit(EXIT_FAIL);
+ }
+ return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
+}
+
+/* Common stats data record shared with _kern.c */
+struct datarec {
+ __u64 processed;
+ __u64 dropped;
+ __u64 issue;
+};
+struct record {
+ __u64 timestamp;
+ struct datarec total;
+ struct datarec *cpu;
+};
+struct stats_record {
+ struct record rx_cnt;
+ struct record redir_err;
+ struct record kthread;
+ struct record exception;
+ struct record enq[MAX_CPUS];
+};
+
+static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
+{
+ /* For percpu maps, userspace gets a value per possible CPU */
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct datarec values[nr_cpus];
+ __u64 sum_processed = 0;
+ __u64 sum_dropped = 0;
+ __u64 sum_issue = 0;
+ int i;
+
+ if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+ fprintf(stderr,
+ "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+ return false;
+ }
+ /* Get time as close as possible to reading map contents */
+ rec->timestamp = gettime();
+
+ /* Record and sum values from each CPU */
+ for (i = 0; i < nr_cpus; i++) {
+ rec->cpu[i].processed = values[i].processed;
+ sum_processed += values[i].processed;
+ rec->cpu[i].dropped = values[i].dropped;
+ sum_dropped += values[i].dropped;
+ rec->cpu[i].issue = values[i].issue;
+ sum_issue += values[i].issue;
+ }
+ rec->total.processed = sum_processed;
+ rec->total.dropped = sum_dropped;
+ rec->total.issue = sum_issue;
+ return true;
+}
+
+static struct datarec *alloc_record_per_cpu(void)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct datarec *array;
+ size_t size;
+
+ size = sizeof(struct datarec) * nr_cpus;
+ array = malloc(size);
+ memset(array, 0, size);
+ if (!array) {
+ fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+ exit(EXIT_FAIL_MEM);
+ }
+ return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+ struct stats_record *rec;
+ int i;
+
+ rec = malloc(sizeof(*rec));
+ memset(rec, 0, sizeof(*rec));
+ if (!rec) {
+ fprintf(stderr, "Mem alloc error\n");
+ exit(EXIT_FAIL_MEM);
+ }
+ rec->rx_cnt.cpu = alloc_record_per_cpu();
+ rec->redir_err.cpu = alloc_record_per_cpu();
+ rec->kthread.cpu = alloc_record_per_cpu();
+ rec->exception.cpu = alloc_record_per_cpu();
+ for (i = 0; i < MAX_CPUS; i++)
+ rec->enq[i].cpu = alloc_record_per_cpu();
+
+ return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+ int i;
+
+ for (i = 0; i < MAX_CPUS; i++)
+ free(r->enq[i].cpu);
+ free(r->exception.cpu);
+ free(r->kthread.cpu);
+ free(r->redir_err.cpu);
+ free(r->rx_cnt.cpu);
+ free(r);
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+ double period_ = 0;
+ __u64 period = 0;
+
+ period = r->timestamp - p->timestamp;
+ if (period > 0)
+ period_ = ((double) period / NANOSEC_PER_SEC);
+
+ return period_;
+}
+
+static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->processed - p->processed;
+ pps = packets / period_;
+ }
+ return pps;
+}
+
+static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->dropped - p->dropped;
+ pps = packets / period_;
+ }
+ return pps;
+}
+
+static __u64 calc_errs_pps(struct datarec *r,
+ struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->issue - p->issue;
+ pps = packets / period_;
+ }
+ return pps;
+}
+
+static void stats_print(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ int prog_num)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ double pps = 0, drop = 0, err = 0;
+ struct record *rec, *prev;
+ int to_cpu;
+ double t;
+ int i;
+
+ /* Header */
+ printf("Running XDP/eBPF prog_num:%d\n", prog_num);
+ printf("%-15s %-7s %-14s %-11s %-9s\n",
+ "XDP-cpumap", "CPU:to", "pps", "drop-pps", "extra-info");
+
+ /* XDP rx_cnt */
+ {
+ char *fmt_rx = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n";
+ char *fm2_rx = "%-15s %-7s %'-14.0f %'-11.0f\n";
+ char *errstr = "";
+
+ rec = &stats_rec->rx_cnt;
+ prev = &stats_prev->rx_cnt;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (err > 0)
+ errstr = "cpu-dest/err";
+ if (pps > 0)
+ printf(fmt_rx, "XDP-RX",
+ i, pps, drop, err, errstr);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+ printf(fm2_rx, "XDP-RX", "total", pps, drop);
+ }
+
+ /* cpumap enqueue stats */
+ for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) {
+ char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
+ char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
+ char *errstr = "";
+
+ rec = &stats_rec->enq[to_cpu];
+ prev = &stats_prev->enq[to_cpu];
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (err > 0) {
+ errstr = "bulk-average";
+ err = pps / err; /* calc average bulk size */
+ }
+ if (pps > 0)
+ printf(fmt, "cpumap-enqueue",
+ i, to_cpu, pps, drop, err, errstr);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ if (pps > 0) {
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+ if (err > 0) {
+ errstr = "bulk-average";
+ err = pps / err; /* calc average bulk size */
+ }
+ printf(fm2, "cpumap-enqueue",
+ "sum", to_cpu, pps, drop, err, errstr);
+ }
+ }
+
+ /* cpumap kthread stats */
+ {
+ char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n";
+ char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f %s\n";
+ char *e_str = "";
+
+ rec = &stats_rec->kthread;
+ prev = &stats_prev->kthread;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (err > 0)
+ e_str = "sched";
+ if (pps > 0)
+ printf(fmt_k, "cpumap_kthread",
+ i, pps, drop, err, e_str);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+ if (err > 0)
+ e_str = "sched-sum";
+ printf(fm2_k, "cpumap_kthread", "total", pps, drop, err, e_str);
+ }
+
+ /* XDP redirect err tracepoints (very unlikely) */
+ {
+ char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n";
+ char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n";
+
+ rec = &stats_rec->redir_err;
+ prev = &stats_prev->redir_err;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ if (pps > 0)
+ printf(fmt_err, "redirect_err", i, pps, drop);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ printf(fm2_err, "redirect_err", "total", pps, drop);
+ }
+
+ /* XDP general exception tracepoints */
+ {
+ char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n";
+ char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n";
+
+ rec = &stats_rec->exception;
+ prev = &stats_prev->exception;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ if (pps > 0)
+ printf(fmt_err, "xdp_exception", i, pps, drop);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ printf(fm2_err, "xdp_exception", "total", pps, drop);
+ }
+
+ printf("\n");
+ fflush(stdout);
+}
+
+static void stats_collect(struct stats_record *rec)
+{
+ int fd, i;
+
+ fd = map_fd[1]; /* map: rx_cnt */
+ map_collect_percpu(fd, 0, &rec->rx_cnt);
+
+ fd = map_fd[2]; /* map: redirect_err_cnt */
+ map_collect_percpu(fd, 1, &rec->redir_err);
+
+ fd = map_fd[3]; /* map: cpumap_enqueue_cnt */
+ for (i = 0; i < MAX_CPUS; i++)
+ map_collect_percpu(fd, i, &rec->enq[i]);
+
+ fd = map_fd[4]; /* map: cpumap_kthread_cnt */
+ map_collect_percpu(fd, 0, &rec->kthread);
+
+ fd = map_fd[8]; /* map: exception_cnt */
+ map_collect_percpu(fd, 0, &rec->exception);
+}
+
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+ struct stats_record *tmp;
+
+ tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+static int create_cpu_entry(__u32 cpu, __u32 queue_size,
+ __u32 avail_idx, bool new)
+{
+ __u32 curr_cpus_count = 0;
+ __u32 key = 0;
+ int ret;
+
+ /* Add a CPU entry to cpumap, as this allocate a cpu entry in
+ * the kernel for the cpu.
+ */
+ ret = bpf_map_update_elem(map_fd[0], &cpu, &queue_size, 0);
+ if (ret) {
+ fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret);
+ exit(EXIT_FAIL_BPF);
+ }
+
+ /* Inform bpf_prog's that a new CPU is available to select
+ * from via some control maps.
+ */
+ /* map_fd[5] = cpus_available */
+ ret = bpf_map_update_elem(map_fd[5], &avail_idx, &cpu, 0);
+ if (ret) {
+ fprintf(stderr, "Add to avail CPUs failed\n");
+ exit(EXIT_FAIL_BPF);
+ }
+
+ /* When not replacing/updating existing entry, bump the count */
+ /* map_fd[6] = cpus_count */
+ ret = bpf_map_lookup_elem(map_fd[6], &key, &curr_cpus_count);
+ if (ret) {
+ fprintf(stderr, "Failed reading curr cpus_count\n");
+ exit(EXIT_FAIL_BPF);
+ }
+ if (new) {
+ curr_cpus_count++;
+ ret = bpf_map_update_elem(map_fd[6], &key, &curr_cpus_count, 0);
+ if (ret) {
+ fprintf(stderr, "Failed write curr cpus_count\n");
+ exit(EXIT_FAIL_BPF);
+ }
+ }
+ /* map_fd[7] = cpus_iterator */
+ printf("%s CPU:%u as idx:%u queue_size:%d (total cpus_count:%u)\n",
+ new ? "Add-new":"Replace", cpu, avail_idx,
+ queue_size, curr_cpus_count);
+
+ return 0;
+}
+
+/* CPUs are zero-indexed. Thus, add a special sentinel default value
+ * in map cpus_available to mark CPU index'es not configured
+ */
+static void mark_cpus_unavailable(void)
+{
+ __u32 invalid_cpu = MAX_CPUS;
+ int ret, i;
+
+ for (i = 0; i < MAX_CPUS; i++) {
+ /* map_fd[5] = cpus_available */
+ ret = bpf_map_update_elem(map_fd[5], &i, &invalid_cpu, 0);
+ if (ret) {
+ fprintf(stderr, "Failed marking CPU unavailable\n");
+ exit(EXIT_FAIL_BPF);
+ }
+ }
+}
+
+/* Stress cpumap management code by concurrently changing underlying cpumap */
+static void stress_cpumap(void)
+{
+ /* Changing qsize will cause kernel to free and alloc a new
+ * bpf_cpu_map_entry, with an associated/complicated tear-down
+ * procedure.
+ */
+ create_cpu_entry(1, 1024, 0, false);
+ create_cpu_entry(1, 128, 0, false);
+ create_cpu_entry(1, 16000, 0, false);
+}
+
+static void stats_poll(int interval, bool use_separators, int prog_num,
+ bool stress_mode)
+{
+ struct stats_record *record, *prev;
+
+ record = alloc_stats_record();
+ prev = alloc_stats_record();
+ stats_collect(record);
+
+ /* Trick to pretty printf with thousands separators use %' */
+ if (use_separators)
+ setlocale(LC_NUMERIC, "en_US");
+
+ while (1) {
+ swap(&prev, &record);
+ stats_collect(record);
+ stats_print(record, prev, prog_num);
+ sleep(interval);
+ if (stress_mode)
+ stress_cpumap();
+ }
+
+ free_stats_record(record);
+ free_stats_record(prev);
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+ bool use_separators = true;
+ bool stress_mode = false;
+ char filename[256];
+ bool debug = false;
+ int added_cpus = 0;
+ int longindex = 0;
+ int interval = 2;
+ int prog_num = 0;
+ int add_cpu = -1;
+ __u32 qsize;
+ int opt;
+
+ /* Notice: choosing he queue size is very important with the
+ * ixgbe driver, because it's driver page recycling trick is
+ * dependend on pages being returned quickly. The number of
+ * out-standing packets in the system must be less-than 2x
+ * RX-ring size.
+ */
+ qsize = 128+64;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ if (load_bpf_file(filename)) {
+ fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf);
+ return EXIT_FAIL;
+ }
+
+ if (!prog_fd[0]) {
+ fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno));
+ return EXIT_FAIL;
+ }
+
+ mark_cpus_unavailable();
+
+ /* Parse commands line args */
+ while ((opt = getopt_long(argc, argv, "hSd:",
+ long_options, &longindex)) != -1) {
+ switch (opt) {
+ case 'd':
+ if (strlen(optarg) >= IF_NAMESIZE) {
+ fprintf(stderr, "ERR: --dev name too long\n");
+ goto error;
+ }
+ ifname = (char *)&ifname_buf;
+ strncpy(ifname, optarg, IF_NAMESIZE);
+ ifindex = if_nametoindex(ifname);
+ if (ifindex == 0) {
+ fprintf(stderr,
+ "ERR: --dev name unknown err(%d):%s\n",
+ errno, strerror(errno));
+ goto error;
+ }
+ break;
+ case 's':
+ interval = atoi(optarg);
+ break;
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'D':
+ debug = true;
+ break;
+ case 'x':
+ stress_mode = true;
+ break;
+ case 'z':
+ use_separators = false;
+ break;
+ case 'p':
+ /* Selecting eBPF prog to load */
+ prog_num = atoi(optarg);
+ if (prog_num < 0 || prog_num >= MAX_PROG) {
+ fprintf(stderr,
+ "--prognum too large err(%d):%s\n",
+ errno, strerror(errno));
+ goto error;
+ }
+ break;
+ case 'c':
+ /* Add multiple CPUs */
+ add_cpu = strtoul(optarg, NULL, 0);
+ if (add_cpu >= MAX_CPUS) {
+ fprintf(stderr,
+ "--cpu nr too large for cpumap err(%d):%s\n",
+ errno, strerror(errno));
+ goto error;
+ }
+ create_cpu_entry(add_cpu, qsize, added_cpus, true);
+ added_cpus++;
+ break;
+ case 'q':
+ qsize = atoi(optarg);
+ break;
+ case 'h':
+ error:
+ default:
+ usage(argv);
+ return EXIT_FAIL_OPTION;
+ }
+ }
+ /* Required option */
+ if (ifindex == -1) {
+ fprintf(stderr, "ERR: required option --dev missing\n");
+ usage(argv);
+ return EXIT_FAIL_OPTION;
+ }
+ /* Required option */
+ if (add_cpu == -1) {
+ fprintf(stderr, "ERR: required option --cpu missing\n");
+ fprintf(stderr, " Specify multiple --cpu option to add more\n");
+ usage(argv);
+ return EXIT_FAIL_OPTION;
+ }
+
+ /* Remove XDP program when program is interrupted */
+ signal(SIGINT, int_exit);
+
+ if (set_link_xdp_fd(ifindex, prog_fd[prog_num], xdp_flags) < 0) {
+ fprintf(stderr, "link set xdp fd failed\n");
+ return EXIT_FAIL_XDP;
+ }
+
+ if (debug) {
+ printf("Debug-mode reading trace pipe (fix #define DEBUG)\n");
+ read_trace_pipe();
+ }
+
+ stats_poll(interval, use_separators, prog_num, stress_mode);
+ return EXIT_OK;
+}
diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c
index d4d86a2..978a532 100644
--- a/samples/bpf/xdp_redirect_map_user.c
+++ b/samples/bpf/xdp_redirect_map_user.c
@@ -20,6 +20,7 @@
#include <string.h>
#include <unistd.h>
#include <libgen.h>
+#include <sys/resource.h>
#include "bpf_load.h"
#include "bpf_util.h"
@@ -74,6 +75,7 @@ static void usage(const char *prog)
int main(int argc, char **argv)
{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
const char *optstr = "SN";
char filename[256];
int ret, opt, key = 0;
@@ -97,6 +99,11 @@ int main(int argc, char **argv)
return 1;
}
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
ifindex_in = strtoul(argv[optind], NULL, 0);
ifindex_out = strtoul(argv[optind + 1], NULL, 0);
printf("input: %d output: %d\n", ifindex_in, ifindex_out);
OpenPOWER on IntegriCloud