From 15898a011b3d0390869f31167c4403835bc04954 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:28:33 +0300 Subject: net: Convert uevent_net_ops uevent_net_init() and uevent_net_exit() create and destroy netlink socket, and these actions serialized in netlink code. Parallel execution with other pernet_operations makes the socket disappear earlier from uevent_sock_list on ->exit. As userspace can't be interested in broadcast messages of dying net, and, as I see, no one in kernel listen them, we may safely make uevent_net_ops async. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- lib/kobject_uevent.c | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 9fe6ec8..9539d7a 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -650,6 +650,7 @@ found: static struct pernet_operations uevent_net_ops = { .init = uevent_net_init, .exit = uevent_net_exit, + .async = true, }; static int __init kobject_uevent_init(void) -- cgit v1.1 From 94e5e3087a67c765be98592b36d8d187566478d5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 19 Mar 2018 13:17:30 +0100 Subject: net: add uevent socket member This commit adds struct uevent_sock to struct net. Since struct uevent_sock records the position of the uevent socket in the uevent socket list we can trivially remove it from the uevent socket list during cleanup. This speeds up the old removal codepath. Note, list_del() will hit __list_del_entry_valid() in its call chain which will validate that the element is a member of the list. If it isn't it will take care that the list is not modified. Signed-off-by: Christian Brauner Signed-off-by: David S. Miller --- lib/kobject_uevent.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 9539d7a..54cfbae 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -32,11 +32,13 @@ u64 uevent_seqnum; #ifdef CONFIG_UEVENT_HELPER char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH; #endif -#ifdef CONFIG_NET + struct uevent_sock { struct list_head list; struct sock *sk; }; + +#ifdef CONFIG_NET static LIST_HEAD(uevent_sock_list); #endif @@ -621,6 +623,9 @@ static int uevent_net_init(struct net *net) kfree(ue_sk); return -ENODEV; } + + net->uevent_sock = ue_sk; + mutex_lock(&uevent_sock_mutex); list_add_tail(&ue_sk->list, &uevent_sock_list); mutex_unlock(&uevent_sock_mutex); @@ -629,17 +634,9 @@ static int uevent_net_init(struct net *net) static void uevent_net_exit(struct net *net) { - struct uevent_sock *ue_sk; + struct uevent_sock *ue_sk = net->uevent_sock; mutex_lock(&uevent_sock_mutex); - list_for_each_entry(ue_sk, &uevent_sock_list, list) { - if (sock_net(ue_sk->sk) == net) - goto found; - } - mutex_unlock(&uevent_sock_mutex); - return; - -found: list_del(&ue_sk->list); mutex_unlock(&uevent_sock_mutex); -- cgit v1.1 From 692ec06d7c92af8ca841a6367648b9b3045344fd Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 19 Mar 2018 13:17:31 +0100 Subject: netns: send uevent messages This patch adds a receive method to NETLINK_KOBJECT_UEVENT netlink sockets to allow sending uevent messages into the network namespace the socket belongs to. Currently non-initial network namespaces are already isolated and don't receive uevents. There are a number of cases where it is beneficial for a sufficiently privileged userspace process to send a uevent into a network namespace. One such use case would be debugging and fuzzing of a piece of software which listens and reacts to uevents. By running a copy of that software inside a network namespace, specific uevents could then be presented to it. More concretely, this would allow for easy testing of udevd/ueventd. This will also allow some piece of software to run components inside a separate network namespace and then effectively filter what that software can receive. Some examples of software that do directly listen to uevents and that we have in the past attempted to run inside a network namespace are rbd (CEPH client) or the X server. Implementation: The implementation has been kept as simple as possible from the kernel's perspective. Specifically, a simple input method uevent_net_rcv() is added to NETLINK_KOBJECT_UEVENT sockets which completely reuses existing af_netlink infrastructure and does neither add an additional netlink family nor requires any user-visible changes. For example, by using netlink_rcv_skb() we can make use of existing netlink infrastructure to report back informative error messages to userspace. Furthermore, this implementation does not introduce any overhead for existing uevent generating codepaths. The struct netns got a new uevent socket member that records the uevent socket associated with that network namespace including its position in the uevent socket list. Since we record the uevent socket for each network namespace in struct net we don't have to walk the whole uevent socket list. Instead we can directly retrieve the relevant uevent socket and send the message. At exit time we can now also trivially remove the uevent socket from the uevent socket list. This keeps the codepath very performant without introducing needless overhead and even makes older codepaths faster. Uevent sequence numbers are kept global. When a uevent message is sent to another network namespace the implementation will simply increment the global uevent sequence number and append it to the received uevent. This has the advantage that the kernel will never need to parse the received uevent message to replace any existing uevent sequence numbers. Instead it is up to the userspace process to remove any existing uevent sequence numbers in case the uevent message to be sent contains any. Security: In order for a caller to send uevent messages to a target network namespace the caller must have CAP_SYS_ADMIN in the owning user namespace of the target network namespace. Additionally, any received uevent message is verified to not exceed size UEVENT_BUFFER_SIZE. This includes the space needed to append the uevent sequence number. Testing: This patch has been tested and verified to work with the following udev implementations: 1. CentOS 6 with udevd version 147 2. Debian Sid with systemd-udevd version 237 3. Android 7.1.1 with ueventd Signed-off-by: Christian Brauner Signed-off-by: David S. Miller --- lib/kobject_uevent.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 54cfbae..fa10ad8 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -604,12 +605,88 @@ int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...) EXPORT_SYMBOL_GPL(add_uevent_var); #if defined(CONFIG_NET) +static int uevent_net_broadcast(struct sock *usk, struct sk_buff *skb, + struct netlink_ext_ack *extack) +{ + /* u64 to chars: 2^64 - 1 = 21 chars */ + char buf[sizeof("SEQNUM=") + 21]; + struct sk_buff *skbc; + int ret; + + /* bump and prepare sequence number */ + ret = snprintf(buf, sizeof(buf), "SEQNUM=%llu", ++uevent_seqnum); + if (ret < 0 || (size_t)ret >= sizeof(buf)) + return -ENOMEM; + ret++; + + /* verify message does not overflow */ + if ((skb->len + ret) > UEVENT_BUFFER_SIZE) { + NL_SET_ERR_MSG(extack, "uevent message too big"); + return -EINVAL; + } + + /* copy skb and extend to accommodate sequence number */ + skbc = skb_copy_expand(skb, 0, ret, GFP_KERNEL); + if (!skbc) + return -ENOMEM; + + /* append sequence number */ + skb_put_data(skbc, buf, ret); + + /* remove msg header */ + skb_pull(skbc, NLMSG_HDRLEN); + + /* set portid 0 to inform userspace message comes from kernel */ + NETLINK_CB(skbc).portid = 0; + NETLINK_CB(skbc).dst_group = 1; + + ret = netlink_broadcast(usk, skbc, 0, 1, GFP_KERNEL); + /* ENOBUFS should be handled in userspace */ + if (ret == -ENOBUFS || ret == -ESRCH) + ret = 0; + + return ret; +} + +static int uevent_net_rcv_skb(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net; + int ret; + + if (!nlmsg_data(nlh)) + return -EINVAL; + + /* + * Verify that we are allowed to send messages to the target + * network namespace. The caller must have CAP_SYS_ADMIN in the + * owning user namespace of the target network namespace. + */ + net = sock_net(NETLINK_CB(skb).sk); + if (!netlink_ns_capable(skb, net->user_ns, CAP_SYS_ADMIN)) { + NL_SET_ERR_MSG(extack, "missing CAP_SYS_ADMIN capability"); + return -EPERM; + } + + mutex_lock(&uevent_sock_mutex); + ret = uevent_net_broadcast(net->uevent_sock->sk, skb, extack); + mutex_unlock(&uevent_sock_mutex); + + return ret; +} + +static void uevent_net_rcv(struct sk_buff *skb) +{ + netlink_rcv_skb(skb, &uevent_net_rcv_skb); +} + static int uevent_net_init(struct net *net) { struct uevent_sock *ue_sk; struct netlink_kernel_cfg cfg = { .groups = 1, - .flags = NL_CFG_F_NONROOT_RECV, + .input = uevent_net_rcv, + .flags = NL_CFG_F_NONROOT_RECV }; ue_sk = kzalloc(sizeof(*ue_sk), GFP_KERNEL); -- cgit v1.1 From 76db8087c4c991dcd17f5ea8ac0eafd0696ab450 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 21 Mar 2018 16:31:04 -0700 Subject: net: bpf: add a test for skb_segment in test_bpf module Without the previous commit, "modprobe test_bpf" will have the following errors: ... [ 98.149165] ------------[ cut here ]------------ [ 98.159362] kernel BUG at net/core/skbuff.c:3667! [ 98.169756] invalid opcode: 0000 [#1] SMP PTI [ 98.179370] Modules linked in: [ 98.179371] test_bpf(+) ... which triggers the bug the previous commit intends to fix. The skbs are constructed to mimic what mlx5 may generate. The packet size/header may not mimic real cases in production. But the processing flow is similar. Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- lib/test_bpf.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 91 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 3e93354..b2badf6 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -6574,6 +6574,93 @@ static bool exclude_test(int test_id) return test_id < test_range[0] || test_id > test_range[1]; } +static __init struct sk_buff *build_test_skb(void) +{ + u32 headroom = NET_SKB_PAD + NET_IP_ALIGN + ETH_HLEN; + struct sk_buff *skb[2]; + struct page *page[2]; + int i, data_size = 8; + + for (i = 0; i < 2; i++) { + page[i] = alloc_page(GFP_KERNEL); + if (!page[i]) { + if (i == 0) + goto err_page0; + else + goto err_page1; + } + + /* this will set skb[i]->head_frag */ + skb[i] = dev_alloc_skb(headroom + data_size); + if (!skb[i]) { + if (i == 0) + goto err_skb0; + else + goto err_skb1; + } + + skb_reserve(skb[i], headroom); + skb_put(skb[i], data_size); + skb[i]->protocol = htons(ETH_P_IP); + skb_reset_network_header(skb[i]); + skb_set_mac_header(skb[i], -ETH_HLEN); + + skb_add_rx_frag(skb[i], 0, page[i], 0, 64, 64); + // skb_headlen(skb[i]): 8, skb[i]->head_frag = 1 + } + + /* setup shinfo */ + skb_shinfo(skb[0])->gso_size = 1448; + skb_shinfo(skb[0])->gso_type = SKB_GSO_TCPV4; + skb_shinfo(skb[0])->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb[0])->gso_segs = 0; + skb_shinfo(skb[0])->frag_list = skb[1]; + + /* adjust skb[0]'s len */ + skb[0]->len += skb[1]->len; + skb[0]->data_len += skb[1]->data_len; + skb[0]->truesize += skb[1]->truesize; + + return skb[0]; + +err_skb1: + __free_page(page[1]); +err_page1: + kfree_skb(skb[0]); +err_skb0: + __free_page(page[0]); +err_page0: + return NULL; +} + +static __init int test_skb_segment(void) +{ + netdev_features_t features; + struct sk_buff *skb, *segs; + int ret = -1; + + features = NETIF_F_SG | NETIF_F_GSO_PARTIAL | NETIF_F_IP_CSUM | + NETIF_F_IPV6_CSUM; + features |= NETIF_F_RXCSUM; + skb = build_test_skb(); + if (!skb) { + pr_info("%s: failed to build_test_skb", __func__); + goto done; + } + + segs = skb_segment(skb, features); + if (segs) { + kfree_skb_list(segs); + ret = 0; + pr_info("%s: success in skb_segment!", __func__); + } else { + pr_info("%s: failed in skb_segment!", __func__); + } + kfree_skb(skb); +done: + return ret; +} + static __init int test_bpf(void) { int i, err_cnt = 0, pass_cnt = 0; @@ -6632,9 +6719,11 @@ static int __init test_bpf_init(void) return ret; ret = test_bpf(); - destroy_bpf_tests(); - return ret; + if (ret) + return ret; + + return test_skb_segment(); } static void __exit test_bpf_exit(void) -- cgit v1.1 From 2f635ceeb22ba13c307236d69795fbb29cfa3e7c Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 27 Mar 2018 18:02:13 +0300 Subject: net: Drop pernet_operations::async Synchronous pernet_operations are not allowed anymore. All are asynchronous. So, drop the structure member. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- lib/kobject_uevent.c | 1 - 1 file changed, 1 deletion(-) (limited to 'lib') diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index fa10ad8..15ea216 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -724,7 +724,6 @@ static void uevent_net_exit(struct net *net) static struct pernet_operations uevent_net_ops = { .init = uevent_net_init, .exit = uevent_net_exit, - .async = true, }; static int __init kobject_uevent_init(void) -- cgit v1.1 From 99fe29d3a25f813146816b322367b4c6a0fdec84 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 28 Mar 2018 14:48:36 +0300 Subject: test_bpf: Fix NULL vs IS_ERR() check in test_skb_segment() The skb_segment() function returns error pointers on error. It never returns NULL. Fixes: 76db8087c4c9 ("net: bpf: add a test for skb_segment in test_bpf module") Signed-off-by: Dan Carpenter Acked-by: Daniel Borkmann Reviewed-by: Yonghong Song Signed-off-by: David S. Miller --- lib/test_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test_bpf.c b/lib/test_bpf.c index b2badf6..8e15780 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -6649,7 +6649,7 @@ static __init int test_skb_segment(void) } segs = skb_segment(skb, features); - if (segs) { + if (!IS_ERR(segs)) { kfree_skb_list(segs); ret = 0; pr_info("%s: success in skb_segment!", __func__); -- cgit v1.1 From f385178679b6561d2e717567d12e07c7f927ee59 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Fri, 30 Mar 2018 09:20:59 +0900 Subject: lib/scatterlist: add sg_init_marker() helper sg_init_marker initializes sg_magic in the sg table and calls sg_mark_end() on the last entry of the table. This can be useful to avoid memset in sg_init_table() when scatterlist is already zeroed out For example: when scatterlist is embedded inside other struct and that container struct is zeroed out Suggested-by: Daniel Borkmann Signed-off-by: Prashant Bhole Acked-by: John Fastabend Signed-off-by: Daniel Borkmann --- lib/scatterlist.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 53728d3..06dad7a 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -132,14 +132,7 @@ EXPORT_SYMBOL(sg_last); void sg_init_table(struct scatterlist *sgl, unsigned int nents) { memset(sgl, 0, sizeof(*sgl) * nents); -#ifdef CONFIG_DEBUG_SG - { - unsigned int i; - for (i = 0; i < nents; i++) - sgl[i].sg_magic = SG_MAGIC; - } -#endif - sg_mark_end(&sgl[nents - 1]); + sg_init_marker(sgl, nents); } EXPORT_SYMBOL(sg_init_table); -- cgit v1.1 From ae6da1f503abb5a5081f9f6c4a6881de97830f3e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:48 -0700 Subject: rhashtable: add schedule points Rehashing and destroying large hash table takes a lot of time, and happens in process context. It is safe to add cond_resched() in rhashtable_rehash_table() and rhashtable_free_and_destroy() Signed-off-by: Eric Dumazet Acked-by: Herbert Xu Signed-off-by: David S. Miller --- lib/rhashtable.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 47de025..2b2b799 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -333,6 +333,7 @@ static int rhashtable_rehash_table(struct rhashtable *ht) err = rhashtable_rehash_chain(ht, old_hash); if (err) return err; + cond_resched(); } /* Publish the new table pointer. */ @@ -1112,6 +1113,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, for (i = 0; i < tbl->size; i++) { struct rhash_head *pos, *next; + cond_resched(); for (pos = rht_dereference(*rht_bucket(tbl, i), ht), next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL; -- cgit v1.1