From c05cdb1b864f548c0c3d8ae3b51264e6739a69b1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 3 Jun 2013 09:46:28 +0000 Subject: netlink: allow large data transfers from user-space I can hit ENOBUFS in the sendmsg() path with a large batch that is composed of many netlink messages. Here that limit is 8 MBytes of skbuff data area as kmalloc does not manage to get more than that. While discussing atomic rule-set for nftables with Patrick McHardy, we decided to put all rule-set updates that need to be applied atomically in one single batch to simplify the existing approach. However, as explained above, the existing netlink code limits us to a maximum of ~20000 rules that fit in one single batch without hitting ENOBUFS. iptables does not have such limitation as it is using vmalloc. This patch adds netlink_alloc_large_skb() which is only used in the netlink_sendmsg() path. It uses alloc_skb if the memory requested is <= one memory page, that should be the common case for most subsystems, else vmalloc for higher memory allocations. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index d0b3dd6..68c1673 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -750,6 +750,10 @@ static void netlink_skb_destructor(struct sk_buff *skb) skb->head = NULL; } #endif + if (is_vmalloc_addr(skb->head)) { + vfree(skb->head); + skb->head = NULL; + } if (skb->sk != NULL) sock_rfree(skb); } @@ -1420,6 +1424,35 @@ struct sock *netlink_getsockbyfilp(struct file *filp) return sock; } +static struct sk_buff *netlink_alloc_large_skb(unsigned int size) +{ + struct sk_buff *skb; + void *data; + + if (size <= NLMSG_GOODSIZE) + return alloc_skb(size, GFP_KERNEL); + + skb = alloc_skb_head(GFP_KERNEL); + if (skb == NULL) + return NULL; + + data = vmalloc(size); + if (data == NULL) + goto err; + + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; + skb->len = 0; + skb->destructor = netlink_skb_destructor; + + return skb; +err: + kfree_skb(skb); + return NULL; +} + /* * Attach a skb to a netlink socket. * The caller must hold a reference to the destination socket. On error, the @@ -1510,7 +1543,7 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) return skb; delta = skb->end - skb->tail; - if (delta * 2 < skb->truesize) + if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) return skb; if (skb_shared(skb)) { @@ -2096,7 +2129,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; - skb = alloc_skb(len, GFP_KERNEL); + skb = netlink_alloc_large_skb(len); if (skb == NULL) goto out; -- cgit v1.1 From da12c90e099789a63073fc82a19542ce54d4efb9 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 6 Jun 2013 14:49:11 +0800 Subject: netlink: Add compare function for netlink_table As we know, netlink sockets are private resource of net namespace, they can communicate with each other only when they in the same net namespace. this works well until we try to add namespace support for other subsystems which use netlink. Don't like ipv4 and route table.., it is not suited to make these subsytems belong to net namespace, Such as audit and crypto subsystems,they are more suitable to user namespace. So we must have the ability to make the netlink sockets in same user namespace can communicate with each other. This patch adds a new function pointer "compare" for netlink_table, we can decide if the netlink sockets can communicate with each other through this netlink_table self-defined compare function. The behavior isn't changed if we don't provide the compare function for netlink_table. Signed-off-by: Gao feng Acked-by: Serge E. Hallyn Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 68c1673..9b6b115 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -858,16 +858,23 @@ netlink_unlock_table(void) wake_up(&nl_table_wait); } +static bool netlink_compare(struct net *net, struct sock *sk) +{ + return net_eq(sock_net(sk), net); +} + static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) { - struct nl_portid_hash *hash = &nl_table[protocol].hash; + struct netlink_table *table = &nl_table[protocol]; + struct nl_portid_hash *hash = &table->hash; struct hlist_head *head; struct sock *sk; read_lock(&nl_table_lock); head = nl_portid_hashfn(hash, portid); sk_for_each(sk, head) { - if (net_eq(sock_net(sk), net) && (nlk_sk(sk)->portid == portid)) { + if (table->compare(net, sk) && + (nlk_sk(sk)->portid == portid)) { sock_hold(sk); goto found; } @@ -980,7 +987,8 @@ netlink_update_listeners(struct sock *sk) static int netlink_insert(struct sock *sk, struct net *net, u32 portid) { - struct nl_portid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct netlink_table *table = &nl_table[sk->sk_protocol]; + struct nl_portid_hash *hash = &table->hash; struct hlist_head *head; int err = -EADDRINUSE; struct sock *osk; @@ -990,7 +998,8 @@ static int netlink_insert(struct sock *sk, struct net *net, u32 portid) head = nl_portid_hashfn(hash, portid); len = 0; sk_for_each(osk, head) { - if (net_eq(sock_net(osk), net) && (nlk_sk(osk)->portid == portid)) + if (table->compare(net, osk) && + (nlk_sk(osk)->portid == portid)) break; len++; } @@ -1165,6 +1174,7 @@ static int netlink_release(struct socket *sock) kfree_rcu(old, rcu); nl_table[sk->sk_protocol].module = NULL; nl_table[sk->sk_protocol].bind = NULL; + nl_table[sk->sk_protocol].compare = NULL; nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } @@ -1187,7 +1197,8 @@ static int netlink_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); - struct nl_portid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct netlink_table *table = &nl_table[sk->sk_protocol]; + struct nl_portid_hash *hash = &table->hash; struct hlist_head *head; struct sock *osk; s32 portid = task_tgid_vnr(current); @@ -1199,7 +1210,7 @@ retry: netlink_table_grab(); head = nl_portid_hashfn(hash, portid); sk_for_each(osk, head) { - if (!net_eq(sock_net(osk), net)) + if (!table->compare(net, osk)) continue; if (nlk_sk(osk)->portid == portid) { /* Bind collision, search negative portid values. */ @@ -2315,9 +2326,12 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, rcu_assign_pointer(nl_table[unit].listeners, listeners); nl_table[unit].cb_mutex = cb_mutex; nl_table[unit].module = module; + nl_table[unit].compare = netlink_compare; if (cfg) { nl_table[unit].bind = cfg->bind; nl_table[unit].flags = cfg->flags; + if (cfg->compare) + nl_table[unit].compare = cfg->compare; } nl_table[unit].registered = 1; } else { @@ -2740,6 +2754,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *s; struct nl_seq_iter *iter; + struct net *net; int i, j; ++*pos; @@ -2747,11 +2762,12 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (v == SEQ_START_TOKEN) return netlink_seq_socket_idx(seq, 0); + net = seq_file_net(seq); iter = seq->private; s = v; do { s = sk_next(s); - } while (s && sock_net(s) != seq_file_net(seq)); + } while (s && !nl_table[s->sk_protocol].compare(net, s)); if (s) return s; @@ -2763,7 +2779,8 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) for (; j <= hash->mask; j++) { s = sk_head(&hash->table[j]); - while (s && sock_net(s) != seq_file_net(seq)) + + while (s && !nl_table[s->sk_protocol].compare(net, s)) s = sk_next(s); if (s) { iter->link = i; -- cgit v1.1 From ca15febfe98f7c681ac345fc1d2ee1b8decaa493 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 13 Jun 2013 10:05:38 +0800 Subject: netlink: make compare exist all the time Commit da12c90e099789a63073fc82a19542ce54d4efb9 "netlink: Add compare function for netlink_table" only set compare at the time we create kernel netlink, and reset compare to NULL at the time we finially release netlink socket, but netlink_lookup wants the compare exist always. So we should set compare after we allocate nl_table, and never reset it. make comapre exist all the time. Reported-by: Fengguang Wu Signed-off-by: Gao feng Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 9b6b115..8978755 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1174,7 +1174,6 @@ static int netlink_release(struct socket *sock) kfree_rcu(old, rcu); nl_table[sk->sk_protocol].module = NULL; nl_table[sk->sk_protocol].bind = NULL; - nl_table[sk->sk_protocol].compare = NULL; nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } @@ -2326,7 +2325,6 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, rcu_assign_pointer(nl_table[unit].listeners, listeners); nl_table[unit].cb_mutex = cb_mutex; nl_table[unit].module = module; - nl_table[unit].compare = netlink_compare; if (cfg) { nl_table[unit].bind = cfg->bind; nl_table[unit].flags = cfg->flags; @@ -2973,6 +2971,8 @@ static int __init netlink_proto_init(void) hash->shift = 0; hash->mask = 0; hash->rehash_time = jiffies; + + nl_table[i].compare = netlink_compare; } netlink_add_usersock_entry(); -- cgit v1.1 From bcbde0d449eda7afa8f63280b165c8300dbd00e2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 21 Jun 2013 19:38:07 +0200 Subject: net: netlink: virtual tap device management Similarly to the networking receive path with ptype_all taps, we add the possibility to register netdevices that are for ARPHRD_NETLINK to the netlink subsystem, so that those can be used for netlink analyzers resp. debuggers. We do not offer a direct callback function as out-of-tree modules could do crap with it. Instead, a netdevice must be registered properly and only receives a clone, managed by the netlink layer. Symbols are exported as GPL-only. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 275d901..6967fbc 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -101,6 +102,9 @@ static atomic_t nl_table_users = ATOMIC_INIT(0); static ATOMIC_NOTIFIER_HEAD(netlink_chain); +static DEFINE_SPINLOCK(netlink_tap_lock); +static struct list_head netlink_tap_all __read_mostly; + static inline u32 netlink_group_mask(u32 group) { return group ? 1 << (group - 1) : 0; @@ -111,6 +115,100 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; } +int netlink_add_tap(struct netlink_tap *nt) +{ + if (unlikely(nt->dev->type != ARPHRD_NETLINK)) + return -EINVAL; + + spin_lock(&netlink_tap_lock); + list_add_rcu(&nt->list, &netlink_tap_all); + spin_unlock(&netlink_tap_lock); + + if (nt->module) + __module_get(nt->module); + + return 0; +} +EXPORT_SYMBOL_GPL(netlink_add_tap); + +int __netlink_remove_tap(struct netlink_tap *nt) +{ + bool found = false; + struct netlink_tap *tmp; + + spin_lock(&netlink_tap_lock); + + list_for_each_entry(tmp, &netlink_tap_all, list) { + if (nt == tmp) { + list_del_rcu(&nt->list); + found = true; + goto out; + } + } + + pr_warn("__netlink_remove_tap: %p not found\n", nt); +out: + spin_unlock(&netlink_tap_lock); + + if (found && nt->module) + module_put(nt->module); + + return found ? 0 : -ENODEV; +} +EXPORT_SYMBOL_GPL(__netlink_remove_tap); + +int netlink_remove_tap(struct netlink_tap *nt) +{ + int ret; + + ret = __netlink_remove_tap(nt); + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL_GPL(netlink_remove_tap); + +static int __netlink_deliver_tap_skb(struct sk_buff *skb, + struct net_device *dev) +{ + struct sk_buff *nskb; + int ret = -ENOMEM; + + dev_hold(dev); + nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb) { + nskb->dev = dev; + ret = dev_queue_xmit(nskb); + if (unlikely(ret > 0)) + ret = net_xmit_errno(ret); + } + + dev_put(dev); + return ret; +} + +static void __netlink_deliver_tap(struct sk_buff *skb) +{ + int ret; + struct netlink_tap *tmp; + + list_for_each_entry_rcu(tmp, &netlink_tap_all, list) { + ret = __netlink_deliver_tap_skb(skb, tmp->dev); + if (unlikely(ret)) + break; + } +} + +static void netlink_deliver_tap(struct sk_buff *skb) +{ + rcu_read_lock(); + + if (unlikely(!list_empty(&netlink_tap_all))) + __netlink_deliver_tap(skb); + + rcu_read_unlock(); +} + static void netlink_overrun(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); @@ -1518,6 +1616,8 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; + netlink_deliver_tap(skb); + #ifdef CONFIG_NETLINK_MMAP if (netlink_skb_is_mmaped(skb)) netlink_queue_mmaped_skb(sk, skb); @@ -1578,6 +1678,11 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { + /* We could do a netlink_deliver_tap(skb) here as well + * but since this is intended for the kernel only, we + * should rather let it stay under the hood. + */ + ret = skb->len; netlink_skb_set_owner_r(skb, sk); NETLINK_CB(skb).sk = ssk; @@ -2975,6 +3080,8 @@ static int __init netlink_proto_init(void) nl_table[i].compare = netlink_compare; } + INIT_LIST_HEAD(&netlink_tap_all); + netlink_add_usersock_entry(); sock_register(&netlink_family_ops); -- cgit v1.1 From 3a36515f729458c8efa0c124c7262d5843ad5c37 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Fri, 28 Jun 2013 03:04:23 +0200 Subject: netlink: fix splat in skb_clone with large messages Since (c05cdb1 netlink: allow large data transfers from user-space), netlink splats if it invokes skb_clone on large netlink skbs since: * skb_shared_info was not correctly initialized. * skb->destructor is not set in the cloned skb. This was spotted by trinity: [ 894.990671] BUG: unable to handle kernel paging request at ffffc9000047b001 [ 894.991034] IP: [] skb_clone+0x24/0xc0 [...] [ 894.991034] Call Trace: [ 894.991034] [] nl_fib_input+0x6a/0x240 [ 894.991034] [] ? _raw_read_unlock+0x26/0x40 [ 894.991034] [] netlink_unicast+0x169/0x1e0 [ 894.991034] [] netlink_sendmsg+0x251/0x3d0 Fix it by: 1) introducing a new netlink_skb_clone function that is used in nl_fib_input, that sets our special skb->destructor in the cloned skb. Moreover, handle the release of the large cloned skb head area in the destructor path. 2) not allowing large skbuffs in the netlink broadcast path. I cannot find any reasonable use of the large data transfer using netlink in that path, moreover this helps to skip extra skb_clone handling. I found two more netlink clients that are cloning the skbs, but they are not in the sendmsg path. Therefore, the sole client cloning that I found seems to be the fib frontend. Thanks to Eric Dumazet for helping to address this issue. Reported-by: Fengguang Wu Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'net/netlink/af_netlink.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 6967fbc..0c61b59 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -849,7 +849,10 @@ static void netlink_skb_destructor(struct sk_buff *skb) } #endif if (is_vmalloc_addr(skb->head)) { - vfree(skb->head); + if (!skb->cloned || + !atomic_dec_return(&(skb_shinfo(skb)->dataref))) + vfree(skb->head); + skb->head = NULL; } if (skb->sk != NULL) @@ -1532,33 +1535,31 @@ struct sock *netlink_getsockbyfilp(struct file *filp) return sock; } -static struct sk_buff *netlink_alloc_large_skb(unsigned int size) +static struct sk_buff *netlink_alloc_large_skb(unsigned int size, + int broadcast) { struct sk_buff *skb; void *data; - if (size <= NLMSG_GOODSIZE) + if (size <= NLMSG_GOODSIZE || broadcast) return alloc_skb(size, GFP_KERNEL); - skb = alloc_skb_head(GFP_KERNEL); - if (skb == NULL) - return NULL; + size = SKB_DATA_ALIGN(size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); data = vmalloc(size); if (data == NULL) - goto err; + return NULL; - skb->head = data; - skb->data = data; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; - skb->len = 0; - skb->destructor = netlink_skb_destructor; + skb = build_skb(data, size); + if (skb == NULL) + vfree(data); + else { + skb->head_frag = 0; + skb->destructor = netlink_skb_destructor; + } return skb; -err: - kfree_skb(skb); - return NULL; } /* @@ -2244,7 +2245,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; - skb = netlink_alloc_large_skb(len); + skb = netlink_alloc_large_skb(len, dst_group); if (skb == NULL) goto out; -- cgit v1.1