From cb820f8e4b7f73d1a32175e6591735b25bb5398d Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 19 Jul 2013 19:40:09 +0200 Subject: net: Provide a generic socket error queue delivery method for Tx time stamps. This patch moves the private error queue delivery function from the af_packet code to the core socket method. In this way, network layers only needing the error queue for transmit time stamping can share common code. Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- net/core/sock.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 548d716..85e8de1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -93,6 +93,7 @@ #include #include +#include #include #include #include @@ -2425,6 +2426,52 @@ void sock_enable_timestamp(struct sock *sk, int flag) } } +int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, + int level, int type) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + int copied, err; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_bh(&sk->sk_error_queue.lock); + sk->sk_err = 0; + if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_bh(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else + spin_unlock_bh(&sk->sk_error_queue.lock); + +out_free_skb: + kfree_skb(skb); +out: + return err; +} +EXPORT_SYMBOL(sock_recv_errqueue); + /* * Get a socket option on an socket. * -- cgit v1.1 From 64dc61306ce7da370833289739e2f52dfc6b37ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Jul 2013 20:26:31 -0700 Subject: net: add sk_stream_is_writeable() helper Several call sites use the hardcoded following condition : sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) Lets use a helper because TCP_NOTSENT_LOWAT support will change this condition for TCP sockets. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/core/stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/stream.c b/net/core/stream.c index f5df85d..512f0a2 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -30,7 +30,7 @@ void sk_stream_write_space(struct sock *sk) struct socket *sock = sk->sk_socket; struct socket_wq *wq; - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { + if (sk_stream_is_writeable(sk) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); -- cgit v1.1 From 18afa4b028b46f8b45ca64f94aefe717c297b07d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 Jul 2013 16:13:17 +0200 Subject: net: Make devnet_rename_seq static No users outside net/core/dev.c. Signed-off-by: Thomas Gleixner Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 26755dd..dfd9f5d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -174,7 +174,7 @@ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id; static DEFINE_HASHTABLE(napi_hash, 8); -seqcount_t devnet_rename_seq; +static seqcount_t devnet_rename_seq; static inline void dev_base_seq_inc(struct net *net) { -- cgit v1.1 From c26bf4a51308c85a6f97628253b99767a84ff90a Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 25 Jul 2013 18:12:18 +0200 Subject: pktgen: Add UDPCSUM flag to support UDP checksums UDP checksums are optional, hence pktgen has been omitting them in favour of performance. The optional flag UDPCSUM enables UDP checksumming. If the output device supports hardware checksumming the skb is prepared and marked CHECKSUM_PARTIAL, otherwise the checksum is generated in software. Signed-off-by: Thomas Graf Cc: Eric Dumazet Cc: Ben Greear Signed-off-by: David S. Miller --- net/core/pktgen.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 9640972..48cebf2 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -160,6 +160,7 @@ #include #include #include +#include #include #ifdef CONFIG_XFRM #include @@ -198,6 +199,7 @@ #define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ #define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ #define F_NODE (1<<15) /* Node memory alloc*/ +#define F_UDPCSUM (1<<16) /* Include UDP checksum */ /* Thread control flag bits */ #define T_STOP (1<<0) /* Stop run */ @@ -631,6 +633,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->flags & F_UDPDST_RND) seq_printf(seq, "UDPDST_RND "); + if (pkt_dev->flags & F_UDPCSUM) + seq_printf(seq, "UDPCSUM "); + if (pkt_dev->flags & F_MPLS_RND) seq_printf(seq, "MPLS_RND "); @@ -1228,6 +1233,12 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "!NODE_ALLOC") == 0) pkt_dev->flags &= ~F_NODE; + else if (strcmp(f, "UDPCSUM") == 0) + pkt_dev->flags |= F_UDPCSUM; + + else if (strcmp(f, "!UDPCSUM") == 0) + pkt_dev->flags &= ~F_UDPCSUM; + else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", @@ -2733,7 +2744,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, udph->source = htons(pkt_dev->cur_udp_src); udph->dest = htons(pkt_dev->cur_udp_dst); udph->len = htons(datalen + 8); /* DATA + udphdr */ - udph->check = 0; /* No checksum */ + udph->check = 0; iph->ihl = 5; iph->version = 4; @@ -2752,6 +2763,24 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->protocol = protocol; skb->dev = odev; skb->pkt_type = PACKET_HOST; + + if (!(pkt_dev->flags & F_UDPCSUM)) { + skb->ip_summed = CHECKSUM_NONE; + } else if (odev->features & NETIF_F_V4_CSUM) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum = 0; + udp4_hwcsum(skb, udph->source, udph->dest); + } else { + __wsum csum = udp_csum(skb); + + /* add protocol-dependent pseudo-header */ + udph->check = csum_tcpudp_magic(udph->source, udph->dest, + datalen + 8, IPPROTO_UDP, csum); + + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + } + pktgen_finalize_skb(pkt_dev, skb, datalen); #ifdef CONFIG_XFRM @@ -2768,7 +2797,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, struct sk_buff *skb = NULL; __u8 *eth; struct udphdr *udph; - int datalen; + int datalen, udplen; struct ipv6hdr *iph; __be16 protocol = htons(ETH_P_IPV6); __be32 *mpls; @@ -2844,10 +2873,11 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, net_info_ratelimited("increased datalen to %d\n", datalen); } + udplen = datalen + sizeof(struct udphdr); udph->source = htons(pkt_dev->cur_udp_src); udph->dest = htons(pkt_dev->cur_udp_dst); - udph->len = htons(datalen + sizeof(struct udphdr)); - udph->check = 0; /* No checksum */ + udph->len = htons(udplen); + udph->check = 0; *(__be32 *) iph = htonl(0x60000000); /* Version + flow */ @@ -2858,7 +2888,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, iph->hop_limit = 32; - iph->payload_len = htons(sizeof(struct udphdr) + datalen); + iph->payload_len = htons(udplen); iph->nexthdr = IPPROTO_UDP; iph->daddr = pkt_dev->cur_in6_daddr; @@ -2868,6 +2898,23 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, skb->dev = odev; skb->pkt_type = PACKET_HOST; + if (!(pkt_dev->flags & F_UDPCSUM)) { + skb->ip_summed = CHECKSUM_NONE; + } else if (odev->features & NETIF_F_V6_CSUM) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0); + } else { + __wsum csum = udp_csum(skb); + + /* add protocol-dependent pseudo-header */ + udph->check = csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, csum); + + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + } + pktgen_finalize_skb(pkt_dev, skb, datalen); return skb; -- cgit v1.1 From 03c633e73353d3c116cab6164933d69ee2180470 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 25 Jul 2013 14:08:04 +0200 Subject: pktgen: Use ip_send_check() to compute checksum Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/pktgen.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 48cebf2..2dec6f1 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2758,8 +2758,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, iph->frag_off = 0; iplen = 20 + 8 + datalen; iph->tot_len = htons(iplen); - iph->check = 0; - iph->check = ip_fast_csum((void *)iph, iph->ihl); + ip_send_check(iph); skb->protocol = protocol; skb->dev = odev; skb->pkt_type = PACKET_HOST; -- cgit v1.1 From 73d94e9481a20817abe2f1b41ee441bb4f6461f7 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 29 Jul 2013 16:21:26 +1000 Subject: pktgen: add needed include file Fixes this on PowerPC (at least): net/core/pktgen.c: In function 'fill_packet_ipv6': net/core/pktgen.c:2906:3: error: implicit declaration of function 'csum_ipv6_magic' [-Werror=implicit-function-declaration] udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0); ^ Signed-off-by: Stephen Rothwell Signed-off-by: David S. Miller --- net/core/pktgen.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 2dec6f1..261357a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -161,6 +161,7 @@ #include #include #include +#include #include #ifdef CONFIG_XFRM #include -- cgit v1.1 From 66b52b0dc82c5c88d769dc1c7d44cf45d0deb07c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 29 Jul 2013 18:16:49 +0200 Subject: net: add ndo to get id of physical port of the device This patch adds a ndo for getting physical port of the device. Driver which is aware of being virtual function of some physical port should implement this ndo. This is applicable not only for IOV, but for other solutions (NPAR, multichannel) as well. Basically if there is possible to have multiple netdevs on the single hw port. Signed-off-by: Jiri Pirko Acked-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index dfd9f5d..58eb802 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4989,6 +4989,24 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier) EXPORT_SYMBOL(dev_change_carrier); /** + * dev_get_phys_port_id - Get device physical port ID + * @dev: device + * @ppid: port ID + * + * Get device physical port ID + */ +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_port_id *ppid) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_id(dev, ppid); +} +EXPORT_SYMBOL(dev_get_phys_port_id); + +/** * dev_new_index - allocate an ifindex * @net: the applicable net namespace * -- cgit v1.1 From 66cae9ed6bc46b8cc57a9693f99f69926f3cc7ef Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 29 Jul 2013 18:16:50 +0200 Subject: rtnl: export physical port id via RT netlink Signed-off-by: Jiri Pirko Acked-by: Ben Hutchings Signed-off-by: Narendra K Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3de7408..0b2972c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -767,7 +767,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ - + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ + + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ + + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */ } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -846,6 +847,24 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + struct netdev_phys_port_id ppid; + + err = dev_get_phys_port_id(dev, &ppid); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) @@ -913,6 +932,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; } + if (rtnl_phys_port_id_fill(skb, dev)) + goto nla_put_failure; + attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (attr == NULL) @@ -1113,6 +1135,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, + [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN }, }; EXPORT_SYMBOL(ifla_policy); -- cgit v1.1 From ff80e519ab1b3a6abb2c6bbf684b98be07111879 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 29 Jul 2013 18:16:51 +0200 Subject: net: export physical port id via sysfs Signed-off-by: Jiri Pirko Acked-by: Ben Hutchings Signed-off-by: Narendra K Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 981fed3..8826b0d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -334,6 +334,27 @@ static ssize_t store_group(struct device *dev, struct device_attribute *attr, return netdev_store(dev, attr, buf, len, change_group); } +static ssize_t show_phys_port_id(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + struct netdev_phys_port_id ppid; + + ret = dev_get_phys_port_id(netdev, &ppid); + if (!ret) + ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + } + rtnl_unlock(); + + return ret; +} + static struct device_attribute net_class_attributes[] = { __ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL), __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), @@ -355,6 +376,7 @@ static struct device_attribute net_class_attributes[] = { __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, store_tx_queue_len), __ATTR(netdev_group, S_IRUGO | S_IWUSR, show_group, store_group), + __ATTR(phys_port_id, S_IRUGO, show_phys_port_id, NULL), {} }; -- cgit v1.1 From fca418955148e4f4555d7ce911e9eee3e7970a7f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 29 Jul 2013 11:07:36 -0700 Subject: flow_dissector: clean up IPIP case Explicitly set proto to ETH_P_IP and jump directly to ip processing. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 00ee068..3259446 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -139,7 +139,8 @@ ipv6: break; } case IPPROTO_IPIP: - goto again; + proto = htons(ETH_P_IP); + goto ip; default: break; } -- cgit v1.1 From b438f940d3541f478c6b37106ed095f1be7959ef Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 29 Jul 2013 11:07:42 -0700 Subject: flow_dissector: add support for IPPROTO_IPV6 Support IPPROTO_IPV6 similar to IPPROTO_IPIP Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3259446..ade9ff1 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -141,6 +141,9 @@ ipv6: case IPPROTO_IPIP: proto = htons(ETH_P_IP); goto ip; + case IPPROTO_IPV6: + proto = htons(ETH_P_IPV6); + goto ipv6; default: break; } -- cgit v1.1 From f2f872f9272a79a1048877ea14c15576f46c225e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Jul 2013 17:55:08 -0700 Subject: netem: Introduce skb_orphan_partial() helper Commit 547669d483e578 ("tcp: xps: fix reordering issues") added unexpected reorders in case netem is used in a MQ setup for high performance test bed. ETH=eth0 tc qd del dev $ETH root 2>/dev/null tc qd add dev $ETH root handle 1: mq for i in `seq 1 32` do tc qd add dev $ETH parent 1:$i netem delay 100ms done As all tcp packets are orphaned by netem, TCP stack believes it can set skb->ooo_okay on all packets. In order to allow producers to send more packets, we want to keep sk_wmem_alloc from reaching sk_sndbuf limit. We can do that by accounting one byte per skb in netem queues, so that TCP stack is not fooled too much. Tested: With above MQ/netem setup, scaling number of concurrent flows gives linear results and no reorders/retransmits lpq83:~# for n in 1 10 20 30 40 50 60 70 80 90 100 do echo -n "n:$n " ; ./super_netperf $n -H 10.7.7.84; done n:1 198.46 n:10 2002.69 n:20 4000.98 n:30 6006.35 n:40 8020.93 n:50 10032.3 n:60 12081.9 n:70 13971.3 n:80 16009.7 n:90 17117.3 n:100 17425.5 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 85e8de1..a753d97 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1576,6 +1576,25 @@ void sock_wfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_wfree); +void skb_orphan_partial(struct sk_buff *skb) +{ + /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, + * so we do not completely orphan skb, but transfert all + * accounted bytes but one, to avoid unexpected reorders. + */ + if (skb->destructor == sock_wfree +#ifdef CONFIG_INET + || skb->destructor == tcp_wfree +#endif + ) { + atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); + skb->truesize = 1; + } else { + skb_orphan(skb); + } +} +EXPORT_SYMBOL(skb_orphan_partial); + /* * Read buffer destructor automatically called from kfree_skb. */ -- cgit v1.1 From 7764a45a8f1fe74d4f7d301eaca2e558e7e2831a Mon Sep 17 00:00:00 2001 From: Stefan Tomanek Date: Thu, 1 Aug 2013 02:17:15 +0200 Subject: fib_rules: add .suppress operation This change adds a new operation to the fib_rules_ops struct; it allows the suppression of routing decisions if certain criteria are not met by its results. The first implemented constraint is a minimum prefix length added to the structures of routing rules. If a rule is added with a minimum prefix length >0, only routes meeting this threshold will be considered. Any other (more general) routing table entries will be ignored. When configuring a system with multiple network uplinks and default routes, it is often convinient to reference the main routing table multiple times - but omitting the default route. Using this patch and a modified "ip" utility, this can be achieved by using the following command sequence: $ ip route add table secuplink default via 10.42.23.1 $ ip rule add pref 100 table main prefixlength 1 $ ip rule add pref 150 fwmark 0xA table secuplink With this setup, packets marked 0xA will be processed by the additional routing table "secuplink", but only if no suitable route in the main routing table can be found. By using a minimal prefixlength of 1, the default route (/0) of the table "main" is hidden to packets processed by rule 100; packets traveling to destinations with more specific routing entries are processed as usual. Signed-off-by: Stefan Tomanek Signed-off-by: David S. Miller --- net/core/fib_rules.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 2173544..2ef5040 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -226,6 +226,9 @@ jumped: else err = ops->action(rule, fl, flags, arg); + if (!err && ops->suppress && ops->suppress(rule, arg)) + continue; + if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || likely(atomic_inc_not_zero(&rule->refcnt))) { @@ -337,6 +340,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); + if (tb[FRA_TABLE_PREFIXLEN_MIN]) + rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]); if (!tb[FRA_PRIORITY] && ops->default_pref) rule->pref = ops->default_pref(ops); @@ -523,6 +528,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4); /* FRA_FWMASK */ @@ -548,6 +554,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->table = rule->table; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; + if (nla_put_u8(skb, FRA_TABLE_PREFIXLEN_MIN, rule->table_prefixlen_min)) + goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; frh->action = rule->action; -- cgit v1.1 From 6ef94cfafba159d6b1a902ccb3349ac6a34ff6ad Mon Sep 17 00:00:00 2001 From: Stefan Tomanek Date: Fri, 2 Aug 2013 17:19:56 +0200 Subject: fib_rules: add route suppression based on ifgroup This change adds the ability to suppress a routing decision based upon the interface group the selected interface belongs to. This allows it to exclude specific devices from a routing decision. Signed-off-by: Stefan Tomanek Signed-off-by: David S. Miller --- net/core/fib_rules.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 2ef5040..5040a61 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -343,6 +343,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (tb[FRA_TABLE_PREFIXLEN_MIN]) rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]); + if (tb[FRA_SUPPRESS_IFGROUP]) + rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); + if (!tb[FRA_PRIORITY] && ops->default_pref) rule->pref = ops->default_pref(ops); @@ -529,6 +532,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */ + + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4); /* FRA_FWMASK */ @@ -588,6 +592,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target))) goto nla_put_failure; + + if (rule->suppress_ifgroup != -1) { + if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) + goto nla_put_failure; + } + if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; -- cgit v1.1 From 63134803a6369dcf7dddf7f0d5e37b9566b308d2 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Fri, 2 Aug 2013 19:07:38 +0200 Subject: neighbour: populate neigh_parms on alloc before calling ndo_neigh_setup dev->ndo_neigh_setup() might need some of the values of neigh_parms, so populate them before calling it. Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/neighbour.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b7de821..576d46f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1441,16 +1441,18 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, atomic_set(&p->refcnt, 1); p->reachable_time = neigh_rand_reach_time(p->base_reachable_time); + dev_hold(dev); + p->dev = dev; + write_pnet(&p->net, hold_net(net)); + p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { + release_net(net); + dev_put(dev); kfree(p); return NULL; } - dev_hold(dev); - p->dev = dev; - write_pnet(&p->net, hold_net(net)); - p->sysctl_table = NULL; write_lock_bh(&tbl->lock); p->next = tbl->parms.next; tbl->parms.next = p; -- cgit v1.1 From 73f5698e77219bfc3ea1903759fe8e20ab5b285e Mon Sep 17 00:00:00 2001 From: Stefan Tomanek Date: Sat, 3 Aug 2013 14:14:43 +0200 Subject: fib_rules: fix suppressor names and default values This change brings the suppressor attribute names into line; it also changes the data types to provide a more consistent interface. While -1 indicates that the suppressor is not enabled, values >= 0 for suppress_prefixlen or suppress_ifgroup reject routing decisions violating the constraint. This changes the previously presented behaviour of suppress_prefixlen, where a prefix length _less_ than the attribute value was rejected. After this change, a prefix length less than *or* equal to the value is considered a violation of the rule constraint. It also changes the default values for default and newly added rules (disabling any suppression for those). Signed-off-by: Stefan Tomanek Signed-off-by: David S. Miller --- net/core/fib_rules.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 5040a61..2e65413 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -33,6 +33,9 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->flags = flags; r->fr_net = hold_net(ops->fro_net); + r->suppress_prefixlen = -1; + r->suppress_ifgroup = -1; + /* The lock is not required here, the list in unreacheable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); @@ -340,11 +343,15 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); - if (tb[FRA_TABLE_PREFIXLEN_MIN]) - rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]); + if (tb[FRA_SUPPRESS_PREFIXLEN]) + rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); + else + rule->suppress_prefixlen = -1; if (tb[FRA_SUPPRESS_IFGROUP]) rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); + else + rule->suppress_ifgroup = -1; if (!tb[FRA_PRIORITY] && ops->default_pref) rule->pref = ops->default_pref(ops); @@ -531,7 +538,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ - + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */ + + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4); /* FRA_FWMASK */ @@ -558,7 +565,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->table = rule->table; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; - if (nla_put_u8(skb, FRA_TABLE_PREFIXLEN_MIN, rule->table_prefixlen_min)) + if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; -- cgit v1.1 From b4bf07771faaf959b0a916d35b1b930c030e30a8 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 6 Aug 2013 17:45:03 +0800 Subject: net: move iov_pages() to net/core/iovec.c To let it be reused and reduce code duplication. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- net/core/iovec.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'net/core') diff --git a/net/core/iovec.c b/net/core/iovec.c index de178e4..b77eeec 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -212,3 +212,27 @@ out_fault: goto out; } EXPORT_SYMBOL(csum_partial_copy_fromiovecend); + +unsigned long iov_pages(const struct iovec *iov, int offset, + unsigned long nr_segs) +{ + unsigned long seg, base; + int pages = 0, len, size; + + while (nr_segs && (offset >= iov->iov_len)) { + offset -= iov->iov_len; + ++iov; + --nr_segs; + } + + for (seg = 0; seg < nr_segs; seg++) { + base = (unsigned long)iov[seg].iov_base + offset; + len = iov[seg].iov_len - offset; + size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + pages += size; + offset = 0; + } + + return pages; +} +EXPORT_SYMBOL(iov_pages); -- cgit v1.1 From c3bdeb5c7cc073ccf5ff9624642022a8613a956e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 6 Aug 2013 17:45:04 +0800 Subject: net: move zerocopy_sg_from_iovec() to net/core/datagram.c To let it be reused and reduce code duplication. Also document this function. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- net/core/datagram.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 8ab48cd..f987fae 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -573,6 +573,99 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iovec); +/** + * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec + * @skb: buffer to copy + * @from: io vector to copy to + * @offset: offset in the io vector to start copying from + * @count: amount of vectors to copy to buffer from + * + * The function will first copy up to headlen, and then pin the userspace + * pages and build frags through them. + * + * Returns 0, -EFAULT or -EMSGSIZE. + * Note: the iovec is not modified during the copy + */ +int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, + int offset, size_t count) +{ + int len = iov_length(from, count) - offset; + int copy = skb_headlen(skb); + int size, offset1 = 0; + int i = 0; + + /* Skip over from offset */ + while (count && (offset >= from->iov_len)) { + offset -= from->iov_len; + ++from; + --count; + } + + /* copy up to skb headlen */ + while (count && (copy > 0)) { + size = min_t(unsigned int, copy, from->iov_len - offset); + if (copy_from_user(skb->data + offset1, from->iov_base + offset, + size)) + return -EFAULT; + if (copy > size) { + ++from; + --count; + offset = 0; + } else + offset += size; + copy -= size; + offset1 += size; + } + + if (len == offset1) + return 0; + + while (count--) { + struct page *page[MAX_SKB_FRAGS]; + int num_pages; + unsigned long base; + unsigned long truesize; + + len = from->iov_len - offset; + if (!len) { + offset = 0; + ++from; + continue; + } + base = (unsigned long)from->iov_base + offset; + size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + if (i + size > MAX_SKB_FRAGS) + return -EMSGSIZE; + num_pages = get_user_pages_fast(base, size, 0, &page[i]); + if (num_pages != size) { + int j; + + for (j = 0; j < num_pages; j++) + put_page(page[i + j]); + return -EFAULT; + } + truesize = size * PAGE_SIZE; + skb->data_len += len; + skb->len += len; + skb->truesize += truesize; + atomic_add(truesize, &skb->sk->sk_wmem_alloc); + while (len) { + int off = base & ~PAGE_MASK; + int size = min_t(int, len, PAGE_SIZE - off); + __skb_fill_page_desc(skb, i, page[i], off, size); + skb_shinfo(skb)->nr_frags++; + /* increase sk_wmem_alloc */ + base += size; + len -= size; + i++; + } + offset = 0; + ++from; + } + return 0; +} +EXPORT_SYMBOL(zerocopy_sg_from_iovec); + static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, u8 __user *to, int len, __wsum *csump) -- cgit v1.1 From 0a57ec62dfa4b639d86aba60c236904def1d0356 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 6 Aug 2013 17:45:05 +0800 Subject: net: use skb_fill_page_desc() in zerocopy_sg_from_iovec() Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- net/core/datagram.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index f987fae..bec0867 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -652,8 +652,7 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, while (len) { int off = base & ~PAGE_MASK; int size = min_t(int, len, PAGE_SIZE - off); - __skb_fill_page_desc(skb, i, page[i], off, size); - skb_shinfo(skb)->nr_frags++; + skb_fill_page_desc(skb, i, page[i], off, size); /* increase sk_wmem_alloc */ base += size; len -= size; -- cgit v1.1 From 234a4267085e85d0cc5552c340833bb2d82a50e7 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 6 Aug 2013 17:45:06 +0800 Subject: net: remove the useless comment in zerocopy_sg_from_iovec() Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- net/core/datagram.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index bec0867..a8a795c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -653,7 +653,6 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, int off = base & ~PAGE_MASK; int size = min_t(int, len, PAGE_SIZE - off); skb_fill_page_desc(skb, i, page[i], off, size); - /* increase sk_wmem_alloc */ base += size; len -= size; i++; -- cgit v1.1 From 0433547aa7443cefc89d9b533169bdc50f1206b8 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 6 Aug 2013 17:45:07 +0800 Subject: net: use release_pages() in zerocopy_sg_from_iovec() To reduce the duplicated codes. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- net/core/datagram.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index a8a795c..badcd93 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -638,10 +639,7 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, return -EMSGSIZE; num_pages = get_user_pages_fast(base, size, 0, &page[i]); if (num_pages != size) { - int j; - - for (j = 0; j < num_pages; j++) - put_page(page[i + j]); + release_pages(&page[i], num_pages, 0); return -EFAULT; } truesize = size * PAGE_SIZE; -- cgit v1.1 From 3d9953a2ef2182d56e268742259b11dedb8e281d Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 6 Aug 2013 17:45:08 +0800 Subject: net: use skb_copy_datagram_from_iovec() in zerocopy_sg_from_iovec() Use skb_copy_datagram_from_iovec() to avoid code duplication and make it easy to be read. Also we can do the skipping inside the zero-copy loop. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- net/core/datagram.c | 37 ++++++++++--------------------------- 1 file changed, 10 insertions(+), 27 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index badcd93..af814e7 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -591,48 +591,31 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, int offset, size_t count) { int len = iov_length(from, count) - offset; - int copy = skb_headlen(skb); - int size, offset1 = 0; + int copy = min_t(int, skb_headlen(skb), len); + int size; int i = 0; - /* Skip over from offset */ - while (count && (offset >= from->iov_len)) { - offset -= from->iov_len; - ++from; - --count; - } - /* copy up to skb headlen */ - while (count && (copy > 0)) { - size = min_t(unsigned int, copy, from->iov_len - offset); - if (copy_from_user(skb->data + offset1, from->iov_base + offset, - size)) - return -EFAULT; - if (copy > size) { - ++from; - --count; - offset = 0; - } else - offset += size; - copy -= size; - offset1 += size; - } + if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy)) + return -EFAULT; - if (len == offset1) + if (len == copy) return 0; + offset += copy; while (count--) { struct page *page[MAX_SKB_FRAGS]; int num_pages; unsigned long base; unsigned long truesize; - len = from->iov_len - offset; - if (!len) { - offset = 0; + /* Skip over from offset and copied */ + if (offset >= from->iov_len) { + offset -= from->iov_len; ++from; continue; } + len = from->iov_len - offset; base = (unsigned long)from->iov_base + offset; size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; if (i + size > MAX_SKB_FRAGS) -- cgit v1.1 From 28d6427109d13b0f447cba5761f88d3548e83605 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Aug 2013 14:38:47 -0700 Subject: net: attempt high order allocations in sock_alloc_send_pskb() Adding paged frags skbs to af_unix sockets introduced a performance regression on large sends because of additional page allocations, even if each skb could carry at least 100% more payload than before. We can instruct sock_alloc_send_pskb() to attempt high order allocations. Most of the time, it does a single page allocation instead of 8. I added an additional parameter to sock_alloc_send_pskb() to let other users to opt-in for this new feature on followup patches. Tested: Before patch : $ netperf -t STREAM_STREAM STREAM STREAM TEST Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 2304 212992 212992 10.00 46861.15 After patch : $ netperf -t STREAM_STREAM STREAM STREAM TEST Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 2304 212992 212992 10.00 57981.11 Signed-off-by: Eric Dumazet Cc: David Rientjes Signed-off-by: David S. Miller --- net/core/sock.c | 100 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 51 insertions(+), 49 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 83667de..5b6beba 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1741,24 +1741,23 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, - int *errcode) + int *errcode, int max_page_order) { - struct sk_buff *skb; + struct sk_buff *skb = NULL; + unsigned long chunk; gfp_t gfp_mask; long timeo; int err; int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + struct page *page; + int i; err = -EMSGSIZE; if (npages > MAX_SKB_FRAGS) goto failure; - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; - timeo = sock_sndtimeo(sk, noblock); - while (1) { + while (!skb) { err = sock_error(sk); if (err != 0) goto failure; @@ -1767,50 +1766,52 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - skb = alloc_skb(header_len, gfp_mask); - if (skb) { - int i; - - /* No pages, we're done... */ - if (!data_len) - break; - - skb->truesize += data_len; - skb_shinfo(skb)->nr_frags = npages; - for (i = 0; i < npages; i++) { - struct page *page; - - page = alloc_pages(sk->sk_allocation, 0); - if (!page) { - err = -ENOBUFS; - skb_shinfo(skb)->nr_frags = i; - kfree_skb(skb); - goto failure; - } - - __skb_fill_page_desc(skb, i, - page, 0, - (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len)); - data_len -= PAGE_SIZE; - } + if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); + continue; + } - /* Full success... */ - break; - } - err = -ENOBUFS; + err = -ENOBUFS; + gfp_mask = sk->sk_allocation; + if (gfp_mask & __GFP_WAIT) + gfp_mask |= __GFP_REPEAT; + + skb = alloc_skb(header_len, gfp_mask); + if (!skb) goto failure; + + skb->truesize += data_len; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages(sk->sk_allocation | + __GFP_COMP | __GFP_NOWARN, + order); + if (page) + goto fill_page; + } + order--; + } + page = alloc_page(sk->sk_allocation); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; } - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = -EAGAIN; - if (!timeo) - goto failure; - if (signal_pending(current)) - goto interrupted; - timeo = sock_wait_for_wmem(sk, timeo); } skb_set_owner_w(skb, sk); @@ -1819,6 +1820,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, interrupted: err = sock_intr_errno(timeo); failure: + kfree_skb(skb); *errcode = err; return NULL; } @@ -1827,7 +1829,7 @@ EXPORT_SYMBOL(sock_alloc_send_pskb); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); } EXPORT_SYMBOL(sock_alloc_send_skb); -- cgit v1.1 From 64261f230a9157f5f520ce30ec6827d679375e2f Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 13 Aug 2013 17:51:09 +0200 Subject: dev: move skb_scrub_packet() after eth_type_trans() skb_scrub_packet() was called before eth_type_trans() to let eth_type_trans() set pkt_type. In fact, we should force pkt_type to PACKET_HOST, so move the call after eth_type_trans(). Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 58eb802..1ed2b66 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1691,13 +1691,13 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) kfree_skb(skb); return NET_RX_DROP; } - skb_scrub_packet(skb); skb->protocol = eth_type_trans(skb, dev); /* eth_type_trans() can set pkt_type. - * clear pkt_type _after_ calling eth_type_trans() + * call skb_scrub_packet() after it to clear pkt_type _after_ calling + * eth_type_trans(). */ - skb->pkt_type = PACKET_HOST; + skb_scrub_packet(skb); return netif_rx(skb); } -- cgit v1.1 From fce9b9be89cece975675142a3953bfb5299d195d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 14 Aug 2013 12:35:42 +0300 Subject: rtnetlink: remove an unneeded test We know that "dev" is a valid pointer at this point, so we can remove the test and clean up a little. Signed-off-by: Dan Carpenter Reviewed-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0b2972c..242084e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1867,10 +1867,10 @@ replay: else err = register_netdevice(dev); - if (err < 0 && !IS_ERR(dev)) + if (err < 0) { free_netdev(dev); - if (err < 0) goto out; + } err = rtnl_configure_link(dev, ifm); if (err < 0) -- cgit v1.1 From aa9d85605f5ab070b64842b3eba797cf81698ae1 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:04 +0200 Subject: net: rename netdev_upper to netdev_adjacent Rename the structure to reflect the upcoming addition of lower_dev_list. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1ed2b66..5072e2c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4367,7 +4367,7 @@ softnet_break: goto out; } -struct netdev_upper { +struct netdev_adjacent { struct net_device *dev; bool master; struct list_head list; @@ -4378,7 +4378,7 @@ struct netdev_upper { static void __append_search_uppers(struct list_head *search_list, struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; list_for_each_entry(upper, &dev->upper_dev_list, list) { /* check if this upper is not already in search list */ @@ -4391,8 +4391,8 @@ static bool __netdev_search_upper_dev(struct net_device *dev, struct net_device *upper_dev) { LIST_HEAD(search_list); - struct netdev_upper *upper; - struct netdev_upper *tmp; + struct netdev_adjacent *upper; + struct netdev_adjacent *tmp; bool ret = false; __append_search_uppers(&search_list, dev); @@ -4408,10 +4408,10 @@ static bool __netdev_search_upper_dev(struct net_device *dev, return ret; } -static struct netdev_upper *__netdev_find_upper(struct net_device *dev, +static struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; list_for_each_entry(upper, &dev->upper_dev_list, list) { if (upper->dev == upper_dev) @@ -4462,7 +4462,7 @@ EXPORT_SYMBOL(netdev_has_any_upper_dev); */ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; ASSERT_RTNL(); @@ -4470,7 +4470,7 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) return NULL; upper = list_first_entry(&dev->upper_dev_list, - struct netdev_upper, list); + struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; return NULL; @@ -4486,10 +4486,10 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get); */ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; upper = list_first_or_null_rcu(&dev->upper_dev_list, - struct netdev_upper, list); + struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; return NULL; @@ -4499,7 +4499,7 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; ASSERT_RTNL(); @@ -4580,7 +4580,7 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; ASSERT_RTNL(); -- cgit v1.1 From 5d261913ca3daf6c2d21d38924235667b3d07c40 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:05 +0200 Subject: net: add lower_dev_list to net_device and make a full mesh This patch adds lower_dev_list list_head to net_device, which is the same as upper_dev_list, only for lower devices, and begins to use it in the same way as the upper list. It also changes the way the whole adjacent device lists work - now they contain *all* of upper/lower devices, not only the first level. The first level devices are distinguished by the bool neighbour field in netdev_adjacent, also added by this patch. There are cases when a device can be added several times to the adjacent list, the simplest would be: /---- eth0.10 ---\ eth0- --- bond0 \---- eth0.20 ---/ where both bond0 and eth0 'see' each other in the adjacent lists two times. To avoid duplication of netdev_adjacent structures ref_nr is being kept as the number of times the device was added to the list. The 'full view' is achieved by adding, on link creation, all of the upper_dev's upper_dev_list devices as upper devices to all of the lower_dev's lower_dev_list devices (and to the lower_dev itself), and vice versa. On unlink they are removed using the same logic. I've tested it with thousands vlans/bonds/bridges, everything works ok and no observable lags even on a huge number of interfaces. Memory footprint for 128 devices interconnected with each other via both upper and lower (which is impossible, but for the comparison) lists would be: 128*128*2*sizeof(netdev_adjacent) = 1.5MB but in the real world we usualy have at most several devices with slaves and a lot of vlans, so the footprint will be much lower. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 285 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 258 insertions(+), 27 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5072e2c..2aa914e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4369,7 +4369,16 @@ softnet_break: struct netdev_adjacent { struct net_device *dev; + + /* upper master flag, there can only be one master device per list */ bool master; + + /* indicates that this dev is our first-level lower/upper device */ + bool neighbour; + + /* counter for the number of times this device was added to us */ + u16 ref_nr; + struct list_head list; struct rcu_head rcu; struct list_head search_list; @@ -4408,18 +4417,34 @@ static bool __netdev_search_upper_dev(struct net_device *dev, return ret; } -static struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, - struct net_device *upper_dev) +static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, + struct net_device *adj_dev, + bool upper) { - struct netdev_adjacent *upper; + struct netdev_adjacent *adj; + struct list_head *dev_list; - list_for_each_entry(upper, &dev->upper_dev_list, list) { - if (upper->dev == upper_dev) - return upper; + dev_list = upper ? &dev->upper_dev_list : &dev->lower_dev_list; + + list_for_each_entry(adj, dev_list, list) { + if (adj->dev == adj_dev) + return adj; } return NULL; } +static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_find_adj(dev, udev, true); +} + +static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev, + struct net_device *ldev) +{ + return __netdev_find_adj(dev, ldev, false); +} + /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device @@ -4496,10 +4521,149 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); +static int __netdev_adjacent_dev_insert(struct net_device *dev, + struct net_device *adj_dev, + bool neighbour, bool master, + bool upper) +{ + struct netdev_adjacent *adj; + + adj = __netdev_find_adj(dev, adj_dev, upper); + + if (adj) { + BUG_ON(neighbour); + adj->ref_nr++; + return 0; + } + + adj = kmalloc(sizeof(*adj), GFP_KERNEL); + if (!adj) + return -ENOMEM; + + adj->dev = adj_dev; + adj->master = master; + adj->neighbour = neighbour; + adj->ref_nr = 1; + INIT_LIST_HEAD(&adj->search_list); + + dev_hold(adj_dev); + pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", + adj_dev->name, upper ? "upper" : "lower", dev->name, + adj_dev->name); + + if (!upper) { + list_add_tail_rcu(&adj->list, &dev->lower_dev_list); + return 0; + } + + /* Ensure that master upper link is always the first item in list. */ + if (master) + list_add_rcu(&adj->list, &dev->upper_dev_list); + else + list_add_tail_rcu(&adj->list, &dev->upper_dev_list); + + return 0; +} + +static inline int __netdev_upper_dev_insert(struct net_device *dev, + struct net_device *udev, + bool master, bool neighbour) +{ + return __netdev_adjacent_dev_insert(dev, udev, neighbour, master, + true); +} + +static inline int __netdev_lower_dev_insert(struct net_device *dev, + struct net_device *ldev, + bool neighbour) +{ + return __netdev_adjacent_dev_insert(dev, ldev, neighbour, false, + false); +} + +void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, bool upper) +{ + struct netdev_adjacent *adj; + + if (upper) + adj = __netdev_find_upper(dev, adj_dev); + else + adj = __netdev_find_lower(dev, adj_dev); + + if (!adj) + BUG(); + + if (adj->ref_nr > 1) { + adj->ref_nr--; + return; + } + + list_del_rcu(&adj->list); + pr_debug("dev_put for %s, because of %s link removed from %s to %s\n", + adj_dev->name, upper ? "upper" : "lower", dev->name, + adj_dev->name); + dev_put(adj_dev); + kfree_rcu(adj, rcu); +} + +static inline void __netdev_upper_dev_remove(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_adjacent_dev_remove(dev, udev, true); +} + +static inline void __netdev_lower_dev_remove(struct net_device *dev, + struct net_device *ldev) +{ + return __netdev_adjacent_dev_remove(dev, ldev, false); +} + +int __netdev_adjacent_dev_insert_link(struct net_device *dev, + struct net_device *upper_dev, + bool master, bool neighbour) +{ + int ret; + + ret = __netdev_upper_dev_insert(dev, upper_dev, master, neighbour); + if (ret) + return ret; + + ret = __netdev_lower_dev_insert(upper_dev, dev, neighbour); + if (ret) { + __netdev_upper_dev_remove(dev, upper_dev); + return ret; + } + + return 0; +} + +static inline int __netdev_adjacent_dev_link(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_adjacent_dev_insert_link(dev, udev, false, false); +} + +static inline int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *udev, + bool master) +{ + return __netdev_adjacent_dev_insert_link(dev, udev, master, true); +} + +void __netdev_adjacent_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) +{ + __netdev_upper_dev_remove(dev, upper_dev); + __netdev_lower_dev_remove(upper_dev, dev); +} + + static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master) { - struct netdev_adjacent *upper; + struct netdev_adjacent *i, *j, *to_i, *to_j; + int ret = 0; ASSERT_RTNL(); @@ -4516,22 +4680,76 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; - upper = kmalloc(sizeof(*upper), GFP_KERNEL); - if (!upper) - return -ENOMEM; + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, master); + if (ret) + return ret; - upper->dev = upper_dev; - upper->master = master; - INIT_LIST_HEAD(&upper->search_list); + /* Now that we linked these devs, make all the upper_dev's + * upper_dev_list visible to every dev's lower_dev_list and vice + * versa, and don't forget the devices itself. All of these + * links are non-neighbours. + */ + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + list_for_each_entry(j, &dev->lower_dev_list, list) { + ret = __netdev_adjacent_dev_link(i->dev, j->dev); + if (ret) + goto rollback_mesh; + } + } + + /* add dev to every upper_dev's upper device */ + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + ret = __netdev_adjacent_dev_link(dev, i->dev); + if (ret) + goto rollback_upper_mesh; + } + + /* add upper_dev to every dev's lower device */ + list_for_each_entry(i, &dev->lower_dev_list, list) { + ret = __netdev_adjacent_dev_link(i->dev, upper_dev); + if (ret) + goto rollback_lower_mesh; + } - /* Ensure that master upper link is always the first item in list. */ - if (master) - list_add_rcu(&upper->list, &dev->upper_dev_list); - else - list_add_tail_rcu(&upper->list, &dev->upper_dev_list); - dev_hold(upper_dev); call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); return 0; + +rollback_lower_mesh: + to_i = i; + list_for_each_entry(i, &dev->lower_dev_list, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + } + + i = NULL; + +rollback_upper_mesh: + to_i = i; + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(dev, i->dev); + } + + i = j = NULL; + +rollback_mesh: + to_i = i; + to_j = j; + list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(j, &upper_dev->upper_dev_list, list) { + if (i == to_i && j == to_j) + break; + __netdev_adjacent_dev_unlink(i->dev, j->dev); + } + if (i == to_i) + break; + } + + __netdev_adjacent_dev_unlink(dev, upper_dev); + + return ret; } /** @@ -4580,16 +4798,28 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_adjacent *upper; - + struct netdev_adjacent *i, *j; ASSERT_RTNL(); - upper = __netdev_find_upper(dev, upper_dev); - if (!upper) - return; - list_del_rcu(&upper->list); - dev_put(upper_dev); - kfree_rcu(upper, rcu); + __netdev_adjacent_dev_unlink(dev, upper_dev); + + /* Here is the tricky part. We must remove all dev's lower + * devices from all upper_dev's upper devices and vice + * versa, to maintain the graph relationship. + */ + list_for_each_entry(i, &dev->lower_dev_list, list) + list_for_each_entry(j, &upper_dev->upper_dev_list, list) + __netdev_adjacent_dev_unlink(i->dev, j->dev); + + /* remove also the devices itself from lower/upper device + * list + */ + list_for_each_entry(i, &dev->lower_dev_list, list) + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + + list_for_each_entry(i, &upper_dev->upper_dev_list, list) + __netdev_adjacent_dev_unlink(dev, i->dev); + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -5850,6 +6080,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->upper_dev_list); + INIT_LIST_HEAD(&dev->lower_dev_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); -- cgit v1.1 From 620f3186caa8124e0efaf329751cf51c5d55c731 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:06 +0200 Subject: net: remove search_list from netdev_adjacent We already don't need it cause we see every upper/lower device in the list already. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 2aa914e..749925a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4381,42 +4381,8 @@ struct netdev_adjacent { struct list_head list; struct rcu_head rcu; - struct list_head search_list; }; -static void __append_search_uppers(struct list_head *search_list, - struct net_device *dev) -{ - struct netdev_adjacent *upper; - - list_for_each_entry(upper, &dev->upper_dev_list, list) { - /* check if this upper is not already in search list */ - if (list_empty(&upper->search_list)) - list_add_tail(&upper->search_list, search_list); - } -} - -static bool __netdev_search_upper_dev(struct net_device *dev, - struct net_device *upper_dev) -{ - LIST_HEAD(search_list); - struct netdev_adjacent *upper; - struct netdev_adjacent *tmp; - bool ret = false; - - __append_search_uppers(&search_list, dev); - list_for_each_entry(upper, &search_list, search_list) { - if (upper->dev == upper_dev) { - ret = true; - break; - } - __append_search_uppers(&search_list, upper->dev); - } - list_for_each_entry_safe(upper, tmp, &search_list, search_list) - INIT_LIST_HEAD(&upper->search_list); - return ret; -} - static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, struct net_device *adj_dev, bool upper) @@ -4544,7 +4510,6 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->master = master; adj->neighbour = neighbour; adj->ref_nr = 1; - INIT_LIST_HEAD(&adj->search_list); dev_hold(adj_dev); pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", @@ -4671,7 +4636,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_search_upper_dev(upper_dev, dev)) + if (__netdev_find_upper(upper_dev, dev)) return -EBUSY; if (__netdev_find_upper(dev, upper_dev)) -- cgit v1.1 From 48311f46853c0361f9fba7e0e6bb1652d633c049 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:07 +0200 Subject: net: add netdev_upper_get_next_dev_rcu(dev, iter) This function returns the next dev in the dev->upper_dev_list after the struct list_head **iter position, and updates *iter accordingly. Returns NULL if there are no devices left. Caller must hold RCU read lock. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 749925a..6fbb0c9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4468,6 +4468,31 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get); +/* netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->upper_dev_list) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); + /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device -- cgit v1.1 From 6da7c8fcbcbdb50ec68c61b40d554c74850fdb91 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 27 Aug 2013 16:19:08 -0700 Subject: qdisc: allow setting default queuing discipline By default, the pfifo_fast queue discipline has been used by default for all devices. But we have better choices now. This patch allow setting the default queueing discipline with sysctl. This allows easy use of better queueing disciplines on all devices without having to use tc qdisc scripts. It is intended to allow an easy path for distributions to make fq_codel or sfq the default qdisc. This patch also makes pfifo_fast more of a first class qdisc, since it is now possible to manually override the default and explicitly use pfifo_fast. The behavior for systems who do not use the sysctl is unchanged, they still get pfifo_fast Also removes leftover random # in sysctl net core. Signed-off-by: Stephen Hemminger Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sysctl_net_core.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 31107ab..cca4441 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -20,6 +20,7 @@ #include #include #include +#include static int zero = 0; static int one = 1; @@ -193,6 +194,26 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, } #endif /* CONFIG_NET_FLOW_LIMIT */ +#ifdef CONFIG_NET_SCHED +static int set_default_qdisc(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char id[IFNAMSIZ]; + struct ctl_table tbl = { + .data = id, + .maxlen = IFNAMSIZ, + }; + int ret; + + qdisc_get_default(id, IFNAMSIZ); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) + ret = qdisc_set_default(id); + return ret; +} +#endif + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -315,7 +336,14 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, -# +#endif +#ifdef CONFIG_NET_SCHED + { + .procname = "default_qdisc", + .mode = 0644, + .maxlen = IFNAMSIZ, + .proc_handler = set_default_qdisc + }, #endif #endif /* CONFIG_NET */ { -- cgit v1.1 From 3e25c65ed085b361cc91a8f02e028f1158c9f255 Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Thu, 29 Aug 2013 06:38:47 -0600 Subject: net: neighbour: Remove CONFIG_ARPD This config option is superfluous in that it only guards a call to neigh_app_ns(). Enabling CONFIG_ARPD by default has no change in behavior. There will now be call to __neigh_notify() for each ARP resolution, which has no impact unless there is a user space daemon waiting to receive the notification, i.e., the case for which CONFIG_ARPD was designed anyways. Suggested-by: Eric W. Biederman Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Cc: "Eric W. Biederman" Cc: Gao feng Cc: Joe Perches Cc: Veaceslav Falico Signed-off-by: Tim Gardner Reviewed-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/neighbour.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 60533db..6072610 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2759,13 +2759,11 @@ errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } -#ifdef CONFIG_ARPD void neigh_app_ns(struct neighbour *n) { __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST); } EXPORT_SYMBOL(neigh_app_ns); -#endif /* CONFIG_ARPD */ #ifdef CONFIG_SYSCTL static int zero; -- cgit v1.1 From 8b27f27797cac5ed9b2f3e63dac89a7ae70e70a7 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 2 Sep 2013 15:34:56 +0200 Subject: skb: allow skb_scrub_packet() to be used by tunnels This function was only used when a packet was sent to another netns. Now, it can also be used after tunnel encapsulation or decapsulation. Only skb_orphan() should not be done when a packet is not crossing netns. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/core/skbuff.c | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 6fbb0c9..07684e8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1697,7 +1697,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) * call skb_scrub_packet() after it to clear pkt_type _after_ calling * eth_type_trans(). */ - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); return netif_rx(skb); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2c3d0f5..d81cff1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3500,17 +3500,22 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, EXPORT_SYMBOL(skb_try_coalesce); /** - * skb_scrub_packet - scrub an skb before sending it to another netns + * skb_scrub_packet - scrub an skb * * @skb: buffer to clean - * - * skb_scrub_packet can be used to clean an skb before injecting it in - * another namespace. We have to clear all information in the skb that - * could impact namespace isolation. + * @xnet: packet is crossing netns + * + * skb_scrub_packet can be used after encapsulating or decapsulting a packet + * into/from a tunnel. Some information have to be cleared during these + * operations. + * skb_scrub_packet can also be used to clean a skb before injecting it in + * another namespace (@xnet == true). We have to clear all information in the + * skb that could impact namespace isolation. */ -void skb_scrub_packet(struct sk_buff *skb) +void skb_scrub_packet(struct sk_buff *skb, bool xnet) { - skb_orphan(skb); + if (xnet) + skb_orphan(skb); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; -- cgit v1.1 From 82476b316084e6826a9dd339d1dad892a598af9a Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Mon, 2 Sep 2013 16:26:51 +0200 Subject: net: correctly interlink lower/upper devices Currently we're linking upper devices to lower ones, which results in upside-down relationship: upper devices seeing lower devices via its upper lists. Fix this by correctly linking lower devices to the upper ones. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 07684e8..5c713f2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4679,8 +4679,8 @@ static int __netdev_upper_dev_link(struct net_device *dev, * versa, and don't forget the devices itself. All of these * links are non-neighbours. */ - list_for_each_entry(i, &upper_dev->upper_dev_list, list) { - list_for_each_entry(j, &dev->lower_dev_list, list) { + list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(j, &upper_dev->upper_dev_list, list) { ret = __netdev_adjacent_dev_link(i->dev, j->dev); if (ret) goto rollback_mesh; -- cgit v1.1