From 06e3041164a1c52d6e8a369af8be3827f428272b Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 18 Oct 2012 17:55:31 +0000
Subject: pktgen: Use ipv6_addr_any

Use the standard test for a non-zero ipv6 address.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index d1dc14c..1d1c216 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2427,11 +2427,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
 		}
 	} else {		/* IPV6 * */
 
-		if (pkt_dev->min_in6_daddr.s6_addr32[0] == 0 &&
-		    pkt_dev->min_in6_daddr.s6_addr32[1] == 0 &&
-		    pkt_dev->min_in6_daddr.s6_addr32[2] == 0 &&
-		    pkt_dev->min_in6_daddr.s6_addr32[3] == 0) ;
-		else {
+		if (!ipv6_addr_any(&pkt_dev->min_in6_daddr)) {
 			int i;
 
 			/* Only random destinations yet */
-- 
cgit v1.1


From f7b86bfe8d9f10e5a9fcacf52dddf29d0d58a33b Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Thu, 18 Oct 2012 23:55:56 +0000
Subject: sockopt: Make SO_BINDTODEVICE readable

The SO_BINDTODEVICE option is the only SOL_SOCKET one that can be set, but
cannot be get via sockopt API. The only way we can find the device id a
socket is bound to is via sock-diag interface. But the diag works only on
hashed sockets, while the opt in question can be set for yet unhashed one.

That said, in order to know what device a socket is bound to (we do want
to know this in checkpoint-restore project) I propose to make this option
getsockopt-able and report the respective device index.

Another solution to the problem might be to teach the sock-diag reporting
info on unhashed sockets. Should I go this way instead?

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 8a146cf..c49412c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1074,6 +1074,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 	case SO_NOFCS:
 		v.val = sock_flag(sk, SOCK_NOFCS);
 		break;
+	case SO_BINDTODEVICE:
+		v.val = sk->sk_bound_dev_if;
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
cgit v1.1


From 47b70db5558388b3f4ecd10b492a0b3f6d680789 Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Fri, 19 Oct 2012 01:09:30 +0000
Subject: net:dev: remove double indentical assignment in
 dev_change_net_namespace().

This patch removes double assignment of err to -EINVAL in dev_change_net_namespace().

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 09cb3f6..b4978e2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6264,7 +6264,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 		goto out;
 
 	/* Ensure the device has been registrered */
-	err = -EINVAL;
 	if (dev->reg_state != NETREG_REGISTERED)
 		goto out;
 
-- 
cgit v1.1


From c80bbeaec98b36eeba9c6c77061226034d5c4622 Mon Sep 17 00:00:00 2001
From: Hans Zhang <zhanghonghui@huawei.com>
Date: Mon, 22 Oct 2012 22:21:23 +0000
Subject: netlink: cleanup the unnecessary return value check

It's no needed to check the return value of tab since the NULL situation
has been handled already, and the rtnl_msg_handlers[PF_UNSPEC] has been
initialized as non-NULL during the rtnetlink_init().

Signed-off-by: Hans Zhang <zhanghonghui@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 76d4c2c..64fe3cc 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -128,7 +128,7 @@ static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex)
 	if (tab == NULL || tab[msgindex].doit == NULL)
 		tab = rtnl_msg_handlers[PF_UNSPEC];
 
-	return tab ? tab[msgindex].doit : NULL;
+	return tab[msgindex].doit;
 }
 
 static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
@@ -143,7 +143,7 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
 	if (tab == NULL || tab[msgindex].dumpit == NULL)
 		tab = rtnl_msg_handlers[PF_UNSPEC];
 
-	return tab ? tab[msgindex].dumpit : NULL;
+	return tab[msgindex].dumpit;
 }
 
 static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex)
@@ -158,7 +158,7 @@ static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex)
 	if (tab == NULL || tab[msgindex].calcit == NULL)
 		tab = rtnl_msg_handlers[PF_UNSPEC];
 
-	return tab ? tab[msgindex].calcit : NULL;
+	return tab[msgindex].calcit;
 }
 
 /**
-- 
cgit v1.1


From c658f19db3c3077a3328a0e0ea182f6777337c40 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Thu, 25 Oct 2012 04:16:55 +0000
Subject: cgroup: net_prio: Mark local used function static

net_prio_attach() is only access via cgroup_subsys callbacks,
therefore we can reduce the visibility of this function.

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: John Fastabend <john.r.fastabend@intel.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: <netdev@vger.kernel.org>
Cc: <cgroups@vger.kernel.org>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netprio_cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 79285a3..847c02b 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -248,7 +248,7 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
 	return 0;
 }
 
-void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+static void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
 	struct task_struct *p;
 	void *v;
-- 
cgit v1.1


From 3ace03cc2a03eadf83a59eecada68b37bc1a46ae Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Thu, 25 Oct 2012 04:16:57 +0000
Subject: cgroup: net_cls: Remove rcu_read_lock/unlock

As Eric pointed out:
"Hey task_cls_classid() has its own rcu protection since commit
3fb5a991916091a908d (cls_cgroup: Fix rcu lockdep warning)

So we can safely revert Paul commit (1144182a8757f2a1)
(We no longer need rcu_read_lock/unlock here)"

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: netdev@vger.kernel.org
Cc: cgroups@vger.kernel.org
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index c49412c..9fedbbf 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1221,9 +1221,7 @@ void sock_update_classid(struct sock *sk)
 {
 	u32 classid;
 
-	rcu_read_lock();  /* doing current task, which cannot vanish. */
 	classid = task_cls_classid(current);
-	rcu_read_unlock();
 	if (classid != sk->sk_classid)
 		sk->sk_classid = classid;
 }
-- 
cgit v1.1


From fd9a08a7b83074e34c13c6340f673f7a51f53489 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Thu, 25 Oct 2012 04:16:58 +0000
Subject: cgroup: net_cls: Pass in task to sock_update_classid()

sock_update_classid() assumes that the update operation always are
applied on the current task. sock_update_classid() needs to know on
which tasks to work on in order to be able to migrate task between
cgroups using the struct cgroup_subsys attach() callback.

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Joe Perches <joe@perches.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Stanislav Kinsbursky <skinsbursky@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: <netdev@vger.kernel.org>
Cc: <cgroups@vger.kernel.org>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 9fedbbf..0a023b8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1217,11 +1217,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
 
 #ifdef CONFIG_CGROUPS
 #if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
-void sock_update_classid(struct sock *sk)
+void sock_update_classid(struct sock *sk, struct task_struct *task)
 {
 	u32 classid;
 
-	classid = task_cls_classid(current);
+	classid = task_cls_classid(task);
 	if (classid != sk->sk_classid)
 		sk->sk_classid = classid;
 }
@@ -1264,7 +1264,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		sock_net_set(sk, get_net(net));
 		atomic_set(&sk->sk_wmem_alloc, 1);
 
-		sock_update_classid(sk);
+		sock_update_classid(sk, current);
 		sock_update_netprioidx(sk, current);
 	}
 
-- 
cgit v1.1


From e5a55a898720096f43bc24938f8875c0a1b34cd7 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Wed, 24 Oct 2012 08:12:57 +0000
Subject: net: create generic bridge ops

The PF_BRIDGE:RTM_{GET|SET}LINK nlmsg family and type are
currently embedded in the ./net/bridge module. This prohibits
them from being used by other bridging devices. One example
of this being hardware that has embedded bridging components.

In order to use these nlmsg types more generically this patch
adds two net_device_ops hooks. One to set link bridge attributes
and another to dump the current bride attributes.

	ndo_bridge_setlink()
	ndo_bridge_getlink()

CC: Lennert Buytenhek <buytenh@wantstofly.org>
CC: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 64fe3cc..a068666 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2252,6 +2252,83 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
+static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct net_device *dev;
+	int idx = 0;
+	u32 portid = NETLINK_CB(cb->skb).portid;
+	u32 seq = cb->nlh->nlmsg_seq;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		const struct net_device_ops *ops = dev->netdev_ops;
+		struct net_device *master = dev->master;
+
+		if (idx < cb->args[0])
+			continue;
+
+		if (master && master->netdev_ops->ndo_bridge_getlink) {
+			const struct net_device_ops *bops = master->netdev_ops;
+			int err = bops->ndo_bridge_getlink(skb, portid,
+							   seq, dev);
+
+			if (err < 0)
+				break;
+			else
+				idx++;
+		}
+
+		if (ops->ndo_bridge_getlink) {
+			int err = ops->ndo_bridge_getlink(skb, portid,
+							  seq, dev);
+
+			if (err < 0)
+				break;
+			else
+				idx++;
+		}
+	}
+	rcu_read_unlock();
+	cb->args[0] = idx;
+
+	return skb->len;
+}
+
+static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+			       void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct ifinfomsg *ifm;
+	struct net_device *dev;
+	int err = -EINVAL;
+
+	if (nlmsg_len(nlh) < sizeof(*ifm))
+		return -EINVAL;
+
+	ifm = nlmsg_data(nlh);
+	if (ifm->ifi_family != AF_BRIDGE)
+		return -EPFNOSUPPORT;
+
+	dev = __dev_get_by_index(net, ifm->ifi_index);
+	if (!dev) {
+		pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n");
+		return -ENODEV;
+	}
+
+	if (dev->master && dev->master->netdev_ops->ndo_bridge_setlink) {
+		err = dev->master->netdev_ops->ndo_bridge_setlink(dev, nlh);
+		if (err)
+			goto out;
+	}
+
+	if (dev->netdev_ops->ndo_bridge_setlink)
+		err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
+
+out:
+	return err;
+}
+
 /* Protected by RTNL sempahore.  */
 static struct rtattr **rta_buf;
 static int rtattr_max;
@@ -2433,5 +2510,8 @@ void __init rtnetlink_init(void)
 	rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL);
 	rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL);
 	rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL);
+
+	rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL);
+	rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);
 }
 
-- 
cgit v1.1


From 2469ffd723f76ac2d3ce3d4f31ee31ee0a06cd38 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Wed, 24 Oct 2012 08:13:03 +0000
Subject: net: set and query VEB/VEPA bridge mode via PF_BRIDGE

Hardware switches may support enabling and disabling the
loopback switch which puts the device in a VEPA mode defined
in the IEEE 802.1Qbg specification. In this mode frames are
not switched in the hardware but sent directly to the switch.
SR-IOV capable NICs will likely support this mode I am
aware of at least two such devices. Also I am told (but don't
have any of this hardware available) that there are devices
that only support VEPA modes. In these cases it is important
at a minimum to be able to query these attributes.

This patch adds an additional IFLA_BRIDGE_MODE attribute that can be
set and dumped via the PF_BRIDGE:{SET|GET}LINK operations. Also
anticipating bridge attributes that may be common for both embedded
bridges and software bridges this adds a flags attribute
IFLA_BRIDGE_FLAGS currently used to determine if the command or event
is being generated to/from an embedded bridge or software bridge.
Finally, the event generation is pulled out of the bridge module and
into rtnetlink proper.

For example using the macvlan driver in VEPA mode on top of
an embedded switch requires putting the embedded switch into
a VEPA mode to get the expected results.

	--------  --------
        | VEPA |  | VEPA |       <-- macvlan vepa edge relays
        --------  --------
           |        |
           |        |
        ------------------
        |      VEPA      |       <-- embedded switch in NIC
        ------------------
                |
                |
        -------------------
        | external switch |      <-- shiny new physical
	-------------------          switch with VEPA support

A packet sent from the macvlan VEPA at the top could be
loopbacked on the embedded switch and never seen by the
external switch. So in order for this to work the embedded
switch needs to be set in the VEPA state via the above
described commands.

By making these attributes nested in IFLA_AF_SPEC we allow
future extensions to be made as needed.

CC: Lennert Buytenhek <buytenh@wantstofly.org>
CC: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 81 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a068666..8d2af0f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2295,13 +2295,60 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
+static inline size_t bridge_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+		+ nla_total_size(IFNAMSIZ)	/* IFLA_IFNAME */
+		+ nla_total_size(MAX_ADDR_LEN)	/* IFLA_ADDRESS */
+		+ nla_total_size(sizeof(u32))	/* IFLA_MASTER */
+		+ nla_total_size(sizeof(u32))	/* IFLA_MTU */
+		+ nla_total_size(sizeof(u32))	/* IFLA_LINK */
+		+ nla_total_size(sizeof(u32))	/* IFLA_OPERSTATE */
+		+ nla_total_size(sizeof(u8))	/* IFLA_PROTINFO */
+		+ nla_total_size(sizeof(struct nlattr))	/* IFLA_AF_SPEC */
+		+ nla_total_size(sizeof(u16))	/* IFLA_BRIDGE_FLAGS */
+		+ nla_total_size(sizeof(u16));	/* IFLA_BRIDGE_MODE */
+}
+
+static int rtnl_bridge_notify(struct net_device *dev, u16 flags)
+{
+	struct net *net = dev_net(dev);
+	struct net_device *master = dev->master;
+	struct sk_buff *skb;
+	int err = -EOPNOTSUPP;
+
+	skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC);
+	if (!skb) {
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	if (!flags && master && master->netdev_ops->ndo_bridge_getlink)
+		err = master->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev);
+	else if (dev->netdev_ops->ndo_bridge_getlink)
+		err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev);
+
+	if (err < 0)
+		goto errout;
+
+	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+	return 0;
+errout:
+	WARN_ON(err == -EMSGSIZE);
+	kfree_skb(skb);
+	rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+	return err;
+}
+
 static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 			       void *arg)
 {
 	struct net *net = sock_net(skb->sk);
 	struct ifinfomsg *ifm;
 	struct net_device *dev;
-	int err = -EINVAL;
+	struct nlattr *br_spec, *attr = NULL;
+	int rem, err = -EOPNOTSUPP;
+	u16 flags = 0;
 
 	if (nlmsg_len(nlh) < sizeof(*ifm))
 		return -EINVAL;
@@ -2316,15 +2363,45 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -ENODEV;
 	}
 
-	if (dev->master && dev->master->netdev_ops->ndo_bridge_setlink) {
+	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+	if (br_spec) {
+		nla_for_each_nested(attr, br_spec, rem) {
+			if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+				flags = nla_get_u16(attr);
+				break;
+			}
+		}
+	}
+
+	if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
+		if (!dev->master ||
+		    !dev->master->netdev_ops->ndo_bridge_setlink) {
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+
 		err = dev->master->netdev_ops->ndo_bridge_setlink(dev, nlh);
 		if (err)
 			goto out;
+
+		flags &= ~BRIDGE_FLAGS_MASTER;
 	}
 
-	if (dev->netdev_ops->ndo_bridge_setlink)
-		err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
+	if ((flags & BRIDGE_FLAGS_SELF)) {
+		if (!dev->netdev_ops->ndo_bridge_setlink)
+			err = -EOPNOTSUPP;
+		else
+			err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
+
+		if (!err)
+			flags &= ~BRIDGE_FLAGS_SELF;
+	}
 
+	if (attr && nla_type(attr) == IFLA_BRIDGE_FLAGS)
+		memcpy(nla_data(attr), &flags, sizeof(flags));
+	/* Generate event to notify upper layer of bridge change */
+	if (!err)
+		err = rtnl_bridge_notify(dev, flags);
 out:
 	return err;
 }
-- 
cgit v1.1


From 815cccbf10b27115fb3e5827bef26768616e5e27 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Wed, 24 Oct 2012 08:13:09 +0000
Subject: ixgbe: add setlink, getlink support to ixgbe and ixgbevf

This adds support for the net device ops to manage the embedded
hardware bridge on ixgbe devices. With this patch the bridge
mode can be toggled between VEB and VEPA to support stacking
macvlan devices or using the embedded switch without any SW
component in 802.1Qbg/br environments.

Additionally, this adds source address pruning to the ixgbevf
driver to prune any frames sent back from a reflective relay on
the switch. This is required because the existing hardware does
not support this. Without it frames get pushed into the stack
with its own src mac which is invalid per 802.1Qbg VEPA
definition.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 8d2af0f..51dc58f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2252,6 +2252,56 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
+int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+			    struct net_device *dev, u16 mode)
+{
+	struct nlmsghdr *nlh;
+	struct ifinfomsg *ifm;
+	struct nlattr *br_afspec;
+	u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
+
+	nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ifm = nlmsg_data(nlh);
+	ifm->ifi_family = AF_BRIDGE;
+	ifm->__ifi_pad = 0;
+	ifm->ifi_type = dev->type;
+	ifm->ifi_index = dev->ifindex;
+	ifm->ifi_flags = dev_get_flags(dev);
+	ifm->ifi_change = 0;
+
+
+	if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+	    nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+	    nla_put_u8(skb, IFLA_OPERSTATE, operstate) ||
+	    (dev->master &&
+	     nla_put_u32(skb, IFLA_MASTER, dev->master->ifindex)) ||
+	    (dev->addr_len &&
+	     nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
+	    (dev->ifindex != dev->iflink &&
+	     nla_put_u32(skb, IFLA_LINK, dev->iflink)))
+		goto nla_put_failure;
+
+	br_afspec = nla_nest_start(skb, IFLA_AF_SPEC);
+	if (!br_afspec)
+		goto nla_put_failure;
+
+	if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) ||
+	    nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) {
+		nla_nest_cancel(skb, br_afspec);
+		goto nla_put_failure;
+	}
+	nla_nest_end(skb, br_afspec);
+
+	return nlmsg_end(skb, nlh);
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+EXPORT_SYMBOL(ndo_dflt_bridge_getlink);
+
 static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
-- 
cgit v1.1


From f3335031b9452baebfe49b8b5e55d3fe0c4677d1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 27 Oct 2012 02:26:17 +0000
Subject: net: filter: add vlan tag access

BPF filters lack ability to access skb->vlan_tci

This patch adds two new ancillary accessors :

SKF_AD_VLAN_TAG         (44) mapped to vlan_tx_tag_get(skb)

SKF_AD_VLAN_TAG_PRESENT (48) mapped to vlan_tx_tag_present(skb)

This allows libpcap/tcpdump to use a kernel filter instead of
having to fallback to accept all packets, then filter them in
user space.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Suggested-by: Ani Sinha <ani@aristanetworks.com>
Suggested-by: Daniel Borkmann <danborkmann@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 3d92ebb..5a114d4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -39,6 +39,7 @@
 #include <linux/reciprocal_div.h>
 #include <linux/ratelimit.h>
 #include <linux/seccomp.h>
+#include <linux/if_vlan.h>
 
 /* No hurry in this branch
  *
@@ -341,6 +342,12 @@ load_b:
 		case BPF_S_ANC_CPU:
 			A = raw_smp_processor_id();
 			continue;
+		case BPF_S_ANC_VLAN_TAG:
+			A = vlan_tx_tag_get(skb);
+			continue;
+		case BPF_S_ANC_VLAN_TAG_PRESENT:
+			A = !!vlan_tx_tag_present(skb);
+			continue;
 		case BPF_S_ANC_NLATTR: {
 			struct nlattr *nla;
 
@@ -600,6 +607,8 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			ANCILLARY(RXHASH);
 			ANCILLARY(CPU);
 			ANCILLARY(ALU_XOR_X);
+			ANCILLARY(VLAN_TAG);
+			ANCILLARY(VLAN_TAG_PRESENT);
 			}
 		}
 		ftest->code = code;
-- 
cgit v1.1


From a8fc92778080c845eaadc369a0ecf5699a03bef0 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Thu, 1 Nov 2012 02:01:48 +0000
Subject: sk-filter: Add ability to get socket filter program (v2)

The SO_ATTACH_FILTER option is set only. I propose to add the get
ability by using SO_ATTACH_FILTER in getsockopt. To be less
irritating to eyes the SO_GET_FILTER alias to it is declared. This
ability is required by checkpoint-restore project to be able to
save full state of a socket.

There are two issues with getting filter back.

First, kernel modifies the sock_filter->code on filter load, thus in
order to return the filter element back to user we have to decode it
into user-visible constants. Fortunately the modification in question
is interconvertible.

Second, the BPF_S_ALU_DIV_K code modifies the command argument k to
speed up the run-time division by doing kernel_k = reciprocal(user_k).
Bad news is that different user_k may result in same kernel_k, so we
can't get the original user_k back. Good news is that we don't have
to do it. What we need to is calculate a user2_k so, that

  reciprocal(user2_k) == reciprocal(user_k) == kernel_k

i.e. if it's re-loaded back the compiled again value will be exactly
the same as it was. That said, the user2_k can be calculated like this

  user2_k = reciprocal(kernel_k)

with an exception, that if kernel_k == 0, then user2_k == 1.

The optlen argument is treated like this -- when zero, kernel returns
the amount of sock_fprog elements in filter, otherwise it should be
large enough for the sock_fprog array.

changes since v1:
* Declared SO_GET_FILTER in all arch headers
* Added decode of vlan-tag codes

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/core/sock.c   |   6 +++
 2 files changed, 136 insertions(+)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 5a114d4..c23543c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -760,3 +760,133 @@ int sk_detach_filter(struct sock *sk)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(sk_detach_filter);
+
+static void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
+{
+	static const u16 decodes[] = {
+		[BPF_S_ALU_ADD_K]	= BPF_ALU|BPF_ADD|BPF_K,
+		[BPF_S_ALU_ADD_X]	= BPF_ALU|BPF_ADD|BPF_X,
+		[BPF_S_ALU_SUB_K]	= BPF_ALU|BPF_SUB|BPF_K,
+		[BPF_S_ALU_SUB_X]	= BPF_ALU|BPF_SUB|BPF_X,
+		[BPF_S_ALU_MUL_K]	= BPF_ALU|BPF_MUL|BPF_K,
+		[BPF_S_ALU_MUL_X]	= BPF_ALU|BPF_MUL|BPF_X,
+		[BPF_S_ALU_DIV_X]	= BPF_ALU|BPF_DIV|BPF_X,
+		[BPF_S_ALU_MOD_K]	= BPF_ALU|BPF_MOD|BPF_K,
+		[BPF_S_ALU_MOD_X]	= BPF_ALU|BPF_MOD|BPF_X,
+		[BPF_S_ALU_AND_K]	= BPF_ALU|BPF_AND|BPF_K,
+		[BPF_S_ALU_AND_X]	= BPF_ALU|BPF_AND|BPF_X,
+		[BPF_S_ALU_OR_K]	= BPF_ALU|BPF_OR|BPF_K,
+		[BPF_S_ALU_OR_X]	= BPF_ALU|BPF_OR|BPF_X,
+		[BPF_S_ALU_XOR_K]	= BPF_ALU|BPF_XOR|BPF_K,
+		[BPF_S_ALU_XOR_X]	= BPF_ALU|BPF_XOR|BPF_X,
+		[BPF_S_ALU_LSH_K]	= BPF_ALU|BPF_LSH|BPF_K,
+		[BPF_S_ALU_LSH_X]	= BPF_ALU|BPF_LSH|BPF_X,
+		[BPF_S_ALU_RSH_K]	= BPF_ALU|BPF_RSH|BPF_K,
+		[BPF_S_ALU_RSH_X]	= BPF_ALU|BPF_RSH|BPF_X,
+		[BPF_S_ALU_NEG]		= BPF_ALU|BPF_NEG,
+		[BPF_S_LD_W_ABS]	= BPF_LD|BPF_W|BPF_ABS,
+		[BPF_S_LD_H_ABS]	= BPF_LD|BPF_H|BPF_ABS,
+		[BPF_S_LD_B_ABS]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_PROTOCOL]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_PKTTYPE]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_IFINDEX]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_NLATTR]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_NLATTR_NEST]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_MARK]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_QUEUE]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_HATYPE]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_RXHASH]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_CPU]		= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_ALU_XOR_X]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_VLAN_TAG]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
+		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
+		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
+		[BPF_S_LD_B_IND]	= BPF_LD|BPF_B|BPF_IND,
+		[BPF_S_LD_IMM]		= BPF_LD|BPF_IMM,
+		[BPF_S_LDX_W_LEN]	= BPF_LDX|BPF_W|BPF_LEN,
+		[BPF_S_LDX_B_MSH]	= BPF_LDX|BPF_B|BPF_MSH,
+		[BPF_S_LDX_IMM]		= BPF_LDX|BPF_IMM,
+		[BPF_S_MISC_TAX]	= BPF_MISC|BPF_TAX,
+		[BPF_S_MISC_TXA]	= BPF_MISC|BPF_TXA,
+		[BPF_S_RET_K]		= BPF_RET|BPF_K,
+		[BPF_S_RET_A]		= BPF_RET|BPF_A,
+		[BPF_S_ALU_DIV_K]	= BPF_ALU|BPF_DIV|BPF_K,
+		[BPF_S_LD_MEM]		= BPF_LD|BPF_MEM,
+		[BPF_S_LDX_MEM]		= BPF_LDX|BPF_MEM,
+		[BPF_S_ST]		= BPF_ST,
+		[BPF_S_STX]		= BPF_STX,
+		[BPF_S_JMP_JA]		= BPF_JMP|BPF_JA,
+		[BPF_S_JMP_JEQ_K]	= BPF_JMP|BPF_JEQ|BPF_K,
+		[BPF_S_JMP_JEQ_X]	= BPF_JMP|BPF_JEQ|BPF_X,
+		[BPF_S_JMP_JGE_K]	= BPF_JMP|BPF_JGE|BPF_K,
+		[BPF_S_JMP_JGE_X]	= BPF_JMP|BPF_JGE|BPF_X,
+		[BPF_S_JMP_JGT_K]	= BPF_JMP|BPF_JGT|BPF_K,
+		[BPF_S_JMP_JGT_X]	= BPF_JMP|BPF_JGT|BPF_X,
+		[BPF_S_JMP_JSET_K]	= BPF_JMP|BPF_JSET|BPF_K,
+		[BPF_S_JMP_JSET_X]	= BPF_JMP|BPF_JSET|BPF_X,
+	};
+	u16 code;
+
+	code = filt->code;
+
+	to->code = decodes[code];
+	to->jt = filt->jt;
+	to->jf = filt->jf;
+
+	if (code == BPF_S_ALU_DIV_K) {
+		/*
+		 * When loaded this rule user gave us X, which was
+		 * translated into R = r(X). Now we calculate the
+		 * RR = r(R) and report it back. If next time this
+		 * value is loaded and RRR = r(RR) is calculated
+		 * then the R == RRR will be true.
+		 *
+		 * One exception. X == 1 translates into R == 0 and
+		 * we can't calculate RR out of it with r().
+		 */
+
+		if (filt->k == 0)
+			to->k = 1;
+		else
+			to->k = reciprocal_value(filt->k);
+
+		BUG_ON(reciprocal_value(to->k) != filt->k);
+	} else
+		to->k = filt->k;
+}
+
+int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len)
+{
+	struct sk_filter *filter;
+	int i, ret;
+
+	lock_sock(sk);
+	filter = rcu_dereference_protected(sk->sk_filter,
+			sock_owned_by_user(sk));
+	ret = 0;
+	if (!filter)
+		goto out;
+	ret = filter->len;
+	if (!len)
+		goto out;
+	ret = -EINVAL;
+	if (len < filter->len)
+		goto out;
+
+	ret = -EFAULT;
+	for (i = 0; i < filter->len; i++) {
+		struct sock_filter fb;
+
+		sk_decode_filter(&filter->insns[i], &fb);
+		if (copy_to_user(&ubuf[i], &fb, sizeof(fb)))
+			goto out;
+	}
+
+	ret = filter->len;
+out:
+	release_sock(sk);
+	return ret;
+}
diff --git a/net/core/sock.c b/net/core/sock.c
index 0a023b8..0628600 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1077,6 +1077,12 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 	case SO_BINDTODEVICE:
 		v.val = sk->sk_bound_dev_if;
 		break;
+	case SO_GET_FILTER:
+		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
+		if (len < 0)
+			return len;
+
+		goto lenout;
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
cgit v1.1


From e19d6763cc300fcb706bd291b24ac06be71e1ce6 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 1 Nov 2012 09:16:22 +0000
Subject: skb: report completion status for zero copy skbs

Even if skb is marked for zero copy, net core might still decide
to copy it later which is somewhat slower than a copy in user context:
besides copying the data we need to pin/unpin the pages.

Add a parameter reporting such cases through zero copy callback:
if this happens a lot, device can take this into account
and switch to copying in user context.

This patch updates all users but ignores the passed value for now:
it will be used by follow-up patches.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6e04b1f..4abdf71 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -519,7 +519,7 @@ static void skb_release_data(struct sk_buff *skb)
 
 			uarg = skb_shinfo(skb)->destructor_arg;
 			if (uarg->callback)
-				uarg->callback(uarg);
+				uarg->callback(uarg, true);
 		}
 
 		if (skb_has_frag_list(skb))
@@ -797,7 +797,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 	for (i = 0; i < num_frags; i++)
 		skb_frag_unref(skb, i);
 
-	uarg->callback(uarg);
+	uarg->callback(uarg, false);
 
 	/* skb frags point to kernel buffers */
 	for (i = num_frags - 1; i >= 0; i--) {
-- 
cgit v1.1


From 25121173f7b1e4ac3fc692df6e7b8c52ec36abba Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 1 Nov 2012 09:16:28 +0000
Subject: skb: api to report errors for zero copy skbs

Orphaning frags for zero copy skbs needs to allocate data in atomic
context so is has a chance to fail. If it does we currently discard
the skb which is safe, but we don't report anything to the caller,
so it can not recover by e.g. disabling zero copy.

Add an API to free skb reporting such errors: this is used
by tun in case orphaning frags fails.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4abdf71..d9addea 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -635,6 +635,26 @@ void kfree_skb(struct sk_buff *skb)
 EXPORT_SYMBOL(kfree_skb);
 
 /**
+ *	skb_tx_error - report an sk_buff xmit error
+ *	@skb: buffer that triggered an error
+ *
+ *	Report xmit error if a device callback is tracking this skb.
+ *	skb must be freed afterwards.
+ */
+void skb_tx_error(struct sk_buff *skb)
+{
+	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+		struct ubuf_info *uarg;
+
+		uarg = skb_shinfo(skb)->destructor_arg;
+		if (uarg->callback)
+			uarg->callback(uarg, false);
+		skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
+	}
+}
+EXPORT_SYMBOL(skb_tx_error);
+
+/**
  *	consume_skb - free an skbuff
  *	@skb: buffer to free
  *
-- 
cgit v1.1


From 25b1e67921f448cdddf70042ba233ffe43d33a9c Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 2 Nov 2012 12:56:52 +0000
Subject: net: Fix continued iteration in rtnl_bridge_getlink()

Commit e5a55a898720096f43bc24938f8875c0a1b34cd7 ('net: create generic
bridge ops') broke the handling of a non-zero starting index in
rtnl_bridge_getlink() (based on the old br_dump_ifinfo()).

When the starting index is non-zero, we need to increment the current
index for each entry that we are skipping.  Also, we need to check the
index before both cases, since we may previously have stopped
iteration between getting information about a device from its master
and from itself.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Tested-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 51dc58f..a0e3507 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2315,28 +2315,19 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
 		const struct net_device_ops *ops = dev->netdev_ops;
 		struct net_device *master = dev->master;
 
-		if (idx < cb->args[0])
-			continue;
-
 		if (master && master->netdev_ops->ndo_bridge_getlink) {
-			const struct net_device_ops *bops = master->netdev_ops;
-			int err = bops->ndo_bridge_getlink(skb, portid,
-							   seq, dev);
-
-			if (err < 0)
+			if (idx >= cb->args[0] &&
+			    master->netdev_ops->ndo_bridge_getlink(
+				    skb, portid, seq, dev) < 0)
 				break;
-			else
-				idx++;
+			idx++;
 		}
 
 		if (ops->ndo_bridge_getlink) {
-			int err = ops->ndo_bridge_getlink(skb, portid,
-							  seq, dev);
-
-			if (err < 0)
+			if (idx >= cb->args[0] &&
+			    ops->ndo_bridge_getlink(skb, portid, seq, dev) < 0)
 				break;
-			else
-				idx++;
+			idx++;
 		}
 	}
 	rcu_read_unlock();
-- 
cgit v1.1


From 398f382c0a5bd62f0f31871e844700e1b9fd1ad3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dxchgb@gmail.com>
Date: Sun, 28 Oct 2012 08:27:19 +0000
Subject: pktgen: clean up ktime_t helpers

Some years ago, the ktime_t helper functions ktime_now() and ktime_lt()
have been introduced. Instead of defining them inside pktgen.c, they
should either use ktime_t library functions or, if not available, they
should be defined in ktime.h, so that also others can benefit from them.
ktime_compare() is introduced with a similar notion as in timespec_compare().

Signed-off-by: Daniel Borkmann <daniel.borkmann@tik.ee.ethz.ch>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 41 +++++++++++++----------------------------
 1 file changed, 13 insertions(+), 28 deletions(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 1d1c216..b29dacf 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -419,20 +419,6 @@ struct pktgen_thread {
 #define REMOVE 1
 #define FIND   0
 
-static inline ktime_t ktime_now(void)
-{
-	struct timespec ts;
-	ktime_get_ts(&ts);
-
-	return timespec_to_ktime(ts);
-}
-
-/* This works even if 32 bit because of careful byte order choice */
-static inline int ktime_lt(const ktime_t cmp1, const ktime_t cmp2)
-{
-	return cmp1.tv64 < cmp2.tv64;
-}
-
 static const char version[] =
 	"Packet Generator for packet performance testing. "
 	"Version: " VERSION "\n";
@@ -675,7 +661,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
 	seq_puts(seq, "\n");
 
 	/* not really stopped, more like last-running-at */
-	stopped = pkt_dev->running ? ktime_now() : pkt_dev->stopped_at;
+	stopped = pkt_dev->running ? ktime_get() : pkt_dev->stopped_at;
 	idle = pkt_dev->idle_acc;
 	do_div(idle, NSEC_PER_USEC);
 
@@ -2141,12 +2127,12 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 		return;
 	}
 
-	start_time = ktime_now();
+	start_time = ktime_get();
 	if (remaining < 100000) {
 		/* for small delays (<100us), just loop until limit is reached */
 		do {
-			end_time = ktime_now();
-		} while (ktime_lt(end_time, spin_until));
+			end_time = ktime_get();
+		} while (ktime_compare(end_time, spin_until) < 0);
 	} else {
 		/* see do_nanosleep */
 		hrtimer_init_sleeper(&t, current);
@@ -2162,7 +2148,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 			hrtimer_cancel(&t.timer);
 		} while (t.task && pkt_dev->running && !signal_pending(current));
 		__set_current_state(TASK_RUNNING);
-		end_time = ktime_now();
+		end_time = ktime_get();
 	}
 
 	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
@@ -2912,8 +2898,7 @@ static void pktgen_run(struct pktgen_thread *t)
 			pktgen_clear_counters(pkt_dev);
 			pkt_dev->running = 1;	/* Cranke yeself! */
 			pkt_dev->skb = NULL;
-			pkt_dev->started_at =
-				pkt_dev->next_tx = ktime_now();
+			pkt_dev->started_at = pkt_dev->next_tx = ktime_get();
 
 			set_pkt_overhead(pkt_dev);
 
@@ -3072,7 +3057,7 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
 
 	kfree_skb(pkt_dev->skb);
 	pkt_dev->skb = NULL;
-	pkt_dev->stopped_at = ktime_now();
+	pkt_dev->stopped_at = ktime_get();
 	pkt_dev->running = 0;
 
 	show_results(pkt_dev, nr_frags);
@@ -3091,7 +3076,7 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t)
 			continue;
 		if (best == NULL)
 			best = pkt_dev;
-		else if (ktime_lt(pkt_dev->next_tx, best->next_tx))
+		else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0)
 			best = pkt_dev;
 	}
 	if_unlock(t);
@@ -3176,14 +3161,14 @@ static void pktgen_rem_thread(struct pktgen_thread *t)
 
 static void pktgen_resched(struct pktgen_dev *pkt_dev)
 {
-	ktime_t idle_start = ktime_now();
+	ktime_t idle_start = ktime_get();
 	schedule();
-	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start));
+	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
 }
 
 static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
 {
-	ktime_t idle_start = ktime_now();
+	ktime_t idle_start = ktime_get();
 
 	while (atomic_read(&(pkt_dev->skb->users)) != 1) {
 		if (signal_pending(current))
@@ -3194,7 +3179,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
 		else
 			cpu_relax();
 	}
-	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start));
+	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
 }
 
 static void pktgen_xmit(struct pktgen_dev *pkt_dev)
@@ -3216,7 +3201,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 	 * "never transmit"
 	 */
 	if (unlikely(pkt_dev->delay == ULLONG_MAX)) {
-		pkt_dev->next_tx = ktime_add_ns(ktime_now(), ULONG_MAX);
+		pkt_dev->next_tx = ktime_add_ns(ktime_get(), ULONG_MAX);
 		return;
 	}
 
-- 
cgit v1.1


From c38e01b8b958cb6606bcc156d3d00c3ee99a13f8 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Fri, 2 Nov 2012 16:32:36 +0000
Subject: net: fix bridge notify hook to manage flags correctly

The bridge notify hook rtnl_bridge_notify() was not handling the
case where the master flags was set or with both flags set. First
flags are not being passed correctly and second the logic to parse
them is broken.

This patch passes the original flags value and fixes the
logic.

Reported-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a0e3507..04a201a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2364,13 +2364,19 @@ static int rtnl_bridge_notify(struct net_device *dev, u16 flags)
 		goto errout;
 	}
 
-	if (!flags && master && master->netdev_ops->ndo_bridge_getlink)
+	if ((!flags || (flags & BRIDGE_FLAGS_MASTER)) &&
+	    master && master->netdev_ops->ndo_bridge_getlink) {
 		err = master->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev);
-	else if (dev->netdev_ops->ndo_bridge_getlink)
-		err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev);
+		if (err < 0)
+			goto errout;
+	}
 
-	if (err < 0)
-		goto errout;
+	if ((flags & BRIDGE_FLAGS_SELF) &&
+	    dev->netdev_ops->ndo_bridge_getlink) {
+		err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev);
+		if (err < 0)
+			goto errout;
+	}
 
 	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
 	return 0;
@@ -2389,7 +2395,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct net_device *dev;
 	struct nlattr *br_spec, *attr = NULL;
 	int rem, err = -EOPNOTSUPP;
-	u16 flags = 0;
+	u16 oflags, flags = 0;
+	bool have_flags = false;
 
 	if (nlmsg_len(nlh) < sizeof(*ifm))
 		return -EINVAL;
@@ -2408,12 +2415,15 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (br_spec) {
 		nla_for_each_nested(attr, br_spec, rem) {
 			if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+				have_flags = true;
 				flags = nla_get_u16(attr);
 				break;
 			}
 		}
 	}
 
+	oflags = flags;
+
 	if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
 		if (!dev->master ||
 		    !dev->master->netdev_ops->ndo_bridge_setlink) {
@@ -2438,11 +2448,11 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 			flags &= ~BRIDGE_FLAGS_SELF;
 	}
 
-	if (attr && nla_type(attr) == IFLA_BRIDGE_FLAGS)
+	if (have_flags)
 		memcpy(nla_data(attr), &flags, sizeof(flags));
 	/* Generate event to notify upper layer of bridge change */
 	if (!err)
-		err = rtnl_bridge_notify(dev, flags);
+		err = rtnl_bridge_notify(dev, oflags);
 out:
 	return err;
 }
-- 
cgit v1.1


From 62532da9d5f47a7ced3b965aa73ffd5b1afbeb79 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevic@redhat.com>
Date: Thu, 15 Nov 2012 08:49:10 +0000
Subject: net: Add generic packet offload infrastructure.

Create a new data structure to contain the GRO/GSO callbacks and add
a new registration mechanism.

Singed-off-by: Vlad Yasevich <vyasevic@redhat.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 83232a1..6884f87 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -176,8 +176,10 @@
 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 
 static DEFINE_SPINLOCK(ptype_lock);
+static DEFINE_SPINLOCK(offload_lock);
 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 static struct list_head ptype_all __read_mostly;	/* Taps */
+static struct list_head offload_base __read_mostly;
 
 /*
  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
@@ -470,6 +472,82 @@ void dev_remove_pack(struct packet_type *pt)
 }
 EXPORT_SYMBOL(dev_remove_pack);
 
+
+/**
+ *	dev_add_offload - register offload handlers
+ *	@po: protocol offload declaration
+ *
+ *	Add protocol offload handlers to the networking stack. The passed
+ *	&proto_offload is linked into kernel lists and may not be freed until
+ *	it has been removed from the kernel lists.
+ *
+ *	This call does not sleep therefore it can not
+ *	guarantee all CPU's that are in middle of receiving packets
+ *	will see the new offload handlers (until the next received packet).
+ */
+void dev_add_offload(struct packet_offload *po)
+{
+	struct list_head *head = &offload_base;
+
+	spin_lock(&offload_lock);
+	list_add_rcu(&po->list, head);
+	spin_unlock(&offload_lock);
+}
+EXPORT_SYMBOL(dev_add_offload);
+
+/**
+ *	__dev_remove_offload	 - remove offload handler
+ *	@po: packet offload declaration
+ *
+ *	Remove a protocol offload handler that was previously added to the
+ *	kernel offload handlers by dev_add_offload(). The passed &offload_type
+ *	is removed from the kernel lists and can be freed or reused once this
+ *	function returns.
+ *
+ *      The packet type might still be in use by receivers
+ *	and must not be freed until after all the CPU's have gone
+ *	through a quiescent state.
+ */
+void __dev_remove_offload(struct packet_offload *po)
+{
+	struct list_head *head = &offload_base;
+	struct packet_offload *po1;
+
+	spin_lock(&ptype_lock);
+
+	list_for_each_entry(po1, head, list) {
+		if (po == po1) {
+			list_del_rcu(&po->list);
+			goto out;
+		}
+	}
+
+	pr_warn("dev_remove_offload: %p not found\n", po);
+out:
+	spin_unlock(&ptype_lock);
+}
+EXPORT_SYMBOL(__dev_remove_offload);
+
+/**
+ *	dev_remove_offload	 - remove packet offload handler
+ *	@po: packet offload declaration
+ *
+ *	Remove a packet offload handler that was previously added to the kernel
+ *	offload handlers by dev_add_offload(). The passed &offload_type is
+ *	removed from the kernel lists and can be freed or reused once this
+ *	function returns.
+ *
+ *	This call sleeps to guarantee that no CPU is looking at the packet
+ *	type after return.
+ */
+void dev_remove_offload(struct packet_offload *po)
+{
+	__dev_remove_offload(po);
+
+	synchronize_net();
+}
+EXPORT_SYMBOL(dev_remove_offload);
+
 /******************************************************************************
 
 		      Device Boot-time Settings Routines
@@ -6661,6 +6739,8 @@ static int __init net_dev_init(void)
 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
 		INIT_LIST_HEAD(&ptype_base[i]);
 
+	INIT_LIST_HEAD(&offload_base);
+
 	if (register_pernet_subsys(&netdev_net_ops))
 		goto out;
 
-- 
cgit v1.1


From 22061d8014455b01eb018bd6c35a1b3040ccc230 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevic@redhat.com>
Date: Thu, 15 Nov 2012 08:49:11 +0000
Subject: net: Switch to using the new packet offload infrustructure

Convert to using the new GSO/GRO registration mechanism and new
packet offload structure.

Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6884f87..cf843a2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2072,7 +2072,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,
 	netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
-	struct packet_type *ptype;
+	struct packet_offload *ptype;
 	__be16 type = skb->protocol;
 	int vlan_depth = ETH_HLEN;
 	int err;
@@ -2101,9 +2101,8 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,
 	}
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(ptype,
-			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
-		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
+	list_for_each_entry_rcu(ptype, &offload_base, list) {
+		if (ptype->type == type && ptype->gso_segment) {
 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
 				err = ptype->gso_send_check(skb);
 				segs = ERR_PTR(err);
@@ -3522,9 +3521,9 @@ static void flush_backlog(void *arg)
 
 static int napi_gro_complete(struct sk_buff *skb)
 {
-	struct packet_type *ptype;
+	struct packet_offload *ptype;
 	__be16 type = skb->protocol;
-	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
+	struct list_head *head = &offload_base;
 	int err = -ENOENT;
 
 	if (NAPI_GRO_CB(skb)->count == 1) {
@@ -3534,7 +3533,7 @@ static int napi_gro_complete(struct sk_buff *skb)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
-		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
+		if (ptype->type != type || !ptype->gro_complete)
 			continue;
 
 		err = ptype->gro_complete(skb);
@@ -3584,9 +3583,9 @@ EXPORT_SYMBOL(napi_gro_flush);
 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff **pp = NULL;
-	struct packet_type *ptype;
+	struct packet_offload *ptype;
 	__be16 type = skb->protocol;
-	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
+	struct list_head *head = &offload_base;
 	int same_flow;
 	int mac_len;
 	enum gro_result ret;
@@ -3599,7 +3598,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
-		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
+		if (ptype->type != type || !ptype->gro_receive)
 			continue;
 
 		skb_set_network_header(skb, skb_gro_offset(skb));
-- 
cgit v1.1


From f191a1d17f227032c159e5499809f545402b6dc6 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevic@redhat.com>
Date: Thu, 15 Nov 2012 08:49:23 +0000
Subject: net: Remove code duplication between offload structures

Move the offload callbacks into its own structure.

Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index cf843a2..cf105e8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2102,16 +2102,16 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, &offload_base, list) {
-		if (ptype->type == type && ptype->gso_segment) {
+		if (ptype->type == type && ptype->callbacks.gso_segment) {
 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
-				err = ptype->gso_send_check(skb);
+				err = ptype->callbacks.gso_send_check(skb);
 				segs = ERR_PTR(err);
 				if (err || skb_gso_ok(skb, features))
 					break;
 				__skb_push(skb, (skb->data -
 						 skb_network_header(skb)));
 			}
-			segs = ptype->gso_segment(skb, features);
+			segs = ptype->callbacks.gso_segment(skb, features);
 			break;
 		}
 	}
@@ -3533,10 +3533,10 @@ static int napi_gro_complete(struct sk_buff *skb)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
-		if (ptype->type != type || !ptype->gro_complete)
+		if (ptype->type != type || !ptype->callbacks.gro_complete)
 			continue;
 
-		err = ptype->gro_complete(skb);
+		err = ptype->callbacks.gro_complete(skb);
 		break;
 	}
 	rcu_read_unlock();
@@ -3598,7 +3598,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
-		if (ptype->type != type || !ptype->gro_receive)
+		if (ptype->type != type || !ptype->callbacks.gro_receive)
 			continue;
 
 		skb_set_network_header(skb, skb_gro_offset(skb));
@@ -3608,7 +3608,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 		NAPI_GRO_CB(skb)->flush = 0;
 		NAPI_GRO_CB(skb)->free = 0;
 
-		pp = ptype->gro_receive(&napi->gro_list, skb);
+		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
 		break;
 	}
 	rcu_read_unlock();
-- 
cgit v1.1


From c53aa5058ad5ca8876a47d6639ad4d4f2c5ed584 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 16 Nov 2012 08:08:23 +0000
Subject: net: use right lock in __dev_remove_offload

offload_base is protected by offload_lock, not ptype_lock

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Vlad Yasevich <vyasevic@redhat.com>
Acked-by: Vlad Yasevich <vyasevic@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index cf105e8..2705a2a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -513,7 +513,7 @@ void __dev_remove_offload(struct packet_offload *po)
 	struct list_head *head = &offload_base;
 	struct packet_offload *po1;
 
-	spin_lock(&ptype_lock);
+	spin_lock(&offload_lock);
 
 	list_for_each_entry(po1, head, list) {
 		if (po == po1) {
@@ -524,7 +524,7 @@ void __dev_remove_offload(struct packet_offload *po)
 
 	pr_warn("dev_remove_offload: %p not found\n", po);
 out:
-	spin_unlock(&ptype_lock);
+	spin_unlock(&offload_lock);
 }
 EXPORT_SYMBOL(__dev_remove_offload);
 
-- 
cgit v1.1


From 2407dc25f741cf73853e0521ce9257531c9fc61d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 16 Nov 2012 03:02:56 +0000
Subject: netns: Deduplicate and fix copy_net_ns when !CONFIG_NET_NS

The copy of copy_net_ns used when the network stack is not
built is broken as it does not return -EINVAL when attempting
to create a new network namespace.  We don't even have
a previous network namespace.

Since we need a copy of copy_net_ns in net/net_namespace.h that is
available when the networking stack is not built at all move the
correct version of copy_net_ns from net_namespace.c into net_namespace.h
Leaving us with just 2 versions of copy_net_ns.  One version for when
we compile in network namespace suport and another stub for all other
occasions.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 42f1e1c..2c1c590 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -347,13 +347,6 @@ struct net *get_net_ns_by_fd(int fd)
 }
 
 #else
-struct net *copy_net_ns(unsigned long flags, struct net *old_net)
-{
-	if (flags & CLONE_NEWNET)
-		return ERR_PTR(-EINVAL);
-	return old_net;
-}
-
 struct net *get_net_ns_by_fd(int fd)
 {
 	return ERR_PTR(-EINVAL);
-- 
cgit v1.1


From d328b836823cd4a76611a45f52e208f8ce3d75d7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 16 Nov 2012 03:02:57 +0000
Subject: userns: make each net (net_ns) belong to a user_ns

The user namespace which creates a new network namespace owns that
namespace and all resources created in it.  This way we can target
capability checks for privileged operations against network resources to
the user_ns which created the network namespace in which the resource
lives.  Privilege to the user namespace which owns the network
namespace, or any parent user namespace thereof, provides the same
privilege to the network resource.

This patch is reworked from a version originally by
Serge E. Hallyn <serge.hallyn@canonical.com>

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c1c590..6456439 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -13,6 +13,7 @@
 #include <linux/proc_fs.h>
 #include <linux/file.h>
 #include <linux/export.h>
+#include <linux/user_namespace.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
@@ -145,7 +146,7 @@ static void ops_free_list(const struct pernet_operations *ops,
 /*
  * setup_net runs the initializers for the network namespace object.
  */
-static __net_init int setup_net(struct net *net)
+static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 {
 	/* Must be called with net_mutex held */
 	const struct pernet_operations *ops, *saved_ops;
@@ -155,6 +156,7 @@ static __net_init int setup_net(struct net *net)
 	atomic_set(&net->count, 1);
 	atomic_set(&net->passive, 1);
 	net->dev_base_seq = 1;
+	net->user_ns = user_ns;
 
 #ifdef NETNS_REFCNT_DEBUG
 	atomic_set(&net->use_count, 0);
@@ -232,7 +234,8 @@ void net_drop_ns(void *p)
 		net_free(ns);
 }
 
-struct net *copy_net_ns(unsigned long flags, struct net *old_net)
+struct net *copy_net_ns(unsigned long flags,
+			struct user_namespace *user_ns, struct net *old_net)
 {
 	struct net *net;
 	int rv;
@@ -243,8 +246,11 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
 	net = net_alloc();
 	if (!net)
 		return ERR_PTR(-ENOMEM);
+
+	get_user_ns(user_ns);
+
 	mutex_lock(&net_mutex);
-	rv = setup_net(net);
+	rv = setup_net(net, user_ns);
 	if (rv == 0) {
 		rtnl_lock();
 		list_add_tail_rcu(&net->list, &net_namespace_list);
@@ -252,6 +258,7 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
 	}
 	mutex_unlock(&net_mutex);
 	if (rv < 0) {
+		put_user_ns(user_ns);
 		net_drop_ns(net);
 		return ERR_PTR(rv);
 	}
@@ -308,6 +315,7 @@ static void cleanup_net(struct work_struct *work)
 	/* Finally it is safe to free my network namespace structure */
 	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
 		list_del_init(&net->exit_list);
+		put_user_ns(net->user_ns);
 		net_drop_ns(net);
 	}
 }
@@ -395,7 +403,7 @@ static int __init net_ns_init(void)
 	rcu_assign_pointer(init_net.gen, ng);
 
 	mutex_lock(&net_mutex);
-	if (setup_net(&init_net))
+	if (setup_net(&init_net, &init_user_ns))
 		panic("Could not setup the initial network namespace");
 
 	rtnl_lock();
-- 
cgit v1.1


From 464dc801c76aa0db88e16e8f5f47c6879858b9b2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 16 Nov 2012 03:02:59 +0000
Subject: net: Don't export sysctls to unprivileged users

In preparation for supporting the creation of network namespaces
by unprivileged users, modify all of the per net sysctl exports
and refuse to allow them to unprivileged users.

This makes it safe for unprivileged users in general to access
per net sysctls, and allows sysctls to be exported to unprivileged
users on an individual basis as they are deemed safe.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c       | 4 ++++
 net/core/sysctl_net_core.c | 5 +++++
 2 files changed, 9 insertions(+)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 2257148..f1c0c2e 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2987,6 +2987,10 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
 		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev;
 	}
 
+	/* Don't export sysctls to unprivileged users */
+	if (neigh_parms_net(p)->user_ns != &init_user_ns)
+		t->neigh_vars[0].procname = NULL;
+
 	snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s",
 		p_name, dev_name_source);
 	t->sysctl_header =
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index a7c3684..d1b0804 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -216,6 +216,11 @@ static __net_init int sysctl_core_net_init(struct net *net)
 			goto err_dup;
 
 		tbl[0].data = &net->core.sysctl_somaxconn;
+
+		/* Don't export any sysctls to unprivileged users */
+		if (net->user_ns != &init_user_ns) {
+			tbl[0].procname = NULL;
+		}
 	}
 
 	net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl);
-- 
cgit v1.1


From dfc47ef8639facd77210e74be831943c2fdd9c74 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 16 Nov 2012 03:03:00 +0000
Subject: net: Push capable(CAP_NET_ADMIN) into the rtnl methods

- In rtnetlink_rcv_msg convert the capable(CAP_NET_ADMIN) check
  to ns_capable(net->user-ns, CAP_NET_ADMIN).  Allowing unprivileged
  users to make netlink calls to modify their local network
  namespace.

- In the rtnetlink doit methods add capable(CAP_NET_ADMIN) so
  that calls that are not safe for unprivileged users are still
  protected.

Later patches will remove the extra capable calls from methods
that are safe for unprivilged users.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c |  6 ++++++
 net/core/neighbour.c |  9 +++++++++
 net/core/rtnetlink.c | 17 ++++++++++++++++-
 3 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 58a4ba2..bf5b5b8 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -275,6 +275,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	struct nlattr *tb[FRA_MAX+1];
 	int err = -EINVAL, unresolved = 0;
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
 		goto errout;
 
@@ -424,6 +427,9 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	struct nlattr *tb[FRA_MAX+1];
 	int err = -EINVAL;
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
 		goto errout;
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index f1c0c2e..7adcdaf 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1620,6 +1620,9 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct net_device *dev = NULL;
 	int err = -EINVAL;
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	ASSERT_RTNL();
 	if (nlmsg_len(nlh) < sizeof(*ndm))
 		goto out;
@@ -1684,6 +1687,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct net_device *dev = NULL;
 	int err;
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	ASSERT_RTNL();
 	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
 	if (err < 0)
@@ -1962,6 +1968,9 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct nlattr *tb[NDTA_MAX+1];
 	int err;
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
 			  nl_neightbl_policy);
 	if (err < 0)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a810f6a..a40c10b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1547,6 +1547,9 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct nlattr *tb[IFLA_MAX+1];
 	char ifname[IFNAMSIZ];
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
 	if (err < 0)
 		goto errout;
@@ -1590,6 +1593,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	int err;
 	LIST_HEAD(list_kill);
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
 	if (err < 0)
 		return err;
@@ -1720,6 +1726,9 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct nlattr *linkinfo[IFLA_INFO_MAX+1];
 	int err;
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 #ifdef CONFIG_MODULES
 replay:
 #endif
@@ -2057,6 +2066,9 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	u8 *addr;
 	int err;
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
 	if (err < 0)
 		return err;
@@ -2123,6 +2135,9 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	int err = -EINVAL;
 	__u8 *addr;
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	if (nlmsg_len(nlh) < sizeof(*ndm))
 		return -EINVAL;
 
@@ -2488,7 +2503,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	sz_idx = type>>2;
 	kind = type&3;
 
-	if (kind != 2 && !capable(CAP_NET_ADMIN))
+	if (kind != 2 && !ns_capable(net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
-- 
cgit v1.1


From 00f70de09c418bfb028d03f046e39c1d301db7b2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 16 Nov 2012 03:03:03 +0000
Subject: net: Allow userns root to force the scm creds

If the user calling sendmsg has the appropriate privieleges
in their user namespace allow them to set the uid, gid, and
pid in the SCM_CREDENTIALS control message to any valid value.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/scm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/scm.c b/net/core/scm.c
index ab57084..57fb1ee 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -51,11 +51,11 @@ static __inline__ int scm_check_creds(struct ucred *creds)
 	if (!uid_valid(uid) || !gid_valid(gid))
 		return -EINVAL;
 
-	if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) &&
+	if ((creds->pid == task_tgid_vnr(current) || nsown_capable(CAP_SYS_ADMIN)) &&
 	    ((uid_eq(uid, cred->uid)   || uid_eq(uid, cred->euid) ||
-	      uid_eq(uid, cred->suid)) || capable(CAP_SETUID)) &&
+	      uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) &&
 	    ((gid_eq(gid, cred->gid)   || gid_eq(gid, cred->egid) ||
-	      gid_eq(gid, cred->sgid)) || capable(CAP_SETGID))) {
+	      gid_eq(gid, cred->sgid)) || nsown_capable(CAP_SETGID))) {
 	       return 0;
 	}
 	return -EPERM;
-- 
cgit v1.1


From 5e1fccc0bfac4946932b36e4535c03957d35113d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 16 Nov 2012 03:03:04 +0000
Subject: net: Allow userns root control of the core of the network stack.

Allow an unpriviled user who has created a user namespace, and then
created a network namespace to effectively use the new network
namespace, by reducing capable(CAP_NET_ADMIN) and
capable(CAP_NET_RAW) calls to be ns_capable(net->user_ns,
CAP_NET_ADMIN), or capable(net->user_ns, CAP_NET_RAW) calls.

Settings that merely control a single network device are allowed.
Either the network device is a logical network device where
restrictions make no difference or the network device is hardware NIC
that has been explicity moved from the initial network namespace.

In general policy and network stack state changes are allowed
while resource control is left unchanged.

Allow ethtool ioctls.

Allow binding to network devices.
Allow setting the socket mark.
Allow setting the socket priority.

Allow setting the network device alias via sysfs.
Allow setting the mtu via sysfs.
Allow changing the network device flags via sysfs.
Allow setting the network device group via sysfs.

Allow the following network device ioctls.
SIOCGMIIPHY
SIOCGMIIREG
SIOCSIFNAME
SIOCSIFFLAGS
SIOCSIFMETRIC
SIOCSIFMTU
SIOCSIFHWADDR
SIOCSIFSLAVE
SIOCADDMULTI
SIOCDELMULTI
SIOCSIFHWBROADCAST
SIOCSMIIREG
SIOCBONDENSLAVE
SIOCBONDRELEASE
SIOCBONDSETHWADDR
SIOCBONDCHANGEACTIVE
SIOCBRADDIF
SIOCBRDELIF
SIOCSHWTSTAMP

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       | 17 +++++++++++++----
 net/core/ethtool.c   |  2 +-
 net/core/net-sysfs.c | 15 ++++++++++-----
 net/core/sock.c      |  7 ++++---
 4 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 974199d..0afae8b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5279,7 +5279,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	case SIOCGMIIPHY:
 	case SIOCGMIIREG:
 	case SIOCSIFNAME:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 		dev_load(net, ifr.ifr_name);
 		rtnl_lock();
@@ -5300,16 +5300,25 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	 *	- require strict serialization.
 	 *	- do not return a value
 	 */
+	case SIOCSIFMAP:
+	case SIOCSIFTXQLEN:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		/* fall through */
+	/*
+	 *	These ioctl calls:
+	 *	- require local superuser power.
+	 *	- require strict serialization.
+	 *	- do not return a value
+	 */
 	case SIOCSIFFLAGS:
 	case SIOCSIFMETRIC:
 	case SIOCSIFMTU:
-	case SIOCSIFMAP:
 	case SIOCSIFHWADDR:
 	case SIOCSIFSLAVE:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 	case SIOCSIFHWBROADCAST:
-	case SIOCSIFTXQLEN:
 	case SIOCSMIIREG:
 	case SIOCBONDENSLAVE:
 	case SIOCBONDRELEASE:
@@ -5318,7 +5327,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	case SIOCBRADDIF:
 	case SIOCBRDELIF:
 	case SIOCSHWTSTAMP:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 		/* fall through */
 	case SIOCBONDSLAVEINFOQUERY:
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 4d64cc2..a870543 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1460,7 +1460,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GEEE:
 		break;
 	default:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 	}
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index bcf02f6..c66b8c2 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -73,11 +73,12 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
 			    const char *buf, size_t len,
 			    int (*set)(struct net_device *, unsigned long))
 {
-	struct net_device *net = to_net_dev(dev);
+	struct net_device *netdev = to_net_dev(dev);
+	struct net *net = dev_net(netdev);
 	unsigned long new;
 	int ret = -EINVAL;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	ret = kstrtoul(buf, 0, &new);
@@ -87,8 +88,8 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
 	if (!rtnl_trylock())
 		return restart_syscall();
 
-	if (dev_isalive(net)) {
-		if ((ret = (*set)(net, new)) == 0)
+	if (dev_isalive(netdev)) {
+		if ((ret = (*set)(netdev, new)) == 0)
 			ret = len;
 	}
 	rtnl_unlock();
@@ -264,6 +265,9 @@ static ssize_t store_tx_queue_len(struct device *dev,
 				  struct device_attribute *attr,
 				  const char *buf, size_t len)
 {
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	return netdev_store(dev, attr, buf, len, change_tx_queue_len);
 }
 
@@ -271,10 +275,11 @@ static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t len)
 {
 	struct net_device *netdev = to_net_dev(dev);
+	struct net *net = dev_net(netdev);
 	size_t count = len;
 	ssize_t ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	/* ignore trailing newline */
diff --git a/net/core/sock.c b/net/core/sock.c
index 0628600..d4f7b58 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -515,7 +515,7 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 
 	/* Sorry... */
 	ret = -EPERM;
-	if (!capable(CAP_NET_RAW))
+	if (!ns_capable(net->user_ns, CAP_NET_RAW))
 		goto out;
 
 	ret = -EINVAL;
@@ -696,7 +696,8 @@ set_rcvbuf:
 		break;
 
 	case SO_PRIORITY:
-		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
+		if ((val >= 0 && val <= 6) ||
+		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 			sk->sk_priority = val;
 		else
 			ret = -EPERM;
@@ -813,7 +814,7 @@ set_rcvbuf:
 			clear_bit(SOCK_PASSSEC, &sock->flags);
 		break;
 	case SO_MARK:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 			ret = -EPERM;
 		else
 			sk->sk_mark = val;
-- 
cgit v1.1


From b51642f6d77b131dc85d1d71029c3cbb5b07c262 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 16 Nov 2012 03:03:11 +0000
Subject: net: Enable a userns root rtnl calls that are safe for unprivilged
 users

- Only allow moving network devices to network namespaces you have
  CAP_NET_ADMIN privileges over.

- Enable creating/deleting/modifying interfaces
- Enable adding/deleting addresses
- Enable adding/setting/deleting neighbour entries
- Enable adding/removing routes
- Enable adding/removing fib rules
- Enable setting the forwarding state
- Enable adding/removing ipv6 address labels
- Enable setting bridge parameter

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c |  6 ------
 net/core/neighbour.c |  9 ---------
 net/core/rtnetlink.c | 13 ++++---------
 3 files changed, 4 insertions(+), 24 deletions(-)

(limited to 'net/core')

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index bf5b5b8..58a4ba2 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -275,9 +275,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	struct nlattr *tb[FRA_MAX+1];
 	int err = -EINVAL, unresolved = 0;
 
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
 		goto errout;
 
@@ -427,9 +424,6 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	struct nlattr *tb[FRA_MAX+1];
 	int err = -EINVAL;
 
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
 		goto errout;
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 7adcdaf..f1c0c2e 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1620,9 +1620,6 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct net_device *dev = NULL;
 	int err = -EINVAL;
 
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
 	ASSERT_RTNL();
 	if (nlmsg_len(nlh) < sizeof(*ndm))
 		goto out;
@@ -1687,9 +1684,6 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct net_device *dev = NULL;
 	int err;
 
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
 	ASSERT_RTNL();
 	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
 	if (err < 0)
@@ -1968,9 +1962,6 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct nlattr *tb[NDTA_MAX+1];
 	int err;
 
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
 	err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
 			  nl_neightbl_policy);
 	if (err < 0)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a40c10b..575a6ee 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1316,6 +1316,10 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 			err = PTR_ERR(net);
 			goto errout;
 		}
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+			err = -EPERM;
+			goto errout;
+		}
 		err = dev_change_net_namespace(dev, net, ifname);
 		put_net(net);
 		if (err)
@@ -1547,9 +1551,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct nlattr *tb[IFLA_MAX+1];
 	char ifname[IFNAMSIZ];
 
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
 	if (err < 0)
 		goto errout;
@@ -1593,9 +1594,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	int err;
 	LIST_HEAD(list_kill);
 
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
 	if (err < 0)
 		return err;
@@ -1726,9 +1724,6 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct nlattr *linkinfo[IFLA_INFO_MAX+1];
 	int err;
 
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
 #ifdef CONFIG_MODULES
 replay:
 #endif
-- 
cgit v1.1


From 1f743b07652f11100bee004e261b9931632beac1 Mon Sep 17 00:00:00 2001
From: Shan Wei <davidshan@tencent.com>
Date: Mon, 12 Nov 2012 15:51:54 +0000
Subject: net: core: use this_cpu_ptr per-cpu helper

flush_tasklet is a struct, not a pointer in percpu var.
so use this_cpu_ptr to get the member pointer.

Signed-off-by: Shan Wei <davidshan@tencent.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/flow.c b/net/core/flow.c
index e318c7e..b0901ee 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -327,11 +327,9 @@ static void flow_cache_flush_tasklet(unsigned long data)
 static void flow_cache_flush_per_cpu(void *data)
 {
 	struct flow_flush_info *info = data;
-	int cpu;
 	struct tasklet_struct *tasklet;
 
-	cpu = smp_processor_id();
-	tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
+	tasklet = this_cpu_ptr(&info->cache->percpu->flush_tasklet);
 	tasklet->data = (unsigned long)info;
 	tasklet_schedule(tasklet);
 }
-- 
cgit v1.1


From 01f1c6b994bc4e0cf048534e67702e08ae69a890 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 16 Nov 2012 10:59:21 +0000
Subject: net: remove unnecessary wireless includes

The wireless and wext includes in net-sysfs.c aren't
needed, so remove them.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c66b8c2..1f8a13c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -18,11 +18,9 @@
 #include <net/sock.h>
 #include <net/net_namespace.h>
 #include <linux/rtnetlink.h>
-#include <linux/wireless.h>
 #include <linux/vmalloc.h>
 #include <linux/export.h>
 #include <linux/jiffies.h>
-#include <net/wext.h>
 
 #include "net-sysfs.h"
 
-- 
cgit v1.1


From 388dfc2d2d9c43c251921a397d6fe5ef7dc34731 Mon Sep 17 00:00:00 2001
From: Sachin Kamat <sachin.kamat@linaro.org>
Date: Tue, 20 Nov 2012 00:57:04 +0000
Subject: net: Remove redundant null check before kfree in dev.c

kfree on a null pointer is a no-op.

Signed-off-by: Sachin Kamat <sachin.kamat@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 0afae8b..7304ea8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1153,10 +1153,8 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 		return -EINVAL;
 
 	if (!len) {
-		if (dev->ifalias) {
-			kfree(dev->ifalias);
-			dev->ifalias = NULL;
-		}
+		kfree(dev->ifalias);
+		dev->ifalias = NULL;
 		return 0;
 	}
 
-- 
cgit v1.1


From c91f6df2db4972d3cc983e6988b9abf1ad02f5f9 Mon Sep 17 00:00:00 2001
From: Brian Haley <brian.haley@hp.com>
Date: Mon, 26 Nov 2012 05:21:08 +0000
Subject: sockopt: Change getsockopt() of SO_BINDTODEVICE to return an
 interface name

Instead of having the getsockopt() of SO_BINDTODEVICE return an index, which
will then require another call like if_indextoname() to get the actual interface
name, have it return the name directly.

This also matches the existing man page description on socket(7) which mentions
the argument being an interface name.

If the value has not been set, zero is returned and optlen will be set to zero
to indicate there is no interface name present.

Added a seqlock to protect this code path, and dev_ifname(), from someone
changing the device name via dev_change_name().

v2: Added seqlock protection while copying device name.

v3: Fixed word wrap in patch.

Signed-off-by: Brian Haley <brian.haley@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c  | 21 +++++++++++++++++--
 net/core/sock.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 79 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 7304ea8..2a5f5586 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -203,6 +203,8 @@ static struct list_head offload_base __read_mostly;
 DEFINE_RWLOCK(dev_base_lock);
 EXPORT_SYMBOL(dev_base_lock);
 
+DEFINE_SEQLOCK(devnet_rename_seq);
+
 static inline void dev_base_seq_inc(struct net *net)
 {
 	while (++net->dev_base_seq == 0);
@@ -1091,22 +1093,31 @@ int dev_change_name(struct net_device *dev, const char *newname)
 	if (dev->flags & IFF_UP)
 		return -EBUSY;
 
-	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
+	write_seqlock(&devnet_rename_seq);
+
+	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
+		write_sequnlock(&devnet_rename_seq);
 		return 0;
+	}
 
 	memcpy(oldname, dev->name, IFNAMSIZ);
 
 	err = dev_get_valid_name(net, dev, newname);
-	if (err < 0)
+	if (err < 0) {
+		write_sequnlock(&devnet_rename_seq);
 		return err;
+	}
 
 rollback:
 	ret = device_rename(&dev->dev, dev->name);
 	if (ret) {
 		memcpy(dev->name, oldname, IFNAMSIZ);
+		write_sequnlock(&devnet_rename_seq);
 		return ret;
 	}
 
+	write_sequnlock(&devnet_rename_seq);
+
 	write_lock_bh(&dev_base_lock);
 	hlist_del_rcu(&dev->name_hlist);
 	write_unlock_bh(&dev_base_lock);
@@ -1124,6 +1135,7 @@ rollback:
 		/* err >= 0 after dev_alloc_name() or stores the first errno */
 		if (err >= 0) {
 			err = ret;
+			write_seqlock(&devnet_rename_seq);
 			memcpy(dev->name, oldname, IFNAMSIZ);
 			goto rollback;
 		} else {
@@ -4148,6 +4160,7 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg)
 {
 	struct net_device *dev;
 	struct ifreq ifr;
+	unsigned seq;
 
 	/*
 	 *	Fetch the caller's info block.
@@ -4156,6 +4169,8 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg)
 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
 		return -EFAULT;
 
+retry:
+	seq = read_seqbegin(&devnet_rename_seq);
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
 	if (!dev) {
@@ -4165,6 +4180,8 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg)
 
 	strcpy(ifr.ifr_name, dev->name);
 	rcu_read_unlock();
+	if (read_seqretry(&devnet_rename_seq, seq))
+		goto retry;
 
 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
 		return -EFAULT;
diff --git a/net/core/sock.c b/net/core/sock.c
index d4f7b58..a692ef4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -505,7 +505,8 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 }
 EXPORT_SYMBOL(sk_dst_check);
 
-static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
+static int sock_setbindtodevice(struct sock *sk, char __user *optval,
+				int optlen)
 {
 	int ret = -ENOPROTOOPT;
 #ifdef CONFIG_NETDEVICES
@@ -562,6 +563,59 @@ out:
 	return ret;
 }
 
+static int sock_getbindtodevice(struct sock *sk, char __user *optval,
+				int __user *optlen, int len)
+{
+	int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+	struct net *net = sock_net(sk);
+	struct net_device *dev;
+	char devname[IFNAMSIZ];
+	unsigned seq;
+
+	if (sk->sk_bound_dev_if == 0) {
+		len = 0;
+		goto zero;
+	}
+
+	ret = -EINVAL;
+	if (len < IFNAMSIZ)
+		goto out;
+
+retry:
+	seq = read_seqbegin(&devnet_rename_seq);
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+	ret = -ENODEV;
+	if (!dev) {
+		rcu_read_unlock();
+		goto out;
+	}
+
+	strcpy(devname, dev->name);
+	rcu_read_unlock();
+	if (read_seqretry(&devnet_rename_seq, seq))
+		goto retry;
+
+	len = strlen(devname) + 1;
+
+	ret = -EFAULT;
+	if (copy_to_user(optval, devname, len))
+		goto out;
+
+zero:
+	ret = -EFAULT;
+	if (put_user(len, optlen))
+		goto out;
+
+	ret = 0;
+
+out:
+#endif
+
+	return ret;
+}
+
 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 {
 	if (valbool)
@@ -589,7 +643,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 	 */
 
 	if (optname == SO_BINDTODEVICE)
-		return sock_bindtodevice(sk, optval, optlen);
+		return sock_setbindtodevice(sk, optval, optlen);
 
 	if (optlen < sizeof(int))
 		return -EINVAL;
@@ -1075,15 +1129,17 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 	case SO_NOFCS:
 		v.val = sock_flag(sk, SOCK_NOFCS);
 		break;
+
 	case SO_BINDTODEVICE:
-		v.val = sk->sk_bound_dev_if;
-		break;
+		return sock_getbindtodevice(sk, optval, optlen, len);
+
 	case SO_GET_FILTER:
 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
 		if (len < 0)
 			return len;
 
 		goto lenout;
+
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
cgit v1.1


From bb728820fe7c42fdb838ab2745fb5fe6b18b5ffa Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Wed, 28 Nov 2012 21:55:25 +0000
Subject: core: make GRO methods static.

This patch changes three methods to be static and removes their
EXPORT_SYMBOLs in core/dev.c and their external declaration in
netdevice.h. The methods, dev_gro_receive(), napi_frags_finish() and
napi_skb_finish(), which are in the GRO rx path, are not used
outside core/dev.c.

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 2a5f5586..2f94df2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3592,7 +3592,7 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
-enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff **pp = NULL;
 	struct packet_offload *ptype;
@@ -3683,7 +3683,6 @@ normal:
 	ret = GRO_NORMAL;
 	goto pull;
 }
-EXPORT_SYMBOL(dev_gro_receive);
 
 static inline gro_result_t
 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
@@ -3710,7 +3709,7 @@ __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	return dev_gro_receive(napi, skb);
 }
 
-gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
+static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 {
 	switch (ret) {
 	case GRO_NORMAL:
@@ -3736,7 +3735,6 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 
 	return ret;
 }
-EXPORT_SYMBOL(napi_skb_finish);
 
 static void skb_gro_reset_offset(struct sk_buff *skb)
 {
@@ -3788,7 +3786,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(napi_get_frags);
 
-gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
+static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
 			       gro_result_t ret)
 {
 	switch (ret) {
@@ -3813,7 +3811,6 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
 
 	return ret;
 }
-EXPORT_SYMBOL(napi_frags_finish);
 
 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 {
-- 
cgit v1.1


From c07135633bee3f01a6454d15b6411f32cfbeb2fd Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Fri, 30 Nov 2012 01:08:47 +0000
Subject: rtnelink: remove unused parameter from rtnl_create_link().

This patch removes an unused parameter (src_net) from rtnl_create_link()
method and from the method single invocation, in veth.
This parameter was used in the past when calling
ops->get_tx_queues(src_net, tb) in rtnl_create_link().
The get_tx_queues() member of rtnl_link_ops was replaced by two methods,
get_num_tx_queues() and get_num_rx_queues(), which do not get any
parameter. This was done in commit d40156aa5ecbd51fed932ed4813df82b56e5ff4d by
Jiri Pirko ("rtnl: allow to specify different num for rx and tx queue count").

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 575a6ee..1868625 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1642,7 +1642,7 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
 }
 EXPORT_SYMBOL(rtnl_configure_link);
 
-struct net_device *rtnl_create_link(struct net *src_net, struct net *net,
+struct net_device *rtnl_create_link(struct net *net,
 	char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])
 {
 	int err;
@@ -1840,7 +1840,7 @@ replay:
 		if (IS_ERR(dest_net))
 			return PTR_ERR(dest_net);
 
-		dev = rtnl_create_link(net, dest_net, ifname, ops, tb);
+		dev = rtnl_create_link(dest_net, ifname, ops, tb);
 		if (IS_ERR(dev)) {
 			err = PTR_ERR(dev);
 			goto out;
-- 
cgit v1.1


From 4e66ae2ea371cf431283e2cb95480eb860432856 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serge.hallyn@canonical.com>
Date: Mon, 3 Dec 2012 16:17:12 +0000
Subject: net: dev_change_net_namespace: send a KOBJ_REMOVED/KOBJ_ADD

When a new nic is created in namespace ns1, the kernel sends a KOBJ_ADD uevent
to ns1.  When the nic is moved to ns2, we only send a KOBJ_MOVE to ns2, and
nothing to ns1.

This patch changes that behavior so that when moving a nic from ns1 to ns2, we
send a KOBJ_REMOVED to ns1 and KOBJ_ADD to ns2.  (The KOBJ_MOVE is still
sent to ns2).

The effects of this can be seen when starting and stopping containers in
an upstart based host.  Lxc will create a pair of veth nics, the kernel
sends KOBJ_ADD, and upstart starts network-instance jobs for each.  When
one nic is moved to the container, because no KOBJ_REMOVED event is
received, the network-instance job for that veth never goes away.  This
was reported at https://bugs.launchpad.net/ubuntu/+source/lxc/+bug/1065589
With this patch the networ-instance jobs properly go away.

The other oddness solved here is that if a nic is passed into a running
upstart-based container, without this patch no network-instance job is
started in the container.  But when the container creates a new nic
itself (ip link add new type veth) then network-interface jobs are
created.  With this patch, behavior comes in line with a regular host.

v2: also send KOBJ_ADD to new netns.  There will then be a
_MOVE event from the device_rename() call, but that should
be innocuous.

Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 2f94df2..0aea3fe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6418,6 +6418,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	dev_uc_flush(dev);
 	dev_mc_flush(dev);
 
+	/* Send a netdev-removed uevent to the old namespace */
+	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
+
 	/* Actually switch the network namespace */
 	dev_net_set(dev, net);
 
@@ -6429,6 +6432,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 			dev->iflink = dev->ifindex;
 	}
 
+	/* Send a netdev-add uevent to the new namespace */
+	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
+
 	/* Fixup kobjects */
 	err = device_rename(&dev->dev, dev->name);
 	WARN_ON(err);
-- 
cgit v1.1


From ce46cc64d47a8afaf13c300b09a7f9c29f4979b6 Mon Sep 17 00:00:00 2001
From: Shan Wei <davidshan@tencent.com>
Date: Tue, 4 Dec 2012 18:49:15 +0000
Subject: net: neighbour: prohibit negative value for unres_qlen_bytes
 parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

unres_qlen_bytes and unres_qlen are int type.
But multiple relation(unres_qlen_bytes = unres_qlen * SKB_TRUESIZE(ETH_FRAME_LEN))
will cause type overflow when seting unres_qlen. e.g.

$ echo 1027506 > /proc/sys/net/ipv4/neigh/eth1/unres_qlen
$ cat /proc/sys/net/ipv4/neigh/eth1/unres_qlen
1182657265
$ cat /proc/sys/net/ipv4/neigh/eth1/unres_qlen_bytes
-2147479756

The gutted value is not that we setting。
But user/administrator don't know this is caused by int type overflow.

what's more, it is meaningless and even dangerous that unres_qlen_bytes is set
with negative number. Because, for unresolved neighbour address, kernel will cache packets
without limit in __neigh_event_send()(e.g. (u32)-1 = 2GB).

Signed-off-by: Shan Wei <davidshan@tencent.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index f1c0c2e..36fc692 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -62,6 +62,9 @@ static void __neigh_notify(struct neighbour *n, int type, int flags);
 static void neigh_update_notify(struct neighbour *neigh);
 static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
 
+static int zero;
+static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
+
 static struct neigh_table *neigh_tables;
 #ifdef CONFIG_PROC_FS
 static const struct file_operations neigh_stat_seq_fops;
@@ -1787,8 +1790,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
 	    nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes) ||
 	    /* approximative value for deprecated QUEUE_LEN (in packets) */
 	    nla_put_u32(skb, NDTPA_QUEUE_LEN,
-			DIV_ROUND_UP(parms->queue_len_bytes,
-				     SKB_TRUESIZE(ETH_FRAME_LEN))) ||
+			parms->queue_len_bytes / SKB_TRUESIZE(ETH_FRAME_LEN)) ||
 	    nla_put_u32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen) ||
 	    nla_put_u32(skb, NDTPA_APP_PROBES, parms->app_probes) ||
 	    nla_put_u32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes) ||
@@ -2777,9 +2779,13 @@ static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer,
 	int size, ret;
 	ctl_table tmp = *ctl;
 
+	tmp.extra1 = &zero;
+	tmp.extra2 = &unres_qlen_max;
 	tmp.data = &size;
-	size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN));
-	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+
+	size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN);
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
 	if (write && !ret)
 		*(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
 	return ret;
@@ -2865,7 +2871,8 @@ static struct neigh_sysctl_table {
 			.procname	= "unres_qlen_bytes",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
-			.proc_handler	= proc_dointvec,
+			.extra1		= &zero,
+			.proc_handler   = proc_dointvec_minmax,
 		},
 		[NEIGH_VAR_PROXY_QLEN] = {
 			.procname	= "proxy_qlen",
-- 
cgit v1.1


From b93196dc5af7729ff7cc50d3d322ab1a364aa14f Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Thu, 6 Dec 2012 10:04:04 +0800
Subject: net: fix some compiler warning in net/core/neighbour.c

net/core/neighbour.c:65:12: warning: 'zero' defined but not used [-Wunused-variable]
net/core/neighbour.c:66:12: warning: 'unres_qlen_max' defined but not used [-Wunused-variable]

These variables are only used when CONFIG_SYSCTL is defined,
so move them under #ifdef CONFIG_SYSCTL.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
Acked-by: Shan Wei <davidshan@tencent.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 36fc692..c815f28 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -62,9 +62,6 @@ static void __neigh_notify(struct neighbour *n, int type, int flags);
 static void neigh_update_notify(struct neighbour *neigh);
 static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
 
-static int zero;
-static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
-
 static struct neigh_table *neigh_tables;
 #ifdef CONFIG_PROC_FS
 static const struct file_operations neigh_stat_seq_fops;
@@ -2772,6 +2769,8 @@ EXPORT_SYMBOL(neigh_app_ns);
 #endif /* CONFIG_ARPD */
 
 #ifdef CONFIG_SYSCTL
+static int zero;
+static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
 
 static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer,
 			   size_t *lenp, loff_t *ppos)
-- 
cgit v1.1


From e3d8fabee3b66ce158b2603f270479b84b6e4ba7 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Mon, 3 Dec 2012 01:16:32 +0000
Subject: net: call notifiers for mtu change even if iface is not up

Do the same thing as in set mac. Call notifiers every time.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 0aea3fe..307142a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4971,7 +4971,7 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 	else
 		dev->mtu = new_mtu;
 
-	if (!err && dev->flags & IFF_UP)
+	if (!err)
 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 	return err;
 }
-- 
cgit v1.1


From 6a674e9c75b17e7a88ff15b3c2e269eed54f7cfb Mon Sep 17 00:00:00 2001
From: Joseph Gasparakis <joseph.gasparakis@intel.com>
Date: Fri, 7 Dec 2012 14:14:14 +0000
Subject: net: Add support for hardware-offloaded encapsulation

This patch adds support in the kernel for offloading in the NIC Tx and Rx
checksumming for encapsulated packets (such as VXLAN and IP GRE).

For Tx encapsulation offload, the driver will need to set the right bits
in netdev->hw_enc_features. The protocol driver will have to set the
skb->encapsulation bit and populate the inner headers, so the NIC driver will
use those inner headers to calculate the csum in hardware.

For Rx encapsulation offload, the driver will need to set again the
skb->encapsulation flag and the skb->ip_csum to CHECKSUM_UNNECESSARY.
In that case the protocol driver should push the decapsulated packet up
to the stack, again with CHECKSUM_UNNECESSARY. In ether case, the protocol
driver should set the skb->encapsulation flag back to zero. Finally the
protocol driver should have NETIF_F_RXCSUM flag set in its features.

Signed-off-by: Joseph Gasparakis <joseph.gasparakis@intel.com>
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 880722e2..ccbabf5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -682,11 +682,14 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->transport_header	= old->transport_header;
 	new->network_header	= old->network_header;
 	new->mac_header		= old->mac_header;
+	new->inner_transport_header = old->inner_transport_header;
+	new->inner_network_header = old->inner_transport_header;
 	skb_dst_copy(new, old);
 	new->rxhash		= old->rxhash;
 	new->ooo_okay		= old->ooo_okay;
 	new->l4_rxhash		= old->l4_rxhash;
 	new->no_fcs		= old->no_fcs;
+	new->encapsulation	= old->encapsulation;
 #ifdef CONFIG_XFRM
 	new->sp			= secpath_get(old->sp);
 #endif
@@ -892,6 +895,8 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->network_header   += offset;
 	if (skb_mac_header_was_set(new))
 		new->mac_header	      += offset;
+	new->inner_transport_header += offset;
+	new->inner_network_header   += offset;
 #endif
 	skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
 	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
@@ -1089,6 +1094,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	skb->network_header   += off;
 	if (skb_mac_header_was_set(skb))
 		skb->mac_header += off;
+	skb->inner_transport_header += off;
+	skb->inner_network_header += off;
 	/* Only adjust this if it actually is csum_start rather than csum */
 	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		skb->csum_start += nhead;
@@ -1188,6 +1195,8 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
 	n->network_header   += off;
 	if (skb_mac_header_was_set(skb))
 		n->mac_header += off;
+	n->inner_transport_header += off;
+	n->inner_network_header	   += off;
 #endif
 
 	return n;
-- 
cgit v1.1


From fc70fb640b159f1d6bf5ad2321cd55e874c8d1b8 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 7 Dec 2012 14:14:15 +0000
Subject: net: Handle encapsulated offloads before fragmentation or handing to
 lower dev

This change allows the VXLAN to enable Tx checksum offloading even on
devices that do not support encapsulated checksum offloads. The
advantage to this is that it allows for the lower device to change due
to routing table changes without impacting features on the VXLAN itself.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 307142a..a4c4a1b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2324,6 +2324,13 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			skb->vlan_tci = 0;
 		}
 
+		/* If encapsulation offload request, verify we are testing
+		 * hardware encapsulation features instead of standard
+		 * features for the netdev
+		 */
+		if (skb->encapsulation)
+			features &= dev->hw_enc_features;
+
 		if (netif_needs_gso(skb, features)) {
 			if (unlikely(dev_gso_segment(skb, features)))
 				goto out_kfree_skb;
@@ -2339,8 +2346,12 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			 * checksumming here.
 			 */
 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
-				skb_set_transport_header(skb,
-					skb_checksum_start_offset(skb));
+				if (skb->encapsulation)
+					skb_set_inner_transport_header(skb,
+						skb_checksum_start_offset(skb));
+				else
+					skb_set_transport_header(skb,
+						skb_checksum_start_offset(skb));
 				if (!(features & NETIF_F_ALL_CSUM) &&
 				     skb_checksum_help(skb))
 					goto out_kfree_skb;
-- 
cgit v1.1


From 4b5511ebc7e1cf94e4f13be19c2cf3e90edc3395 Mon Sep 17 00:00:00 2001
From: Abhijit Pawar <abhi.c.pawar@gmail.com>
Date: Sun, 9 Dec 2012 23:12:28 +0000
Subject: net: remove obsolete simple_strto<foo>

This patch replace the obsolete simple_strto<foo> with kstrto<foo>

Signed-off-by: Abhijit Pawar <abhi.c.pawar@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 77a0388..12c129f72 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -674,7 +674,8 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
 		if ((delim = strchr(cur, '@')) == NULL)
 			goto parse_failed;
 		*delim = 0;
-		np->local_port = simple_strtol(cur, NULL, 10);
+		if (kstrtou16(cur, 10, &np->local_port))
+			goto parse_failed;
 		cur = delim;
 	}
 	cur++;
@@ -706,6 +707,8 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
 		if (*cur == ' ' || *cur == '\t')
 			np_info(np, "warning: whitespace is not allowed\n");
 		np->remote_port = simple_strtol(cur, NULL, 10);
+		if (kstrtou16(cur, 10, &np->remote_port))
+			goto parse_failed;
 		cur = delim;
 	}
 	cur++;
-- 
cgit v1.1


From 89c5fa3369a47db0df904c45c1c26e64c0404430 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 10 Dec 2012 13:28:16 +0000
Subject: net: gro: dev_gro_receive() cleanup

__napi_gro_receive() is inlined from two call sites for no good reason.

Lets move the prep stuff in a function of its own, called only if/when
needed. This saves 300 bytes on x86 :

# size net/core/dev.o.after net/core/dev.o.before
   text	   data	    bss	    dec	    hex	filename
  51968	   1238	   1040	  54246	   d3e6	net/core/dev.o.before
  51664	   1238	   1040	  53942	   d2b6	net/core/dev.o.after

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index a4c4a1b..4783850 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3603,6 +3603,28 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
+static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
+{
+	struct sk_buff *p;
+	unsigned int maclen = skb->dev->hard_header_len;
+
+	for (p = napi->gro_list; p; p = p->next) {
+		unsigned long diffs;
+
+		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+		diffs |= p->vlan_tci ^ skb->vlan_tci;
+		if (maclen == ETH_HLEN)
+			diffs |= compare_ether_header(skb_mac_header(p),
+						      skb_gro_mac_header(skb));
+		else if (!diffs)
+			diffs = memcmp(skb_mac_header(p),
+				       skb_gro_mac_header(skb),
+				       maclen);
+		NAPI_GRO_CB(p)->same_flow = !diffs;
+		NAPI_GRO_CB(p)->flush = 0;
+	}
+}
+
 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff **pp = NULL;
@@ -3619,6 +3641,8 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
 		goto normal;
 
+	gro_list_prepare(napi, skb);
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
 		if (ptype->type != type || !ptype->callbacks.gro_receive)
@@ -3695,30 +3719,6 @@ normal:
 	goto pull;
 }
 
-static inline gro_result_t
-__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
-{
-	struct sk_buff *p;
-	unsigned int maclen = skb->dev->hard_header_len;
-
-	for (p = napi->gro_list; p; p = p->next) {
-		unsigned long diffs;
-
-		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
-		diffs |= p->vlan_tci ^ skb->vlan_tci;
-		if (maclen == ETH_HLEN)
-			diffs |= compare_ether_header(skb_mac_header(p),
-						      skb_gro_mac_header(skb));
-		else if (!diffs)
-			diffs = memcmp(skb_mac_header(p),
-				       skb_gro_mac_header(skb),
-				       maclen);
-		NAPI_GRO_CB(p)->same_flow = !diffs;
-		NAPI_GRO_CB(p)->flush = 0;
-	}
-
-	return dev_gro_receive(napi, skb);
-}
 
 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 {
@@ -3768,7 +3768,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	skb_gro_reset_offset(skb);
 
-	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
+	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
 }
 EXPORT_SYMBOL(napi_gro_receive);
 
@@ -3866,7 +3866,7 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
 	if (!skb)
 		return GRO_DROP;
 
-	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
+	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
 }
 EXPORT_SYMBOL(napi_gro_frags);
 
-- 
cgit v1.1


From a71258d79e3d05632e90c9f7db5ccf929d276529 Mon Sep 17 00:00:00 2001
From: Abhijit Pawar <abhi.c.pawar@gmail.com>
Date: Mon, 10 Dec 2012 19:30:52 +0000
Subject: net: remove obsolete simple_strto<foo>

This patch removes the redundant occurences of simple_strto<foo>

Signed-off-by: Abhijit Pawar <abhi.c.pawar@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 12c129f72..3151acf 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -706,7 +706,6 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
 		*delim = 0;
 		if (*cur == ' ' || *cur == '\t')
 			np_info(np, "warning: whitespace is not allowed\n");
-		np->remote_port = simple_strtol(cur, NULL, 10);
 		if (kstrtou16(cur, 10, &np->remote_port))
 			goto parse_failed;
 		cur = delim;
-- 
cgit v1.1


From 75be437230b06fca87908a787f70de0ce7fbab8c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 11 Dec 2012 08:38:29 +0000
Subject: net: gro: avoid double copy in skb_gro_receive()

__copy_skb_header(nskb, p) already copied p->cb[], no need to copy
it again.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ccbabf5..ac9e44a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3028,7 +3028,6 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	memcpy(skb_mac_header(nskb), skb_mac_header(p),
 	       p->data - skb_mac_header(p));
 
-	*NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);
 	skb_shinfo(nskb)->frag_list = p;
 	skb_shinfo(nskb)->gso_size = pinfo->gso_size;
 	pinfo->gso_size = 0;
-- 
cgit v1.1