From d83769a580f1132ac26439f50068a29b02be535e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 21 Apr 2015 18:32:24 -0700
Subject: tcp: fix possible deadlock in tcp_send_fin()

Using sk_stream_alloc_skb() in tcp_send_fin() is dangerous in
case a huge process is killed by OOM, and tcp_mem[2] is hit.

To be able to free memory we need to make progress, so this
patch allows FIN packets to not care about tcp_mem[2], if
skb allocation succeeded.

In a follow-up patch, we might abort tcp_send_fin() infinite loop
in case TIF_MEMDIE is set on this thread, as memory allocator
did its best getting extra memory already.

This patch reverts d22e15371811 ("tcp: fix tcp fin memory accounting")

Fixes: d22e15371811 ("tcp: fix tcp fin memory accounting")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8c8d7e0..2ade67b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2812,6 +2812,21 @@ begin_fwd:
 	}
 }
 
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could loop forever.
+ */
+static void sk_forced_wmem_schedule(struct sock *sk, int size)
+{
+	int amt, status;
+
+	if (size <= sk->sk_forward_alloc)
+		return;
+	amt = sk_mem_pages(size);
+	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+	sk_memory_allocated_add(sk, amt, &status);
+}
+
 /* Send a fin.  The caller locks the socket for us.  This cannot be
  * allowed to fail queueing a FIN frame under any circumstances.
  */
@@ -2834,11 +2849,14 @@ void tcp_send_fin(struct sock *sk)
 	} else {
 		/* Socket is locked, keep trying until memory is available. */
 		for (;;) {
-			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+			skb = alloc_skb_fclone(MAX_TCP_HEADER,
+					       sk->sk_allocation);
 			if (skb)
 				break;
 			yield();
 		}
+		skb_reserve(skb, MAX_TCP_HEADER);
+		sk_forced_wmem_schedule(sk, skb->truesize);
 		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
 		tcp_init_nondata_skb(skb, tp->write_seq,
 				     TCPHDR_ACK | TCPHDR_FIN);
-- 
cgit v1.1


From 03c57747a7020a28a200e7e920fb48ecdc9b0fb8 Mon Sep 17 00:00:00 2001
From: Robert Shearman <rshearma@brocade.com>
Date: Wed, 22 Apr 2015 11:14:37 +0100
Subject: mpls: Per-device MPLS state

Add per-device MPLS state to supported interfaces. Use the presence of
this state in mpls_route_add to determine that this is a supported
interface.

Use the presence of mpls_dev to drop packets that arrived on an
unsupported interface - previously they were allowed through.

Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Robert Shearman <rshearma@brocade.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mpls/af_mpls.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++++--
 net/mpls/internal.h |  3 +++
 2 files changed, 51 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index db8a2ea..ad45017 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -53,6 +53,11 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
 	return rt;
 }
 
+static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
+{
+	return rcu_dereference_rtnl(dev->mpls_ptr);
+}
+
 static bool mpls_output_possible(const struct net_device *dev)
 {
 	return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
@@ -136,6 +141,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	struct mpls_route *rt;
 	struct mpls_entry_decoded dec;
 	struct net_device *out_dev;
+	struct mpls_dev *mdev;
 	unsigned int hh_len;
 	unsigned int new_header_size;
 	unsigned int mtu;
@@ -143,6 +149,10 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 
 	/* Careful this entire function runs inside of an rcu critical section */
 
+	mdev = mpls_dev_get(dev);
+	if (!mdev)
+		goto drop;
+
 	if (skb->pkt_type != PACKET_HOST)
 		goto drop;
 
@@ -352,9 +362,9 @@ static int mpls_route_add(struct mpls_route_config *cfg)
 	if (!dev)
 		goto errout;
 
-	/* For now just support ethernet devices */
+	/* Ensure this is a supported device */
 	err = -EINVAL;
-	if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK))
+	if (!mpls_dev_get(dev))
 		goto errout;
 
 	err = -EINVAL;
@@ -428,10 +438,27 @@ errout:
 	return err;
 }
 
+static struct mpls_dev *mpls_add_dev(struct net_device *dev)
+{
+	struct mpls_dev *mdev;
+	int err = -ENOMEM;
+
+	ASSERT_RTNL();
+
+	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+	if (!mdev)
+		return ERR_PTR(err);
+
+	rcu_assign_pointer(dev->mpls_ptr, mdev);
+
+	return mdev;
+}
+
 static void mpls_ifdown(struct net_device *dev)
 {
 	struct mpls_route __rcu **platform_label;
 	struct net *net = dev_net(dev);
+	struct mpls_dev *mdev;
 	unsigned index;
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
@@ -443,14 +470,33 @@ static void mpls_ifdown(struct net_device *dev)
 			continue;
 		rt->rt_dev = NULL;
 	}
+
+	mdev = mpls_dev_get(dev);
+	if (!mdev)
+		return;
+
+	RCU_INIT_POINTER(dev->mpls_ptr, NULL);
+
+	kfree(mdev);
 }
 
 static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
 			   void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct mpls_dev *mdev;
 
 	switch(event) {
+	case NETDEV_REGISTER:
+		/* For now just support ethernet devices */
+		if ((dev->type == ARPHRD_ETHER) ||
+		    (dev->type == ARPHRD_LOOPBACK)) {
+			mdev = mpls_add_dev(dev);
+			if (IS_ERR(mdev))
+				return notifier_from_errno(PTR_ERR(mdev));
+		}
+		break;
+
 	case NETDEV_UNREGISTER:
 		mpls_ifdown(dev);
 		break;
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index fb6de920..8090cb3 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -22,6 +22,9 @@ struct mpls_entry_decoded {
 	u8 bos;
 };
 
+struct mpls_dev {
+};
+
 struct sk_buff;
 
 static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
-- 
cgit v1.1


From 37bde79979c3862c79294c62ddcef7efc477e4bf Mon Sep 17 00:00:00 2001
From: Robert Shearman <rshearma@brocade.com>
Date: Wed, 22 Apr 2015 11:14:38 +0100
Subject: mpls: Per-device enabling of packet input

An MPLS network is a single trust domain where the edges must be in
control of what labels make their way into the core. The simplest way
of ensuring this is for the edge device to always impose the labels,
and not allow forward labeled traffic from untrusted neighbours. This
is achieved by allowing a per-device configuration of whether MPLS
traffic input from that interface should be processed or not.

To be secure by default, the default state is changed to MPLS being
disabled on all interfaces unless explicitly enabled and no global
option is provided to change the default. Whilst this differs from
other protocols (e.g. IPv6), network operators are used to explicitly
enabling MPLS forwarding on interfaces, and with the number of links
to the MPLS core typically fairly low this doesn't present too much of
a burden on operators.

Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Robert Shearman <rshearma@brocade.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mpls/af_mpls.c  | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 net/mpls/internal.h |  3 +++
 2 files changed, 69 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index ad45017..9fdd94c 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -150,7 +150,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	/* Careful this entire function runs inside of an rcu critical section */
 
 	mdev = mpls_dev_get(dev);
-	if (!mdev)
+	if (!mdev || !mdev->input_enabled)
 		goto drop;
 
 	if (skb->pkt_type != PACKET_HOST)
@@ -438,6 +438,60 @@ errout:
 	return err;
 }
 
+#define MPLS_PERDEV_SYSCTL_OFFSET(field)	\
+	(&((struct mpls_dev *)0)->field)
+
+static const struct ctl_table mpls_dev_table[] = {
+	{
+		.procname	= "input",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+		.data		= MPLS_PERDEV_SYSCTL_OFFSET(input_enabled),
+	},
+	{ }
+};
+
+static int mpls_dev_sysctl_register(struct net_device *dev,
+				    struct mpls_dev *mdev)
+{
+	char path[sizeof("net/mpls/conf/") + IFNAMSIZ];
+	struct ctl_table *table;
+	int i;
+
+	table = kmemdup(&mpls_dev_table, sizeof(mpls_dev_table), GFP_KERNEL);
+	if (!table)
+		goto out;
+
+	/* Table data contains only offsets relative to the base of
+	 * the mdev at this point, so make them absolute.
+	 */
+	for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++)
+		table[i].data = (char *)mdev + (uintptr_t)table[i].data;
+
+	snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name);
+
+	mdev->sysctl = register_net_sysctl(dev_net(dev), path, table);
+	if (!mdev->sysctl)
+		goto free;
+
+	return 0;
+
+free:
+	kfree(table);
+out:
+	return -ENOBUFS;
+}
+
+static void mpls_dev_sysctl_unregister(struct mpls_dev *mdev)
+{
+	struct ctl_table *table;
+
+	table = mdev->sysctl->ctl_table_arg;
+	unregister_net_sysctl_table(mdev->sysctl);
+	kfree(table);
+}
+
 static struct mpls_dev *mpls_add_dev(struct net_device *dev)
 {
 	struct mpls_dev *mdev;
@@ -449,9 +503,17 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
 	if (!mdev)
 		return ERR_PTR(err);
 
+	err = mpls_dev_sysctl_register(dev, mdev);
+	if (err)
+		goto free;
+
 	rcu_assign_pointer(dev->mpls_ptr, mdev);
 
 	return mdev;
+
+free:
+	kfree(mdev);
+	return ERR_PTR(err);
 }
 
 static void mpls_ifdown(struct net_device *dev)
@@ -475,6 +537,8 @@ static void mpls_ifdown(struct net_device *dev)
 	if (!mdev)
 		return;
 
+	mpls_dev_sysctl_unregister(mdev);
+
 	RCU_INIT_POINTER(dev->mpls_ptr, NULL);
 
 	kfree(mdev);
@@ -958,7 +1022,7 @@ static int mpls_platform_labels(struct ctl_table *table, int write,
 	return ret;
 }
 
-static struct ctl_table mpls_table[] = {
+static const struct ctl_table mpls_table[] = {
 	{
 		.procname	= "platform_labels",
 		.data		= NULL,
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 8090cb3..693877d 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -23,6 +23,9 @@ struct mpls_entry_decoded {
 };
 
 struct mpls_dev {
+	int			input_enabled;
+
+	struct ctl_table_header *sysctl;
 };
 
 struct sk_buff;
-- 
cgit v1.1


From 5a9ab0176198d91dfc153f5e6c5fdc5afa613607 Mon Sep 17 00:00:00 2001
From: Robert Shearman <rshearma@brocade.com>
Date: Wed, 22 Apr 2015 11:14:39 +0100
Subject: mpls: Prevent use of implicit NULL label as outgoing label

The reserved implicit-NULL label isn't allowed to appear in the label
stack for packets, so make it an error for the control plane to
specify it as an outgoing label.

Suggested-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Robert Shearman <rshearma@brocade.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mpls/af_mpls.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net')

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 9fdd94c..954810c 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -646,6 +646,15 @@ int nla_get_labels(const struct nlattr *nla,
 		if ((dec.bos != bos) || dec.ttl || dec.tc)
 			return -EINVAL;
 
+		switch (dec.label) {
+		case LABEL_IMPLICIT_NULL:
+			/* RFC3032: This is a label that an LSR may
+			 * assign and distribute, but which never
+			 * actually appears in the encapsulation.
+			 */
+			return -EINVAL;
+		}
+
 		label[i] = dec.label;
 	}
 	*labels = nla_labels;
-- 
cgit v1.1


From 26349c71b4323e62f8de0fe45f2f06c4df535b9b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 22 Apr 2015 16:56:16 +0200
Subject: ip6_gre: use netdev_alloc_pcpu_stats()

The code there just open-codes the same, so use the provided macro instead.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index b5e6cc1..a38d3ac 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1246,7 +1246,6 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
 static int ip6gre_tunnel_init(struct net_device *dev)
 {
 	struct ip6_tnl *tunnel;
-	int i;
 
 	tunnel = netdev_priv(dev);
 
@@ -1260,16 +1259,10 @@ static int ip6gre_tunnel_init(struct net_device *dev)
 	if (ipv6_addr_any(&tunnel->parms.raddr))
 		dev->header_ops = &ip6gre_header_ops;
 
-	dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
+	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
 
-	for_each_possible_cpu(i) {
-		struct pcpu_sw_netstats *ip6gre_tunnel_stats;
-		ip6gre_tunnel_stats = per_cpu_ptr(dev->tstats, i);
-		u64_stats_init(&ip6gre_tunnel_stats->syncp);
-	}
-
 	return 0;
 }
 
-- 
cgit v1.1


From 79930f5892e134c6da1254389577fffb8bd72c66 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 22 Apr 2015 07:33:36 -0700
Subject: net: do not deplete pfmemalloc reserve

build_skb() should look at the page pfmemalloc status.
If set, this means page allocator allocated this page in the
expectation it would help to free other pages. Networking
stack can do that only if skb->pfmemalloc is also set.

Also, we must refrain using high order pages from the pfmemalloc
reserve, so __page_frag_refill() must also use __GFP_NOMEMALLOC for
them. Under memory pressure, using order-0 pages is probably the best
strategy.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d1967da..456ead5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -311,7 +311,11 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
 
 	memset(skb, 0, offsetof(struct sk_buff, tail));
 	skb->truesize = SKB_TRUESIZE(size);
-	skb->head_frag = frag_size != 0;
+	if (frag_size) {
+		skb->head_frag = 1;
+		if (virt_to_head_page(data)->pfmemalloc)
+			skb->pfmemalloc = 1;
+	}
 	atomic_set(&skb->users, 1);
 	skb->head = data;
 	skb->data = data;
@@ -348,7 +352,8 @@ static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
 	gfp_t gfp = gfp_mask;
 
 	if (order) {
-		gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
+		gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
+			    __GFP_NOMEMALLOC;
 		page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
 		nc->frag.size = PAGE_SIZE << (page ? order : 0);
 	}
-- 
cgit v1.1


From def81f69bfbd70a3278a7592a4ab8717300cbac1 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Thu, 23 Apr 2015 09:37:38 -0400
Subject: tipc: fix topology server broken issue

When a new topology server is launched in a new namespace, its
listening socket is inserted into the "init ns" namespace's socket
hash table rather than the one owned by the new namespace. Although
the socket's namespace is forcedly changed to the new namespace later,
the socket is still stored in the socket hash table of "init ns"
namespace. When a client created in the new namespace connects
its own topology server, the connection is failed as its server's
socket could not be found from its own namespace's socket table.

If __sock_create() instead of original sock_create_kern() is used
to create the server's socket through specifying an expected namesapce,
the socket will be inserted into the specified namespace's socket
table, thereby avoiding to the topology server broken issue.

Fixes: 76100a8a64bc ("tipc: fix netns refcnt leak")

Reported-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/server.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/tipc/server.c b/net/tipc/server.c
index ab6183c..77ff03e 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -102,7 +102,7 @@ static void tipc_conn_kref_release(struct kref *kref)
 		}
 		saddr->scope = -TIPC_NODE_SCOPE;
 		kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr));
-		sk_release_kernel(sk);
+		sock_release(sock);
 		con->sock = NULL;
 	}
 
@@ -321,12 +321,9 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con)
 	struct socket *sock = NULL;
 	int ret;
 
-	ret = sock_create_kern(AF_TIPC, SOCK_SEQPACKET, 0, &sock);
+	ret = __sock_create(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock, 1);
 	if (ret < 0)
 		return NULL;
-
-	sk_change_net(sock->sk, s->net);
-
 	ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE,
 				(char *)&s->imp, sizeof(s->imp));
 	if (ret < 0)
@@ -376,7 +373,7 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con)
 
 create_err:
 	kernel_sock_shutdown(sock, SHUT_RDWR);
-	sk_release_kernel(sock->sk);
+	sock_release(sock);
 	return NULL;
 }
 
-- 
cgit v1.1


From 9871b27f6705fc6e0ba633b136369a289b2bfb99 Mon Sep 17 00:00:00 2001
From: Erik Hugne <erik.hugne@ericsson.com>
Date: Thu, 23 Apr 2015 09:37:39 -0400
Subject: tipc: fix random link reset problem

In the function tipc_sk_rcv(), the stack variable 'err'
is only initialized to TIPC_ERR_NO_PORT for the first
iteration over the link input queue. If a chain of messages
are received from a link, failure to lookup the socket for
any but the first message will cause the message to bounce back
out on a random link.
We fix this by properly initializing err.

Signed-off-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index ee90d74..9074b5c 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1764,13 +1764,14 @@ static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
 int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
 {
 	u32 dnode, dport = 0;
-	int err = -TIPC_ERR_NO_PORT;
+	int err;
 	struct sk_buff *skb;
 	struct tipc_sock *tsk;
 	struct tipc_net *tn;
 	struct sock *sk;
 
 	while (skb_queue_len(inputq)) {
+		err = -TIPC_ERR_NO_PORT;
 		skb = NULL;
 		dport = tipc_skb_peek_port(inputq, dport);
 		tsk = tipc_sk_lookup(net, dport);
-- 
cgit v1.1


From 73a317377303b5ec14d4703d73ba87efffbb779d Mon Sep 17 00:00:00 2001
From: Erik Hugne <erik.hugne@ericsson.com>
Date: Thu, 23 Apr 2015 09:37:40 -0400
Subject: tipc: fix node refcount issue

When link statistics is dumped over netlink, we iterate over
the list of peer nodes and append each links statistics to
the netlink msg. In the case where the dump is resumed after
filling up a nlmsg, the node refcnt is decremented without
having been incremented previously which may cause the node
reference to be freed. When this happens, the following
info/stacktrace will be generated, followed by a crash or
undefined behavior.
We fix this by removing the erroneous call to tipc_node_put
inside the loop that iterates over nodes.

[  384.312303] INFO: trying to register non-static key.
[  384.313110] the code is fine but needs lockdep annotation.
[  384.313290] turning off the locking correctness validator.
[  384.313290] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.0.0+ #13
[  384.313290] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[  384.313290]  ffff88003c6d0290 ffff88003cc03ca8 ffffffff8170adf1 0000000000000007
[  384.313290]  ffffffff82728730 ffff88003cc03d38 ffffffff810a6a6d 00000000001d7200
[  384.313290]  ffff88003c6d0ab0 ffff88003cc03ce8 0000000000000285 0000000000000001
[  384.313290] Call Trace:
[  384.313290]  <IRQ>  [<ffffffff8170adf1>] dump_stack+0x4c/0x65
[  384.313290]  [<ffffffff810a6a6d>] __lock_acquire+0xf3d/0xf50
[  384.313290]  [<ffffffff810a7375>] lock_acquire+0xd5/0x290
[  384.313290]  [<ffffffffa0043e8c>] ? link_timeout+0x1c/0x170 [tipc]
[  384.313290]  [<ffffffffa0043e70>] ? link_state_event+0x4e0/0x4e0 [tipc]
[  384.313290]  [<ffffffff81712890>] _raw_spin_lock_bh+0x40/0x80
[  384.313290]  [<ffffffffa0043e8c>] ? link_timeout+0x1c/0x170 [tipc]
[  384.313290]  [<ffffffffa0043e8c>] link_timeout+0x1c/0x170 [tipc]
[  384.313290]  [<ffffffff810c4698>] call_timer_fn+0xb8/0x490
[  384.313290]  [<ffffffff810c45e0>] ? process_timeout+0x10/0x10
[  384.313290]  [<ffffffff810c5a2c>] run_timer_softirq+0x21c/0x420
[  384.313290]  [<ffffffffa0043e70>] ? link_state_event+0x4e0/0x4e0 [tipc]
[  384.313290]  [<ffffffff8105a954>] __do_softirq+0xf4/0x630
[  384.313290]  [<ffffffff8105afdd>] irq_exit+0x5d/0x60
[  384.313290]  [<ffffffff8103ade1>] smp_apic_timer_interrupt+0x41/0x50
[  384.313290]  [<ffffffff817144a0>] apic_timer_interrupt+0x70/0x80
[  384.313290]  <EOI>  [<ffffffff8100db10>] ? default_idle+0x20/0x210
[  384.313290]  [<ffffffff8100db0e>] ? default_idle+0x1e/0x210
[  384.313290]  [<ffffffff8100e61a>] arch_cpu_idle+0xa/0x10
[  384.313290]  [<ffffffff81099803>] cpu_startup_entry+0x2c3/0x530
[  384.313290]  [<ffffffff810d2893>] ? clockevents_register_device+0x113/0x200
[  384.313290]  [<ffffffff81038b0f>] start_secondary+0x13f/0x170

Fixes: 8a0f6ebe8494 ("tipc: involve reference counter for node structure")
Signed-off-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/link.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/link.c b/net/tipc/link.c
index a6b30df..57be6e6 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -2143,7 +2143,6 @@ int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb)
 			err = __tipc_nl_add_node_links(net, &msg, node,
 						       &prev_link);
 			tipc_node_unlock(node);
-			tipc_node_put(node);
 			if (err)
 				goto out;
 
-- 
cgit v1.1


From d1ab39f17f8653021620d0355ee1cd24d7442a4f Mon Sep 17 00:00:00 2001
From: Jason Eastman <eastman.jason.linux@gmail.com>
Date: Wed, 22 Apr 2015 00:56:42 -0600
Subject: net: unix: garbage: fixed several comment and whitespace style issues

fixed several comment and whitespace style issues

Signed-off-by: Jason Eastman <eastman.jason.linux@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/garbage.c | 70 ++++++++++++++++++++++--------------------------------
 1 file changed, 28 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 99f7012..a73a226 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -95,39 +95,36 @@ static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
 
 unsigned int unix_tot_inflight;
 
-
 struct sock *unix_get_socket(struct file *filp)
 {
 	struct sock *u_sock = NULL;
 	struct inode *inode = file_inode(filp);
 
-	/*
-	 *	Socket ?
-	 */
+	/* Socket ? */
 	if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
 		struct socket *sock = SOCKET_I(inode);
 		struct sock *s = sock->sk;
 
-		/*
-		 *	PF_UNIX ?
-		 */
+		/* PF_UNIX ? */
 		if (s && sock->ops && sock->ops->family == PF_UNIX)
 			u_sock = s;
 	}
 	return u_sock;
 }
 
-/*
- *	Keep the number of times in flight count for the file
- *	descriptor if it is for an AF_UNIX socket.
+/* Keep the number of times in flight count for the file
+ * descriptor if it is for an AF_UNIX socket.
  */
 
 void unix_inflight(struct file *fp)
 {
 	struct sock *s = unix_get_socket(fp);
+
 	if (s) {
 		struct unix_sock *u = unix_sk(s);
+
 		spin_lock(&unix_gc_lock);
+
 		if (atomic_long_inc_return(&u->inflight) == 1) {
 			BUG_ON(!list_empty(&u->link));
 			list_add_tail(&u->link, &gc_inflight_list);
@@ -142,10 +139,13 @@ void unix_inflight(struct file *fp)
 void unix_notinflight(struct file *fp)
 {
 	struct sock *s = unix_get_socket(fp);
+
 	if (s) {
 		struct unix_sock *u = unix_sk(s);
+
 		spin_lock(&unix_gc_lock);
 		BUG_ON(list_empty(&u->link));
+
 		if (atomic_long_dec_and_test(&u->inflight))
 			list_del_init(&u->link);
 		unix_tot_inflight--;
@@ -161,32 +161,27 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
 
 	spin_lock(&x->sk_receive_queue.lock);
 	skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
-		/*
-		 *	Do we have file descriptors ?
-		 */
+		/* Do we have file descriptors ? */
 		if (UNIXCB(skb).fp) {
 			bool hit = false;
-			/*
-			 *	Process the descriptors of this socket
-			 */
+			/* Process the descriptors of this socket */
 			int nfd = UNIXCB(skb).fp->count;
 			struct file **fp = UNIXCB(skb).fp->fp;
+
 			while (nfd--) {
-				/*
-				 *	Get the socket the fd matches
-				 *	if it indeed does so
-				 */
+				/* Get the socket the fd matches if it indeed does so */
 				struct sock *sk = unix_get_socket(*fp++);
+
 				if (sk) {
 					struct unix_sock *u = unix_sk(sk);
 
-					/*
-					 * Ignore non-candidates, they could
+					/* Ignore non-candidates, they could
 					 * have been added to the queues after
 					 * starting the garbage collection
 					 */
 					if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
 						hit = true;
+
 						func(u);
 					}
 				}
@@ -203,24 +198,22 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
 static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
 			  struct sk_buff_head *hitlist)
 {
-	if (x->sk_state != TCP_LISTEN)
+	if (x->sk_state != TCP_LISTEN) {
 		scan_inflight(x, func, hitlist);
-	else {
+	} else {
 		struct sk_buff *skb;
 		struct sk_buff *next;
 		struct unix_sock *u;
 		LIST_HEAD(embryos);
 
-		/*
-		 * For a listening socket collect the queued embryos
+		/* For a listening socket collect the queued embryos
 		 * and perform a scan on them as well.
 		 */
 		spin_lock(&x->sk_receive_queue.lock);
 		skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
 			u = unix_sk(skb->sk);
 
-			/*
-			 * An embryo cannot be in-flight, so it's safe
+			/* An embryo cannot be in-flight, so it's safe
 			 * to use the list link.
 			 */
 			BUG_ON(!list_empty(&u->link));
@@ -249,8 +242,7 @@ static void inc_inflight(struct unix_sock *usk)
 static void inc_inflight_move_tail(struct unix_sock *u)
 {
 	atomic_long_inc(&u->inflight);
-	/*
-	 * If this still might be part of a cycle, move it to the end
+	/* If this still might be part of a cycle, move it to the end
 	 * of the list, so that it's checked even if it was already
 	 * passed over
 	 */
@@ -263,8 +255,7 @@ static bool gc_in_progress;
 
 void wait_for_unix_gc(void)
 {
-	/*
-	 * If number of inflight sockets is insane,
+	/* If number of inflight sockets is insane,
 	 * force a garbage collect right now.
 	 */
 	if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress)
@@ -288,8 +279,7 @@ void unix_gc(void)
 		goto out;
 
 	gc_in_progress = true;
-	/*
-	 * First, select candidates for garbage collection.  Only
+	/* First, select candidates for garbage collection.  Only
 	 * in-flight sockets are considered, and from those only ones
 	 * which don't have any external reference.
 	 *
@@ -320,15 +310,13 @@ void unix_gc(void)
 		}
 	}
 
-	/*
-	 * Now remove all internal in-flight reference to children of
+	/* Now remove all internal in-flight reference to children of
 	 * the candidates.
 	 */
 	list_for_each_entry(u, &gc_candidates, link)
 		scan_children(&u->sk, dec_inflight, NULL);
 
-	/*
-	 * Restore the references for children of all candidates,
+	/* Restore the references for children of all candidates,
 	 * which have remaining references.  Do this recursively, so
 	 * only those remain, which form cyclic references.
 	 *
@@ -350,8 +338,7 @@ void unix_gc(void)
 	}
 	list_del(&cursor);
 
-	/*
-	 * not_cycle_list contains those sockets which do not make up a
+	/* not_cycle_list contains those sockets which do not make up a
 	 * cycle.  Restore these to the inflight list.
 	 */
 	while (!list_empty(&not_cycle_list)) {
@@ -360,8 +347,7 @@ void unix_gc(void)
 		list_move_tail(&u->link, &gc_inflight_list);
 	}
 
-	/*
-	 * Now gc_candidates contains only garbage.  Restore original
+	/* Now gc_candidates contains only garbage.  Restore original
 	 * inflight counters for these as well, and remove the skbuffs
 	 * which are creating the cycle(s).
 	 */
-- 
cgit v1.1


From 845704a535e9b3c76448f52af1b70e4422ea03fd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 23 Apr 2015 10:42:39 -0700
Subject: tcp: avoid looping in tcp_send_fin()

Presence of an unbound loop in tcp_send_fin() had always been hard
to explain when analyzing crash dumps involving gigantic dying processes
with millions of sockets.

Lets try a different strategy :

In case of memory pressure, try to add the FIN flag to last packet
in write queue, even if packet was already sent. TCP stack will
be able to deliver this FIN after a timeout event. Note that this
FIN being delivered by a retransmit, it also carries a Push flag
given our current implementation.

By checking sk_under_memory_pressure(), we anticipate that cooking
many FIN packets might deplete tcp memory.

In the case we could not allocate a packet, even with __GFP_WAIT
allocation, then not sending a FIN seems quite reasonable if it allows
to get rid of this socket, free memory, and not block the process from
eventually doing other useful work.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 50 +++++++++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2ade67b..a369e8a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2814,7 +2814,8 @@ begin_fwd:
 
 /* We allow to exceed memory limits for FIN packets to expedite
  * connection tear down and (memory) recovery.
- * Otherwise tcp_send_fin() could loop forever.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
  */
 static void sk_forced_wmem_schedule(struct sock *sk, int size)
 {
@@ -2827,33 +2828,40 @@ static void sk_forced_wmem_schedule(struct sock *sk, int size)
 	sk_memory_allocated_add(sk, amt, &status);
 }
 
-/* Send a fin.  The caller locks the socket for us.  This cannot be
- * allowed to fail queueing a FIN frame under any circumstances.
+/* Send a FIN. The caller locks the socket for us.
+ * We should try to send a FIN packet really hard, but eventually give up.
  */
 void tcp_send_fin(struct sock *sk)
 {
+	struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb = tcp_write_queue_tail(sk);
-	int mss_now;
 
-	/* Optimization, tack on the FIN if we have a queue of
-	 * unsent frames.  But be careful about outgoing SACKS
-	 * and IP options.
+	/* Optimization, tack on the FIN if we have one skb in write queue and
+	 * this skb was not yet sent, or we are under memory pressure.
+	 * Note: in the latter case, FIN packet will be sent after a timeout,
+	 * as TCP stack thinks it has already been transmitted.
 	 */
-	mss_now = tcp_current_mss(sk);
-
-	if (tcp_send_head(sk)) {
-		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
-		TCP_SKB_CB(skb)->end_seq++;
+	if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
+coalesce:
+		TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
+		TCP_SKB_CB(tskb)->end_seq++;
 		tp->write_seq++;
+		if (!tcp_send_head(sk)) {
+			/* This means tskb was already sent.
+			 * Pretend we included the FIN on previous transmit.
+			 * We need to set tp->snd_nxt to the value it would have
+			 * if FIN had been sent. This is because retransmit path
+			 * does not change tp->snd_nxt.
+			 */
+			tp->snd_nxt++;
+			return;
+		}
 	} else {
-		/* Socket is locked, keep trying until memory is available. */
-		for (;;) {
-			skb = alloc_skb_fclone(MAX_TCP_HEADER,
-					       sk->sk_allocation);
-			if (skb)
-				break;
-			yield();
+		skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
+		if (unlikely(!skb)) {
+			if (tskb)
+				goto coalesce;
+			return;
 		}
 		skb_reserve(skb, MAX_TCP_HEADER);
 		sk_forced_wmem_schedule(sk, skb->truesize);
@@ -2862,7 +2870,7 @@ void tcp_send_fin(struct sock *sk)
 				     TCPHDR_ACK | TCPHDR_FIN);
 		tcp_queue_skb(sk, skb);
 	}
-	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
+	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
 }
 
 /* We get here when a process closes a file descriptor (either due to
-- 
cgit v1.1


From b357a364c57c940ddb932224542494363df37378 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 23 Apr 2015 18:03:44 -0700
Subject: inet: fix possible panic in reqsk_queue_unlink()

[ 3897.923145] BUG: unable to handle kernel NULL pointer dereference at
 0000000000000080
[ 3897.931025] IP: [<ffffffffa9f27686>] reqsk_timer_handler+0x1a6/0x243

There is a race when reqsk_timer_handler() and tcp_check_req() call
inet_csk_reqsk_queue_unlink() on the same req at the same time.

Before commit fa76ce7328b2 ("inet: get rid of central tcp/dccp listener
timer"), listener spinlock was held and race could not happen.

To solve this bug, we change reqsk_queue_unlink() to not assume req
must be found, and we return a status, to conditionally release a
refcount on the request sock.

This also means tcp_check_req() in non fastopen case might or not
consume req refcount, so tcp_v6_hnd_req() & tcp_v4_hnd_req() have
to properly handle this.

(Same remark for dccp_check_req() and its callers)

inet_csk_reqsk_queue_drop() is now too big to be inlined, as it is
called 4 times in tcp and 3 times in dccp.

Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv4.c                 |  3 ++-
 net/dccp/ipv6.c                 |  3 ++-
 net/dccp/minisocks.c            |  3 +--
 net/ipv4/inet_connection_sock.c | 34 ++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c             |  3 ++-
 net/ipv4/tcp_minisocks.c        |  7 ++++---
 net/ipv6/tcp_ipv6.c             |  3 ++-
 7 files changed, 47 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 2b4f21d..ccf4c56 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -453,7 +453,8 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 						       iph->saddr, iph->daddr);
 	if (req) {
 		nsk = dccp_check_req(sk, skb, req);
-		reqsk_put(req);
+		if (!nsk)
+			reqsk_put(req);
 		return nsk;
 	}
 	nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 9d05510..5165571 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -301,7 +301,8 @@ static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
 				   &iph->daddr, inet6_iif(skb));
 	if (req) {
 		nsk = dccp_check_req(sk, skb, req);
-		reqsk_put(req);
+		if (!nsk)
+			reqsk_put(req);
 		return nsk;
 	}
 	nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo,
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 5f56666..30addee 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -186,8 +186,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
 	if (child == NULL)
 		goto listen_overflow;
 
-	inet_csk_reqsk_queue_unlink(sk, req);
-	inet_csk_reqsk_queue_removed(sk, req);
+	inet_csk_reqsk_queue_drop(sk, req);
 	inet_csk_reqsk_queue_add(sk, req, child);
 out:
 	return child;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 5c3dd62..8976ca4 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -564,6 +564,40 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
 }
 EXPORT_SYMBOL(inet_rtx_syn_ack);
 
+/* return true if req was found in the syn_table[] */
+static bool reqsk_queue_unlink(struct request_sock_queue *queue,
+			       struct request_sock *req)
+{
+	struct listen_sock *lopt = queue->listen_opt;
+	struct request_sock **prev;
+	bool found = false;
+
+	spin_lock(&queue->syn_wait_lock);
+
+	for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL;
+	     prev = &(*prev)->dl_next) {
+		if (*prev == req) {
+			*prev = req->dl_next;
+			found = true;
+			break;
+		}
+	}
+
+	spin_unlock(&queue->syn_wait_lock);
+	if (del_timer(&req->rsk_timer))
+		reqsk_put(req);
+	return found;
+}
+
+void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
+{
+	if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) {
+		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+		reqsk_put(req);
+	}
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
+
 static void reqsk_timer_handler(unsigned long data)
 {
 	struct request_sock *req = (struct request_sock *)data;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3571f2b..fc1c658 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1348,7 +1348,8 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 	req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
 	if (req) {
 		nsk = tcp_check_req(sk, skb, req, false);
-		reqsk_put(req);
+		if (!nsk)
+			reqsk_put(req);
 		return nsk;
 	}
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 63d6311..e5d7649 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -755,10 +755,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	if (!child)
 		goto listen_overflow;
 
-	inet_csk_reqsk_queue_unlink(sk, req);
-	inet_csk_reqsk_queue_removed(sk, req);
-
+	inet_csk_reqsk_queue_drop(sk, req);
 	inet_csk_reqsk_queue_add(sk, req, child);
+	/* Warning: caller must not call reqsk_put(req);
+	 * child stole last reference on it.
+	 */
 	return child;
 
 listen_overflow:
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ad51df8..b6575d6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -946,7 +946,8 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
 				   &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb));
 	if (req) {
 		nsk = tcp_check_req(sk, skb, req, false);
-		reqsk_put(req);
+		if (!nsk)
+			reqsk_put(req);
 		return nsk;
 	}
 	nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
-- 
cgit v1.1


From 2ea2f62c8bda242433809c7f4e9eae1c52c40bbe Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 24 Apr 2015 16:05:01 -0700
Subject: net: fix crash in build_skb()

When I added pfmemalloc support in build_skb(), I forgot netlink
was using build_skb() with a vmalloc() area.

In this patch I introduce __build_skb() for netlink use,
and build_skb() is a wrapper handling both skb->head_frag and
skb->pfmemalloc

This means netlink no longer has to hack skb->head_frag

[ 1567.700067] kernel BUG at arch/x86/mm/physaddr.c:26!
[ 1567.700067] invalid opcode: 0000 [#1] PREEMPT SMP KASAN
[ 1567.700067] Dumping ftrace buffer:
[ 1567.700067]    (ftrace buffer empty)
[ 1567.700067] Modules linked in:
[ 1567.700067] CPU: 9 PID: 16186 Comm: trinity-c182 Not tainted 4.0.0-next-20150424-sasha-00037-g4796e21 #2167
[ 1567.700067] task: ffff880127efb000 ti: ffff880246770000 task.ti: ffff880246770000
[ 1567.700067] RIP: __phys_addr (arch/x86/mm/physaddr.c:26 (discriminator 3))
[ 1567.700067] RSP: 0018:ffff8802467779d8  EFLAGS: 00010202
[ 1567.700067] RAX: 000041000ed8e000 RBX: ffffc9008ed8e000 RCX: 000000000000002c
[ 1567.700067] RDX: 0000000000000004 RSI: 0000000000000000 RDI: ffffffffb3fd6049
[ 1567.700067] RBP: ffff8802467779f8 R08: 0000000000000019 R09: ffff8801d0168000
[ 1567.700067] R10: ffff8801d01680c7 R11: ffffed003a02d019 R12: ffffc9000ed8e000
[ 1567.700067] R13: 0000000000000f40 R14: 0000000000001180 R15: ffffc9000ed8e000
[ 1567.700067] FS:  00007f2a7da3f700(0000) GS:ffff8801d1000000(0000) knlGS:0000000000000000
[ 1567.700067] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1567.700067] CR2: 0000000000738308 CR3: 000000022e329000 CR4: 00000000000007e0
[ 1567.700067] Stack:
[ 1567.700067]  ffffc9000ed8e000 ffff8801d0168000 ffffc9000ed8e000 ffff8801d0168000
[ 1567.700067]  ffff880246777a28 ffffffffad7c0a21 0000000000001080 ffff880246777c08
[ 1567.700067]  ffff88060d302e68 ffff880246777b58 ffff880246777b88 ffffffffad9a6821
[ 1567.700067] Call Trace:
[ 1567.700067] build_skb (include/linux/mm.h:508 net/core/skbuff.c:316)
[ 1567.700067] netlink_sendmsg (net/netlink/af_netlink.c:1633 net/netlink/af_netlink.c:2329)
[ 1567.774369] ? sched_clock_cpu (kernel/sched/clock.c:311)
[ 1567.774369] ? netlink_unicast (net/netlink/af_netlink.c:2273)
[ 1567.774369] ? netlink_unicast (net/netlink/af_netlink.c:2273)
[ 1567.774369] sock_sendmsg (net/socket.c:614 net/socket.c:623)
[ 1567.774369] sock_write_iter (net/socket.c:823)
[ 1567.774369] ? sock_sendmsg (net/socket.c:806)
[ 1567.774369] __vfs_write (fs/read_write.c:479 fs/read_write.c:491)
[ 1567.774369] ? get_lock_stats (kernel/locking/lockdep.c:249)
[ 1567.774369] ? default_llseek (fs/read_write.c:487)
[ 1567.774369] ? vtime_account_user (kernel/sched/cputime.c:701)
[ 1567.774369] ? rw_verify_area (fs/read_write.c:406 (discriminator 4))
[ 1567.774369] vfs_write (fs/read_write.c:539)
[ 1567.774369] SyS_write (fs/read_write.c:586 fs/read_write.c:577)
[ 1567.774369] ? SyS_read (fs/read_write.c:577)
[ 1567.774369] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
[ 1567.774369] ? trace_hardirqs_on_caller (kernel/locking/lockdep.c:2594 kernel/locking/lockdep.c:2636)
[ 1567.774369] ? trace_hardirqs_on_thunk (arch/x86/lib/thunk_64.S:42)
[ 1567.774369] system_call_fastpath (arch/x86/kernel/entry_64.S:261)

Fixes: 79930f5892e ("net: do not deplete pfmemalloc reserve")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c        | 31 ++++++++++++++++++++++---------
 net/netlink/af_netlink.c |  6 ++----
 2 files changed, 24 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 456ead5..3cfff2a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -280,13 +280,14 @@ nodata:
 EXPORT_SYMBOL(__alloc_skb);
 
 /**
- * build_skb - build a network buffer
+ * __build_skb - build a network buffer
  * @data: data buffer provided by caller
- * @frag_size: size of fragment, or 0 if head was kmalloced
+ * @frag_size: size of data, or 0 if head was kmalloced
  *
  * Allocate a new &sk_buff. Caller provides space holding head and
  * skb_shared_info. @data must have been allocated by kmalloc() only if
- * @frag_size is 0, otherwise data should come from the page allocator.
+ * @frag_size is 0, otherwise data should come from the page allocator
+ *  or vmalloc()
  * The return is the new skb buffer.
  * On a failure the return is %NULL, and @data is not freed.
  * Notes :
@@ -297,7 +298,7 @@ EXPORT_SYMBOL(__alloc_skb);
  *  before giving packet to stack.
  *  RX rings only contains data buffers, not full skbs.
  */
-struct sk_buff *build_skb(void *data, unsigned int frag_size)
+struct sk_buff *__build_skb(void *data, unsigned int frag_size)
 {
 	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
@@ -311,11 +312,6 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
 
 	memset(skb, 0, offsetof(struct sk_buff, tail));
 	skb->truesize = SKB_TRUESIZE(size);
-	if (frag_size) {
-		skb->head_frag = 1;
-		if (virt_to_head_page(data)->pfmemalloc)
-			skb->pfmemalloc = 1;
-	}
 	atomic_set(&skb->users, 1);
 	skb->head = data;
 	skb->data = data;
@@ -332,6 +328,23 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
 
 	return skb;
 }
+
+/* build_skb() is wrapper over __build_skb(), that specifically
+ * takes care of skb->head and skb->pfmemalloc
+ * This means that if @frag_size is not zero, then @data must be backed
+ * by a page fragment, not kmalloc() or vmalloc()
+ */
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
+{
+	struct sk_buff *skb = __build_skb(data, frag_size);
+
+	if (skb && frag_size) {
+		skb->head_frag = 1;
+		if (virt_to_head_page(data)->pfmemalloc)
+			skb->pfmemalloc = 1;
+	}
+	return skb;
+}
 EXPORT_SYMBOL(build_skb);
 
 struct netdev_alloc_cache {
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 19909d0..ec4adbd 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1629,13 +1629,11 @@ static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
 	if (data == NULL)
 		return NULL;
 
-	skb = build_skb(data, size);
+	skb = __build_skb(data, size);
 	if (skb == NULL)
 		vfree(data);
-	else {
-		skb->head_frag = 0;
+	else
 		skb->destructor = netlink_skb_destructor;
-	}
 
 	return skb;
 }
-- 
cgit v1.1


From a31196b07f8034eba6a3487a1ad1bb5ec5cd58a5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 25 Apr 2015 09:35:24 -0700
Subject: net: rfs: fix crash in get_rps_cpus()

Commit 567e4b79731c ("net: rfs: add hash collision detection") had one
mistake :

RPS_NO_CPU is no longer the marker for invalid cpu in set_rps_cpu()
and get_rps_cpu(), as @next_cpu was the result of an AND with
rps_cpu_mask

This bug showed up on a host with 72 cpus :
next_cpu was 0x7f, and the code was trying to access percpu data of an
non existent cpu.

In a follow up patch, we might get rid of compares against nr_cpu_ids,
if we init the tables with 0. This is silly to test for a very unlikely
condition that exists only shortly after table initialization, as
we got rid of rps_reset_sock_flow() and similar functions that were
writing this RPS_NO_CPU magic value at flow dismantle : When table is
old enough, it never contains this value anymore.

Fixes: 567e4b79731c ("net: rfs: add hash collision detection")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <tom@herbertland.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 1796cef5..c7ba038 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3079,7 +3079,7 @@ static struct rps_dev_flow *
 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	    struct rps_dev_flow *rflow, u16 next_cpu)
 {
-	if (next_cpu != RPS_NO_CPU) {
+	if (next_cpu < nr_cpu_ids) {
 #ifdef CONFIG_RFS_ACCEL
 		struct netdev_rx_queue *rxqueue;
 		struct rps_dev_flow_table *flow_table;
@@ -3184,7 +3184,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		 * If the desired CPU (where last recvmsg was done) is
 		 * different from current CPU (one in the rx-queue flow
 		 * table entry), switch if one of the following holds:
-		 *   - Current CPU is unset (equal to RPS_NO_CPU).
+		 *   - Current CPU is unset (>= nr_cpu_ids).
 		 *   - Current CPU is offline.
 		 *   - The current CPU's queue tail has advanced beyond the
 		 *     last packet that was enqueued using this table entry.
@@ -3192,14 +3192,14 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		 *     have been dequeued, thus preserving in order delivery.
 		 */
 		if (unlikely(tcpu != next_cpu) &&
-		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
+		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
 		      rflow->last_qtail)) >= 0)) {
 			tcpu = next_cpu;
 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 		}
 
-		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
+		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
 			*rflowp = rflow;
 			cpu = tcpu;
 			goto done;
@@ -3240,14 +3240,14 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 	struct rps_dev_flow_table *flow_table;
 	struct rps_dev_flow *rflow;
 	bool expire = true;
-	int cpu;
+	unsigned int cpu;
 
 	rcu_read_lock();
 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 	if (flow_table && flow_id <= flow_table->mask) {
 		rflow = &flow_table->flows[flow_id];
 		cpu = ACCESS_ONCE(rflow->cpu);
-		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 			   rflow->last_qtail) <
 		     (int)(10 * flow_table->mask)))
-- 
cgit v1.1


From 129d23a56623eea0947a05288158d76dc7f2f0ac Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 27 Apr 2015 13:20:34 -0400
Subject: netfilter; Add some missing default cases to switch statements in
 nft_reject.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes:

====================
net/netfilter/nft_reject.c: In function ‘nft_reject_dump’:
net/netfilter/nft_reject.c:61:2: warning: enumeration value ‘NFT_REJECT_TCP_RST’ not handled in switch [-Wswitch]
  switch (priv->type) {
  ^
net/netfilter/nft_reject.c:61:2: warning: enumeration value ‘NFT_REJECT_ICMPX_UNREACH’ not handled in switch [-Wswi\
tch]
net/netfilter/nft_reject_inet.c: In function ‘nft_reject_inet_dump’:
net/netfilter/nft_reject_inet.c:105:2: warning: enumeration value ‘NFT_REJECT_TCP_RST’ not handled in switch [-Wswi\
tch]
  switch (priv->type) {
  ^
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nft_reject.c      | 2 ++
 net/netfilter/nft_reject_inet.c | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 57d3e1a..0522fc9 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -63,6 +63,8 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
 			goto nla_put_failure;
 		break;
+	default:
+		break;
 	}
 
 	return 0;
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index 62cabee..635dbba 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -108,6 +108,8 @@ static int nft_reject_inet_dump(struct sk_buff *skb,
 		if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
 			goto nla_put_failure;
 		break;
+	default:
+		break;
 	}
 
 	return 0;
-- 
cgit v1.1