From 4f031fa9f188b2b0641ac20087d9e16bcfb4e49d Mon Sep 17 00:00:00 2001
From: Ronald Wahl <ronald.wahl@raritan.com>
Date: Thu, 6 Nov 2014 11:52:13 +0100
Subject: mac80211: Fix regression that triggers a kernel BUG with CCMP

Commit 7ec7c4a9a686c608315739ab6a2b0527a240883c (mac80211: port CCMP to
cryptoapi's CCM driver) introduced a regression when decrypting empty
packets (data_len == 0). This will lead to backtraces like:

(scatterwalk_start) from [<c01312f4>] (scatterwalk_map_and_copy+0x2c/0xa8)
(scatterwalk_map_and_copy) from [<c013a5a0>] (crypto_ccm_decrypt+0x7c/0x25c)
(crypto_ccm_decrypt) from [<c032886c>] (ieee80211_aes_ccm_decrypt+0x160/0x170)
(ieee80211_aes_ccm_decrypt) from [<c031c628>] (ieee80211_crypto_ccmp_decrypt+0x1ac/0x238)
(ieee80211_crypto_ccmp_decrypt) from [<c032ef28>] (ieee80211_rx_handlers+0x870/0x1d24)
(ieee80211_rx_handlers) from [<c0330c7c>] (ieee80211_prepare_and_rx_handle+0x8a0/0x91c)
(ieee80211_prepare_and_rx_handle) from [<c0331260>] (ieee80211_rx+0x568/0x730)
(ieee80211_rx) from [<c01d3054>] (__carl9170_rx+0x94c/0xa20)
(__carl9170_rx) from [<c01d3324>] (carl9170_rx_stream+0x1fc/0x320)
(carl9170_rx_stream) from [<c01cbccc>] (carl9170_usb_tasklet+0x80/0xc8)
(carl9170_usb_tasklet) from [<c00199dc>] (tasklet_hi_action+0x88/0xcc)
(tasklet_hi_action) from [<c00193c8>] (__do_softirq+0xcc/0x200)
(__do_softirq) from [<c0019734>] (irq_exit+0x80/0xe0)
(irq_exit) from [<c0009c10>] (handle_IRQ+0x64/0x80)
(handle_IRQ) from [<c000c3a0>] (__irq_svc+0x40/0x4c)
(__irq_svc) from [<c0009d44>] (arch_cpu_idle+0x2c/0x34)

Such packets can appear for example when using the carl9170 wireless driver
because hardware sometimes generates garbage when the internal FIFO overruns.

This patch adds an additional length check.

Cc: stable@vger.kernel.org
Fixes: 7ec7c4a9a686 ("mac80211: port CCMP to cryptoapi's CCM driver")
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Ronald Wahl <ronald.wahl@raritan.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/aes_ccm.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')
diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c
index ec24378..09d9caa 100644
--- a/net/mac80211/aes_ccm.c
+++ b/net/mac80211/aes_ccm.c
@@ -53,6 +53,9 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
 		__aligned(__alignof__(struct aead_request));
 	struct aead_request *aead_req = (void *) aead_req_data;
 
+	if (data_len == 0)
+		return -EINVAL;
+
 	memset(aead_req, 0, sizeof(aead_req_data));
 
 	sg_init_one(&pt, data, data_len);
-- 
cgit v1.1


From 6b96686ecffcbea85dcb502e4584e4a20a2bfb29 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 7 Nov 2014 15:34:54 +0100
Subject: netfilter: nft_masq: fix uninitialized range in nft_masq_{ipv4,
 ipv6}_eval

When transferring from the original range in nf_nat_masquerade_{ipv4,ipv6}()
we copy over values from stack in from min_proto/max_proto due to uninitialized
range variable in both, nft_masq_{ipv4,ipv6}_eval. As we only initialize
flags at this time from nft_masq struct, just zero out the rest.

Fixes: 9ba1f726bec09 ("netfilter: nf_tables: add new nft_masq expression")
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nft_masq_ipv4.c | 1 +
 net/ipv6/netfilter/nft_masq_ipv6.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index c1023c4..665de06 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -24,6 +24,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
 	struct nf_nat_range range;
 	unsigned int verdict;
 
+	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
 
 	verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum,
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index 8a7ac68..529c119 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -25,6 +25,7 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
 	struct nf_nat_range range;
 	unsigned int verdict;
 
+	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
 
 	verdict = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out);
-- 
cgit v1.1


From 2196937e12b1b4ba139806d132647e1651d655df Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 10 Nov 2014 17:11:21 +0100
Subject: netfilter: ipset: small potential read beyond the end of buffer

We could be reading 8 bytes into a 4 byte buffer here.  It seems
harmless but adding a check is the right thing to do and it silences a
static checker warning.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_core.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 86f9d76..d259da3 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1863,6 +1863,12 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 	if (*op < IP_SET_OP_VERSION) {
 		/* Check the version at the beginning of operations */
 		struct ip_set_req_version *req_version = data;
+
+		if (*len < sizeof(struct ip_set_req_version)) {
+			ret = -EINVAL;
+			goto done;
+		}
+
 		if (req_version->version != IPSET_PROTOCOL) {
 			ret = -EPROTO;
 			goto done;
-- 
cgit v1.1


From 50656d9df63d69ce399c8be62d4473b039dac36a Mon Sep 17 00:00:00 2001
From: Calvin Owens <calvinowens@fb.com>
Date: Tue, 4 Nov 2014 16:37:40 -0800
Subject: ipvs: Keep skb->sk when allocating headroom on tunnel xmit

ip_vs_prepare_tunneled_skb() ignores ->sk when allocating a new
skb, either unconditionally setting ->sk to NULL or allowing
the uninitialized ->sk from a newly allocated skb to leak through
to the caller.

This patch properly copies ->sk and increments its reference count.

Signed-off-by: Calvin Owens <calvinowens@fb.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 437a366..bd90bf8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -846,6 +846,8 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
 		new_skb = skb_realloc_headroom(skb, max_headroom);
 		if (!new_skb)
 			goto error;
+		if (skb->sk)
+			skb_set_owner_w(new_skb, skb->sk);
 		consume_skb(skb);
 		skb = new_skb;
 	}
-- 
cgit v1.1


From 2daf1b4d18e3add229d1a3b5c554331d99ac6c7e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 7 Nov 2014 18:48:33 +0100
Subject: netfilter: nft_compat: use current net namespace

Instead of init_net when using xtables over nftables compat.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 9d6d6f6..b92f129 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -117,7 +117,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par,
 			   struct xt_target *target, void *info,
 			   union nft_entry *entry, u8 proto, bool inv)
 {
-	par->net	= &init_net;
+	par->net	= ctx->net;
 	par->table	= ctx->table->name;
 	switch (ctx->afi->family) {
 	case AF_INET:
@@ -324,7 +324,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
 			  struct xt_match *match, void *info,
 			  union nft_entry *entry, u8 proto, bool inv)
 {
-	par->net	= &init_net;
+	par->net	= ctx->net;
 	par->table	= ctx->table->name;
 	switch (ctx->afi->family) {
 	case AF_INET:
-- 
cgit v1.1


From c918687f5e3962375a19de6ded3c1be85ebdbcd6 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 10 Nov 2014 20:53:55 +0100
Subject: netfilter: nft_compat: relax chain type validation

Check for nat chain dependency only, which is the one that can
actually crash the kernel. Don't care if mangle, filter and security
specific match and targets are used out of their scope, they are
harmless.

This restores iptables-compat with mangle specific match/target when
used out of the OUTPUT chain, that are actually emulated through filter
chains, which broke when performing strict validation.

Fixes: f3f5dde ("netfilter: nft_compat: validate chain type in match/target")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 32 ++------------------------------
 1 file changed, 2 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index b92f129..70dc965 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -21,45 +21,17 @@
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/netfilter/nf_tables.h>
 
-static const struct {
-       const char	*name;
-       u8		type;
-} table_to_chaintype[] = {
-       { "filter",     NFT_CHAIN_T_DEFAULT },
-       { "raw",        NFT_CHAIN_T_DEFAULT },
-       { "security",   NFT_CHAIN_T_DEFAULT },
-       { "mangle",     NFT_CHAIN_T_ROUTE },
-       { "nat",        NFT_CHAIN_T_NAT },
-       { },
-};
-
-static int nft_compat_table_to_chaintype(const char *table)
-{
-	int i;
-
-	for (i = 0; table_to_chaintype[i].name != NULL; i++) {
-		if (strcmp(table_to_chaintype[i].name, table) == 0)
-			return table_to_chaintype[i].type;
-	}
-
-	return -1;
-}
-
 static int nft_compat_chain_validate_dependency(const char *tablename,
 						const struct nft_chain *chain)
 {
-	enum nft_chain_type type;
 	const struct nft_base_chain *basechain;
 
 	if (!tablename || !(chain->flags & NFT_BASE_CHAIN))
 		return 0;
 
-	type = nft_compat_table_to_chaintype(tablename);
-	if (type < 0)
-		return -EINVAL;
-
 	basechain = nft_base_chain(chain);
-	if (basechain->type->type != type)
+	if (strcmp(tablename, "nat") == 0 &&
+	    basechain->type->type != NFT_CHAIN_T_NAT)
 		return -EINVAL;
 
 	return 0;
-- 
cgit v1.1


From afefb6f928ed42d5db452ee9251ce6de62673c67 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 10 Nov 2014 19:08:21 +0100
Subject: netfilter: nft_compat: use the match->table to validate dependencies

Instead of the match->name, which is of course not relevant.

Fixes: f3f5dde ("netfilter: nft_compat: validate chain type in match/target")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 70dc965..265e190 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -346,7 +346,7 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	union nft_entry e = {};
 	int ret;
 
-	ret = nft_compat_chain_validate_dependency(match->name, ctx->chain);
+	ret = nft_compat_chain_validate_dependency(match->table, ctx->chain);
 	if (ret < 0)
 		goto err;
 
@@ -420,7 +420,7 @@ static int nft_match_validate(const struct nft_ctx *ctx,
 		if (!(hook_mask & match->hooks))
 			return -EINVAL;
 
-		ret = nft_compat_chain_validate_dependency(match->name,
+		ret = nft_compat_chain_validate_dependency(match->table,
 							   ctx->chain);
 		if (ret < 0)
 			return ret;
-- 
cgit v1.1


From b326dd37b94e29bf6a15940f4fa66aa21a678ab1 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 10 Nov 2014 21:14:12 +0100
Subject: netfilter: nf_tables: restore synchronous object release from
 commit/abort

The existing xtables matches and targets, when used from nft_compat, may
sleep from the destroy path, ie. when removing rules. Since the objects
are released via call_rcu from softirq context, this results in lockdep
splats and possible lockups that may be hard to reproduce.

Patrick also indicated that delayed object release via call_rcu can
cause us problems in the ordering of event notifications when anonymous
sets are in place.

So, this patch restores the synchronous object release from the commit
and abort paths. This includes a call to synchronize_rcu() to make sure
that no packets are walking on the objects that are going to be
released. This is slowier though, but it's simple and it resolves the
aforementioned problems.

This is a partial revert of c7c32e7 ("netfilter: nf_tables: defer all
object release via rcu") that was introduced in 3.16 to speed up
interaction with userspace.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 11ab4b07..66e8425 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3484,13 +3484,8 @@ static void nft_chain_commit_update(struct nft_trans *trans)
 	}
 }
 
-/* Schedule objects for release via rcu to make sure no packets are accesing
- * removed rules.
- */
-static void nf_tables_commit_release_rcu(struct rcu_head *rt)
+static void nf_tables_commit_release(struct nft_trans *trans)
 {
-	struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head);
-
 	switch (trans->msg_type) {
 	case NFT_MSG_DELTABLE:
 		nf_tables_table_destroy(&trans->ctx);
@@ -3612,10 +3607,11 @@ static int nf_tables_commit(struct sk_buff *skb)
 		}
 	}
 
+	synchronize_rcu();
+
 	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
 		list_del(&trans->list);
-		trans->ctx.nla = NULL;
-		call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu);
+		nf_tables_commit_release(trans);
 	}
 
 	nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
@@ -3623,13 +3619,8 @@ static int nf_tables_commit(struct sk_buff *skb)
 	return 0;
 }
 
-/* Schedule objects for release via rcu to make sure no packets are accesing
- * aborted rules.
- */
-static void nf_tables_abort_release_rcu(struct rcu_head *rt)
+static void nf_tables_abort_release(struct nft_trans *trans)
 {
-	struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head);
-
 	switch (trans->msg_type) {
 	case NFT_MSG_NEWTABLE:
 		nf_tables_table_destroy(&trans->ctx);
@@ -3725,11 +3716,12 @@ static int nf_tables_abort(struct sk_buff *skb)
 		}
 	}
 
+	synchronize_rcu();
+
 	list_for_each_entry_safe_reverse(trans, next,
 					 &net->nft.commit_list, list) {
 		list_del(&trans->list);
-		trans->ctx.nla = NULL;
-		call_rcu(&trans->rcu_head, nf_tables_abort_release_rcu);
+		nf_tables_abort_release(trans);
 	}
 
 	return 0;
-- 
cgit v1.1


From 5195c14c8b27cc0b18220ddbf0e5ad3328a04187 Mon Sep 17 00:00:00 2001
From: bill bonaparte <programme110@gmail.com>
Date: Thu, 6 Nov 2014 14:36:48 +0100
Subject: netfilter: conntrack: fix race in __nf_conntrack_confirm against
 get_next_corpse

After removal of the central spinlock nf_conntrack_lock, in
commit 93bb0ceb75be2 ("netfilter: conntrack: remove central
spinlock nf_conntrack_lock"), it is possible to race against
get_next_corpse().

The race is against the get_next_corpse() cleanup on
the "unconfirmed" list (a per-cpu list with seperate locking),
which set the DYING bit.

Fix this race, in __nf_conntrack_confirm(), by removing the CT
from unconfirmed list before checking the DYING bit.  In case
race occured, re-add the CT to the dying list.

While at this, fix coding style of the comment that has been
updated.

Fixes: 93bb0ceb75be2 ("netfilter: conntrack: remove central spinlock nf_conntrack_lock")
Reported-by: bill bonaparte <programme110@gmail.com>
Signed-off-by: bill bonaparte <programme110@gmail.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_core.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 5016a69..2c69975 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -611,12 +611,16 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	 */
 	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
 	pr_debug("Confirming conntrack %p\n", ct);
-	/* We have to check the DYING flag inside the lock to prevent
-	   a race against nf_ct_get_next_corpse() possibly called from
-	   user context, else we insert an already 'dead' hash, blocking
-	   further use of that particular connection -JM */
+
+	/* We have to check the DYING flag after unlink to prevent
+	 * a race against nf_ct_get_next_corpse() possibly called from
+	 * user context, else we insert an already 'dead' hash, blocking
+	 * further use of that particular connection -JM.
+	 */
+	nf_ct_del_from_dying_or_unconfirmed_list(ct);
 
 	if (unlikely(nf_ct_is_dying(ct))) {
+		nf_ct_add_to_dying_list(ct);
 		nf_conntrack_double_unlock(hash, reply_hash);
 		local_bh_enable();
 		return NF_ACCEPT;
@@ -636,8 +640,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
 			goto out;
 
-	nf_ct_del_from_dying_or_unconfirmed_list(ct);
-
 	/* Timer relative to confirmation time, not original
 	   setting time, otherwise we'd get timer wrap in
 	   weird delay cases. */
-- 
cgit v1.1


From ab64f16ff2e83371927c57a0380fd3c0fee5c1c1 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Tue, 11 Nov 2014 13:40:49 -0800
Subject: openvswitch: Fix memory leak.

Need to free memory in case of sample action error.

Introduced by commit 651887b0c22cffcfce7eb9c ("openvswitch: Sample
action without side effects").

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/actions.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 006886d..00e447a 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -722,8 +722,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 
 		case OVS_ACTION_ATTR_SAMPLE:
 			err = sample(dp, skb, key, a);
-			if (unlikely(err)) /* skb already freed. */
-				return err;
 			break;
 		}
 
-- 
cgit v1.1


From 856447d0209c2214d806b30bd3b0d873db5998bd Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Tue, 11 Nov 2014 14:32:20 -0800
Subject: openvswitch: Fix checksum calculation when modifying ICMPv6 packets.

The checksum of ICMPv6 packets uses the IP pseudoheader as part of
the calculation, unlike ICMP in IPv4. This was not implemented,
which means that modifying the IP addresses of an ICMPv6 packet
would cause the checksum to no longer be correct as the psuedoheader
did not match.
Introduced by commit 3fdbd1ce11e5 ("openvswitch: add ipv6 'set' action").

Reported-by: Neal Shrader <icosahedral@gmail.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/actions.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 00e447a..8c4229b 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -246,11 +246,11 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto,
 {
 	int transport_len = skb->len - skb_transport_offset(skb);
 
-	if (l4_proto == IPPROTO_TCP) {
+	if (l4_proto == NEXTHDR_TCP) {
 		if (likely(transport_len >= sizeof(struct tcphdr)))
 			inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb,
 						  addr, new_addr, 1);
-	} else if (l4_proto == IPPROTO_UDP) {
+	} else if (l4_proto == NEXTHDR_UDP) {
 		if (likely(transport_len >= sizeof(struct udphdr))) {
 			struct udphdr *uh = udp_hdr(skb);
 
@@ -261,6 +261,10 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto,
 					uh->check = CSUM_MANGLED_0;
 			}
 		}
+	} else if (l4_proto == NEXTHDR_ICMP) {
+		if (likely(transport_len >= sizeof(struct icmp6hdr)))
+			inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum,
+						  skb, addr, new_addr, 1);
 	}
 }
 
-- 
cgit v1.1


From 19e7a3df7261c9b7ebced8163c383712d5b6ac6b Mon Sep 17 00:00:00 2001
From: Daniele Di Proietto <ddiproietto@vmware.com>
Date: Tue, 11 Nov 2014 14:51:22 -0800
Subject: openvswitch: Fix NDP flow mask validation

match_validate() enforce that a mask matching on NDP attributes has also an
exact match on ICMPv6 type.
The ICMPv6 type, which is 8-bit wide, is stored in the 'tp.src' field of
'struct sw_flow_key', which is 16-bit wide.
Therefore, an exact match on ICMPv6 type should only check the first 8 bits.

This commit fixes a bug that prevented flows with an exact match on NDP field
from being installed
Introduced by commit 03f0d916aa03 ("openvswitch: Mega flow implementation").

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/flow_netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 939bcb3..dda040e 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -145,7 +145,7 @@ static bool match_validate(const struct sw_flow_match *match,
 	if (match->key->eth.type == htons(ETH_P_ARP)
 			|| match->key->eth.type == htons(ETH_P_RARP)) {
 		key_expected |= 1 << OVS_KEY_ATTR_ARP;
-		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+		if (match->mask && (match->mask->key.tp.src == htons(0xff)))
 			mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
 	}
 
-- 
cgit v1.1


From 8ec609d8b561468691b60347ff594bd443ea58c0 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Tue, 11 Nov 2014 15:55:16 -0800
Subject: openvswitch: Convert dp rcu read operation to locked operations

dp read operations depends on ovs_dp_cmd_fill_info(). This API
needs to looup vport to find dp name, but vport lookup can
fail. Therefore to keep vport reference alive we need to
take ovs lock.

Introduced by commit 6093ae9abac1 ("openvswitch: Minimize
dp and vport critical sections").

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>
---
 net/openvswitch/datapath.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index e6d7255..f9e556b 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1265,7 +1265,7 @@ static size_t ovs_dp_cmd_msg_size(void)
 	return msgsize;
 }
 
-/* Called with ovs_mutex or RCU read lock. */
+/* Called with ovs_mutex. */
 static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
 				u32 portid, u32 seq, u32 flags, u8 cmd)
 {
@@ -1555,7 +1555,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	if (!reply)
 		return -ENOMEM;
 
-	rcu_read_lock();
+	ovs_lock();
 	dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
 	if (IS_ERR(dp)) {
 		err = PTR_ERR(dp);
@@ -1564,12 +1564,12 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
 				   info->snd_seq, 0, OVS_DP_CMD_NEW);
 	BUG_ON(err < 0);
-	rcu_read_unlock();
+	ovs_unlock();
 
 	return genlmsg_reply(reply, info);
 
 err_unlock_free:
-	rcu_read_unlock();
+	ovs_unlock();
 	kfree_skb(reply);
 	return err;
 }
@@ -1581,8 +1581,8 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	int skip = cb->args[0];
 	int i = 0;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(dp, &ovs_net->dps, list_node) {
+	ovs_lock();
+	list_for_each_entry(dp, &ovs_net->dps, list_node) {
 		if (i >= skip &&
 		    ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid,
 					 cb->nlh->nlmsg_seq, NLM_F_MULTI,
@@ -1590,7 +1590,7 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 			break;
 		i++;
 	}
-	rcu_read_unlock();
+	ovs_unlock();
 
 	cb->args[0] = i;
 
-- 
cgit v1.1


From fecaef85f7188ad1822210e2c7a7625c9a32a8e4 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jrajahalme@nicira.com>
Date: Tue, 11 Nov 2014 14:36:30 -0800
Subject: openvswitch: Validate IPv6 flow key and mask values.

Reject flow label key and mask values with invalid bits set.
Introduced by commit 3fdbd1ce11e5 ("openvswitch: add ipv6 'set'
action").

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/flow_netlink.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index dda040e..fa4ec2e 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -689,6 +689,13 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
 				ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX);
 			return -EINVAL;
 		}
+
+		if (ipv6_key->ipv6_label & htonl(0xFFF00000)) {
+			OVS_NLERR("IPv6 flow label %x is out of range (max=%x).\n",
+				  ntohl(ipv6_key->ipv6_label), (1 << 20) - 1);
+			return -EINVAL;
+		}
+
 		SW_FLOW_KEY_PUT(match, ipv6.label,
 				ipv6_key->ipv6_label, is_mask);
 		SW_FLOW_KEY_PUT(match, ip.proto,
-- 
cgit v1.1


From 49dd18ba4615eaa72f15c9087dea1c2ab4744cf5 Mon Sep 17 00:00:00 2001
From: Panu Matilainen <pmatilai@redhat.com>
Date: Fri, 14 Nov 2014 13:14:32 +0200
Subject: ipv4: Fix incorrect error code when adding an unreachable route

Trying to add an unreachable route incorrectly returns -ESRCH if
if custom FIB rules are present:

[root@localhost ~]# ip route add 74.125.31.199 dev eth0 via 1.2.3.4
RTNETLINK answers: Network is unreachable
[root@localhost ~]# ip rule add to 55.66.77.88 table 200
[root@localhost ~]# ip route add 74.125.31.199 dev eth0 via 1.2.3.4
RTNETLINK answers: No such process
[root@localhost ~]#

Commit 83886b6b636173b206f475929e58fac75c6f2446 ("[NET]: Change "not found"
return value for rule lookup") changed fib_rules_lookup()
to use -ESRCH as a "not found" code internally, but for user space it
should be translated into -ENETUNREACH. Handle the translation centrally in
ipv4-specific fib_lookup(), leaving the DECnet case alone.

On a related note, commit b7a71b51ee37d919e4098cd961d59a883fd272d8
("ipv4: removed redundant conditional") removed a similar translation from
ip_route_input_slow() prematurely AIUI.

Fixes: b7a71b51ee37 ("ipv4: removed redundant conditional")
Signed-off-by: Panu Matilainen <pmatilai@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_rules.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index f2e1573..8f7bd56 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -62,6 +62,10 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
 	else
 		res->tclassid = 0;
 #endif
+
+	if (err == -ESRCH)
+		err = -ENETUNREACH;
+
 	return err;
 }
 EXPORT_SYMBOL_GPL(__fib_lookup);
-- 
cgit v1.1


From 52cff74eef5dd7bdab759300e7d1ca36eba18254 Mon Sep 17 00:00:00 2001
From: Anish Bhatt <anish@chelsio.com>
Date: Fri, 14 Nov 2014 16:38:31 -0800
Subject: dcbnl : Disable software interrupts before taking dcb_lock

Solves possible lockup issues that can be seen from firmware DCB agents calling
into the DCB app api.

DCB firmware event queues can be tied in with NAPI so that dcb events are
generated in softIRQ context. This can results in calls to dcb_*app()
functions which try to take the dcb_lock.

If the the event triggers while we also have the dcb_lock because lldpad or
some other agent happened to be issuing a  get/set command we could see a cpu
lockup.

This code was not originally written with firmware agents in mind, hence
grabbing dcb_lock from softIRQ context was not considered.

Signed-off-by: Anish Bhatt <anish@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dcb/dcbnl.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index ca11d28..93ea801 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1080,13 +1080,13 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
 	if (!app)
 		return -EMSGSIZE;
 
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	list_for_each_entry(itr, &dcb_app_list, list) {
 		if (itr->ifindex == netdev->ifindex) {
 			err = nla_put(skb, DCB_ATTR_IEEE_APP, sizeof(itr->app),
 					 &itr->app);
 			if (err) {
-				spin_unlock(&dcb_lock);
+				spin_unlock_bh(&dcb_lock);
 				return -EMSGSIZE;
 			}
 		}
@@ -1097,7 +1097,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
 	else
 		dcbx = -EOPNOTSUPP;
 
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 	nla_nest_end(skb, app);
 
 	/* get peer info if available */
@@ -1234,7 +1234,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
 	}
 
 	/* local app */
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	app = nla_nest_start(skb, DCB_ATTR_CEE_APP_TABLE);
 	if (!app)
 		goto dcb_unlock;
@@ -1271,7 +1271,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
 	else
 		dcbx = -EOPNOTSUPP;
 
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 
 	/* features flags */
 	if (ops->getfeatcfg) {
@@ -1326,7 +1326,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
 	return 0;
 
 dcb_unlock:
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 nla_put_failure:
 	return err;
 }
@@ -1762,10 +1762,10 @@ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app)
 	struct dcb_app_type *itr;
 	u8 prio = 0;
 
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	if ((itr = dcb_app_lookup(app, dev->ifindex, 0)))
 		prio = itr->app.priority;
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 
 	return prio;
 }
@@ -1789,7 +1789,7 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new)
 	if (dev->dcbnl_ops->getdcbx)
 		event.dcbx = dev->dcbnl_ops->getdcbx(dev);
 
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	/* Search for existing match and replace */
 	if ((itr = dcb_app_lookup(new, dev->ifindex, 0))) {
 		if (new->priority)
@@ -1804,7 +1804,7 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new)
 	if (new->priority)
 		err = dcb_app_add(new, dev->ifindex);
 out:
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 	if (!err)
 		call_dcbevent_notifiers(DCB_APP_EVENT, &event);
 	return err;
@@ -1823,10 +1823,10 @@ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app)
 	struct dcb_app_type *itr;
 	u8 prio = 0;
 
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	if ((itr = dcb_app_lookup(app, dev->ifindex, 0)))
 		prio |= 1 << itr->app.priority;
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 
 	return prio;
 }
@@ -1850,7 +1850,7 @@ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new)
 	if (dev->dcbnl_ops->getdcbx)
 		event.dcbx = dev->dcbnl_ops->getdcbx(dev);
 
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	/* Search for existing match and abort if found */
 	if (dcb_app_lookup(new, dev->ifindex, new->priority)) {
 		err = -EEXIST;
@@ -1859,7 +1859,7 @@ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new)
 
 	err = dcb_app_add(new, dev->ifindex);
 out:
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 	if (!err)
 		call_dcbevent_notifiers(DCB_APP_EVENT, &event);
 	return err;
@@ -1882,7 +1882,7 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del)
 	if (dev->dcbnl_ops->getdcbx)
 		event.dcbx = dev->dcbnl_ops->getdcbx(dev);
 
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	/* Search for existing match and remove it. */
 	if ((itr = dcb_app_lookup(del, dev->ifindex, del->priority))) {
 		list_del(&itr->list);
@@ -1890,7 +1890,7 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del)
 		err = 0;
 	}
 
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 	if (!err)
 		call_dcbevent_notifiers(DCB_APP_EVENT, &event);
 	return err;
@@ -1902,12 +1902,12 @@ static void dcb_flushapp(void)
 	struct dcb_app_type *app;
 	struct dcb_app_type *tmp;
 
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	list_for_each_entry_safe(app, tmp, &dcb_app_list, list) {
 		list_del(&app->list);
 		kfree(app);
 	}
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 }
 
 static int __init dcbnl_init(void)
-- 
cgit v1.1


From feb91a02ccb09661507f170b2a444aec94f307f9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 5 Nov 2014 20:27:38 +0100
Subject: ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs

It has been reported that generating an MLD listener report on
devices with large MTUs (e.g. 9000) and a high number of IPv6
addresses can trigger a skb_over_panic():

skbuff: skb_over_panic: text:ffffffff80612a5d len:3776 put:20
head:ffff88046d751000 data:ffff88046d751010 tail:0xed0 end:0xec0
dev:port1
 ------------[ cut here ]------------
kernel BUG at net/core/skbuff.c:100!
invalid opcode: 0000 [#1] SMP
Modules linked in: ixgbe(O)
CPU: 3 PID: 0 Comm: swapper/3 Tainted: G O 3.14.23+ #4
[...]
Call Trace:
 <IRQ>
 [<ffffffff80578226>] ? skb_put+0x3a/0x3b
 [<ffffffff80612a5d>] ? add_grhead+0x45/0x8e
 [<ffffffff80612e3a>] ? add_grec+0x394/0x3d4
 [<ffffffff80613222>] ? mld_ifc_timer_expire+0x195/0x20d
 [<ffffffff8061308d>] ? mld_dad_timer_expire+0x45/0x45
 [<ffffffff80255b5d>] ? call_timer_fn.isra.29+0x12/0x68
 [<ffffffff80255d16>] ? run_timer_softirq+0x163/0x182
 [<ffffffff80250e6f>] ? __do_softirq+0xe0/0x21d
 [<ffffffff8025112b>] ? irq_exit+0x4e/0xd3
 [<ffffffff802214bb>] ? smp_apic_timer_interrupt+0x3b/0x46
 [<ffffffff8063f10a>] ? apic_timer_interrupt+0x6a/0x70

mld_newpack() skb allocations are usually requested with dev->mtu
in size, since commit 72e09ad107e7 ("ipv6: avoid high order allocations")
we have changed the limit in order to be less likely to fail.

However, in MLD/IGMP code, we have some rather ugly AVAILABLE(skb)
macros, which determine if we may end up doing an skb_put() for
adding another record. To avoid possible fragmentation, we check
the skb's tailroom as skb->dev->mtu - skb->len, which is a wrong
assumption as the actual max allocation size can be much smaller.

The IGMP case doesn't have this issue as commit 57e1ab6eaddc
("igmp: refine skb allocations") stores the allocation size in
the cb[].

Set a reserved_tailroom to make it fit into the MTU and use
skb_availroom() helper instead. This also allows to get rid of
igmp_skb_size().

Reported-by: Wei Liu <lw1a2.jing@gmail.com>
Fixes: 72e09ad107e7 ("ipv6: avoid high order allocations")
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: David L Stevens <david.stevens@oracle.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c  | 11 +++++------
 net/ipv6/mcast.c |  9 +++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index fb70e3e..bb15d0e 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -318,9 +318,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
 	return scount;
 }
 
-#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb))
-
-static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
+static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
 {
 	struct sk_buff *skb;
 	struct rtable *rt;
@@ -330,6 +328,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 	struct flowi4 fl4;
 	int hlen = LL_RESERVED_SPACE(dev);
 	int tlen = dev->needed_tailroom;
+	unsigned int size = mtu;
 
 	while (1) {
 		skb = alloc_skb(size + hlen + tlen,
@@ -341,7 +340,6 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 			return NULL;
 	}
 	skb->priority = TC_PRIO_CONTROL;
-	igmp_skb_size(skb) = size;
 
 	rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
 				   0, 0,
@@ -354,6 +352,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 	skb_dst_set(skb, &rt->dst);
 	skb->dev = dev;
 
+	skb->reserved_tailroom = skb_end_offset(skb) -
+				 min(mtu, skb_end_offset(skb));
 	skb_reserve(skb, hlen);
 
 	skb_reset_network_header(skb);
@@ -423,8 +423,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
 	return skb;
 }
 
-#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \
-	skb_tailroom(skb)) : 0)
+#define AVAILABLE(skb)	((skb) ? skb_availroom(skb) : 0)
 
 static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
 	int type, int gdeleted, int sdeleted)
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 9648de2..ed2c4e4 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1550,7 +1550,7 @@ static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb,
 	hdr->daddr = *daddr;
 }
 
-static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size)
+static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
 {
 	struct net_device *dev = idev->dev;
 	struct net *net = dev_net(dev);
@@ -1561,13 +1561,13 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size)
 	const struct in6_addr *saddr;
 	int hlen = LL_RESERVED_SPACE(dev);
 	int tlen = dev->needed_tailroom;
+	unsigned int size = mtu + hlen + tlen;
 	int err;
 	u8 ra[8] = { IPPROTO_ICMPV6, 0,
 		     IPV6_TLV_ROUTERALERT, 2, 0, 0,
 		     IPV6_TLV_PADN, 0 };
 
 	/* we assume size > sizeof(ra) here */
-	size += hlen + tlen;
 	/* limit our allocations to order-0 page */
 	size = min_t(int, size, SKB_MAX_ORDER(0, 0));
 	skb = sock_alloc_send_skb(sk, size, 1, &err);
@@ -1576,6 +1576,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size)
 		return NULL;
 
 	skb->priority = TC_PRIO_CONTROL;
+	skb->reserved_tailroom = skb_end_offset(skb) -
+				 min(mtu, skb_end_offset(skb));
 	skb_reserve(skb, hlen);
 
 	if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) {
@@ -1690,8 +1692,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
 	return skb;
 }
 
-#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \
-	skb_tailroom(skb)) : 0)
+#define AVAILABLE(skb)	((skb) ? skb_availroom(skb) : 0)
 
 static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
 	int type, int gdeleted, int sdeleted, int crsend)
-- 
cgit v1.1


From 97840cb67ff5ac8add836684f011fd838518d698 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 14 Nov 2014 18:14:33 +0100
Subject: netfilter: nfnetlink: fix insufficient validation in nfnetlink_bind

Make sure the netlink group exists, otherwise you can trigger an out
of bound array memory access from the netlink_bind() path. This splat
can only be triggered only by superuser.

[  180.203600] UBSan: Undefined behaviour in ../net/netfilter/nfnetlink.c:467:28
[  180.204249] index 9 is out of range for type 'int [9]'
[  180.204697] CPU: 0 PID: 1771 Comm: trinity-main Not tainted 3.18.0-rc4-mm1+ #122
[  180.205365] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org
+04/01/2014
[  180.206498]  0000000000000018 0000000000000000 0000000000000009 ffff88007bdf7da8
[  180.207220]  ffffffff82b0ef5f 0000000000000092 ffffffff845ae2e0 ffff88007bdf7db8
[  180.207887]  ffffffff8199e489 ffff88007bdf7e18 ffffffff8199ea22 0000003900000000
[  180.208639] Call Trace:
[  180.208857] dump_stack (lib/dump_stack.c:52)
[  180.209370] ubsan_epilogue (lib/ubsan.c:174)
[  180.209849] __ubsan_handle_out_of_bounds (lib/ubsan.c:400)
[  180.210512] nfnetlink_bind (net/netfilter/nfnetlink.c:467)
[  180.210986] netlink_bind (net/netlink/af_netlink.c:1483)
[  180.211495] SYSC_bind (net/socket.c:1541)

Moreover, define the missing nf_tables and nf_acct multicast groups too.

Reported-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 6c5a915..13c2e17 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -47,6 +47,8 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = {
 	[NFNLGRP_CONNTRACK_EXP_NEW]	= NFNL_SUBSYS_CTNETLINK_EXP,
 	[NFNLGRP_CONNTRACK_EXP_UPDATE]	= NFNL_SUBSYS_CTNETLINK_EXP,
 	[NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP,
+	[NFNLGRP_NFTABLES]		= NFNL_SUBSYS_NFTABLES,
+	[NFNLGRP_ACCT_QUOTA]		= NFNL_SUBSYS_ACCT,
 };
 
 void nfnl_lock(__u8 subsys_id)
@@ -464,7 +466,12 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 static int nfnetlink_bind(int group)
 {
 	const struct nfnetlink_subsystem *ss;
-	int type = nfnl_group2type[group];
+	int type;
+
+	if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX)
+		return -EINVAL;
+
+	type = nfnl_group2type[group];
 
 	rcu_read_lock();
 	ss = nfnetlink_get_subsys(type);
@@ -514,6 +521,9 @@ static int __init nfnetlink_init(void)
 {
 	int i;
 
+	for (i = NFNLGRP_NONE + 1; i <= NFNLGRP_MAX; i++)
+		BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE);
+
 	for (i=0; i<NFNL_SUBSYS_COUNT; i++)
 		mutex_init(&table[i].mutex);
 
-- 
cgit v1.1


From f0b4eeced518c632210ef2aea44fc92cc9e86cce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@web.de>
Date: Mon, 17 Nov 2014 12:20:28 +0100
Subject: bridge: fix netfilter/NF_BR_LOCAL_OUT for own, locally generated
 queries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ebtables on the OUTPUT chain (NF_BR_LOCAL_OUT) would not work as expected
for both locally generated IGMP and MLD queries. The IP header specific
filter options are off by 14 Bytes for netfilter (actual output on
interfaces is fine).

NF_HOOK() expects the skb->data to point to the IP header, not the
ethernet one (while dev_queue_xmit() does not). Luckily there is an
br_dev_queue_push_xmit() helper function already - let's just use that.

Introduced by eb1d16414339a6e113d89e2cca2556005d7ce919
("bridge: Add core IGMP snooping support")

Ebtables example:

$ ebtables -I OUTPUT -p IPv6 -o eth1 --logical-out br0 \
	--log --log-level 6 --log-ip6 --log-prefix="~EBT: " -j DROP

before (broken):

~EBT:  IN= OUT=eth1 MAC source = 02:04:64:a4:39:c2 \
	MAC dest = 33:33:00:00:00:01 proto = 0x86dd IPv6 \
	SRC=64a4:39c2:86dd:6000:0000:0020:0001:fe80 IPv6 \
	DST=0000:0000:0000:0004:64ff:fea4:39c2:ff02, \
	IPv6 priority=0x3, Next Header=2

after (working):

~EBT:  IN= OUT=eth1 MAC source = 02:04:64:a4:39:c2 \
	MAC dest = 33:33:00:00:00:01 proto = 0x86dd IPv6 \
	SRC=fe80:0000:0000:0000:0004:64ff:fea4:39c2 IPv6 \
	DST=ff02:0000:0000:0000:0000:0000:0000:0001, \
	IPv6 priority=0x0, Next Header=0

Signed-off-by: Linus Lüssing <linus.luessing@web.de>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/br_multicast.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 648d79c..c465876 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -813,10 +813,9 @@ static void __br_multicast_send_query(struct net_bridge *br,
 		return;
 
 	if (port) {
-		__skb_push(skb, sizeof(struct ethhdr));
 		skb->dev = port->dev;
 		NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
-			dev_queue_xmit);
+			br_dev_queue_push_xmit);
 	} else {
 		br_multicast_select_own_querier(br, ip, skb);
 		netif_rx(skb);
-- 
cgit v1.1


From 280ba51d60be6f4ca3347eaa60783314f38df72e Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Tue, 18 Nov 2014 22:35:31 +0100
Subject: mac80211: minstrel_ht: fix a crash in rate sorting

The commit 5935839ad73583781b8bbe8d91412f6826e218a4
"mac80211: improve minstrel_ht rate sorting by throughput & probability"

introduced a crash on rate sorting that occurs when the rate added to
the sorting array is faster than all the previous rates. Due to an
off-by-one error, it reads the rate index from tp_list[-1], which
contains uninitialized stack garbage, and then uses the resulting index
for accessing the group rate stats, leading to a crash if the garbage
value is big enough.

Cc: Thomas Huehn <thomas@net.t-labs.tu-berlin.de>
Reported-by: Jouni Malinen <j@w1.fi>
Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rc80211_minstrel_ht.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index df90ce2..408fd8ab 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -252,19 +252,16 @@ minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u8 index,
 	cur_thr = mi->groups[cur_group].rates[cur_idx].cur_tp;
 	cur_prob = mi->groups[cur_group].rates[cur_idx].probability;
 
-	tmp_group = tp_list[j - 1] / MCS_GROUP_RATES;
-	tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES;
-	tmp_thr = mi->groups[tmp_group].rates[tmp_idx].cur_tp;
-	tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability;
-
-	while (j > 0 && (cur_thr > tmp_thr ||
-	      (cur_thr == tmp_thr && cur_prob > tmp_prob))) {
-		j--;
+	do {
 		tmp_group = tp_list[j - 1] / MCS_GROUP_RATES;
 		tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES;
 		tmp_thr = mi->groups[tmp_group].rates[tmp_idx].cur_tp;
 		tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability;
-	}
+		if (cur_thr < tmp_thr ||
+		    (cur_thr == tmp_thr && cur_prob <= tmp_prob))
+			break;
+		j--;
+	} while (j > 0);
 
 	if (j < MAX_THR_RATES - 1) {
 		memmove(&tp_list[j + 1], &tp_list[j], (sizeof(*tp_list) *
-- 
cgit v1.1


From ffb1388a364d135810337182d6800a0c7ee44f48 Mon Sep 17 00:00:00 2001
From: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Date: Wed, 19 Nov 2014 09:35:39 +0800
Subject: ipv6: delete protocol and unregister rtnetlink when cleanup

pim6_protocol was added when initiation, but it not deleted.
Similarly, unregister RTNL_FAMILY_IP6MR rtnetlink.

Signed-off-by: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Reviewed-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 0171f08..1a01d79 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1439,6 +1439,10 @@ reg_pernet_fail:
 
 void ip6_mr_cleanup(void)
 {
+	rtnl_unregister(RTNL_FAMILY_IP6MR, RTM_GETROUTE);
+#ifdef CONFIG_IPV6_PIMSM_V2
+	inet6_del_protocol(&pim6_protocol, IPPROTO_PIM);
+#endif
 	unregister_netdevice_notifier(&ip6_mr_notifier);
 	unregister_pernet_subsys(&ip6mr_net_ops);
 	kmem_cache_destroy(mrt_cachep);
-- 
cgit v1.1


From d3052bb5d306b29c1e7d9e5998c5ac4ca1ff0ca9 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Wed, 19 Nov 2014 13:54:49 -0800
Subject: openvswitch: Don't validate IPv6 label masks.

When userspace doesn't provide a mask, OVS datapath generates a fully
unwildcarded mask for the flow by copying the flow and setting all bits
in all fields. For IPv6 label, this creates a mask that matches on the
upper 12 bits, causing the following error:

openvswitch: netlink: Invalid IPv6 flow label value (value=ffffffff, max=fffff)

This patch ignores the label validation check for masks, avoiding this
error.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/flow_netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index fa4ec2e..089b195 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -690,7 +690,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
 			return -EINVAL;
 		}
 
-		if (ipv6_key->ipv6_label & htonl(0xFFF00000)) {
+		if (!is_mask && ipv6_key->ipv6_label & htonl(0xFFF00000)) {
 			OVS_NLERR("IPv6 flow label %x is out of range (max=%x).\n",
 				  ntohl(ipv6_key->ipv6_label), (1 << 20) - 1);
 			return -EINVAL;
-- 
cgit v1.1


From 01462405f0c093b2f8dfddafcadcda6c9e4c5cdf Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Wed, 19 Nov 2014 23:05:49 +0100
Subject: ipx: fix locking regression in ipx_sendmsg and ipx_recvmsg

This fixes an old regression introduced by commit
b0d0d915 (ipx: remove the BKL).

When a recvmsg syscall blocks waiting for new data, no data can be sent on the
same socket with sendmsg because ipx_recvmsg() sleeps with the socket locked.

This breaks mars-nwe (NetWare emulator):
- the ncpserv process reads the request using recvmsg
- ncpserv forks and spawns nwconn
- ncpserv calls a (blocking) recvmsg and waits for new requests
- nwconn deadlocks in sendmsg on the same socket

Commit b0d0d915 has simply replaced BKL locking with
lock_sock/release_sock. Unlike now, BKL got unlocked while
sleeping, so a blocking recvmsg did not block a concurrent
sendmsg.

Only keep the socket locked while actually working with the socket data and
release it prior to calling skb_recv_datagram().

Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipx/af_ipx.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 91729b8..1b095ca 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1764,6 +1764,7 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
 	struct ipxhdr *ipx = NULL;
 	struct sk_buff *skb;
 	int copied, rc;
+	bool locked = true;
 
 	lock_sock(sk);
 	/* put the autobinding in */
@@ -1790,6 +1791,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
 	if (sock_flag(sk, SOCK_ZAPPED))
 		goto out;
 
+	release_sock(sk);
+	locked = false;
 	skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
 				flags & MSG_DONTWAIT, &rc);
 	if (!skb) {
@@ -1826,7 +1829,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
 out_free:
 	skb_free_datagram(sk, skb);
 out:
-	release_sock(sk);
+	if (locked)
+		release_sock(sk);
 	return rc;
 }
 
-- 
cgit v1.1


From e7820e39b7d19b9fe1928fc19de9361b44150ca6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Nov 2014 11:47:16 -0800
Subject: net: Revert "net: avoid one atomic operation in skb_clone()"

Not sure what I was thinking, but doing anything after
releasing a refcount is suicidal or/and embarrassing.

By the time we set skb->fclone to SKB_FCLONE_FREE, another cpu
could have released last reference and freed whole skb.

We potentially corrupt memory or trap if CONFIG_DEBUG_PAGEALLOC is set.

Reported-by: Chris Mason <clm@fb.com>
Fixes: ce1a4ea3f1258 ("net: avoid one atomic operation in skb_clone()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c16615b..32e31c2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -552,20 +552,13 @@ static void kfree_skbmem(struct sk_buff *skb)
 	case SKB_FCLONE_CLONE:
 		fclones = container_of(skb, struct sk_buff_fclones, skb2);
 
-		/* Warning : We must perform the atomic_dec_and_test() before
-		 * setting skb->fclone back to SKB_FCLONE_FREE, otherwise
-		 * skb_clone() could set clone_ref to 2 before our decrement.
-		 * Anyway, if we are going to free the structure, no need to
-		 * rewrite skb->fclone.
+		/* The clone portion is available for
+		 * fast-cloning again.
 		 */
-		if (atomic_dec_and_test(&fclones->fclone_ref)) {
+		skb->fclone = SKB_FCLONE_FREE;
+
+		if (atomic_dec_and_test(&fclones->fclone_ref))
 			kmem_cache_free(skbuff_fclone_cache, fclones);
-		} else {
-			/* The clone portion is available for
-			 * fast-cloning again.
-			 */
-			skb->fclone = SKB_FCLONE_FREE;
-		}
 		break;
 	}
 }
@@ -887,11 +880,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 	if (skb->fclone == SKB_FCLONE_ORIG &&
 	    n->fclone == SKB_FCLONE_FREE) {
 		n->fclone = SKB_FCLONE_CLONE;
-		/* As our fastclone was free, clone_ref must be 1 at this point.
-		 * We could use atomic_inc() here, but it is faster
-		 * to set the final value.
-		 */
-		atomic_set(&fclones->fclone_ref, 2);
+		atomic_inc(&fclones->fclone_ref);
 	} else {
 		if (skb_pfmemalloc(skb))
 			gfp_mask |= __GFP_MEMALLOC;
-- 
cgit v1.1


From 0c228e833c88e3aa029250f5db77d5968c5ce5b5 Mon Sep 17 00:00:00 2001
From: Calvin Owens <calvinowens@fb.com>
Date: Thu, 20 Nov 2014 15:09:53 -0800
Subject: tcp: Restore RFC5961-compliant behavior for SYN packets

Commit c3ae62af8e755 ("tcp: should drop incoming frames without ACK
flag set") was created to mitigate a security vulnerability in which a
local attacker is able to inject data into locally-opened sockets by
using TCP protocol statistics in procfs to quickly find the correct
sequence number.

This broke the RFC5961 requirement to send a challenge ACK in response
to spurious RST packets, which was subsequently fixed by commit
7b514a886ba50 ("tcp: accept RST without ACK flag").

Unfortunately, the RFC5961 requirement that spurious SYN packets be
handled in a similar manner remains broken.

RFC5961 section 4 states that:

   ... the handling of the SYN in the synchronized state SHOULD be
   performed as follows:

   1) If the SYN bit is set, irrespective of the sequence number, TCP
      MUST send an ACK (also referred to as challenge ACK) to the remote
      peer:

      <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>

      After sending the acknowledgment, TCP MUST drop the unacceptable
      segment and stop processing further.

   By sending an ACK, the remote peer is challenged to confirm the loss
   of the previous connection and the request to start a new connection.
   A legitimate peer, after restart, would not have a TCB in the
   synchronized state.  Thus, when the ACK arrives, the peer should send
   a RST segment back with the sequence number derived from the ACK
   field that caused the RST.

   This RST will confirm that the remote peer has indeed closed the
   previous connection.  Upon receipt of a valid RST, the local TCP
   endpoint MUST terminate its connection.  The local TCP endpoint
   should then rely on SYN retransmission from the remote end to
   re-establish the connection.

This patch lets SYN packets through the discard added in c3ae62af8e755,
so that spurious SYN packets are properly dealt with as per the RFC.

The challenge ACK is sent unconditionally and is rate-limited, so the
original vulnerability is not reintroduced by this patch.

Signed-off-by: Calvin Owens <calvinowens@fb.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 88fa2d1..d107ee2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5231,7 +5231,7 @@ slow_path:
 	if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
 		goto csum_error;
 
-	if (!th->ack && !th->rst)
+	if (!th->ack && !th->rst && !th->syn)
 		goto discard;
 
 	/*
@@ -5650,7 +5650,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			goto discard;
 	}
 
-	if (!th->ack && !th->rst)
+	if (!th->ack && !th->rst && !th->syn)
 		goto discard;
 
 	if (!tcp_validate_incoming(sk, skb, th, 0))
-- 
cgit v1.1


From 860a0d9e511f278bedab62d555a457c18e0841d5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Tue, 28 Oct 2014 14:24:12 -0400
Subject: sunrpc: add some tracepoints in svc_rqst handling functions

...just around svc_send, svc_recv and svc_process for now.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/svc.c      | 21 ++++++++++++---------
 net/sunrpc/svc_xprt.c | 31 +++++++++++++++++++++----------
 2 files changed, 33 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index ca8a795..371a8bb 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -28,6 +28,8 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/bc_xprt.h>
 
+#include <trace/events/sunrpc.h>
+
 #define RPCDBG_FACILITY	RPCDBG_SVCDSP
 
 static void svc_unregister(const struct svc_serv *serv, struct net *net);
@@ -1314,24 +1316,25 @@ svc_process(struct svc_rqst *rqstp)
 	rqstp->rq_res.tail[0].iov_base = NULL;
 	rqstp->rq_res.tail[0].iov_len = 0;
 
-	rqstp->rq_xid = svc_getu32(argv);
-
 	dir  = svc_getnl(argv);
 	if (dir != 0) {
 		/* direction != CALL */
 		svc_printk(rqstp, "bad direction %d, dropping request\n", dir);
 		serv->sv_stats->rpcbadfmt++;
-		svc_drop(rqstp);
-		return 0;
+		goto out_drop;
 	}
 
 	/* Returns 1 for send, 0 for drop */
-	if (svc_process_common(rqstp, argv, resv))
-		return svc_send(rqstp);
-	else {
-		svc_drop(rqstp);
-		return 0;
+	if (likely(svc_process_common(rqstp, argv, resv))) {
+		int ret = svc_send(rqstp);
+
+		trace_svc_process(rqstp, ret);
+		return ret;
 	}
+out_drop:
+	trace_svc_process(rqstp, 0);
+	svc_drop(rqstp);
+	return 0;
 }
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c179ca2..bbb3b04 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -15,6 +15,7 @@
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/xprt.h>
 #include <linux/module.h>
+#include <trace/events/sunrpc.h>
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
@@ -773,35 +774,43 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 
 	err = svc_alloc_arg(rqstp);
 	if (err)
-		return err;
+		goto out;
 
 	try_to_freeze();
 	cond_resched();
+	err = -EINTR;
 	if (signalled() || kthread_should_stop())
-		return -EINTR;
+		goto out;
 
 	xprt = svc_get_next_xprt(rqstp, timeout);
-	if (IS_ERR(xprt))
-		return PTR_ERR(xprt);
+	if (IS_ERR(xprt)) {
+		err = PTR_ERR(xprt);
+		goto out;
+	}
 
 	len = svc_handle_xprt(rqstp, xprt);
 
 	/* No data, incomplete (TCP) read, or accept() */
+	err = -EAGAIN;
 	if (len <= 0)
-		goto out;
+		goto out_release;
 
 	clear_bit(XPT_OLD, &xprt->xpt_flags);
 
 	rqstp->rq_secure = xprt->xpt_ops->xpo_secure_port(rqstp);
 	rqstp->rq_chandle.defer = svc_defer;
+	rqstp->rq_xid = svc_getu32(&rqstp->rq_arg.head[0]);
 
 	if (serv->sv_stats)
 		serv->sv_stats->netcnt++;
+	trace_svc_recv(rqstp, len);
 	return len;
-out:
+out_release:
 	rqstp->rq_res.len = 0;
 	svc_xprt_release(rqstp);
-	return -EAGAIN;
+out:
+	trace_svc_recv(rqstp, err);
+	return err;
 }
 EXPORT_SYMBOL_GPL(svc_recv);
 
@@ -821,12 +830,12 @@ EXPORT_SYMBOL_GPL(svc_drop);
 int svc_send(struct svc_rqst *rqstp)
 {
 	struct svc_xprt	*xprt;
-	int		len;
+	int		len = -EFAULT;
 	struct xdr_buf	*xb;
 
 	xprt = rqstp->rq_xprt;
 	if (!xprt)
-		return -EFAULT;
+		goto out;
 
 	/* release the receive skb before sending the reply */
 	rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
@@ -849,7 +858,9 @@ int svc_send(struct svc_rqst *rqstp)
 	svc_xprt_release(rqstp);
 
 	if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
-		return 0;
+		len = 0;
+out:
+	trace_svc_send(rqstp, len);
 	return len;
 }
 
-- 
cgit v1.1


From 3705ad64f123271b2b88dbff0c9891b7b90299d2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Tue, 28 Oct 2014 14:24:13 -0400
Subject: sunrpc: add new tracepoints in xprt handling code

...so we can keep track of when calls are sent and replies received.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprt.c     | 9 ++++++++-
 net/sunrpc/xprtsock.c | 8 +++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 56e4e15..1b2e5e6 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -49,6 +49,8 @@
 #include <linux/sunrpc/metrics.h>
 #include <linux/sunrpc/bc_xprt.h>
 
+#include <trace/events/sunrpc.h>
+
 #include "sunrpc.h"
 
 /*
@@ -772,11 +774,14 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
 	struct rpc_rqst *entry;
 
 	list_for_each_entry(entry, &xprt->recv, rq_list)
-		if (entry->rq_xid == xid)
+		if (entry->rq_xid == xid) {
+			trace_xprt_lookup_rqst(xprt, xid, 0);
 			return entry;
+		}
 
 	dprintk("RPC:       xprt_lookup_rqst did not find xid %08x\n",
 			ntohl(xid));
+	trace_xprt_lookup_rqst(xprt, xid, -ENOENT);
 	xprt->stat.bad_xids++;
 	return NULL;
 }
@@ -810,6 +815,7 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
 
 	dprintk("RPC: %5u xid %08x complete (%d bytes received)\n",
 			task->tk_pid, ntohl(req->rq_xid), copied);
+	trace_xprt_complete_rqst(xprt, req->rq_xid, copied);
 
 	xprt->stat.recvs++;
 	req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime);
@@ -926,6 +932,7 @@ void xprt_transmit(struct rpc_task *task)
 
 	req->rq_xtime = ktime_get();
 	status = xprt->ops->send_request(task);
+	trace_xprt_transmit(xprt, req->rq_xid, status);
 	if (status != 0) {
 		task->tk_status = status;
 		return;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 3b305ab..b63e262 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1454,12 +1454,15 @@ static void xs_tcp_data_ready(struct sock *sk)
 	struct rpc_xprt *xprt;
 	read_descriptor_t rd_desc;
 	int read;
+	unsigned long total = 0;
 
 	dprintk("RPC:       xs_tcp_data_ready...\n");
 
 	read_lock_bh(&sk->sk_callback_lock);
-	if (!(xprt = xprt_from_sock(sk)))
+	if (!(xprt = xprt_from_sock(sk))) {
+		read = 0;
 		goto out;
+	}
 	/* Any data means we had a useful conversation, so
 	 * the we don't need to delay the next reconnect
 	 */
@@ -1471,8 +1474,11 @@ static void xs_tcp_data_ready(struct sock *sk)
 	do {
 		rd_desc.count = 65536;
 		read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
+		if (read > 0)
+			total += read;
 	} while (read > 0);
 out:
+	trace_xs_tcp_data_ready(xprt, read, total);
 	read_unlock_bh(&sk->sk_callback_lock);
 }
 
-- 
cgit v1.1


From 1a867a0898b2e366a1eb5b7fe21413a2b2b1629f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Tue, 28 Oct 2014 14:24:14 -0400
Subject: sunrpc: add tracepoints in xs_tcp_data_recv

Add tracepoints inside the main loop on xs_tcp_data_recv that allow
us to keep an eye on what's happening during each phase of it.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprtsock.c | 61 ++-------------------------------------------------
 1 file changed, 2 insertions(+), 59 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index b63e262..31c0151 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -216,65 +216,6 @@ static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count)
 }
 #endif
 
-struct sock_xprt {
-	struct rpc_xprt		xprt;
-
-	/*
-	 * Network layer
-	 */
-	struct socket *		sock;
-	struct sock *		inet;
-
-	/*
-	 * State of TCP reply receive
-	 */
-	__be32			tcp_fraghdr,
-				tcp_xid,
-				tcp_calldir;
-
-	u32			tcp_offset,
-				tcp_reclen;
-
-	unsigned long		tcp_copied,
-				tcp_flags;
-
-	/*
-	 * Connection of transports
-	 */
-	struct delayed_work	connect_worker;
-	struct sockaddr_storage	srcaddr;
-	unsigned short		srcport;
-
-	/*
-	 * UDP socket buffer size parameters
-	 */
-	size_t			rcvsize,
-				sndsize;
-
-	/*
-	 * Saved socket callback addresses
-	 */
-	void			(*old_data_ready)(struct sock *);
-	void			(*old_state_change)(struct sock *);
-	void			(*old_write_space)(struct sock *);
-	void			(*old_error_report)(struct sock *);
-};
-
-/*
- * TCP receive state flags
- */
-#define TCP_RCV_LAST_FRAG	(1UL << 0)
-#define TCP_RCV_COPY_FRAGHDR	(1UL << 1)
-#define TCP_RCV_COPY_XID	(1UL << 2)
-#define TCP_RCV_COPY_DATA	(1UL << 3)
-#define TCP_RCV_READ_CALLDIR	(1UL << 4)
-#define TCP_RCV_COPY_CALLDIR	(1UL << 5)
-
-/*
- * TCP RPC flags
- */
-#define TCP_RPC_REPLY		(1UL << 6)
-
 static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
 {
 	return (struct rpc_xprt *) sk->sk_user_data;
@@ -1415,6 +1356,7 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
 
 	dprintk("RPC:       xs_tcp_data_recv started\n");
 	do {
+		trace_xs_tcp_data_recv(transport);
 		/* Read in a new fragment marker if necessary */
 		/* Can we ever really expect to get completely empty fragments? */
 		if (transport->tcp_flags & TCP_RCV_COPY_FRAGHDR) {
@@ -1439,6 +1381,7 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
 		/* Skip over any trailing bytes on short reads */
 		xs_tcp_read_discard(transport, &desc);
 	} while (desc.count);
+	trace_xs_tcp_data_recv(transport);
 	dprintk("RPC:       xs_tcp_data_recv done\n");
 	return len - desc.count;
 }
-- 
cgit v1.1


From f895b252d4edf66b2895fb5a7b17a638665f3e1f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Mon, 17 Nov 2014 16:58:04 -0500
Subject: sunrpc: eliminate RPC_DEBUG

It's always set to whatever CONFIG_SUNRPC_DEBUG is, so just use that.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/auth.c                       | 4 ++--
 net/sunrpc/auth_generic.c               | 2 +-
 net/sunrpc/auth_gss/auth_gss.c          | 2 +-
 net/sunrpc/auth_gss/gss_generic_token.c | 2 +-
 net/sunrpc/auth_gss/gss_krb5_crypto.c   | 2 +-
 net/sunrpc/auth_gss/gss_krb5_keys.c     | 2 +-
 net/sunrpc/auth_gss/gss_krb5_mech.c     | 2 +-
 net/sunrpc/auth_gss/gss_krb5_seal.c     | 2 +-
 net/sunrpc/auth_gss/gss_krb5_seqnum.c   | 2 +-
 net/sunrpc/auth_gss/gss_krb5_unseal.c   | 2 +-
 net/sunrpc/auth_gss/gss_krb5_wrap.c     | 2 +-
 net/sunrpc/auth_gss/gss_mech_switch.c   | 2 +-
 net/sunrpc/auth_gss/gss_rpc_xdr.h       | 2 +-
 net/sunrpc/auth_gss/svcauth_gss.c       | 2 +-
 net/sunrpc/auth_null.c                  | 4 ++--
 net/sunrpc/auth_unix.c                  | 2 +-
 net/sunrpc/backchannel_rqst.c           | 2 +-
 net/sunrpc/clnt.c                       | 6 +++---
 net/sunrpc/rpcb_clnt.c                  | 2 +-
 net/sunrpc/sched.c                      | 4 ++--
 net/sunrpc/sunrpc_syms.c                | 4 ++--
 net/sunrpc/svc.c                        | 2 +-
 net/sunrpc/sysctl.c                     | 2 +-
 net/sunrpc/xprt.c                       | 2 +-
 net/sunrpc/xprtrdma/rpc_rdma.c          | 4 ++--
 net/sunrpc/xprtrdma/transport.c         | 8 ++++----
 net/sunrpc/xprtrdma/verbs.c             | 8 ++++----
 net/sunrpc/xprtsock.c                   | 8 ++++----
 28 files changed, 44 insertions(+), 44 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 383eb91..47f38be 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -16,7 +16,7 @@
 #include <linux/sunrpc/gss_api.h>
 #include <linux/spinlock.h>
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
@@ -646,7 +646,7 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
 	cred->cr_auth = auth;
 	cred->cr_ops = ops;
 	cred->cr_expire = jiffies;
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	cred->cr_magic = RPCAUTH_CRED_MAGIC;
 #endif
 	cred->cr_uid = acred->uid;
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 6f6b829..41248b1 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -14,7 +14,7 @@
 #include <linux/sunrpc/debug.h>
 #include <linux/sunrpc/sched.h>
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 53ed8d3..dace13d 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -66,7 +66,7 @@ static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED;
 #define GSS_KEY_EXPIRE_TIMEO 240
 static unsigned int gss_key_expire_timeo = GSS_KEY_EXPIRE_TIMEO;
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c
index c586e92..254defe 100644
--- a/net/sunrpc/auth_gss/gss_generic_token.c
+++ b/net/sunrpc/auth_gss/gss_generic_token.c
@@ -38,7 +38,7 @@
 #include <linux/sunrpc/gss_asn1.h>
 
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY        RPCDBG_AUTH
 #endif
 
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index f5ed9f6..b5408e8 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -45,7 +45,7 @@
 #include <linux/sunrpc/gss_krb5.h>
 #include <linux/sunrpc/xdr.h>
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY        RPCDBG_AUTH
 #endif
 
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index 24589bd..234fa8d 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -61,7 +61,7 @@
 #include <linux/sunrpc/xdr.h>
 #include <linux/lcm.h>
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY        RPCDBG_AUTH
 #endif
 
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 0d3c158..28db442 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -45,7 +45,7 @@
 #include <linux/crypto.h>
 #include <linux/sunrpc/gss_krb5_enctypes.h>
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 42768e5..1d74d65 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -64,7 +64,7 @@
 #include <linux/random.h>
 #include <linux/crypto.h>
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY        RPCDBG_AUTH
 #endif
 
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
index 62ac90c..20d55c7 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -35,7 +35,7 @@
 #include <linux/sunrpc/gss_krb5.h>
 #include <linux/crypto.h>
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY        RPCDBG_AUTH
 #endif
 
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
index 6c981dd..dcf9515 100644
--- a/net/sunrpc/auth_gss/gss_krb5_unseal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -62,7 +62,7 @@
 #include <linux/sunrpc/gss_krb5.h>
 #include <linux/crypto.h>
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY        RPCDBG_AUTH
 #endif
 
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 4b614c6..ca7e92a 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -35,7 +35,7 @@
 #include <linux/pagemap.h>
 #include <linux/crypto.h>
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 92d5ab9..7063d85 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -46,7 +46,7 @@
 #include <linux/sunrpc/gss_api.h>
 #include <linux/sunrpc/clnt.h>
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY        RPCDBG_AUTH
 #endif
 
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.h b/net/sunrpc/auth_gss/gss_rpc_xdr.h
index 685a688..9d88c62 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.h
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.h
@@ -25,7 +25,7 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprtsock.h>
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index c548ab2..de856dd 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -51,7 +51,7 @@
 #include "gss_rpc_upcall.h"
 
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 712c123..c2a2b58 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -10,7 +10,7 @@
 #include <linux/module.h>
 #include <linux/sunrpc/clnt.h>
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
@@ -138,7 +138,7 @@ struct rpc_cred null_cred = {
 	.cr_ops		= &null_credops,
 	.cr_count	= ATOMIC_INIT(1),
 	.cr_flags	= 1UL << RPCAUTH_CRED_UPTODATE,
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	.cr_magic	= RPCAUTH_CRED_MAGIC,
 #endif
 };
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index d5d6923..4feda2d 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -25,7 +25,7 @@ struct unx_cred {
 
 #define UNX_WRITESLACK		(21 + (UNX_MAXNODENAME >> 2))
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 9761a0d..651f49a 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -27,7 +27,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <linux/export.h>
 #include <linux/sunrpc/bc_xprt.h>
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 #define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 9acd6ce..36c64ef 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -42,7 +42,7 @@
 #include "sunrpc.h"
 #include "netns.h"
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_CALL
 #endif
 
@@ -1396,7 +1396,7 @@ rpc_restart_call(struct rpc_task *task)
 }
 EXPORT_SYMBOL_GPL(rpc_restart_call);
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static const char *rpc_proc_name(const struct rpc_task *task)
 {
 	const struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
@@ -2421,7 +2421,7 @@ struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int
 }
 EXPORT_SYMBOL_GPL(rpc_call_null);
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static void rpc_show_header(void)
 {
 	printk(KERN_INFO "-pid- flgs status -client- --rqstp- "
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 1891a10..0520201 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -32,7 +32,7 @@
 
 #include "netns.h"
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_BIND
 #endif
 
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index fe3441a..574b297 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -24,7 +24,7 @@
 
 #include "sunrpc.h"
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 #define RPCDBG_FACILITY		RPCDBG_SCHED
 #endif
 
@@ -258,7 +258,7 @@ static int rpc_wait_bit_killable(struct wait_bit_key *key)
 	return 0;
 }
 
-#if defined(RPC_DEBUG) || defined(RPC_TRACEPOINTS)
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || defined(RPC_TRACEPOINTS)
 static void rpc_task_set_debuginfo(struct rpc_task *task)
 {
 	static atomic_t rpc_pid;
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index cd30120..f632e47 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -97,7 +97,7 @@ init_sunrpc(void)
 	err = register_rpc_pipefs();
 	if (err)
 		goto out4;
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	rpc_register_sysctl();
 #endif
 	svc_init_xprt_sock();	/* svc sock transport */
@@ -123,7 +123,7 @@ cleanup_sunrpc(void)
 	unregister_rpc_pipefs();
 	rpc_destroy_mempool();
 	unregister_pernet_subsys(&sunrpc_net_ops);
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	rpc_unregister_sysctl();
 #endif
 	rcu_barrier(); /* Wait for completion of call_rcu()'s */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 371a8bb..2783fd8 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1042,7 +1042,7 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net)
 /*
  * dprintk the given error with the address of the client that caused it.
  */
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static __printf(2, 3)
 void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...)
 {
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index c99c58e..887f018 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -37,7 +37,7 @@ EXPORT_SYMBOL_GPL(nfsd_debug);
 unsigned int	nlm_debug;
 EXPORT_SYMBOL_GPL(nlm_debug);
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 
 static struct ctl_table_header *sunrpc_table_header;
 static struct ctl_table sunrpc_table[];
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 1b2e5e6..894d071 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -57,7 +57,7 @@
  * Local variables
  */
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_XPRT
 #endif
 
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 6166c98..df01d12 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -49,11 +49,11 @@
 
 #include <linux/highmem.h>
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static const char transfertypes[][12] = {
 	"pure inline",	/* no chunks */
 	" read chunk",	/* some argument via rdma read */
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 6a4615d..ef58eba 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -55,7 +55,7 @@
 
 #include "xprt_rdma.h"
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
@@ -75,7 +75,7 @@ static unsigned int xprt_rdma_inline_write_padding;
 static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
                 int xprt_rdma_pad_optimize = 0;
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 
 static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
 static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
@@ -705,7 +705,7 @@ static void __exit xprt_rdma_cleanup(void)
 	int rc;
 
 	dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	if (sunrpc_table_header) {
 		unregister_sysctl_table(sunrpc_table_header);
 		sunrpc_table_header = NULL;
@@ -736,7 +736,7 @@ static int __init xprt_rdma_init(void)
 	dprintk("\tPadding %d\n\tMemreg %d\n",
 		xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	if (!sunrpc_table_header)
 		sunrpc_table_header = register_sysctl_table(sunrpc_table);
 #endif
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 61c4129..b92b040 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -57,7 +57,7 @@
  * Globals/Macros
  */
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
@@ -313,7 +313,7 @@ rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
 	rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep);
 }
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static const char * const conn[] = {
 	"address resolved",
 	"address error",
@@ -344,7 +344,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 	struct rpcrdma_xprt *xprt = id->context;
 	struct rpcrdma_ia *ia = &xprt->rx_ia;
 	struct rpcrdma_ep *ep = &xprt->rx_ep;
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
 #endif
 	struct ib_qp_attr attr;
@@ -408,7 +408,7 @@ connected:
 		break;
 	}
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	if (connstate == 1) {
 		int ird = attr.max_dest_rd_atomic;
 		int tird = ep->rep_remote_cma.responder_resources;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 31c0151..87ce7e8 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -75,7 +75,7 @@ static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO;
  * someone else's file names!
  */
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 
 static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE;
 static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE;
@@ -186,7 +186,7 @@ static struct ctl_table sunrpc_table[] = {
  */
 #define XS_IDLE_DISC_TO		(5U * 60 * HZ)
 
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # undef  RPC_DEBUG_DATA
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
@@ -2991,7 +2991,7 @@ static struct xprt_class	xs_bc_tcp_transport = {
  */
 int init_socket_xprt(void)
 {
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	if (!sunrpc_table_header)
 		sunrpc_table_header = register_sysctl_table(sunrpc_table);
 #endif
@@ -3010,7 +3010,7 @@ int init_socket_xprt(void)
  */
 void cleanup_socket_xprt(void)
 {
-#ifdef RPC_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	if (sunrpc_table_header) {
 		unregister_sysctl_table(sunrpc_table_header);
 		sunrpc_table_header = NULL;
-- 
cgit v1.1


From 1306729b0d4f4a0bd0d098711ed3d938dc5a1a28 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Mon, 17 Nov 2014 16:58:05 -0500
Subject: sunrpc: eliminate RPC_TRACEPOINTS

It's always set to the same value as CONFIG_TRACEPOINTS, so we can just
use that instead.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 574b297..d20f232 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -258,7 +258,7 @@ static int rpc_wait_bit_killable(struct wait_bit_key *key)
 	return 0;
 }
 
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || defined(RPC_TRACEPOINTS)
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS)
 static void rpc_task_set_debuginfo(struct rpc_task *task)
 {
 	static atomic_t rpc_pid;
-- 
cgit v1.1


From 92b98361f119a6919061d067c6584de9ae2de9f4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 8 Nov 2014 20:14:12 -0500
Subject: xprtrdma: Return an errno from rpcrdma_register_external()

The RPC/RDMA send_request method and the chunk registration code
expects an errno from the registration function. This allows
the upper layers to distinguish between a recoverable failure
(for example, temporary memory exhaustion) and a hard failure
(for example, a bug in the registration logic).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 61c4129..6ea2942 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1918,10 +1918,10 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
 		break;
 
 	default:
-		return -1;
+		return -EIO;
 	}
 	if (rc)
-		return -1;
+		return rc;
 
 	return nsegs;
 }
-- 
cgit v1.1


From e7104a2a96069975d489c60a30564372c6273a85 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 8 Nov 2014 20:14:20 -0500
Subject: xprtrdma: Cap req_cqinit

Recent work made FRMR registration and invalidation completions
unsignaled. This greatly reduces the adapter interrupt rate.

Every so often, however, a posted send Work Request is allowed to
signal. Otherwise, the provider's Work Queue will wrap and the
workload will hang.

The number of Work Requests that are allowed to remain unsignaled is
determined by the value of req_cqinit. Currently, this is set to the
size of the send Work Queue divided by two, minus 1.

For FRMR, the send Work Queue is the maximum number of concurrent
RPCs (currently 32) times the maximum number of Work Requests an
RPC might use (currently 7, though some adapters may need more).

For mlx4, this is 224 entries. This leaves completion signaling
disabled for 111 send Work Requests.

Some providers hold back dispatching Work Requests until a CQE is
generated.  If completions are disabled, then no CQEs are generated
for quite some time, and that can stall the Work Queue.

I've seen this occur running xfstests generic/113 over NFSv4, where
eventually, posting a FAST_REG_MR Work Request fails with -ENOMEM
because the Work Queue has overflowed. The connection is dropped
and re-established.

Cap the rep_cqinit setting so completions are not left turned off
for too long.

BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=269
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c     | 4 +++-
 net/sunrpc/xprtrdma/xprt_rdma.h | 6 ++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 6ea2942..af45cf3 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -733,7 +733,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 
 	/* set trigger for requesting send completion */
 	ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
-	if (ep->rep_cqinit <= 2)
+	if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
+		ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
+	else if (ep->rep_cqinit <= 2)
 		ep->rep_cqinit = 0;
 	INIT_CQCOUNT(ep);
 	ep->rep_ia = ia;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index ac7fc9a..b799041 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -97,6 +97,12 @@ struct rpcrdma_ep {
 	struct ib_wc		rep_recv_wcs[RPCRDMA_POLLSIZE];
 };
 
+/*
+ * Force a signaled SEND Work Request every so often,
+ * in case the provider needs to do some housekeeping.
+ */
+#define RPCRDMA_MAX_UNSIGNALED_SENDS	(32)
+
 #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
 #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
 
-- 
cgit v1.1


From 467c9674bccc073684ee34f4bd205cf1b135d76e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 8 Nov 2014 20:14:29 -0500
Subject: xprtrdma: unmap all FMRs during transport disconnect

When using RPCRDMA_MTHCAFMR memory registration, after a few
transport disconnect / reconnect cycles, ib_map_phys_fmr() starts to
return EINVAL because the provider has exhausted its map pool.

Make sure that all FMRs are unmapped during transport disconnect,
and that ->send_request remarshals them during an RPC retransmit.
This resets the transport's MRs to ensure that none are leaked
during a disconnect.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/transport.c |  2 +-
 net/sunrpc/xprtrdma/verbs.c     | 42 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 6a4615d..cfe9a81 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -599,7 +599,7 @@ xprt_rdma_send_request(struct rpc_task *task)
 
 	if (req->rl_niovs == 0)
 		rc = rpcrdma_marshal_req(rqst);
-	else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+	else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL)
 		rc = rpcrdma_marshal_chunks(rqst, 0);
 	if (rc < 0)
 		goto failed_marshal;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index af45cf3..3c88276 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -62,6 +62,7 @@
 #endif
 
 static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
+static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
 
 /*
  * internal functions
@@ -868,8 +869,19 @@ retry:
 		rpcrdma_ep_disconnect(ep, ia);
 		rpcrdma_flush_cqs(ep);
 
-		if (ia->ri_memreg_strategy == RPCRDMA_FRMR)
+		switch (ia->ri_memreg_strategy) {
+		case RPCRDMA_FRMR:
 			rpcrdma_reset_frmrs(ia);
+			break;
+		case RPCRDMA_MTHCAFMR:
+			rpcrdma_reset_fmrs(ia);
+			break;
+		case RPCRDMA_ALLPHYSICAL:
+			break;
+		default:
+			rc = -EIO;
+			goto out;
+		}
 
 		xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
 		id = rpcrdma_create_id(xprt, ia,
@@ -1289,6 +1301,34 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 	kfree(buf->rb_pool);
 }
 
+/* After a disconnect, unmap all FMRs.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_fmr_external().
+ */
+static void
+rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_xprt *r_xprt =
+				container_of(ia, struct rpcrdma_xprt, rx_ia);
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct list_head *pos;
+	struct rpcrdma_mw *r;
+	LIST_HEAD(l);
+	int rc;
+
+	list_for_each(pos, &buf->rb_all) {
+		r = list_entry(pos, struct rpcrdma_mw, mw_all);
+
+		INIT_LIST_HEAD(&l);
+		list_add(&r->r.fmr->list, &l);
+		rc = ib_unmap_fmr(&l);
+		if (rc)
+			dprintk("RPC:       %s: ib_unmap_fmr failed %i\n",
+				__func__, rc);
+	}
+}
+
 /* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
  * an unusable state. Find FRMRs in this state and dereg / reg
  * each.  FRMRs that are VALID and attached to an rpcrdma_req are
-- 
cgit v1.1


From f1a03b76fecdb23983e2ad7e817e98d093ece9f7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 8 Nov 2014 20:14:37 -0500
Subject: xprtrdma: Refactor tasklet scheduling

Restore the separate function that schedules the reply handling
tasklet. I need to call it from two different paths.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 3c88276..478b2fd 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -107,6 +107,17 @@ rpcrdma_run_tasklet(unsigned long data)
 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
 
 static void
+rpcrdma_schedule_tasklet(struct list_head *sched_list)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+	list_splice_tail(sched_list, &rpcrdma_tasklets_g);
+	spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+	tasklet_schedule(&rpcrdma_tasklet_g);
+}
+
+static void
 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
 {
 	struct rpcrdma_ep *ep = context;
@@ -244,7 +255,6 @@ rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
 	struct list_head sched_list;
 	struct ib_wc *wcs;
 	int budget, count, rc;
-	unsigned long flags;
 
 	INIT_LIST_HEAD(&sched_list);
 	budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
@@ -262,10 +272,7 @@ rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
 	rc = 0;
 
 out_schedule:
-	spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
-	list_splice_tail(&sched_list, &rpcrdma_tasklets_g);
-	spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
-	tasklet_schedule(&rpcrdma_tasklet_g);
+	rpcrdma_schedule_tasklet(&sched_list);
 	return rc;
 }
 
-- 
cgit v1.1


From 5c166bef4fa138da45d756dc77a8cb29fced1714 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 8 Nov 2014 20:14:45 -0500
Subject: xprtrdma: Re-write rpcrdma_flush_cqs()

Currently rpcrdma_flush_cqs() attempts to avoid code duplication,
and simply invokes rpcrdma_recvcq_upcall and rpcrdma_sendcq_upcall.

1. rpcrdma_flush_cqs() can run concurrently with provider upcalls.
   Both flush_cqs() and the upcalls were invoking ib_poll_cq() in
   different threads using the same wc buffers (ep->rep_recv_wcs
   and ep->rep_send_wcs), added by commit 1c00dd077654 ("xprtrmda:
   Reduce calls to ib_poll_cq() in completion handlers").

   During transport disconnect processing, this sometimes resulted
   in the same reply getting added to the rpcrdma_tasklets_g list
   more than once, which corrupted the list.

2. The upcall functions drain only a limited number of CQEs,
   thanks to the poll budget added by commit 8301a2c047cc
   ("xprtrdma: Limit work done by completion handler").

Fixes: a7bc211ac926 ("xprtrdma: On disconnect, don't ignore ... ")
BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=276
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 478b2fd..e6ac964 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -317,8 +317,15 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
 static void
 rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
 {
-	rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep);
-	rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep);
+	struct ib_wc wc;
+	LIST_HEAD(sched_list);
+
+	while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
+		rpcrdma_recvcq_process_wc(&wc, &sched_list);
+	if (!list_empty(&sched_list))
+		rpcrdma_schedule_tasklet(&sched_list);
+	while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
+		rpcrdma_sendcq_process_wc(&wc);
 }
 
 #ifdef RPC_DEBUG
-- 
cgit v1.1


From d5440e27d3e572272db957d6b9661a902b7c339f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 8 Nov 2014 20:14:53 -0500
Subject: xprtrdma: Enable pad optimization

The Linux NFS/RDMA server used to reject NFSv3 WRITE requests when
pad optimization was enabled. That bug was fixed by commit
e560e3b510d2 ("svcrdma: Add zero padding if the client doesn't send
it").

We can now enable pad optimization on the client, which helps
performance and is supported now by both Linux and Solaris servers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/transport.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index cfe9a81..8ed2576 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -73,7 +73,7 @@ static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_inline_write_padding;
 static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
-                int xprt_rdma_pad_optimize = 0;
+		int xprt_rdma_pad_optimize = 1;
 
 #ifdef RPC_DEBUG
 
-- 
cgit v1.1


From 7ff11de1bae02a41cac6503f858218ac1b9a3cbe Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 8 Nov 2014 20:15:01 -0500
Subject: xprtrdma: Display async errors

An async error upcall is a hard error, and should be reported in
the system log.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e6ac964..5783c1a 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -106,6 +106,32 @@ rpcrdma_run_tasklet(unsigned long data)
 
 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
 
+static const char * const async_event[] = {
+	"CQ error",
+	"QP fatal error",
+	"QP request error",
+	"QP access error",
+	"communication established",
+	"send queue drained",
+	"path migration successful",
+	"path mig error",
+	"device fatal error",
+	"port active",
+	"port error",
+	"LID change",
+	"P_key change",
+	"SM change",
+	"SRQ error",
+	"SRQ limit reached",
+	"last WQE reached",
+	"client reregister",
+	"GID change",
+};
+
+#define ASYNC_MSG(status)					\
+	((status) < ARRAY_SIZE(async_event) ?			\
+		async_event[(status)] : "unknown async error")
+
 static void
 rpcrdma_schedule_tasklet(struct list_head *sched_list)
 {
@@ -122,8 +148,9 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
 {
 	struct rpcrdma_ep *ep = context;
 
-	dprintk("RPC:       %s: QP error %X on device %s ep %p\n",
-		__func__, event->event, event->device->name, context);
+	pr_err("RPC:       %s: %s on device %s ep %p\n",
+	       __func__, ASYNC_MSG(event->event),
+		event->device->name, context);
 	if (ep->rep_connected == 1) {
 		ep->rep_connected = -EIO;
 		ep->rep_func(ep);
@@ -136,8 +163,9 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
 {
 	struct rpcrdma_ep *ep = context;
 
-	dprintk("RPC:       %s: CQ error %X on device %s ep %p\n",
-		__func__, event->event, event->device->name, context);
+	pr_err("RPC:       %s: %s on device %s ep %p\n",
+	       __func__, ASYNC_MSG(event->event),
+		event->device->name, context);
 	if (ep->rep_connected == 1) {
 		ep->rep_connected = -EIO;
 		ep->rep_func(ep);
-- 
cgit v1.1


From edef1297f33a4546559d905457b435a5ea160bab Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 8 Nov 2014 20:15:09 -0500
Subject: SUNRPC: serialize iostats updates

Occasionally mountstats reports a negative retransmission rate.
Ensure that two RPCs completing concurrently don't confuse the sums
in the transport's op_metrics array.

Since pNFS filelayout can invoke rpc_count_iostats() on another
transport from xprt_release(), we can't rely on simply holding the
transport_lock in xprt_release(). There's nothing for it but hard
serialization. One spin lock per RPC operation should make this as
painless as it can be.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/stats.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 5453049..9711a15 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -116,7 +116,15 @@ EXPORT_SYMBOL_GPL(svc_seq_show);
  */
 struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt)
 {
-	return kcalloc(clnt->cl_maxproc, sizeof(struct rpc_iostats), GFP_KERNEL);
+	struct rpc_iostats *stats;
+	int i;
+
+	stats = kcalloc(clnt->cl_maxproc, sizeof(*stats), GFP_KERNEL);
+	if (stats) {
+		for (i = 0; i < clnt->cl_maxproc; i++)
+			spin_lock_init(&stats[i].om_lock);
+	}
+	return stats;
 }
 EXPORT_SYMBOL_GPL(rpc_alloc_iostats);
 
@@ -135,20 +143,21 @@ EXPORT_SYMBOL_GPL(rpc_free_iostats);
  * rpc_count_iostats - tally up per-task stats
  * @task: completed rpc_task
  * @stats: array of stat structures
- *
- * Relies on the caller for serialization.
  */
 void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_iostats *op_metrics;
-	ktime_t delta;
+	ktime_t delta, now;
 
 	if (!stats || !req)
 		return;
 
+	now = ktime_get();
 	op_metrics = &stats[task->tk_msg.rpc_proc->p_statidx];
 
+	spin_lock(&op_metrics->om_lock);
+
 	op_metrics->om_ops++;
 	op_metrics->om_ntrans += req->rq_ntrans;
 	op_metrics->om_timeouts += task->tk_timeouts;
@@ -161,8 +170,10 @@ void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats)
 
 	op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt);
 
-	delta = ktime_sub(ktime_get(), task->tk_start);
+	delta = ktime_sub(now, task->tk_start);
 	op_metrics->om_execute = ktime_add(op_metrics->om_execute, delta);
+
+	spin_unlock(&op_metrics->om_lock);
 }
 EXPORT_SYMBOL_GPL(rpc_count_iostats);
 
-- 
cgit v1.1


From b4b9d2ccf0be61c69213f6ae4e33377c05194ef4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Wed, 26 Nov 2014 14:44:43 -0500
Subject: sunrpc: add debugfs file for displaying client rpc_task queue

It's possible to get a dump of the RPC task queue by writing a value to
/proc/sys/sunrpc/rpc_debug. If you write any value to that file, you get
a dump of the RPC client task list into the log buffer. This is a rather
inconvenient interface however, and makes it hard to get immediate info
about the task queue.

Add a new directory hierarchy under debugfs:

    sunrpc/
        rpc_clnt/
            <clientid>/

Within each clientid directory we create a new "tasks" file that will
dump info similar to what shows up in the log buffer, but with a few
small differences -- we avoid printing raw kernel addresses in favor of
symbolic names and the XID is also displayed.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/Kconfig       |   1 +
 net/sunrpc/Makefile      |   1 +
 net/sunrpc/clnt.c        |  10 ++-
 net/sunrpc/debugfs.c     | 191 +++++++++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/sunrpc_syms.c |   8 ++
 5 files changed, 210 insertions(+), 1 deletion(-)
 create mode 100644 net/sunrpc/debugfs.c

(limited to 'net')

diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 0754d0f..fb78117 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -35,6 +35,7 @@ config RPCSEC_GSS_KRB5
 config SUNRPC_DEBUG
 	bool "RPC: Enable dprintk debugging"
 	depends on SUNRPC && SYSCTL
+	select DEBUG_FS
 	help
 	  This option enables a sysctl-based debugging interface
 	  that is be used by the 'rpcdebug' utility to turn on or off
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index e5a7a1c..15e6f6c 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -14,6 +14,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
 	    addr.o rpcb_clnt.o timer.o xdr.o \
 	    sunrpc_syms.o cache.o rpc_pipe.o \
 	    svc_xprt.o
+sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o
 sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o bc_svc.o
 sunrpc-$(CONFIG_PROC_FS) += stats.o
 sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 36c64ef..05da12a 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -305,6 +305,10 @@ static int rpc_client_register(struct rpc_clnt *clnt,
 	struct super_block *pipefs_sb;
 	int err;
 
+	err = rpc_clnt_debugfs_register(clnt);
+	if (err)
+		return err;
+
 	pipefs_sb = rpc_get_sb_net(net);
 	if (pipefs_sb) {
 		err = rpc_setup_pipedir(pipefs_sb, clnt);
@@ -331,6 +335,7 @@ err_auth:
 out:
 	if (pipefs_sb)
 		rpc_put_sb_net(net);
+	rpc_clnt_debugfs_unregister(clnt);
 	return err;
 }
 
@@ -670,6 +675,7 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt,
 
 	rpc_unregister_client(clnt);
 	__rpc_clnt_remove_pipedir(clnt);
+	rpc_clnt_debugfs_unregister(clnt);
 
 	/*
 	 * A new transport was created.  "clnt" therefore
@@ -771,6 +777,7 @@ rpc_free_client(struct rpc_clnt *clnt)
 			rcu_dereference(clnt->cl_xprt)->servername);
 	if (clnt->cl_parent != clnt)
 		parent = clnt->cl_parent;
+	rpc_clnt_debugfs_unregister(clnt);
 	rpc_clnt_remove_pipedir(clnt);
 	rpc_unregister_client(clnt);
 	rpc_free_iostats(clnt->cl_metrics);
@@ -1397,7 +1404,8 @@ rpc_restart_call(struct rpc_task *task)
 EXPORT_SYMBOL_GPL(rpc_restart_call);
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-static const char *rpc_proc_name(const struct rpc_task *task)
+const char
+*rpc_proc_name(const struct rpc_task *task)
 {
 	const struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
 
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
new file mode 100644
index 0000000..3d77456
--- /dev/null
+++ b/net/sunrpc/debugfs.c
@@ -0,0 +1,191 @@
+/**
+ * debugfs interface for sunrpc
+ *
+ * (c) 2014 Jeff Layton <jlayton@primarydata.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include "netns.h"
+
+static struct dentry *topdir;
+static struct dentry *rpc_clnt_dir;
+
+struct rpc_clnt_iter {
+	struct rpc_clnt	*clnt;
+	loff_t		pos;
+};
+
+static int
+tasks_show(struct seq_file *f, void *v)
+{
+	u32 xid = 0;
+	struct rpc_task *task = v;
+	struct rpc_clnt *clnt = task->tk_client;
+	const char *rpc_waitq = "none";
+
+	if (RPC_IS_QUEUED(task))
+		rpc_waitq = rpc_qname(task->tk_waitqueue);
+
+	if (task->tk_rqstp)
+		xid = be32_to_cpu(task->tk_rqstp->rq_xid);
+
+	seq_printf(f, "%5u %04x %6d 0x%x 0x%x %8ld %ps %sv%u %s a:%ps q:%s\n",
+		task->tk_pid, task->tk_flags, task->tk_status,
+		clnt->cl_clid, xid, task->tk_timeout, task->tk_ops,
+		clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task),
+		task->tk_action, rpc_waitq);
+	return 0;
+}
+
+static void *
+tasks_start(struct seq_file *f, loff_t *ppos)
+	__acquires(&clnt->cl_lock)
+{
+	struct rpc_clnt_iter *iter = f->private;
+	loff_t pos = *ppos;
+	struct rpc_clnt *clnt = iter->clnt;
+	struct rpc_task *task;
+
+	iter->pos = pos + 1;
+	spin_lock(&clnt->cl_lock);
+	list_for_each_entry(task, &clnt->cl_tasks, tk_task)
+		if (pos-- == 0)
+			return task;
+	return NULL;
+}
+
+static void *
+tasks_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	struct rpc_clnt_iter *iter = f->private;
+	struct rpc_clnt *clnt = iter->clnt;
+	struct rpc_task *task = v;
+	struct list_head *next = task->tk_task.next;
+
+	++iter->pos;
+	++*pos;
+
+	/* If there's another task on list, return it */
+	if (next == &clnt->cl_tasks)
+		return NULL;
+	return list_entry(next, struct rpc_task, tk_task);
+}
+
+static void
+tasks_stop(struct seq_file *f, void *v)
+	__releases(&clnt->cl_lock)
+{
+	struct rpc_clnt_iter *iter = f->private;
+	struct rpc_clnt *clnt = iter->clnt;
+
+	spin_unlock(&clnt->cl_lock);
+}
+
+static const struct seq_operations tasks_seq_operations = {
+	.start	= tasks_start,
+	.next	= tasks_next,
+	.stop	= tasks_stop,
+	.show	= tasks_show,
+};
+
+static int tasks_open(struct inode *inode, struct file *filp)
+{
+	int ret = seq_open_private(filp, &tasks_seq_operations,
+					sizeof(struct rpc_clnt_iter));
+
+	if (!ret) {
+		struct seq_file *seq = filp->private_data;
+		struct rpc_clnt_iter *iter = seq->private;
+
+		iter->clnt = inode->i_private;
+
+		if (!atomic_inc_not_zero(&iter->clnt->cl_count)) {
+			seq_release_private(inode, filp);
+			ret = -EINVAL;
+		}
+	}
+
+	return ret;
+}
+
+static int
+tasks_release(struct inode *inode, struct file *filp)
+{
+	struct seq_file *seq = filp->private_data;
+	struct rpc_clnt_iter *iter = seq->private;
+
+	rpc_release_client(iter->clnt);
+	return seq_release_private(inode, filp);
+}
+
+static const struct file_operations tasks_fops = {
+	.owner		= THIS_MODULE,
+	.open		= tasks_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= tasks_release,
+};
+
+int
+rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
+{
+	int len;
+	char name[9]; /* 8 for hex digits + NULL terminator */
+
+	/* Already registered? */
+	if (clnt->cl_debugfs)
+		return 0;
+
+	len = snprintf(name, sizeof(name), "%x", clnt->cl_clid);
+	if (len >= sizeof(name))
+		return -EINVAL;
+
+	/* make the per-client dir */
+	clnt->cl_debugfs = debugfs_create_dir(name, rpc_clnt_dir);
+	if (!clnt->cl_debugfs)
+		return -ENOMEM;
+
+	/* make tasks file */
+	if (!debugfs_create_file("tasks", S_IFREG | S_IRUSR, clnt->cl_debugfs,
+				 clnt, &tasks_fops)) {
+		debugfs_remove_recursive(clnt->cl_debugfs);
+		clnt->cl_debugfs = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void
+rpc_clnt_debugfs_unregister(struct rpc_clnt *clnt)
+{
+	debugfs_remove_recursive(clnt->cl_debugfs);
+	clnt->cl_debugfs = NULL;
+}
+
+void __exit
+sunrpc_debugfs_exit(void)
+{
+	debugfs_remove_recursive(topdir);
+}
+
+int __init
+sunrpc_debugfs_init(void)
+{
+	topdir = debugfs_create_dir("sunrpc", NULL);
+	if (!topdir)
+		goto out;
+
+	rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir);
+	if (!rpc_clnt_dir)
+		goto out_remove;
+
+	return 0;
+out_remove:
+	debugfs_remove_recursive(topdir);
+	topdir = NULL;
+out:
+	return -ENOMEM;
+}
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index f632e47..e37fbed 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -97,6 +97,11 @@ init_sunrpc(void)
 	err = register_rpc_pipefs();
 	if (err)
 		goto out4;
+
+	err = sunrpc_debugfs_init();
+	if (err)
+		goto out5;
+
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	rpc_register_sysctl();
 #endif
@@ -104,6 +109,8 @@ init_sunrpc(void)
 	init_socket_xprt();	/* clnt sock transport */
 	return 0;
 
+out5:
+	unregister_rpc_pipefs();
 out4:
 	unregister_pernet_subsys(&sunrpc_net_ops);
 out3:
@@ -120,6 +127,7 @@ cleanup_sunrpc(void)
 	rpcauth_remove_module();
 	cleanup_socket_xprt();
 	svc_cleanup_xprt_sock();
+	sunrpc_debugfs_exit();
 	unregister_rpc_pipefs();
 	rpc_destroy_mempool();
 	unregister_pernet_subsys(&sunrpc_net_ops);
-- 
cgit v1.1


From 388f0c776781fe64ce951701bfe712b2182a31f2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Wed, 26 Nov 2014 14:44:44 -0500
Subject: sunrpc: add a debugfs rpc_xprt directory with an info file in it

Add a new directory heirarchy under the debugfs sunrpc/ directory:

    sunrpc/
        rpc_xprt/
            <xprt id>/

Within that directory, we can put files that give info about the
xprts. We do have the (minor) problem that there is no succinct,
unique identifier for rpc_xprts. So we generate them synthetically
with a static atomic_t counter.

For now, this directory just holds an "info" file, but we may add
other files to it in the future.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/debugfs.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++----
 net/sunrpc/xprt.c    |   8 ++++
 2 files changed, 116 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index 3d77456..e811f39 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -11,6 +11,7 @@
 
 static struct dentry *topdir;
 static struct dentry *rpc_clnt_dir;
+static struct dentry *rpc_xprt_dir;
 
 struct rpc_clnt_iter {
 	struct rpc_clnt	*clnt;
@@ -131,8 +132,8 @@ static const struct file_operations tasks_fops = {
 int
 rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
 {
-	int len;
-	char name[9]; /* 8 for hex digits + NULL terminator */
+	int len, err;
+	char name[24]; /* enough for "../../rpc_xprt/ + 8 hex digits + NULL */
 
 	/* Already registered? */
 	if (clnt->cl_debugfs)
@@ -148,14 +149,28 @@ rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
 		return -ENOMEM;
 
 	/* make tasks file */
+	err = -ENOMEM;
 	if (!debugfs_create_file("tasks", S_IFREG | S_IRUSR, clnt->cl_debugfs,
-				 clnt, &tasks_fops)) {
-		debugfs_remove_recursive(clnt->cl_debugfs);
-		clnt->cl_debugfs = NULL;
-		return -ENOMEM;
-	}
+				 clnt, &tasks_fops))
+		goto out_err;
+
+	err = -EINVAL;
+	rcu_read_lock();
+	len = snprintf(name, sizeof(name), "../../rpc_xprt/%s",
+			rcu_dereference(clnt->cl_xprt)->debugfs->d_name.name);
+	rcu_read_unlock();
+	if (len >= sizeof(name))
+		goto out_err;
+
+	err = -ENOMEM;
+	if (!debugfs_create_symlink("xprt", clnt->cl_debugfs, name))
+		goto out_err;
 
 	return 0;
+out_err:
+	debugfs_remove_recursive(clnt->cl_debugfs);
+	clnt->cl_debugfs = NULL;
+	return err;
 }
 
 void
@@ -165,6 +180,88 @@ rpc_clnt_debugfs_unregister(struct rpc_clnt *clnt)
 	clnt->cl_debugfs = NULL;
 }
 
+static int
+xprt_info_show(struct seq_file *f, void *v)
+{
+	struct rpc_xprt *xprt = f->private;
+
+	seq_printf(f, "netid: %s\n", xprt->address_strings[RPC_DISPLAY_NETID]);
+	seq_printf(f, "addr:  %s\n", xprt->address_strings[RPC_DISPLAY_ADDR]);
+	seq_printf(f, "port:  %s\n", xprt->address_strings[RPC_DISPLAY_PORT]);
+	seq_printf(f, "state: 0x%lx\n", xprt->state);
+	return 0;
+}
+
+static int
+xprt_info_open(struct inode *inode, struct file *filp)
+{
+	int ret;
+	struct rpc_xprt *xprt = inode->i_private;
+
+	ret = single_open(filp, xprt_info_show, xprt);
+
+	if (!ret) {
+		if (!xprt_get(xprt)) {
+			single_release(inode, filp);
+			ret = -EINVAL;
+		}
+	}
+	return ret;
+}
+
+static int
+xprt_info_release(struct inode *inode, struct file *filp)
+{
+	struct rpc_xprt *xprt = inode->i_private;
+
+	xprt_put(xprt);
+	return single_release(inode, filp);
+}
+
+static const struct file_operations xprt_info_fops = {
+	.owner		= THIS_MODULE,
+	.open		= xprt_info_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= xprt_info_release,
+};
+
+int
+rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
+{
+	int len, id;
+	static atomic_t	cur_id;
+	char		name[9]; /* 8 hex digits + NULL term */
+
+	id = (unsigned int)atomic_inc_return(&cur_id);
+
+	len = snprintf(name, sizeof(name), "%x", id);
+	if (len >= sizeof(name))
+		return -EINVAL;
+
+	/* make the per-client dir */
+	xprt->debugfs = debugfs_create_dir(name, rpc_xprt_dir);
+	if (!xprt->debugfs)
+		return -ENOMEM;
+
+	/* make tasks file */
+	if (!debugfs_create_file("info", S_IFREG | S_IRUSR, xprt->debugfs,
+				 xprt, &xprt_info_fops)) {
+		debugfs_remove_recursive(xprt->debugfs);
+		xprt->debugfs = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void
+rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt)
+{
+	debugfs_remove_recursive(xprt->debugfs);
+	xprt->debugfs = NULL;
+}
+
 void __exit
 sunrpc_debugfs_exit(void)
 {
@@ -182,6 +279,10 @@ sunrpc_debugfs_init(void)
 	if (!rpc_clnt_dir)
 		goto out_remove;
 
+	rpc_xprt_dir = debugfs_create_dir("rpc_xprt", topdir);
+	if (!rpc_xprt_dir)
+		goto out_remove;
+
 	return 0;
 out_remove:
 	debugfs_remove_recursive(topdir);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 894d071..ebbefad 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1303,6 +1303,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
  */
 struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
 {
+	int err;
 	struct rpc_xprt	*xprt;
 	struct xprt_class *t;
 
@@ -1343,6 +1344,12 @@ found:
 		return ERR_PTR(-ENOMEM);
 	}
 
+	err = rpc_xprt_debugfs_register(xprt);
+	if (err) {
+		xprt_destroy(xprt);
+		return ERR_PTR(err);
+	}
+
 	dprintk("RPC:       created transport %p with %u slots\n", xprt,
 			xprt->max_reqs);
 out:
@@ -1359,6 +1366,7 @@ static void xprt_destroy(struct rpc_xprt *xprt)
 	dprintk("RPC:       destroying transport %p\n", xprt);
 	del_timer_sync(&xprt->timer);
 
+	rpc_xprt_debugfs_unregister(xprt);
 	rpc_destroy_wait_queue(&xprt->binding);
 	rpc_destroy_wait_queue(&xprt->pending);
 	rpc_destroy_wait_queue(&xprt->sending);
-- 
cgit v1.1