Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next

Pull networking changes from David Miller: 1) GRE now works over ipv6, from Dmitry Kozlov. 2) Make SCTP more network namespace aware, from Eric Biederman. 3) TEAM driver now works with non-ethernet devices, from Jiri Pirko. 4) Make openvswitch network namespace aware, from Pravin B Shelar. 5) IPV6 NAT implementation, from Patrick McHardy. 6) Server side support for TCP Fast Open, from Jerry Chu and others. 7) Packet BPF filter supports MOD and XOR, from Eric Dumazet and Daniel Borkmann. 8) Increate the loopback default MTU to 64K, from Eric Dumazet. 9) Use a per-task rather than per-socket page fragment allocator for outgoing networking traffic. This benefits processes that have very many mostly idle sockets, which is quite common. From Eric Dumazet. 10) Use up to 32K for page fragment allocations, with fallbacks to smaller sizes when higher order page allocations fail. Benefits are a) less segments for driver to process b) less calls to page allocator c) less waste of space. From Eric Dumazet. 11) Allow GRO to be used on GRE tunnels, from Eric Dumazet. 12) VXLAN device driver, one way to handle VLAN issues such as the limitation of 4096 VLAN IDs yet still have some level of isolation. From Stephen Hemminger. 13) As usual there is a large boatload of driver changes, with the scale perhaps tilted towards the wireless side this time around. Fix up various fairly trivial conflicts, mostly caused by the user namespace changes. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1012 commits) hyperv: Add buffer for extended info after the RNDIS response message. hyperv: Report actual status in receive completion packet hyperv: Remove extra allocated space for recv_pkt_list elements hyperv: Fix page buffer handling in rndis_filter_send_request() hyperv: Fix the missing return value in rndis_filter_set_packet_filter() hyperv: Fix the max_xfer_size in RNDIS initialization vxlan: put UDP socket in correct namespace vxlan: Depend on CONFIG_INET sfc: Fix the reported priorities of different filter types sfc: Remove EFX_FILTER_FLAG_RX_OVERRIDE_IP sfc: Fix loopback self-test with separate_tx_channels=1 sfc: Fix MCDI structure field lookup sfc: Add parentheses around use of bitfield macro arguments sfc: Fix null function pointer in efx_sriov_channel_type vxlan: virtual extensible lan igmp: export symbol ip_mc_leave_group netlink: add attributes to fdb interface tg3: unconditionally select HWMON support when tg3 is enabled. Revert "net: ti cpsw ethernet: allow reading phy interface mode from DT" gre: fix sparse warning ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-10-02 13:38:27 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-10-02 13:38:27 -0700
commit: aecdc33e111b2c447b622e287c6003726daa1426 (patch)
tree: 3e7657eae4b785e1a1fb5dfb225dbae0b2f0cfc6 /net/ipv4/netfilter/iptable_nat.c
parent: a20acf99f75e49271381d65db097c9763060a1e8 (diff)
parent: a3a6cab5ea10cca64d036851fe0d932448f2fe4f (diff)
download: op-kernel-dev-aecdc33e111b2c447b622e287c6003726daa1426.zip
op-kernel-dev-aecdc33e111b2c447b622e287c6003726daa1426.tar.gz
1 files changed, 320 insertions, 0 deletions
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
new file mode 100644
index 0000000..9e0ffaf
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -0,0 +1,320 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+
+static const struct xt_table nf_nat_ipv4_table = {
+	.name		= "nat",
+	.valid_hooks	= (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_LOCAL_IN),
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_IPV4,
+};
+
+static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
+{
+	/* Force range to this IP; let proto decide mapping for
+	 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
+	 */
+	struct nf_nat_range range;
+
+	range.flags = 0;
+	pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
+		 HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
+		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
+		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
+
+	return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
+}
+
+static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
+				     const struct net_device *in,
+				     const struct net_device *out,
+				     struct nf_conn *ct)
+{
+	struct net *net = nf_ct_net(ct);
+	unsigned int ret;
+
+	ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
+	if (ret == NF_ACCEPT) {
+		if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
+			ret = alloc_null_binding(ct, hooknum);
+	}
+	return ret;
+}
+
+static unsigned int
+nf_nat_ipv4_fn(unsigned int hooknum,
+	       struct sk_buff *skb,
+	       const struct net_device *in,
+	       const struct net_device *out,
+	       int (*okfn)(struct sk_buff *))
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn_nat *nat;
+	/* maniptype == SRC for postrouting. */
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+
+	/* We never see fragments: conntrack defrags on pre-routing
+	 * and local-out, and nf_nat_out protects post-routing.
+	 */
+	NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
+
+	ct = nf_ct_get(skb, &ctinfo);
+	/* Can't track?  It's not due to stress, or conntrack would
+	 * have dropped it.  Hence it's the user's responsibilty to
+	 * packet filter it out, or implement conntrack/NAT for that
+	 * protocol. 8) --RR
+	 */
+	if (!ct)
+		return NF_ACCEPT;
+
+	/* Don't try to NAT if this packet is not conntracked */
+	if (nf_ct_is_untracked(ct))
+		return NF_ACCEPT;
+
+	nat = nfct_nat(ct);
+	if (!nat) {
+		/* NAT module was loaded late. */
+		if (nf_ct_is_confirmed(ct))
+			return NF_ACCEPT;
+		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+		if (nat == NULL) {
+			pr_debug("failed to add NAT extension\n");
+			return NF_ACCEPT;
+		}
+	}
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED_REPLY:
+		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+							   hooknum))
+				return NF_DROP;
+			else
+				return NF_ACCEPT;
+		}
+		/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+	case IP_CT_NEW:
+		/* Seen it before?  This can happen for loopback, retrans,
+		 * or local packets.
+		 */
+		if (!nf_nat_initialized(ct, maniptype)) {
+			unsigned int ret;
+
+			ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
+			if (ret != NF_ACCEPT)
+				return ret;
+		} else
+			pr_debug("Already setup manip %s for ct %p\n",
+				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
+				 ct);
+		break;
+
+	default:
+		/* ESTABLISHED */
+		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
+			     ctinfo == IP_CT_ESTABLISHED_REPLY);
+	}
+
+	return nf_nat_packet(ct, ctinfo, hooknum, skb);
+}
+
+static unsigned int
+nf_nat_ipv4_in(unsigned int hooknum,
+	       struct sk_buff *skb,
+	       const struct net_device *in,
+	       const struct net_device *out,
+	       int (*okfn)(struct sk_buff *))
+{
+	unsigned int ret;
+	__be32 daddr = ip_hdr(skb)->daddr;
+
+	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    daddr != ip_hdr(skb)->daddr)
+		skb_dst_drop(skb);
+
+	return ret;
+}
+
+static unsigned int
+nf_nat_ipv4_out(unsigned int hooknum,
+		struct sk_buff *skb,
+		const struct net_device *in,
+		const struct net_device *out,
+		int (*okfn)(struct sk_buff *))
+{
+#ifdef CONFIG_XFRM
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+#endif
+	unsigned int ret;
+
+	/* root is playing with raw sockets. */
+	if (skb->len < sizeof(struct iphdr) ||
+	    ip_hdrlen(skb) < sizeof(struct iphdr))
+		return NF_ACCEPT;
+
+	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+#ifdef CONFIG_XFRM
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if ((ct->tuplehash[dir].tuple.src.u3.ip !=
+		     ct->tuplehash[!dir].tuple.dst.u3.ip) ||
+		    (ct->tuplehash[dir].tuple.src.u.all !=
+		     ct->tuplehash[!dir].tuple.dst.u.all))
+			if (nf_xfrm_me_harder(skb, AF_INET) < 0)
+				ret = NF_DROP;
+	}
+#endif
+	return ret;
+}
+
+static unsigned int
+nf_nat_ipv4_local_fn(unsigned int hooknum,
+		     struct sk_buff *skb,
+		     const struct net_device *in,
+		     const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	unsigned int ret;
+
+	/* root is playing with raw sockets. */
+	if (skb->len < sizeof(struct iphdr) ||
+	    ip_hdrlen(skb) < sizeof(struct iphdr))
+		return NF_ACCEPT;
+
+	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.dst.u3.ip !=
+		    ct->tuplehash[!dir].tuple.src.u3.ip) {
+			if (ip_route_me_harder(skb, RTN_UNSPEC))
+				ret = NF_DROP;
+		}
+#ifdef CONFIG_XFRM
+		else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+			 ct->tuplehash[dir].tuple.dst.u.all !=
+			 ct->tuplehash[!dir].tuple.src.u.all)
+			if (nf_xfrm_me_harder(skb, AF_INET) < 0)
+				ret = NF_DROP;
+#endif
+	}
+	return ret;
+}
+
+static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
+	/* Before packet filtering, change destination */
+	{
+		.hook		= nf_nat_ipv4_in,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP_PRI_NAT_DST,
+	},
+	/* After packet filtering, change source */
+	{
+		.hook		= nf_nat_ipv4_out,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_NAT_SRC,
+	},
+	/* Before packet filtering, change destination */
+	{
+		.hook		= nf_nat_ipv4_local_fn,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP_PRI_NAT_DST,
+	},
+	/* After packet filtering, change source */
+	{
+		.hook		= nf_nat_ipv4_fn,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_NAT_SRC,
+	},
+};
+
+static int __net_init iptable_nat_net_init(struct net *net)
+{
+	struct ipt_replace *repl;
+
+	repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
+	if (repl == NULL)
+		return -ENOMEM;
+	net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
+	kfree(repl);
+	if (IS_ERR(net->ipv4.nat_table))
+		return PTR_ERR(net->ipv4.nat_table);
+	return 0;
+}
+
+static void __net_exit iptable_nat_net_exit(struct net *net)
+{
+	ipt_unregister_table(net, net->ipv4.nat_table);
+}
+
+static struct pernet_operations iptable_nat_net_ops = {
+	.init	= iptable_nat_net_init,
+	.exit	= iptable_nat_net_exit,
+};
+
+static int __init iptable_nat_init(void)
+{
+	int err;
+
+	err = register_pernet_subsys(&iptable_nat_net_ops);
+	if (err < 0)
+		goto err1;
+
+	err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+	if (err < 0)
+		goto err2;
+	return 0;
+
+err2:
+	unregister_pernet_subsys(&iptable_nat_net_ops);
+err1:
+	return err;
+}
+
+static void __exit iptable_nat_exit(void)
+{
+	nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+	unregister_pernet_subsys(&iptable_nat_net_ops);
+}
+
+module_init(iptable_nat_init);
+module_exit(iptable_nat_exit);
+
+MODULE_LICENSE("GPL");
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-10-02 13:38:27 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-10-02 13:38:27 -0700
commit	aecdc33e111b2c447b622e287c6003726daa1426 (patch)
tree	3e7657eae4b785e1a1fb5dfb225dbae0b2f0cfc6 /net/ipv4/netfilter/iptable_nat.c
parent	a20acf99f75e49271381d65db097c9763060a1e8 (diff)
parent	a3a6cab5ea10cca64d036851fe0d932448f2fe4f (diff)
download	op-kernel-dev-aecdc33e111b2c447b622e287c6003726daa1426.zip op-kernel-dev-aecdc33e111b2c447b622e287c6003726daa1426.tar.gz