diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-02 13:38:27 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-02 13:38:27 -0700 |
commit | aecdc33e111b2c447b622e287c6003726daa1426 (patch) | |
tree | 3e7657eae4b785e1a1fb5dfb225dbae0b2f0cfc6 /net/core | |
parent | a20acf99f75e49271381d65db097c9763060a1e8 (diff) | |
parent | a3a6cab5ea10cca64d036851fe0d932448f2fe4f (diff) | |
download | op-kernel-dev-aecdc33e111b2c447b622e287c6003726daa1426.zip op-kernel-dev-aecdc33e111b2c447b622e287c6003726daa1426.tar.gz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking changes from David Miller:
1) GRE now works over ipv6, from Dmitry Kozlov.
2) Make SCTP more network namespace aware, from Eric Biederman.
3) TEAM driver now works with non-ethernet devices, from Jiri Pirko.
4) Make openvswitch network namespace aware, from Pravin B Shelar.
5) IPV6 NAT implementation, from Patrick McHardy.
6) Server side support for TCP Fast Open, from Jerry Chu and others.
7) Packet BPF filter supports MOD and XOR, from Eric Dumazet and Daniel
Borkmann.
8) Increate the loopback default MTU to 64K, from Eric Dumazet.
9) Use a per-task rather than per-socket page fragment allocator for
outgoing networking traffic. This benefits processes that have very
many mostly idle sockets, which is quite common.
From Eric Dumazet.
10) Use up to 32K for page fragment allocations, with fallbacks to
smaller sizes when higher order page allocations fail. Benefits are
a) less segments for driver to process b) less calls to page
allocator c) less waste of space.
From Eric Dumazet.
11) Allow GRO to be used on GRE tunnels, from Eric Dumazet.
12) VXLAN device driver, one way to handle VLAN issues such as the
limitation of 4096 VLAN IDs yet still have some level of isolation.
From Stephen Hemminger.
13) As usual there is a large boatload of driver changes, with the scale
perhaps tilted towards the wireless side this time around.
Fix up various fairly trivial conflicts, mostly caused by the user
namespace changes.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1012 commits)
hyperv: Add buffer for extended info after the RNDIS response message.
hyperv: Report actual status in receive completion packet
hyperv: Remove extra allocated space for recv_pkt_list elements
hyperv: Fix page buffer handling in rndis_filter_send_request()
hyperv: Fix the missing return value in rndis_filter_set_packet_filter()
hyperv: Fix the max_xfer_size in RNDIS initialization
vxlan: put UDP socket in correct namespace
vxlan: Depend on CONFIG_INET
sfc: Fix the reported priorities of different filter types
sfc: Remove EFX_FILTER_FLAG_RX_OVERRIDE_IP
sfc: Fix loopback self-test with separate_tx_channels=1
sfc: Fix MCDI structure field lookup
sfc: Add parentheses around use of bitfield macro arguments
sfc: Fix null function pointer in efx_sriov_channel_type
vxlan: virtual extensible lan
igmp: export symbol ip_mc_leave_group
netlink: add attributes to fdb interface
tg3: unconditionally select HWMON support when tg3 is enabled.
Revert "net: ti cpsw ethernet: allow reading phy interface mode from DT"
gre: fix sparse warning
...
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/dev.c | 109 | ||||
-rw-r--r-- | net/core/dev_addr_lists.c | 40 | ||||
-rw-r--r-- | net/core/dst.c | 2 | ||||
-rw-r--r-- | net/core/ethtool.c | 12 | ||||
-rw-r--r-- | net/core/fib_rules.c | 6 | ||||
-rw-r--r-- | net/core/filter.c | 27 | ||||
-rw-r--r-- | net/core/link_watch.c | 8 | ||||
-rw-r--r-- | net/core/neighbour.c | 8 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 18 | ||||
-rw-r--r-- | net/core/netpoll.c | 5 | ||||
-rw-r--r-- | net/core/netprio_cgroup.c | 41 | ||||
-rw-r--r-- | net/core/request_sock.c | 95 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 38 | ||||
-rw-r--r-- | net/core/scm.c | 17 | ||||
-rw-r--r-- | net/core/secure_seq.c | 1 | ||||
-rw-r--r-- | net/core/skbuff.c | 86 | ||||
-rw-r--r-- | net/core/sock.c | 64 | ||||
-rw-r--r-- | net/core/sock_diag.c | 3 | ||||
-rw-r--r-- | net/core/utils.c | 20 |
19 files changed, 394 insertions, 206 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 17e912f..1e0a184 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -959,18 +959,30 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -static int dev_get_valid_name(struct net_device *dev, const char *name) +static int dev_alloc_name_ns(struct net *net, + struct net_device *dev, + const char *name) { - struct net *net; + char buf[IFNAMSIZ]; + int ret; - BUG_ON(!dev_net(dev)); - net = dev_net(dev); + ret = __dev_alloc_name(net, name, buf); + if (ret >= 0) + strlcpy(dev->name, buf, IFNAMSIZ); + return ret; +} + +static int dev_get_valid_name(struct net *net, + struct net_device *dev, + const char *name) +{ + BUG_ON(!net); if (!dev_valid_name(name)) return -EINVAL; if (strchr(name, '%')) - return dev_alloc_name(dev, name); + return dev_alloc_name_ns(net, dev, name); else if (__dev_get_by_name(net, name)) return -EEXIST; else if (dev->name != name) @@ -1006,7 +1018,7 @@ int dev_change_name(struct net_device *dev, const char *newname) memcpy(oldname, dev->name, IFNAMSIZ); - err = dev_get_valid_name(dev, newname); + err = dev_get_valid_name(net, dev, newname); if (err < 0) return err; @@ -1109,11 +1121,23 @@ void netdev_state_change(struct net_device *dev) } EXPORT_SYMBOL(netdev_state_change); -int netdev_bonding_change(struct net_device *dev, unsigned long event) +/** + * netdev_notify_peers - notify network peers about existence of @dev + * @dev: network device + * + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. + */ +void netdev_notify_peers(struct net_device *dev) { - return call_netdevice_notifiers(event, dev); + rtnl_lock(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + rtnl_unlock(); } -EXPORT_SYMBOL(netdev_bonding_change); +EXPORT_SYMBOL(netdev_notify_peers); /** * dev_load - load a network module @@ -1394,7 +1418,6 @@ rollback: nb->notifier_call(nb, NETDEV_DOWN, dev); } nb->notifier_call(nb, NETDEV_UNREGISTER, dev); - nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); } } @@ -1436,7 +1459,6 @@ int unregister_netdevice_notifier(struct notifier_block *nb) nb->notifier_call(nb, NETDEV_DOWN, dev); } nb->notifier_call(nb, NETDEV_UNREGISTER, dev); - nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); } } unlock: @@ -2175,9 +2197,7 @@ EXPORT_SYMBOL(netif_skb_features); /* * Returns true if either: * 1. skb has frag_list and the device doesn't support FRAGLIST, or - * 2. skb is fragmented and the device does not support SG, or if - * at least one of fragments is in highmem and device does not - * support DMA from it. + * 2. skb is fragmented and the device does not support SG. */ static inline int skb_needs_linearize(struct sk_buff *skb, int features) @@ -2206,9 +2226,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); - features = netif_skb_features(skb); if (vlan_tx_tag_present(skb) && @@ -2243,6 +2260,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, } } + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); + skb_len = skb->len; rc = ops->ndo_start_xmit(skb, dev); trace_net_dev_xmit(skb, rc, dev, skb_len); @@ -2265,6 +2285,9 @@ gso: if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(nskb); + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(nskb, dev); + skb_len = nskb->len; rc = ops->ndo_start_xmit(nskb, dev); trace_net_dev_xmit(nskb, rc, dev, skb_len); @@ -2374,8 +2397,8 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) #endif } -static struct netdev_queue *dev_pick_tx(struct net_device *dev, - struct sk_buff *skb) +struct netdev_queue *netdev_pick_tx(struct net_device *dev, + struct sk_buff *skb) { int queue_index; const struct net_device_ops *ops = dev->netdev_ops; @@ -2549,7 +2572,7 @@ int dev_queue_xmit(struct sk_buff *skb) skb_update_prio(skb); - txq = dev_pick_tx(dev, skb); + txq = netdev_pick_tx(dev, skb); q = rcu_dereference_bh(txq->qdisc); #ifdef CONFIG_NET_CLS_ACT @@ -2622,6 +2645,8 @@ EXPORT_SYMBOL(dev_queue_xmit); =======================================================================*/ int netdev_max_backlog __read_mostly = 1000; +EXPORT_SYMBOL(netdev_max_backlog); + int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; int weight_p __read_mostly = 64; /* old backlog weight */ @@ -5239,12 +5264,12 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) */ static int dev_new_index(struct net *net) { - static int ifindex; + int ifindex = net->ifindex; for (;;) { if (++ifindex <= 0) ifindex = 1; if (!__dev_get_by_index(net, ifindex)) - return ifindex; + return net->ifindex = ifindex; } } @@ -5322,10 +5347,6 @@ static void rollback_registered_many(struct list_head *head) netdev_unregister_kobject(dev); } - /* Process any work delayed until the end of the batch */ - dev = list_first_entry(head, struct net_device, unreg_list); - call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); - synchronize_net(); list_for_each_entry(dev, head, unreg_list) @@ -5583,7 +5604,7 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; - ret = dev_get_valid_name(dev, dev->name); + ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; @@ -5597,7 +5618,12 @@ int register_netdevice(struct net_device *dev) } } - dev->ifindex = dev_new_index(net); + ret = -EBUSY; + if (!dev->ifindex) + dev->ifindex = dev_new_index(net); + else if (__dev_get_by_index(net, dev->ifindex)) + goto err_uninit; + if (dev->iflink == -1) dev->iflink = dev->ifindex; @@ -5640,6 +5666,8 @@ int register_netdevice(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); + linkwatch_init_dev(dev); + dev_init_scheduler(dev); dev_hold(dev); list_netdevice(dev); @@ -5773,9 +5801,12 @@ static void netdev_wait_allrefs(struct net_device *dev) /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users - * should have already handle it the first time */ + __rtnl_unlock(); + rcu_barrier(); + rtnl_lock(); + + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { /* We must not have linkwatch events @@ -5837,9 +5868,8 @@ void netdev_run_todo(void) __rtnl_unlock(); - /* Wait for rcu callbacks to finish before attempting to drain - * the device list. This usually avoids a 250ms wait. - */ + + /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); @@ -5848,6 +5878,10 @@ void netdev_run_todo(void) = list_first_entry(&list, struct net_device, todo_list); list_del(&dev->todo_list); + rtnl_lock(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); + __rtnl_unlock(); + if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { pr_err("network todo '%s' but state %d\n", dev->name, dev->reg_state); @@ -5943,6 +5977,8 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) return queue; } +static const struct ethtool_ops default_ethtool_ops; + /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for @@ -6030,6 +6066,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, strcpy(dev->name, name); dev->group = INIT_NETDEV_GROUP; + if (!dev->ethtool_ops) + dev->ethtool_ops = &default_ethtool_ops; return dev; free_all: @@ -6214,7 +6252,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* We get here if we can't use the current device name */ if (!pat) goto out; - if (dev_get_valid_name(dev, pat) < 0) + if (dev_get_valid_name(net, dev, pat) < 0) goto out; } @@ -6242,7 +6280,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); + rcu_barrier(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); /* diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index c4cc2bc..87cc17d 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -22,7 +22,7 @@ */ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, - unsigned char *addr, int addr_len, + const unsigned char *addr, int addr_len, unsigned char addr_type, bool global) { struct netdev_hw_addr *ha; @@ -46,7 +46,7 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, } static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, - unsigned char *addr, int addr_len, + const unsigned char *addr, int addr_len, unsigned char addr_type, bool global) { struct netdev_hw_addr *ha; @@ -72,14 +72,15 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, return __hw_addr_create_ex(list, addr, addr_len, addr_type, global); } -static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, - int addr_len, unsigned char addr_type) +static int __hw_addr_add(struct netdev_hw_addr_list *list, + const unsigned char *addr, int addr_len, + unsigned char addr_type) { return __hw_addr_add_ex(list, addr, addr_len, addr_type, false); } static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, - unsigned char *addr, int addr_len, + const unsigned char *addr, int addr_len, unsigned char addr_type, bool global) { struct netdev_hw_addr *ha; @@ -104,8 +105,9 @@ static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, return -ENOENT; } -static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr, - int addr_len, unsigned char addr_type) +static int __hw_addr_del(struct netdev_hw_addr_list *list, + const unsigned char *addr, int addr_len, + unsigned char addr_type) { return __hw_addr_del_ex(list, addr, addr_len, addr_type, false); } @@ -278,7 +280,7 @@ EXPORT_SYMBOL(dev_addr_init); * * The caller must hold the rtnl_mutex. */ -int dev_addr_add(struct net_device *dev, unsigned char *addr, +int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type) { int err; @@ -303,7 +305,7 @@ EXPORT_SYMBOL(dev_addr_add); * * The caller must hold the rtnl_mutex. */ -int dev_addr_del(struct net_device *dev, unsigned char *addr, +int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type) { int err; @@ -390,7 +392,7 @@ EXPORT_SYMBOL(dev_addr_del_multiple); * @dev: device * @addr: address to add */ -int dev_uc_add_excl(struct net_device *dev, unsigned char *addr) +int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr) { struct netdev_hw_addr *ha; int err; @@ -421,7 +423,7 @@ EXPORT_SYMBOL(dev_uc_add_excl); * Add a secondary unicast address to the device or increase * the reference count if it already exists. */ -int dev_uc_add(struct net_device *dev, unsigned char *addr) +int dev_uc_add(struct net_device *dev, const unsigned char *addr) { int err; @@ -443,7 +445,7 @@ EXPORT_SYMBOL(dev_uc_add); * Release reference to a secondary unicast address and remove it * from the device if the reference count drops to zero. */ -int dev_uc_del(struct net_device *dev, unsigned char *addr) +int dev_uc_del(struct net_device *dev, const unsigned char *addr) { int err; @@ -543,7 +545,7 @@ EXPORT_SYMBOL(dev_uc_init); * @dev: device * @addr: address to add */ -int dev_mc_add_excl(struct net_device *dev, unsigned char *addr) +int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr) { struct netdev_hw_addr *ha; int err; @@ -566,7 +568,7 @@ out: } EXPORT_SYMBOL(dev_mc_add_excl); -static int __dev_mc_add(struct net_device *dev, unsigned char *addr, +static int __dev_mc_add(struct net_device *dev, const unsigned char *addr, bool global) { int err; @@ -587,7 +589,7 @@ static int __dev_mc_add(struct net_device *dev, unsigned char *addr, * Add a multicast address to the device or increase * the reference count if it already exists. */ -int dev_mc_add(struct net_device *dev, unsigned char *addr) +int dev_mc_add(struct net_device *dev, const unsigned char *addr) { return __dev_mc_add(dev, addr, false); } @@ -600,13 +602,13 @@ EXPORT_SYMBOL(dev_mc_add); * * Add a global multicast address to the device. */ -int dev_mc_add_global(struct net_device *dev, unsigned char *addr) +int dev_mc_add_global(struct net_device *dev, const unsigned char *addr) { return __dev_mc_add(dev, addr, true); } EXPORT_SYMBOL(dev_mc_add_global); -static int __dev_mc_del(struct net_device *dev, unsigned char *addr, +static int __dev_mc_del(struct net_device *dev, const unsigned char *addr, bool global) { int err; @@ -628,7 +630,7 @@ static int __dev_mc_del(struct net_device *dev, unsigned char *addr, * Release reference to a multicast address and remove it * from the device if the reference count drops to zero. */ -int dev_mc_del(struct net_device *dev, unsigned char *addr) +int dev_mc_del(struct net_device *dev, const unsigned char *addr) { return __dev_mc_del(dev, addr, false); } @@ -642,7 +644,7 @@ EXPORT_SYMBOL(dev_mc_del); * Release reference to a multicast address and remove it * from the device if the reference count drops to zero. */ -int dev_mc_del_global(struct net_device *dev, unsigned char *addr) +int dev_mc_del_global(struct net_device *dev, const unsigned char *addr) { return __dev_mc_del(dev, addr, true); } diff --git a/net/core/dst.c b/net/core/dst.c index b8d7c70..ee6153e 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -374,7 +374,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, struct dst_entry *dst, *last = NULL; switch (event) { - case NETDEV_UNREGISTER: + case NETDEV_UNREGISTER_FINAL: case NETDEV_DOWN: mutex_lock(&dst_gc_mutex); for (dst = dst_busy_list; dst; dst = dst->next) { diff --git a/net/core/ethtool.c b/net/core/ethtool.c index cbf033d..4d64cc2 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1426,18 +1426,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; - if (!dev->ethtool_ops) { - /* A few commands do not require any driver support, - * are unprivileged, and do not change anything, so we - * can take a shortcut to them. */ - if (ethcmd == ETHTOOL_GDRVINFO) - return ethtool_get_drvinfo(dev, useraddr); - else if (ethcmd == ETHTOOL_GET_TS_INFO) - return ethtool_get_ts_info(dev, useraddr); - else - return -EOPNOTSUPP; - } - /* Allow some commands to be done by anyone */ switch (ethcmd) { case ETHTOOL_GSET: diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index ab7db83..58a4ba2 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -402,7 +402,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (unresolved) ops->unresolved_rules++; - notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid); + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); return 0; @@ -500,7 +500,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) } notify_rule_change(RTM_DELRULE, rule, ops, nlh, - NETLINK_CB(skb).pid); + NETLINK_CB(skb).portid); if (ops->delete) ops->delete(rule); fib_rule_put(rule); @@ -601,7 +601,7 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, if (idx < cb->args[1]) goto skip; - if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).pid, + if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWRULE, NLM_F_MULTI, ops) < 0) break; diff --git a/net/core/filter.c b/net/core/filter.c index 907efd2..3d92ebb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -167,6 +167,14 @@ unsigned int sk_run_filter(const struct sk_buff *skb, case BPF_S_ALU_DIV_K: A = reciprocal_divide(A, K); continue; + case BPF_S_ALU_MOD_X: + if (X == 0) + return 0; + A %= X; + continue; + case BPF_S_ALU_MOD_K: + A %= K; + continue; case BPF_S_ALU_AND_X: A &= X; continue; @@ -179,6 +187,13 @@ unsigned int sk_run_filter(const struct sk_buff *skb, case BPF_S_ALU_OR_K: A |= K; continue; + case BPF_S_ANC_ALU_XOR_X: + case BPF_S_ALU_XOR_X: + A ^= X; + continue; + case BPF_S_ALU_XOR_K: + A ^= K; + continue; case BPF_S_ALU_LSH_X: A <<= X; continue; @@ -326,9 +341,6 @@ load_b: case BPF_S_ANC_CPU: A = raw_smp_processor_id(); continue; - case BPF_S_ANC_ALU_XOR_X: - A ^= X; - continue; case BPF_S_ANC_NLATTR: { struct nlattr *nla; @@ -469,10 +481,14 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K, [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X, [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X, + [BPF_ALU|BPF_MOD|BPF_K] = BPF_S_ALU_MOD_K, + [BPF_ALU|BPF_MOD|BPF_X] = BPF_S_ALU_MOD_X, [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K, [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X, [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K, [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X, + [BPF_ALU|BPF_XOR|BPF_K] = BPF_S_ALU_XOR_K, + [BPF_ALU|BPF_XOR|BPF_X] = BPF_S_ALU_XOR_X, [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K, [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X, [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K, @@ -531,6 +547,11 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) return -EINVAL; ftest->k = reciprocal_value(ftest->k); break; + case BPF_S_ALU_MOD_K: + /* check for division by zero */ + if (ftest->k == 0) + return -EINVAL; + break; case BPF_S_LD_MEM: case BPF_S_LDX_MEM: case BPF_S_ST: diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 8e397a6..8f82a5c 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -76,6 +76,14 @@ static void rfc2863_policy(struct net_device *dev) } +void linkwatch_init_dev(struct net_device *dev) +{ + /* Handle pre-registration link state changes */ + if (!netif_carrier_ok(dev) || netif_dormant(dev)) + rfc2863_policy(dev); +} + + static bool linkwatch_urgent_event(struct net_device *dev) { if (!netif_running(dev)) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 112c6e2..baca771 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2102,7 +2102,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (tidx < tbl_skip || (family && tbl->family != family)) continue; - if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).pid, + if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) <= 0) break; @@ -2115,7 +2115,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) goto next; if (neightbl_fill_param_info(skb, tbl, p, - NETLINK_CB(cb->skb).pid, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) <= 0) @@ -2244,7 +2244,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, continue; if (idx < s_idx) goto next; - if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, + if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI) <= 0) { @@ -2281,7 +2281,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, continue; if (idx < s_idx) goto next; - if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, + if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, tbl) <= 0) { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7260717..bcf02f6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -166,9 +166,21 @@ static ssize_t show_duplex(struct device *dev, if (netif_running(netdev)) { struct ethtool_cmd cmd; - if (!__ethtool_get_settings(netdev, &cmd)) - ret = sprintf(buf, "%s\n", - cmd.duplex ? "full" : "half"); + if (!__ethtool_get_settings(netdev, &cmd)) { + const char *duplex; + switch (cmd.duplex) { + case DUPLEX_HALF: + duplex = "half"; + break; + case DUPLEX_FULL: + duplex = "full"; + break; + default: + duplex = "unknown"; + break; + } + ret = sprintf(buf, "%s\n", duplex); + } } rtnl_unlock(); return ret; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e4ba3e7..77a0388 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -328,7 +328,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = netdev_pick_tx(dev, skb); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; @@ -380,6 +380,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) struct udphdr *udph; struct iphdr *iph; struct ethhdr *eth; + static atomic_t ip_ident; udp_len = len + sizeof(*udph); ip_len = udp_len + sizeof(*iph); @@ -415,7 +416,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) put_unaligned(0x45, (unsigned char *)iph); iph->tos = 0; put_unaligned(htons(ip_len), &(iph->tot_len)); - iph->id = 0; + iph->id = htons(atomic_inc_return(&ip_ident)); iph->frag_off = 0; iph->ttl = 64; iph->protocol = IPPROTO_UDP; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 39e7e4d..4a83fb3 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -73,7 +73,6 @@ static int extend_netdev_table(struct net_device *dev, u32 new_len) ((sizeof(u32) * new_len)); struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL); struct netprio_map *old_priomap; - int i; old_priomap = rtnl_dereference(dev->priomap); @@ -82,10 +81,10 @@ static int extend_netdev_table(struct net_device *dev, u32 new_len) return -ENOMEM; } - for (i = 0; - old_priomap && (i < old_priomap->priomap_len); - i++) - new_priomap->priomap[i] = old_priomap->priomap[i]; + if (old_priomap) + memcpy(new_priomap->priomap, old_priomap->priomap, + old_priomap->priomap_len * + sizeof(old_priomap->priomap[0])); new_priomap->priomap_len = new_len; @@ -109,32 +108,6 @@ static int write_update_netdev_table(struct net_device *dev) return ret; } -static int update_netdev_tables(void) -{ - int ret = 0; - struct net_device *dev; - u32 max_len; - struct netprio_map *map; - - rtnl_lock(); - max_len = atomic_read(&max_prioidx) + 1; - for_each_netdev(&init_net, dev) { - map = rtnl_dereference(dev->priomap); - /* - * don't allocate priomap if we didn't - * change net_prio.ifpriomap (map == NULL), - * this will speed up skb_update_prio. - */ - if (map && map->priomap_len < max_len) { - ret = extend_netdev_table(dev, max_len); - if (ret < 0) - break; - } - } - rtnl_unlock(); - return ret; -} - static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) { struct cgroup_netprio_state *cs; @@ -153,12 +126,6 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) goto out; } - ret = update_netdev_tables(); - if (ret < 0) { - put_prioidx(cs->prioidx); - goto out; - } - return &cs->css; out: kfree(cs); diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 9b570a6..c31d9e8 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -15,6 +15,7 @@ #include <linux/random.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/tcp.h> #include <linux/vmalloc.h> #include <net/request_sock.h> @@ -130,3 +131,97 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) kfree(lopt); } +/* + * This function is called to set a Fast Open socket's "fastopen_rsk" field + * to NULL when a TFO socket no longer needs to access the request_sock. + * This happens only after 3WHS has been either completed or aborted (e.g., + * RST is received). + * + * Before TFO, a child socket is created only after 3WHS is completed, + * hence it never needs to access the request_sock. things get a lot more + * complex with TFO. A child socket, accepted or not, has to access its + * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, + * until 3WHS is either completed or aborted. Afterwards the req will stay + * until either the child socket is accepted, or in the rare case when the + * listener is closed before the child is accepted. + * + * In short, a request socket is only freed after BOTH 3WHS has completed + * (or aborted) and the child socket has been accepted (or listener closed). + * When a child socket is accepted, its corresponding req->sk is set to + * NULL since it's no longer needed. More importantly, "req->sk == NULL" + * will be used by the code below to determine if a child socket has been + * accepted or not, and the check is protected by the fastopenq->lock + * described below. + * + * Note that fastopen_rsk is only accessed from the child socket's context + * with its socket lock held. But a request_sock (req) can be accessed by + * both its child socket through fastopen_rsk, and a listener socket through + * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin + * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. + * only in the rare case when both the listener and the child locks are held, + * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. + * The lock also protects other fields such as fastopenq->qlen, which is + * decremented by this function when fastopen_rsk is no longer needed. + * + * Note that another solution was to simply use the existing socket lock + * from the listener. But first socket lock is difficult to use. It is not + * a simple spin lock - one must consider sock_owned_by_user() and arrange + * to use sk_add_backlog() stuff. But what really makes it infeasible is the + * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to + * acquire a child's lock while holding listener's socket lock. A corner + * case might also exist in tcp_v4_hnd_req() that will trigger this locking + * order. + * + * When a TFO req is created, it needs to sock_hold its listener to prevent + * the latter data structure from going away. + * + * This function also sets "treq->listener" to NULL and unreference listener + * socket. treq->listener is used by the listener so it is protected by the + * fastopenq->lock in this function. + */ +void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, + bool reset) +{ + struct sock *lsk = tcp_rsk(req)->listener; + struct fastopen_queue *fastopenq = + inet_csk(lsk)->icsk_accept_queue.fastopenq; + + BUG_ON(!spin_is_locked(&sk->sk_lock.slock) && !sock_owned_by_user(sk)); + + tcp_sk(sk)->fastopen_rsk = NULL; + spin_lock_bh(&fastopenq->lock); + fastopenq->qlen--; + tcp_rsk(req)->listener = NULL; + if (req->sk) /* the child socket hasn't been accepted yet */ + goto out; + + if (!reset || lsk->sk_state != TCP_LISTEN) { + /* If the listener has been closed don't bother with the + * special RST handling below. + */ + spin_unlock_bh(&fastopenq->lock); + sock_put(lsk); + reqsk_free(req); + return; + } + /* Wait for 60secs before removing a req that has triggered RST. + * This is a simple defense against TFO spoofing attack - by + * counting the req against fastopen.max_qlen, and disabling + * TFO when the qlen exceeds max_qlen. + * + * For more details see CoNext'11 "TCP Fast Open" paper. + */ + req->expires = jiffies + 60*HZ; + if (fastopenq->rskq_rst_head == NULL) + fastopenq->rskq_rst_head = req; + else + fastopenq->rskq_rst_tail->dl_next = req; + + req->dl_next = NULL; + fastopenq->rskq_rst_tail = req; + fastopenq->qlen++; +out: + spin_unlock_bh(&fastopenq->lock); + sock_put(lsk); + return; +} diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2c5a0a0..76d4c2c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -618,7 +618,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error) { struct rta_cacheinfo ci = { - .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse), + .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse), .rta_used = dst->__use, .rta_clntref = atomic_read(&(dst->__refcnt)), .rta_error = error, @@ -1081,7 +1081,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (idx < s_idx) goto cont; if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, - NETLINK_CB(cb->skb).pid, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, NLM_F_MULTI, ext_filter_mask) <= 0) @@ -1812,8 +1812,6 @@ replay: return -ENODEV; } - if (ifm->ifi_index) - return -EOPNOTSUPP; if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO]) return -EOPNOTSUPP; @@ -1839,10 +1837,14 @@ replay: return PTR_ERR(dest_net); dev = rtnl_create_link(net, dest_net, ifname, ops, tb); - - if (IS_ERR(dev)) + if (IS_ERR(dev)) { err = PTR_ERR(dev); - else if (ops->newlink) + goto out; + } + + dev->ifindex = ifm->ifi_index; + + if (ops->newlink) err = ops->newlink(net, dev, tb, data); else err = register_netdevice(dev); @@ -1897,14 +1899,14 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (nskb == NULL) return -ENOBUFS; - err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, + err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else - err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid); + err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); return err; } @@ -2088,7 +2090,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && (dev->priv_flags & IFF_BRIDGE_PORT)) { master = dev->master; - err = master->netdev_ops->ndo_fdb_add(ndm, dev, addr, + err = master->netdev_ops->ndo_fdb_add(ndm, tb, + dev, addr, nlh->nlmsg_flags); if (err) goto out; @@ -2098,7 +2101,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) /* Embedded bridge, macvlan, and any other device support */ if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_add) { - err = dev->netdev_ops->ndo_fdb_add(ndm, dev, addr, + err = dev->netdev_ops->ndo_fdb_add(ndm, tb, + dev, addr, nlh->nlmsg_flags); if (!err) { @@ -2178,9 +2182,9 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, { struct netdev_hw_addr *ha; int err; - u32 pid, seq; + u32 portid, seq; - pid = NETLINK_CB(cb->skb).pid; + portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { @@ -2188,7 +2192,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, - pid, seq, 0, NTF_SELF); + portid, seq, 0, NTF_SELF); if (err < 0) return err; skip: @@ -2356,7 +2360,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_PRE_TYPE_CHANGE: case NETDEV_GOING_DOWN: case NETDEV_UNREGISTER: - case NETDEV_UNREGISTER_BATCH: + case NETDEV_UNREGISTER_FINAL: case NETDEV_RELEASE: case NETDEV_JOIN: break; @@ -2379,9 +2383,10 @@ static int __net_init rtnetlink_net_init(struct net *net) .groups = RTNLGRP_MAX, .input = rtnetlink_rcv, .cb_mutex = &rtnl_mutex, + .flags = NL_CFG_F_NONROOT_RECV, }; - sk = netlink_kernel_create(net, NETLINK_ROUTE, THIS_MODULE, &cfg); + sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); if (!sk) return -ENOMEM; net->rtnl = sk; @@ -2414,7 +2419,6 @@ void __init rtnetlink_init(void) if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); - netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); register_netdevice_notifier(&rtnetlink_dev_notifier); rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, diff --git a/net/core/scm.c b/net/core/scm.c index 6ab491d..9c1c63d 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -155,19 +155,21 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) break; case SCM_CREDENTIALS: { + struct ucred creds; kuid_t uid; kgid_t gid; if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred))) goto error; - memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred)); - err = scm_check_creds(&p->creds); + memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred)); + err = scm_check_creds(&creds); if (err) goto error; - if (!p->pid || pid_vnr(p->pid) != p->creds.pid) { + p->creds.pid = creds.pid; + if (!p->pid || pid_vnr(p->pid) != creds.pid) { struct pid *pid; err = -ESRCH; - pid = find_get_pid(p->creds.pid); + pid = find_get_pid(creds.pid); if (!pid) goto error; put_pid(p->pid); @@ -175,11 +177,14 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) } err = -EINVAL; - uid = make_kuid(current_user_ns(), p->creds.uid); - gid = make_kgid(current_user_ns(), p->creds.gid); + uid = make_kuid(current_user_ns(), creds.uid); + gid = make_kgid(current_user_ns(), creds.gid); if (!uid_valid(uid) || !gid_valid(gid)) goto error; + p->creds.uid = uid; + p->creds.gid = gid; + if (!p->cred || !uid_eq(p->cred->euid, uid) || !gid_eq(p->cred->egid, gid)) { diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 99b2596..e61a8bb 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -76,6 +76,7 @@ u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, return hash[0]; } +EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e33ebae..cdc2859 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -340,43 +340,57 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) EXPORT_SYMBOL(build_skb); struct netdev_alloc_cache { - struct page *page; - unsigned int offset; - unsigned int pagecnt_bias; + struct page_frag frag; + /* we maintain a pagecount bias, so that we dont dirty cache line + * containing page->_count every time we allocate a fragment. + */ + unsigned int pagecnt_bias; }; static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); -#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES) +#define NETDEV_FRAG_PAGE_MAX_ORDER get_order(32768) +#define NETDEV_FRAG_PAGE_MAX_SIZE (PAGE_SIZE << NETDEV_FRAG_PAGE_MAX_ORDER) +#define NETDEV_PAGECNT_MAX_BIAS NETDEV_FRAG_PAGE_MAX_SIZE static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { struct netdev_alloc_cache *nc; void *data = NULL; + int order; unsigned long flags; local_irq_save(flags); nc = &__get_cpu_var(netdev_alloc_cache); - if (unlikely(!nc->page)) { + if (unlikely(!nc->frag.page)) { refill: - nc->page = alloc_page(gfp_mask); - if (unlikely(!nc->page)) - goto end; + for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) { + gfp_t gfp = gfp_mask; + + if (order) + gfp |= __GFP_COMP | __GFP_NOWARN; + nc->frag.page = alloc_pages(gfp, order); + if (likely(nc->frag.page)) + break; + if (--order < 0) + goto end; + } + nc->frag.size = PAGE_SIZE << order; recycle: - atomic_set(&nc->page->_count, NETDEV_PAGECNT_BIAS); - nc->pagecnt_bias = NETDEV_PAGECNT_BIAS; - nc->offset = 0; + atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS); + nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; + nc->frag.offset = 0; } - if (nc->offset + fragsz > PAGE_SIZE) { + if (nc->frag.offset + fragsz > nc->frag.size) { /* avoid unnecessary locked operations if possible */ - if ((atomic_read(&nc->page->_count) == nc->pagecnt_bias) || - atomic_sub_and_test(nc->pagecnt_bias, &nc->page->_count)) + if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) || + atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count)) goto recycle; goto refill; } - data = page_address(nc->page) + nc->offset; - nc->offset += fragsz; + data = page_address(nc->frag.page) + nc->frag.offset; + nc->frag.offset += fragsz; nc->pagecnt_bias--; end: local_irq_restore(flags); @@ -1655,38 +1669,19 @@ static struct page *linear_to_page(struct page *page, unsigned int *len, unsigned int *offset, struct sk_buff *skb, struct sock *sk) { - struct page *p = sk->sk_sndmsg_page; - unsigned int off; - - if (!p) { -new_page: - p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0); - if (!p) - return NULL; + struct page_frag *pfrag = sk_page_frag(sk); - off = sk->sk_sndmsg_off = 0; - /* hold one ref to this page until it's full */ - } else { - unsigned int mlen; - - /* If we are the only user of the page, we can reset offset */ - if (page_count(p) == 1) - sk->sk_sndmsg_off = 0; - off = sk->sk_sndmsg_off; - mlen = PAGE_SIZE - off; - if (mlen < 64 && mlen < *len) { - put_page(p); - goto new_page; - } + if (!sk_page_frag_refill(sk, pfrag)) + return NULL; - *len = min_t(unsigned int, *len, mlen); - } + *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); - memcpy(page_address(p) + off, page_address(page) + *offset, *len); - sk->sk_sndmsg_off += *len; - *offset = off; + memcpy(page_address(pfrag->page) + pfrag->offset, + page_address(page) + *offset, *len); + *offset = pfrag->offset; + pfrag->offset += *len; - return p; + return pfrag->page; } static bool spd_can_coalesce(const struct splice_pipe_desc *spd, @@ -3488,8 +3483,7 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) return false; - delta = from->truesize - - SKB_TRUESIZE(skb_end_pointer(from) - from->head); + delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); } WARN_ON_ONCE(delta < len); diff --git a/net/core/sock.c b/net/core/sock.c index 12cddd0..8a146cf 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1221,7 +1221,7 @@ void sock_update_classid(struct sock *sk) rcu_read_lock(); /* doing current task, which cannot vanish. */ classid = task_cls_classid(current); rcu_read_unlock(); - if (classid && classid != sk->sk_classid) + if (classid != sk->sk_classid) sk->sk_classid = classid; } EXPORT_SYMBOL(sock_update_classid); @@ -1458,19 +1458,6 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } EXPORT_SYMBOL_GPL(sk_setup_caps); -void __init sk_init(void) -{ - if (totalram_pages <= 4096) { - sysctl_wmem_max = 32767; - sysctl_rmem_max = 32767; - sysctl_wmem_default = 32767; - sysctl_rmem_default = 32767; - } else if (totalram_pages >= 131072) { - sysctl_wmem_max = 131071; - sysctl_rmem_max = 131071; - } -} - /* * Simple resource managers for sockets. */ @@ -1738,6 +1725,45 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, } EXPORT_SYMBOL(sock_alloc_send_skb); +/* On 32bit arches, an skb frag is limited to 2^15 */ +#define SKB_FRAG_PAGE_ORDER get_order(32768) + +bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + int order; + + if (pfrag->page) { + if (atomic_read(&pfrag->page->_count) == 1) { + pfrag->offset = 0; + return true; + } + if (pfrag->offset < pfrag->size) + return true; + put_page(pfrag->page); + } + + /* We restrict high order allocations to users that can afford to wait */ + order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; + + do { + gfp_t gfp = sk->sk_allocation; + + if (order) + gfp |= __GFP_COMP | __GFP_NOWARN; + pfrag->page = alloc_pages(gfp, order); + if (likely(pfrag->page)) { + pfrag->offset = 0; + pfrag->size = PAGE_SIZE << order; + return true; + } + } while (--order >= 0); + + sk_enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return false; +} +EXPORT_SYMBOL(sk_page_frag_refill); + static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) @@ -2167,8 +2193,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_error_report = sock_def_error_report; sk->sk_destruct = sock_def_destruct; - sk->sk_sndmsg_page = NULL; - sk->sk_sndmsg_off = 0; + sk->sk_frag.page = NULL; + sk->sk_frag.offset = 0; sk->sk_peek_off = -1; sk->sk_peer_pid = NULL; @@ -2411,6 +2437,12 @@ void sk_common_release(struct sock *sk) xfrm_sk_free_policy(sk); sk_refcnt_debug_release(sk); + + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + } + sock_put(sk); } EXPORT_SYMBOL(sk_common_release); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 9d8755e..602cd63 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -172,8 +172,7 @@ static int __net_init diag_net_init(struct net *net) .input = sock_diag_rcv, }; - net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, - THIS_MODULE, &cfg); + net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg); return net->diag_nlsk == NULL ? -ENOMEM : 0; } diff --git a/net/core/utils.c b/net/core/utils.c index 39895a6..f5613d5 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -294,6 +294,26 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, } EXPORT_SYMBOL(inet_proto_csum_replace4); +void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, + const __be32 *from, const __be32 *to, + int pseudohdr) +{ + __be32 diff[] = { + ~from[0], ~from[1], ~from[2], ~from[3], + to[0], to[1], to[2], to[3], + }; + if (skb->ip_summed != CHECKSUM_PARTIAL) { + *sum = csum_fold(csum_partial(diff, sizeof(diff), + ~csum_unfold(*sum))); + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + skb->csum = ~csum_partial(diff, sizeof(diff), + ~skb->csum); + } else if (pseudohdr) + *sum = ~csum_fold(csum_partial(diff, sizeof(diff), + csum_unfold(*sum))); +} +EXPORT_SYMBOL(inet_proto_csum_replace16); + int mac_pton(const char *s, u8 *mac) { int i; |