summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/Kconfig12
-rw-r--r--net/bridge/br_device.c13
-rw-r--r--net/bridge/br_multicast.c70
-rw-r--r--net/bridge/br_private.h2
-rw-r--r--net/bridge/br_sysfs_br.c26
-rw-r--r--net/core/dev.c49
-rw-r--r--net/core/net-procfs.c16
-rw-r--r--net/core/skbuff.c3
-rw-r--r--net/core/sysctl_net_core.c104
-rw-r--r--net/ipv4/tcp.c98
-rw-r--r--net/ipv4/tcp_input.c69
-rw-r--r--net/ipv4/tcp_ipv4.c10
-rw-r--r--net/ipv4/tcp_minisocks.c6
-rw-r--r--net/ipv6/addrconf.c67
-rw-r--r--net/sched/sch_tbf.c47
15 files changed, 375 insertions, 217 deletions
diff --git a/net/Kconfig b/net/Kconfig
index 2ddc904..08de901 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -259,6 +259,18 @@ config BPF_JIT
packet sniffing (libpcap/tcpdump). Note : Admin should enable
this feature changing /proc/sys/net/core/bpf_jit_enable
+config NET_FLOW_LIMIT
+ boolean
+ depends on RPS
+ default y
+ ---help---
+ The network stack has to drop packets when a receive processing CPU's
+ backlog reaches netdev_max_backlog. If a few out of many active flows
+ generate the vast majority of load, drop their traffic earlier to
+ maintain capacity for the other flows. This feature provides servers
+ with many clients some protection against DoS by a single (spoofed)
+ flow that greatly exceeds average workload.
+
menu "Network testing"
config NET_PKTGEN
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 9673128..75f3239 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -22,6 +22,9 @@
#include <asm/uaccess.h>
#include "br_private.h"
+#define COMMON_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | \
+ NETIF_F_GSO_MASK | NETIF_F_HW_CSUM)
+
/* net device transmit always called with BH disabled */
netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
{
@@ -346,12 +349,10 @@ void br_dev_setup(struct net_device *dev)
dev->tx_queue_len = 0;
dev->priv_flags = IFF_EBRIDGE;
- dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
- NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | NETIF_F_LLTX |
- NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_CTAG_TX;
- dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
- NETIF_F_GSO_MASK | NETIF_F_HW_CSUM |
- NETIF_F_HW_VLAN_CTAG_TX;
+ dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL |
+ NETIF_F_HW_VLAN_CTAG_TX;
+ dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX;
+ dev->vlan_features = COMMON_FEATURES;
br->dev = dev;
spin_lock_init(&br->lock);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 81f2389..37a4676 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -23,6 +23,7 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/timer.h>
+#include <linux/inetdevice.h>
#include <net/ip.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@@ -381,7 +382,8 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
iph->frag_off = htons(IP_DF);
iph->ttl = 1;
iph->protocol = IPPROTO_IGMP;
- iph->saddr = 0;
+ iph->saddr = br->multicast_query_use_ifaddr ?
+ inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0;
iph->daddr = htonl(INADDR_ALLHOSTS_GROUP);
((u8 *)&iph[1])[0] = IPOPT_RA;
((u8 *)&iph[1])[1] = 4;
@@ -615,8 +617,6 @@ rehash:
mp->br = br;
mp->addr = *group;
- setup_timer(&mp->timer, br_multicast_group_expired,
- (unsigned long)mp);
hlist_add_head_rcu(&mp->hlist[mdb->ver], &mdb->mhash[hash]);
mdb->size++;
@@ -654,7 +654,6 @@ static int br_multicast_add_group(struct net_bridge *br,
struct net_bridge_mdb_entry *mp;
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
- unsigned long now = jiffies;
int err;
spin_lock(&br->multicast_lock);
@@ -669,7 +668,6 @@ static int br_multicast_add_group(struct net_bridge *br,
if (!port) {
mp->mglist = true;
- mod_timer(&mp->timer, now + br->multicast_membership_interval);
goto out;
}
@@ -677,7 +675,7 @@ static int br_multicast_add_group(struct net_bridge *br,
(p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
if (p->port == port)
- goto found;
+ goto out;
if ((unsigned long)p->port < (unsigned long)port)
break;
}
@@ -688,8 +686,6 @@ static int br_multicast_add_group(struct net_bridge *br,
rcu_assign_pointer(*pp, p);
br_mdb_notify(br->dev, port, group, RTM_NEWMDB);
-found:
- mod_timer(&p->timer, now + br->multicast_membership_interval);
out:
err = 0;
@@ -1129,6 +1125,10 @@ static int br_ip4_multicast_query(struct net_bridge *br,
if (!mp)
goto out;
+ setup_timer(&mp->timer, br_multicast_group_expired, (unsigned long)mp);
+ mod_timer(&mp->timer, now + br->multicast_membership_interval);
+ mp->timer_armed = true;
+
max_delay *= br->multicast_last_member_count;
if (mp->mglist &&
@@ -1203,6 +1203,10 @@ static int br_ip6_multicast_query(struct net_bridge *br,
if (!mp)
goto out;
+ setup_timer(&mp->timer, br_multicast_group_expired, (unsigned long)mp);
+ mod_timer(&mp->timer, now + br->multicast_membership_interval);
+ mp->timer_armed = true;
+
max_delay *= br->multicast_last_member_count;
if (mp->mglist &&
(timer_pending(&mp->timer) ?
@@ -1246,6 +1250,32 @@ static void br_multicast_leave_group(struct net_bridge *br,
if (!mp)
goto out;
+ if (br->multicast_querier &&
+ !timer_pending(&br->multicast_querier_timer)) {
+ __br_multicast_send_query(br, port, &mp->addr);
+
+ time = jiffies + br->multicast_last_member_count *
+ br->multicast_last_member_interval;
+ mod_timer(port ? &port->multicast_query_timer :
+ &br->multicast_query_timer, time);
+
+ for (p = mlock_dereference(mp->ports, br);
+ p != NULL;
+ p = mlock_dereference(p->next, br)) {
+ if (p->port != port)
+ continue;
+
+ if (!hlist_unhashed(&p->mglist) &&
+ (timer_pending(&p->timer) ?
+ time_after(p->timer.expires, time) :
+ try_to_del_timer_sync(&p->timer) >= 0)) {
+ mod_timer(&p->timer, time);
+ }
+
+ break;
+ }
+ }
+
if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) {
struct net_bridge_port_group __rcu **pp;
@@ -1261,7 +1291,7 @@ static void br_multicast_leave_group(struct net_bridge *br,
call_rcu_bh(&p->rcu, br_multicast_free_pg);
br_mdb_notify(br->dev, port, group, RTM_DELMDB);
- if (!mp->ports && !mp->mglist &&
+ if (!mp->ports && !mp->mglist && mp->timer_armed &&
netif_running(br->dev))
mod_timer(&mp->timer, jiffies);
}
@@ -1273,30 +1303,12 @@ static void br_multicast_leave_group(struct net_bridge *br,
br->multicast_last_member_interval;
if (!port) {
- if (mp->mglist &&
+ if (mp->mglist && mp->timer_armed &&
(timer_pending(&mp->timer) ?
time_after(mp->timer.expires, time) :
try_to_del_timer_sync(&mp->timer) >= 0)) {
mod_timer(&mp->timer, time);
}
-
- goto out;
- }
-
- for (p = mlock_dereference(mp->ports, br);
- p != NULL;
- p = mlock_dereference(p->next, br)) {
- if (p->port != port)
- continue;
-
- if (!hlist_unhashed(&p->mglist) &&
- (timer_pending(&p->timer) ?
- time_after(p->timer.expires, time) :
- try_to_del_timer_sync(&p->timer) >= 0)) {
- mod_timer(&p->timer, time);
- }
-
- break;
}
out:
@@ -1618,6 +1630,7 @@ void br_multicast_init(struct net_bridge *br)
br->multicast_router = 1;
br->multicast_querier = 0;
+ br->multicast_query_use_ifaddr = 0;
br->multicast_last_member_count = 2;
br->multicast_startup_query_count = 2;
@@ -1671,6 +1684,7 @@ void br_multicast_stop(struct net_bridge *br)
hlist_for_each_entry_safe(mp, n, &mdb->mhash[i],
hlist[ver]) {
del_timer(&mp->timer);
+ mp->timer_armed = false;
call_rcu_bh(&mp->rcu, br_multicast_free_group);
}
}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index d2c043a..1b0ac95 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -112,6 +112,7 @@ struct net_bridge_mdb_entry
struct timer_list timer;
struct br_ip addr;
bool mglist;
+ bool timer_armed;
};
struct net_bridge_mdb_htable
@@ -249,6 +250,7 @@ struct net_bridge
u8 multicast_disabled:1;
u8 multicast_querier:1;
+ u8 multicast_query_use_ifaddr:1;
u32 hash_elasticity;
u32 hash_max;
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 8baa9c0..394bb96 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -375,6 +375,31 @@ static ssize_t store_multicast_snooping(struct device *d,
static DEVICE_ATTR(multicast_snooping, S_IRUGO | S_IWUSR,
show_multicast_snooping, store_multicast_snooping);
+static ssize_t show_multicast_query_use_ifaddr(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br->multicast_query_use_ifaddr);
+}
+
+static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val)
+{
+ br->multicast_query_use_ifaddr = !!val;
+ return 0;
+}
+
+static ssize_t
+store_multicast_query_use_ifaddr(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_query_use_ifaddr);
+}
+static DEVICE_ATTR(multicast_query_use_ifaddr, S_IRUGO | S_IWUSR,
+ show_multicast_query_use_ifaddr,
+ store_multicast_query_use_ifaddr);
+
static ssize_t show_multicast_querier(struct device *d,
struct device_attribute *attr,
char *buf)
@@ -734,6 +759,7 @@ static struct attribute *bridge_attrs[] = {
&dev_attr_multicast_router.attr,
&dev_attr_multicast_snooping.attr,
&dev_attr_multicast_querier.attr,
+ &dev_attr_multicast_query_use_ifaddr.attr,
&dev_attr_hash_elasticity.attr,
&dev_attr_hash_max.attr,
&dev_attr_multicast_last_member_count.attr,
diff --git a/net/core/dev.c b/net/core/dev.c
index fc1e289..7229bc3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1629,7 +1629,6 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
return NET_RX_DROP;
}
skb->skb_iif = 0;
- skb->dev = dev;
skb_dst_drop(skb);
skb->tstamp.tv64 = 0;
skb->pkt_type = PACKET_HOST;
@@ -3065,6 +3064,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
return 0;
}
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+ struct sd_flow_limit *fl;
+ struct softnet_data *sd;
+ unsigned int old_flow, new_flow;
+
+ if (qlen < (netdev_max_backlog >> 1))
+ return false;
+
+ sd = &__get_cpu_var(softnet_data);
+
+ rcu_read_lock();
+ fl = rcu_dereference(sd->flow_limit);
+ if (fl) {
+ new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
+ old_flow = fl->history[fl->history_head];
+ fl->history[fl->history_head] = new_flow;
+
+ fl->history_head++;
+ fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+ if (likely(fl->buckets[old_flow]))
+ fl->buckets[old_flow]--;
+
+ if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+ fl->count++;
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+#endif
+ return false;
+}
+
/*
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
* queue (may be a remote CPU queue).
@@ -3074,13 +3113,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
{
struct softnet_data *sd;
unsigned long flags;
+ unsigned int qlen;
sd = &per_cpu(softnet_data, cpu);
local_irq_save(flags);
rps_lock(sd);
- if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+ qlen = skb_queue_len(&sd->input_pkt_queue);
+ if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
if (skb_queue_len(&sd->input_pkt_queue)) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6270,6 +6311,10 @@ static int __init net_dev_init(void)
sd->backlog.weight = weight_p;
sd->backlog.gro_list = NULL;
sd->backlog.gro_count = 0;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+ sd->flow_limit = NULL;
+#endif
}
dev_boot_phase = 0;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 569d355..2bf8329 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)
static int softnet_seq_show(struct seq_file *seq, void *v)
{
struct softnet_data *sd = v;
+ unsigned int flow_limit_count = 0;
- seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+#ifdef CONFIG_NET_FLOW_LIMIT
+ struct sd_flow_limit *fl;
+
+ rcu_read_lock();
+ fl = rcu_dereference(sd->flow_limit);
+ if (fl)
+ flow_limit_count = fl->count;
+ rcu_read_unlock();
+#endif
+
+ seq_printf(seq,
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
sd->processed, sd->dropped, sd->time_squeeze, 0,
0, 0, 0, 0, /* was fastroute */
- sd->cpu_collision, sd->received_rps);
+ sd->cpu_collision, sd->received_rps, flow_limit_count);
return 0;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index af9185d..d629891 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2853,7 +2853,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
doffset + tnl_hlen);
if (fskb != skb_shinfo(skb)->frag_list)
- continue;
+ goto perform_csum_check;
if (!sg) {
nskb->ip_summed = CHECKSUM_NONE;
@@ -2917,6 +2917,7 @@ skip_fraglist:
nskb->len += nskb->data_len;
nskb->truesize += nskb->data_len;
+perform_csum_check:
if (!csum) {
nskb->csum = skb_checksum(nskb, doffset,
nskb->len - doffset, 0);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cfdb46a..741db5fc 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -87,6 +87,96 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
}
#endif /* CONFIG_RPS */
+#ifdef CONFIG_NET_FLOW_LIMIT
+static DEFINE_MUTEX(flow_limit_update_mutex);
+
+static int flow_limit_cpu_sysctl(ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct sd_flow_limit *cur;
+ struct softnet_data *sd;
+ cpumask_var_t mask;
+ int i, len, ret = 0;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (write) {
+ ret = cpumask_parse_user(buffer, *lenp, mask);
+ if (ret)
+ goto done;
+
+ mutex_lock(&flow_limit_update_mutex);
+ len = sizeof(*cur) + netdev_flow_limit_table_len;
+ for_each_possible_cpu(i) {
+ sd = &per_cpu(softnet_data, i);
+ cur = rcu_dereference_protected(sd->flow_limit,
+ lockdep_is_held(&flow_limit_update_mutex));
+ if (cur && !cpumask_test_cpu(i, mask)) {
+ RCU_INIT_POINTER(sd->flow_limit, NULL);
+ synchronize_rcu();
+ kfree(cur);
+ } else if (!cur && cpumask_test_cpu(i, mask)) {
+ cur = kzalloc(len, GFP_KERNEL);
+ if (!cur) {
+ /* not unwinding previous changes */
+ ret = -ENOMEM;
+ goto write_unlock;
+ }
+ cur->num_buckets = netdev_flow_limit_table_len;
+ rcu_assign_pointer(sd->flow_limit, cur);
+ }
+ }
+write_unlock:
+ mutex_unlock(&flow_limit_update_mutex);
+ } else {
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ goto done;
+ }
+
+ cpumask_clear(mask);
+ rcu_read_lock();
+ for_each_possible_cpu(i) {
+ sd = &per_cpu(softnet_data, i);
+ if (rcu_dereference(sd->flow_limit))
+ cpumask_set_cpu(i, mask);
+ }
+ rcu_read_unlock();
+
+ len = cpumask_scnprintf(buffer, *lenp, mask);
+ *lenp = len + 1;
+ *ppos += len + 1;
+ }
+
+done:
+ free_cpumask_var(mask);
+ return ret;
+}
+
+static int flow_limit_table_len_sysctl(ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ unsigned int old, *ptr;
+ int ret;
+
+ mutex_lock(&flow_limit_update_mutex);
+
+ ptr = table->data;
+ old = *ptr;
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (!ret && write && !is_power_of_2(*ptr)) {
+ *ptr = old;
+ ret = -EINVAL;
+ }
+
+ mutex_unlock(&flow_limit_update_mutex);
+ return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
static struct ctl_table net_core_table[] = {
#ifdef CONFIG_NET
{
@@ -180,6 +270,20 @@ static struct ctl_table net_core_table[] = {
.proc_handler = rps_sock_flow_sysctl
},
#endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+ {
+ .procname = "flow_limit_cpu_bitmap",
+ .mode = 0644,
+ .proc_handler = flow_limit_cpu_sysctl
+ },
+ {
+ .procname = "flow_limit_table_len",
+ .data = &netdev_flow_limit_table_len,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = flow_limit_table_len_sysctl
+ },
+#endif /* CONFIG_NET_FLOW_LIMIT */
#endif /* CONFIG_NET */
{
.procname = "netdev_budget",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ab450c0..d87ce72 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3115,9 +3115,8 @@ int tcp_gro_complete(struct sk_buff *skb)
EXPORT_SYMBOL(tcp_gro_complete);
#ifdef CONFIG_TCP_MD5SIG
-static unsigned long tcp_md5sig_users;
-static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool;
-static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
+static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly;
+static DEFINE_MUTEX(tcp_md5sig_mutex);
static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
{
@@ -3132,30 +3131,14 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
free_percpu(pool);
}
-void tcp_free_md5sig_pool(void)
-{
- struct tcp_md5sig_pool __percpu *pool = NULL;
-
- spin_lock_bh(&tcp_md5sig_pool_lock);
- if (--tcp_md5sig_users == 0) {
- pool = tcp_md5sig_pool;
- tcp_md5sig_pool = NULL;
- }
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- if (pool)
- __tcp_free_md5sig_pool(pool);
-}
-EXPORT_SYMBOL(tcp_free_md5sig_pool);
-
-static struct tcp_md5sig_pool __percpu *
-__tcp_alloc_md5sig_pool(struct sock *sk)
+static void __tcp_alloc_md5sig_pool(void)
{
int cpu;
struct tcp_md5sig_pool __percpu *pool;
pool = alloc_percpu(struct tcp_md5sig_pool);
if (!pool)
- return NULL;
+ return;
for_each_possible_cpu(cpu) {
struct crypto_hash *hash;
@@ -3166,53 +3149,27 @@ __tcp_alloc_md5sig_pool(struct sock *sk)
per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
}
- return pool;
+ /* before setting tcp_md5sig_pool, we must commit all writes
+ * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool()
+ */
+ smp_wmb();
+ tcp_md5sig_pool = pool;
+ return;
out_free:
__tcp_free_md5sig_pool(pool);
- return NULL;
}
-struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
+bool tcp_alloc_md5sig_pool(void)
{
- struct tcp_md5sig_pool __percpu *pool;
- bool alloc = false;
-
-retry:
- spin_lock_bh(&tcp_md5sig_pool_lock);
- pool = tcp_md5sig_pool;
- if (tcp_md5sig_users++ == 0) {
- alloc = true;
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- } else if (!pool) {
- tcp_md5sig_users--;
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- cpu_relax();
- goto retry;
- } else
- spin_unlock_bh(&tcp_md5sig_pool_lock);
-
- if (alloc) {
- /* we cannot hold spinlock here because this may sleep. */
- struct tcp_md5sig_pool __percpu *p;
-
- p = __tcp_alloc_md5sig_pool(sk);
- spin_lock_bh(&tcp_md5sig_pool_lock);
- if (!p) {
- tcp_md5sig_users--;
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- return NULL;
- }
- pool = tcp_md5sig_pool;
- if (pool) {
- /* oops, it has already been assigned. */
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- __tcp_free_md5sig_pool(p);
- } else {
- tcp_md5sig_pool = pool = p;
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- }
+ if (unlikely(!tcp_md5sig_pool)) {
+ mutex_lock(&tcp_md5sig_mutex);
+
+ if (!tcp_md5sig_pool)
+ __tcp_alloc_md5sig_pool();
+
+ mutex_unlock(&tcp_md5sig_mutex);
}
- return pool;
+ return tcp_md5sig_pool != NULL;
}
EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
@@ -3229,28 +3186,15 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
struct tcp_md5sig_pool __percpu *p;
local_bh_disable();
-
- spin_lock(&tcp_md5sig_pool_lock);
- p = tcp_md5sig_pool;
- if (p)
- tcp_md5sig_users++;
- spin_unlock(&tcp_md5sig_pool_lock);
-
+ p = ACCESS_ONCE(tcp_md5sig_pool);
if (p)
- return this_cpu_ptr(p);
+ return __this_cpu_ptr(p);
local_bh_enable();
return NULL;
}
EXPORT_SYMBOL(tcp_get_md5sig_pool);
-void tcp_put_md5sig_pool(void)
-{
- local_bh_enable();
- tcp_free_md5sig_pool();
-}
-EXPORT_SYMBOL(tcp_put_md5sig_pool);
-
int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
const struct tcphdr *th)
{
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9c62257..8230cd6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -360,9 +360,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
if (mss > 1460)
icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
- rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER);
- while (tcp_win_from_space(rcvmem) < mss)
- rcvmem += 128;
+ rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER);
rcvmem *= icwnd;
@@ -1257,8 +1255,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = prev;
- if (skb == tp->scoreboard_skb_hint)
- tp->scoreboard_skb_hint = prev;
if (skb == tp->lost_skb_hint) {
tp->lost_skb_hint = prev;
tp->lost_cnt_hint -= tcp_skb_pcount(prev);
@@ -1966,20 +1962,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
return true;
}
-static inline int tcp_skb_timedout(const struct sock *sk,
- const struct sk_buff *skb)
-{
- return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
-}
-
-static inline int tcp_head_timedout(const struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
-
- return tp->packets_out &&
- tcp_skb_timedout(sk, tcp_write_queue_head(sk));
-}
-
/* Linux NewReno/SACK/FACK/ECN state machine.
* --------------------------------------
*
@@ -2086,12 +2068,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
if (tcp_dupack_heuristics(tp) > tp->reordering)
return true;
- /* Trick#3 : when we use RFC2988 timer restart, fast
- * retransmit can be triggered by timeout of queue head.
- */
- if (tcp_is_fack(tp) && tcp_head_timedout(sk))
- return true;
-
/* Trick#4: It is still not OK... But will it be useful to delay
* recovery more?
*/
@@ -2128,44 +2104,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
return false;
}
-/* New heuristics: it is possible only after we switched to restart timer
- * each time when something is ACKed. Hence, we can detect timed out packets
- * during fast retransmit without falling to slow start.
- *
- * Usefulness of this as is very questionable, since we should know which of
- * the segments is the next to timeout which is relatively expensive to find
- * in general case unless we add some data structure just for that. The
- * current approach certainly won't find the right one too often and when it
- * finally does find _something_ it usually marks large part of the window
- * right away (because a retransmission with a larger timestamp blocks the
- * loop from advancing). -ij
- */
-static void tcp_timeout_skbs(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
-
- if (!tcp_is_fack(tp) || !tcp_head_timedout(sk))
- return;
-
- skb = tp->scoreboard_skb_hint;
- if (tp->scoreboard_skb_hint == NULL)
- skb = tcp_write_queue_head(sk);
-
- tcp_for_write_queue_from(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
- if (!tcp_skb_timedout(sk, skb))
- break;
-
- tcp_skb_mark_lost(tp, skb);
- }
-
- tp->scoreboard_skb_hint = skb;
-
- tcp_verify_left_out(tp);
-}
-
/* Detect loss in event "A" above by marking head of queue up as lost.
* For FACK or non-SACK(Reno) senders, the first "packets" number of segments
* are considered lost. For RFC3517 SACK, a segment is considered lost if it
@@ -2251,8 +2189,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
else if (fast_rexmit)
tcp_mark_head_lost(sk, 1, 1);
}
-
- tcp_timeout_skbs(sk);
}
/* CWND moderation, preventing bursts due to too big ACKs
@@ -2846,7 +2782,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
fast_rexmit = 1;
}
- if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
+ if (do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit);
tcp_xmit_retransmit_queue(sk);
@@ -3079,7 +3015,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
- tp->scoreboard_skb_hint = NULL;
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = NULL;
if (skb == tp->lost_skb_hint)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7196523..d20ede0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1026,7 +1026,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
key = sock_kmalloc(sk, sizeof(*key), gfp);
if (!key)
return -ENOMEM;
- if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
+ if (!tcp_alloc_md5sig_pool()) {
sock_kfree_s(sk, key, sizeof(*key));
return -ENOMEM;
}
@@ -1044,9 +1044,7 @@ EXPORT_SYMBOL(tcp_md5_do_add);
int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
- struct tcp_md5sig_info *md5sig;
key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
if (!key)
@@ -1054,10 +1052,6 @@ int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
hlist_del_rcu(&key->node);
atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
kfree_rcu(key, rcu);
- md5sig = rcu_dereference_protected(tp->md5sig_info,
- sock_owned_by_user(sk));
- if (hlist_empty(&md5sig->head))
- tcp_free_md5sig_pool();
return 0;
}
EXPORT_SYMBOL(tcp_md5_do_del);
@@ -1071,8 +1065,6 @@ static void tcp_clear_md5_list(struct sock *sk)
md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
- if (!hlist_empty(&md5sig->head))
- tcp_free_md5sig_pool();
hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
hlist_del_rcu(&key->node);
atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 0f01788..ab1c086 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -317,7 +317,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
key = tp->af_specific->md5_lookup(sk, sk);
if (key != NULL) {
tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
- if (tcptw->tw_md5_key && tcp_alloc_md5sig_pool(sk) == NULL)
+ if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool())
BUG();
}
} while (0);
@@ -358,10 +358,8 @@ void tcp_twsk_destructor(struct sock *sk)
#ifdef CONFIG_TCP_MD5SIG
struct tcp_timewait_sock *twsk = tcp_twsk(sk);
- if (twsk->tw_md5_key) {
- tcp_free_md5sig_pool();
+ if (twsk->tw_md5_key)
kfree_rcu(twsk->tw_md5_key, rcu);
- }
#endif
}
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d1ab6ab..432e084 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1126,8 +1126,7 @@ retry:
ift = !max_addresses ||
ipv6_count_addresses(idev) < max_addresses ?
- ipv6_add_addr(idev, &addr, tmp_plen,
- ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK,
+ ipv6_add_addr(idev, &addr, tmp_plen, ipv6_addr_scope(&addr),
addr_flags) : NULL;
if (IS_ERR_OR_NULL(ift)) {
in6_ifa_put(ifp);
@@ -2402,6 +2401,7 @@ err_exit:
* Manual configuration of address on an interface
*/
static int inet6_addr_add(struct net *net, int ifindex, const struct in6_addr *pfx,
+ const struct in6_addr *peer_pfx,
unsigned int plen, __u8 ifa_flags, __u32 prefered_lft,
__u32 valid_lft)
{
@@ -2457,6 +2457,8 @@ static int inet6_addr_add(struct net *net, int ifindex, const struct in6_addr *p
ifp->valid_lft = valid_lft;
ifp->prefered_lft = prefered_lft;
ifp->tstamp = jiffies;
+ if (peer_pfx)
+ ifp->peer_addr = *peer_pfx;
spin_unlock_bh(&ifp->lock);
addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
@@ -2526,7 +2528,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
return -EFAULT;
rtnl_lock();
- err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr,
+ err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL,
ireq.ifr6_prefixlen, IFA_F_PERMANENT,
INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
rtnl_unlock();
@@ -3610,18 +3612,20 @@ restart:
rcu_read_unlock_bh();
}
-static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local)
+static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local,
+ struct in6_addr **peer_pfx)
{
struct in6_addr *pfx = NULL;
+ *peer_pfx = NULL;
+
if (addr)
pfx = nla_data(addr);
if (local) {
if (pfx && nla_memcmp(local, pfx, sizeof(*pfx)))
- pfx = NULL;
- else
- pfx = nla_data(local);
+ *peer_pfx = pfx;
+ pfx = nla_data(local);
}
return pfx;
@@ -3639,7 +3643,7 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
struct net *net = sock_net(skb->sk);
struct ifaddrmsg *ifm;
struct nlattr *tb[IFA_MAX+1];
- struct in6_addr *pfx;
+ struct in6_addr *pfx, *peer_pfx;
int err;
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy);
@@ -3647,7 +3651,7 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
ifm = nlmsg_data(nlh);
- pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]);
+ pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
if (pfx == NULL)
return -EINVAL;
@@ -3705,7 +3709,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
struct net *net = sock_net(skb->sk);
struct ifaddrmsg *ifm;
struct nlattr *tb[IFA_MAX+1];
- struct in6_addr *pfx;
+ struct in6_addr *pfx, *peer_pfx;
struct inet6_ifaddr *ifa;
struct net_device *dev;
u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME;
@@ -3717,7 +3721,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
ifm = nlmsg_data(nlh);
- pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]);
+ pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
if (pfx == NULL)
return -EINVAL;
@@ -3745,7 +3749,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
* It would be best to check for !NLM_F_CREATE here but
* userspace alreay relies on not having to provide this.
*/
- return inet6_addr_add(net, ifm->ifa_index, pfx,
+ return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx,
ifm->ifa_prefixlen, ifa_flags,
preferred_lft, valid_lft);
}
@@ -3802,6 +3806,7 @@ static inline int rt_scope(int ifa_scope)
static inline int inet6_ifaddr_msgsize(void)
{
return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+ + nla_total_size(16) /* IFA_LOCAL */
+ nla_total_size(16) /* IFA_ADDRESS */
+ nla_total_size(sizeof(struct ifa_cacheinfo));
}
@@ -3840,13 +3845,22 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
valid = INFINITY_LIFE_TIME;
}
- if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0 ||
- put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) {
- nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
- }
+ if (!ipv6_addr_any(&ifa->peer_addr)) {
+ if (nla_put(skb, IFA_LOCAL, 16, &ifa->addr) < 0 ||
+ nla_put(skb, IFA_ADDRESS, 16, &ifa->peer_addr) < 0)
+ goto error;
+ } else
+ if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0)
+ goto error;
+
+ if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0)
+ goto error;
return nlmsg_end(skb, nlh);
+
+error:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
@@ -4046,7 +4060,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh)
struct net *net = sock_net(in_skb->sk);
struct ifaddrmsg *ifm;
struct nlattr *tb[IFA_MAX+1];
- struct in6_addr *addr = NULL;
+ struct in6_addr *addr = NULL, *peer;
struct net_device *dev = NULL;
struct inet6_ifaddr *ifa;
struct sk_buff *skb;
@@ -4056,7 +4070,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh)
if (err < 0)
goto errout;
- addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]);
+ addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer);
if (addr == NULL) {
err = -EINVAL;
goto errout;
@@ -4564,11 +4578,26 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
ip6_ins_rt(ifp->rt);
if (ifp->idev->cnf.forwarding)
addrconf_join_anycast(ifp);
+ if (!ipv6_addr_any(&ifp->peer_addr))
+ addrconf_prefix_route(&ifp->peer_addr, 128,
+ ifp->idev->dev, 0, 0);
break;
case RTM_DELADDR:
if (ifp->idev->cnf.forwarding)
addrconf_leave_anycast(ifp);
addrconf_leave_solict(ifp->idev, &ifp->addr);
+ if (!ipv6_addr_any(&ifp->peer_addr)) {
+ struct rt6_info *rt;
+ struct net_device *dev = ifp->idev->dev;
+
+ rt = rt6_lookup(dev_net(dev), &ifp->peer_addr, NULL,
+ dev->ifindex, 1);
+ if (rt) {
+ dst_hold(&rt->dst);
+ if (ip6_del_rt(rt))
+ dst_free(&rt->dst);
+ }
+ }
dst_hold(&ifp->rt->dst);
if (ip6_del_rt(ifp->rt))
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index c8388f3..38008b0 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -116,14 +116,57 @@ struct tbf_sched_data {
struct qdisc_watchdog watchdog; /* Watchdog timer */
};
+
+/* GSO packet is too big, segment it so that tbf can transmit
+ * each segment in time
+ */
+static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features = netif_skb_features(skb);
+ int ret, nb;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+
+ if (IS_ERR_OR_NULL(segs))
+ return qdisc_reshape_fail(skb, sch);
+
+ nb = 0;
+ while (segs) {
+ nskb = segs->next;
+ segs->next = NULL;
+ if (likely(segs->len <= q->max_size)) {
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ ret = qdisc_enqueue(segs, q->qdisc);
+ } else {
+ ret = qdisc_reshape_fail(skb, sch);
+ }
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret))
+ sch->qstats.drops++;
+ } else {
+ nb++;
+ }
+ segs = nskb;
+ }
+ sch->q.qlen += nb;
+ if (nb > 1)
+ qdisc_tree_decrease_qlen(sch, 1 - nb);
+ consume_skb(skb);
+ return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+}
+
static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
int ret;
- if (qdisc_pkt_len(skb) > q->max_size)
+ if (qdisc_pkt_len(skb) > q->max_size) {
+ if (skb_is_gso(skb))
+ return tbf_segment(skb, sch);
return qdisc_reshape_fail(skb, sch);
-
+ }
ret = qdisc_enqueue(skb, q->qdisc);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
OpenPOWER on IntegriCloud