summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/ip_output.c29
-rw-r--r--net/ipv4/route.c3
-rw-r--r--net/ipv4/tcp_bic.c2
-rw-r--r--net/ipv4/tcp_cong.c32
-rw-r--r--net/ipv4/tcp_cubic.c39
-rw-r--r--net/ipv4/tcp_ipv4.c37
-rw-r--r--net/ipv4/tcp_scalable.c3
-rw-r--r--net/ipv4/tcp_veno.c2
-rw-r--r--net/ipv4/tcp_yeah.c2
9 files changed, 80 insertions, 69 deletions
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b50861b..c373c07 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1506,23 +1506,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
/*
* Generic function to send a packet as reply to another packet.
* Used to send some TCP resets/acks so far.
- *
- * Use a fake percpu inet socket to avoid false sharing and contention.
*/
-static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
- .sk = {
- .__sk_common = {
- .skc_refcnt = ATOMIC_INIT(1),
- },
- .sk_wmem_alloc = ATOMIC_INIT(1),
- .sk_allocation = GFP_ATOMIC,
- .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE),
- },
- .pmtudisc = IP_PMTUDISC_WANT,
- .uc_ttl = -1,
-};
-
-void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
const struct ip_options *sopt,
__be32 daddr, __be32 saddr,
const struct ip_reply_arg *arg,
@@ -1532,9 +1517,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
struct ipcm_cookie ipc;
struct flowi4 fl4;
struct rtable *rt = skb_rtable(skb);
+ struct net *net = sock_net(sk);
struct sk_buff *nskb;
- struct sock *sk;
- struct inet_sock *inet;
int err;
if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
@@ -1565,15 +1549,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
if (IS_ERR(rt))
return;
- inet = &get_cpu_var(unicast_sock);
+ inet_sk(sk)->tos = arg->tos;
- inet->tos = arg->tos;
- sk = &inet->sk;
sk->sk_priority = skb->priority;
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
- sock_net_set(sk, net);
- __skb_queue_head_init(&sk->sk_write_queue);
sk->sk_sndbuf = sysctl_wmem_default;
err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
len, 0, &ipc, &rt, MSG_DONTWAIT);
@@ -1589,13 +1569,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
- skb_orphan(nskb);
skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
ip_push_pending_frames(sk, &fl4);
}
out:
- put_cpu_var(unicast_sock);
-
ip_rt_put(rt);
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d58dd0e..52e1f2b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -966,6 +966,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
if (dst->dev->mtu < mtu)
return;
+ if (rt->rt_pmtu && rt->rt_pmtu < mtu)
+ return;
+
if (mtu < ip_rt_min_pmtu)
mtu = ip_rt_min_pmtu;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index bb395d4..c037644 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -150,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
tcp_slow_start(tp, acked);
else {
bictcp_update(ca, tp->snd_cwnd);
- tcp_cong_avoid_ai(tp, ca->cnt);
+ tcp_cong_avoid_ai(tp, ca->cnt, 1);
}
}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 27ead0d..8670e68 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -291,26 +291,32 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
* ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
* returns the leftover acks to adjust cwnd in congestion avoidance mode.
*/
-void tcp_slow_start(struct tcp_sock *tp, u32 acked)
+u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
u32 cwnd = tp->snd_cwnd + acked;
if (cwnd > tp->snd_ssthresh)
cwnd = tp->snd_ssthresh + 1;
+ acked -= cwnd - tp->snd_cwnd;
tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+
+ return acked;
}
EXPORT_SYMBOL_GPL(tcp_slow_start);
-/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
-void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
+/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
+ * for every packet that was ACKed.
+ */
+void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
{
+ tp->snd_cwnd_cnt += acked;
if (tp->snd_cwnd_cnt >= w) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else {
- tp->snd_cwnd_cnt++;
+ u32 delta = tp->snd_cwnd_cnt / w;
+
+ tp->snd_cwnd_cnt -= delta * w;
+ tp->snd_cwnd += delta;
}
+ tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
}
EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
@@ -329,11 +335,13 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
return;
/* In "safe" area, increase. */
- if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp, acked);
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
/* In dangerous area, increase slowly. */
- else
- tcp_cong_avoid_ai(tp, tp->snd_cwnd);
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 6b60024..4b276d1 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -93,9 +93,7 @@ struct bictcp {
u32 epoch_start; /* beginning of an epoch */
u32 ack_cnt; /* number of acks */
u32 tcp_cwnd; /* estimated tcp cwnd */
-#define ACK_RATIO_SHIFT 4
-#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT)
- u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
+ u16 unused;
u8 sample_cnt; /* number of samples to decide curr_rtt */
u8 found; /* the exit point is found? */
u32 round_start; /* beginning of each round */
@@ -114,7 +112,6 @@ static inline void bictcp_reset(struct bictcp *ca)
ca->bic_K = 0;
ca->delay_min = 0;
ca->epoch_start = 0;
- ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
ca->ack_cnt = 0;
ca->tcp_cwnd = 0;
ca->found = 0;
@@ -205,23 +202,30 @@ static u32 cubic_root(u64 a)
/*
* Compute congestion window to use.
*/
-static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
{
u32 delta, bic_target, max_cnt;
u64 offs, t;
- ca->ack_cnt++; /* count the number of ACKs */
+ ca->ack_cnt += acked; /* count the number of ACKed packets */
if (ca->last_cwnd == cwnd &&
(s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
return;
+ /* The CUBIC function can update ca->cnt at most once per jiffy.
+ * On all cwnd reduction events, ca->epoch_start is set to 0,
+ * which will force a recalculation of ca->cnt.
+ */
+ if (ca->epoch_start && tcp_time_stamp == ca->last_time)
+ goto tcp_friendliness;
+
ca->last_cwnd = cwnd;
ca->last_time = tcp_time_stamp;
if (ca->epoch_start == 0) {
ca->epoch_start = tcp_time_stamp; /* record beginning */
- ca->ack_cnt = 1; /* start counting */
+ ca->ack_cnt = acked; /* start counting */
ca->tcp_cwnd = cwnd; /* syn with cubic */
if (ca->last_max_cwnd <= cwnd) {
@@ -283,6 +287,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
if (ca->last_max_cwnd == 0 && ca->cnt > 20)
ca->cnt = 20; /* increase cwnd 5% per RTT */
+tcp_friendliness:
/* TCP Friendly */
if (tcp_friendliness) {
u32 scale = beta_scale;
@@ -301,7 +306,6 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
}
}
- ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
if (ca->cnt == 0) /* cannot be zero */
ca->cnt = 1;
}
@@ -317,11 +321,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (tp->snd_cwnd <= tp->snd_ssthresh) {
if (hystart && after(ack, ca->end_seq))
bictcp_hystart_reset(sk);
- tcp_slow_start(tp, acked);
- } else {
- bictcp_update(ca, tp->snd_cwnd);
- tcp_cong_avoid_ai(tp, ca->cnt);
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
}
+ bictcp_update(ca, tp->snd_cwnd, acked);
+ tcp_cong_avoid_ai(tp, ca->cnt, acked);
}
static u32 bictcp_recalc_ssthresh(struct sock *sk)
@@ -411,20 +416,10 @@ static void hystart_update(struct sock *sk, u32 delay)
*/
static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
u32 delay;
- if (icsk->icsk_ca_state == TCP_CA_Open) {
- u32 ratio = ca->delayed_ack;
-
- ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
- ratio += cnt;
-
- ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT);
- }
-
/* Some calls are for duplicates without timetamps */
if (rtt_us < 0)
return;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3f72d7..d22f544 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -683,7 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
arg.bound_dev_if = sk->sk_bound_dev_if;
arg.tos = ip_hdr(skb)->tos;
- ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
+ ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+ skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len);
@@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
- ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
+ ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+ skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len);
@@ -2428,14 +2430,39 @@ struct proto tcp_prot = {
};
EXPORT_SYMBOL(tcp_prot);
+static void __net_exit tcp_sk_exit(struct net *net)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
+ free_percpu(net->ipv4.tcp_sk);
+}
+
static int __net_init tcp_sk_init(struct net *net)
{
+ int res, cpu;
+
+ net->ipv4.tcp_sk = alloc_percpu(struct sock *);
+ if (!net->ipv4.tcp_sk)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct sock *sk;
+
+ res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+ IPPROTO_TCP, net);
+ if (res)
+ goto fail;
+ *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
+ }
net->ipv4.sysctl_tcp_ecn = 2;
return 0;
-}
-static void __net_exit tcp_sk_exit(struct net *net)
-{
+fail:
+ tcp_sk_exit(net);
+
+ return res;
}
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 6824afb..333bcb2 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -25,7 +25,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (tp->snd_cwnd <= tp->snd_ssthresh)
tcp_slow_start(tp, acked);
else
- tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
+ tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
+ 1);
}
static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index a4d2d2d..112151e 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -159,7 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
/* In the "non-congestive state", increase cwnd
* every rtt.
*/
- tcp_cong_avoid_ai(tp, tp->snd_cwnd);
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
} else {
/* In the "congestive state", increase cwnd
* every other rtt.
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index cd72732..17d3566 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -92,7 +92,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
} else {
/* Reno */
- tcp_cong_avoid_ai(tp, tp->snd_cwnd);
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
}
/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
OpenPOWER on IntegriCloud