From 67b67e365f07d6dc70f3bb266af3268bac0a4836 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 22 Aug 2010 19:41:36 +0000 Subject: ccid: ccid-2/3 code cosmetics This patch collects cosmetics-only changes to separate these from code changes: * update with regard to CodingStyle and whitespace changes, * documentation: - adding/revising comments, - remove CCID-3 RX socket documentation which is either duplicate or refers to fields that no longer exist, * expand embedded tfrc_tx_info struct inline for consistency, removing indirections via #define. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 4 ++-- net/dccp/ccids/ccid3.c | 30 ++++++++++++++++++------------ net/dccp/ccids/ccid3.h | 19 +++++++------------ 3 files changed, 27 insertions(+), 26 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 9b3ae99..f564211 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -193,8 +193,8 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) hc->tx_ssthresh = hc->tx_cwnd / 2; if (hc->tx_ssthresh < 2) hc->tx_ssthresh = 2; - hc->tx_cwnd = 1; - hc->tx_pipe = 0; + hc->tx_cwnd = 1; + hc->tx_pipe = 0; /* clear state about stuff we sent */ hc->tx_seqt = hc->tx_seqh; diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 95f7529..a666f3f 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -218,9 +218,9 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) /* * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 + * RTO is 0 if and only if no feedback has been received yet. */ - if (hc->tx_t_rto == 0 || /* no feedback received yet */ - hc->tx_p == 0) { + if (hc->tx_t_rto == 0 || hc->tx_p == 0) { /* halve send rate directly */ hc->tx_x = max(hc->tx_x / 2, @@ -256,7 +256,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) * Set new timeout for the nofeedback timer. * See comments in packet_recv() regarding the value of t_RTO. */ - if (unlikely(hc->tx_t_rto == 0)) /* no feedback yet */ + if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */ t_nfb = TFRC_INITIAL_TIMEOUT; else t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); @@ -372,7 +372,7 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv; + struct ccid3_options_received *opt_recv = &hc->tx_options_received; ktime_t now; unsigned long t_nfb; u32 pinv, r_sample; @@ -386,7 +386,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hc->tx_state != TFRC_SSTATE_NO_FBACK) return; - opt_recv = &hc->tx_options_received; now = ktime_get_real(); /* Estimate RTT from history if ACK number is valid */ @@ -489,11 +488,9 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, int rc = 0; const struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv; + struct ccid3_options_received *opt_recv = &hc->tx_options_received; __be32 opt_val; - opt_recv = &hc->tx_options_received; - if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { opt_recv->ccid3or_seqno = dp->dccps_gsr; opt_recv->ccid3or_loss_event_rate = ~0; @@ -582,6 +579,7 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { const struct ccid3_hc_tx_sock *hc; + struct tfrc_tx_info tfrc; const void *val; /* Listen socks doesn't have a private CCID block */ @@ -591,10 +589,17 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, hc = ccid3_hc_tx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_TX_INFO: - if (len < sizeof(hc->tx_tfrc)) + if (len < sizeof(tfrc)) return -EINVAL; - len = sizeof(hc->tx_tfrc); - val = &hc->tx_tfrc; + tfrc.tfrctx_x = hc->tx_x; + tfrc.tfrctx_x_recv = hc->tx_x_recv; + tfrc.tfrctx_x_calc = hc->tx_x_calc; + tfrc.tfrctx_rtt = hc->tx_rtt; + tfrc.tfrctx_p = hc->tx_p; + tfrc.tfrctx_rto = hc->tx_t_rto; + tfrc.tfrctx_ipi = hc->tx_t_ipi; + len = sizeof(tfrc); + val = &tfrc; break; default: return -ENOPROTOOPT; @@ -749,10 +754,11 @@ static u32 ccid3_first_li(struct sock *sk) x_recv = scaled_div32(hc->rx_bytes_recv, delta); if (x_recv == 0) { /* would also trigger divide-by-zero */ DCCP_WARN("X_recv==0\n"); - if ((x_recv = hc->rx_x_recv) == 0) { + if (hc->rx_x_recv == 0) { DCCP_BUG("stored value of X_recv is zero"); return ~0U; } + x_recv = hc->rx_x_recv; } fval = scaled_div(hc->rx_s, hc->rx_rtt); diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 0326357..b186424 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -95,14 +95,13 @@ enum ccid3_hc_tx_states { * @tx_options_received: Parsed set of retrieved options */ struct ccid3_hc_tx_sock { - struct tfrc_tx_info tx_tfrc; -#define tx_x tx_tfrc.tfrctx_x -#define tx_x_recv tx_tfrc.tfrctx_x_recv -#define tx_x_calc tx_tfrc.tfrctx_x_calc -#define tx_rtt tx_tfrc.tfrctx_rtt -#define tx_p tx_tfrc.tfrctx_p -#define tx_t_rto tx_tfrc.tfrctx_rto -#define tx_t_ipi tx_tfrc.tfrctx_ipi + u64 tx_x; + u64 tx_x_recv; + u32 tx_x_calc; + u32 tx_rtt; + u32 tx_p; + u32 tx_t_rto; + u32 tx_t_ipi; u16 tx_s; enum ccid3_hc_tx_states tx_state:8; u8 tx_last_win_count; @@ -131,16 +130,12 @@ enum ccid3_hc_rx_states { /** * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket - * @rx_x_recv: Receiver estimate of send rate (RFC 3448 4.3) - * @rx_rtt: Receiver estimate of rtt (non-standard) - * @rx_p: Current loss event rate (RFC 3448 5.4) * @rx_last_counter: Tracks window counter (RFC 4342, 8.1) * @rx_state: Receiver state, one of %ccid3_hc_rx_states * @rx_bytes_recv: Total sum of DCCP payload bytes * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3) * @rx_rtt: Receiver estimate of RTT * @rx_tstamp_last_feedback: Time at which last feedback was sent - * @rx_tstamp_last_ack: Time at which last feedback was sent * @rx_hist: Packet history (loss detection + RTT sampling) * @rx_li_hist: Loss Interval database * @rx_s: Received packet size in bytes -- cgit v1.1 From 51c22bb510fefbb1a87c02dbd835383e6e7e3d36 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 22 Aug 2010 19:41:37 +0000 Subject: dccp ccid-3: No more CCID control blocks in LISTEN state The CCIDs are activated as last of the features, at the end of the handshake, were the LISTEN state of the master socket is inherited into the server state of the child socket. Thus, the only states visible to CCIDs now are OPEN/PARTOPEN, and the closing states. This allows to remove tests which were previously necessary to protect against referencing a socket in the listening state (in CCID-3), but which now have become redundant. As a further byproduct of enabling the CCIDs only after the connection has been fully established, several typecast-initialisations of ccid3_hc_{rx,tx}_sock can now be eliminated: * the CCID is loaded, so it is not necessary to test if it is NULL, * if it is possible to load a CCID and leave the private area NULL, then this is a bug, which should crash loudly - and earlier, * the test for state==OPEN || state==PARTOPEN now reduces only to the closing phase (e.g. when the node has received an unexpected Reset). Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: David S. Miller --- net/dccp/ccids/ccid3.c | 40 +++++++--------------------------------- 1 file changed, 7 insertions(+), 33 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index a666f3f..4340672 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -564,29 +564,17 @@ static void ccid3_hc_tx_exit(struct sock *sk) static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) { - struct ccid3_hc_tx_sock *hc; - - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return; - - hc = ccid3_hc_tx_sk(sk); - info->tcpi_rto = hc->tx_t_rto; - info->tcpi_rtt = hc->tx_rtt; + info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto; + info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt; } static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_tx_sock *hc; + const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); struct tfrc_tx_info tfrc; const void *val; - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return -EINVAL; - - hc = ccid3_hc_tx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_TX_INFO: if (len < sizeof(tfrc)) @@ -706,14 +694,12 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) { - const struct ccid3_hc_rx_sock *hc; + const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); __be32 x_recv, pinv; if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) return 0; - hc = ccid3_hc_rx_sk(sk); - if (dccp_packet_without_ack(skb)) return 0; @@ -876,30 +862,18 @@ static void ccid3_hc_rx_exit(struct sock *sk) static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) { - const struct ccid3_hc_rx_sock *hc; - - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return; - - hc = ccid3_hc_rx_sk(sk); - info->tcpi_ca_state = hc->rx_state; + info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state; info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = hc->rx_rtt; + info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt; } static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_rx_sock *hc; + const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); struct tfrc_rx_info rx_info; const void *val; - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return -EINVAL; - - hc = ccid3_hc_rx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_RX_INFO: if (len < sizeof(rx_info)) -- cgit v1.1 From 30564e355511b434613aa42375317b5a07fc9f23 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 22 Aug 2010 19:41:38 +0000 Subject: dccp ccid-2: Remove redundant sanity tests This removes the ccid2_hc_tx_check_sanity function: it is redundant. Details: The tx_check_sanity function performs three tests: 1) it checks that the circular TX list is sorted - in ascending order of sequence number (ccid2s_seq) - and time (ccid2s_sent), - in the direction from `tail' (hctx_seqt) to `head' (hctx_seqh); 2) it ensures that the entire list has the length seqbufc * CCID2_SEQBUF_LEN; 3) it ensures that pipe equals the number of packets that were not marked `acked' (ccid2s_acked) between `tail' and `head'. The following argues that each of these tests is redundant, this can be verified by going through the code. (1) is not necessary, since both time and GSS increase from one packet to the next, so that subsequent insertions in tx_packet_sent (which advance the `head' pointer) will be in ascending order of time and sequence number. In (2), the length of the list is always equal to seqbufc times CCID2_SEQBUF_LEN (set to 1024) unless allocation caused an earlier failure, because: * at initialisation (tx_init), there is one chunk of size 1024 and seqbufc=1; * subsequent calls to tx_alloc_seq take place whenever head->next == tail in tx_packet_sent; then a new chunk of size 1024 is inserted between head and tail, and seqbufc is incremented by one. To show that (3) is redundant requires looking at two cases. The `pipe' variable of the TX socket is incremented only in tx_packet_sent, and decremented in tx_packet_recv. When head == tail (TX history empty) then pipe should be 0, which is the case directly after initialisation and after a retransmission timeout has occurred (ccid2_hc_tx_rto_expire). The first case involves parsing Ack Vectors for packets recorded in the live portion of the buffer, between tail and head. For each packet marked by the receiver as received (state 0) or ECN-marked (state 1), pipe is decremented by one, so for all such packets the BUG_ON in tx_check_sanity will not trigger. The second case is the loss detection in the second half of tx_packet_recv, below the comment "Check for NUMDUPACK". The first while-loop here ensures that the sequence number of `seqp' is either above or equal to `high_ack', or otherwise equal to the highest sequence number sent so far (of the entry head->prev, as head points to the next unsent entry). The next while-loop ("while (1)") counts the number of acked packets starting from that position of seqp, going backwards in the direction from head->prev to tail. If NUMDUPACK=3 such packets were counted within this loop, `seqp' points to the last acknowledged packet of these, and the "if (done == NUMDUPACK)" block is entered next. The while-loop contained within that block in turn traverses the list backwards, from head to tail; the position of `seqp' is saved in the variable `last_acked'. For each packet not marked as `acked', a congestion event is triggered within the loop, and pipe is decremented. The loop terminates when `seqp' has reached `tail', whereupon tail is set to the position previously stored in `last_acked'. Thus, between `last_acked' and the previous position of `tail', - pipe has been decremented earlier if the packet was marked as state 0 or 1; - pipe was decremented if the packet was not marked as acked. That is, pipe has been decremented by the number of packets between `last_acked' and the previous position of `tail'. As a consequence, pipe now again reflects the number of packets which have not (yet) been acked between the new position of tail (at `last_acked') and head->prev, or 0 if head==tail. The result is that the BUG_ON condition in check_sanity will also not be triggered, hence the test (3) is also redundant. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 52 -------------------------------------------------- 1 file changed, 52 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index f564211..28499e0 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -33,51 +33,8 @@ #ifdef CONFIG_IP_DCCP_CCID2_DEBUG static int ccid2_debug; #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) - -static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hc) -{ - int len = 0; - int pipe = 0; - struct ccid2_seq *seqp = hc->tx_seqh; - - /* there is data in the chain */ - if (seqp != hc->tx_seqt) { - seqp = seqp->ccid2s_prev; - len++; - if (!seqp->ccid2s_acked) - pipe++; - - while (seqp != hc->tx_seqt) { - struct ccid2_seq *prev = seqp->ccid2s_prev; - - len++; - if (!prev->ccid2s_acked) - pipe++; - - /* packets are sent sequentially */ - BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq, - prev->ccid2s_seq ) >= 0); - BUG_ON(time_before(seqp->ccid2s_sent, - prev->ccid2s_sent)); - - seqp = prev; - } - } - - BUG_ON(pipe != hc->tx_pipe); - ccid2_pr_debug("len of chain=%d\n", len); - - do { - seqp = seqp->ccid2s_prev; - len++; - } while (seqp != hc->tx_seqh); - - ccid2_pr_debug("total len=%d\n", len); - BUG_ON(len != hc->tx_seqbufc * CCID2_SEQBUF_LEN); -} #else #define ccid2_pr_debug(format, a...) -#define ccid2_hc_tx_check_sanity(hc) #endif static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) @@ -178,8 +135,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) ccid2_pr_debug("RTO_EXPIRE\n"); - ccid2_hc_tx_check_sanity(hc); - /* back-off timer */ hc->tx_rto <<= 1; @@ -204,7 +159,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) hc->tx_rpseq = 0; hc->tx_rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); - ccid2_hc_tx_check_sanity(hc); out: bh_unlock_sock(sk); sock_put(sk); @@ -312,7 +266,6 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) } } while (0); ccid2_pr_debug("=========\n"); - ccid2_hc_tx_check_sanity(hc); #endif } @@ -510,7 +463,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) int done = 0; unsigned int maxincr = 0; - ccid2_hc_tx_check_sanity(hc); /* check reverse path congestion */ seqno = DCCP_SKB_CB(skb)->dccpd_seq; @@ -694,8 +646,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hc->tx_seqt = hc->tx_seqt->ccid2s_next; } - - ccid2_hc_tx_check_sanity(hc); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) @@ -730,8 +680,6 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) hc->tx_last_cong = jiffies; setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); - - ccid2_hc_tx_check_sanity(hc); return 0; } -- cgit v1.1 From c38c92a84a9291a3d0eaf6a13650a11961ae964f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 22 Aug 2010 19:41:39 +0000 Subject: dccp ccid-2: Simplify dec_pipe and rearming of RTO timer This removes the dec_pipe function and improves the way the RTO timer is rearmed when a new acknowledgment comes in. Details and justification for removal: -------------------------------------- 1) The BUG_ON in dec_pipe is never triggered: pipe is only decremented for TX history entries between tail and head, for which it had previously been incremented in tx_packet_sent; and it is not decremented twice for the same entry, since it is - either decremented when a corresponding Ack Vector cell in state 0 or 1 was received (and then ccid2s_acked==1), - or it is decremented when ccid2s_acked==0, as part of the loss detection in tx_packet_recv (and hence it can not have been decremented earlier). 2) Restarting the RTO timer happens for every single entry in each Ack Vector parsed by tx_packet_recv (according to RFC 4340, 11.4 this can happen up to 16192 times per Ack Vector). 3) The RTO timer should not be restarted when all outstanding data has been acknowledged. This is currently done similar to (2), in dec_pipe, when pipe has reached 0. The patch onsolidates the code which rearms the RTO timer, combining the segments from new_ack and dec_pipe. As a result, the code becomes clearer (compare with tcp_rearm_rto()). Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 28499e0..f7f5069 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -413,23 +413,6 @@ static inline void ccid2_new_ack(struct sock *sk, hc->tx_srtt, hc->tx_rttvar, hc->tx_rto, HZ, r); } - - /* we got a new ack, so re-start RTO timer */ - ccid2_hc_tx_kill_rto_timer(sk); - ccid2_start_rto_timer(sk); -} - -static void ccid2_hc_tx_dec_pipe(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - if (hc->tx_pipe == 0) - DCCP_BUG("pipe == 0"); - else - hc->tx_pipe--; - - if (hc->tx_pipe == 0) - ccid2_hc_tx_kill_rto_timer(sk); } static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) @@ -572,7 +555,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) seqp->ccid2s_acked = 1; ccid2_pr_debug("Got ack for %llu\n", (unsigned long long)seqp->ccid2s_seq); - ccid2_hc_tx_dec_pipe(sk); + hc->tx_pipe--; } if (seqp == hc->tx_seqt) { done = 1; @@ -629,7 +612,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * one ack vector. */ ccid2_congestion_event(sk, seqp); - ccid2_hc_tx_dec_pipe(sk); + hc->tx_pipe--; } if (seqp == hc->tx_seqt) break; @@ -646,6 +629,12 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hc->tx_seqt = hc->tx_seqt->ccid2s_next; } + + /* restart RTO timer if not all outstanding data has been acked */ + if (hc->tx_pipe == 0) + sk_stop_timer(sk, &hc->tx_rtotimer); + else + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) -- cgit v1.1 From 231cc2aaf14bad3b2325be0b19b8385ff5e75485 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 22 Aug 2010 19:41:40 +0000 Subject: dccp ccid-2: Replace broken RTT estimator with better algorithm The current CCID-2 RTT estimator code is in parts broken and lags behind the suggestions in RFC2988 of using scaled variants for SRTT/RTTVAR. That code is replaced by the present patch, which reuses the Linux TCP RTT estimator code. Further details: ---------------- 1. The minimum RTO of previously one second has been replaced with TCP's, since RFC4341, sec. 5 says that the minimum of 1 sec. (suggested in RFC2988, 2.4) is not necessary. Instead, the TCP_RTO_MIN is used, which agrees with DCCP's concept of a default RTT (RFC 4340, 3.4). 2. The maximum RTO has been set to DCCP_RTO_MAX (64 sec), which agrees with RFC2988, (2.5). 3. De-inlined the function ccid2_new_ack(). 4. Added a FIXME: the RTT is sampled several times per Ack Vector, which will give the wrong estimate. It should be replaced with one sample per Ack. However, at the moment this can not be resolved easily, since - it depends on TX history code (which also needs some work), - the cleanest solution is not to use the `sent' time at all (saves 4 bytes per entry) and use DCCP timestamps / elapsed time to estimated the RTT, which however is non-trivial to get right (but needs to be done). Reasons for reusing the Linux TCP estimator algorithm: ------------------------------------------------------ Some time was spent to find a better alternative, using basic RFC2988 as a first step. Further analysis and experimentation showed that the Linux TCP RTO estimator is superior to a basic RFC2988 implementation. A summary is on http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/ccid2/rto_estimator/ In addition, this estimator fared well in a recent empirical evaluation: Rewaskar, Sushant, Jasleen Kaur and F. Donelson Smith. A Performance Study of Loss Detection/Recovery in Real-world TCP Implementations. Proceedings of 15th IEEE International Conference on Network Protocols (ICNP-07), 2007. Thus there is significant benefit in reusing the existing TCP code. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 169 +++++++++++++++++++++++++++---------------------- net/dccp/ccids/ccid2.h | 20 ++++-- 2 files changed, 108 insertions(+), 81 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index f7f5069..7af3106 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -113,19 +113,12 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) dp->dccps_l_ack_ratio = val; } -static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hc, long val) -{ - ccid2_pr_debug("change SRTT to %ld\n", val); - hc->tx_srtt = val; -} - static void ccid2_start_rto_timer(struct sock *sk); static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - long s; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -137,10 +130,8 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) /* back-off timer */ hc->tx_rto <<= 1; - - s = hc->tx_rto / HZ; - if (s > 60) - hc->tx_rto = 60 * HZ; + if (hc->tx_rto > DCCP_RTO_MAX) + hc->tx_rto = DCCP_RTO_MAX; ccid2_start_rto_timer(sk); @@ -168,7 +159,7 @@ static void ccid2_start_rto_timer(struct sock *sk) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - ccid2_pr_debug("setting RTO timeout=%ld\n", hc->tx_rto); + ccid2_pr_debug("setting RTO timeout=%u\n", hc->tx_rto); BUG_ON(timer_pending(&hc->tx_rtotimer)); sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); @@ -339,9 +330,86 @@ static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) ccid2_pr_debug("deleted RTO timer\n"); } -static inline void ccid2_new_ack(struct sock *sk, - struct ccid2_seq *seqp, - unsigned int *maxincr) +/** + * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm + * This code is almost identical with TCP's tcp_rtt_estimator(), since + * - it has a higher sampling frequency (recommended by RFC 1323), + * - the RTO does not collapse into RTT due to RTTVAR going towards zero, + * - it is simple (cf. more complex proposals such as Eifel timer or research + * which suggests that the gain should be set according to window size), + * - in tests it was found to work well with CCID2 [gerrit]. + */ +static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) +{ + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + long m = mrtt ? : 1; + + if (hc->tx_srtt == 0) { + /* First measurement m */ + hc->tx_srtt = m << 3; + hc->tx_mdev = m << 1; + + hc->tx_mdev_max = max(TCP_RTO_MIN, hc->tx_mdev); + hc->tx_rttvar = hc->tx_mdev_max; + hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; + } else { + /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ + m -= (hc->tx_srtt >> 3); + hc->tx_srtt += m; + + /* Similarly, update scaled mdev with regard to |m| */ + if (m < 0) { + m = -m; + m -= (hc->tx_mdev >> 2); + /* + * This neutralises RTO increase when RTT < SRTT - mdev + * (see P. Sarolahti, A. Kuznetsov,"Congestion Control + * in Linux TCP", USENIX 2002, pp. 49-62). + */ + if (m > 0) + m >>= 3; + } else { + m -= (hc->tx_mdev >> 2); + } + hc->tx_mdev += m; + + if (hc->tx_mdev > hc->tx_mdev_max) { + hc->tx_mdev_max = hc->tx_mdev; + if (hc->tx_mdev_max > hc->tx_rttvar) + hc->tx_rttvar = hc->tx_mdev_max; + } + + /* + * Decay RTTVAR at most once per flight, exploiting that + * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) + * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) + * GAR is a useful bound for FlightSize = pipe. + * AWL is probably too low here, as it over-estimates pipe. + */ + if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) { + if (hc->tx_mdev_max < hc->tx_rttvar) + hc->tx_rttvar -= (hc->tx_rttvar - + hc->tx_mdev_max) >> 2; + hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; + hc->tx_mdev_max = TCP_RTO_MIN; + } + } + + /* + * Set RTO from SRTT and RTTVAR + * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms. + * This agrees with RFC 4341, 5: + * "Because DCCP does not retransmit data, DCCP does not require + * TCP's recommended minimum timeout of one second". + */ + hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar; + + if (hc->tx_rto > DCCP_RTO_MAX) + hc->tx_rto = DCCP_RTO_MAX; +} + +static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, + unsigned int *maxincr) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); @@ -355,64 +423,15 @@ static inline void ccid2_new_ack(struct sock *sk, hc->tx_cwnd += 1; hc->tx_packets_acked = 0; } - - /* update RTO */ - if (hc->tx_srtt == -1 || - time_after(jiffies, hc->tx_lastrtt + hc->tx_srtt)) { - unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; - int s; - - /* first measurement */ - if (hc->tx_srtt == -1) { - ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", - r, jiffies, - (unsigned long long)seqp->ccid2s_seq); - ccid2_change_srtt(hc, r); - hc->tx_rttvar = r >> 1; - } else { - /* RTTVAR */ - long tmp = hc->tx_srtt - r; - long srtt; - - if (tmp < 0) - tmp *= -1; - - tmp >>= 2; - hc->tx_rttvar *= 3; - hc->tx_rttvar >>= 2; - hc->tx_rttvar += tmp; - - /* SRTT */ - srtt = hc->tx_srtt; - srtt *= 7; - srtt >>= 3; - tmp = r >> 3; - srtt += tmp; - ccid2_change_srtt(hc, srtt); - } - s = hc->tx_rttvar << 2; - /* clock granularity is 1 when based on jiffies */ - if (!s) - s = 1; - hc->tx_rto = hc->tx_srtt + s; - - /* must be at least a second */ - s = hc->tx_rto / HZ; - /* DCCP doesn't require this [but I like it cuz my code sux] */ -#if 1 - if (s < 1) - hc->tx_rto = HZ; -#endif - /* max 60 seconds */ - if (s > 60) - hc->tx_rto = HZ * 60; - - hc->tx_lastrtt = jiffies; - - ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", - hc->tx_srtt, hc->tx_rttvar, - hc->tx_rto, HZ, r); - } + /* + * FIXME: RTT is sampled several times per acknowledgment (for each + * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). + * This causes the RTT to be over-estimated, since the older entries + * in the Ack Vector have earlier sending times. + * The cleanest solution is to not use the ccid2s_sent field at all + * and instead use DCCP timestamps: requires changes in other places. + */ + ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent); } static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) @@ -662,9 +681,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) if (ccid2_hc_tx_alloc_seq(hc)) return -ENOMEM; - hc->tx_rto = 3 * HZ; - ccid2_change_srtt(hc, -1); - hc->tx_rttvar = -1; + hc->tx_rto = DCCP_TIMEOUT_INIT; hc->tx_rpdupack = -1; hc->tx_last_cong = jiffies; setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 1ec6a30..b017843 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -42,7 +42,12 @@ struct ccid2_seq { * struct ccid2_hc_tx_sock - CCID2 TX half connection * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) - * @tx_lastrtt: time RTT was last measured + * @tx_srtt: smoothed RTT estimate, scaled by 2^3 + * @tx_mdev: smoothed RTT variation, scaled by 2^2 + * @tx_mdev_max: maximum of @mdev during one flight + * @tx_rttvar: moving average/maximum of @mdev_max + * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) + * @tx_rtt_seq: to decay RTTVAR at most once per flight * @tx_rpseq: last consecutive seqno * @tx_rpdupack: dupacks since rpseq */ @@ -55,11 +60,16 @@ struct ccid2_hc_tx_sock { int tx_seqbufc; struct ccid2_seq *tx_seqh; struct ccid2_seq *tx_seqt; - long tx_rto; - long tx_srtt; - long tx_rttvar; - unsigned long tx_lastrtt; + + /* RTT measurement: variables/principles are the same as in TCP */ + u32 tx_srtt, + tx_mdev, + tx_mdev_max, + tx_rttvar, + tx_rto; + u64 tx_rtt_seq:48; struct timer_list tx_rtotimer; + u64 tx_rpseq; int tx_rpdupack; unsigned long tx_last_cong; -- cgit v1.1 From d82b6f85c1d73340ef4a26bd0b247ac14610cd83 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 29 Aug 2010 19:23:10 +0000 Subject: dccp ccid-2: Use u32 timestamps uniformly Since CCID-2 is de facto a mini implementation of TCP, it makes sense to share as much code as possible. Hence this patch aligns CCID-2 timestamping with TCP timestamping. This also halves the space consumption (on 64-bit systems). The necessary include file is already included by way of net/dccp.h. Redundant includes have been removed. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 14 ++++++-------- net/dccp/ccids/ccid2.h | 15 ++++++++++----- 2 files changed, 16 insertions(+), 13 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 7af3106..0cff637 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -25,8 +25,6 @@ */ #include #include "../feat.h" -#include "../ccid.h" -#include "../dccp.h" #include "ccid2.h" @@ -175,7 +173,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) hc->tx_seqh->ccid2s_seq = dp->dccps_gss; hc->tx_seqh->ccid2s_acked = 0; - hc->tx_seqh->ccid2s_sent = jiffies; + hc->tx_seqh->ccid2s_sent = ccid2_time_stamp; next = hc->tx_seqh->ccid2s_next; /* check if we need to alloc more space */ @@ -250,7 +248,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) struct ccid2_seq *seqp = hc->tx_seqt; while (seqp != hc->tx_seqh) { - ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", + ccid2_pr_debug("out seq=%llu acked=%d time=%u\n", (unsigned long long)seqp->ccid2s_seq, seqp->ccid2s_acked, seqp->ccid2s_sent); seqp = seqp->ccid2s_next; @@ -431,19 +429,19 @@ static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, * The cleanest solution is to not use the ccid2s_sent field at all * and instead use DCCP timestamps: requires changes in other places. */ - ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent); + ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent); } static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - if (time_before(seqp->ccid2s_sent, hc->tx_last_cong)) { + if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) { ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); return; } - hc->tx_last_cong = jiffies; + hc->tx_last_cong = ccid2_time_stamp; hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; hc->tx_ssthresh = max(hc->tx_cwnd, 2U); @@ -683,7 +681,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) hc->tx_rto = DCCP_TIMEOUT_INIT; hc->tx_rpdupack = -1; - hc->tx_last_cong = jiffies; + hc->tx_last_cong = ccid2_time_stamp; setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); return 0; diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index b017843..9731c2d 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -18,18 +18,23 @@ #ifndef _DCCP_CCID2_H_ #define _DCCP_CCID2_H_ -#include #include #include #include "../ccid.h" +#include "../dccp.h" + +/* + * CCID-2 timestamping faces the same issues as TCP timestamping. + * Hence we reuse/share as much of the code as possible. + */ +#define ccid2_time_stamp tcp_time_stamp + /* NUMDUPACK parameter from RFC 4341, p. 6 */ #define NUMDUPACK 3 -struct sock; - struct ccid2_seq { u64 ccid2s_seq; - unsigned long ccid2s_sent; + u32 ccid2s_sent; int ccid2s_acked; struct ccid2_seq *ccid2s_prev; struct ccid2_seq *ccid2s_next; @@ -72,7 +77,7 @@ struct ccid2_hc_tx_sock { u64 tx_rpseq; int tx_rpdupack; - unsigned long tx_last_cong; + u32 tx_last_cong; u64 tx_high_ack; }; -- cgit v1.1 From d26eeb07fd02de31848b59d19687daff0e93532f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 29 Aug 2010 19:23:11 +0000 Subject: dccp ccid-2: Remove wrappers around sk_{reset,stop}_timer() This removes the wrappers around the sk timer functions, since not much is gained from using them: the BUG_ON in start_rto_timer will never trigger since that function is called only if: * the RTO timer expires (rto_expire, and then timer_pending() is false); * in tx_packet_sent only if !timer_pending() (BUG_ON is redundant here); * previously in new_ack, after stopping the timer (timer_pending() false). Removing the wrappers also clears the way for eventually replacing the RTO timer with the icsk-retransmission-timer, as it is already part of the DCCP socket. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 0cff637..8c95813 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -111,8 +111,6 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) dp->dccps_l_ack_ratio = val; } -static void ccid2_start_rto_timer(struct sock *sk); - static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; @@ -131,7 +129,7 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) if (hc->tx_rto > DCCP_RTO_MAX) hc->tx_rto = DCCP_RTO_MAX; - ccid2_start_rto_timer(sk); + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); /* adjust pipe, cwnd etc */ hc->tx_ssthresh = hc->tx_cwnd / 2; @@ -153,16 +151,6 @@ out: sock_put(sk); } -static void ccid2_start_rto_timer(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - ccid2_pr_debug("setting RTO timeout=%u\n", hc->tx_rto); - - BUG_ON(timer_pending(&hc->tx_rtotimer)); - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); -} - static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); @@ -239,9 +227,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) } #endif - /* setup RTO timer */ - if (!timer_pending(&hc->tx_rtotimer)) - ccid2_start_rto_timer(sk); + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); #ifdef CONFIG_IP_DCCP_CCID2_DEBUG do { @@ -320,14 +306,6 @@ out_invalid_option: return -1; } -static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - sk_stop_timer(sk, &hc->tx_rtotimer); - ccid2_pr_debug("deleted RTO timer\n"); -} - /** * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm * This code is almost identical with TCP's tcp_rtt_estimator(), since @@ -692,7 +670,7 @@ static void ccid2_hc_tx_exit(struct sock *sk) struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); int i; - ccid2_hc_tx_kill_rto_timer(sk); + sk_stop_timer(sk, &hc->tx_rtotimer); for (i = 0; i < hc->tx_seqbufc; i++) kfree(hc->tx_seqbuf[i]); -- cgit v1.1 From 22b71c8f4f3db8df92f5e7b081c265bc56c0bd2f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 29 Aug 2010 19:23:12 +0000 Subject: tcp/dccp: Consolidate common code for RFC 3390 conversion This patch consolidates initial-window code common to TCP and CCID-2: * TCP uses RFC 3390 in a packet-oriented manner (tcp_input.c) and * CCID-2 uses RFC 3390 in packet-oriented manner (RFC 4341). Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 8c95813..b9c942a 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -641,12 +641,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ hc->tx_ssthresh = ~0U; - /* - * RFC 4341, 5: "The cwnd parameter is initialized to at most four - * packets for new connections, following the rules from [RFC3390]". - * We need to convert the bytes of RFC3390 into the packets of RFC 4341. - */ - hc->tx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); + /* Use larger initial windows (RFC 4341, section 5). */ + hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); /* Make sure that Ack Ratio is enabled and within bounds. */ max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); -- cgit v1.1 From 4886fcad6e12572afbd230dfab1b268eace20d6d Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 29 Aug 2010 19:23:13 +0000 Subject: dccp ccid-2: Share TCP's minimum RTO code Using a fixed RTO_MIN of 0.2 seconds was found to cause problems for CCID-2 over 802.11g: at least once per session there was a spurious timeout. It helped to then increase the the value of RTO_MIN over this link. Since the problem is the same as in TCP, this patch makes the solution from commit "05bb1fad1cde025a864a90cfeb98dcbefe78a44a" "[TCP]: Allow minimum RTO to be configurable via routing metrics." available to DCCP. This avoids reinventing the wheel, so that e.g. the following works in the expected way now also for CCID-2: > ip route change 10.0.0.2 rto_min 800 dev ath0 Luckily this useful rto_min function was recently moved to net/tcp.h, which simplifies sharing code originating from TCP. Documentation also updated (plus minor whitespace fixes). Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index b9c942a..dc18172 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -325,8 +325,9 @@ static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) hc->tx_srtt = m << 3; hc->tx_mdev = m << 1; - hc->tx_mdev_max = max(TCP_RTO_MIN, hc->tx_mdev); + hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk)); hc->tx_rttvar = hc->tx_mdev_max; + hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; } else { /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ @@ -367,7 +368,7 @@ static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) hc->tx_rttvar -= (hc->tx_rttvar - hc->tx_mdev_max) >> 2; hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; - hc->tx_mdev_max = TCP_RTO_MIN; + hc->tx_mdev_max = tcp_rto_min(sk); } } -- cgit v1.1 From 89858ad14307a398961a0f1414b04053c1475e4f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 29 Aug 2010 19:23:14 +0000 Subject: dccp ccid-3: use per-route RTO or TCP RTO as fallback This makes RTAX_RTO_MIN also available to CCID-3, replacing the compile-time RTO lower bound with a per-route tunable value. The original Kconfig option solved the problem that a very low RTT (in the order of HZ) can trigger too frequent and unnecessary reductions of the sending rate. This tunable does not affect the initial RTO value of 2 seconds specified in RFC 5348, section 4.2 and Appendix B. But like the hardcoded Kconfig value, it allows to adapt to network conditions. The same effect as the original Kconfig option of 100ms is now achieved by > ip route replace to unicast 192.168.0.0/24 rto_min 100j dev eth0 (assuming HZ=1000). Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/Kconfig | 31 ------------------------------- net/dccp/ccids/ccid3.c | 11 +++++------ net/dccp/ccids/ccid3.h | 2 +- 3 files changed, 6 insertions(+), 38 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 8408398..0581143 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -47,37 +47,6 @@ config IP_DCCP_CCID3_DEBUG If in doubt, say N. -config IP_DCCP_CCID3_RTO - int "Use higher bound for nofeedback timer" - default 100 - depends on IP_DCCP_CCID3 && EXPERIMENTAL - ---help--- - Use higher lower bound for nofeedback timer expiration. - - The TFRC nofeedback timer normally expires after the maximum of 4 - RTTs and twice the current send interval (RFC 3448, 4.3). On LANs - with a small RTT this can mean a high processing load and reduced - performance, since then the nofeedback timer is triggered very - frequently. - - This option enables to set a higher lower bound for the nofeedback - value. Values in units of milliseconds can be set here. - - A value of 0 disables this feature by enforcing the value specified - in RFC 3448. The following values have been suggested as bounds for - experimental use: - * 16-20ms to match the typical multimedia inter-frame interval - * 100ms as a reasonable compromise [default] - * 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4) - - The default of 100ms is a compromise between a large value for - efficient DCCP implementations, and a small value to avoid disrupting - the network in times of congestion. - - The purpose of the nofeedback timer is to slow DCCP down when there - is serious network congestion: experimenting with larger values should - therefore not be performed on WANs. - config IP_DCCP_TFRC_LIB def_bool y if IP_DCCP_CCID3 diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 4340672..278e170 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -460,13 +460,12 @@ done_computing_x: sk->sk_write_space(sk); /* - * Update timeout interval for the nofeedback timer. - * We use a configuration option to increase the lower bound. - * This can help avoid triggering the nofeedback timer too - * often ('spinning') on LANs with small RTTs. + * Update timeout interval for the nofeedback timer. In order to control + * rate halving on networks with very low RTTs (<= 1 ms), use per-route + * tunable RTAX_RTO_MIN value as the lower bound. */ - hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, (CONFIG_IP_DCCP_CCID3_RTO * - (USEC_PER_SEC / 1000))); + hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, + USEC_PER_SEC/HZ * tcp_rto_min(sk)); /* * Schedule no feedback timer to expire in * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index b186424..b7e569c 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -42,7 +42,7 @@ #include "lib/tfrc.h" #include "../ccid.h" -/* Two seconds as per RFC 3448 4.2 */ +/* Two seconds as per RFC 5348, 4.2 */ #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) /* In usecs - half the scheduling granularity as per RFC3448 4.6 */ -- cgit v1.1 From 20cbd3e120a0c20bebe420e1fed0e816730bb988 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Tue, 14 Sep 2010 20:16:59 +0200 Subject: dccp ccid-3: A lower bound for the inter-packet scheduling algorithm This fixes a subtle bug in the calculation of the inter-packet gap and shows that t_delta, as it is currently used, is not needed. The algorithm from RFC 5348, 8.3 below continually computes a send time t_nom, which is initialised with the current time t_now; t_gran = 1E6 / HZ specifies the scheduling granularity, s the packet size, and X the sending rate: t_distance = t_nom - t_now; // in microseconds t_delta = min(t_ipi, t_gran) / 2; // `delta' parameter in microseconds if (t_distance >= t_delta) { reschedule after (t_distance / 1000) milliseconds; } else { t_ipi = s / X; // inter-packet interval in usec t_nom += t_ipi; // compute the next send time send packet now; } Problem: -------- Rescheduling requires a conversion into milliseconds (sk_reset_timer()). The highest jiffy resolution with HZ=1000 is 1 millisecond, so using a higher granularity does not make much sense here. As a consequence, values of t_distance < 1000 are truncated to 0. This issue has so far been resolved by using instead if (t_distance >= t_delta + 1000) reschedule after (t_distance / 1000) milliseconds; This is unnecessarily large, a lower bound is t_delta' = max(t_delta, 1000). And it implies a further simplification: a) when HZ >= 500, then t_delta <= t_gran/2 = 10^6/(2*HZ) <= 1000, so that t_delta' = MAX(1000, t_delta) = 1000 (constant value); b) when HZ < 500, then t_delta = 1/2*MIN(rtt, t_ipi, t_gran) <= t_gran/2, so that 1000 <= t_delta' <= t_gran/2. The maximum error of using a constant t_delta in (b) is less than half a jiffy. Fix: ---- The patch replaces t_delta with a constant, whose value depends on CONFIG_HZ, changing the above algorithm to: if (t_distance >= t_delta') reschedule after (t_distance / 1000) milliseconds; where t_delta' = 10^6/(2*HZ) if HZ < 500, and t_delta' = 1000 otherwise. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 19 ++++++++----------- net/dccp/ccids/ccid3.h | 18 +++++++++++++----- 2 files changed, 21 insertions(+), 16 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 278e170..e9ca098 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -91,19 +91,16 @@ static inline u64 rfc3390_initial_rate(struct sock *sk) return scaled_div(w_init << 6, hc->tx_rtt); } -/* - * Recalculate t_ipi and delta (should be called whenever X changes) +/** + * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst + * This respects the granularity of X_inst (64 * bytes/second). */ static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc) { - /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x); - /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ - hc->tx_delta = min_t(u32, hc->tx_t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN); - - ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", hc->tx_t_ipi, - hc->tx_delta, hc->tx_s, (unsigned)(hc->tx_x >> 6)); + ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi, + hc->tx_s, (unsigned)(hc->tx_x >> 6)); } static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now) @@ -332,15 +329,15 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) delay = ktime_us_delta(hc->tx_t_nom, now); ccid3_pr_debug("delay=%ld\n", (long)delay); /* - * Scheduling of packet transmissions [RFC 3448, 4.6] + * Scheduling of packet transmissions (RFC 5348, 8.3) * * if (t_now > t_nom - delta) * // send the packet now * else * // send the packet in (t_nom - t_now) milliseconds. */ - if (delay - (s64)hc->tx_delta >= 1000) - return (u32)delay / 1000L; + if (delay >= TFRC_T_DELTA) + return (u32)delay / USEC_PER_MSEC; ccid3_hc_tx_update_win_count(hc, now); break; diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index b7e569c..4a00174 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -45,12 +45,22 @@ /* Two seconds as per RFC 5348, 4.2 */ #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) -/* In usecs - half the scheduling granularity as per RFC3448 4.6 */ -#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) - /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ #define TFRC_T_MBI 64 +/* + * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are + * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. + * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse + * resolution of HZ < 500 means that the error is below one timer tick (t_gran) + * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). + */ +#if (HZ >= 500) +# define TFRC_T_DELTA USEC_PER_MSEC +#else +# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) +#endif + enum ccid3_options { TFRC_OPT_LOSS_EVENT_RATE = 192, TFRC_OPT_LOSS_INTERVALS = 193, @@ -90,7 +100,6 @@ enum ccid3_hc_tx_states { * @tx_no_feedback_timer: Handle to no feedback timer * @tx_t_ld: Time last doubled during slow start * @tx_t_nom: Nominal send time of next packet - * @tx_delta: Send timer delta (RFC 3448, 4.6) in usecs * @tx_hist: Packet history * @tx_options_received: Parsed set of retrieved options */ @@ -109,7 +118,6 @@ struct ccid3_hc_tx_sock { struct timer_list tx_no_feedback_timer; ktime_t tx_t_ld; ktime_t tx_t_nom; - u32 tx_delta; struct tfrc_tx_hist_entry *tx_hist; struct ccid3_options_received tx_options_received; }; -- cgit v1.1 From d2c726309d88df3c5568486e4b5b9e4c3150903f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Tue, 14 Sep 2010 20:18:00 +0200 Subject: dccp ccid-3: remove buggy RTT-sampling history lookup This removes the RTT-sampling function tfrc_tx_hist_rtt(), since 1. it suffered from complex passing of return values (the return value both indicated successful lookup while the value doubled as RTT sample); 2. when for some odd reason the sample value equalled 0, this triggered a bug warning about "bogus Ack", due to the ambiguity of the return value; 3. on a passive host which has not sent anything the TX history is empty and thus will lead to unwanted "bogus Ack" warnings such as ccid3_hc_tx_packet_recv: server(e7b7d518): DATAACK with bogus ACK-28197148 ccid3_hc_tx_packet_recv: server(e7b7d518): DATAACK with bogus ACK-26641606. The fix is to replace the implicit encoding by performing the steps manually. Furthermore, the "bogus Ack" warning has been removed, since it can actually be triggered due to several reasons (network reordering, old packet, (3) above), hence it is not very useful. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 34 ++++++++++++++++++-------------- net/dccp/ccids/lib/packet_history.c | 39 ------------------------------------- net/dccp/ccids/lib/packet_history.h | 22 ++++++++++++++++++--- 3 files changed, 38 insertions(+), 57 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index e9ca098..b2ddd20 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -370,6 +370,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); struct ccid3_options_received *opt_recv = &hc->tx_options_received; + struct tfrc_tx_hist_entry *acked; ktime_t now; unsigned long t_nfb; u32 pinv, r_sample; @@ -383,17 +384,24 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hc->tx_state != TFRC_SSTATE_NO_FBACK) return; - now = ktime_get_real(); - - /* Estimate RTT from history if ACK number is valid */ - r_sample = tfrc_tx_hist_rtt(hc->tx_hist, - DCCP_SKB_CB(skb)->dccpd_ack_seq, now); - if (r_sample == 0) { - DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk, - dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type), - (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq); + /* + * Locate the acknowledged packet in the TX history. + * + * Returning "entry not found" here can for instance happen when + * - the host has not sent out anything (e.g. a passive server), + * - the Ack is outdated (packet with higher Ack number was received), + * - it is a bogus Ack (for a packet not sent on this connection). + */ + acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb)); + if (acked == NULL) return; - } + /* For the sake of RTT sampling, ignore/remove all older entries */ + tfrc_tx_hist_purge(&acked->next); + + /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ + now = ktime_get_real(); + r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); + hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); /* Update receive rate in units of 64 * bytes/second */ hc->tx_x_recv = opt_recv->ccid3or_receive_rate; @@ -405,11 +413,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hc->tx_p = 0; else /* can not exceed 100% */ hc->tx_p = scaled_div(1, pinv); - /* - * Validate new RTT sample and update moving average - */ - r_sample = dccp_sample_rtt(sk, r_sample); - hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); + /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 3a4f414..de8fe29 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -38,18 +38,6 @@ #include "packet_history.h" #include "../../dccp.h" -/** - * tfrc_tx_hist_entry - Simple singly-linked TX history list - * @next: next oldest entry (LIFO order) - * @seqno: sequence number of this entry - * @stamp: send time of packet with sequence number @seqno - */ -struct tfrc_tx_hist_entry { - struct tfrc_tx_hist_entry *next; - u64 seqno; - ktime_t stamp; -}; - /* * Transmitter History Routines */ @@ -71,15 +59,6 @@ void tfrc_tx_packet_history_exit(void) } } -static struct tfrc_tx_hist_entry * - tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) -{ - while (head != NULL && head->seqno != seqno) - head = head->next; - - return head; -} - int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) { struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); @@ -107,24 +86,6 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) *headp = NULL; } -u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, - const ktime_t now) -{ - u32 rtt = 0; - struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno); - - if (packet != NULL) { - rtt = ktime_us_delta(now, packet->stamp); - /* - * Garbage-collect older (irrelevant) entries: - */ - tfrc_tx_hist_purge(&packet->next); - } - - return rtt; -} - - /* * Receiver History Routines */ diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 7df6c52..7ee4a9d 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -40,12 +40,28 @@ #include #include "tfrc.h" -struct tfrc_tx_hist_entry; +/** + * tfrc_tx_hist_entry - Simple singly-linked TX history list + * @next: next oldest entry (LIFO order) + * @seqno: sequence number of this entry + * @stamp: send time of packet with sequence number @seqno + */ +struct tfrc_tx_hist_entry { + struct tfrc_tx_hist_entry *next; + u64 seqno; + ktime_t stamp; +}; + +static inline struct tfrc_tx_hist_entry * + tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) +{ + while (head != NULL && head->seqno != seqno) + head = head->next; + return head; +} extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); -extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, - const u64 seqno, const ktime_t now); /* Subtraction a-b modulo-16, respects circular wrap-around */ #define SUB16(a, b) (((a) + 16 - (b)) & 0xF) -- cgit v1.1 From 37efb03fbd0935f5f85a0538c46b53be5cf40504 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Tue, 14 Sep 2010 20:21:29 +0200 Subject: dccp ccid-3: Simplify and consolidate tx_parse_options This simplifies and consolidates the TX option-parsing code: 1. The Loss Intervals option is not currently used, so dead code related to this option is removed. I am aware of no plans to support the option, but if someone wants to implement it (e.g. for inter-op tests), it is better to start afresh than having to also update currently unused code. 2. The Loss Event and Receive Rate options have a lot of code in common (both are 32 bit, both have same length etc.), so this is consolidated. 3. The test against GSR is not necessary, because - on first loading CCID3, ccid_new() zeroes out all fields in the socket; - ccid3_hc_tx_packet_recv() treats 0 and ~0U equivalently, due to pinv = opt_recv->ccid3or_loss_event_rate; if (pinv == ~0U || pinv == 0) hctx->p = 0; - as a result, the sequence number field is removed from opt_recv. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 57 +++++++++++++------------------------------------- net/dccp/ccids/ccid3.h | 3 --- 2 files changed, 14 insertions(+), 46 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index b2ddd20..ce80591 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -485,60 +485,31 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, unsigned char len, u16 idx, unsigned char *value) { - int rc = 0; - const struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); struct ccid3_options_received *opt_recv = &hc->tx_options_received; __be32 opt_val; - if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { - opt_recv->ccid3or_seqno = dp->dccps_gsr; - opt_recv->ccid3or_loss_event_rate = ~0; - opt_recv->ccid3or_loss_intervals_idx = 0; - opt_recv->ccid3or_loss_intervals_len = 0; - opt_recv->ccid3or_receive_rate = 0; - } - switch (option) { + case TFRC_OPT_RECEIVE_RATE: case TFRC_OPT_LOSS_EVENT_RATE: if (unlikely(len != 4)) { - DCCP_WARN("%s(%p), invalid len %d " - "for TFRC_OPT_LOSS_EVENT_RATE\n", - dccp_role(sk), sk, len); - rc = -EINVAL; - } else { - opt_val = get_unaligned((__be32 *)value); - opt_recv->ccid3or_loss_event_rate = ntohl(opt_val); - ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", - dccp_role(sk), sk, - opt_recv->ccid3or_loss_event_rate); + DCCP_WARN("%s(%p), invalid len %d for %u\n", + dccp_role(sk), sk, len, option); + return -EINVAL; } - break; - case TFRC_OPT_LOSS_INTERVALS: - opt_recv->ccid3or_loss_intervals_idx = idx; - opt_recv->ccid3or_loss_intervals_len = len; - ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", - dccp_role(sk), sk, - opt_recv->ccid3or_loss_intervals_idx, - opt_recv->ccid3or_loss_intervals_len); - break; - case TFRC_OPT_RECEIVE_RATE: - if (unlikely(len != 4)) { - DCCP_WARN("%s(%p), invalid len %d " - "for TFRC_OPT_RECEIVE_RATE\n", - dccp_role(sk), sk, len); - rc = -EINVAL; - } else { - opt_val = get_unaligned((__be32 *)value); - opt_recv->ccid3or_receive_rate = ntohl(opt_val); + opt_val = ntohl(get_unaligned((__be32 *)value)); + + if (option == TFRC_OPT_RECEIVE_RATE) { + opt_recv->ccid3or_receive_rate = opt_val; ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", - dccp_role(sk), sk, - opt_recv->ccid3or_receive_rate); + dccp_role(sk), sk, opt_val); + } else { + opt_recv->ccid3or_loss_event_rate = opt_val; + ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", + dccp_role(sk), sk, opt_val); } - break; } - - return rc; + return 0; } static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 4a00174..9eb90b8 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -68,9 +68,6 @@ enum ccid3_options { }; struct ccid3_options_received { - u64 ccid3or_seqno:48, - ccid3or_loss_intervals_idx:16; - u16 ccid3or_loss_intervals_len; u32 ccid3or_loss_event_rate; u32 ccid3or_receive_rate; }; -- cgit v1.1 From 4874c131d79695e3d372042781a408a1a8a762d8 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 19 Sep 2010 20:06:50 +0200 Subject: dccp: Add packet type information to CCID-specific option parsing This 1. adds packet type information to ccid_hc_{rx,tx}_parse_options(). This is necessary, since table 3 in RFC 4340, 5.8 leaves it to the CCIDs to state which options may (not) appear on what packet type. 2. adds such a check for CCID-3's {Loss Event, Receive} Rate as specified in RFC 4340 8.3 ("Receive Rate options MUST NOT be sent on DCCP-Data packets") and 8.5 ("Loss Event Rate options MUST NOT be sent on DCCP-Data packets"). 3. removes an unused argument `idx' from ccid_hc_{rx,tx}_parse_options(). This is also no longer necessary, since the CCID-specific option-parsing routines are passed every single parameter of the type-length-value option encoding. Signed-off-by: Gerrit Renker --- net/dccp/ccid.h | 46 +++++++++++++++++++++++----------------------- net/dccp/ccids/ccid3.c | 14 ++++++++------ net/dccp/options.c | 16 ++++------------ 3 files changed, 35 insertions(+), 41 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 6df6f8a..6d16a90 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -62,18 +62,14 @@ struct ccid_operations { void (*ccid_hc_tx_exit)(struct sock *sk); void (*ccid_hc_rx_packet_recv)(struct sock *sk, struct sk_buff *skb); - int (*ccid_hc_rx_parse_options)(struct sock *sk, - unsigned char option, - unsigned char len, u16 idx, - unsigned char* value); + int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, + u8 opt, u8 *val, u8 len); int (*ccid_hc_rx_insert_options)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_recv)(struct sock *sk, struct sk_buff *skb); - int (*ccid_hc_tx_parse_options)(struct sock *sk, - unsigned char option, - unsigned char len, u16 idx, - unsigned char* value); + int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, + u8 opt, u8 *val, u8 len); int (*ccid_hc_tx_send_packet)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_sent)(struct sock *sk, @@ -168,27 +164,31 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); } +/** + * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver + * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) + * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) + * @val: value of @opt + * @len: length of @val in bytes + */ static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, - unsigned char option, - unsigned char len, u16 idx, - unsigned char* value) + u8 pkt, u8 opt, u8 *val, u8 len) { - int rc = 0; - if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL) - rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx, - value); - return rc; + if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL) + return 0; + return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); } +/** + * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender + * Arguments are analogous to ccid_hc_tx_parse_options() + */ static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, - unsigned char option, - unsigned char len, u16 idx, - unsigned char* value) + u8 pkt, u8 opt, u8 *val, u8 len) { - int rc = 0; - if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL) - rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value); - return rc; + if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL) + return 0; + return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); } static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index ce80591..be1b8ba 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -481,9 +481,8 @@ done_computing_x: jiffies + usecs_to_jiffies(t_nfb)); } -static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, - unsigned char len, u16 idx, - unsigned char *value) +static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, + u8 option, u8 *optval, u8 optlen) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); struct ccid3_options_received *opt_recv = &hc->tx_options_received; @@ -492,12 +491,15 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, switch (option) { case TFRC_OPT_RECEIVE_RATE: case TFRC_OPT_LOSS_EVENT_RATE: - if (unlikely(len != 4)) { + /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ + if (packet_type == DCCP_PKT_DATA) + break; + if (unlikely(optlen != 4)) { DCCP_WARN("%s(%p), invalid len %d for %u\n", - dccp_role(sk), sk, len, option); + dccp_role(sk), sk, optlen, option); return -EINVAL; } - opt_val = ntohl(get_unaligned((__be32 *)value)); + opt_val = ntohl(get_unaligned((__be32 *)optval)); if (option == TFRC_OPT_RECEIVE_RATE) { opt_recv->ccid3or_receive_rate = opt_val; diff --git a/net/dccp/options.c b/net/dccp/options.c index bfda087..e4983e3 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -226,23 +226,15 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", dccp_role(sk), elapsed_time); break; - case 128 ... 191: { - const u16 idx = value - options; - + case 128 ... 191: if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, - opt, len, idx, - value) != 0) + pkt_type, opt, value, len)) goto out_invalid_option; - } break; - case 192 ... 255: { - const u16 idx = value - options; - + case 192 ... 255: if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, - opt, len, idx, - value) != 0) + pkt_type, opt, value, len)) goto out_invalid_option; - } break; default: DCCP_CRIT("DCCP(%p): option %d(len=%d) not " -- cgit v1.1 From a18213d1d2a469956845b437f5d1d0401ab22e8b Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 19 Sep 2010 20:08:00 +0200 Subject: dccp: Replace magic CCID-specific numbers by symbolic constants The constants DCCPO_{MIN,MAX}_CCID_SPECIFIC are nowhere used in the code, but instead for the CCID-specific options numbers are used. This patch unifies the use of CCID-specific option numbers, by adding symbolic names reflecting the definitions in RFC 4340, 10.3. Signed-off-by: Gerrit Renker --- net/dccp/options.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/options.c b/net/dccp/options.c index e4983e3..9271851 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -96,18 +96,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, } /* - * CCID-Specific Options (from RFC 4340, sec. 10.3): - * - * Option numbers 128 through 191 are for options sent from the - * HC-Sender to the HC-Receiver; option numbers 192 through 255 - * are for options sent from the HC-Receiver to the HC-Sender. - * * CCID-specific options are ignored during connection setup, as * negotiation may still be in progress (see RFC 4340, 10.3). * The same applies to Ack Vectors, as these depend on the CCID. - * */ - if (dreq != NULL && (opt >= 128 || + if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) goto ignore_option; @@ -226,12 +219,12 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", dccp_role(sk), elapsed_time); break; - case 128 ... 191: + case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, pkt_type, opt, value, len)) goto out_invalid_option; break; - case 192 ... 255: + case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, pkt_type, opt, value, len)) goto out_invalid_option; -- cgit v1.1 From 80763dfbac4ed1e6dfe6ec08ef748e0e9aec3260 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 19 Sep 2010 20:08:24 +0200 Subject: dccp ccid-3: remove dead states This patch is thanks to an investigation by Leandro Sales de Melo and his colleagues. They worked out two state diagrams which highlight the fact that the xxx_TERM states in CCID-3/4 are in fact not necessary. And this can be confirmed by in turn looking at the code: the xxx_TERM states are only ever set in ccid3_hc_{rx,tx}_exit(): when CCID-3 sets the state to xxx_TERM, it is at a time where no more processing should be going on, hence it is not necessary to introduce a dedicated exit state - this is already implied by unloading the CCID. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 37 +++++++++---------------------------- net/dccp/ccids/ccid3.h | 2 -- 2 files changed, 9 insertions(+), 30 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index be1b8ba..90bc0a7 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -54,7 +54,6 @@ static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) [TFRC_SSTATE_NO_SENT] = "NO_SENT", [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", [TFRC_SSTATE_FBACK] = "FBACK", - [TFRC_SSTATE_TERM] = "TERM", }; return ccid3_state_names[state]; @@ -208,10 +207,13 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk, ccid3_tx_state_name(hc->tx_state)); + /* Ignore and do not restart after leaving the established state */ + if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) + goto out; + + /* Reset feedback state to "no feedback received" */ if (hc->tx_state == TFRC_SSTATE_FBACK) ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - else if (hc->tx_state != TFRC_SSTATE_NO_FBACK) - goto out; /* * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 @@ -287,8 +289,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) if (unlikely(skb->len == 0)) return -EBADMSG; - switch (hc->tx_state) { - case TFRC_SSTATE_NO_SENT: + if (hc->tx_state == TFRC_SSTATE_NO_SENT) { sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); hc->tx_last_win_count = 0; @@ -323,9 +324,8 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) ccid3_update_send_interval(hc); ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - break; - case TFRC_SSTATE_NO_FBACK: - case TFRC_SSTATE_FBACK: + + } else { delay = ktime_us_delta(hc->tx_t_nom, now); ccid3_pr_debug("delay=%ld\n", (long)delay); /* @@ -340,10 +340,6 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) return (u32)delay / USEC_PER_MSEC; ccid3_hc_tx_update_win_count(hc, now); - break; - case TFRC_SSTATE_TERM: - DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); - return -EINVAL; } /* prepare to send now (add options etc.) */ @@ -379,11 +375,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) return; - /* ... and only in the established state */ - if (hc->tx_state != TFRC_SSTATE_FBACK && - hc->tx_state != TFRC_SSTATE_NO_FBACK) - return; - /* * Locate the acknowledged packet in the TX history. * @@ -529,9 +520,7 @@ static void ccid3_hc_tx_exit(struct sock *sk) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); sk_stop_timer(sk, &hc->tx_no_feedback_timer); - tfrc_tx_hist_purge(&hc->tx_hist); } @@ -590,7 +579,6 @@ static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) static const char *const ccid3_rx_state_names[] = { [TFRC_RSTATE_NO_DATA] = "NO_DATA", [TFRC_RSTATE_DATA] = "DATA", - [TFRC_RSTATE_TERM] = "TERM", }; return ccid3_rx_state_names[state]; @@ -616,14 +604,9 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, { struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); struct dccp_sock *dp = dccp_sk(sk); - ktime_t now; + ktime_t now = ktime_get_real(); s64 delta = 0; - if (unlikely(hc->rx_state == TFRC_RSTATE_TERM)) - return; - - now = ktime_get_real(); - switch (fbtype) { case CCID3_FBACK_INITIAL: hc->rx_x_recv = 0; @@ -827,8 +810,6 @@ static void ccid3_hc_rx_exit(struct sock *sk) { struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); - tfrc_rx_hist_purge(&hc->rx_hist); tfrc_lh_cleanup(&hc->rx_li_hist); } diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 9eb90b8..89b047b 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -77,7 +77,6 @@ enum ccid3_hc_tx_states { TFRC_SSTATE_NO_SENT = 1, TFRC_SSTATE_NO_FBACK, TFRC_SSTATE_FBACK, - TFRC_SSTATE_TERM, }; /** @@ -130,7 +129,6 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) enum ccid3_hc_rx_states { TFRC_RSTATE_NO_DATA = 1, TFRC_RSTATE_DATA, - TFRC_RSTATE_TERM = 127, }; /** -- cgit v1.1 From 792e6d3389061ad449429b9ba228eb758c247ea0 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 19 Sep 2010 20:10:52 +0200 Subject: dccp tfrc/ccid-3: computing the loss rate from the Loss Event Rate This adds a function to take care of the following, separate cases occurring in the computation of the Loss Rate p: * 1/(2^32-1) is mapped into 0% as per RFC 4342, 8.5; * 1/0 is mapped into 100%, the maximum; * to avoid that p = 1/x is rounded down to 0 when x is very large, since this means accidentally re-entering slow-start indicated by p == 0, the minimum resolution value of p is now returned instead; * a bug in ccid3_hc_rx_getsockopt is fixed: 1/0 was mapped into ~0U. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 9 ++++----- net/dccp/ccids/lib/tfrc.h | 1 + net/dccp/ccids/lib/tfrc_equation.c | 14 ++++++++++++++ 3 files changed, 19 insertions(+), 5 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 90bc0a7..9715eeb 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -400,10 +400,10 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* Update loss event rate (which is scaled by 1e6) */ pinv = opt_recv->ccid3or_loss_event_rate; - if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ + if (pinv == 0) hc->tx_p = 0; - else /* can not exceed 100% */ - hc->tx_p = scaled_div(1, pinv); + else + hc->tx_p = tfrc_invert_loss_event_rate(pinv); /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 @@ -834,8 +834,7 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, return -EINVAL; rx_info.tfrcrx_x_recv = hc->rx_x_recv; rx_info.tfrcrx_rtt = hc->rx_rtt; - rx_info.tfrcrx_p = hc->rx_pinv == 0 ? ~0U : - scaled_div(1, hc->rx_pinv); + rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv); len = sizeof(rx_info); val = &rx_info; break; diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h index 01bb48e..f8ee3f5 100644 --- a/net/dccp/ccids/lib/tfrc.h +++ b/net/dccp/ccids/lib/tfrc.h @@ -57,6 +57,7 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); +extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); extern int tfrc_tx_packet_history_init(void); extern void tfrc_tx_packet_history_exit(void); diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index 22ca1cf..a052a43 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -687,3 +687,17 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) index = tfrc_binsearch(fvalue, 0); return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; } + +/** + * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% + * When @loss_event_rate is large, there is a chance that p is truncated to 0. + * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. + */ +u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) +{ + if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ + return 0; + if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ + return 1000000; + return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); +} -- cgit v1.1 From 536bb20b45dee3f9b77b0d250f8ed0733a5cb025 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 19 Sep 2010 20:14:23 +0200 Subject: dccp ccid-3: Remove redundant 'options_received' struct The `options_received' struct is redundant, since it re-duplicates the existing `p' and `x_recv' fields. This patch removes the sub-struct and migrates the format conversion operations to ccid3_hc_tx_parse_options(). Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 24 ++++++++---------------- net/dccp/ccids/ccid3.h | 7 ------- 2 files changed, 8 insertions(+), 23 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 9715eeb..c3f3a25 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -365,11 +365,10 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv = &hc->tx_options_received; struct tfrc_tx_hist_entry *acked; ktime_t now; unsigned long t_nfb; - u32 pinv, r_sample; + u32 r_sample; /* we are only interested in ACKs */ if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || @@ -394,17 +393,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); - /* Update receive rate in units of 64 * bytes/second */ - hc->tx_x_recv = opt_recv->ccid3or_receive_rate; - hc->tx_x_recv <<= 6; - - /* Update loss event rate (which is scaled by 1e6) */ - pinv = opt_recv->ccid3or_loss_event_rate; - if (pinv == 0) - hc->tx_p = 0; - else - hc->tx_p = tfrc_invert_loss_event_rate(pinv); - /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 */ @@ -476,7 +464,6 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, u8 option, u8 *optval, u8 optlen) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv = &hc->tx_options_received; __be32 opt_val; switch (option) { @@ -493,11 +480,16 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, opt_val = ntohl(get_unaligned((__be32 *)optval)); if (option == TFRC_OPT_RECEIVE_RATE) { - opt_recv->ccid3or_receive_rate = opt_val; + /* Receive Rate is kept in units of 64 bytes/second */ + hc->tx_x_recv = opt_val; + hc->tx_x_recv <<= 6; + ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", dccp_role(sk), sk, opt_val); } else { - opt_recv->ccid3or_loss_event_rate = opt_val; + /* Update the fixpoint Loss Event Rate fraction */ + hc->tx_p = tfrc_invert_loss_event_rate(opt_val); + ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", dccp_role(sk), sk, opt_val); } diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 89b047b..1a9933c 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -67,11 +67,6 @@ enum ccid3_options { TFRC_OPT_RECEIVE_RATE = 194, }; -struct ccid3_options_received { - u32 ccid3or_loss_event_rate; - u32 ccid3or_receive_rate; -}; - /* TFRC sender states */ enum ccid3_hc_tx_states { TFRC_SSTATE_NO_SENT = 1, @@ -97,7 +92,6 @@ enum ccid3_hc_tx_states { * @tx_t_ld: Time last doubled during slow start * @tx_t_nom: Nominal send time of next packet * @tx_hist: Packet history - * @tx_options_received: Parsed set of retrieved options */ struct ccid3_hc_tx_sock { u64 tx_x; @@ -115,7 +109,6 @@ struct ccid3_hc_tx_sock { ktime_t tx_t_ld; ktime_t tx_t_nom; struct tfrc_tx_hist_entry *tx_hist; - struct ccid3_options_received tx_options_received; }; static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) -- cgit v1.1 From a02cec2155fbea457eca8881870fd2de1a4c4c76 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Sep 2010 20:43:57 +0000 Subject: net: return operator cleanup Change "return (EXPR);" to "return EXPR;" return is not a function, parentheses are not required. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ccids/lib/loss_interval.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/dccp') diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index 8fc3cbf..497723c 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -116,7 +116,7 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) cur->li_length = len; tfrc_lh_calc_i_mean(lh); - return (lh->i_mean < old_i_mean); + return lh->i_mean < old_i_mean; } /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ -- cgit v1.1 From 1f4f0f645cc1d7f1187fcdb0ac22c2e69bd68050 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 5 Oct 2010 04:24:09 +0000 Subject: dccp: Kill dead code and add static markers. Remove dead code and make some functions static. Compile tested only. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/dccp/dccp.h | 2 -- net/dccp/feat.c | 10 ---------- net/dccp/feat.h | 1 - net/dccp/options.c | 4 +--- net/dccp/proto.c | 48 ++++++++++++++++++++++++------------------------ 5 files changed, 25 insertions(+), 40 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 3ccef1b..019d6ff 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -246,7 +246,6 @@ static inline void dccp_clear_xmit_timers(struct sock *sk) extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); extern const char *dccp_packet_name(const int type); -extern const char *dccp_state_name(const int state); extern void dccp_set_state(struct sock *sk, const int state); extern void dccp_done(struct sock *sk); @@ -449,7 +448,6 @@ extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); extern int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed); extern u32 dccp_timestamp(void); extern void dccp_timestamping_init(void); -extern int dccp_insert_option_timestamp(struct sk_buff *skb); extern int dccp_insert_option(struct sk_buff *skb, unsigned char option, const void *value, unsigned char len); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index df7dd26..568def9 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -730,16 +730,6 @@ int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, 0, list, len); } -/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */ -int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val) -{ - /* any changes must be registered before establishing the connection */ - if (sk->sk_state != DCCP_CLOSED) - return -EISCONN; - if (dccp_feat_type(feat) != FEAT_NN) - return -EINVAL; - return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val); -} /* * Tracking features whose value depend on the choice of CCID diff --git a/net/dccp/feat.h b/net/dccp/feat.h index f967216..e56a4e5 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -111,7 +111,6 @@ extern int dccp_feat_init(struct sock *sk); extern void dccp_feat_initialise_sysctls(void); extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, u8 const *list, u8 len); -extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, u8 mand, u8 opt, u8 feat, u8 *val, u8 len); extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); diff --git a/net/dccp/options.c b/net/dccp/options.c index 9271851..d4b1ae0 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -369,7 +369,7 @@ int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed_time) EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time); -int dccp_insert_option_timestamp(struct sk_buff *skb) +static int dccp_insert_option_timestamp(struct sk_buff *skb) { __be32 now = htonl(dccp_timestamp()); /* yes this will overflow but that is the point as we want a @@ -378,8 +378,6 @@ int dccp_insert_option_timestamp(struct sk_buff *skb) return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now)); } -EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp); - static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, struct dccp_request_sock *dreq, struct sk_buff *skb) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 096250d..b054ba1 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -50,6 +50,30 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo); /* the maximum queue length for tx in packets. 0 is no limit */ int sysctl_dccp_tx_qlen __read_mostly = 5; +#ifdef CONFIG_IP_DCCP_DEBUG +static const char *dccp_state_name(const int state) +{ + static const char *const dccp_state_names[] = { + [DCCP_OPEN] = "OPEN", + [DCCP_REQUESTING] = "REQUESTING", + [DCCP_PARTOPEN] = "PARTOPEN", + [DCCP_LISTEN] = "LISTEN", + [DCCP_RESPOND] = "RESPOND", + [DCCP_CLOSING] = "CLOSING", + [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", + [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", + [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", + [DCCP_TIME_WAIT] = "TIME_WAIT", + [DCCP_CLOSED] = "CLOSED", + }; + + if (state >= DCCP_MAX_STATES) + return "INVALID STATE!"; + else + return dccp_state_names[state]; +} +#endif + void dccp_set_state(struct sock *sk, const int state) { const int oldstate = sk->sk_state; @@ -146,30 +170,6 @@ const char *dccp_packet_name(const int type) EXPORT_SYMBOL_GPL(dccp_packet_name); -const char *dccp_state_name(const int state) -{ - static const char *const dccp_state_names[] = { - [DCCP_OPEN] = "OPEN", - [DCCP_REQUESTING] = "REQUESTING", - [DCCP_PARTOPEN] = "PARTOPEN", - [DCCP_LISTEN] = "LISTEN", - [DCCP_RESPOND] = "RESPOND", - [DCCP_CLOSING] = "CLOSING", - [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", - [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", - [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", - [DCCP_TIME_WAIT] = "TIME_WAIT", - [DCCP_CLOSED] = "CLOSED", - }; - - if (state >= DCCP_MAX_STATES) - return "INVALID STATE!"; - else - return dccp_state_names[state]; -} - -EXPORT_SYMBOL_GPL(dccp_state_name); - int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); -- cgit v1.1 From 0b53d4604ac2b4f2faa9a62a04ea9b383ad2efe0 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 11 Oct 2010 20:35:40 +0200 Subject: dccp: fix the adjustments to AWL and SWL This fixes a problem and a potential loophole with regard to seqno/ackno validity: currently the initial adjustments to AWL/SWL are only performed once at the begin of the connection, during the handshake. Since the Sequence Window feature is always greater than Wmin=32 (7.5.2), it is however necessary to perform these adjustments at least for the first W/W' (variables as per 7.5.1) packets in the lifetime of a connection. This requirement is complicated by the fact that W/W' can change at any time during the lifetime of a connection. Therefore it is better to perform that safety check each time SWL/AWL are updated, as implemented by the patch. A second problem solved by this patch is that the remote/local Sequence Window feature values (which set the bounds for AWL/SWL/SWH) are undefined until the feature negotiation has completed. During the initial handshake we have more stringent sequence number protection; the changes added by this patch effect that {A,S}W{L,H} are within the correct bounds at the instant that feature negotiation completes (since the SeqWin feature activation handlers call dccp_update_gsr/gss()). Signed-off-by: Gerrit Renker --- net/dccp/dccp.h | 20 ++++++++++++++++++++ net/dccp/input.c | 18 ++++++------------ net/dccp/minisocks.c | 30 +++++++++--------------------- 3 files changed, 35 insertions(+), 33 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 019d6ff..e051c77 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -414,6 +414,23 @@ static inline void dccp_update_gsr(struct sock *sk, u64 seq) dp->dccps_gsr = seq; /* Sequence validity window depends on remote Sequence Window (7.5.1) */ dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); + /* + * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, + * 7.5.1 we perform this check beyond the initial handshake: W/W' are + * always > 32, so for the first W/W' packets in the lifetime of a + * connection we always have to adjust SWL. + * A second reason why we are doing this is that the window depends on + * the feature-remote value of Sequence Window: nothing stops the peer + * from updating this value while we are busy adjusting SWL for the + * first W packets (we would have to count from scratch again then). + * Therefore it is safer to always make sure that the Sequence Window + * is not artificially extended by a peer who grows SWL downwards by + * continually updating the feature-remote Sequence-Window. + * If sequence numbers wrap it is bad luck. But that will take a while + * (48 bit), and this measure prevents Sequence-number attacks. + */ + if (before48(dp->dccps_swl, dp->dccps_isr)) + dp->dccps_swl = dp->dccps_isr; dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); } @@ -424,6 +441,9 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq) dp->dccps_gss = seq; /* Ack validity window depends on local Sequence Window value (7.5.1) */ dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); + /* Adjust AWL so that it is not below ISS - see comment above for SWL */ + if (before48(dp->dccps_awl, dp->dccps_iss)) + dp->dccps_awl = dp->dccps_iss; dp->dccps_awh = dp->dccps_gss; } diff --git a/net/dccp/input.c b/net/dccp/input.c index 10c957a..aecc8c74 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -441,20 +441,14 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; - dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; - dccp_update_gsr(sk, dp->dccps_isr); /* - * SWL and AWL are initially adjusted so that they are not less than - * the initial Sequence Numbers received and sent, respectively: - * SWL := max(GSR + 1 - floor(W/4), ISR), - * AWL := max(GSS - W' + 1, ISS). - * These adjustments MUST be applied only at the beginning of the - * connection. - * - * AWL was adjusted in dccp_v4_connect -acme + * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect + * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH + * is done as part of activating the feature values below, since + * these settings depend on the local/remote Sequence Window + * features, which were undefined or not confirmed until now. */ - dccp_set_seqno(&dp->dccps_swl, - max48(dp->dccps_swl, dp->dccps_isr)); + dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 128b089..d7041a0 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -121,30 +121,18 @@ struct sock *dccp_create_openreq_child(struct sock *sk, * * Choose S.ISS (initial seqno) or set from Init Cookies * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies - */ - newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; - dccp_update_gss(newsk, dreq->dreq_iss); - - newdp->dccps_isr = dreq->dreq_isr; - dccp_update_gsr(newsk, dreq->dreq_isr); - - /* - * SWL and AWL are initially adjusted so that they are not less than - * the initial Sequence Numbers received and sent, respectively: - * SWL := max(GSR + 1 - floor(W/4), ISR), - * AWL := max(GSS - W' + 1, ISS). - * These adjustments MUST be applied only at the beginning of the - * connection. + * Set S.ISR, S.GSR from packet (or Init Cookies) + * + * Setting AWL/AWH and SWL/SWH happens as part of the feature + * activation below, as these windows all depend on the local + * and remote Sequence Window feature values (7.5.2). */ - dccp_set_seqno(&newdp->dccps_swl, - max48(newdp->dccps_swl, newdp->dccps_isr)); - dccp_set_seqno(&newdp->dccps_awl, - max48(newdp->dccps_awl, newdp->dccps_iss)); + newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss; + newdp->dccps_gar = newdp->dccps_iss; + newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr; /* - * Activate features after initialising the sequence numbers, - * since CCID initialisation may depend on GSS, ISR, ISS etc. + * Activate features: initialise CCIDs, sequence windows etc. */ if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { /* It is still raw copy of parent, so invalidate -- cgit v1.1 From 93344af44c0f649582bf1e3b5ecc45b3d19e98c2 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 11 Oct 2010 20:36:33 +0200 Subject: dccp: merge now-reduced connect_init() function After moving the assignment of GAR/ISS from dccp_connect_init() to dccp_transmit_skb(), the former function becomes very small, so that a merger with dccp_connect() suggests itself. Signed-off-by: Gerrit Renker --- net/dccp/output.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/output.c b/net/dccp/output.c index aadbdb5..6993a93 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -474,8 +474,9 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) /* * Do all connect socket setups that can be done AF independent. */ -static inline void dccp_connect_init(struct sock *sk) +int dccp_connect(struct sock *sk) { + struct sk_buff *skb; struct dccp_sock *dp = dccp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -485,22 +486,12 @@ static inline void dccp_connect_init(struct sock *sk) dccp_sync_mss(sk, dst_mtu(dst)); - /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ - dp->dccps_gar = dp->dccps_iss; - - icsk->icsk_retransmits = 0; -} - -int dccp_connect(struct sock *sk) -{ - struct sk_buff *skb; - struct inet_connection_sock *icsk = inet_csk(sk); - /* do not connect if feature negotiation setup fails */ if (dccp_feat_finalise_settings(dccp_sk(sk))) return -EPROTO; - dccp_connect_init(sk); + /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ + dp->dccps_gar = dp->dccps_iss; skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); if (unlikely(skb == NULL)) @@ -516,6 +507,7 @@ int dccp_connect(struct sock *sk) DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); /* Timer for repeating the REQUEST until an answer. */ + icsk->icsk_retransmits = 0; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, DCCP_RTO_MAX); return 0; -- cgit v1.1 From baf9e782e1dc4991edecfa3b8700cf8739c40259 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 11 Oct 2010 20:37:38 +0200 Subject: dccp: remove unused argument in CCID tx function This removes the argument `more' from ccid_hc_tx_packet_sent, since it was nowhere used in the entire code. (Btw, this argument was not even used in the original KAME code where the function initially came from; compare the variable moreToSend in the freebsd61-dccp-kame-28.08.2006.patch kept by Emmanuel Lochin.) Signed-off-by: Gerrit Renker --- net/dccp/ccid.h | 6 +++--- net/dccp/ccids/ccid2.c | 2 +- net/dccp/ccids/ccid3.c | 3 +-- net/dccp/output.c | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 6d16a90..117fb09 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -73,7 +73,7 @@ struct ccid_operations { int (*ccid_hc_tx_send_packet)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_sent)(struct sock *sk, - int more, unsigned int len); + unsigned int len); void (*ccid_hc_rx_get_info)(struct sock *sk, struct tcp_info *info); void (*ccid_hc_tx_get_info)(struct sock *sk, @@ -144,10 +144,10 @@ static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, } static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, - int more, unsigned int len) + unsigned int len) { if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len); + ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); } static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index dc18172..d850e29 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -151,7 +151,7 @@ out: sock_put(sk); } -static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) +static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index c3f3a25..3060a60 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -351,8 +351,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) return 0; } -static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, - unsigned int len) +static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); diff --git a/net/dccp/output.c b/net/dccp/output.c index 6993a93..a988fe9 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -304,7 +304,7 @@ void dccp_write_xmit(struct sock *sk, int block) dcb->dccpd_type = DCCP_PKT_DATA; err = dccp_transmit_skb(sk, skb); - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); if (err) DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", err); -- cgit v1.1 From d196c9a5d4e150cdff675662214c80c69b906958 Mon Sep 17 00:00:00 2001 From: Ivo Calado Date: Mon, 11 Oct 2010 20:40:04 +0200 Subject: dccp: generalise data-loss condition This patch generalises the task of determining data loss from RFC 4340, 7.7.1. Let S_A, S_B be sequence numbers such that S_B is "after" S_A, and let N_B be the NDP count of packet S_B. Then, using modulo-2^48 arithmetic, D = S_B - S_A - 1 is an upper bound of the number of lost data packets, D - N_B is an approximation of the number of lost data packets (there are cases where this is not exact). The patch implements this as dccp_loss_count(S_A, S_B, N_B) := max(S_B - S_A - 1 - N_B, 0) Signed-off-by: Ivo Calado Signed-off-by: Erivaldo Xavier Signed-off-by: Leandro Sales Signed-off-by: Gerrit Renker --- net/dccp/dccp.h | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index e051c77..60f4f96 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -153,18 +153,27 @@ static inline u64 max48(const u64 seq1, const u64 seq2) } /** - * dccp_loss_free - Evaluates condition for data loss from RFC 4340, 7.7.1 - * @s1: start sequence number - * @s2: end sequence number + * dccp_loss_count - Approximate the number of lost data packets in a burst loss + * @s1: last known sequence number before the loss ('hole') + * @s2: first sequence number seen after the 'hole' * @ndp: NDP count on packet with sequence number @s2 - * Returns true if the sequence range s1...s2 has no data loss. */ -static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp) +static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp) { s64 delta = dccp_delta_seqno(s1, s2); WARN_ON(delta < 0); - return (u64)delta <= ndp + 1; + delta -= ndp + 1; + + return delta > 0 ? delta : 0; +} + +/** + * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1 + */ +static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp) +{ + return dccp_loss_count(s1, s2, ndp) == 0; } enum { -- cgit v1.1 From ecdfbdabbe4e0cf0443cbbea2df1bf51bf67f3f3 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 11 Oct 2010 20:41:13 +0200 Subject: dccp: schedule an Ack when receiving timestamps This schedules an Ack when receiving a timestamp, exploiting the existing inet_csk_schedule_ack() function, saving one case in the `dccp_ack_pending()' function. Signed-off-by: Gerrit Renker --- net/dccp/dccp.h | 3 +-- net/dccp/options.c | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 60f4f96..3eb264b 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -459,8 +459,7 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq) static inline int dccp_ack_pending(const struct sock *sk) { const struct dccp_sock *dp = dccp_sk(sk); - return dp->dccps_timestamp_echo != 0 || - (dp->dccps_hc_rx_ackvec != NULL && + return (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || inet_csk_ack_scheduled(sk); } diff --git a/net/dccp/options.c b/net/dccp/options.c index d4b1ae0..cd30618 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -163,6 +163,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_role(sk), ntohl(opt_val), (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq); + /* schedule an Ack in case this sender is quiescent */ + inet_csk_schedule_ack(sk); break; case DCCPO_TIMESTAMP_ECHO: if (len != 4 && len != 6 && len != 8) -- cgit v1.1 From 2f34b32977ade4249601f35f7eb0cdd56b4e0f89 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 11 Oct 2010 20:44:42 +0200 Subject: dccp: cosmetics - warning format This omits the redundant "DCCP:" in warning messages, since DCCP_WARN() already echoes the function name, avoiding messages like kernel: [10988.766503] dccp_close: DCCP: ABORT -- 209 bytes unread Signed-off-by: Gerrit Renker --- net/dccp/input.c | 2 +- net/dccp/proto.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/input.c b/net/dccp/input.c index aecc8c74..2659853 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -259,7 +259,7 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) sysctl_dccp_sync_ratelimit))) return 0; - DCCP_WARN("DCCP: Step 6 failed for %s packet, " + DCCP_WARN("Step 6 failed for %s packet, " "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and " "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " "sending SYNC...\n", dccp_packet_name(dh->dccph_type), diff --git a/net/dccp/proto.c b/net/dccp/proto.c index b054ba1..7e5fc04 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -944,7 +944,7 @@ void dccp_close(struct sock *sk, long timeout) if (data_was_unread) { /* Unread data was tossed, send an appropriate Reset Code */ - DCCP_WARN("DCCP: ABORT -- %u bytes unread\n", data_was_unread); + DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); dccp_set_state(sk, DCCP_CLOSED); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { -- cgit v1.1 From 093d282321daeb19c107e5f1f16d7f68484f3ade Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 13:06:43 +0200 Subject: tproxy: fix hash locking issue when using port redirection in __inet_inherit_port() When __inet_inherit_port() is called on a tproxy connection the wrong locks are held for the inet_bind_bucket it is added to. __inet_inherit_port() made an implicit assumption that the listener's port number (and thus its bind bucket). Unfortunately, if you're using the TPROXY target to redirect skbs to a transparent proxy that assumption is not true anymore and things break. This patch adds code to __inet_inherit_port() so that it can handle this case by looking up or creating a new bind bucket for the child socket and updates callers of __inet_inherit_port() to gracefully handle __inet_inherit_port() failing. Reported by and original patch from Stephen Buck . See http://marc.info/?t=128169268200001&r=1&w=2 for the original discussion. Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/dccp/ipv4.c | 10 +++++++--- net/dccp/ipv6.c | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'net/dccp') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index d4a166f..3f69ea1 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -392,7 +392,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, newsk = dccp_create_openreq_child(sk, req, skb); if (newsk == NULL) - goto exit; + goto exit_nonewsk; sk_setup_caps(newsk, dst); @@ -409,16 +409,20 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, dccp_sync_mss(newsk, dst_mtu(dst)); + if (__inet_inherit_port(sk, newsk) < 0) { + sock_put(newsk); + goto exit; + } __inet_hash_nolisten(newsk, NULL); - __inet_inherit_port(sk, newsk); return newsk; exit_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +exit_nonewsk: + dst_release(dst); exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - dst_release(dst); return NULL; } diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 6e3f325..dca711d 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -564,7 +564,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, newsk = dccp_create_openreq_child(sk, req, skb); if (newsk == NULL) - goto out; + goto out_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks @@ -632,18 +632,22 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; newinet->inet_rcv_saddr = LOOPBACK4_IPV6; + if (__inet_inherit_port(sk, newsk) < 0) { + sock_put(newsk); + goto out; + } __inet6_hash(newsk, NULL); - __inet_inherit_port(sk, newsk); return newsk; out_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +out_nonewsk: + dst_release(dst); out: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); if (opt != NULL && opt != np->opt) sock_kfree_s(sk, opt, opt->tot_len); - dst_release(dst); return NULL; } -- cgit v1.1