summaryrefslogtreecommitdiffstats
path: root/sys/netinet/khelp/h_ertt.c
blob: 13d15e04545a937b08a24f5809f4f15952ecd591 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
/*-
 * Copyright (c) 2009-2010
 * 	Swinburne University of Technology, Melbourne, Australia
 * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
 * Copyright (c) 2010-2011 The FreeBSD Foundation
 * All rights reserved.
 *
 * This software was developed at the Centre for Advanced Internet
 * Architectures, Swinburne University of Technology, by David Hayes, made
 * possible in part by a grant from the Cisco University Research Program Fund
 * at Community Foundation Silicon Valley.
 *
 * Portions of this software were developed at the Centre for Advanced
 * Internet Architectures, Swinburne University of Technology, Melbourne,
 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <sys/hhook.h>
#include <sys/khelp.h>
#include <sys/module_khelp.h>
#include <sys/socket.h>
#include <sys/sockopt.h>

#include <net/vnet.h>

#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_var.h>

#include <netinet/khelp/h_ertt.h>

#include <vm/uma.h>

uma_zone_t txseginfo_zone;

/* Smoothing factor for delayed ack guess. */
#define	DLYACK_SMOOTH	5

/* Max number of time stamp errors allowed in a session. */
#define	MAX_TS_ERR	10

static int ertt_packet_measurement_hook(int hhook_type, int hhook_id,
    void *udata, void *ctx_data, void *hdata, struct osd *hosd);
static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id,
    void *udata, void *ctx_data, void *hdata, struct osd *hosd);
static int ertt_mod_init(void);
static int ertt_mod_destroy(void);
static int ertt_uma_ctor(void *mem, int size, void *arg, int flags);
static void ertt_uma_dtor(void *mem, int size, void *arg);

/*
 * Contains information about the sent segment for comparison with the
 * corresponding ack.
 */
struct txseginfo {
	/* Segment length. */
	long		len;
	/* Segment sequence number. */
	tcp_seq		seq;
	/* Time stamp indicating when the packet was sent. */
	uint32_t	tx_ts;
	/* Last received receiver ts (if the TCP option is used). */
	uint32_t	rx_ts;
	uint32_t	flags;
	TAILQ_ENTRY (txseginfo) txsegi_lnk;
};

/* Flags for struct txseginfo. */
#define	TXSI_TSO		0x01 /* TSO was used for this entry. */
#define	TXSI_RTT_MEASURE_START	0x02 /* Start a per RTT measurement. */
#define	TXSI_RX_MEASURE_END	0x04 /* Measure the rx rate until this txsi. */

struct helper ertt_helper = {
	.mod_init = ertt_mod_init,
	.mod_destroy = ertt_mod_destroy,
	.h_flags = HELPER_NEEDS_OSD,
	.h_classes = HELPER_CLASS_TCP
};

/* Define the helper hook info required by ERTT. */
struct hookinfo ertt_hooks[] = {
	{
		.hook_type = HHOOK_TYPE_TCP,
		.hook_id = HHOOK_TCP_EST_IN,
		.hook_udata = NULL,
		.hook_func = &ertt_packet_measurement_hook
	},
	{
		.hook_type = HHOOK_TYPE_TCP,
		.hook_id = HHOOK_TCP_EST_OUT,
		.hook_udata = NULL,
		.hook_func = &ertt_add_tx_segment_info_hook
	}
};

/* Flags to indicate how marked_packet_rtt should handle this txsi. */
#define	MULTI_ACK		0x01 /* More than this txsi is acked. */
#define	OLD_TXSI		0x02 /* TXSI is old according to timestamps. */
#define	CORRECT_ACK		0X04 /* Acks this TXSI. */
#define	FORCED_MEASUREMENT	0X08 /* Force an RTT measurement. */

/*
 * This fuction measures the RTT of a particular segment/ack pair, or the next
 * closest if this will yield an inaccurate result due to delayed acking or
 * other issues.
 */
static void inline
marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp,
    uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust,
    int mflag)
{

	/*
	 * If we can't measure this one properly due to delayed acking adjust
	 * byte counters and flag to measure next txsi. Note that since the
	 * marked packet's transmitted bytes are measured we need to subtract the
	 * transmitted bytes. Then pretend the next txsi was marked.
	 */
	if (mflag & (MULTI_ACK|OLD_TXSI)) {
		*pmeasurenext = txsi->tx_ts;
		*pmeasurenext_len = txsi->len;
		*prtt_bytes_adjust += *pmeasurenext_len;
	} else {
		if (mflag & FORCED_MEASUREMENT) {
			e_t->markedpkt_rtt = tcp_ts_getticks() -
			    *pmeasurenext + 1;
			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
			    *pmeasurenext_len - *prtt_bytes_adjust;
		} else {
			e_t->markedpkt_rtt = tcp_ts_getticks() -
			    txsi->tx_ts + 1;
			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
			    *prtt_bytes_adjust;
		}
		e_t->marked_snd_cwnd = tp->snd_cwnd;

		/*
		 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to
		 * add_tx_segment_info that a new measurement should be started.
		 */
		e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS;
		/*
		 * Set ERTT_NEW_MEASUREMENT to tell the congestion control
		 * algorithm that a new marked RTT measurement has has been made
		 * and is available for use.
		 */
		e_t->flags |= ERTT_NEW_MEASUREMENT;

		if (tp->t_flags & TF_TSO) {
			/* Temporarily disable TSO to aid a new measurment. */
			tp->t_flags &= ~TF_TSO;
			/* Keep track that we've disabled it. */
			e_t->flags |= ERTT_TSO_DISABLED;
		}
	}
}

/*
 * Ertt_packet_measurements uses a small amount of state kept on each packet
 * sent to match incoming acknowledgements. This enables more accurate and
 * secure round trip time measurements. The resulting measurement is used for
 * congestion control algorithms which require a more accurate time.
 * Ertt_packet_measurements is called via the helper hook in tcp_input.c
 */
static int
ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
    void *ctx_data, void *hdata, struct osd *hosd)
{
	struct ertt *e_t;
	struct tcpcb *tp;
	struct tcphdr *th;
	struct tcpopt *to;
	struct tcp_hhook_data *thdp;
	struct txseginfo *txsi;
	int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
	uint32_t measurenext, rts;
	tcp_seq ack;

	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));

	e_t = (struct ertt *)hdata;
	thdp = ctx_data;
	tp = thdp->tp;
	th = thdp->th;
	to = thdp->to;
	new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
	measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
	acked = th->th_ack - tp->snd_una;

	INP_WLOCK_ASSERT(tp->t_inpcb);

	/* Packet has provided new acknowledgements. */
	if (acked > 0 || new_sacked_bytes) {
		if (acked == 0 && new_sacked_bytes) {
			/* Use last sacked data. */
			ack = tp->sackhint.last_sack_ack;
		} else
			ack = th->th_ack;

		txsi = TAILQ_FIRST(&e_t->txsegi_q);
		while (txsi != NULL) {
			rts = 0;

			/* Acknowledgement is acking more than this txsi. */
			if (SEQ_GT(ack, txsi->seq + txsi->len)) {
				if (txsi->flags & TXSI_RTT_MEASURE_START ||
				    measurenext) {
					marked_packet_rtt(txsi, e_t, tp,
					    &measurenext, &measurenext_len,
					    &rtt_bytes_adjust, MULTI_ACK);
				}
				TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
				uma_zfree(txseginfo_zone, txsi);
				txsi = TAILQ_FIRST(&e_t->txsegi_q);
				continue;
			}

			/*
			 * Guess if delayed acks are being used by the receiver.
			 *
			 * XXXDH: A simple heuristic that could be improved
			 */
			if (!new_sacked_bytes) {
				if (acked > tp->t_maxseg) {
					e_t->dlyack_rx +=
					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
					    1 : 0;
					multiack = 1;
				} else if (acked > txsi->len) {
					multiack = 1;
					e_t->dlyack_rx +=
					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
					    1 : 0;
				} else if (acked == tp->t_maxseg ||
					   acked == txsi->len) {
					e_t->dlyack_rx -=
					    (e_t->dlyack_rx > 0) ? 1 : 0;
				}
				/* Otherwise leave dlyack_rx the way it was. */
			}

			/*
			 * Time stamps are only to help match the txsi with the
			 * received acknowledgements.
			 */
			if (e_t->timestamp_errors < MAX_TS_ERR &&
			    (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
				/*
				 * Note: All packets sent with the offload will
				 * have the same time stamp. If we are sending
				 * on a fast interface and the t_maxseg is much
				 * smaller than one tick, this will be fine. The
				 * time stamp would be the same whether we were
				 * using tso or not. However, if the interface
				 * is slow, this will cause problems with the
				 * calculations. If the interface is slow, there
				 * is not reason to be using tso, and it should
				 * be turned off.
				 */
				/*
				 * If there are too many time stamp errors, time
				 * stamps won't be trusted
				 */
				rts = to->to_tsecr;
				/* Before this packet. */
				if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
					/* When delayed acking is used, the
					 * reflected time stamp is of the first
					 * packet and thus may be before
					 * txsi->tx_ts.
					 */
					break;
				if (TSTMP_GT(rts, txsi->tx_ts)) {
					/*
					 * If reflected time stamp is later than
					 * tx_tsi, then this txsi is old.
					 */
					if (txsi->flags & TXSI_RTT_MEASURE_START
					    || measurenext) {
						marked_packet_rtt(txsi, e_t, tp,
						    &measurenext, &measurenext_len,
						    &rtt_bytes_adjust, OLD_TXSI);
					}
					TAILQ_REMOVE(&e_t->txsegi_q, txsi,
					    txsegi_lnk);
					uma_zfree(txseginfo_zone, txsi);
					txsi = TAILQ_FIRST(&e_t->txsegi_q);
					continue;
				}
				if (rts == txsi->tx_ts &&
				    TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
					/*
					 * Segment received before sent!
					 * Something is wrong with the received
					 * timestamps so increment errors. If
					 * this keeps up we will ignore
					 * timestamps.
					 */
					e_t->timestamp_errors++;
				}
			}
			/*
			 * Acknowledging a sequence number before this txsi.
			 * If it is an old txsi that may have had the same seq
			 * numbers, it should have been removed if time stamps
			 * are being used.
			 */
			if (SEQ_LEQ(ack, txsi->seq))
				break; /* Before first packet in txsi. */

			/*
			 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
			 * past this point.
			 *
			 * If delayed acks are being used, an acknowledgement
			 * for a single segment will have been delayed by the
			 * receiver and will yield an inaccurate measurement. In
			 * this case, we only make the measurement if more than
			 * one segment is being acknowledged or sack is
			 * currently being used.
			 */
			if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
				/* Make an accurate new measurement. */
				e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;

				if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
					e_t->minrtt = e_t->rtt;

				if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
					e_t->maxrtt = e_t->rtt;
			}

			if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
				marked_packet_rtt(txsi, e_t, tp,
				    &measurenext, &measurenext_len,
				    &rtt_bytes_adjust, CORRECT_ACK);

			if (txsi->flags & TXSI_TSO) {
				txsi->len -= acked;
				if (txsi->len > 0) {
					/*
					 * This presumes ack for first bytes in
					 * txsi, this may not be true but it
					 * shouldn't cause problems for the
					 * timing.
					 *
					 * We remeasure RTT even though we only
					 * have a single txsi. The rationale
					 * behind this is that it is better to
					 * have a slightly inaccurate
					 * measurement than no additional
					 * measurement for the rest of the bulk
					 * transfer. Since TSO is only used on
					 * high speed interface cards, so the
					 * packets should be transmitted at line
					 * rate back to back with little
					 * difference in transmission times (in
					 * ticks).
					 */
					txsi->seq += acked;
					/*
					 * Reset txsi measure flag so we don't
					 * use it for another RTT measurement.
					 */
					txsi->flags &= ~TXSI_RTT_MEASURE_START;
					/*
					 * There is still more data to be acked
					 * from tso bulk transmission, so we
					 * won't remove it from the TAILQ yet.
					 */
					break;
				}
			}

			TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
			uma_zfree(txseginfo_zone, txsi);
			break;
		}

		if (measurenext) {
			/*
			 * We need to do a RTT measurement. It won't be the best
			 * if we do it here.
			 */
			marked_packet_rtt(txsi, e_t, tp,
			    &measurenext, &measurenext_len,
			    &rtt_bytes_adjust, FORCED_MEASUREMENT);
		}
	}

	return (0);
}

/*
 * Add information about a transmitted segment to a list.
 * This is called via the helper hook in tcp_output.c
 */
static int
ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata,
    void *ctx_data, void *hdata, struct osd *hosd)
{
	struct ertt *e_t;
	struct tcpcb *tp;
	struct tcphdr *th;
	struct tcpopt *to;
	struct tcp_hhook_data *thdp;
	struct txseginfo *txsi;
	long len;
	int tso;

	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));

	e_t = (struct ertt *)hdata;
	thdp = ctx_data;
	tp = thdp->tp;
	th = thdp->th;
	to = thdp->to;
	len = thdp->len;
	tso = thdp->tso;

	INP_WLOCK_ASSERT(tp->t_inpcb);

	if (len > 0) {
		txsi = uma_zalloc(txseginfo_zone, M_NOWAIT);
		if (txsi != NULL) {
			/* Construct txsi setting the necessary flags. */
			txsi->flags = 0; /* Needs to be initialised. */
			txsi->seq = ntohl(th->th_seq);
			txsi->len = len;
			if (tso)
				txsi->flags |= TXSI_TSO;
			else if (e_t->flags & ERTT_TSO_DISABLED) {
				tp->t_flags |= TF_TSO;
				e_t->flags &= ~ERTT_TSO_DISABLED;
			}

			if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) {
				e_t->bytes_tx_in_rtt += len;
			} else {
				txsi->flags |= TXSI_RTT_MEASURE_START;
				e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS;
				e_t->bytes_tx_in_rtt = len;
			}

			if (((tp->t_flags & TF_NOOPT) == 0) &&
			    (to->to_flags & TOF_TS)) {
				txsi->tx_ts = ntohl(to->to_tsval) -
				    tp->ts_offset;
				txsi->rx_ts = ntohl(to->to_tsecr);
			} else {
				txsi->tx_ts = tcp_ts_getticks();
				txsi->rx_ts = 0; /* No received time stamp. */
			}
			TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
		}
	}

	return (0);
}

static int
ertt_mod_init(void)
{

	txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo),
	    NULL, NULL, NULL, NULL, 0, 0);

	return (0);
}

static int
ertt_mod_destroy(void)
{

	uma_zdestroy(txseginfo_zone);

	return (0);
}

static int
ertt_uma_ctor(void *mem, int size, void *arg, int flags)
{
	struct ertt *e_t;

	e_t = mem;

	TAILQ_INIT(&e_t->txsegi_q);
	e_t->timestamp_errors = 0;
	e_t->minrtt = 0;
	e_t->maxrtt = 0;
	e_t->rtt = 0;
	e_t->flags = 0;
	e_t->dlyack_rx = 0;
	e_t->bytes_tx_in_rtt = 0;
	e_t->markedpkt_rtt = 0;

	return (0);
}

static void
ertt_uma_dtor(void *mem, int size, void *arg)
{
	struct ertt *e_t;
	struct txseginfo *n_txsi, *txsi;

	e_t = mem;
	txsi = TAILQ_FIRST(&e_t->txsegi_q);
	while (txsi != NULL) {
		n_txsi = TAILQ_NEXT(txsi, txsegi_lnk);
		uma_zfree(txseginfo_zone, txsi);
		txsi = n_txsi;
	}
}

KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt),
    ertt_uma_ctor, ertt_uma_dtor);
OpenPOWER on IntegriCloud