summaryrefslogtreecommitdiffstats
path: root/sys/netinet/tcp_lro.c
diff options
context:
space:
mode:
authorjfv <jfv@FreeBSD.org>2008-06-11 22:12:50 +0000
committerjfv <jfv@FreeBSD.org>2008-06-11 22:12:50 +0000
commitf471fdabc3eb133e156912fa14dedbbd997d65f8 (patch)
treedd23a7fa0bb4b89ae700917ee0c7907717c1e022 /sys/netinet/tcp_lro.c
parent7c347e5bb2a0dc44285ad5db804d24345e209176 (diff)
downloadFreeBSD-src-f471fdabc3eb133e156912fa14dedbbd997d65f8.zip
FreeBSD-src-f471fdabc3eb133e156912fa14dedbbd997d65f8.tar.gz
Add generic TCP LOR into netinet
Diffstat (limited to 'sys/netinet/tcp_lro.c')
-rw-r--r--sys/netinet/tcp_lro.c379
1 files changed, 379 insertions, 0 deletions
diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c
new file mode 100644
index 0000000..4f0a30e
--- /dev/null
+++ b/sys/netinet/tcp_lro.c
@@ -0,0 +1,379 @@
+/******************************************************************************
+
+Copyright (c) 2007, Myricom Inc.
+Copyright (c) 2008, Intel Corporation.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Myricom Inc, nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ 3. Neither the name of the Intel Corporation, nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+$FreeBSD$
+***************************************************************************/
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/endian.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+
+#include <net/if.h>
+#include <net/ethernet.h>
+#include <net/if_media.h>
+
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_lro.h>
+
+#include <machine/bus.h>
+#include <machine/in_cksum.h>
+
+
+static uint16_t do_csum_data(uint16_t *raw, int len)
+{
+ uint32_t csum;
+ csum = 0;
+ while (len > 0) {
+ csum += *raw;
+ raw++;
+ csum += *raw;
+ raw++;
+ len -= 4;
+ }
+ csum = (csum >> 16) + (csum & 0xffff);
+ csum = (csum >> 16) + (csum & 0xffff);
+ return (uint16_t)csum;
+}
+
+/*
+ * Allocate and init the LRO data structures
+ */
+int
+tcp_lro_init(struct lro_ctrl *cntl)
+{
+ struct lro_entry *lro;
+ int i, error = 0;
+
+ SLIST_INIT(&cntl->lro_free);
+ SLIST_INIT(&cntl->lro_active);
+
+ cntl->lro_bad_csum = 0;
+ cntl->lro_queued = 0;
+ cntl->lro_flushed = 0;
+
+ for (i = 0; i < LRO_ENTRIES; i++) {
+ lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
+ M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (lro == NULL) {
+ if (i == 0)
+ error = ENOMEM;
+ break;
+ }
+ cntl->lro_cnt = i;
+ SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
+ }
+
+ return (error);
+}
+
+void
+tcp_lro_free(struct lro_ctrl *cntl)
+{
+ struct lro_entry *entry;
+
+ while (!SLIST_EMPTY(&cntl->lro_free)) {
+ entry = SLIST_FIRST(&cntl->lro_free);
+ SLIST_REMOVE_HEAD(&cntl->lro_free, next);
+ free(entry, M_DEVBUF);
+ }
+}
+
+void
+tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
+{
+ struct ifnet *ifp;
+ struct ip *ip;
+ struct tcphdr *tcp;
+ uint32_t *ts_ptr;
+ uint32_t tcplen, tcp_csum;
+
+
+ if (lro->append_cnt) {
+ /* incorporate the new len into the ip header and
+ * re-calculate the checksum */
+ ip = lro->ip;
+ ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
+ ip->ip_sum = 0;
+ ip->ip_sum = 0xffff ^
+ do_csum_data((uint16_t*)ip,
+ sizeof (*ip));
+
+ lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
+ CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ lro->m_head->m_pkthdr.csum_data = 0xffff;
+ lro->m_head->m_pkthdr.len = lro->len;
+
+ /* incorporate the latest ack into the tcp header */
+ tcp = (struct tcphdr *) (ip + 1);
+ tcp->th_ack = lro->ack_seq;
+ tcp->th_win = lro->window;
+ /* incorporate latest timestamp into the tcp header */
+ if (lro->timestamp) {
+ ts_ptr = (uint32_t *)(tcp + 1);
+ ts_ptr[1] = htonl(lro->tsval);
+ ts_ptr[2] = lro->tsecr;
+ }
+ /*
+ * update checksum in tcp header by re-calculating the
+ * tcp pseudoheader checksum, and adding it to the checksum
+ * of the tcp payload data
+ */
+ tcp->th_sum = 0;
+ tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
+ tcp_csum = lro->data_csum;
+ tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons(tcplen + IPPROTO_TCP));
+ tcp_csum += do_csum_data((uint16_t*)tcp,
+ tcp->th_off << 2);
+ tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
+ tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
+ tcp->th_sum = 0xffff ^ tcp_csum;
+ }
+ ifp = cntl->ifp;
+ (*ifp->if_input)(cntl->ifp, lro->m_head);
+ cntl->lro_queued += lro->append_cnt + 1;
+ cntl->lro_flushed++;
+ lro->m_head = NULL;
+ lro->timestamp = 0;
+ lro->append_cnt = 0;
+ SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
+}
+
+int
+tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
+{
+ struct ether_header *eh;
+ struct ip *ip;
+ struct tcphdr *tcp;
+ uint32_t *ts_ptr;
+ struct mbuf *m_nxt, *m_tail;
+ struct lro_entry *lro;
+ int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
+ int opt_bytes, trim;
+ uint32_t seq, tmp_csum, device_mtu;
+
+
+ eh = mtod(m_head, struct ether_header *);
+ if (eh->ether_type != htons(ETHERTYPE_IP))
+ return 1;
+ ip = (struct ip *) (eh + 1);
+ if (ip->ip_p != IPPROTO_TCP)
+ return 1;
+
+ /* ensure there are no options */
+ if ((ip->ip_hl << 2) != sizeof (*ip))
+ return -1;
+
+ /* .. and the packet is not fragmented */
+ if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
+ return -1;
+
+ /* verify that the IP header checksum is correct */
+ tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
+ if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
+ cntl->lro_bad_csum++;
+ return -1;
+ }
+
+ /* find the TCP header */
+ tcp = (struct tcphdr *) (ip + 1);
+
+ /* Get the TCP checksum if we dont have it */
+ if (!csum)
+ csum = tcp->th_sum;
+
+ /* ensure no bits set besides ack or psh */
+ if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
+ return -1;
+
+ /* check for timestamps. Since the only option we handle are
+ timestamps, we only have to handle the simple case of
+ aligned timestamps */
+
+ opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
+ tcp_hdr_len = sizeof (*tcp) + opt_bytes;
+ ts_ptr = (uint32_t *)(tcp + 1);
+ if (opt_bytes != 0) {
+ if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
+ (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
+ TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
+ return -1;
+ }
+
+ ip_len = ntohs(ip->ip_len);
+ tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
+
+
+ /*
+ * If frame is padded beyond the end of the IP packet,
+ * then we must trim the extra bytes off the end.
+ */
+ tot_len = m_head->m_pkthdr.len;
+ trim = tot_len - (ip_len + ETHER_HDR_LEN);
+ if (trim != 0) {
+ if (trim < 0) {
+ /* truncated packet */
+ return -1;
+ }
+ m_adj(m_head, -trim);
+ tot_len = m_head->m_pkthdr.len;
+ }
+
+ m_nxt = m_head;
+ m_tail = NULL; /* -Wuninitialized */
+ while (m_nxt != NULL) {
+ m_tail = m_nxt;
+ m_nxt = m_tail->m_next;
+ }
+
+ hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
+ seq = ntohl(tcp->th_seq);
+
+ SLIST_FOREACH(lro, &cntl->lro_active, next) {
+ if (lro->source_port == tcp->th_sport &&
+ lro->dest_port == tcp->th_dport &&
+ lro->source_ip == ip->ip_src.s_addr &&
+ lro->dest_ip == ip->ip_dst.s_addr) {
+ /* Try to append it */
+
+ if (__predict_false(seq != lro->next_seq)) {
+ /* out of order packet */
+ SLIST_REMOVE(&cntl->lro_active, lro,
+ lro_entry, next);
+ tcp_lro_flush(cntl, lro);
+ return -1;
+ }
+
+ if (opt_bytes) {
+ uint32_t tsval = ntohl(*(ts_ptr + 1));
+ /* make sure timestamp values are increasing */
+ if (__predict_false(lro->tsval > tsval ||
+ *(ts_ptr + 2) == 0)) {
+ return -1;
+ }
+ lro->tsval = tsval;
+ lro->tsecr = *(ts_ptr + 2);
+ }
+
+ lro->next_seq += tcp_data_len;
+ lro->ack_seq = tcp->th_ack;
+ lro->window = tcp->th_win;
+ lro->append_cnt++;
+ if (tcp_data_len == 0) {
+ m_freem(m_head);
+ return 0;
+ }
+ /* subtract off the checksum of the tcp header
+ * from the hardware checksum, and add it to the
+ * stored tcp data checksum. Byteswap the checksum
+ * if the total length so far is odd
+ */
+ tmp_csum = do_csum_data((uint16_t*)tcp,
+ tcp_hdr_len);
+ csum = csum + (tmp_csum ^ 0xffff);
+ csum = (csum & 0xffff) + (csum >> 16);
+ csum = (csum & 0xffff) + (csum >> 16);
+ if (lro->len & 0x1) {
+ /* Odd number of bytes so far, flip bytes */
+ csum = ((csum << 8) | (csum >> 8)) & 0xffff;
+ }
+ csum = csum + lro->data_csum;
+ csum = (csum & 0xffff) + (csum >> 16);
+ csum = (csum & 0xffff) + (csum >> 16);
+ lro->data_csum = csum;
+
+ lro->len += tcp_data_len;
+
+ /* adjust mbuf so that m->m_data points to
+ the first byte of the payload */
+ m_adj(m_head, hlen);
+ /* append mbuf chain */
+ lro->m_tail->m_next = m_head;
+ /* advance the last pointer */
+ lro->m_tail = m_tail;
+ /* flush packet if required */
+ device_mtu = cntl->ifp->if_mtu;
+ if (lro->len > (65535 - device_mtu)) {
+ SLIST_REMOVE(&cntl->lro_active, lro,
+ lro_entry, next);
+ tcp_lro_flush(cntl, lro);
+ }
+ return 0;
+ }
+ }
+
+ if (SLIST_EMPTY(&cntl->lro_free))
+ return -1;
+
+ /* start a new chain */
+ lro = SLIST_FIRST(&cntl->lro_free);
+ SLIST_REMOVE_HEAD(&cntl->lro_free, next);
+ SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
+ lro->source_port = tcp->th_sport;
+ lro->dest_port = tcp->th_dport;
+ lro->source_ip = ip->ip_src.s_addr;
+ lro->dest_ip = ip->ip_dst.s_addr;
+ lro->next_seq = seq + tcp_data_len;
+ lro->mss = tcp_data_len;
+ lro->ack_seq = tcp->th_ack;
+ lro->window = tcp->th_win;
+
+ /* save the checksum of just the TCP payload by
+ * subtracting off the checksum of the TCP header from
+ * the entire hardware checksum
+ * Since IP header checksum is correct, checksum over
+ * the IP header is -0. Substracting -0 is unnecessary.
+ */
+ tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
+ csum = csum + (tmp_csum ^ 0xffff);
+ csum = (csum & 0xffff) + (csum >> 16);
+ csum = (csum & 0xffff) + (csum >> 16);
+ lro->data_csum = csum;
+
+ lro->ip = ip;
+ /* record timestamp if it is present */
+ if (opt_bytes) {
+ lro->timestamp = 1;
+ lro->tsval = ntohl(*(ts_ptr + 1));
+ lro->tsecr = *(ts_ptr + 2);
+ }
+ lro->len = tot_len;
+ lro->m_head = m_head;
+ lro->m_tail = m_tail;
+ return 0;
+}
OpenPOWER on IntegriCloud