diff options
Diffstat (limited to 'sys/dev/xen/netfront/netfront.c')
-rw-r--r-- | sys/dev/xen/netfront/netfront.c | 648 |
1 files changed, 359 insertions, 289 deletions
diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c index 15cf455..a6fd9ea 100644 --- a/sys/dev/xen/netfront/netfront.c +++ b/sys/dev/xen/netfront/netfront.c @@ -89,7 +89,7 @@ __FBSDID("$FreeBSD$"); #include "xenbus_if.h" -#define XN_CSUM_FEATURES (CSUM_TCP | CSUM_UDP) +#define XN_CSUM_FEATURES (CSUM_TCP | CSUM_UDP | CSUM_TSO) #define GRANT_INVALID_REF 0 @@ -124,7 +124,16 @@ static const int MODPARM_rx_copy = 1; static const int MODPARM_rx_flip = 0; #endif -#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 2) +/** + * \brief The maximum allowed data fragments in a single transmit + * request. + * + * This limit is imposed by the backend driver. We assume here that + * we are dealing with a Linux driver domain and have set our limit + * to mirror the Linux MAX_SKB_FRAGS constant. + */ +#define MAX_TX_REQ_FRAGS (65536 / PAGE_SIZE + 2) + #define RX_COPY_THRESHOLD 256 #define net_ratelimit() 0 @@ -140,6 +149,9 @@ static void xn_tick_locked(struct netfront_info *); static void xn_tick(void *); static void xn_intr(void *); +static inline int xn_count_frags(struct mbuf *m); +static int xn_assemble_tx_request(struct netfront_info *sc, + struct mbuf *m_head); static void xn_start_locked(struct ifnet *); static void xn_start(struct ifnet *); static int xn_ioctl(struct ifnet *, u_long, caddr_t); @@ -174,8 +186,8 @@ static void xn_free_rx_ring(struct netfront_info *); static void xn_free_tx_ring(struct netfront_info *); static int xennet_get_responses(struct netfront_info *np, - struct netfront_rx_info *rinfo, RING_IDX rp, struct mbuf **list, - int *pages_flipped_p); + struct netfront_rx_info *rinfo, RING_IDX rp, RING_IDX *cons, + struct mbuf **list, int *pages_flipped_p); #define virt_to_mfn(x) (vtomach(x) >> PAGE_SHIFT) @@ -187,11 +199,12 @@ static int xennet_get_responses(struct netfront_info *np, * not the other way around. The size must track the free index arrays. */ struct xn_chain_data { - struct mbuf *xn_tx_chain[NET_TX_RING_SIZE+1]; - int xn_tx_chain_cnt; - struct mbuf *xn_rx_chain[NET_RX_RING_SIZE+1]; + struct mbuf *xn_tx_chain[NET_TX_RING_SIZE+1]; + int xn_tx_chain_cnt; + struct mbuf *xn_rx_chain[NET_RX_RING_SIZE+1]; }; +#define NUM_ELEMENTS(x) (sizeof(x)/sizeof(*x)) struct net_device_stats { @@ -255,17 +268,11 @@ struct netfront_info { int rx_max_target; int rx_target; - /* - * {tx,rx}_skbs store outstanding skbuffs. The first entry in each - * array is an index into a chain of free entries. - */ - grant_ref_t gref_tx_head; grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1]; grant_ref_t gref_rx_head; grant_ref_t grant_rx_ref[NET_TX_RING_SIZE + 1]; -#define TX_MAX_TARGET min(NET_RX_RING_SIZE, 256) device_t xbdev; int tx_ring_ref; int rx_ring_ref; @@ -288,7 +295,7 @@ struct netfront_info { #define XN_LOCK_INIT(_sc, _name) \ mtx_init(&(_sc)->tx_lock, #_name"_tx", "network transmit lock", MTX_DEF); \ mtx_init(&(_sc)->rx_lock, #_name"_rx", "network receive lock", MTX_DEF); \ - mtx_init(&(_sc)->sc_lock, #_name"_sc", "netfront softc lock", MTX_DEF) + mtx_init(&(_sc)->sc_lock, #_name"_sc", "netfront softc lock", MTX_DEF) #define XN_RX_LOCK(_sc) mtx_lock(&(_sc)->rx_lock) #define XN_RX_UNLOCK(_sc) mtx_unlock(&(_sc)->rx_lock) @@ -324,18 +331,22 @@ struct netfront_rx_info { */ static inline void -add_id_to_freelist(struct mbuf **list, unsigned short id) +add_id_to_freelist(struct mbuf **list, uintptr_t id) { - KASSERT(id != 0, ("add_id_to_freelist: the head item (0) must always be free.")); + KASSERT(id != 0, + ("%s: the head item (0) must always be free.", __func__)); list[id] = list[0]; - list[0] = (void *)(u_long)id; + list[0] = (struct mbuf *)id; } static inline unsigned short get_id_from_freelist(struct mbuf **list) { - u_int id = (u_int)(u_long)list[0]; - KASSERT(id != 0, ("get_id_from_freelist: the head item (0) must always remain free.")); + uintptr_t id; + + id = (uintptr_t)list[0]; + KASSERT(id != 0, + ("%s: the head item (0) must always remain free.", __func__)); list[0] = list[id]; return (id); } @@ -347,8 +358,7 @@ xennet_rxidx(RING_IDX idx) } static inline struct mbuf * -xennet_get_rx_mbuf(struct netfront_info *np, - RING_IDX ri) +xennet_get_rx_mbuf(struct netfront_info *np, RING_IDX ri) { int i = xennet_rxidx(ri); struct mbuf *m; @@ -495,25 +505,25 @@ talk_to_backend(device_t dev, struct netfront_info *info) goto destroy_ring; } err = xenbus_printf(xbt, node, "tx-ring-ref","%u", - info->tx_ring_ref); + info->tx_ring_ref); if (err) { message = "writing tx ring-ref"; goto abort_transaction; } err = xenbus_printf(xbt, node, "rx-ring-ref","%u", - info->rx_ring_ref); + info->rx_ring_ref); if (err) { message = "writing rx ring-ref"; goto abort_transaction; } err = xenbus_printf(xbt, node, - "event-channel", "%u", irq_to_evtchn_port(info->irq)); + "event-channel", "%u", irq_to_evtchn_port(info->irq)); if (err) { message = "writing event-channel"; goto abort_transaction; } err = xenbus_printf(xbt, node, "request-rx-copy", "%u", - info->copying_receiver); + info->copying_receiver); if (err) { message = "writing request-rx-copy"; goto abort_transaction; @@ -674,9 +684,9 @@ xn_free_rx_ring(struct netfront_info *sc) int i; for (i = 0; i < NET_RX_RING_SIZE; i++) { - if (sc->xn_cdata.xn_rx_chain[i] != NULL) { - m_freem(sc->xn_cdata.xn_rx_chain[i]); - sc->xn_cdata.xn_rx_chain[i] = NULL; + if (sc->xn_cdata.rx_mbufs[i] != NULL) { + m_freem(sc->rx_mbufs[i]); + sc->rx_mbufs[i] = NULL; } } @@ -693,8 +703,8 @@ xn_free_tx_ring(struct netfront_info *sc) int i; for (i = 0; i < NET_TX_RING_SIZE; i++) { - if (sc->xn_cdata.xn_tx_chain[i] != NULL) { - m_freem(sc->xn_cdata.xn_tx_chain[i]); + if (sc->tx_mbufs[i] != NULL) { + m_freem(sc->tx_mbufs[i]); sc->xn_cdata.xn_tx_chain[i] = NULL; } } @@ -703,39 +713,36 @@ xn_free_tx_ring(struct netfront_info *sc) #endif } -/* - * Do some brief math on the number of descriptors available to - * determine how many slots are available. - * - * Firstly - wouldn't something with RING_FREE_REQUESTS() be more applicable? - * Secondly - MAX_SKB_FRAGS is a Linux construct which may not apply here. - * Thirdly - it isn't used here anyway; the magic constant '24' is possibly - * wrong? - * The "2" is presumably to ensure there are also enough slots available for - * the ring entries used for "options" (eg, the TSO entry before a packet - * is queued); I'm not sure why its 2 and not 1. Perhaps to make sure there's - * a "free" node in the tx mbuf list (node 0) to represent the freelist? +/** + * \brief Verify that there is sufficient space in the Tx ring + * buffer for a maximally sized request to be enqueued. * - * This only figures out whether any xenbus ring descriptors are available; - * it doesn't at all reflect how many tx mbuf ring descriptors are also - * available. + * A transmit request requires a transmit descriptor for each packet + * fragment, plus up to 2 entries for "options" (e.g. TSO). */ static inline int -netfront_tx_slot_available(struct netfront_info *np) +xn_tx_slot_available(struct netfront_info *np) { - return ((np->tx.req_prod_pvt - np->tx.rsp_cons) < - (TX_MAX_TARGET - /* MAX_SKB_FRAGS */ 24 - 2)); + return (RING_FREE_REQUESTS(&np->tx) > (MAX_TX_REQ_FRAGS + 2)); } + static void netif_release_tx_bufs(struct netfront_info *np) { - struct mbuf *m; int i; for (i = 1; i <= NET_TX_RING_SIZE; i++) { - m = np->xn_cdata.xn_tx_chain[i]; + struct mbuf *m; + + m = np->tx_mbufs[i]; - if (((u_long)m) < KERNBASE) + /* + * We assume that no kernel addresses are + * less than NET_TX_RING_SIZE. Any entry + * in the table that is below this number + * must be an index from free-list tracking. + */ + if (((uintptr_t)m) <= NET_TX_RING_SIZE) continue; gnttab_grant_foreign_access_ref(np->grant_tx_ref[i], xenbus_get_otherend_id(np->xbdev), @@ -774,19 +781,25 @@ network_alloc_rx_buffers(struct netfront_info *sc) return; /* - * Allocate skbuffs greedily, even though we batch updates to the + * Allocate mbufs greedily, even though we batch updates to the * receive ring. This creates a less bursty demand on the memory - * allocator, so should reduce the chance of failed allocation + * allocator, and so should reduce the chance of failed allocation * requests both for ourself and for other kernel subsystems. + * + * Here we attempt to maintain rx_target buffers in flight, counting + * buffers that we have yet to process in the receive ring. */ batch_target = sc->rx_target - (req_prod - sc->rx.rsp_cons); for (i = mbufq_len(&sc->xn_rx_batch); i < batch_target; i++) { MGETHDR(m_new, M_DONTWAIT, MT_DATA); - if (m_new == NULL) + if (m_new == NULL) { + printf("%s: MGETHDR failed\n", __func__); goto no_mbuf; + } m_cljget(m_new, M_DONTWAIT, MJUMPAGESIZE); if ((m_new->m_flags & M_EXT) == 0) { + printf("%s: m_cljget failed\n", __func__); m_freem(m_new); no_mbuf: @@ -803,16 +816,29 @@ no_mbuf: mbufq_tail(&sc->xn_rx_batch, m_new); } - /* Is the batch large enough to be worthwhile? */ + /* + * If we've allocated at least half of our target number of entries, + * submit them to the backend - we have enough to make the overhead + * of submission worthwhile. Otherwise wait for more mbufs and + * request entries to become available. + */ if (i < (sc->rx_target/2)) { if (req_prod >sc->rx.sring->req_prod) goto push; return; } - /* Adjust floating fill target if we risked running out of buffers. */ - if ( ((req_prod - sc->rx.sring->rsp_prod) < (sc->rx_target / 4)) && - ((sc->rx_target *= 2) > sc->rx_max_target) ) - sc->rx_target = sc->rx_max_target; + + /* + * Double floating fill target if we risked having the backend + * run out of empty buffers for receive traffic. We define "running + * low" as having less than a fourth of our target buffers free + * at the time we refilled the queue. + */ + if ((req_prod - sc->rx.sring->rsp_prod) < (sc->rx_target / 4)) { + sc->rx_target *= 2; + if (sc->rx_target > sc->rx_max_target) + sc->rx_target = sc->rx_max_target; + } refill: for (nr_flips = i = 0; ; i++) { @@ -824,9 +850,8 @@ refill: id = xennet_rxidx(req_prod + i); - KASSERT(sc->xn_cdata.xn_rx_chain[id] == NULL, - ("non-NULL xm_rx_chain")); - sc->xn_cdata.xn_rx_chain[id] = m_new; + KASSERT(sc->rx_mbufs[id] == NULL, ("non-NULL xm_rx_chain")); + sc->rx_mbufs[id] = m_new; ref = gnttab_claim_grant_reference(&sc->gref_rx_head); KASSERT((short)ref >= 0, ("negative ref")); @@ -950,14 +975,13 @@ xn_rxeof(struct netfront_info *np) memset(extras, 0, sizeof(rinfo.extras)); m = NULL; - err = xennet_get_responses(np, &rinfo, rp, &m, + err = xennet_get_responses(np, &rinfo, rp, &i, &m, &pages_flipped); if (unlikely(err)) { if (m) mbufq_tail(&errq, m); np->stats.rx_errors++; - i = np->rx.rsp_cons; continue; } @@ -979,7 +1003,7 @@ xn_rxeof(struct netfront_info *np) np->stats.rx_bytes += m->m_pkthdr.len; mbufq_tail(&rxq, m); - np->rx.rsp_cons = ++i; + np->rx.rsp_cons = i; } if (pages_flipped) { @@ -1084,9 +1108,16 @@ xn_txeof(struct netfront_info *np) if (txr->status == NETIF_RSP_NULL) continue; + if (txr->status != NETIF_RSP_OKAY) { + printf("%s: WARNING: response is %d!\n", + __func__, txr->status); + } id = txr->id; - m = np->xn_cdata.xn_tx_chain[id]; + m = np->tx_mbufs[id]; KASSERT(m != NULL, ("mbuf not found in xn_tx_chain")); + KASSERT((uintptr_t)m > NET_TX_RING_SIZE, + ("mbuf already on the free list, but we're " + "trying to free it again!")); M_ASSERTVALID(m); /* @@ -1097,10 +1128,8 @@ xn_txeof(struct netfront_info *np) ifp->if_opackets++; if (unlikely(gnttab_query_foreign_access( np->grant_tx_ref[id]) != 0)) { - WPRINTK("network_tx_buf_gc: warning " - "-- grant still in use by backend " - "domain.\n"); - goto out; + panic("grant id %u still in use by the backend", + id); } gnttab_end_foreign_access_ref( np->grant_tx_ref[id]); @@ -1108,12 +1137,9 @@ xn_txeof(struct netfront_info *np) &np->gref_tx_head, np->grant_tx_ref[id]); np->grant_tx_ref[id] = GRANT_INVALID_REF; - np->xn_cdata.xn_tx_chain[id] = NULL; - add_id_to_freelist(np->xn_cdata.xn_tx_chain, id); + np->tx_mbufs[id] = NULL; + add_id_to_freelist(np->tx_mbufs, id); np->xn_cdata.xn_tx_chain_cnt--; - if (np->xn_cdata.xn_tx_chain_cnt < 0) { - panic("netif_release_tx_bufs: tx_chain_cnt must be >= 0"); - } m_free(m); /* Only mark the queue active if we've freed up at least one slot to try */ ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; @@ -1135,7 +1161,6 @@ xn_txeof(struct netfront_info *np) mb(); } while (prod != np->tx.sring->rsp_prod); - out: if (np->tx_full && ((np->tx.sring->req_prod - prod) < NET_TX_RING_SIZE)) { np->tx_full = 0; @@ -1159,7 +1184,7 @@ xn_intr(void *xsc) ifp->if_drv_flags & IFF_DRV_RUNNING)) return; #endif - if (np->tx.rsp_cons != np->tx.sring->rsp_prod) { + if (RING_HAS_UNCONSUMED_RESPONSES(&np->tx)) { XN_TX_LOCK(np); xn_txeof(np); XN_TX_UNLOCK(np); @@ -1191,10 +1216,9 @@ xennet_move_rx_slot(struct netfront_info *np, struct mbuf *m, static int xennet_get_extras(struct netfront_info *np, - struct netif_extra_info *extras, RING_IDX rp) + struct netif_extra_info *extras, RING_IDX rp, RING_IDX *cons) { struct netif_extra_info *extra; - RING_IDX cons = np->rx.rsp_cons; int err = 0; @@ -1202,17 +1226,17 @@ xennet_get_extras(struct netfront_info *np, struct mbuf *m; grant_ref_t ref; - if (unlikely(cons + 1 == rp)) { + if (unlikely(*cons + 1 == rp)) { #if 0 if (net_ratelimit()) WPRINTK("Missing extra info\n"); #endif - err = -EINVAL; + err = EINVAL; break; } extra = (struct netif_extra_info *) - RING_GET_RESPONSE(&np->rx, ++cons); + RING_GET_RESPONSE(&np->rx, ++(*cons)); if (unlikely(!extra->type || extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) { @@ -1221,23 +1245,22 @@ xennet_get_extras(struct netfront_info *np, WPRINTK("Invalid extra type: %d\n", extra->type); #endif - err = -EINVAL; + err = EINVAL; } else { memcpy(&extras[extra->type - 1], extra, sizeof(*extra)); } - m = xennet_get_rx_mbuf(np, cons); - ref = xennet_get_rx_ref(np, cons); + m = xennet_get_rx_mbuf(np, *cons); + ref = xennet_get_rx_ref(np, *cons); xennet_move_rx_slot(np, m, ref); } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE); - np->rx.rsp_cons = cons; return err; } static int xennet_get_responses(struct netfront_info *np, - struct netfront_rx_info *rinfo, RING_IDX rp, + struct netfront_rx_info *rinfo, RING_IDX rp, RING_IDX *cons, struct mbuf **list, int *pages_flipped_p) { @@ -1246,26 +1269,25 @@ xennet_get_responses(struct netfront_info *np, struct multicall_entry *mcl; struct netif_rx_response *rx = &rinfo->rx; struct netif_extra_info *extras = rinfo->extras; - RING_IDX cons = np->rx.rsp_cons; struct mbuf *m, *m0, *m_prev; - grant_ref_t ref = xennet_get_rx_ref(np, cons); - int max = 5 /* MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD) */; + grant_ref_t ref = xennet_get_rx_ref(np, *cons); + RING_IDX ref_cons = *cons; + int max = 5 /* MAX_TX_REQ_FRAGS + (rx->status <= RX_COPY_THRESHOLD) */; int frags = 1; int err = 0; u_long ret; - m0 = m = m_prev = xennet_get_rx_mbuf(np, cons); + m0 = m = m_prev = xennet_get_rx_mbuf(np, *cons); if (rx->flags & NETRXF_extra_info) { - err = xennet_get_extras(np, extras, rp); - cons = np->rx.rsp_cons; + err = xennet_get_extras(np, extras, rp, cons); } if (m0 != NULL) { - m0->m_pkthdr.len = 0; - m0->m_next = NULL; + m0->m_pkthdr.len = 0; + m0->m_next = NULL; } for (;;) { @@ -1277,14 +1299,18 @@ xennet_get_responses(struct netfront_info *np, #endif if (unlikely(rx->status < 0 || rx->offset + rx->status > PAGE_SIZE)) { + #if 0 if (net_ratelimit()) WPRINTK("rx->offset: %x, size: %u\n", rx->offset, rx->status); #endif xennet_move_rx_slot(np, m, ref); - err = -EINVAL; - goto next; + if (m0 == m) + m0 = NULL; + m = NULL; + err = EINVAL; + goto next_skip_queue; } /* @@ -1293,11 +1319,12 @@ xennet_get_responses(struct netfront_info *np, * situation to the system controller to reboot the backed. */ if (ref == GRANT_INVALID_REF) { + #if 0 if (net_ratelimit()) WPRINTK("Bad rx response id %d.\n", rx->id); #endif - err = -EINVAL; + err = EINVAL; goto next; } @@ -1306,12 +1333,10 @@ xennet_get_responses(struct netfront_info *np, * headroom, ... */ if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) { - if (net_ratelimit()) - WPRINTK("Unfulfilled rx req " - "(id=%d, st=%d).\n", - rx->id, rx->status); + WPRINTK("Unfulfilled rx req (id=%d, st=%d).\n", + rx->id, rx->status); xennet_move_rx_slot(np, m, ref); - err = -ENOMEM; + err = ENOMEM; goto next; } @@ -1349,23 +1374,43 @@ next: m->m_data += rx->offset; m0->m_pkthdr.len += rx->status; +next_skip_queue: if (!(rx->flags & NETRXF_more_data)) break; - if (cons + frags == rp) { + if (*cons + frags == rp) { if (net_ratelimit()) WPRINTK("Need more frags\n"); - err = -ENOENT; + err = ENOENT; + printf("%s: cons %u frags %u rp %u, not enough frags\n", + __func__, *cons, frags, rp); break; } + /* + * Note that m can be NULL, if rx->status < 0 or if + * rx->offset + rx->status > PAGE_SIZE above. + */ m_prev = m; - rx = RING_GET_RESPONSE(&np->rx, cons + frags); - m = xennet_get_rx_mbuf(np, cons + frags); + rx = RING_GET_RESPONSE(&np->rx, *cons + frags); + m = xennet_get_rx_mbuf(np, *cons + frags); - m_prev->m_next = m; + /* + * m_prev == NULL can happen if rx->status < 0 or if + * rx->offset + * rx->status > PAGE_SIZE above. + */ + if (m_prev != NULL) + m_prev->m_next = m; + + /* + * m0 can be NULL if rx->status < 0 or if * rx->offset + + * rx->status > PAGE_SIZE above. + */ + if (m0 == NULL) + m0 = m; m->m_next = NULL; - ref = xennet_get_rx_ref(np, cons + frags); + ref = xennet_get_rx_ref(np, *cons + frags); + ref_cons = *cons + frags; frags++; } *list = m0; @@ -1373,11 +1418,12 @@ next: if (unlikely(frags > max)) { if (net_ratelimit()) WPRINTK("Too many frags\n"); - err = -E2BIG; + printf("%s: too many frags %d > max %d\n", __func__, frags, + max); + err = E2BIG; } - if (unlikely(err)) - np->rx.rsp_cons = cons + frags; + *cons += frags; *pages_flipped_p = pages_flipped; @@ -1406,209 +1452,228 @@ xn_tick(void *xsc) XN_RX_UNLOCK(sc); } -static void -xn_start_locked(struct ifnet *ifp) + +/** + * \brief Count the number of fragments in an mbuf chain. + * + * Surprisingly, there isn't an M* macro for this. + */ +static inline int +xn_count_frags(struct mbuf *m) { - int otherend_id; - unsigned short id; - struct mbuf *m_head, *m; - struct netfront_info *sc; - netif_tx_request_t *tx; - netif_extra_info_t *extra; - RING_IDX i; - grant_ref_t ref; - u_long mfn, tx_bytes; - int notify, nfrags; + int nfrags; - sc = ifp->if_softc; - otherend_id = xenbus_get_otherend_id(sc->xbdev); - tx_bytes = 0; + for (nfrags = 0; m != NULL; m = m->m_next) + nfrags++; - if (!netfront_carrier_ok(sc)) - return; - - for (i = sc->tx.req_prod_pvt; TRUE; i++) { - IF_DEQUEUE(&ifp->if_snd, m_head); - if (m_head == NULL) - break; - - /* - * netfront_tx_slot_available() tries to do some math to - * ensure that there'll be enough xenbus ring slots available - * for the maximum number of packet fragments (and a couple more - * for what I guess are TSO and other ring entry items.) - */ - if (!netfront_tx_slot_available(sc)) { - IF_PREPEND(&ifp->if_snd, m_head); - ifp->if_drv_flags |= IFF_DRV_OACTIVE; - break; - } + return (nfrags); +} - /* - * Defragment the mbuf if necessary. - */ - for (m = m_head, nfrags = 0; m; m = m->m_next) - nfrags++; - if (nfrags > MAX_SKB_FRAGS) { - m = m_defrag(m_head, M_DONTWAIT); - if (!m) { - m_freem(m_head); - break; - } - m_head = m; - } +/** + * Given an mbuf chain, make sure we have enough room and then push + * it onto the transmit ring. + */ +static int +xn_assemble_tx_request(struct netfront_info *sc, struct mbuf *m_head) +{ + struct ifnet *ifp; + struct mbuf *m; + u_int nfrags; + netif_extra_info_t *extra; + int otherend_id; - /* Determine how many fragments now exist */ - for (m = m_head, nfrags = 0; m; m = m->m_next) - nfrags++; + ifp = sc->xn_ifp; - /* - * Don't attempt to queue this packet if there aren't - * enough free entries in the chain. - * - * There isn't a 1:1 correspondance between the mbuf TX ring - * and the xenbus TX ring. - * xn_txeof() may need to be called to free up some slots. - * - * It is quite possible that this can be later eliminated if - * it turns out that partial * packets can be pushed into - * the ringbuffer, with fragments pushed in when further slots - * free up. - * - * It is also quite possible that the driver will lock up - * if the TX queue fills up with no RX traffic, and - * the mbuf ring is exhausted. The queue may need - * a swift kick to continue. - */ + /** + * Defragment the mbuf if necessary. + */ + nfrags = xn_count_frags(m_head); - /* - * It is not +1 like the allocation because we need to keep - * slot [0] free for the freelist head - */ - if (sc->xn_cdata.xn_tx_chain_cnt + nfrags >= NET_TX_RING_SIZE) { - WPRINTK("xn_start_locked: xn_tx_chain_cnt (%d) + nfrags %d >= NET_TX_RING_SIZE (%d); must be full!\n", - (int) sc->xn_cdata.xn_tx_chain_cnt, - (int) nfrags, (int) NET_TX_RING_SIZE); - IF_PREPEND(&ifp->if_snd, m_head); - ifp->if_drv_flags |= IFF_DRV_OACTIVE; - break; + /* + * Check to see whether this request is longer than netback + * can handle, and try to defrag it. + */ + /** + * It is a bit lame, but the netback driver in Linux can't + * deal with nfrags > MAX_TX_REQ_FRAGS, which is a quirk of + * the Linux network stack. + */ + if (nfrags > MAX_TX_REQ_FRAGS) { + m = m_defrag(m_head, M_DONTWAIT); + if (!m) { + /* + * Defrag failed, so free the mbuf and + * therefore drop the packet. + */ + m_freem(m_head); + return (EMSGSIZE); } + m_head = m; + } - /* - * Make sure there's actually space available in the - * Xen TX ring for this. Overcompensate for the possibility - * of having a TCP offload fragment just in case for now - * (the +1) rather than adding logic to accurately calculate - * the required size. - */ - if (RING_FREE_REQUESTS(&sc->tx) < (nfrags + 1)) { - WPRINTK("xn_start_locked: free ring slots (%d) < (nfrags + 1) (%d); must be full!\n", - (int) RING_FREE_REQUESTS(&sc->tx), - (int) (nfrags + 1)); - IF_PREPEND(&ifp->if_snd, m_head); - ifp->if_drv_flags |= IFF_DRV_OACTIVE; - break; - } + /* Determine how many fragments now exist */ + nfrags = xn_count_frags(m_head); - /* - * Start packing the mbufs in this chain into - * the fragment pointers. Stop when we run out - * of fragments or hit the end of the mbuf chain. - */ - m = m_head; - extra = NULL; - for (m = m_head; m; m = m->m_next) { - tx = RING_GET_REQUEST(&sc->tx, i); - id = get_id_from_freelist(sc->xn_cdata.xn_tx_chain); - if (id == 0) - panic("xn_start_locked: was allocated the freelist head!\n"); - sc->xn_cdata.xn_tx_chain_cnt++; - if (sc->xn_cdata.xn_tx_chain_cnt >= NET_TX_RING_SIZE+1) - panic("xn_start_locked: tx_chain_cnt must be < NET_TX_RING_SIZE+1\n"); - sc->xn_cdata.xn_tx_chain[id] = m; - tx->id = id; - ref = gnttab_claim_grant_reference(&sc->gref_tx_head); - KASSERT((short)ref >= 0, ("Negative ref")); - mfn = virt_to_mfn(mtod(m, vm_offset_t)); - gnttab_grant_foreign_access_ref(ref, otherend_id, - mfn, GNTMAP_readonly); - tx->gref = sc->grant_tx_ref[id] = ref; - tx->offset = mtod(m, vm_offset_t) & (PAGE_SIZE - 1); - tx->flags = 0; - if (m == m_head) { - /* - * The first fragment has the entire packet - * size, subsequent fragments have just the - * fragment size. The backend works out the - * true size of the first fragment by - * subtracting the sizes of the other - * fragments. - */ - tx->size = m->m_pkthdr.len; + /* + * Check to see whether the defragmented packet has too many + * segments for the Linux netback driver. + */ + /** + * The FreeBSD TCP stack, with TSO enabled, can produce a chain + * of mbufs longer than Linux can handle. Make sure we don't + * pass a too-long chain over to the other side by dropping the + * packet. It doesn't look like there is currently a way to + * tell the TCP stack to generate a shorter chain of packets. + */ + if (nfrags > MAX_TX_REQ_FRAGS) { + m_freem(m_head); + return (EMSGSIZE); + } - /* - * The first fragment contains the - * checksum flags and is optionally - * followed by extra data for TSO etc. - */ - if (m->m_pkthdr.csum_flags - & CSUM_DELAY_DATA) { - tx->flags |= (NETTXF_csum_blank - | NETTXF_data_validated); - } -#if __FreeBSD_version >= 700000 - if (m->m_pkthdr.csum_flags & CSUM_TSO) { - struct netif_extra_info *gso = - (struct netif_extra_info *) - RING_GET_REQUEST(&sc->tx, ++i); - - if (extra) - extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE; - else - tx->flags |= NETTXF_extra_info; - - gso->u.gso.size = m->m_pkthdr.tso_segsz; - gso->u.gso.type = - XEN_NETIF_GSO_TYPE_TCPV4; - gso->u.gso.pad = 0; - gso->u.gso.features = 0; - - gso->type = XEN_NETIF_EXTRA_TYPE_GSO; - gso->flags = 0; - extra = gso; - } -#endif - } else { - tx->size = m->m_len; + /* + * This check should be redundant. We've already verified that we + * have enough slots in the ring to handle a packet of maximum + * size, and that our packet is less than the maximum size. Keep + * it in here as an assert for now just to make certain that + * xn_tx_chain_cnt is accurate. + */ + KASSERT((sc->xn_cdata.xn_tx_chain_cnt + nfrags) <= NET_TX_RING_SIZE, + ("%s: xn_tx_chain_cnt (%d) + nfrags (%d) > NET_TX_RING_SIZE " + "(%d)!", __func__, (int) sc->xn_cdata.xn_tx_chain_cnt, + (int) nfrags, (int) NET_TX_RING_SIZE)); + + /* + * Start packing the mbufs in this chain into + * the fragment pointers. Stop when we run out + * of fragments or hit the end of the mbuf chain. + */ + m = m_head; + extra = NULL; + otherend_id = xenbus_get_otherend_id(sc->xbdev); + for (m = m_head; m; m = m->m_next) { + netif_tx_request_t *tx; + uintptr_t id; + grant_ref_t ref; + u_long mfn; /* XXX Wrong type? */ + + tx = RING_GET_REQUEST(&sc->tx, sc->tx.req_prod_pvt); + id = get_id_from_freelist(sc->tx_mbufs); + if (id == 0) + panic("xn_start_locked: was allocated the freelist head!\n"); + sc->xn_cdata.xn_tx_chain_cnt++; + if (sc->xn_cdata.xn_tx_chain_cnt > NET_TX_RING_SIZE) + panic("xn_start_locked: tx_chain_cnt must be <= NET_TX_RING_SIZE\n"); + sc->tx_mbufs[id] = m; + tx->id = id; + ref = gnttab_claim_grant_reference(&sc->gref_tx_head); + KASSERT((short)ref >= 0, ("Negative ref")); + mfn = virt_to_mfn(mtod(m, vm_offset_t)); + gnttab_grant_foreign_access_ref(ref, otherend_id, + mfn, GNTMAP_readonly); + tx->gref = sc->grant_tx_ref[id] = ref; + tx->offset = mtod(m, vm_offset_t) & (PAGE_SIZE - 1); + tx->flags = 0; + if (m == m_head) { + /* + * The first fragment has the entire packet + * size, subsequent fragments have just the + * fragment size. The backend works out the + * true size of the first fragment by + * subtracting the sizes of the other + * fragments. + */ + tx->size = m->m_pkthdr.len; + + /* + * The first fragment contains the checksum flags + * and is optionally followed by extra data for + * TSO etc. + */ + /** + * CSUM_TSO requires checksum offloading. + * Some versions of FreeBSD fail to + * set CSUM_TCP in the CSUM_TSO case, + * so we have to test for CSUM_TSO + * explicitly. + */ + if (m->m_pkthdr.csum_flags + & (CSUM_DELAY_DATA | CSUM_TSO)) { + tx->flags |= (NETTXF_csum_blank + | NETTXF_data_validated); } - if (m->m_next) { - tx->flags |= NETTXF_more_data; - i++; +#if __FreeBSD_version >= 700000 + if (m->m_pkthdr.csum_flags & CSUM_TSO) { + struct netif_extra_info *gso = + (struct netif_extra_info *) + RING_GET_REQUEST(&sc->tx, + ++sc->tx.req_prod_pvt); + + tx->flags |= NETTXF_extra_info; + + gso->u.gso.size = m->m_pkthdr.tso_segsz; + gso->u.gso.type = + XEN_NETIF_GSO_TYPE_TCPV4; + gso->u.gso.pad = 0; + gso->u.gso.features = 0; + + gso->type = XEN_NETIF_EXTRA_TYPE_GSO; + gso->flags = 0; } +#endif + } else { + tx->size = m->m_len; } + if (m->m_next) + tx->flags |= NETTXF_more_data; - BPF_MTAP(ifp, m_head); + sc->tx.req_prod_pvt++; + } + BPF_MTAP(ifp, m_head); + + sc->stats.tx_bytes += m_head->m_pkthdr.len; + sc->stats.tx_packets++; + + return (0); +} - sc->stats.tx_bytes += m_head->m_pkthdr.len; - sc->stats.tx_packets++; +static void +xn_start_locked(struct ifnet *ifp) +{ + struct netfront_info *sc; + struct mbuf *m_head; + int notify; + + sc = ifp->if_softc; + + if (!netfront_carrier_ok(sc)) + return; + + /* + * While we have enough transmit slots available for at least one + * maximum-sized packet, pull mbufs off the queue and put them on + * the transmit ring. + */ + while (xn_tx_slot_available(sc)) { + IF_DEQUEUE(&ifp->if_snd, m_head); + if (m_head == NULL) + break; + + if (xn_assemble_tx_request(sc, m_head) != 0) + break; } - sc->tx.req_prod_pvt = i; RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->tx, notify); if (notify) notify_remote_via_irq(sc->irq); - xn_txeof(sc); - if (RING_FULL(&sc->tx)) { sc->tx_full = 1; #if 0 netif_stop_queue(dev); #endif } +} - return; -} static void xn_start(struct ifnet *ifp) @@ -1851,6 +1916,7 @@ network_connect(struct netfront_info *np) m = np->rx_mbufs[requeue_idx] = xennet_get_rx_mbuf(np, i); ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i); + req = RING_GET_REQUEST(&np->rx, requeue_idx); pfn = vtophys(mtod(m, vm_offset_t)) >> PAGE_SHIFT; @@ -1935,20 +2001,23 @@ create_netdev(device_t dev) np->tx_mbufs[i] = (void *) ((u_long) i+1); np->grant_tx_ref[i] = GRANT_INVALID_REF; } + np->tx_mbufs[NET_TX_RING_SIZE] = (void *)0; + for (i = 0; i <= NET_RX_RING_SIZE; i++) { + np->rx_mbufs[i] = NULL; np->grant_rx_ref[i] = GRANT_INVALID_REF; } /* A grant for every tx ring slot */ - if (gnttab_alloc_grant_references(TX_MAX_TARGET, - &np->gref_tx_head) < 0) { + if (gnttab_alloc_grant_references(NET_TX_RING_SIZE, + &np->gref_tx_head) != 0) { IPRINTK("#### netfront can't alloc tx grant refs\n"); err = ENOMEM; goto exit; } /* A grant for every rx ring slot */ if (gnttab_alloc_grant_references(RX_MAX_TARGET, - &np->gref_rx_head) < 0) { + &np->gref_rx_head) != 0) { WPRINTK("#### netfront can't alloc rx grant refs\n"); gnttab_free_grant_references(np->gref_tx_head); err = ENOMEM; @@ -1980,6 +2049,7 @@ create_netdev(device_t dev) ifp->if_hwassist = XN_CSUM_FEATURES; ifp->if_capabilities = IFCAP_HWCSUM; #if __FreeBSD_version >= 700000 + ifp->if_capabilities |= IFCAP_TSO4; if (xn_enable_lro) { int err = tcp_lro_init(&np->xn_lro); if (err) { |