diff options
Diffstat (limited to 'sys')
57 files changed, 1131 insertions, 802 deletions
diff --git a/sys/amd64/amd64/uma_machdep.c b/sys/amd64/amd64/uma_machdep.c index c4ca677..72adb27 100644 --- a/sys/amd64/amd64/uma_machdep.c +++ b/sys/amd64/amd64/uma_machdep.c @@ -41,7 +41,7 @@ __FBSDID("$FreeBSD$"); #include <machine/vmparam.h> void * -uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) { vm_page_t m; vm_paddr_t pa; @@ -70,7 +70,7 @@ uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) } void -uma_small_free(void *mem, int size, u_int8_t flags) +uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; diff --git a/sys/contrib/ipfilter/netinet/ip_state.c b/sys/contrib/ipfilter/netinet/ip_state.c index 9c6a244..ad2bf38 100644 --- a/sys/contrib/ipfilter/netinet/ip_state.c +++ b/sys/contrib/ipfilter/netinet/ip_state.c @@ -1,4 +1,4 @@ -/* $FreeBSD$ */ +/* $FreeBSD$ */ /* * Copyright (C) 2012 by Darren Reed. @@ -1054,7 +1054,7 @@ ipf_state_putent(softc, softs, data) /* to pointers and adjusts running stats for the hash table as appropriate. */ /* */ /* This function can fail if the filter rule has had a population policy of */ -/* IP addresses used with stateful filteirng assigned to it. */ +/* IP addresses used with stateful filtering assigned to it. */ /* */ /* Locking: it is assumed that some kind of lock on ipf_state is held. */ /* Exits with is_lock initialised and held - *EVEN IF ERROR*. */ @@ -1081,7 +1081,7 @@ ipf_state_insert(softc, is, rev) } /* - * If we could trust is_hv, then the modulous would not be needed, + * If we could trust is_hv, then the modulus would not be needed, * but when running with IPFILTER_SYNC, this stops bad values. */ hv = is->is_hv % softs->ipf_state_size; @@ -1672,6 +1672,10 @@ ipf_state_add(softc, fin, stsave, flags) SBUMPD(ipf_state_stats, iss_bucket_full); return 4; } + + /* + * No existing state; create new + */ KMALLOC(is, ipstate_t *); if (is == NULL) { SBUMPD(ipf_state_stats, iss_nomem); @@ -1683,7 +1687,7 @@ ipf_state_add(softc, fin, stsave, flags) is->is_rule = fr; /* - * Do not do the modulous here, it is done in ipf_state_insert(). + * Do not do the modulus here, it is done in ipf_state_insert(). */ if (fr != NULL) { ipftq_t *tq; @@ -1711,7 +1715,7 @@ ipf_state_add(softc, fin, stsave, flags) /* * It may seem strange to set is_ref to 2, but if stsave is not NULL * then a copy of the pointer is being stored somewhere else and in - * the end, it will expect to be able to do osmething with it. + * the end, it will expect to be able to do something with it. */ is->is_me = stsave; if (stsave != NULL) { @@ -3652,7 +3656,6 @@ ipf_state_del(softc, is, why) softs->ipf_state_stats.iss_orphan++; return refs; } - MUTEX_EXIT(&is->is_lock); fr = is->is_rule; is->is_rule = NULL; @@ -3664,6 +3667,7 @@ ipf_state_del(softc, is, why) } is->is_ref = 0; + MUTEX_EXIT(&is->is_lock); if (is->is_tqehead[0] != NULL) { if (ipf_deletetimeoutqueue(is->is_tqehead[0]) == 0) diff --git a/sys/dev/msk/if_mskreg.h b/sys/dev/msk/if_mskreg.h index 9c55192..da69f2e 100644 --- a/sys/dev/msk/if_mskreg.h +++ b/sys/dev/msk/if_mskreg.h @@ -2175,13 +2175,8 @@ #define MSK_ADDR_LO(x) ((uint64_t) (x) & 0xffffffffUL) #define MSK_ADDR_HI(x) ((uint64_t) (x) >> 32) -/* - * At first I guessed 8 bytes, the size of a single descriptor, would be - * required alignment constraints. But, it seems that Yukon II have 4096 - * bytes boundary alignment constraints. - */ -#define MSK_RING_ALIGN 4096 -#define MSK_STAT_ALIGN 4096 +#define MSK_RING_ALIGN 32768 +#define MSK_STAT_ALIGN 32768 /* Rx descriptor data structure */ struct msk_rx_desc { diff --git a/sys/dev/puc/puc.c b/sys/dev/puc/puc.c index d7bfb63..2b320ca 100644 --- a/sys/dev/puc/puc.c +++ b/sys/dev/puc/puc.c @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$"); #include <sys/conf.h> #include <sys/malloc.h> #include <sys/mutex.h> +#include <sys/sysctl.h> #include <machine/bus.h> #include <machine/resource.h> @@ -70,6 +71,8 @@ const char puc_driver_name[] = "puc"; static MALLOC_DEFINE(M_PUC, "PUC", "PUC driver"); +SYSCTL_NODE(_hw, OID_AUTO, puc, CTLFLAG_RD, 0, "puc(9) driver configuration"); + struct puc_bar * puc_get_bar(struct puc_softc *sc, int rid) { @@ -324,7 +327,6 @@ puc_bfe_attach(device_t dev) if (bootverbose && sc->sc_ilr != 0) device_printf(dev, "using interrupt latch register\n"); - sc->sc_irid = 0; sc->sc_ires = bus_alloc_resource_any(dev, SYS_RES_IRQ, &sc->sc_irid, RF_ACTIVE|RF_SHAREABLE); if (sc->sc_ires != NULL) { diff --git a/sys/dev/puc/puc_bfe.h b/sys/dev/puc/puc_bfe.h index 3cf0c53..9f2f3fb 100644 --- a/sys/dev/puc/puc_bfe.h +++ b/sys/dev/puc/puc_bfe.h @@ -66,6 +66,7 @@ struct puc_softc { int sc_fastintr:1; int sc_leaving:1; int sc_polled:1; + int sc_msi:1; int sc_ilr; @@ -94,4 +95,6 @@ int puc_bus_setup_intr(device_t, device_t, struct resource *, int, driver_filter_t *, driver_intr_t *, void *, void **); int puc_bus_teardown_intr(device_t, device_t, struct resource *, void *); +SYSCTL_DECL(_hw_puc); + #endif /* _DEV_PUC_BFE_H_ */ diff --git a/sys/dev/puc/puc_cfg.c b/sys/dev/puc/puc_cfg.c index 7de6ea3..99d9816 100644 --- a/sys/dev/puc/puc_cfg.c +++ b/sys/dev/puc/puc_cfg.c @@ -31,6 +31,7 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/bus.h> #include <sys/rman.h> +#include <sys/sysctl.h> #include <dev/puc/puc_bus.h> #include <dev/puc/puc_cfg.h> diff --git a/sys/dev/puc/puc_pccard.c b/sys/dev/puc/puc_pccard.c index 370c6af..232741e 100644 --- a/sys/dev/puc/puc_pccard.c +++ b/sys/dev/puc/puc_pccard.c @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$"); #include <sys/bus.h> #include <sys/conf.h> #include <sys/malloc.h> +#include <sys/sysctl.h> #include <machine/bus.h> #include <machine/resource.h> diff --git a/sys/dev/puc/puc_pci.c b/sys/dev/puc/puc_pci.c index 4ad1e3a..d6d5509 100644 --- a/sys/dev/puc/puc_pci.c +++ b/sys/dev/puc/puc_pci.c @@ -67,6 +67,7 @@ __FBSDID("$FreeBSD$"); #include <sys/bus.h> #include <sys/conf.h> #include <sys/malloc.h> +#include <sys/sysctl.h> #include <machine/bus.h> #include <machine/resource.h> @@ -78,6 +79,11 @@ __FBSDID("$FreeBSD$"); #include <dev/puc/puc_cfg.h> #include <dev/puc/puc_bfe.h> +static int puc_msi_disable; +TUNABLE_INT("hw.puc.msi_disable", &puc_msi_disable); +SYSCTL_INT(_hw_puc, OID_AUTO, msi_disable, CTLFLAG_RD | CTLFLAG_TUN, + &puc_msi_disable, 0, "Disable use of MSI interrupts by puc(9)"); + static const struct puc_cfg * puc_pci_match(device_t dev, const struct puc_cfg *desc) { @@ -120,11 +126,56 @@ puc_pci_probe(device_t dev) return (puc_bfe_probe(dev, desc)); } +static int +puc_pci_attach(device_t dev) +{ + struct puc_softc *sc; + int error, count; + + sc = device_get_softc(dev); + + if (!puc_msi_disable) { + count = 1; + + if (pci_alloc_msi(dev, &count) == 0) { + sc->sc_msi = 1; + sc->sc_irid = 1; + } + } + + error = puc_bfe_attach(dev); + + if (error != 0 && sc->sc_msi) + pci_release_msi(dev); + + return (error); +} + +static int +puc_pci_detach(device_t dev) +{ + struct puc_softc *sc; + int error; + + sc = device_get_softc(dev); + + error = puc_bfe_detach(dev); + + if (error != 0) + return (error); + + if (sc->sc_msi) + error = pci_release_msi(dev); + + return (error); +} + + static device_method_t puc_pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, puc_pci_probe), - DEVMETHOD(device_attach, puc_bfe_attach), - DEVMETHOD(device_detach, puc_bfe_detach), + DEVMETHOD(device_attach, puc_pci_attach), + DEVMETHOD(device_detach, puc_pci_detach), DEVMETHOD(bus_alloc_resource, puc_bus_alloc_resource), DEVMETHOD(bus_release_resource, puc_bus_release_resource), diff --git a/sys/dev/puc/pucdata.c b/sys/dev/puc/pucdata.c index 77953e0..4646fa4 100644 --- a/sys/dev/puc/pucdata.c +++ b/sys/dev/puc/pucdata.c @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/kernel.h> #include <sys/bus.h> +#include <sys/sysctl.h> #include <machine/resource.h> #include <machine/bus.h> diff --git a/sys/dev/usb/serial/u3g.c b/sys/dev/usb/serial/u3g.c index 1c8f3b5..1fe9b21 100644 --- a/sys/dev/usb/serial/u3g.c +++ b/sys/dev/usb/serial/u3g.c @@ -522,6 +522,7 @@ static const STRUCT_USB_HOST_ID u3g_devs[] = { U3G_DEV(SIERRA, MC5727_2, 0), U3G_DEV(SIERRA, MC5728, 0), U3G_DEV(SIERRA, MC7354, 0), + U3G_DEV(SIERRA, MC7355, 0), U3G_DEV(SIERRA, MC8700, 0), U3G_DEV(SIERRA, MC8755, 0), U3G_DEV(SIERRA, MC8755_2, 0), diff --git a/sys/dev/usb/usbdevs b/sys/dev/usb/usbdevs index 04cfd54..d649402 100644 --- a/sys/dev/usb/usbdevs +++ b/sys/dev/usb/usbdevs @@ -4010,6 +4010,7 @@ product SIERRA E6892 0x6892 E6892 product SIERRA E6893 0x6893 E6893 product SIERRA MC8700 0x68A3 MC8700 product SIERRA MC7354 0x68C0 MC7354 +product SIERRA MC7355 0x9041 MC7355 product SIERRA AC313U 0x68aa Sierra Wireless AirCard 313U product SIERRA TRUINSTALL 0x0fff Aircard Tru Installer diff --git a/sys/dev/vt/vt_core.c b/sys/dev/vt/vt_core.c index b661a0f..cc85e1c 100644 --- a/sys/dev/vt/vt_core.c +++ b/sys/dev/vt/vt_core.c @@ -121,6 +121,7 @@ const struct terminal_class vt_termclass = { static SYSCTL_NODE(_kern, OID_AUTO, vt, CTLFLAG_RD, 0, "vt(9) parameters"); VT_SYSCTL_INT(enable_altgr, 1, "Enable AltGr key (Do not assume R.Alt as Alt)"); +VT_SYSCTL_INT(enable_bell, 1, "Enable bell"); VT_SYSCTL_INT(debug, 0, "vt(9) debug level"); VT_SYSCTL_INT(deadtimer, 15, "Time to wait busy process in VT_PROCESS mode"); VT_SYSCTL_INT(suspendswitch, 1, "Switch to VT0 before suspend"); @@ -935,6 +936,9 @@ vtterm_bell(struct terminal *tm) struct vt_window *vw = tm->tm_softc; struct vt_device *vd = vw->vw_device; + if (!vt_enable_bell) + return; + if (vd->vd_flags & VDF_QUIET_BELL) return; @@ -946,6 +950,9 @@ vtterm_beep(struct terminal *tm, u_int param) { u_int freq, period; + if (!vt_enable_bell) + return; + if ((param == 0) || ((param & 0xffff) == 0)) { vtterm_bell(tm); return; diff --git a/sys/dev/watchdog/watchdog.c b/sys/dev/watchdog/watchdog.c index 087d6e8..5c9c6a1 100644 --- a/sys/dev/watchdog/watchdog.c +++ b/sys/dev/watchdog/watchdog.c @@ -28,6 +28,8 @@ * */ +#include "opt_ddb.h" + #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); @@ -37,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include <sys/conf.h> #include <sys/uio.h> #include <sys/kernel.h> +#include <sys/kdb.h> #include <sys/malloc.h> #include <sys/module.h> #include <sys/sysctl.h> diff --git a/sys/dev/xen/blkfront/blkfront.c b/sys/dev/xen/blkfront/blkfront.c index a71251d..312a077 100644 --- a/sys/dev/xen/blkfront/blkfront.c +++ b/sys/dev/xen/blkfront/blkfront.c @@ -84,6 +84,11 @@ static void xbd_startio(struct xbd_softc *sc); /*---------------------------- Global Static Data ----------------------------*/ static MALLOC_DEFINE(M_XENBLOCKFRONT, "xbd", "Xen Block Front driver data"); +static int xbd_enable_indirect = 1; +SYSCTL_NODE(_hw, OID_AUTO, xbd, CTLFLAG_RD, 0, "xbd driver parameters"); +SYSCTL_INT(_hw_xbd, OID_AUTO, xbd_enable_indirect, CTLFLAG_RDTUN, + &xbd_enable_indirect, 0, "Enable xbd indirect segments"); + /*---------------------------- Command Processing ----------------------------*/ static void xbd_freeze(struct xbd_softc *sc, xbd_flag_t xbd_flag) @@ -156,44 +161,14 @@ xbd_free_command(struct xbd_command *cm) } static void -xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) +xbd_mksegarray(bus_dma_segment_t *segs, int nsegs, + grant_ref_t * gref_head, int otherend_id, int readonly, + grant_ref_t * sg_ref, blkif_request_segment_t * sg) { - struct xbd_softc *sc; - struct xbd_command *cm; - blkif_request_t *ring_req; - struct blkif_request_segment *sg; - struct blkif_request_segment *last_block_sg; - grant_ref_t *sg_ref; + struct blkif_request_segment *last_block_sg = sg + nsegs; vm_paddr_t buffer_ma; uint64_t fsect, lsect; int ref; - int op; - int block_segs; - - cm = arg; - sc = cm->cm_sc; - - if (error) { - cm->cm_bp->bio_error = EIO; - biodone(cm->cm_bp); - xbd_free_command(cm); - return; - } - - /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt); - sc->xbd_ring.req_prod_pvt++; - ring_req->id = cm->cm_id; - ring_req->operation = cm->cm_operation; - ring_req->sector_number = cm->cm_sector_number; - ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk; - ring_req->nr_segments = nsegs; - cm->cm_nseg = nsegs; - - block_segs = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_REQUEST); - sg = ring_req->seg; - last_block_sg = sg + block_segs; - sg_ref = cm->cm_sg_refs; while (sg < last_block_sg) { buffer_ma = segs->ds_addr; @@ -204,7 +179,7 @@ xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) "cross a page boundary")); /* install a grant reference. */ - ref = gnttab_claim_grant_reference(&cm->cm_gref_head); + ref = gnttab_claim_grant_reference(gref_head); /* * GNTTAB_LIST_END == 0xffffffff, but it is private @@ -214,9 +189,9 @@ xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) gnttab_grant_foreign_access_ref( ref, - xenbus_get_otherend_id(sc->xbd_dev), + otherend_id, buffer_ma >> PAGE_SHIFT, - ring_req->operation == BLKIF_OP_WRITE); + readonly); *sg_ref = ref; *sg = (struct blkif_request_segment) { @@ -227,7 +202,66 @@ xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) sg++; sg_ref++; segs++; - nsegs--; + } +} + +static void +xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) +{ + struct xbd_softc *sc; + struct xbd_command *cm; + int op; + + cm = arg; + sc = cm->cm_sc; + + if (error) { + cm->cm_bp->bio_error = EIO; + biodone(cm->cm_bp); + xbd_free_command(cm); + return; + } + + KASSERT(nsegs <= sc->xbd_max_request_segments, + ("Too many segments in a blkfront I/O")); + + if (nsegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST) { + blkif_request_t *ring_req; + + /* Fill out a blkif_request_t structure. */ + ring_req = (blkif_request_t *) + RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt); + sc->xbd_ring.req_prod_pvt++; + ring_req->id = cm->cm_id; + ring_req->operation = cm->cm_operation; + ring_req->sector_number = cm->cm_sector_number; + ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk; + ring_req->nr_segments = nsegs; + cm->cm_nseg = nsegs; + xbd_mksegarray(segs, nsegs, &cm->cm_gref_head, + xenbus_get_otherend_id(sc->xbd_dev), + cm->cm_operation == BLKIF_OP_WRITE, + cm->cm_sg_refs, ring_req->seg); + } else { + blkif_request_indirect_t *ring_req; + + /* Fill out a blkif_request_indirect_t structure. */ + ring_req = (blkif_request_indirect_t *) + RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt); + sc->xbd_ring.req_prod_pvt++; + ring_req->id = cm->cm_id; + ring_req->operation = BLKIF_OP_INDIRECT; + ring_req->indirect_op = cm->cm_operation; + ring_req->sector_number = cm->cm_sector_number; + ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk; + ring_req->nr_segments = nsegs; + cm->cm_nseg = nsegs; + xbd_mksegarray(segs, nsegs, &cm->cm_gref_head, + xenbus_get_otherend_id(sc->xbd_dev), + cm->cm_operation == BLKIF_OP_WRITE, + cm->cm_sg_refs, cm->cm_indirectionpages); + memcpy(ring_req->indirect_grefs, &cm->cm_indirectionrefs, + sizeof(grant_ref_t) * sc->xbd_max_request_indirectpages); } if (cm->cm_operation == BLKIF_OP_READ) @@ -1010,6 +1044,16 @@ xbd_free(struct xbd_softc *sc) cm->cm_sg_refs = NULL; } + if (cm->cm_indirectionpages != NULL) { + gnttab_end_foreign_access_references( + sc->xbd_max_request_indirectpages, + &cm->cm_indirectionrefs[0]); + contigfree(cm->cm_indirectionpages, PAGE_SIZE * + sc->xbd_max_request_indirectpages, + M_XENBLOCKFRONT); + cm->cm_indirectionpages = NULL; + } + bus_dmamap_destroy(sc->xbd_io_dmat, cm->cm_map); } free(sc->xbd_shadow, M_XENBLOCKFRONT); @@ -1034,7 +1078,6 @@ xbd_initialize(struct xbd_softc *sc) const char *node_path; uint32_t max_ring_page_order; int error; - int i; if (xenbus_get_state(sc->xbd_dev) != XenbusStateInitialising) { /* Initialization has already been performed. */ @@ -1047,9 +1090,6 @@ xbd_initialize(struct xbd_softc *sc) */ max_ring_page_order = 0; sc->xbd_ring_pages = 1; - sc->xbd_max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; - sc->xbd_max_request_size = - XBD_SEGS_TO_SIZE(sc->xbd_max_request_segments); /* * Protocol negotiation. @@ -1105,53 +1145,6 @@ xbd_initialize(struct xbd_softc *sc) sc->xbd_max_requests = XBD_MAX_REQUESTS; } - /* Allocate datastructures based on negotiated values. */ - error = bus_dma_tag_create( - bus_get_dma_tag(sc->xbd_dev), /* parent */ - 512, PAGE_SIZE, /* algnmnt, boundary */ - BUS_SPACE_MAXADDR, /* lowaddr */ - BUS_SPACE_MAXADDR, /* highaddr */ - NULL, NULL, /* filter, filterarg */ - sc->xbd_max_request_size, - sc->xbd_max_request_segments, - PAGE_SIZE, /* maxsegsize */ - BUS_DMA_ALLOCNOW, /* flags */ - busdma_lock_mutex, /* lockfunc */ - &sc->xbd_io_lock, /* lockarg */ - &sc->xbd_io_dmat); - if (error != 0) { - xenbus_dev_fatal(sc->xbd_dev, error, - "Cannot allocate parent DMA tag\n"); - return; - } - - /* Per-transaction data allocation. */ - sc->xbd_shadow = malloc(sizeof(*sc->xbd_shadow) * sc->xbd_max_requests, - M_XENBLOCKFRONT, M_NOWAIT|M_ZERO); - if (sc->xbd_shadow == NULL) { - bus_dma_tag_destroy(sc->xbd_io_dmat); - xenbus_dev_fatal(sc->xbd_dev, error, - "Cannot allocate request structures\n"); - return; - } - - for (i = 0; i < sc->xbd_max_requests; i++) { - struct xbd_command *cm; - - cm = &sc->xbd_shadow[i]; - cm->cm_sg_refs = malloc( - sizeof(grant_ref_t) * sc->xbd_max_request_segments, - M_XENBLOCKFRONT, M_NOWAIT); - if (cm->cm_sg_refs == NULL) - break; - cm->cm_id = i; - cm->cm_flags = XBDCF_INITIALIZER; - cm->cm_sc = sc; - if (bus_dmamap_create(sc->xbd_io_dmat, 0, &cm->cm_map) != 0) - break; - xbd_free_command(cm); - } - if (xbd_alloc_ring(sc) != 0) return; @@ -1210,6 +1203,7 @@ xbd_connect(struct xbd_softc *sc) unsigned long sectors, sector_size; unsigned int binfo; int err, feature_barrier, feature_flush; + int i, j; if (sc->xbd_state == XBD_STATE_CONNECTED || sc->xbd_state == XBD_STATE_SUSPENDED) @@ -1240,6 +1234,88 @@ xbd_connect(struct xbd_softc *sc) if (err == 0 && feature_flush != 0) sc->xbd_flags |= XBDF_FLUSH; + err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), + "feature-max-indirect-segments", "%" PRIu32, + &sc->xbd_max_request_segments, NULL); + if ((err != 0) || (xbd_enable_indirect == 0)) + sc->xbd_max_request_segments = 0; + if (sc->xbd_max_request_segments > XBD_MAX_INDIRECT_SEGMENTS) + sc->xbd_max_request_segments = XBD_MAX_INDIRECT_SEGMENTS; + if (sc->xbd_max_request_segments > XBD_SIZE_TO_SEGS(MAXPHYS)) + sc->xbd_max_request_segments = XBD_SIZE_TO_SEGS(MAXPHYS); + sc->xbd_max_request_indirectpages = + XBD_INDIRECT_SEGS_TO_PAGES(sc->xbd_max_request_segments); + if (sc->xbd_max_request_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST) + sc->xbd_max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; + sc->xbd_max_request_size = + XBD_SEGS_TO_SIZE(sc->xbd_max_request_segments); + + /* Allocate datastructures based on negotiated values. */ + err = bus_dma_tag_create( + bus_get_dma_tag(sc->xbd_dev), /* parent */ + 512, PAGE_SIZE, /* algnmnt, boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + sc->xbd_max_request_size, + sc->xbd_max_request_segments, + PAGE_SIZE, /* maxsegsize */ + BUS_DMA_ALLOCNOW, /* flags */ + busdma_lock_mutex, /* lockfunc */ + &sc->xbd_io_lock, /* lockarg */ + &sc->xbd_io_dmat); + if (err != 0) { + xenbus_dev_fatal(sc->xbd_dev, err, + "Cannot allocate parent DMA tag\n"); + return; + } + + /* Per-transaction data allocation. */ + sc->xbd_shadow = malloc(sizeof(*sc->xbd_shadow) * sc->xbd_max_requests, + M_XENBLOCKFRONT, M_NOWAIT|M_ZERO); + if (sc->xbd_shadow == NULL) { + bus_dma_tag_destroy(sc->xbd_io_dmat); + xenbus_dev_fatal(sc->xbd_dev, ENOMEM, + "Cannot allocate request structures\n"); + return; + } + + for (i = 0; i < sc->xbd_max_requests; i++) { + struct xbd_command *cm; + void * indirectpages; + + cm = &sc->xbd_shadow[i]; + cm->cm_sg_refs = malloc( + sizeof(grant_ref_t) * sc->xbd_max_request_segments, + M_XENBLOCKFRONT, M_NOWAIT); + if (cm->cm_sg_refs == NULL) + break; + cm->cm_id = i; + cm->cm_flags = XBDCF_INITIALIZER; + cm->cm_sc = sc; + if (bus_dmamap_create(sc->xbd_io_dmat, 0, &cm->cm_map) != 0) + break; + if (sc->xbd_max_request_indirectpages > 0) { + indirectpages = contigmalloc( + PAGE_SIZE * sc->xbd_max_request_indirectpages, + M_XENBLOCKFRONT, M_ZERO, 0, ~0, PAGE_SIZE, 0); + } else { + indirectpages = NULL; + } + for (j = 0; j < sc->xbd_max_request_indirectpages; j++) { + if (gnttab_grant_foreign_access( + xenbus_get_otherend_id(sc->xbd_dev), + (vtomach(indirectpages) >> PAGE_SHIFT) + j, + 1 /* grant read-only access */, + &cm->cm_indirectionrefs[j])) + break; + } + if (j < sc->xbd_max_request_indirectpages) + break; + cm->cm_indirectionpages = indirectpages; + xbd_free_command(cm); + } + if (sc->xbd_disk == NULL) { device_printf(dev, "%juMB <%s> at %s", (uintmax_t) sectors / (1048576 / sector_size), diff --git a/sys/dev/xen/blkfront/block.h b/sys/dev/xen/blkfront/block.h index 3007118..28c6ff2 100644 --- a/sys/dev/xen/blkfront/block.h +++ b/sys/dev/xen/blkfront/block.h @@ -68,28 +68,32 @@ #define XBD_MAX_RING_PAGES 32 /** - * The maximum number of outstanding requests blocks (request headers plus - * additional segment blocks) we will allow in a negotiated block-front/back - * communication channel. + * The maximum number of outstanding requests we will allow in a negotiated + * block-front/back communication channel. */ #define XBD_MAX_REQUESTS \ __CONST_RING_SIZE(blkif, PAGE_SIZE * XBD_MAX_RING_PAGES) /** - * The maximum mapped region size per request we will allow in a negotiated - * block-front/back communication channel. + * The maximum number of blkif segments which can be provided per indirect + * page in an indirect request. + */ +#define XBD_MAX_SEGMENTS_PER_PAGE \ + (PAGE_SIZE / sizeof(struct blkif_request_segment)) + +/** + * The maximum number of blkif segments which can be provided in an indirect + * request. */ -#define XBD_MAX_REQUEST_SIZE \ - MIN(MAXPHYS, XBD_SEGS_TO_SIZE(BLKIF_MAX_SEGMENTS_PER_REQUEST)) +#define XBD_MAX_INDIRECT_SEGMENTS \ + (BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST * XBD_MAX_SEGMENTS_PER_PAGE) /** - * The maximum number of segments (within a request header and accompanying - * segment blocks) per request we will allow in a negotiated block-front/back - * communication channel. + * Compute the number of indirect segment pages required for an I/O with the + * specified number of indirect segments. */ -#define XBD_MAX_SEGMENTS_PER_REQUEST \ - (MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ - XBD_SIZE_TO_SEGS(XBD_MAX_REQUEST_SIZE))) +#define XBD_INDIRECT_SEGS_TO_PAGES(segs) \ + ((segs + XBD_MAX_SEGMENTS_PER_PAGE - 1) / XBD_MAX_SEGMENTS_PER_PAGE) typedef enum { XBDCF_Q_MASK = 0xFF, @@ -121,6 +125,8 @@ struct xbd_command { blkif_sector_t cm_sector_number; int cm_status; xbd_cbcf_t *cm_complete; + void *cm_indirectionpages; + grant_ref_t cm_indirectionrefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; }; typedef enum { @@ -175,6 +181,7 @@ struct xbd_softc { uint32_t xbd_max_requests; uint32_t xbd_max_request_segments; uint32_t xbd_max_request_size; + uint32_t xbd_max_request_indirectpages; grant_ref_t xbd_ring_ref[XBD_MAX_RING_PAGES]; blkif_front_ring_t xbd_ring; xen_intr_handle_t xen_intr_handle; diff --git a/sys/geom/geom_dev.c b/sys/geom/geom_dev.c index b53065d..621a037 100644 --- a/sys/geom/geom_dev.c +++ b/sys/geom/geom_dev.c @@ -570,7 +570,7 @@ g_dev_done(struct bio *bp2) } mtx_unlock(&sc->sc_mtx); if (destroy) - g_post_event(g_dev_destroy, cp, M_WAITOK, NULL); + g_post_event(g_dev_destroy, cp, M_NOWAIT, NULL); biodone(bp); } diff --git a/sys/geom/multipath/g_multipath.c b/sys/geom/multipath/g_multipath.c index 7593cca..0953d18 100644 --- a/sys/geom/multipath/g_multipath.c +++ b/sys/geom/multipath/g_multipath.c @@ -369,9 +369,9 @@ g_multipath_done(struct bio *bp) mtx_lock(&sc->sc_mtx); (*cnt)--; if (*cnt == 0 && (cp->index & MP_LOST)) { - cp->index |= MP_POSTED; + if (g_post_event(g_mpd, cp, M_NOWAIT, NULL) == 0) + cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); - g_post_event(g_mpd, cp, M_WAITOK, NULL); } else mtx_unlock(&sc->sc_mtx); g_std_done(bp); diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 540eb65..299ee77 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -352,7 +352,8 @@ static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); #if defined(PAE) || defined(PAE_TABLES) -static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); +static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, + int wait); #endif static void pmap_set_pg(void); @@ -670,7 +671,7 @@ pmap_page_init(vm_page_t m) #if defined(PAE) || defined(PAE_TABLES) static void * -pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ diff --git a/sys/ia64/ia64/uma_machdep.c b/sys/ia64/ia64/uma_machdep.c index 77791ae..b536820 100644 --- a/sys/ia64/ia64/uma_machdep.c +++ b/sys/ia64/ia64/uma_machdep.c @@ -40,7 +40,7 @@ __FBSDID("$FreeBSD$"); #include <machine/vmparam.h> void * -uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) { void *va; vm_page_t m; @@ -66,7 +66,7 @@ uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) } void -uma_small_free(void *mem, int size, u_int8_t flags) +uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index 9de8fa7..dae1d54 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -1849,7 +1849,7 @@ void knote(struct knlist *list, long hint, int lockflags) { struct kqueue *kq; - struct knote *kn; + struct knote *kn, *tkn; int error; if (list == NULL) @@ -1861,14 +1861,13 @@ knote(struct knlist *list, long hint, int lockflags) list->kl_lock(list->kl_lockarg); /* - * If we unlock the list lock (and set KN_INFLUX), we can eliminate - * the kqueue scheduling, but this will introduce four - * lock/unlock's for each knote to test. If we do, continue to use - * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is - * only safe if you want to remove the current item, which we are - * not doing. + * If we unlock the list lock (and set KN_INFLUX), we can + * eliminate the kqueue scheduling, but this will introduce + * four lock/unlock's for each knote to test. Also, marker + * would be needed to keep iteration position, since filters + * or other threads could remove events. */ - SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { + SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) { kq = kn->kn_kq; KQ_LOCK(kq); if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) { diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c index e7b8016..c232a37 100644 --- a/sys/kern/kern_mbuf.c +++ b/sys/kern/kern_mbuf.c @@ -284,7 +284,7 @@ static int mb_zinit_pack(void *, int, int); static void mb_zfini_pack(void *, int); static void mb_reclaim(void *); -static void *mbuf_jumbo_alloc(uma_zone_t, int, uint8_t *, int); +static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, uint8_t *, int); /* Ensure that MSIZE is a power of 2. */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); @@ -389,7 +389,7 @@ SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); * pages. */ static void * -mbuf_jumbo_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait) +mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ diff --git a/sys/kern/subr_busdma_bufalloc.c b/sys/kern/subr_busdma_bufalloc.c index a80a233..b0b1ba8 100644 --- a/sys/kern/subr_busdma_bufalloc.c +++ b/sys/kern/subr_busdma_bufalloc.c @@ -147,8 +147,8 @@ busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size) } void * -busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, u_int8_t *pflag, - int wait) +busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, + uint8_t *pflag, int wait) { #ifdef VM_MEMATTR_UNCACHEABLE @@ -166,7 +166,7 @@ busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, u_int8_t *pflag, } void -busdma_bufalloc_free_uncacheable(void *item, int size, u_int8_t pflag) +busdma_bufalloc_free_uncacheable(void *item, vm_size_t size, uint8_t pflag) { kmem_free(kernel_arena, (vm_offset_t)item, size); diff --git a/sys/kern/subr_vmem.c b/sys/kern/subr_vmem.c index 8cc020a..389b7ee 100644 --- a/sys/kern/subr_vmem.c +++ b/sys/kern/subr_vmem.c @@ -608,7 +608,7 @@ static struct mtx_padalign vmem_bt_lock; * we are really out of KVA. */ static void * -vmem_bt_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait) +vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) { vmem_addr_t addr; diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index 708457d..fa36849 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -2114,6 +2114,7 @@ sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, goto out; } } else if (fp->f_type == DTYPE_SHM) { + error = 0; shmfd = fp->f_data; obj = shmfd->shm_object; *obj_size = shmfd->shm_size; diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 17f72a7..48f7550 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -330,23 +330,27 @@ sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) int n_nchash; int count; +retry: n_nchash = nchash + 1; /* nchash is max index, not count */ if (!req->oldptr) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); - - /* Scan hash tables for applicable entries */ - for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { - CACHE_RLOCK(); - count = 0; - LIST_FOREACH(ncp, ncpp, nc_hash) { - count++; - } + cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); + CACHE_RLOCK(); + if (n_nchash != nchash + 1) { CACHE_RUNLOCK(); - error = SYSCTL_OUT(req, &count, sizeof(count)); - if (error) - return (error); + free(cntbuf, M_TEMP); + goto retry; } - return (0); + /* Scan hash tables counting entries */ + for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) + LIST_FOREACH(ncp, ncpp, nc_hash) + cntbuf[i]++; + CACHE_RUNLOCK(); + for (error = 0, i = 0; i < n_nchash; i++) + if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) + break; + free(cntbuf, M_TEMP); + return (error); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", @@ -935,6 +939,44 @@ nchinit(void *dummy __unused) } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); +void +cache_changesize(int newmaxvnodes) +{ + struct nchashhead *new_nchashtbl, *old_nchashtbl; + u_long new_nchash, old_nchash; + struct namecache *ncp; + uint32_t hash; + int i; + + new_nchashtbl = hashinit(newmaxvnodes * 2, M_VFSCACHE, &new_nchash); + /* If same hash table size, nothing to do */ + if (nchash == new_nchash) { + free(new_nchashtbl, M_VFSCACHE); + return; + } + /* + * Move everything from the old hash table to the new table. + * None of the namecache entries in the table can be removed + * because to do so, they have to be removed from the hash table. + */ + CACHE_WLOCK(); + old_nchashtbl = nchashtbl; + old_nchash = nchash; + nchashtbl = new_nchashtbl; + nchash = new_nchash; + for (i = 0; i <= old_nchash; i++) { + while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { + hash = fnv_32_buf(nc_get_name(ncp), ncp->nc_nlen, + FNV1_32_INIT); + hash = fnv_32_buf(&ncp->nc_dvp, sizeof(ncp->nc_dvp), + hash); + LIST_REMOVE(ncp, nc_hash); + LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); + } + } + CACHE_WUNLOCK(); + free(old_nchashtbl, M_VFSCACHE); +} /* * Invalidate all entries to a particular vnode. diff --git a/sys/kern/vfs_hash.c b/sys/kern/vfs_hash.c index 0271e49..1398a47 100644 --- a/sys/kern/vfs_hash.c +++ b/sys/kern/vfs_hash.c @@ -160,3 +160,40 @@ vfs_hash_rehash(struct vnode *vp, u_int hash) vp->v_hash = hash; mtx_unlock(&vfs_hash_mtx); } + +void +vfs_hash_changesize(int newmaxvnodes) +{ + struct vfs_hash_head *vfs_hash_newtbl, *vfs_hash_oldtbl; + u_long vfs_hash_newmask, vfs_hash_oldmask; + struct vnode *vp; + int i; + + vfs_hash_newtbl = hashinit(newmaxvnodes, M_VFS_HASH, + &vfs_hash_newmask); + /* If same hash table size, nothing to do */ + if (vfs_hash_mask == vfs_hash_newmask) { + free(vfs_hash_newtbl, M_VFS_HASH); + return; + } + /* + * Move everything from the old hash table to the new table. + * None of the vnodes in the table can be recycled because to + * do so, they have to be removed from the hash table. + */ + rw_wlock(&vfs_hash_lock); + vfs_hash_oldtbl = vfs_hash_tbl; + vfs_hash_oldmask = vfs_hash_mask; + vfs_hash_tbl = vfs_hash_newtbl; + vfs_hash_mask = vfs_hash_newmask; + for (i = 0; i <= vfs_hash_oldmask; i++) { + while ((vp = LIST_FIRST(&vfs_hash_oldtbl[i])) != NULL) { + LIST_REMOVE(vp, v_hashlist); + LIST_INSERT_HEAD( + vfs_hash_bucket(vp->v_mount, vp->v_hash), + vp, v_hashlist); + } + } + rw_wunlock(&vfs_hash_lock); + free(vfs_hash_oldtbl, M_VFS_HASH); +} diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 2c471af..a9e17f1 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -280,8 +280,25 @@ static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } * XXX desiredvnodes is historical cruft and should not exist. */ int desiredvnodes; -SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, - &desiredvnodes, 0, "Maximum number of vnodes"); + +static int +sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) +{ + int error, old_desiredvnodes; + + old_desiredvnodes = desiredvnodes; + if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0) + return (error); + if (old_desiredvnodes != desiredvnodes) { + vfs_hash_changesize(desiredvnodes); + cache_changesize(desiredvnodes); + } + return (0); +} + +SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0, + sysctl_update_desiredvnodes, "I", "Maximum number of vnodes"); SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); static int vnlru_nowhere; diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 2c92445..d4c8693 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -188,7 +188,10 @@ vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, restart: fmode = *flagp; - if (fmode & O_CREAT) { + if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT | + O_EXCL | O_DIRECTORY)) + return (EINVAL); + else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) { ndp->ni_cnd.cn_nameiop = CREATE; /* * Set NOCACHE to avoid flushing the cache when diff --git a/sys/mips/mips/uma_machdep.c b/sys/mips/mips/uma_machdep.c index 1c8e6c8..a1f5e5f 100644 --- a/sys/mips/mips/uma_machdep.c +++ b/sys/mips/mips/uma_machdep.c @@ -41,7 +41,7 @@ __FBSDID("$FreeBSD$"); #include <machine/vmparam.h> void * -uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) { vm_paddr_t pa; vm_page_t m; @@ -70,7 +70,7 @@ uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) } void -uma_small_free(void *mem, int size, u_int8_t flags) +uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; diff --git a/sys/net/ieee8023ad_lacp.c b/sys/net/ieee8023ad_lacp.c index e0fd776..361f592 100644 --- a/sys/net/ieee8023ad_lacp.c +++ b/sys/net/ieee8023ad_lacp.c @@ -187,15 +187,15 @@ static const char *lacp_format_portid(const struct lacp_portid *, char *, static void lacp_dprintf(const struct lacp_port *, const char *, ...) __attribute__((__format__(__printf__, 2, 3))); -static int lacp_debug = 0; +static VNET_DEFINE(int, lacp_debug); +#define V_lacp_debug VNET(lacp_debug) SYSCTL_NODE(_net_link_lagg, OID_AUTO, lacp, CTLFLAG_RD, 0, "ieee802.3ad"); -SYSCTL_INT(_net_link_lagg_lacp, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_TUN, - &lacp_debug, 0, "Enable LACP debug logging (1=debug, 2=trace)"); -TUNABLE_INT("net.link.lagg.lacp.debug", &lacp_debug); +SYSCTL_INT(_net_link_lagg_lacp, OID_AUTO, debug, CTLFLAG_RWTUN | CTLFLAG_VNET, + &VNET_NAME(lacp_debug), 0, "Enable LACP debug logging (1=debug, 2=trace)"); -#define LACP_DPRINTF(a) if (lacp_debug & 0x01) { lacp_dprintf a ; } -#define LACP_TRACE(a) if (lacp_debug & 0x02) { lacp_dprintf(a,"%s\n",__func__); } -#define LACP_TPRINTF(a) if (lacp_debug & 0x04) { lacp_dprintf a ; } +#define LACP_DPRINTF(a) if (V_lacp_debug & 0x01) { lacp_dprintf a ; } +#define LACP_TRACE(a) if (V_lacp_debug & 0x02) { lacp_dprintf(a,"%s\n",__func__); } +#define LACP_TPRINTF(a) if (V_lacp_debug & 0x04) { lacp_dprintf a ; } /* * partner administration variables. @@ -298,7 +298,7 @@ lacp_pdu_input(struct lacp_port *lp, struct mbuf *m) goto bad; } - if (lacp_debug > 0) { + if (V_lacp_debug > 0) { lacp_dprintf(lp, "lacpdu receive\n"); lacp_dump_lacpdu(du); } @@ -383,7 +383,7 @@ lacp_xmit_lacpdu(struct lacp_port *lp) sizeof(du->ldu_collector)); du->ldu_collector.lci_maxdelay = 0; - if (lacp_debug > 0) { + if (V_lacp_debug > 0) { lacp_dprintf(lp, "lacpdu transmit\n"); lacp_dump_lacpdu(du); } @@ -495,12 +495,14 @@ lacp_tick(void *arg) if ((lp->lp_state & LACP_STATE_AGGREGATION) == 0) continue; + CURVNET_SET(lp->lp_ifp->if_vnet); lacp_run_timers(lp); lacp_select(lp); lacp_sm_mux(lp); lacp_sm_tx(lp); lacp_sm_ptx_tx_schedule(lp); + CURVNET_RESTORE(); } callout_reset(&lsc->lsc_callout, hz, lacp_tick, lsc); } @@ -517,7 +519,7 @@ lacp_port_create(struct lagg_port *lgp) int error; boolean_t active = TRUE; /* XXX should be configurable */ - boolean_t fast = FALSE; /* XXX should be configurable */ + boolean_t fast = FALSE; /* Configurable via ioctl */ bzero((char *)&sdl, sizeof(sdl)); sdl.sdl_len = sizeof(sdl); @@ -577,12 +579,13 @@ lacp_port_destroy(struct lagg_port *lgp) lacp_disable_distributing(lp); lacp_unselect(lp); + LIST_REMOVE(lp, lp_next); + LACP_UNLOCK(lsc); + /* The address may have already been removed by if_purgemaddrs() */ if (!lgp->lp_detaching) if_delmulti_ifma(lp->lp_ifma); - LIST_REMOVE(lp, lp_next); - LACP_UNLOCK(lsc); free(lp, M_DEVBUF); } @@ -743,58 +746,19 @@ lacp_transit_expire(void *vp) LACP_LOCK_ASSERT(lsc); + CURVNET_SET(lsc->lsc_softc->sc_ifp->if_vnet); LACP_TRACE(NULL); + CURVNET_RESTORE(); lsc->lsc_suppress_distributing = FALSE; } -static void -lacp_attach_sysctl(struct lacp_softc *lsc, struct sysctl_oid *p_oid) -{ - struct lagg_softc *sc = lsc->lsc_softc; - - SYSCTL_ADD_UINT(&sc->ctx, SYSCTL_CHILDREN(p_oid), OID_AUTO, - "lacp_strict_mode", - CTLFLAG_RW, - &lsc->lsc_strict_mode, - lsc->lsc_strict_mode, - "Enable LACP strict mode"); -} - -static void -lacp_attach_sysctl_debug(struct lacp_softc *lsc, struct sysctl_oid *p_oid) -{ - struct lagg_softc *sc = lsc->lsc_softc; - struct sysctl_oid *oid; - - /* Create a child of the parent lagg interface */ - oid = SYSCTL_ADD_NODE(&sc->ctx, SYSCTL_CHILDREN(p_oid), - OID_AUTO, "debug", CTLFLAG_RD, NULL, "DEBUG"); - - SYSCTL_ADD_UINT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, - "rx_test", - CTLFLAG_RW, - &lsc->lsc_debug.lsc_rx_test, - lsc->lsc_debug.lsc_rx_test, - "Bitmap of if_dunit entries to drop RX frames for"); - SYSCTL_ADD_UINT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, - "tx_test", - CTLFLAG_RW, - &lsc->lsc_debug.lsc_tx_test, - lsc->lsc_debug.lsc_tx_test, - "Bitmap of if_dunit entries to drop TX frames for"); -} - -int +void lacp_attach(struct lagg_softc *sc) { struct lacp_softc *lsc; - struct sysctl_oid *oid; - lsc = malloc(sizeof(struct lacp_softc), - M_DEVBUF, M_NOWAIT|M_ZERO); - if (lsc == NULL) - return (ENOMEM); + lsc = malloc(sizeof(struct lacp_softc), M_DEVBUF, M_WAITOK | M_ZERO); sc->sc_psc = (caddr_t)lsc; lsc->lsc_softc = sc; @@ -806,35 +770,24 @@ lacp_attach(struct lagg_softc *sc) TAILQ_INIT(&lsc->lsc_aggregators); LIST_INIT(&lsc->lsc_ports); - /* Create a child of the parent lagg interface */ - oid = SYSCTL_ADD_NODE(&sc->ctx, SYSCTL_CHILDREN(sc->sc_oid), - OID_AUTO, "lacp", CTLFLAG_RD, NULL, "LACP"); - - /* Attach sysctl nodes */ - lacp_attach_sysctl(lsc, oid); - lacp_attach_sysctl_debug(lsc, oid); - callout_init_mtx(&lsc->lsc_transit_callout, &lsc->lsc_mtx, 0); callout_init_mtx(&lsc->lsc_callout, &lsc->lsc_mtx, 0); /* if the lagg is already up then do the same */ if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) lacp_init(sc); - - return (0); } int -lacp_detach(struct lagg_softc *sc) +lacp_detach(void *psc) { - struct lacp_softc *lsc = LACP_SOFTC(sc); + struct lacp_softc *lsc = (struct lacp_softc *)psc; KASSERT(TAILQ_EMPTY(&lsc->lsc_aggregators), ("aggregators still active")); KASSERT(lsc->lsc_active_aggregator == NULL, ("aggregator still attached")); - sc->sc_psc = NULL; callout_drain(&lsc->lsc_transit_callout); callout_drain(&lsc->lsc_callout); @@ -883,7 +836,7 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) return (NULL); } - if (sc->use_flowid && + if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) hash = m->m_pkthdr.flowid >> sc->flowid_shift; else @@ -1425,7 +1378,7 @@ lacp_sm_mux(struct lacp_port *lp) enum lacp_selected selected = lp->lp_selected; struct lacp_aggregator *la; - if (lacp_debug > 1) + if (V_lacp_debug > 1) lacp_dprintf(lp, "%s: state= 0x%x, selected= 0x%x, " "p_sync= 0x%x, p_collecting= 0x%x\n", __func__, lp->lp_mux_state, selected, p_sync, p_collecting); diff --git a/sys/net/ieee8023ad_lacp.h b/sys/net/ieee8023ad_lacp.h index ca5f76e..1573c0b 100644 --- a/sys/net/ieee8023ad_lacp.h +++ b/sys/net/ieee8023ad_lacp.h @@ -75,6 +75,7 @@ "\007DEFAULTED" \ "\010EXPIRED" +#ifdef _KERNEL /* * IEEE802.3 slow protocols * @@ -250,6 +251,7 @@ struct lacp_softc { u_int32_t lsc_tx_test; } lsc_debug; u_int32_t lsc_strict_mode; + boolean_t lsc_fast_timeout; /* if set, fast timeout */ }; #define LACP_TYPE_ACTORINFO 1 @@ -282,8 +284,8 @@ struct lacp_softc { struct mbuf *lacp_input(struct lagg_port *, struct mbuf *); struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *); -int lacp_attach(struct lagg_softc *); -int lacp_detach(struct lagg_softc *); +void lacp_attach(struct lagg_softc *); +int lacp_detach(void *); void lacp_init(struct lagg_softc *); void lacp_stop(struct lagg_softc *); int lacp_port_create(struct lagg_port *); @@ -336,3 +338,4 @@ lacp_isdistributing(struct lagg_port *lgp) #define LACP_LAGIDSTR_MAX \ (1 + LACP_PARTNERSTR_MAX + 1 + LACP_PARTNERSTR_MAX + 1) #define LACP_STATESTR_MAX (255) /* XXX */ +#endif /* _KERNEL */ diff --git a/sys/net/if_gif.c b/sys/net/if_gif.c index 66669ca..f2a38c1 100644 --- a/sys/net/if_gif.c +++ b/sys/net/if_gif.c @@ -419,13 +419,8 @@ gif_transmit(struct ifnet *ifp, struct mbuf *m) } eth = mtod(m, struct etherip_header *); eth->eip_resvh = 0; - if ((sc->gif_options & GIF_SEND_REVETHIP) != 0) { - eth->eip_ver = 0; - eth->eip_resvl = ETHERIP_VERSION; - } else { - eth->eip_ver = ETHERIP_VERSION; - eth->eip_resvl = 0; - } + eth->eip_ver = ETHERIP_VERSION; + eth->eip_resvl = 0; break; default: error = EAFNOSUPPORT; @@ -633,19 +628,10 @@ gif_input(struct mbuf *m, struct ifnet *ifp, int proto, uint8_t ecn) if (m == NULL) goto drop; eip = mtod(m, struct etherip_header *); - /* - * GIF_ACCEPT_REVETHIP (enabled by default) intentionally - * accepts an EtherIP packet with revered version field in - * the header. This is a knob for backward compatibility - * with FreeBSD 7.2R or prior. - */ if (eip->eip_ver != ETHERIP_VERSION) { - if ((gif_options & GIF_ACCEPT_REVETHIP) == 0 || - eip->eip_resvl != ETHERIP_VERSION) { - /* discard unknown versions */ - m_freem(m); - goto drop; - } + /* discard unknown versions */ + m_freem(m); + goto drop; } m_adj(m, sizeof(struct etherip_header)); @@ -766,50 +752,32 @@ gif_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) goto bad; /* validate sa_len */ + /* check sa_family looks sane for the cmd */ switch (src->sa_family) { #ifdef INET case AF_INET: if (src->sa_len != sizeof(struct sockaddr_in)) goto bad; - break; -#endif -#ifdef INET6 - case AF_INET6: - if (src->sa_len != sizeof(struct sockaddr_in6)) + if (cmd != SIOCSIFPHYADDR) { + error = EAFNOSUPPORT; goto bad; - break; -#endif - default: - error = EAFNOSUPPORT; - goto bad; - } - /* check sa_family looks sane for the cmd */ - error = EAFNOSUPPORT; - switch (cmd) { -#ifdef INET - case SIOCSIFPHYADDR: - if (src->sa_family == AF_INET) - break; - goto bad; -#endif -#ifdef INET6 - case SIOCSIFPHYADDR_IN6: - if (src->sa_family == AF_INET6) - break; - goto bad; -#endif - } - error = EADDRNOTAVAIL; - switch (src->sa_family) { -#ifdef INET - case AF_INET: + } if (satosin(src)->sin_addr.s_addr == INADDR_ANY || - satosin(dst)->sin_addr.s_addr == INADDR_ANY) + satosin(dst)->sin_addr.s_addr == INADDR_ANY) { + error = EADDRNOTAVAIL; goto bad; + } break; #endif #ifdef INET6 case AF_INET6: + if (src->sa_len != sizeof(struct sockaddr_in6)) + goto bad; + if (cmd != SIOCSIFPHYADDR_IN6) { + error = EAFNOSUPPORT; + goto bad; + } + error = EADDRNOTAVAIL; if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr) || IN6_IS_ADDR_UNSPECIFIED(&satosin6(dst)->sin6_addr)) @@ -825,8 +793,12 @@ gif_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = sa6_embedscope(satosin6(dst), 0); if (error != 0) goto bad; + break; #endif - }; + default: + error = EAFNOSUPPORT; + goto bad; + } error = gif_set_tunnel(ifp, src, dst); break; case SIOCDIFPHYADDR: diff --git a/sys/net/if_gif.h b/sys/net/if_gif.h index ed143e8..b71c8e8 100644 --- a/sys/net/if_gif.h +++ b/sys/net/if_gif.h @@ -126,10 +126,7 @@ int in6_gif_attach(struct gif_softc *); #define GIFGOPTS _IOWR('i', 150, struct ifreq) #define GIFSOPTS _IOW('i', 151, struct ifreq) -#define GIF_ACCEPT_REVETHIP 0x0001 #define GIF_IGNORE_SOURCE 0x0002 -#define GIF_SEND_REVETHIP 0x0010 -#define GIF_OPTMASK (GIF_ACCEPT_REVETHIP|GIF_SEND_REVETHIP| \ - GIF_IGNORE_SOURCE) +#define GIF_OPTMASK (GIF_IGNORE_SOURCE) #endif /* _NET_IF_GIF_H_ */ diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c index ce1365b..2522fb6 100644 --- a/sys/net/if_lagg.c +++ b/sys/net/if_lagg.c @@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include <net/if_types.h> #include <net/if_var.h> #include <net/bpf.h> +#include <net/vnet.h> #if defined(INET) || defined(INET6) #include <netinet/in.h> @@ -81,13 +82,21 @@ static struct { {0, NULL} }; -SLIST_HEAD(__trhead, lagg_softc) lagg_list; /* list of laggs */ -static struct mtx lagg_list_mtx; +VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */ +#define V_lagg_list VNET(lagg_list) +static VNET_DEFINE(struct mtx, lagg_list_mtx); +#define V_lagg_list_mtx VNET(lagg_list_mtx) +#define LAGG_LIST_LOCK_INIT(x) mtx_init(&V_lagg_list_mtx, \ + "if_lagg list", NULL, MTX_DEF) +#define LAGG_LIST_LOCK_DESTROY(x) mtx_destroy(&V_lagg_list_mtx) +#define LAGG_LIST_LOCK(x) mtx_lock(&V_lagg_list_mtx) +#define LAGG_LIST_UNLOCK(x) mtx_unlock(&V_lagg_list_mtx) eventhandler_tag lagg_detach_cookie = NULL; static int lagg_clone_create(struct if_clone *, int, caddr_t); static void lagg_clone_destroy(struct ifnet *); -static struct if_clone *lagg_cloner; +static VNET_DEFINE(struct if_clone *, lagg_cloner); +#define V_lagg_cloner VNET(lagg_cloner) static const char laggname[] = "lagg"; static void lagg_lladdr(struct lagg_softc *, uint8_t *); @@ -123,24 +132,23 @@ static void lagg_media_status(struct ifnet *, struct ifmediareq *); static struct lagg_port *lagg_link_active(struct lagg_softc *, struct lagg_port *); static const void *lagg_gethdr(struct mbuf *, u_int, u_int, void *); -static int lagg_sysctl_active(SYSCTL_HANDLER_ARGS); /* Simple round robin */ -static int lagg_rr_attach(struct lagg_softc *); +static void lagg_rr_attach(struct lagg_softc *); static int lagg_rr_detach(struct lagg_softc *); static int lagg_rr_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Active failover */ -static int lagg_fail_attach(struct lagg_softc *); +static void lagg_fail_attach(struct lagg_softc *); static int lagg_fail_detach(struct lagg_softc *); static int lagg_fail_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Loadbalancing */ -static int lagg_lb_attach(struct lagg_softc *); +static void lagg_lb_attach(struct lagg_softc *); static int lagg_lb_detach(struct lagg_softc *); static int lagg_lb_port_create(struct lagg_port *); static void lagg_lb_port_destroy(struct lagg_port *); @@ -150,7 +158,7 @@ static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *, static int lagg_lb_porttable(struct lagg_softc *, struct lagg_port *); /* 802.3ad LACP */ -static int lagg_lacp_attach(struct lagg_softc *); +static void lagg_lacp_attach(struct lagg_softc *); static int lagg_lacp_detach(struct lagg_softc *); static int lagg_lacp_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *, @@ -160,9 +168,9 @@ static void lagg_lacp_lladdr(struct lagg_softc *); static void lagg_callout(void *); /* lagg protocol table */ -static const struct { - int ti_proto; - int (*ti_attach)(struct lagg_softc *); +static const struct lagg_proto { + lagg_proto ti_proto; + void (*ti_attach)(struct lagg_softc *); } lagg_protos[] = { { LAGG_PROTO_ROUNDROBIN, lagg_rr_attach }, { LAGG_PROTO_FAILOVER, lagg_fail_attach }, @@ -176,31 +184,55 @@ SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0, "Link Aggregation"); -static int lagg_failover_rx_all = 0; /* Allow input on any failover links */ -SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW, - &lagg_failover_rx_all, 0, +/* Allow input on any failover links */ +static VNET_DEFINE(int, lagg_failover_rx_all); +#define V_lagg_failover_rx_all VNET(lagg_failover_rx_all) +SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET, + &VNET_NAME(lagg_failover_rx_all), 0, "Accept input from any interface in a failover lagg"); -static int def_use_flowid = 1; /* Default value for using flowid */ -TUNABLE_INT("net.link.lagg.default_use_flowid", &def_use_flowid); -SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RW, - &def_use_flowid, 0, + +/* Default value for using M_FLOWID */ +static VNET_DEFINE(int, def_use_flowid) = 1; +#define V_def_use_flowid VNET(def_use_flowid) +SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN, + &VNET_NAME(def_use_flowid), 0, "Default setting for using flow id for load sharing"); -static int def_flowid_shift = 16; /* Default value for using flow shift */ -TUNABLE_INT("net.link.lagg.default_flowid_shift", &def_flowid_shift); -SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RW, - &def_flowid_shift, 0, + +/* Default value for using M_FLOWID */ +static VNET_DEFINE(int, def_flowid_shift) = 16; +#define V_def_flowid_shift VNET(def_flowid_shift) +SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN, + &VNET_NAME(def_flowid_shift), 0, "Default setting for flowid shift for load sharing"); +static void +vnet_lagg_init(const void *unused __unused) +{ + + LAGG_LIST_LOCK_INIT(); + SLIST_INIT(&V_lagg_list); + V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create, + lagg_clone_destroy, 0); +} +VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_lagg_init, NULL); + +static void +vnet_lagg_uninit(const void *unused __unused) +{ + + if_clone_detach(V_lagg_cloner); + LAGG_LIST_LOCK_DESTROY(); +} +VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_lagg_uninit, NULL); + static int lagg_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: - mtx_init(&lagg_list_mtx, "if_lagg list", NULL, MTX_DEF); - SLIST_INIT(&lagg_list); - lagg_cloner = if_clone_simple(laggname, lagg_clone_create, - lagg_clone_destroy, 0); lagg_input_p = lagg_input; lagg_linkstate_p = lagg_port_state; lagg_detach_cookie = EVENTHANDLER_REGISTER( @@ -210,10 +242,8 @@ lagg_modevent(module_t mod, int type, void *data) case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, lagg_detach_cookie); - if_clone_detach(lagg_cloner); lagg_input_p = NULL; lagg_linkstate_p = NULL; - mtx_destroy(&lagg_list_mtx); break; default: return (EOPNOTSUPP); @@ -279,10 +309,8 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct lagg_softc *sc; struct ifnet *ifp; - int i, error = 0; static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ - struct sysctl_oid *oid; - char num[14]; /* sufficient for 32 bits */ + int i; sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); @@ -296,29 +324,10 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) sc->sc_ibytes = counter_u64_alloc(M_WAITOK); sc->sc_obytes = counter_u64_alloc(M_WAITOK); - sysctl_ctx_init(&sc->ctx); - snprintf(num, sizeof(num), "%u", unit); - sc->use_flowid = def_use_flowid; - sc->flowid_shift = def_flowid_shift; - sc->sc_oid = oid = SYSCTL_ADD_NODE(&sc->ctx, - &SYSCTL_NODE_CHILDREN(_net_link, lagg), - OID_AUTO, num, CTLFLAG_RD, NULL, ""); - SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, - "use_flowid", CTLFLAG_RW, &sc->use_flowid, - sc->use_flowid, "Use flow id for load sharing"); - SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, - "flowid_shift", CTLFLAG_RW, &sc->flowid_shift, - sc->flowid_shift, - "Shift flowid bits to prevent multiqueue collisions"); - SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, - "count", CTLFLAG_RD, &sc->sc_count, sc->sc_count, - "Total number of ports"); - SYSCTL_ADD_PROC(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, - "active", CTLTYPE_INT|CTLFLAG_RD, sc, 0, lagg_sysctl_active, - "I", "Total number of active ports"); - SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, - "flapping", CTLFLAG_RD, &sc->sc_flapping, - sc->sc_flapping, "Total number of port change events"); + if (V_def_use_flowid) + sc->sc_opts |= LAGG_OPT_USE_FLOWID; + sc->flowid_shift = V_def_flowid_shift; + /* Hash all layers by default */ sc->sc_flags = LAGG_F_HASHL2|LAGG_F_HASHL3|LAGG_F_HASHL4; @@ -326,11 +335,7 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) for (i = 0; lagg_protos[i].ti_proto != LAGG_PROTO_NONE; i++) { if (lagg_protos[i].ti_proto == LAGG_PROTO_DEFAULT) { sc->sc_proto = lagg_protos[i].ti_proto; - if ((error = lagg_protos[i].ti_attach(sc)) != 0) { - if_free(ifp); - free(sc, M_DEVBUF); - return (error); - } + lagg_protos[i].ti_attach(sc); break; } } @@ -375,9 +380,9 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); /* Insert into the global list of laggs */ - mtx_lock(&lagg_list_mtx); - SLIST_INSERT_HEAD(&lagg_list, sc, sc_entries); - mtx_unlock(&lagg_list_mtx); + LAGG_LIST_LOCK(); + SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries); + LAGG_LIST_UNLOCK(); callout_reset(&sc->sc_callout, hz, lagg_callout, sc); @@ -404,10 +409,9 @@ lagg_clone_destroy(struct ifnet *ifp) /* Unhook the aggregation protocol */ if (sc->sc_detach != NULL) (*sc->sc_detach)(sc); + else + LAGG_WUNLOCK(sc); - LAGG_WUNLOCK(sc); - - sysctl_ctx_free(&sc->ctx); ifmedia_removeall(&sc->sc_media); ether_ifdetach(ifp); if_free(ifp); @@ -421,9 +425,9 @@ lagg_clone_destroy(struct ifnet *ifp) counter_u64_free(sc->sc_ibytes); counter_u64_free(sc->sc_obytes); - mtx_lock(&lagg_list_mtx); - SLIST_REMOVE(&lagg_list, sc, lagg_softc, sc_entries); - mtx_unlock(&lagg_list_mtx); + LAGG_LIST_LOCK(); + SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries); + LAGG_LIST_UNLOCK(); taskqueue_drain(taskqueue_swi, &sc->sc_lladdr_task); LAGG_LOCK_DESTROY(sc); @@ -435,15 +439,28 @@ static void lagg_lladdr(struct lagg_softc *sc, uint8_t *lladdr) { struct ifnet *ifp = sc->sc_ifp; + struct lagg_port lp; if (memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) return; + LAGG_WLOCK_ASSERT(sc); + /* + * Set the link layer address on the lagg interface. + * sc_lladdr() notifies the MAC change to + * the aggregation protocol. iflladdr_event handler which + * may trigger gratuitous ARPs for INET will be handled in + * a taskqueue. + */ bcopy(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN); - /* Let the protocol know the MAC has changed */ if (sc->sc_lladdr != NULL) (*sc->sc_lladdr)(sc); - EVENTHANDLER_INVOKE(iflladdr_event, ifp); + + bzero(&lp, sizeof(lp)); + lp.lp_ifp = sc->sc_ifp; + lp.lp_softc = sc; + + lagg_port_lladdr(&lp, lladdr); } static void @@ -491,11 +508,13 @@ lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr) struct ifnet *ifp = lp->lp_ifp; struct lagg_llq *llq; int pending = 0; + int primary; LAGG_WLOCK_ASSERT(sc); - if (lp->lp_detaching || - memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) + primary = (sc->sc_primary->lp_ifp == ifp) ? 1 : 0; + if (primary == 0 && (lp->lp_detaching || + memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0)) return; /* Check to make sure its not already queued to be changed */ @@ -514,6 +533,7 @@ lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr) /* Update the lladdr even if pending, it may have changed */ llq->llq_ifp = ifp; + llq->llq_primary = primary; bcopy(lladdr, llq->llq_lladdr, ETHER_ADDR_LEN); if (!pending) @@ -546,14 +566,20 @@ lagg_port_setlladdr(void *arg, int pending) for (llq = head; llq != NULL; llq = head) { ifp = llq->llq_ifp; - /* Set the link layer address */ CURVNET_SET(ifp->if_vnet); - error = if_setlladdr(ifp, llq->llq_lladdr, ETHER_ADDR_LEN); + if (llq->llq_primary == 0) { + /* + * Set the link layer address on the laggport interface. + * if_setlladdr() triggers gratuitous ARPs for INET. + */ + error = if_setlladdr(ifp, llq->llq_lladdr, + ETHER_ADDR_LEN); + if (error) + printf("%s: setlladdr failed on %s\n", __func__, + ifp->if_xname); + } else + EVENTHANDLER_INVOKE(iflladdr_event, ifp); CURVNET_RESTORE(); - if (error) - printf("%s: setlladdr failed on %s\n", __func__, - ifp->if_xname); - head = SLIST_NEXT(llq, llq_entries); free(llq, M_DEVBUF); } @@ -585,34 +611,6 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) if (ifp->if_type != IFT_ETHER) return (EPROTONOSUPPORT); -#ifdef INET6 - /* - * The member interface should not have inet6 address because - * two interfaces with a valid link-local scope zone must not be - * merged in any form. This restriction is needed to - * prevent violation of link-local scope zone. Attempts to - * add a member interface which has inet6 addresses triggers - * removal of all inet6 addresses on the member interface. - */ - SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { - if (in6ifa_llaonifp(lp->lp_ifp)) { - in6_ifdetach(lp->lp_ifp); - if_printf(sc->sc_ifp, - "IPv6 addresses on %s have been removed " - "before adding it as a member to prevent " - "IPv6 address scope violation.\n", - lp->lp_ifp->if_xname); - } - } - if (in6ifa_llaonifp(ifp)) { - in6_ifdetach(ifp); - if_printf(sc->sc_ifp, - "IPv6 addresses on %s have been removed " - "before adding it as a member to prevent " - "IPv6 address scope violation.\n", - ifp->if_xname); - } -#endif /* Allow the first Ethernet member to define the MTU */ if (SLIST_EMPTY(&sc->sc_ports)) sc->sc_ifp->if_mtu = ifp->if_mtu; @@ -627,10 +625,10 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) return (ENOMEM); /* Check if port is a stacked lagg */ - mtx_lock(&lagg_list_mtx); - SLIST_FOREACH(sc_ptr, &lagg_list, sc_entries) { + LAGG_LIST_LOCK(); + SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) { if (ifp == sc_ptr->sc_ifp) { - mtx_unlock(&lagg_list_mtx); + LAGG_LIST_UNLOCK(); free(lp, M_DEVBUF); return (EINVAL); /* XXX disable stacking for the moment, its untested */ @@ -638,14 +636,14 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) lp->lp_flags |= LAGG_PORT_STACK; if (lagg_port_checkstacking(sc_ptr) >= LAGG_MAX_STACKING) { - mtx_unlock(&lagg_list_mtx); + LAGG_LIST_UNLOCK(); free(lp, M_DEVBUF); return (E2BIG); } #endif } } - mtx_unlock(&lagg_list_mtx); + LAGG_LIST_UNLOCK(); /* Change the interface type */ lp->lp_iftype = ifp->if_type; @@ -995,10 +993,12 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_reqall *ra = (struct lagg_reqall *)data; + struct lagg_reqopts *ro = (struct lagg_reqopts *)data; struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf; struct lagg_reqflags *rf = (struct lagg_reqflags *)data; struct ifreq *ifr = (struct ifreq *)data; struct lagg_port *lp; + const struct lagg_proto *proto = NULL; struct ifnet *tpif; struct thread *td = curthread; char *buf, *outbuf; @@ -1046,50 +1046,153 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = priv_check(td, PRIV_NET_LAGG); if (error) break; - if (ra->ra_proto >= LAGG_PROTO_MAX) { + for (proto = lagg_protos; proto->ti_proto != LAGG_PROTO_NONE; + proto++) { + if (proto->ti_proto == ra->ra_proto) { + if (sc->sc_ifflags & IFF_DEBUG) + printf("%s: using proto %u\n", + sc->sc_ifname, proto->ti_proto); + break; + } + } + if (proto->ti_proto == LAGG_PROTO_NONE) { error = EPROTONOSUPPORT; break; } + /* Set to LAGG_PROTO_NONE during the attach. */ LAGG_WLOCK(sc); if (sc->sc_proto != LAGG_PROTO_NONE) { - /* Reset protocol first in case detach unlocks */ sc->sc_proto = LAGG_PROTO_NONE; - error = sc->sc_detach(sc); - sc->sc_detach = NULL; - sc->sc_start = NULL; - sc->sc_input = NULL; - sc->sc_port_create = NULL; - sc->sc_port_destroy = NULL; - sc->sc_linkstate = NULL; - sc->sc_init = NULL; - sc->sc_stop = NULL; - sc->sc_lladdr = NULL; - sc->sc_req = NULL; - sc->sc_portreq = NULL; - } else if (sc->sc_input != NULL) { - /* Still detaching */ - error = EBUSY; - } - if (error != 0) { + if (sc->sc_detach != NULL) + sc->sc_detach(sc); + else + LAGG_WUNLOCK(sc); + } else LAGG_WUNLOCK(sc); + proto->ti_attach(sc); + LAGG_WLOCK(sc); + sc->sc_proto = proto->ti_proto; + LAGG_WUNLOCK(sc); + break; + case SIOCGLAGGOPTS: + ro->ro_opts = sc->sc_opts; + if (sc->sc_proto == LAGG_PROTO_LACP) { + struct lacp_softc *lsc; + + lsc = (struct lacp_softc *)sc->sc_psc; + if (lsc->lsc_debug.lsc_tx_test != 0) + ro->ro_opts |= LAGG_OPT_LACP_TXTEST; + if (lsc->lsc_debug.lsc_rx_test != 0) + ro->ro_opts |= LAGG_OPT_LACP_RXTEST; + if (lsc->lsc_strict_mode != 0) + ro->ro_opts |= LAGG_OPT_LACP_STRICT; + if (lsc->lsc_fast_timeout != 0) + ro->ro_opts |= LAGG_OPT_LACP_TIMEOUT; + + ro->ro_active = sc->sc_active; + } else { + ro->ro_active = 0; + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + ro->ro_active += LAGG_PORTACTIVE(lp); + } + ro->ro_flapping = sc->sc_flapping; + ro->ro_flowid_shift = sc->flowid_shift; + break; + case SIOCSLAGGOPTS: + error = priv_check(td, PRIV_NET_LAGG); + if (error) + break; + if (ro->ro_opts == 0) + break; + /* + * Set options. LACP options are stored in sc->sc_psc, + * not in sc_opts. + */ + int valid, lacp; + + switch (ro->ro_opts) { + case LAGG_OPT_USE_FLOWID: + case -LAGG_OPT_USE_FLOWID: + case LAGG_OPT_FLOWIDSHIFT: + valid = 1; + lacp = 0; + break; + case LAGG_OPT_LACP_TXTEST: + case -LAGG_OPT_LACP_TXTEST: + case LAGG_OPT_LACP_RXTEST: + case -LAGG_OPT_LACP_RXTEST: + case LAGG_OPT_LACP_STRICT: + case -LAGG_OPT_LACP_STRICT: + case LAGG_OPT_LACP_TIMEOUT: + case -LAGG_OPT_LACP_TIMEOUT: + valid = lacp = 1; + break; + default: + valid = lacp = 0; break; } - for (int i = 0; i < (sizeof(lagg_protos) / - sizeof(lagg_protos[0])); i++) { - if (lagg_protos[i].ti_proto == ra->ra_proto) { - if (sc->sc_ifflags & IFF_DEBUG) - printf("%s: using proto %u\n", - sc->sc_ifname, - lagg_protos[i].ti_proto); - sc->sc_proto = lagg_protos[i].ti_proto; - if (sc->sc_proto != LAGG_PROTO_NONE) - error = lagg_protos[i].ti_attach(sc); - LAGG_WUNLOCK(sc); - return (error); + + LAGG_WLOCK(sc); + if (valid == 0 || + (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) { + /* Invalid combination of options specified. */ + error = EINVAL; + LAGG_WUNLOCK(sc); + break; /* Return from SIOCSLAGGOPTS. */ + } + /* + * Store new options into sc->sc_opts except for + * FLOWIDSHIFT and LACP options. + */ + if (lacp == 0) { + if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT) + sc->flowid_shift = ro->ro_flowid_shift; + else if (ro->ro_opts > 0) + sc->sc_opts |= ro->ro_opts; + else + sc->sc_opts &= ~ro->ro_opts; + } else { + struct lacp_softc *lsc; + struct lacp_port *lp; + + lsc = (struct lacp_softc *)sc->sc_psc; + + switch (ro->ro_opts) { + case LAGG_OPT_LACP_TXTEST: + lsc->lsc_debug.lsc_tx_test = 1; + break; + case -LAGG_OPT_LACP_TXTEST: + lsc->lsc_debug.lsc_tx_test = 0; + break; + case LAGG_OPT_LACP_RXTEST: + lsc->lsc_debug.lsc_rx_test = 1; + break; + case -LAGG_OPT_LACP_RXTEST: + lsc->lsc_debug.lsc_rx_test = 0; + break; + case LAGG_OPT_LACP_STRICT: + lsc->lsc_strict_mode = 1; + break; + case -LAGG_OPT_LACP_STRICT: + lsc->lsc_strict_mode = 0; + break; + case LAGG_OPT_LACP_TIMEOUT: + LACP_LOCK(lsc); + LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) + lp->lp_state |= LACP_STATE_TIMEOUT; + LACP_UNLOCK(lsc); + lsc->lsc_fast_timeout = 1; + break; + case -LAGG_OPT_LACP_TIMEOUT: + LACP_LOCK(lsc); + LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) + lp->lp_state &= ~LACP_STATE_TIMEOUT; + LACP_UNLOCK(lsc); + lsc->lsc_fast_timeout = 0; + break; } } LAGG_WUNLOCK(sc); - error = EPROTONOSUPPORT; break; case SIOCGLAGGFLAGS: rf->rf_flags = sc->sc_flags; @@ -1134,6 +1237,26 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = EINVAL; break; } +#ifdef INET6 + /* + * A laggport interface should not have inet6 address + * because two interfaces with a valid link-local + * scope zone must not be merged in any form. This + * restriction is needed to prevent violation of + * link-local scope zone. Attempts to add a laggport + * interface which has inet6 addresses triggers + * removal of all inet6 addresses on the member + * interface. + */ + if (in6ifa_llaonifp(tpif)) { + in6_ifdetach(tpif); + if_printf(sc->sc_ifp, + "IPv6 addresses on %s have been removed " + "before adding it as a member to prevent " + "IPv6 address scope violation.\n", + tpif->if_xname); + } +#endif LAGG_WLOCK(sc); error = lagg_port_create(sc, tpif); LAGG_WUNLOCK(sc); @@ -1419,7 +1542,7 @@ lagg_input(struct ifnet *ifp, struct mbuf *m) ETHER_BPF_MTAP(scifp, m); - m = (*sc->sc_input)(sc, lp, m); + m = (lp->lp_detaching == 0) ? (*sc->sc_input)(sc, lp, m) : NULL; if (m != NULL) { counter_u64_add(sc->sc_ipackets, 1); @@ -1582,27 +1705,6 @@ lagg_gethdr(struct mbuf *m, u_int off, u_int len, void *buf) return (mtod(m, char *) + off); } -static int -lagg_sysctl_active(SYSCTL_HANDLER_ARGS) -{ - struct lagg_softc *sc = (struct lagg_softc *)arg1; - struct lagg_port *lp; - int error; - - /* LACP tracks active links automatically, the others do not */ - if (sc->sc_proto != LAGG_PROTO_LACP) { - sc->sc_active = 0; - SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) - sc->sc_active += LAGG_PORTACTIVE(lp); - } - - error = sysctl_handle_int(oidp, &sc->sc_active, 0, req); - if ((error) || (req->newptr == NULL)) - return (error); - - return (0); -} - uint32_t lagg_hashmbuf(struct lagg_softc *sc, struct mbuf *m, uint32_t key) { @@ -1715,18 +1817,16 @@ lagg_enqueue(struct ifnet *ifp, struct mbuf *m) /* * Simple round robin aggregation */ - -static int +static void lagg_rr_attach(struct lagg_softc *sc) { sc->sc_detach = lagg_rr_detach; sc->sc_start = lagg_rr_start; sc->sc_input = lagg_rr_input; + sc->sc_detach = NULL; sc->sc_port_create = NULL; sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX; sc->sc_seq = 0; - - return (0); } static int @@ -1774,8 +1874,7 @@ lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) /* * Active failover */ - -static int +static void lagg_fail_attach(struct lagg_softc *sc) { sc->sc_detach = lagg_fail_detach; @@ -1783,8 +1882,7 @@ lagg_fail_attach(struct lagg_softc *sc) sc->sc_input = lagg_fail_input; sc->sc_port_create = NULL; sc->sc_port_destroy = NULL; - - return (0); + sc->sc_detach = NULL; } static int @@ -1814,7 +1912,7 @@ lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) struct ifnet *ifp = sc->sc_ifp; struct lagg_port *tmp_tp; - if (lp == sc->sc_primary || lagg_failover_rx_all) { + if (lp == sc->sc_primary || V_lagg_failover_rx_all) { m->m_pkthdr.rcvif = ifp; return (m); } @@ -1838,16 +1936,13 @@ lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) /* * Loadbalancing */ - -static int +static void lagg_lb_attach(struct lagg_softc *sc) { struct lagg_port *lp; struct lagg_lb *lb; - if ((lb = (struct lagg_lb *)malloc(sizeof(struct lagg_lb), - M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL) - return (ENOMEM); + lb = malloc(sizeof(struct lagg_lb), M_DEVBUF, M_WAITOK | M_ZERO); sc->sc_detach = lagg_lb_detach; sc->sc_start = lagg_lb_start; @@ -1861,14 +1956,13 @@ lagg_lb_attach(struct lagg_softc *sc) SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lagg_lb_port_create(lp); - - return (0); } static int lagg_lb_detach(struct lagg_softc *sc) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; + LAGG_WUNLOCK(sc); if (lb != NULL) free(lb, M_DEVBUF); return (0); @@ -1917,7 +2011,7 @@ lagg_lb_start(struct lagg_softc *sc, struct mbuf *m) struct lagg_port *lp = NULL; uint32_t p = 0; - if (sc->use_flowid && + if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) p = m->m_pkthdr.flowid >> sc->flowid_shift; else @@ -1952,12 +2046,10 @@ lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) /* * 802.3ad LACP */ - -static int +static void lagg_lacp_attach(struct lagg_softc *sc) { struct lagg_port *lp; - int error; sc->sc_detach = lagg_lacp_detach; sc->sc_port_create = lacp_port_create; @@ -1971,31 +2063,28 @@ lagg_lacp_attach(struct lagg_softc *sc) sc->sc_req = lacp_req; sc->sc_portreq = lacp_portreq; - error = lacp_attach(sc); - if (error) - return (error); + lacp_attach(sc); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); - - return (error); } static int lagg_lacp_detach(struct lagg_softc *sc) { struct lagg_port *lp; - int error; + void *psc; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); - /* unlocking is safe here */ + psc = sc->sc_psc; + sc->sc_psc = NULL; LAGG_WUNLOCK(sc); - error = lacp_detach(sc); - LAGG_WLOCK(sc); - return (error); + lacp_detach(psc); + + return (0); } static void diff --git a/sys/net/if_lagg.h b/sys/net/if_lagg.h index ff1ae2f..ac01032 100644 --- a/sys/net/if_lagg.h +++ b/sys/net/if_lagg.h @@ -47,17 +47,19 @@ "\05DISTRIBUTING\06DISABLED" /* Supported lagg PROTOs */ -#define LAGG_PROTO_NONE 0 /* no lagg protocol defined */ -#define LAGG_PROTO_ROUNDROBIN 1 /* simple round robin */ -#define LAGG_PROTO_FAILOVER 2 /* active failover */ -#define LAGG_PROTO_LOADBALANCE 3 /* loadbalance */ -#define LAGG_PROTO_LACP 4 /* 802.3ad lacp */ -#define LAGG_PROTO_ETHERCHANNEL 5 /* Cisco FEC */ -#define LAGG_PROTO_MAX 6 +typedef enum { + LAGG_PROTO_NONE = 0, /* no lagg protocol defined */ + LAGG_PROTO_ROUNDROBIN, /* simple round robin */ + LAGG_PROTO_FAILOVER, /* active failover */ + LAGG_PROTO_LOADBALANCE, /* loadbalance */ + LAGG_PROTO_LACP, /* 802.3ad lacp */ + LAGG_PROTO_ETHERCHANNEL,/* Cisco FEC */ + LAGG_PROTO_MAX, +} lagg_proto; struct lagg_protos { const char *lpr_name; - int lpr_proto; + lagg_proto lpr_proto; }; #define LAGG_PROTO_DEFAULT LAGG_PROTO_FAILOVER @@ -134,6 +136,31 @@ struct lagg_reqflags { #define SIOCGLAGGFLAGS _IOWR('i', 145, struct lagg_reqflags) #define SIOCSLAGGHASH _IOW('i', 146, struct lagg_reqflags) +struct lagg_reqopts { + char ro_ifname[IFNAMSIZ]; /* name of the lagg */ + + int ro_opts; /* Option bitmap */ +#define LAGG_OPT_NONE 0x00 +#define LAGG_OPT_USE_FLOWID 0x01 /* use M_FLOWID */ +/* Pseudo flags which are used in ro_opts but not stored into sc_opts. */ +#define LAGG_OPT_FLOWIDSHIFT 0x02 /* Set flowid */ +#define LAGG_OPT_FLOWIDSHIFT_MASK 0x1f /* flowid is uint32_t */ +#define LAGG_OPT_LACP_STRICT 0x10 /* LACP strict mode */ +#define LAGG_OPT_LACP_TXTEST 0x20 /* LACP debug: txtest */ +#define LAGG_OPT_LACP_RXTEST 0x40 /* LACP debug: rxtest */ +#define LAGG_OPT_LACP_TIMEOUT 0x80 /* LACP timeout */ + u_int ro_count; /* number of ports */ + u_int ro_active; /* active port count */ + u_int ro_flapping; /* number of flapping */ + int ro_flowid_shift; /* shift the flowid */ +}; + +#define SIOCGLAGGOPTS _IOWR('i', 152, struct lagg_reqopts) +#define SIOCSLAGGOPTS _IOW('i', 153, struct lagg_reqopts) + +#define LAGG_OPT_BITS "\020\001USE_FLOWID\005LACP_STRICT" \ + "\006LACP_TXTEST\007LACP_RXTEST" + #ifdef _KERNEL #include <sys/counter.h> @@ -183,6 +210,7 @@ struct lagg_mc { struct lagg_llq { struct ifnet *llq_ifp; uint8_t llq_lladdr[ETHER_ADDR_LEN]; + uint8_t llq_primary; SLIST_ENTRY(lagg_llq) llq_entries; }; @@ -229,9 +257,7 @@ struct lagg_softc { eventhandler_tag vlan_attach; eventhandler_tag vlan_detach; struct callout sc_callout; - struct sysctl_ctx_list ctx; /* sysctl variables */ - struct sysctl_oid *sc_oid; /* sysctl tree oid */ - int use_flowid; /* enable use of flowid */ + u_int sc_opts; int flowid_shift; /* set flowid shift*/ }; diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h index 3c60274..188057d 100644 --- a/sys/netinet/ip_fw.h +++ b/sys/netinet/ip_fw.h @@ -640,7 +640,7 @@ typedef struct _ipfw_table_xentry { char iface[IF_NAMESIZE]; /* interface name */ } k; } ipfw_table_xentry; -#define IPFW_TCF_INET 0x01 /* CIDR flags: IPv4 record */ +#define IPFW_TCF_INET 0x01 /* CIDR flags: IPv4 record */ typedef struct _ipfw_table { u_int32_t size; /* size of entries in bytes */ diff --git a/sys/netinet6/in6.c b/sys/netinet6/in6.c index 4c94ba8..231b269 100644 --- a/sys/netinet6/in6.c +++ b/sys/netinet6/in6.c @@ -323,8 +323,6 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, /* FALLTHROUGH */ case OSIOCGIFINFO_IN6: case SIOCGIFINFO_IN6: - case SIOCGDRLST_IN6: - case SIOCGPRLST_IN6: case SIOCGNBRINFO_IN6: case SIOCGDEFIFACE_IN6: return (nd6_ioctl(cmd, data, ifp)); @@ -1254,11 +1252,13 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, * source address. */ ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ - if (hostIsNew && in6if_do_dad(ifp)) - ia->ia6_flags |= IN6_IFF_TENTATIVE; - /* DAD should be performed after ND6_IFF_IFDISABLED is cleared. */ - if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) + /* + * DAD should be performed for an new address or addresses on + * an interface with ND6_IFF_IFDISABLED. + */ + if (in6if_do_dad(ifp) && + (hostIsNew || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED))) ia->ia6_flags |= IN6_IFF_TENTATIVE; /* @@ -1280,13 +1280,8 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, goto cleanup; } - /* - * Perform DAD, if needed. - * XXX It may be of use, if we can administratively disable DAD. - */ - if (in6if_do_dad(ifp) && ((ifra->ifra_flags & IN6_IFF_NODAD) == 0) && - (ia->ia6_flags & IN6_IFF_TENTATIVE)) - { + /* Perform DAD, if the address is TENTATIVE. */ + if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) { int mindelay, maxdelay; delay = 0; @@ -1619,6 +1614,10 @@ in6_purgeif(struct ifnet *ifp) * in the future. * RFC2373 defines interface id to be 64bit, but it allows non-RFC2374 * address encoding scheme. (see figure on page 8) + * Notifies other subsystems about address change/arrival: + * 1) Notifies device handler on the first IPv6 address assignment + * 2) Handle routing table changes for P2P links and route + * 3) Handle routing table changes for address host route */ static int in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, @@ -2389,13 +2388,13 @@ in6if_do_dad(struct ifnet *ifp) * However, some interfaces can be up before the RUNNING * status. Additionaly, users may try to assign addresses * before the interface becomes up (or running). - * We simply skip DAD in such a case as a work around. - * XXX: we should rather mark "tentative" on such addresses, - * and do DAD after the interface becomes ready. + * This function returns EAGAIN in that case. + * The caller should mark "tentative" on the address instead of + * performing DAD immediately. */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) - return (0); + return (EAGAIN); return (1); } diff --git a/sys/netinet6/in6_ifattach.c b/sys/netinet6/in6_ifattach.c index 5196101..2381dd3 100644 --- a/sys/netinet6/in6_ifattach.c +++ b/sys/netinet6/in6_ifattach.c @@ -595,12 +595,6 @@ in6_ifattach_loopback(struct ifnet *ifp) ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; - /* we don't need to perform DAD on loopback interfaces. */ - ifra.ifra_flags |= IN6_IFF_NODAD; - - /* skip registration to the prefix list. XXX should be temporary. */ - ifra.ifra_flags |= IN6_IFF_NOPFX; - /* * We are sure that this is a newly assigned address, so we can set * NULL to the 3rd arg. diff --git a/sys/netinet6/in6_var.h b/sys/netinet6/in6_var.h index e0f337f..30dd1e8 100644 --- a/sys/netinet6/in6_var.h +++ b/sys/netinet6/in6_var.h @@ -447,11 +447,6 @@ struct in6_rrenumreq { #define SIOCGIFAFLAG_IN6 _IOWR('i', 73, struct in6_ifreq) -#define SIOCGDRLST_IN6 _IOWR('i', 74, struct in6_drlist) -#ifdef _KERNEL -/* XXX: SIOCGPRLST_IN6 is exposed in KAME but in6_oprlist is not. */ -#define SIOCGPRLST_IN6 _IOWR('i', 75, struct in6_oprlist) -#endif #ifdef _KERNEL #define OSIOCGIFINFO_IN6 _IOWR('i', 76, struct in6_ondireq) #endif @@ -499,14 +494,11 @@ struct in6_rrenumreq { #define IN6_IFF_DETACHED 0x08 /* may be detached from the link */ #define IN6_IFF_DEPRECATED 0x10 /* deprecated address */ #define IN6_IFF_NODAD 0x20 /* don't perform DAD on this address - * (used only at first SIOC* call) + * (obsolete) */ #define IN6_IFF_AUTOCONF 0x40 /* autoconfigurable address. */ #define IN6_IFF_TEMPORARY 0x80 /* temporary (anonymous) address. */ #define IN6_IFF_PREFER_SOURCE 0x0100 /* preferred address for SAS */ -#define IN6_IFF_NOPFX 0x8000 /* skip kernel prefix management. - * XXX: this should be temporary. - */ /* do not input/output */ #define IN6_IFF_NOTREADY (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED) diff --git a/sys/netinet6/nd6.c b/sys/netinet6/nd6.c index 94add9a..d2aa188 100644 --- a/sys/netinet6/nd6.c +++ b/sys/netinet6/nd6.c @@ -1246,99 +1246,14 @@ nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info) int nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { - struct in6_drlist *drl = (struct in6_drlist *)data; - struct in6_oprlist *oprl = (struct in6_oprlist *)data; struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data; struct in6_ndifreq *ndif = (struct in6_ndifreq *)data; - struct nd_defrouter *dr; - struct nd_prefix *pr; - int i = 0, error = 0; + int error = 0; if (ifp->if_afdata[AF_INET6] == NULL) return (EPFNOSUPPORT); switch (cmd) { - case SIOCGDRLST_IN6: - /* - * obsolete API, use sysctl under net.inet6.icmp6 - */ - bzero(drl, sizeof(*drl)); - TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { - if (i >= DRLSTSIZ) - break; - drl->defrouter[i].rtaddr = dr->rtaddr; - in6_clearscope(&drl->defrouter[i].rtaddr); - - drl->defrouter[i].flags = dr->flags; - drl->defrouter[i].rtlifetime = dr->rtlifetime; - drl->defrouter[i].expire = dr->expire + - (time_second - time_uptime); - drl->defrouter[i].if_index = dr->ifp->if_index; - i++; - } - break; - case SIOCGPRLST_IN6: - /* - * obsolete API, use sysctl under net.inet6.icmp6 - * - * XXX the structure in6_prlist was changed in backward- - * incompatible manner. in6_oprlist is used for SIOCGPRLST_IN6, - * in6_prlist is used for nd6_sysctl() - fill_prlist(). - */ - /* - * XXX meaning of fields, especialy "raflags", is very - * differnet between RA prefix list and RR/static prefix list. - * how about separating ioctls into two? - */ - bzero(oprl, sizeof(*oprl)); - LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { - struct nd_pfxrouter *pfr; - int j; - - if (i >= PRLSTSIZ) - break; - oprl->prefix[i].prefix = pr->ndpr_prefix.sin6_addr; - oprl->prefix[i].raflags = pr->ndpr_raf; - oprl->prefix[i].prefixlen = pr->ndpr_plen; - oprl->prefix[i].vltime = pr->ndpr_vltime; - oprl->prefix[i].pltime = pr->ndpr_pltime; - oprl->prefix[i].if_index = pr->ndpr_ifp->if_index; - if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME) - oprl->prefix[i].expire = 0; - else { - time_t maxexpire; - - /* XXX: we assume time_t is signed. */ - maxexpire = (-1) & - ~((time_t)1 << - ((sizeof(maxexpire) * 8) - 1)); - if (pr->ndpr_vltime < - maxexpire - pr->ndpr_lastupdate) { - oprl->prefix[i].expire = - pr->ndpr_lastupdate + - pr->ndpr_vltime + - (time_second - time_uptime); - } else - oprl->prefix[i].expire = maxexpire; - } - - j = 0; - LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) { - if (j < DRLSTSIZ) { -#define RTRADDR oprl->prefix[i].advrtr[j] - RTRADDR = pfr->router->rtaddr; - in6_clearscope(&RTRADDR); -#undef RTRADDR - } - j++; - } - oprl->prefix[i].advrtrs = j; - oprl->prefix[i].origin = PR_ORIG_RA; - - i++; - } - - break; case OSIOCGIFINFO_IN6: #define ND ndi->ndi /* XXX: old ndp(8) assumes a positive value for linkmtu. */ @@ -1398,22 +1313,19 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) * do not clear ND6_IFF_IFDISABLED. * See RFC 4862, Section 5.4.5. */ - int duplicated_linklocal = 0; - IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if ((ia->ia6_flags & IN6_IFF_DUPLICATED) && - IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) { - duplicated_linklocal = 1; + IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; - } } IF_ADDR_RUNLOCK(ifp); - if (duplicated_linklocal) { + if (ifa != NULL) { + /* LLA is duplicated. */ ND.flags |= ND6_IFF_IFDISABLED; log(LOG_ERR, "Cannot enable an interface" " with a link-local address marked" @@ -1429,14 +1341,18 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) /* Mark all IPv6 address as tentative. */ ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED; - IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family != AF_INET6) - continue; - ia = (struct in6_ifaddr *)ifa; - ia->ia6_flags |= IN6_IFF_TENTATIVE; + if ((ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0) { + IF_ADDR_RLOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, + ifa_link) { + if (ifa->ifa_addr->sa_family != + AF_INET6) + continue; + ia = (struct in6_ifaddr *)ifa; + ia->ia6_flags |= IN6_IFF_TENTATIVE; + } + IF_ADDR_RUNLOCK(ifp); } - IF_ADDR_RUNLOCK(ifp); } if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) { @@ -1454,20 +1370,19 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) * address is assigned, and IFF_UP, try to * assign one. */ - int haslinklocal = 0; - IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family != AF_INET6) + TAILQ_FOREACH(ifa, &ifp->if_addrhead, + ifa_link) { + if (ifa->ifa_addr->sa_family != + AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; - if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) { - haslinklocal = 1; + if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; - } } IF_ADDR_RUNLOCK(ifp); - if (!haslinklocal) + if (ifa != NULL) + /* No LLA is configured. */ in6_ifattach(ifp, NULL); } } diff --git a/sys/netinet6/nd6_nbr.c b/sys/netinet6/nd6_nbr.c index 2272cd0..8f287dc 100644 --- a/sys/netinet6/nd6_nbr.c +++ b/sys/netinet6/nd6_nbr.c @@ -576,7 +576,7 @@ nd6_ns_output_fib(struct ifnet *ifp, const struct in6_addr *daddr6, /* * Add a Nonce option (RFC 3971) to detect looped back NS messages. * This behavior is documented as Enhanced Duplicate Address - * Detection in draft-ietf-6man-enhanced-dad-13. + * Detection in RFC 7527. * net.inet6.ip6.dad_enhanced=0 disables this. */ if (V_dad_enhanced != 0 && nonce != NULL) { @@ -1308,11 +1308,16 @@ nd6_dad_start(struct ifaddr *ifa, int delay) } if (ifa->ifa_ifp == NULL) panic("nd6_dad_start: ifa->ifa_ifp == NULL"); - if (!(ifa->ifa_ifp->if_flags & IFF_UP)) { + if (ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_NO_DAD) { + ia->ia6_flags &= ~IN6_IFF_TENTATIVE; return; } - if (ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_IFDISABLED) + if (!(ifa->ifa_ifp->if_flags & IFF_UP) || + !(ifa->ifa_ifp->if_drv_flags & IFF_DRV_RUNNING) || + (ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_IFDISABLED)) { + ia->ia6_flags |= IN6_IFF_TENTATIVE; return; + } if ((dp = nd6_dad_find(ifa, NULL)) != NULL) { /* DAD already in progress */ nd6_dad_rele(dp); diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c index ef66064..6a6a10f 100644 --- a/sys/powerpc/aim/mmu_oea64.c +++ b/sys/powerpc/aim/mmu_oea64.c @@ -1432,7 +1432,8 @@ retry: static mmu_t installed_mmu; static void * -moea64_uma_page_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, + int wait) { /* * This entire routine is a horrible hack to avoid bothering kmem diff --git a/sys/powerpc/aim/slb.c b/sys/powerpc/aim/slb.c index 9d60b2b..89cfabf 100644 --- a/sys/powerpc/aim/slb.c +++ b/sys/powerpc/aim/slb.c @@ -473,7 +473,7 @@ slb_insert_user(pmap_t pm, struct slb *slb) } static void * -slb_uma_real_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) { static vm_offset_t realmax = 0; void *va; diff --git a/sys/powerpc/aim/uma_machdep.c b/sys/powerpc/aim/uma_machdep.c index 255826e..33674e5 100644 --- a/sys/powerpc/aim/uma_machdep.c +++ b/sys/powerpc/aim/uma_machdep.c @@ -50,7 +50,7 @@ SYSCTL_INT(_hw, OID_AUTO, uma_mdpages, CTLFLAG_RD, &hw_uma_mdpages, 0, "UMA MD pages in use"); void * -uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) { void *va; vm_page_t m; @@ -82,7 +82,7 @@ uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) } void -uma_small_free(void *mem, int size, u_int8_t flags) +uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; diff --git a/sys/sparc64/sparc64/vm_machdep.c b/sys/sparc64/sparc64/vm_machdep.c index 779b953..60e4a2f 100644 --- a/sys/sparc64/sparc64/vm_machdep.c +++ b/sys/sparc64/sparc64/vm_machdep.c @@ -502,7 +502,7 @@ swi_vm(void *v) } void * -uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) { vm_paddr_t pa; vm_page_t m; @@ -540,7 +540,7 @@ uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) } void -uma_small_free(void *mem, int size, u_int8_t flags) +uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; diff --git a/sys/sys/busdma_bufalloc.h b/sys/sys/busdma_bufalloc.h index f5ec32f..dfcb4b8 100644 --- a/sys/sys/busdma_bufalloc.h +++ b/sys/sys/busdma_bufalloc.h @@ -110,9 +110,10 @@ struct busdma_bufzone * busdma_bufalloc_findzone(busdma_bufalloc_t ba, * routines support pmap_page_set_memattr() and the VM_MEMATTR_UNCACHEABLE flag * you can probably use these when you need uncacheable buffers. */ -void * busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, - u_int8_t *pflag, int wait); -void busdma_bufalloc_free_uncacheable(void *item, int size, u_int8_t pflag); +void * busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, + uint8_t *pflag, int wait); +void busdma_bufalloc_free_uncacheable(void *item, vm_size_t size, + uint8_t pflag); #endif /* _MACHINE_BUSDMA_BUFALLOC_H_ */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index fe4c9ea..eb691b8 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -600,6 +600,7 @@ struct vnode; typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **); /* cache_* may belong in namei.h. */ +void cache_changesize(int newhashsize); #define cache_enter(dvp, vp, cnp) \ cache_enter_time(dvp, vp, cnp, NULL, NULL) void cache_enter_time(struct vnode *dvp, struct vnode *vp, @@ -836,6 +837,7 @@ int fifo_printinfo(struct vnode *); /* vfs_hash.c */ typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg); +void vfs_hash_changesize(int newhashsize); int vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); u_int vfs_hash_index(struct vnode *vp); int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); diff --git a/sys/teken/teken.c b/sys/teken/teken.c index 3002a88..8834390 100644 --- a/sys/teken/teken.c +++ b/sys/teken/teken.c @@ -29,12 +29,14 @@ #include <sys/cdefs.h> #if defined(__FreeBSD__) && defined(_KERNEL) #include <sys/param.h> +#include <sys/limits.h> #include <sys/lock.h> #include <sys/systm.h> #define teken_assert(x) MPASS(x) #else /* !(__FreeBSD__ && _KERNEL) */ #include <sys/types.h> #include <assert.h> +#include <limits.h> #include <stdint.h> #include <stdio.h> #include <string.h> @@ -405,18 +407,24 @@ teken_state_numbers(teken_t *t, teken_char_t c) teken_assert(t->t_curnum < T_NUMSIZE); if (c >= '0' && c <= '9') { - /* - * Don't do math with the default value of 1 when a - * custom number is inserted. - */ if (t->t_stateflags & TS_FIRSTDIGIT) { + /* First digit. */ t->t_stateflags &= ~TS_FIRSTDIGIT; - t->t_nums[t->t_curnum] = 0; - } else { - t->t_nums[t->t_curnum] *= 10; + t->t_nums[t->t_curnum] = c - '0'; + } else if (t->t_nums[t->t_curnum] < UINT_MAX / 100) { + /* + * There is no need to continue parsing input + * once the value exceeds the size of the + * terminal. It would only allow for integer + * overflows when performing arithmetic on the + * cursor position. + * + * Ignore any further digits if the value is + * already UINT_MAX / 100. + */ + t->t_nums[t->t_curnum] = + t->t_nums[t->t_curnum] * 10 + c - '0'; } - - t->t_nums[t->t_curnum] += c - '0'; return (1); } else if (c == ';') { if (t->t_stateflags & TS_FIRSTDIGIT) diff --git a/sys/teken/teken_subr.h b/sys/teken/teken_subr.h index 2eee627..d458f2a 100644 --- a/sys/teken/teken_subr.h +++ b/sys/teken/teken_subr.h @@ -324,13 +324,13 @@ static void teken_subr_cursor_position(teken_t *t, unsigned int row, unsigned int col) { - t->t_cursor.tp_row = t->t_originreg.ts_begin + row - 1; - if (t->t_cursor.tp_row >= t->t_originreg.ts_end) - t->t_cursor.tp_row = t->t_originreg.ts_end - 1; + row = row - 1 + t->t_originreg.ts_begin; + t->t_cursor.tp_row = row < t->t_originreg.ts_end ? + row : t->t_originreg.ts_end - 1; - t->t_cursor.tp_col = col - 1; - if (t->t_cursor.tp_col >= t->t_winsize.tp_col) - t->t_cursor.tp_col = t->t_winsize.tp_col - 1; + col--; + t->t_cursor.tp_col = col < t->t_winsize.tp_col ? + col : t->t_winsize.tp_col - 1; t->t_stateflags &= ~TS_WRAPPED; teken_funcs_cursor(t); @@ -583,9 +583,9 @@ static void teken_subr_horizontal_position_absolute(teken_t *t, unsigned int col) { - t->t_cursor.tp_col = col - 1; - if (t->t_cursor.tp_col >= t->t_winsize.tp_col) - t->t_cursor.tp_col = t->t_winsize.tp_col - 1; + col--; + t->t_cursor.tp_col = col < t->t_winsize.tp_col ? + col : t->t_winsize.tp_col - 1; t->t_stateflags &= ~TS_WRAPPED; teken_funcs_cursor(t); @@ -1297,9 +1297,9 @@ static void teken_subr_vertical_position_absolute(teken_t *t, unsigned int row) { - t->t_cursor.tp_row = t->t_originreg.ts_begin + row - 1; - if (t->t_cursor.tp_row >= t->t_originreg.ts_end) - t->t_cursor.tp_row = t->t_originreg.ts_end - 1; + row = row - 1 + t->t_originreg.ts_begin; + t->t_cursor.tp_row = row < t->t_originreg.ts_end ? + row : t->t_originreg.ts_end - 1; t->t_stateflags &= ~TS_WRAPPED; teken_funcs_cursor(t); diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index 5ac391e..4e2c9ea 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -923,8 +923,7 @@ static int journal_unsuspend(struct ufsmount *ump); static void softdep_prelink(struct vnode *, struct vnode *); static void add_to_journal(struct worklist *); static void remove_from_journal(struct worklist *); -static bool softdep_excess_inodes(struct ufsmount *); -static bool softdep_excess_dirrem(struct ufsmount *); +static bool softdep_excess_items(struct ufsmount *, int); static void softdep_process_journal(struct mount *, struct worklist *, int); static struct jremref *newjremref(struct dirrem *, struct inode *, struct inode *ip, off_t, nlink_t); @@ -2212,7 +2211,7 @@ inodedep_lookup(mp, inum, flags, inodedeppp) * responsible for more than our share of that usage and * we are not in a rush, request some inodedep cleanup. */ - if (softdep_excess_inodes(ump)) + if (softdep_excess_items(ump, D_INODEDEP)) schedule_cleanup(mp); else FREE_LOCK(ump); @@ -2307,7 +2306,12 @@ newblk_lookup(mp, newblkno, flags, newblkpp) return (1); if ((flags & DEPALLOC) == 0) return (0); - FREE_LOCK(ump); + if (softdep_excess_items(ump, D_NEWBLK) || + softdep_excess_items(ump, D_ALLOCDIRECT) || + softdep_excess_items(ump, D_ALLOCINDIR)) + schedule_cleanup(mp); + else + FREE_LOCK(ump); newblk = malloc(sizeof(union allblk), M_NEWBLK, M_SOFTDEP_FLAGS | M_ZERO); workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); @@ -2406,7 +2410,11 @@ softdep_initialize() { TAILQ_INIT(&softdepmounts); +#ifdef __LP64__ max_softdeps = desiredvnodes * 4; +#else + max_softdeps = desiredvnodes * 2; +#endif /* initialise bioops hack */ bioops.io_start = softdep_disk_io_initiation; @@ -9106,7 +9114,7 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp) * the number of freefile and freeblks structures. */ ACQUIRE_LOCK(ip->i_ump); - if (!IS_SNAPSHOT(ip) && softdep_excess_dirrem(ip->i_ump)) + if (!IS_SNAPSHOT(ip) && softdep_excess_items(ip->i_ump, D_DIRREM)) schedule_cleanup(ITOV(dp)->v_mount); else FREE_LOCK(ip->i_ump); @@ -13244,20 +13252,12 @@ retry: } static bool -softdep_excess_inodes(struct ufsmount *ump) -{ - - return (dep_current[D_INODEDEP] > max_softdeps && - ump->softdep_curdeps[D_INODEDEP] > max_softdeps / - stat_flush_threads); -} - -static bool -softdep_excess_dirrem(struct ufsmount *ump) +softdep_excess_items(struct ufsmount *ump, int item) { - return (dep_current[D_DIRREM] > max_softdeps / 2 && - ump->softdep_curdeps[D_DIRREM] > (max_softdeps / 2) / + KASSERT(item >= 0 && item < D_LAST, ("item %d", item)); + return (dep_current[item] > max_softdeps && + ump->softdep_curdeps[item] > max_softdeps / stat_flush_threads); } @@ -13313,15 +13313,25 @@ softdep_ast_cleanup_proc(void) for (;;) { req = false; ACQUIRE_LOCK(ump); - if (softdep_excess_inodes(ump)) { + if (softdep_excess_items(ump, D_INODEDEP)) { req = true; request_cleanup(mp, FLUSH_INODES); } - if (softdep_excess_dirrem(ump)) { + if (softdep_excess_items(ump, D_DIRREM)) { req = true; request_cleanup(mp, FLUSH_BLOCKS); } FREE_LOCK(ump); + if (softdep_excess_items(ump, D_NEWBLK) || + softdep_excess_items(ump, D_ALLOCDIRECT) || + softdep_excess_items(ump, D_ALLOCINDIR)) { + error = vn_start_write(NULL, &mp, V_WAIT); + if (error == 0) { + req = true; + VFS_SYNC(mp, MNT_WAIT); + vn_finished_write(mp); + } + } if ((td->td_pflags & TDP_KTHREAD) != 0 || !req) break; } diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index c09dbc2..4deebd9 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -2354,8 +2354,8 @@ swapoff_one(struct swdevt *sp, struct ucred *cred) swap_pager_swapoff(sp); sp->sw_close(curthread, sp); - sp->sw_id = NULL; mtx_lock(&sw_dev_mtx); + sp->sw_id = NULL; TAILQ_REMOVE(&swtailq, sp, sw_list); nswapdev--; if (nswapdev == 0) { @@ -2541,13 +2541,39 @@ swapgeom_close_ev(void *arg, int flags) g_destroy_consumer(cp); } +/* + * Add a reference to the g_consumer for an inflight transaction. + */ +static void +swapgeom_acquire(struct g_consumer *cp) +{ + + mtx_assert(&sw_dev_mtx, MA_OWNED); + cp->index++; +} + +/* + * Remove a reference from the g_consumer. Post a close event if + * all referneces go away. + */ +static void +swapgeom_release(struct g_consumer *cp, struct swdevt *sp) +{ + + mtx_assert(&sw_dev_mtx, MA_OWNED); + cp->index--; + if (cp->index == 0) { + if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0) + sp->sw_id = NULL; + } +} + static void swapgeom_done(struct bio *bp2) { struct swdevt *sp; struct buf *bp; struct g_consumer *cp; - int destroy; bp = bp2->bio_caller2; cp = bp2->bio_from; @@ -2557,16 +2583,11 @@ swapgeom_done(struct bio *bp2) bp->b_resid = bp->b_bcount - bp2->bio_completed; bp->b_error = bp2->bio_error; bufdone(bp); + sp = bp2->bio_caller1; mtx_lock(&sw_dev_mtx); - destroy = ((--cp->index) == 0 && cp->private); - if (destroy) { - sp = bp2->bio_caller1; - sp->sw_id = NULL; - } + swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); g_destroy_bio(bp2); - if (destroy) - g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL); } static void @@ -2584,13 +2605,16 @@ swapgeom_strategy(struct buf *bp, struct swdevt *sp) bufdone(bp); return; } - cp->index++; + swapgeom_acquire(cp); mtx_unlock(&sw_dev_mtx); if (bp->b_iocmd == BIO_WRITE) bio = g_new_bio(); else bio = g_alloc_bio(); if (bio == NULL) { + mtx_lock(&sw_dev_mtx); + swapgeom_release(cp, sp); + mtx_unlock(&sw_dev_mtx); bp->b_error = ENOMEM; bp->b_ioflags |= BIO_ERROR; bufdone(bp); @@ -2630,7 +2654,12 @@ swapgeom_orphan(struct g_consumer *cp) break; } } - cp->private = (void *)(uintptr_t)1; + /* + * Drop reference we were created with. Do directly since we're in a + * special context where we don't have to queue the call to + * swapgeom_close_ev(). + */ + cp->index--; destroy = ((sp != NULL) && (cp->index == 0)); if (destroy) sp->sw_id = NULL; @@ -2691,8 +2720,8 @@ swapongeom_ev(void *arg, int flags) if (gp == NULL) gp = g_new_geomf(&g_swap_class, "swap"); cp = g_new_consumer(gp); - cp->index = 0; /* Number of active I/Os. */ - cp->private = NULL; /* Orphanization flag */ + cp->index = 1; /* Number of active I/Os, plus one for being active. */ + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; g_attach(cp, pp); /* * XXX: Everytime you think you can improve the margin for diff --git a/sys/vm/uma.h b/sys/vm/uma.h index ed69e19..d3e0658 100644 --- a/sys/vm/uma.h +++ b/sys/vm/uma.h @@ -382,7 +382,8 @@ uma_zfree(uma_zone_t zone, void *item) * A pointer to the allocated memory or NULL on failure. */ -typedef void *(*uma_alloc)(uma_zone_t zone, int size, uint8_t *pflag, int wait); +typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, uint8_t *pflag, + int wait); /* * Backend page free routines @@ -395,7 +396,7 @@ typedef void *(*uma_alloc)(uma_zone_t zone, int size, uint8_t *pflag, int wait); * Returns: * None */ -typedef void (*uma_free)(void *item, int size, uint8_t pflag); +typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag); diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index d0df901..ee0b207 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -229,10 +229,10 @@ enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI }; /* Prototypes.. */ -static void *noobj_alloc(uma_zone_t, int, uint8_t *, int); -static void *page_alloc(uma_zone_t, int, uint8_t *, int); -static void *startup_alloc(uma_zone_t, int, uint8_t *, int); -static void page_free(void *, int, uint8_t); +static void *noobj_alloc(uma_zone_t, vm_size_t, uint8_t *, int); +static void *page_alloc(uma_zone_t, vm_size_t, uint8_t *, int); +static void *startup_alloc(uma_zone_t, vm_size_t, uint8_t *, int); +static void page_free(void *, vm_size_t, uint8_t); static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); @@ -1038,7 +1038,7 @@ out: * the VM is ready. */ static void * -startup_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait) +startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) { uma_keg_t keg; uma_slab_t tmps; @@ -1098,7 +1098,7 @@ startup_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait) * NULL if M_NOWAIT is set. */ static void * -page_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait) +page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) { void *p; /* Returned page */ @@ -1120,7 +1120,7 @@ page_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait) * NULL if M_NOWAIT is set. */ static void * -noobj_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait) +noobj_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) { TAILQ_HEAD(, vm_page) alloctail; u_long npages; @@ -1183,7 +1183,7 @@ noobj_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait) * Nothing */ static void -page_free(void *mem, int size, uint8_t flags) +page_free(void *mem, vm_size_t size, uint8_t flags) { struct vmem *vmem; @@ -3269,7 +3269,7 @@ uma_zone_exhausted_nolock(uma_zone_t zone) } void * -uma_large_malloc(int size, int wait) +uma_large_malloc(vm_size_t size, int wait) { void *mem; uma_slab_t slab; diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h index 1ffc7d5..ad2a405 100644 --- a/sys/vm/uma_int.h +++ b/sys/vm/uma_int.h @@ -341,7 +341,7 @@ zone_first_keg(uma_zone_t zone) #ifdef _KERNEL /* Internal prototypes */ static __inline uma_slab_t hash_sfind(struct uma_hash *hash, uint8_t *data); -void *uma_large_malloc(int size, int wait); +void *uma_large_malloc(vm_size_t size, int wait); void uma_large_free(uma_slab_t slab); /* Lock Macros */ @@ -424,8 +424,9 @@ vsetslab(vm_offset_t va, uma_slab_t slab) * if they can provide more effecient allocation functions. This is useful * for using direct mapped addresses. */ -void *uma_small_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait); -void uma_small_free(void *mem, int size, uint8_t flags); +void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, + int wait); +void uma_small_free(void *mem, vm_size_t size, uint8_t flags); #endif /* _KERNEL */ #endif /* VM_UMA_INT_H */ diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 93db8d1..46e3089 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -3989,12 +3989,10 @@ RetryLookup:; vm_map_unlock_read(map); return (KERN_PROTECTION_FAILURE); } - if ((entry->eflags & MAP_ENTRY_USER_WIRED) && - (entry->eflags & MAP_ENTRY_COW) && - (fault_type & VM_PROT_WRITE)) { - vm_map_unlock_read(map); - return (KERN_PROTECTION_FAILURE); - } + KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags & + (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) != + (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY), + ("entry %p flags %x", entry, entry->eflags)); if ((fault_typea & VM_PROT_COPY) != 0 && (entry->max_protection & VM_PROT_WRITE) == 0 && (entry->eflags & MAP_ENTRY_COW) == 0) { @@ -4148,10 +4146,6 @@ vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */ fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; if ((fault_type & prot) != fault_type) return (KERN_PROTECTION_FAILURE); - if ((entry->eflags & MAP_ENTRY_USER_WIRED) && - (entry->eflags & MAP_ENTRY_COW) && - (fault_type & VM_PROT_WRITE)) - return (KERN_PROTECTION_FAILURE); /* * If this page is not pageable, we have to get it for all possible diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 998cd37..6a56fd7 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -93,6 +93,7 @@ __FBSDID("$FreeBSD$"); #include <sys/sdt.h> #include <sys/signalvar.h> #include <sys/smp.h> +#include <sys/time.h> #include <sys/vnode.h> #include <sys/vmmeter.h> #include <sys/rwlock.h> @@ -170,7 +171,7 @@ static int vm_pageout_update_period; static int defer_swap_pageouts; static int disable_swap_pageouts; static int lowmem_period = 10; -static int lowmem_ticks; +static time_t lowmem_uptime; #if defined(NO_SWAPPING) static int vm_swap_enabled = 0; @@ -932,7 +933,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass) * some. We rate limit to avoid thrashing. */ if (vmd == &vm_dom[0] && pass > 0 && - (ticks - lowmem_ticks) / hz >= lowmem_period) { + (time_uptime - lowmem_uptime) >= lowmem_period) { /* * Decrease registered cache sizes. */ @@ -943,7 +944,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass) * drained above. */ uma_reclaim(); - lowmem_ticks = ticks; + lowmem_uptime = time_uptime; } /* diff --git a/sys/xen/interface/io/blkif.h b/sys/xen/interface/io/blkif.h index b725776..9c73ae7 100644 --- a/sys/xen/interface/io/blkif.h +++ b/sys/xen/interface/io/blkif.h @@ -97,6 +97,28 @@ * * The type of the backing device/object. * + * + * direct-io-safe + * Values: 0/1 (boolean) + * Default Value: 0 + * + * The underlying storage is not affected by the direct IO memory + * lifetime bug. See: + * http://lists.xen.org/archives/html/xen-devel/2012-12/msg01154.html + * + * Therefore this option gives the backend permission to use + * O_DIRECT, notwithstanding that bug. + * + * That is, if this option is enabled, use of O_DIRECT is safe, + * in circumstances where we would normally have avoided it as a + * workaround for that bug. This option is not relevant for all + * backends, and even not necessarily supported for those for + * which it is relevant. A backend which knows that it is not + * affected by the bug can ignore this option. + * + * This option doesn't require a backend to use O_DIRECT, so it + * should not be used to try to control the caching behaviour. + * *--------------------------------- Features --------------------------------- * * feature-barrier @@ -126,6 +148,34 @@ * of this type may still be returned at any time with the * BLKIF_RSP_EOPNOTSUPP result code. * + * feature-persistent + * Values: 0/1 (boolean) + * Default Value: 0 + * Notes: 7 + * + * A value of "1" indicates that the backend can keep the grants used + * by the frontend driver mapped, so the same set of grants should be + * used in all transactions. The maximum number of grants the backend + * can map persistently depends on the implementation, but ideally it + * should be RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. Using this + * feature the backend doesn't need to unmap each grant, preventing + * costly TLB flushes. The backend driver should only map grants + * persistently if the frontend supports it. If a backend driver chooses + * to use the persistent protocol when the frontend doesn't support it, + * it will probably hit the maximum number of persistently mapped grants + * (due to the fact that the frontend won't be reusing the same grants), + * and fall back to non-persistent mode. Backend implementations may + * shrink or expand the number of persistently mapped grants without + * notifying the frontend depending on memory constraints (this might + * cause a performance degradation). + * + * If a backend driver wants to limit the maximum number of persistently + * mapped grants to a value less than RING_SIZE * + * BLKIF_MAX_SEGMENTS_PER_REQUEST a LRU strategy should be used to + * discard the grants that are less commonly used. Using a LRU in the + * backend driver paired with a LIFO queue in the frontend will + * allow us to have better performance in this scenario. + * *----------------------- Request Transport Parameters ------------------------ * * max-ring-page-order @@ -145,33 +195,17 @@ * The maximum supported size of the request ring buffer in units of * machine pages. The value must be a power of 2. * - * max-requests <uint32_t> - * Default Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE) - * Maximum Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE * max-ring-pages) - * - * The maximum number of concurrent, logical requests supported by - * the backend. - * - * Note: A logical request may span multiple ring entries. - * - * max-request-segments - * Values: <uint8_t> - * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK - * Maximum Value: BLKIF_MAX_SEGMENTS_PER_REQUEST - * - * The maximum value of blkif_request.nr_segments supported by - * the backend. - * - * max-request-size - * Values: <uint32_t> - * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK * PAGE_SIZE - * Maximum Value: BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE + *------------------------- Backend Device Properties ------------------------- * - * The maximum amount of data, in bytes, that can be referenced by a - * request type that accesses frontend memory (currently BLKIF_OP_READ, - * BLKIF_OP_WRITE, or BLKIF_OP_WRITE_BARRIER). + * discard-enable + * Values: 0/1 (boolean) + * Default Value: 1 * - *------------------------- Backend Device Properties ------------------------- + * This optional property, set by the toolstack, instructs the backend + * to offer discard to the frontend. If the property is missing the + * backend should offer discard if the backing storage actually supports + * it. This optional property, set by the toolstack, requests that the + * backend offer, or not offer, discard to the frontend. * * discard-alignment * Values: <uint32_t> @@ -192,6 +226,7 @@ * discard-secure * Values: 0/1 (boolean) * Default Value: 0 + * Notes: 10 * * A value of "1" indicates that the backend can process BLKIF_OP_DISCARD * requests with the BLKIF_DISCARD_SECURE flag set. @@ -206,13 +241,17 @@ * sector-size * Values: <uint32_t> * - * The size, in bytes, of the individually addressible data blocks - * on the backend device. + * The logical sector size, in bytes, of the backend device. + * + * physical-sector-size + * Values: <uint32_t> + * + * The physical sector size, in bytes, of the backend device. * * sectors * Values: <uint64_t> * - * The size of the backend device, expressed in units of its native + * The size of the backend device, expressed in units of its logical * sector size ("sector-size"). * ***************************************************************************** @@ -269,32 +308,26 @@ * The size of the frontend allocated request ring buffer in units of * machine pages. The value must be a power of 2. * - * max-requests - * Values: <uint32_t> - * Default Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE) - * Maximum Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE * max-ring-pages) - * - * The maximum number of concurrent, logical requests that will be - * issued by the frontend. - * - * Note: A logical request may span multiple ring entries. - * - * max-request-segments - * Values: <uint8_t> - * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK - * Maximum Value: MIN(255, backend/max-request-segments) - * - * The maximum value the frontend will set in the - * blkif_request.nr_segments field. - * - * max-request-size - * Values: <uint32_t> - * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK * PAGE_SIZE - * Maximum Value: max-request-segments * PAGE_SIZE - * - * The maximum amount of data, in bytes, that can be referenced by - * a request type that accesses frontend memory (currently BLKIF_OP_READ, - * BLKIF_OP_WRITE, or BLKIF_OP_WRITE_BARRIER). + * feature-persistent + * Values: 0/1 (boolean) + * Default Value: 0 + * Notes: 7, 8, 9 + * + * A value of "1" indicates that the frontend will reuse the same grants + * for all transactions, allowing the backend to map them with write + * access (even when it should be read-only). If the frontend hits the + * maximum number of allowed persistently mapped grants, it can fallback + * to non persistent mode. This will cause a performance degradation, + * since the the backend driver will still try to map those grants + * persistently. Since the persistent grants protocol is compatible with + * the previous protocol, a frontend driver can choose to work in + * persistent mode even when the backend doesn't support it. + * + * It is recommended that the frontend driver stores the persistently + * mapped grants in a LIFO queue, so a subset of all persistently mapped + * grants gets used commonly. This is done in case the backend driver + * decides to limit the maximum number of persistently mapped grants + * to a value less than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. * *------------------------- Virtual Device Properties ------------------------- * @@ -315,17 +348,23 @@ * ----- * (1) Multi-page ring buffer scheme first developed in the Citrix XenServer * PV drivers. - * (2) Multi-page ring buffer scheme first used in some Red Hat distributions + * (2) Multi-page ring buffer scheme first used in some RedHat distributions * including a distribution deployed on certain nodes of the Amazon * EC2 cluster. * (3) Support for multi-page ring buffers was implemented independently, - * in slightly different forms, by both Citrix and Red Hat/Amazon. + * in slightly different forms, by both Citrix and RedHat/Amazon. * For full interoperability, block front and backends should publish * identical ring parameters, adjusted for unit differences, to the * XenStore nodes used in both schemes. - * (4) Devices that support discard functionality may internally allocate - * space (discardable extents) in units that are larger than the - * exported logical block size. + * (4) Devices that support discard functionality may internally allocate space + * (discardable extents) in units that are larger than the exported logical + * block size. If the backing device has such discardable extents the + * backend should provide both discard-granularity and discard-alignment. + * Providing just one of the two may be considered an error by the frontend. + * Backends supporting discard should include discard-granularity and + * discard-alignment even if it supports discarding individual sectors. + * Frontends should assume discard-alignment == 0 and discard-granularity + * == sector size if these keys are missing. * (5) The discard-alignment parameter allows a physical device to be * partitioned into virtual devices that do not necessarily begin or * end on a discardable extent boundary. @@ -333,6 +372,19 @@ * 'ring-ref' is used to communicate the grant reference for this * page to the backend. When using a multi-page ring, the 'ring-ref' * node is not created. Instead 'ring-ref0' - 'ring-refN' are used. + * (7) When using persistent grants data has to be copied from/to the page + * where the grant is currently mapped. The overhead of doing this copy + * however doesn't suppress the speed improvement of not having to unmap + * the grants. + * (8) The frontend driver has to allow the backend driver to map all grants + * with write access, even when they should be mapped read-only, since + * further requests may reuse these grants and require write permissions. + * (9) Linux implementation doesn't have a limit on the maximum number of + * grants that can be persistently mapped in the frontend driver, but + * due to the frontent driver implementation it should never be bigger + * than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. + *(10) The discard-secure property may be present and will be set to 1 if the + * backing device supports secure discard. */ /* @@ -457,16 +509,48 @@ #define BLKIF_OP_DISCARD 5 /* - * Maximum scatter/gather segments per request (header + segment blocks). + * Recognized if "feature-max-indirect-segments" in present in the backend + * xenbus info. The "feature-max-indirect-segments" node contains the maximum + * number of segments allowed by the backend per request. If the node is + * present, the frontend might use blkif_request_indirect structs in order to + * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The + * maximum number of indirect segments is fixed by the backend, but the + * frontend can issue requests with any number of indirect segments as long as + * it's less than the number provided by the backend. The indirect_grefs field + * in blkif_request_indirect should be filled by the frontend with the + * grant references of the pages that are holding the indirect segments. + * These pages are filled with an array of blkif_request_segment that hold the + * information about the segments. The number of indirect pages to use is + * determined by the number of segments an indirect request contains. Every + * indirect page can contain a maximum of + * (PAGE_SIZE / sizeof(struct blkif_request_segment)) segments, so to + * calculate the number of indirect pages to use we have to do + * ceil(indirect_segments / (PAGE_SIZE / sizeof(struct blkif_request_segment))). + * + * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not* + * create the "feature-max-indirect-segments" node! + */ +#define BLKIF_OP_INDIRECT 6 + +/* + * Maximum scatter/gather segments per request. + * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE. + * NB. This could be 12 if the ring indexes weren't stored in the same page. */ #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 /* + * Maximum number of indirect pages to use per request. + */ +#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8 + +/* * NB. first_sect and last_sect in blkif_request_segment, as well as * sector_number in blkif_request, are always expressed in 512-byte units. * However they must be properly aligned to the real sector size of the - * physical disk, which is reported in the "sector-size" node in the backend - * xenbus info. Also the xenbus "sectors" node is expressed in 512-byte units. + * physical disk, which is reported in the "physical-sector-size" node in + * the backend xenbus info. Also the xenbus "sectors" node is expressed in + * 512-byte units. */ struct blkif_request_segment { grant_ref_t gref; /* reference to I/O buffer frame */ @@ -478,21 +562,6 @@ typedef struct blkif_request_segment blkif_request_segment_t; /* * Starting ring element for any I/O request. - * - * One or more segment blocks can be inserted into the request ring - * just after a blkif_request_t, allowing requests to operate on - * up to BLKIF_MAX_SEGMENTS_PER_REQUEST. - * - * BLKIF_SEGS_TO_BLOCKS() can be used on blkif_requst.nr_segments - * to determine the number of contiguous ring entries associated - * with this request. - * - * Note: Due to the way Xen request rings operate, the producer and - * consumer indices of the ring must be incremented by the - * BLKIF_SEGS_TO_BLOCKS() value of the associated request. - * (e.g. a response to a 3 ring entry request must also consume - * 3 entries in the ring, even though only the first ring entry - * in the response has any data.) */ struct blkif_request { uint8_t operation; /* BLKIF_OP_??? */ @@ -519,6 +588,20 @@ struct blkif_request_discard { }; typedef struct blkif_request_discard blkif_request_discard_t; +struct blkif_request_indirect { + uint8_t operation; /* BLKIF_OP_INDIRECT */ + uint8_t indirect_op; /* BLKIF_OP_{READ/WRITE} */ + uint16_t nr_segments; /* number of segments */ + uint64_t id; /* private guest value, echoed in resp */ + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + blkif_vdev_t handle; /* same as for read/write requests */ + grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; +#ifdef __i386__ + uint64_t pad; /* Make it 64 byte aligned on i386 */ +#endif +}; +typedef struct blkif_request_indirect blkif_request_indirect_t; + struct blkif_response { uint64_t id; /* copied from request */ uint8_t operation; /* copied from request */ @@ -550,7 +633,7 @@ DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response); /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil |