summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
Diffstat (limited to 'sys')
-rw-r--r--sys/amd64/amd64/uma_machdep.c4
-rw-r--r--sys/contrib/ipfilter/netinet/ip_state.c16
-rw-r--r--sys/dev/msk/if_mskreg.h9
-rw-r--r--sys/dev/puc/puc.c4
-rw-r--r--sys/dev/puc/puc_bfe.h3
-rw-r--r--sys/dev/puc/puc_cfg.c1
-rw-r--r--sys/dev/puc/puc_pccard.c1
-rw-r--r--sys/dev/puc/puc_pci.c55
-rw-r--r--sys/dev/puc/pucdata.c1
-rw-r--r--sys/dev/usb/serial/u3g.c1
-rw-r--r--sys/dev/usb/usbdevs1
-rw-r--r--sys/dev/vt/vt_core.c7
-rw-r--r--sys/dev/watchdog/watchdog.c3
-rw-r--r--sys/dev/xen/blkfront/blkfront.c254
-rw-r--r--sys/dev/xen/blkfront/block.h33
-rw-r--r--sys/geom/geom_dev.c2
-rw-r--r--sys/geom/multipath/g_multipath.c4
-rw-r--r--sys/i386/i386/pmap.c5
-rw-r--r--sys/ia64/ia64/uma_machdep.c4
-rw-r--r--sys/kern/kern_event.c15
-rw-r--r--sys/kern/kern_mbuf.c4
-rw-r--r--sys/kern/subr_busdma_bufalloc.c6
-rw-r--r--sys/kern/subr_vmem.c2
-rw-r--r--sys/kern/uipc_syscalls.c1
-rw-r--r--sys/kern/vfs_cache.c66
-rw-r--r--sys/kern/vfs_hash.c37
-rw-r--r--sys/kern/vfs_subr.c21
-rw-r--r--sys/kern/vfs_vnops.c5
-rw-r--r--sys/mips/mips/uma_machdep.c4
-rw-r--r--sys/net/ieee8023ad_lacp.c93
-rw-r--r--sys/net/ieee8023ad_lacp.h7
-rw-r--r--sys/net/if_gif.c76
-rw-r--r--sys/net/if_gif.h5
-rw-r--r--sys/net/if_lagg.c479
-rw-r--r--sys/net/if_lagg.h48
-rw-r--r--sys/netinet/ip_fw.h2
-rw-r--r--sys/netinet6/in6.c33
-rw-r--r--sys/netinet6/in6_ifattach.c6
-rw-r--r--sys/netinet6/in6_var.h10
-rw-r--r--sys/netinet6/nd6.c129
-rw-r--r--sys/netinet6/nd6_nbr.c11
-rw-r--r--sys/powerpc/aim/mmu_oea64.c3
-rw-r--r--sys/powerpc/aim/slb.c2
-rw-r--r--sys/powerpc/aim/uma_machdep.c4
-rw-r--r--sys/sparc64/sparc64/vm_machdep.c4
-rw-r--r--sys/sys/busdma_bufalloc.h7
-rw-r--r--sys/sys/vnode.h2
-rw-r--r--sys/teken/teken.c26
-rw-r--r--sys/teken/teken_subr.h24
-rw-r--r--sys/ufs/ffs/ffs_softdep.c48
-rw-r--r--sys/vm/swap_pager.c55
-rw-r--r--sys/vm/uma.h5
-rw-r--r--sys/vm/uma_core.c18
-rw-r--r--sys/vm/uma_int.h7
-rw-r--r--sys/vm/vm_map.c14
-rw-r--r--sys/vm/vm_pageout.c7
-rw-r--r--sys/xen/interface/io/blkif.h239
57 files changed, 1131 insertions, 802 deletions
diff --git a/sys/amd64/amd64/uma_machdep.c b/sys/amd64/amd64/uma_machdep.c
index c4ca677..72adb27 100644
--- a/sys/amd64/amd64/uma_machdep.c
+++ b/sys/amd64/amd64/uma_machdep.c
@@ -41,7 +41,7 @@ __FBSDID("$FreeBSD$");
#include <machine/vmparam.h>
void *
-uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
{
vm_page_t m;
vm_paddr_t pa;
@@ -70,7 +70,7 @@ uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
}
void
-uma_small_free(void *mem, int size, u_int8_t flags)
+uma_small_free(void *mem, vm_size_t size, u_int8_t flags)
{
vm_page_t m;
vm_paddr_t pa;
diff --git a/sys/contrib/ipfilter/netinet/ip_state.c b/sys/contrib/ipfilter/netinet/ip_state.c
index 9c6a244..ad2bf38 100644
--- a/sys/contrib/ipfilter/netinet/ip_state.c
+++ b/sys/contrib/ipfilter/netinet/ip_state.c
@@ -1,4 +1,4 @@
-/* $FreeBSD$ */
+/* $FreeBSD$ */
/*
* Copyright (C) 2012 by Darren Reed.
@@ -1054,7 +1054,7 @@ ipf_state_putent(softc, softs, data)
/* to pointers and adjusts running stats for the hash table as appropriate. */
/* */
/* This function can fail if the filter rule has had a population policy of */
-/* IP addresses used with stateful filteirng assigned to it. */
+/* IP addresses used with stateful filtering assigned to it. */
/* */
/* Locking: it is assumed that some kind of lock on ipf_state is held. */
/* Exits with is_lock initialised and held - *EVEN IF ERROR*. */
@@ -1081,7 +1081,7 @@ ipf_state_insert(softc, is, rev)
}
/*
- * If we could trust is_hv, then the modulous would not be needed,
+ * If we could trust is_hv, then the modulus would not be needed,
* but when running with IPFILTER_SYNC, this stops bad values.
*/
hv = is->is_hv % softs->ipf_state_size;
@@ -1672,6 +1672,10 @@ ipf_state_add(softc, fin, stsave, flags)
SBUMPD(ipf_state_stats, iss_bucket_full);
return 4;
}
+
+ /*
+ * No existing state; create new
+ */
KMALLOC(is, ipstate_t *);
if (is == NULL) {
SBUMPD(ipf_state_stats, iss_nomem);
@@ -1683,7 +1687,7 @@ ipf_state_add(softc, fin, stsave, flags)
is->is_rule = fr;
/*
- * Do not do the modulous here, it is done in ipf_state_insert().
+ * Do not do the modulus here, it is done in ipf_state_insert().
*/
if (fr != NULL) {
ipftq_t *tq;
@@ -1711,7 +1715,7 @@ ipf_state_add(softc, fin, stsave, flags)
/*
* It may seem strange to set is_ref to 2, but if stsave is not NULL
* then a copy of the pointer is being stored somewhere else and in
- * the end, it will expect to be able to do osmething with it.
+ * the end, it will expect to be able to do something with it.
*/
is->is_me = stsave;
if (stsave != NULL) {
@@ -3652,7 +3656,6 @@ ipf_state_del(softc, is, why)
softs->ipf_state_stats.iss_orphan++;
return refs;
}
- MUTEX_EXIT(&is->is_lock);
fr = is->is_rule;
is->is_rule = NULL;
@@ -3664,6 +3667,7 @@ ipf_state_del(softc, is, why)
}
is->is_ref = 0;
+ MUTEX_EXIT(&is->is_lock);
if (is->is_tqehead[0] != NULL) {
if (ipf_deletetimeoutqueue(is->is_tqehead[0]) == 0)
diff --git a/sys/dev/msk/if_mskreg.h b/sys/dev/msk/if_mskreg.h
index 9c55192..da69f2e 100644
--- a/sys/dev/msk/if_mskreg.h
+++ b/sys/dev/msk/if_mskreg.h
@@ -2175,13 +2175,8 @@
#define MSK_ADDR_LO(x) ((uint64_t) (x) & 0xffffffffUL)
#define MSK_ADDR_HI(x) ((uint64_t) (x) >> 32)
-/*
- * At first I guessed 8 bytes, the size of a single descriptor, would be
- * required alignment constraints. But, it seems that Yukon II have 4096
- * bytes boundary alignment constraints.
- */
-#define MSK_RING_ALIGN 4096
-#define MSK_STAT_ALIGN 4096
+#define MSK_RING_ALIGN 32768
+#define MSK_STAT_ALIGN 32768
/* Rx descriptor data structure */
struct msk_rx_desc {
diff --git a/sys/dev/puc/puc.c b/sys/dev/puc/puc.c
index d7bfb63..2b320ca 100644
--- a/sys/dev/puc/puc.c
+++ b/sys/dev/puc/puc.c
@@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$");
#include <sys/conf.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/sysctl.h>
#include <machine/bus.h>
#include <machine/resource.h>
@@ -70,6 +71,8 @@ const char puc_driver_name[] = "puc";
static MALLOC_DEFINE(M_PUC, "PUC", "PUC driver");
+SYSCTL_NODE(_hw, OID_AUTO, puc, CTLFLAG_RD, 0, "puc(9) driver configuration");
+
struct puc_bar *
puc_get_bar(struct puc_softc *sc, int rid)
{
@@ -324,7 +327,6 @@ puc_bfe_attach(device_t dev)
if (bootverbose && sc->sc_ilr != 0)
device_printf(dev, "using interrupt latch register\n");
- sc->sc_irid = 0;
sc->sc_ires = bus_alloc_resource_any(dev, SYS_RES_IRQ, &sc->sc_irid,
RF_ACTIVE|RF_SHAREABLE);
if (sc->sc_ires != NULL) {
diff --git a/sys/dev/puc/puc_bfe.h b/sys/dev/puc/puc_bfe.h
index 3cf0c53..9f2f3fb 100644
--- a/sys/dev/puc/puc_bfe.h
+++ b/sys/dev/puc/puc_bfe.h
@@ -66,6 +66,7 @@ struct puc_softc {
int sc_fastintr:1;
int sc_leaving:1;
int sc_polled:1;
+ int sc_msi:1;
int sc_ilr;
@@ -94,4 +95,6 @@ int puc_bus_setup_intr(device_t, device_t, struct resource *, int,
driver_filter_t *, driver_intr_t *, void *, void **);
int puc_bus_teardown_intr(device_t, device_t, struct resource *, void *);
+SYSCTL_DECL(_hw_puc);
+
#endif /* _DEV_PUC_BFE_H_ */
diff --git a/sys/dev/puc/puc_cfg.c b/sys/dev/puc/puc_cfg.c
index 7de6ea3..99d9816 100644
--- a/sys/dev/puc/puc_cfg.c
+++ b/sys/dev/puc/puc_cfg.c
@@ -31,6 +31,7 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/rman.h>
+#include <sys/sysctl.h>
#include <dev/puc/puc_bus.h>
#include <dev/puc/puc_cfg.h>
diff --git a/sys/dev/puc/puc_pccard.c b/sys/dev/puc/puc_pccard.c
index 370c6af..232741e 100644
--- a/sys/dev/puc/puc_pccard.c
+++ b/sys/dev/puc/puc_pccard.c
@@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$");
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/malloc.h>
+#include <sys/sysctl.h>
#include <machine/bus.h>
#include <machine/resource.h>
diff --git a/sys/dev/puc/puc_pci.c b/sys/dev/puc/puc_pci.c
index 4ad1e3a..d6d5509 100644
--- a/sys/dev/puc/puc_pci.c
+++ b/sys/dev/puc/puc_pci.c
@@ -67,6 +67,7 @@ __FBSDID("$FreeBSD$");
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/malloc.h>
+#include <sys/sysctl.h>
#include <machine/bus.h>
#include <machine/resource.h>
@@ -78,6 +79,11 @@ __FBSDID("$FreeBSD$");
#include <dev/puc/puc_cfg.h>
#include <dev/puc/puc_bfe.h>
+static int puc_msi_disable;
+TUNABLE_INT("hw.puc.msi_disable", &puc_msi_disable);
+SYSCTL_INT(_hw_puc, OID_AUTO, msi_disable, CTLFLAG_RD | CTLFLAG_TUN,
+ &puc_msi_disable, 0, "Disable use of MSI interrupts by puc(9)");
+
static const struct puc_cfg *
puc_pci_match(device_t dev, const struct puc_cfg *desc)
{
@@ -120,11 +126,56 @@ puc_pci_probe(device_t dev)
return (puc_bfe_probe(dev, desc));
}
+static int
+puc_pci_attach(device_t dev)
+{
+ struct puc_softc *sc;
+ int error, count;
+
+ sc = device_get_softc(dev);
+
+ if (!puc_msi_disable) {
+ count = 1;
+
+ if (pci_alloc_msi(dev, &count) == 0) {
+ sc->sc_msi = 1;
+ sc->sc_irid = 1;
+ }
+ }
+
+ error = puc_bfe_attach(dev);
+
+ if (error != 0 && sc->sc_msi)
+ pci_release_msi(dev);
+
+ return (error);
+}
+
+static int
+puc_pci_detach(device_t dev)
+{
+ struct puc_softc *sc;
+ int error;
+
+ sc = device_get_softc(dev);
+
+ error = puc_bfe_detach(dev);
+
+ if (error != 0)
+ return (error);
+
+ if (sc->sc_msi)
+ error = pci_release_msi(dev);
+
+ return (error);
+}
+
+
static device_method_t puc_pci_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, puc_pci_probe),
- DEVMETHOD(device_attach, puc_bfe_attach),
- DEVMETHOD(device_detach, puc_bfe_detach),
+ DEVMETHOD(device_attach, puc_pci_attach),
+ DEVMETHOD(device_detach, puc_pci_detach),
DEVMETHOD(bus_alloc_resource, puc_bus_alloc_resource),
DEVMETHOD(bus_release_resource, puc_bus_release_resource),
diff --git a/sys/dev/puc/pucdata.c b/sys/dev/puc/pucdata.c
index 77953e0..4646fa4 100644
--- a/sys/dev/puc/pucdata.c
+++ b/sys/dev/puc/pucdata.c
@@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bus.h>
+#include <sys/sysctl.h>
#include <machine/resource.h>
#include <machine/bus.h>
diff --git a/sys/dev/usb/serial/u3g.c b/sys/dev/usb/serial/u3g.c
index 1c8f3b5..1fe9b21 100644
--- a/sys/dev/usb/serial/u3g.c
+++ b/sys/dev/usb/serial/u3g.c
@@ -522,6 +522,7 @@ static const STRUCT_USB_HOST_ID u3g_devs[] = {
U3G_DEV(SIERRA, MC5727_2, 0),
U3G_DEV(SIERRA, MC5728, 0),
U3G_DEV(SIERRA, MC7354, 0),
+ U3G_DEV(SIERRA, MC7355, 0),
U3G_DEV(SIERRA, MC8700, 0),
U3G_DEV(SIERRA, MC8755, 0),
U3G_DEV(SIERRA, MC8755_2, 0),
diff --git a/sys/dev/usb/usbdevs b/sys/dev/usb/usbdevs
index 04cfd54..d649402 100644
--- a/sys/dev/usb/usbdevs
+++ b/sys/dev/usb/usbdevs
@@ -4010,6 +4010,7 @@ product SIERRA E6892 0x6892 E6892
product SIERRA E6893 0x6893 E6893
product SIERRA MC8700 0x68A3 MC8700
product SIERRA MC7354 0x68C0 MC7354
+product SIERRA MC7355 0x9041 MC7355
product SIERRA AC313U 0x68aa Sierra Wireless AirCard 313U
product SIERRA TRUINSTALL 0x0fff Aircard Tru Installer
diff --git a/sys/dev/vt/vt_core.c b/sys/dev/vt/vt_core.c
index b661a0f..cc85e1c 100644
--- a/sys/dev/vt/vt_core.c
+++ b/sys/dev/vt/vt_core.c
@@ -121,6 +121,7 @@ const struct terminal_class vt_termclass = {
static SYSCTL_NODE(_kern, OID_AUTO, vt, CTLFLAG_RD, 0, "vt(9) parameters");
VT_SYSCTL_INT(enable_altgr, 1, "Enable AltGr key (Do not assume R.Alt as Alt)");
+VT_SYSCTL_INT(enable_bell, 1, "Enable bell");
VT_SYSCTL_INT(debug, 0, "vt(9) debug level");
VT_SYSCTL_INT(deadtimer, 15, "Time to wait busy process in VT_PROCESS mode");
VT_SYSCTL_INT(suspendswitch, 1, "Switch to VT0 before suspend");
@@ -935,6 +936,9 @@ vtterm_bell(struct terminal *tm)
struct vt_window *vw = tm->tm_softc;
struct vt_device *vd = vw->vw_device;
+ if (!vt_enable_bell)
+ return;
+
if (vd->vd_flags & VDF_QUIET_BELL)
return;
@@ -946,6 +950,9 @@ vtterm_beep(struct terminal *tm, u_int param)
{
u_int freq, period;
+ if (!vt_enable_bell)
+ return;
+
if ((param == 0) || ((param & 0xffff) == 0)) {
vtterm_bell(tm);
return;
diff --git a/sys/dev/watchdog/watchdog.c b/sys/dev/watchdog/watchdog.c
index 087d6e8..5c9c6a1 100644
--- a/sys/dev/watchdog/watchdog.c
+++ b/sys/dev/watchdog/watchdog.c
@@ -28,6 +28,8 @@
*
*/
+#include "opt_ddb.h"
+
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
@@ -37,6 +39,7 @@ __FBSDID("$FreeBSD$");
#include <sys/conf.h>
#include <sys/uio.h>
#include <sys/kernel.h>
+#include <sys/kdb.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/sysctl.h>
diff --git a/sys/dev/xen/blkfront/blkfront.c b/sys/dev/xen/blkfront/blkfront.c
index a71251d..312a077 100644
--- a/sys/dev/xen/blkfront/blkfront.c
+++ b/sys/dev/xen/blkfront/blkfront.c
@@ -84,6 +84,11 @@ static void xbd_startio(struct xbd_softc *sc);
/*---------------------------- Global Static Data ----------------------------*/
static MALLOC_DEFINE(M_XENBLOCKFRONT, "xbd", "Xen Block Front driver data");
+static int xbd_enable_indirect = 1;
+SYSCTL_NODE(_hw, OID_AUTO, xbd, CTLFLAG_RD, 0, "xbd driver parameters");
+SYSCTL_INT(_hw_xbd, OID_AUTO, xbd_enable_indirect, CTLFLAG_RDTUN,
+ &xbd_enable_indirect, 0, "Enable xbd indirect segments");
+
/*---------------------------- Command Processing ----------------------------*/
static void
xbd_freeze(struct xbd_softc *sc, xbd_flag_t xbd_flag)
@@ -156,44 +161,14 @@ xbd_free_command(struct xbd_command *cm)
}
static void
-xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
+xbd_mksegarray(bus_dma_segment_t *segs, int nsegs,
+ grant_ref_t * gref_head, int otherend_id, int readonly,
+ grant_ref_t * sg_ref, blkif_request_segment_t * sg)
{
- struct xbd_softc *sc;
- struct xbd_command *cm;
- blkif_request_t *ring_req;
- struct blkif_request_segment *sg;
- struct blkif_request_segment *last_block_sg;
- grant_ref_t *sg_ref;
+ struct blkif_request_segment *last_block_sg = sg + nsegs;
vm_paddr_t buffer_ma;
uint64_t fsect, lsect;
int ref;
- int op;
- int block_segs;
-
- cm = arg;
- sc = cm->cm_sc;
-
- if (error) {
- cm->cm_bp->bio_error = EIO;
- biodone(cm->cm_bp);
- xbd_free_command(cm);
- return;
- }
-
- /* Fill out a communications ring structure. */
- ring_req = RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt);
- sc->xbd_ring.req_prod_pvt++;
- ring_req->id = cm->cm_id;
- ring_req->operation = cm->cm_operation;
- ring_req->sector_number = cm->cm_sector_number;
- ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk;
- ring_req->nr_segments = nsegs;
- cm->cm_nseg = nsegs;
-
- block_segs = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_REQUEST);
- sg = ring_req->seg;
- last_block_sg = sg + block_segs;
- sg_ref = cm->cm_sg_refs;
while (sg < last_block_sg) {
buffer_ma = segs->ds_addr;
@@ -204,7 +179,7 @@ xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
"cross a page boundary"));
/* install a grant reference. */
- ref = gnttab_claim_grant_reference(&cm->cm_gref_head);
+ ref = gnttab_claim_grant_reference(gref_head);
/*
* GNTTAB_LIST_END == 0xffffffff, but it is private
@@ -214,9 +189,9 @@ xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
gnttab_grant_foreign_access_ref(
ref,
- xenbus_get_otherend_id(sc->xbd_dev),
+ otherend_id,
buffer_ma >> PAGE_SHIFT,
- ring_req->operation == BLKIF_OP_WRITE);
+ readonly);
*sg_ref = ref;
*sg = (struct blkif_request_segment) {
@@ -227,7 +202,66 @@ xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
sg++;
sg_ref++;
segs++;
- nsegs--;
+ }
+}
+
+static void
+xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
+{
+ struct xbd_softc *sc;
+ struct xbd_command *cm;
+ int op;
+
+ cm = arg;
+ sc = cm->cm_sc;
+
+ if (error) {
+ cm->cm_bp->bio_error = EIO;
+ biodone(cm->cm_bp);
+ xbd_free_command(cm);
+ return;
+ }
+
+ KASSERT(nsegs <= sc->xbd_max_request_segments,
+ ("Too many segments in a blkfront I/O"));
+
+ if (nsegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+ blkif_request_t *ring_req;
+
+ /* Fill out a blkif_request_t structure. */
+ ring_req = (blkif_request_t *)
+ RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt);
+ sc->xbd_ring.req_prod_pvt++;
+ ring_req->id = cm->cm_id;
+ ring_req->operation = cm->cm_operation;
+ ring_req->sector_number = cm->cm_sector_number;
+ ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk;
+ ring_req->nr_segments = nsegs;
+ cm->cm_nseg = nsegs;
+ xbd_mksegarray(segs, nsegs, &cm->cm_gref_head,
+ xenbus_get_otherend_id(sc->xbd_dev),
+ cm->cm_operation == BLKIF_OP_WRITE,
+ cm->cm_sg_refs, ring_req->seg);
+ } else {
+ blkif_request_indirect_t *ring_req;
+
+ /* Fill out a blkif_request_indirect_t structure. */
+ ring_req = (blkif_request_indirect_t *)
+ RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt);
+ sc->xbd_ring.req_prod_pvt++;
+ ring_req->id = cm->cm_id;
+ ring_req->operation = BLKIF_OP_INDIRECT;
+ ring_req->indirect_op = cm->cm_operation;
+ ring_req->sector_number = cm->cm_sector_number;
+ ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk;
+ ring_req->nr_segments = nsegs;
+ cm->cm_nseg = nsegs;
+ xbd_mksegarray(segs, nsegs, &cm->cm_gref_head,
+ xenbus_get_otherend_id(sc->xbd_dev),
+ cm->cm_operation == BLKIF_OP_WRITE,
+ cm->cm_sg_refs, cm->cm_indirectionpages);
+ memcpy(ring_req->indirect_grefs, &cm->cm_indirectionrefs,
+ sizeof(grant_ref_t) * sc->xbd_max_request_indirectpages);
}
if (cm->cm_operation == BLKIF_OP_READ)
@@ -1010,6 +1044,16 @@ xbd_free(struct xbd_softc *sc)
cm->cm_sg_refs = NULL;
}
+ if (cm->cm_indirectionpages != NULL) {
+ gnttab_end_foreign_access_references(
+ sc->xbd_max_request_indirectpages,
+ &cm->cm_indirectionrefs[0]);
+ contigfree(cm->cm_indirectionpages, PAGE_SIZE *
+ sc->xbd_max_request_indirectpages,
+ M_XENBLOCKFRONT);
+ cm->cm_indirectionpages = NULL;
+ }
+
bus_dmamap_destroy(sc->xbd_io_dmat, cm->cm_map);
}
free(sc->xbd_shadow, M_XENBLOCKFRONT);
@@ -1034,7 +1078,6 @@ xbd_initialize(struct xbd_softc *sc)
const char *node_path;
uint32_t max_ring_page_order;
int error;
- int i;
if (xenbus_get_state(sc->xbd_dev) != XenbusStateInitialising) {
/* Initialization has already been performed. */
@@ -1047,9 +1090,6 @@ xbd_initialize(struct xbd_softc *sc)
*/
max_ring_page_order = 0;
sc->xbd_ring_pages = 1;
- sc->xbd_max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
- sc->xbd_max_request_size =
- XBD_SEGS_TO_SIZE(sc->xbd_max_request_segments);
/*
* Protocol negotiation.
@@ -1105,53 +1145,6 @@ xbd_initialize(struct xbd_softc *sc)
sc->xbd_max_requests = XBD_MAX_REQUESTS;
}
- /* Allocate datastructures based on negotiated values. */
- error = bus_dma_tag_create(
- bus_get_dma_tag(sc->xbd_dev), /* parent */
- 512, PAGE_SIZE, /* algnmnt, boundary */
- BUS_SPACE_MAXADDR, /* lowaddr */
- BUS_SPACE_MAXADDR, /* highaddr */
- NULL, NULL, /* filter, filterarg */
- sc->xbd_max_request_size,
- sc->xbd_max_request_segments,
- PAGE_SIZE, /* maxsegsize */
- BUS_DMA_ALLOCNOW, /* flags */
- busdma_lock_mutex, /* lockfunc */
- &sc->xbd_io_lock, /* lockarg */
- &sc->xbd_io_dmat);
- if (error != 0) {
- xenbus_dev_fatal(sc->xbd_dev, error,
- "Cannot allocate parent DMA tag\n");
- return;
- }
-
- /* Per-transaction data allocation. */
- sc->xbd_shadow = malloc(sizeof(*sc->xbd_shadow) * sc->xbd_max_requests,
- M_XENBLOCKFRONT, M_NOWAIT|M_ZERO);
- if (sc->xbd_shadow == NULL) {
- bus_dma_tag_destroy(sc->xbd_io_dmat);
- xenbus_dev_fatal(sc->xbd_dev, error,
- "Cannot allocate request structures\n");
- return;
- }
-
- for (i = 0; i < sc->xbd_max_requests; i++) {
- struct xbd_command *cm;
-
- cm = &sc->xbd_shadow[i];
- cm->cm_sg_refs = malloc(
- sizeof(grant_ref_t) * sc->xbd_max_request_segments,
- M_XENBLOCKFRONT, M_NOWAIT);
- if (cm->cm_sg_refs == NULL)
- break;
- cm->cm_id = i;
- cm->cm_flags = XBDCF_INITIALIZER;
- cm->cm_sc = sc;
- if (bus_dmamap_create(sc->xbd_io_dmat, 0, &cm->cm_map) != 0)
- break;
- xbd_free_command(cm);
- }
-
if (xbd_alloc_ring(sc) != 0)
return;
@@ -1210,6 +1203,7 @@ xbd_connect(struct xbd_softc *sc)
unsigned long sectors, sector_size;
unsigned int binfo;
int err, feature_barrier, feature_flush;
+ int i, j;
if (sc->xbd_state == XBD_STATE_CONNECTED ||
sc->xbd_state == XBD_STATE_SUSPENDED)
@@ -1240,6 +1234,88 @@ xbd_connect(struct xbd_softc *sc)
if (err == 0 && feature_flush != 0)
sc->xbd_flags |= XBDF_FLUSH;
+ err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
+ "feature-max-indirect-segments", "%" PRIu32,
+ &sc->xbd_max_request_segments, NULL);
+ if ((err != 0) || (xbd_enable_indirect == 0))
+ sc->xbd_max_request_segments = 0;
+ if (sc->xbd_max_request_segments > XBD_MAX_INDIRECT_SEGMENTS)
+ sc->xbd_max_request_segments = XBD_MAX_INDIRECT_SEGMENTS;
+ if (sc->xbd_max_request_segments > XBD_SIZE_TO_SEGS(MAXPHYS))
+ sc->xbd_max_request_segments = XBD_SIZE_TO_SEGS(MAXPHYS);
+ sc->xbd_max_request_indirectpages =
+ XBD_INDIRECT_SEGS_TO_PAGES(sc->xbd_max_request_segments);
+ if (sc->xbd_max_request_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST)
+ sc->xbd_max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ sc->xbd_max_request_size =
+ XBD_SEGS_TO_SIZE(sc->xbd_max_request_segments);
+
+ /* Allocate datastructures based on negotiated values. */
+ err = bus_dma_tag_create(
+ bus_get_dma_tag(sc->xbd_dev), /* parent */
+ 512, PAGE_SIZE, /* algnmnt, boundary */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ sc->xbd_max_request_size,
+ sc->xbd_max_request_segments,
+ PAGE_SIZE, /* maxsegsize */
+ BUS_DMA_ALLOCNOW, /* flags */
+ busdma_lock_mutex, /* lockfunc */
+ &sc->xbd_io_lock, /* lockarg */
+ &sc->xbd_io_dmat);
+ if (err != 0) {
+ xenbus_dev_fatal(sc->xbd_dev, err,
+ "Cannot allocate parent DMA tag\n");
+ return;
+ }
+
+ /* Per-transaction data allocation. */
+ sc->xbd_shadow = malloc(sizeof(*sc->xbd_shadow) * sc->xbd_max_requests,
+ M_XENBLOCKFRONT, M_NOWAIT|M_ZERO);
+ if (sc->xbd_shadow == NULL) {
+ bus_dma_tag_destroy(sc->xbd_io_dmat);
+ xenbus_dev_fatal(sc->xbd_dev, ENOMEM,
+ "Cannot allocate request structures\n");
+ return;
+ }
+
+ for (i = 0; i < sc->xbd_max_requests; i++) {
+ struct xbd_command *cm;
+ void * indirectpages;
+
+ cm = &sc->xbd_shadow[i];
+ cm->cm_sg_refs = malloc(
+ sizeof(grant_ref_t) * sc->xbd_max_request_segments,
+ M_XENBLOCKFRONT, M_NOWAIT);
+ if (cm->cm_sg_refs == NULL)
+ break;
+ cm->cm_id = i;
+ cm->cm_flags = XBDCF_INITIALIZER;
+ cm->cm_sc = sc;
+ if (bus_dmamap_create(sc->xbd_io_dmat, 0, &cm->cm_map) != 0)
+ break;
+ if (sc->xbd_max_request_indirectpages > 0) {
+ indirectpages = contigmalloc(
+ PAGE_SIZE * sc->xbd_max_request_indirectpages,
+ M_XENBLOCKFRONT, M_ZERO, 0, ~0, PAGE_SIZE, 0);
+ } else {
+ indirectpages = NULL;
+ }
+ for (j = 0; j < sc->xbd_max_request_indirectpages; j++) {
+ if (gnttab_grant_foreign_access(
+ xenbus_get_otherend_id(sc->xbd_dev),
+ (vtomach(indirectpages) >> PAGE_SHIFT) + j,
+ 1 /* grant read-only access */,
+ &cm->cm_indirectionrefs[j]))
+ break;
+ }
+ if (j < sc->xbd_max_request_indirectpages)
+ break;
+ cm->cm_indirectionpages = indirectpages;
+ xbd_free_command(cm);
+ }
+
if (sc->xbd_disk == NULL) {
device_printf(dev, "%juMB <%s> at %s",
(uintmax_t) sectors / (1048576 / sector_size),
diff --git a/sys/dev/xen/blkfront/block.h b/sys/dev/xen/blkfront/block.h
index 3007118..28c6ff2 100644
--- a/sys/dev/xen/blkfront/block.h
+++ b/sys/dev/xen/blkfront/block.h
@@ -68,28 +68,32 @@
#define XBD_MAX_RING_PAGES 32
/**
- * The maximum number of outstanding requests blocks (request headers plus
- * additional segment blocks) we will allow in a negotiated block-front/back
- * communication channel.
+ * The maximum number of outstanding requests we will allow in a negotiated
+ * block-front/back communication channel.
*/
#define XBD_MAX_REQUESTS \
__CONST_RING_SIZE(blkif, PAGE_SIZE * XBD_MAX_RING_PAGES)
/**
- * The maximum mapped region size per request we will allow in a negotiated
- * block-front/back communication channel.
+ * The maximum number of blkif segments which can be provided per indirect
+ * page in an indirect request.
+ */
+#define XBD_MAX_SEGMENTS_PER_PAGE \
+ (PAGE_SIZE / sizeof(struct blkif_request_segment))
+
+/**
+ * The maximum number of blkif segments which can be provided in an indirect
+ * request.
*/
-#define XBD_MAX_REQUEST_SIZE \
- MIN(MAXPHYS, XBD_SEGS_TO_SIZE(BLKIF_MAX_SEGMENTS_PER_REQUEST))
+#define XBD_MAX_INDIRECT_SEGMENTS \
+ (BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST * XBD_MAX_SEGMENTS_PER_PAGE)
/**
- * The maximum number of segments (within a request header and accompanying
- * segment blocks) per request we will allow in a negotiated block-front/back
- * communication channel.
+ * Compute the number of indirect segment pages required for an I/O with the
+ * specified number of indirect segments.
*/
-#define XBD_MAX_SEGMENTS_PER_REQUEST \
- (MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \
- XBD_SIZE_TO_SEGS(XBD_MAX_REQUEST_SIZE)))
+#define XBD_INDIRECT_SEGS_TO_PAGES(segs) \
+ ((segs + XBD_MAX_SEGMENTS_PER_PAGE - 1) / XBD_MAX_SEGMENTS_PER_PAGE)
typedef enum {
XBDCF_Q_MASK = 0xFF,
@@ -121,6 +125,8 @@ struct xbd_command {
blkif_sector_t cm_sector_number;
int cm_status;
xbd_cbcf_t *cm_complete;
+ void *cm_indirectionpages;
+ grant_ref_t cm_indirectionrefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
};
typedef enum {
@@ -175,6 +181,7 @@ struct xbd_softc {
uint32_t xbd_max_requests;
uint32_t xbd_max_request_segments;
uint32_t xbd_max_request_size;
+ uint32_t xbd_max_request_indirectpages;
grant_ref_t xbd_ring_ref[XBD_MAX_RING_PAGES];
blkif_front_ring_t xbd_ring;
xen_intr_handle_t xen_intr_handle;
diff --git a/sys/geom/geom_dev.c b/sys/geom/geom_dev.c
index b53065d..621a037 100644
--- a/sys/geom/geom_dev.c
+++ b/sys/geom/geom_dev.c
@@ -570,7 +570,7 @@ g_dev_done(struct bio *bp2)
}
mtx_unlock(&sc->sc_mtx);
if (destroy)
- g_post_event(g_dev_destroy, cp, M_WAITOK, NULL);
+ g_post_event(g_dev_destroy, cp, M_NOWAIT, NULL);
biodone(bp);
}
diff --git a/sys/geom/multipath/g_multipath.c b/sys/geom/multipath/g_multipath.c
index 7593cca..0953d18 100644
--- a/sys/geom/multipath/g_multipath.c
+++ b/sys/geom/multipath/g_multipath.c
@@ -369,9 +369,9 @@ g_multipath_done(struct bio *bp)
mtx_lock(&sc->sc_mtx);
(*cnt)--;
if (*cnt == 0 && (cp->index & MP_LOST)) {
- cp->index |= MP_POSTED;
+ if (g_post_event(g_mpd, cp, M_NOWAIT, NULL) == 0)
+ cp->index |= MP_POSTED;
mtx_unlock(&sc->sc_mtx);
- g_post_event(g_mpd, cp, M_WAITOK, NULL);
} else
mtx_unlock(&sc->sc_mtx);
g_std_done(bp);
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index 540eb65..299ee77 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -352,7 +352,8 @@ static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
static void pmap_pte_release(pt_entry_t *pte);
static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
#if defined(PAE) || defined(PAE_TABLES)
-static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
+static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
+ int wait);
#endif
static void pmap_set_pg(void);
@@ -670,7 +671,7 @@ pmap_page_init(vm_page_t m)
#if defined(PAE) || defined(PAE_TABLES)
static void *
-pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
{
/* Inform UMA that this allocator uses kernel_map/object. */
diff --git a/sys/ia64/ia64/uma_machdep.c b/sys/ia64/ia64/uma_machdep.c
index 77791ae..b536820 100644
--- a/sys/ia64/ia64/uma_machdep.c
+++ b/sys/ia64/ia64/uma_machdep.c
@@ -40,7 +40,7 @@ __FBSDID("$FreeBSD$");
#include <machine/vmparam.h>
void *
-uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
{
void *va;
vm_page_t m;
@@ -66,7 +66,7 @@ uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
}
void
-uma_small_free(void *mem, int size, u_int8_t flags)
+uma_small_free(void *mem, vm_size_t size, u_int8_t flags)
{
vm_page_t m;
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index 9de8fa7..dae1d54 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -1849,7 +1849,7 @@ void
knote(struct knlist *list, long hint, int lockflags)
{
struct kqueue *kq;
- struct knote *kn;
+ struct knote *kn, *tkn;
int error;
if (list == NULL)
@@ -1861,14 +1861,13 @@ knote(struct knlist *list, long hint, int lockflags)
list->kl_lock(list->kl_lockarg);
/*
- * If we unlock the list lock (and set KN_INFLUX), we can eliminate
- * the kqueue scheduling, but this will introduce four
- * lock/unlock's for each knote to test. If we do, continue to use
- * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
- * only safe if you want to remove the current item, which we are
- * not doing.
+ * If we unlock the list lock (and set KN_INFLUX), we can
+ * eliminate the kqueue scheduling, but this will introduce
+ * four lock/unlock's for each knote to test. Also, marker
+ * would be needed to keep iteration position, since filters
+ * or other threads could remove events.
*/
- SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
+ SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
kq = kn->kn_kq;
KQ_LOCK(kq);
if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c
index e7b8016..c232a37 100644
--- a/sys/kern/kern_mbuf.c
+++ b/sys/kern/kern_mbuf.c
@@ -284,7 +284,7 @@ static int mb_zinit_pack(void *, int, int);
static void mb_zfini_pack(void *, int);
static void mb_reclaim(void *);
-static void *mbuf_jumbo_alloc(uma_zone_t, int, uint8_t *, int);
+static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
/* Ensure that MSIZE is a power of 2. */
CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
@@ -389,7 +389,7 @@ SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL);
* pages.
*/
static void *
-mbuf_jumbo_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait)
+mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
{
/* Inform UMA that this allocator uses kernel_map/object. */
diff --git a/sys/kern/subr_busdma_bufalloc.c b/sys/kern/subr_busdma_bufalloc.c
index a80a233..b0b1ba8 100644
--- a/sys/kern/subr_busdma_bufalloc.c
+++ b/sys/kern/subr_busdma_bufalloc.c
@@ -147,8 +147,8 @@ busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size)
}
void *
-busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, u_int8_t *pflag,
- int wait)
+busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size,
+ uint8_t *pflag, int wait)
{
#ifdef VM_MEMATTR_UNCACHEABLE
@@ -166,7 +166,7 @@ busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, u_int8_t *pflag,
}
void
-busdma_bufalloc_free_uncacheable(void *item, int size, u_int8_t pflag)
+busdma_bufalloc_free_uncacheable(void *item, vm_size_t size, uint8_t pflag)
{
kmem_free(kernel_arena, (vm_offset_t)item, size);
diff --git a/sys/kern/subr_vmem.c b/sys/kern/subr_vmem.c
index 8cc020a..389b7ee 100644
--- a/sys/kern/subr_vmem.c
+++ b/sys/kern/subr_vmem.c
@@ -608,7 +608,7 @@ static struct mtx_padalign vmem_bt_lock;
* we are really out of KVA.
*/
static void *
-vmem_bt_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
+vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
{
vmem_addr_t addr;
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index 708457d..fa36849 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -2114,6 +2114,7 @@ sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
goto out;
}
} else if (fp->f_type == DTYPE_SHM) {
+ error = 0;
shmfd = fp->f_data;
obj = shmfd->shm_object;
*obj_size = shmfd->shm_size;
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 17f72a7..48f7550 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -330,23 +330,27 @@ sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
int n_nchash;
int count;
+retry:
n_nchash = nchash + 1; /* nchash is max index, not count */
if (!req->oldptr)
return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
-
- /* Scan hash tables for applicable entries */
- for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
- CACHE_RLOCK();
- count = 0;
- LIST_FOREACH(ncp, ncpp, nc_hash) {
- count++;
- }
+ cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
+ CACHE_RLOCK();
+ if (n_nchash != nchash + 1) {
CACHE_RUNLOCK();
- error = SYSCTL_OUT(req, &count, sizeof(count));
- if (error)
- return (error);
+ free(cntbuf, M_TEMP);
+ goto retry;
}
- return (0);
+ /* Scan hash tables counting entries */
+ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
+ LIST_FOREACH(ncp, ncpp, nc_hash)
+ cntbuf[i]++;
+ CACHE_RUNLOCK();
+ for (error = 0, i = 0; i < n_nchash; i++)
+ if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
+ break;
+ free(cntbuf, M_TEMP);
+ return (error);
}
SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
@@ -935,6 +939,44 @@ nchinit(void *dummy __unused)
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
+void
+cache_changesize(int newmaxvnodes)
+{
+ struct nchashhead *new_nchashtbl, *old_nchashtbl;
+ u_long new_nchash, old_nchash;
+ struct namecache *ncp;
+ uint32_t hash;
+ int i;
+
+ new_nchashtbl = hashinit(newmaxvnodes * 2, M_VFSCACHE, &new_nchash);
+ /* If same hash table size, nothing to do */
+ if (nchash == new_nchash) {
+ free(new_nchashtbl, M_VFSCACHE);
+ return;
+ }
+ /*
+ * Move everything from the old hash table to the new table.
+ * None of the namecache entries in the table can be removed
+ * because to do so, they have to be removed from the hash table.
+ */
+ CACHE_WLOCK();
+ old_nchashtbl = nchashtbl;
+ old_nchash = nchash;
+ nchashtbl = new_nchashtbl;
+ nchash = new_nchash;
+ for (i = 0; i <= old_nchash; i++) {
+ while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) {
+ hash = fnv_32_buf(nc_get_name(ncp), ncp->nc_nlen,
+ FNV1_32_INIT);
+ hash = fnv_32_buf(&ncp->nc_dvp, sizeof(ncp->nc_dvp),
+ hash);
+ LIST_REMOVE(ncp, nc_hash);
+ LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
+ }
+ }
+ CACHE_WUNLOCK();
+ free(old_nchashtbl, M_VFSCACHE);
+}
/*
* Invalidate all entries to a particular vnode.
diff --git a/sys/kern/vfs_hash.c b/sys/kern/vfs_hash.c
index 0271e49..1398a47 100644
--- a/sys/kern/vfs_hash.c
+++ b/sys/kern/vfs_hash.c
@@ -160,3 +160,40 @@ vfs_hash_rehash(struct vnode *vp, u_int hash)
vp->v_hash = hash;
mtx_unlock(&vfs_hash_mtx);
}
+
+void
+vfs_hash_changesize(int newmaxvnodes)
+{
+ struct vfs_hash_head *vfs_hash_newtbl, *vfs_hash_oldtbl;
+ u_long vfs_hash_newmask, vfs_hash_oldmask;
+ struct vnode *vp;
+ int i;
+
+ vfs_hash_newtbl = hashinit(newmaxvnodes, M_VFS_HASH,
+ &vfs_hash_newmask);
+ /* If same hash table size, nothing to do */
+ if (vfs_hash_mask == vfs_hash_newmask) {
+ free(vfs_hash_newtbl, M_VFS_HASH);
+ return;
+ }
+ /*
+ * Move everything from the old hash table to the new table.
+ * None of the vnodes in the table can be recycled because to
+ * do so, they have to be removed from the hash table.
+ */
+ rw_wlock(&vfs_hash_lock);
+ vfs_hash_oldtbl = vfs_hash_tbl;
+ vfs_hash_oldmask = vfs_hash_mask;
+ vfs_hash_tbl = vfs_hash_newtbl;
+ vfs_hash_mask = vfs_hash_newmask;
+ for (i = 0; i <= vfs_hash_oldmask; i++) {
+ while ((vp = LIST_FIRST(&vfs_hash_oldtbl[i])) != NULL) {
+ LIST_REMOVE(vp, v_hashlist);
+ LIST_INSERT_HEAD(
+ vfs_hash_bucket(vp->v_mount, vp->v_hash),
+ vp, v_hashlist);
+ }
+ }
+ rw_wunlock(&vfs_hash_lock);
+ free(vfs_hash_oldtbl, M_VFS_HASH);
+}
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 2c471af..a9e17f1 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -280,8 +280,25 @@ static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
* XXX desiredvnodes is historical cruft and should not exist.
*/
int desiredvnodes;
-SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
- &desiredvnodes, 0, "Maximum number of vnodes");
+
+static int
+sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
+{
+ int error, old_desiredvnodes;
+
+ old_desiredvnodes = desiredvnodes;
+ if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
+ return (error);
+ if (old_desiredvnodes != desiredvnodes) {
+ vfs_hash_changesize(desiredvnodes);
+ cache_changesize(desiredvnodes);
+ }
+ return (0);
+}
+
+SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
+ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
+ sysctl_update_desiredvnodes, "I", "Maximum number of vnodes");
SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
&wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
static int vnlru_nowhere;
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 2c92445..d4c8693 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -188,7 +188,10 @@ vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
restart:
fmode = *flagp;
- if (fmode & O_CREAT) {
+ if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
+ O_EXCL | O_DIRECTORY))
+ return (EINVAL);
+ else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
ndp->ni_cnd.cn_nameiop = CREATE;
/*
* Set NOCACHE to avoid flushing the cache when
diff --git a/sys/mips/mips/uma_machdep.c b/sys/mips/mips/uma_machdep.c
index 1c8e6c8..a1f5e5f 100644
--- a/sys/mips/mips/uma_machdep.c
+++ b/sys/mips/mips/uma_machdep.c
@@ -41,7 +41,7 @@ __FBSDID("$FreeBSD$");
#include <machine/vmparam.h>
void *
-uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
{
vm_paddr_t pa;
vm_page_t m;
@@ -70,7 +70,7 @@ uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
}
void
-uma_small_free(void *mem, int size, u_int8_t flags)
+uma_small_free(void *mem, vm_size_t size, u_int8_t flags)
{
vm_page_t m;
vm_paddr_t pa;
diff --git a/sys/net/ieee8023ad_lacp.c b/sys/net/ieee8023ad_lacp.c
index e0fd776..361f592 100644
--- a/sys/net/ieee8023ad_lacp.c
+++ b/sys/net/ieee8023ad_lacp.c
@@ -187,15 +187,15 @@ static const char *lacp_format_portid(const struct lacp_portid *, char *,
static void lacp_dprintf(const struct lacp_port *, const char *, ...)
__attribute__((__format__(__printf__, 2, 3)));
-static int lacp_debug = 0;
+static VNET_DEFINE(int, lacp_debug);
+#define V_lacp_debug VNET(lacp_debug)
SYSCTL_NODE(_net_link_lagg, OID_AUTO, lacp, CTLFLAG_RD, 0, "ieee802.3ad");
-SYSCTL_INT(_net_link_lagg_lacp, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_TUN,
- &lacp_debug, 0, "Enable LACP debug logging (1=debug, 2=trace)");
-TUNABLE_INT("net.link.lagg.lacp.debug", &lacp_debug);
+SYSCTL_INT(_net_link_lagg_lacp, OID_AUTO, debug, CTLFLAG_RWTUN | CTLFLAG_VNET,
+ &VNET_NAME(lacp_debug), 0, "Enable LACP debug logging (1=debug, 2=trace)");
-#define LACP_DPRINTF(a) if (lacp_debug & 0x01) { lacp_dprintf a ; }
-#define LACP_TRACE(a) if (lacp_debug & 0x02) { lacp_dprintf(a,"%s\n",__func__); }
-#define LACP_TPRINTF(a) if (lacp_debug & 0x04) { lacp_dprintf a ; }
+#define LACP_DPRINTF(a) if (V_lacp_debug & 0x01) { lacp_dprintf a ; }
+#define LACP_TRACE(a) if (V_lacp_debug & 0x02) { lacp_dprintf(a,"%s\n",__func__); }
+#define LACP_TPRINTF(a) if (V_lacp_debug & 0x04) { lacp_dprintf a ; }
/*
* partner administration variables.
@@ -298,7 +298,7 @@ lacp_pdu_input(struct lacp_port *lp, struct mbuf *m)
goto bad;
}
- if (lacp_debug > 0) {
+ if (V_lacp_debug > 0) {
lacp_dprintf(lp, "lacpdu receive\n");
lacp_dump_lacpdu(du);
}
@@ -383,7 +383,7 @@ lacp_xmit_lacpdu(struct lacp_port *lp)
sizeof(du->ldu_collector));
du->ldu_collector.lci_maxdelay = 0;
- if (lacp_debug > 0) {
+ if (V_lacp_debug > 0) {
lacp_dprintf(lp, "lacpdu transmit\n");
lacp_dump_lacpdu(du);
}
@@ -495,12 +495,14 @@ lacp_tick(void *arg)
if ((lp->lp_state & LACP_STATE_AGGREGATION) == 0)
continue;
+ CURVNET_SET(lp->lp_ifp->if_vnet);
lacp_run_timers(lp);
lacp_select(lp);
lacp_sm_mux(lp);
lacp_sm_tx(lp);
lacp_sm_ptx_tx_schedule(lp);
+ CURVNET_RESTORE();
}
callout_reset(&lsc->lsc_callout, hz, lacp_tick, lsc);
}
@@ -517,7 +519,7 @@ lacp_port_create(struct lagg_port *lgp)
int error;
boolean_t active = TRUE; /* XXX should be configurable */
- boolean_t fast = FALSE; /* XXX should be configurable */
+ boolean_t fast = FALSE; /* Configurable via ioctl */
bzero((char *)&sdl, sizeof(sdl));
sdl.sdl_len = sizeof(sdl);
@@ -577,12 +579,13 @@ lacp_port_destroy(struct lagg_port *lgp)
lacp_disable_distributing(lp);
lacp_unselect(lp);
+ LIST_REMOVE(lp, lp_next);
+ LACP_UNLOCK(lsc);
+
/* The address may have already been removed by if_purgemaddrs() */
if (!lgp->lp_detaching)
if_delmulti_ifma(lp->lp_ifma);
- LIST_REMOVE(lp, lp_next);
- LACP_UNLOCK(lsc);
free(lp, M_DEVBUF);
}
@@ -743,58 +746,19 @@ lacp_transit_expire(void *vp)
LACP_LOCK_ASSERT(lsc);
+ CURVNET_SET(lsc->lsc_softc->sc_ifp->if_vnet);
LACP_TRACE(NULL);
+ CURVNET_RESTORE();
lsc->lsc_suppress_distributing = FALSE;
}
-static void
-lacp_attach_sysctl(struct lacp_softc *lsc, struct sysctl_oid *p_oid)
-{
- struct lagg_softc *sc = lsc->lsc_softc;
-
- SYSCTL_ADD_UINT(&sc->ctx, SYSCTL_CHILDREN(p_oid), OID_AUTO,
- "lacp_strict_mode",
- CTLFLAG_RW,
- &lsc->lsc_strict_mode,
- lsc->lsc_strict_mode,
- "Enable LACP strict mode");
-}
-
-static void
-lacp_attach_sysctl_debug(struct lacp_softc *lsc, struct sysctl_oid *p_oid)
-{
- struct lagg_softc *sc = lsc->lsc_softc;
- struct sysctl_oid *oid;
-
- /* Create a child of the parent lagg interface */
- oid = SYSCTL_ADD_NODE(&sc->ctx, SYSCTL_CHILDREN(p_oid),
- OID_AUTO, "debug", CTLFLAG_RD, NULL, "DEBUG");
-
- SYSCTL_ADD_UINT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
- "rx_test",
- CTLFLAG_RW,
- &lsc->lsc_debug.lsc_rx_test,
- lsc->lsc_debug.lsc_rx_test,
- "Bitmap of if_dunit entries to drop RX frames for");
- SYSCTL_ADD_UINT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
- "tx_test",
- CTLFLAG_RW,
- &lsc->lsc_debug.lsc_tx_test,
- lsc->lsc_debug.lsc_tx_test,
- "Bitmap of if_dunit entries to drop TX frames for");
-}
-
-int
+void
lacp_attach(struct lagg_softc *sc)
{
struct lacp_softc *lsc;
- struct sysctl_oid *oid;
- lsc = malloc(sizeof(struct lacp_softc),
- M_DEVBUF, M_NOWAIT|M_ZERO);
- if (lsc == NULL)
- return (ENOMEM);
+ lsc = malloc(sizeof(struct lacp_softc), M_DEVBUF, M_WAITOK | M_ZERO);
sc->sc_psc = (caddr_t)lsc;
lsc->lsc_softc = sc;
@@ -806,35 +770,24 @@ lacp_attach(struct lagg_softc *sc)
TAILQ_INIT(&lsc->lsc_aggregators);
LIST_INIT(&lsc->lsc_ports);
- /* Create a child of the parent lagg interface */
- oid = SYSCTL_ADD_NODE(&sc->ctx, SYSCTL_CHILDREN(sc->sc_oid),
- OID_AUTO, "lacp", CTLFLAG_RD, NULL, "LACP");
-
- /* Attach sysctl nodes */
- lacp_attach_sysctl(lsc, oid);
- lacp_attach_sysctl_debug(lsc, oid);
-
callout_init_mtx(&lsc->lsc_transit_callout, &lsc->lsc_mtx, 0);
callout_init_mtx(&lsc->lsc_callout, &lsc->lsc_mtx, 0);
/* if the lagg is already up then do the same */
if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING)
lacp_init(sc);
-
- return (0);
}
int
-lacp_detach(struct lagg_softc *sc)
+lacp_detach(void *psc)
{
- struct lacp_softc *lsc = LACP_SOFTC(sc);
+ struct lacp_softc *lsc = (struct lacp_softc *)psc;
KASSERT(TAILQ_EMPTY(&lsc->lsc_aggregators),
("aggregators still active"));
KASSERT(lsc->lsc_active_aggregator == NULL,
("aggregator still attached"));
- sc->sc_psc = NULL;
callout_drain(&lsc->lsc_transit_callout);
callout_drain(&lsc->lsc_callout);
@@ -883,7 +836,7 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m)
return (NULL);
}
- if (sc->use_flowid &&
+ if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&
M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
hash = m->m_pkthdr.flowid >> sc->flowid_shift;
else
@@ -1425,7 +1378,7 @@ lacp_sm_mux(struct lacp_port *lp)
enum lacp_selected selected = lp->lp_selected;
struct lacp_aggregator *la;
- if (lacp_debug > 1)
+ if (V_lacp_debug > 1)
lacp_dprintf(lp, "%s: state= 0x%x, selected= 0x%x, "
"p_sync= 0x%x, p_collecting= 0x%x\n", __func__,
lp->lp_mux_state, selected, p_sync, p_collecting);
diff --git a/sys/net/ieee8023ad_lacp.h b/sys/net/ieee8023ad_lacp.h
index ca5f76e..1573c0b 100644
--- a/sys/net/ieee8023ad_lacp.h
+++ b/sys/net/ieee8023ad_lacp.h
@@ -75,6 +75,7 @@
"\007DEFAULTED" \
"\010EXPIRED"
+#ifdef _KERNEL
/*
* IEEE802.3 slow protocols
*
@@ -250,6 +251,7 @@ struct lacp_softc {
u_int32_t lsc_tx_test;
} lsc_debug;
u_int32_t lsc_strict_mode;
+ boolean_t lsc_fast_timeout; /* if set, fast timeout */
};
#define LACP_TYPE_ACTORINFO 1
@@ -282,8 +284,8 @@ struct lacp_softc {
struct mbuf *lacp_input(struct lagg_port *, struct mbuf *);
struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *);
-int lacp_attach(struct lagg_softc *);
-int lacp_detach(struct lagg_softc *);
+void lacp_attach(struct lagg_softc *);
+int lacp_detach(void *);
void lacp_init(struct lagg_softc *);
void lacp_stop(struct lagg_softc *);
int lacp_port_create(struct lagg_port *);
@@ -336,3 +338,4 @@ lacp_isdistributing(struct lagg_port *lgp)
#define LACP_LAGIDSTR_MAX \
(1 + LACP_PARTNERSTR_MAX + 1 + LACP_PARTNERSTR_MAX + 1)
#define LACP_STATESTR_MAX (255) /* XXX */
+#endif /* _KERNEL */
diff --git a/sys/net/if_gif.c b/sys/net/if_gif.c
index 66669ca..f2a38c1 100644
--- a/sys/net/if_gif.c
+++ b/sys/net/if_gif.c
@@ -419,13 +419,8 @@ gif_transmit(struct ifnet *ifp, struct mbuf *m)
}
eth = mtod(m, struct etherip_header *);
eth->eip_resvh = 0;
- if ((sc->gif_options & GIF_SEND_REVETHIP) != 0) {
- eth->eip_ver = 0;
- eth->eip_resvl = ETHERIP_VERSION;
- } else {
- eth->eip_ver = ETHERIP_VERSION;
- eth->eip_resvl = 0;
- }
+ eth->eip_ver = ETHERIP_VERSION;
+ eth->eip_resvl = 0;
break;
default:
error = EAFNOSUPPORT;
@@ -633,19 +628,10 @@ gif_input(struct mbuf *m, struct ifnet *ifp, int proto, uint8_t ecn)
if (m == NULL)
goto drop;
eip = mtod(m, struct etherip_header *);
- /*
- * GIF_ACCEPT_REVETHIP (enabled by default) intentionally
- * accepts an EtherIP packet with revered version field in
- * the header. This is a knob for backward compatibility
- * with FreeBSD 7.2R or prior.
- */
if (eip->eip_ver != ETHERIP_VERSION) {
- if ((gif_options & GIF_ACCEPT_REVETHIP) == 0 ||
- eip->eip_resvl != ETHERIP_VERSION) {
- /* discard unknown versions */
- m_freem(m);
- goto drop;
- }
+ /* discard unknown versions */
+ m_freem(m);
+ goto drop;
}
m_adj(m, sizeof(struct etherip_header));
@@ -766,50 +752,32 @@ gif_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
goto bad;
/* validate sa_len */
+ /* check sa_family looks sane for the cmd */
switch (src->sa_family) {
#ifdef INET
case AF_INET:
if (src->sa_len != sizeof(struct sockaddr_in))
goto bad;
- break;
-#endif
-#ifdef INET6
- case AF_INET6:
- if (src->sa_len != sizeof(struct sockaddr_in6))
+ if (cmd != SIOCSIFPHYADDR) {
+ error = EAFNOSUPPORT;
goto bad;
- break;
-#endif
- default:
- error = EAFNOSUPPORT;
- goto bad;
- }
- /* check sa_family looks sane for the cmd */
- error = EAFNOSUPPORT;
- switch (cmd) {
-#ifdef INET
- case SIOCSIFPHYADDR:
- if (src->sa_family == AF_INET)
- break;
- goto bad;
-#endif
-#ifdef INET6
- case SIOCSIFPHYADDR_IN6:
- if (src->sa_family == AF_INET6)
- break;
- goto bad;
-#endif
- }
- error = EADDRNOTAVAIL;
- switch (src->sa_family) {
-#ifdef INET
- case AF_INET:
+ }
if (satosin(src)->sin_addr.s_addr == INADDR_ANY ||
- satosin(dst)->sin_addr.s_addr == INADDR_ANY)
+ satosin(dst)->sin_addr.s_addr == INADDR_ANY) {
+ error = EADDRNOTAVAIL;
goto bad;
+ }
break;
#endif
#ifdef INET6
case AF_INET6:
+ if (src->sa_len != sizeof(struct sockaddr_in6))
+ goto bad;
+ if (cmd != SIOCSIFPHYADDR_IN6) {
+ error = EAFNOSUPPORT;
+ goto bad;
+ }
+ error = EADDRNOTAVAIL;
if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr)
||
IN6_IS_ADDR_UNSPECIFIED(&satosin6(dst)->sin6_addr))
@@ -825,8 +793,12 @@ gif_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
error = sa6_embedscope(satosin6(dst), 0);
if (error != 0)
goto bad;
+ break;
#endif
- };
+ default:
+ error = EAFNOSUPPORT;
+ goto bad;
+ }
error = gif_set_tunnel(ifp, src, dst);
break;
case SIOCDIFPHYADDR:
diff --git a/sys/net/if_gif.h b/sys/net/if_gif.h
index ed143e8..b71c8e8 100644
--- a/sys/net/if_gif.h
+++ b/sys/net/if_gif.h
@@ -126,10 +126,7 @@ int in6_gif_attach(struct gif_softc *);
#define GIFGOPTS _IOWR('i', 150, struct ifreq)
#define GIFSOPTS _IOW('i', 151, struct ifreq)
-#define GIF_ACCEPT_REVETHIP 0x0001
#define GIF_IGNORE_SOURCE 0x0002
-#define GIF_SEND_REVETHIP 0x0010
-#define GIF_OPTMASK (GIF_ACCEPT_REVETHIP|GIF_SEND_REVETHIP| \
- GIF_IGNORE_SOURCE)
+#define GIF_OPTMASK (GIF_IGNORE_SOURCE)
#endif /* _NET_IF_GIF_H_ */
diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c
index ce1365b..2522fb6 100644
--- a/sys/net/if_lagg.c
+++ b/sys/net/if_lagg.c
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
#include <net/if_types.h>
#include <net/if_var.h>
#include <net/bpf.h>
+#include <net/vnet.h>
#if defined(INET) || defined(INET6)
#include <netinet/in.h>
@@ -81,13 +82,21 @@ static struct {
{0, NULL}
};
-SLIST_HEAD(__trhead, lagg_softc) lagg_list; /* list of laggs */
-static struct mtx lagg_list_mtx;
+VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */
+#define V_lagg_list VNET(lagg_list)
+static VNET_DEFINE(struct mtx, lagg_list_mtx);
+#define V_lagg_list_mtx VNET(lagg_list_mtx)
+#define LAGG_LIST_LOCK_INIT(x) mtx_init(&V_lagg_list_mtx, \
+ "if_lagg list", NULL, MTX_DEF)
+#define LAGG_LIST_LOCK_DESTROY(x) mtx_destroy(&V_lagg_list_mtx)
+#define LAGG_LIST_LOCK(x) mtx_lock(&V_lagg_list_mtx)
+#define LAGG_LIST_UNLOCK(x) mtx_unlock(&V_lagg_list_mtx)
eventhandler_tag lagg_detach_cookie = NULL;
static int lagg_clone_create(struct if_clone *, int, caddr_t);
static void lagg_clone_destroy(struct ifnet *);
-static struct if_clone *lagg_cloner;
+static VNET_DEFINE(struct if_clone *, lagg_cloner);
+#define V_lagg_cloner VNET(lagg_cloner)
static const char laggname[] = "lagg";
static void lagg_lladdr(struct lagg_softc *, uint8_t *);
@@ -123,24 +132,23 @@ static void lagg_media_status(struct ifnet *, struct ifmediareq *);
static struct lagg_port *lagg_link_active(struct lagg_softc *,
struct lagg_port *);
static const void *lagg_gethdr(struct mbuf *, u_int, u_int, void *);
-static int lagg_sysctl_active(SYSCTL_HANDLER_ARGS);
/* Simple round robin */
-static int lagg_rr_attach(struct lagg_softc *);
+static void lagg_rr_attach(struct lagg_softc *);
static int lagg_rr_detach(struct lagg_softc *);
static int lagg_rr_start(struct lagg_softc *, struct mbuf *);
static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *,
struct mbuf *);
/* Active failover */
-static int lagg_fail_attach(struct lagg_softc *);
+static void lagg_fail_attach(struct lagg_softc *);
static int lagg_fail_detach(struct lagg_softc *);
static int lagg_fail_start(struct lagg_softc *, struct mbuf *);
static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *,
struct mbuf *);
/* Loadbalancing */
-static int lagg_lb_attach(struct lagg_softc *);
+static void lagg_lb_attach(struct lagg_softc *);
static int lagg_lb_detach(struct lagg_softc *);
static int lagg_lb_port_create(struct lagg_port *);
static void lagg_lb_port_destroy(struct lagg_port *);
@@ -150,7 +158,7 @@ static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *,
static int lagg_lb_porttable(struct lagg_softc *, struct lagg_port *);
/* 802.3ad LACP */
-static int lagg_lacp_attach(struct lagg_softc *);
+static void lagg_lacp_attach(struct lagg_softc *);
static int lagg_lacp_detach(struct lagg_softc *);
static int lagg_lacp_start(struct lagg_softc *, struct mbuf *);
static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *,
@@ -160,9 +168,9 @@ static void lagg_lacp_lladdr(struct lagg_softc *);
static void lagg_callout(void *);
/* lagg protocol table */
-static const struct {
- int ti_proto;
- int (*ti_attach)(struct lagg_softc *);
+static const struct lagg_proto {
+ lagg_proto ti_proto;
+ void (*ti_attach)(struct lagg_softc *);
} lagg_protos[] = {
{ LAGG_PROTO_ROUNDROBIN, lagg_rr_attach },
{ LAGG_PROTO_FAILOVER, lagg_fail_attach },
@@ -176,31 +184,55 @@ SYSCTL_DECL(_net_link);
SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0,
"Link Aggregation");
-static int lagg_failover_rx_all = 0; /* Allow input on any failover links */
-SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW,
- &lagg_failover_rx_all, 0,
+/* Allow input on any failover links */
+static VNET_DEFINE(int, lagg_failover_rx_all);
+#define V_lagg_failover_rx_all VNET(lagg_failover_rx_all)
+SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET,
+ &VNET_NAME(lagg_failover_rx_all), 0,
"Accept input from any interface in a failover lagg");
-static int def_use_flowid = 1; /* Default value for using flowid */
-TUNABLE_INT("net.link.lagg.default_use_flowid", &def_use_flowid);
-SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RW,
- &def_use_flowid, 0,
+
+/* Default value for using M_FLOWID */
+static VNET_DEFINE(int, def_use_flowid) = 1;
+#define V_def_use_flowid VNET(def_use_flowid)
+SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN,
+ &VNET_NAME(def_use_flowid), 0,
"Default setting for using flow id for load sharing");
-static int def_flowid_shift = 16; /* Default value for using flow shift */
-TUNABLE_INT("net.link.lagg.default_flowid_shift", &def_flowid_shift);
-SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RW,
- &def_flowid_shift, 0,
+
+/* Default value for using M_FLOWID */
+static VNET_DEFINE(int, def_flowid_shift) = 16;
+#define V_def_flowid_shift VNET(def_flowid_shift)
+SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN,
+ &VNET_NAME(def_flowid_shift), 0,
"Default setting for flowid shift for load sharing");
+static void
+vnet_lagg_init(const void *unused __unused)
+{
+
+ LAGG_LIST_LOCK_INIT();
+ SLIST_INIT(&V_lagg_list);
+ V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create,
+ lagg_clone_destroy, 0);
+}
+VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+ vnet_lagg_init, NULL);
+
+static void
+vnet_lagg_uninit(const void *unused __unused)
+{
+
+ if_clone_detach(V_lagg_cloner);
+ LAGG_LIST_LOCK_DESTROY();
+}
+VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+ vnet_lagg_uninit, NULL);
+
static int
lagg_modevent(module_t mod, int type, void *data)
{
switch (type) {
case MOD_LOAD:
- mtx_init(&lagg_list_mtx, "if_lagg list", NULL, MTX_DEF);
- SLIST_INIT(&lagg_list);
- lagg_cloner = if_clone_simple(laggname, lagg_clone_create,
- lagg_clone_destroy, 0);
lagg_input_p = lagg_input;
lagg_linkstate_p = lagg_port_state;
lagg_detach_cookie = EVENTHANDLER_REGISTER(
@@ -210,10 +242,8 @@ lagg_modevent(module_t mod, int type, void *data)
case MOD_UNLOAD:
EVENTHANDLER_DEREGISTER(ifnet_departure_event,
lagg_detach_cookie);
- if_clone_detach(lagg_cloner);
lagg_input_p = NULL;
lagg_linkstate_p = NULL;
- mtx_destroy(&lagg_list_mtx);
break;
default:
return (EOPNOTSUPP);
@@ -279,10 +309,8 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
{
struct lagg_softc *sc;
struct ifnet *ifp;
- int i, error = 0;
static const u_char eaddr[6]; /* 00:00:00:00:00:00 */
- struct sysctl_oid *oid;
- char num[14]; /* sufficient for 32 bits */
+ int i;
sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
@@ -296,29 +324,10 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
sc->sc_ibytes = counter_u64_alloc(M_WAITOK);
sc->sc_obytes = counter_u64_alloc(M_WAITOK);
- sysctl_ctx_init(&sc->ctx);
- snprintf(num, sizeof(num), "%u", unit);
- sc->use_flowid = def_use_flowid;
- sc->flowid_shift = def_flowid_shift;
- sc->sc_oid = oid = SYSCTL_ADD_NODE(&sc->ctx,
- &SYSCTL_NODE_CHILDREN(_net_link, lagg),
- OID_AUTO, num, CTLFLAG_RD, NULL, "");
- SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
- "use_flowid", CTLFLAG_RW, &sc->use_flowid,
- sc->use_flowid, "Use flow id for load sharing");
- SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
- "flowid_shift", CTLFLAG_RW, &sc->flowid_shift,
- sc->flowid_shift,
- "Shift flowid bits to prevent multiqueue collisions");
- SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
- "count", CTLFLAG_RD, &sc->sc_count, sc->sc_count,
- "Total number of ports");
- SYSCTL_ADD_PROC(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
- "active", CTLTYPE_INT|CTLFLAG_RD, sc, 0, lagg_sysctl_active,
- "I", "Total number of active ports");
- SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
- "flapping", CTLFLAG_RD, &sc->sc_flapping,
- sc->sc_flapping, "Total number of port change events");
+ if (V_def_use_flowid)
+ sc->sc_opts |= LAGG_OPT_USE_FLOWID;
+ sc->flowid_shift = V_def_flowid_shift;
+
/* Hash all layers by default */
sc->sc_flags = LAGG_F_HASHL2|LAGG_F_HASHL3|LAGG_F_HASHL4;
@@ -326,11 +335,7 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
for (i = 0; lagg_protos[i].ti_proto != LAGG_PROTO_NONE; i++) {
if (lagg_protos[i].ti_proto == LAGG_PROTO_DEFAULT) {
sc->sc_proto = lagg_protos[i].ti_proto;
- if ((error = lagg_protos[i].ti_attach(sc)) != 0) {
- if_free(ifp);
- free(sc, M_DEVBUF);
- return (error);
- }
+ lagg_protos[i].ti_attach(sc);
break;
}
}
@@ -375,9 +380,9 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
/* Insert into the global list of laggs */
- mtx_lock(&lagg_list_mtx);
- SLIST_INSERT_HEAD(&lagg_list, sc, sc_entries);
- mtx_unlock(&lagg_list_mtx);
+ LAGG_LIST_LOCK();
+ SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries);
+ LAGG_LIST_UNLOCK();
callout_reset(&sc->sc_callout, hz, lagg_callout, sc);
@@ -404,10 +409,9 @@ lagg_clone_destroy(struct ifnet *ifp)
/* Unhook the aggregation protocol */
if (sc->sc_detach != NULL)
(*sc->sc_detach)(sc);
+ else
+ LAGG_WUNLOCK(sc);
- LAGG_WUNLOCK(sc);
-
- sysctl_ctx_free(&sc->ctx);
ifmedia_removeall(&sc->sc_media);
ether_ifdetach(ifp);
if_free(ifp);
@@ -421,9 +425,9 @@ lagg_clone_destroy(struct ifnet *ifp)
counter_u64_free(sc->sc_ibytes);
counter_u64_free(sc->sc_obytes);
- mtx_lock(&lagg_list_mtx);
- SLIST_REMOVE(&lagg_list, sc, lagg_softc, sc_entries);
- mtx_unlock(&lagg_list_mtx);
+ LAGG_LIST_LOCK();
+ SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries);
+ LAGG_LIST_UNLOCK();
taskqueue_drain(taskqueue_swi, &sc->sc_lladdr_task);
LAGG_LOCK_DESTROY(sc);
@@ -435,15 +439,28 @@ static void
lagg_lladdr(struct lagg_softc *sc, uint8_t *lladdr)
{
struct ifnet *ifp = sc->sc_ifp;
+ struct lagg_port lp;
if (memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0)
return;
+ LAGG_WLOCK_ASSERT(sc);
+ /*
+ * Set the link layer address on the lagg interface.
+ * sc_lladdr() notifies the MAC change to
+ * the aggregation protocol. iflladdr_event handler which
+ * may trigger gratuitous ARPs for INET will be handled in
+ * a taskqueue.
+ */
bcopy(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN);
- /* Let the protocol know the MAC has changed */
if (sc->sc_lladdr != NULL)
(*sc->sc_lladdr)(sc);
- EVENTHANDLER_INVOKE(iflladdr_event, ifp);
+
+ bzero(&lp, sizeof(lp));
+ lp.lp_ifp = sc->sc_ifp;
+ lp.lp_softc = sc;
+
+ lagg_port_lladdr(&lp, lladdr);
}
static void
@@ -491,11 +508,13 @@ lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr)
struct ifnet *ifp = lp->lp_ifp;
struct lagg_llq *llq;
int pending = 0;
+ int primary;
LAGG_WLOCK_ASSERT(sc);
- if (lp->lp_detaching ||
- memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0)
+ primary = (sc->sc_primary->lp_ifp == ifp) ? 1 : 0;
+ if (primary == 0 && (lp->lp_detaching ||
+ memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0))
return;
/* Check to make sure its not already queued to be changed */
@@ -514,6 +533,7 @@ lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr)
/* Update the lladdr even if pending, it may have changed */
llq->llq_ifp = ifp;
+ llq->llq_primary = primary;
bcopy(lladdr, llq->llq_lladdr, ETHER_ADDR_LEN);
if (!pending)
@@ -546,14 +566,20 @@ lagg_port_setlladdr(void *arg, int pending)
for (llq = head; llq != NULL; llq = head) {
ifp = llq->llq_ifp;
- /* Set the link layer address */
CURVNET_SET(ifp->if_vnet);
- error = if_setlladdr(ifp, llq->llq_lladdr, ETHER_ADDR_LEN);
+ if (llq->llq_primary == 0) {
+ /*
+ * Set the link layer address on the laggport interface.
+ * if_setlladdr() triggers gratuitous ARPs for INET.
+ */
+ error = if_setlladdr(ifp, llq->llq_lladdr,
+ ETHER_ADDR_LEN);
+ if (error)
+ printf("%s: setlladdr failed on %s\n", __func__,
+ ifp->if_xname);
+ } else
+ EVENTHANDLER_INVOKE(iflladdr_event, ifp);
CURVNET_RESTORE();
- if (error)
- printf("%s: setlladdr failed on %s\n", __func__,
- ifp->if_xname);
-
head = SLIST_NEXT(llq, llq_entries);
free(llq, M_DEVBUF);
}
@@ -585,34 +611,6 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
if (ifp->if_type != IFT_ETHER)
return (EPROTONOSUPPORT);
-#ifdef INET6
- /*
- * The member interface should not have inet6 address because
- * two interfaces with a valid link-local scope zone must not be
- * merged in any form. This restriction is needed to
- * prevent violation of link-local scope zone. Attempts to
- * add a member interface which has inet6 addresses triggers
- * removal of all inet6 addresses on the member interface.
- */
- SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
- if (in6ifa_llaonifp(lp->lp_ifp)) {
- in6_ifdetach(lp->lp_ifp);
- if_printf(sc->sc_ifp,
- "IPv6 addresses on %s have been removed "
- "before adding it as a member to prevent "
- "IPv6 address scope violation.\n",
- lp->lp_ifp->if_xname);
- }
- }
- if (in6ifa_llaonifp(ifp)) {
- in6_ifdetach(ifp);
- if_printf(sc->sc_ifp,
- "IPv6 addresses on %s have been removed "
- "before adding it as a member to prevent "
- "IPv6 address scope violation.\n",
- ifp->if_xname);
- }
-#endif
/* Allow the first Ethernet member to define the MTU */
if (SLIST_EMPTY(&sc->sc_ports))
sc->sc_ifp->if_mtu = ifp->if_mtu;
@@ -627,10 +625,10 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
return (ENOMEM);
/* Check if port is a stacked lagg */
- mtx_lock(&lagg_list_mtx);
- SLIST_FOREACH(sc_ptr, &lagg_list, sc_entries) {
+ LAGG_LIST_LOCK();
+ SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) {
if (ifp == sc_ptr->sc_ifp) {
- mtx_unlock(&lagg_list_mtx);
+ LAGG_LIST_UNLOCK();
free(lp, M_DEVBUF);
return (EINVAL);
/* XXX disable stacking for the moment, its untested */
@@ -638,14 +636,14 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
lp->lp_flags |= LAGG_PORT_STACK;
if (lagg_port_checkstacking(sc_ptr) >=
LAGG_MAX_STACKING) {
- mtx_unlock(&lagg_list_mtx);
+ LAGG_LIST_UNLOCK();
free(lp, M_DEVBUF);
return (E2BIG);
}
#endif
}
}
- mtx_unlock(&lagg_list_mtx);
+ LAGG_LIST_UNLOCK();
/* Change the interface type */
lp->lp_iftype = ifp->if_type;
@@ -995,10 +993,12 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
{
struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
struct lagg_reqall *ra = (struct lagg_reqall *)data;
+ struct lagg_reqopts *ro = (struct lagg_reqopts *)data;
struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf;
struct lagg_reqflags *rf = (struct lagg_reqflags *)data;
struct ifreq *ifr = (struct ifreq *)data;
struct lagg_port *lp;
+ const struct lagg_proto *proto = NULL;
struct ifnet *tpif;
struct thread *td = curthread;
char *buf, *outbuf;
@@ -1046,50 +1046,153 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
error = priv_check(td, PRIV_NET_LAGG);
if (error)
break;
- if (ra->ra_proto >= LAGG_PROTO_MAX) {
+ for (proto = lagg_protos; proto->ti_proto != LAGG_PROTO_NONE;
+ proto++) {
+ if (proto->ti_proto == ra->ra_proto) {
+ if (sc->sc_ifflags & IFF_DEBUG)
+ printf("%s: using proto %u\n",
+ sc->sc_ifname, proto->ti_proto);
+ break;
+ }
+ }
+ if (proto->ti_proto == LAGG_PROTO_NONE) {
error = EPROTONOSUPPORT;
break;
}
+ /* Set to LAGG_PROTO_NONE during the attach. */
LAGG_WLOCK(sc);
if (sc->sc_proto != LAGG_PROTO_NONE) {
- /* Reset protocol first in case detach unlocks */
sc->sc_proto = LAGG_PROTO_NONE;
- error = sc->sc_detach(sc);
- sc->sc_detach = NULL;
- sc->sc_start = NULL;
- sc->sc_input = NULL;
- sc->sc_port_create = NULL;
- sc->sc_port_destroy = NULL;
- sc->sc_linkstate = NULL;
- sc->sc_init = NULL;
- sc->sc_stop = NULL;
- sc->sc_lladdr = NULL;
- sc->sc_req = NULL;
- sc->sc_portreq = NULL;
- } else if (sc->sc_input != NULL) {
- /* Still detaching */
- error = EBUSY;
- }
- if (error != 0) {
+ if (sc->sc_detach != NULL)
+ sc->sc_detach(sc);
+ else
+ LAGG_WUNLOCK(sc);
+ } else
LAGG_WUNLOCK(sc);
+ proto->ti_attach(sc);
+ LAGG_WLOCK(sc);
+ sc->sc_proto = proto->ti_proto;
+ LAGG_WUNLOCK(sc);
+ break;
+ case SIOCGLAGGOPTS:
+ ro->ro_opts = sc->sc_opts;
+ if (sc->sc_proto == LAGG_PROTO_LACP) {
+ struct lacp_softc *lsc;
+
+ lsc = (struct lacp_softc *)sc->sc_psc;
+ if (lsc->lsc_debug.lsc_tx_test != 0)
+ ro->ro_opts |= LAGG_OPT_LACP_TXTEST;
+ if (lsc->lsc_debug.lsc_rx_test != 0)
+ ro->ro_opts |= LAGG_OPT_LACP_RXTEST;
+ if (lsc->lsc_strict_mode != 0)
+ ro->ro_opts |= LAGG_OPT_LACP_STRICT;
+ if (lsc->lsc_fast_timeout != 0)
+ ro->ro_opts |= LAGG_OPT_LACP_TIMEOUT;
+
+ ro->ro_active = sc->sc_active;
+ } else {
+ ro->ro_active = 0;
+ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
+ ro->ro_active += LAGG_PORTACTIVE(lp);
+ }
+ ro->ro_flapping = sc->sc_flapping;
+ ro->ro_flowid_shift = sc->flowid_shift;
+ break;
+ case SIOCSLAGGOPTS:
+ error = priv_check(td, PRIV_NET_LAGG);
+ if (error)
+ break;
+ if (ro->ro_opts == 0)
+ break;
+ /*
+ * Set options. LACP options are stored in sc->sc_psc,
+ * not in sc_opts.
+ */
+ int valid, lacp;
+
+ switch (ro->ro_opts) {
+ case LAGG_OPT_USE_FLOWID:
+ case -LAGG_OPT_USE_FLOWID:
+ case LAGG_OPT_FLOWIDSHIFT:
+ valid = 1;
+ lacp = 0;
+ break;
+ case LAGG_OPT_LACP_TXTEST:
+ case -LAGG_OPT_LACP_TXTEST:
+ case LAGG_OPT_LACP_RXTEST:
+ case -LAGG_OPT_LACP_RXTEST:
+ case LAGG_OPT_LACP_STRICT:
+ case -LAGG_OPT_LACP_STRICT:
+ case LAGG_OPT_LACP_TIMEOUT:
+ case -LAGG_OPT_LACP_TIMEOUT:
+ valid = lacp = 1;
+ break;
+ default:
+ valid = lacp = 0;
break;
}
- for (int i = 0; i < (sizeof(lagg_protos) /
- sizeof(lagg_protos[0])); i++) {
- if (lagg_protos[i].ti_proto == ra->ra_proto) {
- if (sc->sc_ifflags & IFF_DEBUG)
- printf("%s: using proto %u\n",
- sc->sc_ifname,
- lagg_protos[i].ti_proto);
- sc->sc_proto = lagg_protos[i].ti_proto;
- if (sc->sc_proto != LAGG_PROTO_NONE)
- error = lagg_protos[i].ti_attach(sc);
- LAGG_WUNLOCK(sc);
- return (error);
+
+ LAGG_WLOCK(sc);
+ if (valid == 0 ||
+ (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) {
+ /* Invalid combination of options specified. */
+ error = EINVAL;
+ LAGG_WUNLOCK(sc);
+ break; /* Return from SIOCSLAGGOPTS. */
+ }
+ /*
+ * Store new options into sc->sc_opts except for
+ * FLOWIDSHIFT and LACP options.
+ */
+ if (lacp == 0) {
+ if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT)
+ sc->flowid_shift = ro->ro_flowid_shift;
+ else if (ro->ro_opts > 0)
+ sc->sc_opts |= ro->ro_opts;
+ else
+ sc->sc_opts &= ~ro->ro_opts;
+ } else {
+ struct lacp_softc *lsc;
+ struct lacp_port *lp;
+
+ lsc = (struct lacp_softc *)sc->sc_psc;
+
+ switch (ro->ro_opts) {
+ case LAGG_OPT_LACP_TXTEST:
+ lsc->lsc_debug.lsc_tx_test = 1;
+ break;
+ case -LAGG_OPT_LACP_TXTEST:
+ lsc->lsc_debug.lsc_tx_test = 0;
+ break;
+ case LAGG_OPT_LACP_RXTEST:
+ lsc->lsc_debug.lsc_rx_test = 1;
+ break;
+ case -LAGG_OPT_LACP_RXTEST:
+ lsc->lsc_debug.lsc_rx_test = 0;
+ break;
+ case LAGG_OPT_LACP_STRICT:
+ lsc->lsc_strict_mode = 1;
+ break;
+ case -LAGG_OPT_LACP_STRICT:
+ lsc->lsc_strict_mode = 0;
+ break;
+ case LAGG_OPT_LACP_TIMEOUT:
+ LACP_LOCK(lsc);
+ LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
+ lp->lp_state |= LACP_STATE_TIMEOUT;
+ LACP_UNLOCK(lsc);
+ lsc->lsc_fast_timeout = 1;
+ break;
+ case -LAGG_OPT_LACP_TIMEOUT:
+ LACP_LOCK(lsc);
+ LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
+ lp->lp_state &= ~LACP_STATE_TIMEOUT;
+ LACP_UNLOCK(lsc);
+ lsc->lsc_fast_timeout = 0;
+ break;
}
}
LAGG_WUNLOCK(sc);
- error = EPROTONOSUPPORT;
break;
case SIOCGLAGGFLAGS:
rf->rf_flags = sc->sc_flags;
@@ -1134,6 +1237,26 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
error = EINVAL;
break;
}
+#ifdef INET6
+ /*
+ * A laggport interface should not have inet6 address
+ * because two interfaces with a valid link-local
+ * scope zone must not be merged in any form. This
+ * restriction is needed to prevent violation of
+ * link-local scope zone. Attempts to add a laggport
+ * interface which has inet6 addresses triggers
+ * removal of all inet6 addresses on the member
+ * interface.
+ */
+ if (in6ifa_llaonifp(tpif)) {
+ in6_ifdetach(tpif);
+ if_printf(sc->sc_ifp,
+ "IPv6 addresses on %s have been removed "
+ "before adding it as a member to prevent "
+ "IPv6 address scope violation.\n",
+ tpif->if_xname);
+ }
+#endif
LAGG_WLOCK(sc);
error = lagg_port_create(sc, tpif);
LAGG_WUNLOCK(sc);
@@ -1419,7 +1542,7 @@ lagg_input(struct ifnet *ifp, struct mbuf *m)
ETHER_BPF_MTAP(scifp, m);
- m = (*sc->sc_input)(sc, lp, m);
+ m = (lp->lp_detaching == 0) ? (*sc->sc_input)(sc, lp, m) : NULL;
if (m != NULL) {
counter_u64_add(sc->sc_ipackets, 1);
@@ -1582,27 +1705,6 @@ lagg_gethdr(struct mbuf *m, u_int off, u_int len, void *buf)
return (mtod(m, char *) + off);
}
-static int
-lagg_sysctl_active(SYSCTL_HANDLER_ARGS)
-{
- struct lagg_softc *sc = (struct lagg_softc *)arg1;
- struct lagg_port *lp;
- int error;
-
- /* LACP tracks active links automatically, the others do not */
- if (sc->sc_proto != LAGG_PROTO_LACP) {
- sc->sc_active = 0;
- SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
- sc->sc_active += LAGG_PORTACTIVE(lp);
- }
-
- error = sysctl_handle_int(oidp, &sc->sc_active, 0, req);
- if ((error) || (req->newptr == NULL))
- return (error);
-
- return (0);
-}
-
uint32_t
lagg_hashmbuf(struct lagg_softc *sc, struct mbuf *m, uint32_t key)
{
@@ -1715,18 +1817,16 @@ lagg_enqueue(struct ifnet *ifp, struct mbuf *m)
/*
* Simple round robin aggregation
*/
-
-static int
+static void
lagg_rr_attach(struct lagg_softc *sc)
{
sc->sc_detach = lagg_rr_detach;
sc->sc_start = lagg_rr_start;
sc->sc_input = lagg_rr_input;
+ sc->sc_detach = NULL;
sc->sc_port_create = NULL;
sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX;
sc->sc_seq = 0;
-
- return (0);
}
static int
@@ -1774,8 +1874,7 @@ lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
/*
* Active failover
*/
-
-static int
+static void
lagg_fail_attach(struct lagg_softc *sc)
{
sc->sc_detach = lagg_fail_detach;
@@ -1783,8 +1882,7 @@ lagg_fail_attach(struct lagg_softc *sc)
sc->sc_input = lagg_fail_input;
sc->sc_port_create = NULL;
sc->sc_port_destroy = NULL;
-
- return (0);
+ sc->sc_detach = NULL;
}
static int
@@ -1814,7 +1912,7 @@ lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
struct ifnet *ifp = sc->sc_ifp;
struct lagg_port *tmp_tp;
- if (lp == sc->sc_primary || lagg_failover_rx_all) {
+ if (lp == sc->sc_primary || V_lagg_failover_rx_all) {
m->m_pkthdr.rcvif = ifp;
return (m);
}
@@ -1838,16 +1936,13 @@ lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
/*
* Loadbalancing
*/
-
-static int
+static void
lagg_lb_attach(struct lagg_softc *sc)
{
struct lagg_port *lp;
struct lagg_lb *lb;
- if ((lb = (struct lagg_lb *)malloc(sizeof(struct lagg_lb),
- M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
- return (ENOMEM);
+ lb = malloc(sizeof(struct lagg_lb), M_DEVBUF, M_WAITOK | M_ZERO);
sc->sc_detach = lagg_lb_detach;
sc->sc_start = lagg_lb_start;
@@ -1861,14 +1956,13 @@ lagg_lb_attach(struct lagg_softc *sc)
SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
lagg_lb_port_create(lp);
-
- return (0);
}
static int
lagg_lb_detach(struct lagg_softc *sc)
{
struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
+ LAGG_WUNLOCK(sc);
if (lb != NULL)
free(lb, M_DEVBUF);
return (0);
@@ -1917,7 +2011,7 @@ lagg_lb_start(struct lagg_softc *sc, struct mbuf *m)
struct lagg_port *lp = NULL;
uint32_t p = 0;
- if (sc->use_flowid &&
+ if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&
M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
p = m->m_pkthdr.flowid >> sc->flowid_shift;
else
@@ -1952,12 +2046,10 @@ lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
/*
* 802.3ad LACP
*/
-
-static int
+static void
lagg_lacp_attach(struct lagg_softc *sc)
{
struct lagg_port *lp;
- int error;
sc->sc_detach = lagg_lacp_detach;
sc->sc_port_create = lacp_port_create;
@@ -1971,31 +2063,28 @@ lagg_lacp_attach(struct lagg_softc *sc)
sc->sc_req = lacp_req;
sc->sc_portreq = lacp_portreq;
- error = lacp_attach(sc);
- if (error)
- return (error);
+ lacp_attach(sc);
SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
lacp_port_create(lp);
-
- return (error);
}
static int
lagg_lacp_detach(struct lagg_softc *sc)
{
struct lagg_port *lp;
- int error;
+ void *psc;
SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
lacp_port_destroy(lp);
- /* unlocking is safe here */
+ psc = sc->sc_psc;
+ sc->sc_psc = NULL;
LAGG_WUNLOCK(sc);
- error = lacp_detach(sc);
- LAGG_WLOCK(sc);
- return (error);
+ lacp_detach(psc);
+
+ return (0);
}
static void
diff --git a/sys/net/if_lagg.h b/sys/net/if_lagg.h
index ff1ae2f..ac01032 100644
--- a/sys/net/if_lagg.h
+++ b/sys/net/if_lagg.h
@@ -47,17 +47,19 @@
"\05DISTRIBUTING\06DISABLED"
/* Supported lagg PROTOs */
-#define LAGG_PROTO_NONE 0 /* no lagg protocol defined */
-#define LAGG_PROTO_ROUNDROBIN 1 /* simple round robin */
-#define LAGG_PROTO_FAILOVER 2 /* active failover */
-#define LAGG_PROTO_LOADBALANCE 3 /* loadbalance */
-#define LAGG_PROTO_LACP 4 /* 802.3ad lacp */
-#define LAGG_PROTO_ETHERCHANNEL 5 /* Cisco FEC */
-#define LAGG_PROTO_MAX 6
+typedef enum {
+ LAGG_PROTO_NONE = 0, /* no lagg protocol defined */
+ LAGG_PROTO_ROUNDROBIN, /* simple round robin */
+ LAGG_PROTO_FAILOVER, /* active failover */
+ LAGG_PROTO_LOADBALANCE, /* loadbalance */
+ LAGG_PROTO_LACP, /* 802.3ad lacp */
+ LAGG_PROTO_ETHERCHANNEL,/* Cisco FEC */
+ LAGG_PROTO_MAX,
+} lagg_proto;
struct lagg_protos {
const char *lpr_name;
- int lpr_proto;
+ lagg_proto lpr_proto;
};
#define LAGG_PROTO_DEFAULT LAGG_PROTO_FAILOVER
@@ -134,6 +136,31 @@ struct lagg_reqflags {
#define SIOCGLAGGFLAGS _IOWR('i', 145, struct lagg_reqflags)
#define SIOCSLAGGHASH _IOW('i', 146, struct lagg_reqflags)
+struct lagg_reqopts {
+ char ro_ifname[IFNAMSIZ]; /* name of the lagg */
+
+ int ro_opts; /* Option bitmap */
+#define LAGG_OPT_NONE 0x00
+#define LAGG_OPT_USE_FLOWID 0x01 /* use M_FLOWID */
+/* Pseudo flags which are used in ro_opts but not stored into sc_opts. */
+#define LAGG_OPT_FLOWIDSHIFT 0x02 /* Set flowid */
+#define LAGG_OPT_FLOWIDSHIFT_MASK 0x1f /* flowid is uint32_t */
+#define LAGG_OPT_LACP_STRICT 0x10 /* LACP strict mode */
+#define LAGG_OPT_LACP_TXTEST 0x20 /* LACP debug: txtest */
+#define LAGG_OPT_LACP_RXTEST 0x40 /* LACP debug: rxtest */
+#define LAGG_OPT_LACP_TIMEOUT 0x80 /* LACP timeout */
+ u_int ro_count; /* number of ports */
+ u_int ro_active; /* active port count */
+ u_int ro_flapping; /* number of flapping */
+ int ro_flowid_shift; /* shift the flowid */
+};
+
+#define SIOCGLAGGOPTS _IOWR('i', 152, struct lagg_reqopts)
+#define SIOCSLAGGOPTS _IOW('i', 153, struct lagg_reqopts)
+
+#define LAGG_OPT_BITS "\020\001USE_FLOWID\005LACP_STRICT" \
+ "\006LACP_TXTEST\007LACP_RXTEST"
+
#ifdef _KERNEL
#include <sys/counter.h>
@@ -183,6 +210,7 @@ struct lagg_mc {
struct lagg_llq {
struct ifnet *llq_ifp;
uint8_t llq_lladdr[ETHER_ADDR_LEN];
+ uint8_t llq_primary;
SLIST_ENTRY(lagg_llq) llq_entries;
};
@@ -229,9 +257,7 @@ struct lagg_softc {
eventhandler_tag vlan_attach;
eventhandler_tag vlan_detach;
struct callout sc_callout;
- struct sysctl_ctx_list ctx; /* sysctl variables */
- struct sysctl_oid *sc_oid; /* sysctl tree oid */
- int use_flowid; /* enable use of flowid */
+ u_int sc_opts;
int flowid_shift; /* set flowid shift*/
};
diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h
index 3c60274..188057d 100644
--- a/sys/netinet/ip_fw.h
+++ b/sys/netinet/ip_fw.h
@@ -640,7 +640,7 @@ typedef struct _ipfw_table_xentry {
char iface[IF_NAMESIZE]; /* interface name */
} k;
} ipfw_table_xentry;
-#define IPFW_TCF_INET 0x01 /* CIDR flags: IPv4 record */
+#define IPFW_TCF_INET 0x01 /* CIDR flags: IPv4 record */
typedef struct _ipfw_table {
u_int32_t size; /* size of entries in bytes */
diff --git a/sys/netinet6/in6.c b/sys/netinet6/in6.c
index 4c94ba8..231b269 100644
--- a/sys/netinet6/in6.c
+++ b/sys/netinet6/in6.c
@@ -323,8 +323,6 @@ in6_control(struct socket *so, u_long cmd, caddr_t data,
/* FALLTHROUGH */
case OSIOCGIFINFO_IN6:
case SIOCGIFINFO_IN6:
- case SIOCGDRLST_IN6:
- case SIOCGPRLST_IN6:
case SIOCGNBRINFO_IN6:
case SIOCGDEFIFACE_IN6:
return (nd6_ioctl(cmd, data, ifp));
@@ -1254,11 +1252,13 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra,
* source address.
*/
ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */
- if (hostIsNew && in6if_do_dad(ifp))
- ia->ia6_flags |= IN6_IFF_TENTATIVE;
- /* DAD should be performed after ND6_IFF_IFDISABLED is cleared. */
- if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)
+ /*
+ * DAD should be performed for an new address or addresses on
+ * an interface with ND6_IFF_IFDISABLED.
+ */
+ if (in6if_do_dad(ifp) &&
+ (hostIsNew || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)))
ia->ia6_flags |= IN6_IFF_TENTATIVE;
/*
@@ -1280,13 +1280,8 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra,
goto cleanup;
}
- /*
- * Perform DAD, if needed.
- * XXX It may be of use, if we can administratively disable DAD.
- */
- if (in6if_do_dad(ifp) && ((ifra->ifra_flags & IN6_IFF_NODAD) == 0) &&
- (ia->ia6_flags & IN6_IFF_TENTATIVE))
- {
+ /* Perform DAD, if the address is TENTATIVE. */
+ if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) {
int mindelay, maxdelay;
delay = 0;
@@ -1619,6 +1614,10 @@ in6_purgeif(struct ifnet *ifp)
* in the future.
* RFC2373 defines interface id to be 64bit, but it allows non-RFC2374
* address encoding scheme. (see figure on page 8)
+ * Notifies other subsystems about address change/arrival:
+ * 1) Notifies device handler on the first IPv6 address assignment
+ * 2) Handle routing table changes for P2P links and route
+ * 3) Handle routing table changes for address host route
*/
static int
in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data,
@@ -2389,13 +2388,13 @@ in6if_do_dad(struct ifnet *ifp)
* However, some interfaces can be up before the RUNNING
* status. Additionaly, users may try to assign addresses
* before the interface becomes up (or running).
- * We simply skip DAD in such a case as a work around.
- * XXX: we should rather mark "tentative" on such addresses,
- * and do DAD after the interface becomes ready.
+ * This function returns EAGAIN in that case.
+ * The caller should mark "tentative" on the address instead of
+ * performing DAD immediately.
*/
if (!((ifp->if_flags & IFF_UP) &&
(ifp->if_drv_flags & IFF_DRV_RUNNING)))
- return (0);
+ return (EAGAIN);
return (1);
}
diff --git a/sys/netinet6/in6_ifattach.c b/sys/netinet6/in6_ifattach.c
index 5196101..2381dd3 100644
--- a/sys/netinet6/in6_ifattach.c
+++ b/sys/netinet6/in6_ifattach.c
@@ -595,12 +595,6 @@ in6_ifattach_loopback(struct ifnet *ifp)
ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
- /* we don't need to perform DAD on loopback interfaces. */
- ifra.ifra_flags |= IN6_IFF_NODAD;
-
- /* skip registration to the prefix list. XXX should be temporary. */
- ifra.ifra_flags |= IN6_IFF_NOPFX;
-
/*
* We are sure that this is a newly assigned address, so we can set
* NULL to the 3rd arg.
diff --git a/sys/netinet6/in6_var.h b/sys/netinet6/in6_var.h
index e0f337f..30dd1e8 100644
--- a/sys/netinet6/in6_var.h
+++ b/sys/netinet6/in6_var.h
@@ -447,11 +447,6 @@ struct in6_rrenumreq {
#define SIOCGIFAFLAG_IN6 _IOWR('i', 73, struct in6_ifreq)
-#define SIOCGDRLST_IN6 _IOWR('i', 74, struct in6_drlist)
-#ifdef _KERNEL
-/* XXX: SIOCGPRLST_IN6 is exposed in KAME but in6_oprlist is not. */
-#define SIOCGPRLST_IN6 _IOWR('i', 75, struct in6_oprlist)
-#endif
#ifdef _KERNEL
#define OSIOCGIFINFO_IN6 _IOWR('i', 76, struct in6_ondireq)
#endif
@@ -499,14 +494,11 @@ struct in6_rrenumreq {
#define IN6_IFF_DETACHED 0x08 /* may be detached from the link */
#define IN6_IFF_DEPRECATED 0x10 /* deprecated address */
#define IN6_IFF_NODAD 0x20 /* don't perform DAD on this address
- * (used only at first SIOC* call)
+ * (obsolete)
*/
#define IN6_IFF_AUTOCONF 0x40 /* autoconfigurable address. */
#define IN6_IFF_TEMPORARY 0x80 /* temporary (anonymous) address. */
#define IN6_IFF_PREFER_SOURCE 0x0100 /* preferred address for SAS */
-#define IN6_IFF_NOPFX 0x8000 /* skip kernel prefix management.
- * XXX: this should be temporary.
- */
/* do not input/output */
#define IN6_IFF_NOTREADY (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED)
diff --git a/sys/netinet6/nd6.c b/sys/netinet6/nd6.c
index 94add9a..d2aa188 100644
--- a/sys/netinet6/nd6.c
+++ b/sys/netinet6/nd6.c
@@ -1246,99 +1246,14 @@ nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info)
int
nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
{
- struct in6_drlist *drl = (struct in6_drlist *)data;
- struct in6_oprlist *oprl = (struct in6_oprlist *)data;
struct in6_ndireq *ndi = (struct in6_ndireq *)data;
struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data;
struct in6_ndifreq *ndif = (struct in6_ndifreq *)data;
- struct nd_defrouter *dr;
- struct nd_prefix *pr;
- int i = 0, error = 0;
+ int error = 0;
if (ifp->if_afdata[AF_INET6] == NULL)
return (EPFNOSUPPORT);
switch (cmd) {
- case SIOCGDRLST_IN6:
- /*
- * obsolete API, use sysctl under net.inet6.icmp6
- */
- bzero(drl, sizeof(*drl));
- TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) {
- if (i >= DRLSTSIZ)
- break;
- drl->defrouter[i].rtaddr = dr->rtaddr;
- in6_clearscope(&drl->defrouter[i].rtaddr);
-
- drl->defrouter[i].flags = dr->flags;
- drl->defrouter[i].rtlifetime = dr->rtlifetime;
- drl->defrouter[i].expire = dr->expire +
- (time_second - time_uptime);
- drl->defrouter[i].if_index = dr->ifp->if_index;
- i++;
- }
- break;
- case SIOCGPRLST_IN6:
- /*
- * obsolete API, use sysctl under net.inet6.icmp6
- *
- * XXX the structure in6_prlist was changed in backward-
- * incompatible manner. in6_oprlist is used for SIOCGPRLST_IN6,
- * in6_prlist is used for nd6_sysctl() - fill_prlist().
- */
- /*
- * XXX meaning of fields, especialy "raflags", is very
- * differnet between RA prefix list and RR/static prefix list.
- * how about separating ioctls into two?
- */
- bzero(oprl, sizeof(*oprl));
- LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
- struct nd_pfxrouter *pfr;
- int j;
-
- if (i >= PRLSTSIZ)
- break;
- oprl->prefix[i].prefix = pr->ndpr_prefix.sin6_addr;
- oprl->prefix[i].raflags = pr->ndpr_raf;
- oprl->prefix[i].prefixlen = pr->ndpr_plen;
- oprl->prefix[i].vltime = pr->ndpr_vltime;
- oprl->prefix[i].pltime = pr->ndpr_pltime;
- oprl->prefix[i].if_index = pr->ndpr_ifp->if_index;
- if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME)
- oprl->prefix[i].expire = 0;
- else {
- time_t maxexpire;
-
- /* XXX: we assume time_t is signed. */
- maxexpire = (-1) &
- ~((time_t)1 <<
- ((sizeof(maxexpire) * 8) - 1));
- if (pr->ndpr_vltime <
- maxexpire - pr->ndpr_lastupdate) {
- oprl->prefix[i].expire =
- pr->ndpr_lastupdate +
- pr->ndpr_vltime +
- (time_second - time_uptime);
- } else
- oprl->prefix[i].expire = maxexpire;
- }
-
- j = 0;
- LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) {
- if (j < DRLSTSIZ) {
-#define RTRADDR oprl->prefix[i].advrtr[j]
- RTRADDR = pfr->router->rtaddr;
- in6_clearscope(&RTRADDR);
-#undef RTRADDR
- }
- j++;
- }
- oprl->prefix[i].advrtrs = j;
- oprl->prefix[i].origin = PR_ORIG_RA;
-
- i++;
- }
-
- break;
case OSIOCGIFINFO_IN6:
#define ND ndi->ndi
/* XXX: old ndp(8) assumes a positive value for linkmtu. */
@@ -1398,22 +1313,19 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
* do not clear ND6_IFF_IFDISABLED.
* See RFC 4862, Section 5.4.5.
*/
- int duplicated_linklocal = 0;
-
IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ia = (struct in6_ifaddr *)ifa;
if ((ia->ia6_flags & IN6_IFF_DUPLICATED) &&
- IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) {
- duplicated_linklocal = 1;
+ IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia)))
break;
- }
}
IF_ADDR_RUNLOCK(ifp);
- if (duplicated_linklocal) {
+ if (ifa != NULL) {
+ /* LLA is duplicated. */
ND.flags |= ND6_IFF_IFDISABLED;
log(LOG_ERR, "Cannot enable an interface"
" with a link-local address marked"
@@ -1429,14 +1341,18 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
/* Mark all IPv6 address as tentative. */
ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED;
- IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
- if (ifa->ifa_addr->sa_family != AF_INET6)
- continue;
- ia = (struct in6_ifaddr *)ifa;
- ia->ia6_flags |= IN6_IFF_TENTATIVE;
+ if ((ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0) {
+ IF_ADDR_RLOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead,
+ ifa_link) {
+ if (ifa->ifa_addr->sa_family !=
+ AF_INET6)
+ continue;
+ ia = (struct in6_ifaddr *)ifa;
+ ia->ia6_flags |= IN6_IFF_TENTATIVE;
+ }
+ IF_ADDR_RUNLOCK(ifp);
}
- IF_ADDR_RUNLOCK(ifp);
}
if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) {
@@ -1454,20 +1370,19 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
* address is assigned, and IFF_UP, try to
* assign one.
*/
- int haslinklocal = 0;
-
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
- if (ifa->ifa_addr->sa_family != AF_INET6)
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead,
+ ifa_link) {
+ if (ifa->ifa_addr->sa_family !=
+ AF_INET6)
continue;
ia = (struct in6_ifaddr *)ifa;
- if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) {
- haslinklocal = 1;
+ if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia)))
break;
- }
}
IF_ADDR_RUNLOCK(ifp);
- if (!haslinklocal)
+ if (ifa != NULL)
+ /* No LLA is configured. */
in6_ifattach(ifp, NULL);
}
}
diff --git a/sys/netinet6/nd6_nbr.c b/sys/netinet6/nd6_nbr.c
index 2272cd0..8f287dc 100644
--- a/sys/netinet6/nd6_nbr.c
+++ b/sys/netinet6/nd6_nbr.c
@@ -576,7 +576,7 @@ nd6_ns_output_fib(struct ifnet *ifp, const struct in6_addr *daddr6,
/*
* Add a Nonce option (RFC 3971) to detect looped back NS messages.
* This behavior is documented as Enhanced Duplicate Address
- * Detection in draft-ietf-6man-enhanced-dad-13.
+ * Detection in RFC 7527.
* net.inet6.ip6.dad_enhanced=0 disables this.
*/
if (V_dad_enhanced != 0 && nonce != NULL) {
@@ -1308,11 +1308,16 @@ nd6_dad_start(struct ifaddr *ifa, int delay)
}
if (ifa->ifa_ifp == NULL)
panic("nd6_dad_start: ifa->ifa_ifp == NULL");
- if (!(ifa->ifa_ifp->if_flags & IFF_UP)) {
+ if (ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_NO_DAD) {
+ ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
return;
}
- if (ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_IFDISABLED)
+ if (!(ifa->ifa_ifp->if_flags & IFF_UP) ||
+ !(ifa->ifa_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
+ (ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_IFDISABLED)) {
+ ia->ia6_flags |= IN6_IFF_TENTATIVE;
return;
+ }
if ((dp = nd6_dad_find(ifa, NULL)) != NULL) {
/* DAD already in progress */
nd6_dad_rele(dp);
diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c
index ef66064..6a6a10f 100644
--- a/sys/powerpc/aim/mmu_oea64.c
+++ b/sys/powerpc/aim/mmu_oea64.c
@@ -1432,7 +1432,8 @@ retry:
static mmu_t installed_mmu;
static void *
-moea64_uma_page_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
+ int wait)
{
/*
* This entire routine is a horrible hack to avoid bothering kmem
diff --git a/sys/powerpc/aim/slb.c b/sys/powerpc/aim/slb.c
index 9d60b2b..89cfabf 100644
--- a/sys/powerpc/aim/slb.c
+++ b/sys/powerpc/aim/slb.c
@@ -473,7 +473,7 @@ slb_insert_user(pmap_t pm, struct slb *slb)
}
static void *
-slb_uma_real_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
{
static vm_offset_t realmax = 0;
void *va;
diff --git a/sys/powerpc/aim/uma_machdep.c b/sys/powerpc/aim/uma_machdep.c
index 255826e..33674e5 100644
--- a/sys/powerpc/aim/uma_machdep.c
+++ b/sys/powerpc/aim/uma_machdep.c
@@ -50,7 +50,7 @@ SYSCTL_INT(_hw, OID_AUTO, uma_mdpages, CTLFLAG_RD, &hw_uma_mdpages, 0,
"UMA MD pages in use");
void *
-uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
{
void *va;
vm_page_t m;
@@ -82,7 +82,7 @@ uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
}
void
-uma_small_free(void *mem, int size, u_int8_t flags)
+uma_small_free(void *mem, vm_size_t size, u_int8_t flags)
{
vm_page_t m;
diff --git a/sys/sparc64/sparc64/vm_machdep.c b/sys/sparc64/sparc64/vm_machdep.c
index 779b953..60e4a2f 100644
--- a/sys/sparc64/sparc64/vm_machdep.c
+++ b/sys/sparc64/sparc64/vm_machdep.c
@@ -502,7 +502,7 @@ swi_vm(void *v)
}
void *
-uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
{
vm_paddr_t pa;
vm_page_t m;
@@ -540,7 +540,7 @@ uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
}
void
-uma_small_free(void *mem, int size, u_int8_t flags)
+uma_small_free(void *mem, vm_size_t size, u_int8_t flags)
{
vm_page_t m;
diff --git a/sys/sys/busdma_bufalloc.h b/sys/sys/busdma_bufalloc.h
index f5ec32f..dfcb4b8 100644
--- a/sys/sys/busdma_bufalloc.h
+++ b/sys/sys/busdma_bufalloc.h
@@ -110,9 +110,10 @@ struct busdma_bufzone * busdma_bufalloc_findzone(busdma_bufalloc_t ba,
* routines support pmap_page_set_memattr() and the VM_MEMATTR_UNCACHEABLE flag
* you can probably use these when you need uncacheable buffers.
*/
-void * busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size,
- u_int8_t *pflag, int wait);
-void busdma_bufalloc_free_uncacheable(void *item, int size, u_int8_t pflag);
+void * busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size,
+ uint8_t *pflag, int wait);
+void busdma_bufalloc_free_uncacheable(void *item, vm_size_t size,
+ uint8_t pflag);
#endif /* _MACHINE_BUSDMA_BUFALLOC_H_ */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index fe4c9ea..eb691b8 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -600,6 +600,7 @@ struct vnode;
typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **);
/* cache_* may belong in namei.h. */
+void cache_changesize(int newhashsize);
#define cache_enter(dvp, vp, cnp) \
cache_enter_time(dvp, vp, cnp, NULL, NULL)
void cache_enter_time(struct vnode *dvp, struct vnode *vp,
@@ -836,6 +837,7 @@ int fifo_printinfo(struct vnode *);
/* vfs_hash.c */
typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg);
+void vfs_hash_changesize(int newhashsize);
int vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
u_int vfs_hash_index(struct vnode *vp);
int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
diff --git a/sys/teken/teken.c b/sys/teken/teken.c
index 3002a88..8834390 100644
--- a/sys/teken/teken.c
+++ b/sys/teken/teken.c
@@ -29,12 +29,14 @@
#include <sys/cdefs.h>
#if defined(__FreeBSD__) && defined(_KERNEL)
#include <sys/param.h>
+#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/systm.h>
#define teken_assert(x) MPASS(x)
#else /* !(__FreeBSD__ && _KERNEL) */
#include <sys/types.h>
#include <assert.h>
+#include <limits.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
@@ -405,18 +407,24 @@ teken_state_numbers(teken_t *t, teken_char_t c)
teken_assert(t->t_curnum < T_NUMSIZE);
if (c >= '0' && c <= '9') {
- /*
- * Don't do math with the default value of 1 when a
- * custom number is inserted.
- */
if (t->t_stateflags & TS_FIRSTDIGIT) {
+ /* First digit. */
t->t_stateflags &= ~TS_FIRSTDIGIT;
- t->t_nums[t->t_curnum] = 0;
- } else {
- t->t_nums[t->t_curnum] *= 10;
+ t->t_nums[t->t_curnum] = c - '0';
+ } else if (t->t_nums[t->t_curnum] < UINT_MAX / 100) {
+ /*
+ * There is no need to continue parsing input
+ * once the value exceeds the size of the
+ * terminal. It would only allow for integer
+ * overflows when performing arithmetic on the
+ * cursor position.
+ *
+ * Ignore any further digits if the value is
+ * already UINT_MAX / 100.
+ */
+ t->t_nums[t->t_curnum] =
+ t->t_nums[t->t_curnum] * 10 + c - '0';
}
-
- t->t_nums[t->t_curnum] += c - '0';
return (1);
} else if (c == ';') {
if (t->t_stateflags & TS_FIRSTDIGIT)
diff --git a/sys/teken/teken_subr.h b/sys/teken/teken_subr.h
index 2eee627..d458f2a 100644
--- a/sys/teken/teken_subr.h
+++ b/sys/teken/teken_subr.h
@@ -324,13 +324,13 @@ static void
teken_subr_cursor_position(teken_t *t, unsigned int row, unsigned int col)
{
- t->t_cursor.tp_row = t->t_originreg.ts_begin + row - 1;
- if (t->t_cursor.tp_row >= t->t_originreg.ts_end)
- t->t_cursor.tp_row = t->t_originreg.ts_end - 1;
+ row = row - 1 + t->t_originreg.ts_begin;
+ t->t_cursor.tp_row = row < t->t_originreg.ts_end ?
+ row : t->t_originreg.ts_end - 1;
- t->t_cursor.tp_col = col - 1;
- if (t->t_cursor.tp_col >= t->t_winsize.tp_col)
- t->t_cursor.tp_col = t->t_winsize.tp_col - 1;
+ col--;
+ t->t_cursor.tp_col = col < t->t_winsize.tp_col ?
+ col : t->t_winsize.tp_col - 1;
t->t_stateflags &= ~TS_WRAPPED;
teken_funcs_cursor(t);
@@ -583,9 +583,9 @@ static void
teken_subr_horizontal_position_absolute(teken_t *t, unsigned int col)
{
- t->t_cursor.tp_col = col - 1;
- if (t->t_cursor.tp_col >= t->t_winsize.tp_col)
- t->t_cursor.tp_col = t->t_winsize.tp_col - 1;
+ col--;
+ t->t_cursor.tp_col = col < t->t_winsize.tp_col ?
+ col : t->t_winsize.tp_col - 1;
t->t_stateflags &= ~TS_WRAPPED;
teken_funcs_cursor(t);
@@ -1297,9 +1297,9 @@ static void
teken_subr_vertical_position_absolute(teken_t *t, unsigned int row)
{
- t->t_cursor.tp_row = t->t_originreg.ts_begin + row - 1;
- if (t->t_cursor.tp_row >= t->t_originreg.ts_end)
- t->t_cursor.tp_row = t->t_originreg.ts_end - 1;
+ row = row - 1 + t->t_originreg.ts_begin;
+ t->t_cursor.tp_row = row < t->t_originreg.ts_end ?
+ row : t->t_originreg.ts_end - 1;
t->t_stateflags &= ~TS_WRAPPED;
teken_funcs_cursor(t);
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 5ac391e..4e2c9ea 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -923,8 +923,7 @@ static int journal_unsuspend(struct ufsmount *ump);
static void softdep_prelink(struct vnode *, struct vnode *);
static void add_to_journal(struct worklist *);
static void remove_from_journal(struct worklist *);
-static bool softdep_excess_inodes(struct ufsmount *);
-static bool softdep_excess_dirrem(struct ufsmount *);
+static bool softdep_excess_items(struct ufsmount *, int);
static void softdep_process_journal(struct mount *, struct worklist *, int);
static struct jremref *newjremref(struct dirrem *, struct inode *,
struct inode *ip, off_t, nlink_t);
@@ -2212,7 +2211,7 @@ inodedep_lookup(mp, inum, flags, inodedeppp)
* responsible for more than our share of that usage and
* we are not in a rush, request some inodedep cleanup.
*/
- if (softdep_excess_inodes(ump))
+ if (softdep_excess_items(ump, D_INODEDEP))
schedule_cleanup(mp);
else
FREE_LOCK(ump);
@@ -2307,7 +2306,12 @@ newblk_lookup(mp, newblkno, flags, newblkpp)
return (1);
if ((flags & DEPALLOC) == 0)
return (0);
- FREE_LOCK(ump);
+ if (softdep_excess_items(ump, D_NEWBLK) ||
+ softdep_excess_items(ump, D_ALLOCDIRECT) ||
+ softdep_excess_items(ump, D_ALLOCINDIR))
+ schedule_cleanup(mp);
+ else
+ FREE_LOCK(ump);
newblk = malloc(sizeof(union allblk), M_NEWBLK,
M_SOFTDEP_FLAGS | M_ZERO);
workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
@@ -2406,7 +2410,11 @@ softdep_initialize()
{
TAILQ_INIT(&softdepmounts);
+#ifdef __LP64__
max_softdeps = desiredvnodes * 4;
+#else
+ max_softdeps = desiredvnodes * 2;
+#endif
/* initialise bioops hack */
bioops.io_start = softdep_disk_io_initiation;
@@ -9106,7 +9114,7 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
* the number of freefile and freeblks structures.
*/
ACQUIRE_LOCK(ip->i_ump);
- if (!IS_SNAPSHOT(ip) && softdep_excess_dirrem(ip->i_ump))
+ if (!IS_SNAPSHOT(ip) && softdep_excess_items(ip->i_ump, D_DIRREM))
schedule_cleanup(ITOV(dp)->v_mount);
else
FREE_LOCK(ip->i_ump);
@@ -13244,20 +13252,12 @@ retry:
}
static bool
-softdep_excess_inodes(struct ufsmount *ump)
-{
-
- return (dep_current[D_INODEDEP] > max_softdeps &&
- ump->softdep_curdeps[D_INODEDEP] > max_softdeps /
- stat_flush_threads);
-}
-
-static bool
-softdep_excess_dirrem(struct ufsmount *ump)
+softdep_excess_items(struct ufsmount *ump, int item)
{
- return (dep_current[D_DIRREM] > max_softdeps / 2 &&
- ump->softdep_curdeps[D_DIRREM] > (max_softdeps / 2) /
+ KASSERT(item >= 0 && item < D_LAST, ("item %d", item));
+ return (dep_current[item] > max_softdeps &&
+ ump->softdep_curdeps[item] > max_softdeps /
stat_flush_threads);
}
@@ -13313,15 +13313,25 @@ softdep_ast_cleanup_proc(void)
for (;;) {
req = false;
ACQUIRE_LOCK(ump);
- if (softdep_excess_inodes(ump)) {
+ if (softdep_excess_items(ump, D_INODEDEP)) {
req = true;
request_cleanup(mp, FLUSH_INODES);
}
- if (softdep_excess_dirrem(ump)) {
+ if (softdep_excess_items(ump, D_DIRREM)) {
req = true;
request_cleanup(mp, FLUSH_BLOCKS);
}
FREE_LOCK(ump);
+ if (softdep_excess_items(ump, D_NEWBLK) ||
+ softdep_excess_items(ump, D_ALLOCDIRECT) ||
+ softdep_excess_items(ump, D_ALLOCINDIR)) {
+ error = vn_start_write(NULL, &mp, V_WAIT);
+ if (error == 0) {
+ req = true;
+ VFS_SYNC(mp, MNT_WAIT);
+ vn_finished_write(mp);
+ }
+ }
if ((td->td_pflags & TDP_KTHREAD) != 0 || !req)
break;
}
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index c09dbc2..4deebd9 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -2354,8 +2354,8 @@ swapoff_one(struct swdevt *sp, struct ucred *cred)
swap_pager_swapoff(sp);
sp->sw_close(curthread, sp);
- sp->sw_id = NULL;
mtx_lock(&sw_dev_mtx);
+ sp->sw_id = NULL;
TAILQ_REMOVE(&swtailq, sp, sw_list);
nswapdev--;
if (nswapdev == 0) {
@@ -2541,13 +2541,39 @@ swapgeom_close_ev(void *arg, int flags)
g_destroy_consumer(cp);
}
+/*
+ * Add a reference to the g_consumer for an inflight transaction.
+ */
+static void
+swapgeom_acquire(struct g_consumer *cp)
+{
+
+ mtx_assert(&sw_dev_mtx, MA_OWNED);
+ cp->index++;
+}
+
+/*
+ * Remove a reference from the g_consumer. Post a close event if
+ * all referneces go away.
+ */
+static void
+swapgeom_release(struct g_consumer *cp, struct swdevt *sp)
+{
+
+ mtx_assert(&sw_dev_mtx, MA_OWNED);
+ cp->index--;
+ if (cp->index == 0) {
+ if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0)
+ sp->sw_id = NULL;
+ }
+}
+
static void
swapgeom_done(struct bio *bp2)
{
struct swdevt *sp;
struct buf *bp;
struct g_consumer *cp;
- int destroy;
bp = bp2->bio_caller2;
cp = bp2->bio_from;
@@ -2557,16 +2583,11 @@ swapgeom_done(struct bio *bp2)
bp->b_resid = bp->b_bcount - bp2->bio_completed;
bp->b_error = bp2->bio_error;
bufdone(bp);
+ sp = bp2->bio_caller1;
mtx_lock(&sw_dev_mtx);
- destroy = ((--cp->index) == 0 && cp->private);
- if (destroy) {
- sp = bp2->bio_caller1;
- sp->sw_id = NULL;
- }
+ swapgeom_release(cp, sp);
mtx_unlock(&sw_dev_mtx);
g_destroy_bio(bp2);
- if (destroy)
- g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL);
}
static void
@@ -2584,13 +2605,16 @@ swapgeom_strategy(struct buf *bp, struct swdevt *sp)
bufdone(bp);
return;
}
- cp->index++;
+ swapgeom_acquire(cp);
mtx_unlock(&sw_dev_mtx);
if (bp->b_iocmd == BIO_WRITE)
bio = g_new_bio();
else
bio = g_alloc_bio();
if (bio == NULL) {
+ mtx_lock(&sw_dev_mtx);
+ swapgeom_release(cp, sp);
+ mtx_unlock(&sw_dev_mtx);
bp->b_error = ENOMEM;
bp->b_ioflags |= BIO_ERROR;
bufdone(bp);
@@ -2630,7 +2654,12 @@ swapgeom_orphan(struct g_consumer *cp)
break;
}
}
- cp->private = (void *)(uintptr_t)1;
+ /*
+ * Drop reference we were created with. Do directly since we're in a
+ * special context where we don't have to queue the call to
+ * swapgeom_close_ev().
+ */
+ cp->index--;
destroy = ((sp != NULL) && (cp->index == 0));
if (destroy)
sp->sw_id = NULL;
@@ -2691,8 +2720,8 @@ swapongeom_ev(void *arg, int flags)
if (gp == NULL)
gp = g_new_geomf(&g_swap_class, "swap");
cp = g_new_consumer(gp);
- cp->index = 0; /* Number of active I/Os. */
- cp->private = NULL; /* Orphanization flag */
+ cp->index = 1; /* Number of active I/Os, plus one for being active. */
+ cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
g_attach(cp, pp);
/*
* XXX: Everytime you think you can improve the margin for
diff --git a/sys/vm/uma.h b/sys/vm/uma.h
index ed69e19..d3e0658 100644
--- a/sys/vm/uma.h
+++ b/sys/vm/uma.h
@@ -382,7 +382,8 @@ uma_zfree(uma_zone_t zone, void *item)
* A pointer to the allocated memory or NULL on failure.
*/
-typedef void *(*uma_alloc)(uma_zone_t zone, int size, uint8_t *pflag, int wait);
+typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, uint8_t *pflag,
+ int wait);
/*
* Backend page free routines
@@ -395,7 +396,7 @@ typedef void *(*uma_alloc)(uma_zone_t zone, int size, uint8_t *pflag, int wait);
* Returns:
* None
*/
-typedef void (*uma_free)(void *item, int size, uint8_t pflag);
+typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag);
diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c
index d0df901..ee0b207 100644
--- a/sys/vm/uma_core.c
+++ b/sys/vm/uma_core.c
@@ -229,10 +229,10 @@ enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI };
/* Prototypes.. */
-static void *noobj_alloc(uma_zone_t, int, uint8_t *, int);
-static void *page_alloc(uma_zone_t, int, uint8_t *, int);
-static void *startup_alloc(uma_zone_t, int, uint8_t *, int);
-static void page_free(void *, int, uint8_t);
+static void *noobj_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
+static void *page_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
+static void *startup_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
+static void page_free(void *, vm_size_t, uint8_t);
static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
static void cache_drain(uma_zone_t);
static void bucket_drain(uma_zone_t, uma_bucket_t);
@@ -1038,7 +1038,7 @@ out:
* the VM is ready.
*/
static void *
-startup_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
+startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
{
uma_keg_t keg;
uma_slab_t tmps;
@@ -1098,7 +1098,7 @@ startup_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
* NULL if M_NOWAIT is set.
*/
static void *
-page_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
+page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
{
void *p; /* Returned page */
@@ -1120,7 +1120,7 @@ page_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
* NULL if M_NOWAIT is set.
*/
static void *
-noobj_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait)
+noobj_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
{
TAILQ_HEAD(, vm_page) alloctail;
u_long npages;
@@ -1183,7 +1183,7 @@ noobj_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait)
* Nothing
*/
static void
-page_free(void *mem, int size, uint8_t flags)
+page_free(void *mem, vm_size_t size, uint8_t flags)
{
struct vmem *vmem;
@@ -3269,7 +3269,7 @@ uma_zone_exhausted_nolock(uma_zone_t zone)
}
void *
-uma_large_malloc(int size, int wait)
+uma_large_malloc(vm_size_t size, int wait)
{
void *mem;
uma_slab_t slab;
diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h
index 1ffc7d5..ad2a405 100644
--- a/sys/vm/uma_int.h
+++ b/sys/vm/uma_int.h
@@ -341,7 +341,7 @@ zone_first_keg(uma_zone_t zone)
#ifdef _KERNEL
/* Internal prototypes */
static __inline uma_slab_t hash_sfind(struct uma_hash *hash, uint8_t *data);
-void *uma_large_malloc(int size, int wait);
+void *uma_large_malloc(vm_size_t size, int wait);
void uma_large_free(uma_slab_t slab);
/* Lock Macros */
@@ -424,8 +424,9 @@ vsetslab(vm_offset_t va, uma_slab_t slab)
* if they can provide more effecient allocation functions. This is useful
* for using direct mapped addresses.
*/
-void *uma_small_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait);
-void uma_small_free(void *mem, int size, uint8_t flags);
+void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag,
+ int wait);
+void uma_small_free(void *mem, vm_size_t size, uint8_t flags);
#endif /* _KERNEL */
#endif /* VM_UMA_INT_H */
diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c
index 93db8d1..46e3089 100644
--- a/sys/vm/vm_map.c
+++ b/sys/vm/vm_map.c
@@ -3989,12 +3989,10 @@ RetryLookup:;
vm_map_unlock_read(map);
return (KERN_PROTECTION_FAILURE);
}
- if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
- (entry->eflags & MAP_ENTRY_COW) &&
- (fault_type & VM_PROT_WRITE)) {
- vm_map_unlock_read(map);
- return (KERN_PROTECTION_FAILURE);
- }
+ KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
+ (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
+ (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
+ ("entry %p flags %x", entry, entry->eflags));
if ((fault_typea & VM_PROT_COPY) != 0 &&
(entry->max_protection & VM_PROT_WRITE) == 0 &&
(entry->eflags & MAP_ENTRY_COW) == 0) {
@@ -4148,10 +4146,6 @@ vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */
fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
if ((fault_type & prot) != fault_type)
return (KERN_PROTECTION_FAILURE);
- if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
- (entry->eflags & MAP_ENTRY_COW) &&
- (fault_type & VM_PROT_WRITE))
- return (KERN_PROTECTION_FAILURE);
/*
* If this page is not pageable, we have to get it for all possible
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 998cd37..6a56fd7 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -93,6 +93,7 @@ __FBSDID("$FreeBSD$");
#include <sys/sdt.h>
#include <sys/signalvar.h>
#include <sys/smp.h>
+#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/vmmeter.h>
#include <sys/rwlock.h>
@@ -170,7 +171,7 @@ static int vm_pageout_update_period;
static int defer_swap_pageouts;
static int disable_swap_pageouts;
static int lowmem_period = 10;
-static int lowmem_ticks;
+static time_t lowmem_uptime;
#if defined(NO_SWAPPING)
static int vm_swap_enabled = 0;
@@ -932,7 +933,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
* some. We rate limit to avoid thrashing.
*/
if (vmd == &vm_dom[0] && pass > 0 &&
- (ticks - lowmem_ticks) / hz >= lowmem_period) {
+ (time_uptime - lowmem_uptime) >= lowmem_period) {
/*
* Decrease registered cache sizes.
*/
@@ -943,7 +944,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
* drained above.
*/
uma_reclaim();
- lowmem_ticks = ticks;
+ lowmem_uptime = time_uptime;
}
/*
diff --git a/sys/xen/interface/io/blkif.h b/sys/xen/interface/io/blkif.h
index b725776..9c73ae7 100644
--- a/sys/xen/interface/io/blkif.h
+++ b/sys/xen/interface/io/blkif.h
@@ -97,6 +97,28 @@
*
* The type of the backing device/object.
*
+ *
+ * direct-io-safe
+ * Values: 0/1 (boolean)
+ * Default Value: 0
+ *
+ * The underlying storage is not affected by the direct IO memory
+ * lifetime bug. See:
+ * http://lists.xen.org/archives/html/xen-devel/2012-12/msg01154.html
+ *
+ * Therefore this option gives the backend permission to use
+ * O_DIRECT, notwithstanding that bug.
+ *
+ * That is, if this option is enabled, use of O_DIRECT is safe,
+ * in circumstances where we would normally have avoided it as a
+ * workaround for that bug. This option is not relevant for all
+ * backends, and even not necessarily supported for those for
+ * which it is relevant. A backend which knows that it is not
+ * affected by the bug can ignore this option.
+ *
+ * This option doesn't require a backend to use O_DIRECT, so it
+ * should not be used to try to control the caching behaviour.
+ *
*--------------------------------- Features ---------------------------------
*
* feature-barrier
@@ -126,6 +148,34 @@
* of this type may still be returned at any time with the
* BLKIF_RSP_EOPNOTSUPP result code.
*
+ * feature-persistent
+ * Values: 0/1 (boolean)
+ * Default Value: 0
+ * Notes: 7
+ *
+ * A value of "1" indicates that the backend can keep the grants used
+ * by the frontend driver mapped, so the same set of grants should be
+ * used in all transactions. The maximum number of grants the backend
+ * can map persistently depends on the implementation, but ideally it
+ * should be RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. Using this
+ * feature the backend doesn't need to unmap each grant, preventing
+ * costly TLB flushes. The backend driver should only map grants
+ * persistently if the frontend supports it. If a backend driver chooses
+ * to use the persistent protocol when the frontend doesn't support it,
+ * it will probably hit the maximum number of persistently mapped grants
+ * (due to the fact that the frontend won't be reusing the same grants),
+ * and fall back to non-persistent mode. Backend implementations may
+ * shrink or expand the number of persistently mapped grants without
+ * notifying the frontend depending on memory constraints (this might
+ * cause a performance degradation).
+ *
+ * If a backend driver wants to limit the maximum number of persistently
+ * mapped grants to a value less than RING_SIZE *
+ * BLKIF_MAX_SEGMENTS_PER_REQUEST a LRU strategy should be used to
+ * discard the grants that are less commonly used. Using a LRU in the
+ * backend driver paired with a LIFO queue in the frontend will
+ * allow us to have better performance in this scenario.
+ *
*----------------------- Request Transport Parameters ------------------------
*
* max-ring-page-order
@@ -145,33 +195,17 @@
* The maximum supported size of the request ring buffer in units of
* machine pages. The value must be a power of 2.
*
- * max-requests <uint32_t>
- * Default Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE)
- * Maximum Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE * max-ring-pages)
- *
- * The maximum number of concurrent, logical requests supported by
- * the backend.
- *
- * Note: A logical request may span multiple ring entries.
- *
- * max-request-segments
- * Values: <uint8_t>
- * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK
- * Maximum Value: BLKIF_MAX_SEGMENTS_PER_REQUEST
- *
- * The maximum value of blkif_request.nr_segments supported by
- * the backend.
- *
- * max-request-size
- * Values: <uint32_t>
- * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK * PAGE_SIZE
- * Maximum Value: BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE
+ *------------------------- Backend Device Properties -------------------------
*
- * The maximum amount of data, in bytes, that can be referenced by a
- * request type that accesses frontend memory (currently BLKIF_OP_READ,
- * BLKIF_OP_WRITE, or BLKIF_OP_WRITE_BARRIER).
+ * discard-enable
+ * Values: 0/1 (boolean)
+ * Default Value: 1
*
- *------------------------- Backend Device Properties -------------------------
+ * This optional property, set by the toolstack, instructs the backend
+ * to offer discard to the frontend. If the property is missing the
+ * backend should offer discard if the backing storage actually supports
+ * it. This optional property, set by the toolstack, requests that the
+ * backend offer, or not offer, discard to the frontend.
*
* discard-alignment
* Values: <uint32_t>
@@ -192,6 +226,7 @@
* discard-secure
* Values: 0/1 (boolean)
* Default Value: 0
+ * Notes: 10
*
* A value of "1" indicates that the backend can process BLKIF_OP_DISCARD
* requests with the BLKIF_DISCARD_SECURE flag set.
@@ -206,13 +241,17 @@
* sector-size
* Values: <uint32_t>
*
- * The size, in bytes, of the individually addressible data blocks
- * on the backend device.
+ * The logical sector size, in bytes, of the backend device.
+ *
+ * physical-sector-size
+ * Values: <uint32_t>
+ *
+ * The physical sector size, in bytes, of the backend device.
*
* sectors
* Values: <uint64_t>
*
- * The size of the backend device, expressed in units of its native
+ * The size of the backend device, expressed in units of its logical
* sector size ("sector-size").
*
*****************************************************************************
@@ -269,32 +308,26 @@
* The size of the frontend allocated request ring buffer in units of
* machine pages. The value must be a power of 2.
*
- * max-requests
- * Values: <uint32_t>
- * Default Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE)
- * Maximum Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE * max-ring-pages)
- *
- * The maximum number of concurrent, logical requests that will be
- * issued by the frontend.
- *
- * Note: A logical request may span multiple ring entries.
- *
- * max-request-segments
- * Values: <uint8_t>
- * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK
- * Maximum Value: MIN(255, backend/max-request-segments)
- *
- * The maximum value the frontend will set in the
- * blkif_request.nr_segments field.
- *
- * max-request-size
- * Values: <uint32_t>
- * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK * PAGE_SIZE
- * Maximum Value: max-request-segments * PAGE_SIZE
- *
- * The maximum amount of data, in bytes, that can be referenced by
- * a request type that accesses frontend memory (currently BLKIF_OP_READ,
- * BLKIF_OP_WRITE, or BLKIF_OP_WRITE_BARRIER).
+ * feature-persistent
+ * Values: 0/1 (boolean)
+ * Default Value: 0
+ * Notes: 7, 8, 9
+ *
+ * A value of "1" indicates that the frontend will reuse the same grants
+ * for all transactions, allowing the backend to map them with write
+ * access (even when it should be read-only). If the frontend hits the
+ * maximum number of allowed persistently mapped grants, it can fallback
+ * to non persistent mode. This will cause a performance degradation,
+ * since the the backend driver will still try to map those grants
+ * persistently. Since the persistent grants protocol is compatible with
+ * the previous protocol, a frontend driver can choose to work in
+ * persistent mode even when the backend doesn't support it.
+ *
+ * It is recommended that the frontend driver stores the persistently
+ * mapped grants in a LIFO queue, so a subset of all persistently mapped
+ * grants gets used commonly. This is done in case the backend driver
+ * decides to limit the maximum number of persistently mapped grants
+ * to a value less than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST.
*
*------------------------- Virtual Device Properties -------------------------
*
@@ -315,17 +348,23 @@
* -----
* (1) Multi-page ring buffer scheme first developed in the Citrix XenServer
* PV drivers.
- * (2) Multi-page ring buffer scheme first used in some Red Hat distributions
+ * (2) Multi-page ring buffer scheme first used in some RedHat distributions
* including a distribution deployed on certain nodes of the Amazon
* EC2 cluster.
* (3) Support for multi-page ring buffers was implemented independently,
- * in slightly different forms, by both Citrix and Red Hat/Amazon.
+ * in slightly different forms, by both Citrix and RedHat/Amazon.
* For full interoperability, block front and backends should publish
* identical ring parameters, adjusted for unit differences, to the
* XenStore nodes used in both schemes.
- * (4) Devices that support discard functionality may internally allocate
- * space (discardable extents) in units that are larger than the
- * exported logical block size.
+ * (4) Devices that support discard functionality may internally allocate space
+ * (discardable extents) in units that are larger than the exported logical
+ * block size. If the backing device has such discardable extents the
+ * backend should provide both discard-granularity and discard-alignment.
+ * Providing just one of the two may be considered an error by the frontend.
+ * Backends supporting discard should include discard-granularity and
+ * discard-alignment even if it supports discarding individual sectors.
+ * Frontends should assume discard-alignment == 0 and discard-granularity
+ * == sector size if these keys are missing.
* (5) The discard-alignment parameter allows a physical device to be
* partitioned into virtual devices that do not necessarily begin or
* end on a discardable extent boundary.
@@ -333,6 +372,19 @@
* 'ring-ref' is used to communicate the grant reference for this
* page to the backend. When using a multi-page ring, the 'ring-ref'
* node is not created. Instead 'ring-ref0' - 'ring-refN' are used.
+ * (7) When using persistent grants data has to be copied from/to the page
+ * where the grant is currently mapped. The overhead of doing this copy
+ * however doesn't suppress the speed improvement of not having to unmap
+ * the grants.
+ * (8) The frontend driver has to allow the backend driver to map all grants
+ * with write access, even when they should be mapped read-only, since
+ * further requests may reuse these grants and require write permissions.
+ * (9) Linux implementation doesn't have a limit on the maximum number of
+ * grants that can be persistently mapped in the frontend driver, but
+ * due to the frontent driver implementation it should never be bigger
+ * than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST.
+ *(10) The discard-secure property may be present and will be set to 1 if the
+ * backing device supports secure discard.
*/
/*
@@ -457,16 +509,48 @@
#define BLKIF_OP_DISCARD 5
/*
- * Maximum scatter/gather segments per request (header + segment blocks).
+ * Recognized if "feature-max-indirect-segments" in present in the backend
+ * xenbus info. The "feature-max-indirect-segments" node contains the maximum
+ * number of segments allowed by the backend per request. If the node is
+ * present, the frontend might use blkif_request_indirect structs in order to
+ * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The
+ * maximum number of indirect segments is fixed by the backend, but the
+ * frontend can issue requests with any number of indirect segments as long as
+ * it's less than the number provided by the backend. The indirect_grefs field
+ * in blkif_request_indirect should be filled by the frontend with the
+ * grant references of the pages that are holding the indirect segments.
+ * These pages are filled with an array of blkif_request_segment that hold the
+ * information about the segments. The number of indirect pages to use is
+ * determined by the number of segments an indirect request contains. Every
+ * indirect page can contain a maximum of
+ * (PAGE_SIZE / sizeof(struct blkif_request_segment)) segments, so to
+ * calculate the number of indirect pages to use we have to do
+ * ceil(indirect_segments / (PAGE_SIZE / sizeof(struct blkif_request_segment))).
+ *
+ * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not*
+ * create the "feature-max-indirect-segments" node!
+ */
+#define BLKIF_OP_INDIRECT 6
+
+/*
+ * Maximum scatter/gather segments per request.
+ * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
+ * NB. This could be 12 if the ring indexes weren't stored in the same page.
*/
#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
/*
+ * Maximum number of indirect pages to use per request.
+ */
+#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8
+
+/*
* NB. first_sect and last_sect in blkif_request_segment, as well as
* sector_number in blkif_request, are always expressed in 512-byte units.
* However they must be properly aligned to the real sector size of the
- * physical disk, which is reported in the "sector-size" node in the backend
- * xenbus info. Also the xenbus "sectors" node is expressed in 512-byte units.
+ * physical disk, which is reported in the "physical-sector-size" node in
+ * the backend xenbus info. Also the xenbus "sectors" node is expressed in
+ * 512-byte units.
*/
struct blkif_request_segment {
grant_ref_t gref; /* reference to I/O buffer frame */
@@ -478,21 +562,6 @@ typedef struct blkif_request_segment blkif_request_segment_t;
/*
* Starting ring element for any I/O request.
- *
- * One or more segment blocks can be inserted into the request ring
- * just after a blkif_request_t, allowing requests to operate on
- * up to BLKIF_MAX_SEGMENTS_PER_REQUEST.
- *
- * BLKIF_SEGS_TO_BLOCKS() can be used on blkif_requst.nr_segments
- * to determine the number of contiguous ring entries associated
- * with this request.
- *
- * Note: Due to the way Xen request rings operate, the producer and
- * consumer indices of the ring must be incremented by the
- * BLKIF_SEGS_TO_BLOCKS() value of the associated request.
- * (e.g. a response to a 3 ring entry request must also consume
- * 3 entries in the ring, even though only the first ring entry
- * in the response has any data.)
*/
struct blkif_request {
uint8_t operation; /* BLKIF_OP_??? */
@@ -519,6 +588,20 @@ struct blkif_request_discard {
};
typedef struct blkif_request_discard blkif_request_discard_t;
+struct blkif_request_indirect {
+ uint8_t operation; /* BLKIF_OP_INDIRECT */
+ uint8_t indirect_op; /* BLKIF_OP_{READ/WRITE} */
+ uint16_t nr_segments; /* number of segments */
+ uint64_t id; /* private guest value, echoed in resp */
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ blkif_vdev_t handle; /* same as for read/write requests */
+ grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+#ifdef __i386__
+ uint64_t pad; /* Make it 64 byte aligned on i386 */
+#endif
+};
+typedef struct blkif_request_indirect blkif_request_indirect_t;
+
struct blkif_response {
uint64_t id; /* copied from request */
uint8_t operation; /* copied from request */
@@ -550,7 +633,7 @@ DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
/*
* Local variables:
* mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
OpenPOWER on IntegriCloud