summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authorRenato Botelho <renato@netgate.com>2016-02-24 07:51:32 -0300
committerRenato Botelho <renato@netgate.com>2016-02-24 07:51:32 -0300
commitb15d3cfa0625b6816b5b55df864fbda78dc2add8 (patch)
treec384e7235e9894678587ee5782698526bdcc340e /sys
parent7c17fc70241a215de420457e10a510834441b90f (diff)
parent008df39fd8f9ba2311709c852fa30e39bf891bcf (diff)
downloadFreeBSD-src-b15d3cfa0625b6816b5b55df864fbda78dc2add8.zip
FreeBSD-src-b15d3cfa0625b6816b5b55df864fbda78dc2add8.tar.gz
Merge remote-tracking branch 'origin/stable/10' into devel
Diffstat (limited to 'sys')
-rw-r--r--sys/compat/linprocfs/linprocfs.c2
-rw-r--r--sys/compat/linsysfs/linsysfs.c2
-rw-r--r--sys/dev/hyperv/netvsc/hv_net_vsc.c2
-rw-r--r--sys/dev/hyperv/netvsc/hv_net_vsc.h26
-rw-r--r--sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c663
-rw-r--r--sys/dev/hyperv/netvsc/hv_rndis.h1
-rw-r--r--sys/dev/hyperv/netvsc/hv_rndis_filter.c18
-rw-r--r--sys/dev/hyperv/netvsc/hv_rndis_filter.h1
-rw-r--r--sys/dev/hyperv/vmbus/hv_channel_mgmt.c21
-rw-r--r--sys/dev/hyperv/vmbus/hv_connection.c38
-rw-r--r--sys/dev/hyperv/vmbus/hv_vmbus_priv.h11
-rw-r--r--sys/fs/tmpfs/tmpfs_vnops.c5
-rw-r--r--sys/kern/kern_jail.c16
-rw-r--r--sys/kern/vfs_subr.c322
-rw-r--r--sys/sys/jail.h4
15 files changed, 698 insertions, 434 deletions
diff --git a/sys/compat/linprocfs/linprocfs.c b/sys/compat/linprocfs/linprocfs.c
index 6e591e9..9142c93 100644
--- a/sys/compat/linprocfs/linprocfs.c
+++ b/sys/compat/linprocfs/linprocfs.c
@@ -1514,7 +1514,7 @@ linprocfs_uninit(PFS_INIT_ARGS)
return (0);
}
-PSEUDOFS(linprocfs, 1, 0);
+PSEUDOFS(linprocfs, 1, PR_ALLOW_MOUNT_LINPROCFS);
#if defined(__amd64__)
MODULE_DEPEND(linprocfs, linux_common, 1, 1, 1);
#else
diff --git a/sys/compat/linsysfs/linsysfs.c b/sys/compat/linsysfs/linsysfs.c
index 8b5f9b5..4f57526 100644
--- a/sys/compat/linsysfs/linsysfs.c
+++ b/sys/compat/linsysfs/linsysfs.c
@@ -274,7 +274,7 @@ linsysfs_uninit(PFS_INIT_ARGS)
return (0);
}
-PSEUDOFS(linsysfs, 1, 0);
+PSEUDOFS(linsysfs, 1, PR_ALLOW_MOUNT_LINSYSFS);
#if defined(__amd64__)
MODULE_DEPEND(linsysfs, linux_common, 1, 1, 1);
#else
diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.c b/sys/dev/hyperv/netvsc/hv_net_vsc.c
index a44c30d..64e7578 100644
--- a/sys/dev/hyperv/netvsc/hv_net_vsc.c
+++ b/sys/dev/hyperv/netvsc/hv_net_vsc.c
@@ -1027,4 +1027,6 @@ hv_nv_on_channel_callback(void *context)
if (bufferlen > NETVSC_PACKET_SIZE)
free(buffer, M_NETVSC);
+
+ hv_rf_channel_rollup(net_dev);
}
diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.h b/sys/dev/hyperv/netvsc/hv_net_vsc.h
index 4e63b94..e684cc5 100644
--- a/sys/dev/hyperv/netvsc/hv_net_vsc.h
+++ b/sys/dev/hyperv/netvsc/hv_net_vsc.h
@@ -38,12 +38,16 @@
#ifndef __HV_NET_VSC_H__
#define __HV_NET_VSC_H__
-#include <sys/types.h>
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/malloc.h>
+#include <sys/queue.h>
#include <sys/sx.h>
+#include <machine/bus.h>
+#include <sys/bus.h>
+#include <sys/bus_dma.h>
+
#include <netinet/in.h>
#include <netinet/tcp_lro.h>
@@ -984,6 +988,9 @@ typedef struct {
hv_bool_uint8_t link_state;
} netvsc_device_info;
+struct hn_txdesc;
+SLIST_HEAD(hn_txdesc_list, hn_txdesc);
+
/*
* Device-specific softc structure
*/
@@ -1002,6 +1009,18 @@ typedef struct hn_softc {
struct hv_device *hn_dev_obj;
netvsc_dev *net_dev;
+ int hn_txdesc_cnt;
+ struct hn_txdesc *hn_txdesc;
+ bus_dma_tag_t hn_tx_data_dtag;
+ bus_dma_tag_t hn_tx_rndis_dtag;
+ int hn_tx_chimney_size;
+ int hn_tx_chimney_max;
+
+ struct mtx hn_txlist_spin;
+ struct hn_txdesc_list hn_txlist;
+ int hn_txdesc_avail;
+ int hn_txeof;
+
struct lro_ctrl hn_lro;
int hn_lro_hiwat;
@@ -1013,6 +1032,11 @@ typedef struct hn_softc {
u_long hn_csum_trusted;
u_long hn_lro_tried;
u_long hn_small_pkts;
+ u_long hn_no_txdescs;
+ u_long hn_send_failed;
+ u_long hn_txdma_failed;
+ u_long hn_tx_collapsed;
+ u_long hn_tx_chimney;
} hn_softc_t;
diff --git a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
index f8ebd38..b3360ea 100644
--- a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
+++ b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
@@ -129,6 +129,41 @@ __FBSDID("$FreeBSD$");
#define HV_NV_SC_PTR_OFFSET_IN_BUF 0
#define HV_NV_PACKET_OFFSET_IN_BUF 16
+/* YYY should get it from the underlying channel */
+#define HN_TX_DESC_CNT 512
+
+#define HN_RNDIS_MSG_LEN \
+ (sizeof(rndis_msg) + \
+ RNDIS_VLAN_PPI_SIZE + \
+ RNDIS_TSO_PPI_SIZE + \
+ RNDIS_CSUM_PPI_SIZE)
+#define HN_RNDIS_MSG_BOUNDARY PAGE_SIZE
+#define HN_RNDIS_MSG_ALIGN CACHE_LINE_SIZE
+
+#define HN_TX_DATA_BOUNDARY PAGE_SIZE
+#define HN_TX_DATA_MAXSIZE IP_MAXPACKET
+#define HN_TX_DATA_SEGSIZE PAGE_SIZE
+#define HN_TX_DATA_SEGCNT_MAX \
+ (NETVSC_PACKET_MAXPAGE - HV_RF_NUM_TX_RESERVED_PAGE_BUFS)
+
+struct hn_txdesc {
+ SLIST_ENTRY(hn_txdesc) link;
+ struct mbuf *m;
+ struct hn_softc *sc;
+ int refs;
+ uint32_t flags; /* HN_TXD_FLAG_ */
+ netvsc_packet netvsc_pkt; /* XXX to be removed */
+
+ bus_dmamap_t data_dmap;
+
+ bus_addr_t rndis_msg_paddr;
+ rndis_msg *rndis_msg;
+ bus_dmamap_t rndis_msg_dmap;
+};
+
+#define HN_TXD_FLAG_ONLIST 0x1
+#define HN_TXD_FLAG_DMAMAP 0x2
+
/*
* A unified flag for all outbound check sum flags is useful,
* and it helps avoiding unnecessary check sum calculation in
@@ -174,6 +209,16 @@ int hv_promisc_mode = 0; /* normal mode by default */
static int hn_trust_hosttcp = 0;
TUNABLE_INT("dev.hn.trust_hosttcp", &hn_trust_hosttcp);
+#if __FreeBSD_version >= 1100045
+/* Limit TSO burst size */
+static int hn_tso_maxlen = 0;
+TUNABLE_INT("dev.hn.tso_maxlen", &hn_tso_maxlen);
+#endif
+
+/* Limit chimney send size */
+static int hn_tx_chimney_size = 0;
+TUNABLE_INT("dev.hn.tx_chimney_size", &hn_tx_chimney_size);
+
/*
* Forward declarations
*/
@@ -181,14 +226,17 @@ static void hn_stop(hn_softc_t *sc);
static void hn_ifinit_locked(hn_softc_t *sc);
static void hn_ifinit(void *xsc);
static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
-static int hn_start_locked(struct ifnet *ifp);
+static void hn_start_locked(struct ifnet *ifp);
static void hn_start(struct ifnet *ifp);
static int hn_ifmedia_upd(struct ifnet *ifp);
static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
#ifdef HN_LRO_HIWAT
static int hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS);
#endif
+static int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS);
static int hn_check_iplen(const struct mbuf *, int);
+static int hn_create_tx_ring(struct hn_softc *sc);
+static void hn_destroy_tx_ring(struct hn_softc *sc);
static __inline void
hn_set_lro_hiwat(struct hn_softc *sc, int hiwat)
@@ -318,10 +366,13 @@ netvsc_attach(device_t dev)
netvsc_device_info device_info;
hn_softc_t *sc;
int unit = device_get_unit(dev);
- struct ifnet *ifp;
+ struct ifnet *ifp = NULL;
struct sysctl_oid_list *child;
struct sysctl_ctx_list *ctx;
- int ret;
+ int error;
+#if __FreeBSD_version >= 1100045
+ int tso_maxlen;
+#endif
sc = device_get_softc(dev);
if (sc == NULL) {
@@ -334,6 +385,10 @@ netvsc_attach(device_t dev)
sc->hn_lro_hiwat = HN_LRO_HIWAT_DEF;
sc->hn_trust_hosttcp = hn_trust_hosttcp;
+ error = hn_create_tx_ring(sc);
+ if (error)
+ goto failed;
+
NV_LOCK_INIT(sc, "NetVSCLock");
sc->hn_dev_obj = device_ctx;
@@ -381,12 +436,10 @@ netvsc_attach(device_t dev)
else
ifp->if_hwassist = CSUM_TCP | CSUM_TSO;
- ret = hv_rf_on_device_add(device_ctx, &device_info);
- if (ret != 0) {
- if_free(ifp);
+ error = hv_rf_on_device_add(device_ctx, &device_info);
+ if (error)
+ goto failed;
- return (ret);
- }
if (device_info.link_state == 0) {
sc->hn_carrier = 1;
}
@@ -400,8 +453,30 @@ netvsc_attach(device_t dev)
#endif
#endif /* INET || INET6 */
+#if __FreeBSD_version >= 1100045
+ tso_maxlen = hn_tso_maxlen;
+ if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET)
+ tso_maxlen = IP_MAXPACKET;
+
+ ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
+ ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
+ ifp->if_hw_tsomax = tso_maxlen -
+ (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
+#endif
+
ether_ifattach(ifp, device_info.mac_addr);
+#if __FreeBSD_version >= 1100045
+ if_printf(ifp, "TSO: %u/%u/%u\n", ifp->if_hw_tsomax,
+ ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
+#endif
+
+ sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
+ sc->hn_tx_chimney_size = sc->hn_tx_chimney_max;
+ if (hn_tx_chimney_size > 0 &&
+ hn_tx_chimney_size < sc->hn_tx_chimney_max)
+ sc->hn_tx_chimney_size = hn_tx_chimney_size;
+
ctx = device_get_sysctl_ctx(dev);
child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
@@ -429,6 +504,26 @@ netvsc_attach(device_t dev)
"# of TCP segements that we trust host's csum verification");
SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "small_pkts",
CTLFLAG_RW, &sc->hn_small_pkts, "# of small packets received");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "no_txdescs",
+ CTLFLAG_RW, &sc->hn_no_txdescs, "# of times short of TX descs");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "send_failed",
+ CTLFLAG_RW, &sc->hn_send_failed, "# of hyper-v sending failure");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "txdma_failed",
+ CTLFLAG_RW, &sc->hn_txdma_failed, "# of TX DMA failure");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_collapsed",
+ CTLFLAG_RW, &sc->hn_tx_collapsed, "# of TX mbuf collapsed");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_chimney",
+ CTLFLAG_RW, &sc->hn_tx_chimney, "# of chimney send");
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
+ CTLFLAG_RD, &sc->hn_txdesc_cnt, 0, "# of total TX descs");
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
+ CTLFLAG_RD, &sc->hn_txdesc_avail, 0, "# of available TX descs");
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
+ CTLFLAG_RD, &sc->hn_tx_chimney_max, 0,
+ "Chimney send packet size upper boundary");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
+ CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_tx_chimney_size_sysctl,
+ "I", "Chimney send packet size limit");
if (unit == 0) {
struct sysctl_ctx_list *dc_ctx;
@@ -446,9 +541,21 @@ netvsc_attach(device_t dev)
CTLFLAG_RD, &hn_trust_hosttcp, 0,
"Trust tcp segement verification on host side, "
"when csum info is missing (global setting)");
+ SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tx_chimney_size",
+ CTLFLAG_RD, &hn_tx_chimney_size, 0,
+ "Chimney send packet size limit");
+#if __FreeBSD_version >= 1100045
+ SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tso_maxlen",
+ CTLFLAG_RD, &hn_tso_maxlen, 0, "TSO burst limit");
+#endif
}
return (0);
+failed:
+ hn_destroy_tx_ring(sc);
+ if (ifp != NULL)
+ if_free(ifp);
+ return (error);
}
/*
@@ -480,6 +587,7 @@ netvsc_detach(device_t dev)
#if defined(INET) || defined(INET6)
tcp_lro_free(&sc->hn_lro);
#endif
+ hn_destroy_tx_ring(sc);
return (0);
}
@@ -493,6 +601,112 @@ netvsc_shutdown(device_t dev)
return (0);
}
+static __inline int
+hn_txdesc_dmamap_load(struct hn_softc *sc, struct hn_txdesc *txd,
+ struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
+{
+ struct mbuf *m = *m_head;
+ int error;
+
+ error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag, txd->data_dmap,
+ m, segs, nsegs, BUS_DMA_NOWAIT);
+ if (error == EFBIG) {
+ struct mbuf *m_new;
+
+ m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
+ if (m_new == NULL)
+ return ENOBUFS;
+ else
+ *m_head = m = m_new;
+ sc->hn_tx_collapsed++;
+
+ error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag,
+ txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
+ }
+ if (!error) {
+ bus_dmamap_sync(sc->hn_tx_data_dtag, txd->data_dmap,
+ BUS_DMASYNC_PREWRITE);
+ txd->flags |= HN_TXD_FLAG_DMAMAP;
+ }
+ return error;
+}
+
+static __inline void
+hn_txdesc_dmamap_unload(struct hn_softc *sc, struct hn_txdesc *txd)
+{
+
+ if (txd->flags & HN_TXD_FLAG_DMAMAP) {
+ bus_dmamap_sync(sc->hn_tx_data_dtag,
+ txd->data_dmap, BUS_DMASYNC_POSTWRITE);
+ bus_dmamap_unload(sc->hn_tx_data_dtag,
+ txd->data_dmap);
+ txd->flags &= ~HN_TXD_FLAG_DMAMAP;
+ }
+}
+
+static __inline int
+hn_txdesc_put(struct hn_softc *sc, struct hn_txdesc *txd)
+{
+
+ KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
+ ("put an onlist txd %#x", txd->flags));
+
+ KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
+ if (atomic_fetchadd_int(&txd->refs, -1) != 1)
+ return 0;
+
+ hn_txdesc_dmamap_unload(sc, txd);
+ if (txd->m != NULL) {
+ m_freem(txd->m);
+ txd->m = NULL;
+ }
+
+ txd->flags |= HN_TXD_FLAG_ONLIST;
+
+ mtx_lock_spin(&sc->hn_txlist_spin);
+ KASSERT(sc->hn_txdesc_avail >= 0 &&
+ sc->hn_txdesc_avail < sc->hn_txdesc_cnt,
+ ("txdesc_put: invalid txd avail %d", sc->hn_txdesc_avail));
+ sc->hn_txdesc_avail++;
+ SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link);
+ mtx_unlock_spin(&sc->hn_txlist_spin);
+
+ return 1;
+}
+
+static __inline struct hn_txdesc *
+hn_txdesc_get(struct hn_softc *sc)
+{
+ struct hn_txdesc *txd;
+
+ mtx_lock_spin(&sc->hn_txlist_spin);
+ txd = SLIST_FIRST(&sc->hn_txlist);
+ if (txd != NULL) {
+ KASSERT(sc->hn_txdesc_avail > 0,
+ ("txdesc_get: invalid txd avail %d", sc->hn_txdesc_avail));
+ sc->hn_txdesc_avail--;
+ SLIST_REMOVE_HEAD(&sc->hn_txlist, link);
+ }
+ mtx_unlock_spin(&sc->hn_txlist_spin);
+
+ if (txd != NULL) {
+ KASSERT(txd->m == NULL && txd->refs == 0 &&
+ (txd->flags & HN_TXD_FLAG_ONLIST), ("invalid txd"));
+ txd->flags &= ~HN_TXD_FLAG_ONLIST;
+ txd->refs = 1;
+ }
+ return txd;
+}
+
+static __inline void
+hn_txdesc_hold(struct hn_txdesc *txd)
+{
+
+ /* 0->1 transition will never work */
+ KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
+ atomic_add_int(&txd->refs, 1);
+}
+
/*
* Send completion processing
*
@@ -503,34 +717,46 @@ netvsc_shutdown(device_t dev)
void
netvsc_xmit_completion(void *context)
{
- netvsc_packet *packet = (netvsc_packet *)context;
- struct mbuf *mb;
- uint8_t *buf;
+ netvsc_packet *packet = context;
+ struct hn_txdesc *txd;
+ struct hn_softc *sc;
- mb = (struct mbuf *)(uintptr_t)packet->compl.send.send_completion_tid;
- buf = ((uint8_t *)packet) - HV_NV_PACKET_OFFSET_IN_BUF;
+ txd = (struct hn_txdesc *)(uintptr_t)
+ packet->compl.send.send_completion_tid;
- free(buf, M_NETVSC);
+ sc = txd->sc;
+ sc->hn_txeof = 1;
+ hn_txdesc_put(sc, txd);
+}
- if (mb != NULL) {
- m_freem(mb);
- }
+void
+netvsc_channel_rollup(struct hv_device *device_ctx)
+{
+ struct hn_softc *sc = device_get_softc(device_ctx->device);
+ struct ifnet *ifp;
+
+ if (!sc->hn_txeof)
+ return;
+
+ sc->hn_txeof = 0;
+ ifp = sc->hn_ifp;
+ NV_LOCK(sc);
+ ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+ hn_start_locked(ifp);
+ NV_UNLOCK(sc);
}
/*
* Start a transmit of one or more packets
*/
-static int
+static void
hn_start_locked(struct ifnet *ifp)
{
hn_softc_t *sc = ifp->if_softc;
struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
netvsc_dev *net_dev = sc->net_dev;
- device_t dev = device_ctx->device;
- uint8_t *buf;
netvsc_packet *packet;
struct mbuf *m_head, *m;
- struct mbuf *mc_head = NULL;
struct ether_vlan_header *eh;
rndis_msg *rndis_mesg;
rndis_packet *rndis_pkt;
@@ -539,84 +765,40 @@ hn_start_locked(struct ifnet *ifp)
rndis_tcp_ip_csum_info *csum_info;
rndis_tcp_tso_info *tso_info;
int ether_len;
- int i;
- int num_frags;
- int len;
- int retries = 0;
- int ret = 0;
uint32_t rndis_msg_size = 0;
uint32_t trans_proto_type;
uint32_t send_buf_section_idx =
NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX;
- while (!IFQ_DRV_IS_EMPTY(&sc->hn_ifp->if_snd)) {
- IFQ_DRV_DEQUEUE(&sc->hn_ifp->if_snd, m_head);
- if (m_head == NULL) {
- break;
- }
-
- len = 0;
- num_frags = 0;
-
- /* Walk the mbuf list computing total length and num frags */
- for (m = m_head; m != NULL; m = m->m_next) {
- if (m->m_len != 0) {
- num_frags++;
- len += m->m_len;
- }
- }
+ if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
+ IFF_DRV_RUNNING)
+ return;
- /*
- * Reserve the number of pages requested. Currently,
- * one page is reserved for the message in the RNDIS
- * filter packet
- */
- num_frags += HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
+ while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
+ bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
+ int error, nsegs, i, send_failed = 0;
+ struct hn_txdesc *txd;
- /* If exceeds # page_buffers in netvsc_packet */
- if (num_frags > NETVSC_PACKET_MAXPAGE) {
- device_printf(dev, "exceed max page buffers,%d,%d\n",
- num_frags, NETVSC_PACKET_MAXPAGE);
- m_freem(m_head);
- if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
- return (EINVAL);
- }
+ IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
+ if (m_head == NULL)
+ break;
- /*
- * Allocate a buffer with space for a netvsc packet plus a
- * number of reserved areas. First comes a (currently 16
- * bytes, currently unused) reserved data area. Second is
- * the netvsc_packet. Third is an area reserved for an
- * rndis_filter_packet struct. Fourth (optional) is a
- * rndis_per_packet_info struct.
- * Changed malloc to M_NOWAIT to avoid sleep under spin lock.
- * No longer reserving extra space for page buffers, as they
- * are already part of the netvsc_packet.
- */
- buf = malloc(HV_NV_PACKET_OFFSET_IN_BUF +
- sizeof(netvsc_packet) +
- sizeof(rndis_msg) +
- RNDIS_VLAN_PPI_SIZE +
- RNDIS_TSO_PPI_SIZE +
- RNDIS_CSUM_PPI_SIZE,
- M_NETVSC, M_ZERO | M_NOWAIT);
- if (buf == NULL) {
- device_printf(dev, "hn:malloc packet failed\n");
- m_freem(m_head);
- if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
- return (ENOMEM);
+ txd = hn_txdesc_get(sc);
+ if (txd == NULL) {
+ sc->hn_no_txdescs++;
+ IF_PREPEND(&ifp->if_snd, m_head);
+ ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+ break;
}
- packet = (netvsc_packet *)(buf + HV_NV_PACKET_OFFSET_IN_BUF);
- *(vm_offset_t *)buf = HV_NV_SC_PTR_OFFSET_IN_BUF;
+ packet = &txd->netvsc_pkt;
+ /* XXX not necessary */
+ memset(packet, 0, sizeof(*packet));
packet->is_data_pkt = TRUE;
- /* Set up the rndis header */
- packet->page_buf_count = num_frags;
-
/* Initialize it from the mbuf */
- packet->tot_data_buf_len = len;
+ packet->tot_data_buf_len = m_head->m_pkthdr.len;
/*
* extension points to the area reserved for the
@@ -624,8 +806,9 @@ hn_start_locked(struct ifnet *ifp)
* the netvsc_packet (and rppi struct, if present;
* length is updated later).
*/
- packet->rndis_mesg = packet + 1;
- rndis_mesg = (rndis_msg *)packet->rndis_mesg;
+ rndis_mesg = txd->rndis_msg;
+ /* XXX not necessary */
+ memset(rndis_mesg, 0, HN_RNDIS_MSG_LEN);
rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG;
rndis_pkt = &rndis_mesg->msg.packet;
@@ -644,8 +827,6 @@ hn_start_locked(struct ifnet *ifp)
* set up some additional fields so the Hyper-V infrastructure will stuff the VLAN tag
* into the frame.
*/
- packet->vlan_tci = m_head->m_pkthdr.ether_vtag;
-
rndis_msg_size += RNDIS_VLAN_PPI_SIZE;
rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE,
@@ -656,7 +837,7 @@ hn_start_locked(struct ifnet *ifp)
rppi->per_packet_info_offset);
/* FreeBSD does not support CFI or priority */
rppi_vlan_info->u1.s1.vlan_id =
- packet->vlan_tci & 0xfff;
+ m_head->m_pkthdr.ether_vtag & 0xfff;
}
/* Only check the flags for outbound and ignore the ones for inbound */
@@ -758,7 +939,7 @@ pre_send:
packet->tot_data_buf_len = rndis_mesg->msg_len;
/* send packet with send buffer */
- if (packet->tot_data_buf_len < net_dev->send_section_size) {
+ if (packet->tot_data_buf_len < sc->hn_tx_chimney_size) {
send_buf_section_idx =
hv_nv_get_next_send_section(net_dev);
if (send_buf_section_idx !=
@@ -783,33 +964,49 @@ pre_send:
packet->send_buf_section_size =
packet->tot_data_buf_len;
packet->page_buf_count = 0;
+ sc->hn_tx_chimney++;
goto do_send;
}
}
+ error = hn_txdesc_dmamap_load(sc, txd, &m_head, segs, &nsegs);
+ if (error) {
+ int freed;
+
+ /*
+ * This mbuf is not linked w/ the txd yet, so free
+ * it now.
+ */
+ m_freem(m_head);
+ freed = hn_txdesc_put(sc, txd);
+ KASSERT(freed != 0,
+ ("fail to free txd upon txdma error"));
+
+ sc->hn_txdma_failed++;
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ continue;
+ }
+
+ packet->page_buf_count = nsegs +
+ HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
+
/* send packet with page buffer */
- packet->page_buffers[0].pfn =
- atop(hv_get_phys_addr(rndis_mesg));
+ packet->page_buffers[0].pfn = atop(txd->rndis_msg_paddr);
packet->page_buffers[0].offset =
- (unsigned long)rndis_mesg & PAGE_MASK;
+ txd->rndis_msg_paddr & PAGE_MASK;
packet->page_buffers[0].length = rndis_msg_size;
/*
* Fill the page buffers with mbuf info starting at index
* HV_RF_NUM_TX_RESERVED_PAGE_BUFS.
*/
- i = HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
- for (m = m_head; m != NULL; m = m->m_next) {
- if (m->m_len) {
- vm_offset_t paddr =
- vtophys(mtod(m, vm_offset_t));
- packet->page_buffers[i].pfn =
- paddr >> PAGE_SHIFT;
- packet->page_buffers[i].offset =
- paddr & (PAGE_SIZE - 1);
- packet->page_buffers[i].length = m->m_len;
- i++;
- }
+ for (i = 0; i < nsegs; ++i) {
+ hv_vmbus_page_buffer *pb = &packet->page_buffers[
+ i + HV_RF_NUM_TX_RESERVED_PAGE_BUFS];
+
+ pb->pfn = atop(segs[i].ds_addr);
+ pb->offset = segs[i].ds_addr & PAGE_MASK;
+ pb->length = segs[i].ds_len;
}
packet->send_buf_section_idx =
@@ -817,63 +1014,65 @@ pre_send:
packet->send_buf_section_size = 0;
do_send:
+ txd->m = m_head;
- /*
- * If bpf, copy the mbuf chain. This is less expensive than
- * it appears; the mbuf clusters are not copied, only their
- * reference counts are incremented.
- * Needed to avoid a race condition where the completion
- * callback is invoked, freeing the mbuf chain, before the
- * bpf_mtap code has a chance to run.
- */
- if (ifp->if_bpf) {
- mc_head = m_copypacket(m_head, M_DONTWAIT);
- }
-retry_send:
/* Set the completion routine */
packet->compl.send.on_send_completion = netvsc_xmit_completion;
packet->compl.send.send_completion_context = packet;
- packet->compl.send.send_completion_tid = (uint64_t)(uintptr_t)m_head;
-
- /* Removed critical_enter(), does not appear necessary */
- ret = hv_nv_on_send(device_ctx, packet);
- if (ret == 0) {
- ifp->if_opackets++;
- /* if bpf && mc_head, call bpf_mtap code */
- if (mc_head) {
- ETHER_BPF_MTAP(ifp, mc_head);
- }
- } else {
- retries++;
- if (retries < 4) {
- goto retry_send;
- }
+ packet->compl.send.send_completion_tid =
+ (uint64_t)(uintptr_t)txd;
- IF_PREPEND(&ifp->if_snd, m_head);
- ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+again:
+ /*
+ * Make sure that txd is not freed before ETHER_BPF_MTAP.
+ */
+ hn_txdesc_hold(txd);
+ error = hv_nv_on_send(device_ctx, packet);
+ if (!error) {
+ ETHER_BPF_MTAP(ifp, m_head);
+ if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+ }
+ hn_txdesc_put(sc, txd);
+
+ if (__predict_false(error)) {
+ int freed;
/*
- * Null the mbuf pointer so the completion function
- * does not free the mbuf chain. We just pushed the
- * mbuf chain back on the if_snd queue.
+ * This should "really rarely" happen.
+ *
+ * XXX Too many RX to be acked or too many sideband
+ * commands to run? Ask netvsc_channel_rollup()
+ * to kick start later.
*/
- packet->compl.send.send_completion_tid = 0;
+ sc->hn_txeof = 1;
+ if (!send_failed) {
+ sc->hn_send_failed++;
+ send_failed = 1;
+ /*
+ * Try sending again after set hn_txeof;
+ * in case that we missed the last
+ * netvsc_channel_rollup().
+ */
+ goto again;
+ }
+ if_printf(ifp, "send failed\n");
/*
- * Release the resources since we will not get any
- * send completion
+ * This mbuf will be prepended, don't free it
+ * in hn_txdesc_put(); only unload it from the
+ * DMA map in hn_txdesc_put(), if it was loaded.
*/
- netvsc_xmit_completion(packet);
- if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
- }
+ txd->m = NULL;
+ freed = hn_txdesc_put(sc, txd);
+ KASSERT(freed != 0,
+ ("fail to free txd upon send error"));
- /* if bpf && mc_head, free the mbuf chain copy */
- if (mc_head) {
- m_freem(mc_head);
+ sc->hn_send_failed++;
+ IF_PREPEND(&ifp->if_snd, m_head);
+ ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+ break;
}
}
-
- return (ret);
}
/*
@@ -1222,6 +1421,9 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
break;
}
+ sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
+ if (sc->hn_tx_chimney_size > sc->hn_tx_chimney_max)
+ sc->hn_tx_chimney_size = sc->hn_tx_chimney_max;
hn_ifinit_locked(sc);
NV_LOCK(sc);
@@ -1479,6 +1681,25 @@ hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS)
#endif /* HN_LRO_HIWAT */
static int
+hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int chimney_size, error;
+
+ chimney_size = sc->hn_tx_chimney_size;
+ error = sysctl_handle_int(oidp, &chimney_size, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ if (chimney_size > sc->hn_tx_chimney_max || chimney_size <= 0)
+ return EINVAL;
+
+ if (sc->hn_tx_chimney_size != chimney_size)
+ sc->hn_tx_chimney_size = chimney_size;
+ return 0;
+}
+
+static int
hn_check_iplen(const struct mbuf *m, int hoff)
{
const struct ip *ip;
@@ -1553,6 +1774,150 @@ hn_check_iplen(const struct mbuf *m, int hoff)
return ip->ip_p;
}
+static void
+hn_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error)
+{
+ bus_addr_t *paddr = arg;
+
+ if (error)
+ return;
+
+ KASSERT(nseg == 1, ("too many segments %d!", nseg));
+ *paddr = segs->ds_addr;
+}
+
+static int
+hn_create_tx_ring(struct hn_softc *sc)
+{
+ bus_dma_tag_t parent_dtag;
+ int error, i;
+
+ sc->hn_txdesc_cnt = HN_TX_DESC_CNT;
+ sc->hn_txdesc = malloc(sizeof(struct hn_txdesc) * sc->hn_txdesc_cnt,
+ M_NETVSC, M_WAITOK | M_ZERO);
+ SLIST_INIT(&sc->hn_txlist);
+ mtx_init(&sc->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
+
+ parent_dtag = bus_get_dma_tag(sc->hn_dev);
+
+ /* DMA tag for RNDIS messages. */
+ error = bus_dma_tag_create(parent_dtag, /* parent */
+ HN_RNDIS_MSG_ALIGN, /* alignment */
+ HN_RNDIS_MSG_BOUNDARY, /* boundary */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ HN_RNDIS_MSG_LEN, /* maxsize */
+ 1, /* nsegments */
+ HN_RNDIS_MSG_LEN, /* maxsegsize */
+ 0, /* flags */
+ NULL, /* lockfunc */
+ NULL, /* lockfuncarg */
+ &sc->hn_tx_rndis_dtag);
+ if (error) {
+ device_printf(sc->hn_dev, "failed to create rndis dmatag\n");
+ return error;
+ }
+
+ /* DMA tag for data. */
+ error = bus_dma_tag_create(parent_dtag, /* parent */
+ 1, /* alignment */
+ HN_TX_DATA_BOUNDARY, /* boundary */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ HN_TX_DATA_MAXSIZE, /* maxsize */
+ HN_TX_DATA_SEGCNT_MAX, /* nsegments */
+ HN_TX_DATA_SEGSIZE, /* maxsegsize */
+ 0, /* flags */
+ NULL, /* lockfunc */
+ NULL, /* lockfuncarg */
+ &sc->hn_tx_data_dtag);
+ if (error) {
+ device_printf(sc->hn_dev, "failed to create data dmatag\n");
+ return error;
+ }
+
+ for (i = 0; i < sc->hn_txdesc_cnt; ++i) {
+ struct hn_txdesc *txd = &sc->hn_txdesc[i];
+
+ txd->sc = sc;
+
+ /*
+ * Allocate and load RNDIS messages.
+ */
+ error = bus_dmamem_alloc(sc->hn_tx_rndis_dtag,
+ (void **)&txd->rndis_msg,
+ BUS_DMA_WAITOK | BUS_DMA_COHERENT,
+ &txd->rndis_msg_dmap);
+ if (error) {
+ device_printf(sc->hn_dev,
+ "failed to allocate rndis_msg, %d\n", i);
+ return error;
+ }
+
+ error = bus_dmamap_load(sc->hn_tx_rndis_dtag,
+ txd->rndis_msg_dmap,
+ txd->rndis_msg, HN_RNDIS_MSG_LEN,
+ hn_dma_map_paddr, &txd->rndis_msg_paddr,
+ BUS_DMA_NOWAIT);
+ if (error) {
+ device_printf(sc->hn_dev,
+ "failed to load rndis_msg, %d\n", i);
+ bus_dmamem_free(sc->hn_tx_rndis_dtag,
+ txd->rndis_msg, txd->rndis_msg_dmap);
+ return error;
+ }
+
+ /* DMA map for TX data. */
+ error = bus_dmamap_create(sc->hn_tx_data_dtag, 0,
+ &txd->data_dmap);
+ if (error) {
+ device_printf(sc->hn_dev,
+ "failed to allocate tx data dmamap\n");
+ bus_dmamap_unload(sc->hn_tx_rndis_dtag,
+ txd->rndis_msg_dmap);
+ bus_dmamem_free(sc->hn_tx_rndis_dtag,
+ txd->rndis_msg, txd->rndis_msg_dmap);
+ return error;
+ }
+
+ /* All set, put it to list */
+ txd->flags |= HN_TXD_FLAG_ONLIST;
+ SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link);
+ }
+ sc->hn_txdesc_avail = sc->hn_txdesc_cnt;
+
+ return 0;
+}
+
+static void
+hn_destroy_tx_ring(struct hn_softc *sc)
+{
+ struct hn_txdesc *txd;
+
+ while ((txd = SLIST_FIRST(&sc->hn_txlist)) != NULL) {
+ KASSERT(txd->m == NULL, ("still has mbuf installed"));
+ KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
+ ("still dma mapped"));
+ SLIST_REMOVE_HEAD(&sc->hn_txlist, link);
+
+ bus_dmamap_unload(sc->hn_tx_rndis_dtag,
+ txd->rndis_msg_dmap);
+ bus_dmamem_free(sc->hn_tx_rndis_dtag,
+ txd->rndis_msg, txd->rndis_msg_dmap);
+
+ bus_dmamap_destroy(sc->hn_tx_data_dtag, txd->data_dmap);
+ }
+
+ if (sc->hn_tx_data_dtag != NULL)
+ bus_dma_tag_destroy(sc->hn_tx_data_dtag);
+ if (sc->hn_tx_rndis_dtag != NULL)
+ bus_dma_tag_destroy(sc->hn_tx_rndis_dtag);
+ free(sc->hn_txdesc, M_NETVSC);
+ mtx_destroy(&sc->hn_txlist_spin);
+}
+
static device_method_t netvsc_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, netvsc_probe),
diff --git a/sys/dev/hyperv/netvsc/hv_rndis.h b/sys/dev/hyperv/netvsc/hv_rndis.h
index fd032de..cd46ecc 100644
--- a/sys/dev/hyperv/netvsc/hv_rndis.h
+++ b/sys/dev/hyperv/netvsc/hv_rndis.h
@@ -1050,6 +1050,7 @@ int netvsc_recv(struct hv_device *device_ctx,
netvsc_packet *packet,
rndis_tcp_ip_csum_info *csum_info);
void netvsc_recv_rollup(struct hv_device *device_ctx);
+void netvsc_channel_rollup(struct hv_device *device_ctx);
void* hv_set_rppi_data(rndis_msg *rndis_mesg,
uint32_t rppi_size,
diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.c b/sys/dev/hyperv/netvsc/hv_rndis_filter.c
index 3e95024..dfd0b47 100644
--- a/sys/dev/hyperv/netvsc/hv_rndis_filter.c
+++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.c
@@ -974,3 +974,21 @@ hv_rf_receive_rollup(netvsc_dev *net_dev)
rndis_dev = (rndis_device *)net_dev->extension;
netvsc_recv_rollup(rndis_dev->net_dev->dev);
}
+
+void
+hv_rf_channel_rollup(netvsc_dev *net_dev)
+{
+ rndis_device *rndis_dev;
+
+ rndis_dev = (rndis_device *)net_dev->extension;
+
+ /*
+ * This could be called pretty early, so we need
+ * to make sure everything has been setup.
+ */
+ if (rndis_dev == NULL ||
+ rndis_dev->net_dev == NULL ||
+ rndis_dev->net_dev->dev == NULL)
+ return;
+ netvsc_channel_rollup(rndis_dev->net_dev->dev);
+}
diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.h b/sys/dev/hyperv/netvsc/hv_rndis_filter.h
index 2f3ebd8..9d7a38d 100644
--- a/sys/dev/hyperv/netvsc/hv_rndis_filter.h
+++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.h
@@ -99,6 +99,7 @@ typedef struct rndis_device_ {
int hv_rf_on_receive(netvsc_dev *net_dev,
struct hv_device *device, netvsc_packet *pkt);
void hv_rf_receive_rollup(netvsc_dev *net_dev);
+void hv_rf_channel_rollup(netvsc_dev *net_dev);
int hv_rf_on_device_add(struct hv_device *device, void *additl_info);
int hv_rf_on_device_remove(struct hv_device *device, boolean_t destroy_channel);
int hv_rf_on_open(struct hv_device *device);
diff --git a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
index c7f3538..4ccb647 100644
--- a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
+++ b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
@@ -274,14 +274,16 @@ vmbus_channel_process_offer(hv_vmbus_channel *new_channel)
boolean_t f_new;
hv_vmbus_channel* channel;
int ret;
+ uint32_t relid;
f_new = TRUE;
channel = NULL;
-
+ relid = new_channel->offer_msg.child_rel_id;
/*
* Make sure this is a new offer
*/
mtx_lock(&hv_vmbus_g_connection.channel_lock);
+ hv_vmbus_g_connection.channels[relid] = new_channel;
TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor,
list_entry)
@@ -325,16 +327,18 @@ vmbus_channel_process_offer(hv_vmbus_channel *new_channel)
mtx_unlock(&channel->sc_lock);
/* Insert new channel into channel_anchor. */
- printf("Storvsc get multi-channel offer, rel=%u.\n",
- new_channel->offer_msg.child_rel_id);
+ printf("VMBUS get multi-channel offer, rel=%u,sub=%u\n",
+ new_channel->offer_msg.child_rel_id,
+ new_channel->offer_msg.offer.sub_channel_index);
mtx_lock(&hv_vmbus_g_connection.channel_lock);
TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor,
new_channel, list_entry);
mtx_unlock(&hv_vmbus_g_connection.channel_lock);
if(bootverbose)
- printf("VMBUS: new multi-channel offer <%p>.\n",
- new_channel);
+ printf("VMBUS: new multi-channel offer <%p>, "
+ "its primary channel is <%p>.\n",
+ new_channel, new_channel->primary_channel);
/*XXX add it to percpu_list */
@@ -524,11 +528,14 @@ vmbus_channel_on_offer_rescind(hv_vmbus_channel_msg_header* hdr)
rescind = (hv_vmbus_channel_rescind_offer*) hdr;
- channel = hv_vmbus_get_channel_from_rel_id(rescind->child_rel_id);
+ channel = hv_vmbus_g_connection.channels[rescind->child_rel_id];
if (channel == NULL)
return;
hv_vmbus_child_device_unregister(channel->device);
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
+ hv_vmbus_g_connection.channels[rescind->child_rel_id] = NULL;
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
}
/**
@@ -782,6 +789,8 @@ hv_vmbus_release_unattached_channels(void)
hv_vmbus_child_device_unregister(channel->device);
hv_vmbus_free_vmbus_channel(channel);
}
+ bzero(hv_vmbus_g_connection.channels,
+ sizeof(hv_vmbus_channel*) * HV_CHANNEL_MAX_COUNT);
mtx_unlock(&hv_vmbus_g_connection.channel_lock);
}
diff --git a/sys/dev/hyperv/vmbus/hv_connection.c b/sys/dev/hyperv/vmbus/hv_connection.c
index 7496288..a9e3561 100644
--- a/sys/dev/hyperv/vmbus/hv_connection.c
+++ b/sys/dev/hyperv/vmbus/hv_connection.c
@@ -232,6 +232,9 @@ hv_vmbus_connect(void) {
goto cleanup;
}
+ hv_vmbus_g_connection.channels = malloc(sizeof(hv_vmbus_channel*) *
+ HV_CHANNEL_MAX_COUNT,
+ M_DEVBUF, M_WAITOK | M_ZERO);
/*
* Find the highest vmbus version number we can support.
*/
@@ -295,6 +298,7 @@ hv_vmbus_connect(void) {
free(msg_info, M_DEVBUF);
}
+ free(hv_vmbus_g_connection.channels, M_DEVBUF);
return (ret);
}
@@ -325,6 +329,7 @@ hv_vmbus_disconnect(void) {
hv_work_queue_close(hv_vmbus_g_connection.work_queue);
sema_destroy(&hv_vmbus_g_connection.control_sema);
+ free(hv_vmbus_g_connection.channels, M_DEVBUF);
hv_vmbus_g_connection.connect_state = HV_DISCONNECTED;
free(msg, M_DEVBUF);
@@ -333,35 +338,6 @@ hv_vmbus_disconnect(void) {
}
/**
- * Get the channel object given its child relative id (ie channel id)
- */
-hv_vmbus_channel*
-hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) {
-
- hv_vmbus_channel* channel;
- hv_vmbus_channel* foundChannel = NULL;
-
- /*
- * TODO:
- * Consider optimization where relids are stored in a fixed size array
- * and channels are accessed without the need to take this lock or search
- * the list.
- */
- mtx_lock(&hv_vmbus_g_connection.channel_lock);
- TAILQ_FOREACH(channel,
- &hv_vmbus_g_connection.channel_anchor, list_entry) {
-
- if (channel->offer_msg.child_rel_id == rel_id) {
- foundChannel = channel;
- break;
- }
- }
- mtx_unlock(&hv_vmbus_g_connection.channel_lock);
-
- return (foundChannel);
-}
-
-/**
* Process a channel event notification
*/
static void
@@ -377,7 +353,7 @@ VmbusProcessChannelEvent(uint32_t relid)
* the channel callback to process the event
*/
- channel = hv_vmbus_get_channel_from_rel_id(relid);
+ channel = hv_vmbus_g_connection.channels[relid];
if (channel == NULL) {
return;
@@ -473,7 +449,7 @@ hv_vmbus_on_events(void *arg)
if (recv_interrupt_page != NULL) {
for (dword = 0; dword < maxdword; dword++) {
if (recv_interrupt_page[dword]) {
- for (bit = 0; bit < 32; bit++) {
+ for (bit = 0; bit < HV_CHANNEL_DWORD_LEN; bit++) {
if (synch_test_and_clear_bit(bit,
(uint32_t *) &recv_interrupt_page[dword])) {
rel_id = (dword << 5) + bit;
diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
index 74fe824..13a35c4 100644
--- a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
+++ b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
@@ -58,6 +58,12 @@ typedef uint16_t hv_vmbus_status;
#define HV_EVENT_FLAGS_BYTE_COUNT (256)
#define HV_EVENT_FLAGS_DWORD_COUNT (256 / sizeof(uint32_t))
+/**
+ * max channel count <== event_flags_dword_count * bit_of_dword
+ */
+#define HV_CHANNEL_DWORD_LEN (32)
+#define HV_CHANNEL_MAX_COUNT \
+ ((HV_EVENT_FLAGS_DWORD_COUNT) * HV_CHANNEL_DWORD_LEN)
/*
* MessageId: HV_STATUS_INSUFFICIENT_BUFFERS
* MessageText:
@@ -355,6 +361,10 @@ typedef struct {
TAILQ_HEAD(, hv_vmbus_channel) channel_anchor;
struct mtx channel_lock;
+ /**
+ * channel table for fast lookup through id.
+ */
+ hv_vmbus_channel **channels;
hv_vmbus_handle work_queue;
struct sema control_sema;
} hv_vmbus_connection;
@@ -699,7 +709,6 @@ int hv_vmbus_child_device_register(
struct hv_device *child_dev);
int hv_vmbus_child_device_unregister(
struct hv_device *child_dev);
-hv_vmbus_channel* hv_vmbus_get_channel_from_rel_id(uint32_t rel_id);
/**
* Connection interfaces
diff --git a/sys/fs/tmpfs/tmpfs_vnops.c b/sys/fs/tmpfs/tmpfs_vnops.c
index 885f84c..f01c8be 100644
--- a/sys/fs/tmpfs/tmpfs_vnops.c
+++ b/sys/fs/tmpfs/tmpfs_vnops.c
@@ -1187,8 +1187,11 @@ tmpfs_readdir(struct vop_readdir_args *v)
if (error == EJUSTRETURN)
error = (uio->uio_resid != startresid) ? 0 : EINVAL;
- if (error != 0 && cookies != NULL)
+ if (error != 0 && cookies != NULL && ncookies != NULL) {
free(*cookies, M_TEMP);
+ *cookies = NULL;
+ *ncookies = 0;
+ }
if (eofflag != NULL)
*eofflag =
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 42c53c0..0d52c7b 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -208,6 +208,8 @@ static char *pr_allow_names[] = {
"allow.mount.procfs",
"allow.mount.tmpfs",
"allow.mount.fdescfs",
+ "allow.mount.linprocfs",
+ "allow.mount.linsysfs",
};
const size_t pr_allow_names_size = sizeof(pr_allow_names);
@@ -225,6 +227,8 @@ static char *pr_allow_nonames[] = {
"allow.mount.noprocfs",
"allow.mount.notmpfs",
"allow.mount.nofdescfs",
+ "allow.mount.nolinprocfs",
+ "allow.mount.nolinsysfs",
};
const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
@@ -4315,6 +4319,14 @@ SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I",
"Processes in jail can mount the procfs file system");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I",
+ "Processes in jail can mount the linprocfs file system");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I",
+ "Processes in jail can mount the linsysfs file system");
SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I",
@@ -4481,6 +4493,10 @@ SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may mount the nullfs file system");
SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may mount the procfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may mount the linprocfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may mount the linsysfs file system");
SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may mount the tmpfs file system");
SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW,
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index a721c5a..aa81313 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -145,51 +145,24 @@ int vttoif_tab[10] = {
static TAILQ_HEAD(freelst, vnode) vnode_free_list;
/*
- * "Free" vnode target. Free vnodes are rarely completely free, but are
- * just ones that are cheap to recycle. Usually they are for files which
- * have been stat'd but not read; these usually have inode and namecache
- * data attached to them. This target is the preferred minimum size of a
- * sub-cache consisting mostly of such files. The system balances the size
- * of this sub-cache with its complement to try to prevent either from
- * thrashing while the other is relatively inactive. The targets express
- * a preference for the best balance.
- *
- * "Above" this target there are 2 further targets (watermarks) related
- * to recyling of free vnodes. In the best-operating case, the cache is
- * exactly full, the free list has size between vlowat and vhiwat above the
- * free target, and recycling from it and normal use maintains this state.
- * Sometimes the free list is below vlowat or even empty, but this state
- * is even better for immediate use provided the cache is not full.
- * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
- * ones) to reach one of these states. The watermarks are currently hard-
- * coded as 4% and 9% of the available space higher. These and the default
- * of 25% for wantfreevnodes are too large if the memory size is large.
- * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
- * whenever vnlru_proc() becomes active.
+ * Free vnode target. Free vnodes may simply be files which have been stat'd
+ * but not read. This is somewhat common, and a small cache of such files
+ * should be kept to avoid recreation costs.
*/
static u_long wantfreevnodes;
-SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
- &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
+SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
+/* Number of vnodes in the free list. */
static u_long freevnodes;
-SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
- &freevnodes, 0, "Number of \"free\" vnodes");
+SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
+ "Number of vnodes in the free list");
-/*
- * The vfs.vlru_allow_cache_src sysctl variable is no longer used but
- * the sysctl remains to provide ABI compatibility. The new code frees
- * namecache sources as the last chance to satisfy the highest watermark,
- * instead of selecting the source vnodes randomly. This provides good
- * enough behaviour to keep vn_fullpath() working in most situations.
- * The filesystem layout with deep trees, where the depricated knob was
- * required, is thus handled automatically.
- */
static int vlru_allow_cache_src;
SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
- &vlru_allow_cache_src, 0, "Placeholder for API compatibility (unused)");
+ &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
static u_long recycles_count;
SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
- "Number of vnodes recycled to meet vnode cache targets");
+ "Number of vnodes recycled to avoid exceding kern.maxvnodes");
/*
* Various variables used for debugging the new implementation of
@@ -299,13 +272,14 @@ static int syncer_worklist_len;
static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
syncer_state;
-/* Target for maximum number of vnodes. */
+/*
+ * Number of vnodes we want to exist at any one time. This is mostly used
+ * to size hash tables in vnode-related code. It is normally not used in
+ * getnewvnode(), as wantfreevnodes is normally nonzero.)
+ *
+ * XXX desiredvnodes is historical cruft and should not exist.
+ */
int desiredvnodes;
-static int gapvnodes; /* gap between wanted and desired */
-static int vhiwat; /* enough extras after expansion */
-static int vlowat; /* minimal extras before expansion */
-static int vstir; /* nonzero to stir non-free vnodes */
-static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */
static int
sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
@@ -316,8 +290,6 @@ sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
return (error);
if (old_desiredvnodes != desiredvnodes) {
- wantfreevnodes = desiredvnodes / 4;
- /* XXX locking seems to be incomplete. */
vfs_hash_changesize(desiredvnodes);
cache_changesize(desiredvnodes);
}
@@ -326,9 +298,9 @@ sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
- sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
+ sysctl_update_desiredvnodes, "I", "Maximum number of vnodes");
SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
- &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
+ &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
static int vnlru_nowhere;
SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
&vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
@@ -359,10 +331,10 @@ PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
*
* Reevaluate the following cap on the number of vnodes after the physical
* memory size exceeds 512GB. In the limit, as the physical memory size
- * grows, the ratio of the memory size in KB to to vnodes approaches 64:1.
+ * grows, the ratio of physical pages to vnodes approaches sixteen to one.
*/
#ifndef MAXVNODES_MAX
-#define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */
+#define MAXVNODES_MAX (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
#endif
/*
@@ -433,16 +405,15 @@ vntblinit(void *dummy __unused)
/*
* Desiredvnodes is a function of the physical memory size and the
* kernel's heap size. Generally speaking, it scales with the
- * physical memory size. The ratio of desiredvnodes to the physical
- * memory size is 1:16 until desiredvnodes exceeds 98,304.
- * Thereafter, the
- * marginal ratio of desiredvnodes to the physical memory size is
- * 1:64. However, desiredvnodes is limited by the kernel's heap
+ * physical memory size. The ratio of desiredvnodes to physical pages
+ * is one to four until desiredvnodes exceeds 98,304. Thereafter, the
+ * marginal ratio of desiredvnodes to physical pages is one to
+ * sixteen. However, desiredvnodes is limited by the kernel's heap
* size. The memory required by desiredvnodes vnodes and vm objects
- * must not exceed 1/7th of the kernel's heap size.
+ * may not exceed one seventh of the kernel's heap size.
*/
- physvnodes = maxproc + pgtok(cnt.v_page_count) / 64 +
- 3 * min(98304 * 16, pgtok(cnt.v_page_count)) / 64;
+ physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
+ cnt.v_page_count) / 16;
virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
sizeof(struct vnode)));
desiredvnodes = min(physvnodes, virtvnodes);
@@ -831,41 +802,35 @@ vattr_null(struct vattr *vap)
* you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
*/
static int
-vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
+vlrureclaim(struct mount *mp)
{
struct vnode *vp;
- int count, done, target;
+ int done;
+ int trigger;
+ int usevnodes;
+ int count;
+ /*
+ * Calculate the trigger point, don't allow user
+ * screwups to blow us up. This prevents us from
+ * recycling vnodes with lots of resident pages. We
+ * aren't trying to free memory, we are trying to
+ * free vnodes.
+ */
+ usevnodes = desiredvnodes;
+ if (usevnodes <= 0)
+ usevnodes = 1;
+ trigger = cnt.v_page_count * 2 / usevnodes;
done = 0;
vn_start_write(NULL, &mp, V_WAIT);
MNT_ILOCK(mp);
- count = mp->mnt_nvnodelistsize;
- target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
- target = target / 10 + 1;
- while (count != 0 && done < target) {
+ count = mp->mnt_nvnodelistsize / 10 + 1;
+ while (count != 0) {
vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
while (vp != NULL && vp->v_type == VMARKER)
vp = TAILQ_NEXT(vp, v_nmntvnodes);
if (vp == NULL)
break;
- /*
- * XXX LRU is completely broken for non-free vnodes. First
- * by calling here in mountpoint order, then by moving
- * unselected vnodes to the end here, and most grossly by
- * removing the vlruvp() function that was supposed to
- * maintain the order. (This function was born broken
- * since syncer problems prevented it doing anything.) The
- * order is closer to LRC (C = Created).
- *
- * LRU reclaiming of vnodes seems to have last worked in
- * FreeBSD-3 where LRU wasn't mentioned under any spelling.
- * Then there was no hold count, and inactive vnodes were
- * simply put on the free list in LRU order. The separate
- * lists also break LRU. We prefer to reclaim from the
- * free list for technical reasons. This tends to thrash
- * the free list to keep very unrecently used held vnodes.
- * The problem is mitigated by keeping the free list large.
- */
TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
--count;
@@ -874,12 +839,10 @@ vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
/*
* If it's been deconstructed already, it's still
* referenced, or it exceeds the trigger, skip it.
- * Also skip free vnodes. We are trying to make space
- * to expand the free list, not reduce it.
*/
if (vp->v_usecount ||
- (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
- ((vp->v_iflag & VI_FREE) != 0) ||
+ (!vlru_allow_cache_src &&
+ !LIST_EMPTY(&(vp)->v_cache_src)) ||
(vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
vp->v_object->resident_page_count > trigger)) {
VI_UNLOCK(vp);
@@ -905,8 +868,8 @@ vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
* vnode lock before our VOP_LOCK() call fails.
*/
if (vp->v_usecount ||
- (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
- (vp->v_iflag & VI_FREE) != 0 ||
+ (!vlru_allow_cache_src &&
+ !LIST_EMPTY(&(vp)->v_cache_src)) ||
(vp->v_object != NULL &&
vp->v_object->resident_page_count > trigger)) {
VOP_UNLOCK(vp, LK_INTERLOCK);
@@ -939,7 +902,7 @@ relock_mnt:
}
/*
- * Attempt to reduce the free list by the requested amount.
+ * Attempt to keep the free list at wantfreevnodes length.
*/
static void
vnlru_free(int count)
@@ -996,24 +959,6 @@ vnlru_free(int count)
mtx_lock(&vnode_free_list_mtx);
}
}
-
-/* XXX some names and initialization are bad for limits and watermarks. */
-static int
-vspace(void)
-{
- int space;
-
- gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
- vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
- vlowat = vhiwat / 2;
- if (numvnodes > desiredvnodes)
- return (0);
- space = desiredvnodes - numvnodes;
- if (freevnodes > wantfreevnodes)
- space += freevnodes - wantfreevnodes;
- return (space);
-}
-
/*
* Attempt to recycle vnodes in a context that is always safe to block.
* Calling vlrurecycle() from the bowels of filesystem code has some
@@ -1026,36 +971,18 @@ static void
vnlru_proc(void)
{
struct mount *mp, *nmp;
- unsigned long ofreevnodes, onumvnodes;
- int done, force, reclaim_nc_src, trigger, usevnodes;
+ int done;
+ struct proc *p = vnlruproc;
- EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
SHUTDOWN_PRI_FIRST);
- force = 0;
for (;;) {
- kproc_suspend_check(vnlruproc);
+ kproc_suspend_check(p);
mtx_lock(&vnode_free_list_mtx);
- /*
- * If numvnodes is too large (due to desiredvnodes being
- * adjusted using its sysctl, or emergency growth), first
- * try to reduce it by discarding from the free list.
- */
- if (numvnodes > desiredvnodes && freevnodes > 0)
- vnlru_free(ulmin(numvnodes - desiredvnodes,
- freevnodes));
- /*
- * Sleep if the vnode cache is in a good state. This is
- * when it is not over-full and has space for about a 4%
- * or 9% expansion (by growing its size or inexcessively
- * reducing its free list). Otherwise, try to reclaim
- * space for a 10% expansion.
- */
- if (vstir && force == 0) {
- force = 1;
- vstir = 0;
- }
- if (vspace() >= vlowat && force == 0) {
+ if (freevnodes > wantfreevnodes)
+ vnlru_free(freevnodes - wantfreevnodes);
+ if (numvnodes <= desiredvnodes * 9 / 10) {
vnlruproc_sig = 0;
wakeup(&vnlruproc_sig);
msleep(vnlruproc, &vnode_free_list_mtx,
@@ -1064,66 +991,30 @@ vnlru_proc(void)
}
mtx_unlock(&vnode_free_list_mtx);
done = 0;
- ofreevnodes = freevnodes;
- onumvnodes = numvnodes;
- /*
- * Calculate parameters for recycling. These are the same
- * throughout the loop to give some semblance of fairness.
- * The trigger point is to avoid recycling vnodes with lots
- * of resident pages. We aren't trying to free memory; we
- * are trying to recycle or at least free vnodes.
- */
- if (numvnodes <= desiredvnodes)
- usevnodes = numvnodes - freevnodes;
- else
- usevnodes = numvnodes;
- if (usevnodes <= 0)
- usevnodes = 1;
- /*
- * The trigger value is is chosen to give a conservatively
- * large value to ensure that it alone doesn't prevent
- * making progress. The value can easily be so large that
- * it is effectively infinite in some congested and
- * misconfigured cases, and this is necessary. Normally
- * it is about 8 to 100 (pages), which is quite large.
- */
- trigger = cnt.v_page_count * 2 / usevnodes;
- if (force < 2)
- trigger = vsmalltrigger;
- reclaim_nc_src = force >= 3;
mtx_lock(&mountlist_mtx);
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
- done += vlrureclaim(mp, reclaim_nc_src, trigger);
+ done += vlrureclaim(mp);
mtx_lock(&mountlist_mtx);
nmp = TAILQ_NEXT(mp, mnt_list);
vfs_unbusy(mp);
}
mtx_unlock(&mountlist_mtx);
- if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
- uma_reclaim();
if (done == 0) {
- if (force == 0 || force == 1) {
- force = 2;
- continue;
- }
- if (force == 2) {
- force = 3;
- continue;
- }
- force = 0;
+#if 0
+ /* These messages are temporary debugging aids */
+ if (vnlru_nowhere < 5)
+ printf("vnlru process getting nowhere..\n");
+ else if (vnlru_nowhere == 5)
+ printf("vnlru process messages stopped.\n");
+#endif
vnlru_nowhere++;
tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
} else
kern_yield(PRI_USER);
- /*
- * After becoming active to expand above low water, keep
- * active until above high water.
- */
- force = vspace() < vhiwat;
}
}
@@ -1197,31 +1088,22 @@ vtryrecycle(struct vnode *vp)
return (0);
}
-static void
-vcheckspace(void)
-{
-
- if (vspace() < vlowat && vnlruproc_sig == 0) {
- vnlruproc_sig = 1;
- wakeup(vnlruproc);
- }
-}
-
/*
- * Wait if necessary for space for a new vnode.
+ * Wait for available vnodes.
*/
static int
getnewvnode_wait(int suspended)
{
mtx_assert(&vnode_free_list_mtx, MA_OWNED);
- if (numvnodes >= desiredvnodes) {
+ if (numvnodes > desiredvnodes) {
if (suspended) {
/*
- * The file system is being suspended. We cannot
- * risk a deadlock here, so allow allocation of
- * another vnode even if this would give too many.
+ * File system is beeing suspended, we cannot risk a
+ * deadlock here, so allocate new vnode anyway.
*/
+ if (freevnodes > wantfreevnodes)
+ vnlru_free(freevnodes - wantfreevnodes);
return (0);
}
if (vnlruproc_sig == 0) {
@@ -1231,34 +1113,18 @@ getnewvnode_wait(int suspended)
msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
"vlruwk", hz);
}
- /* Post-adjust like the pre-adjust in getnewvnode(). */
- if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
- vnlru_free(1);
- return (numvnodes >= desiredvnodes ? ENFILE : 0);
+ return (numvnodes > desiredvnodes ? ENFILE : 0);
}
-/*
- * This hack is fragile, and probably not needed any more now that the
- * watermark handling works.
- */
void
getnewvnode_reserve(u_int count)
{
struct thread *td;
- /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
- /* XXX no longer so quick, but this part is not racy. */
- mtx_lock(&vnode_free_list_mtx);
- if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
- vnlru_free(ulmin(numvnodes + count - desiredvnodes,
- freevnodes - wantfreevnodes));
- mtx_unlock(&vnode_free_list_mtx);
-
td = curthread;
/* First try to be quick and racy. */
if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
td->td_vp_reserv += count;
- vcheckspace(); /* XXX no longer so quick, but more racy */
return;
} else
atomic_subtract_long(&numvnodes, count);
@@ -1271,18 +1137,9 @@ getnewvnode_reserve(u_int count)
atomic_add_long(&numvnodes, 1);
}
}
- vcheckspace();
mtx_unlock(&vnode_free_list_mtx);
}
-/*
- * This hack is fragile, especially if desiredvnodes or wantvnodes are
- * misconfgured or changed significantly. Reducing desiredvnodes below
- * the reserved amount should cause bizarre behaviour like reducing it
- * below the number of active vnodes -- the system will try to reduce
- * numvnodes to match, but should fail, so the subtraction below should
- * not overflow.
- */
void
getnewvnode_drop_reserve(void)
{
@@ -1303,7 +1160,6 @@ getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
struct vnode *vp;
struct thread *td;
struct lock_object *lo;
- static int cyclecount;
int error;
CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
@@ -1314,37 +1170,19 @@ getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
goto alloc;
}
mtx_lock(&vnode_free_list_mtx);
- if (numvnodes < desiredvnodes)
- cyclecount = 0;
- else if (cyclecount++ >= freevnodes) {
- cyclecount = 0;
- vstir = 1;
- }
/*
- * Grow the vnode cache if it will not be above its target max
- * after growing. Otherwise, if the free list is nonempty, try
- * to reclaim 1 item from it before growing the cache (possibly
- * above its target max if the reclamation failed or is delayed).
- * Otherwise, wait for some space. In all cases, schedule
- * vnlru_proc() if we are getting short of space. The watermarks
- * should be chosen so that we never wait or even reclaim from
- * the free list to below its target minimum.
+ * Lend our context to reclaim vnodes if they've exceeded the max.
*/
- if (numvnodes + 1 <= desiredvnodes)
- ;
- else if (freevnodes > 0)
+ if (freevnodes > wantfreevnodes)
vnlru_free(1);
- else {
- error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
- MNTK_SUSPEND));
+ error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
+ MNTK_SUSPEND));
#if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
- if (error != 0) {
- mtx_unlock(&vnode_free_list_mtx);
- return (error);
- }
-#endif
+ if (error != 0) {
+ mtx_unlock(&vnode_free_list_mtx);
+ return (error);
}
- vcheckspace();
+#endif
atomic_add_long(&numvnodes, 1);
mtx_unlock(&vnode_free_list_mtx);
alloc:
diff --git a/sys/sys/jail.h b/sys/sys/jail.h
index cfe71d8..63f5ab9 100644
--- a/sys/sys/jail.h
+++ b/sys/sys/jail.h
@@ -232,7 +232,9 @@ struct prison_racct {
#define PR_ALLOW_MOUNT_PROCFS 0x0400
#define PR_ALLOW_MOUNT_TMPFS 0x0800
#define PR_ALLOW_MOUNT_FDESCFS 0x1000
-#define PR_ALLOW_ALL 0x1fff
+#define PR_ALLOW_MOUNT_LINPROCFS 0x2000
+#define PR_ALLOW_MOUNT_LINSYSFS 0x4000
+#define PR_ALLOW_ALL 0x7fff
/*
* OSD methods
OpenPOWER on IntegriCloud