diff options
-rw-r--r-- | contrib/libarchive/libarchive/archive_read.c | 7 | ||||
-rw-r--r-- | share/mk/bsd.dep.mk | 8 | ||||
-rw-r--r-- | sys/compat/linprocfs/linprocfs.c | 2 | ||||
-rw-r--r-- | sys/compat/linsysfs/linsysfs.c | 2 | ||||
-rw-r--r-- | sys/dev/hyperv/netvsc/hv_net_vsc.c | 2 | ||||
-rw-r--r-- | sys/dev/hyperv/netvsc/hv_net_vsc.h | 26 | ||||
-rw-r--r-- | sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c | 663 | ||||
-rw-r--r-- | sys/dev/hyperv/netvsc/hv_rndis.h | 1 | ||||
-rw-r--r-- | sys/dev/hyperv/netvsc/hv_rndis_filter.c | 18 | ||||
-rw-r--r-- | sys/dev/hyperv/netvsc/hv_rndis_filter.h | 1 | ||||
-rw-r--r-- | sys/dev/hyperv/vmbus/hv_channel_mgmt.c | 21 | ||||
-rw-r--r-- | sys/dev/hyperv/vmbus/hv_connection.c | 38 | ||||
-rw-r--r-- | sys/dev/hyperv/vmbus/hv_vmbus_priv.h | 11 | ||||
-rw-r--r-- | sys/fs/tmpfs/tmpfs_vnops.c | 5 | ||||
-rw-r--r-- | sys/kern/kern_jail.c | 16 | ||||
-rw-r--r-- | sys/kern/vfs_subr.c | 322 | ||||
-rw-r--r-- | sys/sys/jail.h | 4 | ||||
-rw-r--r-- | usr.sbin/jail/jail.8 | 20 | ||||
-rw-r--r-- | usr.sbin/rtsold/rtsold.c | 5 |
19 files changed, 725 insertions, 447 deletions
diff --git a/contrib/libarchive/libarchive/archive_read.c b/contrib/libarchive/libarchive/archive_read.c index 9513729..1d4c9bb 100644 --- a/contrib/libarchive/libarchive/archive_read.c +++ b/contrib/libarchive/libarchive/archive_read.c @@ -545,13 +545,13 @@ archive_read_open1(struct archive *_a) static int choose_filters(struct archive_read *a) { - int number_bidders, i, bid, best_bid; + int number_bidders, i, bid, best_bid, n; struct archive_read_filter_bidder *bidder, *best_bidder; struct archive_read_filter *filter; ssize_t avail; int r; - for (;;) { + for (n = 0; n < 25; ++n) { number_bidders = sizeof(a->bidders) / sizeof(a->bidders[0]); best_bid = 0; @@ -597,6 +597,9 @@ choose_filters(struct archive_read *a) return (ARCHIVE_FATAL); } } + archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, + "Input requires too many filters for decoding"); + return (ARCHIVE_FATAL); } /* diff --git a/share/mk/bsd.dep.mk b/share/mk/bsd.dep.mk index 5dc79a8..1555e55 100644 --- a/share/mk/bsd.dep.mk +++ b/share/mk/bsd.dep.mk @@ -164,14 +164,6 @@ depend: beforedepend ${DEPENDFILE} afterdepend # This could be simpler with bmake :tW but needs to support fmake for MFC. _CFLAGS_INCLUDES= ${CFLAGS:Q:S/\\ /,/g:C/-include,/-include%/g:C/,/ /g:M-include*:C/%/ /g} _CXXFLAGS_INCLUDES= ${CXXFLAGS:Q:S/\\ /,/g:C/-include,/-include%/g:C/,/ /g:M-include*:C/%/ /g} -# XXX: Temporary hack to workaround .depend files not tracking -include -_hdrincludes=${_CFLAGS_INCLUDES:M*.h} ${_CXXFLAGS_INCLUDES:M*.h} -.for _hdr in ${_hdrincludes:O:u} -.if exists(${_hdr}) -${OBJS} ${POBJS} ${SOBJS}: ${_hdr} -.endif -.endfor -.undef _hdrincludes # Different types of sources are compiled with slightly different flags. # Split up the sources, and filter out headers and non-applicable flags. diff --git a/sys/compat/linprocfs/linprocfs.c b/sys/compat/linprocfs/linprocfs.c index 6e591e9..9142c93 100644 --- a/sys/compat/linprocfs/linprocfs.c +++ b/sys/compat/linprocfs/linprocfs.c @@ -1514,7 +1514,7 @@ linprocfs_uninit(PFS_INIT_ARGS) return (0); } -PSEUDOFS(linprocfs, 1, 0); +PSEUDOFS(linprocfs, 1, PR_ALLOW_MOUNT_LINPROCFS); #if defined(__amd64__) MODULE_DEPEND(linprocfs, linux_common, 1, 1, 1); #else diff --git a/sys/compat/linsysfs/linsysfs.c b/sys/compat/linsysfs/linsysfs.c index 8b5f9b5..4f57526 100644 --- a/sys/compat/linsysfs/linsysfs.c +++ b/sys/compat/linsysfs/linsysfs.c @@ -274,7 +274,7 @@ linsysfs_uninit(PFS_INIT_ARGS) return (0); } -PSEUDOFS(linsysfs, 1, 0); +PSEUDOFS(linsysfs, 1, PR_ALLOW_MOUNT_LINSYSFS); #if defined(__amd64__) MODULE_DEPEND(linsysfs, linux_common, 1, 1, 1); #else diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.c b/sys/dev/hyperv/netvsc/hv_net_vsc.c index a44c30d..64e7578 100644 --- a/sys/dev/hyperv/netvsc/hv_net_vsc.c +++ b/sys/dev/hyperv/netvsc/hv_net_vsc.c @@ -1027,4 +1027,6 @@ hv_nv_on_channel_callback(void *context) if (bufferlen > NETVSC_PACKET_SIZE) free(buffer, M_NETVSC); + + hv_rf_channel_rollup(net_dev); } diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.h b/sys/dev/hyperv/netvsc/hv_net_vsc.h index 4e63b94..e684cc5 100644 --- a/sys/dev/hyperv/netvsc/hv_net_vsc.h +++ b/sys/dev/hyperv/netvsc/hv_net_vsc.h @@ -38,12 +38,16 @@ #ifndef __HV_NET_VSC_H__ #define __HV_NET_VSC_H__ -#include <sys/types.h> #include <sys/param.h> #include <sys/lock.h> #include <sys/malloc.h> +#include <sys/queue.h> #include <sys/sx.h> +#include <machine/bus.h> +#include <sys/bus.h> +#include <sys/bus_dma.h> + #include <netinet/in.h> #include <netinet/tcp_lro.h> @@ -984,6 +988,9 @@ typedef struct { hv_bool_uint8_t link_state; } netvsc_device_info; +struct hn_txdesc; +SLIST_HEAD(hn_txdesc_list, hn_txdesc); + /* * Device-specific softc structure */ @@ -1002,6 +1009,18 @@ typedef struct hn_softc { struct hv_device *hn_dev_obj; netvsc_dev *net_dev; + int hn_txdesc_cnt; + struct hn_txdesc *hn_txdesc; + bus_dma_tag_t hn_tx_data_dtag; + bus_dma_tag_t hn_tx_rndis_dtag; + int hn_tx_chimney_size; + int hn_tx_chimney_max; + + struct mtx hn_txlist_spin; + struct hn_txdesc_list hn_txlist; + int hn_txdesc_avail; + int hn_txeof; + struct lro_ctrl hn_lro; int hn_lro_hiwat; @@ -1013,6 +1032,11 @@ typedef struct hn_softc { u_long hn_csum_trusted; u_long hn_lro_tried; u_long hn_small_pkts; + u_long hn_no_txdescs; + u_long hn_send_failed; + u_long hn_txdma_failed; + u_long hn_tx_collapsed; + u_long hn_tx_chimney; } hn_softc_t; diff --git a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c index f8ebd38..b3360ea 100644 --- a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c +++ b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c @@ -129,6 +129,41 @@ __FBSDID("$FreeBSD$"); #define HV_NV_SC_PTR_OFFSET_IN_BUF 0 #define HV_NV_PACKET_OFFSET_IN_BUF 16 +/* YYY should get it from the underlying channel */ +#define HN_TX_DESC_CNT 512 + +#define HN_RNDIS_MSG_LEN \ + (sizeof(rndis_msg) + \ + RNDIS_VLAN_PPI_SIZE + \ + RNDIS_TSO_PPI_SIZE + \ + RNDIS_CSUM_PPI_SIZE) +#define HN_RNDIS_MSG_BOUNDARY PAGE_SIZE +#define HN_RNDIS_MSG_ALIGN CACHE_LINE_SIZE + +#define HN_TX_DATA_BOUNDARY PAGE_SIZE +#define HN_TX_DATA_MAXSIZE IP_MAXPACKET +#define HN_TX_DATA_SEGSIZE PAGE_SIZE +#define HN_TX_DATA_SEGCNT_MAX \ + (NETVSC_PACKET_MAXPAGE - HV_RF_NUM_TX_RESERVED_PAGE_BUFS) + +struct hn_txdesc { + SLIST_ENTRY(hn_txdesc) link; + struct mbuf *m; + struct hn_softc *sc; + int refs; + uint32_t flags; /* HN_TXD_FLAG_ */ + netvsc_packet netvsc_pkt; /* XXX to be removed */ + + bus_dmamap_t data_dmap; + + bus_addr_t rndis_msg_paddr; + rndis_msg *rndis_msg; + bus_dmamap_t rndis_msg_dmap; +}; + +#define HN_TXD_FLAG_ONLIST 0x1 +#define HN_TXD_FLAG_DMAMAP 0x2 + /* * A unified flag for all outbound check sum flags is useful, * and it helps avoiding unnecessary check sum calculation in @@ -174,6 +209,16 @@ int hv_promisc_mode = 0; /* normal mode by default */ static int hn_trust_hosttcp = 0; TUNABLE_INT("dev.hn.trust_hosttcp", &hn_trust_hosttcp); +#if __FreeBSD_version >= 1100045 +/* Limit TSO burst size */ +static int hn_tso_maxlen = 0; +TUNABLE_INT("dev.hn.tso_maxlen", &hn_tso_maxlen); +#endif + +/* Limit chimney send size */ +static int hn_tx_chimney_size = 0; +TUNABLE_INT("dev.hn.tx_chimney_size", &hn_tx_chimney_size); + /* * Forward declarations */ @@ -181,14 +226,17 @@ static void hn_stop(hn_softc_t *sc); static void hn_ifinit_locked(hn_softc_t *sc); static void hn_ifinit(void *xsc); static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); -static int hn_start_locked(struct ifnet *ifp); +static void hn_start_locked(struct ifnet *ifp); static void hn_start(struct ifnet *ifp); static int hn_ifmedia_upd(struct ifnet *ifp); static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr); #ifdef HN_LRO_HIWAT static int hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS); #endif +static int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS); static int hn_check_iplen(const struct mbuf *, int); +static int hn_create_tx_ring(struct hn_softc *sc); +static void hn_destroy_tx_ring(struct hn_softc *sc); static __inline void hn_set_lro_hiwat(struct hn_softc *sc, int hiwat) @@ -318,10 +366,13 @@ netvsc_attach(device_t dev) netvsc_device_info device_info; hn_softc_t *sc; int unit = device_get_unit(dev); - struct ifnet *ifp; + struct ifnet *ifp = NULL; struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; - int ret; + int error; +#if __FreeBSD_version >= 1100045 + int tso_maxlen; +#endif sc = device_get_softc(dev); if (sc == NULL) { @@ -334,6 +385,10 @@ netvsc_attach(device_t dev) sc->hn_lro_hiwat = HN_LRO_HIWAT_DEF; sc->hn_trust_hosttcp = hn_trust_hosttcp; + error = hn_create_tx_ring(sc); + if (error) + goto failed; + NV_LOCK_INIT(sc, "NetVSCLock"); sc->hn_dev_obj = device_ctx; @@ -381,12 +436,10 @@ netvsc_attach(device_t dev) else ifp->if_hwassist = CSUM_TCP | CSUM_TSO; - ret = hv_rf_on_device_add(device_ctx, &device_info); - if (ret != 0) { - if_free(ifp); + error = hv_rf_on_device_add(device_ctx, &device_info); + if (error) + goto failed; - return (ret); - } if (device_info.link_state == 0) { sc->hn_carrier = 1; } @@ -400,8 +453,30 @@ netvsc_attach(device_t dev) #endif #endif /* INET || INET6 */ +#if __FreeBSD_version >= 1100045 + tso_maxlen = hn_tso_maxlen; + if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET) + tso_maxlen = IP_MAXPACKET; + + ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; + ifp->if_hw_tsomaxsegsize = PAGE_SIZE; + ifp->if_hw_tsomax = tso_maxlen - + (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); +#endif + ether_ifattach(ifp, device_info.mac_addr); +#if __FreeBSD_version >= 1100045 + if_printf(ifp, "TSO: %u/%u/%u\n", ifp->if_hw_tsomax, + ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); +#endif + + sc->hn_tx_chimney_max = sc->net_dev->send_section_size; + sc->hn_tx_chimney_size = sc->hn_tx_chimney_max; + if (hn_tx_chimney_size > 0 && + hn_tx_chimney_size < sc->hn_tx_chimney_max) + sc->hn_tx_chimney_size = hn_tx_chimney_size; + ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); @@ -429,6 +504,26 @@ netvsc_attach(device_t dev) "# of TCP segements that we trust host's csum verification"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "small_pkts", CTLFLAG_RW, &sc->hn_small_pkts, "# of small packets received"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "no_txdescs", + CTLFLAG_RW, &sc->hn_no_txdescs, "# of times short of TX descs"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "send_failed", + CTLFLAG_RW, &sc->hn_send_failed, "# of hyper-v sending failure"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "txdma_failed", + CTLFLAG_RW, &sc->hn_txdma_failed, "# of TX DMA failure"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_collapsed", + CTLFLAG_RW, &sc->hn_tx_collapsed, "# of TX mbuf collapsed"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_chimney", + CTLFLAG_RW, &sc->hn_tx_chimney, "# of chimney send"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", + CTLFLAG_RD, &sc->hn_txdesc_cnt, 0, "# of total TX descs"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", + CTLFLAG_RD, &sc->hn_txdesc_avail, 0, "# of available TX descs"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", + CTLFLAG_RD, &sc->hn_tx_chimney_max, 0, + "Chimney send packet size upper boundary"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", + CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_tx_chimney_size_sysctl, + "I", "Chimney send packet size limit"); if (unit == 0) { struct sysctl_ctx_list *dc_ctx; @@ -446,9 +541,21 @@ netvsc_attach(device_t dev) CTLFLAG_RD, &hn_trust_hosttcp, 0, "Trust tcp segement verification on host side, " "when csum info is missing (global setting)"); + SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tx_chimney_size", + CTLFLAG_RD, &hn_tx_chimney_size, 0, + "Chimney send packet size limit"); +#if __FreeBSD_version >= 1100045 + SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tso_maxlen", + CTLFLAG_RD, &hn_tso_maxlen, 0, "TSO burst limit"); +#endif } return (0); +failed: + hn_destroy_tx_ring(sc); + if (ifp != NULL) + if_free(ifp); + return (error); } /* @@ -480,6 +587,7 @@ netvsc_detach(device_t dev) #if defined(INET) || defined(INET6) tcp_lro_free(&sc->hn_lro); #endif + hn_destroy_tx_ring(sc); return (0); } @@ -493,6 +601,112 @@ netvsc_shutdown(device_t dev) return (0); } +static __inline int +hn_txdesc_dmamap_load(struct hn_softc *sc, struct hn_txdesc *txd, + struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) +{ + struct mbuf *m = *m_head; + int error; + + error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag, txd->data_dmap, + m, segs, nsegs, BUS_DMA_NOWAIT); + if (error == EFBIG) { + struct mbuf *m_new; + + m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); + if (m_new == NULL) + return ENOBUFS; + else + *m_head = m = m_new; + sc->hn_tx_collapsed++; + + error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag, + txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); + } + if (!error) { + bus_dmamap_sync(sc->hn_tx_data_dtag, txd->data_dmap, + BUS_DMASYNC_PREWRITE); + txd->flags |= HN_TXD_FLAG_DMAMAP; + } + return error; +} + +static __inline void +hn_txdesc_dmamap_unload(struct hn_softc *sc, struct hn_txdesc *txd) +{ + + if (txd->flags & HN_TXD_FLAG_DMAMAP) { + bus_dmamap_sync(sc->hn_tx_data_dtag, + txd->data_dmap, BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(sc->hn_tx_data_dtag, + txd->data_dmap); + txd->flags &= ~HN_TXD_FLAG_DMAMAP; + } +} + +static __inline int +hn_txdesc_put(struct hn_softc *sc, struct hn_txdesc *txd) +{ + + KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, + ("put an onlist txd %#x", txd->flags)); + + KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); + if (atomic_fetchadd_int(&txd->refs, -1) != 1) + return 0; + + hn_txdesc_dmamap_unload(sc, txd); + if (txd->m != NULL) { + m_freem(txd->m); + txd->m = NULL; + } + + txd->flags |= HN_TXD_FLAG_ONLIST; + + mtx_lock_spin(&sc->hn_txlist_spin); + KASSERT(sc->hn_txdesc_avail >= 0 && + sc->hn_txdesc_avail < sc->hn_txdesc_cnt, + ("txdesc_put: invalid txd avail %d", sc->hn_txdesc_avail)); + sc->hn_txdesc_avail++; + SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link); + mtx_unlock_spin(&sc->hn_txlist_spin); + + return 1; +} + +static __inline struct hn_txdesc * +hn_txdesc_get(struct hn_softc *sc) +{ + struct hn_txdesc *txd; + + mtx_lock_spin(&sc->hn_txlist_spin); + txd = SLIST_FIRST(&sc->hn_txlist); + if (txd != NULL) { + KASSERT(sc->hn_txdesc_avail > 0, + ("txdesc_get: invalid txd avail %d", sc->hn_txdesc_avail)); + sc->hn_txdesc_avail--; + SLIST_REMOVE_HEAD(&sc->hn_txlist, link); + } + mtx_unlock_spin(&sc->hn_txlist_spin); + + if (txd != NULL) { + KASSERT(txd->m == NULL && txd->refs == 0 && + (txd->flags & HN_TXD_FLAG_ONLIST), ("invalid txd")); + txd->flags &= ~HN_TXD_FLAG_ONLIST; + txd->refs = 1; + } + return txd; +} + +static __inline void +hn_txdesc_hold(struct hn_txdesc *txd) +{ + + /* 0->1 transition will never work */ + KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs)); + atomic_add_int(&txd->refs, 1); +} + /* * Send completion processing * @@ -503,34 +717,46 @@ netvsc_shutdown(device_t dev) void netvsc_xmit_completion(void *context) { - netvsc_packet *packet = (netvsc_packet *)context; - struct mbuf *mb; - uint8_t *buf; + netvsc_packet *packet = context; + struct hn_txdesc *txd; + struct hn_softc *sc; - mb = (struct mbuf *)(uintptr_t)packet->compl.send.send_completion_tid; - buf = ((uint8_t *)packet) - HV_NV_PACKET_OFFSET_IN_BUF; + txd = (struct hn_txdesc *)(uintptr_t) + packet->compl.send.send_completion_tid; - free(buf, M_NETVSC); + sc = txd->sc; + sc->hn_txeof = 1; + hn_txdesc_put(sc, txd); +} - if (mb != NULL) { - m_freem(mb); - } +void +netvsc_channel_rollup(struct hv_device *device_ctx) +{ + struct hn_softc *sc = device_get_softc(device_ctx->device); + struct ifnet *ifp; + + if (!sc->hn_txeof) + return; + + sc->hn_txeof = 0; + ifp = sc->hn_ifp; + NV_LOCK(sc); + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + hn_start_locked(ifp); + NV_UNLOCK(sc); } /* * Start a transmit of one or more packets */ -static int +static void hn_start_locked(struct ifnet *ifp) { hn_softc_t *sc = ifp->if_softc; struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); netvsc_dev *net_dev = sc->net_dev; - device_t dev = device_ctx->device; - uint8_t *buf; netvsc_packet *packet; struct mbuf *m_head, *m; - struct mbuf *mc_head = NULL; struct ether_vlan_header *eh; rndis_msg *rndis_mesg; rndis_packet *rndis_pkt; @@ -539,84 +765,40 @@ hn_start_locked(struct ifnet *ifp) rndis_tcp_ip_csum_info *csum_info; rndis_tcp_tso_info *tso_info; int ether_len; - int i; - int num_frags; - int len; - int retries = 0; - int ret = 0; uint32_t rndis_msg_size = 0; uint32_t trans_proto_type; uint32_t send_buf_section_idx = NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; - while (!IFQ_DRV_IS_EMPTY(&sc->hn_ifp->if_snd)) { - IFQ_DRV_DEQUEUE(&sc->hn_ifp->if_snd, m_head); - if (m_head == NULL) { - break; - } - - len = 0; - num_frags = 0; - - /* Walk the mbuf list computing total length and num frags */ - for (m = m_head; m != NULL; m = m->m_next) { - if (m->m_len != 0) { - num_frags++; - len += m->m_len; - } - } + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != + IFF_DRV_RUNNING) + return; - /* - * Reserve the number of pages requested. Currently, - * one page is reserved for the message in the RNDIS - * filter packet - */ - num_frags += HV_RF_NUM_TX_RESERVED_PAGE_BUFS; + while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { + bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; + int error, nsegs, i, send_failed = 0; + struct hn_txdesc *txd; - /* If exceeds # page_buffers in netvsc_packet */ - if (num_frags > NETVSC_PACKET_MAXPAGE) { - device_printf(dev, "exceed max page buffers,%d,%d\n", - num_frags, NETVSC_PACKET_MAXPAGE); - m_freem(m_head); - if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - return (EINVAL); - } + IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); + if (m_head == NULL) + break; - /* - * Allocate a buffer with space for a netvsc packet plus a - * number of reserved areas. First comes a (currently 16 - * bytes, currently unused) reserved data area. Second is - * the netvsc_packet. Third is an area reserved for an - * rndis_filter_packet struct. Fourth (optional) is a - * rndis_per_packet_info struct. - * Changed malloc to M_NOWAIT to avoid sleep under spin lock. - * No longer reserving extra space for page buffers, as they - * are already part of the netvsc_packet. - */ - buf = malloc(HV_NV_PACKET_OFFSET_IN_BUF + - sizeof(netvsc_packet) + - sizeof(rndis_msg) + - RNDIS_VLAN_PPI_SIZE + - RNDIS_TSO_PPI_SIZE + - RNDIS_CSUM_PPI_SIZE, - M_NETVSC, M_ZERO | M_NOWAIT); - if (buf == NULL) { - device_printf(dev, "hn:malloc packet failed\n"); - m_freem(m_head); - if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - return (ENOMEM); + txd = hn_txdesc_get(sc); + if (txd == NULL) { + sc->hn_no_txdescs++; + IF_PREPEND(&ifp->if_snd, m_head); + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + break; } - packet = (netvsc_packet *)(buf + HV_NV_PACKET_OFFSET_IN_BUF); - *(vm_offset_t *)buf = HV_NV_SC_PTR_OFFSET_IN_BUF; + packet = &txd->netvsc_pkt; + /* XXX not necessary */ + memset(packet, 0, sizeof(*packet)); packet->is_data_pkt = TRUE; - /* Set up the rndis header */ - packet->page_buf_count = num_frags; - /* Initialize it from the mbuf */ - packet->tot_data_buf_len = len; + packet->tot_data_buf_len = m_head->m_pkthdr.len; /* * extension points to the area reserved for the @@ -624,8 +806,9 @@ hn_start_locked(struct ifnet *ifp) * the netvsc_packet (and rppi struct, if present; * length is updated later). */ - packet->rndis_mesg = packet + 1; - rndis_mesg = (rndis_msg *)packet->rndis_mesg; + rndis_mesg = txd->rndis_msg; + /* XXX not necessary */ + memset(rndis_mesg, 0, HN_RNDIS_MSG_LEN); rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG; rndis_pkt = &rndis_mesg->msg.packet; @@ -644,8 +827,6 @@ hn_start_locked(struct ifnet *ifp) * set up some additional fields so the Hyper-V infrastructure will stuff the VLAN tag * into the frame. */ - packet->vlan_tci = m_head->m_pkthdr.ether_vtag; - rndis_msg_size += RNDIS_VLAN_PPI_SIZE; rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE, @@ -656,7 +837,7 @@ hn_start_locked(struct ifnet *ifp) rppi->per_packet_info_offset); /* FreeBSD does not support CFI or priority */ rppi_vlan_info->u1.s1.vlan_id = - packet->vlan_tci & 0xfff; + m_head->m_pkthdr.ether_vtag & 0xfff; } /* Only check the flags for outbound and ignore the ones for inbound */ @@ -758,7 +939,7 @@ pre_send: packet->tot_data_buf_len = rndis_mesg->msg_len; /* send packet with send buffer */ - if (packet->tot_data_buf_len < net_dev->send_section_size) { + if (packet->tot_data_buf_len < sc->hn_tx_chimney_size) { send_buf_section_idx = hv_nv_get_next_send_section(net_dev); if (send_buf_section_idx != @@ -783,33 +964,49 @@ pre_send: packet->send_buf_section_size = packet->tot_data_buf_len; packet->page_buf_count = 0; + sc->hn_tx_chimney++; goto do_send; } } + error = hn_txdesc_dmamap_load(sc, txd, &m_head, segs, &nsegs); + if (error) { + int freed; + + /* + * This mbuf is not linked w/ the txd yet, so free + * it now. + */ + m_freem(m_head); + freed = hn_txdesc_put(sc, txd); + KASSERT(freed != 0, + ("fail to free txd upon txdma error")); + + sc->hn_txdma_failed++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + continue; + } + + packet->page_buf_count = nsegs + + HV_RF_NUM_TX_RESERVED_PAGE_BUFS; + /* send packet with page buffer */ - packet->page_buffers[0].pfn = - atop(hv_get_phys_addr(rndis_mesg)); + packet->page_buffers[0].pfn = atop(txd->rndis_msg_paddr); packet->page_buffers[0].offset = - (unsigned long)rndis_mesg & PAGE_MASK; + txd->rndis_msg_paddr & PAGE_MASK; packet->page_buffers[0].length = rndis_msg_size; /* * Fill the page buffers with mbuf info starting at index * HV_RF_NUM_TX_RESERVED_PAGE_BUFS. */ - i = HV_RF_NUM_TX_RESERVED_PAGE_BUFS; - for (m = m_head; m != NULL; m = m->m_next) { - if (m->m_len) { - vm_offset_t paddr = - vtophys(mtod(m, vm_offset_t)); - packet->page_buffers[i].pfn = - paddr >> PAGE_SHIFT; - packet->page_buffers[i].offset = - paddr & (PAGE_SIZE - 1); - packet->page_buffers[i].length = m->m_len; - i++; - } + for (i = 0; i < nsegs; ++i) { + hv_vmbus_page_buffer *pb = &packet->page_buffers[ + i + HV_RF_NUM_TX_RESERVED_PAGE_BUFS]; + + pb->pfn = atop(segs[i].ds_addr); + pb->offset = segs[i].ds_addr & PAGE_MASK; + pb->length = segs[i].ds_len; } packet->send_buf_section_idx = @@ -817,63 +1014,65 @@ pre_send: packet->send_buf_section_size = 0; do_send: + txd->m = m_head; - /* - * If bpf, copy the mbuf chain. This is less expensive than - * it appears; the mbuf clusters are not copied, only their - * reference counts are incremented. - * Needed to avoid a race condition where the completion - * callback is invoked, freeing the mbuf chain, before the - * bpf_mtap code has a chance to run. - */ - if (ifp->if_bpf) { - mc_head = m_copypacket(m_head, M_DONTWAIT); - } -retry_send: /* Set the completion routine */ packet->compl.send.on_send_completion = netvsc_xmit_completion; packet->compl.send.send_completion_context = packet; - packet->compl.send.send_completion_tid = (uint64_t)(uintptr_t)m_head; - - /* Removed critical_enter(), does not appear necessary */ - ret = hv_nv_on_send(device_ctx, packet); - if (ret == 0) { - ifp->if_opackets++; - /* if bpf && mc_head, call bpf_mtap code */ - if (mc_head) { - ETHER_BPF_MTAP(ifp, mc_head); - } - } else { - retries++; - if (retries < 4) { - goto retry_send; - } + packet->compl.send.send_completion_tid = + (uint64_t)(uintptr_t)txd; - IF_PREPEND(&ifp->if_snd, m_head); - ifp->if_drv_flags |= IFF_DRV_OACTIVE; +again: + /* + * Make sure that txd is not freed before ETHER_BPF_MTAP. + */ + hn_txdesc_hold(txd); + error = hv_nv_on_send(device_ctx, packet); + if (!error) { + ETHER_BPF_MTAP(ifp, m_head); + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + } + hn_txdesc_put(sc, txd); + + if (__predict_false(error)) { + int freed; /* - * Null the mbuf pointer so the completion function - * does not free the mbuf chain. We just pushed the - * mbuf chain back on the if_snd queue. + * This should "really rarely" happen. + * + * XXX Too many RX to be acked or too many sideband + * commands to run? Ask netvsc_channel_rollup() + * to kick start later. */ - packet->compl.send.send_completion_tid = 0; + sc->hn_txeof = 1; + if (!send_failed) { + sc->hn_send_failed++; + send_failed = 1; + /* + * Try sending again after set hn_txeof; + * in case that we missed the last + * netvsc_channel_rollup(). + */ + goto again; + } + if_printf(ifp, "send failed\n"); /* - * Release the resources since we will not get any - * send completion + * This mbuf will be prepended, don't free it + * in hn_txdesc_put(); only unload it from the + * DMA map in hn_txdesc_put(), if it was loaded. */ - netvsc_xmit_completion(packet); - if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - } + txd->m = NULL; + freed = hn_txdesc_put(sc, txd); + KASSERT(freed != 0, + ("fail to free txd upon send error")); - /* if bpf && mc_head, free the mbuf chain copy */ - if (mc_head) { - m_freem(mc_head); + sc->hn_send_failed++; + IF_PREPEND(&ifp->if_snd, m_head); + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + break; } } - - return (ret); } /* @@ -1222,6 +1421,9 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) break; } + sc->hn_tx_chimney_max = sc->net_dev->send_section_size; + if (sc->hn_tx_chimney_size > sc->hn_tx_chimney_max) + sc->hn_tx_chimney_size = sc->hn_tx_chimney_max; hn_ifinit_locked(sc); NV_LOCK(sc); @@ -1479,6 +1681,25 @@ hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS) #endif /* HN_LRO_HIWAT */ static int +hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int chimney_size, error; + + chimney_size = sc->hn_tx_chimney_size; + error = sysctl_handle_int(oidp, &chimney_size, 0, req); + if (error || req->newptr == NULL) + return error; + + if (chimney_size > sc->hn_tx_chimney_max || chimney_size <= 0) + return EINVAL; + + if (sc->hn_tx_chimney_size != chimney_size) + sc->hn_tx_chimney_size = chimney_size; + return 0; +} + +static int hn_check_iplen(const struct mbuf *m, int hoff) { const struct ip *ip; @@ -1553,6 +1774,150 @@ hn_check_iplen(const struct mbuf *m, int hoff) return ip->ip_p; } +static void +hn_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error) +{ + bus_addr_t *paddr = arg; + + if (error) + return; + + KASSERT(nseg == 1, ("too many segments %d!", nseg)); + *paddr = segs->ds_addr; +} + +static int +hn_create_tx_ring(struct hn_softc *sc) +{ + bus_dma_tag_t parent_dtag; + int error, i; + + sc->hn_txdesc_cnt = HN_TX_DESC_CNT; + sc->hn_txdesc = malloc(sizeof(struct hn_txdesc) * sc->hn_txdesc_cnt, + M_NETVSC, M_WAITOK | M_ZERO); + SLIST_INIT(&sc->hn_txlist); + mtx_init(&sc->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); + + parent_dtag = bus_get_dma_tag(sc->hn_dev); + + /* DMA tag for RNDIS messages. */ + error = bus_dma_tag_create(parent_dtag, /* parent */ + HN_RNDIS_MSG_ALIGN, /* alignment */ + HN_RNDIS_MSG_BOUNDARY, /* boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + HN_RNDIS_MSG_LEN, /* maxsize */ + 1, /* nsegments */ + HN_RNDIS_MSG_LEN, /* maxsegsize */ + 0, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockfuncarg */ + &sc->hn_tx_rndis_dtag); + if (error) { + device_printf(sc->hn_dev, "failed to create rndis dmatag\n"); + return error; + } + + /* DMA tag for data. */ + error = bus_dma_tag_create(parent_dtag, /* parent */ + 1, /* alignment */ + HN_TX_DATA_BOUNDARY, /* boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + HN_TX_DATA_MAXSIZE, /* maxsize */ + HN_TX_DATA_SEGCNT_MAX, /* nsegments */ + HN_TX_DATA_SEGSIZE, /* maxsegsize */ + 0, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockfuncarg */ + &sc->hn_tx_data_dtag); + if (error) { + device_printf(sc->hn_dev, "failed to create data dmatag\n"); + return error; + } + + for (i = 0; i < sc->hn_txdesc_cnt; ++i) { + struct hn_txdesc *txd = &sc->hn_txdesc[i]; + + txd->sc = sc; + + /* + * Allocate and load RNDIS messages. + */ + error = bus_dmamem_alloc(sc->hn_tx_rndis_dtag, + (void **)&txd->rndis_msg, + BUS_DMA_WAITOK | BUS_DMA_COHERENT, + &txd->rndis_msg_dmap); + if (error) { + device_printf(sc->hn_dev, + "failed to allocate rndis_msg, %d\n", i); + return error; + } + + error = bus_dmamap_load(sc->hn_tx_rndis_dtag, + txd->rndis_msg_dmap, + txd->rndis_msg, HN_RNDIS_MSG_LEN, + hn_dma_map_paddr, &txd->rndis_msg_paddr, + BUS_DMA_NOWAIT); + if (error) { + device_printf(sc->hn_dev, + "failed to load rndis_msg, %d\n", i); + bus_dmamem_free(sc->hn_tx_rndis_dtag, + txd->rndis_msg, txd->rndis_msg_dmap); + return error; + } + + /* DMA map for TX data. */ + error = bus_dmamap_create(sc->hn_tx_data_dtag, 0, + &txd->data_dmap); + if (error) { + device_printf(sc->hn_dev, + "failed to allocate tx data dmamap\n"); + bus_dmamap_unload(sc->hn_tx_rndis_dtag, + txd->rndis_msg_dmap); + bus_dmamem_free(sc->hn_tx_rndis_dtag, + txd->rndis_msg, txd->rndis_msg_dmap); + return error; + } + + /* All set, put it to list */ + txd->flags |= HN_TXD_FLAG_ONLIST; + SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link); + } + sc->hn_txdesc_avail = sc->hn_txdesc_cnt; + + return 0; +} + +static void +hn_destroy_tx_ring(struct hn_softc *sc) +{ + struct hn_txdesc *txd; + + while ((txd = SLIST_FIRST(&sc->hn_txlist)) != NULL) { + KASSERT(txd->m == NULL, ("still has mbuf installed")); + KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, + ("still dma mapped")); + SLIST_REMOVE_HEAD(&sc->hn_txlist, link); + + bus_dmamap_unload(sc->hn_tx_rndis_dtag, + txd->rndis_msg_dmap); + bus_dmamem_free(sc->hn_tx_rndis_dtag, + txd->rndis_msg, txd->rndis_msg_dmap); + + bus_dmamap_destroy(sc->hn_tx_data_dtag, txd->data_dmap); + } + + if (sc->hn_tx_data_dtag != NULL) + bus_dma_tag_destroy(sc->hn_tx_data_dtag); + if (sc->hn_tx_rndis_dtag != NULL) + bus_dma_tag_destroy(sc->hn_tx_rndis_dtag); + free(sc->hn_txdesc, M_NETVSC); + mtx_destroy(&sc->hn_txlist_spin); +} + static device_method_t netvsc_methods[] = { /* Device interface */ DEVMETHOD(device_probe, netvsc_probe), diff --git a/sys/dev/hyperv/netvsc/hv_rndis.h b/sys/dev/hyperv/netvsc/hv_rndis.h index fd032de..cd46ecc 100644 --- a/sys/dev/hyperv/netvsc/hv_rndis.h +++ b/sys/dev/hyperv/netvsc/hv_rndis.h @@ -1050,6 +1050,7 @@ int netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, rndis_tcp_ip_csum_info *csum_info); void netvsc_recv_rollup(struct hv_device *device_ctx); +void netvsc_channel_rollup(struct hv_device *device_ctx); void* hv_set_rppi_data(rndis_msg *rndis_mesg, uint32_t rppi_size, diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.c b/sys/dev/hyperv/netvsc/hv_rndis_filter.c index 3e95024..dfd0b47 100644 --- a/sys/dev/hyperv/netvsc/hv_rndis_filter.c +++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.c @@ -974,3 +974,21 @@ hv_rf_receive_rollup(netvsc_dev *net_dev) rndis_dev = (rndis_device *)net_dev->extension; netvsc_recv_rollup(rndis_dev->net_dev->dev); } + +void +hv_rf_channel_rollup(netvsc_dev *net_dev) +{ + rndis_device *rndis_dev; + + rndis_dev = (rndis_device *)net_dev->extension; + + /* + * This could be called pretty early, so we need + * to make sure everything has been setup. + */ + if (rndis_dev == NULL || + rndis_dev->net_dev == NULL || + rndis_dev->net_dev->dev == NULL) + return; + netvsc_channel_rollup(rndis_dev->net_dev->dev); +} diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.h b/sys/dev/hyperv/netvsc/hv_rndis_filter.h index 2f3ebd8..9d7a38d 100644 --- a/sys/dev/hyperv/netvsc/hv_rndis_filter.h +++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.h @@ -99,6 +99,7 @@ typedef struct rndis_device_ { int hv_rf_on_receive(netvsc_dev *net_dev, struct hv_device *device, netvsc_packet *pkt); void hv_rf_receive_rollup(netvsc_dev *net_dev); +void hv_rf_channel_rollup(netvsc_dev *net_dev); int hv_rf_on_device_add(struct hv_device *device, void *additl_info); int hv_rf_on_device_remove(struct hv_device *device, boolean_t destroy_channel); int hv_rf_on_open(struct hv_device *device); diff --git a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c index c7f3538..4ccb647 100644 --- a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c +++ b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c @@ -274,14 +274,16 @@ vmbus_channel_process_offer(hv_vmbus_channel *new_channel) boolean_t f_new; hv_vmbus_channel* channel; int ret; + uint32_t relid; f_new = TRUE; channel = NULL; - + relid = new_channel->offer_msg.child_rel_id; /* * Make sure this is a new offer */ mtx_lock(&hv_vmbus_g_connection.channel_lock); + hv_vmbus_g_connection.channels[relid] = new_channel; TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor, list_entry) @@ -325,16 +327,18 @@ vmbus_channel_process_offer(hv_vmbus_channel *new_channel) mtx_unlock(&channel->sc_lock); /* Insert new channel into channel_anchor. */ - printf("Storvsc get multi-channel offer, rel=%u.\n", - new_channel->offer_msg.child_rel_id); + printf("VMBUS get multi-channel offer, rel=%u,sub=%u\n", + new_channel->offer_msg.child_rel_id, + new_channel->offer_msg.offer.sub_channel_index); mtx_lock(&hv_vmbus_g_connection.channel_lock); TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor, new_channel, list_entry); mtx_unlock(&hv_vmbus_g_connection.channel_lock); if(bootverbose) - printf("VMBUS: new multi-channel offer <%p>.\n", - new_channel); + printf("VMBUS: new multi-channel offer <%p>, " + "its primary channel is <%p>.\n", + new_channel, new_channel->primary_channel); /*XXX add it to percpu_list */ @@ -524,11 +528,14 @@ vmbus_channel_on_offer_rescind(hv_vmbus_channel_msg_header* hdr) rescind = (hv_vmbus_channel_rescind_offer*) hdr; - channel = hv_vmbus_get_channel_from_rel_id(rescind->child_rel_id); + channel = hv_vmbus_g_connection.channels[rescind->child_rel_id]; if (channel == NULL) return; hv_vmbus_child_device_unregister(channel->device); + mtx_lock(&hv_vmbus_g_connection.channel_lock); + hv_vmbus_g_connection.channels[rescind->child_rel_id] = NULL; + mtx_unlock(&hv_vmbus_g_connection.channel_lock); } /** @@ -782,6 +789,8 @@ hv_vmbus_release_unattached_channels(void) hv_vmbus_child_device_unregister(channel->device); hv_vmbus_free_vmbus_channel(channel); } + bzero(hv_vmbus_g_connection.channels, + sizeof(hv_vmbus_channel*) * HV_CHANNEL_MAX_COUNT); mtx_unlock(&hv_vmbus_g_connection.channel_lock); } diff --git a/sys/dev/hyperv/vmbus/hv_connection.c b/sys/dev/hyperv/vmbus/hv_connection.c index 7496288..a9e3561 100644 --- a/sys/dev/hyperv/vmbus/hv_connection.c +++ b/sys/dev/hyperv/vmbus/hv_connection.c @@ -232,6 +232,9 @@ hv_vmbus_connect(void) { goto cleanup; } + hv_vmbus_g_connection.channels = malloc(sizeof(hv_vmbus_channel*) * + HV_CHANNEL_MAX_COUNT, + M_DEVBUF, M_WAITOK | M_ZERO); /* * Find the highest vmbus version number we can support. */ @@ -295,6 +298,7 @@ hv_vmbus_connect(void) { free(msg_info, M_DEVBUF); } + free(hv_vmbus_g_connection.channels, M_DEVBUF); return (ret); } @@ -325,6 +329,7 @@ hv_vmbus_disconnect(void) { hv_work_queue_close(hv_vmbus_g_connection.work_queue); sema_destroy(&hv_vmbus_g_connection.control_sema); + free(hv_vmbus_g_connection.channels, M_DEVBUF); hv_vmbus_g_connection.connect_state = HV_DISCONNECTED; free(msg, M_DEVBUF); @@ -333,35 +338,6 @@ hv_vmbus_disconnect(void) { } /** - * Get the channel object given its child relative id (ie channel id) - */ -hv_vmbus_channel* -hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) { - - hv_vmbus_channel* channel; - hv_vmbus_channel* foundChannel = NULL; - - /* - * TODO: - * Consider optimization where relids are stored in a fixed size array - * and channels are accessed without the need to take this lock or search - * the list. - */ - mtx_lock(&hv_vmbus_g_connection.channel_lock); - TAILQ_FOREACH(channel, - &hv_vmbus_g_connection.channel_anchor, list_entry) { - - if (channel->offer_msg.child_rel_id == rel_id) { - foundChannel = channel; - break; - } - } - mtx_unlock(&hv_vmbus_g_connection.channel_lock); - - return (foundChannel); -} - -/** * Process a channel event notification */ static void @@ -377,7 +353,7 @@ VmbusProcessChannelEvent(uint32_t relid) * the channel callback to process the event */ - channel = hv_vmbus_get_channel_from_rel_id(relid); + channel = hv_vmbus_g_connection.channels[relid]; if (channel == NULL) { return; @@ -473,7 +449,7 @@ hv_vmbus_on_events(void *arg) if (recv_interrupt_page != NULL) { for (dword = 0; dword < maxdword; dword++) { if (recv_interrupt_page[dword]) { - for (bit = 0; bit < 32; bit++) { + for (bit = 0; bit < HV_CHANNEL_DWORD_LEN; bit++) { if (synch_test_and_clear_bit(bit, (uint32_t *) &recv_interrupt_page[dword])) { rel_id = (dword << 5) + bit; diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h index 74fe824..13a35c4 100644 --- a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h +++ b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h @@ -58,6 +58,12 @@ typedef uint16_t hv_vmbus_status; #define HV_EVENT_FLAGS_BYTE_COUNT (256) #define HV_EVENT_FLAGS_DWORD_COUNT (256 / sizeof(uint32_t)) +/** + * max channel count <== event_flags_dword_count * bit_of_dword + */ +#define HV_CHANNEL_DWORD_LEN (32) +#define HV_CHANNEL_MAX_COUNT \ + ((HV_EVENT_FLAGS_DWORD_COUNT) * HV_CHANNEL_DWORD_LEN) /* * MessageId: HV_STATUS_INSUFFICIENT_BUFFERS * MessageText: @@ -355,6 +361,10 @@ typedef struct { TAILQ_HEAD(, hv_vmbus_channel) channel_anchor; struct mtx channel_lock; + /** + * channel table for fast lookup through id. + */ + hv_vmbus_channel **channels; hv_vmbus_handle work_queue; struct sema control_sema; } hv_vmbus_connection; @@ -699,7 +709,6 @@ int hv_vmbus_child_device_register( struct hv_device *child_dev); int hv_vmbus_child_device_unregister( struct hv_device *child_dev); -hv_vmbus_channel* hv_vmbus_get_channel_from_rel_id(uint32_t rel_id); /** * Connection interfaces diff --git a/sys/fs/tmpfs/tmpfs_vnops.c b/sys/fs/tmpfs/tmpfs_vnops.c index 885f84c..f01c8be 100644 --- a/sys/fs/tmpfs/tmpfs_vnops.c +++ b/sys/fs/tmpfs/tmpfs_vnops.c @@ -1187,8 +1187,11 @@ tmpfs_readdir(struct vop_readdir_args *v) if (error == EJUSTRETURN) error = (uio->uio_resid != startresid) ? 0 : EINVAL; - if (error != 0 && cookies != NULL) + if (error != 0 && cookies != NULL && ncookies != NULL) { free(*cookies, M_TEMP); + *cookies = NULL; + *ncookies = 0; + } if (eofflag != NULL) *eofflag = diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index 42c53c0..0d52c7b 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -208,6 +208,8 @@ static char *pr_allow_names[] = { "allow.mount.procfs", "allow.mount.tmpfs", "allow.mount.fdescfs", + "allow.mount.linprocfs", + "allow.mount.linsysfs", }; const size_t pr_allow_names_size = sizeof(pr_allow_names); @@ -225,6 +227,8 @@ static char *pr_allow_nonames[] = { "allow.mount.noprocfs", "allow.mount.notmpfs", "allow.mount.nofdescfs", + "allow.mount.nolinprocfs", + "allow.mount.nolinsysfs", }; const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames); @@ -4315,6 +4319,14 @@ SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the procfs file system"); +SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I", + "Processes in jail can mount the linprocfs file system"); +SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I", + "Processes in jail can mount the linsysfs file system"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I", @@ -4481,6 +4493,10 @@ SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the nullfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the procfs file system"); +SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW, + "B", "Jail may mount the linprocfs file system"); +SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW, + "B", "Jail may mount the linsysfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the tmpfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW, diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index a721c5a..aa81313 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -145,51 +145,24 @@ int vttoif_tab[10] = { static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* - * "Free" vnode target. Free vnodes are rarely completely free, but are - * just ones that are cheap to recycle. Usually they are for files which - * have been stat'd but not read; these usually have inode and namecache - * data attached to them. This target is the preferred minimum size of a - * sub-cache consisting mostly of such files. The system balances the size - * of this sub-cache with its complement to try to prevent either from - * thrashing while the other is relatively inactive. The targets express - * a preference for the best balance. - * - * "Above" this target there are 2 further targets (watermarks) related - * to recyling of free vnodes. In the best-operating case, the cache is - * exactly full, the free list has size between vlowat and vhiwat above the - * free target, and recycling from it and normal use maintains this state. - * Sometimes the free list is below vlowat or even empty, but this state - * is even better for immediate use provided the cache is not full. - * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free - * ones) to reach one of these states. The watermarks are currently hard- - * coded as 4% and 9% of the available space higher. These and the default - * of 25% for wantfreevnodes are too large if the memory size is large. - * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim - * whenever vnlru_proc() becomes active. + * Free vnode target. Free vnodes may simply be files which have been stat'd + * but not read. This is somewhat common, and a small cache of such files + * should be kept to avoid recreation costs. */ static u_long wantfreevnodes; -SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, - &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes"); +SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); +/* Number of vnodes in the free list. */ static u_long freevnodes; -SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, - &freevnodes, 0, "Number of \"free\" vnodes"); +SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, + "Number of vnodes in the free list"); -/* - * The vfs.vlru_allow_cache_src sysctl variable is no longer used but - * the sysctl remains to provide ABI compatibility. The new code frees - * namecache sources as the last chance to satisfy the highest watermark, - * instead of selecting the source vnodes randomly. This provides good - * enough behaviour to keep vn_fullpath() working in most situations. - * The filesystem layout with deep trees, where the depricated knob was - * required, is thus handled automatically. - */ static int vlru_allow_cache_src; SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW, - &vlru_allow_cache_src, 0, "Placeholder for API compatibility (unused)"); + &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode"); static u_long recycles_count; SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0, - "Number of vnodes recycled to meet vnode cache targets"); + "Number of vnodes recycled to avoid exceding kern.maxvnodes"); /* * Various variables used for debugging the new implementation of @@ -299,13 +272,14 @@ static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; -/* Target for maximum number of vnodes. */ +/* + * Number of vnodes we want to exist at any one time. This is mostly used + * to size hash tables in vnode-related code. It is normally not used in + * getnewvnode(), as wantfreevnodes is normally nonzero.) + * + * XXX desiredvnodes is historical cruft and should not exist. + */ int desiredvnodes; -static int gapvnodes; /* gap between wanted and desired */ -static int vhiwat; /* enough extras after expansion */ -static int vlowat; /* minimal extras before expansion */ -static int vstir; /* nonzero to stir non-free vnodes */ -static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ static int sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) @@ -316,8 +290,6 @@ sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0) return (error); if (old_desiredvnodes != desiredvnodes) { - wantfreevnodes = desiredvnodes / 4; - /* XXX locking seems to be incomplete. */ vfs_hash_changesize(desiredvnodes); cache_changesize(desiredvnodes); } @@ -326,9 +298,9 @@ sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0, - sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes"); + sysctl_update_desiredvnodes, "I", "Maximum number of vnodes"); SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, - &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); + &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); @@ -359,10 +331,10 @@ PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free); * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size - * grows, the ratio of the memory size in KB to to vnodes approaches 64:1. + * grows, the ratio of physical pages to vnodes approaches sixteen to one. */ #ifndef MAXVNODES_MAX -#define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */ +#define MAXVNODES_MAX (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16)) #endif /* @@ -433,16 +405,15 @@ vntblinit(void *dummy __unused) /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the - * physical memory size. The ratio of desiredvnodes to the physical - * memory size is 1:16 until desiredvnodes exceeds 98,304. - * Thereafter, the - * marginal ratio of desiredvnodes to the physical memory size is - * 1:64. However, desiredvnodes is limited by the kernel's heap + * physical memory size. The ratio of desiredvnodes to physical pages + * is one to four until desiredvnodes exceeds 98,304. Thereafter, the + * marginal ratio of desiredvnodes to physical pages is one to + * sixteen. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects - * must not exceed 1/7th of the kernel's heap size. + * may not exceed one seventh of the kernel's heap size. */ - physvnodes = maxproc + pgtok(cnt.v_page_count) / 64 + - 3 * min(98304 * 16, pgtok(cnt.v_page_count)) / 64; + physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4, + cnt.v_page_count) / 16; virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) + sizeof(struct vnode))); desiredvnodes = min(physvnodes, virtvnodes); @@ -831,41 +802,35 @@ vattr_null(struct vattr *vap) * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. */ static int -vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger) +vlrureclaim(struct mount *mp) { struct vnode *vp; - int count, done, target; + int done; + int trigger; + int usevnodes; + int count; + /* + * Calculate the trigger point, don't allow user + * screwups to blow us up. This prevents us from + * recycling vnodes with lots of resident pages. We + * aren't trying to free memory, we are trying to + * free vnodes. + */ + usevnodes = desiredvnodes; + if (usevnodes <= 0) + usevnodes = 1; + trigger = cnt.v_page_count * 2 / usevnodes; done = 0; vn_start_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); - count = mp->mnt_nvnodelistsize; - target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1); - target = target / 10 + 1; - while (count != 0 && done < target) { + count = mp->mnt_nvnodelistsize / 10 + 1; + while (count != 0) { vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); if (vp == NULL) break; - /* - * XXX LRU is completely broken for non-free vnodes. First - * by calling here in mountpoint order, then by moving - * unselected vnodes to the end here, and most grossly by - * removing the vlruvp() function that was supposed to - * maintain the order. (This function was born broken - * since syncer problems prevented it doing anything.) The - * order is closer to LRC (C = Created). - * - * LRU reclaiming of vnodes seems to have last worked in - * FreeBSD-3 where LRU wasn't mentioned under any spelling. - * Then there was no hold count, and inactive vnodes were - * simply put on the free list in LRU order. The separate - * lists also break LRU. We prefer to reclaim from the - * free list for technical reasons. This tends to thrash - * the free list to keep very unrecently used held vnodes. - * The problem is mitigated by keeping the free list large. - */ TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); --count; @@ -874,12 +839,10 @@ vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger) /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. - * Also skip free vnodes. We are trying to make space - * to expand the free list, not reduce it. */ if (vp->v_usecount || - (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || - ((vp->v_iflag & VI_FREE) != 0) || + (!vlru_allow_cache_src && + !LIST_EMPTY(&(vp)->v_cache_src)) || (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VI_UNLOCK(vp); @@ -905,8 +868,8 @@ vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger) * vnode lock before our VOP_LOCK() call fails. */ if (vp->v_usecount || - (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || - (vp->v_iflag & VI_FREE) != 0 || + (!vlru_allow_cache_src && + !LIST_EMPTY(&(vp)->v_cache_src)) || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp, LK_INTERLOCK); @@ -939,7 +902,7 @@ relock_mnt: } /* - * Attempt to reduce the free list by the requested amount. + * Attempt to keep the free list at wantfreevnodes length. */ static void vnlru_free(int count) @@ -996,24 +959,6 @@ vnlru_free(int count) mtx_lock(&vnode_free_list_mtx); } } - -/* XXX some names and initialization are bad for limits and watermarks. */ -static int -vspace(void) -{ - int space; - - gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); - vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ - vlowat = vhiwat / 2; - if (numvnodes > desiredvnodes) - return (0); - space = desiredvnodes - numvnodes; - if (freevnodes > wantfreevnodes) - space += freevnodes - wantfreevnodes; - return (space); -} - /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some @@ -1026,36 +971,18 @@ static void vnlru_proc(void) { struct mount *mp, *nmp; - unsigned long ofreevnodes, onumvnodes; - int done, force, reclaim_nc_src, trigger, usevnodes; + int done; + struct proc *p = vnlruproc; - EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, + EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_FIRST); - force = 0; for (;;) { - kproc_suspend_check(vnlruproc); + kproc_suspend_check(p); mtx_lock(&vnode_free_list_mtx); - /* - * If numvnodes is too large (due to desiredvnodes being - * adjusted using its sysctl, or emergency growth), first - * try to reduce it by discarding from the free list. - */ - if (numvnodes > desiredvnodes && freevnodes > 0) - vnlru_free(ulmin(numvnodes - desiredvnodes, - freevnodes)); - /* - * Sleep if the vnode cache is in a good state. This is - * when it is not over-full and has space for about a 4% - * or 9% expansion (by growing its size or inexcessively - * reducing its free list). Otherwise, try to reclaim - * space for a 10% expansion. - */ - if (vstir && force == 0) { - force = 1; - vstir = 0; - } - if (vspace() >= vlowat && force == 0) { + if (freevnodes > wantfreevnodes) + vnlru_free(freevnodes - wantfreevnodes); + if (numvnodes <= desiredvnodes * 9 / 10) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_free_list_mtx, @@ -1064,66 +991,30 @@ vnlru_proc(void) } mtx_unlock(&vnode_free_list_mtx); done = 0; - ofreevnodes = freevnodes; - onumvnodes = numvnodes; - /* - * Calculate parameters for recycling. These are the same - * throughout the loop to give some semblance of fairness. - * The trigger point is to avoid recycling vnodes with lots - * of resident pages. We aren't trying to free memory; we - * are trying to recycle or at least free vnodes. - */ - if (numvnodes <= desiredvnodes) - usevnodes = numvnodes - freevnodes; - else - usevnodes = numvnodes; - if (usevnodes <= 0) - usevnodes = 1; - /* - * The trigger value is is chosen to give a conservatively - * large value to ensure that it alone doesn't prevent - * making progress. The value can easily be so large that - * it is effectively infinite in some congested and - * misconfigured cases, and this is necessary. Normally - * it is about 8 to 100 (pages), which is quite large. - */ - trigger = cnt.v_page_count * 2 / usevnodes; - if (force < 2) - trigger = vsmalltrigger; - reclaim_nc_src = force >= 3; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } - done += vlrureclaim(mp, reclaim_nc_src, trigger); + done += vlrureclaim(mp); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); - if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) - uma_reclaim(); if (done == 0) { - if (force == 0 || force == 1) { - force = 2; - continue; - } - if (force == 2) { - force = 3; - continue; - } - force = 0; +#if 0 + /* These messages are temporary debugging aids */ + if (vnlru_nowhere < 5) + printf("vnlru process getting nowhere..\n"); + else if (vnlru_nowhere == 5) + printf("vnlru process messages stopped.\n"); +#endif vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else kern_yield(PRI_USER); - /* - * After becoming active to expand above low water, keep - * active until above high water. - */ - force = vspace() < vhiwat; } } @@ -1197,31 +1088,22 @@ vtryrecycle(struct vnode *vp) return (0); } -static void -vcheckspace(void) -{ - - if (vspace() < vlowat && vnlruproc_sig == 0) { - vnlruproc_sig = 1; - wakeup(vnlruproc); - } -} - /* - * Wait if necessary for space for a new vnode. + * Wait for available vnodes. */ static int getnewvnode_wait(int suspended) { mtx_assert(&vnode_free_list_mtx, MA_OWNED); - if (numvnodes >= desiredvnodes) { + if (numvnodes > desiredvnodes) { if (suspended) { /* - * The file system is being suspended. We cannot - * risk a deadlock here, so allow allocation of - * another vnode even if this would give too many. + * File system is beeing suspended, we cannot risk a + * deadlock here, so allocate new vnode anyway. */ + if (freevnodes > wantfreevnodes) + vnlru_free(freevnodes - wantfreevnodes); return (0); } if (vnlruproc_sig == 0) { @@ -1231,34 +1113,18 @@ getnewvnode_wait(int suspended) msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, "vlruwk", hz); } - /* Post-adjust like the pre-adjust in getnewvnode(). */ - if (numvnodes + 1 > desiredvnodes && freevnodes > 1) - vnlru_free(1); - return (numvnodes >= desiredvnodes ? ENFILE : 0); + return (numvnodes > desiredvnodes ? ENFILE : 0); } -/* - * This hack is fragile, and probably not needed any more now that the - * watermark handling works. - */ void getnewvnode_reserve(u_int count) { struct thread *td; - /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */ - /* XXX no longer so quick, but this part is not racy. */ - mtx_lock(&vnode_free_list_mtx); - if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes) - vnlru_free(ulmin(numvnodes + count - desiredvnodes, - freevnodes - wantfreevnodes)); - mtx_unlock(&vnode_free_list_mtx); - td = curthread; /* First try to be quick and racy. */ if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { td->td_vp_reserv += count; - vcheckspace(); /* XXX no longer so quick, but more racy */ return; } else atomic_subtract_long(&numvnodes, count); @@ -1271,18 +1137,9 @@ getnewvnode_reserve(u_int count) atomic_add_long(&numvnodes, 1); } } - vcheckspace(); mtx_unlock(&vnode_free_list_mtx); } -/* - * This hack is fragile, especially if desiredvnodes or wantvnodes are - * misconfgured or changed significantly. Reducing desiredvnodes below - * the reserved amount should cause bizarre behaviour like reducing it - * below the number of active vnodes -- the system will try to reduce - * numvnodes to match, but should fail, so the subtraction below should - * not overflow. - */ void getnewvnode_drop_reserve(void) { @@ -1303,7 +1160,6 @@ getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode *vp; struct thread *td; struct lock_object *lo; - static int cyclecount; int error; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); @@ -1314,37 +1170,19 @@ getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, goto alloc; } mtx_lock(&vnode_free_list_mtx); - if (numvnodes < desiredvnodes) - cyclecount = 0; - else if (cyclecount++ >= freevnodes) { - cyclecount = 0; - vstir = 1; - } /* - * Grow the vnode cache if it will not be above its target max - * after growing. Otherwise, if the free list is nonempty, try - * to reclaim 1 item from it before growing the cache (possibly - * above its target max if the reclamation failed or is delayed). - * Otherwise, wait for some space. In all cases, schedule - * vnlru_proc() if we are getting short of space. The watermarks - * should be chosen so that we never wait or even reclaim from - * the free list to below its target minimum. + * Lend our context to reclaim vnodes if they've exceeded the max. */ - if (numvnodes + 1 <= desiredvnodes) - ; - else if (freevnodes > 0) + if (freevnodes > wantfreevnodes) vnlru_free(1); - else { - error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & - MNTK_SUSPEND)); + error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & + MNTK_SUSPEND)); #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ - if (error != 0) { - mtx_unlock(&vnode_free_list_mtx); - return (error); - } -#endif + if (error != 0) { + mtx_unlock(&vnode_free_list_mtx); + return (error); } - vcheckspace(); +#endif atomic_add_long(&numvnodes, 1); mtx_unlock(&vnode_free_list_mtx); alloc: diff --git a/sys/sys/jail.h b/sys/sys/jail.h index cfe71d8..63f5ab9 100644 --- a/sys/sys/jail.h +++ b/sys/sys/jail.h @@ -232,7 +232,9 @@ struct prison_racct { #define PR_ALLOW_MOUNT_PROCFS 0x0400 #define PR_ALLOW_MOUNT_TMPFS 0x0800 #define PR_ALLOW_MOUNT_FDESCFS 0x1000 -#define PR_ALLOW_ALL 0x1fff +#define PR_ALLOW_MOUNT_LINPROCFS 0x2000 +#define PR_ALLOW_MOUNT_LINSYSFS 0x4000 +#define PR_ALLOW_ALL 0x7fff /* * OSD methods diff --git a/usr.sbin/jail/jail.8 b/usr.sbin/jail/jail.8 index 773c703..0831fcd 100644 --- a/usr.sbin/jail/jail.8 +++ b/usr.sbin/jail/jail.8 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd February 25, 2015 +.Dd July 6, 2015 .Dt JAIL 8 .Os .Sh NAME @@ -563,6 +563,22 @@ This permission is effective only together with and only when .Va enforce_statfs is set to a value lower than 2. +.It Va allow.mount.linprocfs +privileged users inside the jail will be able to mount and unmount the +linprocfs file system. +This permission is effective only together with +.Va allow.mount +and only when +.Va enforce_statfs +is set to a value lower than 2. +.It Va allow.mount.linsysfs +privileged users inside the jail will be able to mount and unmount the +linsysfs file system. +This permission is effective only together with +.Va allow.mount +and only when +.Va enforce_statfs +is set to a value lower than 2. .It Va allow.mount.tmpfs privileged users inside the jail will be able to mount and unmount the tmpfs file system. @@ -1210,6 +1226,8 @@ environment of the first jail. .Xr fdescfs 5 , .Xr jail.conf 5 , .Xr procfs 5 , +.Xr linprocfs 5 , +.Xr linsysfs 5 , .Xr rc.conf 5 , .Xr sysctl.conf 5 , .Xr chroot 8 , diff --git a/usr.sbin/rtsold/rtsold.c b/usr.sbin/rtsold/rtsold.c index 7710537..238a1e1 100644 --- a/usr.sbin/rtsold/rtsold.c +++ b/usr.sbin/rtsold/rtsold.c @@ -609,7 +609,7 @@ rtsol_check_timer(void) struct timespec now, rtsol_timer; struct ifinfo *ifi; struct rainfo *rai; - struct ra_opt *rao; + struct ra_opt *rao, *raotmp; int flags; clock_gettime(CLOCK_MONOTONIC_FAST, &now); @@ -704,7 +704,8 @@ rtsol_check_timer(void) int expire = 0; TAILQ_FOREACH(rai, &ifi->ifi_rainfo, rai_next) { - TAILQ_FOREACH(rao, &rai->rai_ra_opt, rao_next) { + TAILQ_FOREACH_SAFE(rao, &rai->rai_ra_opt, + rao_next, raotmp) { warnmsg(LOG_DEBUG, __func__, "RA expiration timer: " "type=%d, msg=%s, expire=%s", |