diff options
Diffstat (limited to 'sys')
66 files changed, 7552 insertions, 1944 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 855f7bc..8136745 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -6480,7 +6480,7 @@ static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; - vm_paddr_t pa_start, pa_end; + vm_paddr_t pa_start, pa_end, pa_end1; pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; @@ -6660,9 +6660,12 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) tmpva += PAGE_SIZE; } } - if (error == 0 && pa_start != pa_end) - error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), - pa_end - pa_start, mode); + if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { + pa_end1 = MIN(pa_end, dmaplimit); + if (pa_start != pa_end1) + error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), + pa_end1 - pa_start, mode); + } /* * Flush CPU caches if required to make sure any data isn't cached that diff --git a/sys/cddl/boot/zfs/lz4.c b/sys/cddl/boot/zfs/lz4.c index c29f861..b12122c 100644 --- a/sys/cddl/boot/zfs/lz4.c +++ b/sys/cddl/boot/zfs/lz4.c @@ -34,6 +34,8 @@ * $FreeBSD$ */ +#include <arpa/inet.h> + static int LZ4_uncompress_unknownOutputSize(const char *source, char *dest, int isize, int maxOutputSize); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 226233e..f6d19fe 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -139,7 +139,6 @@ #include <zfs_fletcher.h> #include <sys/sdt.h> -#include <vm/vm_pageout.h> #include <machine/vmparam.h> #ifdef illumos diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index 04e1342..2a15cdf 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -75,7 +75,6 @@ #include <sys/sched.h> #include <sys/acl.h> #include <vm/vm_param.h> -#include <vm/vm_pageout.h> /* * Programming rules. diff --git a/sys/conf/files b/sys/conf/files index 8d0453a..e8c8a3a 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3596,8 +3596,12 @@ netipx/spx_usrreq.c optional ipx netnatm/natm.c optional natm netnatm/natm_pcb.c optional natm netnatm/natm_proto.c optional natm +netpfil/ipfw/dn_aqm_codel.c optional inet dummynet +netpfil/ipfw/dn_aqm_pie.c optional inet dummynet netpfil/ipfw/dn_heap.c optional inet dummynet netpfil/ipfw/dn_sched_fifo.c optional inet dummynet +netpfil/ipfw/dn_sched_fq_codel.c optional inet dummynet +netpfil/ipfw/dn_sched_fq_pie.c optional inet dummynet netpfil/ipfw/dn_sched_prio.c optional inet dummynet netpfil/ipfw/dn_sched_qfq.c optional inet dummynet netpfil/ipfw/dn_sched_rr.c optional inet dummynet diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index f96b4f3..533b957 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -271,7 +271,10 @@ dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c optional hyperv dev/hyperv/netvsc/hv_rndis_filter.c optional hyperv dev/hyperv/stordisengage/hv_ata_pci_disengage.c optional hyperv dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c optional hyperv +dev/hyperv/utilities/hv_heartbeat.c optional hyperv dev/hyperv/utilities/hv_kvp.c optional hyperv +dev/hyperv/utilities/hv_shutdown.c optional hyperv +dev/hyperv/utilities/hv_timesync.c optional hyperv dev/hyperv/utilities/hv_util.c optional hyperv dev/hyperv/vmbus/hv_channel.c optional hyperv dev/hyperv/vmbus/hv_channel_mgmt.c optional hyperv diff --git a/sys/conf/files.i386 b/sys/conf/files.i386 index f79ed58..89b87e3 100644 --- a/sys/conf/files.i386 +++ b/sys/conf/files.i386 @@ -247,7 +247,10 @@ dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c optional hyperv dev/hyperv/netvsc/hv_rndis_filter.c optional hyperv dev/hyperv/stordisengage/hv_ata_pci_disengage.c optional hyperv dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c optional hyperv +dev/hyperv/utilities/hv_heartbeat.c optional hyperv dev/hyperv/utilities/hv_kvp.c optional hyperv +dev/hyperv/utilities/hv_shutdown.c optional hyperv +dev/hyperv/utilities/hv_timesync.c optional hyperv dev/hyperv/utilities/hv_util.c optional hyperv dev/hyperv/vmbus/hv_channel.c optional hyperv dev/hyperv/vmbus/hv_channel_mgmt.c optional hyperv diff --git a/sys/dev/bge/if_bge.c b/sys/dev/bge/if_bge.c index b8660a4..def33c3 100644 --- a/sys/dev/bge/if_bge.c +++ b/sys/dev/bge/if_bge.c @@ -170,6 +170,7 @@ static const struct bge_type { { BCOM_VENDORID, BCOM_DEVICEID_BCM5715 }, { BCOM_VENDORID, BCOM_DEVICEID_BCM5715S }, { BCOM_VENDORID, BCOM_DEVICEID_BCM5717 }, + { BCOM_VENDORID, BCOM_DEVICEID_BCM5717C }, { BCOM_VENDORID, BCOM_DEVICEID_BCM5718 }, { BCOM_VENDORID, BCOM_DEVICEID_BCM5719 }, { BCOM_VENDORID, BCOM_DEVICEID_BCM5720 }, @@ -310,6 +311,7 @@ static const struct bge_revision { { BGE_CHIPID_BCM5715_A3, "BCM5715 A3" }, { BGE_CHIPID_BCM5717_A0, "BCM5717 A0" }, { BGE_CHIPID_BCM5717_B0, "BCM5717 B0" }, + { BGE_CHIPID_BCM5717_C0, "BCM5717 C0" }, { BGE_CHIPID_BCM5719_A0, "BCM5719 A0" }, { BGE_CHIPID_BCM5720_A0, "BCM5720 A0" }, { BGE_CHIPID_BCM5755_A0, "BCM5755 A0" }, @@ -2689,6 +2691,10 @@ bge_chipid(device_t dev) * registers. */ switch (pci_get_device(dev)) { + case BCOM_DEVICEID_BCM5717C: + /* 5717 C0 seems to belong to 5720 line. */ + id = BGE_CHIPID_BCM5720_A0; + break; case BCOM_DEVICEID_BCM5717: case BCOM_DEVICEID_BCM5718: case BCOM_DEVICEID_BCM5719: diff --git a/sys/dev/bge/if_bgereg.h b/sys/dev/bge/if_bgereg.h index 37b0459..0cf9ca1 100644 --- a/sys/dev/bge/if_bgereg.h +++ b/sys/dev/bge/if_bgereg.h @@ -329,6 +329,7 @@ #define BGE_CHIPID_BCM57780_A1 0x57780001 #define BGE_CHIPID_BCM5717_A0 0x05717000 #define BGE_CHIPID_BCM5717_B0 0x05717100 +#define BGE_CHIPID_BCM5717_C0 0x05717200 #define BGE_CHIPID_BCM5719_A0 0x05719000 #define BGE_CHIPID_BCM5720_A0 0x05720000 #define BGE_CHIPID_BCM5762_A0 0x05762000 @@ -2452,6 +2453,7 @@ struct bge_status_block { #define BCOM_DEVICEID_BCM5715 0x1678 #define BCOM_DEVICEID_BCM5715S 0x1679 #define BCOM_DEVICEID_BCM5717 0x1655 +#define BCOM_DEVICEID_BCM5717C 0x1665 #define BCOM_DEVICEID_BCM5718 0x1656 #define BCOM_DEVICEID_BCM5719 0x1657 #define BCOM_DEVICEID_BCM5720_PP 0x1658 /* Not released to public. */ diff --git a/sys/dev/hyperv/include/hyperv.h b/sys/dev/hyperv/include/hyperv.h index 1a45b7b..f45543b 100644 --- a/sys/dev/hyperv/include/hyperv.h +++ b/sys/dev/hyperv/include/hyperv.h @@ -755,6 +755,8 @@ typedef struct hv_vmbus_channel { struct mtx inbound_lock; + struct taskqueue * rxq; + struct task channel_task; hv_vmbus_pfn_channel_callback on_channel_callback; void* channel_callback_context; @@ -906,30 +908,6 @@ int hv_vmbus_channel_teardown_gpdal( struct hv_vmbus_channel* vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary); -/* - * Work abstraction defines - */ -typedef struct hv_work_queue { - struct taskqueue* queue; - struct proc* proc; - struct sema* work_sema; -} hv_work_queue; - -typedef struct hv_work_item { - struct task work; - void (*callback)(void *); - void* context; - hv_work_queue* wq; -} hv_work_item; - -struct hv_work_queue* hv_work_queue_create(char* name); - -void hv_work_queue_close(struct hv_work_queue* wq); - -int hv_queue_work_item( - hv_work_queue* wq, - void (*callback)(void *), - void* context); /** * @brief Get physical address from virtual */ @@ -941,35 +919,5 @@ hv_get_phys_addr(void *virt) return (ret); } - -/** - * KVP related structures - * - */ -typedef struct hv_vmbus_service { - hv_guid guid; /* Hyper-V GUID */ - char *name; /* name of service */ - boolean_t enabled; /* service enabled */ - hv_work_queue *work_queue; /* background work queue */ - - /* - * function to initialize service - */ - int (*init)(struct hv_vmbus_service *); - - /* - * function to process Hyper-V messages - */ - void (*callback)(void *); -} hv_vmbus_service; - -extern uint8_t* receive_buffer[]; -extern hv_vmbus_service service_table[]; extern uint32_t hv_vmbus_protocal_version; - -void hv_kvp_callback(void *context); -int hv_kvp_init(hv_vmbus_service *serv); -void hv_kvp_deinit(void); - #endif /* __HYPERV_H__ */ - diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.c b/sys/dev/hyperv/netvsc/hv_net_vsc.c index 64e7578..9a89b62 100644 --- a/sys/dev/hyperv/netvsc/hv_net_vsc.c +++ b/sys/dev/hyperv/netvsc/hv_net_vsc.c @@ -73,10 +73,7 @@ hv_nv_alloc_net_device(struct hv_device *device) netvsc_dev *net_dev; hn_softc_t *sc = device_get_softc(device->device); - net_dev = malloc(sizeof(netvsc_dev), M_NETVSC, M_NOWAIT | M_ZERO); - if (net_dev == NULL) { - return (NULL); - } + net_dev = malloc(sizeof(netvsc_dev), M_NETVSC, M_WAITOK | M_ZERO); net_dev->dev = device; net_dev->destroy = FALSE; @@ -135,15 +132,15 @@ hv_nv_get_next_send_section(netvsc_dev *net_dev) int i; for (i = 0; i < bitsmap_words; i++) { - idx = ffs(~bitsmap[i]); + idx = ffsl(~bitsmap[i]); if (0 == idx) continue; idx--; - if (i * BITS_PER_LONG + idx >= net_dev->send_section_count) - return (ret); + KASSERT(i * BITS_PER_LONG + idx < net_dev->send_section_count, + ("invalid i %d and idx %lu", i, idx)); - if (synch_test_and_set_bit(idx, &bitsmap[i])) + if (atomic_testandset_long(&bitsmap[i], idx)) continue; ret = i * BITS_PER_LONG + idx; @@ -223,11 +220,7 @@ hv_nv_init_rx_buffer_with_net_vsp(struct hv_device *device) init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.num_sections; net_dev->rx_sections = malloc(net_dev->rx_section_count * - sizeof(nvsp_1_rx_buf_section), M_NETVSC, M_NOWAIT); - if (net_dev->rx_sections == NULL) { - ret = EINVAL; - goto cleanup; - } + sizeof(nvsp_1_rx_buf_section), M_NETVSC, M_WAITOK); memcpy(net_dev->rx_sections, init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.sections, net_dev->rx_section_count * sizeof(nvsp_1_rx_buf_section)); @@ -325,11 +318,7 @@ hv_nv_init_send_buffer_with_net_vsp(struct hv_device *device) BITS_PER_LONG); net_dev->send_section_bitsmap = malloc(net_dev->bitsmap_words * sizeof(long), M_NETVSC, - M_NOWAIT | M_ZERO); - if (NULL == net_dev->send_section_bitsmap) { - ret = ENOMEM; - goto cleanup; - } + M_WAITOK | M_ZERO); goto exit; @@ -788,8 +777,27 @@ hv_nv_on_send_completion(netvsc_dev *net_dev, if (NULL != net_vsc_pkt) { if (net_vsc_pkt->send_buf_section_idx != NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) { - synch_change_bit(net_vsc_pkt->send_buf_section_idx, - net_dev->send_section_bitsmap); + u_long mask; + int idx; + + idx = net_vsc_pkt->send_buf_section_idx / + BITS_PER_LONG; + KASSERT(idx < net_dev->bitsmap_words, + ("invalid section index %u", + net_vsc_pkt->send_buf_section_idx)); + mask = 1UL << + (net_vsc_pkt->send_buf_section_idx % + BITS_PER_LONG); + + KASSERT(net_dev->send_section_bitsmap[idx] & + mask, + ("index bitmap 0x%lx, section index %u, " + "bitmap idx %d, bitmask 0x%lx", + net_dev->send_section_bitsmap[idx], + net_vsc_pkt->send_buf_section_idx, + idx, mask)); + atomic_clear_long( + &net_dev->send_section_bitsmap[idx], mask); } /* Notify the layer above us */ diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.h b/sys/dev/hyperv/netvsc/hv_net_vsc.h index e684cc5..95dee17 100644 --- a/sys/dev/hyperv/netvsc/hv_net_vsc.h +++ b/sys/dev/hyperv/netvsc/hv_net_vsc.h @@ -39,9 +39,11 @@ #define __HV_NET_VSC_H__ #include <sys/param.h> +#include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/queue.h> +#include <sys/taskqueue.h> #include <sys/sx.h> #include <machine/bus.h> @@ -56,6 +58,8 @@ #include <dev/hyperv/include/hyperv.h> +#define HN_USE_TXDESC_BUFRING + MALLOC_DECLARE(M_NETVSC); #define NVSP_INVALID_PROTOCOL_VERSION (0xFFFFFFFF) @@ -988,8 +992,67 @@ typedef struct { hv_bool_uint8_t link_state; } netvsc_device_info; +#ifndef HN_USE_TXDESC_BUFRING struct hn_txdesc; SLIST_HEAD(hn_txdesc_list, hn_txdesc); +#else +struct buf_ring; +#endif + +struct hn_rx_ring { + struct lro_ctrl hn_lro; + + /* Trust csum verification on host side */ + int hn_trust_hcsum; /* HN_TRUST_HCSUM_ */ + + u_long hn_csum_ip; + u_long hn_csum_tcp; + u_long hn_csum_udp; + u_long hn_csum_trusted; + u_long hn_lro_tried; + u_long hn_small_pkts; +} __aligned(CACHE_LINE_SIZE); + +#define HN_TRUST_HCSUM_IP 0x0001 +#define HN_TRUST_HCSUM_TCP 0x0002 +#define HN_TRUST_HCSUM_UDP 0x0004 + +struct hn_tx_ring { +#ifndef HN_USE_TXDESC_BUFRING + struct mtx hn_txlist_spin; + struct hn_txdesc_list hn_txlist; +#else + struct buf_ring *hn_txdesc_br; +#endif + int hn_txdesc_cnt; + int hn_txdesc_avail; + int hn_has_txeof; + + int hn_sched_tx; + void (*hn_txeof)(struct hn_tx_ring *); + struct taskqueue *hn_tx_taskq; + struct task hn_tx_task; + struct task hn_txeof_task; + + struct mtx hn_tx_lock; + struct hn_softc *hn_sc; + + int hn_direct_tx_size; + int hn_tx_chimney_size; + bus_dma_tag_t hn_tx_data_dtag; + uint64_t hn_csum_assist; + + u_long hn_no_txdescs; + u_long hn_send_failed; + u_long hn_txdma_failed; + u_long hn_tx_collapsed; + u_long hn_tx_chimney; + + /* Rarely used stuffs */ + struct hn_txdesc *hn_txdesc; + bus_dma_tag_t hn_tx_rndis_dtag; + struct sysctl_oid *hn_tx_sysctl_tree; +} __aligned(CACHE_LINE_SIZE); /* * Device-specific softc structure @@ -1009,44 +1072,22 @@ typedef struct hn_softc { struct hv_device *hn_dev_obj; netvsc_dev *net_dev; - int hn_txdesc_cnt; - struct hn_txdesc *hn_txdesc; - bus_dma_tag_t hn_tx_data_dtag; - bus_dma_tag_t hn_tx_rndis_dtag; - int hn_tx_chimney_size; - int hn_tx_chimney_max; + int hn_rx_ring_cnt; + struct hn_rx_ring *hn_rx_ring; - struct mtx hn_txlist_spin; - struct hn_txdesc_list hn_txlist; - int hn_txdesc_avail; - int hn_txeof; - - struct lro_ctrl hn_lro; - int hn_lro_hiwat; - - /* Trust tcp segments verification on host side */ - int hn_trust_hosttcp; - - u_long hn_csum_ip; - u_long hn_csum_tcp; - u_long hn_csum_trusted; - u_long hn_lro_tried; - u_long hn_small_pkts; - u_long hn_no_txdescs; - u_long hn_send_failed; - u_long hn_txdma_failed; - u_long hn_tx_collapsed; - u_long hn_tx_chimney; + int hn_tx_ring_cnt; + struct hn_tx_ring *hn_tx_ring; + int hn_tx_chimney_max; + struct taskqueue *hn_tx_taskq; + struct sysctl_oid *hn_tx_sysctl_tree; } hn_softc_t; - /* * Externs */ extern int hv_promisc_mode; void netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status); -void netvsc_xmit_completion(void *context); void hv_nv_on_receive_completion(struct hv_device *device, uint64_t tid, uint32_t status); netvsc_dev *hv_nv_on_device_add(struct hv_device *device, diff --git a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c index b3360ea..0f4425e 100644 --- a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c +++ b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c @@ -66,10 +66,12 @@ __FBSDID("$FreeBSD$"); #include <sys/module.h> #include <sys/kernel.h> #include <sys/socket.h> +#include <sys/proc.h> #include <sys/queue.h> #include <sys/lock.h> #include <sys/sx.h> #include <sys/sysctl.h> +#include <sys/buf_ring.h> #include <net/if.h> #include <net/if_arp.h> @@ -132,6 +134,8 @@ __FBSDID("$FreeBSD$"); /* YYY should get it from the underlying channel */ #define HN_TX_DESC_CNT 512 +#define HN_LROENT_CNT_DEF 128 + #define HN_RNDIS_MSG_LEN \ (sizeof(rndis_msg) + \ RNDIS_VLAN_PPI_SIZE + \ @@ -146,10 +150,14 @@ __FBSDID("$FreeBSD$"); #define HN_TX_DATA_SEGCNT_MAX \ (NETVSC_PACKET_MAXPAGE - HV_RF_NUM_TX_RESERVED_PAGE_BUFS) +#define HN_DIRECT_TX_SIZE_DEF 128 + struct hn_txdesc { +#ifndef HN_USE_TXDESC_BUFRING SLIST_ENTRY(hn_txdesc) link; +#endif struct mbuf *m; - struct hn_softc *sc; + struct hn_tx_ring *txr; int refs; uint32_t flags; /* HN_TXD_FLAG_ */ netvsc_packet netvsc_pkt; /* XXX to be removed */ @@ -165,23 +173,18 @@ struct hn_txdesc { #define HN_TXD_FLAG_DMAMAP 0x2 /* - * A unified flag for all outbound check sum flags is useful, - * and it helps avoiding unnecessary check sum calculation in - * network forwarding scenario. + * Only enable UDP checksum offloading when it is on 2012R2 or + * later. UDP checksum offloading doesn't work on earlier + * Windows releases. */ -#define HV_CSUM_FOR_OUTBOUND \ - (CSUM_IP|CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP|CSUM_IP_TSO| \ - CSUM_IP_ISCSI|CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP| \ - CSUM_IP6_TSO|CSUM_IP6_ISCSI) - -/* XXX move to netinet/tcp_lro.h */ -#define HN_LRO_HIWAT_MAX 65535 -#define HN_LRO_HIWAT_DEF HN_LRO_HIWAT_MAX +#define HN_CSUM_ASSIST_WIN8 (CSUM_IP | CSUM_TCP) +#define HN_CSUM_ASSIST (CSUM_IP | CSUM_UDP | CSUM_TCP) + +#define HN_LRO_LENLIM_DEF (25 * ETHERMTU) /* YYY 2*MTU is a bit rough, but should be good enough. */ -#define HN_LRO_HIWAT_MTULIM(ifp) (2 * (ifp)->if_mtu) -#define HN_LRO_HIWAT_ISVALID(sc, hiwat) \ - ((hiwat) >= HN_LRO_HIWAT_MTULIM((sc)->hn_ifp) || \ - (hiwat) <= HN_LRO_HIWAT_MAX) +#define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) + +#define HN_LRO_ACKCNT_DEF 1 /* * Be aware that this sleepable mutex will exhibit WITNESS errors when @@ -205,19 +208,71 @@ struct hn_txdesc { int hv_promisc_mode = 0; /* normal mode by default */ +SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD, NULL, "Hyper-V network interface"); + /* Trust tcp segements verification on host side. */ -static int hn_trust_hosttcp = 0; -TUNABLE_INT("dev.hn.trust_hosttcp", &hn_trust_hosttcp); +static int hn_trust_hosttcp = 1; +SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, + &hn_trust_hosttcp, 0, + "Trust tcp segement verification on host side, " + "when csum info is missing (global setting)"); + +/* Trust udp datagrams verification on host side. */ +static int hn_trust_hostudp = 1; +SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, + &hn_trust_hostudp, 0, + "Trust udp datagram verification on host side, " + "when csum info is missing (global setting)"); + +/* Trust ip packets verification on host side. */ +static int hn_trust_hostip = 1; +SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, + &hn_trust_hostip, 0, + "Trust ip packet verification on host side, " + "when csum info is missing (global setting)"); #if __FreeBSD_version >= 1100045 /* Limit TSO burst size */ static int hn_tso_maxlen = 0; -TUNABLE_INT("dev.hn.tso_maxlen", &hn_tso_maxlen); +SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, + &hn_tso_maxlen, 0, "TSO burst limit"); #endif /* Limit chimney send size */ static int hn_tx_chimney_size = 0; -TUNABLE_INT("dev.hn.tx_chimney_size", &hn_tx_chimney_size); +SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, + &hn_tx_chimney_size, 0, "Chimney send packet size limit"); + +/* Limit the size of packet for direct transmission */ +static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; +SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, + &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); + +#if defined(INET) || defined(INET6) +#if __FreeBSD_version >= 1100095 +static int hn_lro_entry_count = HN_LROENT_CNT_DEF; +SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, + &hn_lro_entry_count, 0, "LRO entry count"); +#endif +#endif + +static int hn_share_tx_taskq = 0; +SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN, + &hn_share_tx_taskq, 0, "Enable shared TX taskqueue"); + +static struct taskqueue *hn_tx_taskq; + +#ifndef HN_USE_TXDESC_BUFRING +static int hn_use_txdesc_bufring = 0; +#else +static int hn_use_txdesc_bufring = 1; +#endif +SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, + &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); + +static int hn_bind_tx_taskq = -1; +SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN, + &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu"); /* * Forward declarations @@ -226,82 +281,37 @@ static void hn_stop(hn_softc_t *sc); static void hn_ifinit_locked(hn_softc_t *sc); static void hn_ifinit(void *xsc); static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); -static void hn_start_locked(struct ifnet *ifp); +static int hn_start_locked(struct hn_tx_ring *txr, int len); static void hn_start(struct ifnet *ifp); +static void hn_start_txeof(struct hn_tx_ring *); static int hn_ifmedia_upd(struct ifnet *ifp); static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr); -#ifdef HN_LRO_HIWAT -static int hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS); +#if __FreeBSD_version >= 1100099 +static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); #endif +static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS); -static int hn_check_iplen(const struct mbuf *, int); -static int hn_create_tx_ring(struct hn_softc *sc); -static void hn_destroy_tx_ring(struct hn_softc *sc); - -static __inline void -hn_set_lro_hiwat(struct hn_softc *sc, int hiwat) -{ - sc->hn_lro_hiwat = hiwat; -#ifdef HN_LRO_HIWAT - sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat; -#endif -} - -/* - * NetVsc get message transport protocol type - */ -static uint32_t get_transport_proto_type(struct mbuf *m_head) -{ - uint32_t ret_val = TRANSPORT_TYPE_NOT_IP; - uint16_t ether_type = 0; - int ether_len = 0; - struct ether_vlan_header *eh; -#ifdef INET - struct ip *iph; -#endif -#ifdef INET6 - struct ip6_hdr *ip6; -#endif - - eh = mtod(m_head, struct ether_vlan_header*); - if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { - ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; - ether_type = eh->evl_proto; - } else { - ether_len = ETHER_HDR_LEN; - ether_type = eh->evl_encap_proto; - } - - switch (ntohs(ether_type)) { -#ifdef INET6 - case ETHERTYPE_IPV6: - ip6 = (struct ip6_hdr *)(m_head->m_data + ether_len); - - if (IPPROTO_TCP == ip6->ip6_nxt) { - ret_val = TRANSPORT_TYPE_IPV6_TCP; - } else if (IPPROTO_UDP == ip6->ip6_nxt) { - ret_val = TRANSPORT_TYPE_IPV6_UDP; - } - break; +#if __FreeBSD_version < 1100095 +static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); +#else +static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); #endif -#ifdef INET - case ETHERTYPE_IP: - iph = (struct ip *)(m_head->m_data + ether_len); - - if (IPPROTO_TCP == iph->ip_p) { - ret_val = TRANSPORT_TYPE_IPV4_TCP; - } else if (IPPROTO_UDP == iph->ip_p) { - ret_val = TRANSPORT_TYPE_IPV4_UDP; - } - break; -#endif - default: - ret_val = TRANSPORT_TYPE_NOT_IP; - break; - } - - return (ret_val); -} +static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_check_iplen(const struct mbuf *, int); +static int hn_create_tx_ring(struct hn_softc *, int); +static void hn_destroy_tx_ring(struct hn_tx_ring *); +static int hn_create_tx_data(struct hn_softc *); +static void hn_destroy_tx_data(struct hn_softc *); +static void hn_start_taskfunc(void *, int); +static void hn_start_txeof_taskfunc(void *, int); +static void hn_stop_tx_tasks(struct hn_softc *); +static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **); +static void hn_create_rx_data(struct hn_softc *sc); +static void hn_destroy_rx_data(struct hn_softc *sc); +static void hn_set_tx_chimney_size(struct hn_softc *, int); static int hn_ifmedia_upd(struct ifnet *ifp __unused) @@ -353,6 +363,19 @@ netvsc_probe(device_t dev) return (ENXIO); } +static void +hn_cpuset_setthread_task(void *xmask, int pending __unused) +{ + cpuset_t *mask = xmask; + int error; + + error = cpuset_setthread(curthread->td_tid, mask); + if (error) { + panic("curthread=%ju: can't pin; error=%d", + (uintmax_t)curthread->td_tid, error); + } +} + /* * Standard attach entry point. * @@ -367,8 +390,6 @@ netvsc_attach(device_t dev) hn_softc_t *sc; int unit = device_get_unit(dev); struct ifnet *ifp = NULL; - struct sysctl_oid_list *child; - struct sysctl_ctx_list *ctx; int error; #if __FreeBSD_version >= 1100045 int tso_maxlen; @@ -382,13 +403,28 @@ netvsc_attach(device_t dev) bzero(sc, sizeof(hn_softc_t)); sc->hn_unit = unit; sc->hn_dev = dev; - sc->hn_lro_hiwat = HN_LRO_HIWAT_DEF; - sc->hn_trust_hosttcp = hn_trust_hosttcp; - - error = hn_create_tx_ring(sc); - if (error) - goto failed; + if (hn_tx_taskq == NULL) { + sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK, + taskqueue_thread_enqueue, &sc->hn_tx_taskq); + taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx", + device_get_nameunit(dev)); + if (hn_bind_tx_taskq >= 0) { + int cpu = hn_bind_tx_taskq; + struct task cpuset_task; + cpuset_t cpu_set; + + if (cpu > mp_ncpus - 1) + cpu = mp_ncpus - 1; + CPU_SETOF(cpu, &cpu_set); + TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, + &cpu_set); + taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task); + taskqueue_drain(sc->hn_tx_taskq, &cpuset_task); + } + } else { + sc->hn_tx_taskq = hn_tx_taskq; + } NV_LOCK_INIT(sc, "NetVSCLock"); sc->hn_dev_obj = device_ctx; @@ -396,6 +432,12 @@ netvsc_attach(device_t dev) ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER); ifp->if_softc = sc; + error = hn_create_tx_data(sc); + if (error) + goto failed; + + hn_create_rx_data(sc); + if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_dunit = unit; ifp->if_dname = NETVSC_DEVNAME; @@ -426,15 +468,7 @@ netvsc_attach(device_t dev) ifp->if_capenable |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO | IFCAP_LRO; - /* - * Only enable UDP checksum offloading when it is on 2012R2 or - * later. UDP checksum offloading doesn't work on earlier - * Windows releases. - */ - if (hv_vmbus_protocal_version >= HV_VMBUS_VERSION_WIN8_1) - ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO; - else - ifp->if_hwassist = CSUM_TCP | CSUM_TSO; + ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist | CSUM_TSO; error = hv_rf_on_device_add(device_ctx, &device_info); if (error) @@ -444,15 +478,6 @@ netvsc_attach(device_t dev) sc->hn_carrier = 1; } -#if defined(INET) || defined(INET6) - tcp_lro_init(&sc->hn_lro); - /* Driver private LRO settings */ - sc->hn_lro.ifp = ifp; -#ifdef HN_LRO_HIWAT - sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat; -#endif -#endif /* INET || INET6 */ - #if __FreeBSD_version >= 1100045 tso_maxlen = hn_tso_maxlen; if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET) @@ -472,87 +497,14 @@ netvsc_attach(device_t dev) #endif sc->hn_tx_chimney_max = sc->net_dev->send_section_size; - sc->hn_tx_chimney_size = sc->hn_tx_chimney_max; + hn_set_tx_chimney_size(sc, sc->hn_tx_chimney_max); if (hn_tx_chimney_size > 0 && hn_tx_chimney_size < sc->hn_tx_chimney_max) - sc->hn_tx_chimney_size = hn_tx_chimney_size; - - ctx = device_get_sysctl_ctx(dev); - child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); - - SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_queued", - CTLFLAG_RW, &sc->hn_lro.lro_queued, 0, "LRO queued"); - SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_flushed", - CTLFLAG_RW, &sc->hn_lro.lro_flushed, 0, "LRO flushed"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "lro_tried", - CTLFLAG_RW, &sc->hn_lro_tried, "# of LRO tries"); -#ifdef HN_LRO_HIWAT - SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_hiwat", - CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_hiwat_sysctl, - "I", "LRO high watermark"); -#endif - SYSCTL_ADD_INT(ctx, child, OID_AUTO, "trust_hosttcp", - CTLFLAG_RW, &sc->hn_trust_hosttcp, 0, - "Trust tcp segement verification on host side, " - "when csum info is missing"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_ip", - CTLFLAG_RW, &sc->hn_csum_ip, "RXCSUM IP"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_tcp", - CTLFLAG_RW, &sc->hn_csum_tcp, "RXCSUM TCP"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_trusted", - CTLFLAG_RW, &sc->hn_csum_trusted, - "# of TCP segements that we trust host's csum verification"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "small_pkts", - CTLFLAG_RW, &sc->hn_small_pkts, "# of small packets received"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "no_txdescs", - CTLFLAG_RW, &sc->hn_no_txdescs, "# of times short of TX descs"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "send_failed", - CTLFLAG_RW, &sc->hn_send_failed, "# of hyper-v sending failure"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "txdma_failed", - CTLFLAG_RW, &sc->hn_txdma_failed, "# of TX DMA failure"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_collapsed", - CTLFLAG_RW, &sc->hn_tx_collapsed, "# of TX mbuf collapsed"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_chimney", - CTLFLAG_RW, &sc->hn_tx_chimney, "# of chimney send"); - SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", - CTLFLAG_RD, &sc->hn_txdesc_cnt, 0, "# of total TX descs"); - SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", - CTLFLAG_RD, &sc->hn_txdesc_avail, 0, "# of available TX descs"); - SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", - CTLFLAG_RD, &sc->hn_tx_chimney_max, 0, - "Chimney send packet size upper boundary"); - SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", - CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_tx_chimney_size_sysctl, - "I", "Chimney send packet size limit"); - - if (unit == 0) { - struct sysctl_ctx_list *dc_ctx; - struct sysctl_oid_list *dc_child; - devclass_t dc; - - /* - * Add sysctl nodes for devclass - */ - dc = device_get_devclass(dev); - dc_ctx = devclass_get_sysctl_ctx(dc); - dc_child = SYSCTL_CHILDREN(devclass_get_sysctl_tree(dc)); - - SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "trust_hosttcp", - CTLFLAG_RD, &hn_trust_hosttcp, 0, - "Trust tcp segement verification on host side, " - "when csum info is missing (global setting)"); - SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tx_chimney_size", - CTLFLAG_RD, &hn_tx_chimney_size, 0, - "Chimney send packet size limit"); -#if __FreeBSD_version >= 1100045 - SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tso_maxlen", - CTLFLAG_RD, &hn_tso_maxlen, 0, "TSO burst limit"); -#endif - } + hn_set_tx_chimney_size(sc, hn_tx_chimney_size); return (0); failed: - hn_destroy_tx_ring(sc); + hn_destroy_tx_data(sc); if (ifp != NULL) if_free(ifp); return (error); @@ -583,11 +535,14 @@ netvsc_detach(device_t dev) hv_rf_on_device_remove(hv_device, HV_RF_NV_DESTROY_CHANNEL); + hn_stop_tx_tasks(sc); + ifmedia_removeall(&sc->hn_media); -#if defined(INET) || defined(INET6) - tcp_lro_free(&sc->hn_lro); -#endif - hn_destroy_tx_ring(sc); + hn_destroy_rx_data(sc); + hn_destroy_tx_data(sc); + + if (sc->hn_tx_taskq != hn_tx_taskq) + taskqueue_free(sc->hn_tx_taskq); return (0); } @@ -602,13 +557,13 @@ netvsc_shutdown(device_t dev) } static __inline int -hn_txdesc_dmamap_load(struct hn_softc *sc, struct hn_txdesc *txd, +hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) { struct mbuf *m = *m_head; int error; - error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag, txd->data_dmap, + error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); if (error == EFBIG) { struct mbuf *m_new; @@ -618,13 +573,13 @@ hn_txdesc_dmamap_load(struct hn_softc *sc, struct hn_txdesc *txd, return ENOBUFS; else *m_head = m = m_new; - sc->hn_tx_collapsed++; + txr->hn_tx_collapsed++; - error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag, + error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); } if (!error) { - bus_dmamap_sync(sc->hn_tx_data_dtag, txd->data_dmap, + bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_PREWRITE); txd->flags |= HN_TXD_FLAG_DMAMAP; } @@ -632,20 +587,20 @@ hn_txdesc_dmamap_load(struct hn_softc *sc, struct hn_txdesc *txd, } static __inline void -hn_txdesc_dmamap_unload(struct hn_softc *sc, struct hn_txdesc *txd) +hn_txdesc_dmamap_unload(struct hn_tx_ring *txr, struct hn_txdesc *txd) { if (txd->flags & HN_TXD_FLAG_DMAMAP) { - bus_dmamap_sync(sc->hn_tx_data_dtag, + bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_POSTWRITE); - bus_dmamap_unload(sc->hn_tx_data_dtag, + bus_dmamap_unload(txr->hn_tx_data_dtag, txd->data_dmap); txd->flags &= ~HN_TXD_FLAG_DMAMAP; } } static __inline int -hn_txdesc_put(struct hn_softc *sc, struct hn_txdesc *txd) +hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, @@ -655,7 +610,7 @@ hn_txdesc_put(struct hn_softc *sc, struct hn_txdesc *txd) if (atomic_fetchadd_int(&txd->refs, -1) != 1) return 0; - hn_txdesc_dmamap_unload(sc, txd); + hn_txdesc_dmamap_unload(txr, txd); if (txd->m != NULL) { m_freem(txd->m); txd->m = NULL; @@ -663,33 +618,45 @@ hn_txdesc_put(struct hn_softc *sc, struct hn_txdesc *txd) txd->flags |= HN_TXD_FLAG_ONLIST; - mtx_lock_spin(&sc->hn_txlist_spin); - KASSERT(sc->hn_txdesc_avail >= 0 && - sc->hn_txdesc_avail < sc->hn_txdesc_cnt, - ("txdesc_put: invalid txd avail %d", sc->hn_txdesc_avail)); - sc->hn_txdesc_avail++; - SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link); - mtx_unlock_spin(&sc->hn_txlist_spin); +#ifndef HN_USE_TXDESC_BUFRING + mtx_lock_spin(&txr->hn_txlist_spin); + KASSERT(txr->hn_txdesc_avail >= 0 && + txr->hn_txdesc_avail < txr->hn_txdesc_cnt, + ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); + txr->hn_txdesc_avail++; + SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); + mtx_unlock_spin(&txr->hn_txlist_spin); +#else + atomic_add_int(&txr->hn_txdesc_avail, 1); + buf_ring_enqueue(txr->hn_txdesc_br, txd); +#endif return 1; } static __inline struct hn_txdesc * -hn_txdesc_get(struct hn_softc *sc) +hn_txdesc_get(struct hn_tx_ring *txr) { struct hn_txdesc *txd; - mtx_lock_spin(&sc->hn_txlist_spin); - txd = SLIST_FIRST(&sc->hn_txlist); +#ifndef HN_USE_TXDESC_BUFRING + mtx_lock_spin(&txr->hn_txlist_spin); + txd = SLIST_FIRST(&txr->hn_txlist); if (txd != NULL) { - KASSERT(sc->hn_txdesc_avail > 0, - ("txdesc_get: invalid txd avail %d", sc->hn_txdesc_avail)); - sc->hn_txdesc_avail--; - SLIST_REMOVE_HEAD(&sc->hn_txlist, link); + KASSERT(txr->hn_txdesc_avail > 0, + ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); + txr->hn_txdesc_avail--; + SLIST_REMOVE_HEAD(&txr->hn_txlist, link); } - mtx_unlock_spin(&sc->hn_txlist_spin); + mtx_unlock_spin(&txr->hn_txlist_spin); +#else + txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); +#endif if (txd != NULL) { +#ifdef HN_USE_TXDESC_BUFRING + atomic_subtract_int(&txr->hn_txdesc_avail, 1); +#endif KASSERT(txd->m == NULL && txd->refs == 0 && (txd->flags & HN_TXD_FLAG_ONLIST), ("invalid txd")); txd->flags &= ~HN_TXD_FLAG_ONLIST; @@ -707,213 +674,133 @@ hn_txdesc_hold(struct hn_txdesc *txd) atomic_add_int(&txd->refs, 1); } -/* - * Send completion processing - * - * Note: It looks like offset 0 of buf is reserved to hold the softc - * pointer. The sc pointer is not currently needed in this function, and - * it is not presently populated by the TX function. - */ -void -netvsc_xmit_completion(void *context) +static void +hn_tx_done(void *xpkt) { - netvsc_packet *packet = context; + netvsc_packet *packet = xpkt; struct hn_txdesc *txd; - struct hn_softc *sc; + struct hn_tx_ring *txr; txd = (struct hn_txdesc *)(uintptr_t) packet->compl.send.send_completion_tid; - sc = txd->sc; - sc->hn_txeof = 1; - hn_txdesc_put(sc, txd); + txr = txd->txr; + txr->hn_has_txeof = 1; + hn_txdesc_put(txr, txd); } void netvsc_channel_rollup(struct hv_device *device_ctx) { struct hn_softc *sc = device_get_softc(device_ctx->device); - struct ifnet *ifp; + struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; /* TODO: vRSS */ +#if defined(INET) || defined(INET6) + struct hn_rx_ring *rxr = &sc->hn_rx_ring[0]; /* TODO: vRSS */ + struct lro_ctrl *lro = &rxr->hn_lro; + struct lro_entry *queued; + + while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { + SLIST_REMOVE_HEAD(&lro->lro_active, next); + tcp_lro_flush(lro, queued); + } +#endif - if (!sc->hn_txeof) + if (!txr->hn_has_txeof) return; - sc->hn_txeof = 0; - ifp = sc->hn_ifp; - NV_LOCK(sc); - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - hn_start_locked(ifp); - NV_UNLOCK(sc); + txr->hn_has_txeof = 0; + txr->hn_txeof(txr); } /* - * Start a transmit of one or more packets + * NOTE: + * If this function fails, then both txd and m_head0 will be freed. */ -static void -hn_start_locked(struct ifnet *ifp) +static int +hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) { - hn_softc_t *sc = ifp->if_softc; - struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); - netvsc_dev *net_dev = sc->net_dev; + bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; + int error, nsegs, i; + struct mbuf *m_head = *m_head0; netvsc_packet *packet; - struct mbuf *m_head, *m; - struct ether_vlan_header *eh; rndis_msg *rndis_mesg; rndis_packet *rndis_pkt; rndis_per_packet_info *rppi; - ndis_8021q_info *rppi_vlan_info; - rndis_tcp_ip_csum_info *csum_info; - rndis_tcp_tso_info *tso_info; - int ether_len; - uint32_t rndis_msg_size = 0; - uint32_t trans_proto_type; - uint32_t send_buf_section_idx = - NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; - - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != - IFF_DRV_RUNNING) - return; - - while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { - bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; - int error, nsegs, i, send_failed = 0; - struct hn_txdesc *txd; + uint32_t rndis_msg_size; - IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); - if (m_head == NULL) - break; + packet = &txd->netvsc_pkt; + packet->is_data_pkt = TRUE; + packet->tot_data_buf_len = m_head->m_pkthdr.len; - txd = hn_txdesc_get(sc); - if (txd == NULL) { - sc->hn_no_txdescs++; - IF_PREPEND(&ifp->if_snd, m_head); - ifp->if_drv_flags |= IFF_DRV_OACTIVE; - break; - } + /* + * extension points to the area reserved for the + * rndis_filter_packet, which is placed just after + * the netvsc_packet (and rppi struct, if present; + * length is updated later). + */ + rndis_mesg = txd->rndis_msg; + /* XXX not necessary */ + memset(rndis_mesg, 0, HN_RNDIS_MSG_LEN); + rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG; - packet = &txd->netvsc_pkt; - /* XXX not necessary */ - memset(packet, 0, sizeof(*packet)); + rndis_pkt = &rndis_mesg->msg.packet; + rndis_pkt->data_offset = sizeof(rndis_packet); + rndis_pkt->data_length = packet->tot_data_buf_len; + rndis_pkt->per_pkt_info_offset = sizeof(rndis_packet); - packet->is_data_pkt = TRUE; + rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet); - /* Initialize it from the mbuf */ - packet->tot_data_buf_len = m_head->m_pkthdr.len; + if (m_head->m_flags & M_VLANTAG) { + ndis_8021q_info *rppi_vlan_info; - /* - * extension points to the area reserved for the - * rndis_filter_packet, which is placed just after - * the netvsc_packet (and rppi struct, if present; - * length is updated later). - */ - rndis_mesg = txd->rndis_msg; - /* XXX not necessary */ - memset(rndis_mesg, 0, HN_RNDIS_MSG_LEN); - rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG; + rndis_msg_size += RNDIS_VLAN_PPI_SIZE; + rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE, + ieee_8021q_info); - rndis_pkt = &rndis_mesg->msg.packet; - rndis_pkt->data_offset = sizeof(rndis_packet); - rndis_pkt->data_length = packet->tot_data_buf_len; - rndis_pkt->per_pkt_info_offset = sizeof(rndis_packet); + rppi_vlan_info = (ndis_8021q_info *)((uint8_t *)rppi + + rppi->per_packet_info_offset); + rppi_vlan_info->u1.s1.vlan_id = + m_head->m_pkthdr.ether_vtag & 0xfff; + } - rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet); + if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { + rndis_tcp_tso_info *tso_info; + struct ether_vlan_header *eh; + int ether_len; /* - * If the Hyper-V infrastructure needs to embed a VLAN tag, - * initialize netvsc_packet and rppi struct values as needed. + * XXX need m_pullup and use mtodo */ - if (m_head->m_flags & M_VLANTAG) { - /* - * set up some additional fields so the Hyper-V infrastructure will stuff the VLAN tag - * into the frame. - */ - rndis_msg_size += RNDIS_VLAN_PPI_SIZE; - - rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE, - ieee_8021q_info); - - /* VLAN info immediately follows rppi struct */ - rppi_vlan_info = (ndis_8021q_info *)((char*)rppi + - rppi->per_packet_info_offset); - /* FreeBSD does not support CFI or priority */ - rppi_vlan_info->u1.s1.vlan_id = - m_head->m_pkthdr.ether_vtag & 0xfff; - } - - /* Only check the flags for outbound and ignore the ones for inbound */ - if (0 == (m_head->m_pkthdr.csum_flags & HV_CSUM_FOR_OUTBOUND)) { - goto pre_send; - } - eh = mtod(m_head, struct ether_vlan_header*); - if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { + if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; - } else { + else ether_len = ETHER_HDR_LEN; - } - - trans_proto_type = get_transport_proto_type(m_head); - if (TRANSPORT_TYPE_NOT_IP == trans_proto_type) { - goto pre_send; - } - - /* - * TSO packet needless to setup the send side checksum - * offload. - */ - if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { - goto do_tso; - } - /* setup checksum offload */ - rndis_msg_size += RNDIS_CSUM_PPI_SIZE; - rppi = hv_set_rppi_data(rndis_mesg, RNDIS_CSUM_PPI_SIZE, - tcpip_chksum_info); - csum_info = (rndis_tcp_ip_csum_info *)((char*)rppi + - rppi->per_packet_info_offset); - - if (trans_proto_type & (TYPE_IPV4 << 16)) { - csum_info->xmit.is_ipv4 = 1; - } else { - csum_info->xmit.is_ipv6 = 1; - } - - if (trans_proto_type & TYPE_TCP) { - csum_info->xmit.tcp_csum = 1; - csum_info->xmit.tcp_header_offset = 0; - } else if (trans_proto_type & TYPE_UDP) { - csum_info->xmit.udp_csum = 1; - } - - goto pre_send; - -do_tso: - /* setup TCP segmentation offload */ rndis_msg_size += RNDIS_TSO_PPI_SIZE; rppi = hv_set_rppi_data(rndis_mesg, RNDIS_TSO_PPI_SIZE, tcp_large_send_info); - - tso_info = (rndis_tcp_tso_info *)((char *)rppi + + + tso_info = (rndis_tcp_tso_info *)((uint8_t *)rppi + rppi->per_packet_info_offset); tso_info->lso_v2_xmit.type = RNDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; - + #ifdef INET - if (trans_proto_type & (TYPE_IPV4 << 16)) { + if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { struct ip *ip = (struct ip *)(m_head->m_data + ether_len); unsigned long iph_len = ip->ip_hl << 2; struct tcphdr *th = (struct tcphdr *)((caddr_t)ip + iph_len); - + tso_info->lso_v2_xmit.ip_version = RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV4; ip->ip_len = 0; ip->ip_sum = 0; - + th->th_sum = in_pseudo(ip->ip_src.s_addr, - ip->ip_dst.s_addr, - htons(IPPROTO_TCP)); + ip->ip_dst.s_addr, htons(IPPROTO_TCP)); } #endif #if defined(INET6) && defined(INET) @@ -921,8 +808,8 @@ do_tso: #endif #ifdef INET6 { - struct ip6_hdr *ip6 = - (struct ip6_hdr *)(m_head->m_data + ether_len); + struct ip6_hdr *ip6 = (struct ip6_hdr *) + (m_head->m_data + ether_len); struct tcphdr *th = (struct tcphdr *)(ip6 + 1); tso_info->lso_v2_xmit.ip_version = @@ -933,146 +820,233 @@ do_tso: #endif tso_info->lso_v2_xmit.tcp_header_offset = 0; tso_info->lso_v2_xmit.mss = m_head->m_pkthdr.tso_segsz; + } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { + rndis_tcp_ip_csum_info *csum_info; -pre_send: - rndis_mesg->msg_len = packet->tot_data_buf_len + rndis_msg_size; - packet->tot_data_buf_len = rndis_mesg->msg_len; - - /* send packet with send buffer */ - if (packet->tot_data_buf_len < sc->hn_tx_chimney_size) { - send_buf_section_idx = - hv_nv_get_next_send_section(net_dev); - if (send_buf_section_idx != - NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) { - char *dest = ((char *)net_dev->send_buf + - send_buf_section_idx * - net_dev->send_section_size); - - memcpy(dest, rndis_mesg, rndis_msg_size); - dest += rndis_msg_size; - for (m = m_head; m != NULL; m = m->m_next) { - if (m->m_len) { - memcpy(dest, - (void *)mtod(m, vm_offset_t), - m->m_len); - dest += m->m_len; - } - } + rndis_msg_size += RNDIS_CSUM_PPI_SIZE; + rppi = hv_set_rppi_data(rndis_mesg, RNDIS_CSUM_PPI_SIZE, + tcpip_chksum_info); + csum_info = (rndis_tcp_ip_csum_info *)((uint8_t *)rppi + + rppi->per_packet_info_offset); - packet->send_buf_section_idx = - send_buf_section_idx; - packet->send_buf_section_size = - packet->tot_data_buf_len; - packet->page_buf_count = 0; - sc->hn_tx_chimney++; - goto do_send; - } - } + csum_info->xmit.is_ipv4 = 1; + if (m_head->m_pkthdr.csum_flags & CSUM_IP) + csum_info->xmit.ip_header_csum = 1; - error = hn_txdesc_dmamap_load(sc, txd, &m_head, segs, &nsegs); - if (error) { - int freed; + if (m_head->m_pkthdr.csum_flags & CSUM_TCP) { + csum_info->xmit.tcp_csum = 1; + csum_info->xmit.tcp_header_offset = 0; + } else if (m_head->m_pkthdr.csum_flags & CSUM_UDP) { + csum_info->xmit.udp_csum = 1; + } + } - /* - * This mbuf is not linked w/ the txd yet, so free - * it now. - */ - m_freem(m_head); - freed = hn_txdesc_put(sc, txd); - KASSERT(freed != 0, - ("fail to free txd upon txdma error")); + rndis_mesg->msg_len = packet->tot_data_buf_len + rndis_msg_size; + packet->tot_data_buf_len = rndis_mesg->msg_len; - sc->hn_txdma_failed++; - if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - continue; + /* + * Chimney send, if the packet could fit into one chimney buffer. + */ + if (packet->tot_data_buf_len < txr->hn_tx_chimney_size) { + netvsc_dev *net_dev = txr->hn_sc->net_dev; + uint32_t send_buf_section_idx; + + send_buf_section_idx = + hv_nv_get_next_send_section(net_dev); + if (send_buf_section_idx != + NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) { + uint8_t *dest = ((uint8_t *)net_dev->send_buf + + (send_buf_section_idx * + net_dev->send_section_size)); + + memcpy(dest, rndis_mesg, rndis_msg_size); + dest += rndis_msg_size; + m_copydata(m_head, 0, m_head->m_pkthdr.len, dest); + + packet->send_buf_section_idx = send_buf_section_idx; + packet->send_buf_section_size = + packet->tot_data_buf_len; + packet->page_buf_count = 0; + txr->hn_tx_chimney++; + goto done; } + } - packet->page_buf_count = nsegs + - HV_RF_NUM_TX_RESERVED_PAGE_BUFS; - - /* send packet with page buffer */ - packet->page_buffers[0].pfn = atop(txd->rndis_msg_paddr); - packet->page_buffers[0].offset = - txd->rndis_msg_paddr & PAGE_MASK; - packet->page_buffers[0].length = rndis_msg_size; + error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); + if (error) { + int freed; /* - * Fill the page buffers with mbuf info starting at index - * HV_RF_NUM_TX_RESERVED_PAGE_BUFS. + * This mbuf is not linked w/ the txd yet, so free it now. */ - for (i = 0; i < nsegs; ++i) { - hv_vmbus_page_buffer *pb = &packet->page_buffers[ - i + HV_RF_NUM_TX_RESERVED_PAGE_BUFS]; + m_freem(m_head); + *m_head0 = NULL; - pb->pfn = atop(segs[i].ds_addr); - pb->offset = segs[i].ds_addr & PAGE_MASK; - pb->length = segs[i].ds_len; - } + freed = hn_txdesc_put(txr, txd); + KASSERT(freed != 0, + ("fail to free txd upon txdma error")); + + txr->hn_txdma_failed++; + if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1); + return error; + } + *m_head0 = m_head; + + packet->page_buf_count = nsegs + HV_RF_NUM_TX_RESERVED_PAGE_BUFS; + + /* send packet with page buffer */ + packet->page_buffers[0].pfn = atop(txd->rndis_msg_paddr); + packet->page_buffers[0].offset = txd->rndis_msg_paddr & PAGE_MASK; + packet->page_buffers[0].length = rndis_msg_size; - packet->send_buf_section_idx = - NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; - packet->send_buf_section_size = 0; + /* + * Fill the page buffers with mbuf info starting at index + * HV_RF_NUM_TX_RESERVED_PAGE_BUFS. + */ + for (i = 0; i < nsegs; ++i) { + hv_vmbus_page_buffer *pb = &packet->page_buffers[ + i + HV_RF_NUM_TX_RESERVED_PAGE_BUFS]; -do_send: - txd->m = m_head; + pb->pfn = atop(segs[i].ds_addr); + pb->offset = segs[i].ds_addr & PAGE_MASK; + pb->length = segs[i].ds_len; + } + + packet->send_buf_section_idx = + NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; + packet->send_buf_section_size = 0; +done: + txd->m = m_head; + + /* Set the completion routine */ + packet->compl.send.on_send_completion = hn_tx_done; + packet->compl.send.send_completion_context = packet; + packet->compl.send.send_completion_tid = (uint64_t)(uintptr_t)txd; + + return 0; +} - /* Set the completion routine */ - packet->compl.send.on_send_completion = netvsc_xmit_completion; - packet->compl.send.send_completion_context = packet; - packet->compl.send.send_completion_tid = - (uint64_t)(uintptr_t)txd; +/* + * NOTE: + * If this function fails, then txd will be freed, but the mbuf + * associated w/ the txd will _not_ be freed. + */ +static int +hn_send_pkt(struct ifnet *ifp, struct hv_device *device_ctx, + struct hn_tx_ring *txr, struct hn_txdesc *txd) +{ + int error, send_failed = 0; again: + /* + * Make sure that txd is not freed before ETHER_BPF_MTAP. + */ + hn_txdesc_hold(txd); + error = hv_nv_on_send(device_ctx, &txd->netvsc_pkt); + if (!error) { + ETHER_BPF_MTAP(ifp, txd->m); + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + } + hn_txdesc_put(txr, txd); + + if (__predict_false(error)) { + int freed; + /* - * Make sure that txd is not freed before ETHER_BPF_MTAP. + * This should "really rarely" happen. + * + * XXX Too many RX to be acked or too many sideband + * commands to run? Ask netvsc_channel_rollup() + * to kick start later. */ - hn_txdesc_hold(txd); - error = hv_nv_on_send(device_ctx, packet); - if (!error) { - ETHER_BPF_MTAP(ifp, m_head); - if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + txr->hn_has_txeof = 1; + if (!send_failed) { + txr->hn_send_failed++; + send_failed = 1; + /* + * Try sending again after set hn_has_txeof; + * in case that we missed the last + * netvsc_channel_rollup(). + */ + goto again; } - hn_txdesc_put(sc, txd); + if_printf(ifp, "send failed\n"); - if (__predict_false(error)) { - int freed; + /* + * Caller will perform further processing on the + * associated mbuf, so don't free it in hn_txdesc_put(); + * only unload it from the DMA map in hn_txdesc_put(), + * if it was loaded. + */ + txd->m = NULL; + freed = hn_txdesc_put(txr, txd); + KASSERT(freed != 0, + ("fail to free txd upon send error")); - /* - * This should "really rarely" happen. - * - * XXX Too many RX to be acked or too many sideband - * commands to run? Ask netvsc_channel_rollup() - * to kick start later. - */ - sc->hn_txeof = 1; - if (!send_failed) { - sc->hn_send_failed++; - send_failed = 1; - /* - * Try sending again after set hn_txeof; - * in case that we missed the last - * netvsc_channel_rollup(). - */ - goto again; - } - if_printf(ifp, "send failed\n"); + txr->hn_send_failed++; + } + return error; +} + +/* + * Start a transmit of one or more packets + */ +static int +hn_start_locked(struct hn_tx_ring *txr, int len) +{ + struct hn_softc *sc = txr->hn_sc; + struct ifnet *ifp = sc->hn_ifp; + struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); + + KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); + mtx_assert(&txr->hn_tx_lock, MA_OWNED); + + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != + IFF_DRV_RUNNING) + return 0; + + while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { + struct hn_txdesc *txd; + struct mbuf *m_head; + int error; + + IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); + if (m_head == NULL) + break; + if (len > 0 && m_head->m_pkthdr.len > len) { /* - * This mbuf will be prepended, don't free it - * in hn_txdesc_put(); only unload it from the - * DMA map in hn_txdesc_put(), if it was loaded. + * This sending could be time consuming; let callers + * dispatch this packet sending (and sending of any + * following up packets) to tx taskqueue. */ - txd->m = NULL; - freed = hn_txdesc_put(sc, txd); - KASSERT(freed != 0, - ("fail to free txd upon send error")); - - sc->hn_send_failed++; - IF_PREPEND(&ifp->if_snd, m_head); - ifp->if_drv_flags |= IFF_DRV_OACTIVE; + IFQ_DRV_PREPEND(&ifp->if_snd, m_head); + return 1; + } + + txd = hn_txdesc_get(txr); + if (txd == NULL) { + txr->hn_no_txdescs++; + IFQ_DRV_PREPEND(&ifp->if_snd, m_head); + atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); + break; + } + + error = hn_encap(txr, txd, &m_head); + if (error) { + /* Both txd and m_head are freed */ + continue; + } + + error = hn_send_pkt(ifp, device_ctx, txr, txd); + if (__predict_false(error)) { + /* txd is freed, but m_head is not */ + IFQ_DRV_PREPEND(&ifp->if_snd, m_head); + atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } } + return 0; } /* @@ -1162,11 +1136,11 @@ int netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, rndis_tcp_ip_csum_info *csum_info) { - hn_softc_t *sc = (hn_softc_t *)device_get_softc(device_ctx->device); + struct hn_softc *sc = device_get_softc(device_ctx->device); + struct hn_rx_ring *rxr = &sc->hn_rx_ring[0]; /* TODO: vRSS */ struct mbuf *m_new; struct ifnet *ifp; - device_t dev = device_ctx->device; - int size, do_lro = 0; + int size, do_lro = 0, do_csum = 1; if (sc == NULL) { return (0); /* TODO: KYS how can this be! */ @@ -1192,7 +1166,7 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, memcpy(mtod(m_new, void *), packet->data, packet->tot_data_buf_len); m_new->m_pkthdr.len = m_new->m_len = packet->tot_data_buf_len; - sc->hn_small_pkts++; + rxr->hn_small_pkts++; } else { /* * Get an mbuf with a cluster. For packets 2K or less, @@ -1208,7 +1182,7 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); if (m_new == NULL) { - device_printf(dev, "alloc mbuf failed.\n"); + if_printf(ifp, "alloc mbuf failed.\n"); return (0); } @@ -1216,21 +1190,28 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, } m_new->m_pkthdr.rcvif = ifp; + if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0)) + do_csum = 0; + /* receive side checksum offload */ - if (NULL != csum_info) { + if (csum_info != NULL) { /* IP csum offload */ - if (csum_info->receive.ip_csum_succeeded) { + if (csum_info->receive.ip_csum_succeeded && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); - sc->hn_csum_ip++; + rxr->hn_csum_ip++; } - /* TCP csum offload */ - if (csum_info->receive.tcp_csum_succeeded) { + /* TCP/UDP csum offload */ + if ((csum_info->receive.tcp_csum_succeeded || + csum_info->receive.udp_csum_succeeded) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; - sc->hn_csum_tcp++; + if (csum_info->receive.tcp_csum_succeeded) + rxr->hn_csum_tcp++; + else + rxr->hn_csum_udp++; } if (csum_info->receive.ip_csum_succeeded && @@ -1261,8 +1242,10 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, pr = hn_check_iplen(m_new, hoff); if (pr == IPPROTO_TCP) { - if (sc->hn_trust_hosttcp) { - sc->hn_csum_trusted++; + if (do_csum && + (rxr->hn_trust_hcsum & + HN_TRUST_HCSUM_TCP)) { + rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); @@ -1270,6 +1253,21 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, } /* Rely on SW csum verification though... */ do_lro = 1; + } else if (pr == IPPROTO_UDP) { + if (do_csum && + (rxr->hn_trust_hcsum & + HN_TRUST_HCSUM_UDP)) { + rxr->hn_csum_trusted++; + m_new->m_pkthdr.csum_flags |= + (CSUM_IP_CHECKED | CSUM_IP_VALID | + CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + m_new->m_pkthdr.csum_data = 0xffff; + } + } else if (pr != IPPROTO_DONE && do_csum && + (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { + rxr->hn_csum_trusted++; + m_new->m_pkthdr.csum_flags |= + (CSUM_IP_CHECKED | CSUM_IP_VALID); } } } @@ -1289,10 +1287,10 @@ skip: if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { #if defined(INET) || defined(INET6) - struct lro_ctrl *lro = &sc->hn_lro; + struct lro_ctrl *lro = &rxr->hn_lro; if (lro->lro_cnt) { - sc->hn_lro_tried++; + rxr->hn_lro_tried++; if (tcp_lro_rx(lro, m_new, 0) == 0) { /* DONE! */ return 0; @@ -1308,18 +1306,8 @@ skip: } void -netvsc_recv_rollup(struct hv_device *device_ctx) +netvsc_recv_rollup(struct hv_device *device_ctx __unused) { -#if defined(INET) || defined(INET6) - hn_softc_t *sc = device_get_softc(device_ctx->device); - struct lro_ctrl *lro = &sc->hn_lro; - struct lro_entry *queued; - - while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { - SLIST_REMOVE_HEAD(&lro->lro_active, next); - tcp_lro_flush(lro, queued); - } -#endif } /* @@ -1377,12 +1365,23 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) /* Obtain and record requested MTU */ ifp->if_mtu = ifr->ifr_mtu; + +#if __FreeBSD_version >= 1100099 /* - * Make sure that LRO high watermark is still valid, - * after MTU change (the 2*MTU limit). + * Make sure that LRO aggregation length limit is still + * valid, after the MTU change. */ - if (!HN_LRO_HIWAT_ISVALID(sc, sc->hn_lro_hiwat)) - hn_set_lro_hiwat(sc, HN_LRO_HIWAT_MTULIM(ifp)); + NV_LOCK(sc); + if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < + HN_LRO_LENLIM_MIN(ifp)) { + int i; + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + sc->hn_rx_ring[i].hn_lro.lro_length_lim = + HN_LRO_LENLIM_MIN(ifp); + } + } + NV_UNLOCK(sc); +#endif do { NV_LOCK(sc); @@ -1422,8 +1421,10 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) } sc->hn_tx_chimney_max = sc->net_dev->send_section_size; - if (sc->hn_tx_chimney_size > sc->hn_tx_chimney_max) - sc->hn_tx_chimney_size = sc->hn_tx_chimney_max; + if (sc->hn_tx_ring[0].hn_tx_chimney_size > + sc->hn_tx_chimney_max) + hn_set_tx_chimney_size(sc, sc->hn_tx_chimney_max); + hn_ifinit_locked(sc); NV_LOCK(sc); @@ -1483,47 +1484,43 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = 0; break; case SIOCSIFCAP: + NV_LOCK(sc); + mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { - if (IFCAP_TXCSUM & ifp->if_capenable) { - ifp->if_capenable &= ~IFCAP_TXCSUM; - ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP); + ifp->if_capenable ^= IFCAP_TXCSUM; + if (ifp->if_capenable & IFCAP_TXCSUM) { + ifp->if_hwassist |= + sc->hn_tx_ring[0].hn_csum_assist; } else { - ifp->if_capenable |= IFCAP_TXCSUM; - /* - * Only enable UDP checksum offloading on - * Windows Server 2012R2 or later releases. - */ - if (hv_vmbus_protocal_version >= - HV_VMBUS_VERSION_WIN8_1) { - ifp->if_hwassist |= - (CSUM_TCP | CSUM_UDP); - } else { - ifp->if_hwassist |= CSUM_TCP; - } + ifp->if_hwassist &= + ~sc->hn_tx_ring[0].hn_csum_assist; } } - if (mask & IFCAP_RXCSUM) { - if (IFCAP_RXCSUM & ifp->if_capenable) { - ifp->if_capenable &= ~IFCAP_RXCSUM; - } else { - ifp->if_capenable |= IFCAP_RXCSUM; - } - } + if (mask & IFCAP_RXCSUM) + ifp->if_capenable ^= IFCAP_RXCSUM; + if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; - ifp->if_hwassist ^= CSUM_IP_TSO; + if (ifp->if_capenable & IFCAP_TSO4) + ifp->if_hwassist |= CSUM_IP_TSO; + else + ifp->if_hwassist &= ~CSUM_IP_TSO; } if (mask & IFCAP_TSO6) { ifp->if_capenable ^= IFCAP_TSO6; - ifp->if_hwassist ^= CSUM_IP6_TSO; + if (ifp->if_capenable & IFCAP_TSO6) + ifp->if_hwassist |= CSUM_IP6_TSO; + else + ifp->if_hwassist &= ~CSUM_IP6_TSO; } + NV_UNLOCK(sc); error = 0; break; case SIOCADDMULTI: @@ -1566,7 +1563,8 @@ hn_stop(hn_softc_t *sc) if (bootverbose) printf(" Closing Device ...\n"); - ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + atomic_clear_int(&ifp->if_drv_flags, + (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)); if_link_state_change(ifp, LINK_STATE_DOWN); sc->hn_initdone = 0; @@ -1579,16 +1577,56 @@ hn_stop(hn_softc_t *sc) static void hn_start(struct ifnet *ifp) { - hn_softc_t *sc; + struct hn_softc *sc = ifp->if_softc; + struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; - sc = ifp->if_softc; - NV_LOCK(sc); - if (sc->temp_unusable) { - NV_UNLOCK(sc); - return; + if (txr->hn_sched_tx) + goto do_sched; + + if (mtx_trylock(&txr->hn_tx_lock)) { + int sched; + + sched = hn_start_locked(txr, txr->hn_direct_tx_size); + mtx_unlock(&txr->hn_tx_lock); + if (!sched) + return; + } +do_sched: + taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); +} + +static void +hn_start_txeof(struct hn_tx_ring *txr) +{ + struct hn_softc *sc = txr->hn_sc; + struct ifnet *ifp = sc->hn_ifp; + + KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); + + if (txr->hn_sched_tx) + goto do_sched; + + if (mtx_trylock(&txr->hn_tx_lock)) { + int sched; + + atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); + sched = hn_start_locked(txr, txr->hn_direct_tx_size); + mtx_unlock(&txr->hn_tx_lock); + if (sched) { + taskqueue_enqueue(txr->hn_tx_taskq, + &txr->hn_tx_task); + } + } else { +do_sched: + /* + * Release the OACTIVE earlier, with the hope, that + * others could catch up. The task will clear the + * flag again with the hn_tx_lock to avoid possible + * races. + */ + atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); + taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } - hn_start_locked(ifp); - NV_UNLOCK(sc); } /* @@ -1615,8 +1653,8 @@ hn_ifinit_locked(hn_softc_t *sc) } else { sc->hn_initdone = 1; } - ifp->if_drv_flags |= IFF_DRV_RUNNING; - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); + atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); if_link_state_change(ifp, LINK_STATE_UP); } @@ -1659,26 +1697,90 @@ hn_watchdog(struct ifnet *ifp) } #endif -#ifdef HN_LRO_HIWAT +#if __FreeBSD_version >= 1100099 + +static int +hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + unsigned int lenlim; + int error, i; + + lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; + error = sysctl_handle_int(oidp, &lenlim, 0, req); + if (error || req->newptr == NULL) + return error; + + if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || + lenlim > TCP_LRO_LENGTH_MAX) + return EINVAL; + + NV_LOCK(sc); + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) + sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; + NV_UNLOCK(sc); + return 0; +} + static int -hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS) +hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; - int hiwat, error; + int ackcnt, error, i; - hiwat = sc->hn_lro_hiwat; - error = sysctl_handle_int(oidp, &hiwat, 0, req); + /* + * lro_ackcnt_lim is append count limit, + * +1 to turn it into aggregation limit. + */ + ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; + error = sysctl_handle_int(oidp, &ackcnt, 0, req); if (error || req->newptr == NULL) return error; - if (!HN_LRO_HIWAT_ISVALID(sc, hiwat)) + if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) return EINVAL; - if (sc->hn_lro_hiwat != hiwat) - hn_set_lro_hiwat(sc, hiwat); + /* + * Convert aggregation limit back to append + * count limit. + */ + --ackcnt; + NV_LOCK(sc); + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) + sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; + NV_UNLOCK(sc); + return 0; +} + +#endif + +static int +hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int hcsum = arg2; + int on, error, i; + + on = 0; + if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) + on = 1; + + error = sysctl_handle_int(oidp, &on, 0, req); + if (error || req->newptr == NULL) + return error; + + NV_LOCK(sc); + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; + + if (on) + rxr->hn_trust_hcsum |= hcsum; + else + rxr->hn_trust_hcsum &= ~hcsum; + } + NV_UNLOCK(sc); return 0; } -#endif /* HN_LRO_HIWAT */ static int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS) @@ -1686,7 +1788,7 @@ hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS) struct hn_softc *sc = arg1; int chimney_size, error; - chimney_size = sc->hn_tx_chimney_size; + chimney_size = sc->hn_tx_ring[0].hn_tx_chimney_size; error = sysctl_handle_int(oidp, &chimney_size, 0, req); if (error || req->newptr == NULL) return error; @@ -1694,8 +1796,138 @@ hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS) if (chimney_size > sc->hn_tx_chimney_max || chimney_size <= 0) return EINVAL; - if (sc->hn_tx_chimney_size != chimney_size) - sc->hn_tx_chimney_size = chimney_size; + hn_set_tx_chimney_size(sc, chimney_size); + return 0; +} + +#if __FreeBSD_version < 1100095 +static int +hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int ofs = arg2, i, error; + struct hn_rx_ring *rxr; + uint64_t stat; + + stat = 0; + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + rxr = &sc->hn_rx_ring[i]; + stat += *((int *)((uint8_t *)rxr + ofs)); + } + + error = sysctl_handle_64(oidp, &stat, 0, req); + if (error || req->newptr == NULL) + return error; + + /* Zero out this stat. */ + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + rxr = &sc->hn_rx_ring[i]; + *((int *)((uint8_t *)rxr + ofs)) = 0; + } + return 0; +} +#else +static int +hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int ofs = arg2, i, error; + struct hn_rx_ring *rxr; + uint64_t stat; + + stat = 0; + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + rxr = &sc->hn_rx_ring[i]; + stat += *((uint64_t *)((uint8_t *)rxr + ofs)); + } + + error = sysctl_handle_64(oidp, &stat, 0, req); + if (error || req->newptr == NULL) + return error; + + /* Zero out this stat. */ + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + rxr = &sc->hn_rx_ring[i]; + *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; + } + return 0; +} + +#endif + +static int +hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int ofs = arg2, i, error; + struct hn_rx_ring *rxr; + u_long stat; + + stat = 0; + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + rxr = &sc->hn_rx_ring[i]; + stat += *((u_long *)((uint8_t *)rxr + ofs)); + } + + error = sysctl_handle_long(oidp, &stat, 0, req); + if (error || req->newptr == NULL) + return error; + + /* Zero out this stat. */ + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + rxr = &sc->hn_rx_ring[i]; + *((u_long *)((uint8_t *)rxr + ofs)) = 0; + } + return 0; +} + +static int +hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int ofs = arg2, i, error; + struct hn_tx_ring *txr; + u_long stat; + + stat = 0; + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + txr = &sc->hn_tx_ring[i]; + stat += *((u_long *)((uint8_t *)txr + ofs)); + } + + error = sysctl_handle_long(oidp, &stat, 0, req); + if (error || req->newptr == NULL) + return error; + + /* Zero out this stat. */ + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + txr = &sc->hn_tx_ring[i]; + *((u_long *)((uint8_t *)txr + ofs)) = 0; + } + return 0; +} + +static int +hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int ofs = arg2, i, error, conf; + struct hn_tx_ring *txr; + + txr = &sc->hn_tx_ring[0]; + conf = *((int *)((uint8_t *)txr + ofs)); + + error = sysctl_handle_int(oidp, &conf, 0, req); + if (error || req->newptr == NULL) + return error; + + NV_LOCK(sc); + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + txr = &sc->hn_tx_ring[i]; + *((int *)((uint8_t *)txr + ofs)) = conf; + } + NV_UNLOCK(sc); + return 0; } @@ -1786,17 +2018,191 @@ hn_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error) *paddr = segs->ds_addr; } +static void +hn_create_rx_data(struct hn_softc *sc) +{ + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + device_t dev = sc->hn_dev; +#if defined(INET) || defined(INET6) +#if __FreeBSD_version >= 1100095 + int lroent_cnt; +#endif +#endif + int i; + + sc->hn_rx_ring_cnt = 1; /* TODO: vRSS */ + sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, + M_NETVSC, M_WAITOK | M_ZERO); + +#if defined(INET) || defined(INET6) +#if __FreeBSD_version >= 1100095 + lroent_cnt = hn_lro_entry_count; + if (lroent_cnt < TCP_LRO_ENTRIES) + lroent_cnt = TCP_LRO_ENTRIES; + device_printf(dev, "LRO: entry count %d\n", lroent_cnt); +#endif +#endif /* INET || INET6 */ + + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; + + if (hn_trust_hosttcp) + rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; + if (hn_trust_hostudp) + rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; + if (hn_trust_hostip) + rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; + + /* + * Initialize LRO. + */ +#if defined(INET) || defined(INET6) +#if __FreeBSD_version >= 1100095 + tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 0); +#else + tcp_lro_init(&rxr->hn_lro); + rxr->hn_lro.ifp = sc->hn_ifp; +#endif +#if __FreeBSD_version >= 1100099 + rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; + rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; +#endif +#endif /* INET || INET6 */ + } + + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", + CTLTYPE_U64 | CTLFLAG_RW, sc, + __offsetof(struct hn_rx_ring, hn_lro.lro_queued), +#if __FreeBSD_version < 1100095 + hn_rx_stat_int_sysctl, +#else + hn_rx_stat_u64_sysctl, +#endif + "LU", "LRO queued"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", + CTLTYPE_U64 | CTLFLAG_RW, sc, + __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), +#if __FreeBSD_version < 1100095 + hn_rx_stat_int_sysctl, +#else + hn_rx_stat_u64_sysctl, +#endif + "LU", "LRO flushed"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", + CTLTYPE_ULONG | CTLFLAG_RW, sc, + __offsetof(struct hn_rx_ring, hn_lro_tried), + hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); +#if __FreeBSD_version >= 1100099 + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", + CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_lro_lenlim_sysctl, "IU", + "Max # of data bytes to be aggregated by LRO"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", + CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_ackcnt_sysctl, "I", + "Max # of ACKs to be aggregated by LRO"); +#endif + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", + CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_TCP, + hn_trust_hcsum_sysctl, "I", + "Trust tcp segement verification on host side, " + "when csum info is missing"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", + CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_UDP, + hn_trust_hcsum_sysctl, "I", + "Trust udp datagram verification on host side, " + "when csum info is missing"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", + CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_IP, + hn_trust_hcsum_sysctl, "I", + "Trust ip packet verification on host side, " + "when csum info is missing"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", + CTLTYPE_ULONG | CTLFLAG_RW, sc, + __offsetof(struct hn_rx_ring, hn_csum_ip), + hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", + CTLTYPE_ULONG | CTLFLAG_RW, sc, + __offsetof(struct hn_rx_ring, hn_csum_tcp), + hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", + CTLTYPE_ULONG | CTLFLAG_RW, sc, + __offsetof(struct hn_rx_ring, hn_csum_udp), + hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", + CTLTYPE_ULONG | CTLFLAG_RW, sc, + __offsetof(struct hn_rx_ring, hn_csum_trusted), + hn_rx_stat_ulong_sysctl, "LU", + "# of packets that we trust host's csum verification"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", + CTLTYPE_ULONG | CTLFLAG_RW, sc, + __offsetof(struct hn_rx_ring, hn_small_pkts), + hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); +} + +static void +hn_destroy_rx_data(struct hn_softc *sc) +{ +#if defined(INET) || defined(INET6) + int i; +#endif + + if (sc->hn_rx_ring_cnt == 0) + return; + +#if defined(INET) || defined(INET6) + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) + tcp_lro_free(&sc->hn_rx_ring[i].hn_lro); +#endif + free(sc->hn_rx_ring, M_NETVSC); + sc->hn_rx_ring = NULL; + + sc->hn_rx_ring_cnt = 0; +} + static int -hn_create_tx_ring(struct hn_softc *sc) +hn_create_tx_ring(struct hn_softc *sc, int id) { + struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; bus_dma_tag_t parent_dtag; int error, i; - sc->hn_txdesc_cnt = HN_TX_DESC_CNT; - sc->hn_txdesc = malloc(sizeof(struct hn_txdesc) * sc->hn_txdesc_cnt, + txr->hn_sc = sc; + +#ifndef HN_USE_TXDESC_BUFRING + mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); +#endif + mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); + + txr->hn_txdesc_cnt = HN_TX_DESC_CNT; + txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, M_NETVSC, M_WAITOK | M_ZERO); - SLIST_INIT(&sc->hn_txlist); - mtx_init(&sc->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); +#ifndef HN_USE_TXDESC_BUFRING + SLIST_INIT(&txr->hn_txlist); +#else + txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_NETVSC, + M_WAITOK, &txr->hn_tx_lock); +#endif + + txr->hn_tx_taskq = sc->hn_tx_taskq; + TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); + TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); + + txr->hn_direct_tx_size = hn_direct_tx_size; + if (hv_vmbus_protocal_version >= HV_VMBUS_VERSION_WIN8_1) + txr->hn_csum_assist = HN_CSUM_ASSIST; + else + txr->hn_csum_assist = HN_CSUM_ASSIST_WIN8; + + /* + * Always schedule transmission instead of trying to do direct + * transmission. This one gives the best performance so far. + */ + txr->hn_sched_tx = 1; + + txr->hn_txeof = hn_start_txeof; /* TODO: if_transmit */ parent_dtag = bus_get_dma_tag(sc->hn_dev); @@ -1813,7 +2219,7 @@ hn_create_tx_ring(struct hn_softc *sc) 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ - &sc->hn_tx_rndis_dtag); + &txr->hn_tx_rndis_dtag); if (error) { device_printf(sc->hn_dev, "failed to create rndis dmatag\n"); return error; @@ -1832,21 +2238,21 @@ hn_create_tx_ring(struct hn_softc *sc) 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ - &sc->hn_tx_data_dtag); + &txr->hn_tx_data_dtag); if (error) { device_printf(sc->hn_dev, "failed to create data dmatag\n"); return error; } - for (i = 0; i < sc->hn_txdesc_cnt; ++i) { - struct hn_txdesc *txd = &sc->hn_txdesc[i]; + for (i = 0; i < txr->hn_txdesc_cnt; ++i) { + struct hn_txdesc *txd = &txr->hn_txdesc[i]; - txd->sc = sc; + txd->txr = txr; /* * Allocate and load RNDIS messages. */ - error = bus_dmamem_alloc(sc->hn_tx_rndis_dtag, + error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, (void **)&txd->rndis_msg, BUS_DMA_WAITOK | BUS_DMA_COHERENT, &txd->rndis_msg_dmap); @@ -1856,7 +2262,7 @@ hn_create_tx_ring(struct hn_softc *sc) return error; } - error = bus_dmamap_load(sc->hn_tx_rndis_dtag, + error = bus_dmamap_load(txr->hn_tx_rndis_dtag, txd->rndis_msg_dmap, txd->rndis_msg, HN_RNDIS_MSG_LEN, hn_dma_map_paddr, &txd->rndis_msg_paddr, @@ -1864,59 +2270,277 @@ hn_create_tx_ring(struct hn_softc *sc) if (error) { device_printf(sc->hn_dev, "failed to load rndis_msg, %d\n", i); - bus_dmamem_free(sc->hn_tx_rndis_dtag, + bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_msg, txd->rndis_msg_dmap); return error; } /* DMA map for TX data. */ - error = bus_dmamap_create(sc->hn_tx_data_dtag, 0, + error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, &txd->data_dmap); if (error) { device_printf(sc->hn_dev, "failed to allocate tx data dmamap\n"); - bus_dmamap_unload(sc->hn_tx_rndis_dtag, + bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_msg_dmap); - bus_dmamem_free(sc->hn_tx_rndis_dtag, + bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_msg, txd->rndis_msg_dmap); return error; } /* All set, put it to list */ txd->flags |= HN_TXD_FLAG_ONLIST; - SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link); +#ifndef HN_USE_TXDESC_BUFRING + SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); +#else + buf_ring_enqueue(txr->hn_txdesc_br, txd); +#endif + } + txr->hn_txdesc_avail = txr->hn_txdesc_cnt; + + if (sc->hn_tx_sysctl_tree != NULL) { + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + char name[16]; + + /* + * Create per TX ring sysctl tree: + * dev.hn.UNIT.tx.RINGID + */ + ctx = device_get_sysctl_ctx(sc->hn_dev); + child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); + + snprintf(name, sizeof(name), "%d", id); + txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, + name, CTLFLAG_RD, 0, ""); + + if (txr->hn_tx_sysctl_tree != NULL) { + child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); + + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", + CTLFLAG_RD, &txr->hn_txdesc_avail, 0, + "# of available TX descs"); + } } - sc->hn_txdesc_avail = sc->hn_txdesc_cnt; return 0; } static void -hn_destroy_tx_ring(struct hn_softc *sc) +hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) +{ + struct hn_tx_ring *txr = txd->txr; + + KASSERT(txd->m == NULL, ("still has mbuf installed")); + KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); + + bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_msg_dmap); + bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_msg, + txd->rndis_msg_dmap); + bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); +} + +static void +hn_destroy_tx_ring(struct hn_tx_ring *txr) { struct hn_txdesc *txd; - while ((txd = SLIST_FIRST(&sc->hn_txlist)) != NULL) { - KASSERT(txd->m == NULL, ("still has mbuf installed")); - KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, - ("still dma mapped")); - SLIST_REMOVE_HEAD(&sc->hn_txlist, link); + if (txr->hn_txdesc == NULL) + return; + +#ifndef HN_USE_TXDESC_BUFRING + while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) { + SLIST_REMOVE_HEAD(&txr->hn_txlist, link); + hn_txdesc_dmamap_destroy(txd); + } +#else + while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL) + hn_txdesc_dmamap_destroy(txd); +#endif + + if (txr->hn_tx_data_dtag != NULL) + bus_dma_tag_destroy(txr->hn_tx_data_dtag); + if (txr->hn_tx_rndis_dtag != NULL) + bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); + +#ifdef HN_USE_TXDESC_BUFRING + buf_ring_free(txr->hn_txdesc_br, M_NETVSC); +#endif + + free(txr->hn_txdesc, M_NETVSC); + txr->hn_txdesc = NULL; + +#ifndef HN_USE_TXDESC_BUFRING + mtx_destroy(&txr->hn_txlist_spin); +#endif + mtx_destroy(&txr->hn_tx_lock); +} + +static int +hn_create_tx_data(struct hn_softc *sc) +{ + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + int i; + + sc->hn_tx_ring_cnt = 1; /* TODO: vRSS */ + sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, + M_NETVSC, M_WAITOK | M_ZERO); + + ctx = device_get_sysctl_ctx(sc->hn_dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); + + /* Create dev.hn.UNIT.tx sysctl tree */ + sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", + CTLFLAG_RD, 0, ""); + + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + int error; + + error = hn_create_tx_ring(sc, i); + if (error) + return error; + } + + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", + CTLTYPE_ULONG | CTLFLAG_RW, sc, + __offsetof(struct hn_tx_ring, hn_no_txdescs), + hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", + CTLTYPE_ULONG | CTLFLAG_RW, sc, + __offsetof(struct hn_tx_ring, hn_send_failed), + hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", + CTLTYPE_ULONG | CTLFLAG_RW, sc, + __offsetof(struct hn_tx_ring, hn_txdma_failed), + hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", + CTLTYPE_ULONG | CTLFLAG_RW, sc, + __offsetof(struct hn_tx_ring, hn_tx_collapsed), + hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", + CTLTYPE_ULONG | CTLFLAG_RW, sc, + __offsetof(struct hn_tx_ring, hn_tx_chimney), + hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", + CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, + "# of total TX descs"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", + CTLFLAG_RD, &sc->hn_tx_chimney_max, 0, + "Chimney send packet size upper boundary"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", + CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_tx_chimney_size_sysctl, + "I", "Chimney send packet size limit"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", + CTLTYPE_INT | CTLFLAG_RW, sc, + __offsetof(struct hn_tx_ring, hn_direct_tx_size), + hn_tx_conf_int_sysctl, "I", + "Size of the packet for direct transmission"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", + CTLTYPE_INT | CTLFLAG_RW, sc, + __offsetof(struct hn_tx_ring, hn_sched_tx), + hn_tx_conf_int_sysctl, "I", + "Always schedule transmission " + "instead of doing direct transmission"); + + return 0; +} + +static void +hn_set_tx_chimney_size(struct hn_softc *sc, int chimney_size) +{ + int i; - bus_dmamap_unload(sc->hn_tx_rndis_dtag, - txd->rndis_msg_dmap); - bus_dmamem_free(sc->hn_tx_rndis_dtag, - txd->rndis_msg, txd->rndis_msg_dmap); + NV_LOCK(sc); + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) + sc->hn_tx_ring[i].hn_tx_chimney_size = chimney_size; + NV_UNLOCK(sc); +} + +static void +hn_destroy_tx_data(struct hn_softc *sc) +{ + int i; + + if (sc->hn_tx_ring_cnt == 0) + return; + + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) + hn_destroy_tx_ring(&sc->hn_tx_ring[i]); + + free(sc->hn_tx_ring, M_NETVSC); + sc->hn_tx_ring = NULL; + + sc->hn_tx_ring_cnt = 0; +} + +static void +hn_start_taskfunc(void *xtxr, int pending __unused) +{ + struct hn_tx_ring *txr = xtxr; + + mtx_lock(&txr->hn_tx_lock); + hn_start_locked(txr, 0); + mtx_unlock(&txr->hn_tx_lock); +} + +static void +hn_start_txeof_taskfunc(void *xtxr, int pending __unused) +{ + struct hn_tx_ring *txr = xtxr; + + mtx_lock(&txr->hn_tx_lock); + atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); + hn_start_locked(txr, 0); + mtx_unlock(&txr->hn_tx_lock); +} + +static void +hn_stop_tx_tasks(struct hn_softc *sc) +{ + int i; + + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; + + taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); + taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); + } +} - bus_dmamap_destroy(sc->hn_tx_data_dtag, txd->data_dmap); +static void +hn_tx_taskq_create(void *arg __unused) +{ + if (!hn_share_tx_taskq) + return; + + hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK, + taskqueue_thread_enqueue, &hn_tx_taskq); + taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx"); + if (hn_bind_tx_taskq >= 0) { + int cpu = hn_bind_tx_taskq; + struct task cpuset_task; + cpuset_t cpu_set; + + if (cpu > mp_ncpus - 1) + cpu = mp_ncpus - 1; + CPU_SETOF(cpu, &cpu_set); + TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set); + taskqueue_enqueue(hn_tx_taskq, &cpuset_task); + taskqueue_drain(hn_tx_taskq, &cpuset_task); } +} +SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_FIRST, + hn_tx_taskq_create, NULL); - if (sc->hn_tx_data_dtag != NULL) - bus_dma_tag_destroy(sc->hn_tx_data_dtag); - if (sc->hn_tx_rndis_dtag != NULL) - bus_dma_tag_destroy(sc->hn_tx_rndis_dtag); - free(sc->hn_txdesc, M_NETVSC); - mtx_destroy(&sc->hn_txlist_spin); +static void +hn_tx_taskq_destroy(void *arg __unused) +{ + if (hn_tx_taskq != NULL) + taskqueue_free(hn_tx_taskq); } +SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_FIRST, + hn_tx_taskq_destroy, NULL); static device_method_t netvsc_methods[] = { /* Device interface */ diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.c b/sys/dev/hyperv/netvsc/hv_rndis_filter.c index 29d8c8f..31ddbc0 100644 --- a/sys/dev/hyperv/netvsc/hv_rndis_filter.c +++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.c @@ -136,12 +136,9 @@ hv_get_rndis_device(void) { rndis_device *device; - device = malloc(sizeof(rndis_device), M_NETVSC, M_NOWAIT | M_ZERO); - if (device == NULL) { - return (NULL); - } + device = malloc(sizeof(rndis_device), M_NETVSC, M_WAITOK | M_ZERO); - mtx_init(&device->req_lock, "HV-FRL", NULL, MTX_SPIN | MTX_RECURSE); + mtx_init(&device->req_lock, "HV-FRL", NULL, MTX_DEF); /* Same effect as STAILQ_HEAD_INITIALIZER() static initializer */ STAILQ_INIT(&device->myrequest_list); @@ -172,10 +169,7 @@ hv_rndis_request(rndis_device *device, uint32_t message_type, rndis_msg *rndis_mesg; rndis_set_request *set; - request = malloc(sizeof(rndis_request), M_NETVSC, M_NOWAIT | M_ZERO); - if (request == NULL) { - return (NULL); - } + request = malloc(sizeof(rndis_request), M_NETVSC, M_WAITOK | M_ZERO); sema_init(&request->wait_sema, 0, "rndis sema"); @@ -194,9 +188,9 @@ hv_rndis_request(rndis_device *device, uint32_t message_type, set->request_id += 1; /* Add to the request list */ - mtx_lock_spin(&device->req_lock); + mtx_lock(&device->req_lock); STAILQ_INSERT_TAIL(&device->myrequest_list, request, mylist_entry); - mtx_unlock_spin(&device->req_lock); + mtx_unlock(&device->req_lock); return (request); } @@ -207,14 +201,14 @@ hv_rndis_request(rndis_device *device, uint32_t message_type, static inline void hv_put_rndis_request(rndis_device *device, rndis_request *request) { - mtx_lock_spin(&device->req_lock); + mtx_lock(&device->req_lock); /* Fixme: Has O(n) performance */ /* * XXXKYS: Use Doubly linked lists. */ STAILQ_REMOVE(&device->myrequest_list, request, rndis_request_, mylist_entry); - mtx_unlock_spin(&device->req_lock); + mtx_unlock(&device->req_lock); sema_destroy(&request->wait_sema); free(request, M_NETVSC); @@ -271,7 +265,7 @@ hv_rf_receive_response(rndis_device *device, rndis_msg *response) rndis_request *next_request; boolean_t found = FALSE; - mtx_lock_spin(&device->req_lock); + mtx_lock(&device->req_lock); request = STAILQ_FIRST(&device->myrequest_list); while (request != NULL) { /* @@ -286,7 +280,7 @@ hv_rf_receive_response(rndis_device *device, rndis_msg *response) next_request = STAILQ_NEXT(request, mylist_entry); request = next_request; } - mtx_unlock_spin(&device->req_lock); + mtx_unlock(&device->req_lock); if (found) { if (response->msg_len <= sizeof(rndis_msg)) { diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c index a780f9e..27fb3fd 100644 --- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c +++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c @@ -856,8 +856,8 @@ hv_storvsc_rescan_target(struct storvsc_softc *sc) if (xpt_create_path(&ccb->ccb_h.path, NULL, pathid, targetid, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { - printf("unable to create path for rescan, pathid: %d," - "targetid: %d\n", pathid, targetid); + printf("unable to create path for rescan, pathid: %u," + "targetid: %u\n", pathid, targetid); xpt_free_ccb(ccb); return; } @@ -1561,13 +1561,12 @@ static void storvsc_destroy_bounce_buffer(struct sglist *sgl) { struct hv_sgl_node *sgl_node = NULL; - - sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list); - LIST_REMOVE(sgl_node, link); - if (NULL == sgl_node) { + if (LIST_EMPTY(&g_hv_sgl_page_pool.in_use_sgl_list)) { printf("storvsc error: not enough in use sgl\n"); return; } + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list); + LIST_REMOVE(sgl_node, link); sgl_node->sgl_data = sgl; LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link); } @@ -1593,12 +1592,12 @@ storvsc_create_bounce_buffer(uint16_t seg_count, int write) struct hv_sgl_node *sgl_node = NULL; /* get struct sglist from free_sgl_list */ - sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); - LIST_REMOVE(sgl_node, link); - if (NULL == sgl_node) { + if (LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { printf("storvsc error: not enough free sgl\n"); return NULL; } + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); + LIST_REMOVE(sgl_node, link); bounce_sgl = sgl_node->sgl_data; LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link); diff --git a/sys/dev/hyperv/utilities/hv_heartbeat.c b/sys/dev/hyperv/utilities/hv_heartbeat.c new file mode 100644 index 0000000..c1b6da5 --- /dev/null +++ b/sys/dev/hyperv/utilities/hv_heartbeat.c @@ -0,0 +1,129 @@ +/*- + * Copyright (c) 2014 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/timetc.h> +#include <sys/syscallsubr.h> + +#include <dev/hyperv/include/hyperv.h> +#include "hv_util.h" + +/* Heartbeat Service */ +static hv_guid service_guid = { .data = + {0x39, 0x4f, 0x16, 0x57, 0x15, 0x91, 0x78, 0x4e, + 0xab, 0x55, 0x38, 0x2f, 0x3b, 0xd5, 0x42, 0x2d} }; + +/** + * Process heartbeat message + */ +static void +hv_heartbeat_cb(void *context) +{ + uint8_t* buf; + hv_vmbus_channel* channel; + uint32_t recvlen; + uint64_t requestid; + int ret; + + struct hv_vmbus_heartbeat_msg_data* heartbeat_msg; + struct hv_vmbus_icmsg_hdr* icmsghdrp; + hv_util_sc *softc; + + softc = (hv_util_sc*)context; + buf = softc->receive_buffer;; + channel = softc->hv_dev->channel; + + ret = hv_vmbus_channel_recv_packet(channel, buf, PAGE_SIZE, &recvlen, + &requestid); + + if ((ret == 0) && recvlen > 0) { + + icmsghdrp = (struct hv_vmbus_icmsg_hdr *) + &buf[sizeof(struct hv_vmbus_pipe_hdr)]; + + if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { + hv_negotiate_version(icmsghdrp, NULL, buf); + + } else { + heartbeat_msg = + (struct hv_vmbus_heartbeat_msg_data *) + &buf[sizeof(struct hv_vmbus_pipe_hdr) + + sizeof(struct hv_vmbus_icmsg_hdr)]; + + heartbeat_msg->seq_num += 1; + } + + icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | + HV_ICMSGHDRFLAG_RESPONSE; + + hv_vmbus_channel_send_packet(channel, buf, recvlen, requestid, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); + } +} + +static int +hv_heartbeat_probe(device_t dev) +{ + const char *p = vmbus_get_type(dev); + if (!memcmp(p, &service_guid, sizeof(hv_guid))) { + device_set_desc(dev, "Hyper-V Heartbeat Service"); + return BUS_PROBE_DEFAULT; + } + + return ENXIO; +} + +static int +hv_heartbeat_attach(device_t dev) +{ + hv_util_sc *softc = (hv_util_sc*)device_get_softc(dev); + + softc->callback = hv_heartbeat_cb; + + return hv_util_attach(dev); +} + +static device_method_t heartbeat_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, hv_heartbeat_probe), + DEVMETHOD(device_attach, hv_heartbeat_attach), + DEVMETHOD(device_detach, hv_util_detach), + { 0, 0 } +}; + +static driver_t heartbeat_driver = { "hvheartbeat", heartbeat_methods, sizeof(hv_util_sc)}; + +static devclass_t heartbeat_devclass; + +DRIVER_MODULE(hv_heartbeat, vmbus, heartbeat_driver, heartbeat_devclass, NULL, NULL); +MODULE_VERSION(hv_heartbeat, 1); +MODULE_DEPEND(hv_heartbeat, vmbus, 1, 1, 1); diff --git a/sys/dev/hyperv/utilities/hv_kvp.c b/sys/dev/hyperv/utilities/hv_kvp.c index 58d565c4..8517918 100644 --- a/sys/dev/hyperv/utilities/hv_kvp.c +++ b/sys/dev/hyperv/utilities/hv_kvp.c @@ -63,6 +63,7 @@ __FBSDID("$FreeBSD$"); #include <dev/hyperv/include/hyperv.h> #include <dev/hyperv/netvsc/hv_net_vsc.h> +#include "hv_util.h" #include "unicode.h" #include "hv_kvp.h" @@ -74,8 +75,6 @@ __FBSDID("$FreeBSD$"); /* hv_kvp debug control */ static int hv_kvp_log = 0; -SYSCTL_INT(_dev, OID_AUTO, hv_kvp_log, CTLFLAG_RW, &hv_kvp_log, 0, - "hv_kvp log"); #define hv_kvp_log_error(...) do { \ if (hv_kvp_log > 0) \ @@ -87,6 +86,10 @@ SYSCTL_INT(_dev, OID_AUTO, hv_kvp_log, CTLFLAG_RW, &hv_kvp_log, 0, log(LOG_INFO, "hv_kvp: " __VA_ARGS__); \ } while (0) +static hv_guid service_guid = { .data = + {0xe7, 0xf4, 0xa0, 0xa9, 0x45, 0x5a, 0x96, 0x4d, + 0xb8, 0x27, 0x8a, 0x84, 0x1e, 0x8c, 0x3, 0xe6} }; + /* character device prototypes */ static d_open_t hv_kvp_dev_open; static d_close_t hv_kvp_dev_close; @@ -94,12 +97,6 @@ static d_read_t hv_kvp_dev_daemon_read; static d_write_t hv_kvp_dev_daemon_write; static d_poll_t hv_kvp_dev_daemon_poll; -/* hv_kvp prototypes */ -static int hv_kvp_req_in_progress(void); -static void hv_kvp_transaction_init(uint32_t, hv_vmbus_channel *, uint64_t, uint8_t *); -static void hv_kvp_send_msg_to_daemon(void); -static void hv_kvp_process_request(void *context); - /* hv_kvp character device structure */ static struct cdevsw hv_kvp_cdevsw = { @@ -111,70 +108,67 @@ static struct cdevsw hv_kvp_cdevsw = .d_poll = hv_kvp_dev_daemon_poll, .d_name = "hv_kvp_dev", }; -static struct cdev *hv_kvp_dev; -static struct hv_kvp_msg *hv_kvp_dev_buf; -struct proc *daemon_task; -static struct selinfo hv_kvp_selinfo; /* * Global state to track and synchronize multiple * KVP transaction requests from the host. */ -static struct { - - /* Pre-allocated work item for queue */ - hv_work_item work_item; +typedef struct hv_kvp_sc { + struct hv_util_sc util_sc; - /* Unless specified the pending mutex should be + /* Unless specified the pending mutex should be * used to alter the values of the following paramters: * 1. req_in_progress * 2. req_timed_out - * 3. pending_reqs. */ - struct mtx pending_mutex; - + struct mtx pending_mutex; + + struct task task; + /* To track if transaction is active or not */ - boolean_t req_in_progress; + boolean_t req_in_progress; /* Tracks if daemon did not reply back in time */ - boolean_t req_timed_out; + boolean_t req_timed_out; /* Tracks if daemon is serving a request currently */ boolean_t daemon_busy; - /* Count of KVP requests from Hyper-V. */ - uint64_t pending_reqs; - - - /* Length of host message */ - uint32_t host_msg_len; - /* Pointer to channel */ - hv_vmbus_channel *channelp; + /* Length of host message */ + uint32_t host_msg_len; /* Host message id */ - uint64_t host_msg_id; - + uint64_t host_msg_id; + /* Current kvp message from the host */ - struct hv_kvp_msg *host_kvp_msg; - + struct hv_kvp_msg *host_kvp_msg; + /* Current kvp message for daemon */ - struct hv_kvp_msg daemon_kvp_msg; - + struct hv_kvp_msg daemon_kvp_msg; + /* Rcv buffer for communicating with the host*/ - uint8_t *rcv_buf; - + uint8_t *rcv_buf; + /* Device semaphore to control communication */ - struct sema dev_sema; - + struct sema dev_sema; + /* Indicates if daemon registered with driver */ - boolean_t register_done; - + boolean_t register_done; + /* Character device status */ - boolean_t dev_accessed; -} kvp_globals; + boolean_t dev_accessed; + + struct cdev *hv_kvp_dev; + + struct proc *daemon_task; -/* global vars */ -MALLOC_DECLARE(M_HV_KVP_DEV_BUF); -MALLOC_DEFINE(M_HV_KVP_DEV_BUF, "hv_kvp_dev buffer", "buffer for hv_kvp_dev module"); + struct selinfo hv_kvp_selinfo; +} hv_kvp_sc; + +/* hv_kvp prototypes */ +static int hv_kvp_req_in_progress(hv_kvp_sc *sc); +static void hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t, uint64_t, uint8_t *); +static void hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc); +static void hv_kvp_process_request(void *context, int pending); /* * hv_kvp low level functions @@ -184,10 +178,10 @@ MALLOC_DEFINE(M_HV_KVP_DEV_BUF, "hv_kvp_dev buffer", "buffer for hv_kvp_dev modu * Check if kvp transaction is in progres */ static int -hv_kvp_req_in_progress(void) +hv_kvp_req_in_progress(hv_kvp_sc *sc) { - return (kvp_globals.req_in_progress); + return (sc->req_in_progress); } @@ -195,18 +189,17 @@ hv_kvp_req_in_progress(void) * This routine is called whenever a message is received from the host */ static void -hv_kvp_transaction_init(uint32_t rcv_len, hv_vmbus_channel *rcv_channel, +hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t rcv_len, uint64_t request_id, uint8_t *rcv_buf) { - + /* Store all the relevant message details in the global structure */ /* Do not need to use mutex for req_in_progress here */ - kvp_globals.req_in_progress = true; - kvp_globals.host_msg_len = rcv_len; - kvp_globals.channelp = rcv_channel; - kvp_globals.host_msg_id = request_id; - kvp_globals.rcv_buf = rcv_buf; - kvp_globals.host_kvp_msg = (struct hv_kvp_msg *)&rcv_buf[ + sc->req_in_progress = true; + sc->host_msg_len = rcv_len; + sc->host_msg_id = request_id; + sc->rcv_buf = rcv_buf; + sc->host_kvp_msg = (struct hv_kvp_msg *)&rcv_buf[ sizeof(struct hv_vmbus_pipe_hdr) + sizeof(struct hv_vmbus_icmsg_hdr)]; } @@ -258,12 +251,12 @@ hv_kvp_negotiate_version(struct hv_vmbus_icmsg_hdr *icmsghdrp, * Convert ip related info in umsg from utf8 to utf16 and store in hmsg */ static int -hv_kvp_convert_utf8_ipinfo_to_utf16(struct hv_kvp_msg *umsg, +hv_kvp_convert_utf8_ipinfo_to_utf16(struct hv_kvp_msg *umsg, struct hv_kvp_ip_msg *host_ip_msg) { int err_ip, err_subnet, err_gway, err_dns, err_adap; int UNUSED_FLAG = 1; - + utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.ip_addr, MAX_IP_ADDR_SIZE, (char *)umsg->body.kvp_ip_val.ip_addr, @@ -294,7 +287,7 @@ hv_kvp_convert_utf8_ipinfo_to_utf16(struct hv_kvp_msg *umsg, strlen((char *)umsg->body.kvp_ip_val.adapter_id), UNUSED_FLAG, &err_adap); - + host_ip_msg->kvp_ip_val.dhcp_enabled = umsg->body.kvp_ip_val.dhcp_enabled; host_ip_msg->kvp_ip_val.addr_family = umsg->body.kvp_ip_val.addr_family; @@ -389,7 +382,7 @@ hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg, MAX_IP_ADDR_SIZE, UNUSED_FLAG, &err_subnet); - + utf16_to_utf8((char *)umsg->body.kvp_ip_val.gate_way, MAX_GATEWAY_SIZE, (uint16_t *)host_ip_msg->kvp_ip_val.gate_way, MAX_GATEWAY_SIZE, @@ -411,16 +404,13 @@ hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg, * Ensure utf16_utf8 takes care of the additional string terminating char!! */ static void -hv_kvp_convert_hostmsg_to_usermsg(void) +hv_kvp_convert_hostmsg_to_usermsg(struct hv_kvp_msg *hmsg, struct hv_kvp_msg *umsg) { int utf_err = 0; uint32_t value_type; - struct hv_kvp_ip_msg *host_ip_msg = (struct hv_kvp_ip_msg *) - kvp_globals.host_kvp_msg; - - struct hv_kvp_msg *hmsg = kvp_globals.host_kvp_msg; - struct hv_kvp_msg *umsg = &kvp_globals.daemon_kvp_msg; + struct hv_kvp_ip_msg *host_ip_msg; + host_ip_msg = (struct hv_kvp_ip_msg*)hmsg; memset(umsg, 0, sizeof(struct hv_kvp_msg)); umsg->kvp_hdr.operation = hmsg->kvp_hdr.operation; @@ -525,14 +515,12 @@ hv_kvp_convert_hostmsg_to_usermsg(void) * Prepare a host kvp msg based on user kvp msg (utf8 to utf16) */ static int -hv_kvp_convert_usermsg_to_hostmsg(void) +hv_kvp_convert_usermsg_to_hostmsg(struct hv_kvp_msg *umsg, struct hv_kvp_msg *hmsg) { int hkey_len = 0, hvalue_len = 0, utf_err = 0; struct hv_kvp_exchg_msg_value *host_exchg_data; char *key_name, *value; - struct hv_kvp_msg *umsg = &kvp_globals.daemon_kvp_msg; - struct hv_kvp_msg *hmsg = kvp_globals.host_kvp_msg; struct hv_kvp_ip_msg *host_ip_msg = (struct hv_kvp_ip_msg *)hmsg; switch (hmsg->kvp_hdr.operation) { @@ -564,7 +552,7 @@ hv_kvp_convert_usermsg_to_hostmsg(void) if ((hkey_len < 0) || (hvalue_len < 0)) return (HV_KVP_E_FAIL); - + return (KVP_SUCCESS); case HV_KVP_OP_GET: @@ -580,9 +568,9 @@ hv_kvp_convert_usermsg_to_hostmsg(void) /* Use values by string */ host_exchg_data->value_type = HV_REG_SZ; - if ((hkey_len < 0) || (hvalue_len < 0)) + if ((hkey_len < 0) || (hvalue_len < 0)) return (HV_KVP_E_FAIL); - + return (KVP_SUCCESS); default: @@ -595,22 +583,22 @@ hv_kvp_convert_usermsg_to_hostmsg(void) * Send the response back to the host. */ static void -hv_kvp_respond_host(int error) +hv_kvp_respond_host(hv_kvp_sc *sc, int error) { struct hv_vmbus_icmsg_hdr *hv_icmsg_hdrp; hv_icmsg_hdrp = (struct hv_vmbus_icmsg_hdr *) - &kvp_globals.rcv_buf[sizeof(struct hv_vmbus_pipe_hdr)]; + &sc->rcv_buf[sizeof(struct hv_vmbus_pipe_hdr)]; if (error) error = HV_KVP_E_FAIL; hv_icmsg_hdrp->status = error; hv_icmsg_hdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | HV_ICMSGHDRFLAG_RESPONSE; - - error = hv_vmbus_channel_send_packet(kvp_globals.channelp, - kvp_globals.rcv_buf, - kvp_globals.host_msg_len, kvp_globals.host_msg_id, + + error = hv_vmbus_channel_send_packet(sc->util_sc.hv_dev->channel, + sc->rcv_buf, + sc->host_msg_len, sc->host_msg_id, HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); if (error) @@ -624,16 +612,19 @@ hv_kvp_respond_host(int error) * and the host */ static void -hv_kvp_send_msg_to_daemon(void) +hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc) { + struct hv_kvp_msg *hmsg = sc->host_kvp_msg; + struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg; + /* Prepare kvp_msg to be sent to user */ - hv_kvp_convert_hostmsg_to_usermsg(); + hv_kvp_convert_hostmsg_to_usermsg(hmsg, umsg); /* Send the msg to user via function deamon_read - setting sema */ - sema_post(&kvp_globals.dev_sema); + sema_post(&sc->dev_sema); /* We should wake up the daemon, in case it's doing poll() */ - selwakeup(&hv_kvp_selinfo); + selwakeup(&sc->hv_kvp_selinfo); } @@ -642,98 +633,83 @@ hv_kvp_send_msg_to_daemon(void) * and interact with daemon */ static void -hv_kvp_process_request(void *context) +hv_kvp_process_request(void *context, int pending) { uint8_t *kvp_buf; - hv_vmbus_channel *channel = context; + hv_vmbus_channel *channel; uint32_t recvlen = 0; uint64_t requestid; struct hv_vmbus_icmsg_hdr *icmsghdrp; int ret = 0; - uint64_t pending_cnt = 1; - + hv_kvp_sc *sc; + hv_kvp_log_info("%s: entering hv_kvp_process_request\n", __func__); - kvp_buf = receive_buffer[HV_KVP]; + + sc = (hv_kvp_sc*)context; + kvp_buf = sc->util_sc.receive_buffer;; + channel = sc->util_sc.hv_dev->channel; + ret = hv_vmbus_channel_recv_packet(channel, kvp_buf, 2 * PAGE_SIZE, &recvlen, &requestid); - /* - * We start counting only after the daemon registers - * and therefore there could be requests pending in - * the VMBus that are not reflected in pending_cnt. - * Therefore we continue reading as long as either of - * the below conditions is true. - */ + while ((ret == 0) && (recvlen > 0)) { + + icmsghdrp = (struct hv_vmbus_icmsg_hdr *) + &kvp_buf[sizeof(struct hv_vmbus_pipe_hdr)]; + + hv_kvp_transaction_init(sc, recvlen, requestid, kvp_buf); + if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { + hv_kvp_negotiate_version(icmsghdrp, NULL, kvp_buf); + hv_kvp_respond_host(sc, ret); + + /* + * It is ok to not acquire the mutex before setting + * req_in_progress here because negotiation is the + * first thing that happens and hence there is no + * chance of a race condition. + */ + + sc->req_in_progress = false; + hv_kvp_log_info("%s :version negotiated\n", __func__); + + } else { + if (!sc->daemon_busy) { + + hv_kvp_log_info("%s: issuing qury to daemon\n", __func__); + mtx_lock(&sc->pending_mutex); + sc->req_timed_out = false; + sc->daemon_busy = true; + mtx_unlock(&sc->pending_mutex); - while ((pending_cnt>0) || ((ret == 0) && (recvlen > 0))) { - - if ((ret == 0) && (recvlen>0)) { - - icmsghdrp = (struct hv_vmbus_icmsg_hdr *) - &kvp_buf[sizeof(struct hv_vmbus_pipe_hdr)]; - - hv_kvp_transaction_init(recvlen, channel, requestid, kvp_buf); - if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { - hv_kvp_negotiate_version(icmsghdrp, NULL, kvp_buf); - hv_kvp_respond_host(ret); - - /* - * It is ok to not acquire the mutex before setting - * req_in_progress here because negotiation is the - * first thing that happens and hence there is no - * chance of a race condition. - */ - - kvp_globals.req_in_progress = false; - hv_kvp_log_info("%s :version negotiated\n", __func__); - - } else { - if (!kvp_globals.daemon_busy) { - - hv_kvp_log_info("%s: issuing qury to daemon\n", __func__); - mtx_lock(&kvp_globals.pending_mutex); - kvp_globals.req_timed_out = false; - kvp_globals.daemon_busy = true; - mtx_unlock(&kvp_globals.pending_mutex); - - hv_kvp_send_msg_to_daemon(); - hv_kvp_log_info("%s: waiting for daemon\n", __func__); - } - - /* Wait 5 seconds for daemon to respond back */ - tsleep(&kvp_globals, 0, "kvpworkitem", 5 * hz); - hv_kvp_log_info("%s: came out of wait\n", __func__); + hv_kvp_send_msg_to_daemon(sc); + hv_kvp_log_info("%s: waiting for daemon\n", __func__); } + + /* Wait 5 seconds for daemon to respond back */ + tsleep(sc, 0, "kvpworkitem", 5 * hz); + hv_kvp_log_info("%s: came out of wait\n", __func__); } - mtx_lock(&kvp_globals.pending_mutex); - + mtx_lock(&sc->pending_mutex); + /* Notice that once req_timed_out is set to true * it will remain true until the next request is * sent to the daemon. The response from daemon - * is forwarded to host only when this flag is - * false. + * is forwarded to host only when this flag is + * false. */ - kvp_globals.req_timed_out = true; + sc->req_timed_out = true; /* * Cancel request if so need be. */ - if (hv_kvp_req_in_progress()) { + if (hv_kvp_req_in_progress(sc)) { hv_kvp_log_info("%s: request was still active after wait so failing\n", __func__); - hv_kvp_respond_host(HV_KVP_E_FAIL); - kvp_globals.req_in_progress = false; - } - - /* - * Decrement pending request count and - */ - if (kvp_globals.pending_reqs>0) { - kvp_globals.pending_reqs = kvp_globals.pending_reqs - 1; + hv_kvp_respond_host(sc, HV_KVP_E_FAIL); + sc->req_in_progress = false; } - pending_cnt = kvp_globals.pending_reqs; - - mtx_unlock(&kvp_globals.pending_mutex); + + mtx_unlock(&sc->pending_mutex); /* * Try reading next buffer @@ -741,109 +717,43 @@ hv_kvp_process_request(void *context) recvlen = 0; ret = hv_vmbus_channel_recv_packet(channel, kvp_buf, 2 * PAGE_SIZE, &recvlen, &requestid); - hv_kvp_log_info("%s: read: context %p, pending_cnt %llu ret =%d, recvlen=%d\n", - __func__, context, (unsigned long long)pending_cnt, ret, recvlen); - } + hv_kvp_log_info("%s: read: context %p, ret =%d, recvlen=%d\n", + __func__, context, ret, recvlen); + } } /* * Callback routine that gets called whenever there is a message from host */ -void +static void hv_kvp_callback(void *context) { - uint64_t pending_cnt = 0; - - if (kvp_globals.register_done == false) { - - kvp_globals.channelp = context; - } else { - - mtx_lock(&kvp_globals.pending_mutex); - kvp_globals.pending_reqs = kvp_globals.pending_reqs + 1; - pending_cnt = kvp_globals.pending_reqs; - mtx_unlock(&kvp_globals.pending_mutex); - if (pending_cnt == 1) { - hv_kvp_log_info("%s: Queuing work item\n", __func__); - hv_queue_work_item( - service_table[HV_KVP].work_queue, - hv_kvp_process_request, - context - ); - } - } -} - - -/* - * This function is called by the hv_kvp_init - - * creates character device hv_kvp_dev - * allocates memory to hv_kvp_dev_buf - * - */ -static int -hv_kvp_dev_init(void) -{ - int error = 0; - - /* initialize semaphore */ - sema_init(&kvp_globals.dev_sema, 0, "hv_kvp device semaphore"); - /* create character device */ - error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, - &hv_kvp_dev, - &hv_kvp_cdevsw, - 0, - UID_ROOT, - GID_WHEEL, - 0640, - "hv_kvp_dev"); - - if (error != 0) - return (error); - + hv_kvp_sc *sc = (hv_kvp_sc*)context; /* - * Malloc with M_WAITOK flag will never fail. - */ - hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_HV_KVP_DEV_BUF, M_WAITOK | - M_ZERO); - - return (0); -} - - -/* - * This function is called by the hv_kvp_deinit - - * destroy character device - */ -static void -hv_kvp_dev_destroy(void) -{ - - if (daemon_task != NULL) { - PROC_LOCK(daemon_task); - kern_psignal(daemon_task, SIGKILL); - PROC_UNLOCK(daemon_task); + The first request from host will not be handled until daemon is registered. + when callback is triggered without a registered daemon, callback just return. + When a new daemon gets regsitered, this callbcak is trigged from _write op. + */ + if (sc->register_done) { + hv_kvp_log_info("%s: Queuing work item\n", __func__); + taskqueue_enqueue(taskqueue_thread, &sc->task); } - - destroy_dev(hv_kvp_dev); - free(hv_kvp_dev_buf, M_HV_KVP_DEV_BUF); - return; } - static int hv_kvp_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { - + hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; + hv_kvp_log_info("%s: Opened device \"hv_kvp_device\" successfully.\n", __func__); - if (kvp_globals.dev_accessed) + if (sc->dev_accessed) return (-EBUSY); - - daemon_task = curproc; - kvp_globals.dev_accessed = true; - kvp_globals.daemon_busy = false; + + sc->daemon_task = curproc; + sc->dev_accessed = true; + sc->daemon_busy = false; return (0); } @@ -852,10 +762,11 @@ static int hv_kvp_dev_close(struct cdev *dev __unused, int fflag __unused, int devtype __unused, struct thread *td __unused) { + hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; hv_kvp_log_info("%s: Closing device \"hv_kvp_device\".\n", __func__); - kvp_globals.dev_accessed = false; - kvp_globals.register_done = false; + sc->dev_accessed = false; + sc->register_done = false; return (0); } @@ -865,18 +776,21 @@ hv_kvp_dev_close(struct cdev *dev __unused, int fflag __unused, int devtype __un * acts as a send to daemon */ static int -hv_kvp_dev_daemon_read(struct cdev *dev __unused, struct uio *uio, int ioflag __unused) +hv_kvp_dev_daemon_read(struct cdev *dev, struct uio *uio, int ioflag __unused) { size_t amt; int error = 0; + struct hv_kvp_msg *hv_kvp_dev_buf; + hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; /* Check hv_kvp daemon registration status*/ - if (!kvp_globals.register_done) + if (!sc->register_done) return (KVP_ERROR); - sema_wait(&kvp_globals.dev_sema); + sema_wait(&sc->dev_sema); - memcpy(hv_kvp_dev_buf, &kvp_globals.daemon_kvp_msg, sizeof(struct hv_kvp_msg)); + hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK); + memcpy(hv_kvp_dev_buf, &sc->daemon_kvp_msg, sizeof(struct hv_kvp_msg)); amt = MIN(uio->uio_resid, uio->uio_offset >= BUFFERSIZE + 1 ? 0 : BUFFERSIZE + 1 - uio->uio_offset); @@ -884,6 +798,7 @@ hv_kvp_dev_daemon_read(struct cdev *dev __unused, struct uio *uio, int ioflag __ if ((error = uiomove(hv_kvp_dev_buf, amt, uio)) != 0) hv_kvp_log_info("%s: hv_kvp uiomove read failed!\n", __func__); + free(hv_kvp_dev_buf, M_TEMP); return (error); } @@ -893,29 +808,30 @@ hv_kvp_dev_daemon_read(struct cdev *dev __unused, struct uio *uio, int ioflag __ * acts as a recieve from daemon */ static int -hv_kvp_dev_daemon_write(struct cdev *dev __unused, struct uio *uio, int ioflag __unused) +hv_kvp_dev_daemon_write(struct cdev *dev, struct uio *uio, int ioflag __unused) { size_t amt; int error = 0; + struct hv_kvp_msg *hv_kvp_dev_buf; + hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; uio->uio_offset = 0; + hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK); amt = MIN(uio->uio_resid, BUFFERSIZE); error = uiomove(hv_kvp_dev_buf, amt, uio); - if (error != 0) + if (error != 0) { + free(hv_kvp_dev_buf, M_TEMP); return (error); + } + memcpy(&sc->daemon_kvp_msg, hv_kvp_dev_buf, sizeof(struct hv_kvp_msg)); - memcpy(&kvp_globals.daemon_kvp_msg, hv_kvp_dev_buf, sizeof(struct hv_kvp_msg)); - - if (kvp_globals.register_done == false) { - if (kvp_globals.daemon_kvp_msg.kvp_hdr.operation == HV_KVP_OP_REGISTER) { - - kvp_globals.register_done = true; - if (kvp_globals.channelp) { - - hv_kvp_callback(kvp_globals.channelp); - } + free(hv_kvp_dev_buf, M_TEMP); + if (sc->register_done == false) { + if (sc->daemon_kvp_msg.kvp_hdr.operation == HV_KVP_OP_REGISTER) { + sc->register_done = true; + hv_kvp_callback(dev->si_drv1); } else { hv_kvp_log_info("%s, KVP Registration Failed\n", __func__); @@ -923,18 +839,20 @@ hv_kvp_dev_daemon_write(struct cdev *dev __unused, struct uio *uio, int ioflag _ } } else { - mtx_lock(&kvp_globals.pending_mutex); + mtx_lock(&sc->pending_mutex); - if(!kvp_globals.req_timed_out) { + if(!sc->req_timed_out) { + struct hv_kvp_msg *hmsg = sc->host_kvp_msg; + struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg; - hv_kvp_convert_usermsg_to_hostmsg(); - hv_kvp_respond_host(KVP_SUCCESS); - wakeup(&kvp_globals); - kvp_globals.req_in_progress = false; + hv_kvp_convert_usermsg_to_hostmsg(umsg, hmsg); + hv_kvp_respond_host(sc, KVP_SUCCESS); + wakeup(sc); + sc->req_in_progress = false; } - kvp_globals.daemon_busy = false; - mtx_unlock(&kvp_globals.pending_mutex); + sc->daemon_busy = false; + mtx_unlock(&sc->pending_mutex); } return (error); @@ -946,66 +864,106 @@ hv_kvp_dev_daemon_write(struct cdev *dev __unused, struct uio *uio, int ioflag _ * for daemon to read. */ static int -hv_kvp_dev_daemon_poll(struct cdev *dev __unused, int events, struct thread *td) +hv_kvp_dev_daemon_poll(struct cdev *dev, int events, struct thread *td) { int revents = 0; + hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; - mtx_lock(&kvp_globals.pending_mutex); + mtx_lock(&sc->pending_mutex); /* * We check global flag daemon_busy for the data availiability for * userland to read. Deamon_busy is set to true before driver has data * for daemon to read. It is set to false after daemon sends * then response back to driver. */ - if (kvp_globals.daemon_busy == true) + if (sc->daemon_busy == true) revents = POLLIN; else - selrecord(td, &hv_kvp_selinfo); + selrecord(td, &sc->hv_kvp_selinfo); - mtx_unlock(&kvp_globals.pending_mutex); + mtx_unlock(&sc->pending_mutex); return (revents); } - -/* - * hv_kvp initialization function - * called from hv_util service. - * - */ -int -hv_kvp_init(hv_vmbus_service *srv) +static int +hv_kvp_probe(device_t dev) { - int error = 0; - hv_work_queue *work_queue = NULL; - - memset(&kvp_globals, 0, sizeof(kvp_globals)); - - work_queue = hv_work_queue_create("KVP Service"); - if (work_queue == NULL) { - hv_kvp_log_info("%s: Work queue alloc failed\n", __func__); - error = ENOMEM; - hv_kvp_log_error("%s: ENOMEM\n", __func__); - goto Finish; + const char *p = vmbus_get_type(dev); + if (!memcmp(p, &service_guid, sizeof(hv_guid))) { + device_set_desc(dev, "Hyper-V KVP Service"); + return BUS_PROBE_DEFAULT; } - srv->work_queue = work_queue; - error = hv_kvp_dev_init(); - mtx_init(&kvp_globals.pending_mutex, "hv-kvp pending mutex", - NULL, MTX_DEF); - kvp_globals.pending_reqs = 0; + return ENXIO; +} + +static int +hv_kvp_attach(device_t dev) +{ + int error; + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + + hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev); + sc->util_sc.callback = hv_kvp_callback; + sema_init(&sc->dev_sema, 0, "hv_kvp device semaphore"); + mtx_init(&sc->pending_mutex, "hv-kvp pending mutex", + NULL, MTX_DEF); -Finish: - return (error); -} + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "hv_kvp_log", + CTLFLAG_RW, &hv_kvp_log, 0, "Hyperv KVP service log level"); -void -hv_kvp_deinit(void) + TASK_INIT(&sc->task, 0, hv_kvp_process_request, sc); + + /* create character device */ + error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, + &sc->hv_kvp_dev, + &hv_kvp_cdevsw, + 0, + UID_ROOT, + GID_WHEEL, + 0640, + "hv_kvp_dev"); + + if (error != 0) + return (error); + sc->hv_kvp_dev->si_drv1 = sc; + + return hv_util_attach(dev); +} + +static int +hv_kvp_detach(device_t dev) { - hv_kvp_dev_destroy(); - mtx_destroy(&kvp_globals.pending_mutex); + hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev); - return; + if (sc->daemon_task != NULL) { + PROC_LOCK(sc->daemon_task); + kern_psignal(sc->daemon_task, SIGKILL); + PROC_UNLOCK(sc->daemon_task); + } + + destroy_dev(sc->hv_kvp_dev); + return hv_util_detach(dev); } + +static device_method_t kvp_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, hv_kvp_probe), + DEVMETHOD(device_attach, hv_kvp_attach), + DEVMETHOD(device_detach, hv_kvp_detach), + { 0, 0 } +}; + +static driver_t kvp_driver = { "hvkvp", kvp_methods, sizeof(hv_kvp_sc)}; + +static devclass_t kvp_devclass; + +DRIVER_MODULE(hv_kvp, vmbus, kvp_driver, kvp_devclass, NULL, NULL); +MODULE_VERSION(hv_kvp, 1); +MODULE_DEPEND(hv_kvp, vmbus, 1, 1, 1); diff --git a/sys/dev/hyperv/utilities/hv_kvp.h b/sys/dev/hyperv/utilities/hv_kvp.h index b67373fa..b62149e 100644 --- a/sys/dev/hyperv/utilities/hv_kvp.h +++ b/sys/dev/hyperv/utilities/hv_kvp.h @@ -238,17 +238,4 @@ struct hv_kvp_ip_msg { struct hv_kvp_ipaddr_value kvp_ip_val; } __attribute__((packed)); - -#define HV_SHUT_DOWN 0 -#define HV_TIME_SYNCH 1 -#define HV_HEART_BEAT 2 -#define HV_KVP 3 -#define HV_MAX_UTIL_SERVICES 4 - -#define HV_WLTIMEDELTA 116444736000000000L /* in 100ns unit */ -#define HV_ICTIMESYNCFLAG_PROBE 0 -#define HV_ICTIMESYNCFLAG_SYNC 1 -#define HV_ICTIMESYNCFLAG_SAMPLE 2 -#define HV_NANO_SEC_PER_SEC 1000000000 - #endif /* _KVP_H */ diff --git a/sys/dev/hyperv/utilities/hv_shutdown.c b/sys/dev/hyperv/utilities/hv_shutdown.c new file mode 100644 index 0000000..20bc65e --- /dev/null +++ b/sys/dev/hyperv/utilities/hv_shutdown.c @@ -0,0 +1,151 @@ +/*- + * Copyright (c) 2014 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * A common driver for all hyper-V util services. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/reboot.h> +#include <sys/timetc.h> +#include <sys/syscallsubr.h> + +#include <dev/hyperv/include/hyperv.h> +#include "hv_util.h" + +static hv_guid service_guid = { .data = + {0x31, 0x60, 0x0B, 0X0E, 0x13, 0x52, 0x34, 0x49, + 0x81, 0x8B, 0x38, 0XD9, 0x0C, 0xED, 0x39, 0xDB} }; + +/** + * Shutdown + */ +static void +hv_shutdown_cb(void *context) +{ + uint8_t* buf; + hv_vmbus_channel* channel; + uint8_t execute_shutdown = 0; + hv_vmbus_icmsg_hdr* icmsghdrp; + uint32_t recv_len; + uint64_t request_id; + int ret; + hv_vmbus_shutdown_msg_data* shutdown_msg; + hv_util_sc *softc; + + softc = (hv_util_sc*)context; + buf = softc->receive_buffer;; + channel = softc->hv_dev->channel; + ret = hv_vmbus_channel_recv_packet(channel, buf, PAGE_SIZE, + &recv_len, &request_id); + + if ((ret == 0) && recv_len > 0) { + + icmsghdrp = (struct hv_vmbus_icmsg_hdr *) + &buf[sizeof(struct hv_vmbus_pipe_hdr)]; + + if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { + hv_negotiate_version(icmsghdrp, NULL, buf); + + } else { + shutdown_msg = + (struct hv_vmbus_shutdown_msg_data *) + &buf[sizeof(struct hv_vmbus_pipe_hdr) + + sizeof(struct hv_vmbus_icmsg_hdr)]; + + switch (shutdown_msg->flags) { + case 0: + case 1: + icmsghdrp->status = HV_S_OK; + execute_shutdown = 1; + if(bootverbose) + printf("Shutdown request received -" + " graceful shutdown initiated\n"); + break; + default: + icmsghdrp->status = HV_E_FAIL; + execute_shutdown = 0; + printf("Shutdown request received -" + " Invalid request\n"); + break; + } + } + + icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | + HV_ICMSGHDRFLAG_RESPONSE; + + hv_vmbus_channel_send_packet(channel, buf, + recv_len, request_id, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); + } + + if (execute_shutdown) + shutdown_nice(RB_POWEROFF); +} + +static int +hv_shutdown_probe(device_t dev) +{ + const char *p = vmbus_get_type(dev); + if (!memcmp(p, &service_guid, sizeof(hv_guid))) { + device_set_desc(dev, "Hyper-V Shutdown Service"); + return BUS_PROBE_DEFAULT; + } + + return ENXIO; +} + +static int +hv_shutdown_attach(device_t dev) +{ + hv_util_sc *softc = (hv_util_sc*)device_get_softc(dev); + + softc->callback = hv_shutdown_cb; + + return hv_util_attach(dev); +} + +static device_method_t shutdown_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, hv_shutdown_probe), + DEVMETHOD(device_attach, hv_shutdown_attach), + DEVMETHOD(device_detach, hv_util_detach), + { 0, 0 } +}; + +static driver_t shutdown_driver = { "hvshutdown", shutdown_methods, sizeof(hv_util_sc)}; + +static devclass_t shutdown_devclass; + +DRIVER_MODULE(hv_shutdown, vmbus, shutdown_driver, shutdown_devclass, NULL, NULL); +MODULE_VERSION(hv_shutdown, 1); +MODULE_DEPEND(hv_shutdown, vmbus, 1, 1, 1); diff --git a/sys/dev/hyperv/utilities/hv_timesync.c b/sys/dev/hyperv/utilities/hv_timesync.c new file mode 100644 index 0000000..d1ea904 --- /dev/null +++ b/sys/dev/hyperv/utilities/hv_timesync.c @@ -0,0 +1,216 @@ +/*- + * Copyright (c) 2014 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * A common driver for all hyper-V util services. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/reboot.h> +#include <sys/timetc.h> +#include <sys/syscallsubr.h> + +#include <dev/hyperv/include/hyperv.h> +#include "hv_util.h" + +#define HV_WLTIMEDELTA 116444736000000000L /* in 100ns unit */ +#define HV_ICTIMESYNCFLAG_PROBE 0 +#define HV_ICTIMESYNCFLAG_SYNC 1 +#define HV_ICTIMESYNCFLAG_SAMPLE 2 +#define HV_NANO_SEC_PER_SEC 1000000000 + +/* Time Sync data */ +typedef struct { + uint64_t data; +} time_sync_data; + + /* Time Synch Service */ +static hv_guid service_guid = {.data = + {0x30, 0xe6, 0x27, 0x95, 0xae, 0xd0, 0x7b, 0x49, + 0xad, 0xce, 0xe8, 0x0a, 0xb0, 0x17, 0x5c, 0xaf } }; + +struct hv_ictimesync_data { + uint64_t parenttime; + uint64_t childtime; + uint64_t roundtriptime; + uint8_t flags; +} __packed; + +typedef struct hv_timesync_sc { + hv_util_sc util_sc; + struct task task; + time_sync_data time_msg; +} hv_timesync_sc; + +/** + * Set host time based on time sync message from host + */ +static void +hv_set_host_time(void *context, int pending) +{ + hv_timesync_sc *softc = (hv_timesync_sc*)context; + uint64_t hosttime = softc->time_msg.data; + struct timespec guest_ts, host_ts; + uint64_t host_tns; + int64_t diff; + int error; + + host_tns = (hosttime - HV_WLTIMEDELTA) * 100; + host_ts.tv_sec = (time_t)(host_tns/HV_NANO_SEC_PER_SEC); + host_ts.tv_nsec = (long)(host_tns%HV_NANO_SEC_PER_SEC); + + nanotime(&guest_ts); + + diff = (int64_t)host_ts.tv_sec - (int64_t)guest_ts.tv_sec; + + /* + * If host differs by 5 seconds then make the guest catch up + */ + if (diff > 5 || diff < -5) { + error = kern_clock_settime(curthread, CLOCK_REALTIME, + &host_ts); + } +} + +/** + * @brief Synchronize time with host after reboot, restore, etc. + * + * ICTIMESYNCFLAG_SYNC flag bit indicates reboot, restore events of the VM. + * After reboot the flag ICTIMESYNCFLAG_SYNC is included in the first time + * message after the timesync channel is opened. Since the hv_utils module is + * loaded after hv_vmbus, the first message is usually missed. The other + * thing is, systime is automatically set to emulated hardware clock which may + * not be UTC time or in the same time zone. So, to override these effects, we + * use the first 50 time samples for initial system time setting. + */ +static inline +void hv_adj_guesttime(hv_timesync_sc *sc, uint64_t hosttime, uint8_t flags) +{ + sc->time_msg.data = hosttime; + + if (((flags & HV_ICTIMESYNCFLAG_SYNC) != 0) || + ((flags & HV_ICTIMESYNCFLAG_SAMPLE) != 0)) { + taskqueue_enqueue(taskqueue_thread, &sc->task); + } +} + +/** + * Time Sync Channel message handler + */ +static void +hv_timesync_cb(void *context) +{ + hv_vmbus_channel* channel; + hv_vmbus_icmsg_hdr* icmsghdrp; + uint32_t recvlen; + uint64_t requestId; + int ret; + uint8_t* time_buf; + struct hv_ictimesync_data* timedatap; + hv_timesync_sc *softc; + + softc = (hv_timesync_sc*)context; + channel = softc->util_sc.hv_dev->channel; + time_buf = softc->util_sc.receive_buffer; + + ret = hv_vmbus_channel_recv_packet(channel, time_buf, + PAGE_SIZE, &recvlen, &requestId); + + if ((ret == 0) && recvlen > 0) { + icmsghdrp = (struct hv_vmbus_icmsg_hdr *) &time_buf[ + sizeof(struct hv_vmbus_pipe_hdr)]; + + if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { + hv_negotiate_version(icmsghdrp, NULL, time_buf); + } else { + timedatap = (struct hv_ictimesync_data *) &time_buf[ + sizeof(struct hv_vmbus_pipe_hdr) + + sizeof(struct hv_vmbus_icmsg_hdr)]; + hv_adj_guesttime(softc, timedatap->parenttime, timedatap->flags); + } + + icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION + | HV_ICMSGHDRFLAG_RESPONSE; + + hv_vmbus_channel_send_packet(channel, time_buf, + recvlen, requestId, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); + } +} + +static int +hv_timesync_probe(device_t dev) +{ + const char *p = vmbus_get_type(dev); + if (!memcmp(p, &service_guid, sizeof(hv_guid))) { + device_set_desc(dev, "Hyper-V Time Synch Service"); + return BUS_PROBE_DEFAULT; + } + + return ENXIO; +} + +static int +hv_timesync_attach(device_t dev) +{ + hv_timesync_sc *softc = device_get_softc(dev); + + softc->util_sc.callback = hv_timesync_cb; + TASK_INIT(&softc->task, 1, hv_set_host_time, softc); + + return hv_util_attach(dev); +} + +static int +hv_timesync_detach(device_t dev) +{ + hv_timesync_sc *softc = device_get_softc(dev); + taskqueue_drain(taskqueue_thread, &softc->task); + + return hv_util_detach(dev); +} + +static device_method_t timesync_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, hv_timesync_probe), + DEVMETHOD(device_attach, hv_timesync_attach), + DEVMETHOD(device_detach, hv_timesync_detach), + { 0, 0 } +}; + +static driver_t timesync_driver = { "hvtimesync", timesync_methods, sizeof(hv_timesync_sc)}; + +static devclass_t timesync_devclass; + +DRIVER_MODULE(hv_timesync, vmbus, timesync_driver, timesync_devclass, NULL, NULL); +MODULE_VERSION(hv_timesync, 1); +MODULE_DEPEND(hv_timesync, vmbus, 1, 1, 1); diff --git a/sys/dev/hyperv/utilities/hv_util.c b/sys/dev/hyperv/utilities/hv_util.c index dc4b1e2..7d19b3f 100644 --- a/sys/dev/hyperv/utilities/hv_util.c +++ b/sys/dev/hyperv/utilities/hv_util.c @@ -40,85 +40,9 @@ #include <sys/syscallsubr.h> #include <dev/hyperv/include/hyperv.h> -#include "hv_kvp.h" +#include "hv_util.h" -/* Time Sync data */ -typedef struct { - uint64_t data; -} time_sync_data; - -static void hv_shutdown_cb(void *context); -static void hv_heartbeat_cb(void *context); -static void hv_timesync_cb(void *context); - -static int hv_timesync_init(hv_vmbus_service *serv); - -/* - * Note: GUID codes below are predefined by the host hypervisor - * (Hyper-V and Azure)interface and required for correct operation. - */ -hv_vmbus_service service_table[] = { - /* Shutdown Service */ - { .guid.data = {0x31, 0x60, 0x0B, 0X0E, 0x13, 0x52, 0x34, 0x49, - 0x81, 0x8B, 0x38, 0XD9, 0x0C, 0xED, 0x39, 0xDB}, - .name = "Hyper-V Shutdown Service\n", - .enabled = TRUE, - .callback = hv_shutdown_cb, - }, - - /* Time Synch Service */ - { .guid.data = {0x30, 0xe6, 0x27, 0x95, 0xae, 0xd0, 0x7b, 0x49, - 0xad, 0xce, 0xe8, 0x0a, 0xb0, 0x17, 0x5c, 0xaf}, - .name = "Hyper-V Time Synch Service\n", - .enabled = TRUE, - .init = hv_timesync_init, - .callback = hv_timesync_cb, - }, - - /* Heartbeat Service */ - { .guid.data = {0x39, 0x4f, 0x16, 0x57, 0x15, 0x91, 0x78, 0x4e, - 0xab, 0x55, 0x38, 0x2f, 0x3b, 0xd5, 0x42, 0x2d}, - .name = "Hyper-V Heartbeat Service\n", - .enabled = TRUE, - .callback = hv_heartbeat_cb, - }, - - /* KVP (Key Value Pair) Service */ - { .guid.data = {0xe7, 0xf4, 0xa0, 0xa9, 0x45, 0x5a, 0x96, 0x4d, - 0xb8, 0x27, 0x8a, 0x84, 0x1e, 0x8c, 0x3, 0xe6}, - .name = "Hyper-V KVP Service\n", - .enabled = TRUE, - .init = hv_kvp_init, - .callback = hv_kvp_callback, - }, -}; - -/* - * Receive buffer pointers. There is one buffer per utility service. The - * buffer is allocated during attach(). - */ -uint8_t *receive_buffer[HV_MAX_UTIL_SERVICES]; - -static boolean_t destroyed_kvp = FALSE; - -struct hv_ictimesync_data { - uint64_t parenttime; - uint64_t childtime; - uint64_t roundtriptime; - uint8_t flags; -} __packed; - -static int -hv_timesync_init(hv_vmbus_service *serv) -{ - - serv->work_queue = hv_work_queue_create("Time Sync"); - if (serv->work_queue == NULL) - return (ENOMEM); - return (0); -} - -static void +void hv_negotiate_version( struct hv_vmbus_icmsg_hdr* icmsghdrp, struct hv_vmbus_icmsg_negotiate* negop, @@ -147,267 +71,19 @@ hv_negotiate_version( negop->icmsg_vercnt = 1; } - -/** - * Set host time based on time sync message from host - */ -static void -hv_set_host_time(void *context) -{ - time_sync_data* time_msg = (time_sync_data*) context; - uint64_t hosttime = time_msg->data; - struct timespec guest_ts, host_ts; - uint64_t host_tns; - int64_t diff; - int error; - - host_tns = (hosttime - HV_WLTIMEDELTA) * 100; - host_ts.tv_sec = (time_t)(host_tns/HV_NANO_SEC_PER_SEC); - host_ts.tv_nsec = (long)(host_tns%HV_NANO_SEC_PER_SEC); - - nanotime(&guest_ts); - - diff = (int64_t)host_ts.tv_sec - (int64_t)guest_ts.tv_sec; - - /* - * If host differs by 5 seconds then make the guest catch up - */ - if (diff > 5 || diff < -5) { - error = kern_clock_settime(curthread, CLOCK_REALTIME, - &host_ts); - } - - /* - * Free the hosttime that was allocated in hv_adj_guesttime() - */ - free(time_msg, M_DEVBUF); -} - -/** - * @brief Synchronize time with host after reboot, restore, etc. - * - * ICTIMESYNCFLAG_SYNC flag bit indicates reboot, restore events of the VM. - * After reboot the flag ICTIMESYNCFLAG_SYNC is included in the first time - * message after the timesync channel is opened. Since the hv_utils module is - * loaded after hv_vmbus, the first message is usually missed. The other - * thing is, systime is automatically set to emulated hardware clock which may - * not be UTC time or in the same time zone. So, to override these effects, we - * use the first 50 time samples for initial system time setting. - */ -static inline -void hv_adj_guesttime(uint64_t hosttime, uint8_t flags) -{ - time_sync_data* time_msg; - - time_msg = malloc(sizeof(time_sync_data), M_DEVBUF, M_NOWAIT); - - if (time_msg == NULL) - return; - - time_msg->data = hosttime; - - if ((flags & HV_ICTIMESYNCFLAG_SYNC) != 0) { - hv_queue_work_item(service_table[HV_TIME_SYNCH].work_queue, - hv_set_host_time, time_msg); - } else if ((flags & HV_ICTIMESYNCFLAG_SAMPLE) != 0) { - hv_queue_work_item(service_table[HV_TIME_SYNCH].work_queue, - hv_set_host_time, time_msg); - } else { - free(time_msg, M_DEVBUF); - } -} - -/** - * Time Sync Channel message handler - */ -static void -hv_timesync_cb(void *context) -{ - hv_vmbus_channel* channel = context; - hv_vmbus_icmsg_hdr* icmsghdrp; - uint32_t recvlen; - uint64_t requestId; - int ret; - uint8_t* time_buf; - struct hv_ictimesync_data* timedatap; - - time_buf = receive_buffer[HV_TIME_SYNCH]; - - ret = hv_vmbus_channel_recv_packet(channel, time_buf, - PAGE_SIZE, &recvlen, &requestId); - - if ((ret == 0) && recvlen > 0) { - icmsghdrp = (struct hv_vmbus_icmsg_hdr *) &time_buf[ - sizeof(struct hv_vmbus_pipe_hdr)]; - - if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { - hv_negotiate_version(icmsghdrp, NULL, time_buf); - } else { - timedatap = (struct hv_ictimesync_data *) &time_buf[ - sizeof(struct hv_vmbus_pipe_hdr) + - sizeof(struct hv_vmbus_icmsg_hdr)]; - hv_adj_guesttime(timedatap->parenttime, timedatap->flags); - } - - icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION - | HV_ICMSGHDRFLAG_RESPONSE; - - hv_vmbus_channel_send_packet(channel, time_buf, - recvlen, requestId, - HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); - } -} - -/** - * Shutdown - */ -static void -hv_shutdown_cb(void *context) -{ - uint8_t* buf; - hv_vmbus_channel* channel = context; - uint8_t execute_shutdown = 0; - hv_vmbus_icmsg_hdr* icmsghdrp; - uint32_t recv_len; - uint64_t request_id; - int ret; - hv_vmbus_shutdown_msg_data* shutdown_msg; - - buf = receive_buffer[HV_SHUT_DOWN]; - - ret = hv_vmbus_channel_recv_packet(channel, buf, PAGE_SIZE, - &recv_len, &request_id); - - if ((ret == 0) && recv_len > 0) { - - icmsghdrp = (struct hv_vmbus_icmsg_hdr *) - &buf[sizeof(struct hv_vmbus_pipe_hdr)]; - - if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { - hv_negotiate_version(icmsghdrp, NULL, buf); - - } else { - shutdown_msg = - (struct hv_vmbus_shutdown_msg_data *) - &buf[sizeof(struct hv_vmbus_pipe_hdr) + - sizeof(struct hv_vmbus_icmsg_hdr)]; - - switch (shutdown_msg->flags) { - case 0: - case 1: - icmsghdrp->status = HV_S_OK; - execute_shutdown = 1; - if(bootverbose) - printf("Shutdown request received -" - " graceful shutdown initiated\n"); - break; - default: - icmsghdrp->status = HV_E_FAIL; - execute_shutdown = 0; - printf("Shutdown request received -" - " Invalid request\n"); - break; - } - } - - icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | - HV_ICMSGHDRFLAG_RESPONSE; - - hv_vmbus_channel_send_packet(channel, buf, - recv_len, request_id, - HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); - } - - if (execute_shutdown) - shutdown_nice(RB_POWEROFF); -} - -/** - * Process heartbeat message - */ -static void -hv_heartbeat_cb(void *context) -{ - uint8_t* buf; - hv_vmbus_channel* channel = context; - uint32_t recvlen; - uint64_t requestid; - int ret; - - struct hv_vmbus_heartbeat_msg_data* heartbeat_msg; - struct hv_vmbus_icmsg_hdr* icmsghdrp; - - buf = receive_buffer[HV_HEART_BEAT]; - - ret = hv_vmbus_channel_recv_packet(channel, buf, PAGE_SIZE, &recvlen, - &requestid); - - if ((ret == 0) && recvlen > 0) { - - icmsghdrp = (struct hv_vmbus_icmsg_hdr *) - &buf[sizeof(struct hv_vmbus_pipe_hdr)]; - - if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { - hv_negotiate_version(icmsghdrp, NULL, buf); - - } else { - heartbeat_msg = - (struct hv_vmbus_heartbeat_msg_data *) - &buf[sizeof(struct hv_vmbus_pipe_hdr) + - sizeof(struct hv_vmbus_icmsg_hdr)]; - - heartbeat_msg->seq_num += 1; - } - - icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | - HV_ICMSGHDRFLAG_RESPONSE; - - hv_vmbus_channel_send_packet(channel, buf, recvlen, requestid, - HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); - } -} - - -static int -hv_util_probe(device_t dev) -{ - int i; - int rtn_value = ENXIO; - - for (i = 0; i < HV_MAX_UTIL_SERVICES; i++) { - const char *p = vmbus_get_type(dev); - if (service_table[i].enabled && !memcmp(p, &service_table[i].guid, sizeof(hv_guid))) { - device_set_softc(dev, (void *) (&service_table[i])); - rtn_value = BUS_PROBE_DEFAULT; - } - } - - return rtn_value; -} - -static int +int hv_util_attach(device_t dev) { - struct hv_device* hv_dev; - struct hv_vmbus_service* service; - int ret; - size_t receive_buffer_offset; + struct hv_device* hv_dev; + struct hv_util_sc* softc; + int ret; hv_dev = vmbus_get_devctx(dev); - service = device_get_softc(dev); - receive_buffer_offset = service - &service_table[0]; - device_printf(dev, "Hyper-V Service attaching: %s\n", service->name); - receive_buffer[receive_buffer_offset] = + softc = device_get_softc(dev); + softc->hv_dev = hv_dev; + softc->receive_buffer = malloc(4 * PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO); - if (service->init != NULL) { - ret = service->init(service); - if (ret) { - ret = ENODEV; - goto error0; - } - } - /* * These services are not performance critical and do not need * batched reading. Furthermore, some services such as KVP can @@ -418,83 +94,30 @@ hv_util_attach(device_t dev) hv_set_channel_read_state(hv_dev->channel, FALSE); ret = hv_vmbus_channel_open(hv_dev->channel, 4 * PAGE_SIZE, - 4 * PAGE_SIZE, NULL, 0, - service->callback, hv_dev->channel); + 4 * PAGE_SIZE, NULL, 0, + softc->callback, softc); if (ret) - goto error0; + goto error0; return (0); - error0: - - free(receive_buffer[receive_buffer_offset], M_DEVBUF); - receive_buffer[receive_buffer_offset] = NULL; - +error0: + free(softc->receive_buffer, M_DEVBUF); return (ret); } -static int +int hv_util_detach(device_t dev) { - struct hv_device* hv_dev; - struct hv_vmbus_service* service; - size_t receive_buffer_offset; - - if (!destroyed_kvp) { - hv_kvp_deinit(); - destroyed_kvp = TRUE; - } + struct hv_device* hv_dev; + struct hv_util_sc* softc; hv_dev = vmbus_get_devctx(dev); hv_vmbus_channel_close(hv_dev->channel); - service = device_get_softc(dev); - receive_buffer_offset = service - &service_table[0]; + softc = device_get_softc(dev); - if (service->work_queue != NULL) - hv_work_queue_close(service->work_queue); - - free(receive_buffer[receive_buffer_offset], M_DEVBUF); - receive_buffer[receive_buffer_offset] = NULL; + free(softc->receive_buffer, M_DEVBUF); return (0); } - -static void -hv_util_init(void) -{ -} - -static int -hv_util_modevent(module_t mod, int event, void *arg) -{ - switch (event) { - case MOD_LOAD: - break; - case MOD_UNLOAD: - break; - default: - break; - } - return (0); -} - -static device_method_t util_methods[] = { - /* Device interface */ - DEVMETHOD(device_probe, hv_util_probe), - DEVMETHOD(device_attach, hv_util_attach), - DEVMETHOD(device_detach, hv_util_detach), - DEVMETHOD(device_shutdown, bus_generic_shutdown), - { 0, 0 } } -; - -static driver_t util_driver = { "hyperv-utils", util_methods, 0 }; - -static devclass_t util_devclass; - -DRIVER_MODULE(hv_utils, vmbus, util_driver, util_devclass, hv_util_modevent, 0); -MODULE_VERSION(hv_utils, 1); -MODULE_DEPEND(hv_utils, vmbus, 1, 1, 1); - -SYSINIT(hv_util_initx, SI_SUB_KTHREAD_IDLE, SI_ORDER_MIDDLE + 1, - hv_util_init, NULL); diff --git a/sys/dev/hyperv/utilities/hv_util.h b/sys/dev/hyperv/utilities/hv_util.h new file mode 100644 index 0000000..708dca8 --- /dev/null +++ b/sys/dev/hyperv/utilities/hv_util.h @@ -0,0 +1,55 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HVUTIL_H_ +#define _HVUTIL_H_ + +/** + * hv_util related structures + * + */ +typedef struct hv_util_sc { + /* + * function to process Hyper-V messages + */ + void (*callback)(void *); + + struct hv_device* hv_dev; + uint8_t *receive_buffer; +} hv_util_sc; + +void hv_negotiate_version( + struct hv_vmbus_icmsg_hdr* icmsghdrp, + struct hv_vmbus_icmsg_negotiate* negop, + uint8_t* buf); + +int hv_util_attach(device_t dev); +int hv_util_detach(device_t dev); +#endif diff --git a/sys/dev/hyperv/vmbus/hv_channel.c b/sys/dev/hyperv/vmbus/hv_channel.c index 7037768..bb777cc 100644 --- a/sys/dev/hyperv/vmbus/hv_channel.c +++ b/sys/dev/hyperv/vmbus/hv_channel.c @@ -52,6 +52,7 @@ static int vmbus_channel_create_gpadl_header( uint32_t* message_count); static void vmbus_channel_set_event(hv_vmbus_channel* channel); +static void VmbusProcessChannelEvent(void* channel, int pending); /** * @brief Trigger an event notification on the specified channel @@ -68,9 +69,7 @@ vmbus_channel_set_event(hv_vmbus_channel *channel) + ((channel->offer_msg.child_rel_id >> 5)))); monitor_page = (hv_vmbus_monitor_page *) - hv_vmbus_g_connection.monitor_pages; - - monitor_page++; /* Get the child to parent monitor page */ + hv_vmbus_g_connection.monitor_page_2; synch_set_bit(channel->monitor_bit, (uint32_t *)&monitor_page-> @@ -115,6 +114,9 @@ hv_vmbus_channel_open( new_channel->on_channel_callback = pfn_on_channel_callback; new_channel->channel_callback_context = context; + new_channel->rxq = hv_vmbus_g_context.hv_event_queue[new_channel->target_cpu]; + TASK_INIT(&new_channel->channel_task, 0, VmbusProcessChannelEvent, new_channel); + /* Allocate the ring buffer */ out = contigmalloc((send_ring_buffer_size + recv_ring_buffer_size), M_DEVBUF, M_ZERO, 0UL, BUS_SPACE_MAXADDR, PAGE_SIZE, 0); @@ -518,6 +520,7 @@ static void hv_vmbus_channel_close_internal(hv_vmbus_channel *channel) { int ret = 0; + struct taskqueue *rxq = channel->rxq; hv_vmbus_channel_close_channel* msg; hv_vmbus_channel_msg_info* info; @@ -525,6 +528,11 @@ hv_vmbus_channel_close_internal(hv_vmbus_channel *channel) channel->sc_creation_callback = NULL; /* + * set rxq to NULL to avoid more requests be scheduled + */ + channel->rxq = NULL; + taskqueue_drain(rxq, &channel->channel_task); + /* * Grab the lock to prevent race condition when a packet received * and unloading driver is in the process. */ @@ -666,11 +674,11 @@ hv_vmbus_channel_send_packet_pagebuffer( { int ret = 0; - int i = 0; boolean_t need_sig; uint32_t packet_len; + uint32_t page_buflen; uint32_t packetLen_aligned; - hv_vmbus_sg_buffer_list buffer_list[3]; + hv_vmbus_sg_buffer_list buffer_list[4]; hv_vmbus_channel_packet_page_buffer desc; uint32_t descSize; uint64_t alignedData = 0; @@ -682,36 +690,33 @@ hv_vmbus_channel_send_packet_pagebuffer( * Adjust the size down since hv_vmbus_channel_packet_page_buffer * is the largest size we support */ - descSize = sizeof(hv_vmbus_channel_packet_page_buffer) - - ((HV_MAX_PAGE_BUFFER_COUNT - page_count) * - sizeof(hv_vmbus_page_buffer)); - packet_len = descSize + buffer_len; + descSize = __offsetof(hv_vmbus_channel_packet_page_buffer, range); + page_buflen = sizeof(hv_vmbus_page_buffer) * page_count; + packet_len = descSize + page_buflen + buffer_len; packetLen_aligned = HV_ALIGN_UP(packet_len, sizeof(uint64_t)); /* Setup the descriptor */ desc.type = HV_VMBUS_PACKET_TYPE_DATA_USING_GPA_DIRECT; desc.flags = HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; - desc.data_offset8 = descSize >> 3; /* in 8-bytes granularity */ + /* in 8-bytes granularity */ + desc.data_offset8 = (descSize + page_buflen) >> 3; desc.length8 = (uint16_t) (packetLen_aligned >> 3); desc.transaction_id = request_id; desc.range_count = page_count; - for (i = 0; i < page_count; i++) { - desc.range[i].length = page_buffers[i].length; - desc.range[i].offset = page_buffers[i].offset; - desc.range[i].pfn = page_buffers[i].pfn; - } - buffer_list[0].data = &desc; buffer_list[0].length = descSize; - buffer_list[1].data = buffer; - buffer_list[1].length = buffer_len; + buffer_list[1].data = page_buffers; + buffer_list[1].length = page_buflen; - buffer_list[2].data = &alignedData; - buffer_list[2].length = packetLen_aligned - packet_len; + buffer_list[2].data = buffer; + buffer_list[2].length = buffer_len; - ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, + buffer_list[3].data = &alignedData; + buffer_list[3].length = packetLen_aligned - packet_len; + + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 4, &need_sig); /* TODO: We should determine if this is optional */ @@ -880,3 +885,67 @@ hv_vmbus_channel_recv_packet_raw( return (0); } + + +/** + * Process a channel event notification + */ +static void +VmbusProcessChannelEvent(void* context, int pending) +{ + void* arg; + uint32_t bytes_to_read; + hv_vmbus_channel* channel = (hv_vmbus_channel*)context; + boolean_t is_batched_reading; + + /** + * Find the channel based on this relid and invokes + * the channel callback to process the event + */ + + if (channel == NULL) { + return; + } + /** + * To deal with the race condition where we might + * receive a packet while the relevant driver is + * being unloaded, dispatch the callback while + * holding the channel lock. The unloading driver + * will acquire the same channel lock to set the + * callback to NULL. This closes the window. + */ + + /* + * Disable the lock due to newly added WITNESS check in r277723. + * Will seek other way to avoid race condition. + * -- whu + */ + // mtx_lock(&channel->inbound_lock); + if (channel->on_channel_callback != NULL) { + arg = channel->channel_callback_context; + is_batched_reading = channel->batched_reading; + /* + * Optimize host to guest signaling by ensuring: + * 1. While reading the channel, we disable interrupts from + * host. + * 2. Ensure that we process all posted messages from the host + * before returning from this callback. + * 3. Once we return, enable signaling from the host. Once this + * state is set we check to see if additional packets are + * available to read. In this case we repeat the process. + */ + do { + if (is_batched_reading) + hv_ring_buffer_read_begin(&channel->inbound); + + channel->on_channel_callback(arg); + + if (is_batched_reading) + bytes_to_read = + hv_ring_buffer_read_end(&channel->inbound); + else + bytes_to_read = 0; + } while (is_batched_reading && (bytes_to_read != 0)); + } + // mtx_unlock(&channel->inbound_lock); +} diff --git a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c index 4ccb647..ab6e8ad 100644 --- a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c +++ b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c @@ -39,8 +39,10 @@ __FBSDID("$FreeBSD$"); */ static void vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr); +static void vmbus_channel_on_offer_internal(void* context); static void vmbus_channel_on_open_result(hv_vmbus_channel_msg_header* hdr); static void vmbus_channel_on_offer_rescind(hv_vmbus_channel_msg_header* hdr); +static void vmbus_channel_on_offer_rescind_internal(void* context); static void vmbus_channel_on_gpadl_created(hv_vmbus_channel_msg_header* hdr); static void vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr); static void vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr); @@ -52,41 +54,46 @@ static void vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr); hv_vmbus_channel_msg_table_entry g_channel_message_table[HV_CHANNEL_MESSAGE_COUNT] = { { HV_CHANNEL_MESSAGE_INVALID, - 0, NULL }, + NULL }, { HV_CHANNEL_MESSAGE_OFFER_CHANNEL, - 0, vmbus_channel_on_offer }, + vmbus_channel_on_offer }, { HV_CHANNEL_MESSAGE_RESCIND_CHANNEL_OFFER, - 0, vmbus_channel_on_offer_rescind }, + vmbus_channel_on_offer_rescind }, { HV_CHANNEL_MESSAGE_REQUEST_OFFERS, - 0, NULL }, + NULL }, { HV_CHANNEL_MESSAGE_ALL_OFFERS_DELIVERED, - 1, vmbus_channel_on_offers_delivered }, + vmbus_channel_on_offers_delivered }, { HV_CHANNEL_MESSAGE_OPEN_CHANNEL, - 0, NULL }, + NULL }, { HV_CHANNEL_MESSAGE_OPEN_CHANNEL_RESULT, - 1, vmbus_channel_on_open_result }, + vmbus_channel_on_open_result }, { HV_CHANNEL_MESSAGE_CLOSE_CHANNEL, - 0, NULL }, + NULL }, { HV_CHANNEL_MESSAGEL_GPADL_HEADER, - 0, NULL }, + NULL }, { HV_CHANNEL_MESSAGE_GPADL_BODY, - 0, NULL }, + NULL }, { HV_CHANNEL_MESSAGE_GPADL_CREATED, - 1, vmbus_channel_on_gpadl_created }, + vmbus_channel_on_gpadl_created }, { HV_CHANNEL_MESSAGE_GPADL_TEARDOWN, - 0, NULL }, + NULL }, { HV_CHANNEL_MESSAGE_GPADL_TORNDOWN, - 1, vmbus_channel_on_gpadl_torndown }, + vmbus_channel_on_gpadl_torndown }, { HV_CHANNEL_MESSAGE_REL_ID_RELEASED, - 0, NULL }, + NULL }, { HV_CHANNEL_MESSAGE_INITIATED_CONTACT, - 0, NULL }, + NULL }, { HV_CHANNEL_MESSAGE_VERSION_RESPONSE, - 1, vmbus_channel_on_version_response }, + vmbus_channel_on_version_response }, { HV_CHANNEL_MESSAGE_UNLOAD, - 0, NULL } + NULL } }; +typedef struct hv_work_item { + struct task work; + void (*callback)(void *); + void* context; +} hv_work_item; /** * Implementation of the work abstraction. @@ -96,120 +103,30 @@ work_item_callback(void *work, int pending) { struct hv_work_item *w = (struct hv_work_item *)work; - /* - * Serialize work execution. - */ - if (w->wq->work_sema != NULL) { - sema_wait(w->wq->work_sema); - } - w->callback(w->context); - if (w->wq->work_sema != NULL) { - sema_post(w->wq->work_sema); - } - free(w, M_DEVBUF); } -struct hv_work_queue* -hv_work_queue_create(char* name) -{ - static unsigned int qid = 0; - char qname[64]; - int pri; - struct hv_work_queue* wq; - - wq = malloc(sizeof(struct hv_work_queue), M_DEVBUF, M_NOWAIT | M_ZERO); - KASSERT(wq != NULL, ("Error VMBUS: Failed to allocate work_queue\n")); - if (wq == NULL) - return (NULL); - - /* - * We use work abstraction to handle messages - * coming from the host and these are typically offers. - * Some FreeBsd drivers appear to have a concurrency issue - * where probe/attach needs to be serialized. We ensure that - * by having only one thread process work elements in a - * specific queue by serializing work execution. - * - */ - if (strcmp(name, "vmbusQ") == 0) { - pri = PI_DISK; - } else { /* control */ - pri = PI_NET; - /* - * Initialize semaphore for this queue by pointing - * to the globale semaphore used for synchronizing all - * control messages. - */ - wq->work_sema = &hv_vmbus_g_connection.control_sema; - } - - sprintf(qname, "hv_%s_%u", name, qid); - - /* - * Fixme: FreeBSD 8.2 has a different prototype for - * taskqueue_create(), and for certain other taskqueue functions. - * We need to research the implications of these changes. - * Fixme: Not sure when the changes were introduced. - */ - wq->queue = taskqueue_create(qname, M_NOWAIT, taskqueue_thread_enqueue, - &wq->queue - #if __FreeBSD_version < 800000 - , &wq->proc - #endif - ); - - if (wq->queue == NULL) { - free(wq, M_DEVBUF); - return (NULL); - } - - if (taskqueue_start_threads(&wq->queue, 1, pri, "%s taskq", qname)) { - taskqueue_free(wq->queue); - free(wq, M_DEVBUF); - return (NULL); - } - - qid++; - - return (wq); -} - -void -hv_work_queue_close(struct hv_work_queue *wq) -{ - /* - * KYS: Need to drain the taskqueue - * before we close the hv_work_queue. - */ - /*KYS: taskqueue_drain(wq->tq, ); */ - taskqueue_free(wq->queue); - free(wq, M_DEVBUF); -} - /** * @brief Create work item */ -int +static int hv_queue_work_item( - struct hv_work_queue *wq, void (*callback)(void *), void *context) { struct hv_work_item *w = malloc(sizeof(struct hv_work_item), - M_DEVBUF, M_NOWAIT | M_ZERO); + M_DEVBUF, M_NOWAIT); KASSERT(w != NULL, ("Error VMBUS: Failed to allocate WorkItem\n")); if (w == NULL) return (ENOMEM); w->callback = callback; w->context = context; - w->wq = wq; TASK_INIT(&w->work, 0, work_item_callback, w); - return (taskqueue_enqueue(wq->queue, &w->work)); + return (taskqueue_enqueue(taskqueue_thread, &w->work)); } @@ -224,10 +141,7 @@ hv_vmbus_allocate_channel(void) channel = (hv_vmbus_channel*) malloc( sizeof(hv_vmbus_channel), M_DEVBUF, - M_NOWAIT | M_ZERO); - KASSERT(channel != NULL, ("Error VMBUS: Failed to allocate channel!")); - if (channel == NULL) - return (NULL); + M_WAITOK | M_ZERO); mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF); mtx_init(&channel->sc_lock, "vmbus multi channel", NULL, MTX_DEF); @@ -238,16 +152,6 @@ hv_vmbus_allocate_channel(void) } /** - * @brief Release the vmbus channel object itself - */ -static inline void -ReleaseVmbusChannel(void *context) -{ - hv_vmbus_channel* channel = (hv_vmbus_channel*) context; - free(channel, M_DEVBUF); -} - -/** * @brief Release the resources used by the vmbus channel object */ void @@ -255,13 +159,8 @@ hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel) { mtx_destroy(&channel->sc_lock); mtx_destroy(&channel->inbound_lock); - /* - * We have to release the channel's workqueue/thread in - * the vmbus's workqueue/thread context - * ie we can't destroy ourselves - */ - hv_queue_work_item(hv_vmbus_g_connection.work_queue, - ReleaseVmbusChannel, (void *) channel); + + free(channel, M_DEVBUF); } /** @@ -459,7 +358,7 @@ static void vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr) { hv_vmbus_channel_offer_channel* offer; - hv_vmbus_channel* new_channel; + hv_vmbus_channel_offer_channel* copied; offer = (hv_vmbus_channel_offer_channel*) hdr; @@ -469,10 +368,25 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr) guidType = &offer->offer.interface_type; guidInstance = &offer->offer.interface_instance; + // copy offer data + copied = malloc(sizeof(*copied), M_DEVBUF, M_NOWAIT); + if (copied == NULL) { + printf("fail to allocate memory\n"); + return; + } + + memcpy(copied, hdr, sizeof(*copied)); + hv_queue_work_item(vmbus_channel_on_offer_internal, copied); +} + +static void +vmbus_channel_on_offer_internal(void* context) +{ + hv_vmbus_channel* new_channel; + + hv_vmbus_channel_offer_channel* offer = (hv_vmbus_channel_offer_channel*)context; /* Allocate the channel object and save this offer */ new_channel = hv_vmbus_allocate_channel(); - if (new_channel == NULL) - return; /* * By default we setup state to enable batched @@ -512,6 +426,8 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr) new_channel->monitor_bit = (uint8_t) offer->monitor_id % 32; vmbus_channel_process_offer(new_channel); + + free(offer, M_DEVBUF); } /** @@ -529,13 +445,20 @@ vmbus_channel_on_offer_rescind(hv_vmbus_channel_msg_header* hdr) rescind = (hv_vmbus_channel_rescind_offer*) hdr; channel = hv_vmbus_g_connection.channels[rescind->child_rel_id]; - if (channel == NULL) + if (channel == NULL) return; - hv_vmbus_child_device_unregister(channel->device); - mtx_lock(&hv_vmbus_g_connection.channel_lock); + hv_queue_work_item(vmbus_channel_on_offer_rescind_internal, channel); hv_vmbus_g_connection.channels[rescind->child_rel_id] = NULL; - mtx_unlock(&hv_vmbus_g_connection.channel_lock); +} + +static void +vmbus_channel_on_offer_rescind_internal(void *context) +{ + hv_vmbus_channel* channel; + + channel = (hv_vmbus_channel*)context; + hv_vmbus_child_device_unregister(channel->device); } /** @@ -712,35 +635,6 @@ vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr) } /** - * @brief Handler for channel protocol messages. - * - * This is invoked in the vmbus worker thread context. - */ -void -hv_vmbus_on_channel_message(void *context) -{ - hv_vmbus_message* msg; - hv_vmbus_channel_msg_header* hdr; - int size; - - msg = (hv_vmbus_message*) context; - hdr = (hv_vmbus_channel_msg_header*) msg->u.payload; - size = msg->header.payload_size; - - if (hdr->message_type >= HV_CHANNEL_MESSAGE_COUNT) { - free(msg, M_DEVBUF); - return; - } - - if (g_channel_message_table[hdr->message_type].messageHandler) { - g_channel_message_table[hdr->message_type].messageHandler(hdr); - } - - /* Free the msg that was allocated in VmbusOnMsgDPC() */ - free(msg, M_DEVBUF); -} - -/** * @brief Send a request to get all our pending offers. */ int @@ -765,8 +659,7 @@ hv_vmbus_request_channel_offers(void) ret = hv_vmbus_post_message(msg, sizeof(hv_vmbus_channel_msg_header)); - if (msg_info) - free(msg_info, M_DEVBUF); + free(msg_info, M_DEVBUF); return (ret); } diff --git a/sys/dev/hyperv/vmbus/hv_connection.c b/sys/dev/hyperv/vmbus/hv_connection.c index cfdc9bb..fb1879d 100644 --- a/sys/dev/hyperv/vmbus/hv_connection.c +++ b/sys/dev/hyperv/vmbus/hv_connection.c @@ -90,12 +90,10 @@ hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info, hv_vmbus_g_connection.interrupt_page); msg->monitor_page_1 = hv_get_phys_addr( - hv_vmbus_g_connection.monitor_pages); + hv_vmbus_g_connection.monitor_page_1); - msg->monitor_page_2 = - hv_get_phys_addr( - ((uint8_t *) hv_vmbus_g_connection.monitor_pages - + PAGE_SIZE)); + msg->monitor_page_2 = hv_get_phys_addr( + hv_vmbus_g_connection.monitor_page_2); /** * Add to list before we send the request since we may receive the @@ -168,8 +166,6 @@ hv_vmbus_connect(void) { * Initialize the vmbus connection */ hv_vmbus_g_connection.connect_state = HV_CONNECTING; - hv_vmbus_g_connection.work_queue = hv_work_queue_create("vmbusQ"); - sema_init(&hv_vmbus_g_connection.control_sema, 1, "control_sema"); TAILQ_INIT(&hv_vmbus_g_connection.channel_msg_anchor); mtx_init(&hv_vmbus_g_connection.channel_msg_lock, "vmbus channel msg", @@ -183,18 +179,9 @@ hv_vmbus_connect(void) { * Setup the vmbus event connection for channel interrupt abstraction * stuff */ - hv_vmbus_g_connection.interrupt_page = contigmalloc( + hv_vmbus_g_connection.interrupt_page = malloc( PAGE_SIZE, M_DEVBUF, - M_NOWAIT | M_ZERO, 0UL, - BUS_SPACE_MAXADDR, - PAGE_SIZE, 0); - KASSERT(hv_vmbus_g_connection.interrupt_page != NULL, - ("Error VMBUS: malloc failed to allocate Channel" - " Request Event message!")); - if (hv_vmbus_g_connection.interrupt_page == NULL) { - ret = ENOMEM; - goto cleanup; - } + M_WAITOK | M_ZERO); hv_vmbus_g_connection.recv_interrupt_page = hv_vmbus_g_connection.interrupt_page; @@ -207,31 +194,19 @@ hv_vmbus_connect(void) { * Set up the monitor notification facility. The 1st page for * parent->child and the 2nd page for child->parent */ - hv_vmbus_g_connection.monitor_pages = contigmalloc( - 2 * PAGE_SIZE, + hv_vmbus_g_connection.monitor_page_1 = malloc( + PAGE_SIZE, M_DEVBUF, - M_NOWAIT | M_ZERO, - 0UL, - BUS_SPACE_MAXADDR, + M_WAITOK | M_ZERO); + hv_vmbus_g_connection.monitor_page_2 = malloc( PAGE_SIZE, - 0); - KASSERT(hv_vmbus_g_connection.monitor_pages != NULL, - ("Error VMBUS: malloc failed to allocate Monitor Pages!")); - if (hv_vmbus_g_connection.monitor_pages == NULL) { - ret = ENOMEM; - goto cleanup; - } + M_DEVBUF, + M_WAITOK | M_ZERO); msg_info = (hv_vmbus_channel_msg_info*) malloc(sizeof(hv_vmbus_channel_msg_info) + sizeof(hv_vmbus_channel_initiate_contact), - M_DEVBUF, M_NOWAIT | M_ZERO); - KASSERT(msg_info != NULL, - ("Error VMBUS: malloc failed for Initiate Contact message!")); - if (msg_info == NULL) { - ret = ENOMEM; - goto cleanup; - } + M_DEVBUF, M_WAITOK | M_ZERO); hv_vmbus_g_connection.channels = malloc(sizeof(hv_vmbus_channel*) * HV_CHANNEL_MAX_COUNT, @@ -273,26 +248,16 @@ hv_vmbus_connect(void) { hv_vmbus_g_connection.connect_state = HV_DISCONNECTED; - hv_work_queue_close(hv_vmbus_g_connection.work_queue); - sema_destroy(&hv_vmbus_g_connection.control_sema); mtx_destroy(&hv_vmbus_g_connection.channel_lock); mtx_destroy(&hv_vmbus_g_connection.channel_msg_lock); if (hv_vmbus_g_connection.interrupt_page != NULL) { - contigfree( - hv_vmbus_g_connection.interrupt_page, - PAGE_SIZE, - M_DEVBUF); + free(hv_vmbus_g_connection.interrupt_page, M_DEVBUF); hv_vmbus_g_connection.interrupt_page = NULL; } - if (hv_vmbus_g_connection.monitor_pages != NULL) { - contigfree( - hv_vmbus_g_connection.monitor_pages, - 2 * PAGE_SIZE, - M_DEVBUF); - hv_vmbus_g_connection.monitor_pages = NULL; - } + free(hv_vmbus_g_connection.monitor_page_1, M_DEVBUF); + free(hv_vmbus_g_connection.monitor_page_2, M_DEVBUF); if (msg_info) { sema_destroy(&msg_info->wait_sema); @@ -309,108 +274,29 @@ hv_vmbus_connect(void) { int hv_vmbus_disconnect(void) { int ret = 0; - hv_vmbus_channel_unload* msg; - - msg = malloc(sizeof(hv_vmbus_channel_unload), - M_DEVBUF, M_NOWAIT | M_ZERO); - KASSERT(msg != NULL, - ("Error VMBUS: malloc failed to allocate Channel Unload Msg!")); - if (msg == NULL) - return (ENOMEM); - - msg->message_type = HV_CHANNEL_MESSAGE_UNLOAD; + hv_vmbus_channel_unload msg; - ret = hv_vmbus_post_message(msg, sizeof(hv_vmbus_channel_unload)); + msg.message_type = HV_CHANNEL_MESSAGE_UNLOAD; + ret = hv_vmbus_post_message(&msg, sizeof(hv_vmbus_channel_unload)); - contigfree(hv_vmbus_g_connection.interrupt_page, PAGE_SIZE, M_DEVBUF); + free(hv_vmbus_g_connection.interrupt_page, M_DEVBUF); mtx_destroy(&hv_vmbus_g_connection.channel_msg_lock); - hv_work_queue_close(hv_vmbus_g_connection.work_queue); - sema_destroy(&hv_vmbus_g_connection.control_sema); - free(hv_vmbus_g_connection.channels, M_DEVBUF); hv_vmbus_g_connection.connect_state = HV_DISCONNECTED; - free(msg, M_DEVBUF); - return (ret); } /** - * Process a channel event notification - */ -static void -VmbusProcessChannelEvent(uint32_t relid) -{ - void* arg; - uint32_t bytes_to_read; - hv_vmbus_channel* channel; - boolean_t is_batched_reading; - - /** - * Find the channel based on this relid and invokes - * the channel callback to process the event - */ - - channel = hv_vmbus_g_connection.channels[relid]; - - if (channel == NULL) { - return; - } - /** - * To deal with the race condition where we might - * receive a packet while the relevant driver is - * being unloaded, dispatch the callback while - * holding the channel lock. The unloading driver - * will acquire the same channel lock to set the - * callback to NULL. This closes the window. - */ - - /* - * Disable the lock due to newly added WITNESS check in r277723. - * Will seek other way to avoid race condition. - * -- whu - */ - // mtx_lock(&channel->inbound_lock); - if (channel->on_channel_callback != NULL) { - arg = channel->channel_callback_context; - is_batched_reading = channel->batched_reading; - /* - * Optimize host to guest signaling by ensuring: - * 1. While reading the channel, we disable interrupts from - * host. - * 2. Ensure that we process all posted messages from the host - * before returning from this callback. - * 3. Once we return, enable signaling from the host. Once this - * state is set we check to see if additional packets are - * available to read. In this case we repeat the process. - */ - do { - if (is_batched_reading) - hv_ring_buffer_read_begin(&channel->inbound); - - channel->on_channel_callback(arg); - - if (is_batched_reading) - bytes_to_read = - hv_ring_buffer_read_end(&channel->inbound); - else - bytes_to_read = 0; - } while (is_batched_reading && (bytes_to_read != 0)); - } - // mtx_unlock(&channel->inbound_lock); -} - -/** * Handler for events */ void -hv_vmbus_on_events(void *arg) +hv_vmbus_on_events(int cpu) { int bit; - int cpu; int dword; void *page_addr; uint32_t* recv_interrupt_page = NULL; @@ -419,7 +305,6 @@ hv_vmbus_on_events(void *arg) hv_vmbus_synic_event_flags *event; /* int maxdword = PAGE_SIZE >> 3; */ - cpu = (int)(long)arg; KASSERT(cpu <= mp_maxid, ("VMBUS: hv_vmbus_on_events: " "cpu out of range!")); @@ -461,8 +346,14 @@ hv_vmbus_on_events(void *arg) */ continue; } else { - VmbusProcessChannelEvent(rel_id); - + hv_vmbus_channel * channel = hv_vmbus_g_connection.channels[rel_id]; + /* if channel is closed or closing */ + if (channel == NULL || channel->rxq == NULL) + continue; + + if (channel->batched_reading) + hv_ring_buffer_read_begin(&channel->inbound); + taskqueue_enqueue_fast(channel->rxq, &channel->channel_task); } } } diff --git a/sys/dev/hyperv/vmbus/hv_hv.c b/sys/dev/hyperv/vmbus/hv_hv.c index ca5641f..6afc2b8 100644 --- a/sys/dev/hyperv/vmbus/hv_hv.c +++ b/sys/dev/hyperv/vmbus/hv_hv.c @@ -189,11 +189,7 @@ hv_vmbus_init(void) * See if the hypercall page is already set */ hypercall_msr.as_uint64_t = rdmsr(HV_X64_MSR_HYPERCALL); - virt_addr = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO); - KASSERT(virt_addr != NULL, - ("Error VMBUS: malloc failed to allocate page during init!")); - if (virt_addr == NULL) - goto cleanup; + virt_addr = malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO); hypercall_msr.u.enable = 1; hypercall_msr.u.guest_physical_address = diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c index 66a3f39..c8d6894 100644 --- a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c +++ b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c @@ -83,8 +83,6 @@ vmbus_msg_swintr(void *arg) hv_vmbus_channel_msg_table_entry *entry; hv_vmbus_channel_msg_type msg_type; hv_vmbus_message* msg; - hv_vmbus_message* copied; - static bool warned = false; cpu = (int)(long)arg; KASSERT(cpu <= mp_maxid, ("VMBUS: vmbus_msg_swintr: " @@ -100,31 +98,15 @@ vmbus_msg_swintr(void *arg) hdr = (hv_vmbus_channel_msg_header *)msg->u.payload; msg_type = hdr->message_type; - if (msg_type >= HV_CHANNEL_MESSAGE_COUNT && !warned) { - warned = true; + if (msg_type >= HV_CHANNEL_MESSAGE_COUNT) { printf("VMBUS: unknown message type = %d\n", msg_type); goto handled; } entry = &g_channel_message_table[msg_type]; - if (entry->handler_no_sleep) + if (entry->messageHandler) entry->messageHandler(hdr); - else { - - copied = malloc(sizeof(hv_vmbus_message), - M_DEVBUF, M_NOWAIT); - KASSERT(copied != NULL, - ("Error VMBUS: malloc failed to allocate" - " hv_vmbus_message!")); - if (copied == NULL) - continue; - - memcpy(copied, msg, sizeof(hv_vmbus_message)); - hv_queue_work_item(hv_vmbus_g_connection.work_queue, - hv_vmbus_on_channel_message, - copied); - } handled: msg->header.message_type = HV_MESSAGE_TYPE_NONE; @@ -177,7 +159,7 @@ hv_vmbus_isr(struct trapframe *frame) (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) { /* Since we are a child, we only need to check bit 0 */ if (synch_test_and_clear_bit(0, &event->flags32[0])) { - swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0); + hv_vmbus_on_events(cpu); } } else { /* @@ -187,7 +169,7 @@ hv_vmbus_isr(struct trapframe *frame) * Directly schedule the event software interrupt on * current cpu. */ - swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0); + hv_vmbus_on_events(cpu); } /* Check if there are actual msgs to be process */ @@ -225,7 +207,6 @@ hv_vmbus_isr(struct trapframe *frame) return (FILTER_HANDLED); } -uint32_t hv_vmbus_swintr_event_cpu[MAXCPU]; u_long *hv_vmbus_intr_cpu[MAXCPU]; void @@ -310,12 +291,7 @@ hv_vmbus_child_device_create( * Allocate the new child device */ child_dev = malloc(sizeof(hv_device), M_DEVBUF, - M_NOWAIT | M_ZERO); - KASSERT(child_dev != NULL, - ("Error VMBUS: malloc failed to allocate hv_device!")); - - if (child_dev == NULL) - return (NULL); + M_WAITOK | M_ZERO); child_dev->channel = channel; memcpy(&child_dev->class_id, &type, sizeof(hv_guid)); @@ -455,6 +431,19 @@ vmbus_vector_free(int vector) #endif /* HYPERV */ +static void +vmbus_cpuset_setthread_task(void *xmask, int pending __unused) +{ + cpuset_t *mask = xmask; + int error; + + error = cpuset_setthread(curthread->td_tid, mask); + if (error) { + panic("curthread=%ju: can't pin; error=%d", + (uintmax_t)curthread->td_tid, error); + } +} + /** * @brief Main vmbus driver initialization routine. * @@ -472,6 +461,7 @@ vmbus_bus_init(void) { int i, j, n, ret; char buf[MAXCOMLEN + 1]; + cpuset_t cpu_mask; if (vmbus_inited) return (0); @@ -508,10 +498,7 @@ vmbus_bus_init(void) setup_args.vector = hv_vmbus_g_context.hv_cb_vector; CPU_FOREACH(j) { - hv_vmbus_swintr_event_cpu[j] = 0; - hv_vmbus_g_context.hv_event_intr_event[j] = NULL; hv_vmbus_g_context.hv_msg_intr_event[j] = NULL; - hv_vmbus_g_context.event_swintr[j] = NULL; hv_vmbus_g_context.msg_swintr[j] = NULL; snprintf(buf, sizeof(buf), "cpu%d:hyperv", j); @@ -525,6 +512,26 @@ vmbus_bus_init(void) * Per cpu setup. */ CPU_FOREACH(j) { + struct task cpuset_task; + + /* + * Setup taskqueue to handle events + */ + hv_vmbus_g_context.hv_event_queue[j] = taskqueue_create_fast("hyperv event", M_WAITOK, + taskqueue_thread_enqueue, &hv_vmbus_g_context.hv_event_queue[j]); + if (hv_vmbus_g_context.hv_event_queue[j] == NULL) { + if (bootverbose) + printf("VMBUS: failed to setup taskqueue\n"); + goto cleanup1; + } + taskqueue_start_threads(&hv_vmbus_g_context.hv_event_queue[j], 1, PI_NET, + "hvevent%d", j); + + CPU_SETOF(j, &cpu_mask); + TASK_INIT(&cpuset_task, 0, vmbus_cpuset_setthread_task, &cpu_mask); + taskqueue_enqueue(hv_vmbus_g_context.hv_event_queue[j], &cpuset_task); + taskqueue_drain(hv_vmbus_g_context.hv_event_queue[j], &cpuset_task); + /* * Setup software interrupt thread and handler for msg handling. */ @@ -543,7 +550,7 @@ vmbus_bus_init(void) */ ret = intr_event_bind(hv_vmbus_g_context.hv_msg_intr_event[j], j); - if (ret) { + if (ret) { if(bootverbose) printf("VMBUS: failed to bind msg swi thread " "to cpu %d\n", j); @@ -551,30 +558,11 @@ vmbus_bus_init(void) } /* - * Setup software interrupt thread and handler for - * event handling. - */ - ret = swi_add(&hv_vmbus_g_context.hv_event_intr_event[j], - "hv_event", hv_vmbus_on_events, (void *)(long)j, - SWI_CLOCK, 0, &hv_vmbus_g_context.event_swintr[j]); - if (ret) { - if(bootverbose) - printf("VMBUS: failed to setup event swi for " - "cpu %d\n", j); - goto cleanup1; - } - - /* * Prepare the per cpu msg and event pages to be called on each cpu. */ for(i = 0; i < 2; i++) { setup_args.page_buffers[2 * j + i] = - malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO); - if (setup_args.page_buffers[2 * j + i] == NULL) { - KASSERT(setup_args.page_buffers[2 * j + i] != NULL, - ("Error VMBUS: malloc failed!")); - goto cleanup1; - } + malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO); } } @@ -607,12 +595,11 @@ vmbus_bus_init(void) * remove swi and vmbus callback vector; */ CPU_FOREACH(j) { + if (hv_vmbus_g_context.hv_event_queue[j] != NULL) + taskqueue_free(hv_vmbus_g_context.hv_event_queue[j]); if (hv_vmbus_g_context.msg_swintr[j] != NULL) swi_remove(hv_vmbus_g_context.msg_swintr[j]); - if (hv_vmbus_g_context.event_swintr[j] != NULL) - swi_remove(hv_vmbus_g_context.event_swintr[j]); hv_vmbus_g_context.hv_msg_intr_event[j] = NULL; - hv_vmbus_g_context.hv_event_intr_event[j] = NULL; } vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector); @@ -677,12 +664,11 @@ vmbus_bus_exit(void) /* remove swi */ CPU_FOREACH(i) { + if (hv_vmbus_g_context.hv_event_queue[i] != NULL) + taskqueue_free(hv_vmbus_g_context.hv_event_queue[i]); if (hv_vmbus_g_context.msg_swintr[i] != NULL) swi_remove(hv_vmbus_g_context.msg_swintr[i]); - if (hv_vmbus_g_context.event_swintr[i] != NULL) - swi_remove(hv_vmbus_g_context.event_swintr[i]); hv_vmbus_g_context.hv_msg_intr_event[i] = NULL; - hv_vmbus_g_context.hv_event_intr_event[i] = NULL; } vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector); diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h index 13a35c4..5f62072 100644 --- a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h +++ b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h @@ -202,9 +202,8 @@ typedef struct { * Each cpu has its own software interrupt handler for channel * event and msg handling. */ - struct intr_event *hv_event_intr_event[MAXCPU]; + struct taskqueue *hv_event_queue[MAXCPU]; struct intr_event *hv_msg_intr_event[MAXCPU]; - void *event_swintr[MAXCPU]; void *msg_swintr[MAXCPU]; /* * Host use this vector to intrrupt guest for vmbus channel @@ -351,7 +350,8 @@ typedef struct { * notification and 2nd is child->parent * notification */ - void *monitor_pages; + void *monitor_page_1; + void *monitor_page_2; TAILQ_HEAD(, hv_vmbus_channel_msg_info) channel_msg_anchor; struct mtx channel_msg_lock; /** @@ -363,10 +363,8 @@ typedef struct { /** * channel table for fast lookup through id. - */ + */ hv_vmbus_channel **channels; - hv_vmbus_handle work_queue; - struct sema control_sema; } hv_vmbus_connection; typedef union { @@ -633,7 +631,6 @@ typedef void (*vmbus_msg_handler)(hv_vmbus_channel_msg_header *msg); typedef struct hv_vmbus_channel_msg_table_entry { hv_vmbus_channel_msg_type messageType; - bool handler_no_sleep; /* true: the handler doesn't sleep */ vmbus_msg_handler messageHandler; } hv_vmbus_channel_msg_table_entry; @@ -683,7 +680,6 @@ uint32_t hv_ring_buffer_read_end( hv_vmbus_channel* hv_vmbus_allocate_channel(void); void hv_vmbus_free_vmbus_channel(hv_vmbus_channel *channel); -void hv_vmbus_on_channel_message(void *context); int hv_vmbus_request_channel_offers(void); void hv_vmbus_release_unattached_channels(void); int hv_vmbus_init(void); @@ -717,7 +713,7 @@ int hv_vmbus_connect(void); int hv_vmbus_disconnect(void); int hv_vmbus_post_message(void *buffer, size_t buf_size); int hv_vmbus_set_event(hv_vmbus_channel *channel); -void hv_vmbus_on_events(void *); +void hv_vmbus_on_events(int cpu); /** * Event Timer interfaces diff --git a/sys/dev/ioat/ioat.c b/sys/dev/ioat/ioat.c index aff048a..cf48c25 100644 --- a/sys/dev/ioat/ioat.c +++ b/sys/dev/ioat/ioat.c @@ -152,8 +152,8 @@ MODULE_VERSION(ioat, 1); * Private data structures */ static struct ioat_softc *ioat_channel[IOAT_MAX_CHANNELS]; -static int ioat_channel_index = 0; -SYSCTL_INT(_hw_ioat, OID_AUTO, channels, CTLFLAG_RD, &ioat_channel_index, 0, +static unsigned ioat_channel_index = 0; +SYSCTL_UINT(_hw_ioat, OID_AUTO, channels, CTLFLAG_RD, &ioat_channel_index, 0, "Number of IOAT channels attached"); static struct _pcsid @@ -407,7 +407,7 @@ ioat3_attach(device_t device) ioat = DEVICE2SOFTC(device); ioat->capabilities = ioat_read_dmacapability(ioat); - ioat_log_message(1, "Capabilities: %b\n", (int)ioat->capabilities, + ioat_log_message(0, "Capabilities: %b\n", (int)ioat->capabilities, IOAT_DMACAP_STR); xfercap = ioat_read_xfercap(ioat); @@ -742,6 +742,13 @@ ioat_reset_hw_task(void *ctx, int pending __unused) /* * User API functions */ +unsigned +ioat_get_nchannels(void) +{ + + return (ioat_channel_index); +} + bus_dmaengine_t ioat_get_dmaengine(uint32_t index, int flags) { diff --git a/sys/dev/ioat/ioat.h b/sys/dev/ioat/ioat.h index 2e10124..9a0c3e3b 100644 --- a/sys/dev/ioat/ioat.h +++ b/sys/dev/ioat/ioat.h @@ -85,6 +85,8 @@ typedef void *bus_dmaengine_t; struct bus_dmadesc; typedef void (*bus_dmaengine_callback_t)(void *arg, int error); +unsigned ioat_get_nchannels(void); + /* * Called first to acquire a reference to the DMA channel * diff --git a/sys/dev/ioat/ioat_internal.h b/sys/dev/ioat/ioat_internal.h index 322671c..9d0708d 100644 --- a/sys/dev/ioat/ioat_internal.h +++ b/sys/dev/ioat/ioat_internal.h @@ -455,7 +455,7 @@ struct ioat_softc { }) int version; - int chan_idx; + unsigned chan_idx; struct mtx submit_lock; device_t device; diff --git a/sys/dev/ntb/ntb_hw/ntb_hw.c b/sys/dev/ntb/ntb_hw/ntb_hw.c index a4c460e..1ef9749 100644 --- a/sys/dev/ntb/ntb_hw/ntb_hw.c +++ b/sys/dev/ntb/ntb_hw/ntb_hw.c @@ -35,6 +35,8 @@ __FBSDID("$FreeBSD$"); #include <sys/endian.h> #include <sys/malloc.h> #include <sys/module.h> +#include <sys/mutex.h> +#include <sys/pciio.h> #include <sys/queue.h> #include <sys/rman.h> #include <sys/sbuf.h> @@ -42,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include <vm/vm.h> #include <vm/pmap.h> #include <machine/bus.h> +#include <machine/intr_machdep.h> #include <machine/pmap.h> #include <machine/resource.h> #include <dev/pci/pcireg.h> @@ -70,6 +73,19 @@ __FBSDID("$FreeBSD$"); #define DEVICE2SOFTC(dev) ((struct ntb_softc *) device_get_softc(dev)) +#define NTB_MSIX_VER_GUARD 0xaabbccdd +#define NTB_MSIX_RECEIVED 0xe0f0e0f0 +#define ONE_MB (1024u * 1024) + +/* + * PCI constants could be somewhere more generic, but aren't defined/used in + * pci.c. + */ +#define PCI_MSIX_ENTRY_SIZE 16 +#define PCI_MSIX_ENTRY_LOWER_ADDR 0 +#define PCI_MSIX_ENTRY_UPPER_ADDR 4 +#define PCI_MSIX_ENTRY_DATA 8 + enum ntb_device_type { NTB_XEON, NTB_ATOM @@ -95,6 +111,18 @@ enum ntb_bar { NTB_MAX_BARS }; +enum { + NTB_MSIX_GUARD = 0, + NTB_MSIX_DATA0, + NTB_MSIX_DATA1, + NTB_MSIX_DATA2, + NTB_MSIX_OFS0, + NTB_MSIX_OFS1, + NTB_MSIX_OFS2, + NTB_MSIX_DONE, + NTB_MAX_MSIX_SPAD +}; + /* Device features and workarounds */ #define HAS_FEATURE(feature) \ ((ntb->features & (feature)) != 0) @@ -131,6 +159,7 @@ struct ntb_int_info { struct ntb_vec { struct ntb_softc *ntb; uint32_t num; + unsigned masked; }; struct ntb_reg { @@ -169,6 +198,11 @@ struct ntb_b2b_addr { uint64_t bar5_addr32; }; +struct ntb_msix_data { + uint32_t nmd_ofs; + uint32_t nmd_data; +}; + struct ntb_softc { device_t device; enum ntb_device_type type; @@ -178,6 +212,13 @@ struct ntb_softc { struct ntb_int_info int_info[MAX_MSIX_INTERRUPTS]; uint32_t allocated_interrupts; + struct ntb_msix_data peer_msix_data[XEON_NONLINK_DB_MSIX_BITS]; + struct ntb_msix_data msix_data[XEON_NONLINK_DB_MSIX_BITS]; + bool peer_msix_good; + bool peer_msix_done; + struct ntb_pci_bar_info *peer_lapic_bar; + struct callout peer_msix_work; + struct callout heartbeat_timer; struct callout lr_timer; @@ -198,6 +239,7 @@ struct ntb_softc { /* Memory window used to access peer bar0 */ #define B2B_MW_DISABLED UINT8_MAX uint8_t b2b_mw_idx; + uint8_t msix_mw_idx; uint8_t mw_count; uint8_t spad_count; @@ -292,6 +334,8 @@ static inline void db_iowrite(struct ntb_softc *, uint64_t regoff, uint64_t); static inline void db_iowrite_raw(struct ntb_softc *, uint64_t regoff, uint64_t); static int ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors); static void ntb_free_msix_vec(struct ntb_softc *ntb); +static void ntb_get_msix_info(struct ntb_softc *ntb); +static void ntb_exchange_msix(void *); static struct ntb_hw_info *ntb_get_device_info(uint32_t device_id); static void ntb_detect_max_mw(struct ntb_softc *ntb); static int ntb_detect_xeon(struct ntb_softc *ntb); @@ -308,7 +352,9 @@ static void xeon_set_pbar_xlat(struct ntb_softc *, uint64_t base_addr, enum ntb_bar idx); static int xeon_setup_b2b_mw(struct ntb_softc *, const struct ntb_b2b_addr *addr, const struct ntb_b2b_addr *peer_addr); +static int xeon_setup_msix_bar(struct ntb_softc *); static inline bool link_is_up(struct ntb_softc *ntb); +static inline bool _xeon_link_is_up(struct ntb_softc *ntb); static inline bool atom_link_is_err(struct ntb_softc *ntb); static inline enum ntb_speed ntb_link_sta_speed(struct ntb_softc *); static inline enum ntb_width ntb_link_sta_width(struct ntb_softc *); @@ -319,6 +365,8 @@ static bool ntb_poll_link(struct ntb_softc *ntb); static void save_bar_parameters(struct ntb_pci_bar_info *bar); static void ntb_sysctl_init(struct ntb_softc *); static int sysctl_handle_features(SYSCTL_HANDLER_ARGS); +static int sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS); +static int sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS); static int sysctl_handle_link_status(SYSCTL_HANDLER_ARGS); static int sysctl_handle_register(SYSCTL_HANDLER_ARGS); @@ -397,6 +445,13 @@ ntb_vm_memattr_to_str(vm_memattr_t pat) } } +static int g_ntb_msix_idx = 0; +SYSCTL_INT(_hw_ntb, OID_AUTO, msix_mw_idx, CTLFLAG_RDTUN, &g_ntb_msix_idx, + 0, "Use this memory window to access the peer MSIX message complex on " + "certain Xeon-based NTB systems, as a workaround for a hardware errata. " + "Like b2b_mw_idx, negative values index from the last available memory " + "window. (Applies on Xeon platforms with SB01BASE_LOCKUP errata.)"); + static int g_ntb_mw_idx = -1; TUNABLE_INT("hw.ntb.b2b_mw_idx", &g_ntb_mw_idx); SYSCTL_INT(_hw_ntb, OID_AUTO, b2b_mw_idx, CTLFLAG_RDTUN, &g_ntb_mw_idx, @@ -604,10 +659,12 @@ ntb_attach(device_t device) ntb->type = p->type; ntb->features = p->features; ntb->b2b_mw_idx = B2B_MW_DISABLED; + ntb->msix_mw_idx = B2B_MW_DISABLED; /* Heartbeat timer for NTB_ATOM since there is no link interrupt */ callout_init(&ntb->heartbeat_timer, CALLOUT_MPSAFE); callout_init(&ntb->lr_timer, CALLOUT_MPSAFE); + callout_init(&ntb->peer_msix_work, 1); mtx_init(&ntb->db_mask_lock, "ntb hw bits", NULL, MTX_SPIN); mtx_init(&ntb->ctx_lock, "ntb ctx", NULL, MTX_DEF); @@ -632,6 +689,8 @@ ntb_attach(device_t device) if (error != 0) goto out; + ntb_spad_clear(ntb); + ntb_poll_link(ntb); ntb_sysctl_init(ntb); @@ -649,10 +708,14 @@ ntb_detach(device_t device) ntb = DEVICE2SOFTC(device); - if (ntb->self_reg != NULL) - ntb_db_set_mask(ntb, ntb->db_valid_mask); + if (ntb->self_reg != NULL) { + DB_MASK_LOCK(ntb); + db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_valid_mask); + DB_MASK_UNLOCK(ntb); + } callout_drain(&ntb->heartbeat_timer); callout_drain(&ntb->lr_timer); + callout_drain(&ntb->peer_msix_work); pci_disable_busmaster(ntb->device); if (ntb->type == NTB_XEON) ntb_teardown_xeon(ntb); @@ -978,9 +1041,12 @@ ntb_init_isr(struct ntb_softc *ntb) ntb->last_ts = ticks; /* - * Mask all doorbell interrupts. + * Mask all doorbell interrupts. (Except link events!) */ - ntb_db_set_mask(ntb, ntb->db_valid_mask); + DB_MASK_LOCK(ntb); + ntb->db_mask = ntb->db_valid_mask; + db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask); + DB_MASK_UNLOCK(ntb); num_vectors = desired_vectors = MIN(pci_msix_count(ntb->device), ntb->db_count); @@ -1005,12 +1071,28 @@ ntb_init_isr(struct ntb_softc *ntb) num_vectors = 1; if (ntb->type == NTB_XEON && num_vectors < ntb->db_vec_count) { + if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) { + device_printf(ntb->device, + "Errata workaround does not support MSI or INTX\n"); + return (EINVAL); + } + ntb->db_vec_count = 1; ntb->db_vec_shift = XEON_DB_TOTAL_SHIFT; rc = ntb_setup_legacy_interrupt(ntb); } else { + if (num_vectors - 1 != XEON_NONLINK_DB_MSIX_BITS && + HAS_FEATURE(NTB_SB01BASE_LOCKUP)) { + device_printf(ntb->device, + "Errata workaround expects %d doorbell bits\n", + XEON_NONLINK_DB_MSIX_BITS); + return (EINVAL); + } + ntb_create_msix_vec(ntb, num_vectors); rc = ntb_setup_msix(ntb, num_vectors); + if (rc == 0 && HAS_FEATURE(NTB_SB01BASE_LOCKUP)) + ntb_get_msix_info(ntb); } if (rc != 0) { device_printf(ntb->device, @@ -1116,6 +1198,9 @@ void ntb_db_set_mask(struct ntb_softc *ntb, uint64_t bits) { + if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) + return; + DB_MASK_LOCK(ntb); ntb->db_mask |= bits; db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask); @@ -1131,6 +1216,9 @@ ntb_db_clear_mask(struct ntb_softc *ntb, uint64_t bits) (uintmax_t)(bits & ~ntb->db_valid_mask), (uintmax_t)ntb->db_valid_mask)); + if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) + return; + DB_MASK_LOCK(ntb); ntb->db_mask &= ~bits; db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask); @@ -1141,6 +1229,18 @@ uint64_t ntb_db_read(struct ntb_softc *ntb) { + if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) { + uint64_t res; + unsigned i; + + res = 0; + for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) { + if (ntb->msix_vec[i].masked != 0) + res |= ntb_db_vector_mask(ntb, i); + } + return (res); + } + return (db_ioread(ntb, ntb->self_reg->db_bell)); } @@ -1153,6 +1253,25 @@ ntb_db_clear(struct ntb_softc *ntb, uint64_t bits) (uintmax_t)(bits & ~ntb->db_valid_mask), (uintmax_t)ntb->db_valid_mask)); + if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) { + unsigned i; + + for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) { + if ((bits & ntb_db_vector_mask(ntb, i)) != 0) { + DB_MASK_LOCK(ntb); + if (ntb->msix_vec[i].masked != 0) { + /* XXX These need a public API. */ +#if 0 + pci_unmask_msix(ntb->device, i); +#endif + ntb->msix_vec[i].masked = 0; + } + DB_MASK_UNLOCK(ntb); + } + } + return; + } + db_iowrite(ntb, ntb->self_reg->db_bell, bits); } @@ -1179,6 +1298,19 @@ ntb_interrupt(struct ntb_softc *ntb, uint32_t vec) ntb_link_event(ntb); } + if (HAS_FEATURE(NTB_SB01BASE_LOCKUP) && + (vec_mask & ntb->db_link_mask) == 0) { + DB_MASK_LOCK(ntb); + if (ntb->msix_vec[vec].masked == 0) { + /* XXX These need a public API. */ +#if 0 + pci_mask_msix(ntb->device, vec); +#endif + ntb->msix_vec[vec].masked = 1; + } + DB_MASK_UNLOCK(ntb); + } + if ((vec_mask & ntb->db_valid_mask) != 0) ntb_db_event(ntb, vec); } @@ -1224,6 +1356,40 @@ ntb_free_msix_vec(struct ntb_softc *ntb) ntb->msix_vec = NULL; } +static void +ntb_get_msix_info(struct ntb_softc *ntb) +{ + struct pci_devinfo *dinfo; + struct pcicfg_msix *msix; + uint32_t laddr, data, i, offset; + + dinfo = device_get_ivars(ntb->device); + msix = &dinfo->cfg.msix; + + laddr = data = 0; + + CTASSERT(XEON_NONLINK_DB_MSIX_BITS == nitems(ntb->msix_data)); + + for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) { + offset = msix->msix_table_offset + i * PCI_MSIX_ENTRY_SIZE; + + laddr = bus_read_4(msix->msix_table_res, offset + + PCI_MSIX_ENTRY_LOWER_ADDR); + ntb_printf(2, "local lower MSIX addr(%u): 0x%x\n", i, laddr); + + KASSERT((laddr & MSI_INTEL_ADDR_BASE) == MSI_INTEL_ADDR_BASE, + ("local MSIX addr 0x%x not in MSI base 0x%x", laddr, + MSI_INTEL_ADDR_BASE)); + ntb->msix_data[i].nmd_ofs = laddr & ~MSI_INTEL_ADDR_BASE; + + data = bus_read_4(msix->msix_table_res, offset + + PCI_MSIX_ENTRY_DATA); + ntb_printf(2, "local MSIX data(%u): 0x%x\n", i, data); + + ntb->msix_data[i].nmd_data = data; + } +} + static struct ntb_hw_info * ntb_get_device_info(uint32_t device_id) { @@ -1276,9 +1442,12 @@ ntb_detect_xeon(struct ntb_softc *ntb) if ((ppd & XEON_PPD_SPLIT_BAR) != 0) ntb->features |= NTB_SPLIT_BAR; - /* SB01BASE_LOCKUP errata is a superset of SDOORBELL errata */ + /* + * SDOORBELL errata workaround gets in the way of SB01BASE_LOCKUP + * errata workaround; only do one at a time. + */ if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) - ntb->features |= NTB_SDOORBELL_LOCKUP; + ntb->features &= ~NTB_SDOORBELL_LOCKUP; conn_type = ppd & XEON_PPD_CONN_TYPE; switch (conn_type) { @@ -1342,19 +1511,28 @@ ntb_xeon_init_dev(struct ntb_softc *ntb) ntb->peer_reg = &xeon_b2b_reg; ntb->xlat_reg = &xeon_sec_xlat; - /* - * There is a Xeon hardware errata related to writes to SDOORBELL or - * B2BDOORBELL in conjunction with inbound access to NTB MMIO space, - * which may hang the system. To workaround this, use a memory - * window to access the interrupt and scratch pad registers on the - * remote system. - */ - if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) { + if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) { + ntb->msix_mw_idx = (ntb->mw_count + g_ntb_msix_idx) % + ntb->mw_count; + ntb_printf(2, "Setting up MSIX mw idx %d means %u\n", + g_ntb_msix_idx, ntb->msix_mw_idx); + rc = ntb_mw_set_wc_internal(ntb, ntb->msix_mw_idx, + VM_MEMATTR_UNCACHEABLE); + KASSERT(rc == 0, ("shouldn't fail")); + } else if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) { + /* + * There is a Xeon hardware errata related to writes to SDOORBELL or + * B2BDOORBELL in conjunction with inbound access to NTB MMIO space, + * which may hang the system. To workaround this, use a memory + * window to access the interrupt and scratch pad registers on the + * remote system. + */ ntb->b2b_mw_idx = (ntb->mw_count + g_ntb_mw_idx) % ntb->mw_count; ntb_printf(2, "Setting up b2b mw idx %d means %u\n", g_ntb_mw_idx, ntb->b2b_mw_idx); - rc = ntb_mw_set_wc_internal(ntb, ntb->b2b_mw_idx, VM_MEMATTR_UNCACHEABLE); + rc = ntb_mw_set_wc_internal(ntb, ntb->b2b_mw_idx, + VM_MEMATTR_UNCACHEABLE); KASSERT(rc == 0, ("shouldn't fail")); } else if (HAS_FEATURE(NTB_B2BDOORBELL_BIT14)) /* @@ -1385,7 +1563,14 @@ ntb_xeon_init_dev(struct ntb_softc *ntb) /* * Mask all doorbell interrupts. */ - ntb_db_set_mask(ntb, ntb->db_valid_mask); + DB_MASK_LOCK(ntb); + ntb->db_mask = ntb->db_valid_mask; + db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask); + DB_MASK_UNLOCK(ntb); + + rc = xeon_setup_msix_bar(ntb); + if (rc != 0) + return (rc); rc = ntb_init_isr(ntb); return (rc); @@ -1489,6 +1674,15 @@ xeon_reset_sbar_size(struct ntb_softc *ntb, enum ntb_bar idx, bar_sz--; else bar_sz = 0; + } else if (HAS_FEATURE(NTB_SB01BASE_LOCKUP) && + ntb_mw_to_bar(ntb, ntb->msix_mw_idx) == idx) { + /* Restrict LAPIC BAR to 1MB */ + pci_write_config(ntb->device, bar->psz_off, 20, 1); + pci_write_config(ntb->device, bar->ssz_off, 20, 1); + bar_sz = pci_read_config(ntb->device, bar->psz_off, 1); + bar_sz = pci_read_config(ntb->device, bar->ssz_off, 1); + (void)bar_sz; + return; } pci_write_config(ntb->device, bar->ssz_off, bar_sz, 1); bar_sz = pci_read_config(ntb->device, bar->ssz_off, 1); @@ -1499,28 +1693,37 @@ static void xeon_set_sbar_base_and_limit(struct ntb_softc *ntb, uint64_t bar_addr, enum ntb_bar idx, enum ntb_bar regbar) { - uint64_t reg_val; + uint64_t reg_val, lmt_addr; uint32_t base_reg, lmt_reg; bar_get_xlat_params(ntb, idx, &base_reg, NULL, &lmt_reg); if (idx == regbar) bar_addr += ntb->b2b_off; + lmt_addr = bar_addr; + if (HAS_FEATURE(NTB_SB01BASE_LOCKUP) && + ntb_mw_to_bar(ntb, ntb->msix_mw_idx) == idx) + lmt_addr += ONE_MB; + + /* + * Set limit registers first to avoid an errata where setting the base + * registers locks the limit registers. + */ if (!bar_is_64bit(ntb, idx)) { - ntb_reg_write(4, base_reg, bar_addr); - reg_val = ntb_reg_read(4, base_reg); + ntb_reg_write(4, lmt_reg, lmt_addr); + reg_val = ntb_reg_read(4, lmt_reg); (void)reg_val; - ntb_reg_write(4, lmt_reg, bar_addr); - reg_val = ntb_reg_read(4, lmt_reg); + ntb_reg_write(4, base_reg, bar_addr); + reg_val = ntb_reg_read(4, base_reg); (void)reg_val; } else { - ntb_reg_write(8, base_reg, bar_addr); - reg_val = ntb_reg_read(8, base_reg); + ntb_reg_write(8, lmt_reg, lmt_addr); + reg_val = ntb_reg_read(8, lmt_reg); (void)reg_val; - ntb_reg_write(8, lmt_reg, bar_addr); - reg_val = ntb_reg_read(8, lmt_reg); + ntb_reg_write(8, base_reg, bar_addr); + reg_val = ntb_reg_read(8, base_reg); (void)reg_val; } } @@ -1542,6 +1745,37 @@ xeon_set_pbar_xlat(struct ntb_softc *ntb, uint64_t base_addr, enum ntb_bar idx) } static int +xeon_setup_msix_bar(struct ntb_softc *ntb) +{ + struct ntb_pci_bar_info *lapic_bar; + enum ntb_bar bar_num; + int rc; + + if (!HAS_FEATURE(NTB_SB01BASE_LOCKUP)) + return (0); + + bar_num = ntb_mw_to_bar(ntb, ntb->msix_mw_idx); + lapic_bar = &ntb->bar_info[bar_num]; + + /* Restrict LAPIC BAR to 1MB */ + if (lapic_bar->size > ONE_MB) { + rc = bus_adjust_resource(ntb->device, SYS_RES_MEMORY, + lapic_bar->pci_resource, lapic_bar->pbase, + lapic_bar->pbase + ONE_MB - 1); + if (rc == 0) + lapic_bar->size = ONE_MB; + else { + ntb_printf(0, "Failed to shrink LAPIC BAR resource to " + "1 MB: %d\n", rc); + /* Ignore error */ + } + } + + ntb->peer_lapic_bar = lapic_bar; + return (0); +} + +static int xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr, const struct ntb_b2b_addr *peer_addr) { @@ -1619,6 +1853,43 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr, ntb_reg_write(8, XEON_SBAR2XLAT_OFFSET, 0); ntb_reg_write(8, XEON_SBAR4XLAT_OFFSET, 0); + if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) { + size_t size, xlatoffset; + + switch (ntb_mw_to_bar(ntb, ntb->msix_mw_idx)) { + case NTB_B2B_BAR_1: + size = 8; + xlatoffset = XEON_SBAR2XLAT_OFFSET; + break; + case NTB_B2B_BAR_2: + xlatoffset = XEON_SBAR4XLAT_OFFSET; + if (HAS_FEATURE(NTB_SPLIT_BAR)) + size = 4; + else + size = 8; + break; + case NTB_B2B_BAR_3: + xlatoffset = XEON_SBAR5XLAT_OFFSET; + size = 4; + break; + default: + KASSERT(false, ("Bogus msix mw idx: %u", + ntb->msix_mw_idx)); + return (EINVAL); + } + + /* + * We point the chosen MSIX MW BAR xlat to remote LAPIC for + * workaround + */ + if (size == 4) + ntb_reg_write(4, xlatoffset, MSI_INTEL_ADDR_BASE); + else + ntb_reg_write(8, xlatoffset, MSI_INTEL_ADDR_BASE); + } + (void)ntb_reg_read(8, XEON_SBAR2XLAT_OFFSET); + (void)ntb_reg_read(8, XEON_SBAR4XLAT_OFFSET); + /* Zero outgoing translation limits (whole bar size windows) */ ntb_reg_write(8, XEON_PBAR2LMT_OFFSET, 0); ntb_reg_write(8, XEON_PBAR4LMT_OFFSET, 0); @@ -1656,14 +1927,21 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr, } static inline bool +_xeon_link_is_up(struct ntb_softc *ntb) +{ + + if (ntb->conn_type == NTB_CONN_TRANSPARENT) + return (true); + return ((ntb->lnk_sta & NTB_LINK_STATUS_ACTIVE) != 0); +} + +static inline bool link_is_up(struct ntb_softc *ntb) { - if (ntb->type == NTB_XEON) { - if (ntb->conn_type == NTB_CONN_TRANSPARENT) - return (true); - return ((ntb->lnk_sta & NTB_LINK_STATUS_ACTIVE) != 0); - } + if (ntb->type == NTB_XEON) + return (_xeon_link_is_up(ntb) && (ntb->peer_msix_good || + !HAS_FEATURE(NTB_SB01BASE_LOCKUP))); KASSERT(ntb->type == NTB_ATOM, ("ntb type")); return ((ntb->ntb_ctl & ATOM_CNTL_LINK_DOWN) == 0); @@ -1881,6 +2159,8 @@ ntb_link_enable(struct ntb_softc *ntb, enum ntb_speed s __unused, { uint32_t cntl; + ntb_printf(2, "%s\n", __func__); + if (ntb->type == NTB_ATOM) { pci_write_config(ntb->device, NTB_PPD_OFFSET, ntb->ppd | ATOM_PPD_INIT_LINK, 4); @@ -1919,6 +2199,8 @@ ntb_link_disable(struct ntb_softc *ntb) { uint32_t cntl; + ntb_printf(2, "%s\n", __func__); + if (ntb->conn_type == NTB_CONN_TRANSPARENT) { ntb_link_event(ntb); return (0); @@ -1934,6 +2216,23 @@ ntb_link_disable(struct ntb_softc *ntb) return (0); } +bool +ntb_link_enabled(struct ntb_softc *ntb) +{ + uint32_t cntl; + + if (ntb->type == NTB_ATOM) { + cntl = pci_read_config(ntb->device, NTB_PPD_OFFSET, 4); + return ((cntl & ATOM_PPD_INIT_LINK) != 0); + } + + if (ntb->conn_type == NTB_CONN_TRANSPARENT) + return (true); + + cntl = ntb_reg_read(4, ntb->reg->ntb_ctl); + return ((cntl & NTB_CNTL_LINK_DISABLE) == 0); +} + static void recover_atom_link(void *arg) { @@ -2002,6 +2301,19 @@ ntb_poll_link(struct ntb_softc *ntb) return (false); ntb->lnk_sta = reg_val; + + if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) { + if (_xeon_link_is_up(ntb)) { + if (!ntb->peer_msix_good) { + callout_reset(&ntb->peer_msix_work, 0, + ntb_exchange_msix, ntb); + return (false); + } + } else { + ntb->peer_msix_good = false; + ntb->peer_msix_done = false; + } + } } return (true); } @@ -2040,16 +2352,26 @@ SYSCTL_NODE(_hw_ntb, OID_AUTO, debug_info, CTLFLAG_RW, 0, static void ntb_sysctl_init(struct ntb_softc *ntb) { - struct sysctl_oid_list *tree_par, *regpar, *statpar, *errpar; + struct sysctl_oid_list *globals, *tree_par, *regpar, *statpar, *errpar; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree, *tmptree; ctx = device_get_sysctl_ctx(ntb->device); - - tree = SYSCTL_ADD_NODE(ctx, - SYSCTL_CHILDREN(device_get_sysctl_tree(ntb->device)), OID_AUTO, - "debug_info", CTLFLAG_RD, NULL, - "Driver state, statistics, and HW registers"); + globals = SYSCTL_CHILDREN(device_get_sysctl_tree(ntb->device)); + + SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "link_status", + CTLFLAG_RD | CTLTYPE_STRING, ntb, 0, + sysctl_handle_link_status_human, "A", + "Link status (human readable)"); + SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "active", + CTLFLAG_RD | CTLTYPE_UINT, ntb, 0, sysctl_handle_link_status, + "IU", "Link status (1=active, 0=inactive)"); + SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "admin_up", + CTLFLAG_RW | CTLTYPE_UINT, ntb, 0, sysctl_handle_link_admin, + "IU", "Set/get interface status (1=UP, 0=DOWN)"); + + tree = SYSCTL_ADD_NODE(ctx, globals, OID_AUTO, "debug_info", + CTLFLAG_RD, NULL, "Driver state, statistics, and HW registers"); tree_par = SYSCTL_CHILDREN(tree); SYSCTL_ADD_UINT(ctx, tree_par, OID_AUTO, "conn_type", CTLFLAG_RD, @@ -2081,10 +2403,6 @@ ntb_sysctl_init(struct ntb_softc *ntb) __DEVOLATILE(uint32_t *, &ntb->lnk_sta), 0, "LNK STA register (cached)"); - SYSCTL_ADD_PROC(ctx, tree_par, OID_AUTO, "link_status", - CTLFLAG_RD | CTLTYPE_STRING, ntb, 0, sysctl_handle_link_status, - "A", "Link status"); - #ifdef notyet SYSCTL_ADD_U8(ctx, tree_par, OID_AUTO, "mw_count", CTLFLAG_RD, &ntb->mw_count, 0, "MW count"); @@ -2332,7 +2650,37 @@ sysctl_handle_features(SYSCTL_HANDLER_ARGS) } static int -sysctl_handle_link_status(SYSCTL_HANDLER_ARGS) +sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS) +{ + struct ntb_softc *ntb; + unsigned old, new; + int error; + + error = 0; + ntb = arg1; + + old = ntb_link_enabled(ntb); + + error = SYSCTL_OUT(req, &old, sizeof(old)); + if (error != 0 || req->newptr == NULL) + return (error); + + error = SYSCTL_IN(req, &new, sizeof(new)); + if (error != 0) + return (error); + + ntb_printf(0, "Admin set interface state to '%sabled'\n", + (new != 0)? "en" : "dis"); + + if (new != 0) + error = ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO); + else + error = ntb_link_disable(ntb); + return (error); +} + +static int +sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS) { struct ntb_softc *ntb; struct sbuf sb; @@ -2360,6 +2708,24 @@ sysctl_handle_link_status(SYSCTL_HANDLER_ARGS) } static int +sysctl_handle_link_status(SYSCTL_HANDLER_ARGS) +{ + struct ntb_softc *ntb; + unsigned res; + int error; + + error = 0; + ntb = arg1; + + res = ntb_link_is_up(ntb, NULL, NULL); + + error = SYSCTL_OUT(req, &res, sizeof(res)); + if (error || !req->newptr) + return (error); + return (EINVAL); +} + +static int sysctl_handle_register(SYSCTL_HANDLER_ARGS) { struct ntb_softc *ntb; @@ -2434,12 +2800,70 @@ static unsigned ntb_user_mw_to_idx(struct ntb_softc *ntb, unsigned uidx) { - if (ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0 && - uidx >= ntb->b2b_mw_idx) - return (uidx + 1); + if ((ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0 && + uidx >= ntb->b2b_mw_idx) || + (ntb->msix_mw_idx != B2B_MW_DISABLED && uidx >= ntb->msix_mw_idx)) + uidx++; + if ((ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0 && + uidx >= ntb->b2b_mw_idx) && + (ntb->msix_mw_idx != B2B_MW_DISABLED && uidx >= ntb->msix_mw_idx)) + uidx++; return (uidx); } +static void +ntb_exchange_msix(void *ctx) +{ + struct ntb_softc *ntb; + uint32_t val; + unsigned i; + + ntb = ctx; + + if (ntb->peer_msix_done) + goto msix_done; + + for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) { + ntb_peer_spad_write(ntb, NTB_MSIX_DATA0 + i, + ntb->msix_data[i].nmd_data); + ntb_peer_spad_write(ntb, NTB_MSIX_OFS0 + i, + ntb->msix_data[i].nmd_ofs); + } + ntb_peer_spad_write(ntb, NTB_MSIX_GUARD, NTB_MSIX_VER_GUARD); + + ntb_spad_read(ntb, NTB_MSIX_GUARD, &val); + if (val != NTB_MSIX_VER_GUARD) + goto reschedule; + + for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) { + ntb_spad_read(ntb, NTB_MSIX_DATA0 + i, &val); + ntb->peer_msix_data[i].nmd_data = val; + ntb_spad_read(ntb, NTB_MSIX_OFS0 + i, &val); + ntb->peer_msix_data[i].nmd_ofs = val; + } + + ntb->peer_msix_done = true; + +msix_done: + ntb_peer_spad_write(ntb, NTB_MSIX_DONE, NTB_MSIX_RECEIVED); + ntb_spad_read(ntb, NTB_MSIX_DONE, &val); + if (val != NTB_MSIX_RECEIVED) + goto reschedule; + + ntb->peer_msix_good = true; + + ntb_poll_link(ntb); + ntb_link_event(ntb); + return; + +reschedule: + ntb->lnk_sta = pci_read_config(ntb->device, ntb->reg->lnk_sta, 2); + if (_xeon_link_is_up(ntb)) + callout_reset(&ntb->peer_msix_work, hz / 100, ntb_exchange_msix, ntb); + else + ntb_spad_clear(ntb); +} + /* * Public API to the rest of the OS */ @@ -2469,10 +2893,14 @@ ntb_get_max_spads(struct ntb_softc *ntb) uint8_t ntb_mw_count(struct ntb_softc *ntb) { + uint8_t res; + res = ntb->mw_count; if (ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0) - return (ntb->mw_count - 1); - return (ntb->mw_count); + res--; + if (ntb->msix_mw_idx != B2B_MW_DISABLED) + res--; + return (res); } /** @@ -2498,6 +2926,18 @@ ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val) return (0); } +/* + * Zeros the local scratchpad. + */ +void +ntb_spad_clear(struct ntb_softc *ntb) +{ + unsigned i; + + for (i = 0; i < ntb->spad_count; i++) + ntb_spad_write(ntb, i, 0); +} + /** * ntb_spad_read() - read from the primary scratchpad register * @ntb: pointer to ntb_softc instance @@ -2826,6 +3266,22 @@ void ntb_peer_db_set(struct ntb_softc *ntb, uint64_t bit) { + if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) { + struct ntb_pci_bar_info *lapic; + unsigned i; + + lapic = ntb->peer_lapic_bar; + + for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) { + if ((bit & ntb_db_vector_mask(ntb, i)) != 0) + bus_space_write_4(lapic->pci_bus_tag, + lapic->pci_bus_handle, + ntb->peer_msix_data[i].nmd_ofs, + ntb->peer_msix_data[i].nmd_data); + } + return; + } + if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) { ntb_mw_write(2, XEON_PDOORBELL_OFFSET, bit); return; diff --git a/sys/dev/ntb/ntb_hw/ntb_hw.h b/sys/dev/ntb/ntb_hw/ntb_hw.h index c35166c..f05acda 100644 --- a/sys/dev/ntb/ntb_hw/ntb_hw.h +++ b/sys/dev/ntb/ntb_hw/ntb_hw.h @@ -70,6 +70,7 @@ bool ntb_link_is_up(struct ntb_softc *, enum ntb_speed *, enum ntb_width *); void ntb_link_event(struct ntb_softc *); int ntb_link_enable(struct ntb_softc *, enum ntb_speed, enum ntb_width); int ntb_link_disable(struct ntb_softc *); +bool ntb_link_enabled(struct ntb_softc *); int ntb_set_ctx(struct ntb_softc *, void *, const struct ntb_ctx_ops *); void *ntb_get_ctx(struct ntb_softc *, const struct ntb_ctx_ops **); @@ -86,6 +87,7 @@ int ntb_mw_get_wc(struct ntb_softc *, unsigned mw_idx, vm_memattr_t *mode); int ntb_mw_set_wc(struct ntb_softc *, unsigned mw_idx, vm_memattr_t mode); uint8_t ntb_get_max_spads(struct ntb_softc *ntb); +void ntb_spad_clear(struct ntb_softc *ntb); int ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val); int ntb_spad_read(struct ntb_softc *ntb, unsigned int idx, uint32_t *val); int ntb_peer_spad_write(struct ntb_softc *ntb, unsigned int idx, diff --git a/sys/dev/ntb/ntb_hw/ntb_regs.h b/sys/dev/ntb/ntb_hw/ntb_regs.h index f50fd93..fb445d7 100644 --- a/sys/dev/ntb/ntb_hw/ntb_regs.h +++ b/sys/dev/ntb/ntb_hw/ntb_regs.h @@ -44,6 +44,7 @@ #define XEON_DB_MSIX_VECTOR_COUNT 4 #define XEON_DB_MSIX_VECTOR_SHIFT 5 #define XEON_DB_LINK_BIT (1 << XEON_DB_LINK) +#define XEON_NONLINK_DB_MSIX_BITS 3 #define XEON_SPCICMD_OFFSET 0x0504 #define XEON_DEVCTRL_OFFSET 0x0598 diff --git a/sys/dev/sound/pci/hda/hdaa.c b/sys/dev/sound/pci/hda/hdaa.c index fe45343..14aee62 100644 --- a/sys/dev/sound/pci/hda/hdaa.c +++ b/sys/dev/sound/pci/hda/hdaa.c @@ -1553,20 +1553,20 @@ hdaa_widget_parse(struct hdaa_widget *w) SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, buf, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, - w, sizeof(w), hdaa_sysctl_caps, "A", "Node capabilities"); + w, 0, hdaa_sysctl_caps, "A", "Node capabilities"); if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) { snprintf(buf, sizeof(buf), "nid%d_config", w->nid); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, buf, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, - &w->wclass.pin.newconf, sizeof(&w->wclass.pin.newconf), - hdaa_sysctl_config, "A", "Current pin configuration"); + &w->wclass.pin.newconf, 0, hdaa_sysctl_config, "A", + "Current pin configuration"); snprintf(buf, sizeof(buf), "nid%d_original", w->nid); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, buf, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, - &w->wclass.pin.original, sizeof(&w->wclass.pin.original), - hdaa_sysctl_config, "A", "Original pin configuration"); + &w->wclass.pin.original, 0, hdaa_sysctl_config, "A", + "Original pin configuration"); } hdaa_lock(w->devinfo); } @@ -6641,38 +6641,32 @@ hdaa_attach(device_t dev) SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "config", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, - &devinfo->newquirks, sizeof(&devinfo->newquirks), - hdaa_sysctl_quirks, "A", "Configuration options"); + &devinfo->newquirks, 0, hdaa_sysctl_quirks, "A", + "Configuration options"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "gpi_state", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, - devinfo, sizeof(devinfo), - hdaa_sysctl_gpi_state, "A", "GPI state"); + devinfo, 0, hdaa_sysctl_gpi_state, "A", "GPI state"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "gpio_state", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, - devinfo, sizeof(devinfo), - hdaa_sysctl_gpio_state, "A", "GPIO state"); + devinfo, 0, hdaa_sysctl_gpio_state, "A", "GPIO state"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "gpio_config", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, - devinfo, sizeof(devinfo), - hdaa_sysctl_gpio_config, "A", "GPIO configuration"); + devinfo, 0, hdaa_sysctl_gpio_config, "A", "GPIO configuration"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "gpo_state", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, - devinfo, sizeof(devinfo), - hdaa_sysctl_gpo_state, "A", "GPO state"); + devinfo, 0, hdaa_sysctl_gpo_state, "A", "GPO state"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "gpo_config", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, - devinfo, sizeof(devinfo), - hdaa_sysctl_gpo_config, "A", "GPO configuration"); + devinfo, 0, hdaa_sysctl_gpo_config, "A", "GPO configuration"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "reconfig", CTLTYPE_INT | CTLFLAG_RW, - dev, sizeof(dev), - hdaa_sysctl_reconfig, "I", "Reprocess configuration"); + dev, 0, hdaa_sysctl_reconfig, "I", "Reprocess configuration"); bus_generic_attach(dev); return (0); } diff --git a/sys/kern/subr_vmem.c b/sys/kern/subr_vmem.c index 80940be..2ec45c3 100644 --- a/sys/kern/subr_vmem.c +++ b/sys/kern/subr_vmem.c @@ -1046,10 +1046,8 @@ vmem_create(const char *name, vmem_addr_t base, vmem_size_t size, if (vm == NULL) return (NULL); if (vmem_init(vm, name, base, size, quantum, qcache_max, - flags) == NULL) { - free(vm, M_VMEM); + flags) == NULL) return (NULL); - } return (vm); } diff --git a/sys/modules/dummynet/Makefile b/sys/modules/dummynet/Makefile index dfddbce..98e685e 100644 --- a/sys/modules/dummynet/Makefile +++ b/sys/modules/dummynet/Makefile @@ -6,8 +6,9 @@ KMOD= dummynet SRCS= ip_dummynet.c SRCS+= ip_dn_glue.c ip_dn_io.c +SRCS+= dn_aqm_codel.c dn_aqm_pie.c SRCS+= dn_heap.c dn_sched_fifo.c dn_sched_qfq.c dn_sched_rr.c dn_sched_wf2q.c -SRCS+= dn_sched_prio.c +SRCS+= dn_sched_prio.c dn_sched_fq_codel.c dn_sched_fq_pie.c SRCS+= opt_inet6.h .if !defined(KERNBUILDDIR) diff --git a/sys/modules/hyperv/utilities/Makefile b/sys/modules/hyperv/utilities/Makefile index f94e441..c1b6d4f 100644 --- a/sys/modules/hyperv/utilities/Makefile +++ b/sys/modules/hyperv/utilities/Makefile @@ -3,7 +3,7 @@ .PATH: ${.CURDIR}/../../../dev/hyperv/utilities KMOD= hv_utils -SRCS= hv_util.c hv_kvp.c +SRCS= hv_util.c hv_kvp.c hv_timesync.c hv_shutdown.c hv_heartbeat.c SRCS+= bus_if.h device_if.h CFLAGS+= -I${.CURDIR}/../../../dev/hyperv/include \ diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h index 202f1e2..377b5b0 100644 --- a/sys/netinet/ip_dummynet.h +++ b/sys/netinet/ip_dummynet.h @@ -29,7 +29,7 @@ #ifndef _IP_DUMMYNET_H #define _IP_DUMMYNET_H - +#define NEW_AQM /* * Definition of the kernel-userland API for dummynet. * @@ -85,7 +85,13 @@ enum { /* special commands for emulation of sysctl variables */ DN_SYSCTL_GET, DN_SYSCTL_SET, - +#ifdef NEW_AQM + /* subtypes used for setting/getting extra parameters. + * these subtypes used with IP_DUMMYNET3 command (get) + * and DN_TEXT (set). */ + DN_AQM_PARAMS, /* AQM extra params */ + DN_SCH_PARAMS, /* scheduler extra params */ +#endif DN_LAST, }; @@ -105,6 +111,9 @@ enum { /* user flags */ DN_IS_RED = 0x0020, DN_IS_GENTLE_RED= 0x0040, DN_IS_ECN = 0x0080, + #ifdef NEW_AQM + DN_IS_AQM = 0x0100, /* AQMs: e.g Codel & PIE */ + #endif DN_PIPE_CMD = 0x1000, /* pipe config... */ }; @@ -210,7 +219,19 @@ struct dn_profile { int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */ }; - +#ifdef NEW_AQM +/* Extra parameters for AQM and scheduler. + * This struct is used to pass and retrieve parameters (configurations) + * to/from AQM and Scheduler. + */ +struct dn_extra_parms { + struct dn_id oid; + char name[16]; + uint32_t nr; +#define DN_MAX_EXTRA_PARM 10 + int64_t par[DN_MAX_EXTRA_PARM]; +}; +#endif /* * Overall structure of dummynet diff --git a/sys/netipsec/key.c b/sys/netipsec/key.c index 26b1788..f5b0fee 100644 --- a/sys/netipsec/key.c +++ b/sys/netipsec/key.c @@ -350,7 +350,7 @@ do { \ if ((head) != (sav)) { \ ipseclog((LOG_DEBUG, "%s: state mismatched (TREE=%d SA=%d)\n", \ (name), (head), (sav))); \ - continue; \ + break; \ } \ } while (0) diff --git a/sys/netpfil/ipfw/dn_aqm.h b/sys/netpfil/ipfw/dn_aqm.h new file mode 100644 index 0000000..d01e98e --- /dev/null +++ b/sys/netpfil/ipfw/dn_aqm.h @@ -0,0 +1,167 @@ +/*- + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * API for writing an Active Queue Management algorithm for Dummynet + * + * $FreeBSD$ + */ + +#ifndef _IP_DN_AQM_H +#define _IP_DN_AQM_H + + +/* NOW is the current time in millisecond*/ +#define NOW ((dn_cfg.curr_time * tick) / 1000) + +#define AQM_UNOW (dn_cfg.curr_time * tick) +#define AQM_TIME_1US ((aqm_time_t)(1)) +#define AQM_TIME_1MS ((aqm_time_t)(1000)) +#define AQM_TIME_1S ((aqm_time_t)(AQM_TIME_1MS * 1000)) + +/* aqm time allows to store up to 4294 seconds */ +typedef uint32_t aqm_time_t; +typedef int32_t aqm_stime_t; + +#define DN_AQM_MTAG_TS 55345 + +/* Macro for variable bounding */ +#define BOUND_VAR(x,l,h) ((x) > (h)? (h) : ((x) > (l)? (x) : (l))) + +/* sysctl variable to count number of dropped packets */ +extern unsigned long io_pkt_drop; + +/* + * Structure for holding data and function pointers that together represent a + * AQM algorithm. + */ + struct dn_aqm { +#define DN_AQM_NAME_MAX 50 + char name[DN_AQM_NAME_MAX]; /* name of AQM algorithm */ + uint32_t type; /* AQM type number */ + + /* Methods implemented by AQM algorithm: + * + * enqueue enqueue packet 'm' on queue 'q'. + * Return 0 on success, 1 on drop. + * + * dequeue dequeue a packet from queue 'q'. + * Return a packet, NULL if no packet available. + * + * config configure AQM algorithm + * If required, this function should allocate space to store + * the configurations and set 'fs->aqmcfg' to point to this space. + * 'dn_extra_parms' includes array of parameters send + * from ipfw userland command. + * Return 0 on success, non-zero otherwise. + * + * deconfig deconfigure AQM algorithm. + * The allocated configuration memory space should be freed here. + * Return 0 on success, non-zero otherwise. + * + * init initialise AQM status variables of queue 'q' + * This function is used to allocate space and init AQM status for a + * queue and q->aqm_status to point to this space. + * Return 0 on success, non-zero otherwise. + * + * cleanup cleanup AQM status variables of queue 'q' + * The allocated memory space for AQM status should be freed here. + * Return 0 on success, non-zero otherwise. + * + * getconfig retrieve AQM configurations + * This function is used to return AQM parameters to userland + * command. The function should fill 'dn_extra_parms' struct with + * the AQM configurations using 'par' array. + * + */ + + int (*enqueue)(struct dn_queue *, struct mbuf *); + struct mbuf * (*dequeue)(struct dn_queue *); + int (*config)(struct dn_fsk *, struct dn_extra_parms *ep, int); + int (*deconfig)(struct dn_fsk *); + int (*init)(struct dn_queue *); + int (*cleanup)(struct dn_queue *); + int (*getconfig)(struct dn_fsk *, struct dn_extra_parms *); + + int ref_count; /*Number of queues instances in the system */ + int cfg_ref_count; /*Number of AQM instances in the system */ + SLIST_ENTRY (dn_aqm) next; /* Next AQM in the list */ +}; + +/* Helper function to update queue and scheduler statistics. + * negative len + drop -> drop + * negative len -> dequeue + * positive len -> enqueue + * positive len + drop -> drop during enqueue + */ +__inline static void +update_stats(struct dn_queue *q, int len, int drop) +{ + int inc = 0; + struct dn_flow *sni; + struct dn_flow *qni; + + sni = &q->_si->ni; + qni = &q->ni; + + if (len < 0) + inc = -1; + else if(len > 0) + inc = 1; + + if (drop) { + qni->drops++; + sni->drops++; + io_pkt_drop++; + } else { + /*update queue stats */ + qni->length += inc; + qni->len_bytes += len; + + /*update scheduler instance stats */ + sni->length += inc; + sni->len_bytes += len; + } + /* tot_pkts is updated in dn_enqueue function */ +} + + +/* kernel module related function */ +int +dn_aqm_modevent(module_t mod, int cmd, void *arg); + +#define DECLARE_DNAQM_MODULE(name, dnaqm) \ + static moduledata_t name##_mod = { \ + #name, dn_aqm_modevent, dnaqm \ + }; \ + DECLARE_MODULE(name, name##_mod, \ + SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ + MODULE_DEPEND(name, dummynet, 3, 3, 3) + +#endif diff --git a/sys/netpfil/ipfw/dn_aqm_codel.c b/sys/netpfil/ipfw/dn_aqm_codel.c new file mode 100644 index 0000000..0080170 --- /dev/null +++ b/sys/netpfil/ipfw/dn_aqm_codel.c @@ -0,0 +1,444 @@ +/* + * Codel - The Controlled-Delay Active Queue Management algorithm. + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/module.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/sysctl.h> + +#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include <net/netisr.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/ip.h> /* ip_len, ip_off */ +#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ +#include <netinet/ip_fw.h> +#include <netinet/ip_dummynet.h> +#include <netinet/if_ether.h> /* various ether_* routines */ +#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */ +#include <netinet6/ip6_var.h> +#include <netpfil/ipfw/dn_heap.h> + +#ifdef NEW_AQM +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/ip_dn_private.h> +#include <netpfil/ipfw/dn_aqm.h> +#include <netpfil/ipfw/dn_aqm_codel.h> +#include <netpfil/ipfw/dn_sched.h> + +#define DN_AQM_CODEL 1 + +static struct dn_aqm codel_desc; + +/* default codel parameters */ +struct dn_aqm_codel_parms codel_sysctl = {5000 * AQM_TIME_1US, + 100000 * AQM_TIME_1US, 0}; + +static int +codel_sysctl_interval_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + value = codel_sysctl.interval; + value /= AQM_TIME_1US; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > 100 * AQM_TIME_1S) + return (EINVAL); + codel_sysctl.interval = value * AQM_TIME_1US ; + return (0); +} + +static int +codel_sysctl_target_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + value = codel_sysctl.target; + value /= AQM_TIME_1US; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + D("%ld", value); + if (value < 1 || value > 5 * AQM_TIME_1S) + return (EINVAL); + codel_sysctl.target = value * AQM_TIME_1US ; + return (0); +} + +/* defining Codel sysctl variables */ +SYSBEGIN(f4) + +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_DECL(_net_inet_ip_dummynet); +static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, + codel, CTLFLAG_RW, 0, "CODEL"); + +#ifdef SYSCTL_NODE +SYSCTL_PROC(_net_inet_ip_dummynet_codel, OID_AUTO, target, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,codel_sysctl_target_handler, "L", + "CoDel target in microsecond"); + +SYSCTL_PROC(_net_inet_ip_dummynet_codel, OID_AUTO, interval, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, codel_sysctl_interval_handler, "L", + "CoDel interval in microsecond"); +#endif + +/* This function computes codel_interval/sqrt(count) + * Newton's method of approximation is used to compute 1/sqrt(count). + * http://betterexplained.com/articles/ + * understanding-quakes-fast-inverse-square-root/ + */ +aqm_time_t +control_law(struct codel_status *cst, struct dn_aqm_codel_parms *cprms, + aqm_time_t t) +{ + uint32_t count; + uint64_t temp; + count = cst->count; + + /* we don't calculate isqrt(1) to get more accurate result*/ + if (count == 1) { + /* prepare isqrt (old guess) for the next iteration i.e. 1/sqrt(2)*/ + cst->isqrt = (1UL<< FIX_POINT_BITS) * 7/10; + /* return time + isqrt(1)*interval */ + return t + cprms->interval; + } + + /* newguess = g(1.5 - 0.5*c*g^2) + * Multiplying both sides by 2 to make all the constants intergers + * newguess * 2 = g(3 - c*g^2) g=old guess, c=count + * So, newguess = newguess /2 + * Fixed point operations are used here. + */ + + /* Calculate g^2 */ + temp = (uint32_t) cst->isqrt * cst->isqrt; + /* Calculate (3 - c*g^2) i.e. (3 - c * temp) */ + temp = (3ULL<< (FIX_POINT_BITS*2)) - (count * temp); + + /* + * Divide by 2 because we multiplied the original equation by two + * Also, we shift the result by 8 bits to prevent overflow. + * */ + temp >>= (1 + 8); + + /* Now, temp = (1.5 - 0.5*c*g^2) + * Calculate g (1.5 - 0.5*c*g^2) i.e. g * temp + */ + temp = (cst->isqrt * temp) >> (FIX_POINT_BITS + FIX_POINT_BITS - 8); + cst->isqrt = temp; + + /* calculate codel_interval/sqrt(count) */ + return t + ((cprms->interval * temp) >> FIX_POINT_BITS); +} + +/* + * Extract a packet from the head of queue 'q' + * Return a packet or NULL if the queue is empty. + * Also extract packet's timestamp from mtag. + */ +struct mbuf * +codel_extract_head(struct dn_queue *q, aqm_time_t *pkt_ts) +{ + struct m_tag *mtag; + struct mbuf *m = q->mq.head; + + if (m == NULL) + return m; + q->mq.head = m->m_nextpkt; + + /* Update stats */ + update_stats(q, -m->m_pkthdr.len, 0); + + if (q->ni.length == 0) /* queue is now idle */ + q->q_time = dn_cfg.curr_time; + + /* extract packet TS*/ + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL) { + D("Codel timestamp mtag not found!"); + *pkt_ts = 0; + } else { + *pkt_ts = *(aqm_time_t *)(mtag + 1); + m_tag_delete(m,mtag); + } + + return m; +} + +/* + * Enqueue a packet 'm' in queue 'q' + */ +static int +aqm_codel_enqueue(struct dn_queue *q, struct mbuf *m) +{ + struct dn_fs *f; + uint64_t len; + struct codel_status *cst; /*codel status variables */ + struct m_tag *mtag; + + f = &(q->fs->fs); + len = m->m_pkthdr.len; + cst = q->aqm_status; + if(!cst) { + D("Codel queue is not initialized\n"); + goto drop; + } + + /* Finding maximum packet size */ + // XXX we can get MTU from driver instead + if (len > cst->maxpkt_size) + cst->maxpkt_size = len; + + /* check for queue size and drop the tail if exceed queue limit*/ + if (f->flags & DN_QSIZE_BYTES) { + if ( q->ni.len_bytes > f->qsize) + goto drop; + } + else { + if ( q->ni.length >= f->qsize) + goto drop; + } + + /* Add timestamp as mtag */ + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL) + mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, + sizeof(aqm_time_t), M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + goto drop; + } + + *(aqm_time_t *)(mtag + 1) = AQM_UNOW; + m_tag_prepend(m, mtag); + + mq_append(&q->mq, m); + update_stats(q, len, 0); + return (0); + +drop: + update_stats(q, 0, 1); + FREE_PKT(m); + return (1); +} + + +/* Dequeue a pcaket from queue q */ +static struct mbuf * +aqm_codel_dequeue(struct dn_queue *q) +{ + return codel_dequeue(q); +} + +/* + * initialize Codel for queue 'q' + * First allocate memory for codel status. + */ +static int +aqm_codel_init(struct dn_queue *q) +{ + struct codel_status *cst; + + if (!q->fs->aqmcfg) { + D("Codel is not configure!d"); + return EINVAL; + } + + q->aqm_status = malloc(sizeof(struct codel_status), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (q->aqm_status == NULL) { + D("Cannot allocate AQM_codel private data"); + return ENOMEM ; + } + + /* init codel status variables */ + cst = q->aqm_status; + cst->dropping=0; + cst->first_above_time=0; + cst->drop_next_time=0; + cst->count=0; + cst->maxpkt_size = 500; + + /* increase reference counters */ + codel_desc.ref_count++; + + return 0; +} + +/* + * Clean up Codel status for queue 'q' + * Destroy memory allocated for codel status. + */ +static int +aqm_codel_cleanup(struct dn_queue *q) +{ + + if (q && q->aqm_status) { + free(q->aqm_status, M_DUMMYNET); + q->aqm_status = NULL; + /* decrease reference counters */ + codel_desc.ref_count--; + } + else + D("Codel already cleaned up"); + return 0; +} + +/* + * Config codel parameters + * also allocate memory for codel configurations + */ +static int +aqm_codel_config(struct dn_fsk* fs, struct dn_extra_parms *ep, int len) +{ + struct dn_aqm_codel_parms *ccfg; + + int l = sizeof(struct dn_extra_parms); + if (len < l) { + D("invalid sched parms length got %d need %d", len, l); + return EINVAL; + } + /* we free the old cfg because maybe the original allocation + * not the same size as the new one (different AQM type). + */ + if (fs->aqmcfg) { + free(fs->aqmcfg, M_DUMMYNET); + fs->aqmcfg = NULL; + } + + fs->aqmcfg = malloc(sizeof(struct dn_aqm_codel_parms), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (fs->aqmcfg== NULL) { + D("cannot allocate AQM_codel configuration parameters"); + return ENOMEM; + } + + /* configure codel parameters */ + ccfg = fs->aqmcfg; + + if (ep->par[0] < 0) + ccfg->target = codel_sysctl.target; + else + ccfg->target = ep->par[0] * AQM_TIME_1US; + + if (ep->par[1] < 0) + ccfg->interval = codel_sysctl.interval; + else + ccfg->interval = ep->par[1] * AQM_TIME_1US; + + if (ep->par[2] < 0) + ccfg->flags = 0; + else + ccfg->flags = ep->par[2]; + + /* bound codel configurations */ + ccfg->target = BOUND_VAR(ccfg->target,1, 5 * AQM_TIME_1S); + ccfg->interval = BOUND_VAR(ccfg->interval,1, 5 * AQM_TIME_1S); + /* increase config reference counter */ + codel_desc.cfg_ref_count++; + + return 0; +} + +/* + * Deconfigure Codel and free memory allocation + */ +static int +aqm_codel_deconfig(struct dn_fsk* fs) +{ + + if (fs && fs->aqmcfg) { + free(fs->aqmcfg, M_DUMMYNET); + fs->aqmcfg = NULL; + fs->aqmfp = NULL; + /* decrease config reference counter */ + codel_desc.cfg_ref_count--; + } + + return 0; +} + +/* + * Retrieve Codel configuration parameters. + */ +static int +aqm_codel_getconfig(struct dn_fsk *fs, struct dn_extra_parms * ep) +{ + struct dn_aqm_codel_parms *ccfg; + + if (fs->aqmcfg) { + strcpy(ep->name, codel_desc.name); + ccfg = fs->aqmcfg; + ep->par[0] = ccfg->target / AQM_TIME_1US; + ep->par[1] = ccfg->interval / AQM_TIME_1US; + ep->par[2] = ccfg->flags; + return 0; + } + return 1; +} + +static struct dn_aqm codel_desc = { + _SI( .type = ) DN_AQM_CODEL, + _SI( .name = ) "CODEL", + _SI( .enqueue = ) aqm_codel_enqueue, + _SI( .dequeue = ) aqm_codel_dequeue, + _SI( .config = ) aqm_codel_config, + _SI( .getconfig = ) aqm_codel_getconfig, + _SI( .deconfig = ) aqm_codel_deconfig, + _SI( .init = ) aqm_codel_init, + _SI( .cleanup = ) aqm_codel_cleanup, +}; + +DECLARE_DNAQM_MODULE(dn_aqm_codel, &codel_desc); + + +#endif diff --git a/sys/netpfil/ipfw/dn_aqm_codel.h b/sys/netpfil/ipfw/dn_aqm_codel.h new file mode 100644 index 0000000..f5618e7 --- /dev/null +++ b/sys/netpfil/ipfw/dn_aqm_codel.h @@ -0,0 +1,222 @@ +/* + * Codel - The Controlled-Delay Active Queue Management algorithm. + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * + * Copyright (C) 2011-2014 Kathleen Nichols <nichols@pollere.com>. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * o Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * + * o Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * o The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General Public + * License ("GPL") version 2, in which case the provisions of the GPL + * apply INSTEAD OF those given above. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _IP_DN_AQM_CODEL_H +#define _IP_DN_AQM_CODEL_H + + +// XXX How to choose MTAG? +#define FIX_POINT_BITS 16 + +enum { + CODEL_ECN_ENABLED = 1 +}; + +/* Codel parameters */ +struct dn_aqm_codel_parms { + aqm_time_t target; + aqm_time_t interval; + uint32_t flags; +}; + +/* codel status variables */ +struct codel_status { + uint32_t count; /* number of dropped pkts since entering drop state */ + uint16_t dropping; /* dropping state */ + aqm_time_t drop_next_time; /* time for next drop */ + aqm_time_t first_above_time; /* time for first ts over target we observed */ + uint16_t isqrt; /* last isqrt for control low */ + uint16_t maxpkt_size; /* max packet size seen so far */ +}; + +struct mbuf *codel_extract_head(struct dn_queue *, aqm_time_t *); +aqm_time_t control_law(struct codel_status *, + struct dn_aqm_codel_parms *, aqm_time_t ); + +__inline static struct mbuf * +codel_dodequeue(struct dn_queue *q, aqm_time_t now, uint16_t *ok_to_drop) +{ + struct mbuf * m; + struct dn_aqm_codel_parms *cprms; + struct codel_status *cst; + aqm_time_t pkt_ts, sojourn_time; + + *ok_to_drop = 0; + m = codel_extract_head(q, &pkt_ts); + + cst = q->aqm_status; + + if (m == NULL) { + /* queue is empty - we can't be above target */ + cst->first_above_time= 0; + return m; + } + + cprms = q->fs->aqmcfg; + + /* To span a large range of bandwidths, CoDel runs two + * different AQMs in parallel. One is sojourn-time-based + * and takes effect when the time to send an MTU-sized + * packet is less than target. The 1st term of the "if" + * below does this. The other is backlog-based and takes + * effect when the time to send an MTU-sized packet is >= + * target. The goal here is to keep the output link + * utilization high by never allowing the queue to get + * smaller than the amount that arrives in a typical + * interarrival time (MTU-sized packets arriving spaced + * by the amount of time it takes to send such a packet on + * the bottleneck). The 2nd term of the "if" does this. + */ + sojourn_time = now - pkt_ts; + if (sojourn_time < cprms->target || q->ni.len_bytes <= cst->maxpkt_size) { + /* went below - stay below for at least interval */ + cst->first_above_time = 0; + } else { + if (cst->first_above_time == 0) { + /* just went above from below. if still above at + * first_above_time, will say it's ok to drop. */ + cst->first_above_time = now + cprms->interval; + } else if (now >= cst->first_above_time) { + *ok_to_drop = 1; + } + } + return m; +} + +/* + * Dequeue a packet from queue 'q' + */ +__inline static struct mbuf * +codel_dequeue(struct dn_queue *q) +{ + struct mbuf *m; + struct dn_aqm_codel_parms *cprms; + struct codel_status *cst; + aqm_time_t now; + uint16_t ok_to_drop; + + cst = q->aqm_status;; + cprms = q->fs->aqmcfg; + now = AQM_UNOW; + + m = codel_dodequeue(q, now, &ok_to_drop); + if (cst->dropping) { + if (!ok_to_drop) { + /* sojourn time below target - leave dropping state */ + cst->dropping = false; + } + /* + * Time for the next drop. Drop current packet and dequeue + * next. If the dequeue doesn't take us out of dropping + * state, schedule the next drop. A large backlog might + * result in drop rates so high that the next drop should + * happen now, hence the 'while' loop. + */ + while (now >= cst->drop_next_time && cst->dropping) { + + /* mark the packet */ + if (cprms->flags & CODEL_ECN_ENABLED && ecn_mark(m)) { + cst->count++; + /* schedule the next mark. */ + cst->drop_next_time = control_law(cst, cprms, + cst->drop_next_time); + return m; + } + + /* drop the packet */ + update_stats(q, 0, 1); + FREE_PKT(m); + m = codel_dodequeue(q, now, &ok_to_drop); + + if (!ok_to_drop) { + /* leave dropping state */ + cst->dropping = false; + } else { + cst->count++; + /* schedule the next drop. */ + cst->drop_next_time = control_law(cst, cprms, + cst->drop_next_time); + } + } + /* If we get here we're not in dropping state. The 'ok_to_drop' + * return from dodequeue means that the sojourn time has been + * above 'target' for 'interval' so enter dropping state. + */ + } else if (ok_to_drop) { + + /* if ECN option is disabled or the packet cannot be marked, + * drop the packet and extract another. + */ + if (!(cprms->flags & CODEL_ECN_ENABLED) || !ecn_mark(m)) { + update_stats(q, 0, 1); + FREE_PKT(m); + m = codel_dodequeue(q, now, &ok_to_drop); + } + + cst->dropping = true; + + /* If min went above target close to when it last went + * below, assume that the drop rate that controlled the + * queue on the last cycle is a good starting point to + * control it now. ('drop_next' will be at most 'interval' + * later than the time of the last drop so 'now - drop_next' + * is a good approximation of the time from the last drop + * until now.) + */ + cst->count = (cst->count > 2 && ((aqm_stime_t)now - + (aqm_stime_t)cst->drop_next_time) < 8* cprms->interval)? + cst->count - 2 : 1; + /* we don't have to set initial guess for Newton's method isqrt as + * we initilaize isqrt in control_law function when count == 1 */ + cst->drop_next_time = control_law(cst, cprms, now); + } + + return m; +} + +#endif diff --git a/sys/netpfil/ipfw/dn_aqm_pie.c b/sys/netpfil/ipfw/dn_aqm_pie.c new file mode 100644 index 0000000..c4b9401 --- /dev/null +++ b/sys/netpfil/ipfw/dn_aqm_pie.c @@ -0,0 +1,793 @@ +/* + * PIE - Proportional Integral controller Enhanced AQM algorithm. + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/sysctl.h> + +#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include <net/netisr.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/ip.h> /* ip_len, ip_off */ +#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ +#include <netinet/ip_fw.h> +#include <netinet/ip_dummynet.h> +#include <netinet/if_ether.h> /* various ether_* routines */ +#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */ +#include <netinet6/ip6_var.h> +#include <netpfil/ipfw/dn_heap.h> + +#ifdef NEW_AQM +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/ip_dn_private.h> +#include <netpfil/ipfw/dn_aqm.h> +#include <netpfil/ipfw/dn_aqm_pie.h> +#include <netpfil/ipfw/dn_sched.h> + +/* for debugging */ +#include <sys/syslog.h> + +static struct dn_aqm pie_desc; + +/* PIE defaults + * target=15ms, tupdate=15ms, max_burst=150ms, + * max_ecnth=0.1, alpha=0.125, beta=1.25, + */ +struct dn_aqm_pie_parms pie_sysctl = + { 15 * AQM_TIME_1MS, 15 * AQM_TIME_1MS, 150 * AQM_TIME_1MS, + PIE_SCALE/10 , PIE_SCALE * 0.125, PIE_SCALE * 1.25 , + PIE_CAPDROP_ENABLED | PIE_DEPRATEEST_ENABLED | PIE_DERAND_ENABLED }; + +static int +pie_sysctl_alpha_beta_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + if (!strcmp(oidp->oid_name,"alpha")) + value = pie_sysctl.alpha; + else + value = pie_sysctl.beta; + + value = value * 1000 / PIE_SCALE; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > 7 * PIE_SCALE) + return (EINVAL); + value = (value * PIE_SCALE) / 1000; + if (!strcmp(oidp->oid_name,"alpha")) + pie_sysctl.alpha = value; + else + pie_sysctl.beta = value; + return (0); +} + +static int +pie_sysctl_target_tupdate_maxb_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + if (!strcmp(oidp->oid_name,"target")) + value = pie_sysctl.qdelay_ref; + else if (!strcmp(oidp->oid_name,"tupdate")) + value = pie_sysctl.tupdate; + else + value = pie_sysctl.max_burst; + + value = value / AQM_TIME_1US; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > 10 * AQM_TIME_1S) + return (EINVAL); + value = value * AQM_TIME_1US; + + if (!strcmp(oidp->oid_name,"target")) + pie_sysctl.qdelay_ref = value; + else if (!strcmp(oidp->oid_name,"tupdate")) + pie_sysctl.tupdate = value; + else + pie_sysctl.max_burst = value; + return (0); +} + +static int +pie_sysctl_max_ecnth_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + value = pie_sysctl.max_ecnth; + value = value * 1000 / PIE_SCALE; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > PIE_SCALE) + return (EINVAL); + value = (value * PIE_SCALE) / 1000; + pie_sysctl.max_ecnth = value; + return (0); +} + +/* define PIE sysctl variables */ +SYSBEGIN(f4) +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_DECL(_net_inet_ip_dummynet); +static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, + pie, CTLFLAG_RW, 0, "PIE"); + +#ifdef SYSCTL_NODE +SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, target, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + pie_sysctl_target_tupdate_maxb_handler, "L", + "queue target in microsecond"); +SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, tupdate, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + pie_sysctl_target_tupdate_maxb_handler, "L", + "the frequency of drop probability calculation in microsecond"); +SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, max_burst, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + pie_sysctl_target_tupdate_maxb_handler, "L", + "Burst allowance interval in microsecond"); + +SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, max_ecnth, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + pie_sysctl_max_ecnth_handler, "L", + "ECN safeguard threshold scaled by 1000"); + +SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, alpha, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + pie_sysctl_alpha_beta_handler, "L", + "PIE alpha scaled by 1000"); +SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, beta, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + pie_sysctl_alpha_beta_handler, "L", + "beta scaled by 1000"); +#endif + + +/* + * Callout function for drop probability calculation + * This function is called over tupdate ms and takes pointer of PIE + * status variables as an argument + */ +static void +calculate_drop_prob(void *x) +{ + int64_t p, prob, oldprob; + struct dn_aqm_pie_parms *pprms; + struct pie_status *pst = (struct pie_status *) x; + + /* dealing with race condition */ + if (callout_pending(&pst->aqm_pie_callout)) { + /* callout was reset */ + mtx_unlock(&pst->lock_mtx); + return; + } + + if (!callout_active(&pst->aqm_pie_callout)) { + /* callout was stopped */ + mtx_unlock(&pst->lock_mtx); + mtx_destroy(&pst->lock_mtx); + free(x, M_DUMMYNET); + //pst->pq->aqm_status = NULL; + pie_desc.ref_count--; + return; + } + callout_deactivate(&pst->aqm_pie_callout); + + pprms = pst->parms; + prob = pst->drop_prob; + + /* calculate current qdelay */ + if (pprms->flags & PIE_DEPRATEEST_ENABLED) { + pst->current_qdelay = ((uint64_t)pst->pq->ni.len_bytes * + pst->avg_dq_time) >> PIE_DQ_THRESHOLD_BITS; + } + + /* calculate drop probability */ + p = (int64_t)pprms->alpha * + ((int64_t)pst->current_qdelay - (int64_t)pprms->qdelay_ref); + p +=(int64_t) pprms->beta * + ((int64_t)pst->current_qdelay - (int64_t)pst->qdelay_old); + + /* We PIE_MAX_PROB shift by 12-bits to increase the division precision */ + p *= (PIE_MAX_PROB << 12) / AQM_TIME_1S; + + /* auto-tune drop probability */ + if (prob < (PIE_MAX_PROB / 1000000)) /* 0.000001 */ + p >>= 11 + PIE_FIX_POINT_BITS + 12; + else if (prob < (PIE_MAX_PROB / 100000)) /* 0.00001 */ + p >>= 9 + PIE_FIX_POINT_BITS + 12; + else if (prob < (PIE_MAX_PROB / 10000)) /* 0.0001 */ + p >>= 7 + PIE_FIX_POINT_BITS + 12; + else if (prob < (PIE_MAX_PROB / 1000)) /* 0.001 */ + p >>= 5 + PIE_FIX_POINT_BITS + 12; + else if (prob < (PIE_MAX_PROB / 100)) /* 0.01 */ + p >>= 3 + PIE_FIX_POINT_BITS + 12; + else if (prob < (PIE_MAX_PROB / 10)) /* 0.1 */ + p >>= 1 + PIE_FIX_POINT_BITS + 12; + else + p >>= PIE_FIX_POINT_BITS + 12; + + oldprob = prob; + + /* Cap Drop adjustment */ + if ((pprms->flags & PIE_CAPDROP_ENABLED) && prob >= PIE_MAX_PROB / 10 + && p > PIE_MAX_PROB / 50 ) + p = PIE_MAX_PROB / 50; + + prob = prob + p; + + /* decay the drop probability exponentially */ + if (pst->current_qdelay == 0 && pst->qdelay_old == 0) + /* 0.98 ~= 1- 1/64 */ + prob = prob - (prob >> 6); + + + /* check for multiplication overflow/underflow */ + if (p>0) { + if (prob<oldprob) { + D("overflow"); + prob= PIE_MAX_PROB; + } + } + else + if (prob>oldprob) { + prob= 0; + D("underflow"); + } + + /* make drop probability between 0 and PIE_MAX_PROB*/ + if (prob < 0) + prob = 0; + else if (prob > PIE_MAX_PROB) + prob = PIE_MAX_PROB; + + pst->drop_prob = prob; + + /* store current queue delay value in old queue delay*/ + pst->qdelay_old = pst->current_qdelay; + + /* update burst allowance */ + if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance>0) { + + if (pst->burst_allowance > pprms->tupdate ) + pst->burst_allowance -= pprms->tupdate; + else + pst->burst_allowance = 0; + } + + /* reschedule calculate_drop_prob function */ + if (pst->sflags & PIE_ACTIVE) + callout_reset_sbt(&pst->aqm_pie_callout, + (uint64_t)pprms->tupdate * SBT_1US, 0, calculate_drop_prob, pst, 0); + + mtx_unlock(&pst->lock_mtx); +} + +/* + * Extract a packet from the head of queue 'q' + * Return a packet or NULL if the queue is empty. + * If getts is set, also extract packet's timestamp from mtag. + */ +static struct mbuf * +pie_extract_head(struct dn_queue *q, aqm_time_t *pkt_ts, int getts) +{ + struct m_tag *mtag; + struct mbuf *m = q->mq.head; + + if (m == NULL) + return m; + q->mq.head = m->m_nextpkt; + + /* Update stats */ + update_stats(q, -m->m_pkthdr.len, 0); + + if (q->ni.length == 0) /* queue is now idle */ + q->q_time = dn_cfg.curr_time; + + if (getts) { + /* extract packet TS*/ + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL) { + D("PIE timestamp mtag not found!"); + *pkt_ts = 0; + } else { + *pkt_ts = *(aqm_time_t *)(mtag + 1); + m_tag_delete(m,mtag); + } + } + return m; +} + +/* + * Initiate PIE variable and optionally activate it + */ +__inline static void +init_activate_pie(struct pie_status *pst, int resettimer) +{ + struct dn_aqm_pie_parms *pprms; + + mtx_lock(&pst->lock_mtx); + pprms = pst->parms; + pst->drop_prob = 0; + pst->qdelay_old = 0; + pst->burst_allowance = pprms->max_burst; + pst->accu_prob = 0; + pst->dq_count = 0; + pst->avg_dq_time = 0; + pst->sflags = PIE_INMEASUREMENT; + pst->measurement_start = AQM_UNOW; + + if (resettimer) { + pst->sflags |= PIE_ACTIVE; + callout_reset_sbt(&pst->aqm_pie_callout, + (uint64_t)pprms->tupdate * SBT_1US, + 0, calculate_drop_prob, pst, 0); + } + //DX(2, "PIE Activated"); + mtx_unlock(&pst->lock_mtx); +} + +/* + * Deactivate PIE and stop probe update callout + */ +__inline static void +deactivate_pie(struct pie_status *pst) +{ + mtx_lock(&pst->lock_mtx); + pst->sflags &= ~(PIE_ACTIVE | PIE_INMEASUREMENT); + callout_stop(&pst->aqm_pie_callout); + //D("PIE Deactivated"); + mtx_unlock(&pst->lock_mtx); +} + +/* + * Dequeue and return a pcaket from queue 'q' or NULL if 'q' is empty. + * Also, caculate depature time or queue delay using timestamp + */ +static struct mbuf * +aqm_pie_dequeue(struct dn_queue *q) +{ + struct mbuf *m; + struct dn_flow *ni; /* stats for scheduler instance */ + struct dn_aqm_pie_parms *pprms; + struct pie_status *pst; + aqm_time_t now; + aqm_time_t pkt_ts, dq_time; + int32_t w; + + pst = q->aqm_status; + pprms = pst->parms; + ni = &q->_si->ni; + + /*we extarct packet ts only when Departure Rate Estimation dis not used*/ + m = pie_extract_head(q, &pkt_ts, !(pprms->flags & PIE_DEPRATEEST_ENABLED)); + + if (!m || !(pst->sflags & PIE_ACTIVE)) + return m; + + now = AQM_UNOW; + if (pprms->flags & PIE_DEPRATEEST_ENABLED) { + /* calculate average depature time */ + if(pst->sflags & PIE_INMEASUREMENT) { + pst->dq_count += m->m_pkthdr.len; + + if (pst->dq_count >= PIE_DQ_THRESHOLD) { + dq_time = now - pst->measurement_start; + + /* + * if we don't have old avg dq_time i.e PIE is (re)initialized, + * don't use weight to calculate new avg_dq_time + */ + if(pst->avg_dq_time == 0) + pst->avg_dq_time = dq_time; + else { + /* + * weight = PIE_DQ_THRESHOLD/2^6, but we scaled + * weight by 2^8. Thus, scaled + * weight = PIE_DQ_THRESHOLD /2^8 + * */ + w = PIE_DQ_THRESHOLD >> 8; + pst->avg_dq_time = (dq_time* w + + (pst->avg_dq_time * ((1L << 8) - w))) >> 8; + pst->sflags &= ~PIE_INMEASUREMENT; + } + } + } + + /* + * Start new measurment cycle when the queue has + * PIE_DQ_THRESHOLD worth of bytes. + */ + if(!(pst->sflags & PIE_INMEASUREMENT) && + q->ni.len_bytes >= PIE_DQ_THRESHOLD) { + pst->sflags |= PIE_INMEASUREMENT; + pst->measurement_start = now; + pst->dq_count = 0; + } + } + /* Optionally, use packet timestamp to estimate queue delay */ + else + pst->current_qdelay = now - pkt_ts; + + return m; +} + +/* + * Enqueue a packet in q, subject to space and PIE queue management policy + * (whose parameters are in q->fs). + * Update stats for the queue and the scheduler. + * Return 0 on success, 1 on drop. The packet is consumed anyways. + */ +static int +aqm_pie_enqueue(struct dn_queue *q, struct mbuf* m) +{ + struct dn_fs *f; + uint64_t len; + uint32_t qlen; + struct pie_status *pst; + struct dn_aqm_pie_parms *pprms; + int t; + + len = m->m_pkthdr.len; + pst = q->aqm_status; + if(!pst) { + DX(2, "PIE queue is not initialized\n"); + update_stats(q, 0, 1); + FREE_PKT(m); + return 1; + } + + f = &(q->fs->fs); + pprms = pst->parms; + t = ENQUE; + + /* get current queue length in bytes or packets*/ + qlen = (f->flags & DN_QSIZE_BYTES) ? + q->ni.len_bytes : q->ni.length; + + /* check for queue size and drop the tail if exceed queue limit*/ + if (qlen >= f->qsize) + t = DROP; + /* drop/mark the packet when PIE is active and burst time elapsed */ + else if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance==0 + && drop_early(pst, q->ni.len_bytes) == DROP) { + /* + * if drop_prob over ECN threshold, drop the packet + * otherwise mark and enqueue it. + */ + if ((pprms->flags & PIE_ECN_ENABLED) && pst->drop_prob < + (pprms->max_ecnth << (PIE_PROB_BITS - PIE_FIX_POINT_BITS)) + && ecn_mark(m)) + t = ENQUE; + else + t = DROP; + } + + /* Turn PIE on when 1/3 of the queue is full */ + if (!(pst->sflags & PIE_ACTIVE) && qlen >= pst->one_third_q_size) { + init_activate_pie(pst, 1); + } + + /* Reset burst tolerance and optinally turn PIE off*/ + if ((pst->sflags & PIE_ACTIVE) && pst->drop_prob == 0 && + pst->current_qdelay < (pprms->qdelay_ref >> 1) && + pst->qdelay_old < (pprms->qdelay_ref >> 1)) { + + pst->burst_allowance = pprms->max_burst; + if ((pprms->flags & PIE_ON_OFF_MODE_ENABLED) && qlen<=0) + deactivate_pie(pst); + } + + /* Timestamp the packet if Departure Rate Estimation is disabled */ + if (t != DROP && !(pprms->flags & PIE_DEPRATEEST_ENABLED)) { + /* Add TS to mbuf as a TAG */ + struct m_tag *mtag; + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL) + mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, + sizeof(aqm_time_t), M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + t = DROP; + } + *(aqm_time_t *)(mtag + 1) = AQM_UNOW; + m_tag_prepend(m, mtag); + } + + if (t != DROP) { + mq_append(&q->mq, m); + update_stats(q, len, 0); + return (0); + } else { + update_stats(q, 0, 1); + + /* reset accu_prob after packet drop */ + pst->accu_prob = 0; + FREE_PKT(m); + return 1; + } + return 0; +} + +/* + * initialize PIE for queue 'q' + * First allocate memory for PIE status. + */ +static int +aqm_pie_init(struct dn_queue *q) +{ + struct pie_status *pst; + struct dn_aqm_pie_parms *pprms; + int err = 0; + + pprms = q->fs->aqmcfg; + + do { /* exit with break when error occurs*/ + if (!pprms){ + D("AQM_PIE is not configured"); + err = EINVAL; + break; + } + + q->aqm_status = malloc(sizeof(struct pie_status), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (q->aqm_status == NULL) { + D("cannot allocate PIE private data"); + err = ENOMEM ; + break; + } + + pst = q->aqm_status; + /* increase reference count for PIE module */ + pie_desc.ref_count++; + + pst->pq = q; + pst->parms = pprms; + + /* For speed optimization, we caculate 1/3 queue size once here */ + // we can use x/3 = (x >>2) + (x >>4) + (x >>7) + pst->one_third_q_size = q->fs->fs.qsize/3; + + mtx_init(&pst->lock_mtx, "mtx_pie", NULL, MTX_DEF); + callout_init_mtx(&pst->aqm_pie_callout, &pst->lock_mtx, + CALLOUT_RETURNUNLOCKED); + + pst->current_qdelay = 0; + init_activate_pie(pst, !(pprms->flags & PIE_ON_OFF_MODE_ENABLED)); + + //DX(2, "aqm_PIE_init"); + + } while(0); + + return err; +} + +/* + * Clean up PIE status for queue 'q' + * Destroy memory allocated for PIE status. + */ +static int +aqm_pie_cleanup(struct dn_queue *q) +{ + + if(!q) { + D("q is null"); + return 0; + } + struct pie_status *pst = q->aqm_status; + if(!pst) { + //D("queue is already cleaned up"); + return 0; + } + if(!q->fs || !q->fs->aqmcfg) { + D("fs is null or no cfg"); + return 1; + } + if (q->fs->aqmfp && q->fs->aqmfp->type !=DN_AQM_PIE) { + D("Not PIE fs (%d)", q->fs->fs.fs_nr); + return 1; + } + + mtx_lock(&pst->lock_mtx); + + /* stop callout timer */ + if (callout_stop(&pst->aqm_pie_callout) || !(pst->sflags & PIE_ACTIVE)) { + mtx_unlock(&pst->lock_mtx); + mtx_destroy(&pst->lock_mtx); + free(q->aqm_status, M_DUMMYNET); + q->aqm_status = NULL; + pie_desc.ref_count--; + return 0; + } else { + q->aqm_status = NULL; + mtx_unlock(&pst->lock_mtx); + DX(2, "PIE callout has not been stoped from cleanup!"); + return EBUSY; + } + return 0; +} + +/* + * Config PIE parameters + * also allocate memory for PIE configurations + */ +static int +aqm_pie_config(struct dn_fsk* fs, struct dn_extra_parms *ep, int len) +{ + struct dn_aqm_pie_parms *pcfg; + + int l = sizeof(struct dn_extra_parms); + if (len < l) { + D("invalid sched parms length got %d need %d", len, l); + return EINVAL; + } + /* we free the old cfg because maybe the orignal allocation + * was used for diffirent AQM type. + */ + if (fs->aqmcfg) { + free(fs->aqmcfg, M_DUMMYNET); + fs->aqmcfg = NULL; + } + + fs->aqmcfg = malloc(sizeof(struct dn_aqm_pie_parms), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (fs->aqmcfg== NULL) { + D("cannot allocate PIE configuration parameters"); + return ENOMEM; + } + + /* par array contains pie configuration as follow + * 0- qdelay_ref,1- tupdate, 2- max_burst + * 3- max_ecnth, 4- alpha, 5- beta, 6- flags + */ + + /* configure PIE parameters */ + pcfg = fs->aqmcfg; + + if (ep->par[0] < 0) + pcfg->qdelay_ref = pie_sysctl.qdelay_ref * AQM_TIME_1US; + else + pcfg->qdelay_ref = ep->par[0]; + if (ep->par[1] < 0) + pcfg->tupdate = pie_sysctl.tupdate * AQM_TIME_1US; + else + pcfg->tupdate = ep->par[1]; + if (ep->par[2] < 0) + pcfg->max_burst = pie_sysctl.max_burst * AQM_TIME_1US; + else + pcfg->max_burst = ep->par[2]; + if (ep->par[3] < 0) + pcfg->max_ecnth = pie_sysctl.max_ecnth; + else + pcfg->max_ecnth = ep->par[3]; + if (ep->par[4] < 0) + pcfg->alpha = pie_sysctl.alpha; + else + pcfg->alpha = ep->par[4]; + if (ep->par[5] < 0) + pcfg->beta = pie_sysctl.beta; + else + pcfg->beta = ep->par[5]; + if (ep->par[6] < 0) + pcfg->flags = pie_sysctl.flags; + else + pcfg->flags = ep->par[6]; + + /* bound PIE configurations */ + pcfg->qdelay_ref = BOUND_VAR(pcfg->qdelay_ref, 1, 10 * AQM_TIME_1S); + pcfg->tupdate = BOUND_VAR(pcfg->tupdate, 1, 10 * AQM_TIME_1S); + pcfg->max_burst = BOUND_VAR(pcfg->max_burst, 0, 10 * AQM_TIME_1S); + pcfg->max_ecnth = BOUND_VAR(pcfg->max_ecnth, 0, PIE_SCALE); + pcfg->alpha = BOUND_VAR(pcfg->alpha, 0, 7 * PIE_SCALE); + pcfg->beta = BOUND_VAR(pcfg->beta, 0 , 7 * PIE_SCALE); + + pie_desc.cfg_ref_count++; + //D("pie cfg_ref_count=%d", pie_desc.cfg_ref_count); + return 0; +} + +/* + * Deconfigure PIE and free memory allocation + */ +static int +aqm_pie_deconfig(struct dn_fsk* fs) +{ + if (fs && fs->aqmcfg) { + free(fs->aqmcfg, M_DUMMYNET); + fs->aqmcfg = NULL; + pie_desc.cfg_ref_count--; + } + return 0; +} + +/* + * Retrieve PIE configuration parameters. + */ +static int +aqm_pie_getconfig (struct dn_fsk *fs, struct dn_extra_parms * ep) +{ + struct dn_aqm_pie_parms *pcfg; + if (fs->aqmcfg) { + strcpy(ep->name, pie_desc.name); + pcfg = fs->aqmcfg; + ep->par[0] = pcfg->qdelay_ref / AQM_TIME_1US; + ep->par[1] = pcfg->tupdate / AQM_TIME_1US; + ep->par[2] = pcfg->max_burst / AQM_TIME_1US; + ep->par[3] = pcfg->max_ecnth; + ep->par[4] = pcfg->alpha; + ep->par[5] = pcfg->beta; + ep->par[6] = pcfg->flags; + + return 0; + } + return 1; +} + +static struct dn_aqm pie_desc = { + _SI( .type = ) DN_AQM_PIE, + _SI( .name = ) "PIE", + _SI( .ref_count = ) 0, + _SI( .cfg_ref_count = ) 0, + _SI( .enqueue = ) aqm_pie_enqueue, + _SI( .dequeue = ) aqm_pie_dequeue, + _SI( .config = ) aqm_pie_config, + _SI( .deconfig = ) aqm_pie_deconfig, + _SI( .getconfig = ) aqm_pie_getconfig, + _SI( .init = ) aqm_pie_init, + _SI( .cleanup = ) aqm_pie_cleanup, +}; + +DECLARE_DNAQM_MODULE(dn_aqm_pie, &pie_desc); +#endif diff --git a/sys/netpfil/ipfw/dn_aqm_pie.h b/sys/netpfil/ipfw/dn_aqm_pie.h new file mode 100644 index 0000000..aa2fceb --- /dev/null +++ b/sys/netpfil/ipfw/dn_aqm_pie.h @@ -0,0 +1,153 @@ +/* + * PIE - Proportional Integral controller Enhanced AQM algorithm. + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _IP_DN_AQM_PIE_H +#define _IP_DN_AQM_PIE_H + +#define DN_AQM_PIE 2 +#define PIE_DQ_THRESHOLD_BITS 14 +/* 2^14 =16KB */ +#define PIE_DQ_THRESHOLD (1UL << PIE_DQ_THRESHOLD_BITS) +#define MEAN_PKTSIZE 800 + +/* 31-bits because random() generates range from 0->(2**31)-1 */ +#define PIE_PROB_BITS 31 +#define PIE_MAX_PROB ((1ULL<<PIE_PROB_BITS) -1) + +/* for 16-bits, we have 3-bits for integer part and 13-bits for fraction */ +#define PIE_FIX_POINT_BITS 13 +#define PIE_SCALE (1UL<<PIE_FIX_POINT_BITS) + + +/* PIE options */ +enum { + PIE_ECN_ENABLED =1, + PIE_CAPDROP_ENABLED = 2, + PIE_ON_OFF_MODE_ENABLED = 4, + PIE_DEPRATEEST_ENABLED = 8, + PIE_DERAND_ENABLED = 16 +}; + +/* PIE parameters */ +struct dn_aqm_pie_parms { + aqm_time_t qdelay_ref; /* AQM Latency Target (default: 15ms) */ + aqm_time_t tupdate; /* a period to calculate drop probability (default:15ms) */ + aqm_time_t max_burst; /* AQM Max Burst Allowance (default: 150ms) */ + uint16_t max_ecnth; /*AQM Max ECN Marking Threshold (default: 10%) */ + uint16_t alpha; /* (default: 1/8) */ + uint16_t beta; /* (default: 1+1/4) */ + uint32_t flags; /* PIE options */ +}; + +/* PIE status variables */ +struct pie_status{ + struct callout aqm_pie_callout; + aqm_time_t burst_allowance; + uint32_t drop_prob; + aqm_time_t current_qdelay; + aqm_time_t qdelay_old; + uint64_t accu_prob; + aqm_time_t measurement_start; + aqm_time_t avg_dq_time; + uint32_t dq_count; + uint32_t sflags; + struct dn_aqm_pie_parms *parms; /* pointer to PIE configurations */ + /* pointer to parent queue of FQ-PIE sub-queues, or queue of owner fs. */ + struct dn_queue *pq; + struct mtx lock_mtx; + uint32_t one_third_q_size; /* 1/3 of queue size, for speed optization */ +}; + +enum { + ENQUE = 1, + DROP, + MARKECN +}; + +/* PIE current state */ +enum { + PIE_ACTIVE = 1, + PIE_INMEASUREMENT = 2 +}; + +/* + * Check if eneque should drop packet to control delay or not based on + * PIe algorithm. + * return DROP if it is time to drop or ENQUE otherwise. + * This function is used by PIE and FQ-PIE. + */ +__inline static int +drop_early(struct pie_status *pst, uint32_t qlen) +{ + struct dn_aqm_pie_parms *pprms; + + pprms = pst->parms; + + /* queue is not congested */ + + if ((pst->qdelay_old < (pprms->qdelay_ref >> 1) + && pst->drop_prob < PIE_MAX_PROB / 5 ) + || qlen <= 2 * MEAN_PKTSIZE) + return ENQUE; + + + if (pst->drop_prob == 0) + pst->accu_prob = 0; + + /* increment accu_prob */ + if (pprms->flags & PIE_DERAND_ENABLED) + pst->accu_prob += pst->drop_prob; + + /* De-randomize option + * if accu_prob < 0.85 -> enqueue + * if accu_prob>8.5 ->drop + * between 0.85 and 8.5 || !De-randomize --> drop on prob + * + * (0.85 = 17/20 ,8.5 = 17/2) + */ + if (pprms->flags & PIE_DERAND_ENABLED) { + if(pst->accu_prob < (uint64_t) (PIE_MAX_PROB * 17 / 20)) + return ENQUE; + if( pst->accu_prob >= (uint64_t) (PIE_MAX_PROB * 17 / 2)) + return DROP; + } + + if (random() < pst->drop_prob) { + pst->accu_prob = 0; + return DROP; + } + + return ENQUE; +} + +#endif diff --git a/sys/netpfil/ipfw/dn_sched.h b/sys/netpfil/ipfw/dn_sched.h index ab823fe..a359198 100644 --- a/sys/netpfil/ipfw/dn_sched.h +++ b/sys/netpfil/ipfw/dn_sched.h @@ -132,6 +132,10 @@ struct dn_alg { int (*free_fsk)(struct dn_fsk *f); int (*new_queue)(struct dn_queue *q); int (*free_queue)(struct dn_queue *q); +#ifdef NEW_AQM + /* Getting scheduler extra parameters */ + int (*getconfig)(struct dn_schk *, struct dn_extra_parms *); +#endif /* run-time fields */ int ref_count; /* XXX number of instances in the system */ @@ -165,6 +169,11 @@ dn_dequeue(struct dn_queue *q) struct mbuf *m = q->mq.head; if (m == NULL) return NULL; +#ifdef NEW_AQM + /* Call AQM dequeue function */ + if (q->fs->aqmfp && q->fs->aqmfp->dequeue ) + return q->fs->aqmfp->dequeue(q); +#endif q->mq.head = m->m_nextpkt; /* Update stats for the queue */ diff --git a/sys/netpfil/ipfw/dn_sched_fifo.c b/sys/netpfil/ipfw/dn_sched_fifo.c index e2aa608..a4a2a70 100644 --- a/sys/netpfil/ipfw/dn_sched_fifo.c +++ b/sys/netpfil/ipfw/dn_sched_fifo.c @@ -42,6 +42,9 @@ #include <netinet/ip_dummynet.h> #include <netpfil/ipfw/dn_heap.h> #include <netpfil/ipfw/ip_dn_private.h> +#ifdef NEW_AQM +#include <netpfil/ipfw/dn_aqm.h> +#endif #include <netpfil/ipfw/dn_sched.h> #else #include <dn_test.h> @@ -115,6 +118,9 @@ static struct dn_alg fifo_desc = { _SI( .free_fsk = ) NULL, _SI( .new_queue = ) NULL, _SI( .free_queue = ) NULL, +#ifdef NEW_AQM + _SI( .getconfig = ) NULL, +#endif }; DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc); diff --git a/sys/netpfil/ipfw/dn_sched_fq_codel.c b/sys/netpfil/ipfw/dn_sched_fq_codel.c new file mode 100644 index 0000000..c783730 --- /dev/null +++ b/sys/netpfil/ipfw/dn_sched_fq_codel.c @@ -0,0 +1,617 @@ +/* + * FQ_Codel - The FlowQueue-Codel scheduler/AQM + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef _KERNEL +#include <sys/malloc.h> +#include <sys/socket.h> +//#include <sys/socketvar.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <net/if.h> /* IFNAMSIZ */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* ipfw_rule_ref */ +#include <netinet/ip_fw.h> /* flow_id */ +#include <netinet/ip_dummynet.h> + +#include <sys/proc.h> +#include <sys/rwlock.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <sys/sysctl.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/ip_icmp.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <sys/queue.h> +#include <sys/hash.h> + +#include <netpfil/ipfw/dn_heap.h> +#include <netpfil/ipfw/ip_dn_private.h> + +#include <netpfil/ipfw/dn_aqm.h> +#include <netpfil/ipfw/dn_aqm_codel.h> +#include <netpfil/ipfw/dn_sched.h> +#include <netpfil/ipfw/dn_sched_fq_codel.h> +#include <netpfil/ipfw/dn_sched_fq_codel_helper.h> + +#else +#include <dn_test.h> +#endif + +/* NOTE: In fq_codel module, we reimplements CoDel AQM functions + * because fq_codel use different flows (sub-queues) structure and + * dn_queue includes many variables not needed by a flow (sub-queue + * )i.e. avoid extra overhead (88 bytes vs 208 bytes). + * Also, CoDel functions manages stats of sub-queues as well as the main queue. + */ + +#define DN_SCHED_FQ_CODEL 6 + +static struct dn_alg fq_codel_desc; + +/* fq_codel default parameters including codel */ +struct dn_sch_fq_codel_parms +fq_codel_sysctl = {{5000 * AQM_TIME_1US, 100000 * AQM_TIME_1US, + CODEL_ECN_ENABLED}, 1024, 10240, 1514}; + +static int +fqcodel_sysctl_interval_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + value = fq_codel_sysctl.ccfg.interval; + value /= AQM_TIME_1US; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > 100 * AQM_TIME_1S) + return (EINVAL); + fq_codel_sysctl.ccfg.interval = value * AQM_TIME_1US ; + + return (0); +} + +static int +fqcodel_sysctl_target_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + value = fq_codel_sysctl.ccfg.target; + value /= AQM_TIME_1US; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > 5 * AQM_TIME_1S) + return (EINVAL); + fq_codel_sysctl.ccfg.target = value * AQM_TIME_1US ; + + return (0); +} + + +SYSBEGIN(f4) + +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_DECL(_net_inet_ip_dummynet); +static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqcodel, + CTLFLAG_RW, 0, "FQ_CODEL"); + +#ifdef SYSCTL_NODE + +SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, target, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_target_handler, "L", + "FQ_CoDel target in microsecond"); +SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, interval, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_interval_handler, "L", + "FQ_CoDel interval in microsecond"); + +SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, quantum, + CTLFLAG_RW, &fq_codel_sysctl.quantum, 1514, "FQ_CoDel quantum"); +SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, flows, + CTLFLAG_RW, &fq_codel_sysctl.flows_cnt, 1024, + "Number of queues for FQ_CoDel"); +SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, limit, + CTLFLAG_RW, &fq_codel_sysctl.limit, 10240, "FQ_CoDel queues size limit"); +#endif + +/* Drop a packet form the head of codel queue */ +static void +codel_drop_head(struct fq_codel_flow *q, struct fq_codel_si *si) +{ + struct mbuf *m = q->mq.head; + + if (m == NULL) + return; + q->mq.head = m->m_nextpkt; + + fq_update_stats(q, si, -m->m_pkthdr.len, 1); + + if (si->main_q.ni.length == 0) /* queue is now idle */ + si->main_q.q_time = dn_cfg.curr_time; + + FREE_PKT(m); +} + +/* Enqueue a packet 'm' to a queue 'q' and add timestamp to that packet. + * Return 1 when unable to add timestamp, otherwise return 0 + */ +static int +codel_enqueue(struct fq_codel_flow *q, struct mbuf *m, struct fq_codel_si *si) +{ + uint64_t len; + + len = m->m_pkthdr.len; + /* finding maximum packet size */ + if (len > q->cst.maxpkt_size) + q->cst.maxpkt_size = len; + + /* Add timestamp to mbuf as MTAG */ + struct m_tag *mtag; + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL) + mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, sizeof(aqm_time_t), + M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + goto drop; + } + *(aqm_time_t *)(mtag + 1) = AQM_UNOW; + m_tag_prepend(m, mtag); + + mq_append(&q->mq, m); + fq_update_stats(q, si, len, 0); + return 0; + +drop: + fq_update_stats(q, si, len, 1); + m_freem(m); + return 1; +} + +/* + * Classify a packet to queue number using Jenkins hash function. + * Return: queue number + * the input of the hash are protocol no, perturbation, src IP, dst IP, + * src port, dst port, + */ +static inline int +fq_codel_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_codel_si *si) +{ + struct ip *ip; + struct tcphdr *th; + struct udphdr *uh; + uint8_t tuple[41]; + uint16_t hash=0; + +//#ifdef INET6 + struct ip6_hdr *ip6; + int isip6; + isip6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; + + if(isip6) { + ip6 = mtod(m, struct ip6_hdr *); + *((uint8_t *) &tuple[0]) = ip6->ip6_nxt; + *((uint32_t *) &tuple[1]) = si->perturbation; + memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16); + memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16); + + switch (ip6->ip6_nxt) { + case IPPROTO_TCP: + th = (struct tcphdr *)(ip6 + 1); + *((uint16_t *) &tuple[37]) = th->th_dport; + *((uint16_t *) &tuple[39]) = th->th_sport; + break; + + case IPPROTO_UDP: + uh = (struct udphdr *)(ip6 + 1); + *((uint16_t *) &tuple[37]) = uh->uh_dport; + *((uint16_t *) &tuple[39]) = uh->uh_sport; + break; + default: + memset(&tuple[37], 0, 4); + + } + + hash = jenkins_hash(tuple, 41, HASHINIT) % fcount; + return hash; + } +//#endif + + /* IPv4 */ + ip = mtod(m, struct ip *); + *((uint8_t *) &tuple[0]) = ip->ip_p; + *((uint32_t *) &tuple[1]) = si->perturbation; + *((uint32_t *) &tuple[5]) = ip->ip_src.s_addr; + *((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr; + + switch (ip->ip_p) { + case IPPROTO_TCP: + th = (struct tcphdr *)(ip + 1); + *((uint16_t *) &tuple[13]) = th->th_dport; + *((uint16_t *) &tuple[15]) = th->th_sport; + break; + + case IPPROTO_UDP: + uh = (struct udphdr *)(ip + 1); + *((uint16_t *) &tuple[13]) = uh->uh_dport; + *((uint16_t *) &tuple[15]) = uh->uh_sport; + break; + default: + memset(&tuple[13], 0, 4); + + } + hash = jenkins_hash(tuple, 17, HASHINIT) % fcount; + + return hash; +} + +/* + * Enqueue a packet into an appropriate queue according to + * FQ_CODEL algorithm. + */ +static int +fq_codel_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q, + struct mbuf *m) +{ + struct fq_codel_si *si; + struct fq_codel_schk *schk; + struct dn_sch_fq_codel_parms *param; + struct dn_queue *mainq; + int idx, drop, i, maxidx; + + mainq = (struct dn_queue *)(_si + 1); + si = (struct fq_codel_si *)_si; + schk = (struct fq_codel_schk *)(si->_si.sched+1); + param = &schk->cfg; + + /* classify a packet to queue number*/ + idx = fq_codel_classify_flow(m, param->flows_cnt, si); + /* enqueue packet into appropriate queue using CoDel AQM. + * Note: 'codel_enqueue' function returns 1 only when it unable to + * add timestamp to packet (no limit check)*/ + drop = codel_enqueue(&si->flows[idx], m, si); + + /* codel unable to timestamp a packet */ + if (drop) + return 1; + + /* If the flow (sub-queue) is not active ,then add it to the tail of + * new flows list, initialize and activate it. + */ + if (!si->flows[idx].active ) { + STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain); + si->flows[idx].deficit = param->quantum; + si->flows[idx].cst.dropping = false; + si->flows[idx].cst.first_above_time = 0; + si->flows[idx].active = 1; + //D("activate %d",idx); + } + + /* check the limit for all queues and remove a packet from the + * largest one + */ + if (mainq->ni.length > schk->cfg.limit) { D("over limit"); + /* find first active flow */ + for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++) + if (si->flows[maxidx].active) + break; + if (maxidx < schk->cfg.flows_cnt) { + /* find the largest sub- queue */ + for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++) + if (si->flows[i].active && si->flows[i].stats.length > + si->flows[maxidx].stats.length) + maxidx = i; + codel_drop_head(&si->flows[maxidx], si); + D("maxidx = %d",maxidx); + drop = 1; + } + } + + return drop; +} + +/* + * Dequeue a packet from an appropriate queue according to + * FQ_CODEL algorithm. + */ +static struct mbuf * +fq_codel_dequeue(struct dn_sch_inst *_si) +{ + struct fq_codel_si *si; + struct fq_codel_schk *schk; + struct dn_sch_fq_codel_parms *param; + struct fq_codel_flow *f; + struct mbuf *mbuf; + struct fq_codel_list *fq_codel_flowlist; + + si = (struct fq_codel_si *)_si; + schk = (struct fq_codel_schk *)(si->_si.sched+1); + param = &schk->cfg; + + do { + /* select a list to start with */ + if (STAILQ_EMPTY(&si->newflows)) + fq_codel_flowlist = &si->oldflows; + else + fq_codel_flowlist = &si->newflows; + + /* Both new and old queue lists are empty, return NULL */ + if (STAILQ_EMPTY(fq_codel_flowlist)) + return NULL; + + f = STAILQ_FIRST(fq_codel_flowlist); + while (f != NULL) { + /* if there is no flow(sub-queue) deficit, increase deficit + * by quantum, move the flow to the tail of old flows list + * and try another flow. + * Otherwise, the flow will be used for dequeue. + */ + if (f->deficit < 0) { + f->deficit += param->quantum; + STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain); + STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain); + } else + break; + + f = STAILQ_FIRST(fq_codel_flowlist); + } + + /* the new flows list is empty, try old flows list */ + if (STAILQ_EMPTY(fq_codel_flowlist)) + continue; + + /* Dequeue a packet from the selected flow */ + mbuf = fqc_codel_dequeue(f, si); + + /* Codel did not return a packet */ + if (!mbuf) { + /* If the selected flow belongs to new flows list, then move + * it to the tail of old flows list. Otherwise, deactivate it and + * remove it from the old list and + */ + if (fq_codel_flowlist == &si->newflows) { + STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain); + STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain); + } else { + f->active = 0; + STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain); + } + /* start again */ + continue; + } + + /* we have a packet to return, + * update flow deficit and return the packet*/ + f->deficit -= mbuf->m_pkthdr.len; + return mbuf; + + } while (1); + + /* unreachable point */ + return NULL; +} + +/* + * Initialize fq_codel scheduler instance. + * also, allocate memory for flows array. + */ +static int +fq_codel_new_sched(struct dn_sch_inst *_si) +{ + struct fq_codel_si *si; + struct dn_queue *q; + struct fq_codel_schk *schk; + int i; + + si = (struct fq_codel_si *)_si; + schk = (struct fq_codel_schk *)(_si->sched+1); + + if(si->flows) { + D("si already configured!"); + return 0; + } + + /* init the main queue */ + q = &si->main_q; + set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); + q->_si = _si; + q->fs = _si->sched->fs; + + /* allocate memory for flows array */ + si->flows = malloc(schk->cfg.flows_cnt * sizeof(struct fq_codel_flow), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (si->flows == NULL) { + D("cannot allocate memory for fq_codel configuration parameters"); + return ENOMEM ; + } + + /* init perturbation for this si */ + si->perturbation = random(); + + /* init the old and new flows lists */ + STAILQ_INIT(&si->newflows); + STAILQ_INIT(&si->oldflows); + + /* init the flows (sub-queues) */ + for (i = 0; i < schk->cfg.flows_cnt; i++) { + /* init codel */ + si->flows[i].cst.maxpkt_size = 500; + } + + fq_codel_desc.ref_count++; + return 0; +} + +/* + * Free fq_codel scheduler instance. + */ +static int +fq_codel_free_sched(struct dn_sch_inst *_si) +{ + struct fq_codel_si *si = (struct fq_codel_si *)_si ; + + /* free the flows array */ + free(si->flows , M_DUMMYNET); + si->flows = NULL; + fq_codel_desc.ref_count--; + + return 0; +} + +/* + * Configure fq_codel scheduler. + * the configurations for the scheduler is passed from userland. + */ +static int +fq_codel_config(struct dn_schk *_schk) +{ + struct fq_codel_schk *schk; + struct dn_extra_parms *ep; + struct dn_sch_fq_codel_parms *fqc_cfg; + + schk = (struct fq_codel_schk *)(_schk+1); + ep = (struct dn_extra_parms *) _schk->cfg; + + /* par array contains fq_codel configuration as follow + * Codel: 0- target,1- interval, 2- flags + * FQ_CODEL: 3- quantum, 4- limit, 5- flows + */ + if (ep && ep->oid.len ==sizeof(*ep) && + ep->oid.subtype == DN_SCH_PARAMS) { + + fqc_cfg = &schk->cfg; + if (ep->par[0] < 0) + fqc_cfg->ccfg.target = fq_codel_sysctl.ccfg.target; + else + fqc_cfg->ccfg.target = ep->par[0] * AQM_TIME_1US; + + if (ep->par[1] < 0) + fqc_cfg->ccfg.interval = fq_codel_sysctl.ccfg.interval; + else + fqc_cfg->ccfg.interval = ep->par[1] * AQM_TIME_1US; + + if (ep->par[2] < 0) + fqc_cfg->ccfg.flags = 0; + else + fqc_cfg->ccfg.flags = ep->par[2]; + + /* FQ configurations */ + if (ep->par[3] < 0) + fqc_cfg->quantum = fq_codel_sysctl.quantum; + else + fqc_cfg->quantum = ep->par[3]; + + if (ep->par[4] < 0) + fqc_cfg->limit = fq_codel_sysctl.limit; + else + fqc_cfg->limit = ep->par[4]; + + if (ep->par[5] < 0) + fqc_cfg->flows_cnt = fq_codel_sysctl.flows_cnt; + else + fqc_cfg->flows_cnt = ep->par[5]; + + /* Bound the configurations */ + fqc_cfg->ccfg.target = BOUND_VAR(fqc_cfg->ccfg.target, 1 , + 5 * AQM_TIME_1S); ; + fqc_cfg->ccfg.interval = BOUND_VAR(fqc_cfg->ccfg.interval, 1, + 100 * AQM_TIME_1S); + + fqc_cfg->quantum = BOUND_VAR(fqc_cfg->quantum,1, 9000); + fqc_cfg->limit= BOUND_VAR(fqc_cfg->limit,1,20480); + fqc_cfg->flows_cnt= BOUND_VAR(fqc_cfg->flows_cnt,1,65536); + } + else + return 1; + + return 0; +} + +/* + * Return fq_codel scheduler configurations + * the configurations for the scheduler is passed to userland. + */ +static int +fq_codel_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) { + + struct fq_codel_schk *schk = (struct fq_codel_schk *)(_schk+1); + struct dn_sch_fq_codel_parms *fqc_cfg; + + fqc_cfg = &schk->cfg; + + strcpy(ep->name, fq_codel_desc.name); + ep->par[0] = fqc_cfg->ccfg.target / AQM_TIME_1US; + ep->par[1] = fqc_cfg->ccfg.interval / AQM_TIME_1US; + ep->par[2] = fqc_cfg->ccfg.flags; + + ep->par[3] = fqc_cfg->quantum; + ep->par[4] = fqc_cfg->limit; + ep->par[5] = fqc_cfg->flows_cnt; + + return 0; +} + +/* + * fq_codel scheduler descriptor + * contains the type of the scheduler, the name, the size of extra + * data structures, and function pointers. + */ +static struct dn_alg fq_codel_desc = { + _SI( .type = ) DN_SCHED_FQ_CODEL, + _SI( .name = ) "FQ_CODEL", + _SI( .flags = ) 0, + + _SI( .schk_datalen = ) sizeof(struct fq_codel_schk), + _SI( .si_datalen = ) sizeof(struct fq_codel_si) - sizeof(struct dn_sch_inst), + _SI( .q_datalen = ) 0, + + _SI( .enqueue = ) fq_codel_enqueue, + _SI( .dequeue = ) fq_codel_dequeue, + _SI( .config = ) fq_codel_config, /* new sched i.e. sched X config ...*/ + _SI( .destroy = ) NULL, /*sched x delete */ + _SI( .new_sched = ) fq_codel_new_sched, /* new schd instance */ + _SI( .free_sched = ) fq_codel_free_sched, /* delete schd instance */ + _SI( .new_fsk = ) NULL, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) NULL, + _SI( .free_queue = ) NULL, + _SI( .getconfig = ) fq_codel_getconfig, + _SI( .ref_count = ) 0 +}; + +DECLARE_DNSCHED_MODULE(dn_fq_codel, &fq_codel_desc); diff --git a/sys/netpfil/ipfw/dn_sched_fq_codel.h b/sys/netpfil/ipfw/dn_sched_fq_codel.h new file mode 100644 index 0000000..4b65781 --- /dev/null +++ b/sys/netpfil/ipfw/dn_sched_fq_codel.h @@ -0,0 +1,167 @@ +/*- + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * FQ_Codel Structures and helper functions + * + * $FreeBSD$ + */ + +#ifndef _IP_DN_SCHED_FQ_CODEL_H +#define _IP_DN_SCHED_FQ_CODEL_H + +/* list of queues */ +STAILQ_HEAD(fq_codel_list, fq_codel_flow) ; + +/* fq_codel parameters including codel */ +struct dn_sch_fq_codel_parms { + struct dn_aqm_codel_parms ccfg; /* CoDel Parameters */ + /* FQ_CODEL Parameters */ + uint32_t flows_cnt; /* number of flows */ + uint32_t limit; /* hard limit of fq_codel queue size*/ + uint32_t quantum; +}; /* defaults */ + +/* flow (sub-queue) stats */ +struct flow_stats { + uint64_t tot_pkts; /* statistics counters */ + uint64_t tot_bytes; + uint32_t length; /* Queue length, in packets */ + uint32_t len_bytes; /* Queue length, in bytes */ + uint32_t drops; +}; + +/* A flow of packets (sub-queue).*/ +struct fq_codel_flow { + struct mq mq; /* list of packets */ + struct flow_stats stats; /* statistics */ + int deficit; + int active; /* 1: flow is active (in a list) */ + struct codel_status cst; + STAILQ_ENTRY(fq_codel_flow) flowchain; +}; + +/* extra fq_codel scheduler configurations */ +struct fq_codel_schk { + struct dn_sch_fq_codel_parms cfg; +}; + +/* fq_codel scheduler instance */ +struct fq_codel_si { + struct dn_sch_inst _si; /* standard scheduler instance */ + struct dn_queue main_q; /* main queue is after si directly */ + + struct fq_codel_flow *flows; /* array of flows (queues) */ + uint32_t perturbation; /* random value */ + struct fq_codel_list newflows; /* list of new queues */ + struct fq_codel_list oldflows; /* list of old queues */ +}; + +/* Helper function to update queue&main-queue and scheduler statistics. + * negative len + drop -> drop + * negative len -> dequeue + * positive len -> enqueue + * positive len + drop -> drop during enqueue + */ +__inline static void +fq_update_stats(struct fq_codel_flow *q, struct fq_codel_si *si, int len, + int drop) +{ + int inc = 0; + + if (len < 0) + inc = -1; + else if (len > 0) + inc = 1; + + if (drop) { + si->main_q.ni.drops ++; + q->stats.drops ++; + si->_si.ni.drops ++; + io_pkt_drop ++; + } + + if (!drop || (drop && len < 0)) { + /* Update stats for the main queue */ + si->main_q.ni.length += inc; + si->main_q.ni.len_bytes += len; + + /*update sub-queue stats */ + q->stats.length += inc; + q->stats.len_bytes += len; + + /*update scheduler instance stats */ + si->_si.ni.length += inc; + si->_si.ni.len_bytes += len; + } + + if (inc > 0) { + si->main_q.ni.tot_bytes += len; + si->main_q.ni.tot_pkts ++; + + q->stats.tot_bytes +=len; + q->stats.tot_pkts++; + + si->_si.ni.tot_bytes +=len; + si->_si.ni.tot_pkts ++; + } + +} + +/* extract the head of fq_codel sub-queue */ +__inline static struct mbuf * +fq_codel_extract_head(struct fq_codel_flow *q, aqm_time_t *pkt_ts, struct fq_codel_si *si) +{ + struct mbuf *m = q->mq.head; + + if (m == NULL) + return m; + q->mq.head = m->m_nextpkt; + + fq_update_stats(q, si, -m->m_pkthdr.len, 0); + + if (si->main_q.ni.length == 0) /* queue is now idle */ + si->main_q.q_time = dn_cfg.curr_time; + + /* extract packet timestamp*/ + struct m_tag *mtag; + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL){ + D("timestamp tag is not found!"); + *pkt_ts = 0; + } else { + *pkt_ts = *(aqm_time_t *)(mtag + 1); + m_tag_delete(m,mtag); + } + + return m; +} + + +#endif diff --git a/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h b/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h new file mode 100644 index 0000000..da663dc --- /dev/null +++ b/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h @@ -0,0 +1,187 @@ +/* + * Codel - The Controlled-Delay Active Queue Management algorithm. + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * + * Copyright (C) 2011-2014 Kathleen Nichols <nichols@pollere.com>. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * o Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * + * o Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * o The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General Public + * License ("GPL") version 2, in which case the provisions of the GPL + * apply INSTEAD OF those given above. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _IP_DN_SCHED_FQ_CODEL_HELPER_H +#define _IP_DN_SCHED_FQ_CODEL_HELPER_H + +__inline static struct mbuf * +fqc_dodequeue(struct fq_codel_flow *q, aqm_time_t now, uint16_t *ok_to_drop, + struct fq_codel_si *si) +{ + struct mbuf * m; + struct fq_codel_schk *schk = (struct fq_codel_schk *)(si->_si.sched+1); + aqm_time_t pkt_ts, sojourn_time; + + *ok_to_drop = 0; + m = fq_codel_extract_head(q, &pkt_ts, si); + + if (m == NULL) { + /*queue is empty - we can't be above target*/ + q->cst.first_above_time= 0; + return m; + } + + /* To span a large range of bandwidths, CoDel runs two + * different AQMs in parallel. One is sojourn-time-based + * and takes effect when the time to send an MTU-sized + * packet is less than target. The 1st term of the "if" + * below does this. The other is backlog-based and takes + * effect when the time to send an MTU-sized packet is >= + * target. The goal here is to keep the output link + * utilization high by never allowing the queue to get + * smaller than the amount that arrives in a typical + * interarrival time (MTU-sized packets arriving spaced + * by the amount of time it takes to send such a packet on + * the bottleneck). The 2nd term of the "if" does this. + */ + sojourn_time = now - pkt_ts; + if (sojourn_time < schk->cfg.ccfg.target || q->stats.len_bytes <= q->cst.maxpkt_size) { + /* went below - stay below for at least interval */ + q->cst.first_above_time = 0; + } else { + if (q->cst.first_above_time == 0) { + /* just went above from below. if still above at + * first_above_time, will say it's ok to drop. */ + q->cst.first_above_time = now + schk->cfg.ccfg.interval; + } else if (now >= q->cst.first_above_time) { + *ok_to_drop = 1; + } + } + return m; +} + +/* Codel dequeue function */ +__inline static struct mbuf * +fqc_codel_dequeue(struct fq_codel_flow *q, struct fq_codel_si *si) +{ + struct mbuf *m; + struct dn_aqm_codel_parms *cprms; + struct codel_status *cst; + aqm_time_t now; + uint16_t ok_to_drop; + struct fq_codel_schk *schk = (struct fq_codel_schk *)(si->_si.sched+1); + + cst = &q->cst; + cprms = &schk->cfg.ccfg; + + now = AQM_UNOW; + m = fqc_dodequeue(q, now, &ok_to_drop, si); + + if (cst->dropping) { + if (!ok_to_drop) { + /* sojourn time below target - leave dropping state */ + cst->dropping = false; + } + + /* Time for the next drop. Drop current packet and dequeue + * next. If the dequeue doesn't take us out of dropping + * state, schedule the next drop. A large backlog might + * result in drop rates so high that the next drop should + * happen now, hence the 'while' loop. + */ + while (now >= cst->drop_next_time && cst->dropping) { + + /* mark the packet */ + if (cprms->flags & CODEL_ECN_ENABLED && ecn_mark(m)) { + cst->count++; + /* schedule the next mark. */ + cst->drop_next_time = control_law(cst, cprms, cst->drop_next_time); + return m; + } + + /* drop the packet */ + fq_update_stats(q, si, 0, 1); + m_freem(m); + m = fqc_dodequeue(q, now, &ok_to_drop, si); + + if (!ok_to_drop) { + /* leave dropping state */ + cst->dropping = false; + } else { + cst->count++; + /* schedule the next drop. */ + cst->drop_next_time = control_law(cst, cprms, cst->drop_next_time); + } + } + /* If we get here we're not in dropping state. The 'ok_to_drop' + * return from dodequeue means that the sojourn time has been + * above 'target' for 'interval' so enter dropping state. + */ + } else if (ok_to_drop) { + + /* if ECN option is disabled or the packet cannot be marked, + * drop the packet and extract another. + */ + if (!(cprms->flags & CODEL_ECN_ENABLED) || !ecn_mark(m)) { + fq_update_stats(q, si, 0, 1); + m_freem(m); + m = fqc_dodequeue(q, now, &ok_to_drop,si); + } + + cst->dropping = true; + + /* If min went above target close to when it last went + * below, assume that the drop rate that controlled the + * queue on the last cycle is a good starting point to + * control it now. ('drop_next' will be at most 'interval' + * later than the time of the last drop so 'now - drop_next' + * is a good approximation of the time from the last drop + * until now.) + */ + cst->count = (cst->count > 2 && ((aqm_stime_t)now - + (aqm_stime_t)cst->drop_next_time) < 8* cprms->interval)? cst->count - 2 : 1; + + /* we don't have to set initial guess for Newton's method isqrt as + * we initilaize isqrt in control_law function when count == 1 */ + cst->drop_next_time = control_law(cst, cprms, now); + } + + return m; +} + +#endif diff --git a/sys/netpfil/ipfw/dn_sched_fq_pie.c b/sys/netpfil/ipfw/dn_sched_fq_pie.c new file mode 100644 index 0000000..2883cf8 --- /dev/null +++ b/sys/netpfil/ipfw/dn_sched_fq_pie.c @@ -0,0 +1,1262 @@ +/* + * FQ_PIE - The FlowQueue-PIE scheduler/AQM + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* Important note: + * As there is no an office document for FQ-PIE specification, we used + * FQ-CoDel algorithm with some modifications to implement FQ-PIE. + * This FQ-PIE implementation is a beta version and have not been tested + * extensively. Our FQ-PIE uses stand-alone PIE AQM per sub-queue. By + * default, timestamp is used to calculate queue delay instead of departure + * rate estimation method. Although departure rate estimation is available + * as testing option, the results could be incorrect. Moreover, turning PIE on + * and off option is available but it does not work properly in this version. + */ + + +#ifdef _KERNEL +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/lock.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <net/if.h> /* IFNAMSIZ */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* ipfw_rule_ref */ +#include <netinet/ip_fw.h> /* flow_id */ +#include <netinet/ip_dummynet.h> + +#include <sys/proc.h> +#include <sys/rwlock.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <sys/sysctl.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/ip_icmp.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <sys/queue.h> +#include <sys/hash.h> + +#include <netpfil/ipfw/dn_heap.h> +#include <netpfil/ipfw/ip_dn_private.h> + +#include <netpfil/ipfw/dn_aqm.h> +#include <netpfil/ipfw/dn_aqm_pie.h> +#include <netpfil/ipfw/dn_sched.h> + +#else +#include <dn_test.h> +#endif + +#define DN_SCHED_FQ_PIE 7 + +/* list of queues */ +STAILQ_HEAD(fq_pie_list, fq_pie_flow) ; + +/* FQ_PIE parameters including PIE */ +struct dn_sch_fq_pie_parms { + struct dn_aqm_pie_parms pcfg; /* PIE configuration Parameters */ + /* FQ_PIE Parameters */ + uint32_t flows_cnt; /* number of flows */ + uint32_t limit; /* hard limit of FQ_PIE queue size*/ + uint32_t quantum; +}; + +/* flow (sub-queue) stats */ +struct flow_stats { + uint64_t tot_pkts; /* statistics counters */ + uint64_t tot_bytes; + uint32_t length; /* Queue length, in packets */ + uint32_t len_bytes; /* Queue length, in bytes */ + uint32_t drops; +}; + +/* A flow of packets (sub-queue)*/ +struct fq_pie_flow { + struct mq mq; /* list of packets */ + struct flow_stats stats; /* statistics */ + int deficit; + int active; /* 1: flow is active (in a list) */ + struct pie_status pst; /* pie status variables */ + struct fq_pie_si *psi; /* parent scheduler instance */ + STAILQ_ENTRY(fq_pie_flow) flowchain; +}; + +/* extra fq_pie scheduler configurations */ +struct fq_pie_schk { + struct dn_sch_fq_pie_parms cfg; +}; + +/* fq_pie scheduler instance */ +struct fq_pie_si { + struct dn_sch_inst _si; /* standard scheduler instance */ + struct dn_queue main_q; /* main queue is after si directly */ + uint32_t nr_active_q; + struct fq_pie_flow *flows; /* array of flows (queues) */ + uint32_t perturbation; /* random value */ + struct fq_pie_list newflows; /* list of new queues */ + struct fq_pie_list oldflows; /* list of old queues */ +}; + + +struct mem_to_free { + void *mem_flows; + void *mem_callout; +}; +static struct mtx freemem_mtx; +static struct dn_alg fq_pie_desc; + +/* Default FQ-PIE parameters including PIE */ +/* PIE defaults + * target=15ms, max_burst=150ms, max_ecnth=0.1, + * alpha=0.125, beta=1.25, tupdate=15ms + * FQ- + * flows=1024, limit=10240, quantum =1514 + */ +struct dn_sch_fq_pie_parms + fq_pie_sysctl = {{15000 * AQM_TIME_1US, 15000 * AQM_TIME_1US, + 150000 * AQM_TIME_1US, PIE_SCALE * 0.1, PIE_SCALE * 0.125, + PIE_SCALE * 1.25, PIE_CAPDROP_ENABLED | PIE_DERAND_ENABLED}, + 1024, 10240, 1514}; + +static int +fqpie_sysctl_alpha_beta_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + if (!strcmp(oidp->oid_name,"alpha")) + value = fq_pie_sysctl.pcfg.alpha; + else + value = fq_pie_sysctl.pcfg.beta; + + value = value * 1000 / PIE_SCALE; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > 7 * PIE_SCALE) + return (EINVAL); + value = (value * PIE_SCALE) / 1000; + if (!strcmp(oidp->oid_name,"alpha")) + fq_pie_sysctl.pcfg.alpha = value; + else + fq_pie_sysctl.pcfg.beta = value; + return (0); +} + +static int +fqpie_sysctl_target_tupdate_maxb_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + if (!strcmp(oidp->oid_name,"target")) + value = fq_pie_sysctl.pcfg.qdelay_ref; + else if (!strcmp(oidp->oid_name,"tupdate")) + value = fq_pie_sysctl.pcfg.tupdate; + else + value = fq_pie_sysctl.pcfg.max_burst; + + value = value / AQM_TIME_1US; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > 10 * AQM_TIME_1S) + return (EINVAL); + value = value * AQM_TIME_1US; + + if (!strcmp(oidp->oid_name,"target")) + fq_pie_sysctl.pcfg.qdelay_ref = value; + else if (!strcmp(oidp->oid_name,"tupdate")) + fq_pie_sysctl.pcfg.tupdate = value; + else + fq_pie_sysctl.pcfg.max_burst = value; + return (0); +} + +static int +fqpie_sysctl_max_ecnth_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + value = fq_pie_sysctl.pcfg.max_ecnth; + value = value * 1000 / PIE_SCALE; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > PIE_SCALE) + return (EINVAL); + value = (value * PIE_SCALE) / 1000; + fq_pie_sysctl.pcfg.max_ecnth = value; + return (0); +} + +/* define FQ- PIE sysctl variables */ +SYSBEGIN(f4) +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_DECL(_net_inet_ip_dummynet); +static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqpie, + CTLFLAG_RW, 0, "FQ_PIE"); + +#ifdef SYSCTL_NODE + +SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, target, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + fqpie_sysctl_target_tupdate_maxb_handler, "L", + "queue target in microsecond"); + +SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, tupdate, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + fqpie_sysctl_target_tupdate_maxb_handler, "L", + "the frequency of drop probability calculation in microsecond"); + +SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, max_burst, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + fqpie_sysctl_target_tupdate_maxb_handler, "L", + "Burst allowance interval in microsecond"); + +SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, max_ecnth, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + fqpie_sysctl_max_ecnth_handler, "L", + "ECN safeguard threshold scaled by 1000"); + +SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, alpha, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + fqpie_sysctl_alpha_beta_handler, "L", "PIE alpha scaled by 1000"); + +SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, beta, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + fqpie_sysctl_alpha_beta_handler, "L", "beta scaled by 1000"); + +SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, quantum, + CTLFLAG_RW, &fq_pie_sysctl.quantum, 1514, "quantum for FQ_PIE"); +SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, flows, + CTLFLAG_RW, &fq_pie_sysctl.flows_cnt, 1024, "Number of queues for FQ_PIE"); +SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, limit, + CTLFLAG_RW, &fq_pie_sysctl.limit, 10240, "limit for FQ_PIE"); +#endif + +/* Helper function to update queue&main-queue and scheduler statistics. + * negative len & drop -> drop + * negative len -> dequeue + * positive len -> enqueue + * positive len + drop -> drop during enqueue + */ +__inline static void +fq_update_stats(struct fq_pie_flow *q, struct fq_pie_si *si, int len, + int drop) +{ + int inc = 0; + + if (len < 0) + inc = -1; + else if (len > 0) + inc = 1; + + if (drop) { + si->main_q.ni.drops ++; + q->stats.drops ++; + si->_si.ni.drops ++; + io_pkt_drop ++; + } + + if (!drop || (drop && len < 0)) { + /* Update stats for the main queue */ + si->main_q.ni.length += inc; + si->main_q.ni.len_bytes += len; + + /*update sub-queue stats */ + q->stats.length += inc; + q->stats.len_bytes += len; + + /*update scheduler instance stats */ + si->_si.ni.length += inc; + si->_si.ni.len_bytes += len; + } + + if (inc > 0) { + si->main_q.ni.tot_bytes += len; + si->main_q.ni.tot_pkts ++; + + q->stats.tot_bytes +=len; + q->stats.tot_pkts++; + + si->_si.ni.tot_bytes +=len; + si->_si.ni.tot_pkts ++; + } + +} + +/* + * Extract a packet from the head of sub-queue 'q' + * Return a packet or NULL if the queue is empty. + * If getts is set, also extract packet's timestamp from mtag. + */ +__inline static struct mbuf * +fq_pie_extract_head(struct fq_pie_flow *q, aqm_time_t *pkt_ts, + struct fq_pie_si *si, int getts) +{ + struct mbuf *m = q->mq.head; + + if (m == NULL) + return m; + q->mq.head = m->m_nextpkt; + + fq_update_stats(q, si, -m->m_pkthdr.len, 0); + + if (si->main_q.ni.length == 0) /* queue is now idle */ + si->main_q.q_time = dn_cfg.curr_time; + + if (getts) { + /* extract packet timestamp*/ + struct m_tag *mtag; + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL){ + D("PIE timestamp mtag not found!"); + *pkt_ts = 0; + } else { + *pkt_ts = *(aqm_time_t *)(mtag + 1); + m_tag_delete(m,mtag); + } + } + return m; +} + +/* + * Callout function for drop probability calculation + * This function is called over tupdate ms and takes pointer of FQ-PIE + * flow as an argument + */ +static void +fq_calculate_drop_prob(void *x) +{ + struct fq_pie_flow *q = (struct fq_pie_flow *) x; + struct pie_status *pst = &q->pst; + struct dn_aqm_pie_parms *pprms; + int64_t p, prob, oldprob; + aqm_time_t now; + + /* dealing with race condition */ + if (callout_pending(&pst->aqm_pie_callout)) { + /* callout was reset */ + mtx_unlock(&pst->lock_mtx); + return; + } + + if (!callout_active(&pst->aqm_pie_callout)) { + /* callout was stopped */ + mtx_unlock(&pst->lock_mtx); + mtx_destroy(&pst->lock_mtx); + q->psi->nr_active_q--; + return; + } + callout_deactivate(&pst->aqm_pie_callout); + + now = AQM_UNOW; + pprms = pst->parms; + prob = pst->drop_prob; + + /* calculate current qdelay */ + if (pprms->flags & PIE_DEPRATEEST_ENABLED) { + pst->current_qdelay = ((uint64_t)q->stats.len_bytes * pst->avg_dq_time) + >> PIE_DQ_THRESHOLD_BITS; + } + + /* calculate drop probability */ + p = (int64_t)pprms->alpha * + ((int64_t)pst->current_qdelay - (int64_t)pprms->qdelay_ref); + p +=(int64_t) pprms->beta * + ((int64_t)pst->current_qdelay - (int64_t)pst->qdelay_old); + + /* We PIE_MAX_PROB shift by 12-bits to increase the division precision */ + p *= (PIE_MAX_PROB << 12) / AQM_TIME_1S; + + /* auto-tune drop probability */ + if (prob < (PIE_MAX_PROB / 1000000)) /* 0.000001 */ + p >>= 11 + PIE_FIX_POINT_BITS + 12; + else if (prob < (PIE_MAX_PROB / 100000)) /* 0.00001 */ + p >>= 9 + PIE_FIX_POINT_BITS + 12; + else if (prob < (PIE_MAX_PROB / 10000)) /* 0.0001 */ + p >>= 7 + PIE_FIX_POINT_BITS + 12; + else if (prob < (PIE_MAX_PROB / 1000)) /* 0.001 */ + p >>= 5 + PIE_FIX_POINT_BITS + 12; + else if (prob < (PIE_MAX_PROB / 100)) /* 0.01 */ + p >>= 3 + PIE_FIX_POINT_BITS + 12; + else if (prob < (PIE_MAX_PROB / 10)) /* 0.1 */ + p >>= 1 + PIE_FIX_POINT_BITS + 12; + else + p >>= PIE_FIX_POINT_BITS + 12; + + oldprob = prob; + + /* Cap Drop adjustment */ + if ((pprms->flags & PIE_CAPDROP_ENABLED) && prob >= PIE_MAX_PROB / 10 + && p > PIE_MAX_PROB / 50 ) + p = PIE_MAX_PROB / 50; + + prob = prob + p; + + /* decay the drop probability exponentially */ + if (pst->current_qdelay == 0 && pst->qdelay_old == 0) + /* 0.98 ~= 1- 1/64 */ + prob = prob - (prob >> 6); + + + /* check for multiplication over/under flow */ + if (p>0) { + if (prob<oldprob) { + D("overflow"); + prob= PIE_MAX_PROB; + } + } + else + if (prob>oldprob) { + prob= 0; + D("underflow"); + } + + /* make drop probability between 0 and PIE_MAX_PROB*/ + if (prob < 0) + prob = 0; + else if (prob > PIE_MAX_PROB) + prob = PIE_MAX_PROB; + + pst->drop_prob = prob; + + /* store current delay value */ + pst->qdelay_old = pst->current_qdelay; + + /* update burst allowance */ + if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance) { + if (pst->burst_allowance > pprms->tupdate) + pst->burst_allowance -= pprms->tupdate; + else + pst->burst_allowance = 0; + } + + if (pst->sflags & PIE_ACTIVE) + callout_reset_sbt(&pst->aqm_pie_callout, + (uint64_t)pprms->tupdate * SBT_1US, + 0, fq_calculate_drop_prob, q, 0); + + mtx_unlock(&pst->lock_mtx); +} + +/* + * Reset PIE variables & activate the queue + */ +__inline static void +fq_activate_pie(struct fq_pie_flow *q) +{ + struct pie_status *pst = &q->pst; + struct dn_aqm_pie_parms *pprms; + + mtx_lock(&pst->lock_mtx); + pprms = pst->parms; + + pprms = pst->parms; + pst->drop_prob = 0; + pst->qdelay_old = 0; + pst->burst_allowance = pprms->max_burst; + pst->accu_prob = 0; + pst->dq_count = 0; + pst->avg_dq_time = 0; + pst->sflags = PIE_INMEASUREMENT | PIE_ACTIVE; + pst->measurement_start = AQM_UNOW; + + callout_reset_sbt(&pst->aqm_pie_callout, + (uint64_t)pprms->tupdate * SBT_1US, + 0, fq_calculate_drop_prob, q, 0); + + mtx_unlock(&pst->lock_mtx); +} + + + /* + * Deactivate PIE and stop probe update callout + */ +__inline static void +fq_deactivate_pie(struct pie_status *pst) +{ + mtx_lock(&pst->lock_mtx); + pst->sflags &= ~(PIE_ACTIVE | PIE_INMEASUREMENT); + callout_stop(&pst->aqm_pie_callout); + //D("PIE Deactivated"); + mtx_unlock(&pst->lock_mtx); +} + + /* + * Initialize PIE for sub-queue 'q' + */ +static int +pie_init(struct fq_pie_flow *q) +{ + struct pie_status *pst=&q->pst; + struct dn_aqm_pie_parms *pprms = pst->parms; + struct fq_pie_schk *fqpie_schk; + + fqpie_schk = (struct fq_pie_schk *)(q->psi->_si.sched+1); + int err = 0; + + if (!pprms){ + D("AQM_PIE is not configured"); + err = EINVAL; + } else { + q->psi->nr_active_q++; + + /* For speed optimization, we caculate 1/3 queue size once here */ + // XXX limit divided by number of queues divided by 3 ??? + pst->one_third_q_size = (fqpie_schk->cfg.limit / + fqpie_schk->cfg.flows_cnt) / 3; + + mtx_init(&pst->lock_mtx, "mtx_pie", NULL, MTX_DEF); + callout_init_mtx(&pst->aqm_pie_callout, &pst->lock_mtx, + CALLOUT_RETURNUNLOCKED); + } + + return err; +} + +/* + * Clean up PIE status for sub-queue 'q' + * Stop callout timer and destroy mtx + */ +static int +pie_cleanup(struct fq_pie_flow *q) +{ + struct pie_status *pst = &q->pst; + + mtx_lock(&pst->lock_mtx); + if (callout_stop(&pst->aqm_pie_callout) || !(pst->sflags & PIE_ACTIVE)) { + mtx_unlock(&pst->lock_mtx); + mtx_destroy(&pst->lock_mtx); + q->psi->nr_active_q--; + } else { + mtx_unlock(&pst->lock_mtx); + return EBUSY; + } + return 0; +} + +/* + * Dequeue and return a pcaket from sub-queue 'q' or NULL if 'q' is empty. + * Also, caculate depature time or queue delay using timestamp + */ + static struct mbuf * +pie_dequeue(struct fq_pie_flow *q, struct fq_pie_si *si) +{ + struct mbuf *m; + struct dn_aqm_pie_parms *pprms; + struct pie_status *pst; + aqm_time_t now; + aqm_time_t pkt_ts, dq_time; + int32_t w; + + pst = &q->pst; + pprms = q->pst.parms; + + /*we extarct packet ts only when Departure Rate Estimation dis not used*/ + m = fq_pie_extract_head(q, &pkt_ts, si, + !(pprms->flags & PIE_DEPRATEEST_ENABLED)); + + if (!m || !(pst->sflags & PIE_ACTIVE)) + return m; + + now = AQM_UNOW; + if (pprms->flags & PIE_DEPRATEEST_ENABLED) { + /* calculate average depature time */ + if(pst->sflags & PIE_INMEASUREMENT) { + pst->dq_count += m->m_pkthdr.len; + + if (pst->dq_count >= PIE_DQ_THRESHOLD) { + dq_time = now - pst->measurement_start; + + /* + * if we don't have old avg dq_time i.e PIE is (re)initialized, + * don't use weight to calculate new avg_dq_time + */ + if(pst->avg_dq_time == 0) + pst->avg_dq_time = dq_time; + else { + /* + * weight = PIE_DQ_THRESHOLD/2^6, but we scaled + * weight by 2^8. Thus, scaled + * weight = PIE_DQ_THRESHOLD /2^8 + * */ + w = PIE_DQ_THRESHOLD >> 8; + pst->avg_dq_time = (dq_time* w + + (pst->avg_dq_time * ((1L << 8) - w))) >> 8; + pst->sflags &= ~PIE_INMEASUREMENT; + } + } + } + + /* + * Start new measurment cycle when the queue has + * PIE_DQ_THRESHOLD worth of bytes. + */ + if(!(pst->sflags & PIE_INMEASUREMENT) && + q->stats.len_bytes >= PIE_DQ_THRESHOLD) { + pst->sflags |= PIE_INMEASUREMENT; + pst->measurement_start = now; + pst->dq_count = 0; + } + } + /* Optionally, use packet timestamp to estimate queue delay */ + else + pst->current_qdelay = now - pkt_ts; + + return m; +} + + + /* + * Enqueue a packet in q, subject to space and FQ-PIE queue management policy + * (whose parameters are in q->fs). + * Update stats for the queue and the scheduler. + * Return 0 on success, 1 on drop. The packet is consumed anyways. + */ +static int +pie_enqueue(struct fq_pie_flow *q, struct mbuf* m, struct fq_pie_si *si) +{ + uint64_t len; + struct pie_status *pst; + struct dn_aqm_pie_parms *pprms; + int t; + + len = m->m_pkthdr.len; + pst = &q->pst; + pprms = pst->parms; + t = ENQUE; + + /* drop/mark the packet when PIE is active and burst time elapsed */ + if (pst->sflags & PIE_ACTIVE && pst->burst_allowance == 0 + && drop_early(pst, q->stats.len_bytes) == DROP) { + /* + * if drop_prob over ECN threshold, drop the packet + * otherwise mark and enqueue it. + */ + if (pprms->flags & PIE_ECN_ENABLED && pst->drop_prob < + (pprms->max_ecnth << (PIE_PROB_BITS - PIE_FIX_POINT_BITS)) + && ecn_mark(m)) + t = ENQUE; + else + t = DROP; + } + + /* Turn PIE on when 1/3 of the queue is full */ + if (!(pst->sflags & PIE_ACTIVE) && q->stats.len_bytes >= + pst->one_third_q_size) { + fq_activate_pie(q); + } + + /* reset burst tolerance and optinally turn PIE off*/ + if (pst->drop_prob == 0 && pst->current_qdelay < (pprms->qdelay_ref >> 1) + && pst->qdelay_old < (pprms->qdelay_ref >> 1)) { + + pst->burst_allowance = pprms->max_burst; + if (pprms->flags & PIE_ON_OFF_MODE_ENABLED && q->stats.len_bytes<=0) + fq_deactivate_pie(pst); + } + + /* Use timestamp if Departure Rate Estimation mode is disabled */ + if (t != DROP && !(pprms->flags & PIE_DEPRATEEST_ENABLED)) { + /* Add TS to mbuf as a TAG */ + struct m_tag *mtag; + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL) + mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, + sizeof(aqm_time_t), M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + t = DROP; + } + *(aqm_time_t *)(mtag + 1) = AQM_UNOW; + m_tag_prepend(m, mtag); + } + + if (t != DROP) { + mq_append(&q->mq, m); + fq_update_stats(q, si, len, 0); + return 0; + } else { + fq_update_stats(q, si, len, 1); + pst->accu_prob = 0; + FREE_PKT(m); + return 1; + } + + return 0; +} + +/* Drop a packet form the head of FQ-PIE sub-queue */ +static void +pie_drop_head(struct fq_pie_flow *q, struct fq_pie_si *si) +{ + struct mbuf *m = q->mq.head; + + if (m == NULL) + return; + q->mq.head = m->m_nextpkt; + + fq_update_stats(q, si, -m->m_pkthdr.len, 1); + + if (si->main_q.ni.length == 0) /* queue is now idle */ + si->main_q.q_time = dn_cfg.curr_time; + /* reset accu_prob after packet drop */ + q->pst.accu_prob = 0; + + FREE_PKT(m); +} + +/* + * Classify a packet to queue number using Jenkins hash function. + * Return: queue number + * the input of the hash are protocol no, perturbation, src IP, dst IP, + * src port, dst port, + */ +static inline int +fq_pie_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_pie_si *si) +{ + struct ip *ip; + struct tcphdr *th; + struct udphdr *uh; + uint8_t tuple[41]; + uint16_t hash=0; + +//#ifdef INET6 + struct ip6_hdr *ip6; + int isip6; + isip6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; + + if(isip6) { + ip6 = mtod(m, struct ip6_hdr *); + *((uint8_t *) &tuple[0]) = ip6->ip6_nxt; + *((uint32_t *) &tuple[1]) = si->perturbation; + memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16); + memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16); + + switch (ip6->ip6_nxt) { + case IPPROTO_TCP: + th = (struct tcphdr *)(ip6 + 1); + *((uint16_t *) &tuple[37]) = th->th_dport; + *((uint16_t *) &tuple[39]) = th->th_sport; + break; + + case IPPROTO_UDP: + uh = (struct udphdr *)(ip6 + 1); + *((uint16_t *) &tuple[37]) = uh->uh_dport; + *((uint16_t *) &tuple[39]) = uh->uh_sport; + break; + default: + memset(&tuple[37], 0, 4); + } + + hash = jenkins_hash(tuple, 41, HASHINIT) % fcount; + return hash; + } +//#endif + + /* IPv4 */ + ip = mtod(m, struct ip *); + *((uint8_t *) &tuple[0]) = ip->ip_p; + *((uint32_t *) &tuple[1]) = si->perturbation; + *((uint32_t *) &tuple[5]) = ip->ip_src.s_addr; + *((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr; + + switch (ip->ip_p) { + case IPPROTO_TCP: + th = (struct tcphdr *)(ip + 1); + *((uint16_t *) &tuple[13]) = th->th_dport; + *((uint16_t *) &tuple[15]) = th->th_sport; + break; + + case IPPROTO_UDP: + uh = (struct udphdr *)(ip + 1); + *((uint16_t *) &tuple[13]) = uh->uh_dport; + *((uint16_t *) &tuple[15]) = uh->uh_sport; + break; + default: + memset(&tuple[13], 0, 4); + } + hash = jenkins_hash(tuple, 17, HASHINIT) % fcount; + + return hash; +} + +/* + * Enqueue a packet into an appropriate queue according to + * FQ-CoDe; algorithm. + */ +static int +fq_pie_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q, + struct mbuf *m) +{ + struct fq_pie_si *si; + struct fq_pie_schk *schk; + struct dn_sch_fq_pie_parms *param; + struct dn_queue *mainq; + int idx, drop, i, maxidx; + + mainq = (struct dn_queue *)(_si + 1); + si = (struct fq_pie_si *)_si; + schk = (struct fq_pie_schk *)(si->_si.sched+1); + param = &schk->cfg; + + /* classify a packet to queue number*/ + idx = fq_pie_classify_flow(m, param->flows_cnt, si); + + /* enqueue packet into appropriate queue using PIE AQM. + * Note: 'pie_enqueue' function returns 1 only when it unable to + * add timestamp to packet (no limit check)*/ + drop = pie_enqueue(&si->flows[idx], m, si); + + /* pie unable to timestamp a packet */ + if (drop) + return 1; + + /* If the flow (sub-queue) is not active ,then add it to tail of + * new flows list, initialize and activate it. + */ + if (!si->flows[idx].active) { + STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain); + si->flows[idx].deficit = param->quantum; + fq_activate_pie(&si->flows[idx]); + si->flows[idx].active = 1; + } + + /* check the limit for all queues and remove a packet from the + * largest one + */ + if (mainq->ni.length > schk->cfg.limit) { + /* find first active flow */ + for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++) + if (si->flows[maxidx].active) + break; + if (maxidx < schk->cfg.flows_cnt) { + /* find the largest sub- queue */ + for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++) + if (si->flows[i].active && si->flows[i].stats.length > + si->flows[maxidx].stats.length) + maxidx = i; + pie_drop_head(&si->flows[maxidx], si); + drop = 1; + } + } + + return drop; +} + +/* + * Dequeue a packet from an appropriate queue according to + * FQ-CoDel algorithm. + */ +static struct mbuf * +fq_pie_dequeue(struct dn_sch_inst *_si) +{ + struct fq_pie_si *si; + struct fq_pie_schk *schk; + struct dn_sch_fq_pie_parms *param; + struct fq_pie_flow *f; + struct mbuf *mbuf; + struct fq_pie_list *fq_pie_flowlist; + + si = (struct fq_pie_si *)_si; + schk = (struct fq_pie_schk *)(si->_si.sched+1); + param = &schk->cfg; + + do { + /* select a list to start with */ + if (STAILQ_EMPTY(&si->newflows)) + fq_pie_flowlist = &si->oldflows; + else + fq_pie_flowlist = &si->newflows; + + /* Both new and old queue lists are empty, return NULL */ + if (STAILQ_EMPTY(fq_pie_flowlist)) + return NULL; + + f = STAILQ_FIRST(fq_pie_flowlist); + while (f != NULL) { + /* if there is no flow(sub-queue) deficit, increase deficit + * by quantum, move the flow to the tail of old flows list + * and try another flow. + * Otherwise, the flow will be used for dequeue. + */ + if (f->deficit < 0) { + f->deficit += param->quantum; + STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain); + STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain); + } else + break; + + f = STAILQ_FIRST(fq_pie_flowlist); + } + + /* the new flows list is empty, try old flows list */ + if (STAILQ_EMPTY(fq_pie_flowlist)) + continue; + + /* Dequeue a packet from the selected flow */ + mbuf = pie_dequeue(f, si); + + /* pie did not return a packet */ + if (!mbuf) { + /* If the selected flow belongs to new flows list, then move + * it to the tail of old flows list. Otherwise, deactivate it and + * remove it from the old list and + */ + if (fq_pie_flowlist == &si->newflows) { + STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain); + STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain); + } else { + f->active = 0; + fq_deactivate_pie(&f->pst); + STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain); + } + /* start again */ + continue; + } + + /* we have a packet to return, + * update flow deficit and return the packet*/ + f->deficit -= mbuf->m_pkthdr.len; + return mbuf; + + } while (1); + + /* unreachable point */ + return NULL; +} + +/* + * Initialize fq_pie scheduler instance. + * also, allocate memory for flows array. + */ +static int +fq_pie_new_sched(struct dn_sch_inst *_si) +{ + struct fq_pie_si *si; + struct dn_queue *q; + struct fq_pie_schk *schk; + int i; + + si = (struct fq_pie_si *)_si; + schk = (struct fq_pie_schk *)(_si->sched+1); + + if(si->flows) { + D("si already configured!"); + return 0; + } + + /* init the main queue */ + q = &si->main_q; + set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); + q->_si = _si; + q->fs = _si->sched->fs; + + /* allocate memory for flows array */ + si->flows = malloc(schk->cfg.flows_cnt * sizeof(struct fq_pie_flow), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (si->flows == NULL) { + D("cannot allocate memory for fq_pie configuration parameters"); + return ENOMEM ; + } + + /* init perturbation for this si */ + si->perturbation = random(); + si->nr_active_q = 0; + + /* init the old and new flows lists */ + STAILQ_INIT(&si->newflows); + STAILQ_INIT(&si->oldflows); + + /* init the flows (sub-queues) */ + for (i = 0; i < schk->cfg.flows_cnt; i++) { + si->flows[i].pst.parms = &schk->cfg.pcfg; + si->flows[i].psi = si; + pie_init(&si->flows[i]); + } + + /* init mtx lock and callout function for free memory */ + if (!fq_pie_desc.ref_count) { + mtx_init(&freemem_mtx, "mtx_pie", NULL, MTX_DEF); + } + + mtx_lock(&freemem_mtx); + fq_pie_desc.ref_count++; + mtx_unlock(&freemem_mtx); + + return 0; +} + +/* + * Free FQ-PIE flows memory callout function. + * This function is scheduled when a flow or more still active and + * the scheduer is about to be destroyed, to prevent memory leak. + */ +static void +free_flows(void *_mem) +{ + struct mem_to_free *mem = _mem; + + free(mem->mem_flows, M_DUMMYNET); + free(mem->mem_callout, M_DUMMYNET); + free(_mem, M_DUMMYNET); + + fq_pie_desc.ref_count--; + if (!fq_pie_desc.ref_count) { + mtx_unlock(&freemem_mtx); + mtx_destroy(&freemem_mtx); + } else + mtx_unlock(&freemem_mtx); + //D("mem freed ok!"); +} + +/* + * Free fq_pie scheduler instance. + */ +static int +fq_pie_free_sched(struct dn_sch_inst *_si) +{ + struct fq_pie_si *si; + struct fq_pie_schk *schk; + int i; + + si = (struct fq_pie_si *)_si; + schk = (struct fq_pie_schk *)(_si->sched+1); + + for (i = 0; i < schk->cfg.flows_cnt; i++) { + pie_cleanup(&si->flows[i]); + } + + /* if there are still some queues have a callout going to start, + * we cannot free flows memory. If we do so, a panic can happen + * as prob calculate callout function uses flows memory. + */ + if (!si->nr_active_q) { + /* free the flows array */ + free(si->flows , M_DUMMYNET); + si->flows = NULL; + mtx_lock(&freemem_mtx); + fq_pie_desc.ref_count--; + if (!fq_pie_desc.ref_count) { + mtx_unlock(&freemem_mtx); + mtx_destroy(&freemem_mtx); + } else + mtx_unlock(&freemem_mtx); + //D("ok!"); + return 0; + } else { + /* memory leak happens here. So, we register a callout function to free + * flows memory later. + */ + D("unable to stop all fq_pie sub-queues!"); + mtx_lock(&freemem_mtx); + + struct callout *mem_callout; + struct mem_to_free *mem; + + mem = malloc(sizeof(*mem), M_DUMMYNET, + M_NOWAIT | M_ZERO); + mem_callout = malloc(sizeof(*mem_callout), M_DUMMYNET, + M_NOWAIT | M_ZERO); + + callout_init_mtx(mem_callout, &freemem_mtx, + CALLOUT_RETURNUNLOCKED); + + mem->mem_flows = si->flows; + mem->mem_callout = mem_callout; + callout_reset_sbt(mem_callout, + (uint64_t)(si->flows[0].pst.parms->tupdate + 1000) * SBT_1US, + 0, free_flows, mem, 0); + + si->flows = NULL; + mtx_unlock(&freemem_mtx); + + return EBUSY; + } +} + +/* + * Configure FQ-PIE scheduler. + * the configurations for the scheduler is passed fromipfw userland. + */ +static int +fq_pie_config(struct dn_schk *_schk) +{ + struct fq_pie_schk *schk; + struct dn_extra_parms *ep; + struct dn_sch_fq_pie_parms *fqp_cfg; + + schk = (struct fq_pie_schk *)(_schk+1); + ep = (struct dn_extra_parms *) _schk->cfg; + + /* par array contains fq_pie configuration as follow + * PIE: 0- qdelay_ref,1- tupdate, 2- max_burst + * 3- max_ecnth, 4- alpha, 5- beta, 6- flags + * FQ_PIE: 7- quantum, 8- limit, 9- flows + */ + if (ep && ep->oid.len ==sizeof(*ep) && + ep->oid.subtype == DN_SCH_PARAMS) { + + fqp_cfg = &schk->cfg; + if (ep->par[0] < 0) + fqp_cfg->pcfg.qdelay_ref = fq_pie_sysctl.pcfg.qdelay_ref; + else + fqp_cfg->pcfg.qdelay_ref = ep->par[0]; + if (ep->par[1] < 0) + fqp_cfg->pcfg.tupdate = fq_pie_sysctl.pcfg.tupdate; + else + fqp_cfg->pcfg.tupdate = ep->par[1]; + if (ep->par[2] < 0) + fqp_cfg->pcfg.max_burst = fq_pie_sysctl.pcfg.max_burst; + else + fqp_cfg->pcfg.max_burst = ep->par[2]; + if (ep->par[3] < 0) + fqp_cfg->pcfg.max_ecnth = fq_pie_sysctl.pcfg.max_ecnth; + else + fqp_cfg->pcfg.max_ecnth = ep->par[3]; + if (ep->par[4] < 0) + fqp_cfg->pcfg.alpha = fq_pie_sysctl.pcfg.alpha; + else + fqp_cfg->pcfg.alpha = ep->par[4]; + if (ep->par[5] < 0) + fqp_cfg->pcfg.beta = fq_pie_sysctl.pcfg.beta; + else + fqp_cfg->pcfg.beta = ep->par[5]; + if (ep->par[6] < 0) + fqp_cfg->pcfg.flags = 0; + else + fqp_cfg->pcfg.flags = ep->par[6]; + + /* FQ configurations */ + if (ep->par[7] < 0) + fqp_cfg->quantum = fq_pie_sysctl.quantum; + else + fqp_cfg->quantum = ep->par[7]; + if (ep->par[8] < 0) + fqp_cfg->limit = fq_pie_sysctl.limit; + else + fqp_cfg->limit = ep->par[8]; + if (ep->par[9] < 0) + fqp_cfg->flows_cnt = fq_pie_sysctl.flows_cnt; + else + fqp_cfg->flows_cnt = ep->par[9]; + + /* Bound the configurations */ + fqp_cfg->pcfg.qdelay_ref = BOUND_VAR(fqp_cfg->pcfg.qdelay_ref, + 1, 5 * AQM_TIME_1S); + fqp_cfg->pcfg.tupdate = BOUND_VAR(fqp_cfg->pcfg.tupdate, + 1, 5 * AQM_TIME_1S); + fqp_cfg->pcfg.max_burst = BOUND_VAR(fqp_cfg->pcfg.max_burst, + 0, 5 * AQM_TIME_1S); + fqp_cfg->pcfg.max_ecnth = BOUND_VAR(fqp_cfg->pcfg.max_ecnth, + 0, PIE_SCALE); + fqp_cfg->pcfg.alpha = BOUND_VAR(fqp_cfg->pcfg.alpha, 0, 7 * PIE_SCALE); + fqp_cfg->pcfg.beta = BOUND_VAR(fqp_cfg->pcfg.beta, 0, 7 * PIE_SCALE); + + fqp_cfg->quantum = BOUND_VAR(fqp_cfg->quantum,1,9000); + fqp_cfg->limit= BOUND_VAR(fqp_cfg->limit,1,20480); + fqp_cfg->flows_cnt= BOUND_VAR(fqp_cfg->flows_cnt,1,65536); + } + else { + D("Wrong parameters for fq_pie scheduler"); + return 1; + } + + return 0; +} + +/* + * Return FQ-PIE scheduler configurations + * the configurations for the scheduler is passed to userland. + */ +static int +fq_pie_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) { + + struct fq_pie_schk *schk = (struct fq_pie_schk *)(_schk+1); + struct dn_sch_fq_pie_parms *fqp_cfg; + + fqp_cfg = &schk->cfg; + + strcpy(ep->name, fq_pie_desc.name); + ep->par[0] = fqp_cfg->pcfg.qdelay_ref; + ep->par[1] = fqp_cfg->pcfg.tupdate; + ep->par[2] = fqp_cfg->pcfg.max_burst; + ep->par[3] = fqp_cfg->pcfg.max_ecnth; + ep->par[4] = fqp_cfg->pcfg.alpha; + ep->par[5] = fqp_cfg->pcfg.beta; + ep->par[6] = fqp_cfg->pcfg.flags; + + ep->par[7] = fqp_cfg->quantum; + ep->par[8] = fqp_cfg->limit; + ep->par[9] = fqp_cfg->flows_cnt; + + return 0; +} + +/* + * FQ-PIE scheduler descriptor + * contains the type of the scheduler, the name, the size of extra + * data structures, and function pointers. + */ +static struct dn_alg fq_pie_desc = { + _SI( .type = ) DN_SCHED_FQ_PIE, + _SI( .name = ) "FQ_PIE", + _SI( .flags = ) 0, + + _SI( .schk_datalen = ) sizeof(struct fq_pie_schk), + _SI( .si_datalen = ) sizeof(struct fq_pie_si) - sizeof(struct dn_sch_inst), + _SI( .q_datalen = ) 0, + + _SI( .enqueue = ) fq_pie_enqueue, + _SI( .dequeue = ) fq_pie_dequeue, + _SI( .config = ) fq_pie_config, /* new sched i.e. sched X config ...*/ + _SI( .destroy = ) NULL, /*sched x delete */ + _SI( .new_sched = ) fq_pie_new_sched, /* new schd instance */ + _SI( .free_sched = ) fq_pie_free_sched, /* delete schd instance */ + _SI( .new_fsk = ) NULL, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) NULL, + _SI( .free_queue = ) NULL, + _SI( .getconfig = ) fq_pie_getconfig, + _SI( .ref_count = ) 0 +}; + +DECLARE_DNSCHED_MODULE(dn_fq_pie, &fq_pie_desc); diff --git a/sys/netpfil/ipfw/dn_sched_prio.c b/sys/netpfil/ipfw/dn_sched_prio.c index b779515..915b4cb 100644 --- a/sys/netpfil/ipfw/dn_sched_prio.c +++ b/sys/netpfil/ipfw/dn_sched_prio.c @@ -41,6 +41,9 @@ #include <netinet/ip_dummynet.h> #include <netpfil/ipfw/dn_heap.h> #include <netpfil/ipfw/ip_dn_private.h> +#ifdef NEW_AQM +#include <netpfil/ipfw/dn_aqm.h> +#endif #include <netpfil/ipfw/dn_sched.h> #else #include <dn_test.h> @@ -223,6 +226,9 @@ static struct dn_alg prio_desc = { _SI( .new_queue = ) prio_new_queue, _SI( .free_queue = ) prio_free_queue, +#ifdef NEW_AQM + _SI( .getconfig = ) NULL, +#endif }; diff --git a/sys/netpfil/ipfw/dn_sched_qfq.c b/sys/netpfil/ipfw/dn_sched_qfq.c index 5bbff8a..87502d1 100644 --- a/sys/netpfil/ipfw/dn_sched_qfq.c +++ b/sys/netpfil/ipfw/dn_sched_qfq.c @@ -42,6 +42,9 @@ #include <netinet/ip_dummynet.h> #include <netpfil/ipfw/dn_heap.h> #include <netpfil/ipfw/ip_dn_private.h> +#ifdef NEW_AQM +#include <netpfil/ipfw/dn_aqm.h> +#endif #include <netpfil/ipfw/dn_sched.h> #else #include <dn_test.h> @@ -824,6 +827,9 @@ static struct dn_alg qfq_desc = { _SI( .free_fsk = ) NULL, _SI( .new_queue = ) qfq_new_queue, _SI( .free_queue = ) qfq_free_queue, +#ifdef NEW_AQM + _SI( .getconfig = ) NULL, +#endif }; DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc); diff --git a/sys/netpfil/ipfw/dn_sched_rr.c b/sys/netpfil/ipfw/dn_sched_rr.c index dd608d7..b3658a6 100644 --- a/sys/netpfil/ipfw/dn_sched_rr.c +++ b/sys/netpfil/ipfw/dn_sched_rr.c @@ -42,6 +42,9 @@ #include <netinet/ip_dummynet.h> #include <netpfil/ipfw/dn_heap.h> #include <netpfil/ipfw/ip_dn_private.h> +#ifdef NEW_AQM +#include <netpfil/ipfw/dn_aqm.h> +#endif #include <netpfil/ipfw/dn_sched.h> #else #include <dn_test.h> @@ -301,6 +304,9 @@ static struct dn_alg rr_desc = { _SI( .free_fsk = ) NULL, _SI( .new_queue = ) rr_new_queue, _SI( .free_queue = ) rr_free_queue, +#ifdef NEW_AQM + _SI( .getconfig = ) NULL, +#endif }; diff --git a/sys/netpfil/ipfw/dn_sched_wf2q.c b/sys/netpfil/ipfw/dn_sched_wf2q.c index a91c1ce..06f92a9 100644 --- a/sys/netpfil/ipfw/dn_sched_wf2q.c +++ b/sys/netpfil/ipfw/dn_sched_wf2q.c @@ -43,6 +43,9 @@ #include <netinet/ip_dummynet.h> #include <netpfil/ipfw/dn_heap.h> #include <netpfil/ipfw/ip_dn_private.h> +#ifdef NEW_AQM +#include <netpfil/ipfw/dn_aqm.h> +#endif #include <netpfil/ipfw/dn_sched.h> #else #include <dn_test.h> @@ -367,6 +370,10 @@ static struct dn_alg wf2qp_desc = { _SI( .new_queue = ) wf2qp_new_queue, _SI( .free_queue = ) wf2qp_free_queue, +#ifdef NEW_AQM + _SI( .getconfig = ) NULL, +#endif + }; diff --git a/sys/netpfil/ipfw/ip_dn_glue.c b/sys/netpfil/ipfw/ip_dn_glue.c index 7d7e695..d7b04af 100644 --- a/sys/netpfil/ipfw/ip_dn_glue.c +++ b/sys/netpfil/ipfw/ip_dn_glue.c @@ -55,6 +55,9 @@ #include <netpfil/ipfw/ip_fw_private.h> #include <netpfil/ipfw/dn_heap.h> #include <netpfil/ipfw/ip_dn_private.h> +#ifdef NEW_AQM +#include <netpfil/ipfw/dn_aqm.h> +#endif #include <netpfil/ipfw/dn_sched.h> /* FREEBSD7.2 ip_dummynet.h r191715*/ diff --git a/sys/netpfil/ipfw/ip_dn_io.c b/sys/netpfil/ipfw/ip_dn_io.c index 90e2ccf..b7213ce 100644 --- a/sys/netpfil/ipfw/ip_dn_io.c +++ b/sys/netpfil/ipfw/ip_dn_io.c @@ -62,6 +62,9 @@ __FBSDID("$FreeBSD$"); #include <netpfil/ipfw/ip_fw_private.h> #include <netpfil/ipfw/dn_heap.h> #include <netpfil/ipfw/ip_dn_private.h> +#ifdef NEW_AQM +#include <netpfil/ipfw/dn_aqm.h> +#endif #include <netpfil/ipfw/dn_sched.h> /* @@ -83,8 +86,12 @@ static long tick_diff; static unsigned long io_pkt; static unsigned long io_pkt_fast; -static unsigned long io_pkt_drop; +#ifdef NEW_AQM +unsigned long io_pkt_drop; +#else +static unsigned long io_pkt_drop; +#endif /* * We use a heap to store entities for which we have pending timer events. * The heap is checked at every tick and all entities with expired events @@ -147,7 +154,11 @@ SYSBEGIN(f4) SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); +#ifdef NEW_AQM +SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); +#else static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); +#endif /* wrapper to pass dn_cfg fields to SYSCTL_* */ //#define DC(x) (&(VNET_NAME(_base_dn_cfg).x)) @@ -249,6 +260,14 @@ static struct dn_pkt_tag * dn_tag_get(struct mbuf *m) { struct m_tag *mtag = m_tag_first(m); +#ifdef NEW_AQM + /* XXX: to skip ts m_tag. For Debugging only*/ + if (mtag != NULL && mtag->m_tag_id == DN_AQM_MTAG_TS) { + m_tag_delete(m,mtag); + mtag = m_tag_first(m); + D("skip TS tag"); + } +#endif KASSERT(mtag != NULL && mtag->m_tag_cookie == MTAG_ABI_COMPAT && mtag->m_tag_id == PACKET_TAG_DUMMYNET, @@ -256,6 +275,7 @@ dn_tag_get(struct mbuf *m) return (struct dn_pkt_tag *)(mtag+1); } +#ifndef NEW_AQM static inline void mq_append(struct mq *q, struct mbuf *m) { @@ -266,6 +286,7 @@ mq_append(struct mq *q, struct mbuf *m) q->tail = m; m->m_nextpkt = NULL; } +#endif /* * Dispose a list of packet. Use a functions so if we need to do @@ -390,7 +411,10 @@ red_drops (struct dn_queue *q, int len) /* * ECN/ECT Processing (partially adopted from altq) */ -static int +#ifndef NEW_AQM +static +#endif +int ecn_mark(struct mbuf* m) { struct ip *ip; @@ -482,6 +506,11 @@ dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) goto drop; if (f->plr && random() < f->plr) goto drop; +#ifdef NEW_AQM + /* Call AQM enqueue function */ + if (q->fs->aqmfp) + return q->fs->aqmfp->enqueue(q ,m); +#endif if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) { if (!(f->flags & DN_IS_ECN) || !ecn_mark(m)) goto drop; @@ -864,6 +893,10 @@ dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) if (fs->sched->fp->enqueue(si, q, m)) { /* packet was dropped by enqueue() */ m = *m0 = NULL; + + /* dn_enqueue already increases io_pkt_drop */ + io_pkt_drop--; + goto dropit; } diff --git a/sys/netpfil/ipfw/ip_dn_private.h b/sys/netpfil/ipfw/ip_dn_private.h index 159ddc9..b8b55e8 100644 --- a/sys/netpfil/ipfw/ip_dn_private.h +++ b/sys/netpfil/ipfw/ip_dn_private.h @@ -81,6 +81,10 @@ SLIST_HEAD(dn_fsk_head, dn_fsk); SLIST_HEAD(dn_queue_head, dn_queue); SLIST_HEAD(dn_alg_head, dn_alg); +#ifdef NEW_AQM +SLIST_HEAD(dn_aqm_head, dn_aqm); /* for new AQMs */ +#endif + struct mq { /* a basic queue of packets*/ struct mbuf *head, *tail; }; @@ -135,6 +139,9 @@ struct dn_parms { /* list of flowsets without a scheduler -- use sch_chain */ struct dn_fsk_head fsu; /* list of unlinked flowsets */ struct dn_alg_head schedlist; /* list of algorithms */ +#ifdef NEW_AQM + struct dn_aqm_head aqmlist; /* list of AQMs */ +#endif /* Store the fs/sch to scan when draining. The value is the * bucket number of the hash table. Expire can be disabled @@ -231,6 +238,10 @@ struct dn_fsk { /* kernel side of a flowset */ int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ int avg_pkt_size ; /* medium packet size */ int max_pkt_size ; /* max packet size */ +#ifdef NEW_AQM + struct dn_aqm *aqmfp; /* Pointer to AQM functions */ + void *aqmcfg; /* configuration parameters for AQM */ +#endif }; /* @@ -253,6 +264,9 @@ struct dn_queue { int count; /* arrivals since last RED drop */ int random; /* random value (scaled) */ uint64_t q_time; /* start of queue idle time */ +#ifdef NEW_AQM + void *aqm_status; /* per-queue status variables*/ +#endif }; @@ -400,4 +414,20 @@ int do_config(void *p, int l); void dn_drain_scheduler(void); void dn_drain_queue(void); +#ifdef NEW_AQM +int ecn_mark(struct mbuf* m); + +/* moved from ip_dn_io.c to here to be available for AQMs modules*/ +static inline void +mq_append(struct mq *q, struct mbuf *m) +{ + if (q->head == NULL) + q->head = m; + else + q->tail->m_nextpkt = m; + q->tail = m; + m->m_nextpkt = NULL; +} +#endif /* NEW_AQM */ + #endif /* _IP_DN_PRIVATE_H */ diff --git a/sys/netpfil/ipfw/ip_dummynet.c b/sys/netpfil/ipfw/ip_dummynet.c index 420b491..09fbe84 100644 --- a/sys/netpfil/ipfw/ip_dummynet.c +++ b/sys/netpfil/ipfw/ip_dummynet.c @@ -1,4 +1,11 @@ /*- + * Codel/FQ_Codel and PIE/FQ-PIE Code: + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> + * * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa * Portions Copyright (c) 2000 Akamba Corp. * All rights reserved @@ -57,6 +64,9 @@ __FBSDID("$FreeBSD$"); #include <netpfil/ipfw/ip_fw_private.h> #include <netpfil/ipfw/dn_heap.h> #include <netpfil/ipfw/ip_dn_private.h> +#ifdef NEW_AQM +#include <netpfil/ipfw/dn_aqm.h> +#endif #include <netpfil/ipfw/dn_sched.h> /* which objects to copy */ @@ -97,6 +107,21 @@ dn_reschedule(void) } /*----- end of callout hooks -----*/ +#ifdef NEW_AQM +/* Return AQM descriptor for given type or name. */ +static struct dn_aqm * +find_aqm_type(int type, char *name) +{ + struct dn_aqm *d; + + SLIST_FOREACH(d, &dn_cfg.aqmlist, next) { + if (d->type == type || (name && !strcasecmp(d->name, name))) + return d; + } + return NULL; /* not found */ +} +#endif + /* Return a scheduler descriptor given the type or name. */ static struct dn_alg * find_sched_type(int type, char *name) @@ -319,7 +344,15 @@ q_new(uintptr_t key, int flags, void *arg) if (fs->sched->fp->new_queue) fs->sched->fp->new_queue(q); + +#ifdef NEW_AQM + /* call AQM init function after creating a queue*/ + if (fs->aqmfp && fs->aqmfp->init) + if(fs->aqmfp->init(q)) + D("unable to init AQM for fs %d", fs->fs.fs_nr); +#endif dn_cfg.queue_count++; + return q; } @@ -333,6 +366,13 @@ dn_delete_queue(struct dn_queue *q, int flags) { struct dn_fsk *fs = q->fs; +#ifdef NEW_AQM + /* clean up AQM status for queue 'q' + * cleanup here is called just with MULTIQUEUE + */ + if (fs && fs->aqmfp && fs->aqmfp->cleanup) + fs->aqmfp->cleanup(q); +#endif // D("fs %p si %p\n", fs, q->_si); /* notify the parent scheduler that the queue is going away */ if (fs && fs->sched->fp->free_queue) @@ -474,6 +514,16 @@ si_new(uintptr_t key, int flags, void *arg) if (s->sch.flags & DN_HAVE_MASK) si->ni.fid = *(struct ipfw_flow_id *)key; +#ifdef NEW_AQM + /* init AQM status for !DN_MULTIQUEUE sched*/ + if (!(s->fp->flags & DN_MULTIQUEUE)) + if (s->fs->aqmfp && s->fs->aqmfp->init) + if(s->fs->aqmfp->init((struct dn_queue *)(si + 1))) { + D("unable to init AQM for fs %d", s->fs->fs.fs_nr); + goto error; + } +#endif + dn_cfg.si_count++; return si; @@ -503,6 +553,20 @@ si_destroy(void *_si, void *arg) dn_free_pkts(dl->mq.head); /* drain delay line */ if (si->kflags & DN_ACTIVE) /* remove si from event heap */ heap_extract(&dn_cfg.evheap, si); + +#ifdef NEW_AQM + /* clean up AQM status for !DN_MULTIQUEUE sched + * Note that all queues belong to fs were cleaned up in fsk_detach. + * When drain_scheduler is called s->fs and q->fs are pointing + * to a correct fs, so we can use fs in this case. + */ + if (!(s->fp->flags & DN_MULTIQUEUE)) { + struct dn_queue *q = (struct dn_queue *)(si + 1); + if (q->aqm_status && q->fs->aqmfp) + if (q->fs->aqmfp->cleanup) + q->fs->aqmfp->cleanup(q); + } +#endif if (s->fp->free_sched) s->fp->free_sched(si); bzero(si, sizeof(*si)); /* safety */ @@ -591,6 +655,67 @@ fsk_new(uintptr_t key, int flags, void *arg) return fs; } +#ifdef NEW_AQM +/* callback function for cleaning up AQM queue status belongs to a flowset + * connected to scheduler instance '_si' (for !DN_MULTIQUEUE only). + */ +static int +si_cleanup_q(void *_si, void *arg) +{ + struct dn_sch_inst *si = _si; + + if (!(si->sched->fp->flags & DN_MULTIQUEUE)) { + if (si->sched->fs->aqmfp && si->sched->fs->aqmfp->cleanup) + si->sched->fs->aqmfp->cleanup((struct dn_queue *) (si+1)); + } + return 0; +} + +/* callback to clean up queue AQM status.*/ +static int +q_cleanup_q(void *_q, void *arg) +{ + struct dn_queue *q = _q; + q->fs->aqmfp->cleanup(q); + return 0; +} + +/* Clean up all AQM queues status belongs to flowset 'fs' and then + * deconfig AQM for flowset 'fs' + */ +static void +aqm_cleanup_deconfig_fs(struct dn_fsk *fs) +{ + struct dn_sch_inst *si; + + /* clean up AQM status for all queues for !DN_MULTIQUEUE sched*/ + if (fs->fs.fs_nr > DN_MAX_ID) { + if (fs->sched && !(fs->sched->fp->flags & DN_MULTIQUEUE)) { + if (fs->sched->sch.flags & DN_HAVE_MASK) + dn_ht_scan(fs->sched->siht, si_cleanup_q, NULL); + else { + /* single si i.e. no sched mask */ + si = (struct dn_sch_inst *) fs->sched->siht; + if (si && fs->aqmfp && fs->aqmfp->cleanup) + fs->aqmfp->cleanup((struct dn_queue *) (si+1)); + } + } + } + + /* clean up AQM status for all queues for DN_MULTIQUEUE sched*/ + if (fs->sched && fs->sched->fp->flags & DN_MULTIQUEUE && fs->qht) { + if (fs->fs.flags & DN_QHT_HASH) + dn_ht_scan(fs->qht, q_cleanup_q, NULL); + else + fs->aqmfp->cleanup((struct dn_queue *)(fs->qht)); + } + + /* deconfig AQM */ + if(fs->aqmcfg && fs->aqmfp && fs->aqmfp->deconfig) + fs->aqmfp->deconfig(fs); +} +#endif + /* * detach flowset from its current scheduler. Flags as follows: * DN_DETACH removes from the fsk_list @@ -619,6 +744,10 @@ fsk_detach(struct dn_fsk *fs, int flags) free(fs->w_q_lookup, M_DUMMYNET); fs->w_q_lookup = NULL; qht_delete(fs, flags); +#ifdef NEW_AQM + aqm_cleanup_deconfig_fs(fs); +#endif + if (fs->sched && fs->sched->fp->free_fsk) fs->sched->fp->free_fsk(fs); fs->sched = NULL; @@ -1190,6 +1319,183 @@ update_fs(struct dn_schk *s) } } +#ifdef NEW_AQM +/* Retrieve AQM configurations to ipfw userland + */ +static int +get_aqm_parms(struct sockopt *sopt) +{ + struct dn_extra_parms *ep; + struct dn_fsk *fs; + size_t sopt_valsize; + int l, err = 0; + + sopt_valsize = sopt->sopt_valsize; + l = sizeof(*ep); + if (sopt->sopt_valsize < l) { + D("bad len sopt->sopt_valsize %d len %d", + (int) sopt->sopt_valsize , l); + err = EINVAL; + return err; + } + ep = malloc(l, M_DUMMYNET, M_WAITOK); + if(!ep) { + err = ENOMEM ; + return err; + } + do { + err = sooptcopyin(sopt, ep, l, l); + if(err) + break; + sopt->sopt_valsize = sopt_valsize; + if (ep->oid.len < l) { + err = EINVAL; + break; + } + + fs = dn_ht_find(dn_cfg.fshash, ep->nr, 0, NULL); + if (!fs) { + D("fs %d not found", ep->nr); + err = EINVAL; + break; + } + + if (fs->aqmfp && fs->aqmfp->getconfig) { + if(fs->aqmfp->getconfig(fs, ep)) { + D("Error while trying to get AQM params"); + err = EINVAL; + break; + } + ep->oid.len = l; + err = sooptcopyout(sopt, ep, l); + } + }while(0); + + free(ep, M_DUMMYNET); + return err; +} + +/* Retrieve AQM configurations to ipfw userland + */ +static int +get_sched_parms(struct sockopt *sopt) +{ + struct dn_extra_parms *ep; + struct dn_schk *schk; + size_t sopt_valsize; + int l, err = 0; + + sopt_valsize = sopt->sopt_valsize; + l = sizeof(*ep); + if (sopt->sopt_valsize < l) { + D("bad len sopt->sopt_valsize %d len %d", + (int) sopt->sopt_valsize , l); + err = EINVAL; + return err; + } + ep = malloc(l, M_DUMMYNET, M_WAITOK); + if(!ep) { + err = ENOMEM ; + return err; + } + do { + err = sooptcopyin(sopt, ep, l, l); + if(err) + break; + sopt->sopt_valsize = sopt_valsize; + if (ep->oid.len < l) { + err = EINVAL; + break; + } + + schk = locate_scheduler(ep->nr); + if (!schk) { + D("sched %d not found", ep->nr); + err = EINVAL; + break; + } + + if (schk->fp && schk->fp->getconfig) { + if(schk->fp->getconfig(schk, ep)) { + D("Error while trying to get sched params"); + err = EINVAL; + break; + } + ep->oid.len = l; + err = sooptcopyout(sopt, ep, l); + } + }while(0); + free(ep, M_DUMMYNET); + + return err; +} + +/* Configure AQM for flowset 'fs'. + * extra parameters are passed from userland. + */ +static int +config_aqm(struct dn_fsk *fs, struct dn_extra_parms *ep, int busy) +{ + int err = 0; + + do { + /* no configurations */ + if (!ep) { + err = 0; + break; + } + + /* no AQM for this flowset*/ + if (!strcmp(ep->name,"")) { + err = 0; + break; + } + if (ep->oid.len < sizeof(*ep)) { + D("short aqm len %d", ep->oid.len); + err = EINVAL; + break; + } + + if (busy) { + D("Unable to configure flowset, flowset busy!"); + err = EINVAL; + break; + } + + /* deconfigure old aqm if exist */ + if (fs->aqmcfg && fs->aqmfp && fs->aqmfp->deconfig) { + aqm_cleanup_deconfig_fs(fs); + } + + if (!(fs->aqmfp = find_aqm_type(0, ep->name))) { + D("AQM functions not found for type %s!", ep->name); + fs->fs.flags &= ~DN_IS_AQM; + err = EINVAL; + break; + } else + fs->fs.flags |= DN_IS_AQM; + + if (ep->oid.subtype != DN_AQM_PARAMS) { + D("Wrong subtype"); + err = EINVAL; + break; + } + + if (fs->aqmfp->config) { + err = fs->aqmfp->config(fs, ep, ep->oid.len); + if (err) { + D("Unable to configure AQM for FS %d", fs->fs.fs_nr ); + fs->fs.flags &= ~DN_IS_AQM; + fs->aqmfp = NULL; + break; + } + } + } while(0); + + return err; +} +#endif + /* * Configuration -- to preserve backward compatibility we use * the following scheme (N is 65536) @@ -1322,6 +1628,14 @@ config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked) } if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) { ND("flowset %d unchanged", i); +#ifdef NEW_AQM + /* reconfigure AQM as the parameters can be changed. + * we consider the flowsetis busy if it has scheduler instance(s) + */ + s = locate_scheduler(nfs->sched_nr); + config_aqm(fs, (struct dn_extra_parms *) arg, + s != NULL && s->siht != NULL); +#endif break; /* no change, nothing to do */ } if (oldc != dn_cfg.fsk_count) /* new item */ @@ -1340,6 +1654,10 @@ config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked) fsk_detach(fs, flags); } fs->fs = *nfs; /* copy configuration */ +#ifdef NEW_AQM + fs->aqmfp = NULL; + config_aqm(fs, (struct dn_extra_parms *) arg, s != NULL && s->siht != NULL); +#endif if (s != NULL) fsk_attach(fs, s); } while (0); @@ -1865,6 +2183,19 @@ dummynet_get(struct sockopt *sopt, void **compat) // cmd->id = sopt_valsize; D("compatibility mode"); } + +#ifdef NEW_AQM + /* get AQM params */ + if(cmd->subtype == DN_AQM_PARAMS) { + error = get_aqm_parms(sopt); + goto done; + /* get Scheduler params */ + } else if (cmd->subtype == DN_SCH_PARAMS) { + error = get_sched_parms(sopt); + goto done; + } +#endif + a.extra = (struct copy_range *)cmd; if (cmd->len == sizeof(*cmd)) { /* no range, create a default */ uint32_t *rp = (uint32_t *)(cmd + 1); @@ -2316,4 +2647,98 @@ MODULE_VERSION(dummynet, 3); */ //VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL); +#ifdef NEW_AQM + +/* modevent helpers for the AQM modules */ +static int +load_dn_aqm(struct dn_aqm *d) +{ + struct dn_aqm *aqm=NULL; + + if (d == NULL) + return 1; /* error */ + ip_dn_init(); /* just in case, we need the lock */ + + /* Check that mandatory funcs exists */ + if (d->enqueue == NULL || d->dequeue == NULL) { + D("missing enqueue or dequeue for %s", d->name); + return 1; + } + + /* Search if AQM already exists */ + DN_BH_WLOCK(); + SLIST_FOREACH(aqm, &dn_cfg.aqmlist, next) { + if (strcmp(aqm->name, d->name) == 0) { + D("%s already loaded", d->name); + break; /* AQM already exists */ + } + } + if (aqm == NULL) + SLIST_INSERT_HEAD(&dn_cfg.aqmlist, d, next); + DN_BH_WUNLOCK(); + D("dn_aqm %s %sloaded", d->name, aqm ? "not ":""); + return aqm ? 1 : 0; +} + + +/* Callback to clean up AQM status for queues connected to a flowset + * and then deconfigure the flowset. + * This function is called before an AQM module is unloaded + */ +static int +fs_cleanup(void *_fs, void *arg) +{ + struct dn_fsk *fs = _fs; + uint32_t type = *(uint32_t *)arg; + + if (fs->aqmfp && fs->aqmfp->type == type) + aqm_cleanup_deconfig_fs(fs); + + return 0; +} + +static int +unload_dn_aqm(struct dn_aqm *aqm) +{ + struct dn_aqm *tmp, *r; + int err = EINVAL; + err = 0; + ND("called for %s", aqm->name); + + DN_BH_WLOCK(); + + /* clean up AQM status and deconfig flowset */ + dn_ht_scan(dn_cfg.fshash, fs_cleanup, &aqm->type); + + SLIST_FOREACH_SAFE(r, &dn_cfg.aqmlist, next, tmp) { + if (strcmp(aqm->name, r->name) != 0) + continue; + ND("ref_count = %d", r->ref_count); + err = (r->ref_count != 0 || r->cfg_ref_count != 0) ? EBUSY : 0; + if (err == 0) + SLIST_REMOVE(&dn_cfg.aqmlist, r, dn_aqm, next); + break; + } + DN_BH_WUNLOCK(); + D("%s %sunloaded", aqm->name, err ? "not ":""); + if (err) + D("ref_count=%d, cfg_ref_count=%d", r->ref_count, r->cfg_ref_count); + return err; +} + +int +dn_aqm_modevent(module_t mod, int cmd, void *arg) +{ + struct dn_aqm *aqm = arg; + + if (cmd == MOD_LOAD) + return load_dn_aqm(aqm); + else if (cmd == MOD_UNLOAD) + return unload_dn_aqm(aqm); + else + return EINVAL; +} +#endif + /* end of file */ + diff --git a/sys/rpc/svc.c b/sys/rpc/svc.c index b436c18..a4cc484 100644 --- a/sys/rpc/svc.c +++ b/sys/rpc/svc.c @@ -847,9 +847,7 @@ svc_xprt_alloc() SVCXPRT_EXT *ext; xprt = mem_alloc(sizeof(SVCXPRT)); - memset(xprt, 0, sizeof(SVCXPRT)); ext = mem_alloc(sizeof(SVCXPRT_EXT)); - memset(ext, 0, sizeof(SVCXPRT_EXT)); xprt->xp_p3 = ext; refcount_init(&xprt->xp_refs, 1); diff --git a/sys/rpc/svc_vc.c b/sys/rpc/svc_vc.c index be8e04e..92a926d 100644 --- a/sys/rpc/svc_vc.c +++ b/sys/rpc/svc_vc.c @@ -189,11 +189,11 @@ svc_vc_create(SVCPOOL *pool, struct socket *so, size_t sendsize, SOCKBUF_UNLOCK(&so->so_rcv); return (xprt); + cleanup_svc_vc_create: - if (xprt) { - sx_destroy(&xprt->xp_lock); - svc_xprt_free(xprt); - } + sx_destroy(&xprt->xp_lock); + svc_xprt_free(xprt); + return (NULL); } @@ -203,8 +203,8 @@ cleanup_svc_vc_create: SVCXPRT * svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr) { - SVCXPRT *xprt = NULL; - struct cf_conn *cd = NULL; + SVCXPRT *xprt; + struct cf_conn *cd; struct sockaddr* sa = NULL; struct sockopt opt; int one = 1; @@ -279,12 +279,10 @@ svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr) return (xprt); cleanup_svc_vc_create: - if (xprt) { - sx_destroy(&xprt->xp_lock); - svc_xprt_free(xprt); - } - if (cd) - mem_free(cd, sizeof(*cd)); + sx_destroy(&xprt->xp_lock); + svc_xprt_free(xprt); + mem_free(cd, sizeof(*cd)); + return (NULL); } diff --git a/sys/sys/cdefs.h b/sys/sys/cdefs.h index 1729c7b..0b3ed26 100644 --- a/sys/sys/cdefs.h +++ b/sys/sys/cdefs.h @@ -273,7 +273,8 @@ #define _Alignof(x) __alignof(x) #endif -#if !__has_extension(c_atomic) && !__has_extension(cxx_atomic) +#if !defined(__cplusplus) && !__has_extension(c_atomic) && \ + !__has_extension(cxx_atomic) /* * No native support for _Atomic(). Place object in structure to prevent * most forms of direct non-atomic access. diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h index d2ad920..14bd867 100644 --- a/sys/sys/vmmeter.h +++ b/sys/sys/vmmeter.h @@ -183,7 +183,8 @@ static __inline int vm_paging_needed(void) { - return (cnt.v_free_count + cnt.v_cache_count < vm_pageout_wakeup_thresh); + return (cnt.v_free_count + cnt.v_cache_count < + (u_int)vm_pageout_wakeup_thresh); } #endif diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 512151b..c250c5d 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -253,11 +253,11 @@ vm_page_domain_init(struct vm_domain *vmd) *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; - *__DECONST(int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) = + *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) = &cnt.v_inactive_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; - *__DECONST(int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) = + *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) = &cnt.v_active_count; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 7ecb6c7..3ab4c24 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -215,7 +215,7 @@ struct vm_pagequeue { struct mtx pq_mutex; struct pglist pq_pl; int pq_cnt; - int * const pq_vcnt; + u_int * const pq_vcnt; const char * const pq_name; } __aligned(CACHE_LINE_SIZE); diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c index a580f2a..e3c1571 100644 --- a/sys/x86/x86/local_apic.c +++ b/sys/x86/x86/local_apic.c @@ -284,7 +284,7 @@ lapic_init(vm_paddr_t addr) } #ifdef SMP -#define LOOPS 1000000 +#define LOOPS 100000 /* * Calibrate the busy loop waiting for IPI ack in xAPIC mode. * lapic_ipi_wait_mult contains the number of iterations which @@ -440,7 +440,7 @@ lapic_setup(int boot) /* Program the CMCI LVT entry if present. */ if (maxlvt >= APIC_LVT_CMCI) lapic->lvt_cmci = lvt_mode(la, APIC_LVT_CMCI, lapic->lvt_cmci); - + intr_restore(saveintr); } @@ -1363,7 +1363,7 @@ static void apic_setup_local(void *dummy __unused) { int retval; - + if (best_enum == NULL) return; |