summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
Diffstat (limited to 'sys')
-rw-r--r--sys/amd64/amd64/pmap.c11
-rw-r--r--sys/cddl/boot/zfs/lz4.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c1
-rw-r--r--sys/conf/files4
-rw-r--r--sys/conf/files.amd643
-rw-r--r--sys/conf/files.i3863
-rw-r--r--sys/dev/bge/if_bge.c6
-rw-r--r--sys/dev/bge/if_bgereg.h2
-rw-r--r--sys/dev/hyperv/include/hyperv.h56
-rw-r--r--sys/dev/hyperv/netvsc/hv_net_vsc.c48
-rw-r--r--sys/dev/hyperv/netvsc/hv_net_vsc.h99
-rw-r--r--sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c1826
-rw-r--r--sys/dev/hyperv/netvsc/hv_rndis_filter.c24
-rw-r--r--sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c17
-rw-r--r--sys/dev/hyperv/utilities/hv_heartbeat.c129
-rw-r--r--sys/dev/hyperv/utilities/hv_kvp.c556
-rw-r--r--sys/dev/hyperv/utilities/hv_kvp.h13
-rw-r--r--sys/dev/hyperv/utilities/hv_shutdown.c151
-rw-r--r--sys/dev/hyperv/utilities/hv_timesync.c216
-rw-r--r--sys/dev/hyperv/utilities/hv_util.c415
-rw-r--r--sys/dev/hyperv/utilities/hv_util.h55
-rw-r--r--sys/dev/hyperv/vmbus/hv_channel.c111
-rw-r--r--sys/dev/hyperv/vmbus/hv_channel_mgmt.c231
-rw-r--r--sys/dev/hyperv/vmbus/hv_connection.c165
-rw-r--r--sys/dev/hyperv/vmbus/hv_hv.c6
-rw-r--r--sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c104
-rw-r--r--sys/dev/hyperv/vmbus/hv_vmbus_priv.h14
-rw-r--r--sys/dev/ioat/ioat.c13
-rw-r--r--sys/dev/ioat/ioat.h2
-rw-r--r--sys/dev/ioat/ioat_internal.h2
-rw-r--r--sys/dev/ntb/ntb_hw/ntb_hw.c548
-rw-r--r--sys/dev/ntb/ntb_hw/ntb_hw.h2
-rw-r--r--sys/dev/ntb/ntb_hw/ntb_regs.h1
-rw-r--r--sys/dev/sound/pci/hda/hdaa.c32
-rw-r--r--sys/kern/subr_vmem.c4
-rw-r--r--sys/modules/dummynet/Makefile3
-rw-r--r--sys/modules/hyperv/utilities/Makefile2
-rw-r--r--sys/netinet/ip_dummynet.h27
-rw-r--r--sys/netipsec/key.c2
-rw-r--r--sys/netpfil/ipfw/dn_aqm.h167
-rw-r--r--sys/netpfil/ipfw/dn_aqm_codel.c444
-rw-r--r--sys/netpfil/ipfw/dn_aqm_codel.h222
-rw-r--r--sys/netpfil/ipfw/dn_aqm_pie.c793
-rw-r--r--sys/netpfil/ipfw/dn_aqm_pie.h153
-rw-r--r--sys/netpfil/ipfw/dn_sched.h9
-rw-r--r--sys/netpfil/ipfw/dn_sched_fifo.c6
-rw-r--r--sys/netpfil/ipfw/dn_sched_fq_codel.c617
-rw-r--r--sys/netpfil/ipfw/dn_sched_fq_codel.h167
-rw-r--r--sys/netpfil/ipfw/dn_sched_fq_codel_helper.h187
-rw-r--r--sys/netpfil/ipfw/dn_sched_fq_pie.c1262
-rw-r--r--sys/netpfil/ipfw/dn_sched_prio.c6
-rw-r--r--sys/netpfil/ipfw/dn_sched_qfq.c6
-rw-r--r--sys/netpfil/ipfw/dn_sched_rr.c6
-rw-r--r--sys/netpfil/ipfw/dn_sched_wf2q.c7
-rw-r--r--sys/netpfil/ipfw/ip_dn_glue.c3
-rw-r--r--sys/netpfil/ipfw/ip_dn_io.c37
-rw-r--r--sys/netpfil/ipfw/ip_dn_private.h30
-rw-r--r--sys/netpfil/ipfw/ip_dummynet.c425
-rw-r--r--sys/rpc/svc.c2
-rw-r--r--sys/rpc/svc_vc.c22
-rw-r--r--sys/sys/cdefs.h3
-rw-r--r--sys/sys/vmmeter.h3
-rw-r--r--sys/vm/vm_page.c4
-rw-r--r--sys/vm/vm_page.h2
-rw-r--r--sys/x86/x86/local_apic.c6
66 files changed, 7552 insertions, 1944 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 855f7bc..8136745 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -6480,7 +6480,7 @@ static int
pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
{
vm_offset_t base, offset, tmpva;
- vm_paddr_t pa_start, pa_end;
+ vm_paddr_t pa_start, pa_end, pa_end1;
pdp_entry_t *pdpe;
pd_entry_t *pde;
pt_entry_t *pte;
@@ -6660,9 +6660,12 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
tmpva += PAGE_SIZE;
}
}
- if (error == 0 && pa_start != pa_end)
- error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
- pa_end - pa_start, mode);
+ if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
+ pa_end1 = MIN(pa_end, dmaplimit);
+ if (pa_start != pa_end1)
+ error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
+ pa_end1 - pa_start, mode);
+ }
/*
* Flush CPU caches if required to make sure any data isn't cached that
diff --git a/sys/cddl/boot/zfs/lz4.c b/sys/cddl/boot/zfs/lz4.c
index c29f861..b12122c 100644
--- a/sys/cddl/boot/zfs/lz4.c
+++ b/sys/cddl/boot/zfs/lz4.c
@@ -34,6 +34,8 @@
* $FreeBSD$
*/
+#include <arpa/inet.h>
+
static int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
int isize, int maxOutputSize);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 226233e..f6d19fe 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -139,7 +139,6 @@
#include <zfs_fletcher.h>
#include <sys/sdt.h>
-#include <vm/vm_pageout.h>
#include <machine/vmparam.h>
#ifdef illumos
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index 04e1342..2a15cdf 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -75,7 +75,6 @@
#include <sys/sched.h>
#include <sys/acl.h>
#include <vm/vm_param.h>
-#include <vm/vm_pageout.h>
/*
* Programming rules.
diff --git a/sys/conf/files b/sys/conf/files
index 8d0453a..e8c8a3a 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3596,8 +3596,12 @@ netipx/spx_usrreq.c optional ipx
netnatm/natm.c optional natm
netnatm/natm_pcb.c optional natm
netnatm/natm_proto.c optional natm
+netpfil/ipfw/dn_aqm_codel.c optional inet dummynet
+netpfil/ipfw/dn_aqm_pie.c optional inet dummynet
netpfil/ipfw/dn_heap.c optional inet dummynet
netpfil/ipfw/dn_sched_fifo.c optional inet dummynet
+netpfil/ipfw/dn_sched_fq_codel.c optional inet dummynet
+netpfil/ipfw/dn_sched_fq_pie.c optional inet dummynet
netpfil/ipfw/dn_sched_prio.c optional inet dummynet
netpfil/ipfw/dn_sched_qfq.c optional inet dummynet
netpfil/ipfw/dn_sched_rr.c optional inet dummynet
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index f96b4f3..533b957 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -271,7 +271,10 @@ dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c optional hyperv
dev/hyperv/netvsc/hv_rndis_filter.c optional hyperv
dev/hyperv/stordisengage/hv_ata_pci_disengage.c optional hyperv
dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c optional hyperv
+dev/hyperv/utilities/hv_heartbeat.c optional hyperv
dev/hyperv/utilities/hv_kvp.c optional hyperv
+dev/hyperv/utilities/hv_shutdown.c optional hyperv
+dev/hyperv/utilities/hv_timesync.c optional hyperv
dev/hyperv/utilities/hv_util.c optional hyperv
dev/hyperv/vmbus/hv_channel.c optional hyperv
dev/hyperv/vmbus/hv_channel_mgmt.c optional hyperv
diff --git a/sys/conf/files.i386 b/sys/conf/files.i386
index f79ed58..89b87e3 100644
--- a/sys/conf/files.i386
+++ b/sys/conf/files.i386
@@ -247,7 +247,10 @@ dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c optional hyperv
dev/hyperv/netvsc/hv_rndis_filter.c optional hyperv
dev/hyperv/stordisengage/hv_ata_pci_disengage.c optional hyperv
dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c optional hyperv
+dev/hyperv/utilities/hv_heartbeat.c optional hyperv
dev/hyperv/utilities/hv_kvp.c optional hyperv
+dev/hyperv/utilities/hv_shutdown.c optional hyperv
+dev/hyperv/utilities/hv_timesync.c optional hyperv
dev/hyperv/utilities/hv_util.c optional hyperv
dev/hyperv/vmbus/hv_channel.c optional hyperv
dev/hyperv/vmbus/hv_channel_mgmt.c optional hyperv
diff --git a/sys/dev/bge/if_bge.c b/sys/dev/bge/if_bge.c
index b8660a4..def33c3 100644
--- a/sys/dev/bge/if_bge.c
+++ b/sys/dev/bge/if_bge.c
@@ -170,6 +170,7 @@ static const struct bge_type {
{ BCOM_VENDORID, BCOM_DEVICEID_BCM5715 },
{ BCOM_VENDORID, BCOM_DEVICEID_BCM5715S },
{ BCOM_VENDORID, BCOM_DEVICEID_BCM5717 },
+ { BCOM_VENDORID, BCOM_DEVICEID_BCM5717C },
{ BCOM_VENDORID, BCOM_DEVICEID_BCM5718 },
{ BCOM_VENDORID, BCOM_DEVICEID_BCM5719 },
{ BCOM_VENDORID, BCOM_DEVICEID_BCM5720 },
@@ -310,6 +311,7 @@ static const struct bge_revision {
{ BGE_CHIPID_BCM5715_A3, "BCM5715 A3" },
{ BGE_CHIPID_BCM5717_A0, "BCM5717 A0" },
{ BGE_CHIPID_BCM5717_B0, "BCM5717 B0" },
+ { BGE_CHIPID_BCM5717_C0, "BCM5717 C0" },
{ BGE_CHIPID_BCM5719_A0, "BCM5719 A0" },
{ BGE_CHIPID_BCM5720_A0, "BCM5720 A0" },
{ BGE_CHIPID_BCM5755_A0, "BCM5755 A0" },
@@ -2689,6 +2691,10 @@ bge_chipid(device_t dev)
* registers.
*/
switch (pci_get_device(dev)) {
+ case BCOM_DEVICEID_BCM5717C:
+ /* 5717 C0 seems to belong to 5720 line. */
+ id = BGE_CHIPID_BCM5720_A0;
+ break;
case BCOM_DEVICEID_BCM5717:
case BCOM_DEVICEID_BCM5718:
case BCOM_DEVICEID_BCM5719:
diff --git a/sys/dev/bge/if_bgereg.h b/sys/dev/bge/if_bgereg.h
index 37b0459..0cf9ca1 100644
--- a/sys/dev/bge/if_bgereg.h
+++ b/sys/dev/bge/if_bgereg.h
@@ -329,6 +329,7 @@
#define BGE_CHIPID_BCM57780_A1 0x57780001
#define BGE_CHIPID_BCM5717_A0 0x05717000
#define BGE_CHIPID_BCM5717_B0 0x05717100
+#define BGE_CHIPID_BCM5717_C0 0x05717200
#define BGE_CHIPID_BCM5719_A0 0x05719000
#define BGE_CHIPID_BCM5720_A0 0x05720000
#define BGE_CHIPID_BCM5762_A0 0x05762000
@@ -2452,6 +2453,7 @@ struct bge_status_block {
#define BCOM_DEVICEID_BCM5715 0x1678
#define BCOM_DEVICEID_BCM5715S 0x1679
#define BCOM_DEVICEID_BCM5717 0x1655
+#define BCOM_DEVICEID_BCM5717C 0x1665
#define BCOM_DEVICEID_BCM5718 0x1656
#define BCOM_DEVICEID_BCM5719 0x1657
#define BCOM_DEVICEID_BCM5720_PP 0x1658 /* Not released to public. */
diff --git a/sys/dev/hyperv/include/hyperv.h b/sys/dev/hyperv/include/hyperv.h
index 1a45b7b..f45543b 100644
--- a/sys/dev/hyperv/include/hyperv.h
+++ b/sys/dev/hyperv/include/hyperv.h
@@ -755,6 +755,8 @@ typedef struct hv_vmbus_channel {
struct mtx inbound_lock;
+ struct taskqueue * rxq;
+ struct task channel_task;
hv_vmbus_pfn_channel_callback on_channel_callback;
void* channel_callback_context;
@@ -906,30 +908,6 @@ int hv_vmbus_channel_teardown_gpdal(
struct hv_vmbus_channel* vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary);
-/*
- * Work abstraction defines
- */
-typedef struct hv_work_queue {
- struct taskqueue* queue;
- struct proc* proc;
- struct sema* work_sema;
-} hv_work_queue;
-
-typedef struct hv_work_item {
- struct task work;
- void (*callback)(void *);
- void* context;
- hv_work_queue* wq;
-} hv_work_item;
-
-struct hv_work_queue* hv_work_queue_create(char* name);
-
-void hv_work_queue_close(struct hv_work_queue* wq);
-
-int hv_queue_work_item(
- hv_work_queue* wq,
- void (*callback)(void *),
- void* context);
/**
* @brief Get physical address from virtual
*/
@@ -941,35 +919,5 @@ hv_get_phys_addr(void *virt)
return (ret);
}
-
-/**
- * KVP related structures
- *
- */
-typedef struct hv_vmbus_service {
- hv_guid guid; /* Hyper-V GUID */
- char *name; /* name of service */
- boolean_t enabled; /* service enabled */
- hv_work_queue *work_queue; /* background work queue */
-
- /*
- * function to initialize service
- */
- int (*init)(struct hv_vmbus_service *);
-
- /*
- * function to process Hyper-V messages
- */
- void (*callback)(void *);
-} hv_vmbus_service;
-
-extern uint8_t* receive_buffer[];
-extern hv_vmbus_service service_table[];
extern uint32_t hv_vmbus_protocal_version;
-
-void hv_kvp_callback(void *context);
-int hv_kvp_init(hv_vmbus_service *serv);
-void hv_kvp_deinit(void);
-
#endif /* __HYPERV_H__ */
-
diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.c b/sys/dev/hyperv/netvsc/hv_net_vsc.c
index 64e7578..9a89b62 100644
--- a/sys/dev/hyperv/netvsc/hv_net_vsc.c
+++ b/sys/dev/hyperv/netvsc/hv_net_vsc.c
@@ -73,10 +73,7 @@ hv_nv_alloc_net_device(struct hv_device *device)
netvsc_dev *net_dev;
hn_softc_t *sc = device_get_softc(device->device);
- net_dev = malloc(sizeof(netvsc_dev), M_NETVSC, M_NOWAIT | M_ZERO);
- if (net_dev == NULL) {
- return (NULL);
- }
+ net_dev = malloc(sizeof(netvsc_dev), M_NETVSC, M_WAITOK | M_ZERO);
net_dev->dev = device;
net_dev->destroy = FALSE;
@@ -135,15 +132,15 @@ hv_nv_get_next_send_section(netvsc_dev *net_dev)
int i;
for (i = 0; i < bitsmap_words; i++) {
- idx = ffs(~bitsmap[i]);
+ idx = ffsl(~bitsmap[i]);
if (0 == idx)
continue;
idx--;
- if (i * BITS_PER_LONG + idx >= net_dev->send_section_count)
- return (ret);
+ KASSERT(i * BITS_PER_LONG + idx < net_dev->send_section_count,
+ ("invalid i %d and idx %lu", i, idx));
- if (synch_test_and_set_bit(idx, &bitsmap[i]))
+ if (atomic_testandset_long(&bitsmap[i], idx))
continue;
ret = i * BITS_PER_LONG + idx;
@@ -223,11 +220,7 @@ hv_nv_init_rx_buffer_with_net_vsp(struct hv_device *device)
init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.num_sections;
net_dev->rx_sections = malloc(net_dev->rx_section_count *
- sizeof(nvsp_1_rx_buf_section), M_NETVSC, M_NOWAIT);
- if (net_dev->rx_sections == NULL) {
- ret = EINVAL;
- goto cleanup;
- }
+ sizeof(nvsp_1_rx_buf_section), M_NETVSC, M_WAITOK);
memcpy(net_dev->rx_sections,
init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.sections,
net_dev->rx_section_count * sizeof(nvsp_1_rx_buf_section));
@@ -325,11 +318,7 @@ hv_nv_init_send_buffer_with_net_vsp(struct hv_device *device)
BITS_PER_LONG);
net_dev->send_section_bitsmap =
malloc(net_dev->bitsmap_words * sizeof(long), M_NETVSC,
- M_NOWAIT | M_ZERO);
- if (NULL == net_dev->send_section_bitsmap) {
- ret = ENOMEM;
- goto cleanup;
- }
+ M_WAITOK | M_ZERO);
goto exit;
@@ -788,8 +777,27 @@ hv_nv_on_send_completion(netvsc_dev *net_dev,
if (NULL != net_vsc_pkt) {
if (net_vsc_pkt->send_buf_section_idx !=
NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) {
- synch_change_bit(net_vsc_pkt->send_buf_section_idx,
- net_dev->send_section_bitsmap);
+ u_long mask;
+ int idx;
+
+ idx = net_vsc_pkt->send_buf_section_idx /
+ BITS_PER_LONG;
+ KASSERT(idx < net_dev->bitsmap_words,
+ ("invalid section index %u",
+ net_vsc_pkt->send_buf_section_idx));
+ mask = 1UL <<
+ (net_vsc_pkt->send_buf_section_idx %
+ BITS_PER_LONG);
+
+ KASSERT(net_dev->send_section_bitsmap[idx] &
+ mask,
+ ("index bitmap 0x%lx, section index %u, "
+ "bitmap idx %d, bitmask 0x%lx",
+ net_dev->send_section_bitsmap[idx],
+ net_vsc_pkt->send_buf_section_idx,
+ idx, mask));
+ atomic_clear_long(
+ &net_dev->send_section_bitsmap[idx], mask);
}
/* Notify the layer above us */
diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.h b/sys/dev/hyperv/netvsc/hv_net_vsc.h
index e684cc5..95dee17 100644
--- a/sys/dev/hyperv/netvsc/hv_net_vsc.h
+++ b/sys/dev/hyperv/netvsc/hv_net_vsc.h
@@ -39,9 +39,11 @@
#define __HV_NET_VSC_H__
#include <sys/param.h>
+#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/queue.h>
+#include <sys/taskqueue.h>
#include <sys/sx.h>
#include <machine/bus.h>
@@ -56,6 +58,8 @@
#include <dev/hyperv/include/hyperv.h>
+#define HN_USE_TXDESC_BUFRING
+
MALLOC_DECLARE(M_NETVSC);
#define NVSP_INVALID_PROTOCOL_VERSION (0xFFFFFFFF)
@@ -988,8 +992,67 @@ typedef struct {
hv_bool_uint8_t link_state;
} netvsc_device_info;
+#ifndef HN_USE_TXDESC_BUFRING
struct hn_txdesc;
SLIST_HEAD(hn_txdesc_list, hn_txdesc);
+#else
+struct buf_ring;
+#endif
+
+struct hn_rx_ring {
+ struct lro_ctrl hn_lro;
+
+ /* Trust csum verification on host side */
+ int hn_trust_hcsum; /* HN_TRUST_HCSUM_ */
+
+ u_long hn_csum_ip;
+ u_long hn_csum_tcp;
+ u_long hn_csum_udp;
+ u_long hn_csum_trusted;
+ u_long hn_lro_tried;
+ u_long hn_small_pkts;
+} __aligned(CACHE_LINE_SIZE);
+
+#define HN_TRUST_HCSUM_IP 0x0001
+#define HN_TRUST_HCSUM_TCP 0x0002
+#define HN_TRUST_HCSUM_UDP 0x0004
+
+struct hn_tx_ring {
+#ifndef HN_USE_TXDESC_BUFRING
+ struct mtx hn_txlist_spin;
+ struct hn_txdesc_list hn_txlist;
+#else
+ struct buf_ring *hn_txdesc_br;
+#endif
+ int hn_txdesc_cnt;
+ int hn_txdesc_avail;
+ int hn_has_txeof;
+
+ int hn_sched_tx;
+ void (*hn_txeof)(struct hn_tx_ring *);
+ struct taskqueue *hn_tx_taskq;
+ struct task hn_tx_task;
+ struct task hn_txeof_task;
+
+ struct mtx hn_tx_lock;
+ struct hn_softc *hn_sc;
+
+ int hn_direct_tx_size;
+ int hn_tx_chimney_size;
+ bus_dma_tag_t hn_tx_data_dtag;
+ uint64_t hn_csum_assist;
+
+ u_long hn_no_txdescs;
+ u_long hn_send_failed;
+ u_long hn_txdma_failed;
+ u_long hn_tx_collapsed;
+ u_long hn_tx_chimney;
+
+ /* Rarely used stuffs */
+ struct hn_txdesc *hn_txdesc;
+ bus_dma_tag_t hn_tx_rndis_dtag;
+ struct sysctl_oid *hn_tx_sysctl_tree;
+} __aligned(CACHE_LINE_SIZE);
/*
* Device-specific softc structure
@@ -1009,44 +1072,22 @@ typedef struct hn_softc {
struct hv_device *hn_dev_obj;
netvsc_dev *net_dev;
- int hn_txdesc_cnt;
- struct hn_txdesc *hn_txdesc;
- bus_dma_tag_t hn_tx_data_dtag;
- bus_dma_tag_t hn_tx_rndis_dtag;
- int hn_tx_chimney_size;
- int hn_tx_chimney_max;
+ int hn_rx_ring_cnt;
+ struct hn_rx_ring *hn_rx_ring;
- struct mtx hn_txlist_spin;
- struct hn_txdesc_list hn_txlist;
- int hn_txdesc_avail;
- int hn_txeof;
-
- struct lro_ctrl hn_lro;
- int hn_lro_hiwat;
-
- /* Trust tcp segments verification on host side */
- int hn_trust_hosttcp;
-
- u_long hn_csum_ip;
- u_long hn_csum_tcp;
- u_long hn_csum_trusted;
- u_long hn_lro_tried;
- u_long hn_small_pkts;
- u_long hn_no_txdescs;
- u_long hn_send_failed;
- u_long hn_txdma_failed;
- u_long hn_tx_collapsed;
- u_long hn_tx_chimney;
+ int hn_tx_ring_cnt;
+ struct hn_tx_ring *hn_tx_ring;
+ int hn_tx_chimney_max;
+ struct taskqueue *hn_tx_taskq;
+ struct sysctl_oid *hn_tx_sysctl_tree;
} hn_softc_t;
-
/*
* Externs
*/
extern int hv_promisc_mode;
void netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status);
-void netvsc_xmit_completion(void *context);
void hv_nv_on_receive_completion(struct hv_device *device,
uint64_t tid, uint32_t status);
netvsc_dev *hv_nv_on_device_add(struct hv_device *device,
diff --git a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
index b3360ea..0f4425e 100644
--- a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
+++ b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
@@ -66,10 +66,12 @@ __FBSDID("$FreeBSD$");
#include <sys/module.h>
#include <sys/kernel.h>
#include <sys/socket.h>
+#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/lock.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
+#include <sys/buf_ring.h>
#include <net/if.h>
#include <net/if_arp.h>
@@ -132,6 +134,8 @@ __FBSDID("$FreeBSD$");
/* YYY should get it from the underlying channel */
#define HN_TX_DESC_CNT 512
+#define HN_LROENT_CNT_DEF 128
+
#define HN_RNDIS_MSG_LEN \
(sizeof(rndis_msg) + \
RNDIS_VLAN_PPI_SIZE + \
@@ -146,10 +150,14 @@ __FBSDID("$FreeBSD$");
#define HN_TX_DATA_SEGCNT_MAX \
(NETVSC_PACKET_MAXPAGE - HV_RF_NUM_TX_RESERVED_PAGE_BUFS)
+#define HN_DIRECT_TX_SIZE_DEF 128
+
struct hn_txdesc {
+#ifndef HN_USE_TXDESC_BUFRING
SLIST_ENTRY(hn_txdesc) link;
+#endif
struct mbuf *m;
- struct hn_softc *sc;
+ struct hn_tx_ring *txr;
int refs;
uint32_t flags; /* HN_TXD_FLAG_ */
netvsc_packet netvsc_pkt; /* XXX to be removed */
@@ -165,23 +173,18 @@ struct hn_txdesc {
#define HN_TXD_FLAG_DMAMAP 0x2
/*
- * A unified flag for all outbound check sum flags is useful,
- * and it helps avoiding unnecessary check sum calculation in
- * network forwarding scenario.
+ * Only enable UDP checksum offloading when it is on 2012R2 or
+ * later. UDP checksum offloading doesn't work on earlier
+ * Windows releases.
*/
-#define HV_CSUM_FOR_OUTBOUND \
- (CSUM_IP|CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP|CSUM_IP_TSO| \
- CSUM_IP_ISCSI|CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP| \
- CSUM_IP6_TSO|CSUM_IP6_ISCSI)
-
-/* XXX move to netinet/tcp_lro.h */
-#define HN_LRO_HIWAT_MAX 65535
-#define HN_LRO_HIWAT_DEF HN_LRO_HIWAT_MAX
+#define HN_CSUM_ASSIST_WIN8 (CSUM_IP | CSUM_TCP)
+#define HN_CSUM_ASSIST (CSUM_IP | CSUM_UDP | CSUM_TCP)
+
+#define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
/* YYY 2*MTU is a bit rough, but should be good enough. */
-#define HN_LRO_HIWAT_MTULIM(ifp) (2 * (ifp)->if_mtu)
-#define HN_LRO_HIWAT_ISVALID(sc, hiwat) \
- ((hiwat) >= HN_LRO_HIWAT_MTULIM((sc)->hn_ifp) || \
- (hiwat) <= HN_LRO_HIWAT_MAX)
+#define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
+
+#define HN_LRO_ACKCNT_DEF 1
/*
* Be aware that this sleepable mutex will exhibit WITNESS errors when
@@ -205,19 +208,71 @@ struct hn_txdesc {
int hv_promisc_mode = 0; /* normal mode by default */
+SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD, NULL, "Hyper-V network interface");
+
/* Trust tcp segements verification on host side. */
-static int hn_trust_hosttcp = 0;
-TUNABLE_INT("dev.hn.trust_hosttcp", &hn_trust_hosttcp);
+static int hn_trust_hosttcp = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
+ &hn_trust_hosttcp, 0,
+ "Trust tcp segement verification on host side, "
+ "when csum info is missing (global setting)");
+
+/* Trust udp datagrams verification on host side. */
+static int hn_trust_hostudp = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
+ &hn_trust_hostudp, 0,
+ "Trust udp datagram verification on host side, "
+ "when csum info is missing (global setting)");
+
+/* Trust ip packets verification on host side. */
+static int hn_trust_hostip = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
+ &hn_trust_hostip, 0,
+ "Trust ip packet verification on host side, "
+ "when csum info is missing (global setting)");
#if __FreeBSD_version >= 1100045
/* Limit TSO burst size */
static int hn_tso_maxlen = 0;
-TUNABLE_INT("dev.hn.tso_maxlen", &hn_tso_maxlen);
+SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
+ &hn_tso_maxlen, 0, "TSO burst limit");
#endif
/* Limit chimney send size */
static int hn_tx_chimney_size = 0;
-TUNABLE_INT("dev.hn.tx_chimney_size", &hn_tx_chimney_size);
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
+ &hn_tx_chimney_size, 0, "Chimney send packet size limit");
+
+/* Limit the size of packet for direct transmission */
+static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
+SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
+ &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
+
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
+SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
+ &hn_lro_entry_count, 0, "LRO entry count");
+#endif
+#endif
+
+static int hn_share_tx_taskq = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
+ &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
+
+static struct taskqueue *hn_tx_taskq;
+
+#ifndef HN_USE_TXDESC_BUFRING
+static int hn_use_txdesc_bufring = 0;
+#else
+static int hn_use_txdesc_bufring = 1;
+#endif
+SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
+ &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
+
+static int hn_bind_tx_taskq = -1;
+SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
+ &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
/*
* Forward declarations
@@ -226,82 +281,37 @@ static void hn_stop(hn_softc_t *sc);
static void hn_ifinit_locked(hn_softc_t *sc);
static void hn_ifinit(void *xsc);
static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
-static void hn_start_locked(struct ifnet *ifp);
+static int hn_start_locked(struct hn_tx_ring *txr, int len);
static void hn_start(struct ifnet *ifp);
+static void hn_start_txeof(struct hn_tx_ring *);
static int hn_ifmedia_upd(struct ifnet *ifp);
static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
-#ifdef HN_LRO_HIWAT
-static int hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS);
+#if __FreeBSD_version >= 1100099
+static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
#endif
+static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
static int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS);
-static int hn_check_iplen(const struct mbuf *, int);
-static int hn_create_tx_ring(struct hn_softc *sc);
-static void hn_destroy_tx_ring(struct hn_softc *sc);
-
-static __inline void
-hn_set_lro_hiwat(struct hn_softc *sc, int hiwat)
-{
- sc->hn_lro_hiwat = hiwat;
-#ifdef HN_LRO_HIWAT
- sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat;
-#endif
-}
-
-/*
- * NetVsc get message transport protocol type
- */
-static uint32_t get_transport_proto_type(struct mbuf *m_head)
-{
- uint32_t ret_val = TRANSPORT_TYPE_NOT_IP;
- uint16_t ether_type = 0;
- int ether_len = 0;
- struct ether_vlan_header *eh;
-#ifdef INET
- struct ip *iph;
-#endif
-#ifdef INET6
- struct ip6_hdr *ip6;
-#endif
-
- eh = mtod(m_head, struct ether_vlan_header*);
- if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
- ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
- ether_type = eh->evl_proto;
- } else {
- ether_len = ETHER_HDR_LEN;
- ether_type = eh->evl_encap_proto;
- }
-
- switch (ntohs(ether_type)) {
-#ifdef INET6
- case ETHERTYPE_IPV6:
- ip6 = (struct ip6_hdr *)(m_head->m_data + ether_len);
-
- if (IPPROTO_TCP == ip6->ip6_nxt) {
- ret_val = TRANSPORT_TYPE_IPV6_TCP;
- } else if (IPPROTO_UDP == ip6->ip6_nxt) {
- ret_val = TRANSPORT_TYPE_IPV6_UDP;
- }
- break;
+#if __FreeBSD_version < 1100095
+static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
+#else
+static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
#endif
-#ifdef INET
- case ETHERTYPE_IP:
- iph = (struct ip *)(m_head->m_data + ether_len);
-
- if (IPPROTO_TCP == iph->ip_p) {
- ret_val = TRANSPORT_TYPE_IPV4_TCP;
- } else if (IPPROTO_UDP == iph->ip_p) {
- ret_val = TRANSPORT_TYPE_IPV4_UDP;
- }
- break;
-#endif
- default:
- ret_val = TRANSPORT_TYPE_NOT_IP;
- break;
- }
-
- return (ret_val);
-}
+static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_check_iplen(const struct mbuf *, int);
+static int hn_create_tx_ring(struct hn_softc *, int);
+static void hn_destroy_tx_ring(struct hn_tx_ring *);
+static int hn_create_tx_data(struct hn_softc *);
+static void hn_destroy_tx_data(struct hn_softc *);
+static void hn_start_taskfunc(void *, int);
+static void hn_start_txeof_taskfunc(void *, int);
+static void hn_stop_tx_tasks(struct hn_softc *);
+static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **);
+static void hn_create_rx_data(struct hn_softc *sc);
+static void hn_destroy_rx_data(struct hn_softc *sc);
+static void hn_set_tx_chimney_size(struct hn_softc *, int);
static int
hn_ifmedia_upd(struct ifnet *ifp __unused)
@@ -353,6 +363,19 @@ netvsc_probe(device_t dev)
return (ENXIO);
}
+static void
+hn_cpuset_setthread_task(void *xmask, int pending __unused)
+{
+ cpuset_t *mask = xmask;
+ int error;
+
+ error = cpuset_setthread(curthread->td_tid, mask);
+ if (error) {
+ panic("curthread=%ju: can't pin; error=%d",
+ (uintmax_t)curthread->td_tid, error);
+ }
+}
+
/*
* Standard attach entry point.
*
@@ -367,8 +390,6 @@ netvsc_attach(device_t dev)
hn_softc_t *sc;
int unit = device_get_unit(dev);
struct ifnet *ifp = NULL;
- struct sysctl_oid_list *child;
- struct sysctl_ctx_list *ctx;
int error;
#if __FreeBSD_version >= 1100045
int tso_maxlen;
@@ -382,13 +403,28 @@ netvsc_attach(device_t dev)
bzero(sc, sizeof(hn_softc_t));
sc->hn_unit = unit;
sc->hn_dev = dev;
- sc->hn_lro_hiwat = HN_LRO_HIWAT_DEF;
- sc->hn_trust_hosttcp = hn_trust_hosttcp;
-
- error = hn_create_tx_ring(sc);
- if (error)
- goto failed;
+ if (hn_tx_taskq == NULL) {
+ sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
+ taskqueue_thread_enqueue, &sc->hn_tx_taskq);
+ taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
+ device_get_nameunit(dev));
+ if (hn_bind_tx_taskq >= 0) {
+ int cpu = hn_bind_tx_taskq;
+ struct task cpuset_task;
+ cpuset_t cpu_set;
+
+ if (cpu > mp_ncpus - 1)
+ cpu = mp_ncpus - 1;
+ CPU_SETOF(cpu, &cpu_set);
+ TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task,
+ &cpu_set);
+ taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task);
+ taskqueue_drain(sc->hn_tx_taskq, &cpuset_task);
+ }
+ } else {
+ sc->hn_tx_taskq = hn_tx_taskq;
+ }
NV_LOCK_INIT(sc, "NetVSCLock");
sc->hn_dev_obj = device_ctx;
@@ -396,6 +432,12 @@ netvsc_attach(device_t dev)
ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
ifp->if_softc = sc;
+ error = hn_create_tx_data(sc);
+ if (error)
+ goto failed;
+
+ hn_create_rx_data(sc);
+
if_initname(ifp, device_get_name(dev), device_get_unit(dev));
ifp->if_dunit = unit;
ifp->if_dname = NETVSC_DEVNAME;
@@ -426,15 +468,7 @@ netvsc_attach(device_t dev)
ifp->if_capenable |=
IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO |
IFCAP_LRO;
- /*
- * Only enable UDP checksum offloading when it is on 2012R2 or
- * later. UDP checksum offloading doesn't work on earlier
- * Windows releases.
- */
- if (hv_vmbus_protocal_version >= HV_VMBUS_VERSION_WIN8_1)
- ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
- else
- ifp->if_hwassist = CSUM_TCP | CSUM_TSO;
+ ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist | CSUM_TSO;
error = hv_rf_on_device_add(device_ctx, &device_info);
if (error)
@@ -444,15 +478,6 @@ netvsc_attach(device_t dev)
sc->hn_carrier = 1;
}
-#if defined(INET) || defined(INET6)
- tcp_lro_init(&sc->hn_lro);
- /* Driver private LRO settings */
- sc->hn_lro.ifp = ifp;
-#ifdef HN_LRO_HIWAT
- sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat;
-#endif
-#endif /* INET || INET6 */
-
#if __FreeBSD_version >= 1100045
tso_maxlen = hn_tso_maxlen;
if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET)
@@ -472,87 +497,14 @@ netvsc_attach(device_t dev)
#endif
sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
- sc->hn_tx_chimney_size = sc->hn_tx_chimney_max;
+ hn_set_tx_chimney_size(sc, sc->hn_tx_chimney_max);
if (hn_tx_chimney_size > 0 &&
hn_tx_chimney_size < sc->hn_tx_chimney_max)
- sc->hn_tx_chimney_size = hn_tx_chimney_size;
-
- ctx = device_get_sysctl_ctx(dev);
- child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
-
- SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_queued",
- CTLFLAG_RW, &sc->hn_lro.lro_queued, 0, "LRO queued");
- SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_flushed",
- CTLFLAG_RW, &sc->hn_lro.lro_flushed, 0, "LRO flushed");
- SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "lro_tried",
- CTLFLAG_RW, &sc->hn_lro_tried, "# of LRO tries");
-#ifdef HN_LRO_HIWAT
- SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_hiwat",
- CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_hiwat_sysctl,
- "I", "LRO high watermark");
-#endif
- SYSCTL_ADD_INT(ctx, child, OID_AUTO, "trust_hosttcp",
- CTLFLAG_RW, &sc->hn_trust_hosttcp, 0,
- "Trust tcp segement verification on host side, "
- "when csum info is missing");
- SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_ip",
- CTLFLAG_RW, &sc->hn_csum_ip, "RXCSUM IP");
- SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_tcp",
- CTLFLAG_RW, &sc->hn_csum_tcp, "RXCSUM TCP");
- SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_trusted",
- CTLFLAG_RW, &sc->hn_csum_trusted,
- "# of TCP segements that we trust host's csum verification");
- SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "small_pkts",
- CTLFLAG_RW, &sc->hn_small_pkts, "# of small packets received");
- SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "no_txdescs",
- CTLFLAG_RW, &sc->hn_no_txdescs, "# of times short of TX descs");
- SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "send_failed",
- CTLFLAG_RW, &sc->hn_send_failed, "# of hyper-v sending failure");
- SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "txdma_failed",
- CTLFLAG_RW, &sc->hn_txdma_failed, "# of TX DMA failure");
- SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_collapsed",
- CTLFLAG_RW, &sc->hn_tx_collapsed, "# of TX mbuf collapsed");
- SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_chimney",
- CTLFLAG_RW, &sc->hn_tx_chimney, "# of chimney send");
- SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
- CTLFLAG_RD, &sc->hn_txdesc_cnt, 0, "# of total TX descs");
- SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
- CTLFLAG_RD, &sc->hn_txdesc_avail, 0, "# of available TX descs");
- SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
- CTLFLAG_RD, &sc->hn_tx_chimney_max, 0,
- "Chimney send packet size upper boundary");
- SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
- CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_tx_chimney_size_sysctl,
- "I", "Chimney send packet size limit");
-
- if (unit == 0) {
- struct sysctl_ctx_list *dc_ctx;
- struct sysctl_oid_list *dc_child;
- devclass_t dc;
-
- /*
- * Add sysctl nodes for devclass
- */
- dc = device_get_devclass(dev);
- dc_ctx = devclass_get_sysctl_ctx(dc);
- dc_child = SYSCTL_CHILDREN(devclass_get_sysctl_tree(dc));
-
- SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "trust_hosttcp",
- CTLFLAG_RD, &hn_trust_hosttcp, 0,
- "Trust tcp segement verification on host side, "
- "when csum info is missing (global setting)");
- SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tx_chimney_size",
- CTLFLAG_RD, &hn_tx_chimney_size, 0,
- "Chimney send packet size limit");
-#if __FreeBSD_version >= 1100045
- SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tso_maxlen",
- CTLFLAG_RD, &hn_tso_maxlen, 0, "TSO burst limit");
-#endif
- }
+ hn_set_tx_chimney_size(sc, hn_tx_chimney_size);
return (0);
failed:
- hn_destroy_tx_ring(sc);
+ hn_destroy_tx_data(sc);
if (ifp != NULL)
if_free(ifp);
return (error);
@@ -583,11 +535,14 @@ netvsc_detach(device_t dev)
hv_rf_on_device_remove(hv_device, HV_RF_NV_DESTROY_CHANNEL);
+ hn_stop_tx_tasks(sc);
+
ifmedia_removeall(&sc->hn_media);
-#if defined(INET) || defined(INET6)
- tcp_lro_free(&sc->hn_lro);
-#endif
- hn_destroy_tx_ring(sc);
+ hn_destroy_rx_data(sc);
+ hn_destroy_tx_data(sc);
+
+ if (sc->hn_tx_taskq != hn_tx_taskq)
+ taskqueue_free(sc->hn_tx_taskq);
return (0);
}
@@ -602,13 +557,13 @@ netvsc_shutdown(device_t dev)
}
static __inline int
-hn_txdesc_dmamap_load(struct hn_softc *sc, struct hn_txdesc *txd,
+hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
{
struct mbuf *m = *m_head;
int error;
- error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag, txd->data_dmap,
+ error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
m, segs, nsegs, BUS_DMA_NOWAIT);
if (error == EFBIG) {
struct mbuf *m_new;
@@ -618,13 +573,13 @@ hn_txdesc_dmamap_load(struct hn_softc *sc, struct hn_txdesc *txd,
return ENOBUFS;
else
*m_head = m = m_new;
- sc->hn_tx_collapsed++;
+ txr->hn_tx_collapsed++;
- error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag,
+ error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
}
if (!error) {
- bus_dmamap_sync(sc->hn_tx_data_dtag, txd->data_dmap,
+ bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
BUS_DMASYNC_PREWRITE);
txd->flags |= HN_TXD_FLAG_DMAMAP;
}
@@ -632,20 +587,20 @@ hn_txdesc_dmamap_load(struct hn_softc *sc, struct hn_txdesc *txd,
}
static __inline void
-hn_txdesc_dmamap_unload(struct hn_softc *sc, struct hn_txdesc *txd)
+hn_txdesc_dmamap_unload(struct hn_tx_ring *txr, struct hn_txdesc *txd)
{
if (txd->flags & HN_TXD_FLAG_DMAMAP) {
- bus_dmamap_sync(sc->hn_tx_data_dtag,
+ bus_dmamap_sync(txr->hn_tx_data_dtag,
txd->data_dmap, BUS_DMASYNC_POSTWRITE);
- bus_dmamap_unload(sc->hn_tx_data_dtag,
+ bus_dmamap_unload(txr->hn_tx_data_dtag,
txd->data_dmap);
txd->flags &= ~HN_TXD_FLAG_DMAMAP;
}
}
static __inline int
-hn_txdesc_put(struct hn_softc *sc, struct hn_txdesc *txd)
+hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
{
KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
@@ -655,7 +610,7 @@ hn_txdesc_put(struct hn_softc *sc, struct hn_txdesc *txd)
if (atomic_fetchadd_int(&txd->refs, -1) != 1)
return 0;
- hn_txdesc_dmamap_unload(sc, txd);
+ hn_txdesc_dmamap_unload(txr, txd);
if (txd->m != NULL) {
m_freem(txd->m);
txd->m = NULL;
@@ -663,33 +618,45 @@ hn_txdesc_put(struct hn_softc *sc, struct hn_txdesc *txd)
txd->flags |= HN_TXD_FLAG_ONLIST;
- mtx_lock_spin(&sc->hn_txlist_spin);
- KASSERT(sc->hn_txdesc_avail >= 0 &&
- sc->hn_txdesc_avail < sc->hn_txdesc_cnt,
- ("txdesc_put: invalid txd avail %d", sc->hn_txdesc_avail));
- sc->hn_txdesc_avail++;
- SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link);
- mtx_unlock_spin(&sc->hn_txlist_spin);
+#ifndef HN_USE_TXDESC_BUFRING
+ mtx_lock_spin(&txr->hn_txlist_spin);
+ KASSERT(txr->hn_txdesc_avail >= 0 &&
+ txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
+ ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
+ txr->hn_txdesc_avail++;
+ SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
+ mtx_unlock_spin(&txr->hn_txlist_spin);
+#else
+ atomic_add_int(&txr->hn_txdesc_avail, 1);
+ buf_ring_enqueue(txr->hn_txdesc_br, txd);
+#endif
return 1;
}
static __inline struct hn_txdesc *
-hn_txdesc_get(struct hn_softc *sc)
+hn_txdesc_get(struct hn_tx_ring *txr)
{
struct hn_txdesc *txd;
- mtx_lock_spin(&sc->hn_txlist_spin);
- txd = SLIST_FIRST(&sc->hn_txlist);
+#ifndef HN_USE_TXDESC_BUFRING
+ mtx_lock_spin(&txr->hn_txlist_spin);
+ txd = SLIST_FIRST(&txr->hn_txlist);
if (txd != NULL) {
- KASSERT(sc->hn_txdesc_avail > 0,
- ("txdesc_get: invalid txd avail %d", sc->hn_txdesc_avail));
- sc->hn_txdesc_avail--;
- SLIST_REMOVE_HEAD(&sc->hn_txlist, link);
+ KASSERT(txr->hn_txdesc_avail > 0,
+ ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
+ txr->hn_txdesc_avail--;
+ SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
}
- mtx_unlock_spin(&sc->hn_txlist_spin);
+ mtx_unlock_spin(&txr->hn_txlist_spin);
+#else
+ txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
+#endif
if (txd != NULL) {
+#ifdef HN_USE_TXDESC_BUFRING
+ atomic_subtract_int(&txr->hn_txdesc_avail, 1);
+#endif
KASSERT(txd->m == NULL && txd->refs == 0 &&
(txd->flags & HN_TXD_FLAG_ONLIST), ("invalid txd"));
txd->flags &= ~HN_TXD_FLAG_ONLIST;
@@ -707,213 +674,133 @@ hn_txdesc_hold(struct hn_txdesc *txd)
atomic_add_int(&txd->refs, 1);
}
-/*
- * Send completion processing
- *
- * Note: It looks like offset 0 of buf is reserved to hold the softc
- * pointer. The sc pointer is not currently needed in this function, and
- * it is not presently populated by the TX function.
- */
-void
-netvsc_xmit_completion(void *context)
+static void
+hn_tx_done(void *xpkt)
{
- netvsc_packet *packet = context;
+ netvsc_packet *packet = xpkt;
struct hn_txdesc *txd;
- struct hn_softc *sc;
+ struct hn_tx_ring *txr;
txd = (struct hn_txdesc *)(uintptr_t)
packet->compl.send.send_completion_tid;
- sc = txd->sc;
- sc->hn_txeof = 1;
- hn_txdesc_put(sc, txd);
+ txr = txd->txr;
+ txr->hn_has_txeof = 1;
+ hn_txdesc_put(txr, txd);
}
void
netvsc_channel_rollup(struct hv_device *device_ctx)
{
struct hn_softc *sc = device_get_softc(device_ctx->device);
- struct ifnet *ifp;
+ struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; /* TODO: vRSS */
+#if defined(INET) || defined(INET6)
+ struct hn_rx_ring *rxr = &sc->hn_rx_ring[0]; /* TODO: vRSS */
+ struct lro_ctrl *lro = &rxr->hn_lro;
+ struct lro_entry *queued;
+
+ while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
+ SLIST_REMOVE_HEAD(&lro->lro_active, next);
+ tcp_lro_flush(lro, queued);
+ }
+#endif
- if (!sc->hn_txeof)
+ if (!txr->hn_has_txeof)
return;
- sc->hn_txeof = 0;
- ifp = sc->hn_ifp;
- NV_LOCK(sc);
- ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
- hn_start_locked(ifp);
- NV_UNLOCK(sc);
+ txr->hn_has_txeof = 0;
+ txr->hn_txeof(txr);
}
/*
- * Start a transmit of one or more packets
+ * NOTE:
+ * If this function fails, then both txd and m_head0 will be freed.
*/
-static void
-hn_start_locked(struct ifnet *ifp)
+static int
+hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
{
- hn_softc_t *sc = ifp->if_softc;
- struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
- netvsc_dev *net_dev = sc->net_dev;
+ bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
+ int error, nsegs, i;
+ struct mbuf *m_head = *m_head0;
netvsc_packet *packet;
- struct mbuf *m_head, *m;
- struct ether_vlan_header *eh;
rndis_msg *rndis_mesg;
rndis_packet *rndis_pkt;
rndis_per_packet_info *rppi;
- ndis_8021q_info *rppi_vlan_info;
- rndis_tcp_ip_csum_info *csum_info;
- rndis_tcp_tso_info *tso_info;
- int ether_len;
- uint32_t rndis_msg_size = 0;
- uint32_t trans_proto_type;
- uint32_t send_buf_section_idx =
- NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX;
-
- if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
- IFF_DRV_RUNNING)
- return;
-
- while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
- bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
- int error, nsegs, i, send_failed = 0;
- struct hn_txdesc *txd;
+ uint32_t rndis_msg_size;
- IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
- if (m_head == NULL)
- break;
+ packet = &txd->netvsc_pkt;
+ packet->is_data_pkt = TRUE;
+ packet->tot_data_buf_len = m_head->m_pkthdr.len;
- txd = hn_txdesc_get(sc);
- if (txd == NULL) {
- sc->hn_no_txdescs++;
- IF_PREPEND(&ifp->if_snd, m_head);
- ifp->if_drv_flags |= IFF_DRV_OACTIVE;
- break;
- }
+ /*
+ * extension points to the area reserved for the
+ * rndis_filter_packet, which is placed just after
+ * the netvsc_packet (and rppi struct, if present;
+ * length is updated later).
+ */
+ rndis_mesg = txd->rndis_msg;
+ /* XXX not necessary */
+ memset(rndis_mesg, 0, HN_RNDIS_MSG_LEN);
+ rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG;
- packet = &txd->netvsc_pkt;
- /* XXX not necessary */
- memset(packet, 0, sizeof(*packet));
+ rndis_pkt = &rndis_mesg->msg.packet;
+ rndis_pkt->data_offset = sizeof(rndis_packet);
+ rndis_pkt->data_length = packet->tot_data_buf_len;
+ rndis_pkt->per_pkt_info_offset = sizeof(rndis_packet);
- packet->is_data_pkt = TRUE;
+ rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet);
- /* Initialize it from the mbuf */
- packet->tot_data_buf_len = m_head->m_pkthdr.len;
+ if (m_head->m_flags & M_VLANTAG) {
+ ndis_8021q_info *rppi_vlan_info;
- /*
- * extension points to the area reserved for the
- * rndis_filter_packet, which is placed just after
- * the netvsc_packet (and rppi struct, if present;
- * length is updated later).
- */
- rndis_mesg = txd->rndis_msg;
- /* XXX not necessary */
- memset(rndis_mesg, 0, HN_RNDIS_MSG_LEN);
- rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG;
+ rndis_msg_size += RNDIS_VLAN_PPI_SIZE;
+ rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE,
+ ieee_8021q_info);
- rndis_pkt = &rndis_mesg->msg.packet;
- rndis_pkt->data_offset = sizeof(rndis_packet);
- rndis_pkt->data_length = packet->tot_data_buf_len;
- rndis_pkt->per_pkt_info_offset = sizeof(rndis_packet);
+ rppi_vlan_info = (ndis_8021q_info *)((uint8_t *)rppi +
+ rppi->per_packet_info_offset);
+ rppi_vlan_info->u1.s1.vlan_id =
+ m_head->m_pkthdr.ether_vtag & 0xfff;
+ }
- rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet);
+ if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
+ rndis_tcp_tso_info *tso_info;
+ struct ether_vlan_header *eh;
+ int ether_len;
/*
- * If the Hyper-V infrastructure needs to embed a VLAN tag,
- * initialize netvsc_packet and rppi struct values as needed.
+ * XXX need m_pullup and use mtodo
*/
- if (m_head->m_flags & M_VLANTAG) {
- /*
- * set up some additional fields so the Hyper-V infrastructure will stuff the VLAN tag
- * into the frame.
- */
- rndis_msg_size += RNDIS_VLAN_PPI_SIZE;
-
- rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE,
- ieee_8021q_info);
-
- /* VLAN info immediately follows rppi struct */
- rppi_vlan_info = (ndis_8021q_info *)((char*)rppi +
- rppi->per_packet_info_offset);
- /* FreeBSD does not support CFI or priority */
- rppi_vlan_info->u1.s1.vlan_id =
- m_head->m_pkthdr.ether_vtag & 0xfff;
- }
-
- /* Only check the flags for outbound and ignore the ones for inbound */
- if (0 == (m_head->m_pkthdr.csum_flags & HV_CSUM_FOR_OUTBOUND)) {
- goto pre_send;
- }
-
eh = mtod(m_head, struct ether_vlan_header*);
- if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
+ if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN))
ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
- } else {
+ else
ether_len = ETHER_HDR_LEN;
- }
-
- trans_proto_type = get_transport_proto_type(m_head);
- if (TRANSPORT_TYPE_NOT_IP == trans_proto_type) {
- goto pre_send;
- }
-
- /*
- * TSO packet needless to setup the send side checksum
- * offload.
- */
- if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
- goto do_tso;
- }
- /* setup checksum offload */
- rndis_msg_size += RNDIS_CSUM_PPI_SIZE;
- rppi = hv_set_rppi_data(rndis_mesg, RNDIS_CSUM_PPI_SIZE,
- tcpip_chksum_info);
- csum_info = (rndis_tcp_ip_csum_info *)((char*)rppi +
- rppi->per_packet_info_offset);
-
- if (trans_proto_type & (TYPE_IPV4 << 16)) {
- csum_info->xmit.is_ipv4 = 1;
- } else {
- csum_info->xmit.is_ipv6 = 1;
- }
-
- if (trans_proto_type & TYPE_TCP) {
- csum_info->xmit.tcp_csum = 1;
- csum_info->xmit.tcp_header_offset = 0;
- } else if (trans_proto_type & TYPE_UDP) {
- csum_info->xmit.udp_csum = 1;
- }
-
- goto pre_send;
-
-do_tso:
- /* setup TCP segmentation offload */
rndis_msg_size += RNDIS_TSO_PPI_SIZE;
rppi = hv_set_rppi_data(rndis_mesg, RNDIS_TSO_PPI_SIZE,
tcp_large_send_info);
-
- tso_info = (rndis_tcp_tso_info *)((char *)rppi +
+
+ tso_info = (rndis_tcp_tso_info *)((uint8_t *)rppi +
rppi->per_packet_info_offset);
tso_info->lso_v2_xmit.type =
RNDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
-
+
#ifdef INET
- if (trans_proto_type & (TYPE_IPV4 << 16)) {
+ if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
struct ip *ip =
(struct ip *)(m_head->m_data + ether_len);
unsigned long iph_len = ip->ip_hl << 2;
struct tcphdr *th =
(struct tcphdr *)((caddr_t)ip + iph_len);
-
+
tso_info->lso_v2_xmit.ip_version =
RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV4;
ip->ip_len = 0;
ip->ip_sum = 0;
-
+
th->th_sum = in_pseudo(ip->ip_src.s_addr,
- ip->ip_dst.s_addr,
- htons(IPPROTO_TCP));
+ ip->ip_dst.s_addr, htons(IPPROTO_TCP));
}
#endif
#if defined(INET6) && defined(INET)
@@ -921,8 +808,8 @@ do_tso:
#endif
#ifdef INET6
{
- struct ip6_hdr *ip6 =
- (struct ip6_hdr *)(m_head->m_data + ether_len);
+ struct ip6_hdr *ip6 = (struct ip6_hdr *)
+ (m_head->m_data + ether_len);
struct tcphdr *th = (struct tcphdr *)(ip6 + 1);
tso_info->lso_v2_xmit.ip_version =
@@ -933,146 +820,233 @@ do_tso:
#endif
tso_info->lso_v2_xmit.tcp_header_offset = 0;
tso_info->lso_v2_xmit.mss = m_head->m_pkthdr.tso_segsz;
+ } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
+ rndis_tcp_ip_csum_info *csum_info;
-pre_send:
- rndis_mesg->msg_len = packet->tot_data_buf_len + rndis_msg_size;
- packet->tot_data_buf_len = rndis_mesg->msg_len;
-
- /* send packet with send buffer */
- if (packet->tot_data_buf_len < sc->hn_tx_chimney_size) {
- send_buf_section_idx =
- hv_nv_get_next_send_section(net_dev);
- if (send_buf_section_idx !=
- NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) {
- char *dest = ((char *)net_dev->send_buf +
- send_buf_section_idx *
- net_dev->send_section_size);
-
- memcpy(dest, rndis_mesg, rndis_msg_size);
- dest += rndis_msg_size;
- for (m = m_head; m != NULL; m = m->m_next) {
- if (m->m_len) {
- memcpy(dest,
- (void *)mtod(m, vm_offset_t),
- m->m_len);
- dest += m->m_len;
- }
- }
+ rndis_msg_size += RNDIS_CSUM_PPI_SIZE;
+ rppi = hv_set_rppi_data(rndis_mesg, RNDIS_CSUM_PPI_SIZE,
+ tcpip_chksum_info);
+ csum_info = (rndis_tcp_ip_csum_info *)((uint8_t *)rppi +
+ rppi->per_packet_info_offset);
- packet->send_buf_section_idx =
- send_buf_section_idx;
- packet->send_buf_section_size =
- packet->tot_data_buf_len;
- packet->page_buf_count = 0;
- sc->hn_tx_chimney++;
- goto do_send;
- }
- }
+ csum_info->xmit.is_ipv4 = 1;
+ if (m_head->m_pkthdr.csum_flags & CSUM_IP)
+ csum_info->xmit.ip_header_csum = 1;
- error = hn_txdesc_dmamap_load(sc, txd, &m_head, segs, &nsegs);
- if (error) {
- int freed;
+ if (m_head->m_pkthdr.csum_flags & CSUM_TCP) {
+ csum_info->xmit.tcp_csum = 1;
+ csum_info->xmit.tcp_header_offset = 0;
+ } else if (m_head->m_pkthdr.csum_flags & CSUM_UDP) {
+ csum_info->xmit.udp_csum = 1;
+ }
+ }
- /*
- * This mbuf is not linked w/ the txd yet, so free
- * it now.
- */
- m_freem(m_head);
- freed = hn_txdesc_put(sc, txd);
- KASSERT(freed != 0,
- ("fail to free txd upon txdma error"));
+ rndis_mesg->msg_len = packet->tot_data_buf_len + rndis_msg_size;
+ packet->tot_data_buf_len = rndis_mesg->msg_len;
- sc->hn_txdma_failed++;
- if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
- continue;
+ /*
+ * Chimney send, if the packet could fit into one chimney buffer.
+ */
+ if (packet->tot_data_buf_len < txr->hn_tx_chimney_size) {
+ netvsc_dev *net_dev = txr->hn_sc->net_dev;
+ uint32_t send_buf_section_idx;
+
+ send_buf_section_idx =
+ hv_nv_get_next_send_section(net_dev);
+ if (send_buf_section_idx !=
+ NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) {
+ uint8_t *dest = ((uint8_t *)net_dev->send_buf +
+ (send_buf_section_idx *
+ net_dev->send_section_size));
+
+ memcpy(dest, rndis_mesg, rndis_msg_size);
+ dest += rndis_msg_size;
+ m_copydata(m_head, 0, m_head->m_pkthdr.len, dest);
+
+ packet->send_buf_section_idx = send_buf_section_idx;
+ packet->send_buf_section_size =
+ packet->tot_data_buf_len;
+ packet->page_buf_count = 0;
+ txr->hn_tx_chimney++;
+ goto done;
}
+ }
- packet->page_buf_count = nsegs +
- HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
-
- /* send packet with page buffer */
- packet->page_buffers[0].pfn = atop(txd->rndis_msg_paddr);
- packet->page_buffers[0].offset =
- txd->rndis_msg_paddr & PAGE_MASK;
- packet->page_buffers[0].length = rndis_msg_size;
+ error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
+ if (error) {
+ int freed;
/*
- * Fill the page buffers with mbuf info starting at index
- * HV_RF_NUM_TX_RESERVED_PAGE_BUFS.
+ * This mbuf is not linked w/ the txd yet, so free it now.
*/
- for (i = 0; i < nsegs; ++i) {
- hv_vmbus_page_buffer *pb = &packet->page_buffers[
- i + HV_RF_NUM_TX_RESERVED_PAGE_BUFS];
+ m_freem(m_head);
+ *m_head0 = NULL;
- pb->pfn = atop(segs[i].ds_addr);
- pb->offset = segs[i].ds_addr & PAGE_MASK;
- pb->length = segs[i].ds_len;
- }
+ freed = hn_txdesc_put(txr, txd);
+ KASSERT(freed != 0,
+ ("fail to free txd upon txdma error"));
+
+ txr->hn_txdma_failed++;
+ if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1);
+ return error;
+ }
+ *m_head0 = m_head;
+
+ packet->page_buf_count = nsegs + HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
+
+ /* send packet with page buffer */
+ packet->page_buffers[0].pfn = atop(txd->rndis_msg_paddr);
+ packet->page_buffers[0].offset = txd->rndis_msg_paddr & PAGE_MASK;
+ packet->page_buffers[0].length = rndis_msg_size;
- packet->send_buf_section_idx =
- NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX;
- packet->send_buf_section_size = 0;
+ /*
+ * Fill the page buffers with mbuf info starting at index
+ * HV_RF_NUM_TX_RESERVED_PAGE_BUFS.
+ */
+ for (i = 0; i < nsegs; ++i) {
+ hv_vmbus_page_buffer *pb = &packet->page_buffers[
+ i + HV_RF_NUM_TX_RESERVED_PAGE_BUFS];
-do_send:
- txd->m = m_head;
+ pb->pfn = atop(segs[i].ds_addr);
+ pb->offset = segs[i].ds_addr & PAGE_MASK;
+ pb->length = segs[i].ds_len;
+ }
+
+ packet->send_buf_section_idx =
+ NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX;
+ packet->send_buf_section_size = 0;
+done:
+ txd->m = m_head;
+
+ /* Set the completion routine */
+ packet->compl.send.on_send_completion = hn_tx_done;
+ packet->compl.send.send_completion_context = packet;
+ packet->compl.send.send_completion_tid = (uint64_t)(uintptr_t)txd;
+
+ return 0;
+}
- /* Set the completion routine */
- packet->compl.send.on_send_completion = netvsc_xmit_completion;
- packet->compl.send.send_completion_context = packet;
- packet->compl.send.send_completion_tid =
- (uint64_t)(uintptr_t)txd;
+/*
+ * NOTE:
+ * If this function fails, then txd will be freed, but the mbuf
+ * associated w/ the txd will _not_ be freed.
+ */
+static int
+hn_send_pkt(struct ifnet *ifp, struct hv_device *device_ctx,
+ struct hn_tx_ring *txr, struct hn_txdesc *txd)
+{
+ int error, send_failed = 0;
again:
+ /*
+ * Make sure that txd is not freed before ETHER_BPF_MTAP.
+ */
+ hn_txdesc_hold(txd);
+ error = hv_nv_on_send(device_ctx, &txd->netvsc_pkt);
+ if (!error) {
+ ETHER_BPF_MTAP(ifp, txd->m);
+ if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+ }
+ hn_txdesc_put(txr, txd);
+
+ if (__predict_false(error)) {
+ int freed;
+
/*
- * Make sure that txd is not freed before ETHER_BPF_MTAP.
+ * This should "really rarely" happen.
+ *
+ * XXX Too many RX to be acked or too many sideband
+ * commands to run? Ask netvsc_channel_rollup()
+ * to kick start later.
*/
- hn_txdesc_hold(txd);
- error = hv_nv_on_send(device_ctx, packet);
- if (!error) {
- ETHER_BPF_MTAP(ifp, m_head);
- if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+ txr->hn_has_txeof = 1;
+ if (!send_failed) {
+ txr->hn_send_failed++;
+ send_failed = 1;
+ /*
+ * Try sending again after set hn_has_txeof;
+ * in case that we missed the last
+ * netvsc_channel_rollup().
+ */
+ goto again;
}
- hn_txdesc_put(sc, txd);
+ if_printf(ifp, "send failed\n");
- if (__predict_false(error)) {
- int freed;
+ /*
+ * Caller will perform further processing on the
+ * associated mbuf, so don't free it in hn_txdesc_put();
+ * only unload it from the DMA map in hn_txdesc_put(),
+ * if it was loaded.
+ */
+ txd->m = NULL;
+ freed = hn_txdesc_put(txr, txd);
+ KASSERT(freed != 0,
+ ("fail to free txd upon send error"));
- /*
- * This should "really rarely" happen.
- *
- * XXX Too many RX to be acked or too many sideband
- * commands to run? Ask netvsc_channel_rollup()
- * to kick start later.
- */
- sc->hn_txeof = 1;
- if (!send_failed) {
- sc->hn_send_failed++;
- send_failed = 1;
- /*
- * Try sending again after set hn_txeof;
- * in case that we missed the last
- * netvsc_channel_rollup().
- */
- goto again;
- }
- if_printf(ifp, "send failed\n");
+ txr->hn_send_failed++;
+ }
+ return error;
+}
+
+/*
+ * Start a transmit of one or more packets
+ */
+static int
+hn_start_locked(struct hn_tx_ring *txr, int len)
+{
+ struct hn_softc *sc = txr->hn_sc;
+ struct ifnet *ifp = sc->hn_ifp;
+ struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
+
+ KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
+ mtx_assert(&txr->hn_tx_lock, MA_OWNED);
+
+ if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
+ IFF_DRV_RUNNING)
+ return 0;
+
+ while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
+ struct hn_txdesc *txd;
+ struct mbuf *m_head;
+ int error;
+
+ IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
+ if (m_head == NULL)
+ break;
+ if (len > 0 && m_head->m_pkthdr.len > len) {
/*
- * This mbuf will be prepended, don't free it
- * in hn_txdesc_put(); only unload it from the
- * DMA map in hn_txdesc_put(), if it was loaded.
+ * This sending could be time consuming; let callers
+ * dispatch this packet sending (and sending of any
+ * following up packets) to tx taskqueue.
*/
- txd->m = NULL;
- freed = hn_txdesc_put(sc, txd);
- KASSERT(freed != 0,
- ("fail to free txd upon send error"));
-
- sc->hn_send_failed++;
- IF_PREPEND(&ifp->if_snd, m_head);
- ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+ IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+ return 1;
+ }
+
+ txd = hn_txdesc_get(txr);
+ if (txd == NULL) {
+ txr->hn_no_txdescs++;
+ IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+ atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+ break;
+ }
+
+ error = hn_encap(txr, txd, &m_head);
+ if (error) {
+ /* Both txd and m_head are freed */
+ continue;
+ }
+
+ error = hn_send_pkt(ifp, device_ctx, txr, txd);
+ if (__predict_false(error)) {
+ /* txd is freed, but m_head is not */
+ IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+ atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
break;
}
}
+ return 0;
}
/*
@@ -1162,11 +1136,11 @@ int
netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
rndis_tcp_ip_csum_info *csum_info)
{
- hn_softc_t *sc = (hn_softc_t *)device_get_softc(device_ctx->device);
+ struct hn_softc *sc = device_get_softc(device_ctx->device);
+ struct hn_rx_ring *rxr = &sc->hn_rx_ring[0]; /* TODO: vRSS */
struct mbuf *m_new;
struct ifnet *ifp;
- device_t dev = device_ctx->device;
- int size, do_lro = 0;
+ int size, do_lro = 0, do_csum = 1;
if (sc == NULL) {
return (0); /* TODO: KYS how can this be! */
@@ -1192,7 +1166,7 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
memcpy(mtod(m_new, void *), packet->data,
packet->tot_data_buf_len);
m_new->m_pkthdr.len = m_new->m_len = packet->tot_data_buf_len;
- sc->hn_small_pkts++;
+ rxr->hn_small_pkts++;
} else {
/*
* Get an mbuf with a cluster. For packets 2K or less,
@@ -1208,7 +1182,7 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
if (m_new == NULL) {
- device_printf(dev, "alloc mbuf failed.\n");
+ if_printf(ifp, "alloc mbuf failed.\n");
return (0);
}
@@ -1216,21 +1190,28 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
}
m_new->m_pkthdr.rcvif = ifp;
+ if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
+ do_csum = 0;
+
/* receive side checksum offload */
- if (NULL != csum_info) {
+ if (csum_info != NULL) {
/* IP csum offload */
- if (csum_info->receive.ip_csum_succeeded) {
+ if (csum_info->receive.ip_csum_succeeded && do_csum) {
m_new->m_pkthdr.csum_flags |=
(CSUM_IP_CHECKED | CSUM_IP_VALID);
- sc->hn_csum_ip++;
+ rxr->hn_csum_ip++;
}
- /* TCP csum offload */
- if (csum_info->receive.tcp_csum_succeeded) {
+ /* TCP/UDP csum offload */
+ if ((csum_info->receive.tcp_csum_succeeded ||
+ csum_info->receive.udp_csum_succeeded) && do_csum) {
m_new->m_pkthdr.csum_flags |=
(CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
m_new->m_pkthdr.csum_data = 0xffff;
- sc->hn_csum_tcp++;
+ if (csum_info->receive.tcp_csum_succeeded)
+ rxr->hn_csum_tcp++;
+ else
+ rxr->hn_csum_udp++;
}
if (csum_info->receive.ip_csum_succeeded &&
@@ -1261,8 +1242,10 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
pr = hn_check_iplen(m_new, hoff);
if (pr == IPPROTO_TCP) {
- if (sc->hn_trust_hosttcp) {
- sc->hn_csum_trusted++;
+ if (do_csum &&
+ (rxr->hn_trust_hcsum &
+ HN_TRUST_HCSUM_TCP)) {
+ rxr->hn_csum_trusted++;
m_new->m_pkthdr.csum_flags |=
(CSUM_IP_CHECKED | CSUM_IP_VALID |
CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
@@ -1270,6 +1253,21 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
}
/* Rely on SW csum verification though... */
do_lro = 1;
+ } else if (pr == IPPROTO_UDP) {
+ if (do_csum &&
+ (rxr->hn_trust_hcsum &
+ HN_TRUST_HCSUM_UDP)) {
+ rxr->hn_csum_trusted++;
+ m_new->m_pkthdr.csum_flags |=
+ (CSUM_IP_CHECKED | CSUM_IP_VALID |
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+ m_new->m_pkthdr.csum_data = 0xffff;
+ }
+ } else if (pr != IPPROTO_DONE && do_csum &&
+ (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
+ rxr->hn_csum_trusted++;
+ m_new->m_pkthdr.csum_flags |=
+ (CSUM_IP_CHECKED | CSUM_IP_VALID);
}
}
}
@@ -1289,10 +1287,10 @@ skip:
if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
#if defined(INET) || defined(INET6)
- struct lro_ctrl *lro = &sc->hn_lro;
+ struct lro_ctrl *lro = &rxr->hn_lro;
if (lro->lro_cnt) {
- sc->hn_lro_tried++;
+ rxr->hn_lro_tried++;
if (tcp_lro_rx(lro, m_new, 0) == 0) {
/* DONE! */
return 0;
@@ -1308,18 +1306,8 @@ skip:
}
void
-netvsc_recv_rollup(struct hv_device *device_ctx)
+netvsc_recv_rollup(struct hv_device *device_ctx __unused)
{
-#if defined(INET) || defined(INET6)
- hn_softc_t *sc = device_get_softc(device_ctx->device);
- struct lro_ctrl *lro = &sc->hn_lro;
- struct lro_entry *queued;
-
- while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
- SLIST_REMOVE_HEAD(&lro->lro_active, next);
- tcp_lro_flush(lro, queued);
- }
-#endif
}
/*
@@ -1377,12 +1365,23 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
/* Obtain and record requested MTU */
ifp->if_mtu = ifr->ifr_mtu;
+
+#if __FreeBSD_version >= 1100099
/*
- * Make sure that LRO high watermark is still valid,
- * after MTU change (the 2*MTU limit).
+ * Make sure that LRO aggregation length limit is still
+ * valid, after the MTU change.
*/
- if (!HN_LRO_HIWAT_ISVALID(sc, sc->hn_lro_hiwat))
- hn_set_lro_hiwat(sc, HN_LRO_HIWAT_MTULIM(ifp));
+ NV_LOCK(sc);
+ if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
+ HN_LRO_LENLIM_MIN(ifp)) {
+ int i;
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ sc->hn_rx_ring[i].hn_lro.lro_length_lim =
+ HN_LRO_LENLIM_MIN(ifp);
+ }
+ }
+ NV_UNLOCK(sc);
+#endif
do {
NV_LOCK(sc);
@@ -1422,8 +1421,10 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
}
sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
- if (sc->hn_tx_chimney_size > sc->hn_tx_chimney_max)
- sc->hn_tx_chimney_size = sc->hn_tx_chimney_max;
+ if (sc->hn_tx_ring[0].hn_tx_chimney_size >
+ sc->hn_tx_chimney_max)
+ hn_set_tx_chimney_size(sc, sc->hn_tx_chimney_max);
+
hn_ifinit_locked(sc);
NV_LOCK(sc);
@@ -1483,47 +1484,43 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
error = 0;
break;
case SIOCSIFCAP:
+ NV_LOCK(sc);
+
mask = ifr->ifr_reqcap ^ ifp->if_capenable;
if (mask & IFCAP_TXCSUM) {
- if (IFCAP_TXCSUM & ifp->if_capenable) {
- ifp->if_capenable &= ~IFCAP_TXCSUM;
- ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
+ ifp->if_capenable ^= IFCAP_TXCSUM;
+ if (ifp->if_capenable & IFCAP_TXCSUM) {
+ ifp->if_hwassist |=
+ sc->hn_tx_ring[0].hn_csum_assist;
} else {
- ifp->if_capenable |= IFCAP_TXCSUM;
- /*
- * Only enable UDP checksum offloading on
- * Windows Server 2012R2 or later releases.
- */
- if (hv_vmbus_protocal_version >=
- HV_VMBUS_VERSION_WIN8_1) {
- ifp->if_hwassist |=
- (CSUM_TCP | CSUM_UDP);
- } else {
- ifp->if_hwassist |= CSUM_TCP;
- }
+ ifp->if_hwassist &=
+ ~sc->hn_tx_ring[0].hn_csum_assist;
}
}
- if (mask & IFCAP_RXCSUM) {
- if (IFCAP_RXCSUM & ifp->if_capenable) {
- ifp->if_capenable &= ~IFCAP_RXCSUM;
- } else {
- ifp->if_capenable |= IFCAP_RXCSUM;
- }
- }
+ if (mask & IFCAP_RXCSUM)
+ ifp->if_capenable ^= IFCAP_RXCSUM;
+
if (mask & IFCAP_LRO)
ifp->if_capenable ^= IFCAP_LRO;
if (mask & IFCAP_TSO4) {
ifp->if_capenable ^= IFCAP_TSO4;
- ifp->if_hwassist ^= CSUM_IP_TSO;
+ if (ifp->if_capenable & IFCAP_TSO4)
+ ifp->if_hwassist |= CSUM_IP_TSO;
+ else
+ ifp->if_hwassist &= ~CSUM_IP_TSO;
}
if (mask & IFCAP_TSO6) {
ifp->if_capenable ^= IFCAP_TSO6;
- ifp->if_hwassist ^= CSUM_IP6_TSO;
+ if (ifp->if_capenable & IFCAP_TSO6)
+ ifp->if_hwassist |= CSUM_IP6_TSO;
+ else
+ ifp->if_hwassist &= ~CSUM_IP6_TSO;
}
+ NV_UNLOCK(sc);
error = 0;
break;
case SIOCADDMULTI:
@@ -1566,7 +1563,8 @@ hn_stop(hn_softc_t *sc)
if (bootverbose)
printf(" Closing Device ...\n");
- ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+ atomic_clear_int(&ifp->if_drv_flags,
+ (IFF_DRV_RUNNING | IFF_DRV_OACTIVE));
if_link_state_change(ifp, LINK_STATE_DOWN);
sc->hn_initdone = 0;
@@ -1579,16 +1577,56 @@ hn_stop(hn_softc_t *sc)
static void
hn_start(struct ifnet *ifp)
{
- hn_softc_t *sc;
+ struct hn_softc *sc = ifp->if_softc;
+ struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
- sc = ifp->if_softc;
- NV_LOCK(sc);
- if (sc->temp_unusable) {
- NV_UNLOCK(sc);
- return;
+ if (txr->hn_sched_tx)
+ goto do_sched;
+
+ if (mtx_trylock(&txr->hn_tx_lock)) {
+ int sched;
+
+ sched = hn_start_locked(txr, txr->hn_direct_tx_size);
+ mtx_unlock(&txr->hn_tx_lock);
+ if (!sched)
+ return;
+ }
+do_sched:
+ taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
+}
+
+static void
+hn_start_txeof(struct hn_tx_ring *txr)
+{
+ struct hn_softc *sc = txr->hn_sc;
+ struct ifnet *ifp = sc->hn_ifp;
+
+ KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
+
+ if (txr->hn_sched_tx)
+ goto do_sched;
+
+ if (mtx_trylock(&txr->hn_tx_lock)) {
+ int sched;
+
+ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+ sched = hn_start_locked(txr, txr->hn_direct_tx_size);
+ mtx_unlock(&txr->hn_tx_lock);
+ if (sched) {
+ taskqueue_enqueue(txr->hn_tx_taskq,
+ &txr->hn_tx_task);
+ }
+ } else {
+do_sched:
+ /*
+ * Release the OACTIVE earlier, with the hope, that
+ * others could catch up. The task will clear the
+ * flag again with the hn_tx_lock to avoid possible
+ * races.
+ */
+ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+ taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
}
- hn_start_locked(ifp);
- NV_UNLOCK(sc);
}
/*
@@ -1615,8 +1653,8 @@ hn_ifinit_locked(hn_softc_t *sc)
} else {
sc->hn_initdone = 1;
}
- ifp->if_drv_flags |= IFF_DRV_RUNNING;
- ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+ atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
if_link_state_change(ifp, LINK_STATE_UP);
}
@@ -1659,26 +1697,90 @@ hn_watchdog(struct ifnet *ifp)
}
#endif
-#ifdef HN_LRO_HIWAT
+#if __FreeBSD_version >= 1100099
+
+static int
+hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ unsigned int lenlim;
+ int error, i;
+
+ lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
+ error = sysctl_handle_int(oidp, &lenlim, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
+ lenlim > TCP_LRO_LENGTH_MAX)
+ return EINVAL;
+
+ NV_LOCK(sc);
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+ sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
+ NV_UNLOCK(sc);
+ return 0;
+}
+
static int
-hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS)
+hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
{
struct hn_softc *sc = arg1;
- int hiwat, error;
+ int ackcnt, error, i;
- hiwat = sc->hn_lro_hiwat;
- error = sysctl_handle_int(oidp, &hiwat, 0, req);
+ /*
+ * lro_ackcnt_lim is append count limit,
+ * +1 to turn it into aggregation limit.
+ */
+ ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
+ error = sysctl_handle_int(oidp, &ackcnt, 0, req);
if (error || req->newptr == NULL)
return error;
- if (!HN_LRO_HIWAT_ISVALID(sc, hiwat))
+ if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
return EINVAL;
- if (sc->hn_lro_hiwat != hiwat)
- hn_set_lro_hiwat(sc, hiwat);
+ /*
+ * Convert aggregation limit back to append
+ * count limit.
+ */
+ --ackcnt;
+ NV_LOCK(sc);
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+ sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
+ NV_UNLOCK(sc);
+ return 0;
+}
+
+#endif
+
+static int
+hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int hcsum = arg2;
+ int on, error, i;
+
+ on = 0;
+ if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
+ on = 1;
+
+ error = sysctl_handle_int(oidp, &on, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ NV_LOCK(sc);
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
+
+ if (on)
+ rxr->hn_trust_hcsum |= hcsum;
+ else
+ rxr->hn_trust_hcsum &= ~hcsum;
+ }
+ NV_UNLOCK(sc);
return 0;
}
-#endif /* HN_LRO_HIWAT */
static int
hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS)
@@ -1686,7 +1788,7 @@ hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS)
struct hn_softc *sc = arg1;
int chimney_size, error;
- chimney_size = sc->hn_tx_chimney_size;
+ chimney_size = sc->hn_tx_ring[0].hn_tx_chimney_size;
error = sysctl_handle_int(oidp, &chimney_size, 0, req);
if (error || req->newptr == NULL)
return error;
@@ -1694,8 +1796,138 @@ hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS)
if (chimney_size > sc->hn_tx_chimney_max || chimney_size <= 0)
return EINVAL;
- if (sc->hn_tx_chimney_size != chimney_size)
- sc->hn_tx_chimney_size = chimney_size;
+ hn_set_tx_chimney_size(sc, chimney_size);
+ return 0;
+}
+
+#if __FreeBSD_version < 1100095
+static int
+hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int ofs = arg2, i, error;
+ struct hn_rx_ring *rxr;
+ uint64_t stat;
+
+ stat = 0;
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ rxr = &sc->hn_rx_ring[i];
+ stat += *((int *)((uint8_t *)rxr + ofs));
+ }
+
+ error = sysctl_handle_64(oidp, &stat, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ /* Zero out this stat. */
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ rxr = &sc->hn_rx_ring[i];
+ *((int *)((uint8_t *)rxr + ofs)) = 0;
+ }
+ return 0;
+}
+#else
+static int
+hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int ofs = arg2, i, error;
+ struct hn_rx_ring *rxr;
+ uint64_t stat;
+
+ stat = 0;
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ rxr = &sc->hn_rx_ring[i];
+ stat += *((uint64_t *)((uint8_t *)rxr + ofs));
+ }
+
+ error = sysctl_handle_64(oidp, &stat, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ /* Zero out this stat. */
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ rxr = &sc->hn_rx_ring[i];
+ *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
+ }
+ return 0;
+}
+
+#endif
+
+static int
+hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int ofs = arg2, i, error;
+ struct hn_rx_ring *rxr;
+ u_long stat;
+
+ stat = 0;
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ rxr = &sc->hn_rx_ring[i];
+ stat += *((u_long *)((uint8_t *)rxr + ofs));
+ }
+
+ error = sysctl_handle_long(oidp, &stat, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ /* Zero out this stat. */
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ rxr = &sc->hn_rx_ring[i];
+ *((u_long *)((uint8_t *)rxr + ofs)) = 0;
+ }
+ return 0;
+}
+
+static int
+hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int ofs = arg2, i, error;
+ struct hn_tx_ring *txr;
+ u_long stat;
+
+ stat = 0;
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+ txr = &sc->hn_tx_ring[i];
+ stat += *((u_long *)((uint8_t *)txr + ofs));
+ }
+
+ error = sysctl_handle_long(oidp, &stat, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ /* Zero out this stat. */
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+ txr = &sc->hn_tx_ring[i];
+ *((u_long *)((uint8_t *)txr + ofs)) = 0;
+ }
+ return 0;
+}
+
+static int
+hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int ofs = arg2, i, error, conf;
+ struct hn_tx_ring *txr;
+
+ txr = &sc->hn_tx_ring[0];
+ conf = *((int *)((uint8_t *)txr + ofs));
+
+ error = sysctl_handle_int(oidp, &conf, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ NV_LOCK(sc);
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+ txr = &sc->hn_tx_ring[i];
+ *((int *)((uint8_t *)txr + ofs)) = conf;
+ }
+ NV_UNLOCK(sc);
+
return 0;
}
@@ -1786,17 +2018,191 @@ hn_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error)
*paddr = segs->ds_addr;
}
+static void
+hn_create_rx_data(struct hn_softc *sc)
+{
+ struct sysctl_oid_list *child;
+ struct sysctl_ctx_list *ctx;
+ device_t dev = sc->hn_dev;
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+ int lroent_cnt;
+#endif
+#endif
+ int i;
+
+ sc->hn_rx_ring_cnt = 1; /* TODO: vRSS */
+ sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
+ M_NETVSC, M_WAITOK | M_ZERO);
+
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+ lroent_cnt = hn_lro_entry_count;
+ if (lroent_cnt < TCP_LRO_ENTRIES)
+ lroent_cnt = TCP_LRO_ENTRIES;
+ device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
+#endif
+#endif /* INET || INET6 */
+
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
+
+ if (hn_trust_hosttcp)
+ rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
+ if (hn_trust_hostudp)
+ rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
+ if (hn_trust_hostip)
+ rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
+
+ /*
+ * Initialize LRO.
+ */
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+ tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 0);
+#else
+ tcp_lro_init(&rxr->hn_lro);
+ rxr->hn_lro.ifp = sc->hn_ifp;
+#endif
+#if __FreeBSD_version >= 1100099
+ rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
+ rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
+#endif
+#endif /* INET || INET6 */
+ }
+
+ ctx = device_get_sysctl_ctx(dev);
+ child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
+ CTLTYPE_U64 | CTLFLAG_RW, sc,
+ __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
+#if __FreeBSD_version < 1100095
+ hn_rx_stat_int_sysctl,
+#else
+ hn_rx_stat_u64_sysctl,
+#endif
+ "LU", "LRO queued");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
+ CTLTYPE_U64 | CTLFLAG_RW, sc,
+ __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
+#if __FreeBSD_version < 1100095
+ hn_rx_stat_int_sysctl,
+#else
+ hn_rx_stat_u64_sysctl,
+#endif
+ "LU", "LRO flushed");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
+ CTLTYPE_ULONG | CTLFLAG_RW, sc,
+ __offsetof(struct hn_rx_ring, hn_lro_tried),
+ hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
+#if __FreeBSD_version >= 1100099
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
+ CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_lro_lenlim_sysctl, "IU",
+ "Max # of data bytes to be aggregated by LRO");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
+ CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_ackcnt_sysctl, "I",
+ "Max # of ACKs to be aggregated by LRO");
+#endif
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
+ CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_TCP,
+ hn_trust_hcsum_sysctl, "I",
+ "Trust tcp segement verification on host side, "
+ "when csum info is missing");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
+ CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_UDP,
+ hn_trust_hcsum_sysctl, "I",
+ "Trust udp datagram verification on host side, "
+ "when csum info is missing");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
+ CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_IP,
+ hn_trust_hcsum_sysctl, "I",
+ "Trust ip packet verification on host side, "
+ "when csum info is missing");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
+ CTLTYPE_ULONG | CTLFLAG_RW, sc,
+ __offsetof(struct hn_rx_ring, hn_csum_ip),
+ hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
+ CTLTYPE_ULONG | CTLFLAG_RW, sc,
+ __offsetof(struct hn_rx_ring, hn_csum_tcp),
+ hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
+ CTLTYPE_ULONG | CTLFLAG_RW, sc,
+ __offsetof(struct hn_rx_ring, hn_csum_udp),
+ hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
+ CTLTYPE_ULONG | CTLFLAG_RW, sc,
+ __offsetof(struct hn_rx_ring, hn_csum_trusted),
+ hn_rx_stat_ulong_sysctl, "LU",
+ "# of packets that we trust host's csum verification");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
+ CTLTYPE_ULONG | CTLFLAG_RW, sc,
+ __offsetof(struct hn_rx_ring, hn_small_pkts),
+ hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
+}
+
+static void
+hn_destroy_rx_data(struct hn_softc *sc)
+{
+#if defined(INET) || defined(INET6)
+ int i;
+#endif
+
+ if (sc->hn_rx_ring_cnt == 0)
+ return;
+
+#if defined(INET) || defined(INET6)
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+ tcp_lro_free(&sc->hn_rx_ring[i].hn_lro);
+#endif
+ free(sc->hn_rx_ring, M_NETVSC);
+ sc->hn_rx_ring = NULL;
+
+ sc->hn_rx_ring_cnt = 0;
+}
+
static int
-hn_create_tx_ring(struct hn_softc *sc)
+hn_create_tx_ring(struct hn_softc *sc, int id)
{
+ struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
bus_dma_tag_t parent_dtag;
int error, i;
- sc->hn_txdesc_cnt = HN_TX_DESC_CNT;
- sc->hn_txdesc = malloc(sizeof(struct hn_txdesc) * sc->hn_txdesc_cnt,
+ txr->hn_sc = sc;
+
+#ifndef HN_USE_TXDESC_BUFRING
+ mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
+#endif
+ mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
+
+ txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
+ txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
M_NETVSC, M_WAITOK | M_ZERO);
- SLIST_INIT(&sc->hn_txlist);
- mtx_init(&sc->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
+#ifndef HN_USE_TXDESC_BUFRING
+ SLIST_INIT(&txr->hn_txlist);
+#else
+ txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_NETVSC,
+ M_WAITOK, &txr->hn_tx_lock);
+#endif
+
+ txr->hn_tx_taskq = sc->hn_tx_taskq;
+ TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
+ TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
+
+ txr->hn_direct_tx_size = hn_direct_tx_size;
+ if (hv_vmbus_protocal_version >= HV_VMBUS_VERSION_WIN8_1)
+ txr->hn_csum_assist = HN_CSUM_ASSIST;
+ else
+ txr->hn_csum_assist = HN_CSUM_ASSIST_WIN8;
+
+ /*
+ * Always schedule transmission instead of trying to do direct
+ * transmission. This one gives the best performance so far.
+ */
+ txr->hn_sched_tx = 1;
+
+ txr->hn_txeof = hn_start_txeof; /* TODO: if_transmit */
parent_dtag = bus_get_dma_tag(sc->hn_dev);
@@ -1813,7 +2219,7 @@ hn_create_tx_ring(struct hn_softc *sc)
0, /* flags */
NULL, /* lockfunc */
NULL, /* lockfuncarg */
- &sc->hn_tx_rndis_dtag);
+ &txr->hn_tx_rndis_dtag);
if (error) {
device_printf(sc->hn_dev, "failed to create rndis dmatag\n");
return error;
@@ -1832,21 +2238,21 @@ hn_create_tx_ring(struct hn_softc *sc)
0, /* flags */
NULL, /* lockfunc */
NULL, /* lockfuncarg */
- &sc->hn_tx_data_dtag);
+ &txr->hn_tx_data_dtag);
if (error) {
device_printf(sc->hn_dev, "failed to create data dmatag\n");
return error;
}
- for (i = 0; i < sc->hn_txdesc_cnt; ++i) {
- struct hn_txdesc *txd = &sc->hn_txdesc[i];
+ for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
+ struct hn_txdesc *txd = &txr->hn_txdesc[i];
- txd->sc = sc;
+ txd->txr = txr;
/*
* Allocate and load RNDIS messages.
*/
- error = bus_dmamem_alloc(sc->hn_tx_rndis_dtag,
+ error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
(void **)&txd->rndis_msg,
BUS_DMA_WAITOK | BUS_DMA_COHERENT,
&txd->rndis_msg_dmap);
@@ -1856,7 +2262,7 @@ hn_create_tx_ring(struct hn_softc *sc)
return error;
}
- error = bus_dmamap_load(sc->hn_tx_rndis_dtag,
+ error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
txd->rndis_msg_dmap,
txd->rndis_msg, HN_RNDIS_MSG_LEN,
hn_dma_map_paddr, &txd->rndis_msg_paddr,
@@ -1864,59 +2270,277 @@ hn_create_tx_ring(struct hn_softc *sc)
if (error) {
device_printf(sc->hn_dev,
"failed to load rndis_msg, %d\n", i);
- bus_dmamem_free(sc->hn_tx_rndis_dtag,
+ bus_dmamem_free(txr->hn_tx_rndis_dtag,
txd->rndis_msg, txd->rndis_msg_dmap);
return error;
}
/* DMA map for TX data. */
- error = bus_dmamap_create(sc->hn_tx_data_dtag, 0,
+ error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
&txd->data_dmap);
if (error) {
device_printf(sc->hn_dev,
"failed to allocate tx data dmamap\n");
- bus_dmamap_unload(sc->hn_tx_rndis_dtag,
+ bus_dmamap_unload(txr->hn_tx_rndis_dtag,
txd->rndis_msg_dmap);
- bus_dmamem_free(sc->hn_tx_rndis_dtag,
+ bus_dmamem_free(txr->hn_tx_rndis_dtag,
txd->rndis_msg, txd->rndis_msg_dmap);
return error;
}
/* All set, put it to list */
txd->flags |= HN_TXD_FLAG_ONLIST;
- SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link);
+#ifndef HN_USE_TXDESC_BUFRING
+ SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
+#else
+ buf_ring_enqueue(txr->hn_txdesc_br, txd);
+#endif
+ }
+ txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
+
+ if (sc->hn_tx_sysctl_tree != NULL) {
+ struct sysctl_oid_list *child;
+ struct sysctl_ctx_list *ctx;
+ char name[16];
+
+ /*
+ * Create per TX ring sysctl tree:
+ * dev.hn.UNIT.tx.RINGID
+ */
+ ctx = device_get_sysctl_ctx(sc->hn_dev);
+ child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
+
+ snprintf(name, sizeof(name), "%d", id);
+ txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
+ name, CTLFLAG_RD, 0, "");
+
+ if (txr->hn_tx_sysctl_tree != NULL) {
+ child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
+
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
+ CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
+ "# of available TX descs");
+ }
}
- sc->hn_txdesc_avail = sc->hn_txdesc_cnt;
return 0;
}
static void
-hn_destroy_tx_ring(struct hn_softc *sc)
+hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
+{
+ struct hn_tx_ring *txr = txd->txr;
+
+ KASSERT(txd->m == NULL, ("still has mbuf installed"));
+ KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
+
+ bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_msg_dmap);
+ bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_msg,
+ txd->rndis_msg_dmap);
+ bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
+}
+
+static void
+hn_destroy_tx_ring(struct hn_tx_ring *txr)
{
struct hn_txdesc *txd;
- while ((txd = SLIST_FIRST(&sc->hn_txlist)) != NULL) {
- KASSERT(txd->m == NULL, ("still has mbuf installed"));
- KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
- ("still dma mapped"));
- SLIST_REMOVE_HEAD(&sc->hn_txlist, link);
+ if (txr->hn_txdesc == NULL)
+ return;
+
+#ifndef HN_USE_TXDESC_BUFRING
+ while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
+ SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
+ hn_txdesc_dmamap_destroy(txd);
+ }
+#else
+ while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
+ hn_txdesc_dmamap_destroy(txd);
+#endif
+
+ if (txr->hn_tx_data_dtag != NULL)
+ bus_dma_tag_destroy(txr->hn_tx_data_dtag);
+ if (txr->hn_tx_rndis_dtag != NULL)
+ bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
+
+#ifdef HN_USE_TXDESC_BUFRING
+ buf_ring_free(txr->hn_txdesc_br, M_NETVSC);
+#endif
+
+ free(txr->hn_txdesc, M_NETVSC);
+ txr->hn_txdesc = NULL;
+
+#ifndef HN_USE_TXDESC_BUFRING
+ mtx_destroy(&txr->hn_txlist_spin);
+#endif
+ mtx_destroy(&txr->hn_tx_lock);
+}
+
+static int
+hn_create_tx_data(struct hn_softc *sc)
+{
+ struct sysctl_oid_list *child;
+ struct sysctl_ctx_list *ctx;
+ int i;
+
+ sc->hn_tx_ring_cnt = 1; /* TODO: vRSS */
+ sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
+ M_NETVSC, M_WAITOK | M_ZERO);
+
+ ctx = device_get_sysctl_ctx(sc->hn_dev);
+ child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
+
+ /* Create dev.hn.UNIT.tx sysctl tree */
+ sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
+ CTLFLAG_RD, 0, "");
+
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+ int error;
+
+ error = hn_create_tx_ring(sc, i);
+ if (error)
+ return error;
+ }
+
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
+ CTLTYPE_ULONG | CTLFLAG_RW, sc,
+ __offsetof(struct hn_tx_ring, hn_no_txdescs),
+ hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
+ CTLTYPE_ULONG | CTLFLAG_RW, sc,
+ __offsetof(struct hn_tx_ring, hn_send_failed),
+ hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
+ CTLTYPE_ULONG | CTLFLAG_RW, sc,
+ __offsetof(struct hn_tx_ring, hn_txdma_failed),
+ hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
+ CTLTYPE_ULONG | CTLFLAG_RW, sc,
+ __offsetof(struct hn_tx_ring, hn_tx_collapsed),
+ hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
+ CTLTYPE_ULONG | CTLFLAG_RW, sc,
+ __offsetof(struct hn_tx_ring, hn_tx_chimney),
+ hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
+ CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
+ "# of total TX descs");
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
+ CTLFLAG_RD, &sc->hn_tx_chimney_max, 0,
+ "Chimney send packet size upper boundary");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
+ CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_tx_chimney_size_sysctl,
+ "I", "Chimney send packet size limit");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
+ CTLTYPE_INT | CTLFLAG_RW, sc,
+ __offsetof(struct hn_tx_ring, hn_direct_tx_size),
+ hn_tx_conf_int_sysctl, "I",
+ "Size of the packet for direct transmission");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
+ CTLTYPE_INT | CTLFLAG_RW, sc,
+ __offsetof(struct hn_tx_ring, hn_sched_tx),
+ hn_tx_conf_int_sysctl, "I",
+ "Always schedule transmission "
+ "instead of doing direct transmission");
+
+ return 0;
+}
+
+static void
+hn_set_tx_chimney_size(struct hn_softc *sc, int chimney_size)
+{
+ int i;
- bus_dmamap_unload(sc->hn_tx_rndis_dtag,
- txd->rndis_msg_dmap);
- bus_dmamem_free(sc->hn_tx_rndis_dtag,
- txd->rndis_msg, txd->rndis_msg_dmap);
+ NV_LOCK(sc);
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
+ sc->hn_tx_ring[i].hn_tx_chimney_size = chimney_size;
+ NV_UNLOCK(sc);
+}
+
+static void
+hn_destroy_tx_data(struct hn_softc *sc)
+{
+ int i;
+
+ if (sc->hn_tx_ring_cnt == 0)
+ return;
+
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
+ hn_destroy_tx_ring(&sc->hn_tx_ring[i]);
+
+ free(sc->hn_tx_ring, M_NETVSC);
+ sc->hn_tx_ring = NULL;
+
+ sc->hn_tx_ring_cnt = 0;
+}
+
+static void
+hn_start_taskfunc(void *xtxr, int pending __unused)
+{
+ struct hn_tx_ring *txr = xtxr;
+
+ mtx_lock(&txr->hn_tx_lock);
+ hn_start_locked(txr, 0);
+ mtx_unlock(&txr->hn_tx_lock);
+}
+
+static void
+hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
+{
+ struct hn_tx_ring *txr = xtxr;
+
+ mtx_lock(&txr->hn_tx_lock);
+ atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
+ hn_start_locked(txr, 0);
+ mtx_unlock(&txr->hn_tx_lock);
+}
+
+static void
+hn_stop_tx_tasks(struct hn_softc *sc)
+{
+ int i;
+
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+ struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
+
+ taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
+ taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
+ }
+}
- bus_dmamap_destroy(sc->hn_tx_data_dtag, txd->data_dmap);
+static void
+hn_tx_taskq_create(void *arg __unused)
+{
+ if (!hn_share_tx_taskq)
+ return;
+
+ hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
+ taskqueue_thread_enqueue, &hn_tx_taskq);
+ taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
+ if (hn_bind_tx_taskq >= 0) {
+ int cpu = hn_bind_tx_taskq;
+ struct task cpuset_task;
+ cpuset_t cpu_set;
+
+ if (cpu > mp_ncpus - 1)
+ cpu = mp_ncpus - 1;
+ CPU_SETOF(cpu, &cpu_set);
+ TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set);
+ taskqueue_enqueue(hn_tx_taskq, &cpuset_task);
+ taskqueue_drain(hn_tx_taskq, &cpuset_task);
}
+}
+SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_FIRST,
+ hn_tx_taskq_create, NULL);
- if (sc->hn_tx_data_dtag != NULL)
- bus_dma_tag_destroy(sc->hn_tx_data_dtag);
- if (sc->hn_tx_rndis_dtag != NULL)
- bus_dma_tag_destroy(sc->hn_tx_rndis_dtag);
- free(sc->hn_txdesc, M_NETVSC);
- mtx_destroy(&sc->hn_txlist_spin);
+static void
+hn_tx_taskq_destroy(void *arg __unused)
+{
+ if (hn_tx_taskq != NULL)
+ taskqueue_free(hn_tx_taskq);
}
+SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_FIRST,
+ hn_tx_taskq_destroy, NULL);
static device_method_t netvsc_methods[] = {
/* Device interface */
diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.c b/sys/dev/hyperv/netvsc/hv_rndis_filter.c
index 29d8c8f..31ddbc0 100644
--- a/sys/dev/hyperv/netvsc/hv_rndis_filter.c
+++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.c
@@ -136,12 +136,9 @@ hv_get_rndis_device(void)
{
rndis_device *device;
- device = malloc(sizeof(rndis_device), M_NETVSC, M_NOWAIT | M_ZERO);
- if (device == NULL) {
- return (NULL);
- }
+ device = malloc(sizeof(rndis_device), M_NETVSC, M_WAITOK | M_ZERO);
- mtx_init(&device->req_lock, "HV-FRL", NULL, MTX_SPIN | MTX_RECURSE);
+ mtx_init(&device->req_lock, "HV-FRL", NULL, MTX_DEF);
/* Same effect as STAILQ_HEAD_INITIALIZER() static initializer */
STAILQ_INIT(&device->myrequest_list);
@@ -172,10 +169,7 @@ hv_rndis_request(rndis_device *device, uint32_t message_type,
rndis_msg *rndis_mesg;
rndis_set_request *set;
- request = malloc(sizeof(rndis_request), M_NETVSC, M_NOWAIT | M_ZERO);
- if (request == NULL) {
- return (NULL);
- }
+ request = malloc(sizeof(rndis_request), M_NETVSC, M_WAITOK | M_ZERO);
sema_init(&request->wait_sema, 0, "rndis sema");
@@ -194,9 +188,9 @@ hv_rndis_request(rndis_device *device, uint32_t message_type,
set->request_id += 1;
/* Add to the request list */
- mtx_lock_spin(&device->req_lock);
+ mtx_lock(&device->req_lock);
STAILQ_INSERT_TAIL(&device->myrequest_list, request, mylist_entry);
- mtx_unlock_spin(&device->req_lock);
+ mtx_unlock(&device->req_lock);
return (request);
}
@@ -207,14 +201,14 @@ hv_rndis_request(rndis_device *device, uint32_t message_type,
static inline void
hv_put_rndis_request(rndis_device *device, rndis_request *request)
{
- mtx_lock_spin(&device->req_lock);
+ mtx_lock(&device->req_lock);
/* Fixme: Has O(n) performance */
/*
* XXXKYS: Use Doubly linked lists.
*/
STAILQ_REMOVE(&device->myrequest_list, request, rndis_request_,
mylist_entry);
- mtx_unlock_spin(&device->req_lock);
+ mtx_unlock(&device->req_lock);
sema_destroy(&request->wait_sema);
free(request, M_NETVSC);
@@ -271,7 +265,7 @@ hv_rf_receive_response(rndis_device *device, rndis_msg *response)
rndis_request *next_request;
boolean_t found = FALSE;
- mtx_lock_spin(&device->req_lock);
+ mtx_lock(&device->req_lock);
request = STAILQ_FIRST(&device->myrequest_list);
while (request != NULL) {
/*
@@ -286,7 +280,7 @@ hv_rf_receive_response(rndis_device *device, rndis_msg *response)
next_request = STAILQ_NEXT(request, mylist_entry);
request = next_request;
}
- mtx_unlock_spin(&device->req_lock);
+ mtx_unlock(&device->req_lock);
if (found) {
if (response->msg_len <= sizeof(rndis_msg)) {
diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
index a780f9e..27fb3fd 100644
--- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
+++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
@@ -856,8 +856,8 @@ hv_storvsc_rescan_target(struct storvsc_softc *sc)
if (xpt_create_path(&ccb->ccb_h.path, NULL, pathid, targetid,
CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
- printf("unable to create path for rescan, pathid: %d,"
- "targetid: %d\n", pathid, targetid);
+ printf("unable to create path for rescan, pathid: %u,"
+ "targetid: %u\n", pathid, targetid);
xpt_free_ccb(ccb);
return;
}
@@ -1561,13 +1561,12 @@ static void
storvsc_destroy_bounce_buffer(struct sglist *sgl)
{
struct hv_sgl_node *sgl_node = NULL;
-
- sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list);
- LIST_REMOVE(sgl_node, link);
- if (NULL == sgl_node) {
+ if (LIST_EMPTY(&g_hv_sgl_page_pool.in_use_sgl_list)) {
printf("storvsc error: not enough in use sgl\n");
return;
}
+ sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list);
+ LIST_REMOVE(sgl_node, link);
sgl_node->sgl_data = sgl;
LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link);
}
@@ -1593,12 +1592,12 @@ storvsc_create_bounce_buffer(uint16_t seg_count, int write)
struct hv_sgl_node *sgl_node = NULL;
/* get struct sglist from free_sgl_list */
- sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
- LIST_REMOVE(sgl_node, link);
- if (NULL == sgl_node) {
+ if (LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
printf("storvsc error: not enough free sgl\n");
return NULL;
}
+ sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
+ LIST_REMOVE(sgl_node, link);
bounce_sgl = sgl_node->sgl_data;
LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link);
diff --git a/sys/dev/hyperv/utilities/hv_heartbeat.c b/sys/dev/hyperv/utilities/hv_heartbeat.c
new file mode 100644
index 0000000..c1b6da5
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_heartbeat.c
@@ -0,0 +1,129 @@
+/*-
+ * Copyright (c) 2014 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/timetc.h>
+#include <sys/syscallsubr.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include "hv_util.h"
+
+/* Heartbeat Service */
+static hv_guid service_guid = { .data =
+ {0x39, 0x4f, 0x16, 0x57, 0x15, 0x91, 0x78, 0x4e,
+ 0xab, 0x55, 0x38, 0x2f, 0x3b, 0xd5, 0x42, 0x2d} };
+
+/**
+ * Process heartbeat message
+ */
+static void
+hv_heartbeat_cb(void *context)
+{
+ uint8_t* buf;
+ hv_vmbus_channel* channel;
+ uint32_t recvlen;
+ uint64_t requestid;
+ int ret;
+
+ struct hv_vmbus_heartbeat_msg_data* heartbeat_msg;
+ struct hv_vmbus_icmsg_hdr* icmsghdrp;
+ hv_util_sc *softc;
+
+ softc = (hv_util_sc*)context;
+ buf = softc->receive_buffer;;
+ channel = softc->hv_dev->channel;
+
+ ret = hv_vmbus_channel_recv_packet(channel, buf, PAGE_SIZE, &recvlen,
+ &requestid);
+
+ if ((ret == 0) && recvlen > 0) {
+
+ icmsghdrp = (struct hv_vmbus_icmsg_hdr *)
+ &buf[sizeof(struct hv_vmbus_pipe_hdr)];
+
+ if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
+ hv_negotiate_version(icmsghdrp, NULL, buf);
+
+ } else {
+ heartbeat_msg =
+ (struct hv_vmbus_heartbeat_msg_data *)
+ &buf[sizeof(struct hv_vmbus_pipe_hdr) +
+ sizeof(struct hv_vmbus_icmsg_hdr)];
+
+ heartbeat_msg->seq_num += 1;
+ }
+
+ icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION |
+ HV_ICMSGHDRFLAG_RESPONSE;
+
+ hv_vmbus_channel_send_packet(channel, buf, recvlen, requestid,
+ HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
+ }
+}
+
+static int
+hv_heartbeat_probe(device_t dev)
+{
+ const char *p = vmbus_get_type(dev);
+ if (!memcmp(p, &service_guid, sizeof(hv_guid))) {
+ device_set_desc(dev, "Hyper-V Heartbeat Service");
+ return BUS_PROBE_DEFAULT;
+ }
+
+ return ENXIO;
+}
+
+static int
+hv_heartbeat_attach(device_t dev)
+{
+ hv_util_sc *softc = (hv_util_sc*)device_get_softc(dev);
+
+ softc->callback = hv_heartbeat_cb;
+
+ return hv_util_attach(dev);
+}
+
+static device_method_t heartbeat_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, hv_heartbeat_probe),
+ DEVMETHOD(device_attach, hv_heartbeat_attach),
+ DEVMETHOD(device_detach, hv_util_detach),
+ { 0, 0 }
+};
+
+static driver_t heartbeat_driver = { "hvheartbeat", heartbeat_methods, sizeof(hv_util_sc)};
+
+static devclass_t heartbeat_devclass;
+
+DRIVER_MODULE(hv_heartbeat, vmbus, heartbeat_driver, heartbeat_devclass, NULL, NULL);
+MODULE_VERSION(hv_heartbeat, 1);
+MODULE_DEPEND(hv_heartbeat, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/utilities/hv_kvp.c b/sys/dev/hyperv/utilities/hv_kvp.c
index 58d565c4..8517918 100644
--- a/sys/dev/hyperv/utilities/hv_kvp.c
+++ b/sys/dev/hyperv/utilities/hv_kvp.c
@@ -63,6 +63,7 @@ __FBSDID("$FreeBSD$");
#include <dev/hyperv/include/hyperv.h>
#include <dev/hyperv/netvsc/hv_net_vsc.h>
+#include "hv_util.h"
#include "unicode.h"
#include "hv_kvp.h"
@@ -74,8 +75,6 @@ __FBSDID("$FreeBSD$");
/* hv_kvp debug control */
static int hv_kvp_log = 0;
-SYSCTL_INT(_dev, OID_AUTO, hv_kvp_log, CTLFLAG_RW, &hv_kvp_log, 0,
- "hv_kvp log");
#define hv_kvp_log_error(...) do { \
if (hv_kvp_log > 0) \
@@ -87,6 +86,10 @@ SYSCTL_INT(_dev, OID_AUTO, hv_kvp_log, CTLFLAG_RW, &hv_kvp_log, 0,
log(LOG_INFO, "hv_kvp: " __VA_ARGS__); \
} while (0)
+static hv_guid service_guid = { .data =
+ {0xe7, 0xf4, 0xa0, 0xa9, 0x45, 0x5a, 0x96, 0x4d,
+ 0xb8, 0x27, 0x8a, 0x84, 0x1e, 0x8c, 0x3, 0xe6} };
+
/* character device prototypes */
static d_open_t hv_kvp_dev_open;
static d_close_t hv_kvp_dev_close;
@@ -94,12 +97,6 @@ static d_read_t hv_kvp_dev_daemon_read;
static d_write_t hv_kvp_dev_daemon_write;
static d_poll_t hv_kvp_dev_daemon_poll;
-/* hv_kvp prototypes */
-static int hv_kvp_req_in_progress(void);
-static void hv_kvp_transaction_init(uint32_t, hv_vmbus_channel *, uint64_t, uint8_t *);
-static void hv_kvp_send_msg_to_daemon(void);
-static void hv_kvp_process_request(void *context);
-
/* hv_kvp character device structure */
static struct cdevsw hv_kvp_cdevsw =
{
@@ -111,70 +108,67 @@ static struct cdevsw hv_kvp_cdevsw =
.d_poll = hv_kvp_dev_daemon_poll,
.d_name = "hv_kvp_dev",
};
-static struct cdev *hv_kvp_dev;
-static struct hv_kvp_msg *hv_kvp_dev_buf;
-struct proc *daemon_task;
-static struct selinfo hv_kvp_selinfo;
/*
* Global state to track and synchronize multiple
* KVP transaction requests from the host.
*/
-static struct {
-
- /* Pre-allocated work item for queue */
- hv_work_item work_item;
+typedef struct hv_kvp_sc {
+ struct hv_util_sc util_sc;
- /* Unless specified the pending mutex should be
+ /* Unless specified the pending mutex should be
* used to alter the values of the following paramters:
* 1. req_in_progress
* 2. req_timed_out
- * 3. pending_reqs.
*/
- struct mtx pending_mutex;
-
+ struct mtx pending_mutex;
+
+ struct task task;
+
/* To track if transaction is active or not */
- boolean_t req_in_progress;
+ boolean_t req_in_progress;
/* Tracks if daemon did not reply back in time */
- boolean_t req_timed_out;
+ boolean_t req_timed_out;
/* Tracks if daemon is serving a request currently */
boolean_t daemon_busy;
- /* Count of KVP requests from Hyper-V. */
- uint64_t pending_reqs;
-
-
- /* Length of host message */
- uint32_t host_msg_len;
- /* Pointer to channel */
- hv_vmbus_channel *channelp;
+ /* Length of host message */
+ uint32_t host_msg_len;
/* Host message id */
- uint64_t host_msg_id;
-
+ uint64_t host_msg_id;
+
/* Current kvp message from the host */
- struct hv_kvp_msg *host_kvp_msg;
-
+ struct hv_kvp_msg *host_kvp_msg;
+
/* Current kvp message for daemon */
- struct hv_kvp_msg daemon_kvp_msg;
-
+ struct hv_kvp_msg daemon_kvp_msg;
+
/* Rcv buffer for communicating with the host*/
- uint8_t *rcv_buf;
-
+ uint8_t *rcv_buf;
+
/* Device semaphore to control communication */
- struct sema dev_sema;
-
+ struct sema dev_sema;
+
/* Indicates if daemon registered with driver */
- boolean_t register_done;
-
+ boolean_t register_done;
+
/* Character device status */
- boolean_t dev_accessed;
-} kvp_globals;
+ boolean_t dev_accessed;
+
+ struct cdev *hv_kvp_dev;
+
+ struct proc *daemon_task;
-/* global vars */
-MALLOC_DECLARE(M_HV_KVP_DEV_BUF);
-MALLOC_DEFINE(M_HV_KVP_DEV_BUF, "hv_kvp_dev buffer", "buffer for hv_kvp_dev module");
+ struct selinfo hv_kvp_selinfo;
+} hv_kvp_sc;
+
+/* hv_kvp prototypes */
+static int hv_kvp_req_in_progress(hv_kvp_sc *sc);
+static void hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t, uint64_t, uint8_t *);
+static void hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc);
+static void hv_kvp_process_request(void *context, int pending);
/*
* hv_kvp low level functions
@@ -184,10 +178,10 @@ MALLOC_DEFINE(M_HV_KVP_DEV_BUF, "hv_kvp_dev buffer", "buffer for hv_kvp_dev modu
* Check if kvp transaction is in progres
*/
static int
-hv_kvp_req_in_progress(void)
+hv_kvp_req_in_progress(hv_kvp_sc *sc)
{
- return (kvp_globals.req_in_progress);
+ return (sc->req_in_progress);
}
@@ -195,18 +189,17 @@ hv_kvp_req_in_progress(void)
* This routine is called whenever a message is received from the host
*/
static void
-hv_kvp_transaction_init(uint32_t rcv_len, hv_vmbus_channel *rcv_channel,
+hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t rcv_len,
uint64_t request_id, uint8_t *rcv_buf)
{
-
+
/* Store all the relevant message details in the global structure */
/* Do not need to use mutex for req_in_progress here */
- kvp_globals.req_in_progress = true;
- kvp_globals.host_msg_len = rcv_len;
- kvp_globals.channelp = rcv_channel;
- kvp_globals.host_msg_id = request_id;
- kvp_globals.rcv_buf = rcv_buf;
- kvp_globals.host_kvp_msg = (struct hv_kvp_msg *)&rcv_buf[
+ sc->req_in_progress = true;
+ sc->host_msg_len = rcv_len;
+ sc->host_msg_id = request_id;
+ sc->rcv_buf = rcv_buf;
+ sc->host_kvp_msg = (struct hv_kvp_msg *)&rcv_buf[
sizeof(struct hv_vmbus_pipe_hdr) +
sizeof(struct hv_vmbus_icmsg_hdr)];
}
@@ -258,12 +251,12 @@ hv_kvp_negotiate_version(struct hv_vmbus_icmsg_hdr *icmsghdrp,
* Convert ip related info in umsg from utf8 to utf16 and store in hmsg
*/
static int
-hv_kvp_convert_utf8_ipinfo_to_utf16(struct hv_kvp_msg *umsg,
+hv_kvp_convert_utf8_ipinfo_to_utf16(struct hv_kvp_msg *umsg,
struct hv_kvp_ip_msg *host_ip_msg)
{
int err_ip, err_subnet, err_gway, err_dns, err_adap;
int UNUSED_FLAG = 1;
-
+
utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.ip_addr,
MAX_IP_ADDR_SIZE,
(char *)umsg->body.kvp_ip_val.ip_addr,
@@ -294,7 +287,7 @@ hv_kvp_convert_utf8_ipinfo_to_utf16(struct hv_kvp_msg *umsg,
strlen((char *)umsg->body.kvp_ip_val.adapter_id),
UNUSED_FLAG,
&err_adap);
-
+
host_ip_msg->kvp_ip_val.dhcp_enabled = umsg->body.kvp_ip_val.dhcp_enabled;
host_ip_msg->kvp_ip_val.addr_family = umsg->body.kvp_ip_val.addr_family;
@@ -389,7 +382,7 @@ hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg,
MAX_IP_ADDR_SIZE,
UNUSED_FLAG,
&err_subnet);
-
+
utf16_to_utf8((char *)umsg->body.kvp_ip_val.gate_way, MAX_GATEWAY_SIZE,
(uint16_t *)host_ip_msg->kvp_ip_val.gate_way,
MAX_GATEWAY_SIZE,
@@ -411,16 +404,13 @@ hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg,
* Ensure utf16_utf8 takes care of the additional string terminating char!!
*/
static void
-hv_kvp_convert_hostmsg_to_usermsg(void)
+hv_kvp_convert_hostmsg_to_usermsg(struct hv_kvp_msg *hmsg, struct hv_kvp_msg *umsg)
{
int utf_err = 0;
uint32_t value_type;
- struct hv_kvp_ip_msg *host_ip_msg = (struct hv_kvp_ip_msg *)
- kvp_globals.host_kvp_msg;
-
- struct hv_kvp_msg *hmsg = kvp_globals.host_kvp_msg;
- struct hv_kvp_msg *umsg = &kvp_globals.daemon_kvp_msg;
+ struct hv_kvp_ip_msg *host_ip_msg;
+ host_ip_msg = (struct hv_kvp_ip_msg*)hmsg;
memset(umsg, 0, sizeof(struct hv_kvp_msg));
umsg->kvp_hdr.operation = hmsg->kvp_hdr.operation;
@@ -525,14 +515,12 @@ hv_kvp_convert_hostmsg_to_usermsg(void)
* Prepare a host kvp msg based on user kvp msg (utf8 to utf16)
*/
static int
-hv_kvp_convert_usermsg_to_hostmsg(void)
+hv_kvp_convert_usermsg_to_hostmsg(struct hv_kvp_msg *umsg, struct hv_kvp_msg *hmsg)
{
int hkey_len = 0, hvalue_len = 0, utf_err = 0;
struct hv_kvp_exchg_msg_value *host_exchg_data;
char *key_name, *value;
- struct hv_kvp_msg *umsg = &kvp_globals.daemon_kvp_msg;
- struct hv_kvp_msg *hmsg = kvp_globals.host_kvp_msg;
struct hv_kvp_ip_msg *host_ip_msg = (struct hv_kvp_ip_msg *)hmsg;
switch (hmsg->kvp_hdr.operation) {
@@ -564,7 +552,7 @@ hv_kvp_convert_usermsg_to_hostmsg(void)
if ((hkey_len < 0) || (hvalue_len < 0))
return (HV_KVP_E_FAIL);
-
+
return (KVP_SUCCESS);
case HV_KVP_OP_GET:
@@ -580,9 +568,9 @@ hv_kvp_convert_usermsg_to_hostmsg(void)
/* Use values by string */
host_exchg_data->value_type = HV_REG_SZ;
- if ((hkey_len < 0) || (hvalue_len < 0))
+ if ((hkey_len < 0) || (hvalue_len < 0))
return (HV_KVP_E_FAIL);
-
+
return (KVP_SUCCESS);
default:
@@ -595,22 +583,22 @@ hv_kvp_convert_usermsg_to_hostmsg(void)
* Send the response back to the host.
*/
static void
-hv_kvp_respond_host(int error)
+hv_kvp_respond_host(hv_kvp_sc *sc, int error)
{
struct hv_vmbus_icmsg_hdr *hv_icmsg_hdrp;
hv_icmsg_hdrp = (struct hv_vmbus_icmsg_hdr *)
- &kvp_globals.rcv_buf[sizeof(struct hv_vmbus_pipe_hdr)];
+ &sc->rcv_buf[sizeof(struct hv_vmbus_pipe_hdr)];
if (error)
error = HV_KVP_E_FAIL;
hv_icmsg_hdrp->status = error;
hv_icmsg_hdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | HV_ICMSGHDRFLAG_RESPONSE;
-
- error = hv_vmbus_channel_send_packet(kvp_globals.channelp,
- kvp_globals.rcv_buf,
- kvp_globals.host_msg_len, kvp_globals.host_msg_id,
+
+ error = hv_vmbus_channel_send_packet(sc->util_sc.hv_dev->channel,
+ sc->rcv_buf,
+ sc->host_msg_len, sc->host_msg_id,
HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
if (error)
@@ -624,16 +612,19 @@ hv_kvp_respond_host(int error)
* and the host
*/
static void
-hv_kvp_send_msg_to_daemon(void)
+hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc)
{
+ struct hv_kvp_msg *hmsg = sc->host_kvp_msg;
+ struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg;
+
/* Prepare kvp_msg to be sent to user */
- hv_kvp_convert_hostmsg_to_usermsg();
+ hv_kvp_convert_hostmsg_to_usermsg(hmsg, umsg);
/* Send the msg to user via function deamon_read - setting sema */
- sema_post(&kvp_globals.dev_sema);
+ sema_post(&sc->dev_sema);
/* We should wake up the daemon, in case it's doing poll() */
- selwakeup(&hv_kvp_selinfo);
+ selwakeup(&sc->hv_kvp_selinfo);
}
@@ -642,98 +633,83 @@ hv_kvp_send_msg_to_daemon(void)
* and interact with daemon
*/
static void
-hv_kvp_process_request(void *context)
+hv_kvp_process_request(void *context, int pending)
{
uint8_t *kvp_buf;
- hv_vmbus_channel *channel = context;
+ hv_vmbus_channel *channel;
uint32_t recvlen = 0;
uint64_t requestid;
struct hv_vmbus_icmsg_hdr *icmsghdrp;
int ret = 0;
- uint64_t pending_cnt = 1;
-
+ hv_kvp_sc *sc;
+
hv_kvp_log_info("%s: entering hv_kvp_process_request\n", __func__);
- kvp_buf = receive_buffer[HV_KVP];
+
+ sc = (hv_kvp_sc*)context;
+ kvp_buf = sc->util_sc.receive_buffer;;
+ channel = sc->util_sc.hv_dev->channel;
+
ret = hv_vmbus_channel_recv_packet(channel, kvp_buf, 2 * PAGE_SIZE,
&recvlen, &requestid);
- /*
- * We start counting only after the daemon registers
- * and therefore there could be requests pending in
- * the VMBus that are not reflected in pending_cnt.
- * Therefore we continue reading as long as either of
- * the below conditions is true.
- */
+ while ((ret == 0) && (recvlen > 0)) {
+
+ icmsghdrp = (struct hv_vmbus_icmsg_hdr *)
+ &kvp_buf[sizeof(struct hv_vmbus_pipe_hdr)];
+
+ hv_kvp_transaction_init(sc, recvlen, requestid, kvp_buf);
+ if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
+ hv_kvp_negotiate_version(icmsghdrp, NULL, kvp_buf);
+ hv_kvp_respond_host(sc, ret);
+
+ /*
+ * It is ok to not acquire the mutex before setting
+ * req_in_progress here because negotiation is the
+ * first thing that happens and hence there is no
+ * chance of a race condition.
+ */
+
+ sc->req_in_progress = false;
+ hv_kvp_log_info("%s :version negotiated\n", __func__);
+
+ } else {
+ if (!sc->daemon_busy) {
+
+ hv_kvp_log_info("%s: issuing qury to daemon\n", __func__);
+ mtx_lock(&sc->pending_mutex);
+ sc->req_timed_out = false;
+ sc->daemon_busy = true;
+ mtx_unlock(&sc->pending_mutex);
- while ((pending_cnt>0) || ((ret == 0) && (recvlen > 0))) {
-
- if ((ret == 0) && (recvlen>0)) {
-
- icmsghdrp = (struct hv_vmbus_icmsg_hdr *)
- &kvp_buf[sizeof(struct hv_vmbus_pipe_hdr)];
-
- hv_kvp_transaction_init(recvlen, channel, requestid, kvp_buf);
- if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
- hv_kvp_negotiate_version(icmsghdrp, NULL, kvp_buf);
- hv_kvp_respond_host(ret);
-
- /*
- * It is ok to not acquire the mutex before setting
- * req_in_progress here because negotiation is the
- * first thing that happens and hence there is no
- * chance of a race condition.
- */
-
- kvp_globals.req_in_progress = false;
- hv_kvp_log_info("%s :version negotiated\n", __func__);
-
- } else {
- if (!kvp_globals.daemon_busy) {
-
- hv_kvp_log_info("%s: issuing qury to daemon\n", __func__);
- mtx_lock(&kvp_globals.pending_mutex);
- kvp_globals.req_timed_out = false;
- kvp_globals.daemon_busy = true;
- mtx_unlock(&kvp_globals.pending_mutex);
-
- hv_kvp_send_msg_to_daemon();
- hv_kvp_log_info("%s: waiting for daemon\n", __func__);
- }
-
- /* Wait 5 seconds for daemon to respond back */
- tsleep(&kvp_globals, 0, "kvpworkitem", 5 * hz);
- hv_kvp_log_info("%s: came out of wait\n", __func__);
+ hv_kvp_send_msg_to_daemon(sc);
+ hv_kvp_log_info("%s: waiting for daemon\n", __func__);
}
+
+ /* Wait 5 seconds for daemon to respond back */
+ tsleep(sc, 0, "kvpworkitem", 5 * hz);
+ hv_kvp_log_info("%s: came out of wait\n", __func__);
}
- mtx_lock(&kvp_globals.pending_mutex);
-
+ mtx_lock(&sc->pending_mutex);
+
/* Notice that once req_timed_out is set to true
* it will remain true until the next request is
* sent to the daemon. The response from daemon
- * is forwarded to host only when this flag is
- * false.
+ * is forwarded to host only when this flag is
+ * false.
*/
- kvp_globals.req_timed_out = true;
+ sc->req_timed_out = true;
/*
* Cancel request if so need be.
*/
- if (hv_kvp_req_in_progress()) {
+ if (hv_kvp_req_in_progress(sc)) {
hv_kvp_log_info("%s: request was still active after wait so failing\n", __func__);
- hv_kvp_respond_host(HV_KVP_E_FAIL);
- kvp_globals.req_in_progress = false;
- }
-
- /*
- * Decrement pending request count and
- */
- if (kvp_globals.pending_reqs>0) {
- kvp_globals.pending_reqs = kvp_globals.pending_reqs - 1;
+ hv_kvp_respond_host(sc, HV_KVP_E_FAIL);
+ sc->req_in_progress = false;
}
- pending_cnt = kvp_globals.pending_reqs;
-
- mtx_unlock(&kvp_globals.pending_mutex);
+
+ mtx_unlock(&sc->pending_mutex);
/*
* Try reading next buffer
@@ -741,109 +717,43 @@ hv_kvp_process_request(void *context)
recvlen = 0;
ret = hv_vmbus_channel_recv_packet(channel, kvp_buf, 2 * PAGE_SIZE,
&recvlen, &requestid);
- hv_kvp_log_info("%s: read: context %p, pending_cnt %llu ret =%d, recvlen=%d\n",
- __func__, context, (unsigned long long)pending_cnt, ret, recvlen);
- }
+ hv_kvp_log_info("%s: read: context %p, ret =%d, recvlen=%d\n",
+ __func__, context, ret, recvlen);
+ }
}
/*
* Callback routine that gets called whenever there is a message from host
*/
-void
+static void
hv_kvp_callback(void *context)
{
- uint64_t pending_cnt = 0;
-
- if (kvp_globals.register_done == false) {
-
- kvp_globals.channelp = context;
- } else {
-
- mtx_lock(&kvp_globals.pending_mutex);
- kvp_globals.pending_reqs = kvp_globals.pending_reqs + 1;
- pending_cnt = kvp_globals.pending_reqs;
- mtx_unlock(&kvp_globals.pending_mutex);
- if (pending_cnt == 1) {
- hv_kvp_log_info("%s: Queuing work item\n", __func__);
- hv_queue_work_item(
- service_table[HV_KVP].work_queue,
- hv_kvp_process_request,
- context
- );
- }
- }
-}
-
-
-/*
- * This function is called by the hv_kvp_init -
- * creates character device hv_kvp_dev
- * allocates memory to hv_kvp_dev_buf
- *
- */
-static int
-hv_kvp_dev_init(void)
-{
- int error = 0;
-
- /* initialize semaphore */
- sema_init(&kvp_globals.dev_sema, 0, "hv_kvp device semaphore");
- /* create character device */
- error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
- &hv_kvp_dev,
- &hv_kvp_cdevsw,
- 0,
- UID_ROOT,
- GID_WHEEL,
- 0640,
- "hv_kvp_dev");
-
- if (error != 0)
- return (error);
-
+ hv_kvp_sc *sc = (hv_kvp_sc*)context;
/*
- * Malloc with M_WAITOK flag will never fail.
- */
- hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_HV_KVP_DEV_BUF, M_WAITOK |
- M_ZERO);
-
- return (0);
-}
-
-
-/*
- * This function is called by the hv_kvp_deinit -
- * destroy character device
- */
-static void
-hv_kvp_dev_destroy(void)
-{
-
- if (daemon_task != NULL) {
- PROC_LOCK(daemon_task);
- kern_psignal(daemon_task, SIGKILL);
- PROC_UNLOCK(daemon_task);
+ The first request from host will not be handled until daemon is registered.
+ when callback is triggered without a registered daemon, callback just return.
+ When a new daemon gets regsitered, this callbcak is trigged from _write op.
+ */
+ if (sc->register_done) {
+ hv_kvp_log_info("%s: Queuing work item\n", __func__);
+ taskqueue_enqueue(taskqueue_thread, &sc->task);
}
-
- destroy_dev(hv_kvp_dev);
- free(hv_kvp_dev_buf, M_HV_KVP_DEV_BUF);
- return;
}
-
static int
hv_kvp_dev_open(struct cdev *dev, int oflags, int devtype,
struct thread *td)
{
-
+ hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
+
hv_kvp_log_info("%s: Opened device \"hv_kvp_device\" successfully.\n", __func__);
- if (kvp_globals.dev_accessed)
+ if (sc->dev_accessed)
return (-EBUSY);
-
- daemon_task = curproc;
- kvp_globals.dev_accessed = true;
- kvp_globals.daemon_busy = false;
+
+ sc->daemon_task = curproc;
+ sc->dev_accessed = true;
+ sc->daemon_busy = false;
return (0);
}
@@ -852,10 +762,11 @@ static int
hv_kvp_dev_close(struct cdev *dev __unused, int fflag __unused, int devtype __unused,
struct thread *td __unused)
{
+ hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
hv_kvp_log_info("%s: Closing device \"hv_kvp_device\".\n", __func__);
- kvp_globals.dev_accessed = false;
- kvp_globals.register_done = false;
+ sc->dev_accessed = false;
+ sc->register_done = false;
return (0);
}
@@ -865,18 +776,21 @@ hv_kvp_dev_close(struct cdev *dev __unused, int fflag __unused, int devtype __un
* acts as a send to daemon
*/
static int
-hv_kvp_dev_daemon_read(struct cdev *dev __unused, struct uio *uio, int ioflag __unused)
+hv_kvp_dev_daemon_read(struct cdev *dev, struct uio *uio, int ioflag __unused)
{
size_t amt;
int error = 0;
+ struct hv_kvp_msg *hv_kvp_dev_buf;
+ hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
/* Check hv_kvp daemon registration status*/
- if (!kvp_globals.register_done)
+ if (!sc->register_done)
return (KVP_ERROR);
- sema_wait(&kvp_globals.dev_sema);
+ sema_wait(&sc->dev_sema);
- memcpy(hv_kvp_dev_buf, &kvp_globals.daemon_kvp_msg, sizeof(struct hv_kvp_msg));
+ hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK);
+ memcpy(hv_kvp_dev_buf, &sc->daemon_kvp_msg, sizeof(struct hv_kvp_msg));
amt = MIN(uio->uio_resid, uio->uio_offset >= BUFFERSIZE + 1 ? 0 :
BUFFERSIZE + 1 - uio->uio_offset);
@@ -884,6 +798,7 @@ hv_kvp_dev_daemon_read(struct cdev *dev __unused, struct uio *uio, int ioflag __
if ((error = uiomove(hv_kvp_dev_buf, amt, uio)) != 0)
hv_kvp_log_info("%s: hv_kvp uiomove read failed!\n", __func__);
+ free(hv_kvp_dev_buf, M_TEMP);
return (error);
}
@@ -893,29 +808,30 @@ hv_kvp_dev_daemon_read(struct cdev *dev __unused, struct uio *uio, int ioflag __
* acts as a recieve from daemon
*/
static int
-hv_kvp_dev_daemon_write(struct cdev *dev __unused, struct uio *uio, int ioflag __unused)
+hv_kvp_dev_daemon_write(struct cdev *dev, struct uio *uio, int ioflag __unused)
{
size_t amt;
int error = 0;
+ struct hv_kvp_msg *hv_kvp_dev_buf;
+ hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
uio->uio_offset = 0;
+ hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK);
amt = MIN(uio->uio_resid, BUFFERSIZE);
error = uiomove(hv_kvp_dev_buf, amt, uio);
- if (error != 0)
+ if (error != 0) {
+ free(hv_kvp_dev_buf, M_TEMP);
return (error);
+ }
+ memcpy(&sc->daemon_kvp_msg, hv_kvp_dev_buf, sizeof(struct hv_kvp_msg));
- memcpy(&kvp_globals.daemon_kvp_msg, hv_kvp_dev_buf, sizeof(struct hv_kvp_msg));
-
- if (kvp_globals.register_done == false) {
- if (kvp_globals.daemon_kvp_msg.kvp_hdr.operation == HV_KVP_OP_REGISTER) {
-
- kvp_globals.register_done = true;
- if (kvp_globals.channelp) {
-
- hv_kvp_callback(kvp_globals.channelp);
- }
+ free(hv_kvp_dev_buf, M_TEMP);
+ if (sc->register_done == false) {
+ if (sc->daemon_kvp_msg.kvp_hdr.operation == HV_KVP_OP_REGISTER) {
+ sc->register_done = true;
+ hv_kvp_callback(dev->si_drv1);
}
else {
hv_kvp_log_info("%s, KVP Registration Failed\n", __func__);
@@ -923,18 +839,20 @@ hv_kvp_dev_daemon_write(struct cdev *dev __unused, struct uio *uio, int ioflag _
}
} else {
- mtx_lock(&kvp_globals.pending_mutex);
+ mtx_lock(&sc->pending_mutex);
- if(!kvp_globals.req_timed_out) {
+ if(!sc->req_timed_out) {
+ struct hv_kvp_msg *hmsg = sc->host_kvp_msg;
+ struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg;
- hv_kvp_convert_usermsg_to_hostmsg();
- hv_kvp_respond_host(KVP_SUCCESS);
- wakeup(&kvp_globals);
- kvp_globals.req_in_progress = false;
+ hv_kvp_convert_usermsg_to_hostmsg(umsg, hmsg);
+ hv_kvp_respond_host(sc, KVP_SUCCESS);
+ wakeup(sc);
+ sc->req_in_progress = false;
}
- kvp_globals.daemon_busy = false;
- mtx_unlock(&kvp_globals.pending_mutex);
+ sc->daemon_busy = false;
+ mtx_unlock(&sc->pending_mutex);
}
return (error);
@@ -946,66 +864,106 @@ hv_kvp_dev_daemon_write(struct cdev *dev __unused, struct uio *uio, int ioflag _
* for daemon to read.
*/
static int
-hv_kvp_dev_daemon_poll(struct cdev *dev __unused, int events, struct thread *td)
+hv_kvp_dev_daemon_poll(struct cdev *dev, int events, struct thread *td)
{
int revents = 0;
+ hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
- mtx_lock(&kvp_globals.pending_mutex);
+ mtx_lock(&sc->pending_mutex);
/*
* We check global flag daemon_busy for the data availiability for
* userland to read. Deamon_busy is set to true before driver has data
* for daemon to read. It is set to false after daemon sends
* then response back to driver.
*/
- if (kvp_globals.daemon_busy == true)
+ if (sc->daemon_busy == true)
revents = POLLIN;
else
- selrecord(td, &hv_kvp_selinfo);
+ selrecord(td, &sc->hv_kvp_selinfo);
- mtx_unlock(&kvp_globals.pending_mutex);
+ mtx_unlock(&sc->pending_mutex);
return (revents);
}
-
-/*
- * hv_kvp initialization function
- * called from hv_util service.
- *
- */
-int
-hv_kvp_init(hv_vmbus_service *srv)
+static int
+hv_kvp_probe(device_t dev)
{
- int error = 0;
- hv_work_queue *work_queue = NULL;
-
- memset(&kvp_globals, 0, sizeof(kvp_globals));
-
- work_queue = hv_work_queue_create("KVP Service");
- if (work_queue == NULL) {
- hv_kvp_log_info("%s: Work queue alloc failed\n", __func__);
- error = ENOMEM;
- hv_kvp_log_error("%s: ENOMEM\n", __func__);
- goto Finish;
+ const char *p = vmbus_get_type(dev);
+ if (!memcmp(p, &service_guid, sizeof(hv_guid))) {
+ device_set_desc(dev, "Hyper-V KVP Service");
+ return BUS_PROBE_DEFAULT;
}
- srv->work_queue = work_queue;
- error = hv_kvp_dev_init();
- mtx_init(&kvp_globals.pending_mutex, "hv-kvp pending mutex",
- NULL, MTX_DEF);
- kvp_globals.pending_reqs = 0;
+ return ENXIO;
+}
+
+static int
+hv_kvp_attach(device_t dev)
+{
+ int error;
+ struct sysctl_oid_list *child;
+ struct sysctl_ctx_list *ctx;
+
+ hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev);
+ sc->util_sc.callback = hv_kvp_callback;
+ sema_init(&sc->dev_sema, 0, "hv_kvp device semaphore");
+ mtx_init(&sc->pending_mutex, "hv-kvp pending mutex",
+ NULL, MTX_DEF);
-Finish:
- return (error);
-}
+ ctx = device_get_sysctl_ctx(dev);
+ child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "hv_kvp_log",
+ CTLFLAG_RW, &hv_kvp_log, 0, "Hyperv KVP service log level");
-void
-hv_kvp_deinit(void)
+ TASK_INIT(&sc->task, 0, hv_kvp_process_request, sc);
+
+ /* create character device */
+ error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
+ &sc->hv_kvp_dev,
+ &hv_kvp_cdevsw,
+ 0,
+ UID_ROOT,
+ GID_WHEEL,
+ 0640,
+ "hv_kvp_dev");
+
+ if (error != 0)
+ return (error);
+ sc->hv_kvp_dev->si_drv1 = sc;
+
+ return hv_util_attach(dev);
+}
+
+static int
+hv_kvp_detach(device_t dev)
{
- hv_kvp_dev_destroy();
- mtx_destroy(&kvp_globals.pending_mutex);
+ hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev);
- return;
+ if (sc->daemon_task != NULL) {
+ PROC_LOCK(sc->daemon_task);
+ kern_psignal(sc->daemon_task, SIGKILL);
+ PROC_UNLOCK(sc->daemon_task);
+ }
+
+ destroy_dev(sc->hv_kvp_dev);
+ return hv_util_detach(dev);
}
+
+static device_method_t kvp_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, hv_kvp_probe),
+ DEVMETHOD(device_attach, hv_kvp_attach),
+ DEVMETHOD(device_detach, hv_kvp_detach),
+ { 0, 0 }
+};
+
+static driver_t kvp_driver = { "hvkvp", kvp_methods, sizeof(hv_kvp_sc)};
+
+static devclass_t kvp_devclass;
+
+DRIVER_MODULE(hv_kvp, vmbus, kvp_driver, kvp_devclass, NULL, NULL);
+MODULE_VERSION(hv_kvp, 1);
+MODULE_DEPEND(hv_kvp, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/utilities/hv_kvp.h b/sys/dev/hyperv/utilities/hv_kvp.h
index b67373fa..b62149e 100644
--- a/sys/dev/hyperv/utilities/hv_kvp.h
+++ b/sys/dev/hyperv/utilities/hv_kvp.h
@@ -238,17 +238,4 @@ struct hv_kvp_ip_msg {
struct hv_kvp_ipaddr_value kvp_ip_val;
} __attribute__((packed));
-
-#define HV_SHUT_DOWN 0
-#define HV_TIME_SYNCH 1
-#define HV_HEART_BEAT 2
-#define HV_KVP 3
-#define HV_MAX_UTIL_SERVICES 4
-
-#define HV_WLTIMEDELTA 116444736000000000L /* in 100ns unit */
-#define HV_ICTIMESYNCFLAG_PROBE 0
-#define HV_ICTIMESYNCFLAG_SYNC 1
-#define HV_ICTIMESYNCFLAG_SAMPLE 2
-#define HV_NANO_SEC_PER_SEC 1000000000
-
#endif /* _KVP_H */
diff --git a/sys/dev/hyperv/utilities/hv_shutdown.c b/sys/dev/hyperv/utilities/hv_shutdown.c
new file mode 100644
index 0000000..20bc65e
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_shutdown.c
@@ -0,0 +1,151 @@
+/*-
+ * Copyright (c) 2014 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * A common driver for all hyper-V util services.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/reboot.h>
+#include <sys/timetc.h>
+#include <sys/syscallsubr.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include "hv_util.h"
+
+static hv_guid service_guid = { .data =
+ {0x31, 0x60, 0x0B, 0X0E, 0x13, 0x52, 0x34, 0x49,
+ 0x81, 0x8B, 0x38, 0XD9, 0x0C, 0xED, 0x39, 0xDB} };
+
+/**
+ * Shutdown
+ */
+static void
+hv_shutdown_cb(void *context)
+{
+ uint8_t* buf;
+ hv_vmbus_channel* channel;
+ uint8_t execute_shutdown = 0;
+ hv_vmbus_icmsg_hdr* icmsghdrp;
+ uint32_t recv_len;
+ uint64_t request_id;
+ int ret;
+ hv_vmbus_shutdown_msg_data* shutdown_msg;
+ hv_util_sc *softc;
+
+ softc = (hv_util_sc*)context;
+ buf = softc->receive_buffer;;
+ channel = softc->hv_dev->channel;
+ ret = hv_vmbus_channel_recv_packet(channel, buf, PAGE_SIZE,
+ &recv_len, &request_id);
+
+ if ((ret == 0) && recv_len > 0) {
+
+ icmsghdrp = (struct hv_vmbus_icmsg_hdr *)
+ &buf[sizeof(struct hv_vmbus_pipe_hdr)];
+
+ if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
+ hv_negotiate_version(icmsghdrp, NULL, buf);
+
+ } else {
+ shutdown_msg =
+ (struct hv_vmbus_shutdown_msg_data *)
+ &buf[sizeof(struct hv_vmbus_pipe_hdr) +
+ sizeof(struct hv_vmbus_icmsg_hdr)];
+
+ switch (shutdown_msg->flags) {
+ case 0:
+ case 1:
+ icmsghdrp->status = HV_S_OK;
+ execute_shutdown = 1;
+ if(bootverbose)
+ printf("Shutdown request received -"
+ " graceful shutdown initiated\n");
+ break;
+ default:
+ icmsghdrp->status = HV_E_FAIL;
+ execute_shutdown = 0;
+ printf("Shutdown request received -"
+ " Invalid request\n");
+ break;
+ }
+ }
+
+ icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION |
+ HV_ICMSGHDRFLAG_RESPONSE;
+
+ hv_vmbus_channel_send_packet(channel, buf,
+ recv_len, request_id,
+ HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
+ }
+
+ if (execute_shutdown)
+ shutdown_nice(RB_POWEROFF);
+}
+
+static int
+hv_shutdown_probe(device_t dev)
+{
+ const char *p = vmbus_get_type(dev);
+ if (!memcmp(p, &service_guid, sizeof(hv_guid))) {
+ device_set_desc(dev, "Hyper-V Shutdown Service");
+ return BUS_PROBE_DEFAULT;
+ }
+
+ return ENXIO;
+}
+
+static int
+hv_shutdown_attach(device_t dev)
+{
+ hv_util_sc *softc = (hv_util_sc*)device_get_softc(dev);
+
+ softc->callback = hv_shutdown_cb;
+
+ return hv_util_attach(dev);
+}
+
+static device_method_t shutdown_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, hv_shutdown_probe),
+ DEVMETHOD(device_attach, hv_shutdown_attach),
+ DEVMETHOD(device_detach, hv_util_detach),
+ { 0, 0 }
+};
+
+static driver_t shutdown_driver = { "hvshutdown", shutdown_methods, sizeof(hv_util_sc)};
+
+static devclass_t shutdown_devclass;
+
+DRIVER_MODULE(hv_shutdown, vmbus, shutdown_driver, shutdown_devclass, NULL, NULL);
+MODULE_VERSION(hv_shutdown, 1);
+MODULE_DEPEND(hv_shutdown, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/utilities/hv_timesync.c b/sys/dev/hyperv/utilities/hv_timesync.c
new file mode 100644
index 0000000..d1ea904
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_timesync.c
@@ -0,0 +1,216 @@
+/*-
+ * Copyright (c) 2014 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * A common driver for all hyper-V util services.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/reboot.h>
+#include <sys/timetc.h>
+#include <sys/syscallsubr.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include "hv_util.h"
+
+#define HV_WLTIMEDELTA 116444736000000000L /* in 100ns unit */
+#define HV_ICTIMESYNCFLAG_PROBE 0
+#define HV_ICTIMESYNCFLAG_SYNC 1
+#define HV_ICTIMESYNCFLAG_SAMPLE 2
+#define HV_NANO_SEC_PER_SEC 1000000000
+
+/* Time Sync data */
+typedef struct {
+ uint64_t data;
+} time_sync_data;
+
+ /* Time Synch Service */
+static hv_guid service_guid = {.data =
+ {0x30, 0xe6, 0x27, 0x95, 0xae, 0xd0, 0x7b, 0x49,
+ 0xad, 0xce, 0xe8, 0x0a, 0xb0, 0x17, 0x5c, 0xaf } };
+
+struct hv_ictimesync_data {
+ uint64_t parenttime;
+ uint64_t childtime;
+ uint64_t roundtriptime;
+ uint8_t flags;
+} __packed;
+
+typedef struct hv_timesync_sc {
+ hv_util_sc util_sc;
+ struct task task;
+ time_sync_data time_msg;
+} hv_timesync_sc;
+
+/**
+ * Set host time based on time sync message from host
+ */
+static void
+hv_set_host_time(void *context, int pending)
+{
+ hv_timesync_sc *softc = (hv_timesync_sc*)context;
+ uint64_t hosttime = softc->time_msg.data;
+ struct timespec guest_ts, host_ts;
+ uint64_t host_tns;
+ int64_t diff;
+ int error;
+
+ host_tns = (hosttime - HV_WLTIMEDELTA) * 100;
+ host_ts.tv_sec = (time_t)(host_tns/HV_NANO_SEC_PER_SEC);
+ host_ts.tv_nsec = (long)(host_tns%HV_NANO_SEC_PER_SEC);
+
+ nanotime(&guest_ts);
+
+ diff = (int64_t)host_ts.tv_sec - (int64_t)guest_ts.tv_sec;
+
+ /*
+ * If host differs by 5 seconds then make the guest catch up
+ */
+ if (diff > 5 || diff < -5) {
+ error = kern_clock_settime(curthread, CLOCK_REALTIME,
+ &host_ts);
+ }
+}
+
+/**
+ * @brief Synchronize time with host after reboot, restore, etc.
+ *
+ * ICTIMESYNCFLAG_SYNC flag bit indicates reboot, restore events of the VM.
+ * After reboot the flag ICTIMESYNCFLAG_SYNC is included in the first time
+ * message after the timesync channel is opened. Since the hv_utils module is
+ * loaded after hv_vmbus, the first message is usually missed. The other
+ * thing is, systime is automatically set to emulated hardware clock which may
+ * not be UTC time or in the same time zone. So, to override these effects, we
+ * use the first 50 time samples for initial system time setting.
+ */
+static inline
+void hv_adj_guesttime(hv_timesync_sc *sc, uint64_t hosttime, uint8_t flags)
+{
+ sc->time_msg.data = hosttime;
+
+ if (((flags & HV_ICTIMESYNCFLAG_SYNC) != 0) ||
+ ((flags & HV_ICTIMESYNCFLAG_SAMPLE) != 0)) {
+ taskqueue_enqueue(taskqueue_thread, &sc->task);
+ }
+}
+
+/**
+ * Time Sync Channel message handler
+ */
+static void
+hv_timesync_cb(void *context)
+{
+ hv_vmbus_channel* channel;
+ hv_vmbus_icmsg_hdr* icmsghdrp;
+ uint32_t recvlen;
+ uint64_t requestId;
+ int ret;
+ uint8_t* time_buf;
+ struct hv_ictimesync_data* timedatap;
+ hv_timesync_sc *softc;
+
+ softc = (hv_timesync_sc*)context;
+ channel = softc->util_sc.hv_dev->channel;
+ time_buf = softc->util_sc.receive_buffer;
+
+ ret = hv_vmbus_channel_recv_packet(channel, time_buf,
+ PAGE_SIZE, &recvlen, &requestId);
+
+ if ((ret == 0) && recvlen > 0) {
+ icmsghdrp = (struct hv_vmbus_icmsg_hdr *) &time_buf[
+ sizeof(struct hv_vmbus_pipe_hdr)];
+
+ if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
+ hv_negotiate_version(icmsghdrp, NULL, time_buf);
+ } else {
+ timedatap = (struct hv_ictimesync_data *) &time_buf[
+ sizeof(struct hv_vmbus_pipe_hdr) +
+ sizeof(struct hv_vmbus_icmsg_hdr)];
+ hv_adj_guesttime(softc, timedatap->parenttime, timedatap->flags);
+ }
+
+ icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION
+ | HV_ICMSGHDRFLAG_RESPONSE;
+
+ hv_vmbus_channel_send_packet(channel, time_buf,
+ recvlen, requestId,
+ HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
+ }
+}
+
+static int
+hv_timesync_probe(device_t dev)
+{
+ const char *p = vmbus_get_type(dev);
+ if (!memcmp(p, &service_guid, sizeof(hv_guid))) {
+ device_set_desc(dev, "Hyper-V Time Synch Service");
+ return BUS_PROBE_DEFAULT;
+ }
+
+ return ENXIO;
+}
+
+static int
+hv_timesync_attach(device_t dev)
+{
+ hv_timesync_sc *softc = device_get_softc(dev);
+
+ softc->util_sc.callback = hv_timesync_cb;
+ TASK_INIT(&softc->task, 1, hv_set_host_time, softc);
+
+ return hv_util_attach(dev);
+}
+
+static int
+hv_timesync_detach(device_t dev)
+{
+ hv_timesync_sc *softc = device_get_softc(dev);
+ taskqueue_drain(taskqueue_thread, &softc->task);
+
+ return hv_util_detach(dev);
+}
+
+static device_method_t timesync_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, hv_timesync_probe),
+ DEVMETHOD(device_attach, hv_timesync_attach),
+ DEVMETHOD(device_detach, hv_timesync_detach),
+ { 0, 0 }
+};
+
+static driver_t timesync_driver = { "hvtimesync", timesync_methods, sizeof(hv_timesync_sc)};
+
+static devclass_t timesync_devclass;
+
+DRIVER_MODULE(hv_timesync, vmbus, timesync_driver, timesync_devclass, NULL, NULL);
+MODULE_VERSION(hv_timesync, 1);
+MODULE_DEPEND(hv_timesync, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/utilities/hv_util.c b/sys/dev/hyperv/utilities/hv_util.c
index dc4b1e2..7d19b3f 100644
--- a/sys/dev/hyperv/utilities/hv_util.c
+++ b/sys/dev/hyperv/utilities/hv_util.c
@@ -40,85 +40,9 @@
#include <sys/syscallsubr.h>
#include <dev/hyperv/include/hyperv.h>
-#include "hv_kvp.h"
+#include "hv_util.h"
-/* Time Sync data */
-typedef struct {
- uint64_t data;
-} time_sync_data;
-
-static void hv_shutdown_cb(void *context);
-static void hv_heartbeat_cb(void *context);
-static void hv_timesync_cb(void *context);
-
-static int hv_timesync_init(hv_vmbus_service *serv);
-
-/*
- * Note: GUID codes below are predefined by the host hypervisor
- * (Hyper-V and Azure)interface and required for correct operation.
- */
-hv_vmbus_service service_table[] = {
- /* Shutdown Service */
- { .guid.data = {0x31, 0x60, 0x0B, 0X0E, 0x13, 0x52, 0x34, 0x49,
- 0x81, 0x8B, 0x38, 0XD9, 0x0C, 0xED, 0x39, 0xDB},
- .name = "Hyper-V Shutdown Service\n",
- .enabled = TRUE,
- .callback = hv_shutdown_cb,
- },
-
- /* Time Synch Service */
- { .guid.data = {0x30, 0xe6, 0x27, 0x95, 0xae, 0xd0, 0x7b, 0x49,
- 0xad, 0xce, 0xe8, 0x0a, 0xb0, 0x17, 0x5c, 0xaf},
- .name = "Hyper-V Time Synch Service\n",
- .enabled = TRUE,
- .init = hv_timesync_init,
- .callback = hv_timesync_cb,
- },
-
- /* Heartbeat Service */
- { .guid.data = {0x39, 0x4f, 0x16, 0x57, 0x15, 0x91, 0x78, 0x4e,
- 0xab, 0x55, 0x38, 0x2f, 0x3b, 0xd5, 0x42, 0x2d},
- .name = "Hyper-V Heartbeat Service\n",
- .enabled = TRUE,
- .callback = hv_heartbeat_cb,
- },
-
- /* KVP (Key Value Pair) Service */
- { .guid.data = {0xe7, 0xf4, 0xa0, 0xa9, 0x45, 0x5a, 0x96, 0x4d,
- 0xb8, 0x27, 0x8a, 0x84, 0x1e, 0x8c, 0x3, 0xe6},
- .name = "Hyper-V KVP Service\n",
- .enabled = TRUE,
- .init = hv_kvp_init,
- .callback = hv_kvp_callback,
- },
-};
-
-/*
- * Receive buffer pointers. There is one buffer per utility service. The
- * buffer is allocated during attach().
- */
-uint8_t *receive_buffer[HV_MAX_UTIL_SERVICES];
-
-static boolean_t destroyed_kvp = FALSE;
-
-struct hv_ictimesync_data {
- uint64_t parenttime;
- uint64_t childtime;
- uint64_t roundtriptime;
- uint8_t flags;
-} __packed;
-
-static int
-hv_timesync_init(hv_vmbus_service *serv)
-{
-
- serv->work_queue = hv_work_queue_create("Time Sync");
- if (serv->work_queue == NULL)
- return (ENOMEM);
- return (0);
-}
-
-static void
+void
hv_negotiate_version(
struct hv_vmbus_icmsg_hdr* icmsghdrp,
struct hv_vmbus_icmsg_negotiate* negop,
@@ -147,267 +71,19 @@ hv_negotiate_version(
negop->icmsg_vercnt = 1;
}
-
-/**
- * Set host time based on time sync message from host
- */
-static void
-hv_set_host_time(void *context)
-{
- time_sync_data* time_msg = (time_sync_data*) context;
- uint64_t hosttime = time_msg->data;
- struct timespec guest_ts, host_ts;
- uint64_t host_tns;
- int64_t diff;
- int error;
-
- host_tns = (hosttime - HV_WLTIMEDELTA) * 100;
- host_ts.tv_sec = (time_t)(host_tns/HV_NANO_SEC_PER_SEC);
- host_ts.tv_nsec = (long)(host_tns%HV_NANO_SEC_PER_SEC);
-
- nanotime(&guest_ts);
-
- diff = (int64_t)host_ts.tv_sec - (int64_t)guest_ts.tv_sec;
-
- /*
- * If host differs by 5 seconds then make the guest catch up
- */
- if (diff > 5 || diff < -5) {
- error = kern_clock_settime(curthread, CLOCK_REALTIME,
- &host_ts);
- }
-
- /*
- * Free the hosttime that was allocated in hv_adj_guesttime()
- */
- free(time_msg, M_DEVBUF);
-}
-
-/**
- * @brief Synchronize time with host after reboot, restore, etc.
- *
- * ICTIMESYNCFLAG_SYNC flag bit indicates reboot, restore events of the VM.
- * After reboot the flag ICTIMESYNCFLAG_SYNC is included in the first time
- * message after the timesync channel is opened. Since the hv_utils module is
- * loaded after hv_vmbus, the first message is usually missed. The other
- * thing is, systime is automatically set to emulated hardware clock which may
- * not be UTC time or in the same time zone. So, to override these effects, we
- * use the first 50 time samples for initial system time setting.
- */
-static inline
-void hv_adj_guesttime(uint64_t hosttime, uint8_t flags)
-{
- time_sync_data* time_msg;
-
- time_msg = malloc(sizeof(time_sync_data), M_DEVBUF, M_NOWAIT);
-
- if (time_msg == NULL)
- return;
-
- time_msg->data = hosttime;
-
- if ((flags & HV_ICTIMESYNCFLAG_SYNC) != 0) {
- hv_queue_work_item(service_table[HV_TIME_SYNCH].work_queue,
- hv_set_host_time, time_msg);
- } else if ((flags & HV_ICTIMESYNCFLAG_SAMPLE) != 0) {
- hv_queue_work_item(service_table[HV_TIME_SYNCH].work_queue,
- hv_set_host_time, time_msg);
- } else {
- free(time_msg, M_DEVBUF);
- }
-}
-
-/**
- * Time Sync Channel message handler
- */
-static void
-hv_timesync_cb(void *context)
-{
- hv_vmbus_channel* channel = context;
- hv_vmbus_icmsg_hdr* icmsghdrp;
- uint32_t recvlen;
- uint64_t requestId;
- int ret;
- uint8_t* time_buf;
- struct hv_ictimesync_data* timedatap;
-
- time_buf = receive_buffer[HV_TIME_SYNCH];
-
- ret = hv_vmbus_channel_recv_packet(channel, time_buf,
- PAGE_SIZE, &recvlen, &requestId);
-
- if ((ret == 0) && recvlen > 0) {
- icmsghdrp = (struct hv_vmbus_icmsg_hdr *) &time_buf[
- sizeof(struct hv_vmbus_pipe_hdr)];
-
- if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
- hv_negotiate_version(icmsghdrp, NULL, time_buf);
- } else {
- timedatap = (struct hv_ictimesync_data *) &time_buf[
- sizeof(struct hv_vmbus_pipe_hdr) +
- sizeof(struct hv_vmbus_icmsg_hdr)];
- hv_adj_guesttime(timedatap->parenttime, timedatap->flags);
- }
-
- icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION
- | HV_ICMSGHDRFLAG_RESPONSE;
-
- hv_vmbus_channel_send_packet(channel, time_buf,
- recvlen, requestId,
- HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
- }
-}
-
-/**
- * Shutdown
- */
-static void
-hv_shutdown_cb(void *context)
-{
- uint8_t* buf;
- hv_vmbus_channel* channel = context;
- uint8_t execute_shutdown = 0;
- hv_vmbus_icmsg_hdr* icmsghdrp;
- uint32_t recv_len;
- uint64_t request_id;
- int ret;
- hv_vmbus_shutdown_msg_data* shutdown_msg;
-
- buf = receive_buffer[HV_SHUT_DOWN];
-
- ret = hv_vmbus_channel_recv_packet(channel, buf, PAGE_SIZE,
- &recv_len, &request_id);
-
- if ((ret == 0) && recv_len > 0) {
-
- icmsghdrp = (struct hv_vmbus_icmsg_hdr *)
- &buf[sizeof(struct hv_vmbus_pipe_hdr)];
-
- if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
- hv_negotiate_version(icmsghdrp, NULL, buf);
-
- } else {
- shutdown_msg =
- (struct hv_vmbus_shutdown_msg_data *)
- &buf[sizeof(struct hv_vmbus_pipe_hdr) +
- sizeof(struct hv_vmbus_icmsg_hdr)];
-
- switch (shutdown_msg->flags) {
- case 0:
- case 1:
- icmsghdrp->status = HV_S_OK;
- execute_shutdown = 1;
- if(bootverbose)
- printf("Shutdown request received -"
- " graceful shutdown initiated\n");
- break;
- default:
- icmsghdrp->status = HV_E_FAIL;
- execute_shutdown = 0;
- printf("Shutdown request received -"
- " Invalid request\n");
- break;
- }
- }
-
- icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION |
- HV_ICMSGHDRFLAG_RESPONSE;
-
- hv_vmbus_channel_send_packet(channel, buf,
- recv_len, request_id,
- HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
- }
-
- if (execute_shutdown)
- shutdown_nice(RB_POWEROFF);
-}
-
-/**
- * Process heartbeat message
- */
-static void
-hv_heartbeat_cb(void *context)
-{
- uint8_t* buf;
- hv_vmbus_channel* channel = context;
- uint32_t recvlen;
- uint64_t requestid;
- int ret;
-
- struct hv_vmbus_heartbeat_msg_data* heartbeat_msg;
- struct hv_vmbus_icmsg_hdr* icmsghdrp;
-
- buf = receive_buffer[HV_HEART_BEAT];
-
- ret = hv_vmbus_channel_recv_packet(channel, buf, PAGE_SIZE, &recvlen,
- &requestid);
-
- if ((ret == 0) && recvlen > 0) {
-
- icmsghdrp = (struct hv_vmbus_icmsg_hdr *)
- &buf[sizeof(struct hv_vmbus_pipe_hdr)];
-
- if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
- hv_negotiate_version(icmsghdrp, NULL, buf);
-
- } else {
- heartbeat_msg =
- (struct hv_vmbus_heartbeat_msg_data *)
- &buf[sizeof(struct hv_vmbus_pipe_hdr) +
- sizeof(struct hv_vmbus_icmsg_hdr)];
-
- heartbeat_msg->seq_num += 1;
- }
-
- icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION |
- HV_ICMSGHDRFLAG_RESPONSE;
-
- hv_vmbus_channel_send_packet(channel, buf, recvlen, requestid,
- HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
- }
-}
-
-
-static int
-hv_util_probe(device_t dev)
-{
- int i;
- int rtn_value = ENXIO;
-
- for (i = 0; i < HV_MAX_UTIL_SERVICES; i++) {
- const char *p = vmbus_get_type(dev);
- if (service_table[i].enabled && !memcmp(p, &service_table[i].guid, sizeof(hv_guid))) {
- device_set_softc(dev, (void *) (&service_table[i]));
- rtn_value = BUS_PROBE_DEFAULT;
- }
- }
-
- return rtn_value;
-}
-
-static int
+int
hv_util_attach(device_t dev)
{
- struct hv_device* hv_dev;
- struct hv_vmbus_service* service;
- int ret;
- size_t receive_buffer_offset;
+ struct hv_device* hv_dev;
+ struct hv_util_sc* softc;
+ int ret;
hv_dev = vmbus_get_devctx(dev);
- service = device_get_softc(dev);
- receive_buffer_offset = service - &service_table[0];
- device_printf(dev, "Hyper-V Service attaching: %s\n", service->name);
- receive_buffer[receive_buffer_offset] =
+ softc = device_get_softc(dev);
+ softc->hv_dev = hv_dev;
+ softc->receive_buffer =
malloc(4 * PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO);
- if (service->init != NULL) {
- ret = service->init(service);
- if (ret) {
- ret = ENODEV;
- goto error0;
- }
- }
-
/*
* These services are not performance critical and do not need
* batched reading. Furthermore, some services such as KVP can
@@ -418,83 +94,30 @@ hv_util_attach(device_t dev)
hv_set_channel_read_state(hv_dev->channel, FALSE);
ret = hv_vmbus_channel_open(hv_dev->channel, 4 * PAGE_SIZE,
- 4 * PAGE_SIZE, NULL, 0,
- service->callback, hv_dev->channel);
+ 4 * PAGE_SIZE, NULL, 0,
+ softc->callback, softc);
if (ret)
- goto error0;
+ goto error0;
return (0);
- error0:
-
- free(receive_buffer[receive_buffer_offset], M_DEVBUF);
- receive_buffer[receive_buffer_offset] = NULL;
-
+error0:
+ free(softc->receive_buffer, M_DEVBUF);
return (ret);
}
-static int
+int
hv_util_detach(device_t dev)
{
- struct hv_device* hv_dev;
- struct hv_vmbus_service* service;
- size_t receive_buffer_offset;
-
- if (!destroyed_kvp) {
- hv_kvp_deinit();
- destroyed_kvp = TRUE;
- }
+ struct hv_device* hv_dev;
+ struct hv_util_sc* softc;
hv_dev = vmbus_get_devctx(dev);
hv_vmbus_channel_close(hv_dev->channel);
- service = device_get_softc(dev);
- receive_buffer_offset = service - &service_table[0];
+ softc = device_get_softc(dev);
- if (service->work_queue != NULL)
- hv_work_queue_close(service->work_queue);
-
- free(receive_buffer[receive_buffer_offset], M_DEVBUF);
- receive_buffer[receive_buffer_offset] = NULL;
+ free(softc->receive_buffer, M_DEVBUF);
return (0);
}
-
-static void
-hv_util_init(void)
-{
-}
-
-static int
-hv_util_modevent(module_t mod, int event, void *arg)
-{
- switch (event) {
- case MOD_LOAD:
- break;
- case MOD_UNLOAD:
- break;
- default:
- break;
- }
- return (0);
-}
-
-static device_method_t util_methods[] = {
- /* Device interface */
- DEVMETHOD(device_probe, hv_util_probe),
- DEVMETHOD(device_attach, hv_util_attach),
- DEVMETHOD(device_detach, hv_util_detach),
- DEVMETHOD(device_shutdown, bus_generic_shutdown),
- { 0, 0 } }
-;
-
-static driver_t util_driver = { "hyperv-utils", util_methods, 0 };
-
-static devclass_t util_devclass;
-
-DRIVER_MODULE(hv_utils, vmbus, util_driver, util_devclass, hv_util_modevent, 0);
-MODULE_VERSION(hv_utils, 1);
-MODULE_DEPEND(hv_utils, vmbus, 1, 1, 1);
-
-SYSINIT(hv_util_initx, SI_SUB_KTHREAD_IDLE, SI_ORDER_MIDDLE + 1,
- hv_util_init, NULL);
diff --git a/sys/dev/hyperv/utilities/hv_util.h b/sys/dev/hyperv/utilities/hv_util.h
new file mode 100644
index 0000000..708dca8
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_util.h
@@ -0,0 +1,55 @@
+/*-
+ * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HVUTIL_H_
+#define _HVUTIL_H_
+
+/**
+ * hv_util related structures
+ *
+ */
+typedef struct hv_util_sc {
+ /*
+ * function to process Hyper-V messages
+ */
+ void (*callback)(void *);
+
+ struct hv_device* hv_dev;
+ uint8_t *receive_buffer;
+} hv_util_sc;
+
+void hv_negotiate_version(
+ struct hv_vmbus_icmsg_hdr* icmsghdrp,
+ struct hv_vmbus_icmsg_negotiate* negop,
+ uint8_t* buf);
+
+int hv_util_attach(device_t dev);
+int hv_util_detach(device_t dev);
+#endif
diff --git a/sys/dev/hyperv/vmbus/hv_channel.c b/sys/dev/hyperv/vmbus/hv_channel.c
index 7037768..bb777cc 100644
--- a/sys/dev/hyperv/vmbus/hv_channel.c
+++ b/sys/dev/hyperv/vmbus/hv_channel.c
@@ -52,6 +52,7 @@ static int vmbus_channel_create_gpadl_header(
uint32_t* message_count);
static void vmbus_channel_set_event(hv_vmbus_channel* channel);
+static void VmbusProcessChannelEvent(void* channel, int pending);
/**
* @brief Trigger an event notification on the specified channel
@@ -68,9 +69,7 @@ vmbus_channel_set_event(hv_vmbus_channel *channel)
+ ((channel->offer_msg.child_rel_id >> 5))));
monitor_page = (hv_vmbus_monitor_page *)
- hv_vmbus_g_connection.monitor_pages;
-
- monitor_page++; /* Get the child to parent monitor page */
+ hv_vmbus_g_connection.monitor_page_2;
synch_set_bit(channel->monitor_bit,
(uint32_t *)&monitor_page->
@@ -115,6 +114,9 @@ hv_vmbus_channel_open(
new_channel->on_channel_callback = pfn_on_channel_callback;
new_channel->channel_callback_context = context;
+ new_channel->rxq = hv_vmbus_g_context.hv_event_queue[new_channel->target_cpu];
+ TASK_INIT(&new_channel->channel_task, 0, VmbusProcessChannelEvent, new_channel);
+
/* Allocate the ring buffer */
out = contigmalloc((send_ring_buffer_size + recv_ring_buffer_size),
M_DEVBUF, M_ZERO, 0UL, BUS_SPACE_MAXADDR, PAGE_SIZE, 0);
@@ -518,6 +520,7 @@ static void
hv_vmbus_channel_close_internal(hv_vmbus_channel *channel)
{
int ret = 0;
+ struct taskqueue *rxq = channel->rxq;
hv_vmbus_channel_close_channel* msg;
hv_vmbus_channel_msg_info* info;
@@ -525,6 +528,11 @@ hv_vmbus_channel_close_internal(hv_vmbus_channel *channel)
channel->sc_creation_callback = NULL;
/*
+ * set rxq to NULL to avoid more requests be scheduled
+ */
+ channel->rxq = NULL;
+ taskqueue_drain(rxq, &channel->channel_task);
+ /*
* Grab the lock to prevent race condition when a packet received
* and unloading driver is in the process.
*/
@@ -666,11 +674,11 @@ hv_vmbus_channel_send_packet_pagebuffer(
{
int ret = 0;
- int i = 0;
boolean_t need_sig;
uint32_t packet_len;
+ uint32_t page_buflen;
uint32_t packetLen_aligned;
- hv_vmbus_sg_buffer_list buffer_list[3];
+ hv_vmbus_sg_buffer_list buffer_list[4];
hv_vmbus_channel_packet_page_buffer desc;
uint32_t descSize;
uint64_t alignedData = 0;
@@ -682,36 +690,33 @@ hv_vmbus_channel_send_packet_pagebuffer(
* Adjust the size down since hv_vmbus_channel_packet_page_buffer
* is the largest size we support
*/
- descSize = sizeof(hv_vmbus_channel_packet_page_buffer) -
- ((HV_MAX_PAGE_BUFFER_COUNT - page_count) *
- sizeof(hv_vmbus_page_buffer));
- packet_len = descSize + buffer_len;
+ descSize = __offsetof(hv_vmbus_channel_packet_page_buffer, range);
+ page_buflen = sizeof(hv_vmbus_page_buffer) * page_count;
+ packet_len = descSize + page_buflen + buffer_len;
packetLen_aligned = HV_ALIGN_UP(packet_len, sizeof(uint64_t));
/* Setup the descriptor */
desc.type = HV_VMBUS_PACKET_TYPE_DATA_USING_GPA_DIRECT;
desc.flags = HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
- desc.data_offset8 = descSize >> 3; /* in 8-bytes granularity */
+ /* in 8-bytes granularity */
+ desc.data_offset8 = (descSize + page_buflen) >> 3;
desc.length8 = (uint16_t) (packetLen_aligned >> 3);
desc.transaction_id = request_id;
desc.range_count = page_count;
- for (i = 0; i < page_count; i++) {
- desc.range[i].length = page_buffers[i].length;
- desc.range[i].offset = page_buffers[i].offset;
- desc.range[i].pfn = page_buffers[i].pfn;
- }
-
buffer_list[0].data = &desc;
buffer_list[0].length = descSize;
- buffer_list[1].data = buffer;
- buffer_list[1].length = buffer_len;
+ buffer_list[1].data = page_buffers;
+ buffer_list[1].length = page_buflen;
- buffer_list[2].data = &alignedData;
- buffer_list[2].length = packetLen_aligned - packet_len;
+ buffer_list[2].data = buffer;
+ buffer_list[2].length = buffer_len;
- ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3,
+ buffer_list[3].data = &alignedData;
+ buffer_list[3].length = packetLen_aligned - packet_len;
+
+ ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 4,
&need_sig);
/* TODO: We should determine if this is optional */
@@ -880,3 +885,67 @@ hv_vmbus_channel_recv_packet_raw(
return (0);
}
+
+
+/**
+ * Process a channel event notification
+ */
+static void
+VmbusProcessChannelEvent(void* context, int pending)
+{
+ void* arg;
+ uint32_t bytes_to_read;
+ hv_vmbus_channel* channel = (hv_vmbus_channel*)context;
+ boolean_t is_batched_reading;
+
+ /**
+ * Find the channel based on this relid and invokes
+ * the channel callback to process the event
+ */
+
+ if (channel == NULL) {
+ return;
+ }
+ /**
+ * To deal with the race condition where we might
+ * receive a packet while the relevant driver is
+ * being unloaded, dispatch the callback while
+ * holding the channel lock. The unloading driver
+ * will acquire the same channel lock to set the
+ * callback to NULL. This closes the window.
+ */
+
+ /*
+ * Disable the lock due to newly added WITNESS check in r277723.
+ * Will seek other way to avoid race condition.
+ * -- whu
+ */
+ // mtx_lock(&channel->inbound_lock);
+ if (channel->on_channel_callback != NULL) {
+ arg = channel->channel_callback_context;
+ is_batched_reading = channel->batched_reading;
+ /*
+ * Optimize host to guest signaling by ensuring:
+ * 1. While reading the channel, we disable interrupts from
+ * host.
+ * 2. Ensure that we process all posted messages from the host
+ * before returning from this callback.
+ * 3. Once we return, enable signaling from the host. Once this
+ * state is set we check to see if additional packets are
+ * available to read. In this case we repeat the process.
+ */
+ do {
+ if (is_batched_reading)
+ hv_ring_buffer_read_begin(&channel->inbound);
+
+ channel->on_channel_callback(arg);
+
+ if (is_batched_reading)
+ bytes_to_read =
+ hv_ring_buffer_read_end(&channel->inbound);
+ else
+ bytes_to_read = 0;
+ } while (is_batched_reading && (bytes_to_read != 0));
+ }
+ // mtx_unlock(&channel->inbound_lock);
+}
diff --git a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
index 4ccb647..ab6e8ad 100644
--- a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
+++ b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
@@ -39,8 +39,10 @@ __FBSDID("$FreeBSD$");
*/
static void vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr);
+static void vmbus_channel_on_offer_internal(void* context);
static void vmbus_channel_on_open_result(hv_vmbus_channel_msg_header* hdr);
static void vmbus_channel_on_offer_rescind(hv_vmbus_channel_msg_header* hdr);
+static void vmbus_channel_on_offer_rescind_internal(void* context);
static void vmbus_channel_on_gpadl_created(hv_vmbus_channel_msg_header* hdr);
static void vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr);
static void vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr);
@@ -52,41 +54,46 @@ static void vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr);
hv_vmbus_channel_msg_table_entry
g_channel_message_table[HV_CHANNEL_MESSAGE_COUNT] = {
{ HV_CHANNEL_MESSAGE_INVALID,
- 0, NULL },
+ NULL },
{ HV_CHANNEL_MESSAGE_OFFER_CHANNEL,
- 0, vmbus_channel_on_offer },
+ vmbus_channel_on_offer },
{ HV_CHANNEL_MESSAGE_RESCIND_CHANNEL_OFFER,
- 0, vmbus_channel_on_offer_rescind },
+ vmbus_channel_on_offer_rescind },
{ HV_CHANNEL_MESSAGE_REQUEST_OFFERS,
- 0, NULL },
+ NULL },
{ HV_CHANNEL_MESSAGE_ALL_OFFERS_DELIVERED,
- 1, vmbus_channel_on_offers_delivered },
+ vmbus_channel_on_offers_delivered },
{ HV_CHANNEL_MESSAGE_OPEN_CHANNEL,
- 0, NULL },
+ NULL },
{ HV_CHANNEL_MESSAGE_OPEN_CHANNEL_RESULT,
- 1, vmbus_channel_on_open_result },
+ vmbus_channel_on_open_result },
{ HV_CHANNEL_MESSAGE_CLOSE_CHANNEL,
- 0, NULL },
+ NULL },
{ HV_CHANNEL_MESSAGEL_GPADL_HEADER,
- 0, NULL },
+ NULL },
{ HV_CHANNEL_MESSAGE_GPADL_BODY,
- 0, NULL },
+ NULL },
{ HV_CHANNEL_MESSAGE_GPADL_CREATED,
- 1, vmbus_channel_on_gpadl_created },
+ vmbus_channel_on_gpadl_created },
{ HV_CHANNEL_MESSAGE_GPADL_TEARDOWN,
- 0, NULL },
+ NULL },
{ HV_CHANNEL_MESSAGE_GPADL_TORNDOWN,
- 1, vmbus_channel_on_gpadl_torndown },
+ vmbus_channel_on_gpadl_torndown },
{ HV_CHANNEL_MESSAGE_REL_ID_RELEASED,
- 0, NULL },
+ NULL },
{ HV_CHANNEL_MESSAGE_INITIATED_CONTACT,
- 0, NULL },
+ NULL },
{ HV_CHANNEL_MESSAGE_VERSION_RESPONSE,
- 1, vmbus_channel_on_version_response },
+ vmbus_channel_on_version_response },
{ HV_CHANNEL_MESSAGE_UNLOAD,
- 0, NULL }
+ NULL }
};
+typedef struct hv_work_item {
+ struct task work;
+ void (*callback)(void *);
+ void* context;
+} hv_work_item;
/**
* Implementation of the work abstraction.
@@ -96,120 +103,30 @@ work_item_callback(void *work, int pending)
{
struct hv_work_item *w = (struct hv_work_item *)work;
- /*
- * Serialize work execution.
- */
- if (w->wq->work_sema != NULL) {
- sema_wait(w->wq->work_sema);
- }
-
w->callback(w->context);
- if (w->wq->work_sema != NULL) {
- sema_post(w->wq->work_sema);
- }
-
free(w, M_DEVBUF);
}
-struct hv_work_queue*
-hv_work_queue_create(char* name)
-{
- static unsigned int qid = 0;
- char qname[64];
- int pri;
- struct hv_work_queue* wq;
-
- wq = malloc(sizeof(struct hv_work_queue), M_DEVBUF, M_NOWAIT | M_ZERO);
- KASSERT(wq != NULL, ("Error VMBUS: Failed to allocate work_queue\n"));
- if (wq == NULL)
- return (NULL);
-
- /*
- * We use work abstraction to handle messages
- * coming from the host and these are typically offers.
- * Some FreeBsd drivers appear to have a concurrency issue
- * where probe/attach needs to be serialized. We ensure that
- * by having only one thread process work elements in a
- * specific queue by serializing work execution.
- *
- */
- if (strcmp(name, "vmbusQ") == 0) {
- pri = PI_DISK;
- } else { /* control */
- pri = PI_NET;
- /*
- * Initialize semaphore for this queue by pointing
- * to the globale semaphore used for synchronizing all
- * control messages.
- */
- wq->work_sema = &hv_vmbus_g_connection.control_sema;
- }
-
- sprintf(qname, "hv_%s_%u", name, qid);
-
- /*
- * Fixme: FreeBSD 8.2 has a different prototype for
- * taskqueue_create(), and for certain other taskqueue functions.
- * We need to research the implications of these changes.
- * Fixme: Not sure when the changes were introduced.
- */
- wq->queue = taskqueue_create(qname, M_NOWAIT, taskqueue_thread_enqueue,
- &wq->queue
- #if __FreeBSD_version < 800000
- , &wq->proc
- #endif
- );
-
- if (wq->queue == NULL) {
- free(wq, M_DEVBUF);
- return (NULL);
- }
-
- if (taskqueue_start_threads(&wq->queue, 1, pri, "%s taskq", qname)) {
- taskqueue_free(wq->queue);
- free(wq, M_DEVBUF);
- return (NULL);
- }
-
- qid++;
-
- return (wq);
-}
-
-void
-hv_work_queue_close(struct hv_work_queue *wq)
-{
- /*
- * KYS: Need to drain the taskqueue
- * before we close the hv_work_queue.
- */
- /*KYS: taskqueue_drain(wq->tq, ); */
- taskqueue_free(wq->queue);
- free(wq, M_DEVBUF);
-}
-
/**
* @brief Create work item
*/
-int
+static int
hv_queue_work_item(
- struct hv_work_queue *wq,
void (*callback)(void *), void *context)
{
struct hv_work_item *w = malloc(sizeof(struct hv_work_item),
- M_DEVBUF, M_NOWAIT | M_ZERO);
+ M_DEVBUF, M_NOWAIT);
KASSERT(w != NULL, ("Error VMBUS: Failed to allocate WorkItem\n"));
if (w == NULL)
return (ENOMEM);
w->callback = callback;
w->context = context;
- w->wq = wq;
TASK_INIT(&w->work, 0, work_item_callback, w);
- return (taskqueue_enqueue(wq->queue, &w->work));
+ return (taskqueue_enqueue(taskqueue_thread, &w->work));
}
@@ -224,10 +141,7 @@ hv_vmbus_allocate_channel(void)
channel = (hv_vmbus_channel*) malloc(
sizeof(hv_vmbus_channel),
M_DEVBUF,
- M_NOWAIT | M_ZERO);
- KASSERT(channel != NULL, ("Error VMBUS: Failed to allocate channel!"));
- if (channel == NULL)
- return (NULL);
+ M_WAITOK | M_ZERO);
mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF);
mtx_init(&channel->sc_lock, "vmbus multi channel", NULL, MTX_DEF);
@@ -238,16 +152,6 @@ hv_vmbus_allocate_channel(void)
}
/**
- * @brief Release the vmbus channel object itself
- */
-static inline void
-ReleaseVmbusChannel(void *context)
-{
- hv_vmbus_channel* channel = (hv_vmbus_channel*) context;
- free(channel, M_DEVBUF);
-}
-
-/**
* @brief Release the resources used by the vmbus channel object
*/
void
@@ -255,13 +159,8 @@ hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel)
{
mtx_destroy(&channel->sc_lock);
mtx_destroy(&channel->inbound_lock);
- /*
- * We have to release the channel's workqueue/thread in
- * the vmbus's workqueue/thread context
- * ie we can't destroy ourselves
- */
- hv_queue_work_item(hv_vmbus_g_connection.work_queue,
- ReleaseVmbusChannel, (void *) channel);
+
+ free(channel, M_DEVBUF);
}
/**
@@ -459,7 +358,7 @@ static void
vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr)
{
hv_vmbus_channel_offer_channel* offer;
- hv_vmbus_channel* new_channel;
+ hv_vmbus_channel_offer_channel* copied;
offer = (hv_vmbus_channel_offer_channel*) hdr;
@@ -469,10 +368,25 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr)
guidType = &offer->offer.interface_type;
guidInstance = &offer->offer.interface_instance;
+ // copy offer data
+ copied = malloc(sizeof(*copied), M_DEVBUF, M_NOWAIT);
+ if (copied == NULL) {
+ printf("fail to allocate memory\n");
+ return;
+ }
+
+ memcpy(copied, hdr, sizeof(*copied));
+ hv_queue_work_item(vmbus_channel_on_offer_internal, copied);
+}
+
+static void
+vmbus_channel_on_offer_internal(void* context)
+{
+ hv_vmbus_channel* new_channel;
+
+ hv_vmbus_channel_offer_channel* offer = (hv_vmbus_channel_offer_channel*)context;
/* Allocate the channel object and save this offer */
new_channel = hv_vmbus_allocate_channel();
- if (new_channel == NULL)
- return;
/*
* By default we setup state to enable batched
@@ -512,6 +426,8 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr)
new_channel->monitor_bit = (uint8_t) offer->monitor_id % 32;
vmbus_channel_process_offer(new_channel);
+
+ free(offer, M_DEVBUF);
}
/**
@@ -529,13 +445,20 @@ vmbus_channel_on_offer_rescind(hv_vmbus_channel_msg_header* hdr)
rescind = (hv_vmbus_channel_rescind_offer*) hdr;
channel = hv_vmbus_g_connection.channels[rescind->child_rel_id];
- if (channel == NULL)
+ if (channel == NULL)
return;
- hv_vmbus_child_device_unregister(channel->device);
- mtx_lock(&hv_vmbus_g_connection.channel_lock);
+ hv_queue_work_item(vmbus_channel_on_offer_rescind_internal, channel);
hv_vmbus_g_connection.channels[rescind->child_rel_id] = NULL;
- mtx_unlock(&hv_vmbus_g_connection.channel_lock);
+}
+
+static void
+vmbus_channel_on_offer_rescind_internal(void *context)
+{
+ hv_vmbus_channel* channel;
+
+ channel = (hv_vmbus_channel*)context;
+ hv_vmbus_child_device_unregister(channel->device);
}
/**
@@ -712,35 +635,6 @@ vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr)
}
/**
- * @brief Handler for channel protocol messages.
- *
- * This is invoked in the vmbus worker thread context.
- */
-void
-hv_vmbus_on_channel_message(void *context)
-{
- hv_vmbus_message* msg;
- hv_vmbus_channel_msg_header* hdr;
- int size;
-
- msg = (hv_vmbus_message*) context;
- hdr = (hv_vmbus_channel_msg_header*) msg->u.payload;
- size = msg->header.payload_size;
-
- if (hdr->message_type >= HV_CHANNEL_MESSAGE_COUNT) {
- free(msg, M_DEVBUF);
- return;
- }
-
- if (g_channel_message_table[hdr->message_type].messageHandler) {
- g_channel_message_table[hdr->message_type].messageHandler(hdr);
- }
-
- /* Free the msg that was allocated in VmbusOnMsgDPC() */
- free(msg, M_DEVBUF);
-}
-
-/**
* @brief Send a request to get all our pending offers.
*/
int
@@ -765,8 +659,7 @@ hv_vmbus_request_channel_offers(void)
ret = hv_vmbus_post_message(msg, sizeof(hv_vmbus_channel_msg_header));
- if (msg_info)
- free(msg_info, M_DEVBUF);
+ free(msg_info, M_DEVBUF);
return (ret);
}
diff --git a/sys/dev/hyperv/vmbus/hv_connection.c b/sys/dev/hyperv/vmbus/hv_connection.c
index cfdc9bb..fb1879d 100644
--- a/sys/dev/hyperv/vmbus/hv_connection.c
+++ b/sys/dev/hyperv/vmbus/hv_connection.c
@@ -90,12 +90,10 @@ hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info,
hv_vmbus_g_connection.interrupt_page);
msg->monitor_page_1 = hv_get_phys_addr(
- hv_vmbus_g_connection.monitor_pages);
+ hv_vmbus_g_connection.monitor_page_1);
- msg->monitor_page_2 =
- hv_get_phys_addr(
- ((uint8_t *) hv_vmbus_g_connection.monitor_pages
- + PAGE_SIZE));
+ msg->monitor_page_2 = hv_get_phys_addr(
+ hv_vmbus_g_connection.monitor_page_2);
/**
* Add to list before we send the request since we may receive the
@@ -168,8 +166,6 @@ hv_vmbus_connect(void) {
* Initialize the vmbus connection
*/
hv_vmbus_g_connection.connect_state = HV_CONNECTING;
- hv_vmbus_g_connection.work_queue = hv_work_queue_create("vmbusQ");
- sema_init(&hv_vmbus_g_connection.control_sema, 1, "control_sema");
TAILQ_INIT(&hv_vmbus_g_connection.channel_msg_anchor);
mtx_init(&hv_vmbus_g_connection.channel_msg_lock, "vmbus channel msg",
@@ -183,18 +179,9 @@ hv_vmbus_connect(void) {
* Setup the vmbus event connection for channel interrupt abstraction
* stuff
*/
- hv_vmbus_g_connection.interrupt_page = contigmalloc(
+ hv_vmbus_g_connection.interrupt_page = malloc(
PAGE_SIZE, M_DEVBUF,
- M_NOWAIT | M_ZERO, 0UL,
- BUS_SPACE_MAXADDR,
- PAGE_SIZE, 0);
- KASSERT(hv_vmbus_g_connection.interrupt_page != NULL,
- ("Error VMBUS: malloc failed to allocate Channel"
- " Request Event message!"));
- if (hv_vmbus_g_connection.interrupt_page == NULL) {
- ret = ENOMEM;
- goto cleanup;
- }
+ M_WAITOK | M_ZERO);
hv_vmbus_g_connection.recv_interrupt_page =
hv_vmbus_g_connection.interrupt_page;
@@ -207,31 +194,19 @@ hv_vmbus_connect(void) {
* Set up the monitor notification facility. The 1st page for
* parent->child and the 2nd page for child->parent
*/
- hv_vmbus_g_connection.monitor_pages = contigmalloc(
- 2 * PAGE_SIZE,
+ hv_vmbus_g_connection.monitor_page_1 = malloc(
+ PAGE_SIZE,
M_DEVBUF,
- M_NOWAIT | M_ZERO,
- 0UL,
- BUS_SPACE_MAXADDR,
+ M_WAITOK | M_ZERO);
+ hv_vmbus_g_connection.monitor_page_2 = malloc(
PAGE_SIZE,
- 0);
- KASSERT(hv_vmbus_g_connection.monitor_pages != NULL,
- ("Error VMBUS: malloc failed to allocate Monitor Pages!"));
- if (hv_vmbus_g_connection.monitor_pages == NULL) {
- ret = ENOMEM;
- goto cleanup;
- }
+ M_DEVBUF,
+ M_WAITOK | M_ZERO);
msg_info = (hv_vmbus_channel_msg_info*)
malloc(sizeof(hv_vmbus_channel_msg_info) +
sizeof(hv_vmbus_channel_initiate_contact),
- M_DEVBUF, M_NOWAIT | M_ZERO);
- KASSERT(msg_info != NULL,
- ("Error VMBUS: malloc failed for Initiate Contact message!"));
- if (msg_info == NULL) {
- ret = ENOMEM;
- goto cleanup;
- }
+ M_DEVBUF, M_WAITOK | M_ZERO);
hv_vmbus_g_connection.channels = malloc(sizeof(hv_vmbus_channel*) *
HV_CHANNEL_MAX_COUNT,
@@ -273,26 +248,16 @@ hv_vmbus_connect(void) {
hv_vmbus_g_connection.connect_state = HV_DISCONNECTED;
- hv_work_queue_close(hv_vmbus_g_connection.work_queue);
- sema_destroy(&hv_vmbus_g_connection.control_sema);
mtx_destroy(&hv_vmbus_g_connection.channel_lock);
mtx_destroy(&hv_vmbus_g_connection.channel_msg_lock);
if (hv_vmbus_g_connection.interrupt_page != NULL) {
- contigfree(
- hv_vmbus_g_connection.interrupt_page,
- PAGE_SIZE,
- M_DEVBUF);
+ free(hv_vmbus_g_connection.interrupt_page, M_DEVBUF);
hv_vmbus_g_connection.interrupt_page = NULL;
}
- if (hv_vmbus_g_connection.monitor_pages != NULL) {
- contigfree(
- hv_vmbus_g_connection.monitor_pages,
- 2 * PAGE_SIZE,
- M_DEVBUF);
- hv_vmbus_g_connection.monitor_pages = NULL;
- }
+ free(hv_vmbus_g_connection.monitor_page_1, M_DEVBUF);
+ free(hv_vmbus_g_connection.monitor_page_2, M_DEVBUF);
if (msg_info) {
sema_destroy(&msg_info->wait_sema);
@@ -309,108 +274,29 @@ hv_vmbus_connect(void) {
int
hv_vmbus_disconnect(void) {
int ret = 0;
- hv_vmbus_channel_unload* msg;
-
- msg = malloc(sizeof(hv_vmbus_channel_unload),
- M_DEVBUF, M_NOWAIT | M_ZERO);
- KASSERT(msg != NULL,
- ("Error VMBUS: malloc failed to allocate Channel Unload Msg!"));
- if (msg == NULL)
- return (ENOMEM);
-
- msg->message_type = HV_CHANNEL_MESSAGE_UNLOAD;
+ hv_vmbus_channel_unload msg;
- ret = hv_vmbus_post_message(msg, sizeof(hv_vmbus_channel_unload));
+ msg.message_type = HV_CHANNEL_MESSAGE_UNLOAD;
+ ret = hv_vmbus_post_message(&msg, sizeof(hv_vmbus_channel_unload));
- contigfree(hv_vmbus_g_connection.interrupt_page, PAGE_SIZE, M_DEVBUF);
+ free(hv_vmbus_g_connection.interrupt_page, M_DEVBUF);
mtx_destroy(&hv_vmbus_g_connection.channel_msg_lock);
- hv_work_queue_close(hv_vmbus_g_connection.work_queue);
- sema_destroy(&hv_vmbus_g_connection.control_sema);
-
free(hv_vmbus_g_connection.channels, M_DEVBUF);
hv_vmbus_g_connection.connect_state = HV_DISCONNECTED;
- free(msg, M_DEVBUF);
-
return (ret);
}
/**
- * Process a channel event notification
- */
-static void
-VmbusProcessChannelEvent(uint32_t relid)
-{
- void* arg;
- uint32_t bytes_to_read;
- hv_vmbus_channel* channel;
- boolean_t is_batched_reading;
-
- /**
- * Find the channel based on this relid and invokes
- * the channel callback to process the event
- */
-
- channel = hv_vmbus_g_connection.channels[relid];
-
- if (channel == NULL) {
- return;
- }
- /**
- * To deal with the race condition where we might
- * receive a packet while the relevant driver is
- * being unloaded, dispatch the callback while
- * holding the channel lock. The unloading driver
- * will acquire the same channel lock to set the
- * callback to NULL. This closes the window.
- */
-
- /*
- * Disable the lock due to newly added WITNESS check in r277723.
- * Will seek other way to avoid race condition.
- * -- whu
- */
- // mtx_lock(&channel->inbound_lock);
- if (channel->on_channel_callback != NULL) {
- arg = channel->channel_callback_context;
- is_batched_reading = channel->batched_reading;
- /*
- * Optimize host to guest signaling by ensuring:
- * 1. While reading the channel, we disable interrupts from
- * host.
- * 2. Ensure that we process all posted messages from the host
- * before returning from this callback.
- * 3. Once we return, enable signaling from the host. Once this
- * state is set we check to see if additional packets are
- * available to read. In this case we repeat the process.
- */
- do {
- if (is_batched_reading)
- hv_ring_buffer_read_begin(&channel->inbound);
-
- channel->on_channel_callback(arg);
-
- if (is_batched_reading)
- bytes_to_read =
- hv_ring_buffer_read_end(&channel->inbound);
- else
- bytes_to_read = 0;
- } while (is_batched_reading && (bytes_to_read != 0));
- }
- // mtx_unlock(&channel->inbound_lock);
-}
-
-/**
* Handler for events
*/
void
-hv_vmbus_on_events(void *arg)
+hv_vmbus_on_events(int cpu)
{
int bit;
- int cpu;
int dword;
void *page_addr;
uint32_t* recv_interrupt_page = NULL;
@@ -419,7 +305,6 @@ hv_vmbus_on_events(void *arg)
hv_vmbus_synic_event_flags *event;
/* int maxdword = PAGE_SIZE >> 3; */
- cpu = (int)(long)arg;
KASSERT(cpu <= mp_maxid, ("VMBUS: hv_vmbus_on_events: "
"cpu out of range!"));
@@ -461,8 +346,14 @@ hv_vmbus_on_events(void *arg)
*/
continue;
} else {
- VmbusProcessChannelEvent(rel_id);
-
+ hv_vmbus_channel * channel = hv_vmbus_g_connection.channels[rel_id];
+ /* if channel is closed or closing */
+ if (channel == NULL || channel->rxq == NULL)
+ continue;
+
+ if (channel->batched_reading)
+ hv_ring_buffer_read_begin(&channel->inbound);
+ taskqueue_enqueue_fast(channel->rxq, &channel->channel_task);
}
}
}
diff --git a/sys/dev/hyperv/vmbus/hv_hv.c b/sys/dev/hyperv/vmbus/hv_hv.c
index ca5641f..6afc2b8 100644
--- a/sys/dev/hyperv/vmbus/hv_hv.c
+++ b/sys/dev/hyperv/vmbus/hv_hv.c
@@ -189,11 +189,7 @@ hv_vmbus_init(void)
* See if the hypercall page is already set
*/
hypercall_msr.as_uint64_t = rdmsr(HV_X64_MSR_HYPERCALL);
- virt_addr = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO);
- KASSERT(virt_addr != NULL,
- ("Error VMBUS: malloc failed to allocate page during init!"));
- if (virt_addr == NULL)
- goto cleanup;
+ virt_addr = malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO);
hypercall_msr.u.enable = 1;
hypercall_msr.u.guest_physical_address =
diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
index 66a3f39..c8d6894 100644
--- a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
+++ b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
@@ -83,8 +83,6 @@ vmbus_msg_swintr(void *arg)
hv_vmbus_channel_msg_table_entry *entry;
hv_vmbus_channel_msg_type msg_type;
hv_vmbus_message* msg;
- hv_vmbus_message* copied;
- static bool warned = false;
cpu = (int)(long)arg;
KASSERT(cpu <= mp_maxid, ("VMBUS: vmbus_msg_swintr: "
@@ -100,31 +98,15 @@ vmbus_msg_swintr(void *arg)
hdr = (hv_vmbus_channel_msg_header *)msg->u.payload;
msg_type = hdr->message_type;
- if (msg_type >= HV_CHANNEL_MESSAGE_COUNT && !warned) {
- warned = true;
+ if (msg_type >= HV_CHANNEL_MESSAGE_COUNT) {
printf("VMBUS: unknown message type = %d\n", msg_type);
goto handled;
}
entry = &g_channel_message_table[msg_type];
- if (entry->handler_no_sleep)
+ if (entry->messageHandler)
entry->messageHandler(hdr);
- else {
-
- copied = malloc(sizeof(hv_vmbus_message),
- M_DEVBUF, M_NOWAIT);
- KASSERT(copied != NULL,
- ("Error VMBUS: malloc failed to allocate"
- " hv_vmbus_message!"));
- if (copied == NULL)
- continue;
-
- memcpy(copied, msg, sizeof(hv_vmbus_message));
- hv_queue_work_item(hv_vmbus_g_connection.work_queue,
- hv_vmbus_on_channel_message,
- copied);
- }
handled:
msg->header.message_type = HV_MESSAGE_TYPE_NONE;
@@ -177,7 +159,7 @@ hv_vmbus_isr(struct trapframe *frame)
(hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) {
/* Since we are a child, we only need to check bit 0 */
if (synch_test_and_clear_bit(0, &event->flags32[0])) {
- swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0);
+ hv_vmbus_on_events(cpu);
}
} else {
/*
@@ -187,7 +169,7 @@ hv_vmbus_isr(struct trapframe *frame)
* Directly schedule the event software interrupt on
* current cpu.
*/
- swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0);
+ hv_vmbus_on_events(cpu);
}
/* Check if there are actual msgs to be process */
@@ -225,7 +207,6 @@ hv_vmbus_isr(struct trapframe *frame)
return (FILTER_HANDLED);
}
-uint32_t hv_vmbus_swintr_event_cpu[MAXCPU];
u_long *hv_vmbus_intr_cpu[MAXCPU];
void
@@ -310,12 +291,7 @@ hv_vmbus_child_device_create(
* Allocate the new child device
*/
child_dev = malloc(sizeof(hv_device), M_DEVBUF,
- M_NOWAIT | M_ZERO);
- KASSERT(child_dev != NULL,
- ("Error VMBUS: malloc failed to allocate hv_device!"));
-
- if (child_dev == NULL)
- return (NULL);
+ M_WAITOK | M_ZERO);
child_dev->channel = channel;
memcpy(&child_dev->class_id, &type, sizeof(hv_guid));
@@ -455,6 +431,19 @@ vmbus_vector_free(int vector)
#endif /* HYPERV */
+static void
+vmbus_cpuset_setthread_task(void *xmask, int pending __unused)
+{
+ cpuset_t *mask = xmask;
+ int error;
+
+ error = cpuset_setthread(curthread->td_tid, mask);
+ if (error) {
+ panic("curthread=%ju: can't pin; error=%d",
+ (uintmax_t)curthread->td_tid, error);
+ }
+}
+
/**
* @brief Main vmbus driver initialization routine.
*
@@ -472,6 +461,7 @@ vmbus_bus_init(void)
{
int i, j, n, ret;
char buf[MAXCOMLEN + 1];
+ cpuset_t cpu_mask;
if (vmbus_inited)
return (0);
@@ -508,10 +498,7 @@ vmbus_bus_init(void)
setup_args.vector = hv_vmbus_g_context.hv_cb_vector;
CPU_FOREACH(j) {
- hv_vmbus_swintr_event_cpu[j] = 0;
- hv_vmbus_g_context.hv_event_intr_event[j] = NULL;
hv_vmbus_g_context.hv_msg_intr_event[j] = NULL;
- hv_vmbus_g_context.event_swintr[j] = NULL;
hv_vmbus_g_context.msg_swintr[j] = NULL;
snprintf(buf, sizeof(buf), "cpu%d:hyperv", j);
@@ -525,6 +512,26 @@ vmbus_bus_init(void)
* Per cpu setup.
*/
CPU_FOREACH(j) {
+ struct task cpuset_task;
+
+ /*
+ * Setup taskqueue to handle events
+ */
+ hv_vmbus_g_context.hv_event_queue[j] = taskqueue_create_fast("hyperv event", M_WAITOK,
+ taskqueue_thread_enqueue, &hv_vmbus_g_context.hv_event_queue[j]);
+ if (hv_vmbus_g_context.hv_event_queue[j] == NULL) {
+ if (bootverbose)
+ printf("VMBUS: failed to setup taskqueue\n");
+ goto cleanup1;
+ }
+ taskqueue_start_threads(&hv_vmbus_g_context.hv_event_queue[j], 1, PI_NET,
+ "hvevent%d", j);
+
+ CPU_SETOF(j, &cpu_mask);
+ TASK_INIT(&cpuset_task, 0, vmbus_cpuset_setthread_task, &cpu_mask);
+ taskqueue_enqueue(hv_vmbus_g_context.hv_event_queue[j], &cpuset_task);
+ taskqueue_drain(hv_vmbus_g_context.hv_event_queue[j], &cpuset_task);
+
/*
* Setup software interrupt thread and handler for msg handling.
*/
@@ -543,7 +550,7 @@ vmbus_bus_init(void)
*/
ret = intr_event_bind(hv_vmbus_g_context.hv_msg_intr_event[j],
j);
- if (ret) {
+ if (ret) {
if(bootverbose)
printf("VMBUS: failed to bind msg swi thread "
"to cpu %d\n", j);
@@ -551,30 +558,11 @@ vmbus_bus_init(void)
}
/*
- * Setup software interrupt thread and handler for
- * event handling.
- */
- ret = swi_add(&hv_vmbus_g_context.hv_event_intr_event[j],
- "hv_event", hv_vmbus_on_events, (void *)(long)j,
- SWI_CLOCK, 0, &hv_vmbus_g_context.event_swintr[j]);
- if (ret) {
- if(bootverbose)
- printf("VMBUS: failed to setup event swi for "
- "cpu %d\n", j);
- goto cleanup1;
- }
-
- /*
* Prepare the per cpu msg and event pages to be called on each cpu.
*/
for(i = 0; i < 2; i++) {
setup_args.page_buffers[2 * j + i] =
- malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO);
- if (setup_args.page_buffers[2 * j + i] == NULL) {
- KASSERT(setup_args.page_buffers[2 * j + i] != NULL,
- ("Error VMBUS: malloc failed!"));
- goto cleanup1;
- }
+ malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO);
}
}
@@ -607,12 +595,11 @@ vmbus_bus_init(void)
* remove swi and vmbus callback vector;
*/
CPU_FOREACH(j) {
+ if (hv_vmbus_g_context.hv_event_queue[j] != NULL)
+ taskqueue_free(hv_vmbus_g_context.hv_event_queue[j]);
if (hv_vmbus_g_context.msg_swintr[j] != NULL)
swi_remove(hv_vmbus_g_context.msg_swintr[j]);
- if (hv_vmbus_g_context.event_swintr[j] != NULL)
- swi_remove(hv_vmbus_g_context.event_swintr[j]);
hv_vmbus_g_context.hv_msg_intr_event[j] = NULL;
- hv_vmbus_g_context.hv_event_intr_event[j] = NULL;
}
vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector);
@@ -677,12 +664,11 @@ vmbus_bus_exit(void)
/* remove swi */
CPU_FOREACH(i) {
+ if (hv_vmbus_g_context.hv_event_queue[i] != NULL)
+ taskqueue_free(hv_vmbus_g_context.hv_event_queue[i]);
if (hv_vmbus_g_context.msg_swintr[i] != NULL)
swi_remove(hv_vmbus_g_context.msg_swintr[i]);
- if (hv_vmbus_g_context.event_swintr[i] != NULL)
- swi_remove(hv_vmbus_g_context.event_swintr[i]);
hv_vmbus_g_context.hv_msg_intr_event[i] = NULL;
- hv_vmbus_g_context.hv_event_intr_event[i] = NULL;
}
vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector);
diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
index 13a35c4..5f62072 100644
--- a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
+++ b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
@@ -202,9 +202,8 @@ typedef struct {
* Each cpu has its own software interrupt handler for channel
* event and msg handling.
*/
- struct intr_event *hv_event_intr_event[MAXCPU];
+ struct taskqueue *hv_event_queue[MAXCPU];
struct intr_event *hv_msg_intr_event[MAXCPU];
- void *event_swintr[MAXCPU];
void *msg_swintr[MAXCPU];
/*
* Host use this vector to intrrupt guest for vmbus channel
@@ -351,7 +350,8 @@ typedef struct {
* notification and 2nd is child->parent
* notification
*/
- void *monitor_pages;
+ void *monitor_page_1;
+ void *monitor_page_2;
TAILQ_HEAD(, hv_vmbus_channel_msg_info) channel_msg_anchor;
struct mtx channel_msg_lock;
/**
@@ -363,10 +363,8 @@ typedef struct {
/**
* channel table for fast lookup through id.
- */
+ */
hv_vmbus_channel **channels;
- hv_vmbus_handle work_queue;
- struct sema control_sema;
} hv_vmbus_connection;
typedef union {
@@ -633,7 +631,6 @@ typedef void (*vmbus_msg_handler)(hv_vmbus_channel_msg_header *msg);
typedef struct hv_vmbus_channel_msg_table_entry {
hv_vmbus_channel_msg_type messageType;
- bool handler_no_sleep; /* true: the handler doesn't sleep */
vmbus_msg_handler messageHandler;
} hv_vmbus_channel_msg_table_entry;
@@ -683,7 +680,6 @@ uint32_t hv_ring_buffer_read_end(
hv_vmbus_channel* hv_vmbus_allocate_channel(void);
void hv_vmbus_free_vmbus_channel(hv_vmbus_channel *channel);
-void hv_vmbus_on_channel_message(void *context);
int hv_vmbus_request_channel_offers(void);
void hv_vmbus_release_unattached_channels(void);
int hv_vmbus_init(void);
@@ -717,7 +713,7 @@ int hv_vmbus_connect(void);
int hv_vmbus_disconnect(void);
int hv_vmbus_post_message(void *buffer, size_t buf_size);
int hv_vmbus_set_event(hv_vmbus_channel *channel);
-void hv_vmbus_on_events(void *);
+void hv_vmbus_on_events(int cpu);
/**
* Event Timer interfaces
diff --git a/sys/dev/ioat/ioat.c b/sys/dev/ioat/ioat.c
index aff048a..cf48c25 100644
--- a/sys/dev/ioat/ioat.c
+++ b/sys/dev/ioat/ioat.c
@@ -152,8 +152,8 @@ MODULE_VERSION(ioat, 1);
* Private data structures
*/
static struct ioat_softc *ioat_channel[IOAT_MAX_CHANNELS];
-static int ioat_channel_index = 0;
-SYSCTL_INT(_hw_ioat, OID_AUTO, channels, CTLFLAG_RD, &ioat_channel_index, 0,
+static unsigned ioat_channel_index = 0;
+SYSCTL_UINT(_hw_ioat, OID_AUTO, channels, CTLFLAG_RD, &ioat_channel_index, 0,
"Number of IOAT channels attached");
static struct _pcsid
@@ -407,7 +407,7 @@ ioat3_attach(device_t device)
ioat = DEVICE2SOFTC(device);
ioat->capabilities = ioat_read_dmacapability(ioat);
- ioat_log_message(1, "Capabilities: %b\n", (int)ioat->capabilities,
+ ioat_log_message(0, "Capabilities: %b\n", (int)ioat->capabilities,
IOAT_DMACAP_STR);
xfercap = ioat_read_xfercap(ioat);
@@ -742,6 +742,13 @@ ioat_reset_hw_task(void *ctx, int pending __unused)
/*
* User API functions
*/
+unsigned
+ioat_get_nchannels(void)
+{
+
+ return (ioat_channel_index);
+}
+
bus_dmaengine_t
ioat_get_dmaengine(uint32_t index, int flags)
{
diff --git a/sys/dev/ioat/ioat.h b/sys/dev/ioat/ioat.h
index 2e10124..9a0c3e3b 100644
--- a/sys/dev/ioat/ioat.h
+++ b/sys/dev/ioat/ioat.h
@@ -85,6 +85,8 @@ typedef void *bus_dmaengine_t;
struct bus_dmadesc;
typedef void (*bus_dmaengine_callback_t)(void *arg, int error);
+unsigned ioat_get_nchannels(void);
+
/*
* Called first to acquire a reference to the DMA channel
*
diff --git a/sys/dev/ioat/ioat_internal.h b/sys/dev/ioat/ioat_internal.h
index 322671c..9d0708d 100644
--- a/sys/dev/ioat/ioat_internal.h
+++ b/sys/dev/ioat/ioat_internal.h
@@ -455,7 +455,7 @@ struct ioat_softc {
})
int version;
- int chan_idx;
+ unsigned chan_idx;
struct mtx submit_lock;
device_t device;
diff --git a/sys/dev/ntb/ntb_hw/ntb_hw.c b/sys/dev/ntb/ntb_hw/ntb_hw.c
index a4c460e..1ef9749 100644
--- a/sys/dev/ntb/ntb_hw/ntb_hw.c
+++ b/sys/dev/ntb/ntb_hw/ntb_hw.c
@@ -35,6 +35,8 @@ __FBSDID("$FreeBSD$");
#include <sys/endian.h>
#include <sys/malloc.h>
#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/pciio.h>
#include <sys/queue.h>
#include <sys/rman.h>
#include <sys/sbuf.h>
@@ -42,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/bus.h>
+#include <machine/intr_machdep.h>
#include <machine/pmap.h>
#include <machine/resource.h>
#include <dev/pci/pcireg.h>
@@ -70,6 +73,19 @@ __FBSDID("$FreeBSD$");
#define DEVICE2SOFTC(dev) ((struct ntb_softc *) device_get_softc(dev))
+#define NTB_MSIX_VER_GUARD 0xaabbccdd
+#define NTB_MSIX_RECEIVED 0xe0f0e0f0
+#define ONE_MB (1024u * 1024)
+
+/*
+ * PCI constants could be somewhere more generic, but aren't defined/used in
+ * pci.c.
+ */
+#define PCI_MSIX_ENTRY_SIZE 16
+#define PCI_MSIX_ENTRY_LOWER_ADDR 0
+#define PCI_MSIX_ENTRY_UPPER_ADDR 4
+#define PCI_MSIX_ENTRY_DATA 8
+
enum ntb_device_type {
NTB_XEON,
NTB_ATOM
@@ -95,6 +111,18 @@ enum ntb_bar {
NTB_MAX_BARS
};
+enum {
+ NTB_MSIX_GUARD = 0,
+ NTB_MSIX_DATA0,
+ NTB_MSIX_DATA1,
+ NTB_MSIX_DATA2,
+ NTB_MSIX_OFS0,
+ NTB_MSIX_OFS1,
+ NTB_MSIX_OFS2,
+ NTB_MSIX_DONE,
+ NTB_MAX_MSIX_SPAD
+};
+
/* Device features and workarounds */
#define HAS_FEATURE(feature) \
((ntb->features & (feature)) != 0)
@@ -131,6 +159,7 @@ struct ntb_int_info {
struct ntb_vec {
struct ntb_softc *ntb;
uint32_t num;
+ unsigned masked;
};
struct ntb_reg {
@@ -169,6 +198,11 @@ struct ntb_b2b_addr {
uint64_t bar5_addr32;
};
+struct ntb_msix_data {
+ uint32_t nmd_ofs;
+ uint32_t nmd_data;
+};
+
struct ntb_softc {
device_t device;
enum ntb_device_type type;
@@ -178,6 +212,13 @@ struct ntb_softc {
struct ntb_int_info int_info[MAX_MSIX_INTERRUPTS];
uint32_t allocated_interrupts;
+ struct ntb_msix_data peer_msix_data[XEON_NONLINK_DB_MSIX_BITS];
+ struct ntb_msix_data msix_data[XEON_NONLINK_DB_MSIX_BITS];
+ bool peer_msix_good;
+ bool peer_msix_done;
+ struct ntb_pci_bar_info *peer_lapic_bar;
+ struct callout peer_msix_work;
+
struct callout heartbeat_timer;
struct callout lr_timer;
@@ -198,6 +239,7 @@ struct ntb_softc {
/* Memory window used to access peer bar0 */
#define B2B_MW_DISABLED UINT8_MAX
uint8_t b2b_mw_idx;
+ uint8_t msix_mw_idx;
uint8_t mw_count;
uint8_t spad_count;
@@ -292,6 +334,8 @@ static inline void db_iowrite(struct ntb_softc *, uint64_t regoff, uint64_t);
static inline void db_iowrite_raw(struct ntb_softc *, uint64_t regoff, uint64_t);
static int ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors);
static void ntb_free_msix_vec(struct ntb_softc *ntb);
+static void ntb_get_msix_info(struct ntb_softc *ntb);
+static void ntb_exchange_msix(void *);
static struct ntb_hw_info *ntb_get_device_info(uint32_t device_id);
static void ntb_detect_max_mw(struct ntb_softc *ntb);
static int ntb_detect_xeon(struct ntb_softc *ntb);
@@ -308,7 +352,9 @@ static void xeon_set_pbar_xlat(struct ntb_softc *, uint64_t base_addr,
enum ntb_bar idx);
static int xeon_setup_b2b_mw(struct ntb_softc *,
const struct ntb_b2b_addr *addr, const struct ntb_b2b_addr *peer_addr);
+static int xeon_setup_msix_bar(struct ntb_softc *);
static inline bool link_is_up(struct ntb_softc *ntb);
+static inline bool _xeon_link_is_up(struct ntb_softc *ntb);
static inline bool atom_link_is_err(struct ntb_softc *ntb);
static inline enum ntb_speed ntb_link_sta_speed(struct ntb_softc *);
static inline enum ntb_width ntb_link_sta_width(struct ntb_softc *);
@@ -319,6 +365,8 @@ static bool ntb_poll_link(struct ntb_softc *ntb);
static void save_bar_parameters(struct ntb_pci_bar_info *bar);
static void ntb_sysctl_init(struct ntb_softc *);
static int sysctl_handle_features(SYSCTL_HANDLER_ARGS);
+static int sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS);
+static int sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS);
static int sysctl_handle_link_status(SYSCTL_HANDLER_ARGS);
static int sysctl_handle_register(SYSCTL_HANDLER_ARGS);
@@ -397,6 +445,13 @@ ntb_vm_memattr_to_str(vm_memattr_t pat)
}
}
+static int g_ntb_msix_idx = 0;
+SYSCTL_INT(_hw_ntb, OID_AUTO, msix_mw_idx, CTLFLAG_RDTUN, &g_ntb_msix_idx,
+ 0, "Use this memory window to access the peer MSIX message complex on "
+ "certain Xeon-based NTB systems, as a workaround for a hardware errata. "
+ "Like b2b_mw_idx, negative values index from the last available memory "
+ "window. (Applies on Xeon platforms with SB01BASE_LOCKUP errata.)");
+
static int g_ntb_mw_idx = -1;
TUNABLE_INT("hw.ntb.b2b_mw_idx", &g_ntb_mw_idx);
SYSCTL_INT(_hw_ntb, OID_AUTO, b2b_mw_idx, CTLFLAG_RDTUN, &g_ntb_mw_idx,
@@ -604,10 +659,12 @@ ntb_attach(device_t device)
ntb->type = p->type;
ntb->features = p->features;
ntb->b2b_mw_idx = B2B_MW_DISABLED;
+ ntb->msix_mw_idx = B2B_MW_DISABLED;
/* Heartbeat timer for NTB_ATOM since there is no link interrupt */
callout_init(&ntb->heartbeat_timer, CALLOUT_MPSAFE);
callout_init(&ntb->lr_timer, CALLOUT_MPSAFE);
+ callout_init(&ntb->peer_msix_work, 1);
mtx_init(&ntb->db_mask_lock, "ntb hw bits", NULL, MTX_SPIN);
mtx_init(&ntb->ctx_lock, "ntb ctx", NULL, MTX_DEF);
@@ -632,6 +689,8 @@ ntb_attach(device_t device)
if (error != 0)
goto out;
+ ntb_spad_clear(ntb);
+
ntb_poll_link(ntb);
ntb_sysctl_init(ntb);
@@ -649,10 +708,14 @@ ntb_detach(device_t device)
ntb = DEVICE2SOFTC(device);
- if (ntb->self_reg != NULL)
- ntb_db_set_mask(ntb, ntb->db_valid_mask);
+ if (ntb->self_reg != NULL) {
+ DB_MASK_LOCK(ntb);
+ db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_valid_mask);
+ DB_MASK_UNLOCK(ntb);
+ }
callout_drain(&ntb->heartbeat_timer);
callout_drain(&ntb->lr_timer);
+ callout_drain(&ntb->peer_msix_work);
pci_disable_busmaster(ntb->device);
if (ntb->type == NTB_XEON)
ntb_teardown_xeon(ntb);
@@ -978,9 +1041,12 @@ ntb_init_isr(struct ntb_softc *ntb)
ntb->last_ts = ticks;
/*
- * Mask all doorbell interrupts.
+ * Mask all doorbell interrupts. (Except link events!)
*/
- ntb_db_set_mask(ntb, ntb->db_valid_mask);
+ DB_MASK_LOCK(ntb);
+ ntb->db_mask = ntb->db_valid_mask;
+ db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
+ DB_MASK_UNLOCK(ntb);
num_vectors = desired_vectors = MIN(pci_msix_count(ntb->device),
ntb->db_count);
@@ -1005,12 +1071,28 @@ ntb_init_isr(struct ntb_softc *ntb)
num_vectors = 1;
if (ntb->type == NTB_XEON && num_vectors < ntb->db_vec_count) {
+ if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+ device_printf(ntb->device,
+ "Errata workaround does not support MSI or INTX\n");
+ return (EINVAL);
+ }
+
ntb->db_vec_count = 1;
ntb->db_vec_shift = XEON_DB_TOTAL_SHIFT;
rc = ntb_setup_legacy_interrupt(ntb);
} else {
+ if (num_vectors - 1 != XEON_NONLINK_DB_MSIX_BITS &&
+ HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+ device_printf(ntb->device,
+ "Errata workaround expects %d doorbell bits\n",
+ XEON_NONLINK_DB_MSIX_BITS);
+ return (EINVAL);
+ }
+
ntb_create_msix_vec(ntb, num_vectors);
rc = ntb_setup_msix(ntb, num_vectors);
+ if (rc == 0 && HAS_FEATURE(NTB_SB01BASE_LOCKUP))
+ ntb_get_msix_info(ntb);
}
if (rc != 0) {
device_printf(ntb->device,
@@ -1116,6 +1198,9 @@ void
ntb_db_set_mask(struct ntb_softc *ntb, uint64_t bits)
{
+ if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
+ return;
+
DB_MASK_LOCK(ntb);
ntb->db_mask |= bits;
db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
@@ -1131,6 +1216,9 @@ ntb_db_clear_mask(struct ntb_softc *ntb, uint64_t bits)
(uintmax_t)(bits & ~ntb->db_valid_mask),
(uintmax_t)ntb->db_valid_mask));
+ if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
+ return;
+
DB_MASK_LOCK(ntb);
ntb->db_mask &= ~bits;
db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
@@ -1141,6 +1229,18 @@ uint64_t
ntb_db_read(struct ntb_softc *ntb)
{
+ if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+ uint64_t res;
+ unsigned i;
+
+ res = 0;
+ for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+ if (ntb->msix_vec[i].masked != 0)
+ res |= ntb_db_vector_mask(ntb, i);
+ }
+ return (res);
+ }
+
return (db_ioread(ntb, ntb->self_reg->db_bell));
}
@@ -1153,6 +1253,25 @@ ntb_db_clear(struct ntb_softc *ntb, uint64_t bits)
(uintmax_t)(bits & ~ntb->db_valid_mask),
(uintmax_t)ntb->db_valid_mask));
+ if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+ unsigned i;
+
+ for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+ if ((bits & ntb_db_vector_mask(ntb, i)) != 0) {
+ DB_MASK_LOCK(ntb);
+ if (ntb->msix_vec[i].masked != 0) {
+ /* XXX These need a public API. */
+#if 0
+ pci_unmask_msix(ntb->device, i);
+#endif
+ ntb->msix_vec[i].masked = 0;
+ }
+ DB_MASK_UNLOCK(ntb);
+ }
+ }
+ return;
+ }
+
db_iowrite(ntb, ntb->self_reg->db_bell, bits);
}
@@ -1179,6 +1298,19 @@ ntb_interrupt(struct ntb_softc *ntb, uint32_t vec)
ntb_link_event(ntb);
}
+ if (HAS_FEATURE(NTB_SB01BASE_LOCKUP) &&
+ (vec_mask & ntb->db_link_mask) == 0) {
+ DB_MASK_LOCK(ntb);
+ if (ntb->msix_vec[vec].masked == 0) {
+ /* XXX These need a public API. */
+#if 0
+ pci_mask_msix(ntb->device, vec);
+#endif
+ ntb->msix_vec[vec].masked = 1;
+ }
+ DB_MASK_UNLOCK(ntb);
+ }
+
if ((vec_mask & ntb->db_valid_mask) != 0)
ntb_db_event(ntb, vec);
}
@@ -1224,6 +1356,40 @@ ntb_free_msix_vec(struct ntb_softc *ntb)
ntb->msix_vec = NULL;
}
+static void
+ntb_get_msix_info(struct ntb_softc *ntb)
+{
+ struct pci_devinfo *dinfo;
+ struct pcicfg_msix *msix;
+ uint32_t laddr, data, i, offset;
+
+ dinfo = device_get_ivars(ntb->device);
+ msix = &dinfo->cfg.msix;
+
+ laddr = data = 0;
+
+ CTASSERT(XEON_NONLINK_DB_MSIX_BITS == nitems(ntb->msix_data));
+
+ for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+ offset = msix->msix_table_offset + i * PCI_MSIX_ENTRY_SIZE;
+
+ laddr = bus_read_4(msix->msix_table_res, offset +
+ PCI_MSIX_ENTRY_LOWER_ADDR);
+ ntb_printf(2, "local lower MSIX addr(%u): 0x%x\n", i, laddr);
+
+ KASSERT((laddr & MSI_INTEL_ADDR_BASE) == MSI_INTEL_ADDR_BASE,
+ ("local MSIX addr 0x%x not in MSI base 0x%x", laddr,
+ MSI_INTEL_ADDR_BASE));
+ ntb->msix_data[i].nmd_ofs = laddr & ~MSI_INTEL_ADDR_BASE;
+
+ data = bus_read_4(msix->msix_table_res, offset +
+ PCI_MSIX_ENTRY_DATA);
+ ntb_printf(2, "local MSIX data(%u): 0x%x\n", i, data);
+
+ ntb->msix_data[i].nmd_data = data;
+ }
+}
+
static struct ntb_hw_info *
ntb_get_device_info(uint32_t device_id)
{
@@ -1276,9 +1442,12 @@ ntb_detect_xeon(struct ntb_softc *ntb)
if ((ppd & XEON_PPD_SPLIT_BAR) != 0)
ntb->features |= NTB_SPLIT_BAR;
- /* SB01BASE_LOCKUP errata is a superset of SDOORBELL errata */
+ /*
+ * SDOORBELL errata workaround gets in the way of SB01BASE_LOCKUP
+ * errata workaround; only do one at a time.
+ */
if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
- ntb->features |= NTB_SDOORBELL_LOCKUP;
+ ntb->features &= ~NTB_SDOORBELL_LOCKUP;
conn_type = ppd & XEON_PPD_CONN_TYPE;
switch (conn_type) {
@@ -1342,19 +1511,28 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
ntb->peer_reg = &xeon_b2b_reg;
ntb->xlat_reg = &xeon_sec_xlat;
- /*
- * There is a Xeon hardware errata related to writes to SDOORBELL or
- * B2BDOORBELL in conjunction with inbound access to NTB MMIO space,
- * which may hang the system. To workaround this, use a memory
- * window to access the interrupt and scratch pad registers on the
- * remote system.
- */
- if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
+ if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+ ntb->msix_mw_idx = (ntb->mw_count + g_ntb_msix_idx) %
+ ntb->mw_count;
+ ntb_printf(2, "Setting up MSIX mw idx %d means %u\n",
+ g_ntb_msix_idx, ntb->msix_mw_idx);
+ rc = ntb_mw_set_wc_internal(ntb, ntb->msix_mw_idx,
+ VM_MEMATTR_UNCACHEABLE);
+ KASSERT(rc == 0, ("shouldn't fail"));
+ } else if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
+ /*
+ * There is a Xeon hardware errata related to writes to SDOORBELL or
+ * B2BDOORBELL in conjunction with inbound access to NTB MMIO space,
+ * which may hang the system. To workaround this, use a memory
+ * window to access the interrupt and scratch pad registers on the
+ * remote system.
+ */
ntb->b2b_mw_idx = (ntb->mw_count + g_ntb_mw_idx) %
ntb->mw_count;
ntb_printf(2, "Setting up b2b mw idx %d means %u\n",
g_ntb_mw_idx, ntb->b2b_mw_idx);
- rc = ntb_mw_set_wc_internal(ntb, ntb->b2b_mw_idx, VM_MEMATTR_UNCACHEABLE);
+ rc = ntb_mw_set_wc_internal(ntb, ntb->b2b_mw_idx,
+ VM_MEMATTR_UNCACHEABLE);
KASSERT(rc == 0, ("shouldn't fail"));
} else if (HAS_FEATURE(NTB_B2BDOORBELL_BIT14))
/*
@@ -1385,7 +1563,14 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
/*
* Mask all doorbell interrupts.
*/
- ntb_db_set_mask(ntb, ntb->db_valid_mask);
+ DB_MASK_LOCK(ntb);
+ ntb->db_mask = ntb->db_valid_mask;
+ db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
+ DB_MASK_UNLOCK(ntb);
+
+ rc = xeon_setup_msix_bar(ntb);
+ if (rc != 0)
+ return (rc);
rc = ntb_init_isr(ntb);
return (rc);
@@ -1489,6 +1674,15 @@ xeon_reset_sbar_size(struct ntb_softc *ntb, enum ntb_bar idx,
bar_sz--;
else
bar_sz = 0;
+ } else if (HAS_FEATURE(NTB_SB01BASE_LOCKUP) &&
+ ntb_mw_to_bar(ntb, ntb->msix_mw_idx) == idx) {
+ /* Restrict LAPIC BAR to 1MB */
+ pci_write_config(ntb->device, bar->psz_off, 20, 1);
+ pci_write_config(ntb->device, bar->ssz_off, 20, 1);
+ bar_sz = pci_read_config(ntb->device, bar->psz_off, 1);
+ bar_sz = pci_read_config(ntb->device, bar->ssz_off, 1);
+ (void)bar_sz;
+ return;
}
pci_write_config(ntb->device, bar->ssz_off, bar_sz, 1);
bar_sz = pci_read_config(ntb->device, bar->ssz_off, 1);
@@ -1499,28 +1693,37 @@ static void
xeon_set_sbar_base_and_limit(struct ntb_softc *ntb, uint64_t bar_addr,
enum ntb_bar idx, enum ntb_bar regbar)
{
- uint64_t reg_val;
+ uint64_t reg_val, lmt_addr;
uint32_t base_reg, lmt_reg;
bar_get_xlat_params(ntb, idx, &base_reg, NULL, &lmt_reg);
if (idx == regbar)
bar_addr += ntb->b2b_off;
+ lmt_addr = bar_addr;
+ if (HAS_FEATURE(NTB_SB01BASE_LOCKUP) &&
+ ntb_mw_to_bar(ntb, ntb->msix_mw_idx) == idx)
+ lmt_addr += ONE_MB;
+
+ /*
+ * Set limit registers first to avoid an errata where setting the base
+ * registers locks the limit registers.
+ */
if (!bar_is_64bit(ntb, idx)) {
- ntb_reg_write(4, base_reg, bar_addr);
- reg_val = ntb_reg_read(4, base_reg);
+ ntb_reg_write(4, lmt_reg, lmt_addr);
+ reg_val = ntb_reg_read(4, lmt_reg);
(void)reg_val;
- ntb_reg_write(4, lmt_reg, bar_addr);
- reg_val = ntb_reg_read(4, lmt_reg);
+ ntb_reg_write(4, base_reg, bar_addr);
+ reg_val = ntb_reg_read(4, base_reg);
(void)reg_val;
} else {
- ntb_reg_write(8, base_reg, bar_addr);
- reg_val = ntb_reg_read(8, base_reg);
+ ntb_reg_write(8, lmt_reg, lmt_addr);
+ reg_val = ntb_reg_read(8, lmt_reg);
(void)reg_val;
- ntb_reg_write(8, lmt_reg, bar_addr);
- reg_val = ntb_reg_read(8, lmt_reg);
+ ntb_reg_write(8, base_reg, bar_addr);
+ reg_val = ntb_reg_read(8, base_reg);
(void)reg_val;
}
}
@@ -1542,6 +1745,37 @@ xeon_set_pbar_xlat(struct ntb_softc *ntb, uint64_t base_addr, enum ntb_bar idx)
}
static int
+xeon_setup_msix_bar(struct ntb_softc *ntb)
+{
+ struct ntb_pci_bar_info *lapic_bar;
+ enum ntb_bar bar_num;
+ int rc;
+
+ if (!HAS_FEATURE(NTB_SB01BASE_LOCKUP))
+ return (0);
+
+ bar_num = ntb_mw_to_bar(ntb, ntb->msix_mw_idx);
+ lapic_bar = &ntb->bar_info[bar_num];
+
+ /* Restrict LAPIC BAR to 1MB */
+ if (lapic_bar->size > ONE_MB) {
+ rc = bus_adjust_resource(ntb->device, SYS_RES_MEMORY,
+ lapic_bar->pci_resource, lapic_bar->pbase,
+ lapic_bar->pbase + ONE_MB - 1);
+ if (rc == 0)
+ lapic_bar->size = ONE_MB;
+ else {
+ ntb_printf(0, "Failed to shrink LAPIC BAR resource to "
+ "1 MB: %d\n", rc);
+ /* Ignore error */
+ }
+ }
+
+ ntb->peer_lapic_bar = lapic_bar;
+ return (0);
+}
+
+static int
xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
const struct ntb_b2b_addr *peer_addr)
{
@@ -1619,6 +1853,43 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
ntb_reg_write(8, XEON_SBAR2XLAT_OFFSET, 0);
ntb_reg_write(8, XEON_SBAR4XLAT_OFFSET, 0);
+ if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+ size_t size, xlatoffset;
+
+ switch (ntb_mw_to_bar(ntb, ntb->msix_mw_idx)) {
+ case NTB_B2B_BAR_1:
+ size = 8;
+ xlatoffset = XEON_SBAR2XLAT_OFFSET;
+ break;
+ case NTB_B2B_BAR_2:
+ xlatoffset = XEON_SBAR4XLAT_OFFSET;
+ if (HAS_FEATURE(NTB_SPLIT_BAR))
+ size = 4;
+ else
+ size = 8;
+ break;
+ case NTB_B2B_BAR_3:
+ xlatoffset = XEON_SBAR5XLAT_OFFSET;
+ size = 4;
+ break;
+ default:
+ KASSERT(false, ("Bogus msix mw idx: %u",
+ ntb->msix_mw_idx));
+ return (EINVAL);
+ }
+
+ /*
+ * We point the chosen MSIX MW BAR xlat to remote LAPIC for
+ * workaround
+ */
+ if (size == 4)
+ ntb_reg_write(4, xlatoffset, MSI_INTEL_ADDR_BASE);
+ else
+ ntb_reg_write(8, xlatoffset, MSI_INTEL_ADDR_BASE);
+ }
+ (void)ntb_reg_read(8, XEON_SBAR2XLAT_OFFSET);
+ (void)ntb_reg_read(8, XEON_SBAR4XLAT_OFFSET);
+
/* Zero outgoing translation limits (whole bar size windows) */
ntb_reg_write(8, XEON_PBAR2LMT_OFFSET, 0);
ntb_reg_write(8, XEON_PBAR4LMT_OFFSET, 0);
@@ -1656,14 +1927,21 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
}
static inline bool
+_xeon_link_is_up(struct ntb_softc *ntb)
+{
+
+ if (ntb->conn_type == NTB_CONN_TRANSPARENT)
+ return (true);
+ return ((ntb->lnk_sta & NTB_LINK_STATUS_ACTIVE) != 0);
+}
+
+static inline bool
link_is_up(struct ntb_softc *ntb)
{
- if (ntb->type == NTB_XEON) {
- if (ntb->conn_type == NTB_CONN_TRANSPARENT)
- return (true);
- return ((ntb->lnk_sta & NTB_LINK_STATUS_ACTIVE) != 0);
- }
+ if (ntb->type == NTB_XEON)
+ return (_xeon_link_is_up(ntb) && (ntb->peer_msix_good ||
+ !HAS_FEATURE(NTB_SB01BASE_LOCKUP)));
KASSERT(ntb->type == NTB_ATOM, ("ntb type"));
return ((ntb->ntb_ctl & ATOM_CNTL_LINK_DOWN) == 0);
@@ -1881,6 +2159,8 @@ ntb_link_enable(struct ntb_softc *ntb, enum ntb_speed s __unused,
{
uint32_t cntl;
+ ntb_printf(2, "%s\n", __func__);
+
if (ntb->type == NTB_ATOM) {
pci_write_config(ntb->device, NTB_PPD_OFFSET,
ntb->ppd | ATOM_PPD_INIT_LINK, 4);
@@ -1919,6 +2199,8 @@ ntb_link_disable(struct ntb_softc *ntb)
{
uint32_t cntl;
+ ntb_printf(2, "%s\n", __func__);
+
if (ntb->conn_type == NTB_CONN_TRANSPARENT) {
ntb_link_event(ntb);
return (0);
@@ -1934,6 +2216,23 @@ ntb_link_disable(struct ntb_softc *ntb)
return (0);
}
+bool
+ntb_link_enabled(struct ntb_softc *ntb)
+{
+ uint32_t cntl;
+
+ if (ntb->type == NTB_ATOM) {
+ cntl = pci_read_config(ntb->device, NTB_PPD_OFFSET, 4);
+ return ((cntl & ATOM_PPD_INIT_LINK) != 0);
+ }
+
+ if (ntb->conn_type == NTB_CONN_TRANSPARENT)
+ return (true);
+
+ cntl = ntb_reg_read(4, ntb->reg->ntb_ctl);
+ return ((cntl & NTB_CNTL_LINK_DISABLE) == 0);
+}
+
static void
recover_atom_link(void *arg)
{
@@ -2002,6 +2301,19 @@ ntb_poll_link(struct ntb_softc *ntb)
return (false);
ntb->lnk_sta = reg_val;
+
+ if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+ if (_xeon_link_is_up(ntb)) {
+ if (!ntb->peer_msix_good) {
+ callout_reset(&ntb->peer_msix_work, 0,
+ ntb_exchange_msix, ntb);
+ return (false);
+ }
+ } else {
+ ntb->peer_msix_good = false;
+ ntb->peer_msix_done = false;
+ }
+ }
}
return (true);
}
@@ -2040,16 +2352,26 @@ SYSCTL_NODE(_hw_ntb, OID_AUTO, debug_info, CTLFLAG_RW, 0,
static void
ntb_sysctl_init(struct ntb_softc *ntb)
{
- struct sysctl_oid_list *tree_par, *regpar, *statpar, *errpar;
+ struct sysctl_oid_list *globals, *tree_par, *regpar, *statpar, *errpar;
struct sysctl_ctx_list *ctx;
struct sysctl_oid *tree, *tmptree;
ctx = device_get_sysctl_ctx(ntb->device);
-
- tree = SYSCTL_ADD_NODE(ctx,
- SYSCTL_CHILDREN(device_get_sysctl_tree(ntb->device)), OID_AUTO,
- "debug_info", CTLFLAG_RD, NULL,
- "Driver state, statistics, and HW registers");
+ globals = SYSCTL_CHILDREN(device_get_sysctl_tree(ntb->device));
+
+ SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "link_status",
+ CTLFLAG_RD | CTLTYPE_STRING, ntb, 0,
+ sysctl_handle_link_status_human, "A",
+ "Link status (human readable)");
+ SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "active",
+ CTLFLAG_RD | CTLTYPE_UINT, ntb, 0, sysctl_handle_link_status,
+ "IU", "Link status (1=active, 0=inactive)");
+ SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "admin_up",
+ CTLFLAG_RW | CTLTYPE_UINT, ntb, 0, sysctl_handle_link_admin,
+ "IU", "Set/get interface status (1=UP, 0=DOWN)");
+
+ tree = SYSCTL_ADD_NODE(ctx, globals, OID_AUTO, "debug_info",
+ CTLFLAG_RD, NULL, "Driver state, statistics, and HW registers");
tree_par = SYSCTL_CHILDREN(tree);
SYSCTL_ADD_UINT(ctx, tree_par, OID_AUTO, "conn_type", CTLFLAG_RD,
@@ -2081,10 +2403,6 @@ ntb_sysctl_init(struct ntb_softc *ntb)
__DEVOLATILE(uint32_t *, &ntb->lnk_sta), 0,
"LNK STA register (cached)");
- SYSCTL_ADD_PROC(ctx, tree_par, OID_AUTO, "link_status",
- CTLFLAG_RD | CTLTYPE_STRING, ntb, 0, sysctl_handle_link_status,
- "A", "Link status");
-
#ifdef notyet
SYSCTL_ADD_U8(ctx, tree_par, OID_AUTO, "mw_count", CTLFLAG_RD,
&ntb->mw_count, 0, "MW count");
@@ -2332,7 +2650,37 @@ sysctl_handle_features(SYSCTL_HANDLER_ARGS)
}
static int
-sysctl_handle_link_status(SYSCTL_HANDLER_ARGS)
+sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS)
+{
+ struct ntb_softc *ntb;
+ unsigned old, new;
+ int error;
+
+ error = 0;
+ ntb = arg1;
+
+ old = ntb_link_enabled(ntb);
+
+ error = SYSCTL_OUT(req, &old, sizeof(old));
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ error = SYSCTL_IN(req, &new, sizeof(new));
+ if (error != 0)
+ return (error);
+
+ ntb_printf(0, "Admin set interface state to '%sabled'\n",
+ (new != 0)? "en" : "dis");
+
+ if (new != 0)
+ error = ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+ else
+ error = ntb_link_disable(ntb);
+ return (error);
+}
+
+static int
+sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS)
{
struct ntb_softc *ntb;
struct sbuf sb;
@@ -2360,6 +2708,24 @@ sysctl_handle_link_status(SYSCTL_HANDLER_ARGS)
}
static int
+sysctl_handle_link_status(SYSCTL_HANDLER_ARGS)
+{
+ struct ntb_softc *ntb;
+ unsigned res;
+ int error;
+
+ error = 0;
+ ntb = arg1;
+
+ res = ntb_link_is_up(ntb, NULL, NULL);
+
+ error = SYSCTL_OUT(req, &res, sizeof(res));
+ if (error || !req->newptr)
+ return (error);
+ return (EINVAL);
+}
+
+static int
sysctl_handle_register(SYSCTL_HANDLER_ARGS)
{
struct ntb_softc *ntb;
@@ -2434,12 +2800,70 @@ static unsigned
ntb_user_mw_to_idx(struct ntb_softc *ntb, unsigned uidx)
{
- if (ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0 &&
- uidx >= ntb->b2b_mw_idx)
- return (uidx + 1);
+ if ((ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0 &&
+ uidx >= ntb->b2b_mw_idx) ||
+ (ntb->msix_mw_idx != B2B_MW_DISABLED && uidx >= ntb->msix_mw_idx))
+ uidx++;
+ if ((ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0 &&
+ uidx >= ntb->b2b_mw_idx) &&
+ (ntb->msix_mw_idx != B2B_MW_DISABLED && uidx >= ntb->msix_mw_idx))
+ uidx++;
return (uidx);
}
+static void
+ntb_exchange_msix(void *ctx)
+{
+ struct ntb_softc *ntb;
+ uint32_t val;
+ unsigned i;
+
+ ntb = ctx;
+
+ if (ntb->peer_msix_done)
+ goto msix_done;
+
+ for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+ ntb_peer_spad_write(ntb, NTB_MSIX_DATA0 + i,
+ ntb->msix_data[i].nmd_data);
+ ntb_peer_spad_write(ntb, NTB_MSIX_OFS0 + i,
+ ntb->msix_data[i].nmd_ofs);
+ }
+ ntb_peer_spad_write(ntb, NTB_MSIX_GUARD, NTB_MSIX_VER_GUARD);
+
+ ntb_spad_read(ntb, NTB_MSIX_GUARD, &val);
+ if (val != NTB_MSIX_VER_GUARD)
+ goto reschedule;
+
+ for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+ ntb_spad_read(ntb, NTB_MSIX_DATA0 + i, &val);
+ ntb->peer_msix_data[i].nmd_data = val;
+ ntb_spad_read(ntb, NTB_MSIX_OFS0 + i, &val);
+ ntb->peer_msix_data[i].nmd_ofs = val;
+ }
+
+ ntb->peer_msix_done = true;
+
+msix_done:
+ ntb_peer_spad_write(ntb, NTB_MSIX_DONE, NTB_MSIX_RECEIVED);
+ ntb_spad_read(ntb, NTB_MSIX_DONE, &val);
+ if (val != NTB_MSIX_RECEIVED)
+ goto reschedule;
+
+ ntb->peer_msix_good = true;
+
+ ntb_poll_link(ntb);
+ ntb_link_event(ntb);
+ return;
+
+reschedule:
+ ntb->lnk_sta = pci_read_config(ntb->device, ntb->reg->lnk_sta, 2);
+ if (_xeon_link_is_up(ntb))
+ callout_reset(&ntb->peer_msix_work, hz / 100, ntb_exchange_msix, ntb);
+ else
+ ntb_spad_clear(ntb);
+}
+
/*
* Public API to the rest of the OS
*/
@@ -2469,10 +2893,14 @@ ntb_get_max_spads(struct ntb_softc *ntb)
uint8_t
ntb_mw_count(struct ntb_softc *ntb)
{
+ uint8_t res;
+ res = ntb->mw_count;
if (ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0)
- return (ntb->mw_count - 1);
- return (ntb->mw_count);
+ res--;
+ if (ntb->msix_mw_idx != B2B_MW_DISABLED)
+ res--;
+ return (res);
}
/**
@@ -2498,6 +2926,18 @@ ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val)
return (0);
}
+/*
+ * Zeros the local scratchpad.
+ */
+void
+ntb_spad_clear(struct ntb_softc *ntb)
+{
+ unsigned i;
+
+ for (i = 0; i < ntb->spad_count; i++)
+ ntb_spad_write(ntb, i, 0);
+}
+
/**
* ntb_spad_read() - read from the primary scratchpad register
* @ntb: pointer to ntb_softc instance
@@ -2826,6 +3266,22 @@ void
ntb_peer_db_set(struct ntb_softc *ntb, uint64_t bit)
{
+ if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+ struct ntb_pci_bar_info *lapic;
+ unsigned i;
+
+ lapic = ntb->peer_lapic_bar;
+
+ for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+ if ((bit & ntb_db_vector_mask(ntb, i)) != 0)
+ bus_space_write_4(lapic->pci_bus_tag,
+ lapic->pci_bus_handle,
+ ntb->peer_msix_data[i].nmd_ofs,
+ ntb->peer_msix_data[i].nmd_data);
+ }
+ return;
+ }
+
if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
ntb_mw_write(2, XEON_PDOORBELL_OFFSET, bit);
return;
diff --git a/sys/dev/ntb/ntb_hw/ntb_hw.h b/sys/dev/ntb/ntb_hw/ntb_hw.h
index c35166c..f05acda 100644
--- a/sys/dev/ntb/ntb_hw/ntb_hw.h
+++ b/sys/dev/ntb/ntb_hw/ntb_hw.h
@@ -70,6 +70,7 @@ bool ntb_link_is_up(struct ntb_softc *, enum ntb_speed *, enum ntb_width *);
void ntb_link_event(struct ntb_softc *);
int ntb_link_enable(struct ntb_softc *, enum ntb_speed, enum ntb_width);
int ntb_link_disable(struct ntb_softc *);
+bool ntb_link_enabled(struct ntb_softc *);
int ntb_set_ctx(struct ntb_softc *, void *, const struct ntb_ctx_ops *);
void *ntb_get_ctx(struct ntb_softc *, const struct ntb_ctx_ops **);
@@ -86,6 +87,7 @@ int ntb_mw_get_wc(struct ntb_softc *, unsigned mw_idx, vm_memattr_t *mode);
int ntb_mw_set_wc(struct ntb_softc *, unsigned mw_idx, vm_memattr_t mode);
uint8_t ntb_get_max_spads(struct ntb_softc *ntb);
+void ntb_spad_clear(struct ntb_softc *ntb);
int ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val);
int ntb_spad_read(struct ntb_softc *ntb, unsigned int idx, uint32_t *val);
int ntb_peer_spad_write(struct ntb_softc *ntb, unsigned int idx,
diff --git a/sys/dev/ntb/ntb_hw/ntb_regs.h b/sys/dev/ntb/ntb_hw/ntb_regs.h
index f50fd93..fb445d7 100644
--- a/sys/dev/ntb/ntb_hw/ntb_regs.h
+++ b/sys/dev/ntb/ntb_hw/ntb_regs.h
@@ -44,6 +44,7 @@
#define XEON_DB_MSIX_VECTOR_COUNT 4
#define XEON_DB_MSIX_VECTOR_SHIFT 5
#define XEON_DB_LINK_BIT (1 << XEON_DB_LINK)
+#define XEON_NONLINK_DB_MSIX_BITS 3
#define XEON_SPCICMD_OFFSET 0x0504
#define XEON_DEVCTRL_OFFSET 0x0598
diff --git a/sys/dev/sound/pci/hda/hdaa.c b/sys/dev/sound/pci/hda/hdaa.c
index fe45343..14aee62 100644
--- a/sys/dev/sound/pci/hda/hdaa.c
+++ b/sys/dev/sound/pci/hda/hdaa.c
@@ -1553,20 +1553,20 @@ hdaa_widget_parse(struct hdaa_widget *w)
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
buf, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
- w, sizeof(w), hdaa_sysctl_caps, "A", "Node capabilities");
+ w, 0, hdaa_sysctl_caps, "A", "Node capabilities");
if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) {
snprintf(buf, sizeof(buf), "nid%d_config", w->nid);
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
buf, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
- &w->wclass.pin.newconf, sizeof(&w->wclass.pin.newconf),
- hdaa_sysctl_config, "A", "Current pin configuration");
+ &w->wclass.pin.newconf, 0, hdaa_sysctl_config, "A",
+ "Current pin configuration");
snprintf(buf, sizeof(buf), "nid%d_original", w->nid);
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
buf, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
- &w->wclass.pin.original, sizeof(&w->wclass.pin.original),
- hdaa_sysctl_config, "A", "Original pin configuration");
+ &w->wclass.pin.original, 0, hdaa_sysctl_config, "A",
+ "Original pin configuration");
}
hdaa_lock(w->devinfo);
}
@@ -6641,38 +6641,32 @@ hdaa_attach(device_t dev)
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
"config", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
- &devinfo->newquirks, sizeof(&devinfo->newquirks),
- hdaa_sysctl_quirks, "A", "Configuration options");
+ &devinfo->newquirks, 0, hdaa_sysctl_quirks, "A",
+ "Configuration options");
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
"gpi_state", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
- devinfo, sizeof(devinfo),
- hdaa_sysctl_gpi_state, "A", "GPI state");
+ devinfo, 0, hdaa_sysctl_gpi_state, "A", "GPI state");
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
"gpio_state", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
- devinfo, sizeof(devinfo),
- hdaa_sysctl_gpio_state, "A", "GPIO state");
+ devinfo, 0, hdaa_sysctl_gpio_state, "A", "GPIO state");
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
"gpio_config", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
- devinfo, sizeof(devinfo),
- hdaa_sysctl_gpio_config, "A", "GPIO configuration");
+ devinfo, 0, hdaa_sysctl_gpio_config, "A", "GPIO configuration");
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
"gpo_state", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
- devinfo, sizeof(devinfo),
- hdaa_sysctl_gpo_state, "A", "GPO state");
+ devinfo, 0, hdaa_sysctl_gpo_state, "A", "GPO state");
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
"gpo_config", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
- devinfo, sizeof(devinfo),
- hdaa_sysctl_gpo_config, "A", "GPO configuration");
+ devinfo, 0, hdaa_sysctl_gpo_config, "A", "GPO configuration");
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
"reconfig", CTLTYPE_INT | CTLFLAG_RW,
- dev, sizeof(dev),
- hdaa_sysctl_reconfig, "I", "Reprocess configuration");
+ dev, 0, hdaa_sysctl_reconfig, "I", "Reprocess configuration");
bus_generic_attach(dev);
return (0);
}
diff --git a/sys/kern/subr_vmem.c b/sys/kern/subr_vmem.c
index 80940be..2ec45c3 100644
--- a/sys/kern/subr_vmem.c
+++ b/sys/kern/subr_vmem.c
@@ -1046,10 +1046,8 @@ vmem_create(const char *name, vmem_addr_t base, vmem_size_t size,
if (vm == NULL)
return (NULL);
if (vmem_init(vm, name, base, size, quantum, qcache_max,
- flags) == NULL) {
- free(vm, M_VMEM);
+ flags) == NULL)
return (NULL);
- }
return (vm);
}
diff --git a/sys/modules/dummynet/Makefile b/sys/modules/dummynet/Makefile
index dfddbce..98e685e 100644
--- a/sys/modules/dummynet/Makefile
+++ b/sys/modules/dummynet/Makefile
@@ -6,8 +6,9 @@
KMOD= dummynet
SRCS= ip_dummynet.c
SRCS+= ip_dn_glue.c ip_dn_io.c
+SRCS+= dn_aqm_codel.c dn_aqm_pie.c
SRCS+= dn_heap.c dn_sched_fifo.c dn_sched_qfq.c dn_sched_rr.c dn_sched_wf2q.c
-SRCS+= dn_sched_prio.c
+SRCS+= dn_sched_prio.c dn_sched_fq_codel.c dn_sched_fq_pie.c
SRCS+= opt_inet6.h
.if !defined(KERNBUILDDIR)
diff --git a/sys/modules/hyperv/utilities/Makefile b/sys/modules/hyperv/utilities/Makefile
index f94e441..c1b6d4f 100644
--- a/sys/modules/hyperv/utilities/Makefile
+++ b/sys/modules/hyperv/utilities/Makefile
@@ -3,7 +3,7 @@
.PATH: ${.CURDIR}/../../../dev/hyperv/utilities
KMOD= hv_utils
-SRCS= hv_util.c hv_kvp.c
+SRCS= hv_util.c hv_kvp.c hv_timesync.c hv_shutdown.c hv_heartbeat.c
SRCS+= bus_if.h device_if.h
CFLAGS+= -I${.CURDIR}/../../../dev/hyperv/include \
diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h
index 202f1e2..377b5b0 100644
--- a/sys/netinet/ip_dummynet.h
+++ b/sys/netinet/ip_dummynet.h
@@ -29,7 +29,7 @@
#ifndef _IP_DUMMYNET_H
#define _IP_DUMMYNET_H
-
+#define NEW_AQM
/*
* Definition of the kernel-userland API for dummynet.
*
@@ -85,7 +85,13 @@ enum {
/* special commands for emulation of sysctl variables */
DN_SYSCTL_GET,
DN_SYSCTL_SET,
-
+#ifdef NEW_AQM
+ /* subtypes used for setting/getting extra parameters.
+ * these subtypes used with IP_DUMMYNET3 command (get)
+ * and DN_TEXT (set). */
+ DN_AQM_PARAMS, /* AQM extra params */
+ DN_SCH_PARAMS, /* scheduler extra params */
+#endif
DN_LAST,
};
@@ -105,6 +111,9 @@ enum { /* user flags */
DN_IS_RED = 0x0020,
DN_IS_GENTLE_RED= 0x0040,
DN_IS_ECN = 0x0080,
+ #ifdef NEW_AQM
+ DN_IS_AQM = 0x0100, /* AQMs: e.g Codel & PIE */
+ #endif
DN_PIPE_CMD = 0x1000, /* pipe config... */
};
@@ -210,7 +219,19 @@ struct dn_profile {
int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */
};
-
+#ifdef NEW_AQM
+/* Extra parameters for AQM and scheduler.
+ * This struct is used to pass and retrieve parameters (configurations)
+ * to/from AQM and Scheduler.
+ */
+struct dn_extra_parms {
+ struct dn_id oid;
+ char name[16];
+ uint32_t nr;
+#define DN_MAX_EXTRA_PARM 10
+ int64_t par[DN_MAX_EXTRA_PARM];
+};
+#endif
/*
* Overall structure of dummynet
diff --git a/sys/netipsec/key.c b/sys/netipsec/key.c
index 26b1788..f5b0fee 100644
--- a/sys/netipsec/key.c
+++ b/sys/netipsec/key.c
@@ -350,7 +350,7 @@ do { \
if ((head) != (sav)) { \
ipseclog((LOG_DEBUG, "%s: state mismatched (TREE=%d SA=%d)\n", \
(name), (head), (sav))); \
- continue; \
+ break; \
} \
} while (0)
diff --git a/sys/netpfil/ipfw/dn_aqm.h b/sys/netpfil/ipfw/dn_aqm.h
new file mode 100644
index 0000000..d01e98e
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm.h
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * API for writing an Active Queue Management algorithm for Dummynet
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_AQM_H
+#define _IP_DN_AQM_H
+
+
+/* NOW is the current time in millisecond*/
+#define NOW ((dn_cfg.curr_time * tick) / 1000)
+
+#define AQM_UNOW (dn_cfg.curr_time * tick)
+#define AQM_TIME_1US ((aqm_time_t)(1))
+#define AQM_TIME_1MS ((aqm_time_t)(1000))
+#define AQM_TIME_1S ((aqm_time_t)(AQM_TIME_1MS * 1000))
+
+/* aqm time allows to store up to 4294 seconds */
+typedef uint32_t aqm_time_t;
+typedef int32_t aqm_stime_t;
+
+#define DN_AQM_MTAG_TS 55345
+
+/* Macro for variable bounding */
+#define BOUND_VAR(x,l,h) ((x) > (h)? (h) : ((x) > (l)? (x) : (l)))
+
+/* sysctl variable to count number of dropped packets */
+extern unsigned long io_pkt_drop;
+
+/*
+ * Structure for holding data and function pointers that together represent a
+ * AQM algorithm.
+ */
+ struct dn_aqm {
+#define DN_AQM_NAME_MAX 50
+ char name[DN_AQM_NAME_MAX]; /* name of AQM algorithm */
+ uint32_t type; /* AQM type number */
+
+ /* Methods implemented by AQM algorithm:
+ *
+ * enqueue enqueue packet 'm' on queue 'q'.
+ * Return 0 on success, 1 on drop.
+ *
+ * dequeue dequeue a packet from queue 'q'.
+ * Return a packet, NULL if no packet available.
+ *
+ * config configure AQM algorithm
+ * If required, this function should allocate space to store
+ * the configurations and set 'fs->aqmcfg' to point to this space.
+ * 'dn_extra_parms' includes array of parameters send
+ * from ipfw userland command.
+ * Return 0 on success, non-zero otherwise.
+ *
+ * deconfig deconfigure AQM algorithm.
+ * The allocated configuration memory space should be freed here.
+ * Return 0 on success, non-zero otherwise.
+ *
+ * init initialise AQM status variables of queue 'q'
+ * This function is used to allocate space and init AQM status for a
+ * queue and q->aqm_status to point to this space.
+ * Return 0 on success, non-zero otherwise.
+ *
+ * cleanup cleanup AQM status variables of queue 'q'
+ * The allocated memory space for AQM status should be freed here.
+ * Return 0 on success, non-zero otherwise.
+ *
+ * getconfig retrieve AQM configurations
+ * This function is used to return AQM parameters to userland
+ * command. The function should fill 'dn_extra_parms' struct with
+ * the AQM configurations using 'par' array.
+ *
+ */
+
+ int (*enqueue)(struct dn_queue *, struct mbuf *);
+ struct mbuf * (*dequeue)(struct dn_queue *);
+ int (*config)(struct dn_fsk *, struct dn_extra_parms *ep, int);
+ int (*deconfig)(struct dn_fsk *);
+ int (*init)(struct dn_queue *);
+ int (*cleanup)(struct dn_queue *);
+ int (*getconfig)(struct dn_fsk *, struct dn_extra_parms *);
+
+ int ref_count; /*Number of queues instances in the system */
+ int cfg_ref_count; /*Number of AQM instances in the system */
+ SLIST_ENTRY (dn_aqm) next; /* Next AQM in the list */
+};
+
+/* Helper function to update queue and scheduler statistics.
+ * negative len + drop -> drop
+ * negative len -> dequeue
+ * positive len -> enqueue
+ * positive len + drop -> drop during enqueue
+ */
+__inline static void
+update_stats(struct dn_queue *q, int len, int drop)
+{
+ int inc = 0;
+ struct dn_flow *sni;
+ struct dn_flow *qni;
+
+ sni = &q->_si->ni;
+ qni = &q->ni;
+
+ if (len < 0)
+ inc = -1;
+ else if(len > 0)
+ inc = 1;
+
+ if (drop) {
+ qni->drops++;
+ sni->drops++;
+ io_pkt_drop++;
+ } else {
+ /*update queue stats */
+ qni->length += inc;
+ qni->len_bytes += len;
+
+ /*update scheduler instance stats */
+ sni->length += inc;
+ sni->len_bytes += len;
+ }
+ /* tot_pkts is updated in dn_enqueue function */
+}
+
+
+/* kernel module related function */
+int
+dn_aqm_modevent(module_t mod, int cmd, void *arg);
+
+#define DECLARE_DNAQM_MODULE(name, dnaqm) \
+ static moduledata_t name##_mod = { \
+ #name, dn_aqm_modevent, dnaqm \
+ }; \
+ DECLARE_MODULE(name, name##_mod, \
+ SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \
+ MODULE_DEPEND(name, dummynet, 3, 3, 3)
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_aqm_codel.c b/sys/netpfil/ipfw/dn_aqm_codel.c
new file mode 100644
index 0000000..0080170
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm_codel.c
@@ -0,0 +1,444 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm.
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h> /* ip_len, ip_off */
+#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/if_ether.h> /* various ether_* routines */
+#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+#include <netpfil/ipfw/dn_heap.h>
+
+#ifdef NEW_AQM
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+#include <netpfil/ipfw/dn_aqm.h>
+#include <netpfil/ipfw/dn_aqm_codel.h>
+#include <netpfil/ipfw/dn_sched.h>
+
+#define DN_AQM_CODEL 1
+
+static struct dn_aqm codel_desc;
+
+/* default codel parameters */
+struct dn_aqm_codel_parms codel_sysctl = {5000 * AQM_TIME_1US,
+ 100000 * AQM_TIME_1US, 0};
+
+static int
+codel_sysctl_interval_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ value = codel_sysctl.interval;
+ value /= AQM_TIME_1US;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > 100 * AQM_TIME_1S)
+ return (EINVAL);
+ codel_sysctl.interval = value * AQM_TIME_1US ;
+ return (0);
+}
+
+static int
+codel_sysctl_target_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ value = codel_sysctl.target;
+ value /= AQM_TIME_1US;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ D("%ld", value);
+ if (value < 1 || value > 5 * AQM_TIME_1S)
+ return (EINVAL);
+ codel_sysctl.target = value * AQM_TIME_1US ;
+ return (0);
+}
+
+/* defining Codel sysctl variables */
+SYSBEGIN(f4)
+
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_ip_dummynet);
+static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO,
+ codel, CTLFLAG_RW, 0, "CODEL");
+
+#ifdef SYSCTL_NODE
+SYSCTL_PROC(_net_inet_ip_dummynet_codel, OID_AUTO, target,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,codel_sysctl_target_handler, "L",
+ "CoDel target in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_codel, OID_AUTO, interval,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, codel_sysctl_interval_handler, "L",
+ "CoDel interval in microsecond");
+#endif
+
+/* This function computes codel_interval/sqrt(count)
+ * Newton's method of approximation is used to compute 1/sqrt(count).
+ * http://betterexplained.com/articles/
+ * understanding-quakes-fast-inverse-square-root/
+ */
+aqm_time_t
+control_law(struct codel_status *cst, struct dn_aqm_codel_parms *cprms,
+ aqm_time_t t)
+{
+ uint32_t count;
+ uint64_t temp;
+ count = cst->count;
+
+ /* we don't calculate isqrt(1) to get more accurate result*/
+ if (count == 1) {
+ /* prepare isqrt (old guess) for the next iteration i.e. 1/sqrt(2)*/
+ cst->isqrt = (1UL<< FIX_POINT_BITS) * 7/10;
+ /* return time + isqrt(1)*interval */
+ return t + cprms->interval;
+ }
+
+ /* newguess = g(1.5 - 0.5*c*g^2)
+ * Multiplying both sides by 2 to make all the constants intergers
+ * newguess * 2 = g(3 - c*g^2) g=old guess, c=count
+ * So, newguess = newguess /2
+ * Fixed point operations are used here.
+ */
+
+ /* Calculate g^2 */
+ temp = (uint32_t) cst->isqrt * cst->isqrt;
+ /* Calculate (3 - c*g^2) i.e. (3 - c * temp) */
+ temp = (3ULL<< (FIX_POINT_BITS*2)) - (count * temp);
+
+ /*
+ * Divide by 2 because we multiplied the original equation by two
+ * Also, we shift the result by 8 bits to prevent overflow.
+ * */
+ temp >>= (1 + 8);
+
+ /* Now, temp = (1.5 - 0.5*c*g^2)
+ * Calculate g (1.5 - 0.5*c*g^2) i.e. g * temp
+ */
+ temp = (cst->isqrt * temp) >> (FIX_POINT_BITS + FIX_POINT_BITS - 8);
+ cst->isqrt = temp;
+
+ /* calculate codel_interval/sqrt(count) */
+ return t + ((cprms->interval * temp) >> FIX_POINT_BITS);
+}
+
+/*
+ * Extract a packet from the head of queue 'q'
+ * Return a packet or NULL if the queue is empty.
+ * Also extract packet's timestamp from mtag.
+ */
+struct mbuf *
+codel_extract_head(struct dn_queue *q, aqm_time_t *pkt_ts)
+{
+ struct m_tag *mtag;
+ struct mbuf *m = q->mq.head;
+
+ if (m == NULL)
+ return m;
+ q->mq.head = m->m_nextpkt;
+
+ /* Update stats */
+ update_stats(q, -m->m_pkthdr.len, 0);
+
+ if (q->ni.length == 0) /* queue is now idle */
+ q->q_time = dn_cfg.curr_time;
+
+ /* extract packet TS*/
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL) {
+ D("Codel timestamp mtag not found!");
+ *pkt_ts = 0;
+ } else {
+ *pkt_ts = *(aqm_time_t *)(mtag + 1);
+ m_tag_delete(m,mtag);
+ }
+
+ return m;
+}
+
+/*
+ * Enqueue a packet 'm' in queue 'q'
+ */
+static int
+aqm_codel_enqueue(struct dn_queue *q, struct mbuf *m)
+{
+ struct dn_fs *f;
+ uint64_t len;
+ struct codel_status *cst; /*codel status variables */
+ struct m_tag *mtag;
+
+ f = &(q->fs->fs);
+ len = m->m_pkthdr.len;
+ cst = q->aqm_status;
+ if(!cst) {
+ D("Codel queue is not initialized\n");
+ goto drop;
+ }
+
+ /* Finding maximum packet size */
+ // XXX we can get MTU from driver instead
+ if (len > cst->maxpkt_size)
+ cst->maxpkt_size = len;
+
+ /* check for queue size and drop the tail if exceed queue limit*/
+ if (f->flags & DN_QSIZE_BYTES) {
+ if ( q->ni.len_bytes > f->qsize)
+ goto drop;
+ }
+ else {
+ if ( q->ni.length >= f->qsize)
+ goto drop;
+ }
+
+ /* Add timestamp as mtag */
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL)
+ mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS,
+ sizeof(aqm_time_t), M_NOWAIT);
+ if (mtag == NULL) {
+ m_freem(m);
+ goto drop;
+ }
+
+ *(aqm_time_t *)(mtag + 1) = AQM_UNOW;
+ m_tag_prepend(m, mtag);
+
+ mq_append(&q->mq, m);
+ update_stats(q, len, 0);
+ return (0);
+
+drop:
+ update_stats(q, 0, 1);
+ FREE_PKT(m);
+ return (1);
+}
+
+
+/* Dequeue a pcaket from queue q */
+static struct mbuf *
+aqm_codel_dequeue(struct dn_queue *q)
+{
+ return codel_dequeue(q);
+}
+
+/*
+ * initialize Codel for queue 'q'
+ * First allocate memory for codel status.
+ */
+static int
+aqm_codel_init(struct dn_queue *q)
+{
+ struct codel_status *cst;
+
+ if (!q->fs->aqmcfg) {
+ D("Codel is not configure!d");
+ return EINVAL;
+ }
+
+ q->aqm_status = malloc(sizeof(struct codel_status),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (q->aqm_status == NULL) {
+ D("Cannot allocate AQM_codel private data");
+ return ENOMEM ;
+ }
+
+ /* init codel status variables */
+ cst = q->aqm_status;
+ cst->dropping=0;
+ cst->first_above_time=0;
+ cst->drop_next_time=0;
+ cst->count=0;
+ cst->maxpkt_size = 500;
+
+ /* increase reference counters */
+ codel_desc.ref_count++;
+
+ return 0;
+}
+
+/*
+ * Clean up Codel status for queue 'q'
+ * Destroy memory allocated for codel status.
+ */
+static int
+aqm_codel_cleanup(struct dn_queue *q)
+{
+
+ if (q && q->aqm_status) {
+ free(q->aqm_status, M_DUMMYNET);
+ q->aqm_status = NULL;
+ /* decrease reference counters */
+ codel_desc.ref_count--;
+ }
+ else
+ D("Codel already cleaned up");
+ return 0;
+}
+
+/*
+ * Config codel parameters
+ * also allocate memory for codel configurations
+ */
+static int
+aqm_codel_config(struct dn_fsk* fs, struct dn_extra_parms *ep, int len)
+{
+ struct dn_aqm_codel_parms *ccfg;
+
+ int l = sizeof(struct dn_extra_parms);
+ if (len < l) {
+ D("invalid sched parms length got %d need %d", len, l);
+ return EINVAL;
+ }
+ /* we free the old cfg because maybe the original allocation
+ * not the same size as the new one (different AQM type).
+ */
+ if (fs->aqmcfg) {
+ free(fs->aqmcfg, M_DUMMYNET);
+ fs->aqmcfg = NULL;
+ }
+
+ fs->aqmcfg = malloc(sizeof(struct dn_aqm_codel_parms),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (fs->aqmcfg== NULL) {
+ D("cannot allocate AQM_codel configuration parameters");
+ return ENOMEM;
+ }
+
+ /* configure codel parameters */
+ ccfg = fs->aqmcfg;
+
+ if (ep->par[0] < 0)
+ ccfg->target = codel_sysctl.target;
+ else
+ ccfg->target = ep->par[0] * AQM_TIME_1US;
+
+ if (ep->par[1] < 0)
+ ccfg->interval = codel_sysctl.interval;
+ else
+ ccfg->interval = ep->par[1] * AQM_TIME_1US;
+
+ if (ep->par[2] < 0)
+ ccfg->flags = 0;
+ else
+ ccfg->flags = ep->par[2];
+
+ /* bound codel configurations */
+ ccfg->target = BOUND_VAR(ccfg->target,1, 5 * AQM_TIME_1S);
+ ccfg->interval = BOUND_VAR(ccfg->interval,1, 5 * AQM_TIME_1S);
+ /* increase config reference counter */
+ codel_desc.cfg_ref_count++;
+
+ return 0;
+}
+
+/*
+ * Deconfigure Codel and free memory allocation
+ */
+static int
+aqm_codel_deconfig(struct dn_fsk* fs)
+{
+
+ if (fs && fs->aqmcfg) {
+ free(fs->aqmcfg, M_DUMMYNET);
+ fs->aqmcfg = NULL;
+ fs->aqmfp = NULL;
+ /* decrease config reference counter */
+ codel_desc.cfg_ref_count--;
+ }
+
+ return 0;
+}
+
+/*
+ * Retrieve Codel configuration parameters.
+ */
+static int
+aqm_codel_getconfig(struct dn_fsk *fs, struct dn_extra_parms * ep)
+{
+ struct dn_aqm_codel_parms *ccfg;
+
+ if (fs->aqmcfg) {
+ strcpy(ep->name, codel_desc.name);
+ ccfg = fs->aqmcfg;
+ ep->par[0] = ccfg->target / AQM_TIME_1US;
+ ep->par[1] = ccfg->interval / AQM_TIME_1US;
+ ep->par[2] = ccfg->flags;
+ return 0;
+ }
+ return 1;
+}
+
+static struct dn_aqm codel_desc = {
+ _SI( .type = ) DN_AQM_CODEL,
+ _SI( .name = ) "CODEL",
+ _SI( .enqueue = ) aqm_codel_enqueue,
+ _SI( .dequeue = ) aqm_codel_dequeue,
+ _SI( .config = ) aqm_codel_config,
+ _SI( .getconfig = ) aqm_codel_getconfig,
+ _SI( .deconfig = ) aqm_codel_deconfig,
+ _SI( .init = ) aqm_codel_init,
+ _SI( .cleanup = ) aqm_codel_cleanup,
+};
+
+DECLARE_DNAQM_MODULE(dn_aqm_codel, &codel_desc);
+
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_aqm_codel.h b/sys/netpfil/ipfw/dn_aqm_codel.h
new file mode 100644
index 0000000..f5618e7
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm_codel.h
@@ -0,0 +1,222 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm.
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Copyright (C) 2011-2014 Kathleen Nichols <nichols@pollere.com>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * o Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ *
+ * o Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * o The names of the authors may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General Public
+ * License ("GPL") version 2, in which case the provisions of the GPL
+ * apply INSTEAD OF those given above.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _IP_DN_AQM_CODEL_H
+#define _IP_DN_AQM_CODEL_H
+
+
+// XXX How to choose MTAG?
+#define FIX_POINT_BITS 16
+
+enum {
+ CODEL_ECN_ENABLED = 1
+};
+
+/* Codel parameters */
+struct dn_aqm_codel_parms {
+ aqm_time_t target;
+ aqm_time_t interval;
+ uint32_t flags;
+};
+
+/* codel status variables */
+struct codel_status {
+ uint32_t count; /* number of dropped pkts since entering drop state */
+ uint16_t dropping; /* dropping state */
+ aqm_time_t drop_next_time; /* time for next drop */
+ aqm_time_t first_above_time; /* time for first ts over target we observed */
+ uint16_t isqrt; /* last isqrt for control low */
+ uint16_t maxpkt_size; /* max packet size seen so far */
+};
+
+struct mbuf *codel_extract_head(struct dn_queue *, aqm_time_t *);
+aqm_time_t control_law(struct codel_status *,
+ struct dn_aqm_codel_parms *, aqm_time_t );
+
+__inline static struct mbuf *
+codel_dodequeue(struct dn_queue *q, aqm_time_t now, uint16_t *ok_to_drop)
+{
+ struct mbuf * m;
+ struct dn_aqm_codel_parms *cprms;
+ struct codel_status *cst;
+ aqm_time_t pkt_ts, sojourn_time;
+
+ *ok_to_drop = 0;
+ m = codel_extract_head(q, &pkt_ts);
+
+ cst = q->aqm_status;
+
+ if (m == NULL) {
+ /* queue is empty - we can't be above target */
+ cst->first_above_time= 0;
+ return m;
+ }
+
+ cprms = q->fs->aqmcfg;
+
+ /* To span a large range of bandwidths, CoDel runs two
+ * different AQMs in parallel. One is sojourn-time-based
+ * and takes effect when the time to send an MTU-sized
+ * packet is less than target. The 1st term of the "if"
+ * below does this. The other is backlog-based and takes
+ * effect when the time to send an MTU-sized packet is >=
+ * target. The goal here is to keep the output link
+ * utilization high by never allowing the queue to get
+ * smaller than the amount that arrives in a typical
+ * interarrival time (MTU-sized packets arriving spaced
+ * by the amount of time it takes to send such a packet on
+ * the bottleneck). The 2nd term of the "if" does this.
+ */
+ sojourn_time = now - pkt_ts;
+ if (sojourn_time < cprms->target || q->ni.len_bytes <= cst->maxpkt_size) {
+ /* went below - stay below for at least interval */
+ cst->first_above_time = 0;
+ } else {
+ if (cst->first_above_time == 0) {
+ /* just went above from below. if still above at
+ * first_above_time, will say it's ok to drop. */
+ cst->first_above_time = now + cprms->interval;
+ } else if (now >= cst->first_above_time) {
+ *ok_to_drop = 1;
+ }
+ }
+ return m;
+}
+
+/*
+ * Dequeue a packet from queue 'q'
+ */
+__inline static struct mbuf *
+codel_dequeue(struct dn_queue *q)
+{
+ struct mbuf *m;
+ struct dn_aqm_codel_parms *cprms;
+ struct codel_status *cst;
+ aqm_time_t now;
+ uint16_t ok_to_drop;
+
+ cst = q->aqm_status;;
+ cprms = q->fs->aqmcfg;
+ now = AQM_UNOW;
+
+ m = codel_dodequeue(q, now, &ok_to_drop);
+ if (cst->dropping) {
+ if (!ok_to_drop) {
+ /* sojourn time below target - leave dropping state */
+ cst->dropping = false;
+ }
+ /*
+ * Time for the next drop. Drop current packet and dequeue
+ * next. If the dequeue doesn't take us out of dropping
+ * state, schedule the next drop. A large backlog might
+ * result in drop rates so high that the next drop should
+ * happen now, hence the 'while' loop.
+ */
+ while (now >= cst->drop_next_time && cst->dropping) {
+
+ /* mark the packet */
+ if (cprms->flags & CODEL_ECN_ENABLED && ecn_mark(m)) {
+ cst->count++;
+ /* schedule the next mark. */
+ cst->drop_next_time = control_law(cst, cprms,
+ cst->drop_next_time);
+ return m;
+ }
+
+ /* drop the packet */
+ update_stats(q, 0, 1);
+ FREE_PKT(m);
+ m = codel_dodequeue(q, now, &ok_to_drop);
+
+ if (!ok_to_drop) {
+ /* leave dropping state */
+ cst->dropping = false;
+ } else {
+ cst->count++;
+ /* schedule the next drop. */
+ cst->drop_next_time = control_law(cst, cprms,
+ cst->drop_next_time);
+ }
+ }
+ /* If we get here we're not in dropping state. The 'ok_to_drop'
+ * return from dodequeue means that the sojourn time has been
+ * above 'target' for 'interval' so enter dropping state.
+ */
+ } else if (ok_to_drop) {
+
+ /* if ECN option is disabled or the packet cannot be marked,
+ * drop the packet and extract another.
+ */
+ if (!(cprms->flags & CODEL_ECN_ENABLED) || !ecn_mark(m)) {
+ update_stats(q, 0, 1);
+ FREE_PKT(m);
+ m = codel_dodequeue(q, now, &ok_to_drop);
+ }
+
+ cst->dropping = true;
+
+ /* If min went above target close to when it last went
+ * below, assume that the drop rate that controlled the
+ * queue on the last cycle is a good starting point to
+ * control it now. ('drop_next' will be at most 'interval'
+ * later than the time of the last drop so 'now - drop_next'
+ * is a good approximation of the time from the last drop
+ * until now.)
+ */
+ cst->count = (cst->count > 2 && ((aqm_stime_t)now -
+ (aqm_stime_t)cst->drop_next_time) < 8* cprms->interval)?
+ cst->count - 2 : 1;
+ /* we don't have to set initial guess for Newton's method isqrt as
+ * we initilaize isqrt in control_law function when count == 1 */
+ cst->drop_next_time = control_law(cst, cprms, now);
+ }
+
+ return m;
+}
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_aqm_pie.c b/sys/netpfil/ipfw/dn_aqm_pie.c
new file mode 100644
index 0000000..c4b9401
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm_pie.c
@@ -0,0 +1,793 @@
+/*
+ * PIE - Proportional Integral controller Enhanced AQM algorithm.
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h> /* ip_len, ip_off */
+#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/if_ether.h> /* various ether_* routines */
+#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+#include <netpfil/ipfw/dn_heap.h>
+
+#ifdef NEW_AQM
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+#include <netpfil/ipfw/dn_aqm.h>
+#include <netpfil/ipfw/dn_aqm_pie.h>
+#include <netpfil/ipfw/dn_sched.h>
+
+/* for debugging */
+#include <sys/syslog.h>
+
+static struct dn_aqm pie_desc;
+
+/* PIE defaults
+ * target=15ms, tupdate=15ms, max_burst=150ms,
+ * max_ecnth=0.1, alpha=0.125, beta=1.25,
+ */
+struct dn_aqm_pie_parms pie_sysctl =
+ { 15 * AQM_TIME_1MS, 15 * AQM_TIME_1MS, 150 * AQM_TIME_1MS,
+ PIE_SCALE/10 , PIE_SCALE * 0.125, PIE_SCALE * 1.25 ,
+ PIE_CAPDROP_ENABLED | PIE_DEPRATEEST_ENABLED | PIE_DERAND_ENABLED };
+
+static int
+pie_sysctl_alpha_beta_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ if (!strcmp(oidp->oid_name,"alpha"))
+ value = pie_sysctl.alpha;
+ else
+ value = pie_sysctl.beta;
+
+ value = value * 1000 / PIE_SCALE;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > 7 * PIE_SCALE)
+ return (EINVAL);
+ value = (value * PIE_SCALE) / 1000;
+ if (!strcmp(oidp->oid_name,"alpha"))
+ pie_sysctl.alpha = value;
+ else
+ pie_sysctl.beta = value;
+ return (0);
+}
+
+static int
+pie_sysctl_target_tupdate_maxb_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ if (!strcmp(oidp->oid_name,"target"))
+ value = pie_sysctl.qdelay_ref;
+ else if (!strcmp(oidp->oid_name,"tupdate"))
+ value = pie_sysctl.tupdate;
+ else
+ value = pie_sysctl.max_burst;
+
+ value = value / AQM_TIME_1US;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > 10 * AQM_TIME_1S)
+ return (EINVAL);
+ value = value * AQM_TIME_1US;
+
+ if (!strcmp(oidp->oid_name,"target"))
+ pie_sysctl.qdelay_ref = value;
+ else if (!strcmp(oidp->oid_name,"tupdate"))
+ pie_sysctl.tupdate = value;
+ else
+ pie_sysctl.max_burst = value;
+ return (0);
+}
+
+static int
+pie_sysctl_max_ecnth_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ value = pie_sysctl.max_ecnth;
+ value = value * 1000 / PIE_SCALE;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > PIE_SCALE)
+ return (EINVAL);
+ value = (value * PIE_SCALE) / 1000;
+ pie_sysctl.max_ecnth = value;
+ return (0);
+}
+
+/* define PIE sysctl variables */
+SYSBEGIN(f4)
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_ip_dummynet);
+static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO,
+ pie, CTLFLAG_RW, 0, "PIE");
+
+#ifdef SYSCTL_NODE
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, target,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ pie_sysctl_target_tupdate_maxb_handler, "L",
+ "queue target in microsecond");
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, tupdate,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ pie_sysctl_target_tupdate_maxb_handler, "L",
+ "the frequency of drop probability calculation in microsecond");
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, max_burst,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ pie_sysctl_target_tupdate_maxb_handler, "L",
+ "Burst allowance interval in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, max_ecnth,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ pie_sysctl_max_ecnth_handler, "L",
+ "ECN safeguard threshold scaled by 1000");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, alpha,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ pie_sysctl_alpha_beta_handler, "L",
+ "PIE alpha scaled by 1000");
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, beta,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ pie_sysctl_alpha_beta_handler, "L",
+ "beta scaled by 1000");
+#endif
+
+
+/*
+ * Callout function for drop probability calculation
+ * This function is called over tupdate ms and takes pointer of PIE
+ * status variables as an argument
+ */
+static void
+calculate_drop_prob(void *x)
+{
+ int64_t p, prob, oldprob;
+ struct dn_aqm_pie_parms *pprms;
+ struct pie_status *pst = (struct pie_status *) x;
+
+ /* dealing with race condition */
+ if (callout_pending(&pst->aqm_pie_callout)) {
+ /* callout was reset */
+ mtx_unlock(&pst->lock_mtx);
+ return;
+ }
+
+ if (!callout_active(&pst->aqm_pie_callout)) {
+ /* callout was stopped */
+ mtx_unlock(&pst->lock_mtx);
+ mtx_destroy(&pst->lock_mtx);
+ free(x, M_DUMMYNET);
+ //pst->pq->aqm_status = NULL;
+ pie_desc.ref_count--;
+ return;
+ }
+ callout_deactivate(&pst->aqm_pie_callout);
+
+ pprms = pst->parms;
+ prob = pst->drop_prob;
+
+ /* calculate current qdelay */
+ if (pprms->flags & PIE_DEPRATEEST_ENABLED) {
+ pst->current_qdelay = ((uint64_t)pst->pq->ni.len_bytes *
+ pst->avg_dq_time) >> PIE_DQ_THRESHOLD_BITS;
+ }
+
+ /* calculate drop probability */
+ p = (int64_t)pprms->alpha *
+ ((int64_t)pst->current_qdelay - (int64_t)pprms->qdelay_ref);
+ p +=(int64_t) pprms->beta *
+ ((int64_t)pst->current_qdelay - (int64_t)pst->qdelay_old);
+
+ /* We PIE_MAX_PROB shift by 12-bits to increase the division precision */
+ p *= (PIE_MAX_PROB << 12) / AQM_TIME_1S;
+
+ /* auto-tune drop probability */
+ if (prob < (PIE_MAX_PROB / 1000000)) /* 0.000001 */
+ p >>= 11 + PIE_FIX_POINT_BITS + 12;
+ else if (prob < (PIE_MAX_PROB / 100000)) /* 0.00001 */
+ p >>= 9 + PIE_FIX_POINT_BITS + 12;
+ else if (prob < (PIE_MAX_PROB / 10000)) /* 0.0001 */
+ p >>= 7 + PIE_FIX_POINT_BITS + 12;
+ else if (prob < (PIE_MAX_PROB / 1000)) /* 0.001 */
+ p >>= 5 + PIE_FIX_POINT_BITS + 12;
+ else if (prob < (PIE_MAX_PROB / 100)) /* 0.01 */
+ p >>= 3 + PIE_FIX_POINT_BITS + 12;
+ else if (prob < (PIE_MAX_PROB / 10)) /* 0.1 */
+ p >>= 1 + PIE_FIX_POINT_BITS + 12;
+ else
+ p >>= PIE_FIX_POINT_BITS + 12;
+
+ oldprob = prob;
+
+ /* Cap Drop adjustment */
+ if ((pprms->flags & PIE_CAPDROP_ENABLED) && prob >= PIE_MAX_PROB / 10
+ && p > PIE_MAX_PROB / 50 )
+ p = PIE_MAX_PROB / 50;
+
+ prob = prob + p;
+
+ /* decay the drop probability exponentially */
+ if (pst->current_qdelay == 0 && pst->qdelay_old == 0)
+ /* 0.98 ~= 1- 1/64 */
+ prob = prob - (prob >> 6);
+
+
+ /* check for multiplication overflow/underflow */
+ if (p>0) {
+ if (prob<oldprob) {
+ D("overflow");
+ prob= PIE_MAX_PROB;
+ }
+ }
+ else
+ if (prob>oldprob) {
+ prob= 0;
+ D("underflow");
+ }
+
+ /* make drop probability between 0 and PIE_MAX_PROB*/
+ if (prob < 0)
+ prob = 0;
+ else if (prob > PIE_MAX_PROB)
+ prob = PIE_MAX_PROB;
+
+ pst->drop_prob = prob;
+
+ /* store current queue delay value in old queue delay*/
+ pst->qdelay_old = pst->current_qdelay;
+
+ /* update burst allowance */
+ if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance>0) {
+
+ if (pst->burst_allowance > pprms->tupdate )
+ pst->burst_allowance -= pprms->tupdate;
+ else
+ pst->burst_allowance = 0;
+ }
+
+ /* reschedule calculate_drop_prob function */
+ if (pst->sflags & PIE_ACTIVE)
+ callout_reset_sbt(&pst->aqm_pie_callout,
+ (uint64_t)pprms->tupdate * SBT_1US, 0, calculate_drop_prob, pst, 0);
+
+ mtx_unlock(&pst->lock_mtx);
+}
+
+/*
+ * Extract a packet from the head of queue 'q'
+ * Return a packet or NULL if the queue is empty.
+ * If getts is set, also extract packet's timestamp from mtag.
+ */
+static struct mbuf *
+pie_extract_head(struct dn_queue *q, aqm_time_t *pkt_ts, int getts)
+{
+ struct m_tag *mtag;
+ struct mbuf *m = q->mq.head;
+
+ if (m == NULL)
+ return m;
+ q->mq.head = m->m_nextpkt;
+
+ /* Update stats */
+ update_stats(q, -m->m_pkthdr.len, 0);
+
+ if (q->ni.length == 0) /* queue is now idle */
+ q->q_time = dn_cfg.curr_time;
+
+ if (getts) {
+ /* extract packet TS*/
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL) {
+ D("PIE timestamp mtag not found!");
+ *pkt_ts = 0;
+ } else {
+ *pkt_ts = *(aqm_time_t *)(mtag + 1);
+ m_tag_delete(m,mtag);
+ }
+ }
+ return m;
+}
+
+/*
+ * Initiate PIE variable and optionally activate it
+ */
+__inline static void
+init_activate_pie(struct pie_status *pst, int resettimer)
+{
+ struct dn_aqm_pie_parms *pprms;
+
+ mtx_lock(&pst->lock_mtx);
+ pprms = pst->parms;
+ pst->drop_prob = 0;
+ pst->qdelay_old = 0;
+ pst->burst_allowance = pprms->max_burst;
+ pst->accu_prob = 0;
+ pst->dq_count = 0;
+ pst->avg_dq_time = 0;
+ pst->sflags = PIE_INMEASUREMENT;
+ pst->measurement_start = AQM_UNOW;
+
+ if (resettimer) {
+ pst->sflags |= PIE_ACTIVE;
+ callout_reset_sbt(&pst->aqm_pie_callout,
+ (uint64_t)pprms->tupdate * SBT_1US,
+ 0, calculate_drop_prob, pst, 0);
+ }
+ //DX(2, "PIE Activated");
+ mtx_unlock(&pst->lock_mtx);
+}
+
+/*
+ * Deactivate PIE and stop probe update callout
+ */
+__inline static void
+deactivate_pie(struct pie_status *pst)
+{
+ mtx_lock(&pst->lock_mtx);
+ pst->sflags &= ~(PIE_ACTIVE | PIE_INMEASUREMENT);
+ callout_stop(&pst->aqm_pie_callout);
+ //D("PIE Deactivated");
+ mtx_unlock(&pst->lock_mtx);
+}
+
+/*
+ * Dequeue and return a pcaket from queue 'q' or NULL if 'q' is empty.
+ * Also, caculate depature time or queue delay using timestamp
+ */
+static struct mbuf *
+aqm_pie_dequeue(struct dn_queue *q)
+{
+ struct mbuf *m;
+ struct dn_flow *ni; /* stats for scheduler instance */
+ struct dn_aqm_pie_parms *pprms;
+ struct pie_status *pst;
+ aqm_time_t now;
+ aqm_time_t pkt_ts, dq_time;
+ int32_t w;
+
+ pst = q->aqm_status;
+ pprms = pst->parms;
+ ni = &q->_si->ni;
+
+ /*we extarct packet ts only when Departure Rate Estimation dis not used*/
+ m = pie_extract_head(q, &pkt_ts, !(pprms->flags & PIE_DEPRATEEST_ENABLED));
+
+ if (!m || !(pst->sflags & PIE_ACTIVE))
+ return m;
+
+ now = AQM_UNOW;
+ if (pprms->flags & PIE_DEPRATEEST_ENABLED) {
+ /* calculate average depature time */
+ if(pst->sflags & PIE_INMEASUREMENT) {
+ pst->dq_count += m->m_pkthdr.len;
+
+ if (pst->dq_count >= PIE_DQ_THRESHOLD) {
+ dq_time = now - pst->measurement_start;
+
+ /*
+ * if we don't have old avg dq_time i.e PIE is (re)initialized,
+ * don't use weight to calculate new avg_dq_time
+ */
+ if(pst->avg_dq_time == 0)
+ pst->avg_dq_time = dq_time;
+ else {
+ /*
+ * weight = PIE_DQ_THRESHOLD/2^6, but we scaled
+ * weight by 2^8. Thus, scaled
+ * weight = PIE_DQ_THRESHOLD /2^8
+ * */
+ w = PIE_DQ_THRESHOLD >> 8;
+ pst->avg_dq_time = (dq_time* w
+ + (pst->avg_dq_time * ((1L << 8) - w))) >> 8;
+ pst->sflags &= ~PIE_INMEASUREMENT;
+ }
+ }
+ }
+
+ /*
+ * Start new measurment cycle when the queue has
+ * PIE_DQ_THRESHOLD worth of bytes.
+ */
+ if(!(pst->sflags & PIE_INMEASUREMENT) &&
+ q->ni.len_bytes >= PIE_DQ_THRESHOLD) {
+ pst->sflags |= PIE_INMEASUREMENT;
+ pst->measurement_start = now;
+ pst->dq_count = 0;
+ }
+ }
+ /* Optionally, use packet timestamp to estimate queue delay */
+ else
+ pst->current_qdelay = now - pkt_ts;
+
+ return m;
+}
+
+/*
+ * Enqueue a packet in q, subject to space and PIE queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+static int
+aqm_pie_enqueue(struct dn_queue *q, struct mbuf* m)
+{
+ struct dn_fs *f;
+ uint64_t len;
+ uint32_t qlen;
+ struct pie_status *pst;
+ struct dn_aqm_pie_parms *pprms;
+ int t;
+
+ len = m->m_pkthdr.len;
+ pst = q->aqm_status;
+ if(!pst) {
+ DX(2, "PIE queue is not initialized\n");
+ update_stats(q, 0, 1);
+ FREE_PKT(m);
+ return 1;
+ }
+
+ f = &(q->fs->fs);
+ pprms = pst->parms;
+ t = ENQUE;
+
+ /* get current queue length in bytes or packets*/
+ qlen = (f->flags & DN_QSIZE_BYTES) ?
+ q->ni.len_bytes : q->ni.length;
+
+ /* check for queue size and drop the tail if exceed queue limit*/
+ if (qlen >= f->qsize)
+ t = DROP;
+ /* drop/mark the packet when PIE is active and burst time elapsed */
+ else if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance==0
+ && drop_early(pst, q->ni.len_bytes) == DROP) {
+ /*
+ * if drop_prob over ECN threshold, drop the packet
+ * otherwise mark and enqueue it.
+ */
+ if ((pprms->flags & PIE_ECN_ENABLED) && pst->drop_prob <
+ (pprms->max_ecnth << (PIE_PROB_BITS - PIE_FIX_POINT_BITS))
+ && ecn_mark(m))
+ t = ENQUE;
+ else
+ t = DROP;
+ }
+
+ /* Turn PIE on when 1/3 of the queue is full */
+ if (!(pst->sflags & PIE_ACTIVE) && qlen >= pst->one_third_q_size) {
+ init_activate_pie(pst, 1);
+ }
+
+ /* Reset burst tolerance and optinally turn PIE off*/
+ if ((pst->sflags & PIE_ACTIVE) && pst->drop_prob == 0 &&
+ pst->current_qdelay < (pprms->qdelay_ref >> 1) &&
+ pst->qdelay_old < (pprms->qdelay_ref >> 1)) {
+
+ pst->burst_allowance = pprms->max_burst;
+ if ((pprms->flags & PIE_ON_OFF_MODE_ENABLED) && qlen<=0)
+ deactivate_pie(pst);
+ }
+
+ /* Timestamp the packet if Departure Rate Estimation is disabled */
+ if (t != DROP && !(pprms->flags & PIE_DEPRATEEST_ENABLED)) {
+ /* Add TS to mbuf as a TAG */
+ struct m_tag *mtag;
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL)
+ mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS,
+ sizeof(aqm_time_t), M_NOWAIT);
+ if (mtag == NULL) {
+ m_freem(m);
+ t = DROP;
+ }
+ *(aqm_time_t *)(mtag + 1) = AQM_UNOW;
+ m_tag_prepend(m, mtag);
+ }
+
+ if (t != DROP) {
+ mq_append(&q->mq, m);
+ update_stats(q, len, 0);
+ return (0);
+ } else {
+ update_stats(q, 0, 1);
+
+ /* reset accu_prob after packet drop */
+ pst->accu_prob = 0;
+ FREE_PKT(m);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * initialize PIE for queue 'q'
+ * First allocate memory for PIE status.
+ */
+static int
+aqm_pie_init(struct dn_queue *q)
+{
+ struct pie_status *pst;
+ struct dn_aqm_pie_parms *pprms;
+ int err = 0;
+
+ pprms = q->fs->aqmcfg;
+
+ do { /* exit with break when error occurs*/
+ if (!pprms){
+ D("AQM_PIE is not configured");
+ err = EINVAL;
+ break;
+ }
+
+ q->aqm_status = malloc(sizeof(struct pie_status),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (q->aqm_status == NULL) {
+ D("cannot allocate PIE private data");
+ err = ENOMEM ;
+ break;
+ }
+
+ pst = q->aqm_status;
+ /* increase reference count for PIE module */
+ pie_desc.ref_count++;
+
+ pst->pq = q;
+ pst->parms = pprms;
+
+ /* For speed optimization, we caculate 1/3 queue size once here */
+ // we can use x/3 = (x >>2) + (x >>4) + (x >>7)
+ pst->one_third_q_size = q->fs->fs.qsize/3;
+
+ mtx_init(&pst->lock_mtx, "mtx_pie", NULL, MTX_DEF);
+ callout_init_mtx(&pst->aqm_pie_callout, &pst->lock_mtx,
+ CALLOUT_RETURNUNLOCKED);
+
+ pst->current_qdelay = 0;
+ init_activate_pie(pst, !(pprms->flags & PIE_ON_OFF_MODE_ENABLED));
+
+ //DX(2, "aqm_PIE_init");
+
+ } while(0);
+
+ return err;
+}
+
+/*
+ * Clean up PIE status for queue 'q'
+ * Destroy memory allocated for PIE status.
+ */
+static int
+aqm_pie_cleanup(struct dn_queue *q)
+{
+
+ if(!q) {
+ D("q is null");
+ return 0;
+ }
+ struct pie_status *pst = q->aqm_status;
+ if(!pst) {
+ //D("queue is already cleaned up");
+ return 0;
+ }
+ if(!q->fs || !q->fs->aqmcfg) {
+ D("fs is null or no cfg");
+ return 1;
+ }
+ if (q->fs->aqmfp && q->fs->aqmfp->type !=DN_AQM_PIE) {
+ D("Not PIE fs (%d)", q->fs->fs.fs_nr);
+ return 1;
+ }
+
+ mtx_lock(&pst->lock_mtx);
+
+ /* stop callout timer */
+ if (callout_stop(&pst->aqm_pie_callout) || !(pst->sflags & PIE_ACTIVE)) {
+ mtx_unlock(&pst->lock_mtx);
+ mtx_destroy(&pst->lock_mtx);
+ free(q->aqm_status, M_DUMMYNET);
+ q->aqm_status = NULL;
+ pie_desc.ref_count--;
+ return 0;
+ } else {
+ q->aqm_status = NULL;
+ mtx_unlock(&pst->lock_mtx);
+ DX(2, "PIE callout has not been stoped from cleanup!");
+ return EBUSY;
+ }
+ return 0;
+}
+
+/*
+ * Config PIE parameters
+ * also allocate memory for PIE configurations
+ */
+static int
+aqm_pie_config(struct dn_fsk* fs, struct dn_extra_parms *ep, int len)
+{
+ struct dn_aqm_pie_parms *pcfg;
+
+ int l = sizeof(struct dn_extra_parms);
+ if (len < l) {
+ D("invalid sched parms length got %d need %d", len, l);
+ return EINVAL;
+ }
+ /* we free the old cfg because maybe the orignal allocation
+ * was used for diffirent AQM type.
+ */
+ if (fs->aqmcfg) {
+ free(fs->aqmcfg, M_DUMMYNET);
+ fs->aqmcfg = NULL;
+ }
+
+ fs->aqmcfg = malloc(sizeof(struct dn_aqm_pie_parms),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (fs->aqmcfg== NULL) {
+ D("cannot allocate PIE configuration parameters");
+ return ENOMEM;
+ }
+
+ /* par array contains pie configuration as follow
+ * 0- qdelay_ref,1- tupdate, 2- max_burst
+ * 3- max_ecnth, 4- alpha, 5- beta, 6- flags
+ */
+
+ /* configure PIE parameters */
+ pcfg = fs->aqmcfg;
+
+ if (ep->par[0] < 0)
+ pcfg->qdelay_ref = pie_sysctl.qdelay_ref * AQM_TIME_1US;
+ else
+ pcfg->qdelay_ref = ep->par[0];
+ if (ep->par[1] < 0)
+ pcfg->tupdate = pie_sysctl.tupdate * AQM_TIME_1US;
+ else
+ pcfg->tupdate = ep->par[1];
+ if (ep->par[2] < 0)
+ pcfg->max_burst = pie_sysctl.max_burst * AQM_TIME_1US;
+ else
+ pcfg->max_burst = ep->par[2];
+ if (ep->par[3] < 0)
+ pcfg->max_ecnth = pie_sysctl.max_ecnth;
+ else
+ pcfg->max_ecnth = ep->par[3];
+ if (ep->par[4] < 0)
+ pcfg->alpha = pie_sysctl.alpha;
+ else
+ pcfg->alpha = ep->par[4];
+ if (ep->par[5] < 0)
+ pcfg->beta = pie_sysctl.beta;
+ else
+ pcfg->beta = ep->par[5];
+ if (ep->par[6] < 0)
+ pcfg->flags = pie_sysctl.flags;
+ else
+ pcfg->flags = ep->par[6];
+
+ /* bound PIE configurations */
+ pcfg->qdelay_ref = BOUND_VAR(pcfg->qdelay_ref, 1, 10 * AQM_TIME_1S);
+ pcfg->tupdate = BOUND_VAR(pcfg->tupdate, 1, 10 * AQM_TIME_1S);
+ pcfg->max_burst = BOUND_VAR(pcfg->max_burst, 0, 10 * AQM_TIME_1S);
+ pcfg->max_ecnth = BOUND_VAR(pcfg->max_ecnth, 0, PIE_SCALE);
+ pcfg->alpha = BOUND_VAR(pcfg->alpha, 0, 7 * PIE_SCALE);
+ pcfg->beta = BOUND_VAR(pcfg->beta, 0 , 7 * PIE_SCALE);
+
+ pie_desc.cfg_ref_count++;
+ //D("pie cfg_ref_count=%d", pie_desc.cfg_ref_count);
+ return 0;
+}
+
+/*
+ * Deconfigure PIE and free memory allocation
+ */
+static int
+aqm_pie_deconfig(struct dn_fsk* fs)
+{
+ if (fs && fs->aqmcfg) {
+ free(fs->aqmcfg, M_DUMMYNET);
+ fs->aqmcfg = NULL;
+ pie_desc.cfg_ref_count--;
+ }
+ return 0;
+}
+
+/*
+ * Retrieve PIE configuration parameters.
+ */
+static int
+aqm_pie_getconfig (struct dn_fsk *fs, struct dn_extra_parms * ep)
+{
+ struct dn_aqm_pie_parms *pcfg;
+ if (fs->aqmcfg) {
+ strcpy(ep->name, pie_desc.name);
+ pcfg = fs->aqmcfg;
+ ep->par[0] = pcfg->qdelay_ref / AQM_TIME_1US;
+ ep->par[1] = pcfg->tupdate / AQM_TIME_1US;
+ ep->par[2] = pcfg->max_burst / AQM_TIME_1US;
+ ep->par[3] = pcfg->max_ecnth;
+ ep->par[4] = pcfg->alpha;
+ ep->par[5] = pcfg->beta;
+ ep->par[6] = pcfg->flags;
+
+ return 0;
+ }
+ return 1;
+}
+
+static struct dn_aqm pie_desc = {
+ _SI( .type = ) DN_AQM_PIE,
+ _SI( .name = ) "PIE",
+ _SI( .ref_count = ) 0,
+ _SI( .cfg_ref_count = ) 0,
+ _SI( .enqueue = ) aqm_pie_enqueue,
+ _SI( .dequeue = ) aqm_pie_dequeue,
+ _SI( .config = ) aqm_pie_config,
+ _SI( .deconfig = ) aqm_pie_deconfig,
+ _SI( .getconfig = ) aqm_pie_getconfig,
+ _SI( .init = ) aqm_pie_init,
+ _SI( .cleanup = ) aqm_pie_cleanup,
+};
+
+DECLARE_DNAQM_MODULE(dn_aqm_pie, &pie_desc);
+#endif
diff --git a/sys/netpfil/ipfw/dn_aqm_pie.h b/sys/netpfil/ipfw/dn_aqm_pie.h
new file mode 100644
index 0000000..aa2fceb
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm_pie.h
@@ -0,0 +1,153 @@
+/*
+ * PIE - Proportional Integral controller Enhanced AQM algorithm.
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _IP_DN_AQM_PIE_H
+#define _IP_DN_AQM_PIE_H
+
+#define DN_AQM_PIE 2
+#define PIE_DQ_THRESHOLD_BITS 14
+/* 2^14 =16KB */
+#define PIE_DQ_THRESHOLD (1UL << PIE_DQ_THRESHOLD_BITS)
+#define MEAN_PKTSIZE 800
+
+/* 31-bits because random() generates range from 0->(2**31)-1 */
+#define PIE_PROB_BITS 31
+#define PIE_MAX_PROB ((1ULL<<PIE_PROB_BITS) -1)
+
+/* for 16-bits, we have 3-bits for integer part and 13-bits for fraction */
+#define PIE_FIX_POINT_BITS 13
+#define PIE_SCALE (1UL<<PIE_FIX_POINT_BITS)
+
+
+/* PIE options */
+enum {
+ PIE_ECN_ENABLED =1,
+ PIE_CAPDROP_ENABLED = 2,
+ PIE_ON_OFF_MODE_ENABLED = 4,
+ PIE_DEPRATEEST_ENABLED = 8,
+ PIE_DERAND_ENABLED = 16
+};
+
+/* PIE parameters */
+struct dn_aqm_pie_parms {
+ aqm_time_t qdelay_ref; /* AQM Latency Target (default: 15ms) */
+ aqm_time_t tupdate; /* a period to calculate drop probability (default:15ms) */
+ aqm_time_t max_burst; /* AQM Max Burst Allowance (default: 150ms) */
+ uint16_t max_ecnth; /*AQM Max ECN Marking Threshold (default: 10%) */
+ uint16_t alpha; /* (default: 1/8) */
+ uint16_t beta; /* (default: 1+1/4) */
+ uint32_t flags; /* PIE options */
+};
+
+/* PIE status variables */
+struct pie_status{
+ struct callout aqm_pie_callout;
+ aqm_time_t burst_allowance;
+ uint32_t drop_prob;
+ aqm_time_t current_qdelay;
+ aqm_time_t qdelay_old;
+ uint64_t accu_prob;
+ aqm_time_t measurement_start;
+ aqm_time_t avg_dq_time;
+ uint32_t dq_count;
+ uint32_t sflags;
+ struct dn_aqm_pie_parms *parms; /* pointer to PIE configurations */
+ /* pointer to parent queue of FQ-PIE sub-queues, or queue of owner fs. */
+ struct dn_queue *pq;
+ struct mtx lock_mtx;
+ uint32_t one_third_q_size; /* 1/3 of queue size, for speed optization */
+};
+
+enum {
+ ENQUE = 1,
+ DROP,
+ MARKECN
+};
+
+/* PIE current state */
+enum {
+ PIE_ACTIVE = 1,
+ PIE_INMEASUREMENT = 2
+};
+
+/*
+ * Check if eneque should drop packet to control delay or not based on
+ * PIe algorithm.
+ * return DROP if it is time to drop or ENQUE otherwise.
+ * This function is used by PIE and FQ-PIE.
+ */
+__inline static int
+drop_early(struct pie_status *pst, uint32_t qlen)
+{
+ struct dn_aqm_pie_parms *pprms;
+
+ pprms = pst->parms;
+
+ /* queue is not congested */
+
+ if ((pst->qdelay_old < (pprms->qdelay_ref >> 1)
+ && pst->drop_prob < PIE_MAX_PROB / 5 )
+ || qlen <= 2 * MEAN_PKTSIZE)
+ return ENQUE;
+
+
+ if (pst->drop_prob == 0)
+ pst->accu_prob = 0;
+
+ /* increment accu_prob */
+ if (pprms->flags & PIE_DERAND_ENABLED)
+ pst->accu_prob += pst->drop_prob;
+
+ /* De-randomize option
+ * if accu_prob < 0.85 -> enqueue
+ * if accu_prob>8.5 ->drop
+ * between 0.85 and 8.5 || !De-randomize --> drop on prob
+ *
+ * (0.85 = 17/20 ,8.5 = 17/2)
+ */
+ if (pprms->flags & PIE_DERAND_ENABLED) {
+ if(pst->accu_prob < (uint64_t) (PIE_MAX_PROB * 17 / 20))
+ return ENQUE;
+ if( pst->accu_prob >= (uint64_t) (PIE_MAX_PROB * 17 / 2))
+ return DROP;
+ }
+
+ if (random() < pst->drop_prob) {
+ pst->accu_prob = 0;
+ return DROP;
+ }
+
+ return ENQUE;
+}
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_sched.h b/sys/netpfil/ipfw/dn_sched.h
index ab823fe..a359198 100644
--- a/sys/netpfil/ipfw/dn_sched.h
+++ b/sys/netpfil/ipfw/dn_sched.h
@@ -132,6 +132,10 @@ struct dn_alg {
int (*free_fsk)(struct dn_fsk *f);
int (*new_queue)(struct dn_queue *q);
int (*free_queue)(struct dn_queue *q);
+#ifdef NEW_AQM
+ /* Getting scheduler extra parameters */
+ int (*getconfig)(struct dn_schk *, struct dn_extra_parms *);
+#endif
/* run-time fields */
int ref_count; /* XXX number of instances in the system */
@@ -165,6 +169,11 @@ dn_dequeue(struct dn_queue *q)
struct mbuf *m = q->mq.head;
if (m == NULL)
return NULL;
+#ifdef NEW_AQM
+ /* Call AQM dequeue function */
+ if (q->fs->aqmfp && q->fs->aqmfp->dequeue )
+ return q->fs->aqmfp->dequeue(q);
+#endif
q->mq.head = m->m_nextpkt;
/* Update stats for the queue */
diff --git a/sys/netpfil/ipfw/dn_sched_fifo.c b/sys/netpfil/ipfw/dn_sched_fifo.c
index e2aa608..a4a2a70 100644
--- a/sys/netpfil/ipfw/dn_sched_fifo.c
+++ b/sys/netpfil/ipfw/dn_sched_fifo.c
@@ -42,6 +42,9 @@
#include <netinet/ip_dummynet.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
#else
#include <dn_test.h>
@@ -115,6 +118,9 @@ static struct dn_alg fifo_desc = {
_SI( .free_fsk = ) NULL,
_SI( .new_queue = ) NULL,
_SI( .free_queue = ) NULL,
+#ifdef NEW_AQM
+ _SI( .getconfig = ) NULL,
+#endif
};
DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_fq_codel.c b/sys/netpfil/ipfw/dn_sched_fq_codel.c
new file mode 100644
index 0000000..c783730
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_fq_codel.c
@@ -0,0 +1,617 @@
+/*
+ * FQ_Codel - The FlowQueue-Codel scheduler/AQM
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+//#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h> /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ipfw_rule_ref */
+#include <netinet/ip_fw.h> /* flow_id */
+#include <netinet/ip_dummynet.h>
+
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <sys/sysctl.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <sys/queue.h>
+#include <sys/hash.h>
+
+#include <netpfil/ipfw/dn_heap.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+
+#include <netpfil/ipfw/dn_aqm.h>
+#include <netpfil/ipfw/dn_aqm_codel.h>
+#include <netpfil/ipfw/dn_sched.h>
+#include <netpfil/ipfw/dn_sched_fq_codel.h>
+#include <netpfil/ipfw/dn_sched_fq_codel_helper.h>
+
+#else
+#include <dn_test.h>
+#endif
+
+/* NOTE: In fq_codel module, we reimplements CoDel AQM functions
+ * because fq_codel use different flows (sub-queues) structure and
+ * dn_queue includes many variables not needed by a flow (sub-queue
+ * )i.e. avoid extra overhead (88 bytes vs 208 bytes).
+ * Also, CoDel functions manages stats of sub-queues as well as the main queue.
+ */
+
+#define DN_SCHED_FQ_CODEL 6
+
+static struct dn_alg fq_codel_desc;
+
+/* fq_codel default parameters including codel */
+struct dn_sch_fq_codel_parms
+fq_codel_sysctl = {{5000 * AQM_TIME_1US, 100000 * AQM_TIME_1US,
+ CODEL_ECN_ENABLED}, 1024, 10240, 1514};
+
+static int
+fqcodel_sysctl_interval_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ value = fq_codel_sysctl.ccfg.interval;
+ value /= AQM_TIME_1US;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > 100 * AQM_TIME_1S)
+ return (EINVAL);
+ fq_codel_sysctl.ccfg.interval = value * AQM_TIME_1US ;
+
+ return (0);
+}
+
+static int
+fqcodel_sysctl_target_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ value = fq_codel_sysctl.ccfg.target;
+ value /= AQM_TIME_1US;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > 5 * AQM_TIME_1S)
+ return (EINVAL);
+ fq_codel_sysctl.ccfg.target = value * AQM_TIME_1US ;
+
+ return (0);
+}
+
+
+SYSBEGIN(f4)
+
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_ip_dummynet);
+static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqcodel,
+ CTLFLAG_RW, 0, "FQ_CODEL");
+
+#ifdef SYSCTL_NODE
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, target,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_target_handler, "L",
+ "FQ_CoDel target in microsecond");
+SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, interval,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_interval_handler, "L",
+ "FQ_CoDel interval in microsecond");
+
+SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, quantum,
+ CTLFLAG_RW, &fq_codel_sysctl.quantum, 1514, "FQ_CoDel quantum");
+SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, flows,
+ CTLFLAG_RW, &fq_codel_sysctl.flows_cnt, 1024,
+ "Number of queues for FQ_CoDel");
+SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, limit,
+ CTLFLAG_RW, &fq_codel_sysctl.limit, 10240, "FQ_CoDel queues size limit");
+#endif
+
+/* Drop a packet form the head of codel queue */
+static void
+codel_drop_head(struct fq_codel_flow *q, struct fq_codel_si *si)
+{
+ struct mbuf *m = q->mq.head;
+
+ if (m == NULL)
+ return;
+ q->mq.head = m->m_nextpkt;
+
+ fq_update_stats(q, si, -m->m_pkthdr.len, 1);
+
+ if (si->main_q.ni.length == 0) /* queue is now idle */
+ si->main_q.q_time = dn_cfg.curr_time;
+
+ FREE_PKT(m);
+}
+
+/* Enqueue a packet 'm' to a queue 'q' and add timestamp to that packet.
+ * Return 1 when unable to add timestamp, otherwise return 0
+ */
+static int
+codel_enqueue(struct fq_codel_flow *q, struct mbuf *m, struct fq_codel_si *si)
+{
+ uint64_t len;
+
+ len = m->m_pkthdr.len;
+ /* finding maximum packet size */
+ if (len > q->cst.maxpkt_size)
+ q->cst.maxpkt_size = len;
+
+ /* Add timestamp to mbuf as MTAG */
+ struct m_tag *mtag;
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL)
+ mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, sizeof(aqm_time_t),
+ M_NOWAIT);
+ if (mtag == NULL) {
+ m_freem(m);
+ goto drop;
+ }
+ *(aqm_time_t *)(mtag + 1) = AQM_UNOW;
+ m_tag_prepend(m, mtag);
+
+ mq_append(&q->mq, m);
+ fq_update_stats(q, si, len, 0);
+ return 0;
+
+drop:
+ fq_update_stats(q, si, len, 1);
+ m_freem(m);
+ return 1;
+}
+
+/*
+ * Classify a packet to queue number using Jenkins hash function.
+ * Return: queue number
+ * the input of the hash are protocol no, perturbation, src IP, dst IP,
+ * src port, dst port,
+ */
+static inline int
+fq_codel_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_codel_si *si)
+{
+ struct ip *ip;
+ struct tcphdr *th;
+ struct udphdr *uh;
+ uint8_t tuple[41];
+ uint16_t hash=0;
+
+//#ifdef INET6
+ struct ip6_hdr *ip6;
+ int isip6;
+ isip6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
+
+ if(isip6) {
+ ip6 = mtod(m, struct ip6_hdr *);
+ *((uint8_t *) &tuple[0]) = ip6->ip6_nxt;
+ *((uint32_t *) &tuple[1]) = si->perturbation;
+ memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16);
+ memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16);
+
+ switch (ip6->ip6_nxt) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)(ip6 + 1);
+ *((uint16_t *) &tuple[37]) = th->th_dport;
+ *((uint16_t *) &tuple[39]) = th->th_sport;
+ break;
+
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)(ip6 + 1);
+ *((uint16_t *) &tuple[37]) = uh->uh_dport;
+ *((uint16_t *) &tuple[39]) = uh->uh_sport;
+ break;
+ default:
+ memset(&tuple[37], 0, 4);
+
+ }
+
+ hash = jenkins_hash(tuple, 41, HASHINIT) % fcount;
+ return hash;
+ }
+//#endif
+
+ /* IPv4 */
+ ip = mtod(m, struct ip *);
+ *((uint8_t *) &tuple[0]) = ip->ip_p;
+ *((uint32_t *) &tuple[1]) = si->perturbation;
+ *((uint32_t *) &tuple[5]) = ip->ip_src.s_addr;
+ *((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr;
+
+ switch (ip->ip_p) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)(ip + 1);
+ *((uint16_t *) &tuple[13]) = th->th_dport;
+ *((uint16_t *) &tuple[15]) = th->th_sport;
+ break;
+
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)(ip + 1);
+ *((uint16_t *) &tuple[13]) = uh->uh_dport;
+ *((uint16_t *) &tuple[15]) = uh->uh_sport;
+ break;
+ default:
+ memset(&tuple[13], 0, 4);
+
+ }
+ hash = jenkins_hash(tuple, 17, HASHINIT) % fcount;
+
+ return hash;
+}
+
+/*
+ * Enqueue a packet into an appropriate queue according to
+ * FQ_CODEL algorithm.
+ */
+static int
+fq_codel_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q,
+ struct mbuf *m)
+{
+ struct fq_codel_si *si;
+ struct fq_codel_schk *schk;
+ struct dn_sch_fq_codel_parms *param;
+ struct dn_queue *mainq;
+ int idx, drop, i, maxidx;
+
+ mainq = (struct dn_queue *)(_si + 1);
+ si = (struct fq_codel_si *)_si;
+ schk = (struct fq_codel_schk *)(si->_si.sched+1);
+ param = &schk->cfg;
+
+ /* classify a packet to queue number*/
+ idx = fq_codel_classify_flow(m, param->flows_cnt, si);
+ /* enqueue packet into appropriate queue using CoDel AQM.
+ * Note: 'codel_enqueue' function returns 1 only when it unable to
+ * add timestamp to packet (no limit check)*/
+ drop = codel_enqueue(&si->flows[idx], m, si);
+
+ /* codel unable to timestamp a packet */
+ if (drop)
+ return 1;
+
+ /* If the flow (sub-queue) is not active ,then add it to the tail of
+ * new flows list, initialize and activate it.
+ */
+ if (!si->flows[idx].active ) {
+ STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain);
+ si->flows[idx].deficit = param->quantum;
+ si->flows[idx].cst.dropping = false;
+ si->flows[idx].cst.first_above_time = 0;
+ si->flows[idx].active = 1;
+ //D("activate %d",idx);
+ }
+
+ /* check the limit for all queues and remove a packet from the
+ * largest one
+ */
+ if (mainq->ni.length > schk->cfg.limit) { D("over limit");
+ /* find first active flow */
+ for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++)
+ if (si->flows[maxidx].active)
+ break;
+ if (maxidx < schk->cfg.flows_cnt) {
+ /* find the largest sub- queue */
+ for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++)
+ if (si->flows[i].active && si->flows[i].stats.length >
+ si->flows[maxidx].stats.length)
+ maxidx = i;
+ codel_drop_head(&si->flows[maxidx], si);
+ D("maxidx = %d",maxidx);
+ drop = 1;
+ }
+ }
+
+ return drop;
+}
+
+/*
+ * Dequeue a packet from an appropriate queue according to
+ * FQ_CODEL algorithm.
+ */
+static struct mbuf *
+fq_codel_dequeue(struct dn_sch_inst *_si)
+{
+ struct fq_codel_si *si;
+ struct fq_codel_schk *schk;
+ struct dn_sch_fq_codel_parms *param;
+ struct fq_codel_flow *f;
+ struct mbuf *mbuf;
+ struct fq_codel_list *fq_codel_flowlist;
+
+ si = (struct fq_codel_si *)_si;
+ schk = (struct fq_codel_schk *)(si->_si.sched+1);
+ param = &schk->cfg;
+
+ do {
+ /* select a list to start with */
+ if (STAILQ_EMPTY(&si->newflows))
+ fq_codel_flowlist = &si->oldflows;
+ else
+ fq_codel_flowlist = &si->newflows;
+
+ /* Both new and old queue lists are empty, return NULL */
+ if (STAILQ_EMPTY(fq_codel_flowlist))
+ return NULL;
+
+ f = STAILQ_FIRST(fq_codel_flowlist);
+ while (f != NULL) {
+ /* if there is no flow(sub-queue) deficit, increase deficit
+ * by quantum, move the flow to the tail of old flows list
+ * and try another flow.
+ * Otherwise, the flow will be used for dequeue.
+ */
+ if (f->deficit < 0) {
+ f->deficit += param->quantum;
+ STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
+ STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
+ } else
+ break;
+
+ f = STAILQ_FIRST(fq_codel_flowlist);
+ }
+
+ /* the new flows list is empty, try old flows list */
+ if (STAILQ_EMPTY(fq_codel_flowlist))
+ continue;
+
+ /* Dequeue a packet from the selected flow */
+ mbuf = fqc_codel_dequeue(f, si);
+
+ /* Codel did not return a packet */
+ if (!mbuf) {
+ /* If the selected flow belongs to new flows list, then move
+ * it to the tail of old flows list. Otherwise, deactivate it and
+ * remove it from the old list and
+ */
+ if (fq_codel_flowlist == &si->newflows) {
+ STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
+ STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
+ } else {
+ f->active = 0;
+ STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
+ }
+ /* start again */
+ continue;
+ }
+
+ /* we have a packet to return,
+ * update flow deficit and return the packet*/
+ f->deficit -= mbuf->m_pkthdr.len;
+ return mbuf;
+
+ } while (1);
+
+ /* unreachable point */
+ return NULL;
+}
+
+/*
+ * Initialize fq_codel scheduler instance.
+ * also, allocate memory for flows array.
+ */
+static int
+fq_codel_new_sched(struct dn_sch_inst *_si)
+{
+ struct fq_codel_si *si;
+ struct dn_queue *q;
+ struct fq_codel_schk *schk;
+ int i;
+
+ si = (struct fq_codel_si *)_si;
+ schk = (struct fq_codel_schk *)(_si->sched+1);
+
+ if(si->flows) {
+ D("si already configured!");
+ return 0;
+ }
+
+ /* init the main queue */
+ q = &si->main_q;
+ set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
+ q->_si = _si;
+ q->fs = _si->sched->fs;
+
+ /* allocate memory for flows array */
+ si->flows = malloc(schk->cfg.flows_cnt * sizeof(struct fq_codel_flow),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (si->flows == NULL) {
+ D("cannot allocate memory for fq_codel configuration parameters");
+ return ENOMEM ;
+ }
+
+ /* init perturbation for this si */
+ si->perturbation = random();
+
+ /* init the old and new flows lists */
+ STAILQ_INIT(&si->newflows);
+ STAILQ_INIT(&si->oldflows);
+
+ /* init the flows (sub-queues) */
+ for (i = 0; i < schk->cfg.flows_cnt; i++) {
+ /* init codel */
+ si->flows[i].cst.maxpkt_size = 500;
+ }
+
+ fq_codel_desc.ref_count++;
+ return 0;
+}
+
+/*
+ * Free fq_codel scheduler instance.
+ */
+static int
+fq_codel_free_sched(struct dn_sch_inst *_si)
+{
+ struct fq_codel_si *si = (struct fq_codel_si *)_si ;
+
+ /* free the flows array */
+ free(si->flows , M_DUMMYNET);
+ si->flows = NULL;
+ fq_codel_desc.ref_count--;
+
+ return 0;
+}
+
+/*
+ * Configure fq_codel scheduler.
+ * the configurations for the scheduler is passed from userland.
+ */
+static int
+fq_codel_config(struct dn_schk *_schk)
+{
+ struct fq_codel_schk *schk;
+ struct dn_extra_parms *ep;
+ struct dn_sch_fq_codel_parms *fqc_cfg;
+
+ schk = (struct fq_codel_schk *)(_schk+1);
+ ep = (struct dn_extra_parms *) _schk->cfg;
+
+ /* par array contains fq_codel configuration as follow
+ * Codel: 0- target,1- interval, 2- flags
+ * FQ_CODEL: 3- quantum, 4- limit, 5- flows
+ */
+ if (ep && ep->oid.len ==sizeof(*ep) &&
+ ep->oid.subtype == DN_SCH_PARAMS) {
+
+ fqc_cfg = &schk->cfg;
+ if (ep->par[0] < 0)
+ fqc_cfg->ccfg.target = fq_codel_sysctl.ccfg.target;
+ else
+ fqc_cfg->ccfg.target = ep->par[0] * AQM_TIME_1US;
+
+ if (ep->par[1] < 0)
+ fqc_cfg->ccfg.interval = fq_codel_sysctl.ccfg.interval;
+ else
+ fqc_cfg->ccfg.interval = ep->par[1] * AQM_TIME_1US;
+
+ if (ep->par[2] < 0)
+ fqc_cfg->ccfg.flags = 0;
+ else
+ fqc_cfg->ccfg.flags = ep->par[2];
+
+ /* FQ configurations */
+ if (ep->par[3] < 0)
+ fqc_cfg->quantum = fq_codel_sysctl.quantum;
+ else
+ fqc_cfg->quantum = ep->par[3];
+
+ if (ep->par[4] < 0)
+ fqc_cfg->limit = fq_codel_sysctl.limit;
+ else
+ fqc_cfg->limit = ep->par[4];
+
+ if (ep->par[5] < 0)
+ fqc_cfg->flows_cnt = fq_codel_sysctl.flows_cnt;
+ else
+ fqc_cfg->flows_cnt = ep->par[5];
+
+ /* Bound the configurations */
+ fqc_cfg->ccfg.target = BOUND_VAR(fqc_cfg->ccfg.target, 1 ,
+ 5 * AQM_TIME_1S); ;
+ fqc_cfg->ccfg.interval = BOUND_VAR(fqc_cfg->ccfg.interval, 1,
+ 100 * AQM_TIME_1S);
+
+ fqc_cfg->quantum = BOUND_VAR(fqc_cfg->quantum,1, 9000);
+ fqc_cfg->limit= BOUND_VAR(fqc_cfg->limit,1,20480);
+ fqc_cfg->flows_cnt= BOUND_VAR(fqc_cfg->flows_cnt,1,65536);
+ }
+ else
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Return fq_codel scheduler configurations
+ * the configurations for the scheduler is passed to userland.
+ */
+static int
+fq_codel_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) {
+
+ struct fq_codel_schk *schk = (struct fq_codel_schk *)(_schk+1);
+ struct dn_sch_fq_codel_parms *fqc_cfg;
+
+ fqc_cfg = &schk->cfg;
+
+ strcpy(ep->name, fq_codel_desc.name);
+ ep->par[0] = fqc_cfg->ccfg.target / AQM_TIME_1US;
+ ep->par[1] = fqc_cfg->ccfg.interval / AQM_TIME_1US;
+ ep->par[2] = fqc_cfg->ccfg.flags;
+
+ ep->par[3] = fqc_cfg->quantum;
+ ep->par[4] = fqc_cfg->limit;
+ ep->par[5] = fqc_cfg->flows_cnt;
+
+ return 0;
+}
+
+/*
+ * fq_codel scheduler descriptor
+ * contains the type of the scheduler, the name, the size of extra
+ * data structures, and function pointers.
+ */
+static struct dn_alg fq_codel_desc = {
+ _SI( .type = ) DN_SCHED_FQ_CODEL,
+ _SI( .name = ) "FQ_CODEL",
+ _SI( .flags = ) 0,
+
+ _SI( .schk_datalen = ) sizeof(struct fq_codel_schk),
+ _SI( .si_datalen = ) sizeof(struct fq_codel_si) - sizeof(struct dn_sch_inst),
+ _SI( .q_datalen = ) 0,
+
+ _SI( .enqueue = ) fq_codel_enqueue,
+ _SI( .dequeue = ) fq_codel_dequeue,
+ _SI( .config = ) fq_codel_config, /* new sched i.e. sched X config ...*/
+ _SI( .destroy = ) NULL, /*sched x delete */
+ _SI( .new_sched = ) fq_codel_new_sched, /* new schd instance */
+ _SI( .free_sched = ) fq_codel_free_sched, /* delete schd instance */
+ _SI( .new_fsk = ) NULL,
+ _SI( .free_fsk = ) NULL,
+ _SI( .new_queue = ) NULL,
+ _SI( .free_queue = ) NULL,
+ _SI( .getconfig = ) fq_codel_getconfig,
+ _SI( .ref_count = ) 0
+};
+
+DECLARE_DNSCHED_MODULE(dn_fq_codel, &fq_codel_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_fq_codel.h b/sys/netpfil/ipfw/dn_sched_fq_codel.h
new file mode 100644
index 0000000..4b65781
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_fq_codel.h
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * FQ_Codel Structures and helper functions
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_SCHED_FQ_CODEL_H
+#define _IP_DN_SCHED_FQ_CODEL_H
+
+/* list of queues */
+STAILQ_HEAD(fq_codel_list, fq_codel_flow) ;
+
+/* fq_codel parameters including codel */
+struct dn_sch_fq_codel_parms {
+ struct dn_aqm_codel_parms ccfg; /* CoDel Parameters */
+ /* FQ_CODEL Parameters */
+ uint32_t flows_cnt; /* number of flows */
+ uint32_t limit; /* hard limit of fq_codel queue size*/
+ uint32_t quantum;
+}; /* defaults */
+
+/* flow (sub-queue) stats */
+struct flow_stats {
+ uint64_t tot_pkts; /* statistics counters */
+ uint64_t tot_bytes;
+ uint32_t length; /* Queue length, in packets */
+ uint32_t len_bytes; /* Queue length, in bytes */
+ uint32_t drops;
+};
+
+/* A flow of packets (sub-queue).*/
+struct fq_codel_flow {
+ struct mq mq; /* list of packets */
+ struct flow_stats stats; /* statistics */
+ int deficit;
+ int active; /* 1: flow is active (in a list) */
+ struct codel_status cst;
+ STAILQ_ENTRY(fq_codel_flow) flowchain;
+};
+
+/* extra fq_codel scheduler configurations */
+struct fq_codel_schk {
+ struct dn_sch_fq_codel_parms cfg;
+};
+
+/* fq_codel scheduler instance */
+struct fq_codel_si {
+ struct dn_sch_inst _si; /* standard scheduler instance */
+ struct dn_queue main_q; /* main queue is after si directly */
+
+ struct fq_codel_flow *flows; /* array of flows (queues) */
+ uint32_t perturbation; /* random value */
+ struct fq_codel_list newflows; /* list of new queues */
+ struct fq_codel_list oldflows; /* list of old queues */
+};
+
+/* Helper function to update queue&main-queue and scheduler statistics.
+ * negative len + drop -> drop
+ * negative len -> dequeue
+ * positive len -> enqueue
+ * positive len + drop -> drop during enqueue
+ */
+__inline static void
+fq_update_stats(struct fq_codel_flow *q, struct fq_codel_si *si, int len,
+ int drop)
+{
+ int inc = 0;
+
+ if (len < 0)
+ inc = -1;
+ else if (len > 0)
+ inc = 1;
+
+ if (drop) {
+ si->main_q.ni.drops ++;
+ q->stats.drops ++;
+ si->_si.ni.drops ++;
+ io_pkt_drop ++;
+ }
+
+ if (!drop || (drop && len < 0)) {
+ /* Update stats for the main queue */
+ si->main_q.ni.length += inc;
+ si->main_q.ni.len_bytes += len;
+
+ /*update sub-queue stats */
+ q->stats.length += inc;
+ q->stats.len_bytes += len;
+
+ /*update scheduler instance stats */
+ si->_si.ni.length += inc;
+ si->_si.ni.len_bytes += len;
+ }
+
+ if (inc > 0) {
+ si->main_q.ni.tot_bytes += len;
+ si->main_q.ni.tot_pkts ++;
+
+ q->stats.tot_bytes +=len;
+ q->stats.tot_pkts++;
+
+ si->_si.ni.tot_bytes +=len;
+ si->_si.ni.tot_pkts ++;
+ }
+
+}
+
+/* extract the head of fq_codel sub-queue */
+__inline static struct mbuf *
+fq_codel_extract_head(struct fq_codel_flow *q, aqm_time_t *pkt_ts, struct fq_codel_si *si)
+{
+ struct mbuf *m = q->mq.head;
+
+ if (m == NULL)
+ return m;
+ q->mq.head = m->m_nextpkt;
+
+ fq_update_stats(q, si, -m->m_pkthdr.len, 0);
+
+ if (si->main_q.ni.length == 0) /* queue is now idle */
+ si->main_q.q_time = dn_cfg.curr_time;
+
+ /* extract packet timestamp*/
+ struct m_tag *mtag;
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL){
+ D("timestamp tag is not found!");
+ *pkt_ts = 0;
+ } else {
+ *pkt_ts = *(aqm_time_t *)(mtag + 1);
+ m_tag_delete(m,mtag);
+ }
+
+ return m;
+}
+
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h b/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h
new file mode 100644
index 0000000..da663dc
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h
@@ -0,0 +1,187 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm.
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Copyright (C) 2011-2014 Kathleen Nichols <nichols@pollere.com>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * o Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ *
+ * o Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * o The names of the authors may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General Public
+ * License ("GPL") version 2, in which case the provisions of the GPL
+ * apply INSTEAD OF those given above.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _IP_DN_SCHED_FQ_CODEL_HELPER_H
+#define _IP_DN_SCHED_FQ_CODEL_HELPER_H
+
+__inline static struct mbuf *
+fqc_dodequeue(struct fq_codel_flow *q, aqm_time_t now, uint16_t *ok_to_drop,
+ struct fq_codel_si *si)
+{
+ struct mbuf * m;
+ struct fq_codel_schk *schk = (struct fq_codel_schk *)(si->_si.sched+1);
+ aqm_time_t pkt_ts, sojourn_time;
+
+ *ok_to_drop = 0;
+ m = fq_codel_extract_head(q, &pkt_ts, si);
+
+ if (m == NULL) {
+ /*queue is empty - we can't be above target*/
+ q->cst.first_above_time= 0;
+ return m;
+ }
+
+ /* To span a large range of bandwidths, CoDel runs two
+ * different AQMs in parallel. One is sojourn-time-based
+ * and takes effect when the time to send an MTU-sized
+ * packet is less than target. The 1st term of the "if"
+ * below does this. The other is backlog-based and takes
+ * effect when the time to send an MTU-sized packet is >=
+ * target. The goal here is to keep the output link
+ * utilization high by never allowing the queue to get
+ * smaller than the amount that arrives in a typical
+ * interarrival time (MTU-sized packets arriving spaced
+ * by the amount of time it takes to send such a packet on
+ * the bottleneck). The 2nd term of the "if" does this.
+ */
+ sojourn_time = now - pkt_ts;
+ if (sojourn_time < schk->cfg.ccfg.target || q->stats.len_bytes <= q->cst.maxpkt_size) {
+ /* went below - stay below for at least interval */
+ q->cst.first_above_time = 0;
+ } else {
+ if (q->cst.first_above_time == 0) {
+ /* just went above from below. if still above at
+ * first_above_time, will say it's ok to drop. */
+ q->cst.first_above_time = now + schk->cfg.ccfg.interval;
+ } else if (now >= q->cst.first_above_time) {
+ *ok_to_drop = 1;
+ }
+ }
+ return m;
+}
+
+/* Codel dequeue function */
+__inline static struct mbuf *
+fqc_codel_dequeue(struct fq_codel_flow *q, struct fq_codel_si *si)
+{
+ struct mbuf *m;
+ struct dn_aqm_codel_parms *cprms;
+ struct codel_status *cst;
+ aqm_time_t now;
+ uint16_t ok_to_drop;
+ struct fq_codel_schk *schk = (struct fq_codel_schk *)(si->_si.sched+1);
+
+ cst = &q->cst;
+ cprms = &schk->cfg.ccfg;
+
+ now = AQM_UNOW;
+ m = fqc_dodequeue(q, now, &ok_to_drop, si);
+
+ if (cst->dropping) {
+ if (!ok_to_drop) {
+ /* sojourn time below target - leave dropping state */
+ cst->dropping = false;
+ }
+
+ /* Time for the next drop. Drop current packet and dequeue
+ * next. If the dequeue doesn't take us out of dropping
+ * state, schedule the next drop. A large backlog might
+ * result in drop rates so high that the next drop should
+ * happen now, hence the 'while' loop.
+ */
+ while (now >= cst->drop_next_time && cst->dropping) {
+
+ /* mark the packet */
+ if (cprms->flags & CODEL_ECN_ENABLED && ecn_mark(m)) {
+ cst->count++;
+ /* schedule the next mark. */
+ cst->drop_next_time = control_law(cst, cprms, cst->drop_next_time);
+ return m;
+ }
+
+ /* drop the packet */
+ fq_update_stats(q, si, 0, 1);
+ m_freem(m);
+ m = fqc_dodequeue(q, now, &ok_to_drop, si);
+
+ if (!ok_to_drop) {
+ /* leave dropping state */
+ cst->dropping = false;
+ } else {
+ cst->count++;
+ /* schedule the next drop. */
+ cst->drop_next_time = control_law(cst, cprms, cst->drop_next_time);
+ }
+ }
+ /* If we get here we're not in dropping state. The 'ok_to_drop'
+ * return from dodequeue means that the sojourn time has been
+ * above 'target' for 'interval' so enter dropping state.
+ */
+ } else if (ok_to_drop) {
+
+ /* if ECN option is disabled or the packet cannot be marked,
+ * drop the packet and extract another.
+ */
+ if (!(cprms->flags & CODEL_ECN_ENABLED) || !ecn_mark(m)) {
+ fq_update_stats(q, si, 0, 1);
+ m_freem(m);
+ m = fqc_dodequeue(q, now, &ok_to_drop,si);
+ }
+
+ cst->dropping = true;
+
+ /* If min went above target close to when it last went
+ * below, assume that the drop rate that controlled the
+ * queue on the last cycle is a good starting point to
+ * control it now. ('drop_next' will be at most 'interval'
+ * later than the time of the last drop so 'now - drop_next'
+ * is a good approximation of the time from the last drop
+ * until now.)
+ */
+ cst->count = (cst->count > 2 && ((aqm_stime_t)now -
+ (aqm_stime_t)cst->drop_next_time) < 8* cprms->interval)? cst->count - 2 : 1;
+
+ /* we don't have to set initial guess for Newton's method isqrt as
+ * we initilaize isqrt in control_law function when count == 1 */
+ cst->drop_next_time = control_law(cst, cprms, now);
+ }
+
+ return m;
+}
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_sched_fq_pie.c b/sys/netpfil/ipfw/dn_sched_fq_pie.c
new file mode 100644
index 0000000..2883cf8
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_fq_pie.c
@@ -0,0 +1,1262 @@
+/*
+ * FQ_PIE - The FlowQueue-PIE scheduler/AQM
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* Important note:
+ * As there is no an office document for FQ-PIE specification, we used
+ * FQ-CoDel algorithm with some modifications to implement FQ-PIE.
+ * This FQ-PIE implementation is a beta version and have not been tested
+ * extensively. Our FQ-PIE uses stand-alone PIE AQM per sub-queue. By
+ * default, timestamp is used to calculate queue delay instead of departure
+ * rate estimation method. Although departure rate estimation is available
+ * as testing option, the results could be incorrect. Moreover, turning PIE on
+ * and off option is available but it does not work properly in this version.
+ */
+
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <net/if.h> /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ipfw_rule_ref */
+#include <netinet/ip_fw.h> /* flow_id */
+#include <netinet/ip_dummynet.h>
+
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <sys/sysctl.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <sys/queue.h>
+#include <sys/hash.h>
+
+#include <netpfil/ipfw/dn_heap.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+
+#include <netpfil/ipfw/dn_aqm.h>
+#include <netpfil/ipfw/dn_aqm_pie.h>
+#include <netpfil/ipfw/dn_sched.h>
+
+#else
+#include <dn_test.h>
+#endif
+
+#define DN_SCHED_FQ_PIE 7
+
+/* list of queues */
+STAILQ_HEAD(fq_pie_list, fq_pie_flow) ;
+
+/* FQ_PIE parameters including PIE */
+struct dn_sch_fq_pie_parms {
+ struct dn_aqm_pie_parms pcfg; /* PIE configuration Parameters */
+ /* FQ_PIE Parameters */
+ uint32_t flows_cnt; /* number of flows */
+ uint32_t limit; /* hard limit of FQ_PIE queue size*/
+ uint32_t quantum;
+};
+
+/* flow (sub-queue) stats */
+struct flow_stats {
+ uint64_t tot_pkts; /* statistics counters */
+ uint64_t tot_bytes;
+ uint32_t length; /* Queue length, in packets */
+ uint32_t len_bytes; /* Queue length, in bytes */
+ uint32_t drops;
+};
+
+/* A flow of packets (sub-queue)*/
+struct fq_pie_flow {
+ struct mq mq; /* list of packets */
+ struct flow_stats stats; /* statistics */
+ int deficit;
+ int active; /* 1: flow is active (in a list) */
+ struct pie_status pst; /* pie status variables */
+ struct fq_pie_si *psi; /* parent scheduler instance */
+ STAILQ_ENTRY(fq_pie_flow) flowchain;
+};
+
+/* extra fq_pie scheduler configurations */
+struct fq_pie_schk {
+ struct dn_sch_fq_pie_parms cfg;
+};
+
+/* fq_pie scheduler instance */
+struct fq_pie_si {
+ struct dn_sch_inst _si; /* standard scheduler instance */
+ struct dn_queue main_q; /* main queue is after si directly */
+ uint32_t nr_active_q;
+ struct fq_pie_flow *flows; /* array of flows (queues) */
+ uint32_t perturbation; /* random value */
+ struct fq_pie_list newflows; /* list of new queues */
+ struct fq_pie_list oldflows; /* list of old queues */
+};
+
+
+struct mem_to_free {
+ void *mem_flows;
+ void *mem_callout;
+};
+static struct mtx freemem_mtx;
+static struct dn_alg fq_pie_desc;
+
+/* Default FQ-PIE parameters including PIE */
+/* PIE defaults
+ * target=15ms, max_burst=150ms, max_ecnth=0.1,
+ * alpha=0.125, beta=1.25, tupdate=15ms
+ * FQ-
+ * flows=1024, limit=10240, quantum =1514
+ */
+struct dn_sch_fq_pie_parms
+ fq_pie_sysctl = {{15000 * AQM_TIME_1US, 15000 * AQM_TIME_1US,
+ 150000 * AQM_TIME_1US, PIE_SCALE * 0.1, PIE_SCALE * 0.125,
+ PIE_SCALE * 1.25, PIE_CAPDROP_ENABLED | PIE_DERAND_ENABLED},
+ 1024, 10240, 1514};
+
+static int
+fqpie_sysctl_alpha_beta_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ if (!strcmp(oidp->oid_name,"alpha"))
+ value = fq_pie_sysctl.pcfg.alpha;
+ else
+ value = fq_pie_sysctl.pcfg.beta;
+
+ value = value * 1000 / PIE_SCALE;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > 7 * PIE_SCALE)
+ return (EINVAL);
+ value = (value * PIE_SCALE) / 1000;
+ if (!strcmp(oidp->oid_name,"alpha"))
+ fq_pie_sysctl.pcfg.alpha = value;
+ else
+ fq_pie_sysctl.pcfg.beta = value;
+ return (0);
+}
+
+static int
+fqpie_sysctl_target_tupdate_maxb_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ if (!strcmp(oidp->oid_name,"target"))
+ value = fq_pie_sysctl.pcfg.qdelay_ref;
+ else if (!strcmp(oidp->oid_name,"tupdate"))
+ value = fq_pie_sysctl.pcfg.tupdate;
+ else
+ value = fq_pie_sysctl.pcfg.max_burst;
+
+ value = value / AQM_TIME_1US;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > 10 * AQM_TIME_1S)
+ return (EINVAL);
+ value = value * AQM_TIME_1US;
+
+ if (!strcmp(oidp->oid_name,"target"))
+ fq_pie_sysctl.pcfg.qdelay_ref = value;
+ else if (!strcmp(oidp->oid_name,"tupdate"))
+ fq_pie_sysctl.pcfg.tupdate = value;
+ else
+ fq_pie_sysctl.pcfg.max_burst = value;
+ return (0);
+}
+
+static int
+fqpie_sysctl_max_ecnth_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ value = fq_pie_sysctl.pcfg.max_ecnth;
+ value = value * 1000 / PIE_SCALE;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > PIE_SCALE)
+ return (EINVAL);
+ value = (value * PIE_SCALE) / 1000;
+ fq_pie_sysctl.pcfg.max_ecnth = value;
+ return (0);
+}
+
+/* define FQ- PIE sysctl variables */
+SYSBEGIN(f4)
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_ip_dummynet);
+static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqpie,
+ CTLFLAG_RW, 0, "FQ_PIE");
+
+#ifdef SYSCTL_NODE
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, target,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ fqpie_sysctl_target_tupdate_maxb_handler, "L",
+ "queue target in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, tupdate,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ fqpie_sysctl_target_tupdate_maxb_handler, "L",
+ "the frequency of drop probability calculation in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, max_burst,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ fqpie_sysctl_target_tupdate_maxb_handler, "L",
+ "Burst allowance interval in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, max_ecnth,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ fqpie_sysctl_max_ecnth_handler, "L",
+ "ECN safeguard threshold scaled by 1000");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, alpha,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ fqpie_sysctl_alpha_beta_handler, "L", "PIE alpha scaled by 1000");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, beta,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ fqpie_sysctl_alpha_beta_handler, "L", "beta scaled by 1000");
+
+SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, quantum,
+ CTLFLAG_RW, &fq_pie_sysctl.quantum, 1514, "quantum for FQ_PIE");
+SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, flows,
+ CTLFLAG_RW, &fq_pie_sysctl.flows_cnt, 1024, "Number of queues for FQ_PIE");
+SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, limit,
+ CTLFLAG_RW, &fq_pie_sysctl.limit, 10240, "limit for FQ_PIE");
+#endif
+
+/* Helper function to update queue&main-queue and scheduler statistics.
+ * negative len & drop -> drop
+ * negative len -> dequeue
+ * positive len -> enqueue
+ * positive len + drop -> drop during enqueue
+ */
+__inline static void
+fq_update_stats(struct fq_pie_flow *q, struct fq_pie_si *si, int len,
+ int drop)
+{
+ int inc = 0;
+
+ if (len < 0)
+ inc = -1;
+ else if (len > 0)
+ inc = 1;
+
+ if (drop) {
+ si->main_q.ni.drops ++;
+ q->stats.drops ++;
+ si->_si.ni.drops ++;
+ io_pkt_drop ++;
+ }
+
+ if (!drop || (drop && len < 0)) {
+ /* Update stats for the main queue */
+ si->main_q.ni.length += inc;
+ si->main_q.ni.len_bytes += len;
+
+ /*update sub-queue stats */
+ q->stats.length += inc;
+ q->stats.len_bytes += len;
+
+ /*update scheduler instance stats */
+ si->_si.ni.length += inc;
+ si->_si.ni.len_bytes += len;
+ }
+
+ if (inc > 0) {
+ si->main_q.ni.tot_bytes += len;
+ si->main_q.ni.tot_pkts ++;
+
+ q->stats.tot_bytes +=len;
+ q->stats.tot_pkts++;
+
+ si->_si.ni.tot_bytes +=len;
+ si->_si.ni.tot_pkts ++;
+ }
+
+}
+
+/*
+ * Extract a packet from the head of sub-queue 'q'
+ * Return a packet or NULL if the queue is empty.
+ * If getts is set, also extract packet's timestamp from mtag.
+ */
+__inline static struct mbuf *
+fq_pie_extract_head(struct fq_pie_flow *q, aqm_time_t *pkt_ts,
+ struct fq_pie_si *si, int getts)
+{
+ struct mbuf *m = q->mq.head;
+
+ if (m == NULL)
+ return m;
+ q->mq.head = m->m_nextpkt;
+
+ fq_update_stats(q, si, -m->m_pkthdr.len, 0);
+
+ if (si->main_q.ni.length == 0) /* queue is now idle */
+ si->main_q.q_time = dn_cfg.curr_time;
+
+ if (getts) {
+ /* extract packet timestamp*/
+ struct m_tag *mtag;
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL){
+ D("PIE timestamp mtag not found!");
+ *pkt_ts = 0;
+ } else {
+ *pkt_ts = *(aqm_time_t *)(mtag + 1);
+ m_tag_delete(m,mtag);
+ }
+ }
+ return m;
+}
+
+/*
+ * Callout function for drop probability calculation
+ * This function is called over tupdate ms and takes pointer of FQ-PIE
+ * flow as an argument
+ */
+static void
+fq_calculate_drop_prob(void *x)
+{
+ struct fq_pie_flow *q = (struct fq_pie_flow *) x;
+ struct pie_status *pst = &q->pst;
+ struct dn_aqm_pie_parms *pprms;
+ int64_t p, prob, oldprob;
+ aqm_time_t now;
+
+ /* dealing with race condition */
+ if (callout_pending(&pst->aqm_pie_callout)) {
+ /* callout was reset */
+ mtx_unlock(&pst->lock_mtx);
+ return;
+ }
+
+ if (!callout_active(&pst->aqm_pie_callout)) {
+ /* callout was stopped */
+ mtx_unlock(&pst->lock_mtx);
+ mtx_destroy(&pst->lock_mtx);
+ q->psi->nr_active_q--;
+ return;
+ }
+ callout_deactivate(&pst->aqm_pie_callout);
+
+ now = AQM_UNOW;
+ pprms = pst->parms;
+ prob = pst->drop_prob;
+
+ /* calculate current qdelay */
+ if (pprms->flags & PIE_DEPRATEEST_ENABLED) {
+ pst->current_qdelay = ((uint64_t)q->stats.len_bytes * pst->avg_dq_time)
+ >> PIE_DQ_THRESHOLD_BITS;
+ }
+
+ /* calculate drop probability */
+ p = (int64_t)pprms->alpha *
+ ((int64_t)pst->current_qdelay - (int64_t)pprms->qdelay_ref);
+ p +=(int64_t) pprms->beta *
+ ((int64_t)pst->current_qdelay - (int64_t)pst->qdelay_old);
+
+ /* We PIE_MAX_PROB shift by 12-bits to increase the division precision */
+ p *= (PIE_MAX_PROB << 12) / AQM_TIME_1S;
+
+ /* auto-tune drop probability */
+ if (prob < (PIE_MAX_PROB / 1000000)) /* 0.000001 */
+ p >>= 11 + PIE_FIX_POINT_BITS + 12;
+ else if (prob < (PIE_MAX_PROB / 100000)) /* 0.00001 */
+ p >>= 9 + PIE_FIX_POINT_BITS + 12;
+ else if (prob < (PIE_MAX_PROB / 10000)) /* 0.0001 */
+ p >>= 7 + PIE_FIX_POINT_BITS + 12;
+ else if (prob < (PIE_MAX_PROB / 1000)) /* 0.001 */
+ p >>= 5 + PIE_FIX_POINT_BITS + 12;
+ else if (prob < (PIE_MAX_PROB / 100)) /* 0.01 */
+ p >>= 3 + PIE_FIX_POINT_BITS + 12;
+ else if (prob < (PIE_MAX_PROB / 10)) /* 0.1 */
+ p >>= 1 + PIE_FIX_POINT_BITS + 12;
+ else
+ p >>= PIE_FIX_POINT_BITS + 12;
+
+ oldprob = prob;
+
+ /* Cap Drop adjustment */
+ if ((pprms->flags & PIE_CAPDROP_ENABLED) && prob >= PIE_MAX_PROB / 10
+ && p > PIE_MAX_PROB / 50 )
+ p = PIE_MAX_PROB / 50;
+
+ prob = prob + p;
+
+ /* decay the drop probability exponentially */
+ if (pst->current_qdelay == 0 && pst->qdelay_old == 0)
+ /* 0.98 ~= 1- 1/64 */
+ prob = prob - (prob >> 6);
+
+
+ /* check for multiplication over/under flow */
+ if (p>0) {
+ if (prob<oldprob) {
+ D("overflow");
+ prob= PIE_MAX_PROB;
+ }
+ }
+ else
+ if (prob>oldprob) {
+ prob= 0;
+ D("underflow");
+ }
+
+ /* make drop probability between 0 and PIE_MAX_PROB*/
+ if (prob < 0)
+ prob = 0;
+ else if (prob > PIE_MAX_PROB)
+ prob = PIE_MAX_PROB;
+
+ pst->drop_prob = prob;
+
+ /* store current delay value */
+ pst->qdelay_old = pst->current_qdelay;
+
+ /* update burst allowance */
+ if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance) {
+ if (pst->burst_allowance > pprms->tupdate)
+ pst->burst_allowance -= pprms->tupdate;
+ else
+ pst->burst_allowance = 0;
+ }
+
+ if (pst->sflags & PIE_ACTIVE)
+ callout_reset_sbt(&pst->aqm_pie_callout,
+ (uint64_t)pprms->tupdate * SBT_1US,
+ 0, fq_calculate_drop_prob, q, 0);
+
+ mtx_unlock(&pst->lock_mtx);
+}
+
+/*
+ * Reset PIE variables & activate the queue
+ */
+__inline static void
+fq_activate_pie(struct fq_pie_flow *q)
+{
+ struct pie_status *pst = &q->pst;
+ struct dn_aqm_pie_parms *pprms;
+
+ mtx_lock(&pst->lock_mtx);
+ pprms = pst->parms;
+
+ pprms = pst->parms;
+ pst->drop_prob = 0;
+ pst->qdelay_old = 0;
+ pst->burst_allowance = pprms->max_burst;
+ pst->accu_prob = 0;
+ pst->dq_count = 0;
+ pst->avg_dq_time = 0;
+ pst->sflags = PIE_INMEASUREMENT | PIE_ACTIVE;
+ pst->measurement_start = AQM_UNOW;
+
+ callout_reset_sbt(&pst->aqm_pie_callout,
+ (uint64_t)pprms->tupdate * SBT_1US,
+ 0, fq_calculate_drop_prob, q, 0);
+
+ mtx_unlock(&pst->lock_mtx);
+}
+
+
+ /*
+ * Deactivate PIE and stop probe update callout
+ */
+__inline static void
+fq_deactivate_pie(struct pie_status *pst)
+{
+ mtx_lock(&pst->lock_mtx);
+ pst->sflags &= ~(PIE_ACTIVE | PIE_INMEASUREMENT);
+ callout_stop(&pst->aqm_pie_callout);
+ //D("PIE Deactivated");
+ mtx_unlock(&pst->lock_mtx);
+}
+
+ /*
+ * Initialize PIE for sub-queue 'q'
+ */
+static int
+pie_init(struct fq_pie_flow *q)
+{
+ struct pie_status *pst=&q->pst;
+ struct dn_aqm_pie_parms *pprms = pst->parms;
+ struct fq_pie_schk *fqpie_schk;
+
+ fqpie_schk = (struct fq_pie_schk *)(q->psi->_si.sched+1);
+ int err = 0;
+
+ if (!pprms){
+ D("AQM_PIE is not configured");
+ err = EINVAL;
+ } else {
+ q->psi->nr_active_q++;
+
+ /* For speed optimization, we caculate 1/3 queue size once here */
+ // XXX limit divided by number of queues divided by 3 ???
+ pst->one_third_q_size = (fqpie_schk->cfg.limit /
+ fqpie_schk->cfg.flows_cnt) / 3;
+
+ mtx_init(&pst->lock_mtx, "mtx_pie", NULL, MTX_DEF);
+ callout_init_mtx(&pst->aqm_pie_callout, &pst->lock_mtx,
+ CALLOUT_RETURNUNLOCKED);
+ }
+
+ return err;
+}
+
+/*
+ * Clean up PIE status for sub-queue 'q'
+ * Stop callout timer and destroy mtx
+ */
+static int
+pie_cleanup(struct fq_pie_flow *q)
+{
+ struct pie_status *pst = &q->pst;
+
+ mtx_lock(&pst->lock_mtx);
+ if (callout_stop(&pst->aqm_pie_callout) || !(pst->sflags & PIE_ACTIVE)) {
+ mtx_unlock(&pst->lock_mtx);
+ mtx_destroy(&pst->lock_mtx);
+ q->psi->nr_active_q--;
+ } else {
+ mtx_unlock(&pst->lock_mtx);
+ return EBUSY;
+ }
+ return 0;
+}
+
+/*
+ * Dequeue and return a pcaket from sub-queue 'q' or NULL if 'q' is empty.
+ * Also, caculate depature time or queue delay using timestamp
+ */
+ static struct mbuf *
+pie_dequeue(struct fq_pie_flow *q, struct fq_pie_si *si)
+{
+ struct mbuf *m;
+ struct dn_aqm_pie_parms *pprms;
+ struct pie_status *pst;
+ aqm_time_t now;
+ aqm_time_t pkt_ts, dq_time;
+ int32_t w;
+
+ pst = &q->pst;
+ pprms = q->pst.parms;
+
+ /*we extarct packet ts only when Departure Rate Estimation dis not used*/
+ m = fq_pie_extract_head(q, &pkt_ts, si,
+ !(pprms->flags & PIE_DEPRATEEST_ENABLED));
+
+ if (!m || !(pst->sflags & PIE_ACTIVE))
+ return m;
+
+ now = AQM_UNOW;
+ if (pprms->flags & PIE_DEPRATEEST_ENABLED) {
+ /* calculate average depature time */
+ if(pst->sflags & PIE_INMEASUREMENT) {
+ pst->dq_count += m->m_pkthdr.len;
+
+ if (pst->dq_count >= PIE_DQ_THRESHOLD) {
+ dq_time = now - pst->measurement_start;
+
+ /*
+ * if we don't have old avg dq_time i.e PIE is (re)initialized,
+ * don't use weight to calculate new avg_dq_time
+ */
+ if(pst->avg_dq_time == 0)
+ pst->avg_dq_time = dq_time;
+ else {
+ /*
+ * weight = PIE_DQ_THRESHOLD/2^6, but we scaled
+ * weight by 2^8. Thus, scaled
+ * weight = PIE_DQ_THRESHOLD /2^8
+ * */
+ w = PIE_DQ_THRESHOLD >> 8;
+ pst->avg_dq_time = (dq_time* w
+ + (pst->avg_dq_time * ((1L << 8) - w))) >> 8;
+ pst->sflags &= ~PIE_INMEASUREMENT;
+ }
+ }
+ }
+
+ /*
+ * Start new measurment cycle when the queue has
+ * PIE_DQ_THRESHOLD worth of bytes.
+ */
+ if(!(pst->sflags & PIE_INMEASUREMENT) &&
+ q->stats.len_bytes >= PIE_DQ_THRESHOLD) {
+ pst->sflags |= PIE_INMEASUREMENT;
+ pst->measurement_start = now;
+ pst->dq_count = 0;
+ }
+ }
+ /* Optionally, use packet timestamp to estimate queue delay */
+ else
+ pst->current_qdelay = now - pkt_ts;
+
+ return m;
+}
+
+
+ /*
+ * Enqueue a packet in q, subject to space and FQ-PIE queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+static int
+pie_enqueue(struct fq_pie_flow *q, struct mbuf* m, struct fq_pie_si *si)
+{
+ uint64_t len;
+ struct pie_status *pst;
+ struct dn_aqm_pie_parms *pprms;
+ int t;
+
+ len = m->m_pkthdr.len;
+ pst = &q->pst;
+ pprms = pst->parms;
+ t = ENQUE;
+
+ /* drop/mark the packet when PIE is active and burst time elapsed */
+ if (pst->sflags & PIE_ACTIVE && pst->burst_allowance == 0
+ && drop_early(pst, q->stats.len_bytes) == DROP) {
+ /*
+ * if drop_prob over ECN threshold, drop the packet
+ * otherwise mark and enqueue it.
+ */
+ if (pprms->flags & PIE_ECN_ENABLED && pst->drop_prob <
+ (pprms->max_ecnth << (PIE_PROB_BITS - PIE_FIX_POINT_BITS))
+ && ecn_mark(m))
+ t = ENQUE;
+ else
+ t = DROP;
+ }
+
+ /* Turn PIE on when 1/3 of the queue is full */
+ if (!(pst->sflags & PIE_ACTIVE) && q->stats.len_bytes >=
+ pst->one_third_q_size) {
+ fq_activate_pie(q);
+ }
+
+ /* reset burst tolerance and optinally turn PIE off*/
+ if (pst->drop_prob == 0 && pst->current_qdelay < (pprms->qdelay_ref >> 1)
+ && pst->qdelay_old < (pprms->qdelay_ref >> 1)) {
+
+ pst->burst_allowance = pprms->max_burst;
+ if (pprms->flags & PIE_ON_OFF_MODE_ENABLED && q->stats.len_bytes<=0)
+ fq_deactivate_pie(pst);
+ }
+
+ /* Use timestamp if Departure Rate Estimation mode is disabled */
+ if (t != DROP && !(pprms->flags & PIE_DEPRATEEST_ENABLED)) {
+ /* Add TS to mbuf as a TAG */
+ struct m_tag *mtag;
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL)
+ mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS,
+ sizeof(aqm_time_t), M_NOWAIT);
+ if (mtag == NULL) {
+ m_freem(m);
+ t = DROP;
+ }
+ *(aqm_time_t *)(mtag + 1) = AQM_UNOW;
+ m_tag_prepend(m, mtag);
+ }
+
+ if (t != DROP) {
+ mq_append(&q->mq, m);
+ fq_update_stats(q, si, len, 0);
+ return 0;
+ } else {
+ fq_update_stats(q, si, len, 1);
+ pst->accu_prob = 0;
+ FREE_PKT(m);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Drop a packet form the head of FQ-PIE sub-queue */
+static void
+pie_drop_head(struct fq_pie_flow *q, struct fq_pie_si *si)
+{
+ struct mbuf *m = q->mq.head;
+
+ if (m == NULL)
+ return;
+ q->mq.head = m->m_nextpkt;
+
+ fq_update_stats(q, si, -m->m_pkthdr.len, 1);
+
+ if (si->main_q.ni.length == 0) /* queue is now idle */
+ si->main_q.q_time = dn_cfg.curr_time;
+ /* reset accu_prob after packet drop */
+ q->pst.accu_prob = 0;
+
+ FREE_PKT(m);
+}
+
+/*
+ * Classify a packet to queue number using Jenkins hash function.
+ * Return: queue number
+ * the input of the hash are protocol no, perturbation, src IP, dst IP,
+ * src port, dst port,
+ */
+static inline int
+fq_pie_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_pie_si *si)
+{
+ struct ip *ip;
+ struct tcphdr *th;
+ struct udphdr *uh;
+ uint8_t tuple[41];
+ uint16_t hash=0;
+
+//#ifdef INET6
+ struct ip6_hdr *ip6;
+ int isip6;
+ isip6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
+
+ if(isip6) {
+ ip6 = mtod(m, struct ip6_hdr *);
+ *((uint8_t *) &tuple[0]) = ip6->ip6_nxt;
+ *((uint32_t *) &tuple[1]) = si->perturbation;
+ memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16);
+ memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16);
+
+ switch (ip6->ip6_nxt) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)(ip6 + 1);
+ *((uint16_t *) &tuple[37]) = th->th_dport;
+ *((uint16_t *) &tuple[39]) = th->th_sport;
+ break;
+
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)(ip6 + 1);
+ *((uint16_t *) &tuple[37]) = uh->uh_dport;
+ *((uint16_t *) &tuple[39]) = uh->uh_sport;
+ break;
+ default:
+ memset(&tuple[37], 0, 4);
+ }
+
+ hash = jenkins_hash(tuple, 41, HASHINIT) % fcount;
+ return hash;
+ }
+//#endif
+
+ /* IPv4 */
+ ip = mtod(m, struct ip *);
+ *((uint8_t *) &tuple[0]) = ip->ip_p;
+ *((uint32_t *) &tuple[1]) = si->perturbation;
+ *((uint32_t *) &tuple[5]) = ip->ip_src.s_addr;
+ *((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr;
+
+ switch (ip->ip_p) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)(ip + 1);
+ *((uint16_t *) &tuple[13]) = th->th_dport;
+ *((uint16_t *) &tuple[15]) = th->th_sport;
+ break;
+
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)(ip + 1);
+ *((uint16_t *) &tuple[13]) = uh->uh_dport;
+ *((uint16_t *) &tuple[15]) = uh->uh_sport;
+ break;
+ default:
+ memset(&tuple[13], 0, 4);
+ }
+ hash = jenkins_hash(tuple, 17, HASHINIT) % fcount;
+
+ return hash;
+}
+
+/*
+ * Enqueue a packet into an appropriate queue according to
+ * FQ-CoDe; algorithm.
+ */
+static int
+fq_pie_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q,
+ struct mbuf *m)
+{
+ struct fq_pie_si *si;
+ struct fq_pie_schk *schk;
+ struct dn_sch_fq_pie_parms *param;
+ struct dn_queue *mainq;
+ int idx, drop, i, maxidx;
+
+ mainq = (struct dn_queue *)(_si + 1);
+ si = (struct fq_pie_si *)_si;
+ schk = (struct fq_pie_schk *)(si->_si.sched+1);
+ param = &schk->cfg;
+
+ /* classify a packet to queue number*/
+ idx = fq_pie_classify_flow(m, param->flows_cnt, si);
+
+ /* enqueue packet into appropriate queue using PIE AQM.
+ * Note: 'pie_enqueue' function returns 1 only when it unable to
+ * add timestamp to packet (no limit check)*/
+ drop = pie_enqueue(&si->flows[idx], m, si);
+
+ /* pie unable to timestamp a packet */
+ if (drop)
+ return 1;
+
+ /* If the flow (sub-queue) is not active ,then add it to tail of
+ * new flows list, initialize and activate it.
+ */
+ if (!si->flows[idx].active) {
+ STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain);
+ si->flows[idx].deficit = param->quantum;
+ fq_activate_pie(&si->flows[idx]);
+ si->flows[idx].active = 1;
+ }
+
+ /* check the limit for all queues and remove a packet from the
+ * largest one
+ */
+ if (mainq->ni.length > schk->cfg.limit) {
+ /* find first active flow */
+ for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++)
+ if (si->flows[maxidx].active)
+ break;
+ if (maxidx < schk->cfg.flows_cnt) {
+ /* find the largest sub- queue */
+ for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++)
+ if (si->flows[i].active && si->flows[i].stats.length >
+ si->flows[maxidx].stats.length)
+ maxidx = i;
+ pie_drop_head(&si->flows[maxidx], si);
+ drop = 1;
+ }
+ }
+
+ return drop;
+}
+
+/*
+ * Dequeue a packet from an appropriate queue according to
+ * FQ-CoDel algorithm.
+ */
+static struct mbuf *
+fq_pie_dequeue(struct dn_sch_inst *_si)
+{
+ struct fq_pie_si *si;
+ struct fq_pie_schk *schk;
+ struct dn_sch_fq_pie_parms *param;
+ struct fq_pie_flow *f;
+ struct mbuf *mbuf;
+ struct fq_pie_list *fq_pie_flowlist;
+
+ si = (struct fq_pie_si *)_si;
+ schk = (struct fq_pie_schk *)(si->_si.sched+1);
+ param = &schk->cfg;
+
+ do {
+ /* select a list to start with */
+ if (STAILQ_EMPTY(&si->newflows))
+ fq_pie_flowlist = &si->oldflows;
+ else
+ fq_pie_flowlist = &si->newflows;
+
+ /* Both new and old queue lists are empty, return NULL */
+ if (STAILQ_EMPTY(fq_pie_flowlist))
+ return NULL;
+
+ f = STAILQ_FIRST(fq_pie_flowlist);
+ while (f != NULL) {
+ /* if there is no flow(sub-queue) deficit, increase deficit
+ * by quantum, move the flow to the tail of old flows list
+ * and try another flow.
+ * Otherwise, the flow will be used for dequeue.
+ */
+ if (f->deficit < 0) {
+ f->deficit += param->quantum;
+ STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain);
+ STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
+ } else
+ break;
+
+ f = STAILQ_FIRST(fq_pie_flowlist);
+ }
+
+ /* the new flows list is empty, try old flows list */
+ if (STAILQ_EMPTY(fq_pie_flowlist))
+ continue;
+
+ /* Dequeue a packet from the selected flow */
+ mbuf = pie_dequeue(f, si);
+
+ /* pie did not return a packet */
+ if (!mbuf) {
+ /* If the selected flow belongs to new flows list, then move
+ * it to the tail of old flows list. Otherwise, deactivate it and
+ * remove it from the old list and
+ */
+ if (fq_pie_flowlist == &si->newflows) {
+ STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain);
+ STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
+ } else {
+ f->active = 0;
+ fq_deactivate_pie(&f->pst);
+ STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain);
+ }
+ /* start again */
+ continue;
+ }
+
+ /* we have a packet to return,
+ * update flow deficit and return the packet*/
+ f->deficit -= mbuf->m_pkthdr.len;
+ return mbuf;
+
+ } while (1);
+
+ /* unreachable point */
+ return NULL;
+}
+
+/*
+ * Initialize fq_pie scheduler instance.
+ * also, allocate memory for flows array.
+ */
+static int
+fq_pie_new_sched(struct dn_sch_inst *_si)
+{
+ struct fq_pie_si *si;
+ struct dn_queue *q;
+ struct fq_pie_schk *schk;
+ int i;
+
+ si = (struct fq_pie_si *)_si;
+ schk = (struct fq_pie_schk *)(_si->sched+1);
+
+ if(si->flows) {
+ D("si already configured!");
+ return 0;
+ }
+
+ /* init the main queue */
+ q = &si->main_q;
+ set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
+ q->_si = _si;
+ q->fs = _si->sched->fs;
+
+ /* allocate memory for flows array */
+ si->flows = malloc(schk->cfg.flows_cnt * sizeof(struct fq_pie_flow),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (si->flows == NULL) {
+ D("cannot allocate memory for fq_pie configuration parameters");
+ return ENOMEM ;
+ }
+
+ /* init perturbation for this si */
+ si->perturbation = random();
+ si->nr_active_q = 0;
+
+ /* init the old and new flows lists */
+ STAILQ_INIT(&si->newflows);
+ STAILQ_INIT(&si->oldflows);
+
+ /* init the flows (sub-queues) */
+ for (i = 0; i < schk->cfg.flows_cnt; i++) {
+ si->flows[i].pst.parms = &schk->cfg.pcfg;
+ si->flows[i].psi = si;
+ pie_init(&si->flows[i]);
+ }
+
+ /* init mtx lock and callout function for free memory */
+ if (!fq_pie_desc.ref_count) {
+ mtx_init(&freemem_mtx, "mtx_pie", NULL, MTX_DEF);
+ }
+
+ mtx_lock(&freemem_mtx);
+ fq_pie_desc.ref_count++;
+ mtx_unlock(&freemem_mtx);
+
+ return 0;
+}
+
+/*
+ * Free FQ-PIE flows memory callout function.
+ * This function is scheduled when a flow or more still active and
+ * the scheduer is about to be destroyed, to prevent memory leak.
+ */
+static void
+free_flows(void *_mem)
+{
+ struct mem_to_free *mem = _mem;
+
+ free(mem->mem_flows, M_DUMMYNET);
+ free(mem->mem_callout, M_DUMMYNET);
+ free(_mem, M_DUMMYNET);
+
+ fq_pie_desc.ref_count--;
+ if (!fq_pie_desc.ref_count) {
+ mtx_unlock(&freemem_mtx);
+ mtx_destroy(&freemem_mtx);
+ } else
+ mtx_unlock(&freemem_mtx);
+ //D("mem freed ok!");
+}
+
+/*
+ * Free fq_pie scheduler instance.
+ */
+static int
+fq_pie_free_sched(struct dn_sch_inst *_si)
+{
+ struct fq_pie_si *si;
+ struct fq_pie_schk *schk;
+ int i;
+
+ si = (struct fq_pie_si *)_si;
+ schk = (struct fq_pie_schk *)(_si->sched+1);
+
+ for (i = 0; i < schk->cfg.flows_cnt; i++) {
+ pie_cleanup(&si->flows[i]);
+ }
+
+ /* if there are still some queues have a callout going to start,
+ * we cannot free flows memory. If we do so, a panic can happen
+ * as prob calculate callout function uses flows memory.
+ */
+ if (!si->nr_active_q) {
+ /* free the flows array */
+ free(si->flows , M_DUMMYNET);
+ si->flows = NULL;
+ mtx_lock(&freemem_mtx);
+ fq_pie_desc.ref_count--;
+ if (!fq_pie_desc.ref_count) {
+ mtx_unlock(&freemem_mtx);
+ mtx_destroy(&freemem_mtx);
+ } else
+ mtx_unlock(&freemem_mtx);
+ //D("ok!");
+ return 0;
+ } else {
+ /* memory leak happens here. So, we register a callout function to free
+ * flows memory later.
+ */
+ D("unable to stop all fq_pie sub-queues!");
+ mtx_lock(&freemem_mtx);
+
+ struct callout *mem_callout;
+ struct mem_to_free *mem;
+
+ mem = malloc(sizeof(*mem), M_DUMMYNET,
+ M_NOWAIT | M_ZERO);
+ mem_callout = malloc(sizeof(*mem_callout), M_DUMMYNET,
+ M_NOWAIT | M_ZERO);
+
+ callout_init_mtx(mem_callout, &freemem_mtx,
+ CALLOUT_RETURNUNLOCKED);
+
+ mem->mem_flows = si->flows;
+ mem->mem_callout = mem_callout;
+ callout_reset_sbt(mem_callout,
+ (uint64_t)(si->flows[0].pst.parms->tupdate + 1000) * SBT_1US,
+ 0, free_flows, mem, 0);
+
+ si->flows = NULL;
+ mtx_unlock(&freemem_mtx);
+
+ return EBUSY;
+ }
+}
+
+/*
+ * Configure FQ-PIE scheduler.
+ * the configurations for the scheduler is passed fromipfw userland.
+ */
+static int
+fq_pie_config(struct dn_schk *_schk)
+{
+ struct fq_pie_schk *schk;
+ struct dn_extra_parms *ep;
+ struct dn_sch_fq_pie_parms *fqp_cfg;
+
+ schk = (struct fq_pie_schk *)(_schk+1);
+ ep = (struct dn_extra_parms *) _schk->cfg;
+
+ /* par array contains fq_pie configuration as follow
+ * PIE: 0- qdelay_ref,1- tupdate, 2- max_burst
+ * 3- max_ecnth, 4- alpha, 5- beta, 6- flags
+ * FQ_PIE: 7- quantum, 8- limit, 9- flows
+ */
+ if (ep && ep->oid.len ==sizeof(*ep) &&
+ ep->oid.subtype == DN_SCH_PARAMS) {
+
+ fqp_cfg = &schk->cfg;
+ if (ep->par[0] < 0)
+ fqp_cfg->pcfg.qdelay_ref = fq_pie_sysctl.pcfg.qdelay_ref;
+ else
+ fqp_cfg->pcfg.qdelay_ref = ep->par[0];
+ if (ep->par[1] < 0)
+ fqp_cfg->pcfg.tupdate = fq_pie_sysctl.pcfg.tupdate;
+ else
+ fqp_cfg->pcfg.tupdate = ep->par[1];
+ if (ep->par[2] < 0)
+ fqp_cfg->pcfg.max_burst = fq_pie_sysctl.pcfg.max_burst;
+ else
+ fqp_cfg->pcfg.max_burst = ep->par[2];
+ if (ep->par[3] < 0)
+ fqp_cfg->pcfg.max_ecnth = fq_pie_sysctl.pcfg.max_ecnth;
+ else
+ fqp_cfg->pcfg.max_ecnth = ep->par[3];
+ if (ep->par[4] < 0)
+ fqp_cfg->pcfg.alpha = fq_pie_sysctl.pcfg.alpha;
+ else
+ fqp_cfg->pcfg.alpha = ep->par[4];
+ if (ep->par[5] < 0)
+ fqp_cfg->pcfg.beta = fq_pie_sysctl.pcfg.beta;
+ else
+ fqp_cfg->pcfg.beta = ep->par[5];
+ if (ep->par[6] < 0)
+ fqp_cfg->pcfg.flags = 0;
+ else
+ fqp_cfg->pcfg.flags = ep->par[6];
+
+ /* FQ configurations */
+ if (ep->par[7] < 0)
+ fqp_cfg->quantum = fq_pie_sysctl.quantum;
+ else
+ fqp_cfg->quantum = ep->par[7];
+ if (ep->par[8] < 0)
+ fqp_cfg->limit = fq_pie_sysctl.limit;
+ else
+ fqp_cfg->limit = ep->par[8];
+ if (ep->par[9] < 0)
+ fqp_cfg->flows_cnt = fq_pie_sysctl.flows_cnt;
+ else
+ fqp_cfg->flows_cnt = ep->par[9];
+
+ /* Bound the configurations */
+ fqp_cfg->pcfg.qdelay_ref = BOUND_VAR(fqp_cfg->pcfg.qdelay_ref,
+ 1, 5 * AQM_TIME_1S);
+ fqp_cfg->pcfg.tupdate = BOUND_VAR(fqp_cfg->pcfg.tupdate,
+ 1, 5 * AQM_TIME_1S);
+ fqp_cfg->pcfg.max_burst = BOUND_VAR(fqp_cfg->pcfg.max_burst,
+ 0, 5 * AQM_TIME_1S);
+ fqp_cfg->pcfg.max_ecnth = BOUND_VAR(fqp_cfg->pcfg.max_ecnth,
+ 0, PIE_SCALE);
+ fqp_cfg->pcfg.alpha = BOUND_VAR(fqp_cfg->pcfg.alpha, 0, 7 * PIE_SCALE);
+ fqp_cfg->pcfg.beta = BOUND_VAR(fqp_cfg->pcfg.beta, 0, 7 * PIE_SCALE);
+
+ fqp_cfg->quantum = BOUND_VAR(fqp_cfg->quantum,1,9000);
+ fqp_cfg->limit= BOUND_VAR(fqp_cfg->limit,1,20480);
+ fqp_cfg->flows_cnt= BOUND_VAR(fqp_cfg->flows_cnt,1,65536);
+ }
+ else {
+ D("Wrong parameters for fq_pie scheduler");
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Return FQ-PIE scheduler configurations
+ * the configurations for the scheduler is passed to userland.
+ */
+static int
+fq_pie_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) {
+
+ struct fq_pie_schk *schk = (struct fq_pie_schk *)(_schk+1);
+ struct dn_sch_fq_pie_parms *fqp_cfg;
+
+ fqp_cfg = &schk->cfg;
+
+ strcpy(ep->name, fq_pie_desc.name);
+ ep->par[0] = fqp_cfg->pcfg.qdelay_ref;
+ ep->par[1] = fqp_cfg->pcfg.tupdate;
+ ep->par[2] = fqp_cfg->pcfg.max_burst;
+ ep->par[3] = fqp_cfg->pcfg.max_ecnth;
+ ep->par[4] = fqp_cfg->pcfg.alpha;
+ ep->par[5] = fqp_cfg->pcfg.beta;
+ ep->par[6] = fqp_cfg->pcfg.flags;
+
+ ep->par[7] = fqp_cfg->quantum;
+ ep->par[8] = fqp_cfg->limit;
+ ep->par[9] = fqp_cfg->flows_cnt;
+
+ return 0;
+}
+
+/*
+ * FQ-PIE scheduler descriptor
+ * contains the type of the scheduler, the name, the size of extra
+ * data structures, and function pointers.
+ */
+static struct dn_alg fq_pie_desc = {
+ _SI( .type = ) DN_SCHED_FQ_PIE,
+ _SI( .name = ) "FQ_PIE",
+ _SI( .flags = ) 0,
+
+ _SI( .schk_datalen = ) sizeof(struct fq_pie_schk),
+ _SI( .si_datalen = ) sizeof(struct fq_pie_si) - sizeof(struct dn_sch_inst),
+ _SI( .q_datalen = ) 0,
+
+ _SI( .enqueue = ) fq_pie_enqueue,
+ _SI( .dequeue = ) fq_pie_dequeue,
+ _SI( .config = ) fq_pie_config, /* new sched i.e. sched X config ...*/
+ _SI( .destroy = ) NULL, /*sched x delete */
+ _SI( .new_sched = ) fq_pie_new_sched, /* new schd instance */
+ _SI( .free_sched = ) fq_pie_free_sched, /* delete schd instance */
+ _SI( .new_fsk = ) NULL,
+ _SI( .free_fsk = ) NULL,
+ _SI( .new_queue = ) NULL,
+ _SI( .free_queue = ) NULL,
+ _SI( .getconfig = ) fq_pie_getconfig,
+ _SI( .ref_count = ) 0
+};
+
+DECLARE_DNSCHED_MODULE(dn_fq_pie, &fq_pie_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_prio.c b/sys/netpfil/ipfw/dn_sched_prio.c
index b779515..915b4cb 100644
--- a/sys/netpfil/ipfw/dn_sched_prio.c
+++ b/sys/netpfil/ipfw/dn_sched_prio.c
@@ -41,6 +41,9 @@
#include <netinet/ip_dummynet.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
#else
#include <dn_test.h>
@@ -223,6 +226,9 @@ static struct dn_alg prio_desc = {
_SI( .new_queue = ) prio_new_queue,
_SI( .free_queue = ) prio_free_queue,
+#ifdef NEW_AQM
+ _SI( .getconfig = ) NULL,
+#endif
};
diff --git a/sys/netpfil/ipfw/dn_sched_qfq.c b/sys/netpfil/ipfw/dn_sched_qfq.c
index 5bbff8a..87502d1 100644
--- a/sys/netpfil/ipfw/dn_sched_qfq.c
+++ b/sys/netpfil/ipfw/dn_sched_qfq.c
@@ -42,6 +42,9 @@
#include <netinet/ip_dummynet.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
#else
#include <dn_test.h>
@@ -824,6 +827,9 @@ static struct dn_alg qfq_desc = {
_SI( .free_fsk = ) NULL,
_SI( .new_queue = ) qfq_new_queue,
_SI( .free_queue = ) qfq_free_queue,
+#ifdef NEW_AQM
+ _SI( .getconfig = ) NULL,
+#endif
};
DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_rr.c b/sys/netpfil/ipfw/dn_sched_rr.c
index dd608d7..b3658a6 100644
--- a/sys/netpfil/ipfw/dn_sched_rr.c
+++ b/sys/netpfil/ipfw/dn_sched_rr.c
@@ -42,6 +42,9 @@
#include <netinet/ip_dummynet.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
#else
#include <dn_test.h>
@@ -301,6 +304,9 @@ static struct dn_alg rr_desc = {
_SI( .free_fsk = ) NULL,
_SI( .new_queue = ) rr_new_queue,
_SI( .free_queue = ) rr_free_queue,
+#ifdef NEW_AQM
+ _SI( .getconfig = ) NULL,
+#endif
};
diff --git a/sys/netpfil/ipfw/dn_sched_wf2q.c b/sys/netpfil/ipfw/dn_sched_wf2q.c
index a91c1ce..06f92a9 100644
--- a/sys/netpfil/ipfw/dn_sched_wf2q.c
+++ b/sys/netpfil/ipfw/dn_sched_wf2q.c
@@ -43,6 +43,9 @@
#include <netinet/ip_dummynet.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
#else
#include <dn_test.h>
@@ -367,6 +370,10 @@ static struct dn_alg wf2qp_desc = {
_SI( .new_queue = ) wf2qp_new_queue,
_SI( .free_queue = ) wf2qp_free_queue,
+#ifdef NEW_AQM
+ _SI( .getconfig = ) NULL,
+#endif
+
};
diff --git a/sys/netpfil/ipfw/ip_dn_glue.c b/sys/netpfil/ipfw/ip_dn_glue.c
index 7d7e695..d7b04af 100644
--- a/sys/netpfil/ipfw/ip_dn_glue.c
+++ b/sys/netpfil/ipfw/ip_dn_glue.c
@@ -55,6 +55,9 @@
#include <netpfil/ipfw/ip_fw_private.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
/* FREEBSD7.2 ip_dummynet.h r191715*/
diff --git a/sys/netpfil/ipfw/ip_dn_io.c b/sys/netpfil/ipfw/ip_dn_io.c
index 90e2ccf..b7213ce 100644
--- a/sys/netpfil/ipfw/ip_dn_io.c
+++ b/sys/netpfil/ipfw/ip_dn_io.c
@@ -62,6 +62,9 @@ __FBSDID("$FreeBSD$");
#include <netpfil/ipfw/ip_fw_private.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
/*
@@ -83,8 +86,12 @@ static long tick_diff;
static unsigned long io_pkt;
static unsigned long io_pkt_fast;
-static unsigned long io_pkt_drop;
+#ifdef NEW_AQM
+unsigned long io_pkt_drop;
+#else
+static unsigned long io_pkt_drop;
+#endif
/*
* We use a heap to store entities for which we have pending timer events.
* The heap is checked at every tick and all entities with expired events
@@ -147,7 +154,11 @@ SYSBEGIN(f4)
SYSCTL_DECL(_net_inet);
SYSCTL_DECL(_net_inet_ip);
+#ifdef NEW_AQM
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+#else
static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+#endif
/* wrapper to pass dn_cfg fields to SYSCTL_* */
//#define DC(x) (&(VNET_NAME(_base_dn_cfg).x))
@@ -249,6 +260,14 @@ static struct dn_pkt_tag *
dn_tag_get(struct mbuf *m)
{
struct m_tag *mtag = m_tag_first(m);
+#ifdef NEW_AQM
+ /* XXX: to skip ts m_tag. For Debugging only*/
+ if (mtag != NULL && mtag->m_tag_id == DN_AQM_MTAG_TS) {
+ m_tag_delete(m,mtag);
+ mtag = m_tag_first(m);
+ D("skip TS tag");
+ }
+#endif
KASSERT(mtag != NULL &&
mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
mtag->m_tag_id == PACKET_TAG_DUMMYNET,
@@ -256,6 +275,7 @@ dn_tag_get(struct mbuf *m)
return (struct dn_pkt_tag *)(mtag+1);
}
+#ifndef NEW_AQM
static inline void
mq_append(struct mq *q, struct mbuf *m)
{
@@ -266,6 +286,7 @@ mq_append(struct mq *q, struct mbuf *m)
q->tail = m;
m->m_nextpkt = NULL;
}
+#endif
/*
* Dispose a list of packet. Use a functions so if we need to do
@@ -390,7 +411,10 @@ red_drops (struct dn_queue *q, int len)
/*
* ECN/ECT Processing (partially adopted from altq)
*/
-static int
+#ifndef NEW_AQM
+static
+#endif
+int
ecn_mark(struct mbuf* m)
{
struct ip *ip;
@@ -482,6 +506,11 @@ dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
goto drop;
if (f->plr && random() < f->plr)
goto drop;
+#ifdef NEW_AQM
+ /* Call AQM enqueue function */
+ if (q->fs->aqmfp)
+ return q->fs->aqmfp->enqueue(q ,m);
+#endif
if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) {
if (!(f->flags & DN_IS_ECN) || !ecn_mark(m))
goto drop;
@@ -864,6 +893,10 @@ dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
if (fs->sched->fp->enqueue(si, q, m)) {
/* packet was dropped by enqueue() */
m = *m0 = NULL;
+
+ /* dn_enqueue already increases io_pkt_drop */
+ io_pkt_drop--;
+
goto dropit;
}
diff --git a/sys/netpfil/ipfw/ip_dn_private.h b/sys/netpfil/ipfw/ip_dn_private.h
index 159ddc9..b8b55e8 100644
--- a/sys/netpfil/ipfw/ip_dn_private.h
+++ b/sys/netpfil/ipfw/ip_dn_private.h
@@ -81,6 +81,10 @@ SLIST_HEAD(dn_fsk_head, dn_fsk);
SLIST_HEAD(dn_queue_head, dn_queue);
SLIST_HEAD(dn_alg_head, dn_alg);
+#ifdef NEW_AQM
+SLIST_HEAD(dn_aqm_head, dn_aqm); /* for new AQMs */
+#endif
+
struct mq { /* a basic queue of packets*/
struct mbuf *head, *tail;
};
@@ -135,6 +139,9 @@ struct dn_parms {
/* list of flowsets without a scheduler -- use sch_chain */
struct dn_fsk_head fsu; /* list of unlinked flowsets */
struct dn_alg_head schedlist; /* list of algorithms */
+#ifdef NEW_AQM
+ struct dn_aqm_head aqmlist; /* list of AQMs */
+#endif
/* Store the fs/sch to scan when draining. The value is the
* bucket number of the hash table. Expire can be disabled
@@ -231,6 +238,10 @@ struct dn_fsk { /* kernel side of a flowset */
int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
int avg_pkt_size ; /* medium packet size */
int max_pkt_size ; /* max packet size */
+#ifdef NEW_AQM
+ struct dn_aqm *aqmfp; /* Pointer to AQM functions */
+ void *aqmcfg; /* configuration parameters for AQM */
+#endif
};
/*
@@ -253,6 +264,9 @@ struct dn_queue {
int count; /* arrivals since last RED drop */
int random; /* random value (scaled) */
uint64_t q_time; /* start of queue idle time */
+#ifdef NEW_AQM
+ void *aqm_status; /* per-queue status variables*/
+#endif
};
@@ -400,4 +414,20 @@ int do_config(void *p, int l);
void dn_drain_scheduler(void);
void dn_drain_queue(void);
+#ifdef NEW_AQM
+int ecn_mark(struct mbuf* m);
+
+/* moved from ip_dn_io.c to here to be available for AQMs modules*/
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+ if (q->head == NULL)
+ q->head = m;
+ else
+ q->tail->m_nextpkt = m;
+ q->tail = m;
+ m->m_nextpkt = NULL;
+}
+#endif /* NEW_AQM */
+
#endif /* _IP_DN_PRIVATE_H */
diff --git a/sys/netpfil/ipfw/ip_dummynet.c b/sys/netpfil/ipfw/ip_dummynet.c
index 420b491..09fbe84 100644
--- a/sys/netpfil/ipfw/ip_dummynet.c
+++ b/sys/netpfil/ipfw/ip_dummynet.c
@@ -1,4 +1,11 @@
/*-
+ * Codel/FQ_Codel and PIE/FQ-PIE Code:
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
* Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
* Portions Copyright (c) 2000 Akamba Corp.
* All rights reserved
@@ -57,6 +64,9 @@ __FBSDID("$FreeBSD$");
#include <netpfil/ipfw/ip_fw_private.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
/* which objects to copy */
@@ -97,6 +107,21 @@ dn_reschedule(void)
}
/*----- end of callout hooks -----*/
+#ifdef NEW_AQM
+/* Return AQM descriptor for given type or name. */
+static struct dn_aqm *
+find_aqm_type(int type, char *name)
+{
+ struct dn_aqm *d;
+
+ SLIST_FOREACH(d, &dn_cfg.aqmlist, next) {
+ if (d->type == type || (name && !strcasecmp(d->name, name)))
+ return d;
+ }
+ return NULL; /* not found */
+}
+#endif
+
/* Return a scheduler descriptor given the type or name. */
static struct dn_alg *
find_sched_type(int type, char *name)
@@ -319,7 +344,15 @@ q_new(uintptr_t key, int flags, void *arg)
if (fs->sched->fp->new_queue)
fs->sched->fp->new_queue(q);
+
+#ifdef NEW_AQM
+ /* call AQM init function after creating a queue*/
+ if (fs->aqmfp && fs->aqmfp->init)
+ if(fs->aqmfp->init(q))
+ D("unable to init AQM for fs %d", fs->fs.fs_nr);
+#endif
dn_cfg.queue_count++;
+
return q;
}
@@ -333,6 +366,13 @@ dn_delete_queue(struct dn_queue *q, int flags)
{
struct dn_fsk *fs = q->fs;
+#ifdef NEW_AQM
+ /* clean up AQM status for queue 'q'
+ * cleanup here is called just with MULTIQUEUE
+ */
+ if (fs && fs->aqmfp && fs->aqmfp->cleanup)
+ fs->aqmfp->cleanup(q);
+#endif
// D("fs %p si %p\n", fs, q->_si);
/* notify the parent scheduler that the queue is going away */
if (fs && fs->sched->fp->free_queue)
@@ -474,6 +514,16 @@ si_new(uintptr_t key, int flags, void *arg)
if (s->sch.flags & DN_HAVE_MASK)
si->ni.fid = *(struct ipfw_flow_id *)key;
+#ifdef NEW_AQM
+ /* init AQM status for !DN_MULTIQUEUE sched*/
+ if (!(s->fp->flags & DN_MULTIQUEUE))
+ if (s->fs->aqmfp && s->fs->aqmfp->init)
+ if(s->fs->aqmfp->init((struct dn_queue *)(si + 1))) {
+ D("unable to init AQM for fs %d", s->fs->fs.fs_nr);
+ goto error;
+ }
+#endif
+
dn_cfg.si_count++;
return si;
@@ -503,6 +553,20 @@ si_destroy(void *_si, void *arg)
dn_free_pkts(dl->mq.head); /* drain delay line */
if (si->kflags & DN_ACTIVE) /* remove si from event heap */
heap_extract(&dn_cfg.evheap, si);
+
+#ifdef NEW_AQM
+ /* clean up AQM status for !DN_MULTIQUEUE sched
+ * Note that all queues belong to fs were cleaned up in fsk_detach.
+ * When drain_scheduler is called s->fs and q->fs are pointing
+ * to a correct fs, so we can use fs in this case.
+ */
+ if (!(s->fp->flags & DN_MULTIQUEUE)) {
+ struct dn_queue *q = (struct dn_queue *)(si + 1);
+ if (q->aqm_status && q->fs->aqmfp)
+ if (q->fs->aqmfp->cleanup)
+ q->fs->aqmfp->cleanup(q);
+ }
+#endif
if (s->fp->free_sched)
s->fp->free_sched(si);
bzero(si, sizeof(*si)); /* safety */
@@ -591,6 +655,67 @@ fsk_new(uintptr_t key, int flags, void *arg)
return fs;
}
+#ifdef NEW_AQM
+/* callback function for cleaning up AQM queue status belongs to a flowset
+ * connected to scheduler instance '_si' (for !DN_MULTIQUEUE only).
+ */
+static int
+si_cleanup_q(void *_si, void *arg)
+{
+ struct dn_sch_inst *si = _si;
+
+ if (!(si->sched->fp->flags & DN_MULTIQUEUE)) {
+ if (si->sched->fs->aqmfp && si->sched->fs->aqmfp->cleanup)
+ si->sched->fs->aqmfp->cleanup((struct dn_queue *) (si+1));
+ }
+ return 0;
+}
+
+/* callback to clean up queue AQM status.*/
+static int
+q_cleanup_q(void *_q, void *arg)
+{
+ struct dn_queue *q = _q;
+ q->fs->aqmfp->cleanup(q);
+ return 0;
+}
+
+/* Clean up all AQM queues status belongs to flowset 'fs' and then
+ * deconfig AQM for flowset 'fs'
+ */
+static void
+aqm_cleanup_deconfig_fs(struct dn_fsk *fs)
+{
+ struct dn_sch_inst *si;
+
+ /* clean up AQM status for all queues for !DN_MULTIQUEUE sched*/
+ if (fs->fs.fs_nr > DN_MAX_ID) {
+ if (fs->sched && !(fs->sched->fp->flags & DN_MULTIQUEUE)) {
+ if (fs->sched->sch.flags & DN_HAVE_MASK)
+ dn_ht_scan(fs->sched->siht, si_cleanup_q, NULL);
+ else {
+ /* single si i.e. no sched mask */
+ si = (struct dn_sch_inst *) fs->sched->siht;
+ if (si && fs->aqmfp && fs->aqmfp->cleanup)
+ fs->aqmfp->cleanup((struct dn_queue *) (si+1));
+ }
+ }
+ }
+
+ /* clean up AQM status for all queues for DN_MULTIQUEUE sched*/
+ if (fs->sched && fs->sched->fp->flags & DN_MULTIQUEUE && fs->qht) {
+ if (fs->fs.flags & DN_QHT_HASH)
+ dn_ht_scan(fs->qht, q_cleanup_q, NULL);
+ else
+ fs->aqmfp->cleanup((struct dn_queue *)(fs->qht));
+ }
+
+ /* deconfig AQM */
+ if(fs->aqmcfg && fs->aqmfp && fs->aqmfp->deconfig)
+ fs->aqmfp->deconfig(fs);
+}
+#endif
+
/*
* detach flowset from its current scheduler. Flags as follows:
* DN_DETACH removes from the fsk_list
@@ -619,6 +744,10 @@ fsk_detach(struct dn_fsk *fs, int flags)
free(fs->w_q_lookup, M_DUMMYNET);
fs->w_q_lookup = NULL;
qht_delete(fs, flags);
+#ifdef NEW_AQM
+ aqm_cleanup_deconfig_fs(fs);
+#endif
+
if (fs->sched && fs->sched->fp->free_fsk)
fs->sched->fp->free_fsk(fs);
fs->sched = NULL;
@@ -1190,6 +1319,183 @@ update_fs(struct dn_schk *s)
}
}
+#ifdef NEW_AQM
+/* Retrieve AQM configurations to ipfw userland
+ */
+static int
+get_aqm_parms(struct sockopt *sopt)
+{
+ struct dn_extra_parms *ep;
+ struct dn_fsk *fs;
+ size_t sopt_valsize;
+ int l, err = 0;
+
+ sopt_valsize = sopt->sopt_valsize;
+ l = sizeof(*ep);
+ if (sopt->sopt_valsize < l) {
+ D("bad len sopt->sopt_valsize %d len %d",
+ (int) sopt->sopt_valsize , l);
+ err = EINVAL;
+ return err;
+ }
+ ep = malloc(l, M_DUMMYNET, M_WAITOK);
+ if(!ep) {
+ err = ENOMEM ;
+ return err;
+ }
+ do {
+ err = sooptcopyin(sopt, ep, l, l);
+ if(err)
+ break;
+ sopt->sopt_valsize = sopt_valsize;
+ if (ep->oid.len < l) {
+ err = EINVAL;
+ break;
+ }
+
+ fs = dn_ht_find(dn_cfg.fshash, ep->nr, 0, NULL);
+ if (!fs) {
+ D("fs %d not found", ep->nr);
+ err = EINVAL;
+ break;
+ }
+
+ if (fs->aqmfp && fs->aqmfp->getconfig) {
+ if(fs->aqmfp->getconfig(fs, ep)) {
+ D("Error while trying to get AQM params");
+ err = EINVAL;
+ break;
+ }
+ ep->oid.len = l;
+ err = sooptcopyout(sopt, ep, l);
+ }
+ }while(0);
+
+ free(ep, M_DUMMYNET);
+ return err;
+}
+
+/* Retrieve AQM configurations to ipfw userland
+ */
+static int
+get_sched_parms(struct sockopt *sopt)
+{
+ struct dn_extra_parms *ep;
+ struct dn_schk *schk;
+ size_t sopt_valsize;
+ int l, err = 0;
+
+ sopt_valsize = sopt->sopt_valsize;
+ l = sizeof(*ep);
+ if (sopt->sopt_valsize < l) {
+ D("bad len sopt->sopt_valsize %d len %d",
+ (int) sopt->sopt_valsize , l);
+ err = EINVAL;
+ return err;
+ }
+ ep = malloc(l, M_DUMMYNET, M_WAITOK);
+ if(!ep) {
+ err = ENOMEM ;
+ return err;
+ }
+ do {
+ err = sooptcopyin(sopt, ep, l, l);
+ if(err)
+ break;
+ sopt->sopt_valsize = sopt_valsize;
+ if (ep->oid.len < l) {
+ err = EINVAL;
+ break;
+ }
+
+ schk = locate_scheduler(ep->nr);
+ if (!schk) {
+ D("sched %d not found", ep->nr);
+ err = EINVAL;
+ break;
+ }
+
+ if (schk->fp && schk->fp->getconfig) {
+ if(schk->fp->getconfig(schk, ep)) {
+ D("Error while trying to get sched params");
+ err = EINVAL;
+ break;
+ }
+ ep->oid.len = l;
+ err = sooptcopyout(sopt, ep, l);
+ }
+ }while(0);
+ free(ep, M_DUMMYNET);
+
+ return err;
+}
+
+/* Configure AQM for flowset 'fs'.
+ * extra parameters are passed from userland.
+ */
+static int
+config_aqm(struct dn_fsk *fs, struct dn_extra_parms *ep, int busy)
+{
+ int err = 0;
+
+ do {
+ /* no configurations */
+ if (!ep) {
+ err = 0;
+ break;
+ }
+
+ /* no AQM for this flowset*/
+ if (!strcmp(ep->name,"")) {
+ err = 0;
+ break;
+ }
+ if (ep->oid.len < sizeof(*ep)) {
+ D("short aqm len %d", ep->oid.len);
+ err = EINVAL;
+ break;
+ }
+
+ if (busy) {
+ D("Unable to configure flowset, flowset busy!");
+ err = EINVAL;
+ break;
+ }
+
+ /* deconfigure old aqm if exist */
+ if (fs->aqmcfg && fs->aqmfp && fs->aqmfp->deconfig) {
+ aqm_cleanup_deconfig_fs(fs);
+ }
+
+ if (!(fs->aqmfp = find_aqm_type(0, ep->name))) {
+ D("AQM functions not found for type %s!", ep->name);
+ fs->fs.flags &= ~DN_IS_AQM;
+ err = EINVAL;
+ break;
+ } else
+ fs->fs.flags |= DN_IS_AQM;
+
+ if (ep->oid.subtype != DN_AQM_PARAMS) {
+ D("Wrong subtype");
+ err = EINVAL;
+ break;
+ }
+
+ if (fs->aqmfp->config) {
+ err = fs->aqmfp->config(fs, ep, ep->oid.len);
+ if (err) {
+ D("Unable to configure AQM for FS %d", fs->fs.fs_nr );
+ fs->fs.flags &= ~DN_IS_AQM;
+ fs->aqmfp = NULL;
+ break;
+ }
+ }
+ } while(0);
+
+ return err;
+}
+#endif
+
/*
* Configuration -- to preserve backward compatibility we use
* the following scheme (N is 65536)
@@ -1322,6 +1628,14 @@ config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
}
if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) {
ND("flowset %d unchanged", i);
+#ifdef NEW_AQM
+ /* reconfigure AQM as the parameters can be changed.
+ * we consider the flowsetis busy if it has scheduler instance(s)
+ */
+ s = locate_scheduler(nfs->sched_nr);
+ config_aqm(fs, (struct dn_extra_parms *) arg,
+ s != NULL && s->siht != NULL);
+#endif
break; /* no change, nothing to do */
}
if (oldc != dn_cfg.fsk_count) /* new item */
@@ -1340,6 +1654,10 @@ config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
fsk_detach(fs, flags);
}
fs->fs = *nfs; /* copy configuration */
+#ifdef NEW_AQM
+ fs->aqmfp = NULL;
+ config_aqm(fs, (struct dn_extra_parms *) arg, s != NULL && s->siht != NULL);
+#endif
if (s != NULL)
fsk_attach(fs, s);
} while (0);
@@ -1865,6 +2183,19 @@ dummynet_get(struct sockopt *sopt, void **compat)
// cmd->id = sopt_valsize;
D("compatibility mode");
}
+
+#ifdef NEW_AQM
+ /* get AQM params */
+ if(cmd->subtype == DN_AQM_PARAMS) {
+ error = get_aqm_parms(sopt);
+ goto done;
+ /* get Scheduler params */
+ } else if (cmd->subtype == DN_SCH_PARAMS) {
+ error = get_sched_parms(sopt);
+ goto done;
+ }
+#endif
+
a.extra = (struct copy_range *)cmd;
if (cmd->len == sizeof(*cmd)) { /* no range, create a default */
uint32_t *rp = (uint32_t *)(cmd + 1);
@@ -2316,4 +2647,98 @@ MODULE_VERSION(dummynet, 3);
*/
//VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL);
+#ifdef NEW_AQM
+
+/* modevent helpers for the AQM modules */
+static int
+load_dn_aqm(struct dn_aqm *d)
+{
+ struct dn_aqm *aqm=NULL;
+
+ if (d == NULL)
+ return 1; /* error */
+ ip_dn_init(); /* just in case, we need the lock */
+
+ /* Check that mandatory funcs exists */
+ if (d->enqueue == NULL || d->dequeue == NULL) {
+ D("missing enqueue or dequeue for %s", d->name);
+ return 1;
+ }
+
+ /* Search if AQM already exists */
+ DN_BH_WLOCK();
+ SLIST_FOREACH(aqm, &dn_cfg.aqmlist, next) {
+ if (strcmp(aqm->name, d->name) == 0) {
+ D("%s already loaded", d->name);
+ break; /* AQM already exists */
+ }
+ }
+ if (aqm == NULL)
+ SLIST_INSERT_HEAD(&dn_cfg.aqmlist, d, next);
+ DN_BH_WUNLOCK();
+ D("dn_aqm %s %sloaded", d->name, aqm ? "not ":"");
+ return aqm ? 1 : 0;
+}
+
+
+/* Callback to clean up AQM status for queues connected to a flowset
+ * and then deconfigure the flowset.
+ * This function is called before an AQM module is unloaded
+ */
+static int
+fs_cleanup(void *_fs, void *arg)
+{
+ struct dn_fsk *fs = _fs;
+ uint32_t type = *(uint32_t *)arg;
+
+ if (fs->aqmfp && fs->aqmfp->type == type)
+ aqm_cleanup_deconfig_fs(fs);
+
+ return 0;
+}
+
+static int
+unload_dn_aqm(struct dn_aqm *aqm)
+{
+ struct dn_aqm *tmp, *r;
+ int err = EINVAL;
+ err = 0;
+ ND("called for %s", aqm->name);
+
+ DN_BH_WLOCK();
+
+ /* clean up AQM status and deconfig flowset */
+ dn_ht_scan(dn_cfg.fshash, fs_cleanup, &aqm->type);
+
+ SLIST_FOREACH_SAFE(r, &dn_cfg.aqmlist, next, tmp) {
+ if (strcmp(aqm->name, r->name) != 0)
+ continue;
+ ND("ref_count = %d", r->ref_count);
+ err = (r->ref_count != 0 || r->cfg_ref_count != 0) ? EBUSY : 0;
+ if (err == 0)
+ SLIST_REMOVE(&dn_cfg.aqmlist, r, dn_aqm, next);
+ break;
+ }
+ DN_BH_WUNLOCK();
+ D("%s %sunloaded", aqm->name, err ? "not ":"");
+ if (err)
+ D("ref_count=%d, cfg_ref_count=%d", r->ref_count, r->cfg_ref_count);
+ return err;
+}
+
+int
+dn_aqm_modevent(module_t mod, int cmd, void *arg)
+{
+ struct dn_aqm *aqm = arg;
+
+ if (cmd == MOD_LOAD)
+ return load_dn_aqm(aqm);
+ else if (cmd == MOD_UNLOAD)
+ return unload_dn_aqm(aqm);
+ else
+ return EINVAL;
+}
+#endif
+
/* end of file */
+
diff --git a/sys/rpc/svc.c b/sys/rpc/svc.c
index b436c18..a4cc484 100644
--- a/sys/rpc/svc.c
+++ b/sys/rpc/svc.c
@@ -847,9 +847,7 @@ svc_xprt_alloc()
SVCXPRT_EXT *ext;
xprt = mem_alloc(sizeof(SVCXPRT));
- memset(xprt, 0, sizeof(SVCXPRT));
ext = mem_alloc(sizeof(SVCXPRT_EXT));
- memset(ext, 0, sizeof(SVCXPRT_EXT));
xprt->xp_p3 = ext;
refcount_init(&xprt->xp_refs, 1);
diff --git a/sys/rpc/svc_vc.c b/sys/rpc/svc_vc.c
index be8e04e..92a926d 100644
--- a/sys/rpc/svc_vc.c
+++ b/sys/rpc/svc_vc.c
@@ -189,11 +189,11 @@ svc_vc_create(SVCPOOL *pool, struct socket *so, size_t sendsize,
SOCKBUF_UNLOCK(&so->so_rcv);
return (xprt);
+
cleanup_svc_vc_create:
- if (xprt) {
- sx_destroy(&xprt->xp_lock);
- svc_xprt_free(xprt);
- }
+ sx_destroy(&xprt->xp_lock);
+ svc_xprt_free(xprt);
+
return (NULL);
}
@@ -203,8 +203,8 @@ cleanup_svc_vc_create:
SVCXPRT *
svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr)
{
- SVCXPRT *xprt = NULL;
- struct cf_conn *cd = NULL;
+ SVCXPRT *xprt;
+ struct cf_conn *cd;
struct sockaddr* sa = NULL;
struct sockopt opt;
int one = 1;
@@ -279,12 +279,10 @@ svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr)
return (xprt);
cleanup_svc_vc_create:
- if (xprt) {
- sx_destroy(&xprt->xp_lock);
- svc_xprt_free(xprt);
- }
- if (cd)
- mem_free(cd, sizeof(*cd));
+ sx_destroy(&xprt->xp_lock);
+ svc_xprt_free(xprt);
+ mem_free(cd, sizeof(*cd));
+
return (NULL);
}
diff --git a/sys/sys/cdefs.h b/sys/sys/cdefs.h
index 1729c7b..0b3ed26 100644
--- a/sys/sys/cdefs.h
+++ b/sys/sys/cdefs.h
@@ -273,7 +273,8 @@
#define _Alignof(x) __alignof(x)
#endif
-#if !__has_extension(c_atomic) && !__has_extension(cxx_atomic)
+#if !defined(__cplusplus) && !__has_extension(c_atomic) && \
+ !__has_extension(cxx_atomic)
/*
* No native support for _Atomic(). Place object in structure to prevent
* most forms of direct non-atomic access.
diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h
index d2ad920..14bd867 100644
--- a/sys/sys/vmmeter.h
+++ b/sys/sys/vmmeter.h
@@ -183,7 +183,8 @@ static __inline
int
vm_paging_needed(void)
{
- return (cnt.v_free_count + cnt.v_cache_count < vm_pageout_wakeup_thresh);
+ return (cnt.v_free_count + cnt.v_cache_count <
+ (u_int)vm_pageout_wakeup_thresh);
}
#endif
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 512151b..c250c5d 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -253,11 +253,11 @@ vm_page_domain_init(struct vm_domain *vmd)
*__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
"vm inactive pagequeue";
- *__DECONST(int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
+ *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
&cnt.v_inactive_count;
*__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
"vm active pagequeue";
- *__DECONST(int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
+ *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
&cnt.v_active_count;
vmd->vmd_page_count = 0;
vmd->vmd_free_count = 0;
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 7ecb6c7..3ab4c24 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -215,7 +215,7 @@ struct vm_pagequeue {
struct mtx pq_mutex;
struct pglist pq_pl;
int pq_cnt;
- int * const pq_vcnt;
+ u_int * const pq_vcnt;
const char * const pq_name;
} __aligned(CACHE_LINE_SIZE);
diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c
index a580f2a..e3c1571 100644
--- a/sys/x86/x86/local_apic.c
+++ b/sys/x86/x86/local_apic.c
@@ -284,7 +284,7 @@ lapic_init(vm_paddr_t addr)
}
#ifdef SMP
-#define LOOPS 1000000
+#define LOOPS 100000
/*
* Calibrate the busy loop waiting for IPI ack in xAPIC mode.
* lapic_ipi_wait_mult contains the number of iterations which
@@ -440,7 +440,7 @@ lapic_setup(int boot)
/* Program the CMCI LVT entry if present. */
if (maxlvt >= APIC_LVT_CMCI)
lapic->lvt_cmci = lvt_mode(la, APIC_LVT_CMCI, lapic->lvt_cmci);
-
+
intr_restore(saveintr);
}
@@ -1363,7 +1363,7 @@ static void
apic_setup_local(void *dummy __unused)
{
int retval;
-
+
if (best_enum == NULL)
return;
OpenPOWER on IntegriCloud