66 files changed, 7552 insertions, 1944 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 855f7bc..8136745 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -6480,7 +6480,7 @@ static int
 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 {
 	vm_offset_t base, offset, tmpva;
-	vm_paddr_t pa_start, pa_end;
+	vm_paddr_t pa_start, pa_end, pa_end1;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
@@ -6660,9 +6660,12 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 			tmpva += PAGE_SIZE;
 		}
 	}
-	if (error == 0 && pa_start != pa_end)
-		error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
-		    pa_end - pa_start, mode);
+	if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
+		pa_end1 = MIN(pa_end, dmaplimit);
+		if (pa_start != pa_end1)
+			error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
+			    pa_end1 - pa_start, mode);
+	}
 
 	/*
 	 * Flush CPU caches if required to make sure any data isn't cached that
diff --git a/sys/cddl/boot/zfs/lz4.c b/sys/cddl/boot/zfs/lz4.c
index c29f861..b12122c 100644
--- a/sys/cddl/boot/zfs/lz4.c
+++ b/sys/cddl/boot/zfs/lz4.c
@@ -34,6 +34,8 @@
  * $FreeBSD$
  */
 
+#include <arpa/inet.h>
+
 static int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
 					    int isize, int maxOutputSize);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 226233e..f6d19fe 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -139,7 +139,6 @@
 #include <zfs_fletcher.h>
 #include <sys/sdt.h>
 
-#include <vm/vm_pageout.h>
 #include <machine/vmparam.h>
 
 #ifdef illumos
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index 04e1342..2a15cdf 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -75,7 +75,6 @@
 #include <sys/sched.h>
 #include <sys/acl.h>
 #include <vm/vm_param.h>
-#include <vm/vm_pageout.h>
 
 /*
  * Programming rules.
diff --git a/sys/conf/files b/sys/conf/files
index 8d0453a..e8c8a3a 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3596,8 +3596,12 @@ netipx/spx_usrreq.c		optional ipx
 netnatm/natm.c			optional natm
 netnatm/natm_pcb.c		optional natm
 netnatm/natm_proto.c		optional natm
+netpfil/ipfw/dn_aqm_codel.c	optional inet dummynet
+netpfil/ipfw/dn_aqm_pie.c	optional inet dummynet
 netpfil/ipfw/dn_heap.c		optional inet dummynet
 netpfil/ipfw/dn_sched_fifo.c	optional inet dummynet
+netpfil/ipfw/dn_sched_fq_codel.c	optional inet dummynet
+netpfil/ipfw/dn_sched_fq_pie.c	optional inet dummynet
 netpfil/ipfw/dn_sched_prio.c	optional inet dummynet
 netpfil/ipfw/dn_sched_qfq.c	optional inet dummynet
 netpfil/ipfw/dn_sched_rr.c	optional inet dummynet
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index f96b4f3..533b957 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -271,7 +271,10 @@ dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c		optional	hyperv
 dev/hyperv/netvsc/hv_rndis_filter.c			optional	hyperv
 dev/hyperv/stordisengage/hv_ata_pci_disengage.c		optional	hyperv
 dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c		optional	hyperv
+dev/hyperv/utilities/hv_heartbeat.c			optional	hyperv
 dev/hyperv/utilities/hv_kvp.c				optional	hyperv
+dev/hyperv/utilities/hv_shutdown.c			optional	hyperv
+dev/hyperv/utilities/hv_timesync.c			optional	hyperv
 dev/hyperv/utilities/hv_util.c				optional	hyperv
 dev/hyperv/vmbus/hv_channel.c				optional	hyperv
 dev/hyperv/vmbus/hv_channel_mgmt.c			optional	hyperv
diff --git a/sys/conf/files.i386 b/sys/conf/files.i386
index f79ed58..89b87e3 100644
--- a/sys/conf/files.i386
+++ b/sys/conf/files.i386
@@ -247,7 +247,10 @@ dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c		optional	hyperv
 dev/hyperv/netvsc/hv_rndis_filter.c			optional	hyperv
 dev/hyperv/stordisengage/hv_ata_pci_disengage.c		optional	hyperv
 dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c		optional	hyperv
+dev/hyperv/utilities/hv_heartbeat.c			optional	hyperv
 dev/hyperv/utilities/hv_kvp.c				optional	hyperv
+dev/hyperv/utilities/hv_shutdown.c			optional	hyperv
+dev/hyperv/utilities/hv_timesync.c			optional	hyperv
 dev/hyperv/utilities/hv_util.c				optional	hyperv
 dev/hyperv/vmbus/hv_channel.c				optional	hyperv
 dev/hyperv/vmbus/hv_channel_mgmt.c			optional	hyperv
diff --git a/sys/dev/bge/if_bge.c b/sys/dev/bge/if_bge.c
index b8660a4..def33c3 100644
--- a/sys/dev/bge/if_bge.c
+++ b/sys/dev/bge/if_bge.c
@@ -170,6 +170,7 @@ static const struct bge_type {
 	{ BCOM_VENDORID,	BCOM_DEVICEID_BCM5715 },
 	{ BCOM_VENDORID,	BCOM_DEVICEID_BCM5715S },
 	{ BCOM_VENDORID,	BCOM_DEVICEID_BCM5717 },
+	{ BCOM_VENDORID,	BCOM_DEVICEID_BCM5717C },
 	{ BCOM_VENDORID,	BCOM_DEVICEID_BCM5718 },
 	{ BCOM_VENDORID,	BCOM_DEVICEID_BCM5719 },
 	{ BCOM_VENDORID,	BCOM_DEVICEID_BCM5720 },
@@ -310,6 +311,7 @@ static const struct bge_revision {
 	{ BGE_CHIPID_BCM5715_A3,	"BCM5715 A3" },
 	{ BGE_CHIPID_BCM5717_A0,	"BCM5717 A0" },
 	{ BGE_CHIPID_BCM5717_B0,	"BCM5717 B0" },
+	{ BGE_CHIPID_BCM5717_C0,	"BCM5717 C0" },
 	{ BGE_CHIPID_BCM5719_A0,	"BCM5719 A0" },
 	{ BGE_CHIPID_BCM5720_A0,	"BCM5720 A0" },
 	{ BGE_CHIPID_BCM5755_A0,	"BCM5755 A0" },
@@ -2689,6 +2691,10 @@ bge_chipid(device_t dev)
 		 * registers.
 		 */
 		switch (pci_get_device(dev)) {
+		case BCOM_DEVICEID_BCM5717C:
+			/* 5717 C0 seems to belong to 5720 line. */
+			id = BGE_CHIPID_BCM5720_A0;
+			break;
 		case BCOM_DEVICEID_BCM5717:
 		case BCOM_DEVICEID_BCM5718:
 		case BCOM_DEVICEID_BCM5719:
diff --git a/sys/dev/bge/if_bgereg.h b/sys/dev/bge/if_bgereg.h
index 37b0459..0cf9ca1 100644
--- a/sys/dev/bge/if_bgereg.h
+++ b/sys/dev/bge/if_bgereg.h
@@ -329,6 +329,7 @@
 #define	BGE_CHIPID_BCM57780_A1		0x57780001
 #define	BGE_CHIPID_BCM5717_A0		0x05717000
 #define	BGE_CHIPID_BCM5717_B0		0x05717100
+#define	BGE_CHIPID_BCM5717_C0		0x05717200
 #define	BGE_CHIPID_BCM5719_A0		0x05719000
 #define	BGE_CHIPID_BCM5720_A0		0x05720000
 #define	BGE_CHIPID_BCM5762_A0		0x05762000
@@ -2452,6 +2453,7 @@ struct bge_status_block {
 #define	BCOM_DEVICEID_BCM5715		0x1678
 #define	BCOM_DEVICEID_BCM5715S		0x1679
 #define	BCOM_DEVICEID_BCM5717		0x1655
+#define	BCOM_DEVICEID_BCM5717C		0x1665
 #define	BCOM_DEVICEID_BCM5718		0x1656
 #define	BCOM_DEVICEID_BCM5719		0x1657
 #define	BCOM_DEVICEID_BCM5720_PP	0x1658	/* Not released to public. */
diff --git a/sys/dev/hyperv/include/hyperv.h b/sys/dev/hyperv/include/hyperv.h
index 1a45b7b..f45543b 100644
--- a/sys/dev/hyperv/include/hyperv.h
+++ b/sys/dev/hyperv/include/hyperv.h
@@ -755,6 +755,8 @@ typedef struct hv_vmbus_channel {
 
 	struct mtx			inbound_lock;
 
+	struct taskqueue *		rxq;
+	struct task			channel_task;
 	hv_vmbus_pfn_channel_callback	on_channel_callback;
 	void*				channel_callback_context;
 
@@ -906,30 +908,6 @@ int		hv_vmbus_channel_teardown_gpdal(
 
 struct hv_vmbus_channel* vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary);
 
-/*
- * Work abstraction defines
- */
-typedef struct hv_work_queue {
-	struct taskqueue*	queue;
-	struct proc*		proc;
-	struct sema*		work_sema;
-} hv_work_queue;
-
-typedef struct hv_work_item {
-	struct task	work;
-	void		(*callback)(void *);
-	void*		context;
-	hv_work_queue*	wq;
-} hv_work_item;
-
-struct hv_work_queue*	hv_work_queue_create(char* name);
-
-void			hv_work_queue_close(struct hv_work_queue* wq);
-
-int			hv_queue_work_item(
-				hv_work_queue*	wq,
-				void		(*callback)(void *),
-				void*		context);
 /**
  * @brief Get physical address from virtual
  */
@@ -941,35 +919,5 @@ hv_get_phys_addr(void *virt)
 	return (ret);
 }
 
-
-/**
- * KVP related structures
- * 
- */
-typedef struct hv_vmbus_service {
-        hv_guid       guid;             /* Hyper-V GUID */
-        char          *name;            /* name of service */
-        boolean_t     enabled;          /* service enabled */
-        hv_work_queue *work_queue;      /* background work queue */
-
-        /*
-         * function to initialize service
-         */
-        int (*init)(struct hv_vmbus_service *);
-
-        /*
-         * function to process Hyper-V messages
-         */
-        void (*callback)(void *);
-} hv_vmbus_service;
-
-extern uint8_t* receive_buffer[];
-extern hv_vmbus_service service_table[];
 extern uint32_t hv_vmbus_protocal_version;
-
-void hv_kvp_callback(void *context);
-int hv_kvp_init(hv_vmbus_service *serv);
-void hv_kvp_deinit(void);
-
 #endif  /* __HYPERV_H__ */
-
diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.c b/sys/dev/hyperv/netvsc/hv_net_vsc.c
index 64e7578..9a89b62 100644
--- a/sys/dev/hyperv/netvsc/hv_net_vsc.c
+++ b/sys/dev/hyperv/netvsc/hv_net_vsc.c
@@ -73,10 +73,7 @@ hv_nv_alloc_net_device(struct hv_device *device)
 	netvsc_dev *net_dev;
 	hn_softc_t *sc = device_get_softc(device->device);
 
-	net_dev = malloc(sizeof(netvsc_dev), M_NETVSC, M_NOWAIT | M_ZERO);
-	if (net_dev == NULL) {
-		return (NULL);
-	}
+	net_dev = malloc(sizeof(netvsc_dev), M_NETVSC, M_WAITOK | M_ZERO);
 
 	net_dev->dev = device;
 	net_dev->destroy = FALSE;
@@ -135,15 +132,15 @@ hv_nv_get_next_send_section(netvsc_dev *net_dev)
 	int i;
 
 	for (i = 0; i < bitsmap_words; i++) {
-		idx = ffs(~bitsmap[i]);
+		idx = ffsl(~bitsmap[i]);
 		if (0 == idx)
 			continue;
 
 		idx--;
-		if (i * BITS_PER_LONG + idx >= net_dev->send_section_count)
-			return (ret);
+		KASSERT(i * BITS_PER_LONG + idx < net_dev->send_section_count,
+		    ("invalid i %d and idx %lu", i, idx));
 
-		if (synch_test_and_set_bit(idx, &bitsmap[i]))
+		if (atomic_testandset_long(&bitsmap[i], idx))
 			continue;
 
 		ret = i * BITS_PER_LONG + idx;
@@ -223,11 +220,7 @@ hv_nv_init_rx_buffer_with_net_vsp(struct hv_device *device)
 	    init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.num_sections;
 
 	net_dev->rx_sections = malloc(net_dev->rx_section_count *
-	    sizeof(nvsp_1_rx_buf_section), M_NETVSC, M_NOWAIT);
-	if (net_dev->rx_sections == NULL) {
-		ret = EINVAL;
-		goto cleanup;
-	}
+	    sizeof(nvsp_1_rx_buf_section), M_NETVSC, M_WAITOK);
 	memcpy(net_dev->rx_sections, 
 	    init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.sections,
 	    net_dev->rx_section_count * sizeof(nvsp_1_rx_buf_section));
@@ -325,11 +318,7 @@ hv_nv_init_send_buffer_with_net_vsp(struct hv_device *device)
 	    BITS_PER_LONG);
 	net_dev->send_section_bitsmap =
 	    malloc(net_dev->bitsmap_words * sizeof(long), M_NETVSC,
-	    M_NOWAIT | M_ZERO);
-	if (NULL == net_dev->send_section_bitsmap) {
-		ret = ENOMEM;
-		goto cleanup;
-	}
+	    M_WAITOK | M_ZERO);
 
 	goto exit;
 
@@ -788,8 +777,27 @@ hv_nv_on_send_completion(netvsc_dev *net_dev,
 		if (NULL != net_vsc_pkt) {
 			if (net_vsc_pkt->send_buf_section_idx !=
 			    NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) {
-				synch_change_bit(net_vsc_pkt->send_buf_section_idx,
-				    net_dev->send_section_bitsmap);
+				u_long mask;
+				int idx;
+
+				idx = net_vsc_pkt->send_buf_section_idx /
+				    BITS_PER_LONG;
+				KASSERT(idx < net_dev->bitsmap_words,
+				    ("invalid section index %u",
+				     net_vsc_pkt->send_buf_section_idx));
+				mask = 1UL <<
+				    (net_vsc_pkt->send_buf_section_idx %
+				     BITS_PER_LONG);
+
+				KASSERT(net_dev->send_section_bitsmap[idx] &
+				    mask,
+				    ("index bitmap 0x%lx, section index %u, "
+				     "bitmap idx %d, bitmask 0x%lx",
+				     net_dev->send_section_bitsmap[idx],
+				     net_vsc_pkt->send_buf_section_idx,
+				     idx, mask));
+				atomic_clear_long(
+				    &net_dev->send_section_bitsmap[idx], mask);
 			}
 			
 			/* Notify the layer above us */
diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.h b/sys/dev/hyperv/netvsc/hv_net_vsc.h
index e684cc5..95dee17 100644
--- a/sys/dev/hyperv/netvsc/hv_net_vsc.h
+++ b/sys/dev/hyperv/netvsc/hv_net_vsc.h
@@ -39,9 +39,11 @@
 #define __HV_NET_VSC_H__
 
 #include <sys/param.h>
+#include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
+#include <sys/taskqueue.h>
 #include <sys/sx.h>
 
 #include <machine/bus.h>
@@ -56,6 +58,8 @@
 
 #include <dev/hyperv/include/hyperv.h>
 
+#define HN_USE_TXDESC_BUFRING
+
 MALLOC_DECLARE(M_NETVSC);
 
 #define NVSP_INVALID_PROTOCOL_VERSION           (0xFFFFFFFF)
@@ -988,8 +992,67 @@ typedef struct {
 	hv_bool_uint8_t	link_state;
 } netvsc_device_info;
 
+#ifndef HN_USE_TXDESC_BUFRING
 struct hn_txdesc;
 SLIST_HEAD(hn_txdesc_list, hn_txdesc);
+#else
+struct buf_ring;
+#endif
+
+struct hn_rx_ring {
+	struct lro_ctrl	hn_lro;
+
+	/* Trust csum verification on host side */
+	int		hn_trust_hcsum;	/* HN_TRUST_HCSUM_ */
+
+	u_long		hn_csum_ip;
+	u_long		hn_csum_tcp;
+	u_long		hn_csum_udp;
+	u_long		hn_csum_trusted;
+	u_long		hn_lro_tried;
+	u_long		hn_small_pkts;
+} __aligned(CACHE_LINE_SIZE);
+
+#define HN_TRUST_HCSUM_IP	0x0001
+#define HN_TRUST_HCSUM_TCP	0x0002
+#define HN_TRUST_HCSUM_UDP	0x0004
+
+struct hn_tx_ring {
+#ifndef HN_USE_TXDESC_BUFRING
+	struct mtx	hn_txlist_spin;
+	struct hn_txdesc_list hn_txlist;
+#else
+	struct buf_ring	*hn_txdesc_br;
+#endif
+	int		hn_txdesc_cnt;
+	int		hn_txdesc_avail;
+	int		hn_has_txeof;
+
+	int		hn_sched_tx;
+	void		(*hn_txeof)(struct hn_tx_ring *);
+	struct taskqueue *hn_tx_taskq;
+	struct task	hn_tx_task;
+	struct task	hn_txeof_task;
+
+	struct mtx	hn_tx_lock;
+	struct hn_softc	*hn_sc;
+
+	int		hn_direct_tx_size;
+	int		hn_tx_chimney_size;
+	bus_dma_tag_t	hn_tx_data_dtag;
+	uint64_t	hn_csum_assist;
+
+	u_long		hn_no_txdescs;
+	u_long		hn_send_failed;
+	u_long		hn_txdma_failed;
+	u_long		hn_tx_collapsed;
+	u_long		hn_tx_chimney;
+
+	/* Rarely used stuffs */
+	struct hn_txdesc *hn_txdesc;
+	bus_dma_tag_t	hn_tx_rndis_dtag;
+	struct sysctl_oid *hn_tx_sysctl_tree;
+} __aligned(CACHE_LINE_SIZE);
 
 /*
  * Device-specific softc structure
@@ -1009,44 +1072,22 @@ typedef struct hn_softc {
 	struct hv_device  *hn_dev_obj;
 	netvsc_dev  	*net_dev;
 
-	int		hn_txdesc_cnt;
-	struct hn_txdesc *hn_txdesc;
-	bus_dma_tag_t	hn_tx_data_dtag;
-	bus_dma_tag_t	hn_tx_rndis_dtag;
-	int		hn_tx_chimney_size;
-	int		hn_tx_chimney_max;
+	int		hn_rx_ring_cnt;
+	struct hn_rx_ring *hn_rx_ring;
 
-	struct mtx	hn_txlist_spin;
-	struct hn_txdesc_list hn_txlist;
-	int		hn_txdesc_avail;
-	int		hn_txeof;
-
-	struct lro_ctrl	hn_lro;
-	int		hn_lro_hiwat;
-
-	/* Trust tcp segments verification on host side */
-	int		hn_trust_hosttcp;
-
-	u_long		hn_csum_ip;
-	u_long		hn_csum_tcp;
-	u_long		hn_csum_trusted;
-	u_long		hn_lro_tried;
-	u_long		hn_small_pkts;
-	u_long		hn_no_txdescs;
-	u_long		hn_send_failed;
-	u_long		hn_txdma_failed;
-	u_long		hn_tx_collapsed;
-	u_long		hn_tx_chimney;
+	int		hn_tx_ring_cnt;
+	struct hn_tx_ring *hn_tx_ring;
+	int		hn_tx_chimney_max;
+	struct taskqueue *hn_tx_taskq;
+	struct sysctl_oid *hn_tx_sysctl_tree;
 } hn_softc_t;
 
-
 /*
  * Externs
  */
 extern int hv_promisc_mode;
 
 void netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status);
-void netvsc_xmit_completion(void *context);
 void hv_nv_on_receive_completion(struct hv_device *device,
     uint64_t tid, uint32_t status);
 netvsc_dev *hv_nv_on_device_add(struct hv_device *device,
diff --git a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
index b3360ea..0f4425e 100644
--- a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
+++ b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
@@ -66,10 +66,12 @@ __FBSDID("$FreeBSD$");
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
+#include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
+#include <sys/buf_ring.h>
 
 #include <net/if.h>
 #include <net/if_arp.h>
@@ -132,6 +134,8 @@ __FBSDID("$FreeBSD$");
 /* YYY should get it from the underlying channel */
 #define HN_TX_DESC_CNT			512
 
+#define HN_LROENT_CNT_DEF		128
+
 #define HN_RNDIS_MSG_LEN		\
     (sizeof(rndis_msg) +		\
      RNDIS_VLAN_PPI_SIZE +		\
@@ -146,10 +150,14 @@ __FBSDID("$FreeBSD$");
 #define HN_TX_DATA_SEGCNT_MAX		\
     (NETVSC_PACKET_MAXPAGE - HV_RF_NUM_TX_RESERVED_PAGE_BUFS)
 
+#define HN_DIRECT_TX_SIZE_DEF		128
+
 struct hn_txdesc {
+#ifndef HN_USE_TXDESC_BUFRING
 	SLIST_ENTRY(hn_txdesc) link;
+#endif
 	struct mbuf	*m;
-	struct hn_softc	*sc;
+	struct hn_tx_ring *txr;
 	int		refs;
 	uint32_t	flags;		/* HN_TXD_FLAG_ */
 	netvsc_packet	netvsc_pkt;	/* XXX to be removed */
@@ -165,23 +173,18 @@ struct hn_txdesc {
 #define HN_TXD_FLAG_DMAMAP	0x2
 
 /*
- * A unified flag for all outbound check sum flags is useful,
- * and it helps avoiding unnecessary check sum calculation in
- * network forwarding scenario.
+ * Only enable UDP checksum offloading when it is on 2012R2 or
+ * later.  UDP checksum offloading doesn't work on earlier
+ * Windows releases.
  */
-#define HV_CSUM_FOR_OUTBOUND						\
-    (CSUM_IP|CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP|CSUM_IP_TSO|		\
-    CSUM_IP_ISCSI|CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP|		\
-    CSUM_IP6_TSO|CSUM_IP6_ISCSI)
-
-/* XXX move to netinet/tcp_lro.h */
-#define HN_LRO_HIWAT_MAX				65535
-#define HN_LRO_HIWAT_DEF				HN_LRO_HIWAT_MAX
+#define HN_CSUM_ASSIST_WIN8	(CSUM_IP | CSUM_TCP)
+#define HN_CSUM_ASSIST		(CSUM_IP | CSUM_UDP | CSUM_TCP)
+
+#define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
 /* YYY 2*MTU is a bit rough, but should be good enough. */
-#define HN_LRO_HIWAT_MTULIM(ifp)			(2 * (ifp)->if_mtu)
-#define HN_LRO_HIWAT_ISVALID(sc, hiwat)			\
-    ((hiwat) >= HN_LRO_HIWAT_MTULIM((sc)->hn_ifp) ||	\
-     (hiwat) <= HN_LRO_HIWAT_MAX)
+#define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
+
+#define HN_LRO_ACKCNT_DEF		1
 
 /*
  * Be aware that this sleepable mutex will exhibit WITNESS errors when
@@ -205,19 +208,71 @@ struct hn_txdesc {
 
 int hv_promisc_mode = 0;    /* normal mode by default */
 
+SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD, NULL, "Hyper-V network interface");
+
 /* Trust tcp segements verification on host side. */
-static int hn_trust_hosttcp = 0;
-TUNABLE_INT("dev.hn.trust_hosttcp", &hn_trust_hosttcp);
+static int hn_trust_hosttcp = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
+    &hn_trust_hosttcp, 0,
+    "Trust tcp segement verification on host side, "
+    "when csum info is missing (global setting)");
+
+/* Trust udp datagrams verification on host side. */
+static int hn_trust_hostudp = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
+    &hn_trust_hostudp, 0,
+    "Trust udp datagram verification on host side, "
+    "when csum info is missing (global setting)");
+
+/* Trust ip packets verification on host side. */
+static int hn_trust_hostip = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
+    &hn_trust_hostip, 0,
+    "Trust ip packet verification on host side, "
+    "when csum info is missing (global setting)");
 
 #if __FreeBSD_version >= 1100045
 /* Limit TSO burst size */
 static int hn_tso_maxlen = 0;
-TUNABLE_INT("dev.hn.tso_maxlen", &hn_tso_maxlen);
+SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
+    &hn_tso_maxlen, 0, "TSO burst limit");
 #endif
 
 /* Limit chimney send size */
 static int hn_tx_chimney_size = 0;
-TUNABLE_INT("dev.hn.tx_chimney_size", &hn_tx_chimney_size);
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
+    &hn_tx_chimney_size, 0, "Chimney send packet size limit");
+
+/* Limit the size of packet for direct transmission */
+static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
+SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
+    &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
+
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
+SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
+    &hn_lro_entry_count, 0, "LRO entry count");
+#endif
+#endif
+
+static int hn_share_tx_taskq = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
+    &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
+
+static struct taskqueue	*hn_tx_taskq;
+
+#ifndef HN_USE_TXDESC_BUFRING
+static int hn_use_txdesc_bufring = 0;
+#else
+static int hn_use_txdesc_bufring = 1;
+#endif
+SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
+    &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
+
+static int hn_bind_tx_taskq = -1;
+SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
+    &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
 
 /*
  * Forward declarations
@@ -226,82 +281,37 @@ static void hn_stop(hn_softc_t *sc);
 static void hn_ifinit_locked(hn_softc_t *sc);
 static void hn_ifinit(void *xsc);
 static int  hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
-static void hn_start_locked(struct ifnet *ifp);
+static int hn_start_locked(struct hn_tx_ring *txr, int len);
 static void hn_start(struct ifnet *ifp);
+static void hn_start_txeof(struct hn_tx_ring *);
 static int hn_ifmedia_upd(struct ifnet *ifp);
 static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
-#ifdef HN_LRO_HIWAT
-static int hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS);
+#if __FreeBSD_version >= 1100099
+static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
 #endif
+static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS);
-static int hn_check_iplen(const struct mbuf *, int);
-static int hn_create_tx_ring(struct hn_softc *sc);
-static void hn_destroy_tx_ring(struct hn_softc *sc);
-
-static __inline void
-hn_set_lro_hiwat(struct hn_softc *sc, int hiwat)
-{
-	sc->hn_lro_hiwat = hiwat;
-#ifdef HN_LRO_HIWAT
-	sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat;
-#endif
-}
-
-/*
- * NetVsc get message transport protocol type 
- */
-static uint32_t get_transport_proto_type(struct mbuf *m_head)
-{
-	uint32_t ret_val = TRANSPORT_TYPE_NOT_IP;
-	uint16_t ether_type = 0;
-	int ether_len = 0;
-	struct ether_vlan_header *eh;
-#ifdef INET
-	struct ip *iph;
-#endif
-#ifdef INET6
-	struct ip6_hdr *ip6;
-#endif
-
-	eh = mtod(m_head, struct ether_vlan_header*);
-	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
-		ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
-		ether_type = eh->evl_proto;
-	} else {
-		ether_len = ETHER_HDR_LEN;
-		ether_type = eh->evl_encap_proto;
-	}
-
-	switch (ntohs(ether_type)) {
-#ifdef INET6
-	case ETHERTYPE_IPV6:
-		ip6 = (struct ip6_hdr *)(m_head->m_data + ether_len);
-
-		if (IPPROTO_TCP == ip6->ip6_nxt) {
-			ret_val = TRANSPORT_TYPE_IPV6_TCP;
-		} else if (IPPROTO_UDP == ip6->ip6_nxt) {
-			ret_val = TRANSPORT_TYPE_IPV6_UDP;
-		}
-		break;
+#if __FreeBSD_version < 1100095
+static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
+#else
+static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
 #endif
-#ifdef INET
-	case ETHERTYPE_IP:
-		iph = (struct ip *)(m_head->m_data + ether_len);
-
-		if (IPPROTO_TCP == iph->ip_p) {
-			ret_val = TRANSPORT_TYPE_IPV4_TCP;
-		} else if (IPPROTO_UDP == iph->ip_p) {
-			ret_val = TRANSPORT_TYPE_IPV4_UDP;
-		}
-		break;
-#endif
-	default:
-		ret_val = TRANSPORT_TYPE_NOT_IP;
-		break;
-	}
-
-	return (ret_val);
-}
+static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_check_iplen(const struct mbuf *, int);
+static int hn_create_tx_ring(struct hn_softc *, int);
+static void hn_destroy_tx_ring(struct hn_tx_ring *);
+static int hn_create_tx_data(struct hn_softc *);
+static void hn_destroy_tx_data(struct hn_softc *);
+static void hn_start_taskfunc(void *, int);
+static void hn_start_txeof_taskfunc(void *, int);
+static void hn_stop_tx_tasks(struct hn_softc *);
+static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **);
+static void hn_create_rx_data(struct hn_softc *sc);
+static void hn_destroy_rx_data(struct hn_softc *sc);
+static void hn_set_tx_chimney_size(struct hn_softc *, int);
 
 static int
 hn_ifmedia_upd(struct ifnet *ifp __unused)
@@ -353,6 +363,19 @@ netvsc_probe(device_t dev)
 	return (ENXIO);
 }
 
+static void
+hn_cpuset_setthread_task(void *xmask, int pending __unused)
+{
+	cpuset_t *mask = xmask;
+	int error;
+
+	error = cpuset_setthread(curthread->td_tid, mask);
+	if (error) {
+		panic("curthread=%ju: can't pin; error=%d",
+		    (uintmax_t)curthread->td_tid, error);
+	}
+}
+
 /*
  * Standard attach entry point.
  *
@@ -367,8 +390,6 @@ netvsc_attach(device_t dev)
 	hn_softc_t *sc;
 	int unit = device_get_unit(dev);
 	struct ifnet *ifp = NULL;
-	struct sysctl_oid_list *child;
-	struct sysctl_ctx_list *ctx;
 	int error;
 #if __FreeBSD_version >= 1100045
 	int tso_maxlen;
@@ -382,13 +403,28 @@ netvsc_attach(device_t dev)
 	bzero(sc, sizeof(hn_softc_t));
 	sc->hn_unit = unit;
 	sc->hn_dev = dev;
-	sc->hn_lro_hiwat = HN_LRO_HIWAT_DEF;
-	sc->hn_trust_hosttcp = hn_trust_hosttcp;
-
-	error = hn_create_tx_ring(sc);
-	if (error)
-		goto failed;
 
+	if (hn_tx_taskq == NULL) {
+		sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
+		    taskqueue_thread_enqueue, &sc->hn_tx_taskq);
+		taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
+		    device_get_nameunit(dev));
+		if (hn_bind_tx_taskq >= 0) {
+			int cpu = hn_bind_tx_taskq;
+			struct task cpuset_task;
+			cpuset_t cpu_set;
+
+			if (cpu > mp_ncpus - 1)
+				cpu = mp_ncpus - 1;
+			CPU_SETOF(cpu, &cpu_set);
+			TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task,
+			    &cpu_set);
+			taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task);
+			taskqueue_drain(sc->hn_tx_taskq, &cpuset_task);
+		}
+	} else {
+		sc->hn_tx_taskq = hn_tx_taskq;
+	}
 	NV_LOCK_INIT(sc, "NetVSCLock");
 
 	sc->hn_dev_obj = device_ctx;
@@ -396,6 +432,12 @@ netvsc_attach(device_t dev)
 	ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
 	ifp->if_softc = sc;
 
+	error = hn_create_tx_data(sc);
+	if (error)
+		goto failed;
+
+	hn_create_rx_data(sc);
+
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 	ifp->if_dunit = unit;
 	ifp->if_dname = NETVSC_DEVNAME;
@@ -426,15 +468,7 @@ netvsc_attach(device_t dev)
 	ifp->if_capenable |=
 	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO |
 	    IFCAP_LRO;
-	/*
-	 * Only enable UDP checksum offloading when it is on 2012R2 or
-	 * later. UDP checksum offloading doesn't work on earlier
-	 * Windows releases.
-	 */
-	if (hv_vmbus_protocal_version >= HV_VMBUS_VERSION_WIN8_1)
-		ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
-	else
-		ifp->if_hwassist = CSUM_TCP | CSUM_TSO;
+	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist | CSUM_TSO;
 
 	error = hv_rf_on_device_add(device_ctx, &device_info);
 	if (error)
@@ -444,15 +478,6 @@ netvsc_attach(device_t dev)
 		sc->hn_carrier = 1;
 	}
 
-#if defined(INET) || defined(INET6)
-	tcp_lro_init(&sc->hn_lro);
-	/* Driver private LRO settings */
-	sc->hn_lro.ifp = ifp;
-#ifdef HN_LRO_HIWAT
-	sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat;
-#endif
-#endif	/* INET || INET6 */
-
 #if __FreeBSD_version >= 1100045
 	tso_maxlen = hn_tso_maxlen;
 	if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET)
@@ -472,87 +497,14 @@ netvsc_attach(device_t dev)
 #endif
 
 	sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
-	sc->hn_tx_chimney_size = sc->hn_tx_chimney_max;
+	hn_set_tx_chimney_size(sc, sc->hn_tx_chimney_max);
 	if (hn_tx_chimney_size > 0 &&
 	    hn_tx_chimney_size < sc->hn_tx_chimney_max)
-		sc->hn_tx_chimney_size = hn_tx_chimney_size;
-
-	ctx = device_get_sysctl_ctx(dev);
-	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
-
-	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_queued",
-	    CTLFLAG_RW, &sc->hn_lro.lro_queued, 0, "LRO queued");
-	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_flushed",
-	    CTLFLAG_RW, &sc->hn_lro.lro_flushed, 0, "LRO flushed");
-	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "lro_tried",
-	    CTLFLAG_RW, &sc->hn_lro_tried, "# of LRO tries");
-#ifdef HN_LRO_HIWAT
-	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_hiwat",
-	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_hiwat_sysctl,
-	    "I", "LRO high watermark");
-#endif
-	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "trust_hosttcp",
-	    CTLFLAG_RW, &sc->hn_trust_hosttcp, 0,
-	    "Trust tcp segement verification on host side, "
-	    "when csum info is missing");
-	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_ip",
-	    CTLFLAG_RW, &sc->hn_csum_ip, "RXCSUM IP");
-	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_tcp",
-	    CTLFLAG_RW, &sc->hn_csum_tcp, "RXCSUM TCP");
-	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_trusted",
-	    CTLFLAG_RW, &sc->hn_csum_trusted,
-	    "# of TCP segements that we trust host's csum verification");
-	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "small_pkts",
-	    CTLFLAG_RW, &sc->hn_small_pkts, "# of small packets received");
-	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "no_txdescs",
-	    CTLFLAG_RW, &sc->hn_no_txdescs, "# of times short of TX descs");
-	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "send_failed",
-	    CTLFLAG_RW, &sc->hn_send_failed, "# of hyper-v sending failure");
-	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "txdma_failed",
-	    CTLFLAG_RW, &sc->hn_txdma_failed, "# of TX DMA failure");
-	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_collapsed",
-	    CTLFLAG_RW, &sc->hn_tx_collapsed, "# of TX mbuf collapsed");
-	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_chimney",
-	    CTLFLAG_RW, &sc->hn_tx_chimney, "# of chimney send");
-	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
-	    CTLFLAG_RD, &sc->hn_txdesc_cnt, 0, "# of total TX descs");
-	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
-	    CTLFLAG_RD, &sc->hn_txdesc_avail, 0, "# of available TX descs");
-	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
-	    CTLFLAG_RD, &sc->hn_tx_chimney_max, 0,
-	    "Chimney send packet size upper boundary");
-	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
-	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_tx_chimney_size_sysctl,
-	    "I", "Chimney send packet size limit");
-
-	if (unit == 0) {
-		struct sysctl_ctx_list *dc_ctx;
-		struct sysctl_oid_list *dc_child;
-		devclass_t dc;
-
-		/*
-		 * Add sysctl nodes for devclass
-		 */
-		dc = device_get_devclass(dev);
-		dc_ctx = devclass_get_sysctl_ctx(dc);
-		dc_child = SYSCTL_CHILDREN(devclass_get_sysctl_tree(dc));
-
-		SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "trust_hosttcp",
-		    CTLFLAG_RD, &hn_trust_hosttcp, 0,
-		    "Trust tcp segement verification on host side, "
-		    "when csum info is missing (global setting)");
-		SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tx_chimney_size",
-		    CTLFLAG_RD, &hn_tx_chimney_size, 0,
-		    "Chimney send packet size limit");
-#if __FreeBSD_version >= 1100045
-		SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tso_maxlen",
-		    CTLFLAG_RD, &hn_tso_maxlen, 0, "TSO burst limit");
-#endif
-	}
+		hn_set_tx_chimney_size(sc, hn_tx_chimney_size);
 
 	return (0);
 failed:
-	hn_destroy_tx_ring(sc);
+	hn_destroy_tx_data(sc);
 	if (ifp != NULL)
 		if_free(ifp);
 	return (error);
@@ -583,11 +535,14 @@ netvsc_detach(device_t dev)
 
 	hv_rf_on_device_remove(hv_device, HV_RF_NV_DESTROY_CHANNEL);
 
+	hn_stop_tx_tasks(sc);
+
 	ifmedia_removeall(&sc->hn_media);
-#if defined(INET) || defined(INET6)
-	tcp_lro_free(&sc->hn_lro);
-#endif
-	hn_destroy_tx_ring(sc);
+	hn_destroy_rx_data(sc);
+	hn_destroy_tx_data(sc);
+
+	if (sc->hn_tx_taskq != hn_tx_taskq)
+		taskqueue_free(sc->hn_tx_taskq);
 
 	return (0);
 }
@@ -602,13 +557,13 @@ netvsc_shutdown(device_t dev)
 }
 
 static __inline int
-hn_txdesc_dmamap_load(struct hn_softc *sc, struct hn_txdesc *txd,
+hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
 {
 	struct mbuf *m = *m_head;
 	int error;
 
-	error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag, txd->data_dmap,
+	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
 	    m, segs, nsegs, BUS_DMA_NOWAIT);
 	if (error == EFBIG) {
 		struct mbuf *m_new;
@@ -618,13 +573,13 @@ hn_txdesc_dmamap_load(struct hn_softc *sc, struct hn_txdesc *txd,
 			return ENOBUFS;
 		else
 			*m_head = m = m_new;
-		sc->hn_tx_collapsed++;
+		txr->hn_tx_collapsed++;
 
-		error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag,
+		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
 	}
 	if (!error) {
-		bus_dmamap_sync(sc->hn_tx_data_dtag, txd->data_dmap,
+		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
 		    BUS_DMASYNC_PREWRITE);
 		txd->flags |= HN_TXD_FLAG_DMAMAP;
 	}
@@ -632,20 +587,20 @@ hn_txdesc_dmamap_load(struct hn_softc *sc, struct hn_txdesc *txd,
 }
 
 static __inline void
-hn_txdesc_dmamap_unload(struct hn_softc *sc, struct hn_txdesc *txd)
+hn_txdesc_dmamap_unload(struct hn_tx_ring *txr, struct hn_txdesc *txd)
 {
 
 	if (txd->flags & HN_TXD_FLAG_DMAMAP) {
-		bus_dmamap_sync(sc->hn_tx_data_dtag,
+		bus_dmamap_sync(txr->hn_tx_data_dtag,
 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
-		bus_dmamap_unload(sc->hn_tx_data_dtag,
+		bus_dmamap_unload(txr->hn_tx_data_dtag,
 		    txd->data_dmap);
 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
 	}
 }
 
 static __inline int
-hn_txdesc_put(struct hn_softc *sc, struct hn_txdesc *txd)
+hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
 {
 
 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
@@ -655,7 +610,7 @@ hn_txdesc_put(struct hn_softc *sc, struct hn_txdesc *txd)
 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
 		return 0;
 
-	hn_txdesc_dmamap_unload(sc, txd);
+	hn_txdesc_dmamap_unload(txr, txd);
 	if (txd->m != NULL) {
 		m_freem(txd->m);
 		txd->m = NULL;
@@ -663,33 +618,45 @@ hn_txdesc_put(struct hn_softc *sc, struct hn_txdesc *txd)
 
 	txd->flags |= HN_TXD_FLAG_ONLIST;
 
-	mtx_lock_spin(&sc->hn_txlist_spin);
-	KASSERT(sc->hn_txdesc_avail >= 0 &&
-	    sc->hn_txdesc_avail < sc->hn_txdesc_cnt,
-	    ("txdesc_put: invalid txd avail %d", sc->hn_txdesc_avail));
-	sc->hn_txdesc_avail++;
-	SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link);
-	mtx_unlock_spin(&sc->hn_txlist_spin);
+#ifndef HN_USE_TXDESC_BUFRING
+	mtx_lock_spin(&txr->hn_txlist_spin);
+	KASSERT(txr->hn_txdesc_avail >= 0 &&
+	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
+	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
+	txr->hn_txdesc_avail++;
+	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
+	mtx_unlock_spin(&txr->hn_txlist_spin);
+#else
+	atomic_add_int(&txr->hn_txdesc_avail, 1);
+	buf_ring_enqueue(txr->hn_txdesc_br, txd);
+#endif
 
 	return 1;
 }
 
 static __inline struct hn_txdesc *
-hn_txdesc_get(struct hn_softc *sc)
+hn_txdesc_get(struct hn_tx_ring *txr)
 {
 	struct hn_txdesc *txd;
 
-	mtx_lock_spin(&sc->hn_txlist_spin);
-	txd = SLIST_FIRST(&sc->hn_txlist);
+#ifndef HN_USE_TXDESC_BUFRING
+	mtx_lock_spin(&txr->hn_txlist_spin);
+	txd = SLIST_FIRST(&txr->hn_txlist);
 	if (txd != NULL) {
-		KASSERT(sc->hn_txdesc_avail > 0,
-		    ("txdesc_get: invalid txd avail %d", sc->hn_txdesc_avail));
-		sc->hn_txdesc_avail--;
-		SLIST_REMOVE_HEAD(&sc->hn_txlist, link);
+		KASSERT(txr->hn_txdesc_avail > 0,
+		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
+		txr->hn_txdesc_avail--;
+		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
 	}
-	mtx_unlock_spin(&sc->hn_txlist_spin);
+	mtx_unlock_spin(&txr->hn_txlist_spin);
+#else
+	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
+#endif
 
 	if (txd != NULL) {
+#ifdef HN_USE_TXDESC_BUFRING
+		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
+#endif
 		KASSERT(txd->m == NULL && txd->refs == 0 &&
 		    (txd->flags & HN_TXD_FLAG_ONLIST), ("invalid txd"));
 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
@@ -707,213 +674,133 @@ hn_txdesc_hold(struct hn_txdesc *txd)
 	atomic_add_int(&txd->refs, 1);
 }
 
-/*
- * Send completion processing
- *
- * Note:  It looks like offset 0 of buf is reserved to hold the softc
- * pointer.  The sc pointer is not currently needed in this function, and
- * it is not presently populated by the TX function.
- */
-void
-netvsc_xmit_completion(void *context)
+static void
+hn_tx_done(void *xpkt)
 {
-	netvsc_packet *packet = context;
+	netvsc_packet *packet = xpkt;
 	struct hn_txdesc *txd;
-	struct hn_softc *sc;
+	struct hn_tx_ring *txr;
 
 	txd = (struct hn_txdesc *)(uintptr_t)
 	    packet->compl.send.send_completion_tid;
 
-	sc = txd->sc;
-	sc->hn_txeof = 1;
-	hn_txdesc_put(sc, txd);
+	txr = txd->txr;
+	txr->hn_has_txeof = 1;
+	hn_txdesc_put(txr, txd);
 }
 
 void
 netvsc_channel_rollup(struct hv_device *device_ctx)
 {
 	struct hn_softc *sc = device_get_softc(device_ctx->device);
-	struct ifnet *ifp;
+	struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; /* TODO: vRSS */
+#if defined(INET) || defined(INET6)
+	struct hn_rx_ring *rxr = &sc->hn_rx_ring[0]; /* TODO: vRSS */
+	struct lro_ctrl *lro = &rxr->hn_lro;
+	struct lro_entry *queued;
+
+	while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
+		SLIST_REMOVE_HEAD(&lro->lro_active, next);
+		tcp_lro_flush(lro, queued);
+	}
+#endif
 
-	if (!sc->hn_txeof)
+	if (!txr->hn_has_txeof)
 		return;
 
-	sc->hn_txeof = 0;
-	ifp = sc->hn_ifp;
-	NV_LOCK(sc);
-	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
-	hn_start_locked(ifp);
-	NV_UNLOCK(sc);
+	txr->hn_has_txeof = 0;
+	txr->hn_txeof(txr);
 }
 
 /*
- * Start a transmit of one or more packets
+ * NOTE:
+ * If this function fails, then both txd and m_head0 will be freed.
  */
-static void
-hn_start_locked(struct ifnet *ifp)
+static int
+hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
 {
-	hn_softc_t *sc = ifp->if_softc;
-	struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
-	netvsc_dev *net_dev = sc->net_dev;
+	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
+	int error, nsegs, i;
+	struct mbuf *m_head = *m_head0;
 	netvsc_packet *packet;
-	struct mbuf *m_head, *m;
-	struct ether_vlan_header *eh;
 	rndis_msg *rndis_mesg;
 	rndis_packet *rndis_pkt;
 	rndis_per_packet_info *rppi;
-	ndis_8021q_info *rppi_vlan_info;
-	rndis_tcp_ip_csum_info *csum_info;
-	rndis_tcp_tso_info *tso_info;	
-	int ether_len;
-	uint32_t rndis_msg_size = 0;
-	uint32_t trans_proto_type;
-	uint32_t send_buf_section_idx =
-	    NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX;
-
-	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
-	    IFF_DRV_RUNNING)
-		return;
-
-	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
-		bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
-		int error, nsegs, i, send_failed = 0;
-		struct hn_txdesc *txd;
+	uint32_t rndis_msg_size;
 
-		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
-		if (m_head == NULL)
-			break;
+	packet = &txd->netvsc_pkt;
+	packet->is_data_pkt = TRUE;
+	packet->tot_data_buf_len = m_head->m_pkthdr.len;
 
-		txd = hn_txdesc_get(sc);
-		if (txd == NULL) {
-			sc->hn_no_txdescs++;
-			IF_PREPEND(&ifp->if_snd, m_head);
-			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
-			break;
-		}
+	/*
+	 * extension points to the area reserved for the
+	 * rndis_filter_packet, which is placed just after
+	 * the netvsc_packet (and rppi struct, if present;
+	 * length is updated later).
+	 */
+	rndis_mesg = txd->rndis_msg;
+	/* XXX not necessary */
+	memset(rndis_mesg, 0, HN_RNDIS_MSG_LEN);
+	rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG;
 
-		packet = &txd->netvsc_pkt;
-		/* XXX not necessary */
-		memset(packet, 0, sizeof(*packet));
+	rndis_pkt = &rndis_mesg->msg.packet;
+	rndis_pkt->data_offset = sizeof(rndis_packet);
+	rndis_pkt->data_length = packet->tot_data_buf_len;
+	rndis_pkt->per_pkt_info_offset = sizeof(rndis_packet);
 
-		packet->is_data_pkt = TRUE;
+	rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet);
 
-		/* Initialize it from the mbuf */
-		packet->tot_data_buf_len = m_head->m_pkthdr.len;
+	if (m_head->m_flags & M_VLANTAG) {
+		ndis_8021q_info *rppi_vlan_info;
 
-		/*
-		 * extension points to the area reserved for the
-		 * rndis_filter_packet, which is placed just after
-		 * the netvsc_packet (and rppi struct, if present;
-		 * length is updated later).
-		 */
-		rndis_mesg = txd->rndis_msg;
-		/* XXX not necessary */
-		memset(rndis_mesg, 0, HN_RNDIS_MSG_LEN);
-		rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG;
+		rndis_msg_size += RNDIS_VLAN_PPI_SIZE;
+		rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE,
+		    ieee_8021q_info);
 
-		rndis_pkt = &rndis_mesg->msg.packet;
-		rndis_pkt->data_offset = sizeof(rndis_packet);
-		rndis_pkt->data_length = packet->tot_data_buf_len;
-		rndis_pkt->per_pkt_info_offset = sizeof(rndis_packet);
+		rppi_vlan_info = (ndis_8021q_info *)((uint8_t *)rppi +
+		    rppi->per_packet_info_offset);
+		rppi_vlan_info->u1.s1.vlan_id =
+		    m_head->m_pkthdr.ether_vtag & 0xfff;
+	}
 
-		rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet);
+	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
+		rndis_tcp_tso_info *tso_info;	
+		struct ether_vlan_header *eh;
+		int ether_len;
 
 		/*
-		 * If the Hyper-V infrastructure needs to embed a VLAN tag,
-		 * initialize netvsc_packet and rppi struct values as needed.
+		 * XXX need m_pullup and use mtodo
 		 */
-		if (m_head->m_flags & M_VLANTAG) {
-			/*
-			 * set up some additional fields so the Hyper-V infrastructure will stuff the VLAN tag
-			 * into the frame.
-			 */
-			rndis_msg_size += RNDIS_VLAN_PPI_SIZE;
-
-			rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE,
-			    ieee_8021q_info);
-		
-			/* VLAN info immediately follows rppi struct */
-			rppi_vlan_info = (ndis_8021q_info *)((char*)rppi + 
-			    rppi->per_packet_info_offset);
-			/* FreeBSD does not support CFI or priority */
-			rppi_vlan_info->u1.s1.vlan_id =
-			    m_head->m_pkthdr.ether_vtag & 0xfff;
-		}
-
-		/* Only check the flags for outbound and ignore the ones for inbound */
-		if (0 == (m_head->m_pkthdr.csum_flags & HV_CSUM_FOR_OUTBOUND)) {
-			goto pre_send;
-		}
-
 		eh = mtod(m_head, struct ether_vlan_header*);
-		if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
+		if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN))
 			ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
-		} else {
+		else
 			ether_len = ETHER_HDR_LEN;
-		}
-
-		trans_proto_type = get_transport_proto_type(m_head);
-		if (TRANSPORT_TYPE_NOT_IP == trans_proto_type) {
-			goto pre_send;
-		}
-
-		/*
-		 * TSO packet needless to setup the send side checksum
-		 * offload.
-		 */
-		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
-			goto do_tso;
-		}
 
-		/* setup checksum offload */
-		rndis_msg_size += RNDIS_CSUM_PPI_SIZE;
-		rppi = hv_set_rppi_data(rndis_mesg, RNDIS_CSUM_PPI_SIZE,
-		    tcpip_chksum_info);
-		csum_info = (rndis_tcp_ip_csum_info *)((char*)rppi +
-		    rppi->per_packet_info_offset);
-
-		if (trans_proto_type & (TYPE_IPV4 << 16)) {
-			csum_info->xmit.is_ipv4 = 1;
-		} else {
-			csum_info->xmit.is_ipv6 = 1;
-		}
-
-		if (trans_proto_type & TYPE_TCP) {
-			csum_info->xmit.tcp_csum = 1;
-			csum_info->xmit.tcp_header_offset = 0;
-		} else if (trans_proto_type & TYPE_UDP) {
-			csum_info->xmit.udp_csum = 1;
-		}
-
-		goto pre_send;
-
-do_tso:
-		/* setup TCP segmentation offload */
 		rndis_msg_size += RNDIS_TSO_PPI_SIZE;
 		rppi = hv_set_rppi_data(rndis_mesg, RNDIS_TSO_PPI_SIZE,
 		    tcp_large_send_info);
-		
-		tso_info = (rndis_tcp_tso_info *)((char *)rppi +
+
+		tso_info = (rndis_tcp_tso_info *)((uint8_t *)rppi +
 		    rppi->per_packet_info_offset);
 		tso_info->lso_v2_xmit.type =
 		    RNDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
-		
+
 #ifdef INET
-		if (trans_proto_type & (TYPE_IPV4 << 16)) {
+		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
 			struct ip *ip =
 			    (struct ip *)(m_head->m_data + ether_len);
 			unsigned long iph_len = ip->ip_hl << 2;
 			struct tcphdr *th =
 			    (struct tcphdr *)((caddr_t)ip + iph_len);
-		
+
 			tso_info->lso_v2_xmit.ip_version =
 			    RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV4;
 			ip->ip_len = 0;
 			ip->ip_sum = 0;
-		
+
 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
-			    ip->ip_dst.s_addr,
-			    htons(IPPROTO_TCP));
+			    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
 		}
 #endif
 #if defined(INET6) && defined(INET)
@@ -921,8 +808,8 @@ do_tso:
 #endif
 #ifdef INET6
 		{
-			struct ip6_hdr *ip6 =
-			    (struct ip6_hdr *)(m_head->m_data + ether_len);
+			struct ip6_hdr *ip6 = (struct ip6_hdr *)
+			    (m_head->m_data + ether_len);
 			struct tcphdr *th = (struct tcphdr *)(ip6 + 1);
 
 			tso_info->lso_v2_xmit.ip_version =
@@ -933,146 +820,233 @@ do_tso:
 #endif
 		tso_info->lso_v2_xmit.tcp_header_offset = 0;
 		tso_info->lso_v2_xmit.mss = m_head->m_pkthdr.tso_segsz;
+	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
+		rndis_tcp_ip_csum_info *csum_info;
 
-pre_send:
-		rndis_mesg->msg_len = packet->tot_data_buf_len + rndis_msg_size;
-		packet->tot_data_buf_len = rndis_mesg->msg_len;
-
-		/* send packet with send buffer */
-		if (packet->tot_data_buf_len < sc->hn_tx_chimney_size) {
-			send_buf_section_idx =
-			    hv_nv_get_next_send_section(net_dev);
-			if (send_buf_section_idx !=
-			    NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) {
-				char *dest = ((char *)net_dev->send_buf +
-				    send_buf_section_idx *
-				    net_dev->send_section_size);
-
-				memcpy(dest, rndis_mesg, rndis_msg_size);
-				dest += rndis_msg_size;
-				for (m = m_head; m != NULL; m = m->m_next) {
-					if (m->m_len) {
-						memcpy(dest,
-						    (void *)mtod(m, vm_offset_t),
-						    m->m_len);
-						dest += m->m_len;
-					}
-				}
+		rndis_msg_size += RNDIS_CSUM_PPI_SIZE;
+		rppi = hv_set_rppi_data(rndis_mesg, RNDIS_CSUM_PPI_SIZE,
+		    tcpip_chksum_info);
+		csum_info = (rndis_tcp_ip_csum_info *)((uint8_t *)rppi +
+		    rppi->per_packet_info_offset);
 
-				packet->send_buf_section_idx =
-				    send_buf_section_idx;
-				packet->send_buf_section_size =
-				    packet->tot_data_buf_len;
-				packet->page_buf_count = 0;
-				sc->hn_tx_chimney++;
-				goto do_send;
-			}
-		}
+		csum_info->xmit.is_ipv4 = 1;
+		if (m_head->m_pkthdr.csum_flags & CSUM_IP)
+			csum_info->xmit.ip_header_csum = 1;
 
-		error = hn_txdesc_dmamap_load(sc, txd, &m_head, segs, &nsegs);
-		if (error) {
-			int freed;
+		if (m_head->m_pkthdr.csum_flags & CSUM_TCP) {
+			csum_info->xmit.tcp_csum = 1;
+			csum_info->xmit.tcp_header_offset = 0;
+		} else if (m_head->m_pkthdr.csum_flags & CSUM_UDP) {
+			csum_info->xmit.udp_csum = 1;
+		}
+	}
 
-			/*
-			 * This mbuf is not linked w/ the txd yet, so free
-			 * it now.
-			 */
-			m_freem(m_head);
-			freed = hn_txdesc_put(sc, txd);
-			KASSERT(freed != 0,
-			    ("fail to free txd upon txdma error"));
+	rndis_mesg->msg_len = packet->tot_data_buf_len + rndis_msg_size;
+	packet->tot_data_buf_len = rndis_mesg->msg_len;
 
-			sc->hn_txdma_failed++;
-			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
-			continue;
+	/*
+	 * Chimney send, if the packet could fit into one chimney buffer.
+	 */
+	if (packet->tot_data_buf_len < txr->hn_tx_chimney_size) {
+		netvsc_dev *net_dev = txr->hn_sc->net_dev;
+		uint32_t send_buf_section_idx;
+
+		send_buf_section_idx =
+		    hv_nv_get_next_send_section(net_dev);
+		if (send_buf_section_idx !=
+		    NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) {
+			uint8_t *dest = ((uint8_t *)net_dev->send_buf +
+			    (send_buf_section_idx *
+			     net_dev->send_section_size));
+
+			memcpy(dest, rndis_mesg, rndis_msg_size);
+			dest += rndis_msg_size;
+			m_copydata(m_head, 0, m_head->m_pkthdr.len, dest);
+
+			packet->send_buf_section_idx = send_buf_section_idx;
+			packet->send_buf_section_size =
+			    packet->tot_data_buf_len;
+			packet->page_buf_count = 0;
+			txr->hn_tx_chimney++;
+			goto done;
 		}
+	}
 
-		packet->page_buf_count = nsegs +
-		    HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
-
-		/* send packet with page buffer */
-		packet->page_buffers[0].pfn = atop(txd->rndis_msg_paddr);
-		packet->page_buffers[0].offset =
-		    txd->rndis_msg_paddr & PAGE_MASK;
-		packet->page_buffers[0].length = rndis_msg_size;
+	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
+	if (error) {
+		int freed;
 
 		/*
-		 * Fill the page buffers with mbuf info starting at index
-		 * HV_RF_NUM_TX_RESERVED_PAGE_BUFS.
+		 * This mbuf is not linked w/ the txd yet, so free it now.
 		 */
-		for (i = 0; i < nsegs; ++i) {
-			hv_vmbus_page_buffer *pb = &packet->page_buffers[
-			    i + HV_RF_NUM_TX_RESERVED_PAGE_BUFS];
+		m_freem(m_head);
+		*m_head0 = NULL;
 
-			pb->pfn = atop(segs[i].ds_addr);
-			pb->offset = segs[i].ds_addr & PAGE_MASK;
-			pb->length = segs[i].ds_len;
-		}
+		freed = hn_txdesc_put(txr, txd);
+		KASSERT(freed != 0,
+		    ("fail to free txd upon txdma error"));
+
+		txr->hn_txdma_failed++;
+		if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1);
+		return error;
+	}
+	*m_head0 = m_head;
+
+	packet->page_buf_count = nsegs + HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
+
+	/* send packet with page buffer */
+	packet->page_buffers[0].pfn = atop(txd->rndis_msg_paddr);
+	packet->page_buffers[0].offset = txd->rndis_msg_paddr & PAGE_MASK;
+	packet->page_buffers[0].length = rndis_msg_size;
 
-		packet->send_buf_section_idx = 
-		    NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX;
-		packet->send_buf_section_size = 0;
+	/*
+	 * Fill the page buffers with mbuf info starting at index
+	 * HV_RF_NUM_TX_RESERVED_PAGE_BUFS.
+	 */
+	for (i = 0; i < nsegs; ++i) {
+		hv_vmbus_page_buffer *pb = &packet->page_buffers[
+		    i + HV_RF_NUM_TX_RESERVED_PAGE_BUFS];
 
-do_send:
-		txd->m = m_head;
+		pb->pfn = atop(segs[i].ds_addr);
+		pb->offset = segs[i].ds_addr & PAGE_MASK;
+		pb->length = segs[i].ds_len;
+	}
+
+	packet->send_buf_section_idx =
+	    NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX;
+	packet->send_buf_section_size = 0;
+done:
+	txd->m = m_head;
+
+	/* Set the completion routine */
+	packet->compl.send.on_send_completion = hn_tx_done;
+	packet->compl.send.send_completion_context = packet;
+	packet->compl.send.send_completion_tid = (uint64_t)(uintptr_t)txd;
+
+	return 0;
+}
 
-		/* Set the completion routine */
-		packet->compl.send.on_send_completion = netvsc_xmit_completion;
-		packet->compl.send.send_completion_context = packet;
-		packet->compl.send.send_completion_tid =
-		    (uint64_t)(uintptr_t)txd;
+/*
+ * NOTE:
+ * If this function fails, then txd will be freed, but the mbuf
+ * associated w/ the txd will _not_ be freed.
+ */
+static int
+hn_send_pkt(struct ifnet *ifp, struct hv_device *device_ctx,
+    struct hn_tx_ring *txr, struct hn_txdesc *txd)
+{
+	int error, send_failed = 0;
 
 again:
+	/*
+	 * Make sure that txd is not freed before ETHER_BPF_MTAP.
+	 */
+	hn_txdesc_hold(txd);
+	error = hv_nv_on_send(device_ctx, &txd->netvsc_pkt);
+	if (!error) {
+		ETHER_BPF_MTAP(ifp, txd->m);
+		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+	}
+	hn_txdesc_put(txr, txd);
+
+	if (__predict_false(error)) {
+		int freed;
+
 		/*
-		 * Make sure that txd is not freed before ETHER_BPF_MTAP.
+		 * This should "really rarely" happen.
+		 *
+		 * XXX Too many RX to be acked or too many sideband
+		 * commands to run?  Ask netvsc_channel_rollup()
+		 * to kick start later.
 		 */
-		hn_txdesc_hold(txd);
-		error = hv_nv_on_send(device_ctx, packet);
-		if (!error) {
-			ETHER_BPF_MTAP(ifp, m_head);
-			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+		txr->hn_has_txeof = 1;
+		if (!send_failed) {
+			txr->hn_send_failed++;
+			send_failed = 1;
+			/*
+			 * Try sending again after set hn_has_txeof;
+			 * in case that we missed the last
+			 * netvsc_channel_rollup().
+			 */
+			goto again;
 		}
-		hn_txdesc_put(sc, txd);
+		if_printf(ifp, "send failed\n");
 
-		if (__predict_false(error)) {
-			int freed;
+		/*
+		 * Caller will perform further processing on the
+		 * associated mbuf, so don't free it in hn_txdesc_put();
+		 * only unload it from the DMA map in hn_txdesc_put(),
+		 * if it was loaded.
+		 */
+		txd->m = NULL;
+		freed = hn_txdesc_put(txr, txd);
+		KASSERT(freed != 0,
+		    ("fail to free txd upon send error"));
 
-			/*
-			 * This should "really rarely" happen.
-			 *
-			 * XXX Too many RX to be acked or too many sideband
-			 * commands to run?  Ask netvsc_channel_rollup()
-			 * to kick start later.
-			 */
-			sc->hn_txeof = 1;
-			if (!send_failed) {
-				sc->hn_send_failed++;
-				send_failed = 1;
-				/*
-				 * Try sending again after set hn_txeof;
-				 * in case that we missed the last
-				 * netvsc_channel_rollup().
-				 */
-				goto again;
-			}
-			if_printf(ifp, "send failed\n");
+		txr->hn_send_failed++;
+	}
+	return error;
+}
+
+/*
+ * Start a transmit of one or more packets
+ */
+static int
+hn_start_locked(struct hn_tx_ring *txr, int len)
+{
+	struct hn_softc *sc = txr->hn_sc;
+	struct ifnet *ifp = sc->hn_ifp;
+	struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
+
+	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
+	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
+
+	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
+	    IFF_DRV_RUNNING)
+		return 0;
+
+	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
+		struct hn_txdesc *txd;
+		struct mbuf *m_head;
+		int error;
+
+		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
+		if (m_head == NULL)
+			break;
 
+		if (len > 0 && m_head->m_pkthdr.len > len) {
 			/*
-			 * This mbuf will be prepended, don't free it
-			 * in hn_txdesc_put(); only unload it from the
-			 * DMA map in hn_txdesc_put(), if it was loaded.
+			 * This sending could be time consuming; let callers
+			 * dispatch this packet sending (and sending of any
+			 * following up packets) to tx taskqueue.
 			 */
-			txd->m = NULL;
-			freed = hn_txdesc_put(sc, txd);
-			KASSERT(freed != 0,
-			    ("fail to free txd upon send error"));
-
-			sc->hn_send_failed++;
-			IF_PREPEND(&ifp->if_snd, m_head);
-			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+			return 1;
+		}
+
+		txd = hn_txdesc_get(txr);
+		if (txd == NULL) {
+			txr->hn_no_txdescs++;
+			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+			break;
+		}
+
+		error = hn_encap(txr, txd, &m_head);
+		if (error) {
+			/* Both txd and m_head are freed */
+			continue;
+		}
+
+		error = hn_send_pkt(ifp, device_ctx, txr, txd);
+		if (__predict_false(error)) {
+			/* txd is freed, but m_head is not */
+			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
 			break;
 		}
 	}
+	return 0;
 }
 
 /*
@@ -1162,11 +1136,11 @@ int
 netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
     rndis_tcp_ip_csum_info *csum_info)
 {
-	hn_softc_t *sc = (hn_softc_t *)device_get_softc(device_ctx->device);
+	struct hn_softc *sc = device_get_softc(device_ctx->device);
+	struct hn_rx_ring *rxr = &sc->hn_rx_ring[0]; /* TODO: vRSS */
 	struct mbuf *m_new;
 	struct ifnet *ifp;
-	device_t dev = device_ctx->device;
-	int size, do_lro = 0;
+	int size, do_lro = 0, do_csum = 1;
 
 	if (sc == NULL) {
 		return (0); /* TODO: KYS how can this be! */
@@ -1192,7 +1166,7 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
 		memcpy(mtod(m_new, void *), packet->data,
 		    packet->tot_data_buf_len);
 		m_new->m_pkthdr.len = m_new->m_len = packet->tot_data_buf_len;
-		sc->hn_small_pkts++;
+		rxr->hn_small_pkts++;
 	} else {
 		/*
 		 * Get an mbuf with a cluster.  For packets 2K or less,
@@ -1208,7 +1182,7 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
 
 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
 		if (m_new == NULL) {
-			device_printf(dev, "alloc mbuf failed.\n");
+			if_printf(ifp, "alloc mbuf failed.\n");
 			return (0);
 		}
 
@@ -1216,21 +1190,28 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
 	}
 	m_new->m_pkthdr.rcvif = ifp;
 
+	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
+		do_csum = 0;
+
 	/* receive side checksum offload */
-	if (NULL != csum_info) {
+	if (csum_info != NULL) {
 		/* IP csum offload */
-		if (csum_info->receive.ip_csum_succeeded) {
+		if (csum_info->receive.ip_csum_succeeded && do_csum) {
 			m_new->m_pkthdr.csum_flags |=
 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
-			sc->hn_csum_ip++;
+			rxr->hn_csum_ip++;
 		}
 
-		/* TCP csum offload */
-		if (csum_info->receive.tcp_csum_succeeded) {
+		/* TCP/UDP csum offload */
+		if ((csum_info->receive.tcp_csum_succeeded ||
+		     csum_info->receive.udp_csum_succeeded) && do_csum) {
 			m_new->m_pkthdr.csum_flags |=
 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 			m_new->m_pkthdr.csum_data = 0xffff;
-			sc->hn_csum_tcp++;
+			if (csum_info->receive.tcp_csum_succeeded)
+				rxr->hn_csum_tcp++;
+			else
+				rxr->hn_csum_udp++;
 		}
 
 		if (csum_info->receive.ip_csum_succeeded &&
@@ -1261,8 +1242,10 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
 
 			pr = hn_check_iplen(m_new, hoff);
 			if (pr == IPPROTO_TCP) {
-				if (sc->hn_trust_hosttcp) {
-					sc->hn_csum_trusted++;
+				if (do_csum &&
+				    (rxr->hn_trust_hcsum &
+				     HN_TRUST_HCSUM_TCP)) {
+					rxr->hn_csum_trusted++;
 					m_new->m_pkthdr.csum_flags |=
 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
@@ -1270,6 +1253,21 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
 				}
 				/* Rely on SW csum verification though... */
 				do_lro = 1;
+			} else if (pr == IPPROTO_UDP) {
+				if (do_csum &&
+				    (rxr->hn_trust_hcsum &
+				     HN_TRUST_HCSUM_UDP)) {
+					rxr->hn_csum_trusted++;
+					m_new->m_pkthdr.csum_flags |=
+					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
+					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+					m_new->m_pkthdr.csum_data = 0xffff;
+				}
+			} else if (pr != IPPROTO_DONE && do_csum &&
+			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
+				rxr->hn_csum_trusted++;
+				m_new->m_pkthdr.csum_flags |=
+				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
 			}
 		}
 	}
@@ -1289,10 +1287,10 @@ skip:
 
 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
 #if defined(INET) || defined(INET6)
-		struct lro_ctrl *lro = &sc->hn_lro;
+		struct lro_ctrl *lro = &rxr->hn_lro;
 
 		if (lro->lro_cnt) {
-			sc->hn_lro_tried++;
+			rxr->hn_lro_tried++;
 			if (tcp_lro_rx(lro, m_new, 0) == 0) {
 				/* DONE! */
 				return 0;
@@ -1308,18 +1306,8 @@ skip:
 }
 
 void
-netvsc_recv_rollup(struct hv_device *device_ctx)
+netvsc_recv_rollup(struct hv_device *device_ctx __unused)
 {
-#if defined(INET) || defined(INET6)
-	hn_softc_t *sc = device_get_softc(device_ctx->device);
-	struct lro_ctrl *lro = &sc->hn_lro;
-	struct lro_entry *queued;
-
-	while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
-		SLIST_REMOVE_HEAD(&lro->lro_active, next);
-		tcp_lro_flush(lro, queued);
-	}
-#endif
 }
 
 /*
@@ -1377,12 +1365,23 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 
 		/* Obtain and record requested MTU */
 		ifp->if_mtu = ifr->ifr_mtu;
+
+#if __FreeBSD_version >= 1100099
 		/*
-		 * Make sure that LRO high watermark is still valid,
-		 * after MTU change (the 2*MTU limit).
+		 * Make sure that LRO aggregation length limit is still
+		 * valid, after the MTU change.
 		 */
-		if (!HN_LRO_HIWAT_ISVALID(sc, sc->hn_lro_hiwat))
-			hn_set_lro_hiwat(sc, HN_LRO_HIWAT_MTULIM(ifp));
+		NV_LOCK(sc);
+		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
+		    HN_LRO_LENLIM_MIN(ifp)) {
+			int i;
+			for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+				sc->hn_rx_ring[i].hn_lro.lro_length_lim =
+				    HN_LRO_LENLIM_MIN(ifp);
+			}
+		}
+		NV_UNLOCK(sc);
+#endif
 
 		do {
 			NV_LOCK(sc);
@@ -1422,8 +1421,10 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 		}
 
 		sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
-		if (sc->hn_tx_chimney_size > sc->hn_tx_chimney_max)
-			sc->hn_tx_chimney_size = sc->hn_tx_chimney_max;
+		if (sc->hn_tx_ring[0].hn_tx_chimney_size >
+		    sc->hn_tx_chimney_max)
+			hn_set_tx_chimney_size(sc, sc->hn_tx_chimney_max);
+
 		hn_ifinit_locked(sc);
 
 		NV_LOCK(sc);
@@ -1483,47 +1484,43 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 		error = 0;
 		break;
 	case SIOCSIFCAP:
+		NV_LOCK(sc);
+
 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
 		if (mask & IFCAP_TXCSUM) {
-			if (IFCAP_TXCSUM & ifp->if_capenable) {
-				ifp->if_capenable &= ~IFCAP_TXCSUM;
-				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
+			ifp->if_capenable ^= IFCAP_TXCSUM;
+			if (ifp->if_capenable & IFCAP_TXCSUM) {
+				ifp->if_hwassist |=
+				    sc->hn_tx_ring[0].hn_csum_assist;
 			} else {
-				ifp->if_capenable |= IFCAP_TXCSUM;
-				/*
-				 * Only enable UDP checksum offloading on
-				 * Windows Server 2012R2 or later releases.
-				 */
-				if (hv_vmbus_protocal_version >=
-				    HV_VMBUS_VERSION_WIN8_1) {
-					ifp->if_hwassist |=
-					    (CSUM_TCP | CSUM_UDP);
-				} else {
-					ifp->if_hwassist |= CSUM_TCP;
-				}
+				ifp->if_hwassist &=
+				    ~sc->hn_tx_ring[0].hn_csum_assist;
 			}
 		}
 
-		if (mask & IFCAP_RXCSUM) {
-			if (IFCAP_RXCSUM & ifp->if_capenable) {
-				ifp->if_capenable &= ~IFCAP_RXCSUM;
-			} else {
-				ifp->if_capenable |= IFCAP_RXCSUM;
-			}
-		}
+		if (mask & IFCAP_RXCSUM)
+			ifp->if_capenable ^= IFCAP_RXCSUM;
+
 		if (mask & IFCAP_LRO)
 			ifp->if_capenable ^= IFCAP_LRO;
 
 		if (mask & IFCAP_TSO4) {
 			ifp->if_capenable ^= IFCAP_TSO4;
-			ifp->if_hwassist ^= CSUM_IP_TSO;
+			if (ifp->if_capenable & IFCAP_TSO4)
+				ifp->if_hwassist |= CSUM_IP_TSO;
+			else
+				ifp->if_hwassist &= ~CSUM_IP_TSO;
 		}
 
 		if (mask & IFCAP_TSO6) {
 			ifp->if_capenable ^= IFCAP_TSO6;
-			ifp->if_hwassist ^= CSUM_IP6_TSO;
+			if (ifp->if_capenable & IFCAP_TSO6)
+				ifp->if_hwassist |= CSUM_IP6_TSO;
+			else
+				ifp->if_hwassist &= ~CSUM_IP6_TSO;
 		}
 
+		NV_UNLOCK(sc);
 		error = 0;
 		break;
 	case SIOCADDMULTI:
@@ -1566,7 +1563,8 @@ hn_stop(hn_softc_t *sc)
 	if (bootverbose)
 		printf(" Closing Device ...\n");
 
-	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+	atomic_clear_int(&ifp->if_drv_flags,
+	    (IFF_DRV_RUNNING | IFF_DRV_OACTIVE));
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 	sc->hn_initdone = 0;
 
@@ -1579,16 +1577,56 @@ hn_stop(hn_softc_t *sc)
 static void
 hn_start(struct ifnet *ifp)
 {
-	hn_softc_t *sc;
+	struct hn_softc *sc = ifp->if_softc;
+	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
 
-	sc = ifp->if_softc;
-	NV_LOCK(sc);
-	if (sc->temp_unusable) {
-		NV_UNLOCK(sc);
-		return;
+	if (txr->hn_sched_tx)
+		goto do_sched;
+
+	if (mtx_trylock(&txr->hn_tx_lock)) {
+		int sched;
+
+		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
+		mtx_unlock(&txr->hn_tx_lock);
+		if (!sched)
+			return;
+	}
+do_sched:
+	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
+}
+
+static void
+hn_start_txeof(struct hn_tx_ring *txr)
+{
+	struct hn_softc *sc = txr->hn_sc;
+	struct ifnet *ifp = sc->hn_ifp;
+
+	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
+
+	if (txr->hn_sched_tx)
+		goto do_sched;
+
+	if (mtx_trylock(&txr->hn_tx_lock)) {
+		int sched;
+
+		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
+		mtx_unlock(&txr->hn_tx_lock);
+		if (sched) {
+			taskqueue_enqueue(txr->hn_tx_taskq,
+			    &txr->hn_tx_task);
+		}
+	} else {
+do_sched:
+		/*
+		 * Release the OACTIVE earlier, with the hope, that
+		 * others could catch up.  The task will clear the
+		 * flag again with the hn_tx_lock to avoid possible
+		 * races.
+		 */
+		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
 	}
-	hn_start_locked(ifp);
-	NV_UNLOCK(sc);
 }
 
 /*
@@ -1615,8 +1653,8 @@ hn_ifinit_locked(hn_softc_t *sc)
 	} else {
 		sc->hn_initdone = 1;
 	}
-	ifp->if_drv_flags |= IFF_DRV_RUNNING;
-	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
 	if_link_state_change(ifp, LINK_STATE_UP);
 }
 
@@ -1659,26 +1697,90 @@ hn_watchdog(struct ifnet *ifp)
 }
 #endif
 
-#ifdef HN_LRO_HIWAT
+#if __FreeBSD_version >= 1100099
+
+static int
+hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	unsigned int lenlim;
+	int error, i;
+
+	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
+	error = sysctl_handle_int(oidp, &lenlim, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
+	    lenlim > TCP_LRO_LENGTH_MAX)
+		return EINVAL;
+
+	NV_LOCK(sc);
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
+	NV_UNLOCK(sc);
+	return 0;
+}
+
 static int
-hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS)
+hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
-	int hiwat, error;
+	int ackcnt, error, i;
 
-	hiwat = sc->hn_lro_hiwat;
-	error = sysctl_handle_int(oidp, &hiwat, 0, req);
+	/*
+	 * lro_ackcnt_lim is append count limit,
+	 * +1 to turn it into aggregation limit.
+	 */
+	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
+	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
 	if (error || req->newptr == NULL)
 		return error;
 
-	if (!HN_LRO_HIWAT_ISVALID(sc, hiwat))
+	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
 		return EINVAL;
 
-	if (sc->hn_lro_hiwat != hiwat)
-		hn_set_lro_hiwat(sc, hiwat);
+	/*
+	 * Convert aggregation limit back to append
+	 * count limit.
+	 */
+	--ackcnt;
+	NV_LOCK(sc);
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
+	NV_UNLOCK(sc);
+	return 0;
+}
+
+#endif
+
+static int
+hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int hcsum = arg2;
+	int on, error, i;
+
+	on = 0;
+	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
+		on = 1;
+
+	error = sysctl_handle_int(oidp, &on, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	NV_LOCK(sc);
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
+
+		if (on)
+			rxr->hn_trust_hcsum |= hcsum;
+		else
+			rxr->hn_trust_hcsum &= ~hcsum;
+	}
+	NV_UNLOCK(sc);
 	return 0;
 }
-#endif	/* HN_LRO_HIWAT */
 
 static int
 hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS)
@@ -1686,7 +1788,7 @@ hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS)
 	struct hn_softc *sc = arg1;
 	int chimney_size, error;
 
-	chimney_size = sc->hn_tx_chimney_size;
+	chimney_size = sc->hn_tx_ring[0].hn_tx_chimney_size;
 	error = sysctl_handle_int(oidp, &chimney_size, 0, req);
 	if (error || req->newptr == NULL)
 		return error;
@@ -1694,8 +1796,138 @@ hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS)
 	if (chimney_size > sc->hn_tx_chimney_max || chimney_size <= 0)
 		return EINVAL;
 
-	if (sc->hn_tx_chimney_size != chimney_size)
-		sc->hn_tx_chimney_size = chimney_size;
+	hn_set_tx_chimney_size(sc, chimney_size);
+	return 0;
+}
+
+#if __FreeBSD_version < 1100095
+static int
+hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int ofs = arg2, i, error;
+	struct hn_rx_ring *rxr;
+	uint64_t stat;
+
+	stat = 0;
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		rxr = &sc->hn_rx_ring[i];
+		stat += *((int *)((uint8_t *)rxr + ofs));
+	}
+
+	error = sysctl_handle_64(oidp, &stat, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	/* Zero out this stat. */
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		rxr = &sc->hn_rx_ring[i];
+		*((int *)((uint8_t *)rxr + ofs)) = 0;
+	}
+	return 0;
+}
+#else
+static int
+hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int ofs = arg2, i, error;
+	struct hn_rx_ring *rxr;
+	uint64_t stat;
+
+	stat = 0;
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		rxr = &sc->hn_rx_ring[i];
+		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
+	}
+
+	error = sysctl_handle_64(oidp, &stat, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	/* Zero out this stat. */
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		rxr = &sc->hn_rx_ring[i];
+		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
+	}
+	return 0;
+}
+
+#endif
+
+static int
+hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int ofs = arg2, i, error;
+	struct hn_rx_ring *rxr;
+	u_long stat;
+
+	stat = 0;
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		rxr = &sc->hn_rx_ring[i];
+		stat += *((u_long *)((uint8_t *)rxr + ofs));
+	}
+
+	error = sysctl_handle_long(oidp, &stat, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	/* Zero out this stat. */
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		rxr = &sc->hn_rx_ring[i];
+		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
+	}
+	return 0;
+}
+
+static int
+hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int ofs = arg2, i, error;
+	struct hn_tx_ring *txr;
+	u_long stat;
+
+	stat = 0;
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+		txr = &sc->hn_tx_ring[i];
+		stat += *((u_long *)((uint8_t *)txr + ofs));
+	}
+
+	error = sysctl_handle_long(oidp, &stat, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	/* Zero out this stat. */
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+		txr = &sc->hn_tx_ring[i];
+		*((u_long *)((uint8_t *)txr + ofs)) = 0;
+	}
+	return 0;
+}
+
+static int
+hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int ofs = arg2, i, error, conf;
+	struct hn_tx_ring *txr;
+
+	txr = &sc->hn_tx_ring[0];
+	conf = *((int *)((uint8_t *)txr + ofs));
+
+	error = sysctl_handle_int(oidp, &conf, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	NV_LOCK(sc);
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+		txr = &sc->hn_tx_ring[i];
+		*((int *)((uint8_t *)txr + ofs)) = conf;
+	}
+	NV_UNLOCK(sc);
+
 	return 0;
 }
 
@@ -1786,17 +2018,191 @@ hn_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 	*paddr = segs->ds_addr;
 }
 
+static void
+hn_create_rx_data(struct hn_softc *sc)
+{
+	struct sysctl_oid_list *child;
+	struct sysctl_ctx_list *ctx;
+	device_t dev = sc->hn_dev;
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+	int lroent_cnt;
+#endif
+#endif
+	int i;
+
+	sc->hn_rx_ring_cnt = 1; /* TODO: vRSS */
+	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
+	    M_NETVSC, M_WAITOK | M_ZERO);
+
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+	lroent_cnt = hn_lro_entry_count;
+	if (lroent_cnt < TCP_LRO_ENTRIES)
+		lroent_cnt = TCP_LRO_ENTRIES;
+	device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
+#endif
+#endif	/* INET || INET6 */
+
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
+
+		if (hn_trust_hosttcp)
+			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
+		if (hn_trust_hostudp)
+			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
+		if (hn_trust_hostip)
+			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
+
+		/*
+		 * Initialize LRO.
+		 */
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 0);
+#else
+		tcp_lro_init(&rxr->hn_lro);
+		rxr->hn_lro.ifp = sc->hn_ifp;
+#endif
+#if __FreeBSD_version >= 1100099
+		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
+		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
+#endif
+#endif	/* INET || INET6 */
+	}
+
+	ctx = device_get_sysctl_ctx(dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
+	    CTLTYPE_U64 | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
+#if __FreeBSD_version < 1100095
+	    hn_rx_stat_int_sysctl,
+#else
+	    hn_rx_stat_u64_sysctl,
+#endif
+	    "LU", "LRO queued");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
+	    CTLTYPE_U64 | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
+#if __FreeBSD_version < 1100095
+	    hn_rx_stat_int_sysctl,
+#else
+	    hn_rx_stat_u64_sysctl,
+#endif
+	    "LU", "LRO flushed");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
+	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_rx_ring, hn_lro_tried),
+	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
+#if __FreeBSD_version >= 1100099
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
+	    CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_lro_lenlim_sysctl, "IU",
+	    "Max # of data bytes to be aggregated by LRO");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
+	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_ackcnt_sysctl, "I",
+	    "Max # of ACKs to be aggregated by LRO");
+#endif
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
+	    CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_TCP,
+	    hn_trust_hcsum_sysctl, "I",
+	    "Trust tcp segement verification on host side, "
+	    "when csum info is missing");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
+	    CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_UDP,
+	    hn_trust_hcsum_sysctl, "I",
+	    "Trust udp datagram verification on host side, "
+	    "when csum info is missing");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
+	    CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_IP,
+	    hn_trust_hcsum_sysctl, "I",
+	    "Trust ip packet verification on host side, "
+	    "when csum info is missing");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
+	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_rx_ring, hn_csum_ip),
+	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
+	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
+	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
+	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_rx_ring, hn_csum_udp),
+	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
+	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
+	    hn_rx_stat_ulong_sysctl, "LU",
+	    "# of packets that we trust host's csum verification");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
+	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_rx_ring, hn_small_pkts),
+	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
+}
+
+static void
+hn_destroy_rx_data(struct hn_softc *sc)
+{
+#if defined(INET) || defined(INET6)
+	int i;
+#endif
+
+	if (sc->hn_rx_ring_cnt == 0)
+		return;
+
+#if defined(INET) || defined(INET6)
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+		tcp_lro_free(&sc->hn_rx_ring[i].hn_lro);
+#endif
+	free(sc->hn_rx_ring, M_NETVSC);
+	sc->hn_rx_ring = NULL;
+
+	sc->hn_rx_ring_cnt = 0;
+}
+
 static int
-hn_create_tx_ring(struct hn_softc *sc)
+hn_create_tx_ring(struct hn_softc *sc, int id)
 {
+	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
 	bus_dma_tag_t parent_dtag;
 	int error, i;
 
-	sc->hn_txdesc_cnt = HN_TX_DESC_CNT;
-	sc->hn_txdesc = malloc(sizeof(struct hn_txdesc) * sc->hn_txdesc_cnt,
+	txr->hn_sc = sc;
+
+#ifndef HN_USE_TXDESC_BUFRING
+	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
+#endif
+	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
+
+	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
+	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
 	    M_NETVSC, M_WAITOK | M_ZERO);
-	SLIST_INIT(&sc->hn_txlist);
-	mtx_init(&sc->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
+#ifndef HN_USE_TXDESC_BUFRING
+	SLIST_INIT(&txr->hn_txlist);
+#else
+	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_NETVSC,
+	    M_WAITOK, &txr->hn_tx_lock);
+#endif
+
+	txr->hn_tx_taskq = sc->hn_tx_taskq;
+	TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
+	TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
+
+	txr->hn_direct_tx_size = hn_direct_tx_size;
+	if (hv_vmbus_protocal_version >= HV_VMBUS_VERSION_WIN8_1)
+		txr->hn_csum_assist = HN_CSUM_ASSIST;
+	else
+		txr->hn_csum_assist = HN_CSUM_ASSIST_WIN8;
+
+	/*
+	 * Always schedule transmission instead of trying to do direct
+	 * transmission.  This one gives the best performance so far.
+	 */
+	txr->hn_sched_tx = 1;
+
+	txr->hn_txeof = hn_start_txeof; /* TODO: if_transmit */
 
 	parent_dtag = bus_get_dma_tag(sc->hn_dev);
 
@@ -1813,7 +2219,7 @@ hn_create_tx_ring(struct hn_softc *sc)
 	    0,				/* flags */
 	    NULL,			/* lockfunc */
 	    NULL,			/* lockfuncarg */
-	    &sc->hn_tx_rndis_dtag);
+	    &txr->hn_tx_rndis_dtag);
 	if (error) {
 		device_printf(sc->hn_dev, "failed to create rndis dmatag\n");
 		return error;
@@ -1832,21 +2238,21 @@ hn_create_tx_ring(struct hn_softc *sc)
 	    0,				/* flags */
 	    NULL,			/* lockfunc */
 	    NULL,			/* lockfuncarg */
-	    &sc->hn_tx_data_dtag);
+	    &txr->hn_tx_data_dtag);
 	if (error) {
 		device_printf(sc->hn_dev, "failed to create data dmatag\n");
 		return error;
 	}
 
-	for (i = 0; i < sc->hn_txdesc_cnt; ++i) {
-		struct hn_txdesc *txd = &sc->hn_txdesc[i];
+	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
+		struct hn_txdesc *txd = &txr->hn_txdesc[i];
 
-		txd->sc = sc;
+		txd->txr = txr;
 
 		/*
 		 * Allocate and load RNDIS messages.
 		 */
-        	error = bus_dmamem_alloc(sc->hn_tx_rndis_dtag,
+        	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
 		    (void **)&txd->rndis_msg,
 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT,
 		    &txd->rndis_msg_dmap);
@@ -1856,7 +2262,7 @@ hn_create_tx_ring(struct hn_softc *sc)
 			return error;
 		}
 
-		error = bus_dmamap_load(sc->hn_tx_rndis_dtag,
+		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
 		    txd->rndis_msg_dmap,
 		    txd->rndis_msg, HN_RNDIS_MSG_LEN,
 		    hn_dma_map_paddr, &txd->rndis_msg_paddr,
@@ -1864,59 +2270,277 @@ hn_create_tx_ring(struct hn_softc *sc)
 		if (error) {
 			device_printf(sc->hn_dev,
 			    "failed to load rndis_msg, %d\n", i);
-			bus_dmamem_free(sc->hn_tx_rndis_dtag,
+			bus_dmamem_free(txr->hn_tx_rndis_dtag,
 			    txd->rndis_msg, txd->rndis_msg_dmap);
 			return error;
 		}
 
 		/* DMA map for TX data. */
-		error = bus_dmamap_create(sc->hn_tx_data_dtag, 0,
+		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
 		    &txd->data_dmap);
 		if (error) {
 			device_printf(sc->hn_dev,
 			    "failed to allocate tx data dmamap\n");
-			bus_dmamap_unload(sc->hn_tx_rndis_dtag,
+			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
 			    txd->rndis_msg_dmap);
-			bus_dmamem_free(sc->hn_tx_rndis_dtag,
+			bus_dmamem_free(txr->hn_tx_rndis_dtag,
 			    txd->rndis_msg, txd->rndis_msg_dmap);
 			return error;
 		}
 
 		/* All set, put it to list */
 		txd->flags |= HN_TXD_FLAG_ONLIST;
-		SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link);
+#ifndef HN_USE_TXDESC_BUFRING
+		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
+#else
+		buf_ring_enqueue(txr->hn_txdesc_br, txd);
+#endif
+	}
+	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
+
+	if (sc->hn_tx_sysctl_tree != NULL) {
+		struct sysctl_oid_list *child;
+		struct sysctl_ctx_list *ctx;
+		char name[16];
+
+		/*
+		 * Create per TX ring sysctl tree:
+		 * dev.hn.UNIT.tx.RINGID
+		 */
+		ctx = device_get_sysctl_ctx(sc->hn_dev);
+		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
+
+		snprintf(name, sizeof(name), "%d", id);
+		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
+		    name, CTLFLAG_RD, 0, "");
+
+		if (txr->hn_tx_sysctl_tree != NULL) {
+			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
+
+			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
+			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
+			    "# of available TX descs");
+		}
 	}
-	sc->hn_txdesc_avail = sc->hn_txdesc_cnt;
 
 	return 0;
 }
 
 static void
-hn_destroy_tx_ring(struct hn_softc *sc)
+hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
+{
+	struct hn_tx_ring *txr = txd->txr;
+
+	KASSERT(txd->m == NULL, ("still has mbuf installed"));
+	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
+
+	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_msg_dmap);
+	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_msg,
+	    txd->rndis_msg_dmap);
+	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
+}
+
+static void
+hn_destroy_tx_ring(struct hn_tx_ring *txr)
 {
 	struct hn_txdesc *txd;
 
-	while ((txd = SLIST_FIRST(&sc->hn_txlist)) != NULL) {
-		KASSERT(txd->m == NULL, ("still has mbuf installed"));
-		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
-		    ("still dma mapped"));
-		SLIST_REMOVE_HEAD(&sc->hn_txlist, link);
+	if (txr->hn_txdesc == NULL)
+		return;
+
+#ifndef HN_USE_TXDESC_BUFRING
+	while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
+		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
+		hn_txdesc_dmamap_destroy(txd);
+	}
+#else
+	while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
+		hn_txdesc_dmamap_destroy(txd);
+#endif
+
+	if (txr->hn_tx_data_dtag != NULL)
+		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
+	if (txr->hn_tx_rndis_dtag != NULL)
+		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
+
+#ifdef HN_USE_TXDESC_BUFRING
+	buf_ring_free(txr->hn_txdesc_br, M_NETVSC);
+#endif
+
+	free(txr->hn_txdesc, M_NETVSC);
+	txr->hn_txdesc = NULL;
+
+#ifndef HN_USE_TXDESC_BUFRING
+	mtx_destroy(&txr->hn_txlist_spin);
+#endif
+	mtx_destroy(&txr->hn_tx_lock);
+}
+
+static int
+hn_create_tx_data(struct hn_softc *sc)
+{
+	struct sysctl_oid_list *child;
+	struct sysctl_ctx_list *ctx;
+	int i;
+
+	sc->hn_tx_ring_cnt = 1; /* TODO: vRSS */
+	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
+	    M_NETVSC, M_WAITOK | M_ZERO);
+
+	ctx = device_get_sysctl_ctx(sc->hn_dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
+
+	/* Create dev.hn.UNIT.tx sysctl tree */
+	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
+	    CTLFLAG_RD, 0, "");
+
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+		int error;
+
+		error = hn_create_tx_ring(sc, i);
+		if (error)
+			return error;
+	}
+
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
+	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
+	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
+	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_tx_ring, hn_send_failed),
+	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
+	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
+	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
+	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
+	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
+	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
+	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
+	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
+	    "# of total TX descs");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
+	    CTLFLAG_RD, &sc->hn_tx_chimney_max, 0,
+	    "Chimney send packet size upper boundary");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
+	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_tx_chimney_size_sysctl,
+	    "I", "Chimney send packet size limit");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
+	    CTLTYPE_INT | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
+	    hn_tx_conf_int_sysctl, "I",
+	    "Size of the packet for direct transmission");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
+	    CTLTYPE_INT | CTLFLAG_RW, sc,
+	    __offsetof(struct hn_tx_ring, hn_sched_tx),
+	    hn_tx_conf_int_sysctl, "I",
+	    "Always schedule transmission "
+	    "instead of doing direct transmission");
+
+	return 0;
+}
+
+static void
+hn_set_tx_chimney_size(struct hn_softc *sc, int chimney_size)
+{
+	int i;
 
-		bus_dmamap_unload(sc->hn_tx_rndis_dtag,
-		    txd->rndis_msg_dmap);
-		bus_dmamem_free(sc->hn_tx_rndis_dtag,
-		    txd->rndis_msg, txd->rndis_msg_dmap);
+	NV_LOCK(sc);
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
+		sc->hn_tx_ring[i].hn_tx_chimney_size = chimney_size;
+	NV_UNLOCK(sc);
+}
+
+static void
+hn_destroy_tx_data(struct hn_softc *sc)
+{
+	int i;
+
+	if (sc->hn_tx_ring_cnt == 0)
+		return;
+
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
+		hn_destroy_tx_ring(&sc->hn_tx_ring[i]);
+
+	free(sc->hn_tx_ring, M_NETVSC);
+	sc->hn_tx_ring = NULL;
+
+	sc->hn_tx_ring_cnt = 0;
+}
+
+static void
+hn_start_taskfunc(void *xtxr, int pending __unused)
+{
+	struct hn_tx_ring *txr = xtxr;
+
+	mtx_lock(&txr->hn_tx_lock);
+	hn_start_locked(txr, 0);
+	mtx_unlock(&txr->hn_tx_lock);
+}
+
+static void
+hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
+{
+	struct hn_tx_ring *txr = xtxr;
+
+	mtx_lock(&txr->hn_tx_lock);
+	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
+	hn_start_locked(txr, 0);
+	mtx_unlock(&txr->hn_tx_lock);
+}
+
+static void
+hn_stop_tx_tasks(struct hn_softc *sc)
+{
+	int i;
+
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
+
+		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
+		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
+	}
+}
 
-		bus_dmamap_destroy(sc->hn_tx_data_dtag, txd->data_dmap);
+static void
+hn_tx_taskq_create(void *arg __unused)
+{
+	if (!hn_share_tx_taskq)
+		return;
+
+	hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
+	    taskqueue_thread_enqueue, &hn_tx_taskq);
+	taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
+	if (hn_bind_tx_taskq >= 0) {
+		int cpu = hn_bind_tx_taskq;
+		struct task cpuset_task;
+		cpuset_t cpu_set;
+
+		if (cpu > mp_ncpus - 1)
+			cpu = mp_ncpus - 1;
+		CPU_SETOF(cpu, &cpu_set);
+		TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set);
+		taskqueue_enqueue(hn_tx_taskq, &cpuset_task);
+		taskqueue_drain(hn_tx_taskq, &cpuset_task);
 	}
+}
+SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_FIRST,
+    hn_tx_taskq_create, NULL);
 
-	if (sc->hn_tx_data_dtag != NULL)
-		bus_dma_tag_destroy(sc->hn_tx_data_dtag);
-	if (sc->hn_tx_rndis_dtag != NULL)
-		bus_dma_tag_destroy(sc->hn_tx_rndis_dtag);
-	free(sc->hn_txdesc, M_NETVSC);
-	mtx_destroy(&sc->hn_txlist_spin);
+static void
+hn_tx_taskq_destroy(void *arg __unused)
+{
+	if (hn_tx_taskq != NULL)
+		taskqueue_free(hn_tx_taskq);
 }
+SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_FIRST,
+    hn_tx_taskq_destroy, NULL);
 
 static device_method_t netvsc_methods[] = {
         /* Device interface */
diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.c b/sys/dev/hyperv/netvsc/hv_rndis_filter.c
index 29d8c8f..31ddbc0 100644
--- a/sys/dev/hyperv/netvsc/hv_rndis_filter.c
+++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.c
@@ -136,12 +136,9 @@ hv_get_rndis_device(void)
 {
 	rndis_device *device;
 
-	device = malloc(sizeof(rndis_device), M_NETVSC, M_NOWAIT | M_ZERO);
-	if (device == NULL) {
-		return (NULL);
-	}
+	device = malloc(sizeof(rndis_device), M_NETVSC, M_WAITOK | M_ZERO);
 
-	mtx_init(&device->req_lock, "HV-FRL", NULL, MTX_SPIN | MTX_RECURSE);
+	mtx_init(&device->req_lock, "HV-FRL", NULL, MTX_DEF);
 
 	/* Same effect as STAILQ_HEAD_INITIALIZER() static initializer */
 	STAILQ_INIT(&device->myrequest_list);
@@ -172,10 +169,7 @@ hv_rndis_request(rndis_device *device, uint32_t message_type,
 	rndis_msg *rndis_mesg;
 	rndis_set_request *set;
 
-	request = malloc(sizeof(rndis_request), M_NETVSC, M_NOWAIT | M_ZERO);
-	if (request == NULL) {
-		return (NULL);
-	}
+	request = malloc(sizeof(rndis_request), M_NETVSC, M_WAITOK | M_ZERO);
 
 	sema_init(&request->wait_sema, 0, "rndis sema");
 	
@@ -194,9 +188,9 @@ hv_rndis_request(rndis_device *device, uint32_t message_type,
 	set->request_id += 1;
 
 	/* Add to the request list */
-	mtx_lock_spin(&device->req_lock);
+	mtx_lock(&device->req_lock);
 	STAILQ_INSERT_TAIL(&device->myrequest_list, request, mylist_entry);
-	mtx_unlock_spin(&device->req_lock);
+	mtx_unlock(&device->req_lock);
 
 	return (request);
 }
@@ -207,14 +201,14 @@ hv_rndis_request(rndis_device *device, uint32_t message_type,
 static inline void
 hv_put_rndis_request(rndis_device *device, rndis_request *request)
 {
-	mtx_lock_spin(&device->req_lock);
+	mtx_lock(&device->req_lock);
 	/* Fixme:  Has O(n) performance */
 	/*
 	 * XXXKYS: Use Doubly linked lists.
 	 */
 	STAILQ_REMOVE(&device->myrequest_list, request, rndis_request_,
 	    mylist_entry);
-	mtx_unlock_spin(&device->req_lock);
+	mtx_unlock(&device->req_lock);
 
 	sema_destroy(&request->wait_sema);
 	free(request, M_NETVSC);
@@ -271,7 +265,7 @@ hv_rf_receive_response(rndis_device *device, rndis_msg *response)
 	rndis_request *next_request;
 	boolean_t found = FALSE;
 
-	mtx_lock_spin(&device->req_lock);
+	mtx_lock(&device->req_lock);
 	request = STAILQ_FIRST(&device->myrequest_list);
 	while (request != NULL) {
 		/*
@@ -286,7 +280,7 @@ hv_rf_receive_response(rndis_device *device, rndis_msg *response)
 		next_request = STAILQ_NEXT(request, mylist_entry);
 		request = next_request;
 	}
-	mtx_unlock_spin(&device->req_lock);
+	mtx_unlock(&device->req_lock);
 
 	if (found) {
 		if (response->msg_len <= sizeof(rndis_msg)) {
diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
index a780f9e..27fb3fd 100644
--- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
+++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
@@ -856,8 +856,8 @@ hv_storvsc_rescan_target(struct storvsc_softc *sc)
 
 	if (xpt_create_path(&ccb->ccb_h.path, NULL, pathid, targetid,
 	    CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
-		printf("unable to create path for rescan, pathid: %d,"
-		    "targetid: %d\n", pathid, targetid);
+		printf("unable to create path for rescan, pathid: %u,"
+		    "targetid: %u\n", pathid, targetid);
 		xpt_free_ccb(ccb);
 		return;
 	}
@@ -1561,13 +1561,12 @@ static void
 storvsc_destroy_bounce_buffer(struct sglist *sgl)
 {
 	struct hv_sgl_node *sgl_node = NULL;
-
-	sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list);
-	LIST_REMOVE(sgl_node, link);
-	if (NULL == sgl_node) {
+	if (LIST_EMPTY(&g_hv_sgl_page_pool.in_use_sgl_list)) {
 		printf("storvsc error: not enough in use sgl\n");
 		return;
 	}
+	sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list);
+	LIST_REMOVE(sgl_node, link);
 	sgl_node->sgl_data = sgl;
 	LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link);
 }
@@ -1593,12 +1592,12 @@ storvsc_create_bounce_buffer(uint16_t seg_count, int write)
 	struct hv_sgl_node *sgl_node = NULL;	
 
 	/* get struct sglist from free_sgl_list */
-	sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
-	LIST_REMOVE(sgl_node, link);
-	if (NULL == sgl_node) {
+	if (LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
 		printf("storvsc error: not enough free sgl\n");
 		return NULL;
 	}
+	sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
+	LIST_REMOVE(sgl_node, link);
 	bounce_sgl = sgl_node->sgl_data;
 	LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link);
 
diff --git a/sys/dev/hyperv/utilities/hv_heartbeat.c b/sys/dev/hyperv/utilities/hv_heartbeat.c
new file mode 100644
index 0000000..c1b6da5
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_heartbeat.c
@@ -0,0 +1,129 @@
+/*-
+ * Copyright (c) 2014 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/timetc.h>
+#include <sys/syscallsubr.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include "hv_util.h"
+
+/* Heartbeat Service */
+static hv_guid service_guid = { .data =
+	{0x39, 0x4f, 0x16, 0x57, 0x15, 0x91, 0x78, 0x4e,
+	0xab, 0x55, 0x38, 0x2f, 0x3b, 0xd5, 0x42, 0x2d} };
+
+/**
+ * Process heartbeat message
+ */
+static void
+hv_heartbeat_cb(void *context)
+{
+	uint8_t*		buf;
+	hv_vmbus_channel*	channel;
+	uint32_t		recvlen;
+	uint64_t		requestid;
+	int			ret;
+
+	struct hv_vmbus_heartbeat_msg_data*	heartbeat_msg;
+	struct hv_vmbus_icmsg_hdr*		icmsghdrp;
+	hv_util_sc			*softc;
+
+	softc = (hv_util_sc*)context;
+	buf = softc->receive_buffer;;
+	channel = softc->hv_dev->channel;
+
+	ret = hv_vmbus_channel_recv_packet(channel, buf, PAGE_SIZE, &recvlen,
+					    &requestid);
+
+	if ((ret == 0) && recvlen > 0) {
+
+	    icmsghdrp = (struct hv_vmbus_icmsg_hdr *)
+		&buf[sizeof(struct hv_vmbus_pipe_hdr)];
+
+	    if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
+		hv_negotiate_version(icmsghdrp, NULL, buf);
+
+	    } else {
+		heartbeat_msg =
+		    (struct hv_vmbus_heartbeat_msg_data *)
+			&buf[sizeof(struct hv_vmbus_pipe_hdr) +
+			     sizeof(struct hv_vmbus_icmsg_hdr)];
+
+		heartbeat_msg->seq_num += 1;
+	    }
+
+	    icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION |
+				 HV_ICMSGHDRFLAG_RESPONSE;
+
+	    hv_vmbus_channel_send_packet(channel, buf, recvlen, requestid,
+		HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
+	}
+}
+
+static int
+hv_heartbeat_probe(device_t dev)
+{
+	const char *p = vmbus_get_type(dev);
+	if (!memcmp(p, &service_guid, sizeof(hv_guid))) {
+		device_set_desc(dev, "Hyper-V Heartbeat Service");
+		return BUS_PROBE_DEFAULT;
+	}
+
+	return ENXIO;
+}
+
+static int
+hv_heartbeat_attach(device_t dev)
+{
+	hv_util_sc *softc = (hv_util_sc*)device_get_softc(dev);
+
+	softc->callback = hv_heartbeat_cb;
+
+	return hv_util_attach(dev);
+}
+
+static device_method_t heartbeat_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe, hv_heartbeat_probe),
+	DEVMETHOD(device_attach, hv_heartbeat_attach),
+	DEVMETHOD(device_detach, hv_util_detach),
+	{ 0, 0 }
+};
+
+static driver_t heartbeat_driver = { "hvheartbeat", heartbeat_methods, sizeof(hv_util_sc)};
+
+static devclass_t heartbeat_devclass;
+
+DRIVER_MODULE(hv_heartbeat, vmbus, heartbeat_driver, heartbeat_devclass, NULL, NULL);
+MODULE_VERSION(hv_heartbeat, 1);
+MODULE_DEPEND(hv_heartbeat, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/utilities/hv_kvp.c b/sys/dev/hyperv/utilities/hv_kvp.c
index 58d565c4..8517918 100644
--- a/sys/dev/hyperv/utilities/hv_kvp.c
+++ b/sys/dev/hyperv/utilities/hv_kvp.c
@@ -63,6 +63,7 @@ __FBSDID("$FreeBSD$");
 #include <dev/hyperv/include/hyperv.h>
 #include <dev/hyperv/netvsc/hv_net_vsc.h>
 
+#include "hv_util.h"
 #include "unicode.h"
 #include "hv_kvp.h"
 
@@ -74,8 +75,6 @@ __FBSDID("$FreeBSD$");
 
 /* hv_kvp debug control */
 static int hv_kvp_log = 0;
-SYSCTL_INT(_dev, OID_AUTO, hv_kvp_log, CTLFLAG_RW, &hv_kvp_log, 0,
-	"hv_kvp log");
 
 #define	hv_kvp_log_error(...)	do {				\
 	if (hv_kvp_log > 0)				\
@@ -87,6 +86,10 @@ SYSCTL_INT(_dev, OID_AUTO, hv_kvp_log, CTLFLAG_RW, &hv_kvp_log, 0,
 		log(LOG_INFO, "hv_kvp: " __VA_ARGS__);		\
 } while (0)
 
+static hv_guid service_guid = { .data =
+	{0xe7, 0xf4, 0xa0, 0xa9, 0x45, 0x5a, 0x96, 0x4d,
+	0xb8, 0x27, 0x8a, 0x84, 0x1e, 0x8c, 0x3,  0xe6} };
+
 /* character device prototypes */
 static d_open_t		hv_kvp_dev_open;
 static d_close_t	hv_kvp_dev_close;
@@ -94,12 +97,6 @@ static d_read_t		hv_kvp_dev_daemon_read;
 static d_write_t	hv_kvp_dev_daemon_write;
 static d_poll_t		hv_kvp_dev_daemon_poll;
 
-/* hv_kvp prototypes */
-static int	hv_kvp_req_in_progress(void);
-static void	hv_kvp_transaction_init(uint32_t, hv_vmbus_channel *, uint64_t, uint8_t *);
-static void	hv_kvp_send_msg_to_daemon(void);
-static void	hv_kvp_process_request(void *context);
-
 /* hv_kvp character device structure */
 static struct cdevsw hv_kvp_cdevsw =
 {
@@ -111,70 +108,67 @@ static struct cdevsw hv_kvp_cdevsw =
 	.d_poll		= hv_kvp_dev_daemon_poll,
 	.d_name		= "hv_kvp_dev",
 };
-static struct cdev *hv_kvp_dev;
-static struct hv_kvp_msg *hv_kvp_dev_buf;
-struct proc *daemon_task;
 
-static struct selinfo hv_kvp_selinfo;
 
 /*
  * Global state to track and synchronize multiple
  * KVP transaction requests from the host.
  */
-static struct {
-
-	/* Pre-allocated work item for queue */
-	hv_work_item		work_item;	
+typedef struct hv_kvp_sc {
+	struct hv_util_sc	util_sc;
 
-	/* Unless specified the pending mutex should be 
+	/* Unless specified the pending mutex should be
 	 * used to alter the values of the following paramters:
 	 * 1. req_in_progress
 	 * 2. req_timed_out
-	 * 3. pending_reqs.
 	 */
-	struct mtx		pending_mutex;	  
-	
+	struct mtx		pending_mutex;
+
+	struct task		task;
+
 	/* To track if transaction is active or not */
-	boolean_t		req_in_progress;    
+	boolean_t		req_in_progress;
 	/* Tracks if daemon did not reply back in time */
-	boolean_t		req_timed_out;	  
+	boolean_t		req_timed_out;
 	/* Tracks if daemon is serving a request currently */
 	boolean_t		daemon_busy;
-	/* Count of KVP requests from Hyper-V. */
-	uint64_t		pending_reqs;       
-	
-	
-	/* Length of host message */
-	uint32_t		host_msg_len;	    
 
-	/* Pointer to channel */
-	hv_vmbus_channel	*channelp;	    
+	/* Length of host message */
+	uint32_t		host_msg_len;
 
 	/* Host message id */
-	uint64_t		host_msg_id;	   
-	
+	uint64_t		host_msg_id;
+
 	/* Current kvp message from the host */
-	struct hv_kvp_msg	*host_kvp_msg;      
-	
+	struct hv_kvp_msg	*host_kvp_msg;
+
 	 /* Current kvp message for daemon */
-	struct hv_kvp_msg	daemon_kvp_msg;    
-	
+	struct hv_kvp_msg	daemon_kvp_msg;
+
 	/* Rcv buffer for communicating with the host*/
-	uint8_t			*rcv_buf;	    
-	
+	uint8_t			*rcv_buf;
+
 	/* Device semaphore to control communication */
-	struct sema		dev_sema;	   
-	
+	struct sema		dev_sema;
+
 	/* Indicates if daemon registered with driver */
-	boolean_t		register_done;      
-	
+	boolean_t		register_done;
+
 	/* Character device status */
-	boolean_t		dev_accessed;	    
-} kvp_globals;
+	boolean_t		dev_accessed;
+
+	struct cdev *hv_kvp_dev;
+
+	struct proc *daemon_task;
 
-/* global vars */
-MALLOC_DECLARE(M_HV_KVP_DEV_BUF);
-MALLOC_DEFINE(M_HV_KVP_DEV_BUF, "hv_kvp_dev buffer", "buffer for hv_kvp_dev module");
+	struct selinfo hv_kvp_selinfo;
+} hv_kvp_sc;
+
+/* hv_kvp prototypes */
+static int	hv_kvp_req_in_progress(hv_kvp_sc *sc);
+static void	hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t, uint64_t, uint8_t *);
+static void	hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc);
+static void	hv_kvp_process_request(void *context, int pending);
 
 /*
  * hv_kvp low level functions
@@ -184,10 +178,10 @@ MALLOC_DEFINE(M_HV_KVP_DEV_BUF, "hv_kvp_dev buffer", "buffer for hv_kvp_dev modu
  * Check if kvp transaction is in progres
  */
 static int
-hv_kvp_req_in_progress(void)
+hv_kvp_req_in_progress(hv_kvp_sc *sc)
 {
 
-	return (kvp_globals.req_in_progress);
+	return (sc->req_in_progress);
 }
 
 
@@ -195,18 +189,17 @@ hv_kvp_req_in_progress(void)
  * This routine is called whenever a message is received from the host
  */
 static void
-hv_kvp_transaction_init(uint32_t rcv_len, hv_vmbus_channel *rcv_channel,
+hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t rcv_len,
 			uint64_t request_id, uint8_t *rcv_buf)
 {
-	
+
 	/* Store all the relevant message details in the global structure */
 	/* Do not need to use mutex for req_in_progress here */
-	kvp_globals.req_in_progress = true;
-	kvp_globals.host_msg_len = rcv_len;
-	kvp_globals.channelp = rcv_channel;
-	kvp_globals.host_msg_id = request_id;
-	kvp_globals.rcv_buf = rcv_buf;
-	kvp_globals.host_kvp_msg = (struct hv_kvp_msg *)&rcv_buf[
+	sc->req_in_progress = true;
+	sc->host_msg_len = rcv_len;
+	sc->host_msg_id = request_id;
+	sc->rcv_buf = rcv_buf;
+	sc->host_kvp_msg = (struct hv_kvp_msg *)&rcv_buf[
 		sizeof(struct hv_vmbus_pipe_hdr) +
 		sizeof(struct hv_vmbus_icmsg_hdr)];
 }
@@ -258,12 +251,12 @@ hv_kvp_negotiate_version(struct hv_vmbus_icmsg_hdr *icmsghdrp,
  * Convert ip related info in umsg from utf8 to utf16 and store in hmsg
  */
 static int
-hv_kvp_convert_utf8_ipinfo_to_utf16(struct hv_kvp_msg *umsg, 
+hv_kvp_convert_utf8_ipinfo_to_utf16(struct hv_kvp_msg *umsg,
 				    struct hv_kvp_ip_msg *host_ip_msg)
 {
 	int err_ip, err_subnet, err_gway, err_dns, err_adap;
 	int UNUSED_FLAG = 1;
- 		
+
 	utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.ip_addr,
 	    MAX_IP_ADDR_SIZE,
 	    (char *)umsg->body.kvp_ip_val.ip_addr,
@@ -294,7 +287,7 @@ hv_kvp_convert_utf8_ipinfo_to_utf16(struct hv_kvp_msg *umsg,
 	    strlen((char *)umsg->body.kvp_ip_val.adapter_id),
 	    UNUSED_FLAG,
 	    &err_adap);
-	
+
 	host_ip_msg->kvp_ip_val.dhcp_enabled = umsg->body.kvp_ip_val.dhcp_enabled;
 	host_ip_msg->kvp_ip_val.addr_family = umsg->body.kvp_ip_val.addr_family;
 
@@ -389,7 +382,7 @@ hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg,
 	    MAX_IP_ADDR_SIZE,
 	    UNUSED_FLAG,
 	    &err_subnet);
-	
+
 	utf16_to_utf8((char *)umsg->body.kvp_ip_val.gate_way, MAX_GATEWAY_SIZE,
 	    (uint16_t *)host_ip_msg->kvp_ip_val.gate_way,
 	    MAX_GATEWAY_SIZE,
@@ -411,16 +404,13 @@ hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg,
  * Ensure utf16_utf8 takes care of the additional string terminating char!!
  */
 static void
-hv_kvp_convert_hostmsg_to_usermsg(void)
+hv_kvp_convert_hostmsg_to_usermsg(struct hv_kvp_msg *hmsg, struct hv_kvp_msg *umsg)
 {
 	int utf_err = 0;
 	uint32_t value_type;
-	struct hv_kvp_ip_msg *host_ip_msg = (struct hv_kvp_ip_msg *)
-		kvp_globals.host_kvp_msg;
-
-	struct hv_kvp_msg *hmsg = kvp_globals.host_kvp_msg;
-	struct hv_kvp_msg *umsg = &kvp_globals.daemon_kvp_msg;
+	struct hv_kvp_ip_msg *host_ip_msg;
 
+	host_ip_msg = (struct hv_kvp_ip_msg*)hmsg;
 	memset(umsg, 0, sizeof(struct hv_kvp_msg));
 
 	umsg->kvp_hdr.operation = hmsg->kvp_hdr.operation;
@@ -525,14 +515,12 @@ hv_kvp_convert_hostmsg_to_usermsg(void)
  * Prepare a host kvp msg based on user kvp msg (utf8 to utf16)
  */
 static int
-hv_kvp_convert_usermsg_to_hostmsg(void)
+hv_kvp_convert_usermsg_to_hostmsg(struct hv_kvp_msg *umsg, struct hv_kvp_msg *hmsg)
 {
 	int hkey_len = 0, hvalue_len = 0, utf_err = 0;
 	struct hv_kvp_exchg_msg_value *host_exchg_data;
 	char *key_name, *value;
 
-	struct hv_kvp_msg *umsg = &kvp_globals.daemon_kvp_msg;
-	struct hv_kvp_msg *hmsg = kvp_globals.host_kvp_msg;
 	struct hv_kvp_ip_msg *host_ip_msg = (struct hv_kvp_ip_msg *)hmsg;
 
 	switch (hmsg->kvp_hdr.operation) {
@@ -564,7 +552,7 @@ hv_kvp_convert_usermsg_to_hostmsg(void)
 
 		if ((hkey_len < 0) || (hvalue_len < 0))
 			return (HV_KVP_E_FAIL);
-			
+
 		return (KVP_SUCCESS);
 
 	case HV_KVP_OP_GET:
@@ -580,9 +568,9 @@ hv_kvp_convert_usermsg_to_hostmsg(void)
 		/* Use values by string */
 		host_exchg_data->value_type = HV_REG_SZ;
 
-		if ((hkey_len < 0) || (hvalue_len < 0)) 
+		if ((hkey_len < 0) || (hvalue_len < 0))
 			return (HV_KVP_E_FAIL);
-			
+
 		return (KVP_SUCCESS);
 
 	default:
@@ -595,22 +583,22 @@ hv_kvp_convert_usermsg_to_hostmsg(void)
  * Send the response back to the host.
  */
 static void
-hv_kvp_respond_host(int error)
+hv_kvp_respond_host(hv_kvp_sc *sc, int error)
 {
 	struct hv_vmbus_icmsg_hdr *hv_icmsg_hdrp;
 
 	hv_icmsg_hdrp = (struct hv_vmbus_icmsg_hdr *)
-	    &kvp_globals.rcv_buf[sizeof(struct hv_vmbus_pipe_hdr)];
+	    &sc->rcv_buf[sizeof(struct hv_vmbus_pipe_hdr)];
 
 	if (error)
 		error = HV_KVP_E_FAIL;
 
 	hv_icmsg_hdrp->status = error;
 	hv_icmsg_hdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | HV_ICMSGHDRFLAG_RESPONSE;
-	
-	error = hv_vmbus_channel_send_packet(kvp_globals.channelp,
-			kvp_globals.rcv_buf,
-			kvp_globals.host_msg_len, kvp_globals.host_msg_id,
+
+	error = hv_vmbus_channel_send_packet(sc->util_sc.hv_dev->channel,
+			sc->rcv_buf,
+			sc->host_msg_len, sc->host_msg_id,
 			HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
 
 	if (error)
@@ -624,16 +612,19 @@ hv_kvp_respond_host(int error)
  * and the host
  */
 static void
-hv_kvp_send_msg_to_daemon(void)
+hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc)
 {
+	struct hv_kvp_msg *hmsg = sc->host_kvp_msg;
+	struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg;
+
 	/* Prepare kvp_msg to be sent to user */
-	hv_kvp_convert_hostmsg_to_usermsg();
+	hv_kvp_convert_hostmsg_to_usermsg(hmsg, umsg);
 
 	/* Send the msg to user via function deamon_read - setting sema */
-	sema_post(&kvp_globals.dev_sema);
+	sema_post(&sc->dev_sema);
 
 	/* We should wake up the daemon, in case it's doing poll() */
-	selwakeup(&hv_kvp_selinfo);
+	selwakeup(&sc->hv_kvp_selinfo);
 }
 
 
@@ -642,98 +633,83 @@ hv_kvp_send_msg_to_daemon(void)
  * and interact with daemon
  */
 static void
-hv_kvp_process_request(void *context)
+hv_kvp_process_request(void *context, int pending)
 {
 	uint8_t *kvp_buf;
-	hv_vmbus_channel *channel = context;
+	hv_vmbus_channel *channel;
 	uint32_t recvlen = 0;
 	uint64_t requestid;
 	struct hv_vmbus_icmsg_hdr *icmsghdrp;
 	int ret = 0;
-	uint64_t pending_cnt = 1;
-	
+	hv_kvp_sc		*sc;
+
 	hv_kvp_log_info("%s: entering hv_kvp_process_request\n", __func__);
-	kvp_buf = receive_buffer[HV_KVP];
+
+	sc = (hv_kvp_sc*)context;
+	kvp_buf = sc->util_sc.receive_buffer;;
+	channel = sc->util_sc.hv_dev->channel;
+
 	ret = hv_vmbus_channel_recv_packet(channel, kvp_buf, 2 * PAGE_SIZE,
 		&recvlen, &requestid);
 
-	/*
-	 * We start counting only after the daemon registers
-	 * and therefore there could be requests pending in 
-	 * the VMBus that are not reflected in pending_cnt.
-	 * Therefore we continue reading as long as either of
-	 * the below conditions is true.
-	 */
+	while ((ret == 0) && (recvlen > 0)) {
+
+		icmsghdrp = (struct hv_vmbus_icmsg_hdr *)
+			&kvp_buf[sizeof(struct hv_vmbus_pipe_hdr)];
+
+		hv_kvp_transaction_init(sc, recvlen, requestid, kvp_buf);
+		if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
+			hv_kvp_negotiate_version(icmsghdrp, NULL, kvp_buf);
+			hv_kvp_respond_host(sc, ret);
+
+			/*
+			 * It is ok to not acquire the mutex before setting
+			 * req_in_progress here because negotiation is the
+			 * first thing that happens and hence there is no
+			 * chance of a race condition.
+			 */
+
+			sc->req_in_progress = false;
+			hv_kvp_log_info("%s :version negotiated\n", __func__);
+
+		} else {
+			if (!sc->daemon_busy) {
+
+				hv_kvp_log_info("%s: issuing qury to daemon\n", __func__);
+				mtx_lock(&sc->pending_mutex);
+				sc->req_timed_out = false;
+				sc->daemon_busy = true;
+				mtx_unlock(&sc->pending_mutex);
 
-	while ((pending_cnt>0) || ((ret == 0) && (recvlen > 0))) {
-
-		if ((ret == 0) && (recvlen>0)) {
-			
-			icmsghdrp = (struct hv_vmbus_icmsg_hdr *)
-					&kvp_buf[sizeof(struct hv_vmbus_pipe_hdr)];
-	
-			hv_kvp_transaction_init(recvlen, channel, requestid, kvp_buf);
-			if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
-				hv_kvp_negotiate_version(icmsghdrp, NULL, kvp_buf);
-				hv_kvp_respond_host(ret);
-					
-				/*
-				 * It is ok to not acquire the mutex before setting 
-				 * req_in_progress here because negotiation is the
-				 * first thing that happens and hence there is no
-				 * chance of a race condition.
-				 */
-				
-				kvp_globals.req_in_progress = false;
-				hv_kvp_log_info("%s :version negotiated\n", __func__);
-
-			} else {
-				if (!kvp_globals.daemon_busy) {
-
-					hv_kvp_log_info("%s: issuing qury to daemon\n", __func__);
-					mtx_lock(&kvp_globals.pending_mutex);
-					kvp_globals.req_timed_out = false;
-					kvp_globals.daemon_busy = true;
-					mtx_unlock(&kvp_globals.pending_mutex);
-
-					hv_kvp_send_msg_to_daemon();
-					hv_kvp_log_info("%s: waiting for daemon\n", __func__);
-				}
-				
-				/* Wait 5 seconds for daemon to respond back */
-				tsleep(&kvp_globals, 0, "kvpworkitem", 5 * hz);
-				hv_kvp_log_info("%s: came out of wait\n", __func__);
+				hv_kvp_send_msg_to_daemon(sc);
+				hv_kvp_log_info("%s: waiting for daemon\n", __func__);
 			}
+
+			/* Wait 5 seconds for daemon to respond back */
+			tsleep(sc, 0, "kvpworkitem", 5 * hz);
+			hv_kvp_log_info("%s: came out of wait\n", __func__);
 		}
 
-		mtx_lock(&kvp_globals.pending_mutex);
-		
+		mtx_lock(&sc->pending_mutex);
+
 		/* Notice that once req_timed_out is set to true
 		 * it will remain true until the next request is
 		 * sent to the daemon. The response from daemon
-		 * is forwarded to host only when this flag is 
-		 * false. 
+		 * is forwarded to host only when this flag is
+		 * false.
 		 */
-		kvp_globals.req_timed_out = true;
+		sc->req_timed_out = true;
 
 		/*
 		 * Cancel request if so need be.
 		 */
-		if (hv_kvp_req_in_progress()) {
+		if (hv_kvp_req_in_progress(sc)) {
 			hv_kvp_log_info("%s: request was still active after wait so failing\n", __func__);
-			hv_kvp_respond_host(HV_KVP_E_FAIL);
-			kvp_globals.req_in_progress = false;	
-		}
-	
-		/*
-		* Decrement pending request count and
-		*/
-		if (kvp_globals.pending_reqs>0) {
-			kvp_globals.pending_reqs = kvp_globals.pending_reqs - 1;
+			hv_kvp_respond_host(sc, HV_KVP_E_FAIL);
+			sc->req_in_progress = false;
 		}
-		pending_cnt = kvp_globals.pending_reqs;
-		
-		mtx_unlock(&kvp_globals.pending_mutex);
+
+		mtx_unlock(&sc->pending_mutex);
 
 		/*
 		 * Try reading next buffer
@@ -741,109 +717,43 @@ hv_kvp_process_request(void *context)
 		recvlen = 0;
 		ret = hv_vmbus_channel_recv_packet(channel, kvp_buf, 2 * PAGE_SIZE,
 			&recvlen, &requestid);
-		hv_kvp_log_info("%s: read: context %p, pending_cnt %llu ret =%d, recvlen=%d\n",
-			__func__, context, (unsigned long long)pending_cnt, ret, recvlen);
-	} 
+		hv_kvp_log_info("%s: read: context %p, ret =%d, recvlen=%d\n",
+			__func__, context, ret, recvlen);
+	}
 }
 
 
 /*
  * Callback routine that gets called whenever there is a message from host
  */
-void
+static void
 hv_kvp_callback(void *context)
 {
-	uint64_t pending_cnt = 0;
-
-	if (kvp_globals.register_done == false) {
-		
-		kvp_globals.channelp = context;
-	} else {
-		
-		mtx_lock(&kvp_globals.pending_mutex);
-		kvp_globals.pending_reqs = kvp_globals.pending_reqs + 1;
-		pending_cnt = kvp_globals.pending_reqs;
-		mtx_unlock(&kvp_globals.pending_mutex);
-		if (pending_cnt == 1) {
-			hv_kvp_log_info("%s: Queuing work item\n", __func__);
-			hv_queue_work_item(
-					service_table[HV_KVP].work_queue,
-					hv_kvp_process_request,
-					context
-					);
-		}
-	}	
-}
-
-
-/*
- * This function is called by the hv_kvp_init -
- * creates character device hv_kvp_dev 
- * allocates memory to hv_kvp_dev_buf
- *
- */
-static int
-hv_kvp_dev_init(void)
-{
-	int error = 0;
-
-	/* initialize semaphore */
-	sema_init(&kvp_globals.dev_sema, 0, "hv_kvp device semaphore");
-	/* create character device */
-	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
-			&hv_kvp_dev,
-			&hv_kvp_cdevsw,
-			0,
-			UID_ROOT,
-			GID_WHEEL,
-			0640,
-			"hv_kvp_dev");
-					   
-	if (error != 0)
-		return (error);
-
+	hv_kvp_sc *sc = (hv_kvp_sc*)context;
 	/*
-	 * Malloc with M_WAITOK flag will never fail.
-	 */
-	hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_HV_KVP_DEV_BUF, M_WAITOK |
-				M_ZERO);
-
-	return (0);
-}
-
-
-/*
- * This function is called by the hv_kvp_deinit -
- * destroy character device
- */
-static void
-hv_kvp_dev_destroy(void)
-{
-
-	if (daemon_task != NULL) {
-		PROC_LOCK(daemon_task);
-		kern_psignal(daemon_task, SIGKILL);
-		PROC_UNLOCK(daemon_task);
+	 The first request from host will not be handled until daemon is registered.
+	 when callback is triggered without a registered daemon, callback just return.
+	 When a new daemon gets regsitered, this callbcak is trigged from _write op.
+	*/
+	if (sc->register_done) {
+		hv_kvp_log_info("%s: Queuing work item\n", __func__);
+		taskqueue_enqueue(taskqueue_thread, &sc->task);
 	}
-	
-	destroy_dev(hv_kvp_dev);
-	free(hv_kvp_dev_buf, M_HV_KVP_DEV_BUF);
-	return;
 }
 
-
 static int
 hv_kvp_dev_open(struct cdev *dev, int oflags, int devtype,
 				struct thread *td)
 {
-	
+	hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
+
 	hv_kvp_log_info("%s: Opened device \"hv_kvp_device\" successfully.\n", __func__);
-	if (kvp_globals.dev_accessed)
+	if (sc->dev_accessed)
 		return (-EBUSY);
-	
-	daemon_task = curproc;
-	kvp_globals.dev_accessed = true;
-	kvp_globals.daemon_busy = false;
+
+	sc->daemon_task = curproc;
+	sc->dev_accessed = true;
+	sc->daemon_busy = false;
 	return (0);
 }
 
@@ -852,10 +762,11 @@ static int
 hv_kvp_dev_close(struct cdev *dev __unused, int fflag __unused, int devtype __unused,
 				 struct thread *td __unused)
 {
+	hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
 
 	hv_kvp_log_info("%s: Closing device \"hv_kvp_device\".\n", __func__);
-	kvp_globals.dev_accessed = false;
-	kvp_globals.register_done = false;
+	sc->dev_accessed = false;
+	sc->register_done = false;
 	return (0);
 }
 
@@ -865,18 +776,21 @@ hv_kvp_dev_close(struct cdev *dev __unused, int fflag __unused, int devtype __un
  * acts as a send to daemon
  */
 static int
-hv_kvp_dev_daemon_read(struct cdev *dev __unused, struct uio *uio, int ioflag __unused)
+hv_kvp_dev_daemon_read(struct cdev *dev, struct uio *uio, int ioflag __unused)
 {
 	size_t amt;
 	int error = 0;
+	struct hv_kvp_msg *hv_kvp_dev_buf;
+	hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
 
 	/* Check hv_kvp daemon registration status*/
-	if (!kvp_globals.register_done)
+	if (!sc->register_done)
 		return (KVP_ERROR);
 
-	sema_wait(&kvp_globals.dev_sema);
+	sema_wait(&sc->dev_sema);
 
-	memcpy(hv_kvp_dev_buf, &kvp_globals.daemon_kvp_msg, sizeof(struct hv_kvp_msg));
+	hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK);
+	memcpy(hv_kvp_dev_buf, &sc->daemon_kvp_msg, sizeof(struct hv_kvp_msg));
 
 	amt = MIN(uio->uio_resid, uio->uio_offset >= BUFFERSIZE + 1 ? 0 :
 		BUFFERSIZE + 1 - uio->uio_offset);
@@ -884,6 +798,7 @@ hv_kvp_dev_daemon_read(struct cdev *dev __unused, struct uio *uio, int ioflag __
 	if ((error = uiomove(hv_kvp_dev_buf, amt, uio)) != 0)
 		hv_kvp_log_info("%s: hv_kvp uiomove read failed!\n", __func__);
 
+	free(hv_kvp_dev_buf, M_TEMP);
 	return (error);
 }
 
@@ -893,29 +808,30 @@ hv_kvp_dev_daemon_read(struct cdev *dev __unused, struct uio *uio, int ioflag __
  * acts as a recieve from daemon
  */
 static int
-hv_kvp_dev_daemon_write(struct cdev *dev __unused, struct uio *uio, int ioflag __unused)
+hv_kvp_dev_daemon_write(struct cdev *dev, struct uio *uio, int ioflag __unused)
 {
 	size_t amt;
 	int error = 0;
+	struct hv_kvp_msg *hv_kvp_dev_buf;
+	hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
 
 	uio->uio_offset = 0;
+	hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK);
 
 	amt = MIN(uio->uio_resid, BUFFERSIZE);
 	error = uiomove(hv_kvp_dev_buf, amt, uio);
 
-	if (error != 0)
+	if (error != 0) {
+		free(hv_kvp_dev_buf, M_TEMP);
 		return (error);
+	}
+	memcpy(&sc->daemon_kvp_msg, hv_kvp_dev_buf, sizeof(struct hv_kvp_msg));
 
-	memcpy(&kvp_globals.daemon_kvp_msg, hv_kvp_dev_buf, sizeof(struct hv_kvp_msg));
-
-	if (kvp_globals.register_done == false) {
-		if (kvp_globals.daemon_kvp_msg.kvp_hdr.operation == HV_KVP_OP_REGISTER) {
-
-			kvp_globals.register_done = true;
-			if (kvp_globals.channelp) {
-			
-				hv_kvp_callback(kvp_globals.channelp);
-			}
+	free(hv_kvp_dev_buf, M_TEMP);
+	if (sc->register_done == false) {
+		if (sc->daemon_kvp_msg.kvp_hdr.operation == HV_KVP_OP_REGISTER) {
+			sc->register_done = true;
+			hv_kvp_callback(dev->si_drv1);
 		}
 		else {
 			hv_kvp_log_info("%s, KVP Registration Failed\n", __func__);
@@ -923,18 +839,20 @@ hv_kvp_dev_daemon_write(struct cdev *dev __unused, struct uio *uio, int ioflag _
 		}
 	} else {
 
-		mtx_lock(&kvp_globals.pending_mutex);
+		mtx_lock(&sc->pending_mutex);
 
-		if(!kvp_globals.req_timed_out) {
+		if(!sc->req_timed_out) {
+			struct hv_kvp_msg *hmsg = sc->host_kvp_msg;
+			struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg;
 
-			hv_kvp_convert_usermsg_to_hostmsg();
-			hv_kvp_respond_host(KVP_SUCCESS);
-			wakeup(&kvp_globals);
-			kvp_globals.req_in_progress = false;
+			hv_kvp_convert_usermsg_to_hostmsg(umsg, hmsg);
+			hv_kvp_respond_host(sc, KVP_SUCCESS);
+			wakeup(sc);
+			sc->req_in_progress = false;
 		}
 
-		kvp_globals.daemon_busy = false;
-		mtx_unlock(&kvp_globals.pending_mutex);
+		sc->daemon_busy = false;
+		mtx_unlock(&sc->pending_mutex);
 	}
 
 	return (error);
@@ -946,66 +864,106 @@ hv_kvp_dev_daemon_write(struct cdev *dev __unused, struct uio *uio, int ioflag _
  * for daemon to read.
  */
 static int
-hv_kvp_dev_daemon_poll(struct cdev *dev __unused, int events, struct thread *td)
+hv_kvp_dev_daemon_poll(struct cdev *dev, int events, struct thread *td)
 {
 	int revents = 0;
+	hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
 
-	mtx_lock(&kvp_globals.pending_mutex);
+	mtx_lock(&sc->pending_mutex);
 	/*
 	 * We check global flag daemon_busy for the data availiability for
 	 * userland to read. Deamon_busy is set to true before driver has data
 	 * for daemon to read. It is set to false after daemon sends
 	 * then response back to driver.
 	 */
-	if (kvp_globals.daemon_busy == true)
+	if (sc->daemon_busy == true)
 		revents = POLLIN;
 	else
-		selrecord(td, &hv_kvp_selinfo);
+		selrecord(td, &sc->hv_kvp_selinfo);
 
-	mtx_unlock(&kvp_globals.pending_mutex);
+	mtx_unlock(&sc->pending_mutex);
 
 	return (revents);
 }
 
-
-/* 
- * hv_kvp initialization function 
- * called from hv_util service.
- *
- */
-int
-hv_kvp_init(hv_vmbus_service *srv)
+static int
+hv_kvp_probe(device_t dev)
 {
-	int error = 0;
-	hv_work_queue *work_queue = NULL;
-	
-	memset(&kvp_globals, 0, sizeof(kvp_globals));
-
-	work_queue = hv_work_queue_create("KVP Service");
-	if (work_queue == NULL) {
-		hv_kvp_log_info("%s: Work queue alloc failed\n", __func__);
-		error = ENOMEM;
-		hv_kvp_log_error("%s: ENOMEM\n", __func__);
-		goto Finish;
+	const char *p = vmbus_get_type(dev);
+	if (!memcmp(p, &service_guid, sizeof(hv_guid))) {
+		device_set_desc(dev, "Hyper-V KVP Service");
+		return BUS_PROBE_DEFAULT;
 	}
-	srv->work_queue = work_queue;
 
-	error = hv_kvp_dev_init();
-	mtx_init(&kvp_globals.pending_mutex, "hv-kvp pending mutex",
-		       	NULL, MTX_DEF);	
-	kvp_globals.pending_reqs = 0;
+	return ENXIO;
+}
+
+static int
+hv_kvp_attach(device_t dev)
+{
+	int error;
+	struct sysctl_oid_list *child;
+	struct sysctl_ctx_list *ctx;
+
+	hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev);
 
+	sc->util_sc.callback = hv_kvp_callback;
+	sema_init(&sc->dev_sema, 0, "hv_kvp device semaphore");
+	mtx_init(&sc->pending_mutex, "hv-kvp pending mutex",
+		NULL, MTX_DEF);
 
-Finish:
-	return (error);
-}
+	ctx = device_get_sysctl_ctx(dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "hv_kvp_log",
+	    CTLFLAG_RW, &hv_kvp_log, 0, "Hyperv KVP service log level");
 
-void
-hv_kvp_deinit(void)
+	TASK_INIT(&sc->task, 0, hv_kvp_process_request, sc);
+
+	/* create character device */
+	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
+			&sc->hv_kvp_dev,
+			&hv_kvp_cdevsw,
+			0,
+			UID_ROOT,
+			GID_WHEEL,
+			0640,
+			"hv_kvp_dev");
+
+	if (error != 0)
+		return (error);
+	sc->hv_kvp_dev->si_drv1 = sc;
+
+	return hv_util_attach(dev);
+}
+
+static int
+hv_kvp_detach(device_t dev)
 {
-	hv_kvp_dev_destroy();
-	mtx_destroy(&kvp_globals.pending_mutex);
+	hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev);
 
-	return;
+	if (sc->daemon_task != NULL) {
+		PROC_LOCK(sc->daemon_task);
+		kern_psignal(sc->daemon_task, SIGKILL);
+		PROC_UNLOCK(sc->daemon_task);
+	}
+
+	destroy_dev(sc->hv_kvp_dev);
+	return hv_util_detach(dev);
 }
+
+static device_method_t kvp_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe, hv_kvp_probe),
+	DEVMETHOD(device_attach, hv_kvp_attach),
+	DEVMETHOD(device_detach, hv_kvp_detach),
+	{ 0, 0 }
+};
+
+static driver_t kvp_driver = { "hvkvp", kvp_methods, sizeof(hv_kvp_sc)};
+
+static devclass_t kvp_devclass;
+
+DRIVER_MODULE(hv_kvp, vmbus, kvp_driver, kvp_devclass, NULL, NULL);
+MODULE_VERSION(hv_kvp, 1);
+MODULE_DEPEND(hv_kvp, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/utilities/hv_kvp.h b/sys/dev/hyperv/utilities/hv_kvp.h
index b67373fa..b62149e 100644
--- a/sys/dev/hyperv/utilities/hv_kvp.h
+++ b/sys/dev/hyperv/utilities/hv_kvp.h
@@ -238,17 +238,4 @@ struct hv_kvp_ip_msg {
 	struct hv_kvp_ipaddr_value      kvp_ip_val;
 } __attribute__((packed));
 
-
-#define HV_SHUT_DOWN                0
-#define HV_TIME_SYNCH               1
-#define HV_HEART_BEAT               2
-#define HV_KVP                      3
-#define HV_MAX_UTIL_SERVICES        4
-
-#define HV_WLTIMEDELTA              116444736000000000L     /* in 100ns unit */
-#define HV_ICTIMESYNCFLAG_PROBE     0
-#define HV_ICTIMESYNCFLAG_SYNC      1
-#define HV_ICTIMESYNCFLAG_SAMPLE    2
-#define HV_NANO_SEC_PER_SEC         1000000000
-
 #endif /* _KVP_H */
diff --git a/sys/dev/hyperv/utilities/hv_shutdown.c b/sys/dev/hyperv/utilities/hv_shutdown.c
new file mode 100644
index 0000000..20bc65e
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_shutdown.c
@@ -0,0 +1,151 @@
+/*-
+ * Copyright (c) 2014 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * A common driver for all hyper-V util services.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/reboot.h>
+#include <sys/timetc.h>
+#include <sys/syscallsubr.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include "hv_util.h"
+
+static hv_guid service_guid = { .data =
+	{0x31, 0x60, 0x0B, 0X0E, 0x13, 0x52, 0x34, 0x49,
+	0x81, 0x8B, 0x38, 0XD9, 0x0C, 0xED, 0x39, 0xDB} };
+
+/**
+ * Shutdown
+ */
+static void
+hv_shutdown_cb(void *context)
+{
+	uint8_t*			buf;
+	hv_vmbus_channel*		channel;
+	uint8_t				execute_shutdown = 0;
+	hv_vmbus_icmsg_hdr*		icmsghdrp;
+	uint32_t			recv_len;
+	uint64_t			request_id;
+	int				ret;
+	hv_vmbus_shutdown_msg_data*	shutdown_msg;
+	hv_util_sc			*softc;
+
+	softc = (hv_util_sc*)context;
+	buf = softc->receive_buffer;;
+	channel = softc->hv_dev->channel;
+	ret = hv_vmbus_channel_recv_packet(channel, buf, PAGE_SIZE,
+					    &recv_len, &request_id);
+
+	if ((ret == 0) && recv_len > 0) {
+
+	    icmsghdrp = (struct hv_vmbus_icmsg_hdr *)
+		&buf[sizeof(struct hv_vmbus_pipe_hdr)];
+
+	    if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
+		hv_negotiate_version(icmsghdrp, NULL, buf);
+
+	    } else {
+		shutdown_msg =
+		    (struct hv_vmbus_shutdown_msg_data *)
+		    &buf[sizeof(struct hv_vmbus_pipe_hdr) +
+			sizeof(struct hv_vmbus_icmsg_hdr)];
+
+		switch (shutdown_msg->flags) {
+		    case 0:
+		    case 1:
+			icmsghdrp->status = HV_S_OK;
+			execute_shutdown = 1;
+			if(bootverbose)
+			    printf("Shutdown request received -"
+				    " graceful shutdown initiated\n");
+			break;
+		    default:
+			icmsghdrp->status = HV_E_FAIL;
+			execute_shutdown = 0;
+			printf("Shutdown request received -"
+			    " Invalid request\n");
+			break;
+		    }
+	    }
+
+	icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION |
+				 HV_ICMSGHDRFLAG_RESPONSE;
+
+	    hv_vmbus_channel_send_packet(channel, buf,
+					recv_len, request_id,
+					HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
+	}
+
+	if (execute_shutdown)
+	    shutdown_nice(RB_POWEROFF);
+}
+
+static int
+hv_shutdown_probe(device_t dev)
+{
+	const char *p = vmbus_get_type(dev);
+	if (!memcmp(p, &service_guid, sizeof(hv_guid))) {
+		device_set_desc(dev, "Hyper-V Shutdown Service");
+		return BUS_PROBE_DEFAULT;
+	}
+
+	return ENXIO;
+}
+
+static int
+hv_shutdown_attach(device_t dev)
+{
+	hv_util_sc *softc = (hv_util_sc*)device_get_softc(dev);
+
+	softc->callback = hv_shutdown_cb;
+
+	return hv_util_attach(dev);
+}
+
+static device_method_t shutdown_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe, hv_shutdown_probe),
+	DEVMETHOD(device_attach, hv_shutdown_attach),
+	DEVMETHOD(device_detach, hv_util_detach),
+	{ 0, 0 }
+};
+
+static driver_t shutdown_driver = { "hvshutdown", shutdown_methods, sizeof(hv_util_sc)};
+
+static devclass_t shutdown_devclass;
+
+DRIVER_MODULE(hv_shutdown, vmbus, shutdown_driver, shutdown_devclass, NULL, NULL);
+MODULE_VERSION(hv_shutdown, 1);
+MODULE_DEPEND(hv_shutdown, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/utilities/hv_timesync.c b/sys/dev/hyperv/utilities/hv_timesync.c
new file mode 100644
index 0000000..d1ea904
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_timesync.c
@@ -0,0 +1,216 @@
+/*-
+ * Copyright (c) 2014 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * A common driver for all hyper-V util services.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/reboot.h>
+#include <sys/timetc.h>
+#include <sys/syscallsubr.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include "hv_util.h"
+
+#define HV_WLTIMEDELTA              116444736000000000L     /* in 100ns unit */
+#define HV_ICTIMESYNCFLAG_PROBE     0
+#define HV_ICTIMESYNCFLAG_SYNC      1
+#define HV_ICTIMESYNCFLAG_SAMPLE    2
+#define HV_NANO_SEC_PER_SEC         1000000000
+
+/* Time Sync data */
+typedef struct {
+	uint64_t data;
+} time_sync_data;
+
+        /* Time Synch Service */
+static hv_guid service_guid = {.data =
+	{0x30, 0xe6, 0x27, 0x95, 0xae, 0xd0, 0x7b, 0x49,
+	0xad, 0xce, 0xe8, 0x0a, 0xb0, 0x17, 0x5c, 0xaf } };
+
+struct hv_ictimesync_data {
+	uint64_t    parenttime;
+	uint64_t    childtime;
+	uint64_t    roundtriptime;
+	uint8_t     flags;
+} __packed;
+
+typedef struct hv_timesync_sc {
+	hv_util_sc	util_sc;
+	struct task	task;
+	time_sync_data	time_msg;
+} hv_timesync_sc;
+
+/**
+ * Set host time based on time sync message from host
+ */
+static void
+hv_set_host_time(void *context, int pending)
+{
+	hv_timesync_sc *softc = (hv_timesync_sc*)context;
+	uint64_t hosttime = softc->time_msg.data;
+	struct timespec guest_ts, host_ts;
+	uint64_t host_tns;
+	int64_t diff;
+	int error;
+
+	host_tns = (hosttime - HV_WLTIMEDELTA) * 100;
+	host_ts.tv_sec = (time_t)(host_tns/HV_NANO_SEC_PER_SEC);
+	host_ts.tv_nsec = (long)(host_tns%HV_NANO_SEC_PER_SEC);
+
+	nanotime(&guest_ts);
+
+	diff = (int64_t)host_ts.tv_sec - (int64_t)guest_ts.tv_sec;
+
+	/*
+	 * If host differs by 5 seconds then make the guest catch up
+	 */
+	if (diff > 5 || diff < -5) {
+		error = kern_clock_settime(curthread, CLOCK_REALTIME,
+		    &host_ts);
+	}
+}
+
+/**
+ * @brief Synchronize time with host after reboot, restore, etc.
+ *
+ * ICTIMESYNCFLAG_SYNC flag bit indicates reboot, restore events of the VM.
+ * After reboot the flag ICTIMESYNCFLAG_SYNC is included in the first time
+ * message after the timesync channel is opened. Since the hv_utils module is
+ * loaded after hv_vmbus, the first message is usually missed. The other
+ * thing is, systime is automatically set to emulated hardware clock which may
+ * not be UTC time or in the same time zone. So, to override these effects, we
+ * use the first 50 time samples for initial system time setting.
+ */
+static inline
+void hv_adj_guesttime(hv_timesync_sc *sc, uint64_t hosttime, uint8_t flags)
+{
+	sc->time_msg.data = hosttime;
+
+	if (((flags & HV_ICTIMESYNCFLAG_SYNC) != 0) ||
+		((flags & HV_ICTIMESYNCFLAG_SAMPLE) != 0)) {
+		taskqueue_enqueue(taskqueue_thread, &sc->task);
+	}
+}
+
+/**
+ * Time Sync Channel message handler
+ */
+static void
+hv_timesync_cb(void *context)
+{
+	hv_vmbus_channel*	channel;
+	hv_vmbus_icmsg_hdr*	icmsghdrp;
+	uint32_t		recvlen;
+	uint64_t		requestId;
+	int			ret;
+	uint8_t*		time_buf;
+	struct hv_ictimesync_data* timedatap;
+	hv_timesync_sc		*softc;
+
+	softc = (hv_timesync_sc*)context;
+	channel = softc->util_sc.hv_dev->channel;
+	time_buf = softc->util_sc.receive_buffer;
+
+	ret = hv_vmbus_channel_recv_packet(channel, time_buf,
+		PAGE_SIZE, &recvlen, &requestId);
+
+	if ((ret == 0) && recvlen > 0) {
+	    icmsghdrp = (struct hv_vmbus_icmsg_hdr *) &time_buf[
+		sizeof(struct hv_vmbus_pipe_hdr)];
+
+	    if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
+		hv_negotiate_version(icmsghdrp, NULL, time_buf);
+	    } else {
+		timedatap = (struct hv_ictimesync_data *) &time_buf[
+		    sizeof(struct hv_vmbus_pipe_hdr) +
+			sizeof(struct hv_vmbus_icmsg_hdr)];
+		hv_adj_guesttime(softc, timedatap->parenttime, timedatap->flags);
+	    }
+
+	    icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION
+		| HV_ICMSGHDRFLAG_RESPONSE;
+
+	    hv_vmbus_channel_send_packet(channel, time_buf,
+		recvlen, requestId,
+		HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
+	}
+}
+
+static int
+hv_timesync_probe(device_t dev)
+{
+	const char *p = vmbus_get_type(dev);
+	if (!memcmp(p, &service_guid, sizeof(hv_guid))) {
+		device_set_desc(dev, "Hyper-V Time Synch Service");
+		return BUS_PROBE_DEFAULT;
+	}
+
+	return ENXIO;
+}
+
+static int
+hv_timesync_attach(device_t dev)
+{
+	hv_timesync_sc *softc = device_get_softc(dev);
+
+	softc->util_sc.callback = hv_timesync_cb;
+	TASK_INIT(&softc->task, 1, hv_set_host_time, softc);
+
+	return hv_util_attach(dev);
+}
+
+static int
+hv_timesync_detach(device_t dev)
+{
+	hv_timesync_sc *softc = device_get_softc(dev);
+	taskqueue_drain(taskqueue_thread, &softc->task);
+
+	return hv_util_detach(dev);
+}
+
+static device_method_t timesync_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe, hv_timesync_probe),
+	DEVMETHOD(device_attach, hv_timesync_attach),
+	DEVMETHOD(device_detach, hv_timesync_detach),
+	{ 0, 0 }
+};
+
+static driver_t timesync_driver = { "hvtimesync", timesync_methods, sizeof(hv_timesync_sc)};
+
+static devclass_t timesync_devclass;
+
+DRIVER_MODULE(hv_timesync, vmbus, timesync_driver, timesync_devclass, NULL, NULL);
+MODULE_VERSION(hv_timesync, 1);
+MODULE_DEPEND(hv_timesync, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/utilities/hv_util.c b/sys/dev/hyperv/utilities/hv_util.c
index dc4b1e2..7d19b3f 100644
--- a/sys/dev/hyperv/utilities/hv_util.c
+++ b/sys/dev/hyperv/utilities/hv_util.c
@@ -40,85 +40,9 @@
 #include <sys/syscallsubr.h>
 
 #include <dev/hyperv/include/hyperv.h>
-#include "hv_kvp.h"
+#include "hv_util.h"
 
-/* Time Sync data */
-typedef struct {
-	uint64_t data;
-} time_sync_data;
-
-static void hv_shutdown_cb(void *context);
-static void hv_heartbeat_cb(void *context);
-static void hv_timesync_cb(void *context);
-
-static int hv_timesync_init(hv_vmbus_service *serv);
-
-/*
- * Note: GUID codes below are predefined by the host hypervisor
- * (Hyper-V and Azure)interface and required for correct operation.
- */
-hv_vmbus_service service_table[] = {
-	/* Shutdown Service */
-	{ .guid.data = {0x31, 0x60, 0x0B, 0X0E, 0x13, 0x52, 0x34, 0x49,
-			0x81, 0x8B, 0x38, 0XD9, 0x0C, 0xED, 0x39, 0xDB},
-	  .name  = "Hyper-V Shutdown Service\n",
-	  .enabled = TRUE,
-	  .callback = hv_shutdown_cb,
-	},
-
-        /* Time Synch Service */
-        { .guid.data = {0x30, 0xe6, 0x27, 0x95, 0xae, 0xd0, 0x7b, 0x49,
-			0xad, 0xce, 0xe8, 0x0a, 0xb0, 0x17, 0x5c, 0xaf},
-	  .name = "Hyper-V Time Synch Service\n",
-	  .enabled = TRUE,
-	  .init = hv_timesync_init,
-	  .callback = hv_timesync_cb,
-	},
-
-        /* Heartbeat Service */
-        { .guid.data = {0x39, 0x4f, 0x16, 0x57, 0x15, 0x91, 0x78, 0x4e,
-			0xab, 0x55, 0x38, 0x2f, 0x3b, 0xd5, 0x42, 0x2d},
-	  .name = "Hyper-V Heartbeat Service\n",
-	  .enabled = TRUE,
-  	  .callback = hv_heartbeat_cb,
-	},
-
-        /* KVP (Key Value Pair) Service */
-        { .guid.data = {0xe7, 0xf4, 0xa0, 0xa9, 0x45, 0x5a, 0x96, 0x4d,
-			0xb8, 0x27, 0x8a, 0x84, 0x1e, 0x8c, 0x3,  0xe6},
-	  .name = "Hyper-V KVP Service\n",
-	  .enabled = TRUE,
-	  .init = hv_kvp_init,
-	  .callback = hv_kvp_callback,
-	},
-};
-
-/*
- * Receive buffer pointers. There is one buffer per utility service. The
- * buffer is allocated during attach().
- */
-uint8_t *receive_buffer[HV_MAX_UTIL_SERVICES];
-
-static boolean_t destroyed_kvp = FALSE;
-
-struct hv_ictimesync_data {
-	uint64_t    parenttime;
-	uint64_t    childtime;
-	uint64_t    roundtriptime;
-	uint8_t     flags;
-} __packed;
-
-static int
-hv_timesync_init(hv_vmbus_service *serv)
-{
-
-	serv->work_queue = hv_work_queue_create("Time Sync");
-	if (serv->work_queue == NULL)
-		return (ENOMEM);
-	return (0);
-}
-
-static void
+void
 hv_negotiate_version(
 	struct hv_vmbus_icmsg_hdr*		icmsghdrp,
 	struct hv_vmbus_icmsg_negotiate*	negop,
@@ -147,267 +71,19 @@ hv_negotiate_version(
 	negop->icmsg_vercnt = 1;
 }
 
-
-/**
- * Set host time based on time sync message from host
- */
-static void
-hv_set_host_time(void *context)
-{
- 	time_sync_data* time_msg = (time_sync_data*) context;	
-	uint64_t hosttime = time_msg->data;
-	struct timespec guest_ts, host_ts;
-	uint64_t host_tns;
-	int64_t diff;
-	int error;
-
-	host_tns = (hosttime - HV_WLTIMEDELTA) * 100;
-	host_ts.tv_sec = (time_t)(host_tns/HV_NANO_SEC_PER_SEC);
-	host_ts.tv_nsec = (long)(host_tns%HV_NANO_SEC_PER_SEC);
-
-	nanotime(&guest_ts);
-	
-	diff = (int64_t)host_ts.tv_sec - (int64_t)guest_ts.tv_sec;
-
-	/*
-	 * If host differs by 5 seconds then make the guest catch up
-	 */
-	if (diff > 5 || diff < -5) {
-		error = kern_clock_settime(curthread, CLOCK_REALTIME,
-		    &host_ts);
-	} 
-
-	/*
-	 * Free the hosttime that was allocated in hv_adj_guesttime()
-	 */
-	free(time_msg, M_DEVBUF);
-}
-
-/**
- * @brief Synchronize time with host after reboot, restore, etc.
- *
- * ICTIMESYNCFLAG_SYNC flag bit indicates reboot, restore events of the VM.
- * After reboot the flag ICTIMESYNCFLAG_SYNC is included in the first time
- * message after the timesync channel is opened. Since the hv_utils module is
- * loaded after hv_vmbus, the first message is usually missed. The other
- * thing is, systime is automatically set to emulated hardware clock which may
- * not be UTC time or in the same time zone. So, to override these effects, we
- * use the first 50 time samples for initial system time setting.
- */
-static inline
-void hv_adj_guesttime(uint64_t hosttime, uint8_t flags)
-{
-	time_sync_data* time_msg;
-
-	time_msg = malloc(sizeof(time_sync_data), M_DEVBUF, M_NOWAIT);
-
-	if (time_msg == NULL)
-		return;
-	
-	time_msg->data = hosttime;
-
-	if ((flags & HV_ICTIMESYNCFLAG_SYNC) != 0) {
-		hv_queue_work_item(service_table[HV_TIME_SYNCH].work_queue,
-		    hv_set_host_time, time_msg);
-	} else if ((flags & HV_ICTIMESYNCFLAG_SAMPLE) != 0) {
-		hv_queue_work_item(service_table[HV_TIME_SYNCH].work_queue,
-		    hv_set_host_time, time_msg);
-	} else {
-		free(time_msg, M_DEVBUF);
-	}
-}
-
-/**
- * Time Sync Channel message handler
- */
-static void
-hv_timesync_cb(void *context)
-{
-	hv_vmbus_channel*	channel = context;
-	hv_vmbus_icmsg_hdr*	icmsghdrp;
-	uint32_t		recvlen;
-	uint64_t		requestId;
-	int			ret;
-	uint8_t*		time_buf;
-	struct hv_ictimesync_data* timedatap;
-
-	time_buf = receive_buffer[HV_TIME_SYNCH];
-
-	ret = hv_vmbus_channel_recv_packet(channel, time_buf,
-					    PAGE_SIZE, &recvlen, &requestId);
-
-	if ((ret == 0) && recvlen > 0) {
-	    icmsghdrp = (struct hv_vmbus_icmsg_hdr *) &time_buf[
-		sizeof(struct hv_vmbus_pipe_hdr)];
-
-	    if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
-		hv_negotiate_version(icmsghdrp, NULL, time_buf);
-	    } else {
-		timedatap = (struct hv_ictimesync_data *) &time_buf[
-		    sizeof(struct hv_vmbus_pipe_hdr) +
-			sizeof(struct hv_vmbus_icmsg_hdr)];
-		hv_adj_guesttime(timedatap->parenttime, timedatap->flags);
-	    }
-
-	    icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION
-		| HV_ICMSGHDRFLAG_RESPONSE;
-
-	    hv_vmbus_channel_send_packet(channel, time_buf,
-		recvlen, requestId,
-		HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
-	}
-}
-
-/**
- * Shutdown
- */
-static void
-hv_shutdown_cb(void *context)
-{
-	uint8_t*		buf;
-	hv_vmbus_channel*		channel = context;
-	uint8_t			execute_shutdown = 0;
-	hv_vmbus_icmsg_hdr*		icmsghdrp;
-	uint32_t		recv_len;
-	uint64_t		request_id;
-	int				ret;
-	hv_vmbus_shutdown_msg_data*	shutdown_msg;
-
-	buf = receive_buffer[HV_SHUT_DOWN];
-
-	ret = hv_vmbus_channel_recv_packet(channel, buf, PAGE_SIZE,
-					    &recv_len, &request_id);
-
-	if ((ret == 0) && recv_len > 0) {
-
-	    icmsghdrp = (struct hv_vmbus_icmsg_hdr *)
-		&buf[sizeof(struct hv_vmbus_pipe_hdr)];
-
-	    if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
-		hv_negotiate_version(icmsghdrp, NULL, buf);
-
-	    } else {
-		shutdown_msg =
-		    (struct hv_vmbus_shutdown_msg_data *)
-		    &buf[sizeof(struct hv_vmbus_pipe_hdr) +
-			sizeof(struct hv_vmbus_icmsg_hdr)];
-
-		switch (shutdown_msg->flags) {
-		    case 0:
-		    case 1:
-			icmsghdrp->status = HV_S_OK;
-			execute_shutdown = 1;
-			if(bootverbose)
-			    printf("Shutdown request received -"
-				    " graceful shutdown initiated\n");
-			break;
-		    default:
-			icmsghdrp->status = HV_E_FAIL;
-			execute_shutdown = 0;
-			printf("Shutdown request received -"
-			    " Invalid request\n");
-			break;
-		    }
-	    }
-
-	    icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION |
-				 HV_ICMSGHDRFLAG_RESPONSE;
-
-	    hv_vmbus_channel_send_packet(channel, buf,
-					recv_len, request_id,
-					HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
-	}
-
-	if (execute_shutdown)
-	    shutdown_nice(RB_POWEROFF);
-}
-
-/**
- * Process heartbeat message
- */
-static void
-hv_heartbeat_cb(void *context)
-{
-	uint8_t*		buf;
-	hv_vmbus_channel*	channel = context;
-	uint32_t		recvlen;
-	uint64_t		requestid;
-	int			ret;
-
-	struct hv_vmbus_heartbeat_msg_data*	heartbeat_msg;
-	struct hv_vmbus_icmsg_hdr*		icmsghdrp;
-
-	buf = receive_buffer[HV_HEART_BEAT];
-
-	ret = hv_vmbus_channel_recv_packet(channel, buf, PAGE_SIZE, &recvlen,
-					    &requestid);
-
-	if ((ret == 0) && recvlen > 0) {
-
-	    icmsghdrp = (struct hv_vmbus_icmsg_hdr *)
-		&buf[sizeof(struct hv_vmbus_pipe_hdr)];
-
-	    if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
-		hv_negotiate_version(icmsghdrp, NULL, buf);
-
-	    } else {
-		heartbeat_msg =
-		    (struct hv_vmbus_heartbeat_msg_data *)
-			&buf[sizeof(struct hv_vmbus_pipe_hdr) +
-			     sizeof(struct hv_vmbus_icmsg_hdr)];
-
-		heartbeat_msg->seq_num += 1;
-	    }
-
-	    icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION |
-				 HV_ICMSGHDRFLAG_RESPONSE;
-
-	    hv_vmbus_channel_send_packet(channel, buf, recvlen, requestid,
-		HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
-	}
-}
-
-
-static int
-hv_util_probe(device_t dev)
-{
-	int i;
-	int rtn_value = ENXIO;
-
-	for (i = 0; i < HV_MAX_UTIL_SERVICES; i++) {
-	    const char *p = vmbus_get_type(dev);
-	    if (service_table[i].enabled && !memcmp(p, &service_table[i].guid, sizeof(hv_guid))) {
-		device_set_softc(dev, (void *) (&service_table[i]));
-		rtn_value = BUS_PROBE_DEFAULT;
-	    }
-	}
-
-	return rtn_value;
-}
-
-static int
+int
 hv_util_attach(device_t dev)
 {
-	struct hv_device*		hv_dev;
-	struct hv_vmbus_service*	service;
-	int				ret;
-	size_t				receive_buffer_offset;
+	struct hv_device*	hv_dev;
+	struct hv_util_sc*	softc;
+	int			ret;
 
 	hv_dev = vmbus_get_devctx(dev);
-	service = device_get_softc(dev);
-	receive_buffer_offset = service - &service_table[0];
-	device_printf(dev, "Hyper-V Service attaching: %s\n", service->name);
-	receive_buffer[receive_buffer_offset] =
+	softc = device_get_softc(dev);
+	softc->hv_dev = hv_dev;
+	softc->receive_buffer =
 		malloc(4 * PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO);
 
-	if (service->init != NULL) {
-	    ret = service->init(service);
-	    if (ret) {
-		ret = ENODEV;
-		goto error0;
-	    }
-	}
-
 	/*
 	 * These services are not performance critical and do not need
 	 * batched reading. Furthermore, some services such as KVP can
@@ -418,83 +94,30 @@ hv_util_attach(device_t dev)
 	hv_set_channel_read_state(hv_dev->channel, FALSE);
 
 	ret = hv_vmbus_channel_open(hv_dev->channel, 4 * PAGE_SIZE,
-		    4 * PAGE_SIZE, NULL, 0,
-		    service->callback, hv_dev->channel);
+			4 * PAGE_SIZE, NULL, 0,
+			softc->callback, softc);
 
 	if (ret)
-	    goto error0;
+		goto error0;
 
 	return (0);
 
-	error0:
-
-	    free(receive_buffer[receive_buffer_offset], M_DEVBUF);
-	    receive_buffer[receive_buffer_offset] = NULL;
-
+error0:
+	free(softc->receive_buffer, M_DEVBUF);
 	return (ret);
 }
 
-static int
+int
 hv_util_detach(device_t dev)
 {
-	struct hv_device*		hv_dev;
-	struct hv_vmbus_service*	service;
-	size_t				receive_buffer_offset;
-
-	if (!destroyed_kvp) {
-		hv_kvp_deinit();
-		destroyed_kvp = TRUE;
-	}
+	struct hv_device*	hv_dev;
+	struct hv_util_sc*	softc;
 
 	hv_dev = vmbus_get_devctx(dev);
 
 	hv_vmbus_channel_close(hv_dev->channel);
-	service = device_get_softc(dev);
-	receive_buffer_offset = service - &service_table[0];
+	softc = device_get_softc(dev);
 
-	if (service->work_queue != NULL)
-	    hv_work_queue_close(service->work_queue);
-
-	free(receive_buffer[receive_buffer_offset], M_DEVBUF);
-	receive_buffer[receive_buffer_offset] = NULL;
+	free(softc->receive_buffer, M_DEVBUF);
 	return (0);
 }
-
-static void
-hv_util_init(void)
-{
-}
-
-static int
-hv_util_modevent(module_t mod, int event, void *arg)
-{
-	switch (event) {
-        case MOD_LOAD:
-                break;
-        case MOD_UNLOAD:
-		break;
-	default:
-		break;
-        }
-        return (0);
-}
-
-static device_method_t util_methods[] = {
-	/* Device interface */
-	DEVMETHOD(device_probe, hv_util_probe),
-	DEVMETHOD(device_attach, hv_util_attach),
-	DEVMETHOD(device_detach, hv_util_detach),
-	DEVMETHOD(device_shutdown, bus_generic_shutdown),
-	{ 0, 0 } }
-;
-
-static driver_t util_driver = { "hyperv-utils", util_methods, 0 };
-
-static devclass_t util_devclass;
-
-DRIVER_MODULE(hv_utils, vmbus, util_driver, util_devclass, hv_util_modevent, 0);
-MODULE_VERSION(hv_utils, 1);
-MODULE_DEPEND(hv_utils, vmbus, 1, 1, 1);
-
-SYSINIT(hv_util_initx, SI_SUB_KTHREAD_IDLE, SI_ORDER_MIDDLE + 1,
-	hv_util_init, NULL);
diff --git a/sys/dev/hyperv/utilities/hv_util.h b/sys/dev/hyperv/utilities/hv_util.h
new file mode 100644
index 0000000..708dca8
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_util.h
@@ -0,0 +1,55 @@
+/*-
+ * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HVUTIL_H_
+#define _HVUTIL_H_
+
+/**
+ * hv_util related structures
+ *
+ */
+typedef struct hv_util_sc {
+	/*
+	 * function to process Hyper-V messages
+	 */
+	void (*callback)(void *);
+
+	struct hv_device*	hv_dev;
+	uint8_t			*receive_buffer;
+} hv_util_sc;
+
+void hv_negotiate_version(
+	struct hv_vmbus_icmsg_hdr*		icmsghdrp,
+	struct hv_vmbus_icmsg_negotiate*	negop,
+	uint8_t*				buf);
+
+int hv_util_attach(device_t dev);
+int hv_util_detach(device_t dev);
+#endif
diff --git a/sys/dev/hyperv/vmbus/hv_channel.c b/sys/dev/hyperv/vmbus/hv_channel.c
index 7037768..bb777cc 100644
--- a/sys/dev/hyperv/vmbus/hv_channel.c
+++ b/sys/dev/hyperv/vmbus/hv_channel.c
@@ -52,6 +52,7 @@ static int 	vmbus_channel_create_gpadl_header(
 			uint32_t*			message_count);
 
 static void 	vmbus_channel_set_event(hv_vmbus_channel* channel);
+static void	VmbusProcessChannelEvent(void* channel, int pending);
 
 /**
  *  @brief Trigger an event notification on the specified channel
@@ -68,9 +69,7 @@ vmbus_channel_set_event(hv_vmbus_channel *channel)
 				+ ((channel->offer_msg.child_rel_id >> 5))));
 
 		monitor_page = (hv_vmbus_monitor_page *)
-			hv_vmbus_g_connection.monitor_pages;
-
-		monitor_page++; /* Get the child to parent monitor page */
+			hv_vmbus_g_connection.monitor_page_2;
 
 		synch_set_bit(channel->monitor_bit,
 			(uint32_t *)&monitor_page->
@@ -115,6 +114,9 @@ hv_vmbus_channel_open(
 	new_channel->on_channel_callback = pfn_on_channel_callback;
 	new_channel->channel_callback_context = context;
 
+	new_channel->rxq = hv_vmbus_g_context.hv_event_queue[new_channel->target_cpu];
+	TASK_INIT(&new_channel->channel_task, 0, VmbusProcessChannelEvent, new_channel);
+
 	/* Allocate the ring buffer */
 	out = contigmalloc((send_ring_buffer_size + recv_ring_buffer_size),
 	    M_DEVBUF, M_ZERO, 0UL, BUS_SPACE_MAXADDR, PAGE_SIZE, 0);
@@ -518,6 +520,7 @@ static void
 hv_vmbus_channel_close_internal(hv_vmbus_channel *channel)
 {
 	int ret = 0;
+	struct taskqueue *rxq = channel->rxq;
 	hv_vmbus_channel_close_channel* msg;
 	hv_vmbus_channel_msg_info* info;
 
@@ -525,6 +528,11 @@ hv_vmbus_channel_close_internal(hv_vmbus_channel *channel)
 	channel->sc_creation_callback = NULL;
 
 	/*
+	 * set rxq to NULL to avoid more requests be scheduled
+	 */
+	channel->rxq = NULL;
+	taskqueue_drain(rxq, &channel->channel_task);
+	/*
 	 * Grab the lock to prevent race condition when a packet received
 	 * and unloading driver is in the process.
 	 */
@@ -666,11 +674,11 @@ hv_vmbus_channel_send_packet_pagebuffer(
 {
 
 	int					ret = 0;
-	int					i = 0;
 	boolean_t				need_sig;
 	uint32_t				packet_len;
+	uint32_t				page_buflen;
 	uint32_t				packetLen_aligned;
-	hv_vmbus_sg_buffer_list			buffer_list[3];
+	hv_vmbus_sg_buffer_list			buffer_list[4];
 	hv_vmbus_channel_packet_page_buffer	desc;
 	uint32_t				descSize;
 	uint64_t				alignedData = 0;
@@ -682,36 +690,33 @@ hv_vmbus_channel_send_packet_pagebuffer(
 	 * Adjust the size down since hv_vmbus_channel_packet_page_buffer
 	 *  is the largest size we support
 	 */
-	descSize = sizeof(hv_vmbus_channel_packet_page_buffer) -
-			((HV_MAX_PAGE_BUFFER_COUNT - page_count) *
-			sizeof(hv_vmbus_page_buffer));
-	packet_len = descSize + buffer_len;
+	descSize = __offsetof(hv_vmbus_channel_packet_page_buffer, range);
+	page_buflen = sizeof(hv_vmbus_page_buffer) * page_count;
+	packet_len = descSize + page_buflen + buffer_len;
 	packetLen_aligned = HV_ALIGN_UP(packet_len, sizeof(uint64_t));
 
 	/* Setup the descriptor */
 	desc.type = HV_VMBUS_PACKET_TYPE_DATA_USING_GPA_DIRECT;
 	desc.flags = HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
-	desc.data_offset8 = descSize >> 3; /* in 8-bytes granularity */
+	/* in 8-bytes granularity */
+	desc.data_offset8 = (descSize + page_buflen) >> 3;
 	desc.length8 = (uint16_t) (packetLen_aligned >> 3);
 	desc.transaction_id = request_id;
 	desc.range_count = page_count;
 
-	for (i = 0; i < page_count; i++) {
-		desc.range[i].length = page_buffers[i].length;
-		desc.range[i].offset = page_buffers[i].offset;
-		desc.range[i].pfn = page_buffers[i].pfn;
-	}
-
 	buffer_list[0].data = &desc;
 	buffer_list[0].length = descSize;
 
-	buffer_list[1].data = buffer;
-	buffer_list[1].length = buffer_len;
+	buffer_list[1].data = page_buffers;
+	buffer_list[1].length = page_buflen;
 
-	buffer_list[2].data = &alignedData;
-	buffer_list[2].length = packetLen_aligned - packet_len;
+	buffer_list[2].data = buffer;
+	buffer_list[2].length = buffer_len;
 
-	ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3,
+	buffer_list[3].data = &alignedData;
+	buffer_list[3].length = packetLen_aligned - packet_len;
+
+	ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 4,
 	    &need_sig);
 
 	/* TODO: We should determine if this is optional */
@@ -880,3 +885,67 @@ hv_vmbus_channel_recv_packet_raw(
 
 	return (0);
 }
+
+
+/**
+ * Process a channel event notification
+ */
+static void
+VmbusProcessChannelEvent(void* context, int pending)
+{
+	void* arg;
+	uint32_t bytes_to_read;
+	hv_vmbus_channel* channel = (hv_vmbus_channel*)context;
+	boolean_t is_batched_reading;
+
+	/**
+	 * Find the channel based on this relid and invokes
+	 * the channel callback to process the event
+	 */
+
+	if (channel == NULL) {
+		return;
+	}
+	/**
+	 * To deal with the race condition where we might
+	 * receive a packet while the relevant driver is
+	 * being unloaded, dispatch the callback while
+	 * holding the channel lock. The unloading driver
+	 * will acquire the same channel lock to set the
+	 * callback to NULL. This closes the window.
+	 */
+
+	/*
+	 * Disable the lock due to newly added WITNESS check in r277723.
+	 * Will seek other way to avoid race condition.
+	 * -- whu
+	 */
+	// mtx_lock(&channel->inbound_lock);
+	if (channel->on_channel_callback != NULL) {
+		arg = channel->channel_callback_context;
+		is_batched_reading = channel->batched_reading;
+		/*
+		 * Optimize host to guest signaling by ensuring:
+		 * 1. While reading the channel, we disable interrupts from
+		 *    host.
+		 * 2. Ensure that we process all posted messages from the host
+		 *    before returning from this callback.
+		 * 3. Once we return, enable signaling from the host. Once this
+		 *    state is set we check to see if additional packets are
+		 *    available to read. In this case we repeat the process.
+		 */
+		do {
+			if (is_batched_reading)
+				hv_ring_buffer_read_begin(&channel->inbound);
+
+			channel->on_channel_callback(arg);
+
+			if (is_batched_reading)
+				bytes_to_read =
+				    hv_ring_buffer_read_end(&channel->inbound);
+			else
+				bytes_to_read = 0;
+		} while (is_batched_reading && (bytes_to_read != 0));
+	}
+	// mtx_unlock(&channel->inbound_lock);
+}
diff --git a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
index 4ccb647..ab6e8ad 100644
--- a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
+++ b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
@@ -39,8 +39,10 @@ __FBSDID("$FreeBSD$");
  */
 
 static void vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr);
+static void vmbus_channel_on_offer_internal(void* context);
 static void vmbus_channel_on_open_result(hv_vmbus_channel_msg_header* hdr);
 static void vmbus_channel_on_offer_rescind(hv_vmbus_channel_msg_header* hdr);
+static void vmbus_channel_on_offer_rescind_internal(void* context);
 static void vmbus_channel_on_gpadl_created(hv_vmbus_channel_msg_header* hdr);
 static void vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr);
 static void vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr);
@@ -52,41 +54,46 @@ static void vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr);
 hv_vmbus_channel_msg_table_entry
     g_channel_message_table[HV_CHANNEL_MESSAGE_COUNT] = {
 	{ HV_CHANNEL_MESSAGE_INVALID,
-		0, NULL },
+		NULL },
 	{ HV_CHANNEL_MESSAGE_OFFER_CHANNEL,
-		0, vmbus_channel_on_offer },
+		vmbus_channel_on_offer },
 	{ HV_CHANNEL_MESSAGE_RESCIND_CHANNEL_OFFER,
-		0, vmbus_channel_on_offer_rescind },
+		vmbus_channel_on_offer_rescind },
 	{ HV_CHANNEL_MESSAGE_REQUEST_OFFERS,
-		0, NULL },
+		NULL },
 	{ HV_CHANNEL_MESSAGE_ALL_OFFERS_DELIVERED,
-		1, vmbus_channel_on_offers_delivered },
+		vmbus_channel_on_offers_delivered },
 	{ HV_CHANNEL_MESSAGE_OPEN_CHANNEL,
-		0, NULL },
+		NULL },
 	{ HV_CHANNEL_MESSAGE_OPEN_CHANNEL_RESULT,
-		1, vmbus_channel_on_open_result },
+		vmbus_channel_on_open_result },
 	{ HV_CHANNEL_MESSAGE_CLOSE_CHANNEL,
-		0, NULL },
+		NULL },
 	{ HV_CHANNEL_MESSAGEL_GPADL_HEADER,
-		0, NULL },
+		NULL },
 	{ HV_CHANNEL_MESSAGE_GPADL_BODY,
-		0, NULL },
+		NULL },
 	{ HV_CHANNEL_MESSAGE_GPADL_CREATED,
-		1, vmbus_channel_on_gpadl_created },
+		vmbus_channel_on_gpadl_created },
 	{ HV_CHANNEL_MESSAGE_GPADL_TEARDOWN,
-		0, NULL },
+		NULL },
 	{ HV_CHANNEL_MESSAGE_GPADL_TORNDOWN,
-		1, vmbus_channel_on_gpadl_torndown },
+		vmbus_channel_on_gpadl_torndown },
 	{ HV_CHANNEL_MESSAGE_REL_ID_RELEASED,
-		0, NULL },
+		NULL },
 	{ HV_CHANNEL_MESSAGE_INITIATED_CONTACT,
-		0, NULL },
+		NULL },
 	{ HV_CHANNEL_MESSAGE_VERSION_RESPONSE,
-		1, vmbus_channel_on_version_response },
+		vmbus_channel_on_version_response },
 	{ HV_CHANNEL_MESSAGE_UNLOAD,
-		0, NULL }
+		NULL }
 };
 
+typedef struct hv_work_item {
+	struct task	work;
+	void		(*callback)(void *);
+	void*		context;
+} hv_work_item;
 
 /**
  * Implementation of the work abstraction.
@@ -96,120 +103,30 @@ work_item_callback(void *work, int pending)
 {
 	struct hv_work_item *w = (struct hv_work_item *)work;
 
-	/*
-	 * Serialize work execution.
-	 */
-	if (w->wq->work_sema != NULL) {
-		sema_wait(w->wq->work_sema);
-	}
-
 	w->callback(w->context);
 
-	if (w->wq->work_sema != NULL) {
-		sema_post(w->wq->work_sema);
-	} 
-
 	free(w, M_DEVBUF);
 }
 
-struct hv_work_queue*
-hv_work_queue_create(char* name)
-{
-	static unsigned int	qid = 0;
-	char			qname[64];
-	int			pri;
-	struct hv_work_queue*	wq;
-
-	wq = malloc(sizeof(struct hv_work_queue), M_DEVBUF, M_NOWAIT | M_ZERO);
-	KASSERT(wq != NULL, ("Error VMBUS: Failed to allocate work_queue\n"));
-	if (wq == NULL)
-	    return (NULL);
-
-	/*
-	 * We use work abstraction to handle messages
-	 * coming from the host and these are typically offers.
-	 * Some FreeBsd drivers appear to have a concurrency issue
-	 * where probe/attach needs to be serialized. We ensure that
-	 * by having only one thread process work elements in a 
-	 * specific queue by serializing work execution.
-	 *
-	 */
-	if (strcmp(name, "vmbusQ") == 0) {
-	    pri = PI_DISK;
-	} else { /* control */
-	    pri = PI_NET;
-	    /*
-	     * Initialize semaphore for this queue by pointing
-	     * to the globale semaphore used for synchronizing all
-	     * control messages.
-	     */
-	    wq->work_sema = &hv_vmbus_g_connection.control_sema;
-	}
-
-	sprintf(qname, "hv_%s_%u", name, qid);
-
-	/*
-	 * Fixme:  FreeBSD 8.2 has a different prototype for
-	 * taskqueue_create(), and for certain other taskqueue functions.
-	 * We need to research the implications of these changes.
-	 * Fixme:  Not sure when the changes were introduced.
-	 */
-	wq->queue = taskqueue_create(qname, M_NOWAIT, taskqueue_thread_enqueue,
-	    &wq->queue
-	    #if __FreeBSD_version < 800000
-	    , &wq->proc
-	    #endif
-	    );
-
-	if (wq->queue == NULL) {
-	    free(wq, M_DEVBUF);
-	    return (NULL);
-	}
-
-	if (taskqueue_start_threads(&wq->queue, 1, pri, "%s taskq", qname)) {
-	    taskqueue_free(wq->queue);
-	    free(wq, M_DEVBUF);
-	    return (NULL);
-	}
-
-	qid++;
-
-	return (wq);
-}
-
-void
-hv_work_queue_close(struct hv_work_queue *wq)
-{
-	/*
-	 * KYS: Need to drain the taskqueue
-	 * before we close the hv_work_queue.
-	 */
-	/*KYS: taskqueue_drain(wq->tq, ); */
-	taskqueue_free(wq->queue);
-	free(wq, M_DEVBUF);
-}
-
 /**
  * @brief Create work item
  */
-int
+static int
 hv_queue_work_item(
-	struct hv_work_queue *wq,
 	void (*callback)(void *), void *context)
 {
 	struct hv_work_item *w = malloc(sizeof(struct hv_work_item),
-					M_DEVBUF, M_NOWAIT | M_ZERO);
+					M_DEVBUF, M_NOWAIT);
 	KASSERT(w != NULL, ("Error VMBUS: Failed to allocate WorkItem\n"));
 	if (w == NULL)
 	    return (ENOMEM);
 
 	w->callback = callback;
 	w->context = context;
-	w->wq = wq;
 
 	TASK_INIT(&w->work, 0, work_item_callback, w);
 
-	return (taskqueue_enqueue(wq->queue, &w->work));
+	return (taskqueue_enqueue(taskqueue_thread, &w->work));
 }
 
 
@@ -224,10 +141,7 @@ hv_vmbus_allocate_channel(void)
 	channel = (hv_vmbus_channel*) malloc(
 					sizeof(hv_vmbus_channel),
 					M_DEVBUF,
-					M_NOWAIT | M_ZERO);
-	KASSERT(channel != NULL, ("Error VMBUS: Failed to allocate channel!"));
-	if (channel == NULL)
-	    return (NULL);
+					M_WAITOK | M_ZERO);
 
 	mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF);
 	mtx_init(&channel->sc_lock, "vmbus multi channel", NULL, MTX_DEF);
@@ -238,16 +152,6 @@ hv_vmbus_allocate_channel(void)
 }
 
 /**
- * @brief Release the vmbus channel object itself
- */
-static inline void
-ReleaseVmbusChannel(void *context)
-{
-	hv_vmbus_channel* channel = (hv_vmbus_channel*) context;
-	free(channel, M_DEVBUF);
-}
-
-/**
  * @brief Release the resources used by the vmbus channel object
  */
 void
@@ -255,13 +159,8 @@ hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel)
 {
 	mtx_destroy(&channel->sc_lock);
 	mtx_destroy(&channel->inbound_lock);
-	/*
-	 * We have to release the channel's workqueue/thread in
-	 *  the vmbus's workqueue/thread context
-	 * ie we can't destroy ourselves
-	 */
-	hv_queue_work_item(hv_vmbus_g_connection.work_queue,
-	    ReleaseVmbusChannel, (void *) channel);
+
+	free(channel, M_DEVBUF);
 }
 
 /**
@@ -459,7 +358,7 @@ static void
 vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr)
 {
 	hv_vmbus_channel_offer_channel* offer;
-	hv_vmbus_channel* new_channel;
+	hv_vmbus_channel_offer_channel* copied;
 
 	offer = (hv_vmbus_channel_offer_channel*) hdr;
 
@@ -469,10 +368,25 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr)
 	guidType = &offer->offer.interface_type;
 	guidInstance = &offer->offer.interface_instance;
 
+	// copy offer data
+	copied = malloc(sizeof(*copied), M_DEVBUF, M_NOWAIT);
+	if (copied == NULL) {
+		printf("fail to allocate memory\n");
+		return;
+	}
+
+	memcpy(copied, hdr, sizeof(*copied));
+	hv_queue_work_item(vmbus_channel_on_offer_internal, copied);
+}
+
+static void
+vmbus_channel_on_offer_internal(void* context)
+{
+	hv_vmbus_channel* new_channel;
+
+	hv_vmbus_channel_offer_channel* offer = (hv_vmbus_channel_offer_channel*)context;
 	/* Allocate the channel object and save this offer */
 	new_channel = hv_vmbus_allocate_channel();
-	if (new_channel == NULL)
-	    return;
 
 	/*
 	 * By default we setup state to enable batched
@@ -512,6 +426,8 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr)
 	new_channel->monitor_bit = (uint8_t) offer->monitor_id % 32;
 
 	vmbus_channel_process_offer(new_channel);
+
+	free(offer, M_DEVBUF);
 }
 
 /**
@@ -529,13 +445,20 @@ vmbus_channel_on_offer_rescind(hv_vmbus_channel_msg_header* hdr)
 	rescind = (hv_vmbus_channel_rescind_offer*) hdr;
 
 	channel = hv_vmbus_g_connection.channels[rescind->child_rel_id];
-	if (channel == NULL) 
+	if (channel == NULL)
 	    return;
 
-	hv_vmbus_child_device_unregister(channel->device);
-	mtx_lock(&hv_vmbus_g_connection.channel_lock);
+	hv_queue_work_item(vmbus_channel_on_offer_rescind_internal, channel);
 	hv_vmbus_g_connection.channels[rescind->child_rel_id] = NULL;
-	mtx_unlock(&hv_vmbus_g_connection.channel_lock);
+}
+
+static void
+vmbus_channel_on_offer_rescind_internal(void *context)
+{
+	hv_vmbus_channel*               channel;
+
+	channel = (hv_vmbus_channel*)context;
+	hv_vmbus_child_device_unregister(channel->device);
 }
 
 /**
@@ -712,35 +635,6 @@ vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr)
 }
 
 /**
- * @brief Handler for channel protocol messages.
- *
- * This is invoked in the vmbus worker thread context.
- */
-void
-hv_vmbus_on_channel_message(void *context)
-{
-	hv_vmbus_message*		msg;
-	hv_vmbus_channel_msg_header*	hdr;
-	int				size;
-
-	msg = (hv_vmbus_message*) context;
-	hdr = (hv_vmbus_channel_msg_header*) msg->u.payload;
-	size = msg->header.payload_size;
-
-	if (hdr->message_type >= HV_CHANNEL_MESSAGE_COUNT) {
-	    free(msg, M_DEVBUF);
-	    return;
-	}
-
-	if (g_channel_message_table[hdr->message_type].messageHandler) {
-	    g_channel_message_table[hdr->message_type].messageHandler(hdr);
-	}
-
-	/* Free the msg that was allocated in VmbusOnMsgDPC() */
-	free(msg, M_DEVBUF);
-}
-
-/**
  *  @brief Send a request to get all our pending offers.
  */
 int
@@ -765,8 +659,7 @@ hv_vmbus_request_channel_offers(void)
 
 	ret = hv_vmbus_post_message(msg, sizeof(hv_vmbus_channel_msg_header));
 
-	if (msg_info)
-	    free(msg_info, M_DEVBUF);
+	free(msg_info, M_DEVBUF);
 
 	return (ret);
 }
diff --git a/sys/dev/hyperv/vmbus/hv_connection.c b/sys/dev/hyperv/vmbus/hv_connection.c
index cfdc9bb..fb1879d 100644
--- a/sys/dev/hyperv/vmbus/hv_connection.c
+++ b/sys/dev/hyperv/vmbus/hv_connection.c
@@ -90,12 +90,10 @@ hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info,
 		hv_vmbus_g_connection.interrupt_page);
 
 	msg->monitor_page_1 = hv_get_phys_addr(
-		hv_vmbus_g_connection.monitor_pages);
+		hv_vmbus_g_connection.monitor_page_1);
 
-	msg->monitor_page_2 =
-		hv_get_phys_addr(
-			((uint8_t *) hv_vmbus_g_connection.monitor_pages
-			+ PAGE_SIZE));
+	msg->monitor_page_2 = hv_get_phys_addr(
+		hv_vmbus_g_connection.monitor_page_2);
 
 	/**
 	 * Add to list before we send the request since we may receive the
@@ -168,8 +166,6 @@ hv_vmbus_connect(void) {
 	 * Initialize the vmbus connection
 	 */
 	hv_vmbus_g_connection.connect_state = HV_CONNECTING;
-	hv_vmbus_g_connection.work_queue = hv_work_queue_create("vmbusQ");
-	sema_init(&hv_vmbus_g_connection.control_sema, 1, "control_sema");
 
 	TAILQ_INIT(&hv_vmbus_g_connection.channel_msg_anchor);
 	mtx_init(&hv_vmbus_g_connection.channel_msg_lock, "vmbus channel msg",
@@ -183,18 +179,9 @@ hv_vmbus_connect(void) {
 	 * Setup the vmbus event connection for channel interrupt abstraction
 	 * stuff
 	 */
-	hv_vmbus_g_connection.interrupt_page = contigmalloc(
+	hv_vmbus_g_connection.interrupt_page = malloc(
 					PAGE_SIZE, M_DEVBUF,
-					M_NOWAIT | M_ZERO, 0UL,
-					BUS_SPACE_MAXADDR,
-					PAGE_SIZE, 0);
-	KASSERT(hv_vmbus_g_connection.interrupt_page != NULL,
-	    ("Error VMBUS: malloc failed to allocate Channel"
-		" Request Event message!"));
-	if (hv_vmbus_g_connection.interrupt_page == NULL) {
-	    ret = ENOMEM;
-	    goto cleanup;
-	}
+					M_WAITOK | M_ZERO);
 
 	hv_vmbus_g_connection.recv_interrupt_page =
 		hv_vmbus_g_connection.interrupt_page;
@@ -207,31 +194,19 @@ hv_vmbus_connect(void) {
 	 * Set up the monitor notification facility. The 1st page for
 	 * parent->child and the 2nd page for child->parent
 	 */
-	hv_vmbus_g_connection.monitor_pages = contigmalloc(
-		2 * PAGE_SIZE,
+	hv_vmbus_g_connection.monitor_page_1 = malloc(
+		PAGE_SIZE,
 		M_DEVBUF,
-		M_NOWAIT | M_ZERO,
-		0UL,
-		BUS_SPACE_MAXADDR,
+		M_WAITOK | M_ZERO);
+	hv_vmbus_g_connection.monitor_page_2 = malloc(
 		PAGE_SIZE,
-		0);
-	KASSERT(hv_vmbus_g_connection.monitor_pages != NULL,
-	    ("Error VMBUS: malloc failed to allocate Monitor Pages!"));
-	if (hv_vmbus_g_connection.monitor_pages == NULL) {
-	    ret = ENOMEM;
-	    goto cleanup;
-	}
+		M_DEVBUF,
+		M_WAITOK | M_ZERO);
 
 	msg_info = (hv_vmbus_channel_msg_info*)
 		malloc(sizeof(hv_vmbus_channel_msg_info) +
 			sizeof(hv_vmbus_channel_initiate_contact),
-			M_DEVBUF, M_NOWAIT | M_ZERO);
-	KASSERT(msg_info != NULL,
-	    ("Error VMBUS: malloc failed for Initiate Contact message!"));
-	if (msg_info == NULL) {
-	    ret = ENOMEM;
-	    goto cleanup;
-	}
+			M_DEVBUF, M_WAITOK | M_ZERO);
 
 	hv_vmbus_g_connection.channels = malloc(sizeof(hv_vmbus_channel*) *
 		HV_CHANNEL_MAX_COUNT,
@@ -273,26 +248,16 @@ hv_vmbus_connect(void) {
 
 	hv_vmbus_g_connection.connect_state = HV_DISCONNECTED;
 
-	hv_work_queue_close(hv_vmbus_g_connection.work_queue);
-	sema_destroy(&hv_vmbus_g_connection.control_sema);
 	mtx_destroy(&hv_vmbus_g_connection.channel_lock);
 	mtx_destroy(&hv_vmbus_g_connection.channel_msg_lock);
 
 	if (hv_vmbus_g_connection.interrupt_page != NULL) {
-		contigfree(
-			hv_vmbus_g_connection.interrupt_page,
-			PAGE_SIZE,
-			M_DEVBUF);
+		free(hv_vmbus_g_connection.interrupt_page, M_DEVBUF);
 		hv_vmbus_g_connection.interrupt_page = NULL;
 	}
 
-	if (hv_vmbus_g_connection.monitor_pages != NULL) {
-		contigfree(
-			hv_vmbus_g_connection.monitor_pages,
-			2 * PAGE_SIZE,
-			M_DEVBUF);
-		hv_vmbus_g_connection.monitor_pages = NULL;
-	}
+	free(hv_vmbus_g_connection.monitor_page_1, M_DEVBUF);
+	free(hv_vmbus_g_connection.monitor_page_2, M_DEVBUF);
 
 	if (msg_info) {
 		sema_destroy(&msg_info->wait_sema);
@@ -309,108 +274,29 @@ hv_vmbus_connect(void) {
 int
 hv_vmbus_disconnect(void) {
 	int			 ret = 0;
-	hv_vmbus_channel_unload* msg;
-
-	msg = malloc(sizeof(hv_vmbus_channel_unload),
-	    M_DEVBUF, M_NOWAIT | M_ZERO);
-	KASSERT(msg != NULL,
-	    ("Error VMBUS: malloc failed to allocate Channel Unload Msg!"));
-	if (msg == NULL)
-	    return (ENOMEM);
-
-	msg->message_type = HV_CHANNEL_MESSAGE_UNLOAD;
+	hv_vmbus_channel_unload  msg;
 
-	ret = hv_vmbus_post_message(msg, sizeof(hv_vmbus_channel_unload));
+	msg.message_type = HV_CHANNEL_MESSAGE_UNLOAD;
 
+	ret = hv_vmbus_post_message(&msg, sizeof(hv_vmbus_channel_unload));
 
-	contigfree(hv_vmbus_g_connection.interrupt_page, PAGE_SIZE, M_DEVBUF);
+	free(hv_vmbus_g_connection.interrupt_page, M_DEVBUF);
 
 	mtx_destroy(&hv_vmbus_g_connection.channel_msg_lock);
 
-	hv_work_queue_close(hv_vmbus_g_connection.work_queue);
-	sema_destroy(&hv_vmbus_g_connection.control_sema);
-
 	free(hv_vmbus_g_connection.channels, M_DEVBUF);
 	hv_vmbus_g_connection.connect_state = HV_DISCONNECTED;
 
-	free(msg, M_DEVBUF);
-
 	return (ret);
 }
 
 /**
- * Process a channel event notification
- */
-static void
-VmbusProcessChannelEvent(uint32_t relid) 
-{
-	void* arg;
-	uint32_t bytes_to_read;
-	hv_vmbus_channel* channel;
-	boolean_t is_batched_reading;
-
-	/**
-	 * Find the channel based on this relid and invokes
-	 * the channel callback to process the event
-	 */
-
-	channel = hv_vmbus_g_connection.channels[relid];
-
-	if (channel == NULL) {
-		return;
-	}
-	/**
-	 * To deal with the race condition where we might
-	 * receive a packet while the relevant driver is 
-	 * being unloaded, dispatch the callback while 
-	 * holding the channel lock. The unloading driver
-	 * will acquire the same channel lock to set the
-	 * callback to NULL. This closes the window.
-	 */
-
-	/*
-	 * Disable the lock due to newly added WITNESS check in r277723.
-	 * Will seek other way to avoid race condition.
-	 * -- whu
-	 */
-	// mtx_lock(&channel->inbound_lock);
-	if (channel->on_channel_callback != NULL) {
-		arg = channel->channel_callback_context;
-		is_batched_reading = channel->batched_reading;
-		/*
-		 * Optimize host to guest signaling by ensuring:
-		 * 1. While reading the channel, we disable interrupts from
-		 *    host.
-		 * 2. Ensure that we process all posted messages from the host
-		 *    before returning from this callback.
-		 * 3. Once we return, enable signaling from the host. Once this
-		 *    state is set we check to see if additional packets are
-		 *    available to read. In this case we repeat the process.
-		 */
-		do {
-			if (is_batched_reading)
-				hv_ring_buffer_read_begin(&channel->inbound);
-
-			channel->on_channel_callback(arg);
-
-			if (is_batched_reading)
-				bytes_to_read =
-				    hv_ring_buffer_read_end(&channel->inbound);
-			else
-				bytes_to_read = 0;
-		} while (is_batched_reading && (bytes_to_read != 0));
-	}
-	// mtx_unlock(&channel->inbound_lock);
-}
-
-/**
  * Handler for events
  */
 void
-hv_vmbus_on_events(void *arg) 
+hv_vmbus_on_events(int cpu)
 {
 	int bit;
-	int cpu;
 	int dword;
 	void *page_addr;
 	uint32_t* recv_interrupt_page = NULL;
@@ -419,7 +305,6 @@ hv_vmbus_on_events(void *arg)
 	hv_vmbus_synic_event_flags *event;
 	/* int maxdword = PAGE_SIZE >> 3; */
 
-	cpu = (int)(long)arg;
 	KASSERT(cpu <= mp_maxid, ("VMBUS: hv_vmbus_on_events: "
 	    "cpu out of range!"));
 
@@ -461,8 +346,14 @@ hv_vmbus_on_events(void *arg)
 				 */
 				continue;
 			    } else {
-				VmbusProcessChannelEvent(rel_id);
-
+				hv_vmbus_channel * channel = hv_vmbus_g_connection.channels[rel_id];
+				/* if channel is closed or closing */
+				if (channel == NULL || channel->rxq == NULL)
+					continue;
+
+				if (channel->batched_reading)
+					hv_ring_buffer_read_begin(&channel->inbound);
+				taskqueue_enqueue_fast(channel->rxq, &channel->channel_task);
 			    }
 			}
 		    }
diff --git a/sys/dev/hyperv/vmbus/hv_hv.c b/sys/dev/hyperv/vmbus/hv_hv.c
index ca5641f..6afc2b8 100644
--- a/sys/dev/hyperv/vmbus/hv_hv.c
+++ b/sys/dev/hyperv/vmbus/hv_hv.c
@@ -189,11 +189,7 @@ hv_vmbus_init(void)
 	 * See if the hypercall page is already set
 	 */
 	hypercall_msr.as_uint64_t = rdmsr(HV_X64_MSR_HYPERCALL);
-	virt_addr = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO);
-	KASSERT(virt_addr != NULL,
-	    ("Error VMBUS: malloc failed to allocate page during init!"));
-	if (virt_addr == NULL)
-	    goto cleanup;
+	virt_addr = malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO);
 
 	hypercall_msr.u.enable = 1;
 	hypercall_msr.u.guest_physical_address =
diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
index 66a3f39..c8d6894 100644
--- a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
+++ b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
@@ -83,8 +83,6 @@ vmbus_msg_swintr(void *arg)
 	hv_vmbus_channel_msg_table_entry *entry;
 	hv_vmbus_channel_msg_type msg_type;
 	hv_vmbus_message*	msg;
-	hv_vmbus_message*	copied;
-	static bool warned	= false;
 
 	cpu = (int)(long)arg;
 	KASSERT(cpu <= mp_maxid, ("VMBUS: vmbus_msg_swintr: "
@@ -100,31 +98,15 @@ vmbus_msg_swintr(void *arg)
 		hdr = (hv_vmbus_channel_msg_header *)msg->u.payload;
 		msg_type = hdr->message_type;
 
-		if (msg_type >= HV_CHANNEL_MESSAGE_COUNT && !warned) {
-			warned = true;
+		if (msg_type >= HV_CHANNEL_MESSAGE_COUNT) {
 			printf("VMBUS: unknown message type = %d\n", msg_type);
 			goto handled;
 		}
 
 		entry = &g_channel_message_table[msg_type];
 
-		if (entry->handler_no_sleep)
+		if (entry->messageHandler)
 			entry->messageHandler(hdr);
-		else {
-
-			copied = malloc(sizeof(hv_vmbus_message),
-					M_DEVBUF, M_NOWAIT);
-			KASSERT(copied != NULL,
-				("Error VMBUS: malloc failed to allocate"
-					" hv_vmbus_message!"));
-			if (copied == NULL)
-				continue;
-
-			memcpy(copied, msg, sizeof(hv_vmbus_message));
-			hv_queue_work_item(hv_vmbus_g_connection.work_queue,
-					   hv_vmbus_on_channel_message,
-					   copied);
-		}
 handled:
 	    msg->header.message_type = HV_MESSAGE_TYPE_NONE;
 
@@ -177,7 +159,7 @@ hv_vmbus_isr(struct trapframe *frame)
 	    (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) {
 		/* Since we are a child, we only need to check bit 0 */
 		if (synch_test_and_clear_bit(0, &event->flags32[0])) {
-			swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0);
+			hv_vmbus_on_events(cpu);
 		}
 	} else {
 		/*
@@ -187,7 +169,7 @@ hv_vmbus_isr(struct trapframe *frame)
 		 * Directly schedule the event software interrupt on
 		 * current cpu.
 		 */
-		swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0);
+		hv_vmbus_on_events(cpu);
 	}
 
 	/* Check if there are actual msgs to be process */
@@ -225,7 +207,6 @@ hv_vmbus_isr(struct trapframe *frame)
 	return (FILTER_HANDLED);
 }
 
-uint32_t hv_vmbus_swintr_event_cpu[MAXCPU];
 u_long *hv_vmbus_intr_cpu[MAXCPU];
 
 void
@@ -310,12 +291,7 @@ hv_vmbus_child_device_create(
 	 * Allocate the new child device
 	 */
 	child_dev = malloc(sizeof(hv_device), M_DEVBUF,
-			M_NOWAIT |  M_ZERO);
-	KASSERT(child_dev != NULL,
-	    ("Error VMBUS: malloc failed to allocate hv_device!"));
-
-	if (child_dev == NULL)
-		return (NULL);
+			M_WAITOK |  M_ZERO);
 
 	child_dev->channel = channel;
 	memcpy(&child_dev->class_id, &type, sizeof(hv_guid));
@@ -455,6 +431,19 @@ vmbus_vector_free(int vector)
 
 #endif /* HYPERV */
 
+static void
+vmbus_cpuset_setthread_task(void *xmask, int pending __unused)
+{
+	cpuset_t *mask = xmask;
+	int error;
+
+	error = cpuset_setthread(curthread->td_tid, mask);
+	if (error) {
+		panic("curthread=%ju: can't pin; error=%d",
+		    (uintmax_t)curthread->td_tid, error);
+	}
+}
+
 /**
  * @brief Main vmbus driver initialization routine.
  *
@@ -472,6 +461,7 @@ vmbus_bus_init(void)
 {
 	int i, j, n, ret;
 	char buf[MAXCOMLEN + 1];
+	cpuset_t cpu_mask;
 
 	if (vmbus_inited)
 		return (0);
@@ -508,10 +498,7 @@ vmbus_bus_init(void)
 	setup_args.vector = hv_vmbus_g_context.hv_cb_vector;
 
 	CPU_FOREACH(j) {
-		hv_vmbus_swintr_event_cpu[j] = 0;
-		hv_vmbus_g_context.hv_event_intr_event[j] = NULL;
 		hv_vmbus_g_context.hv_msg_intr_event[j] = NULL;
-		hv_vmbus_g_context.event_swintr[j] = NULL;
 		hv_vmbus_g_context.msg_swintr[j] = NULL;
 
 		snprintf(buf, sizeof(buf), "cpu%d:hyperv", j);
@@ -525,6 +512,26 @@ vmbus_bus_init(void)
 	 * Per cpu setup.
 	 */
 	CPU_FOREACH(j) {
+		struct task cpuset_task;
+
+		/*
+		 * Setup taskqueue to handle events
+		 */
+		hv_vmbus_g_context.hv_event_queue[j] = taskqueue_create_fast("hyperv event", M_WAITOK,
+			taskqueue_thread_enqueue, &hv_vmbus_g_context.hv_event_queue[j]);
+		if (hv_vmbus_g_context.hv_event_queue[j] == NULL) {
+			if (bootverbose)
+				printf("VMBUS: failed to setup taskqueue\n");
+			goto cleanup1;
+		}
+		taskqueue_start_threads(&hv_vmbus_g_context.hv_event_queue[j], 1, PI_NET,
+			"hvevent%d", j);
+
+		CPU_SETOF(j, &cpu_mask);
+		TASK_INIT(&cpuset_task, 0, vmbus_cpuset_setthread_task, &cpu_mask);
+		taskqueue_enqueue(hv_vmbus_g_context.hv_event_queue[j], &cpuset_task);
+		taskqueue_drain(hv_vmbus_g_context.hv_event_queue[j], &cpuset_task);
+
 		/*
 		 * Setup software interrupt thread and handler for msg handling.
 		 */
@@ -543,7 +550,7 @@ vmbus_bus_init(void)
 		 */
 		ret = intr_event_bind(hv_vmbus_g_context.hv_msg_intr_event[j],
 		    j);
-	 	if (ret) {
+		if (ret) {
 			if(bootverbose)
 				printf("VMBUS: failed to bind msg swi thread "
 				    "to cpu %d\n", j);
@@ -551,30 +558,11 @@ vmbus_bus_init(void)
 		}
 
 		/*
-		 * Setup software interrupt thread and handler for
-		 * event handling.
-		 */
-		ret = swi_add(&hv_vmbus_g_context.hv_event_intr_event[j],
-		    "hv_event", hv_vmbus_on_events, (void *)(long)j,
-		    SWI_CLOCK, 0, &hv_vmbus_g_context.event_swintr[j]);
-		if (ret) {
-			if(bootverbose)
-				printf("VMBUS: failed to setup event swi for "
-				    "cpu %d\n", j);
-			goto cleanup1;
-		}
-
-		/*
 		 * Prepare the per cpu msg and event pages to be called on each cpu.
 		 */
 		for(i = 0; i < 2; i++) {
 			setup_args.page_buffers[2 * j + i] =
-				malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO);
-			if (setup_args.page_buffers[2 * j + i] == NULL) {
-				KASSERT(setup_args.page_buffers[2 * j + i] != NULL,
-					("Error VMBUS: malloc failed!"));
-				goto cleanup1;
-			}
+				malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO);
 		}
 	}
 
@@ -607,12 +595,11 @@ vmbus_bus_init(void)
 	 * remove swi and vmbus callback vector;
 	 */
 	CPU_FOREACH(j) {
+		if (hv_vmbus_g_context.hv_event_queue[j] != NULL)
+			taskqueue_free(hv_vmbus_g_context.hv_event_queue[j]);
 		if (hv_vmbus_g_context.msg_swintr[j] != NULL)
 			swi_remove(hv_vmbus_g_context.msg_swintr[j]);
-		if (hv_vmbus_g_context.event_swintr[j] != NULL)
-			swi_remove(hv_vmbus_g_context.event_swintr[j]);
 		hv_vmbus_g_context.hv_msg_intr_event[j] = NULL;	
-		hv_vmbus_g_context.hv_event_intr_event[j] = NULL;	
 	}
 
 	vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector);
@@ -677,12 +664,11 @@ vmbus_bus_exit(void)
 
 	/* remove swi */
 	CPU_FOREACH(i) {
+		if (hv_vmbus_g_context.hv_event_queue[i] != NULL)
+			taskqueue_free(hv_vmbus_g_context.hv_event_queue[i]);
 		if (hv_vmbus_g_context.msg_swintr[i] != NULL)
 			swi_remove(hv_vmbus_g_context.msg_swintr[i]);
-		if (hv_vmbus_g_context.event_swintr[i] != NULL)
-			swi_remove(hv_vmbus_g_context.event_swintr[i]);
 		hv_vmbus_g_context.hv_msg_intr_event[i] = NULL;	
-		hv_vmbus_g_context.hv_event_intr_event[i] = NULL;	
 	}
 
 	vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector);
diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
index 13a35c4..5f62072 100644
--- a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
+++ b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
@@ -202,9 +202,8 @@ typedef struct {
 	 * Each cpu has its own software interrupt handler for channel
 	 * event and msg handling.
 	 */
-	struct intr_event		*hv_event_intr_event[MAXCPU];
+	struct taskqueue		*hv_event_queue[MAXCPU];
 	struct intr_event		*hv_msg_intr_event[MAXCPU];
-	void				*event_swintr[MAXCPU];
 	void				*msg_swintr[MAXCPU];
 	/*
 	 * Host use this vector to intrrupt guest for vmbus channel
@@ -351,7 +350,8 @@ typedef struct {
 	 * notification and 2nd is child->parent
 	 * notification
 	 */
-	void					*monitor_pages;
+	void					*monitor_page_1;
+	void					*monitor_page_2;
 	TAILQ_HEAD(, hv_vmbus_channel_msg_info)	channel_msg_anchor;
 	struct mtx				channel_msg_lock;
 	/**
@@ -363,10 +363,8 @@ typedef struct {
 
 	/**
 	 * channel table for fast lookup through id.
-	 */
+	*/
 	hv_vmbus_channel                        **channels;
-	hv_vmbus_handle				work_queue;
-	struct sema				control_sema;
 } hv_vmbus_connection;
 
 typedef union {
@@ -633,7 +631,6 @@ typedef void (*vmbus_msg_handler)(hv_vmbus_channel_msg_header *msg);
 typedef struct hv_vmbus_channel_msg_table_entry {
 	hv_vmbus_channel_msg_type    messageType;
 
-	bool   handler_no_sleep; /* true: the handler doesn't sleep */
 	vmbus_msg_handler   messageHandler;
 } hv_vmbus_channel_msg_table_entry;
 
@@ -683,7 +680,6 @@ uint32_t		hv_ring_buffer_read_end(
 
 hv_vmbus_channel*	hv_vmbus_allocate_channel(void);
 void			hv_vmbus_free_vmbus_channel(hv_vmbus_channel *channel);
-void			hv_vmbus_on_channel_message(void *context);
 int			hv_vmbus_request_channel_offers(void);
 void			hv_vmbus_release_unattached_channels(void);
 int			hv_vmbus_init(void);
@@ -717,7 +713,7 @@ int			hv_vmbus_connect(void);
 int			hv_vmbus_disconnect(void);
 int			hv_vmbus_post_message(void *buffer, size_t buf_size);
 int			hv_vmbus_set_event(hv_vmbus_channel *channel);
-void			hv_vmbus_on_events(void *);
+void			hv_vmbus_on_events(int cpu);
 
 /**
  * Event Timer interfaces
diff --git a/sys/dev/ioat/ioat.c b/sys/dev/ioat/ioat.c
index aff048a..cf48c25 100644
--- a/sys/dev/ioat/ioat.c
+++ b/sys/dev/ioat/ioat.c
@@ -152,8 +152,8 @@ MODULE_VERSION(ioat, 1);
  * Private data structures
  */
 static struct ioat_softc *ioat_channel[IOAT_MAX_CHANNELS];
-static int ioat_channel_index = 0;
-SYSCTL_INT(_hw_ioat, OID_AUTO, channels, CTLFLAG_RD, &ioat_channel_index, 0,
+static unsigned ioat_channel_index = 0;
+SYSCTL_UINT(_hw_ioat, OID_AUTO, channels, CTLFLAG_RD, &ioat_channel_index, 0,
     "Number of IOAT channels attached");
 
 static struct _pcsid
@@ -407,7 +407,7 @@ ioat3_attach(device_t device)
 	ioat = DEVICE2SOFTC(device);
 	ioat->capabilities = ioat_read_dmacapability(ioat);
 
-	ioat_log_message(1, "Capabilities: %b\n", (int)ioat->capabilities,
+	ioat_log_message(0, "Capabilities: %b\n", (int)ioat->capabilities,
 	    IOAT_DMACAP_STR);
 
 	xfercap = ioat_read_xfercap(ioat);
@@ -742,6 +742,13 @@ ioat_reset_hw_task(void *ctx, int pending __unused)
 /*
  * User API functions
  */
+unsigned
+ioat_get_nchannels(void)
+{
+
+	return (ioat_channel_index);
+}
+
 bus_dmaengine_t
 ioat_get_dmaengine(uint32_t index, int flags)
 {
diff --git a/sys/dev/ioat/ioat.h b/sys/dev/ioat/ioat.h
index 2e10124..9a0c3e3b 100644
--- a/sys/dev/ioat/ioat.h
+++ b/sys/dev/ioat/ioat.h
@@ -85,6 +85,8 @@ typedef void *bus_dmaengine_t;
 struct bus_dmadesc;
 typedef void (*bus_dmaengine_callback_t)(void *arg, int error);
 
+unsigned ioat_get_nchannels(void);
+
 /*
  * Called first to acquire a reference to the DMA channel
  *
diff --git a/sys/dev/ioat/ioat_internal.h b/sys/dev/ioat/ioat_internal.h
index 322671c..9d0708d 100644
--- a/sys/dev/ioat/ioat_internal.h
+++ b/sys/dev/ioat/ioat_internal.h
@@ -455,7 +455,7 @@ struct ioat_softc {
 })
 
 	int			version;
-	int			chan_idx;
+	unsigned		chan_idx;
 
 	struct mtx		submit_lock;
 	device_t		device;
diff --git a/sys/dev/ntb/ntb_hw/ntb_hw.c b/sys/dev/ntb/ntb_hw/ntb_hw.c
index a4c460e..1ef9749 100644
--- a/sys/dev/ntb/ntb_hw/ntb_hw.c
+++ b/sys/dev/ntb/ntb_hw/ntb_hw.c
@@ -35,6 +35,8 @@ __FBSDID("$FreeBSD$");
 #include <sys/endian.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/pciio.h>
 #include <sys/queue.h>
 #include <sys/rman.h>
 #include <sys/sbuf.h>
@@ -42,6 +44,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <machine/bus.h>
+#include <machine/intr_machdep.h>
 #include <machine/pmap.h>
 #include <machine/resource.h>
 #include <dev/pci/pcireg.h>
@@ -70,6 +73,19 @@ __FBSDID("$FreeBSD$");
 
 #define DEVICE2SOFTC(dev) ((struct ntb_softc *) device_get_softc(dev))
 
+#define	NTB_MSIX_VER_GUARD	0xaabbccdd
+#define	NTB_MSIX_RECEIVED	0xe0f0e0f0
+#define	ONE_MB			(1024u * 1024)
+
+/*
+ * PCI constants could be somewhere more generic, but aren't defined/used in
+ * pci.c.
+ */
+#define	PCI_MSIX_ENTRY_SIZE		16
+#define	PCI_MSIX_ENTRY_LOWER_ADDR	0
+#define	PCI_MSIX_ENTRY_UPPER_ADDR	4
+#define	PCI_MSIX_ENTRY_DATA		8
+
 enum ntb_device_type {
 	NTB_XEON,
 	NTB_ATOM
@@ -95,6 +111,18 @@ enum ntb_bar {
 	NTB_MAX_BARS
 };
 
+enum {
+	NTB_MSIX_GUARD = 0,
+	NTB_MSIX_DATA0,
+	NTB_MSIX_DATA1,
+	NTB_MSIX_DATA2,
+	NTB_MSIX_OFS0,
+	NTB_MSIX_OFS1,
+	NTB_MSIX_OFS2,
+	NTB_MSIX_DONE,
+	NTB_MAX_MSIX_SPAD
+};
+
 /* Device features and workarounds */
 #define HAS_FEATURE(feature)	\
 	((ntb->features & (feature)) != 0)
@@ -131,6 +159,7 @@ struct ntb_int_info {
 struct ntb_vec {
 	struct ntb_softc	*ntb;
 	uint32_t		num;
+	unsigned		masked;
 };
 
 struct ntb_reg {
@@ -169,6 +198,11 @@ struct ntb_b2b_addr {
 	uint64_t	bar5_addr32;
 };
 
+struct ntb_msix_data {
+	uint32_t	nmd_ofs;
+	uint32_t	nmd_data;
+};
+
 struct ntb_softc {
 	device_t		device;
 	enum ntb_device_type	type;
@@ -178,6 +212,13 @@ struct ntb_softc {
 	struct ntb_int_info	int_info[MAX_MSIX_INTERRUPTS];
 	uint32_t		allocated_interrupts;
 
+	struct ntb_msix_data	peer_msix_data[XEON_NONLINK_DB_MSIX_BITS];
+	struct ntb_msix_data	msix_data[XEON_NONLINK_DB_MSIX_BITS];
+	bool			peer_msix_good;
+	bool			peer_msix_done;
+	struct ntb_pci_bar_info	*peer_lapic_bar;
+	struct callout		peer_msix_work;
+
 	struct callout		heartbeat_timer;
 	struct callout		lr_timer;
 
@@ -198,6 +239,7 @@ struct ntb_softc {
 	/* Memory window used to access peer bar0 */
 #define B2B_MW_DISABLED			UINT8_MAX
 	uint8_t				b2b_mw_idx;
+	uint8_t				msix_mw_idx;
 
 	uint8_t				mw_count;
 	uint8_t				spad_count;
@@ -292,6 +334,8 @@ static inline void db_iowrite(struct ntb_softc *, uint64_t regoff, uint64_t);
 static inline void db_iowrite_raw(struct ntb_softc *, uint64_t regoff, uint64_t);
 static int ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors);
 static void ntb_free_msix_vec(struct ntb_softc *ntb);
+static void ntb_get_msix_info(struct ntb_softc *ntb);
+static void ntb_exchange_msix(void *);
 static struct ntb_hw_info *ntb_get_device_info(uint32_t device_id);
 static void ntb_detect_max_mw(struct ntb_softc *ntb);
 static int ntb_detect_xeon(struct ntb_softc *ntb);
@@ -308,7 +352,9 @@ static void xeon_set_pbar_xlat(struct ntb_softc *, uint64_t base_addr,
     enum ntb_bar idx);
 static int xeon_setup_b2b_mw(struct ntb_softc *,
     const struct ntb_b2b_addr *addr, const struct ntb_b2b_addr *peer_addr);
+static int xeon_setup_msix_bar(struct ntb_softc *);
 static inline bool link_is_up(struct ntb_softc *ntb);
+static inline bool _xeon_link_is_up(struct ntb_softc *ntb);
 static inline bool atom_link_is_err(struct ntb_softc *ntb);
 static inline enum ntb_speed ntb_link_sta_speed(struct ntb_softc *);
 static inline enum ntb_width ntb_link_sta_width(struct ntb_softc *);
@@ -319,6 +365,8 @@ static bool ntb_poll_link(struct ntb_softc *ntb);
 static void save_bar_parameters(struct ntb_pci_bar_info *bar);
 static void ntb_sysctl_init(struct ntb_softc *);
 static int sysctl_handle_features(SYSCTL_HANDLER_ARGS);
+static int sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS);
+static int sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_link_status(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_register(SYSCTL_HANDLER_ARGS);
 
@@ -397,6 +445,13 @@ ntb_vm_memattr_to_str(vm_memattr_t pat)
 	}
 }
 
+static int g_ntb_msix_idx = 0;
+SYSCTL_INT(_hw_ntb, OID_AUTO, msix_mw_idx, CTLFLAG_RDTUN, &g_ntb_msix_idx,
+    0, "Use this memory window to access the peer MSIX message complex on "
+    "certain Xeon-based NTB systems, as a workaround for a hardware errata.  "
+    "Like b2b_mw_idx, negative values index from the last available memory "
+    "window.  (Applies on Xeon platforms with SB01BASE_LOCKUP errata.)");
+
 static int g_ntb_mw_idx = -1;
 TUNABLE_INT("hw.ntb.b2b_mw_idx", &g_ntb_mw_idx);
 SYSCTL_INT(_hw_ntb, OID_AUTO, b2b_mw_idx, CTLFLAG_RDTUN, &g_ntb_mw_idx,
@@ -604,10 +659,12 @@ ntb_attach(device_t device)
 	ntb->type = p->type;
 	ntb->features = p->features;
 	ntb->b2b_mw_idx = B2B_MW_DISABLED;
+	ntb->msix_mw_idx = B2B_MW_DISABLED;
 
 	/* Heartbeat timer for NTB_ATOM since there is no link interrupt */
 	callout_init(&ntb->heartbeat_timer, CALLOUT_MPSAFE);
 	callout_init(&ntb->lr_timer, CALLOUT_MPSAFE);
+	callout_init(&ntb->peer_msix_work, 1);
 	mtx_init(&ntb->db_mask_lock, "ntb hw bits", NULL, MTX_SPIN);
 	mtx_init(&ntb->ctx_lock, "ntb ctx", NULL, MTX_DEF);
 
@@ -632,6 +689,8 @@ ntb_attach(device_t device)
 	if (error != 0)
 		goto out;
 
+	ntb_spad_clear(ntb);
+
 	ntb_poll_link(ntb);
 
 	ntb_sysctl_init(ntb);
@@ -649,10 +708,14 @@ ntb_detach(device_t device)
 
 	ntb = DEVICE2SOFTC(device);
 
-	if (ntb->self_reg != NULL)
-		ntb_db_set_mask(ntb, ntb->db_valid_mask);
+	if (ntb->self_reg != NULL) {
+		DB_MASK_LOCK(ntb);
+		db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_valid_mask);
+		DB_MASK_UNLOCK(ntb);
+	}
 	callout_drain(&ntb->heartbeat_timer);
 	callout_drain(&ntb->lr_timer);
+	callout_drain(&ntb->peer_msix_work);
 	pci_disable_busmaster(ntb->device);
 	if (ntb->type == NTB_XEON)
 		ntb_teardown_xeon(ntb);
@@ -978,9 +1041,12 @@ ntb_init_isr(struct ntb_softc *ntb)
 	ntb->last_ts = ticks;
 
 	/*
-	 * Mask all doorbell interrupts.
+	 * Mask all doorbell interrupts.  (Except link events!)
 	 */
-	ntb_db_set_mask(ntb, ntb->db_valid_mask);
+	DB_MASK_LOCK(ntb);
+	ntb->db_mask = ntb->db_valid_mask;
+	db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
+	DB_MASK_UNLOCK(ntb);
 
 	num_vectors = desired_vectors = MIN(pci_msix_count(ntb->device),
 	    ntb->db_count);
@@ -1005,12 +1071,28 @@ ntb_init_isr(struct ntb_softc *ntb)
 		num_vectors = 1;
 
 	if (ntb->type == NTB_XEON && num_vectors < ntb->db_vec_count) {
+		if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+			device_printf(ntb->device,
+			    "Errata workaround does not support MSI or INTX\n");
+			return (EINVAL);
+		}
+
 		ntb->db_vec_count = 1;
 		ntb->db_vec_shift = XEON_DB_TOTAL_SHIFT;
 		rc = ntb_setup_legacy_interrupt(ntb);
 	} else {
+		if (num_vectors - 1 != XEON_NONLINK_DB_MSIX_BITS &&
+		    HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+			device_printf(ntb->device,
+			    "Errata workaround expects %d doorbell bits\n",
+			    XEON_NONLINK_DB_MSIX_BITS);
+			return (EINVAL);
+		}
+
 		ntb_create_msix_vec(ntb, num_vectors);
 		rc = ntb_setup_msix(ntb, num_vectors);
+		if (rc == 0 && HAS_FEATURE(NTB_SB01BASE_LOCKUP))
+			ntb_get_msix_info(ntb);
 	}
 	if (rc != 0) {
 		device_printf(ntb->device,
@@ -1116,6 +1198,9 @@ void
 ntb_db_set_mask(struct ntb_softc *ntb, uint64_t bits)
 {
 
+	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
+		return;
+
 	DB_MASK_LOCK(ntb);
 	ntb->db_mask |= bits;
 	db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
@@ -1131,6 +1216,9 @@ ntb_db_clear_mask(struct ntb_softc *ntb, uint64_t bits)
 	     (uintmax_t)(bits & ~ntb->db_valid_mask),
 	     (uintmax_t)ntb->db_valid_mask));
 
+	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
+		return;
+
 	DB_MASK_LOCK(ntb);
 	ntb->db_mask &= ~bits;
 	db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
@@ -1141,6 +1229,18 @@ uint64_t
 ntb_db_read(struct ntb_softc *ntb)
 {
 
+	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+		uint64_t res;
+		unsigned i;
+
+		res = 0;
+		for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+			if (ntb->msix_vec[i].masked != 0)
+				res |= ntb_db_vector_mask(ntb, i);
+		}
+		return (res);
+	}
+
 	return (db_ioread(ntb, ntb->self_reg->db_bell));
 }
 
@@ -1153,6 +1253,25 @@ ntb_db_clear(struct ntb_softc *ntb, uint64_t bits)
 	     (uintmax_t)(bits & ~ntb->db_valid_mask),
 	     (uintmax_t)ntb->db_valid_mask));
 
+	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+		unsigned i;
+
+		for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+			if ((bits & ntb_db_vector_mask(ntb, i)) != 0) {
+				DB_MASK_LOCK(ntb);
+				if (ntb->msix_vec[i].masked != 0) {
+					/* XXX These need a public API. */
+#if 0
+					pci_unmask_msix(ntb->device, i);
+#endif
+					ntb->msix_vec[i].masked = 0;
+				}
+				DB_MASK_UNLOCK(ntb);
+			}
+		}
+		return;
+	}
+
 	db_iowrite(ntb, ntb->self_reg->db_bell, bits);
 }
 
@@ -1179,6 +1298,19 @@ ntb_interrupt(struct ntb_softc *ntb, uint32_t vec)
 			ntb_link_event(ntb);
 	}
 
+	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP) &&
+	    (vec_mask & ntb->db_link_mask) == 0) {
+		DB_MASK_LOCK(ntb);
+		if (ntb->msix_vec[vec].masked == 0) {
+			/* XXX These need a public API. */
+#if 0
+			pci_mask_msix(ntb->device, vec);
+#endif
+			ntb->msix_vec[vec].masked = 1;
+		}
+		DB_MASK_UNLOCK(ntb);
+	}
+
 	if ((vec_mask & ntb->db_valid_mask) != 0)
 		ntb_db_event(ntb, vec);
 }
@@ -1224,6 +1356,40 @@ ntb_free_msix_vec(struct ntb_softc *ntb)
 	ntb->msix_vec = NULL;
 }
 
+static void
+ntb_get_msix_info(struct ntb_softc *ntb)
+{
+	struct pci_devinfo *dinfo;
+	struct pcicfg_msix *msix;
+	uint32_t laddr, data, i, offset;
+
+	dinfo = device_get_ivars(ntb->device);
+	msix = &dinfo->cfg.msix;
+
+	laddr = data = 0;
+
+	CTASSERT(XEON_NONLINK_DB_MSIX_BITS == nitems(ntb->msix_data));
+
+	for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+		offset = msix->msix_table_offset + i * PCI_MSIX_ENTRY_SIZE;
+
+		laddr = bus_read_4(msix->msix_table_res, offset +
+		    PCI_MSIX_ENTRY_LOWER_ADDR);
+		ntb_printf(2, "local lower MSIX addr(%u): 0x%x\n", i, laddr);
+
+		KASSERT((laddr & MSI_INTEL_ADDR_BASE) == MSI_INTEL_ADDR_BASE,
+		    ("local MSIX addr 0x%x not in MSI base 0x%x", laddr,
+		     MSI_INTEL_ADDR_BASE));
+		ntb->msix_data[i].nmd_ofs = laddr & ~MSI_INTEL_ADDR_BASE;
+
+		data = bus_read_4(msix->msix_table_res, offset +
+		    PCI_MSIX_ENTRY_DATA);
+		ntb_printf(2, "local MSIX data(%u): 0x%x\n", i, data);
+
+		ntb->msix_data[i].nmd_data = data;
+	}
+}
+
 static struct ntb_hw_info *
 ntb_get_device_info(uint32_t device_id)
 {
@@ -1276,9 +1442,12 @@ ntb_detect_xeon(struct ntb_softc *ntb)
 	if ((ppd & XEON_PPD_SPLIT_BAR) != 0)
 		ntb->features |= NTB_SPLIT_BAR;
 
-	/* SB01BASE_LOCKUP errata is a superset of SDOORBELL errata */
+	/*
+	 * SDOORBELL errata workaround gets in the way of SB01BASE_LOCKUP
+	 * errata workaround; only do one at a time.
+	 */
 	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP))
-		ntb->features |= NTB_SDOORBELL_LOCKUP;
+		ntb->features &= ~NTB_SDOORBELL_LOCKUP;
 
 	conn_type = ppd & XEON_PPD_CONN_TYPE;
 	switch (conn_type) {
@@ -1342,19 +1511,28 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
 	ntb->peer_reg = &xeon_b2b_reg;
 	ntb->xlat_reg = &xeon_sec_xlat;
 
-	/*
-	 * There is a Xeon hardware errata related to writes to SDOORBELL or
-	 * B2BDOORBELL in conjunction with inbound access to NTB MMIO space,
-	 * which may hang the system.  To workaround this, use a memory
-	 * window to access the interrupt and scratch pad registers on the
-	 * remote system.
-	 */
-	if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
+	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+		ntb->msix_mw_idx = (ntb->mw_count + g_ntb_msix_idx) %
+		    ntb->mw_count;
+		ntb_printf(2, "Setting up MSIX mw idx %d means %u\n",
+		    g_ntb_msix_idx, ntb->msix_mw_idx);
+		rc = ntb_mw_set_wc_internal(ntb, ntb->msix_mw_idx,
+		    VM_MEMATTR_UNCACHEABLE);
+		KASSERT(rc == 0, ("shouldn't fail"));
+	} else if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
+		/*
+		 * There is a Xeon hardware errata related to writes to SDOORBELL or
+		 * B2BDOORBELL in conjunction with inbound access to NTB MMIO space,
+		 * which may hang the system.  To workaround this, use a memory
+		 * window to access the interrupt and scratch pad registers on the
+		 * remote system.
+		 */
 		ntb->b2b_mw_idx = (ntb->mw_count + g_ntb_mw_idx) %
 		    ntb->mw_count;
 		ntb_printf(2, "Setting up b2b mw idx %d means %u\n",
 		    g_ntb_mw_idx, ntb->b2b_mw_idx);
-		rc = ntb_mw_set_wc_internal(ntb, ntb->b2b_mw_idx, VM_MEMATTR_UNCACHEABLE);
+		rc = ntb_mw_set_wc_internal(ntb, ntb->b2b_mw_idx,
+		    VM_MEMATTR_UNCACHEABLE);
 		KASSERT(rc == 0, ("shouldn't fail"));
 	} else if (HAS_FEATURE(NTB_B2BDOORBELL_BIT14))
 		/*
@@ -1385,7 +1563,14 @@ ntb_xeon_init_dev(struct ntb_softc *ntb)
 	/*
 	 * Mask all doorbell interrupts.
 	 */
-	ntb_db_set_mask(ntb, ntb->db_valid_mask);
+	DB_MASK_LOCK(ntb);
+	ntb->db_mask = ntb->db_valid_mask;
+	db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
+	DB_MASK_UNLOCK(ntb);
+
+	rc = xeon_setup_msix_bar(ntb);
+	if (rc != 0)
+		return (rc);
 
 	rc = ntb_init_isr(ntb);
 	return (rc);
@@ -1489,6 +1674,15 @@ xeon_reset_sbar_size(struct ntb_softc *ntb, enum ntb_bar idx,
 			bar_sz--;
 		else
 			bar_sz = 0;
+	} else if (HAS_FEATURE(NTB_SB01BASE_LOCKUP) &&
+	    ntb_mw_to_bar(ntb, ntb->msix_mw_idx) == idx) {
+		/* Restrict LAPIC BAR to 1MB */
+		pci_write_config(ntb->device, bar->psz_off, 20, 1);
+		pci_write_config(ntb->device, bar->ssz_off, 20, 1);
+		bar_sz = pci_read_config(ntb->device, bar->psz_off, 1);
+		bar_sz = pci_read_config(ntb->device, bar->ssz_off, 1);
+		(void)bar_sz;
+		return;
 	}
 	pci_write_config(ntb->device, bar->ssz_off, bar_sz, 1);
 	bar_sz = pci_read_config(ntb->device, bar->ssz_off, 1);
@@ -1499,28 +1693,37 @@ static void
 xeon_set_sbar_base_and_limit(struct ntb_softc *ntb, uint64_t bar_addr,
     enum ntb_bar idx, enum ntb_bar regbar)
 {
-	uint64_t reg_val;
+	uint64_t reg_val, lmt_addr;
 	uint32_t base_reg, lmt_reg;
 
 	bar_get_xlat_params(ntb, idx, &base_reg, NULL, &lmt_reg);
 	if (idx == regbar)
 		bar_addr += ntb->b2b_off;
+	lmt_addr = bar_addr;
 
+	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP) &&
+	    ntb_mw_to_bar(ntb, ntb->msix_mw_idx) == idx)
+		lmt_addr += ONE_MB;
+
+	/*
+	 * Set limit registers first to avoid an errata where setting the base
+	 * registers locks the limit registers.
+	 */
 	if (!bar_is_64bit(ntb, idx)) {
-		ntb_reg_write(4, base_reg, bar_addr);
-		reg_val = ntb_reg_read(4, base_reg);
+		ntb_reg_write(4, lmt_reg, lmt_addr);
+		reg_val = ntb_reg_read(4, lmt_reg);
 		(void)reg_val;
 
-		ntb_reg_write(4, lmt_reg, bar_addr);
-		reg_val = ntb_reg_read(4, lmt_reg);
+		ntb_reg_write(4, base_reg, bar_addr);
+		reg_val = ntb_reg_read(4, base_reg);
 		(void)reg_val;
 	} else {
-		ntb_reg_write(8, base_reg, bar_addr);
-		reg_val = ntb_reg_read(8, base_reg);
+		ntb_reg_write(8, lmt_reg, lmt_addr);
+		reg_val = ntb_reg_read(8, lmt_reg);
 		(void)reg_val;
 
-		ntb_reg_write(8, lmt_reg, bar_addr);
-		reg_val = ntb_reg_read(8, lmt_reg);
+		ntb_reg_write(8, base_reg, bar_addr);
+		reg_val = ntb_reg_read(8, base_reg);
 		(void)reg_val;
 	}
 }
@@ -1542,6 +1745,37 @@ xeon_set_pbar_xlat(struct ntb_softc *ntb, uint64_t base_addr, enum ntb_bar idx)
 }
 
 static int
+xeon_setup_msix_bar(struct ntb_softc *ntb)
+{
+	struct ntb_pci_bar_info *lapic_bar;
+	enum ntb_bar bar_num;
+	int rc;
+
+	if (!HAS_FEATURE(NTB_SB01BASE_LOCKUP))
+		return (0);
+
+	bar_num = ntb_mw_to_bar(ntb, ntb->msix_mw_idx);
+	lapic_bar = &ntb->bar_info[bar_num];
+
+	/* Restrict LAPIC BAR to 1MB */
+	if (lapic_bar->size > ONE_MB) {
+		rc = bus_adjust_resource(ntb->device, SYS_RES_MEMORY,
+		    lapic_bar->pci_resource, lapic_bar->pbase,
+		    lapic_bar->pbase + ONE_MB - 1);
+		if (rc == 0)
+			lapic_bar->size = ONE_MB;
+		else {
+			ntb_printf(0, "Failed to shrink LAPIC BAR resource to "
+			    "1 MB: %d\n", rc);
+			/* Ignore error */
+		}
+	}
+
+	ntb->peer_lapic_bar = lapic_bar;
+	return (0);
+}
+
+static int
 xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
     const struct ntb_b2b_addr *peer_addr)
 {
@@ -1619,6 +1853,43 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 	ntb_reg_write(8, XEON_SBAR2XLAT_OFFSET, 0);
 	ntb_reg_write(8, XEON_SBAR4XLAT_OFFSET, 0);
 
+	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+		size_t size, xlatoffset;
+
+		switch (ntb_mw_to_bar(ntb, ntb->msix_mw_idx)) {
+		case NTB_B2B_BAR_1:
+			size = 8;
+			xlatoffset = XEON_SBAR2XLAT_OFFSET;
+			break;
+		case NTB_B2B_BAR_2:
+			xlatoffset = XEON_SBAR4XLAT_OFFSET;
+			if (HAS_FEATURE(NTB_SPLIT_BAR))
+				size = 4;
+			else
+				size = 8;
+			break;
+		case NTB_B2B_BAR_3:
+			xlatoffset = XEON_SBAR5XLAT_OFFSET;
+			size = 4;
+			break;
+		default:
+			KASSERT(false, ("Bogus msix mw idx: %u",
+			    ntb->msix_mw_idx));
+			return (EINVAL);
+		}
+
+		/*
+		 * We point the chosen MSIX MW BAR xlat to remote LAPIC for
+		 * workaround
+		 */
+		if (size == 4)
+			ntb_reg_write(4, xlatoffset, MSI_INTEL_ADDR_BASE);
+		else
+			ntb_reg_write(8, xlatoffset, MSI_INTEL_ADDR_BASE);
+	}
+	(void)ntb_reg_read(8, XEON_SBAR2XLAT_OFFSET);
+	(void)ntb_reg_read(8, XEON_SBAR4XLAT_OFFSET);
+
 	/* Zero outgoing translation limits (whole bar size windows) */
 	ntb_reg_write(8, XEON_PBAR2LMT_OFFSET, 0);
 	ntb_reg_write(8, XEON_PBAR4LMT_OFFSET, 0);
@@ -1656,14 +1927,21 @@ xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
 }
 
 static inline bool
+_xeon_link_is_up(struct ntb_softc *ntb)
+{
+
+	if (ntb->conn_type == NTB_CONN_TRANSPARENT)
+		return (true);
+	return ((ntb->lnk_sta & NTB_LINK_STATUS_ACTIVE) != 0);
+}
+
+static inline bool
 link_is_up(struct ntb_softc *ntb)
 {
 
-	if (ntb->type == NTB_XEON) {
-		if (ntb->conn_type == NTB_CONN_TRANSPARENT)
-			return (true);
-		return ((ntb->lnk_sta & NTB_LINK_STATUS_ACTIVE) != 0);
-	}
+	if (ntb->type == NTB_XEON)
+		return (_xeon_link_is_up(ntb) && (ntb->peer_msix_good ||
+		    !HAS_FEATURE(NTB_SB01BASE_LOCKUP)));
 
 	KASSERT(ntb->type == NTB_ATOM, ("ntb type"));
 	return ((ntb->ntb_ctl & ATOM_CNTL_LINK_DOWN) == 0);
@@ -1881,6 +2159,8 @@ ntb_link_enable(struct ntb_softc *ntb, enum ntb_speed s __unused,
 {
 	uint32_t cntl;
 
+	ntb_printf(2, "%s\n", __func__);
+
 	if (ntb->type == NTB_ATOM) {
 		pci_write_config(ntb->device, NTB_PPD_OFFSET,
 		    ntb->ppd | ATOM_PPD_INIT_LINK, 4);
@@ -1919,6 +2199,8 @@ ntb_link_disable(struct ntb_softc *ntb)
 {
 	uint32_t cntl;
 
+	ntb_printf(2, "%s\n", __func__);
+
 	if (ntb->conn_type == NTB_CONN_TRANSPARENT) {
 		ntb_link_event(ntb);
 		return (0);
@@ -1934,6 +2216,23 @@ ntb_link_disable(struct ntb_softc *ntb)
 	return (0);
 }
 
+bool
+ntb_link_enabled(struct ntb_softc *ntb)
+{
+	uint32_t cntl;
+
+	if (ntb->type == NTB_ATOM) {
+		cntl = pci_read_config(ntb->device, NTB_PPD_OFFSET, 4);
+		return ((cntl & ATOM_PPD_INIT_LINK) != 0);
+	}
+
+	if (ntb->conn_type == NTB_CONN_TRANSPARENT)
+		return (true);
+
+	cntl = ntb_reg_read(4, ntb->reg->ntb_ctl);
+	return ((cntl & NTB_CNTL_LINK_DISABLE) == 0);
+}
+
 static void
 recover_atom_link(void *arg)
 {
@@ -2002,6 +2301,19 @@ ntb_poll_link(struct ntb_softc *ntb)
 			return (false);
 
 		ntb->lnk_sta = reg_val;
+
+		if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+			if (_xeon_link_is_up(ntb)) {
+				if (!ntb->peer_msix_good) {
+					callout_reset(&ntb->peer_msix_work, 0,
+					    ntb_exchange_msix, ntb);
+					return (false);
+				}
+			} else {
+				ntb->peer_msix_good = false;
+				ntb->peer_msix_done = false;
+			}
+		}
 	}
 	return (true);
 }
@@ -2040,16 +2352,26 @@ SYSCTL_NODE(_hw_ntb, OID_AUTO, debug_info, CTLFLAG_RW, 0,
 static void
 ntb_sysctl_init(struct ntb_softc *ntb)
 {
-	struct sysctl_oid_list *tree_par, *regpar, *statpar, *errpar;
+	struct sysctl_oid_list *globals, *tree_par, *regpar, *statpar, *errpar;
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *tree, *tmptree;
 
 	ctx = device_get_sysctl_ctx(ntb->device);
-
-	tree = SYSCTL_ADD_NODE(ctx,
-	    SYSCTL_CHILDREN(device_get_sysctl_tree(ntb->device)), OID_AUTO,
-	    "debug_info", CTLFLAG_RD, NULL,
-	    "Driver state, statistics, and HW registers");
+	globals = SYSCTL_CHILDREN(device_get_sysctl_tree(ntb->device));
+
+	SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "link_status",
+	    CTLFLAG_RD | CTLTYPE_STRING, ntb, 0,
+	    sysctl_handle_link_status_human, "A",
+	    "Link status (human readable)");
+	SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "active",
+	    CTLFLAG_RD | CTLTYPE_UINT, ntb, 0, sysctl_handle_link_status,
+	    "IU", "Link status (1=active, 0=inactive)");
+	SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "admin_up",
+	    CTLFLAG_RW | CTLTYPE_UINT, ntb, 0, sysctl_handle_link_admin,
+	    "IU", "Set/get interface status (1=UP, 0=DOWN)");
+
+	tree = SYSCTL_ADD_NODE(ctx, globals, OID_AUTO, "debug_info",
+	    CTLFLAG_RD, NULL, "Driver state, statistics, and HW registers");
 	tree_par = SYSCTL_CHILDREN(tree);
 
 	SYSCTL_ADD_UINT(ctx, tree_par, OID_AUTO, "conn_type", CTLFLAG_RD,
@@ -2081,10 +2403,6 @@ ntb_sysctl_init(struct ntb_softc *ntb)
 	    __DEVOLATILE(uint32_t *, &ntb->lnk_sta), 0,
 	    "LNK STA register (cached)");
 
-	SYSCTL_ADD_PROC(ctx, tree_par, OID_AUTO, "link_status",
-	    CTLFLAG_RD | CTLTYPE_STRING, ntb, 0, sysctl_handle_link_status,
-	    "A", "Link status");
-
 #ifdef notyet
 	SYSCTL_ADD_U8(ctx, tree_par, OID_AUTO, "mw_count", CTLFLAG_RD,
 	    &ntb->mw_count, 0, "MW count");
@@ -2332,7 +2650,37 @@ sysctl_handle_features(SYSCTL_HANDLER_ARGS)
 }
 
 static int
-sysctl_handle_link_status(SYSCTL_HANDLER_ARGS)
+sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS)
+{
+	struct ntb_softc *ntb;
+	unsigned old, new;
+	int error;
+
+	error = 0;
+	ntb = arg1;
+
+	old = ntb_link_enabled(ntb);
+
+	error = SYSCTL_OUT(req, &old, sizeof(old));
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	error = SYSCTL_IN(req, &new, sizeof(new));
+	if (error != 0)
+		return (error);
+
+	ntb_printf(0, "Admin set interface state to '%sabled'\n",
+	    (new != 0)? "en" : "dis");
+
+	if (new != 0)
+		error = ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+	else
+		error = ntb_link_disable(ntb);
+	return (error);
+}
+
+static int
+sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS)
 {
 	struct ntb_softc *ntb;
 	struct sbuf sb;
@@ -2360,6 +2708,24 @@ sysctl_handle_link_status(SYSCTL_HANDLER_ARGS)
 }
 
 static int
+sysctl_handle_link_status(SYSCTL_HANDLER_ARGS)
+{
+	struct ntb_softc *ntb;
+	unsigned res;
+	int error;
+
+	error = 0;
+	ntb = arg1;
+
+	res = ntb_link_is_up(ntb, NULL, NULL);
+
+	error = SYSCTL_OUT(req, &res, sizeof(res));
+	if (error || !req->newptr)
+		return (error);
+	return (EINVAL);
+}
+
+static int
 sysctl_handle_register(SYSCTL_HANDLER_ARGS)
 {
 	struct ntb_softc *ntb;
@@ -2434,12 +2800,70 @@ static unsigned
 ntb_user_mw_to_idx(struct ntb_softc *ntb, unsigned uidx)
 {
 
-	if (ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0 &&
-	    uidx >= ntb->b2b_mw_idx)
-		return (uidx + 1);
+	if ((ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0 &&
+	    uidx >= ntb->b2b_mw_idx) ||
+	    (ntb->msix_mw_idx != B2B_MW_DISABLED && uidx >= ntb->msix_mw_idx))
+		uidx++;
+	if ((ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0 &&
+	    uidx >= ntb->b2b_mw_idx) &&
+	    (ntb->msix_mw_idx != B2B_MW_DISABLED && uidx >= ntb->msix_mw_idx))
+		uidx++;
 	return (uidx);
 }
 
+static void
+ntb_exchange_msix(void *ctx)
+{
+	struct ntb_softc *ntb;
+	uint32_t val;
+	unsigned i;
+
+	ntb = ctx;
+
+	if (ntb->peer_msix_done)
+		goto msix_done;
+
+	for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+		ntb_peer_spad_write(ntb, NTB_MSIX_DATA0 + i,
+		    ntb->msix_data[i].nmd_data);
+		ntb_peer_spad_write(ntb, NTB_MSIX_OFS0 + i,
+		    ntb->msix_data[i].nmd_ofs);
+	}
+	ntb_peer_spad_write(ntb, NTB_MSIX_GUARD, NTB_MSIX_VER_GUARD);
+
+	ntb_spad_read(ntb, NTB_MSIX_GUARD, &val);
+	if (val != NTB_MSIX_VER_GUARD)
+		goto reschedule;
+
+	for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+		ntb_spad_read(ntb, NTB_MSIX_DATA0 + i, &val);
+		ntb->peer_msix_data[i].nmd_data = val;
+		ntb_spad_read(ntb, NTB_MSIX_OFS0 + i, &val);
+		ntb->peer_msix_data[i].nmd_ofs = val;
+	}
+
+	ntb->peer_msix_done = true;
+
+msix_done:
+	ntb_peer_spad_write(ntb, NTB_MSIX_DONE, NTB_MSIX_RECEIVED);
+	ntb_spad_read(ntb, NTB_MSIX_DONE, &val);
+	if (val != NTB_MSIX_RECEIVED)
+		goto reschedule;
+
+	ntb->peer_msix_good = true;
+
+	ntb_poll_link(ntb);
+	ntb_link_event(ntb);
+	return;
+
+reschedule:
+	ntb->lnk_sta = pci_read_config(ntb->device, ntb->reg->lnk_sta, 2);
+	if (_xeon_link_is_up(ntb))
+		callout_reset(&ntb->peer_msix_work, hz / 100, ntb_exchange_msix, ntb);
+	else
+		ntb_spad_clear(ntb);
+}
+
 /*
  * Public API to the rest of the OS
  */
@@ -2469,10 +2893,14 @@ ntb_get_max_spads(struct ntb_softc *ntb)
 uint8_t
 ntb_mw_count(struct ntb_softc *ntb)
 {
+	uint8_t res;
 
+	res = ntb->mw_count;
 	if (ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0)
-		return (ntb->mw_count - 1);
-	return (ntb->mw_count);
+		res--;
+	if (ntb->msix_mw_idx != B2B_MW_DISABLED)
+		res--;
+	return (res);
 }
 
 /**
@@ -2498,6 +2926,18 @@ ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val)
 	return (0);
 }
 
+/*
+ * Zeros the local scratchpad.
+ */
+void
+ntb_spad_clear(struct ntb_softc *ntb)
+{
+	unsigned i;
+
+	for (i = 0; i < ntb->spad_count; i++)
+		ntb_spad_write(ntb, i, 0);
+}
+
 /**
  * ntb_spad_read() - read from the primary scratchpad register
  * @ntb: pointer to ntb_softc instance
@@ -2826,6 +3266,22 @@ void
 ntb_peer_db_set(struct ntb_softc *ntb, uint64_t bit)
 {
 
+	if (HAS_FEATURE(NTB_SB01BASE_LOCKUP)) {
+		struct ntb_pci_bar_info *lapic;
+		unsigned i;
+
+		lapic = ntb->peer_lapic_bar;
+
+		for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
+			if ((bit & ntb_db_vector_mask(ntb, i)) != 0)
+				bus_space_write_4(lapic->pci_bus_tag,
+				    lapic->pci_bus_handle,
+				    ntb->peer_msix_data[i].nmd_ofs,
+				    ntb->peer_msix_data[i].nmd_data);
+		}
+		return;
+	}
+
 	if (HAS_FEATURE(NTB_SDOORBELL_LOCKUP)) {
 		ntb_mw_write(2, XEON_PDOORBELL_OFFSET, bit);
 		return;
diff --git a/sys/dev/ntb/ntb_hw/ntb_hw.h b/sys/dev/ntb/ntb_hw/ntb_hw.h
index c35166c..f05acda 100644
--- a/sys/dev/ntb/ntb_hw/ntb_hw.h
+++ b/sys/dev/ntb/ntb_hw/ntb_hw.h
@@ -70,6 +70,7 @@ bool ntb_link_is_up(struct ntb_softc *, enum ntb_speed *, enum ntb_width *);
 void ntb_link_event(struct ntb_softc *);
 int ntb_link_enable(struct ntb_softc *, enum ntb_speed, enum ntb_width);
 int ntb_link_disable(struct ntb_softc *);
+bool ntb_link_enabled(struct ntb_softc *);
 
 int ntb_set_ctx(struct ntb_softc *, void *, const struct ntb_ctx_ops *);
 void *ntb_get_ctx(struct ntb_softc *, const struct ntb_ctx_ops **);
@@ -86,6 +87,7 @@ int ntb_mw_get_wc(struct ntb_softc *, unsigned mw_idx, vm_memattr_t *mode);
 int ntb_mw_set_wc(struct ntb_softc *, unsigned mw_idx, vm_memattr_t mode);
 
 uint8_t ntb_get_max_spads(struct ntb_softc *ntb);
+void ntb_spad_clear(struct ntb_softc *ntb);
 int ntb_spad_write(struct ntb_softc *ntb, unsigned int idx, uint32_t val);
 int ntb_spad_read(struct ntb_softc *ntb, unsigned int idx, uint32_t *val);
 int ntb_peer_spad_write(struct ntb_softc *ntb, unsigned int idx,
diff --git a/sys/dev/ntb/ntb_hw/ntb_regs.h b/sys/dev/ntb/ntb_hw/ntb_regs.h
index f50fd93..fb445d7 100644
--- a/sys/dev/ntb/ntb_hw/ntb_regs.h
+++ b/sys/dev/ntb/ntb_hw/ntb_regs.h
@@ -44,6 +44,7 @@
 #define XEON_DB_MSIX_VECTOR_COUNT	4
 #define XEON_DB_MSIX_VECTOR_SHIFT	5
 #define XEON_DB_LINK_BIT	(1 << XEON_DB_LINK)
+#define XEON_NONLINK_DB_MSIX_BITS	3
 
 #define XEON_SPCICMD_OFFSET	0x0504
 #define XEON_DEVCTRL_OFFSET	0x0598
diff --git a/sys/dev/sound/pci/hda/hdaa.c b/sys/dev/sound/pci/hda/hdaa.c
index fe45343..14aee62 100644
--- a/sys/dev/sound/pci/hda/hdaa.c
+++ b/sys/dev/sound/pci/hda/hdaa.c
@@ -1553,20 +1553,20 @@ hdaa_widget_parse(struct hdaa_widget *w)
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
 	    buf, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
-	    w, sizeof(w), hdaa_sysctl_caps, "A", "Node capabilities");
+	    w, 0, hdaa_sysctl_caps, "A", "Node capabilities");
 	if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) {
 		snprintf(buf, sizeof(buf), "nid%d_config", w->nid);
 		SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
 		    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
 		    buf, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
-		    &w->wclass.pin.newconf, sizeof(&w->wclass.pin.newconf),
-		    hdaa_sysctl_config, "A", "Current pin configuration");
+		    &w->wclass.pin.newconf, 0, hdaa_sysctl_config, "A",
+		    "Current pin configuration");
 		snprintf(buf, sizeof(buf), "nid%d_original", w->nid);
 		SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
 		    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
 		    buf, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
-		    &w->wclass.pin.original, sizeof(&w->wclass.pin.original),
-		    hdaa_sysctl_config, "A", "Original pin configuration");
+		    &w->wclass.pin.original, 0, hdaa_sysctl_config, "A",
+		    "Original pin configuration");
 	}
 	hdaa_lock(w->devinfo);
 }
@@ -6641,38 +6641,32 @@ hdaa_attach(device_t dev)
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
 	    "config", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
-	    &devinfo->newquirks, sizeof(&devinfo->newquirks),
-	    hdaa_sysctl_quirks, "A", "Configuration options");
+	    &devinfo->newquirks, 0, hdaa_sysctl_quirks, "A",
+	    "Configuration options");
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
 	    "gpi_state", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
-	    devinfo, sizeof(devinfo),
-	    hdaa_sysctl_gpi_state, "A", "GPI state");
+	    devinfo, 0, hdaa_sysctl_gpi_state, "A", "GPI state");
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
 	    "gpio_state", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
-	    devinfo, sizeof(devinfo),
-	    hdaa_sysctl_gpio_state, "A", "GPIO state");
+	    devinfo, 0, hdaa_sysctl_gpio_state, "A", "GPIO state");
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
 	    "gpio_config", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
-	    devinfo, sizeof(devinfo),
-	    hdaa_sysctl_gpio_config, "A", "GPIO configuration");
+	    devinfo, 0, hdaa_sysctl_gpio_config, "A", "GPIO configuration");
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
 	    "gpo_state", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
-	    devinfo, sizeof(devinfo),
-	    hdaa_sysctl_gpo_state, "A", "GPO state");
+	    devinfo, 0, hdaa_sysctl_gpo_state, "A", "GPO state");
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
 	    "gpo_config", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
-	    devinfo, sizeof(devinfo),
-	    hdaa_sysctl_gpo_config, "A", "GPO configuration");
+	    devinfo, 0, hdaa_sysctl_gpo_config, "A", "GPO configuration");
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
 	    "reconfig", CTLTYPE_INT | CTLFLAG_RW,
-	    dev, sizeof(dev),
-	    hdaa_sysctl_reconfig, "I", "Reprocess configuration");
+	    dev, 0, hdaa_sysctl_reconfig, "I", "Reprocess configuration");
 	bus_generic_attach(dev);
 	return (0);
 }
diff --git a/sys/kern/subr_vmem.c b/sys/kern/subr_vmem.c
index 80940be..2ec45c3 100644
--- a/sys/kern/subr_vmem.c
+++ b/sys/kern/subr_vmem.c
@@ -1046,10 +1046,8 @@ vmem_create(const char *name, vmem_addr_t base, vmem_size_t size,
 	if (vm == NULL)
 		return (NULL);
 	if (vmem_init(vm, name, base, size, quantum, qcache_max,
-	    flags) == NULL) {
-		free(vm, M_VMEM);
+	    flags) == NULL)
 		return (NULL);
-	}
 	return (vm);
 }
 
diff --git a/sys/modules/dummynet/Makefile b/sys/modules/dummynet/Makefile
index dfddbce..98e685e 100644
--- a/sys/modules/dummynet/Makefile
+++ b/sys/modules/dummynet/Makefile
@@ -6,8 +6,9 @@
 KMOD=   dummynet
 SRCS=   ip_dummynet.c
 SRCS+= ip_dn_glue.c ip_dn_io.c
+SRCS+= dn_aqm_codel.c dn_aqm_pie.c
 SRCS+= dn_heap.c dn_sched_fifo.c dn_sched_qfq.c dn_sched_rr.c dn_sched_wf2q.c 
-SRCS+= dn_sched_prio.c
+SRCS+= dn_sched_prio.c dn_sched_fq_codel.c dn_sched_fq_pie.c
 SRCS+=	opt_inet6.h
 
 .if !defined(KERNBUILDDIR)
diff --git a/sys/modules/hyperv/utilities/Makefile b/sys/modules/hyperv/utilities/Makefile
index f94e441..c1b6d4f 100644
--- a/sys/modules/hyperv/utilities/Makefile
+++ b/sys/modules/hyperv/utilities/Makefile
@@ -3,7 +3,7 @@
 .PATH:	${.CURDIR}/../../../dev/hyperv/utilities
 
 KMOD=	hv_utils
-SRCS=	hv_util.c hv_kvp.c
+SRCS=	hv_util.c hv_kvp.c hv_timesync.c hv_shutdown.c hv_heartbeat.c
 SRCS+=	bus_if.h device_if.h
 
 CFLAGS+= -I${.CURDIR}/../../../dev/hyperv/include \
diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h
index 202f1e2..377b5b0 100644
--- a/sys/netinet/ip_dummynet.h
+++ b/sys/netinet/ip_dummynet.h
@@ -29,7 +29,7 @@
 
 #ifndef _IP_DUMMYNET_H
 #define _IP_DUMMYNET_H
-
+#define NEW_AQM
 /*
  * Definition of the kernel-userland API for dummynet.
  *
@@ -85,7 +85,13 @@ enum {
 	/* special commands for emulation of sysctl variables */
 	DN_SYSCTL_GET,
 	DN_SYSCTL_SET,
-
+#ifdef NEW_AQM
+	/* subtypes used for setting/getting extra parameters.
+	 * these subtypes used with IP_DUMMYNET3 command (get)
+	 * and DN_TEXT (set). */
+	DN_AQM_PARAMS, /* AQM extra params */
+	DN_SCH_PARAMS, /* scheduler extra params */
+#endif
 	DN_LAST,
 };
 
@@ -105,6 +111,9 @@ enum {	/* user flags */
 	DN_IS_RED	= 0x0020,
 	DN_IS_GENTLE_RED= 0x0040,
 	DN_IS_ECN	= 0x0080,
+	#ifdef NEW_AQM
+	DN_IS_AQM = 0x0100,     /* AQMs: e.g Codel & PIE */
+	#endif
 	DN_PIPE_CMD	= 0x1000,	/* pipe config... */
 };
 
@@ -210,7 +219,19 @@ struct dn_profile {
 	int	samples[ED_MAX_SAMPLES_NO];	/* may be shorter */
 };
 
-
+#ifdef NEW_AQM
+/* Extra parameters for AQM and scheduler.
+ * This struct is used to pass and retrieve parameters (configurations)
+ * to/from AQM and Scheduler.
+ */
+struct dn_extra_parms {
+	struct dn_id oid;
+	char name[16];
+	uint32_t nr;
+#define DN_MAX_EXTRA_PARM	10
+	int64_t par[DN_MAX_EXTRA_PARM];
+};
+#endif
 
 /*
  * Overall structure of dummynet
diff --git a/sys/netipsec/key.c b/sys/netipsec/key.c
index 26b1788..f5b0fee 100644
--- a/sys/netipsec/key.c
+++ b/sys/netipsec/key.c
@@ -350,7 +350,7 @@ do { \
 	if ((head) != (sav)) {						\
 		ipseclog((LOG_DEBUG, "%s: state mismatched (TREE=%d SA=%d)\n", \
 			(name), (head), (sav)));			\
-		continue;						\
+		break;							\
 	}								\
 } while (0)
 
diff --git a/sys/netpfil/ipfw/dn_aqm.h b/sys/netpfil/ipfw/dn_aqm.h
new file mode 100644
index 0000000..d01e98e
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm.h
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ *  Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from 
+ *  The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * API for writing an Active Queue Management algorithm for Dummynet
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_AQM_H
+#define _IP_DN_AQM_H
+
+
+/* NOW is the current time in millisecond*/
+#define NOW ((dn_cfg.curr_time * tick) / 1000)
+
+#define AQM_UNOW (dn_cfg.curr_time * tick)
+#define AQM_TIME_1US ((aqm_time_t)(1))
+#define AQM_TIME_1MS ((aqm_time_t)(1000))
+#define AQM_TIME_1S ((aqm_time_t)(AQM_TIME_1MS * 1000))
+
+/* aqm time allows to store up to 4294 seconds */
+typedef uint32_t aqm_time_t;
+typedef int32_t aqm_stime_t;
+
+#define DN_AQM_MTAG_TS 55345
+
+/* Macro for variable bounding */
+#define BOUND_VAR(x,l,h)  ((x) > (h)? (h) : ((x) > (l)? (x) : (l)))
+
+/* sysctl variable to count number of dropped packets */
+extern unsigned long io_pkt_drop; 
+
+/*
+ * Structure for holding data and function pointers that together represent a
+ * AQM algorithm.
+ */
+ struct dn_aqm {
+#define DN_AQM_NAME_MAX 50
+	char			name[DN_AQM_NAME_MAX];	/* name of AQM algorithm */
+	uint32_t	type;	/* AQM type number */
+	
+	/* Methods implemented by AQM algorithm:
+	 * 
+	 * enqueue	enqueue packet 'm' on queue 'q'.
+	 * 	Return 0 on success, 1 on drop.
+	 * 
+	 * dequeue	dequeue a packet from queue 'q'.
+	 * 	Return a packet, NULL if no packet available.
+	 * 
+	 * config	configure AQM algorithm
+	 * If required, this function should allocate space to store 
+	 * the configurations and set 'fs->aqmcfg' to point to this space.
+	 * 'dn_extra_parms' includes array of parameters send
+	 * from ipfw userland command.
+	 * 	Return 0 on success, non-zero otherwise.
+	 * 
+	 * deconfig	deconfigure AQM algorithm.
+	 * The allocated configuration memory space should be freed here.
+	 * 	Return 0 on success, non-zero otherwise.
+	 * 
+	 * init	initialise AQM status variables of queue 'q'
+	 * This function is used to allocate space and init AQM status for a
+	 * queue and q->aqm_status to point to this space.
+	 * 	Return 0 on success, non-zero otherwise.
+	 * 
+	 * cleanup	cleanup AQM status variables of queue 'q'
+	 * The allocated memory space for AQM status should be freed here.
+	 * 	Return 0 on success, non-zero otherwise.
+	 * 
+	 * getconfig	retrieve AQM configurations 
+	 * This function is used to return AQM parameters to userland
+	 * command. The function should fill 'dn_extra_parms' struct with 
+	 * the AQM configurations using 'par' array.
+	 * 
+	 */
+	
+	int (*enqueue)(struct dn_queue *, struct mbuf *);
+	struct mbuf * (*dequeue)(struct dn_queue *);
+	int (*config)(struct dn_fsk *, struct dn_extra_parms *ep, int);
+	int (*deconfig)(struct dn_fsk *);
+	int (*init)(struct dn_queue *);
+	int (*cleanup)(struct dn_queue *);
+	int (*getconfig)(struct dn_fsk *, struct dn_extra_parms *);
+
+	int	ref_count; /*Number of queues instances in the system */
+	int	cfg_ref_count;	/*Number of AQM instances in the system */
+	SLIST_ENTRY (dn_aqm) next; /* Next AQM in the list */
+};
+
+/* Helper function to update queue and scheduler statistics.
+ * negative len + drop -> drop
+ * negative len -> dequeue
+ * positive len -> enqueue
+ * positive len + drop -> drop during enqueue
+ */
+__inline static void
+update_stats(struct dn_queue *q, int len, int drop)
+{
+	int inc = 0;
+	struct dn_flow *sni;
+	struct dn_flow *qni;
+	
+	sni = &q->_si->ni;
+	qni = &q->ni;
+
+	if (len < 0)
+			inc = -1;
+	else if(len > 0)
+			inc = 1;
+
+	if (drop) {
+			qni->drops++;
+			sni->drops++;
+			io_pkt_drop++;
+	} else {
+		/*update queue stats */
+		qni->length += inc;
+		qni->len_bytes += len;
+
+		/*update scheduler instance stats */
+		sni->length += inc;
+		sni->len_bytes += len;
+	}
+	/* tot_pkts  is updated in dn_enqueue function */
+}
+
+
+/* kernel module related function */
+int
+dn_aqm_modevent(module_t mod, int cmd, void *arg);
+
+#define DECLARE_DNAQM_MODULE(name, dnaqm)			\
+	static moduledata_t name##_mod = {			\
+		#name, dn_aqm_modevent, dnaqm		\
+	};							\
+	DECLARE_MODULE(name, name##_mod, 			\
+		SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); 	\
+        MODULE_DEPEND(name, dummynet, 3, 3, 3)
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_aqm_codel.c b/sys/netpfil/ipfw/dn_aqm_codel.c
new file mode 100644
index 0000000..0080170
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm_codel.c
@@ -0,0 +1,444 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm.
+ *
+ * $FreeBSD$
+ * 
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ *  Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from 
+ *  The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>		/* ip_len, ip_off */
+#include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/if_ether.h> /* various ether_* routines */
+#include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+#include <netpfil/ipfw/dn_heap.h>
+
+#ifdef NEW_AQM
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+#include <netpfil/ipfw/dn_aqm.h>
+#include <netpfil/ipfw/dn_aqm_codel.h>
+#include <netpfil/ipfw/dn_sched.h>
+
+#define DN_AQM_CODEL 1
+
+static struct dn_aqm codel_desc;
+
+/* default codel parameters */
+struct dn_aqm_codel_parms codel_sysctl = {5000 * AQM_TIME_1US,
+	100000 * AQM_TIME_1US, 0};
+
+static int
+codel_sysctl_interval_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	long  value;
+
+	value = codel_sysctl.interval;
+	value /= AQM_TIME_1US;
+	error = sysctl_handle_long(oidp, &value, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (value < 1 || value > 100 * AQM_TIME_1S)
+		return (EINVAL);
+	codel_sysctl.interval = value * AQM_TIME_1US ;
+	return (0);
+}
+
+static int
+codel_sysctl_target_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	long  value;
+
+	value = codel_sysctl.target;
+	value /= AQM_TIME_1US;
+	error = sysctl_handle_long(oidp, &value, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	D("%ld", value);
+	if (value < 1 || value > 5 * AQM_TIME_1S)
+		return (EINVAL);
+	codel_sysctl.target = value * AQM_TIME_1US ;
+	return (0);
+}
+
+/* defining Codel sysctl variables */
+SYSBEGIN(f4)
+
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_ip_dummynet);
+static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, 
+	codel, CTLFLAG_RW, 0, "CODEL");
+
+#ifdef SYSCTL_NODE
+SYSCTL_PROC(_net_inet_ip_dummynet_codel, OID_AUTO, target,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,codel_sysctl_target_handler, "L",
+	"CoDel target in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_codel, OID_AUTO, interval,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, codel_sysctl_interval_handler, "L",
+	"CoDel interval in microsecond");
+#endif
+
+/* This function computes codel_interval/sqrt(count) 
+ *  Newton's method of approximation is used to compute 1/sqrt(count).
+ * http://betterexplained.com/articles/
+ * 	understanding-quakes-fast-inverse-square-root/ 
+ */
+aqm_time_t 
+control_law(struct codel_status *cst, struct dn_aqm_codel_parms *cprms,
+	aqm_time_t t)
+{
+	uint32_t count;
+	uint64_t temp;
+	count = cst->count;
+
+	/* we don't calculate isqrt(1) to get more accurate result*/
+	if (count == 1) {
+		/* prepare isqrt (old guess) for the next iteration i.e. 1/sqrt(2)*/
+		cst->isqrt = (1UL<< FIX_POINT_BITS) * 7/10;
+		/* return time + isqrt(1)*interval */
+		return t + cprms->interval;
+	}
+
+	/* newguess = g(1.5 - 0.5*c*g^2)
+	 * Multiplying both sides by 2 to make all the constants intergers
+	 * newguess * 2  = g(3 - c*g^2) g=old guess, c=count
+	 * So, newguess = newguess /2
+	 * Fixed point operations are used here.  
+	 */
+
+	/* Calculate g^2 */
+	temp = (uint32_t) cst->isqrt * cst->isqrt;
+	/* Calculate (3 - c*g^2) i.e. (3 - c * temp) */
+	temp = (3ULL<< (FIX_POINT_BITS*2)) - (count * temp);
+
+	/* 
+	 * Divide by 2 because we multiplied the original equation by two 
+	 * Also, we shift the result by 8 bits to prevent overflow. 
+	 * */
+	temp >>= (1 + 8); 
+
+	/*  Now, temp = (1.5 - 0.5*c*g^2)
+	 * Calculate g (1.5 - 0.5*c*g^2) i.e. g * temp 
+	 */
+	temp = (cst->isqrt * temp) >> (FIX_POINT_BITS + FIX_POINT_BITS - 8);
+	cst->isqrt = temp;
+
+	 /* calculate codel_interval/sqrt(count) */
+	 return t + ((cprms->interval * temp) >> FIX_POINT_BITS);
+}
+
+/*
+ * Extract a packet from the head of queue 'q'
+ * Return a packet or NULL if the queue is empty.
+ * Also extract packet's timestamp from mtag.
+ */
+struct mbuf *
+codel_extract_head(struct dn_queue *q, aqm_time_t *pkt_ts)
+{
+	struct m_tag *mtag;
+	struct mbuf *m = q->mq.head;
+
+	if (m == NULL)
+		return m;
+	q->mq.head = m->m_nextpkt;
+
+	/* Update stats */
+	update_stats(q, -m->m_pkthdr.len, 0);
+
+	if (q->ni.length == 0) /* queue is now idle */
+			q->q_time = dn_cfg.curr_time;
+
+	/* extract packet TS*/
+	mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+	if (mtag == NULL) {
+		D("Codel timestamp mtag not found!");
+		*pkt_ts = 0;
+	} else {
+		*pkt_ts = *(aqm_time_t *)(mtag + 1);
+		m_tag_delete(m,mtag); 
+	}
+
+	return m;
+}
+
+/*
+ * Enqueue a packet 'm' in queue 'q'
+ */
+static int
+aqm_codel_enqueue(struct dn_queue *q, struct mbuf *m)
+{
+	struct dn_fs *f;
+	uint64_t len;
+	struct codel_status *cst;	/*codel status variables */
+	struct m_tag *mtag;
+
+	f = &(q->fs->fs);
+	len = m->m_pkthdr.len;
+	cst = q->aqm_status;
+	if(!cst) {
+		D("Codel queue is not initialized\n");
+		goto drop;
+	}
+
+	/* Finding maximum packet size */
+	// XXX we can get MTU from driver instead 
+	if (len > cst->maxpkt_size)
+		cst->maxpkt_size = len;
+
+	/* check for queue size and drop the tail if exceed queue limit*/
+	if (f->flags & DN_QSIZE_BYTES) {
+		if ( q->ni.len_bytes > f->qsize)
+			goto drop;
+	}
+	else {
+		if ( q->ni.length >= f->qsize)
+			goto drop;
+	}
+
+	/* Add timestamp as mtag */
+	mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+	if (mtag == NULL)
+		mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS,
+			sizeof(aqm_time_t), M_NOWAIT);
+	if (mtag == NULL) {
+		m_freem(m); 
+		goto drop;
+	}
+
+	*(aqm_time_t *)(mtag + 1) = AQM_UNOW;
+	m_tag_prepend(m, mtag);
+
+	mq_append(&q->mq, m);
+	update_stats(q, len, 0);
+	return (0);
+
+drop:
+	update_stats(q, 0, 1);
+	FREE_PKT(m);
+	return (1);
+}
+
+
+/* Dequeue a pcaket from queue q */
+static struct mbuf * 
+aqm_codel_dequeue(struct dn_queue *q)
+{
+	return codel_dequeue(q);
+}
+
+/* 
+ * initialize Codel for queue 'q' 
+ * First allocate memory for codel status.
+ */
+static int 
+aqm_codel_init(struct dn_queue *q)
+{
+	struct codel_status *cst;
+
+	if (!q->fs->aqmcfg) {
+		D("Codel is not configure!d");
+		return EINVAL;
+	}
+
+	q->aqm_status = malloc(sizeof(struct codel_status),
+			 M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (q->aqm_status == NULL) {
+		D("Cannot allocate AQM_codel private data");
+		return ENOMEM ; 
+	}
+
+	/* init codel status variables */
+	cst = q->aqm_status;
+	cst->dropping=0;
+	cst->first_above_time=0;
+	cst->drop_next_time=0;
+	cst->count=0;
+	cst->maxpkt_size = 500;
+
+	/* increase reference counters */
+	codel_desc.ref_count++;
+
+	return 0;
+}
+
+/* 
+ * Clean up Codel status for queue 'q' 
+ * Destroy memory allocated for codel status.
+ */
+static int
+aqm_codel_cleanup(struct dn_queue *q)
+{
+
+	if (q && q->aqm_status) {
+		free(q->aqm_status, M_DUMMYNET);
+		q->aqm_status = NULL;
+		/* decrease reference counters */
+		codel_desc.ref_count--;
+	}
+	else
+		D("Codel already cleaned up");
+	return 0;
+}
+
+/* 
+ * Config codel parameters
+ * also allocate memory for codel configurations
+ */
+static int
+aqm_codel_config(struct dn_fsk* fs, struct dn_extra_parms *ep, int len)
+{
+	struct dn_aqm_codel_parms *ccfg;
+
+	int l = sizeof(struct dn_extra_parms);
+	if (len < l) {
+		D("invalid sched parms length got %d need %d", len, l);
+		return EINVAL;
+	}
+	/* we free the old cfg because maybe the original allocation 
+	 * not the same size as the new one (different AQM type).
+	 */
+	if (fs->aqmcfg) {
+		free(fs->aqmcfg, M_DUMMYNET);
+		fs->aqmcfg = NULL;
+	}
+
+	fs->aqmcfg = malloc(sizeof(struct dn_aqm_codel_parms),
+			 M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (fs->aqmcfg== NULL) {
+		D("cannot allocate AQM_codel configuration parameters");
+		return ENOMEM; 
+	}
+	
+	/* configure codel parameters */
+	ccfg = fs->aqmcfg;
+	
+	if (ep->par[0] < 0)
+		ccfg->target = codel_sysctl.target;
+	else
+		ccfg->target = ep->par[0] * AQM_TIME_1US;
+
+	if (ep->par[1] < 0)
+		ccfg->interval = codel_sysctl.interval;
+	else
+		ccfg->interval = ep->par[1] * AQM_TIME_1US;
+
+	if (ep->par[2] < 0)
+		ccfg->flags = 0;
+	else
+		ccfg->flags = ep->par[2];
+
+	/* bound codel configurations */
+	ccfg->target = BOUND_VAR(ccfg->target,1, 5 * AQM_TIME_1S);
+	ccfg->interval = BOUND_VAR(ccfg->interval,1, 5 * AQM_TIME_1S);
+	/* increase config reference counter */
+	codel_desc.cfg_ref_count++;
+
+	return 0;
+}
+
+/*
+ * Deconfigure Codel and free memory allocation
+ */
+static int
+aqm_codel_deconfig(struct dn_fsk* fs)
+{
+
+	if (fs && fs->aqmcfg) {
+		free(fs->aqmcfg, M_DUMMYNET);
+		fs->aqmcfg = NULL;
+		fs->aqmfp = NULL;
+		/* decrease config reference counter */
+		codel_desc.cfg_ref_count--;
+	}
+
+	return 0;
+}
+
+/* 
+ * Retrieve Codel configuration parameters.
+ */ 
+static int
+aqm_codel_getconfig(struct dn_fsk *fs, struct dn_extra_parms * ep)
+{
+	struct dn_aqm_codel_parms *ccfg;
+
+	if (fs->aqmcfg) {
+		strcpy(ep->name, codel_desc.name);
+		ccfg = fs->aqmcfg;
+		ep->par[0] = ccfg->target / AQM_TIME_1US;
+		ep->par[1] = ccfg->interval / AQM_TIME_1US;
+		ep->par[2] = ccfg->flags;
+		return 0;
+	}
+	return 1;
+}
+
+static struct dn_aqm codel_desc = {
+	_SI( .type = )  DN_AQM_CODEL,
+	_SI( .name = )  "CODEL",
+	_SI( .enqueue = )  aqm_codel_enqueue,
+	_SI( .dequeue = )  aqm_codel_dequeue,
+	_SI( .config = )  aqm_codel_config,
+	_SI( .getconfig = )  aqm_codel_getconfig,
+	_SI( .deconfig = )  aqm_codel_deconfig,
+	_SI( .init = )  aqm_codel_init,
+	_SI( .cleanup = )  aqm_codel_cleanup,
+};
+
+DECLARE_DNAQM_MODULE(dn_aqm_codel, &codel_desc);
+
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_aqm_codel.h b/sys/netpfil/ipfw/dn_aqm_codel.h
new file mode 100644
index 0000000..f5618e7
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm_codel.h
@@ -0,0 +1,222 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm.
+ *
+ * $FreeBSD$
+ * 
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ *  Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from 
+ *  The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ * 
+ * Copyright (C) 2011-2014 Kathleen Nichols <nichols@pollere.com>.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * o  Redistributions of source code must retain the above copyright
+ *  notice, this list of conditions, and the following disclaimer,
+ *  without modification.
+ *
+ * o  Redistributions in binary form must reproduce the above copyright
+ *  notice, this list of conditions and the following disclaimer in
+ *  the documentation and/or other materials provided with the
+ *  distribution.
+ * 
+ * o  The names of the authors may not be used to endorse or promote
+ *  products derived from this software without specific prior written
+ *  permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General Public
+ * License ("GPL") version 2, in which case the provisions of the GPL
+ * apply INSTEAD OF those given above.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _IP_DN_AQM_CODEL_H
+#define _IP_DN_AQM_CODEL_H
+
+
+// XXX How to choose MTAG?
+#define FIX_POINT_BITS 16 
+
+enum {
+	CODEL_ECN_ENABLED = 1
+};
+
+/* Codel parameters */
+struct dn_aqm_codel_parms {
+	aqm_time_t	target;
+	aqm_time_t	interval;
+	uint32_t	flags;
+};
+
+/* codel status variables */
+struct codel_status {
+	uint32_t	count;	/* number of dropped pkts since entering drop state */
+	uint16_t	dropping;	/* dropping state */
+	aqm_time_t	drop_next_time;	/* time for next drop */
+	aqm_time_t	first_above_time;	/* time for first ts over target we observed */
+	uint16_t	isqrt;	/* last isqrt for control low */
+	uint16_t	maxpkt_size;	/* max packet size seen so far */
+};
+
+struct mbuf *codel_extract_head(struct dn_queue *, aqm_time_t *);
+aqm_time_t control_law(struct codel_status *,
+	struct dn_aqm_codel_parms *, aqm_time_t );
+
+__inline static struct mbuf *
+codel_dodequeue(struct dn_queue *q, aqm_time_t now, uint16_t *ok_to_drop)
+{
+	struct mbuf * m;
+	struct dn_aqm_codel_parms *cprms;
+	struct codel_status *cst;
+	aqm_time_t  pkt_ts, sojourn_time;
+
+	*ok_to_drop = 0;
+	m = codel_extract_head(q, &pkt_ts);
+	
+	cst = q->aqm_status;
+	
+	if (m == NULL) {
+		/* queue is empty - we can't be above target */
+		cst->first_above_time= 0;
+		return m;
+	}
+
+	cprms = q->fs->aqmcfg;
+
+	/* To span a large range of bandwidths, CoDel runs two
+	 * different AQMs in parallel. One is sojourn-time-based
+	 * and takes effect when the time to send an MTU-sized
+	 * packet is less than target.  The 1st term of the "if"
+	 * below does this.  The other is backlog-based and takes
+	 * effect when the time to send an MTU-sized packet is >=
+	* target. The goal here is to keep the output link
+	* utilization high by never allowing the queue to get
+	* smaller than the amount that arrives in a typical
+	 * interarrival time (MTU-sized packets arriving spaced
+	 * by the amount of time it takes to send such a packet on
+	 * the bottleneck). The 2nd term of the "if" does this.
+	 */
+	sojourn_time = now - pkt_ts;
+	if (sojourn_time < cprms->target || q->ni.len_bytes <= cst->maxpkt_size) {
+		/* went below - stay below for at least interval */
+		cst->first_above_time = 0;
+	} else {
+		if (cst->first_above_time == 0) {
+			/* just went above from below. if still above at
+			 * first_above_time, will say it's ok to drop. */
+			cst->first_above_time = now + cprms->interval;
+		} else if (now >= cst->first_above_time) {
+			*ok_to_drop = 1;
+		}
+	}
+	return m;
+}
+
+/* 
+ * Dequeue a packet from queue 'q'
+ */
+__inline static struct mbuf * 
+codel_dequeue(struct dn_queue *q)
+{
+	struct mbuf *m;
+	struct dn_aqm_codel_parms *cprms;
+	struct codel_status *cst;
+	aqm_time_t now;
+	uint16_t ok_to_drop;
+
+	cst = q->aqm_status;;
+	cprms = q->fs->aqmcfg;
+	now = AQM_UNOW;
+
+	m = codel_dodequeue(q, now, &ok_to_drop);
+	if (cst->dropping) {
+		if (!ok_to_drop) {
+			/* sojourn time below target - leave dropping state */
+			cst->dropping = false;
+		}
+		/*
+		 * Time for the next drop. Drop current packet and dequeue
+		 * next.  If the dequeue doesn't take us out of dropping
+		 * state, schedule the next drop. A large backlog might
+		 * result in drop rates so high that the next drop should
+		 * happen now, hence the 'while' loop.
+		 */
+		while (now >= cst->drop_next_time && cst->dropping) {
+
+			/* mark the packet */
+			if (cprms->flags & CODEL_ECN_ENABLED && ecn_mark(m)) {
+				cst->count++;
+				/* schedule the next mark. */
+				cst->drop_next_time = control_law(cst, cprms,
+					cst->drop_next_time);
+				return m;
+			}
+
+			/* drop the packet */
+			update_stats(q, 0, 1);
+			FREE_PKT(m);
+			m = codel_dodequeue(q, now, &ok_to_drop);
+
+			if (!ok_to_drop) {
+				/* leave dropping state */
+				cst->dropping = false;
+			} else {
+				cst->count++;
+				/* schedule the next drop. */
+				cst->drop_next_time = control_law(cst, cprms,
+					cst->drop_next_time);
+			}
+		}
+	/* If we get here we're not in dropping state. The 'ok_to_drop'
+	 * return from dodequeue means that the sojourn time has been
+	 * above 'target' for 'interval' so enter dropping state.
+	 */
+	} else if (ok_to_drop) {
+
+		/* if ECN option is disabled or the packet cannot be marked,
+		 * drop the packet and extract another.
+		 */
+		if (!(cprms->flags & CODEL_ECN_ENABLED) || !ecn_mark(m)) {
+			update_stats(q, 0, 1);
+			FREE_PKT(m);
+			m = codel_dodequeue(q, now, &ok_to_drop);
+		}
+
+		cst->dropping = true;
+
+		/* If min went above target close to when it last went
+		 * below, assume that the drop rate that controlled the
+		 * queue on the last cycle is a good starting point to
+		 * control it now. ('drop_next' will be at most 'interval'
+		 * later than the time of the last drop so 'now - drop_next'
+		 * is a good approximation of the time from the last drop
+		 * until now.)
+		 */
+		cst->count = (cst->count > 2 && ((aqm_stime_t)now - 
+			(aqm_stime_t)cst->drop_next_time) < 8* cprms->interval)?
+				cst->count - 2 : 1;
+		/* we don't have to set initial guess for Newton's method isqrt as
+		 * we initilaize  isqrt in control_law function when count == 1 */
+		cst->drop_next_time = control_law(cst, cprms, now);
+	}
+	
+	return m;
+}
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_aqm_pie.c b/sys/netpfil/ipfw/dn_aqm_pie.c
new file mode 100644
index 0000000..c4b9401
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm_pie.c
@@ -0,0 +1,793 @@
+/*
+ * PIE - Proportional Integral controller Enhanced AQM algorithm.
+ *
+ * $FreeBSD$
+ * 
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ *  Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from 
+ *  The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>		/* ip_len, ip_off */
+#include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/if_ether.h> /* various ether_* routines */
+#include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+#include <netpfil/ipfw/dn_heap.h>
+
+#ifdef NEW_AQM
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+#include <netpfil/ipfw/dn_aqm.h>
+#include <netpfil/ipfw/dn_aqm_pie.h>
+#include <netpfil/ipfw/dn_sched.h>
+
+/* for debugging */
+#include <sys/syslog.h>
+
+static struct dn_aqm pie_desc;
+
+/*  PIE defaults
+ * target=15ms, tupdate=15ms, max_burst=150ms, 
+ * max_ecnth=0.1, alpha=0.125, beta=1.25, 
+ */
+struct dn_aqm_pie_parms pie_sysctl = 
+	{ 15 * AQM_TIME_1MS,  15 * AQM_TIME_1MS, 150 * AQM_TIME_1MS,
+	PIE_SCALE/10 , PIE_SCALE * 0.125,  PIE_SCALE * 1.25 ,
+	PIE_CAPDROP_ENABLED | PIE_DEPRATEEST_ENABLED | PIE_DERAND_ENABLED };
+
+static int
+pie_sysctl_alpha_beta_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	long  value;
+
+	if (!strcmp(oidp->oid_name,"alpha"))
+		value = pie_sysctl.alpha;
+	else
+		value = pie_sysctl.beta;
+		
+	value = value * 1000 / PIE_SCALE;
+	error = sysctl_handle_long(oidp, &value, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (value < 1 || value > 7 * PIE_SCALE)
+		return (EINVAL);
+	value = (value * PIE_SCALE) / 1000;
+	if (!strcmp(oidp->oid_name,"alpha"))
+			pie_sysctl.alpha = value;
+	else
+		pie_sysctl.beta = value;
+	return (0);
+}
+
+static int
+pie_sysctl_target_tupdate_maxb_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	long  value;
+
+	if (!strcmp(oidp->oid_name,"target"))
+		value = pie_sysctl.qdelay_ref;
+	else if (!strcmp(oidp->oid_name,"tupdate"))
+		value = pie_sysctl.tupdate;
+	else
+		value = pie_sysctl.max_burst;
+	
+	value = value / AQM_TIME_1US;
+	error = sysctl_handle_long(oidp, &value, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (value < 1 || value > 10 * AQM_TIME_1S)
+		return (EINVAL);
+	value = value * AQM_TIME_1US;
+	
+	if (!strcmp(oidp->oid_name,"target"))
+		pie_sysctl.qdelay_ref  = value;
+	else if (!strcmp(oidp->oid_name,"tupdate"))
+		pie_sysctl.tupdate  = value;
+	else
+		pie_sysctl.max_burst = value;
+	return (0);
+}
+
+static int
+pie_sysctl_max_ecnth_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	long  value;
+
+	value = pie_sysctl.max_ecnth;
+	value = value * 1000 / PIE_SCALE;
+	error = sysctl_handle_long(oidp, &value, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (value < 1 || value > PIE_SCALE)
+		return (EINVAL);
+	value = (value * PIE_SCALE) / 1000;
+	pie_sysctl.max_ecnth = value;
+	return (0);
+}
+
+/* define PIE sysctl variables */
+SYSBEGIN(f4)
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_ip_dummynet);
+static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, 
+	pie, CTLFLAG_RW, 0, "PIE");
+
+#ifdef SYSCTL_NODE
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, target,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, 
+	pie_sysctl_target_tupdate_maxb_handler, "L",
+	"queue target in microsecond");
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, tupdate,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+	pie_sysctl_target_tupdate_maxb_handler, "L",
+	"the frequency of drop probability calculation in microsecond");
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, max_burst,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+	pie_sysctl_target_tupdate_maxb_handler, "L",
+	"Burst allowance interval in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, max_ecnth,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+	pie_sysctl_max_ecnth_handler, "L",
+	"ECN safeguard threshold scaled by 1000");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, alpha,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+	pie_sysctl_alpha_beta_handler, "L",
+	"PIE alpha scaled by 1000");
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, beta,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+	pie_sysctl_alpha_beta_handler, "L",
+	"beta scaled by 1000");
+#endif
+
+
+/*
+ * Callout function for drop probability calculation 
+ * This function is called over tupdate ms and takes pointer of PIE
+ * status variables as an argument
+  */
+static void
+calculate_drop_prob(void *x)
+{
+	int64_t p, prob, oldprob;
+	struct dn_aqm_pie_parms *pprms;
+	struct pie_status *pst = (struct pie_status *) x;
+
+	/* dealing with race condition */
+	if (callout_pending(&pst->aqm_pie_callout)) {
+		/* callout was reset */
+		mtx_unlock(&pst->lock_mtx);
+		return;
+	}
+
+	if (!callout_active(&pst->aqm_pie_callout)) {
+		/* callout was stopped */
+		mtx_unlock(&pst->lock_mtx);
+		mtx_destroy(&pst->lock_mtx);
+		free(x, M_DUMMYNET);
+		//pst->pq->aqm_status = NULL;
+		pie_desc.ref_count--;
+		return;
+	}
+	callout_deactivate(&pst->aqm_pie_callout);
+
+	pprms = pst->parms;
+	prob = pst->drop_prob;
+
+	/* calculate current qdelay */
+	if (pprms->flags & PIE_DEPRATEEST_ENABLED) {
+		pst->current_qdelay = ((uint64_t)pst->pq->ni.len_bytes *
+			pst->avg_dq_time) >> PIE_DQ_THRESHOLD_BITS;
+	}
+
+	/* calculate drop probability */
+	p = (int64_t)pprms->alpha * 
+		((int64_t)pst->current_qdelay - (int64_t)pprms->qdelay_ref); 
+	p +=(int64_t) pprms->beta * 
+		((int64_t)pst->current_qdelay - (int64_t)pst->qdelay_old); 
+		
+	/* We PIE_MAX_PROB shift by 12-bits to increase the division precision */
+	p *= (PIE_MAX_PROB << 12) / AQM_TIME_1S;
+
+	/* auto-tune drop probability */
+	if (prob < (PIE_MAX_PROB / 1000000)) /* 0.000001 */
+		p >>= 11 + PIE_FIX_POINT_BITS + 12;
+	else if (prob < (PIE_MAX_PROB / 100000)) /* 0.00001 */
+		p >>= 9 + PIE_FIX_POINT_BITS + 12;
+	else if (prob < (PIE_MAX_PROB / 10000)) /* 0.0001 */
+		p >>= 7 + PIE_FIX_POINT_BITS + 12;
+	else if (prob < (PIE_MAX_PROB / 1000)) /* 0.001 */
+		p >>= 5 + PIE_FIX_POINT_BITS + 12;
+	else if (prob < (PIE_MAX_PROB / 100)) /* 0.01 */
+		p >>= 3 + PIE_FIX_POINT_BITS + 12;
+	else if (prob < (PIE_MAX_PROB / 10)) /* 0.1 */
+		p >>= 1 + PIE_FIX_POINT_BITS + 12;
+	else
+		p >>= PIE_FIX_POINT_BITS + 12;
+
+	oldprob = prob;
+
+	/* Cap Drop adjustment */
+	if ((pprms->flags & PIE_CAPDROP_ENABLED) && prob >= PIE_MAX_PROB / 10
+		&& p > PIE_MAX_PROB / 50 ) 
+			p = PIE_MAX_PROB / 50;
+
+	prob = prob + p;
+
+	/* decay the drop probability exponentially */
+	if (pst->current_qdelay == 0 && pst->qdelay_old == 0)
+		/* 0.98 ~= 1- 1/64 */
+		prob = prob - (prob >> 6); 
+
+
+	/* check for multiplication overflow/underflow */
+	if (p>0) {
+		if (prob<oldprob) {
+			D("overflow");
+			prob= PIE_MAX_PROB;
+		}
+	}
+	else
+		if (prob>oldprob) {
+			prob= 0;
+			D("underflow");
+		}
+
+	/* make drop probability between 0 and PIE_MAX_PROB*/
+	if (prob < 0)
+		prob = 0;
+	else if (prob > PIE_MAX_PROB)
+		prob = PIE_MAX_PROB;
+
+	pst->drop_prob = prob;
+	
+	/* store current queue delay value in old queue delay*/
+	pst->qdelay_old = pst->current_qdelay;
+
+	/* update burst allowance */
+	if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance>0) {
+		
+		if (pst->burst_allowance > pprms->tupdate )
+			pst->burst_allowance -= pprms->tupdate;
+		else 
+			pst->burst_allowance = 0;
+	}
+
+	/* reschedule calculate_drop_prob function */
+	if (pst->sflags & PIE_ACTIVE)
+		callout_reset_sbt(&pst->aqm_pie_callout,
+			(uint64_t)pprms->tupdate * SBT_1US, 0, calculate_drop_prob, pst, 0);
+
+	mtx_unlock(&pst->lock_mtx);
+}
+
+/*
+ * Extract a packet from the head of queue 'q'
+ * Return a packet or NULL if the queue is empty.
+ * If getts is set, also extract packet's timestamp from mtag.
+ */
+static struct mbuf *
+pie_extract_head(struct dn_queue *q, aqm_time_t *pkt_ts, int getts)
+{
+	struct m_tag *mtag;
+	struct mbuf *m = q->mq.head;
+
+	if (m == NULL)
+		return m;
+	q->mq.head = m->m_nextpkt;
+
+	/* Update stats */
+	update_stats(q, -m->m_pkthdr.len, 0);
+
+	if (q->ni.length == 0) /* queue is now idle */
+			q->q_time = dn_cfg.curr_time;
+
+	if (getts) {
+		/* extract packet TS*/
+		mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+		if (mtag == NULL) {
+			D("PIE timestamp mtag not found!");
+			*pkt_ts = 0;
+		} else {
+			*pkt_ts = *(aqm_time_t *)(mtag + 1);
+			m_tag_delete(m,mtag); 
+		}
+	}
+	return m;
+}
+
+/* 
+ * Initiate PIE  variable and optionally activate it
+ */
+__inline static void
+init_activate_pie(struct pie_status *pst, int resettimer)
+{
+	struct dn_aqm_pie_parms *pprms;
+
+	mtx_lock(&pst->lock_mtx);
+	pprms = pst->parms;
+	pst->drop_prob = 0;
+	pst->qdelay_old = 0;
+	pst->burst_allowance = pprms->max_burst;
+	pst->accu_prob = 0;
+	pst->dq_count = 0;
+	pst->avg_dq_time = 0;
+	pst->sflags = PIE_INMEASUREMENT;
+	pst->measurement_start = AQM_UNOW;
+
+	if (resettimer) {
+		pst->sflags |= PIE_ACTIVE;
+		callout_reset_sbt(&pst->aqm_pie_callout,
+			(uint64_t)pprms->tupdate * SBT_1US,
+			0, calculate_drop_prob, pst, 0);
+	}
+	//DX(2, "PIE Activated");
+	mtx_unlock(&pst->lock_mtx);
+}
+
+/* 
+ * Deactivate PIE and stop probe update callout 
+ */
+__inline static void
+deactivate_pie(struct pie_status *pst)
+{
+	mtx_lock(&pst->lock_mtx);
+	pst->sflags &= ~(PIE_ACTIVE | PIE_INMEASUREMENT);
+	callout_stop(&pst->aqm_pie_callout);
+	//D("PIE Deactivated");
+	mtx_unlock(&pst->lock_mtx);
+}
+
+/* 
+ * Dequeue and return a pcaket from queue 'q' or NULL if 'q' is empty.
+ * Also, caculate depature time or queue delay using timestamp
+ */
+static struct mbuf *
+aqm_pie_dequeue(struct dn_queue *q)
+{
+	struct mbuf *m;
+	struct dn_flow *ni;	/* stats for scheduler instance */	
+	struct dn_aqm_pie_parms *pprms;
+	struct pie_status *pst;
+	aqm_time_t now;
+	aqm_time_t pkt_ts, dq_time;
+	int32_t w;
+
+	pst  = q->aqm_status;
+	pprms = pst->parms;
+	ni = &q->_si->ni;
+
+	/*we extarct packet ts only when Departure Rate Estimation dis not used*/
+	m = pie_extract_head(q, &pkt_ts, !(pprms->flags & PIE_DEPRATEEST_ENABLED));
+
+	if (!m || !(pst->sflags & PIE_ACTIVE))
+		return m;
+
+	now = AQM_UNOW;
+	if (pprms->flags & PIE_DEPRATEEST_ENABLED) {
+		/* calculate average depature time */
+		if(pst->sflags & PIE_INMEASUREMENT) {
+			pst->dq_count += m->m_pkthdr.len;
+
+			if (pst->dq_count >= PIE_DQ_THRESHOLD) {
+				dq_time = now - pst->measurement_start;
+
+				/* 
+				 * if we don't have old avg dq_time i.e PIE is (re)initialized, 
+				 * don't use weight to calculate new avg_dq_time
+				 */
+				if(pst->avg_dq_time == 0)
+					pst->avg_dq_time = dq_time;
+				else {
+					/* 
+					 * weight = PIE_DQ_THRESHOLD/2^6, but we scaled 
+					 * weight by 2^8. Thus, scaled 
+					 * weight = PIE_DQ_THRESHOLD /2^8 
+					 * */
+					w = PIE_DQ_THRESHOLD >> 8;
+					pst->avg_dq_time = (dq_time* w
+						+ (pst->avg_dq_time * ((1L << 8) - w))) >> 8;
+					pst->sflags &= ~PIE_INMEASUREMENT;
+				}
+			}
+		}
+
+		/* 
+		 * Start new measurment cycle when the queue has
+		 *  PIE_DQ_THRESHOLD worth of bytes.
+		 */
+		if(!(pst->sflags & PIE_INMEASUREMENT) && 
+			q->ni.len_bytes >= PIE_DQ_THRESHOLD) {
+			pst->sflags |= PIE_INMEASUREMENT;
+			pst->measurement_start = now;
+			pst->dq_count = 0;
+		}
+	}
+	/* Optionally, use packet timestamp to estimate queue delay */
+	else
+		pst->current_qdelay = now - pkt_ts;
+
+	return m;	
+}
+
+/*
+ * Enqueue a packet in q, subject to space and  PIE queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+static int
+aqm_pie_enqueue(struct dn_queue *q, struct mbuf* m)
+{
+	struct dn_fs *f;
+	uint64_t len;
+	uint32_t qlen;
+	struct pie_status *pst;
+	struct dn_aqm_pie_parms *pprms;
+	int t;
+
+	len = m->m_pkthdr.len;
+	pst  = q->aqm_status;
+	if(!pst) {
+		DX(2, "PIE queue is not initialized\n");
+		update_stats(q, 0, 1);
+		FREE_PKT(m);
+		return 1;
+	}
+
+	f = &(q->fs->fs);
+	pprms = pst->parms;
+	t = ENQUE;
+
+	/* get current queue length in bytes or packets*/
+	qlen = (f->flags & DN_QSIZE_BYTES) ?
+		q->ni.len_bytes : q->ni.length;
+
+	/* check for queue size and drop the tail if exceed queue limit*/
+	if (qlen >= f->qsize)
+		t = DROP;
+	/* drop/mark the packet when PIE is active and burst time elapsed */
+	else if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance==0
+			&& drop_early(pst, q->ni.len_bytes) == DROP) {
+				/* 
+				 * if drop_prob over ECN threshold, drop the packet 
+				 * otherwise mark and enqueue it.
+				 */
+				if ((pprms->flags & PIE_ECN_ENABLED) && pst->drop_prob <
+					(pprms->max_ecnth << (PIE_PROB_BITS - PIE_FIX_POINT_BITS))
+					&& ecn_mark(m))
+					t = ENQUE;
+				else
+					t = DROP;
+	}
+
+	/* Turn PIE on when 1/3 of the queue is full */ 
+	if (!(pst->sflags & PIE_ACTIVE) && qlen >= pst->one_third_q_size) {
+		init_activate_pie(pst, 1);
+	}
+
+	/*  Reset burst tolerance and optinally turn PIE off*/
+	if ((pst->sflags & PIE_ACTIVE) && pst->drop_prob == 0 &&
+		pst->current_qdelay < (pprms->qdelay_ref >> 1) &&
+		pst->qdelay_old < (pprms->qdelay_ref >> 1)) {
+
+			pst->burst_allowance = pprms->max_burst;
+			if ((pprms->flags & PIE_ON_OFF_MODE_ENABLED) && qlen<=0)
+				deactivate_pie(pst);
+	}
+
+	/* Timestamp the packet if Departure Rate Estimation is disabled */
+	if (t != DROP && !(pprms->flags & PIE_DEPRATEEST_ENABLED)) {
+		/* Add TS to mbuf as a TAG */
+		struct m_tag *mtag;
+		mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+		if (mtag == NULL)
+			mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS,
+				sizeof(aqm_time_t), M_NOWAIT);
+		if (mtag == NULL) {
+			m_freem(m); 
+			t = DROP;
+		}
+		*(aqm_time_t *)(mtag + 1) = AQM_UNOW;
+		m_tag_prepend(m, mtag);
+	}
+
+	if (t != DROP) {
+		mq_append(&q->mq, m);
+		update_stats(q, len, 0);
+		return (0);
+	} else {
+		update_stats(q, 0, 1);
+
+		/* reset accu_prob after packet drop */
+		pst->accu_prob = 0;
+		FREE_PKT(m);
+		return 1;
+	}
+	return 0;
+}
+
+/* 
+ * initialize PIE for queue 'q' 
+ * First allocate memory for PIE status.
+ */
+static int
+aqm_pie_init(struct dn_queue *q)
+{
+	struct pie_status *pst;
+	struct dn_aqm_pie_parms *pprms;
+	int err = 0;
+	
+	pprms = q->fs->aqmcfg;
+	
+	do { /* exit with break when error occurs*/
+		if (!pprms){
+			D("AQM_PIE is not configured");
+			err = EINVAL;
+			break;
+		}
+
+		q->aqm_status = malloc(sizeof(struct pie_status),
+				 M_DUMMYNET, M_NOWAIT | M_ZERO);
+		if (q->aqm_status == NULL) {
+			D("cannot allocate PIE private data");
+			err =  ENOMEM ; 
+			break;
+		}
+
+		pst = q->aqm_status;
+		/* increase reference count for PIE module */
+		pie_desc.ref_count++;
+		
+		pst->pq = q;
+		pst->parms = pprms;
+		
+		/* For speed optimization, we caculate 1/3 queue size once here */
+		// we can use x/3 = (x >>2) + (x >>4) + (x >>7)
+		pst->one_third_q_size = q->fs->fs.qsize/3;
+		
+		mtx_init(&pst->lock_mtx, "mtx_pie", NULL, MTX_DEF);
+		callout_init_mtx(&pst->aqm_pie_callout, &pst->lock_mtx,
+			CALLOUT_RETURNUNLOCKED);
+		
+		pst->current_qdelay = 0;
+		init_activate_pie(pst, !(pprms->flags & PIE_ON_OFF_MODE_ENABLED));
+		
+		//DX(2, "aqm_PIE_init");
+
+	} while(0);
+	
+	return err;
+}
+
+/* 
+ * Clean up PIE status for queue 'q' 
+ * Destroy memory allocated for PIE status.
+ */
+static int
+aqm_pie_cleanup(struct dn_queue *q)
+{
+
+	if(!q) {
+		D("q is null");
+		return 0;
+	}
+	struct pie_status *pst  = q->aqm_status;
+	if(!pst) {
+		//D("queue is already cleaned up");
+		return 0;
+	}
+	if(!q->fs || !q->fs->aqmcfg) {
+		D("fs is null or no cfg");
+		return 1;
+	}
+	if (q->fs->aqmfp && q->fs->aqmfp->type !=DN_AQM_PIE) {
+		D("Not PIE fs (%d)", q->fs->fs.fs_nr);
+		return 1;
+	}
+
+	mtx_lock(&pst->lock_mtx);
+
+	/* stop callout timer */
+	if (callout_stop(&pst->aqm_pie_callout) || !(pst->sflags & PIE_ACTIVE)) {
+		mtx_unlock(&pst->lock_mtx);
+		mtx_destroy(&pst->lock_mtx);
+		free(q->aqm_status, M_DUMMYNET);
+		q->aqm_status = NULL;
+		pie_desc.ref_count--;
+		return 0;
+	} else {
+		q->aqm_status = NULL;
+		mtx_unlock(&pst->lock_mtx);
+		DX(2, "PIE callout has not been stoped from cleanup!");
+		return EBUSY;
+	}
+	return 0;
+}
+
+/* 
+ * Config PIE parameters
+ * also allocate memory for PIE configurations
+ */
+static int 
+aqm_pie_config(struct dn_fsk* fs, struct dn_extra_parms *ep, int len)
+{ 
+	struct dn_aqm_pie_parms *pcfg;
+
+	int l = sizeof(struct dn_extra_parms);
+	if (len < l) {
+		D("invalid sched parms length got %d need %d", len, l);
+		return EINVAL;
+	}
+	/* we free the old cfg because maybe the orignal allocation 
+	 * was used for diffirent AQM type.
+	 */
+	if (fs->aqmcfg) {
+		free(fs->aqmcfg, M_DUMMYNET);
+		fs->aqmcfg = NULL;
+	}
+	
+	fs->aqmcfg = malloc(sizeof(struct dn_aqm_pie_parms),
+			 M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (fs->aqmcfg== NULL) {
+		D("cannot allocate PIE configuration parameters");
+		return ENOMEM; 
+	}
+
+	/* par array contains pie configuration as follow
+	 * 0- qdelay_ref,1- tupdate, 2- max_burst
+	 * 3- max_ecnth, 4- alpha, 5- beta, 6- flags
+	 */
+
+	/* configure PIE parameters */
+	pcfg = fs->aqmcfg;
+	
+	if (ep->par[0] < 0)
+		pcfg->qdelay_ref = pie_sysctl.qdelay_ref * AQM_TIME_1US;
+	else
+		pcfg->qdelay_ref = ep->par[0];
+	if (ep->par[1] < 0)
+		pcfg->tupdate = pie_sysctl.tupdate * AQM_TIME_1US;
+	else
+		pcfg->tupdate = ep->par[1];
+	if (ep->par[2] < 0)
+		pcfg->max_burst = pie_sysctl.max_burst * AQM_TIME_1US;
+	else
+		pcfg->max_burst = ep->par[2];
+	if (ep->par[3] < 0)
+		pcfg->max_ecnth = pie_sysctl.max_ecnth;
+	else
+		pcfg->max_ecnth = ep->par[3];
+	if (ep->par[4] < 0)
+		pcfg->alpha = pie_sysctl.alpha;
+	else
+		pcfg->alpha = ep->par[4];
+	if (ep->par[5] < 0)
+		pcfg->beta = pie_sysctl.beta;
+	else
+		pcfg->beta = ep->par[5];
+	if (ep->par[6] < 0)
+		pcfg->flags = pie_sysctl.flags;
+	else
+		pcfg->flags = ep->par[6];
+
+	/* bound PIE configurations */
+	pcfg->qdelay_ref = BOUND_VAR(pcfg->qdelay_ref, 1, 10 * AQM_TIME_1S);
+	pcfg->tupdate = BOUND_VAR(pcfg->tupdate, 1, 10 * AQM_TIME_1S);
+	pcfg->max_burst = BOUND_VAR(pcfg->max_burst, 0, 10 * AQM_TIME_1S);
+	pcfg->max_ecnth = BOUND_VAR(pcfg->max_ecnth, 0, PIE_SCALE);
+	pcfg->alpha = BOUND_VAR(pcfg->alpha, 0, 7 * PIE_SCALE);
+	pcfg->beta = BOUND_VAR(pcfg->beta, 0 , 7 * PIE_SCALE);
+
+	pie_desc.cfg_ref_count++;
+	//D("pie cfg_ref_count=%d", pie_desc.cfg_ref_count);
+	return 0;
+}
+
+/*
+ * Deconfigure PIE and free memory allocation
+ */
+static int
+aqm_pie_deconfig(struct dn_fsk* fs)
+{
+	if (fs && fs->aqmcfg) {
+		free(fs->aqmcfg, M_DUMMYNET);
+		fs->aqmcfg = NULL;
+		pie_desc.cfg_ref_count--;
+	}
+	return 0;
+}
+
+/* 
+ * Retrieve PIE configuration parameters.
+ */ 
+static int 
+aqm_pie_getconfig (struct dn_fsk *fs, struct dn_extra_parms * ep)
+{
+	struct dn_aqm_pie_parms *pcfg;
+	if (fs->aqmcfg) {
+		strcpy(ep->name, pie_desc.name);
+		pcfg = fs->aqmcfg;
+		ep->par[0] = pcfg->qdelay_ref / AQM_TIME_1US;
+		ep->par[1] = pcfg->tupdate / AQM_TIME_1US;
+		ep->par[2] = pcfg->max_burst / AQM_TIME_1US;
+		ep->par[3] = pcfg->max_ecnth;
+		ep->par[4] = pcfg->alpha;
+		ep->par[5] = pcfg->beta;
+		ep->par[6] = pcfg->flags;
+
+		return 0;
+	}
+	return 1;
+}
+
+static struct dn_aqm pie_desc = {
+	_SI( .type = )  DN_AQM_PIE,
+	_SI( .name = )  "PIE",
+	_SI( .ref_count = )  0,
+	_SI( .cfg_ref_count = )  0,
+	_SI( .enqueue = )  aqm_pie_enqueue,
+	_SI( .dequeue = )  aqm_pie_dequeue,
+	_SI( .config = )  aqm_pie_config,
+	_SI( .deconfig = )  aqm_pie_deconfig,
+	_SI( .getconfig = )  aqm_pie_getconfig,
+	_SI( .init = )  aqm_pie_init,
+	_SI( .cleanup = )  aqm_pie_cleanup,
+};
+
+DECLARE_DNAQM_MODULE(dn_aqm_pie, &pie_desc);
+#endif
diff --git a/sys/netpfil/ipfw/dn_aqm_pie.h b/sys/netpfil/ipfw/dn_aqm_pie.h
new file mode 100644
index 0000000..aa2fceb
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm_pie.h
@@ -0,0 +1,153 @@
+/*
+ * PIE - Proportional Integral controller Enhanced AQM algorithm.
+ *
+ * $FreeBSD$
+ * 
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ *  Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from 
+ *  The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _IP_DN_AQM_PIE_H
+#define _IP_DN_AQM_PIE_H
+
+#define DN_AQM_PIE 2
+#define PIE_DQ_THRESHOLD_BITS 14
+/* 2^14 =16KB */
+#define PIE_DQ_THRESHOLD (1UL << PIE_DQ_THRESHOLD_BITS) 
+#define MEAN_PKTSIZE 800
+
+/* 31-bits because random() generates range from 0->(2**31)-1 */
+#define PIE_PROB_BITS 31
+#define PIE_MAX_PROB ((1ULL<<PIE_PROB_BITS) -1)
+
+/* for 16-bits, we have 3-bits for integer part and 13-bits for fraction */
+#define PIE_FIX_POINT_BITS 13
+#define PIE_SCALE (1UL<<PIE_FIX_POINT_BITS)
+
+
+/* PIE options */
+enum {
+	PIE_ECN_ENABLED =1,
+	PIE_CAPDROP_ENABLED = 2,
+	PIE_ON_OFF_MODE_ENABLED = 4,
+	PIE_DEPRATEEST_ENABLED = 8,
+	PIE_DERAND_ENABLED = 16
+};
+
+/* PIE parameters */
+struct dn_aqm_pie_parms {
+	aqm_time_t	qdelay_ref;	/* AQM Latency Target (default: 15ms) */
+	aqm_time_t	tupdate;		/* a period to calculate drop probability (default:15ms) */
+	aqm_time_t	max_burst;	/* AQM Max Burst Allowance (default: 150ms) */
+	uint16_t	max_ecnth;	/*AQM Max ECN Marking Threshold (default: 10%) */
+	uint16_t	alpha;			/* (default: 1/8) */
+	uint16_t	beta;			/* (default: 1+1/4) */
+	uint32_t	flags;			/* PIE options */
+};
+
+/* PIE status variables */
+struct pie_status{
+	struct callout	aqm_pie_callout;
+	aqm_time_t	burst_allowance;
+	uint32_t	drop_prob;
+	aqm_time_t	current_qdelay;
+	aqm_time_t	qdelay_old;
+	uint64_t	accu_prob;
+	aqm_time_t	measurement_start;
+	aqm_time_t	avg_dq_time;
+	uint32_t	dq_count;
+	uint32_t	sflags;
+	struct dn_aqm_pie_parms *parms;	/* pointer to PIE configurations */
+	/* pointer to parent queue of FQ-PIE sub-queues, or  queue of owner fs. */
+	struct dn_queue	*pq;	
+	struct mtx	lock_mtx;
+	uint32_t one_third_q_size; /* 1/3 of queue size, for speed optization */
+};
+
+enum { 
+	ENQUE = 1,
+	DROP,
+	MARKECN
+};
+
+/* PIE current state */
+enum { 
+	PIE_ACTIVE = 1,
+	PIE_INMEASUREMENT = 2
+};
+
+/* 
+ * Check if eneque should drop packet to control delay or not based on
+ * PIe algorithm.
+ * return  DROP if it is time to drop or  ENQUE otherwise.
+ * This function is used by PIE and FQ-PIE.
+ */
+__inline static int
+drop_early(struct pie_status *pst, uint32_t qlen)
+{
+	struct dn_aqm_pie_parms *pprms;
+
+	pprms = pst->parms;
+
+	/* queue is not congested */
+
+	if ((pst->qdelay_old < (pprms->qdelay_ref >> 1)
+		&& pst->drop_prob < PIE_MAX_PROB / 5 )
+		||  qlen <= 2 * MEAN_PKTSIZE)
+		return ENQUE;
+
+
+	if (pst->drop_prob == 0)
+		pst->accu_prob = 0;
+
+	/* increment accu_prob */
+	if (pprms->flags & PIE_DERAND_ENABLED)
+		pst->accu_prob += pst->drop_prob;
+
+	/* De-randomize option 
+	 * if accu_prob < 0.85 -> enqueue
+	 * if accu_prob>8.5 ->drop
+	 * between 0.85 and 8.5 || !De-randomize --> drop on prob
+	 * 
+	 * (0.85 = 17/20 ,8.5 = 17/2)
+	 */
+	if (pprms->flags & PIE_DERAND_ENABLED) {
+		if(pst->accu_prob < (uint64_t) (PIE_MAX_PROB * 17 / 20))
+			return ENQUE;
+		 if( pst->accu_prob >= (uint64_t) (PIE_MAX_PROB * 17 / 2))
+			return DROP;
+	}
+
+	if (random() < pst->drop_prob) {
+		pst->accu_prob = 0;
+		return DROP;
+	}
+
+	return ENQUE;
+}
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_sched.h b/sys/netpfil/ipfw/dn_sched.h
index ab823fe..a359198 100644
--- a/sys/netpfil/ipfw/dn_sched.h
+++ b/sys/netpfil/ipfw/dn_sched.h
@@ -132,6 +132,10 @@ struct dn_alg {
 	int (*free_fsk)(struct dn_fsk *f);
 	int (*new_queue)(struct dn_queue *q);
 	int (*free_queue)(struct dn_queue *q);
+#ifdef NEW_AQM
+	/* Getting scheduler extra parameters */
+	int (*getconfig)(struct dn_schk *, struct dn_extra_parms *);
+#endif
 
 	/* run-time fields */
 	int ref_count;      /* XXX number of instances in the system */
@@ -165,6 +169,11 @@ dn_dequeue(struct dn_queue *q)
 	struct mbuf *m = q->mq.head;
 	if (m == NULL)
 		return NULL;
+#ifdef NEW_AQM
+	/* Call AQM dequeue function  */
+	if (q->fs->aqmfp && q->fs->aqmfp->dequeue )
+		return q->fs->aqmfp->dequeue(q);
+#endif
 	q->mq.head = m->m_nextpkt;
 
 	/* Update stats for the queue */
diff --git a/sys/netpfil/ipfw/dn_sched_fifo.c b/sys/netpfil/ipfw/dn_sched_fifo.c
index e2aa608..a4a2a70 100644
--- a/sys/netpfil/ipfw/dn_sched_fifo.c
+++ b/sys/netpfil/ipfw/dn_sched_fifo.c
@@ -42,6 +42,9 @@
 #include <netinet/ip_dummynet.h>
 #include <netpfil/ipfw/dn_heap.h>
 #include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
 #include <netpfil/ipfw/dn_sched.h>
 #else
 #include <dn_test.h>
@@ -115,6 +118,9 @@ static struct dn_alg fifo_desc = {
 	_SI( .free_fsk = )  NULL,
 	_SI( .new_queue = )  NULL,
 	_SI( .free_queue = )  NULL,
+#ifdef NEW_AQM
+	_SI( .getconfig = )  NULL,
+#endif
 };
 
 DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_fq_codel.c b/sys/netpfil/ipfw/dn_sched_fq_codel.c
new file mode 100644
index 0000000..c783730
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_fq_codel.c
@@ -0,0 +1,617 @@
+/* 
+ * FQ_Codel - The FlowQueue-Codel scheduler/AQM
+ *
+ * $FreeBSD$
+ * 
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ *  Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from 
+ *  The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+//#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>	/* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>		/* ipfw_rule_ref */
+#include <netinet/ip_fw.h>	/* flow_id */
+#include <netinet/ip_dummynet.h>
+
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <sys/sysctl.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <sys/queue.h>
+#include <sys/hash.h>
+
+#include <netpfil/ipfw/dn_heap.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+
+#include <netpfil/ipfw/dn_aqm.h>
+#include <netpfil/ipfw/dn_aqm_codel.h>
+#include <netpfil/ipfw/dn_sched.h>
+#include <netpfil/ipfw/dn_sched_fq_codel.h>
+#include <netpfil/ipfw/dn_sched_fq_codel_helper.h>
+
+#else
+#include <dn_test.h>
+#endif
+
+/* NOTE: In fq_codel module, we reimplements CoDel AQM functions 
+ * because fq_codel use different flows (sub-queues) structure and 
+ * dn_queue includes many variables not needed by a flow (sub-queue 
+ * )i.e. avoid extra overhead (88 bytes vs 208 bytes).
+ * Also, CoDel functions manages stats of sub-queues as well as the main queue.
+ */
+
+#define DN_SCHED_FQ_CODEL 6
+
+static struct dn_alg fq_codel_desc;
+
+/* fq_codel default parameters including codel */
+struct dn_sch_fq_codel_parms 
+fq_codel_sysctl = {{5000 * AQM_TIME_1US, 100000 * AQM_TIME_1US,
+	CODEL_ECN_ENABLED}, 1024, 10240, 1514};
+
+static int
+fqcodel_sysctl_interval_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	long  value;
+
+	value = fq_codel_sysctl.ccfg.interval;
+	value /= AQM_TIME_1US;
+	error = sysctl_handle_long(oidp, &value, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (value < 1 || value > 100 * AQM_TIME_1S)
+		return (EINVAL);
+	fq_codel_sysctl.ccfg.interval = value * AQM_TIME_1US ;
+
+	return (0);
+}
+
+static int
+fqcodel_sysctl_target_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	long  value;
+
+	value = fq_codel_sysctl.ccfg.target;
+	value /= AQM_TIME_1US;
+	error = sysctl_handle_long(oidp, &value, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (value < 1 || value > 5 * AQM_TIME_1S)
+		return (EINVAL);
+	fq_codel_sysctl.ccfg.target = value * AQM_TIME_1US ;
+
+	return (0);
+}
+
+
+SYSBEGIN(f4)
+
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_ip_dummynet);
+static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqcodel,
+	CTLFLAG_RW, 0, "FQ_CODEL");
+
+#ifdef SYSCTL_NODE
+	
+SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, target,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_target_handler, "L",
+	"FQ_CoDel target in microsecond");
+SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, interval,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_interval_handler, "L",
+	"FQ_CoDel interval in microsecond");
+	
+SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, quantum,
+	CTLFLAG_RW, &fq_codel_sysctl.quantum, 1514, "FQ_CoDel quantum");
+SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, flows,
+	CTLFLAG_RW, &fq_codel_sysctl.flows_cnt, 1024, 
+	"Number of queues for FQ_CoDel");
+SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, limit,
+	CTLFLAG_RW, &fq_codel_sysctl.limit, 10240, "FQ_CoDel queues size limit");
+#endif
+
+/* Drop a packet form the head of codel queue */
+static void
+codel_drop_head(struct fq_codel_flow *q, struct fq_codel_si *si)
+{
+	struct mbuf *m = q->mq.head;
+
+	if (m == NULL)
+		return;
+	q->mq.head = m->m_nextpkt;
+
+	fq_update_stats(q, si, -m->m_pkthdr.len, 1);
+
+	if (si->main_q.ni.length == 0) /* queue is now idle */
+			si->main_q.q_time = dn_cfg.curr_time;
+
+	FREE_PKT(m);
+}
+
+/* Enqueue a packet 'm' to a queue 'q' and add timestamp to that packet.
+ * Return 1 when unable to add timestamp, otherwise return 0 
+ */
+static int
+codel_enqueue(struct fq_codel_flow *q, struct mbuf *m, struct fq_codel_si *si)
+{
+	uint64_t len;
+
+	len = m->m_pkthdr.len;
+	/* finding maximum packet size */
+	if (len > q->cst.maxpkt_size)
+		q->cst.maxpkt_size = len;
+
+	/* Add timestamp to mbuf as MTAG */
+	struct m_tag *mtag;
+	mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+	if (mtag == NULL)
+		mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, sizeof(aqm_time_t),
+			M_NOWAIT);
+	if (mtag == NULL) {
+		m_freem(m); 
+		goto drop;
+	}
+	*(aqm_time_t *)(mtag + 1) = AQM_UNOW;
+	m_tag_prepend(m, mtag);
+
+	mq_append(&q->mq, m);
+	fq_update_stats(q, si, len, 0);
+	return 0;
+
+drop:
+	fq_update_stats(q, si, len, 1);
+	m_freem(m);
+	return 1;
+}
+
+/*
+ * Classify a packet to queue number using Jenkins hash function.
+ * Return: queue number 
+ * the input of the hash are protocol no, perturbation, src IP, dst IP,
+ * src port, dst port,
+ */
+static inline int
+fq_codel_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_codel_si *si)
+{
+	struct ip *ip;
+	struct tcphdr *th;
+	struct udphdr *uh;
+	uint8_t tuple[41];
+	uint16_t hash=0;
+
+//#ifdef INET6
+	struct ip6_hdr *ip6;
+	int isip6;
+	isip6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
+
+	if(isip6) {
+		ip6 = mtod(m, struct ip6_hdr *);
+		*((uint8_t *) &tuple[0]) = ip6->ip6_nxt;
+		*((uint32_t *) &tuple[1]) = si->perturbation;
+		memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16);
+		memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16);
+
+		switch (ip6->ip6_nxt) {
+		case IPPROTO_TCP:
+			th = (struct tcphdr *)(ip6 + 1);
+			*((uint16_t *) &tuple[37]) = th->th_dport;
+			*((uint16_t *) &tuple[39]) = th->th_sport;
+			break;
+
+		case IPPROTO_UDP:
+			uh = (struct udphdr *)(ip6 + 1);
+			*((uint16_t *) &tuple[37]) = uh->uh_dport;
+			*((uint16_t *) &tuple[39]) = uh->uh_sport;
+			break;
+		default:
+			memset(&tuple[37], 0, 4);
+
+		}
+
+		hash = jenkins_hash(tuple, 41, HASHINIT) %  fcount;
+		return hash;
+	} 
+//#endif
+
+	/* IPv4 */
+	ip = mtod(m, struct ip *);
+	*((uint8_t *) &tuple[0]) = ip->ip_p;
+	*((uint32_t *) &tuple[1]) = si->perturbation;
+	*((uint32_t *) &tuple[5]) = ip->ip_src.s_addr;
+	*((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr;
+
+	switch (ip->ip_p) {
+		case IPPROTO_TCP:
+			th = (struct tcphdr *)(ip + 1);
+			*((uint16_t *) &tuple[13]) = th->th_dport;
+			*((uint16_t *) &tuple[15]) = th->th_sport;
+			break;
+
+		case IPPROTO_UDP:
+			uh = (struct udphdr *)(ip + 1);
+			*((uint16_t *) &tuple[13]) = uh->uh_dport;
+			*((uint16_t *) &tuple[15]) = uh->uh_sport;
+			break;
+		default:
+			memset(&tuple[13], 0, 4);
+
+	}
+	hash = jenkins_hash(tuple, 17, HASHINIT) %  fcount;
+
+	return hash;
+}
+
+/*
+ * Enqueue a packet into an appropriate queue according to
+ * FQ_CODEL algorithm.
+ */
+static int 
+fq_codel_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q, 
+	struct mbuf *m)
+{
+	struct fq_codel_si *si;
+	struct fq_codel_schk *schk;
+	struct dn_sch_fq_codel_parms *param;
+	struct dn_queue *mainq;
+	int idx, drop, i, maxidx;
+
+	mainq = (struct dn_queue *)(_si + 1);
+	si = (struct fq_codel_si *)_si;
+	schk = (struct fq_codel_schk *)(si->_si.sched+1);
+	param = &schk->cfg;
+
+	 /* classify a packet to queue number*/
+	idx = fq_codel_classify_flow(m, param->flows_cnt, si);
+	/* enqueue packet into appropriate queue using CoDel AQM.
+	 * Note: 'codel_enqueue' function returns 1 only when it unable to 
+	 * add timestamp to packet (no limit check)*/
+	drop = codel_enqueue(&si->flows[idx], m, si);
+	
+	/* codel unable to timestamp a packet */ 
+	if (drop)
+		return 1;
+	
+	/* If the flow (sub-queue) is not active ,then add it to the tail of
+	 * new flows list, initialize and activate it.
+	 */
+	if (!si->flows[idx].active ) {
+		STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain);
+		si->flows[idx].deficit = param->quantum;
+		si->flows[idx].cst.dropping = false;
+		si->flows[idx].cst.first_above_time = 0;
+		si->flows[idx].active = 1;
+		//D("activate %d",idx);
+	}
+
+	/* check the limit for all queues and remove a packet from the
+	 * largest one 
+	 */
+	if (mainq->ni.length > schk->cfg.limit) { D("over limit");
+		/* find first active flow */
+		for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++)
+			if (si->flows[maxidx].active)
+				break;
+		if (maxidx < schk->cfg.flows_cnt) {
+			/* find the largest sub- queue */
+			for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++) 
+				if (si->flows[i].active && si->flows[i].stats.length >
+					si->flows[maxidx].stats.length)
+					maxidx = i;
+			codel_drop_head(&si->flows[maxidx], si);
+			D("maxidx = %d",maxidx);
+			drop = 1;
+		}
+	}
+
+	return drop;
+}
+
+/*
+ * Dequeue a packet from an appropriate queue according to
+ * FQ_CODEL algorithm.
+ */
+static struct mbuf *
+fq_codel_dequeue(struct dn_sch_inst *_si)
+{
+	struct fq_codel_si *si;
+	struct fq_codel_schk *schk;
+	struct dn_sch_fq_codel_parms *param;
+	struct fq_codel_flow *f;
+	struct mbuf *mbuf;
+	struct fq_codel_list *fq_codel_flowlist;
+
+	si = (struct fq_codel_si *)_si;
+	schk = (struct fq_codel_schk *)(si->_si.sched+1);
+	param = &schk->cfg;
+
+	do {
+		/* select a list to start with */
+		if (STAILQ_EMPTY(&si->newflows))
+			fq_codel_flowlist = &si->oldflows;
+		else
+			fq_codel_flowlist = &si->newflows;
+
+		/* Both new and old queue lists are empty, return NULL */
+		if (STAILQ_EMPTY(fq_codel_flowlist)) 
+			return NULL;
+
+		f = STAILQ_FIRST(fq_codel_flowlist);
+		while (f != NULL)	{
+			/* if there is no flow(sub-queue) deficit, increase deficit
+			 * by quantum, move the flow to the tail of old flows list
+			 * and try another flow.
+			 * Otherwise, the flow will be used for dequeue.
+			 */
+			if (f->deficit < 0) {
+				 f->deficit += param->quantum;
+				 STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
+				 STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
+			 } else 
+				 break;
+
+			f = STAILQ_FIRST(fq_codel_flowlist);
+		}
+		
+		/* the new flows list is empty, try old flows list */
+		if (STAILQ_EMPTY(fq_codel_flowlist)) 
+			continue;
+
+		/* Dequeue a packet from the selected flow */
+		mbuf = fqc_codel_dequeue(f, si);
+
+		/* Codel did not return a packet */
+		if (!mbuf) {
+			/* If the selected flow belongs to new flows list, then move 
+			 * it to the tail of old flows list. Otherwise, deactivate it and
+			 * remove it from the old list and
+			 */
+			if (fq_codel_flowlist == &si->newflows) {
+				STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
+				STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
+			}	else {
+				f->active = 0;
+				STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
+			}
+			/* start again */
+			continue;
+		}
+
+		/* we have a packet to return, 
+		 * update flow deficit and return the packet*/
+		f->deficit -= mbuf->m_pkthdr.len;
+		return mbuf;
+
+	} while (1);
+	
+	/* unreachable point */
+	return NULL;
+}
+
+/*
+ * Initialize fq_codel scheduler instance.
+ * also, allocate memory for flows array.
+ */
+static int
+fq_codel_new_sched(struct dn_sch_inst *_si)
+{
+	struct fq_codel_si *si;
+	struct dn_queue *q;
+	struct fq_codel_schk *schk;
+	int i;
+
+	si = (struct fq_codel_si *)_si;
+	schk = (struct fq_codel_schk *)(_si->sched+1);
+
+	if(si->flows) {
+		D("si already configured!");
+		return 0;
+	}
+
+	/* init the main queue */
+	q = &si->main_q;
+	set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
+	q->_si = _si;
+	q->fs = _si->sched->fs;
+
+	/* allocate memory for flows array */
+	si->flows = malloc(schk->cfg.flows_cnt * sizeof(struct fq_codel_flow),
+		 M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (si->flows == NULL) {
+		D("cannot allocate memory for fq_codel configuration parameters");
+		return ENOMEM ; 
+	}
+
+	/* init perturbation for this si */
+	si->perturbation = random();
+
+	/* init the old and new flows lists */
+	STAILQ_INIT(&si->newflows);
+	STAILQ_INIT(&si->oldflows);
+
+	/* init the flows (sub-queues) */
+	for (i = 0; i < schk->cfg.flows_cnt; i++) {
+		/* init codel */
+		si->flows[i].cst.maxpkt_size = 500;
+	}
+
+	fq_codel_desc.ref_count++;
+	return 0;
+}
+
+/*
+ * Free fq_codel scheduler instance.
+ */
+static int
+fq_codel_free_sched(struct dn_sch_inst *_si)
+{
+	struct fq_codel_si *si = (struct fq_codel_si *)_si ;
+
+	/* free the flows array */
+	free(si->flows , M_DUMMYNET);
+	si->flows = NULL;
+	fq_codel_desc.ref_count--;
+
+	return 0;
+}
+
+/*
+ * Configure fq_codel scheduler.
+ * the configurations for the scheduler is passed from userland.
+ */
+static int
+fq_codel_config(struct dn_schk *_schk)
+{
+	struct fq_codel_schk *schk;
+	struct dn_extra_parms *ep;
+	struct dn_sch_fq_codel_parms *fqc_cfg;
+	
+	schk = (struct fq_codel_schk *)(_schk+1);
+	ep = (struct dn_extra_parms *) _schk->cfg;
+
+	/* par array contains fq_codel configuration as follow
+	 * Codel: 0- target,1- interval, 2- flags
+	 * FQ_CODEL: 3- quantum, 4- limit, 5- flows
+	 */
+	if (ep && ep->oid.len ==sizeof(*ep) &&
+		ep->oid.subtype == DN_SCH_PARAMS) {
+
+		fqc_cfg = &schk->cfg;
+		if (ep->par[0] < 0)
+			fqc_cfg->ccfg.target = fq_codel_sysctl.ccfg.target;
+		else
+			fqc_cfg->ccfg.target = ep->par[0] * AQM_TIME_1US;
+
+		if (ep->par[1] < 0)
+			fqc_cfg->ccfg.interval = fq_codel_sysctl.ccfg.interval;
+		else
+			fqc_cfg->ccfg.interval = ep->par[1] * AQM_TIME_1US;
+
+		if (ep->par[2] < 0)
+			fqc_cfg->ccfg.flags = 0;
+		else
+			fqc_cfg->ccfg.flags = ep->par[2];
+
+		/* FQ configurations */
+		if (ep->par[3] < 0)
+			fqc_cfg->quantum = fq_codel_sysctl.quantum;
+		else
+			fqc_cfg->quantum = ep->par[3];
+
+		if (ep->par[4] < 0)
+			fqc_cfg->limit = fq_codel_sysctl.limit;
+		else
+			fqc_cfg->limit = ep->par[4];
+
+		if (ep->par[5] < 0)
+			fqc_cfg->flows_cnt = fq_codel_sysctl.flows_cnt;
+		else
+			fqc_cfg->flows_cnt = ep->par[5];
+
+		/* Bound the configurations */
+		fqc_cfg->ccfg.target = BOUND_VAR(fqc_cfg->ccfg.target, 1 , 
+			5 * AQM_TIME_1S); ;
+		fqc_cfg->ccfg.interval = BOUND_VAR(fqc_cfg->ccfg.interval, 1,
+			100 * AQM_TIME_1S);
+
+		fqc_cfg->quantum = BOUND_VAR(fqc_cfg->quantum,1, 9000);
+		fqc_cfg->limit= BOUND_VAR(fqc_cfg->limit,1,20480);
+		fqc_cfg->flows_cnt= BOUND_VAR(fqc_cfg->flows_cnt,1,65536);
+	}
+	else
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Return fq_codel scheduler configurations
+ * the configurations for the scheduler is passed to userland.
+ */
+static int 
+fq_codel_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) {
+	
+	struct fq_codel_schk *schk = (struct fq_codel_schk *)(_schk+1);
+	struct dn_sch_fq_codel_parms *fqc_cfg;
+
+	fqc_cfg = &schk->cfg;
+
+	strcpy(ep->name, fq_codel_desc.name);
+	ep->par[0] = fqc_cfg->ccfg.target / AQM_TIME_1US;
+	ep->par[1] = fqc_cfg->ccfg.interval / AQM_TIME_1US;
+	ep->par[2] = fqc_cfg->ccfg.flags;
+
+	ep->par[3] = fqc_cfg->quantum;
+	ep->par[4] = fqc_cfg->limit;
+	ep->par[5] = fqc_cfg->flows_cnt;
+
+	return 0;
+}
+
+/*
+ * fq_codel scheduler descriptor
+ * contains the type of the scheduler, the name, the size of extra
+ * data structures, and function pointers.
+ */
+static struct dn_alg fq_codel_desc = {
+	_SI( .type = )  DN_SCHED_FQ_CODEL,
+	_SI( .name = ) "FQ_CODEL",
+	_SI( .flags = ) 0,
+
+	_SI( .schk_datalen = ) sizeof(struct fq_codel_schk),
+	_SI( .si_datalen = ) sizeof(struct fq_codel_si) - sizeof(struct dn_sch_inst),
+	_SI( .q_datalen = ) 0,
+
+	_SI( .enqueue = ) fq_codel_enqueue,
+	_SI( .dequeue = ) fq_codel_dequeue,
+	_SI( .config = ) fq_codel_config, /* new sched i.e. sched X config ...*/
+	_SI( .destroy = ) NULL,  /*sched x delete */
+	_SI( .new_sched = ) fq_codel_new_sched, /* new schd instance */
+	_SI( .free_sched = ) fq_codel_free_sched,	/* delete schd instance */
+	_SI( .new_fsk = ) NULL,
+	_SI( .free_fsk = ) NULL,
+	_SI( .new_queue = ) NULL,
+	_SI( .free_queue = ) NULL,
+	_SI( .getconfig = )  fq_codel_getconfig,
+	_SI( .ref_count = ) 0
+};
+
+DECLARE_DNSCHED_MODULE(dn_fq_codel, &fq_codel_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_fq_codel.h b/sys/netpfil/ipfw/dn_sched_fq_codel.h
new file mode 100644
index 0000000..4b65781
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_fq_codel.h
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ *  Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from 
+ *  The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * FQ_Codel Structures and helper functions
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_SCHED_FQ_CODEL_H
+#define _IP_DN_SCHED_FQ_CODEL_H
+
+/* list of queues */
+STAILQ_HEAD(fq_codel_list, fq_codel_flow) ;
+
+/* fq_codel parameters including codel */
+struct dn_sch_fq_codel_parms {
+	struct dn_aqm_codel_parms	ccfg;	/* CoDel Parameters */
+	/* FQ_CODEL Parameters */
+	uint32_t flows_cnt;	/* number of flows */
+	uint32_t limit;	/* hard limit of fq_codel queue size*/
+	uint32_t quantum;
+};	/* defaults */
+
+/* flow (sub-queue) stats */
+struct flow_stats {
+	uint64_t tot_pkts;	/* statistics counters  */
+	uint64_t tot_bytes;
+	uint32_t length;		/* Queue length, in packets */
+	uint32_t len_bytes;	/* Queue length, in bytes */
+	uint32_t drops;
+};
+
+/* A flow of packets (sub-queue).*/
+struct fq_codel_flow {
+	struct mq	mq;	/* list of packets */
+	struct flow_stats stats;	/* statistics */
+	int	deficit;
+	int active;		/* 1: flow is active (in a list) */
+	struct codel_status cst;
+	STAILQ_ENTRY(fq_codel_flow) flowchain;
+};
+
+/* extra fq_codel scheduler configurations */
+struct fq_codel_schk {
+	struct dn_sch_fq_codel_parms cfg;
+};
+
+/* fq_codel scheduler instance */
+struct fq_codel_si {
+	struct dn_sch_inst _si;	/* standard scheduler instance */
+	struct dn_queue main_q; /* main queue is after si directly */
+
+	struct fq_codel_flow *flows; /* array of flows (queues) */
+	uint32_t perturbation; /* random value */
+	struct fq_codel_list newflows;	/* list of new queues */
+	struct fq_codel_list oldflows;		/* list of old queues */
+};
+
+/* Helper function to update queue&main-queue and scheduler statistics.
+ * negative len + drop -> drop
+ * negative len -> dequeue
+ * positive len -> enqueue
+ * positive len + drop -> drop during enqueue
+ */
+__inline static void
+fq_update_stats(struct fq_codel_flow *q, struct fq_codel_si *si, int len,
+	int drop)
+{
+	int inc = 0;
+
+	if (len < 0) 
+		inc = -1;
+	else if (len > 0)
+		inc = 1;
+
+	if (drop) {
+		si->main_q.ni.drops ++;
+		q->stats.drops ++;
+		si->_si.ni.drops ++;
+		io_pkt_drop ++;
+	} 
+
+	if (!drop || (drop && len < 0)) {
+		/* Update stats for the main queue */
+		si->main_q.ni.length += inc;
+		si->main_q.ni.len_bytes += len;
+
+		/*update sub-queue stats */
+		q->stats.length += inc;
+		q->stats.len_bytes += len;
+
+		/*update scheduler instance stats */
+		si->_si.ni.length += inc;
+		si->_si.ni.len_bytes += len;
+	}
+
+	if (inc > 0) {
+		si->main_q.ni.tot_bytes += len;
+		si->main_q.ni.tot_pkts ++;
+		
+		q->stats.tot_bytes +=len;
+		q->stats.tot_pkts++;
+		
+		si->_si.ni.tot_bytes +=len;
+		si->_si.ni.tot_pkts ++;
+	}
+
+}
+
+/* extract the head of fq_codel sub-queue */
+__inline static struct mbuf *
+fq_codel_extract_head(struct fq_codel_flow *q, aqm_time_t *pkt_ts, struct fq_codel_si *si)
+{
+	struct mbuf *m = q->mq.head;
+
+	if (m == NULL)
+		return m;
+	q->mq.head = m->m_nextpkt;
+
+	fq_update_stats(q, si, -m->m_pkthdr.len, 0);
+
+	if (si->main_q.ni.length == 0) /* queue is now idle */
+			si->main_q.q_time = dn_cfg.curr_time;
+
+	/* extract packet timestamp*/
+	struct m_tag *mtag;
+	mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+	if (mtag == NULL){
+		D("timestamp tag is not found!");
+		*pkt_ts = 0;
+	} else {
+		*pkt_ts = *(aqm_time_t *)(mtag + 1);
+		m_tag_delete(m,mtag); 
+	}
+
+	return m;
+}
+
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h b/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h
new file mode 100644
index 0000000..da663dc
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h
@@ -0,0 +1,187 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm.
+ *
+ * $FreeBSD$
+ * 
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ *  Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from 
+ *  The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Copyright (C) 2011-2014 Kathleen Nichols <nichols@pollere.com>.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * o  Redistributions of source code must retain the above copyright
+ *  notice, this list of conditions, and the following disclaimer,
+ *  without modification.
+ *
+ * o  Redistributions in binary form must reproduce the above copyright
+ *  notice, this list of conditions and the following disclaimer in
+ *  the documentation and/or other materials provided with the
+ *  distribution.
+ * 
+ * o  The names of the authors may not be used to endorse or promote
+ *  products derived from this software without specific prior written
+ *  permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General Public
+ * License ("GPL") version 2, in which case the provisions of the GPL
+ * apply INSTEAD OF those given above.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _IP_DN_SCHED_FQ_CODEL_HELPER_H
+#define _IP_DN_SCHED_FQ_CODEL_HELPER_H
+
+__inline static struct mbuf *
+fqc_dodequeue(struct fq_codel_flow *q, aqm_time_t now, uint16_t *ok_to_drop,
+	struct fq_codel_si *si)
+{
+	struct mbuf * m;
+	struct fq_codel_schk *schk = (struct fq_codel_schk *)(si->_si.sched+1);
+	aqm_time_t  pkt_ts, sojourn_time;
+
+	*ok_to_drop = 0;
+	m = fq_codel_extract_head(q, &pkt_ts, si);
+
+	if (m == NULL) {
+		/*queue is empty - we can't be above target*/
+		q->cst.first_above_time= 0;
+		return m;
+	}
+
+	/* To span a large range of bandwidths, CoDel runs two
+	 * different AQMs in parallel. One is sojourn-time-based
+	 * and takes effect when the time to send an MTU-sized
+	 * packet is less than target.  The 1st term of the "if"
+	 * below does this.  The other is backlog-based and takes
+	 * effect when the time to send an MTU-sized packet is >=
+	* target. The goal here is to keep the output link
+	* utilization high by never allowing the queue to get
+	* smaller than the amount that arrives in a typical
+	 * interarrival time (MTU-sized packets arriving spaced
+	 * by the amount of time it takes to send such a packet on
+	 * the bottleneck). The 2nd term of the "if" does this.
+	 */
+	sojourn_time = now - pkt_ts;
+	if (sojourn_time < schk->cfg.ccfg.target || q->stats.len_bytes <= q->cst.maxpkt_size) {
+		/* went below - stay below for at least interval */
+		q->cst.first_above_time = 0;
+	} else {
+		if (q->cst.first_above_time == 0) {
+			/* just went above from below. if still above at
+			 * first_above_time, will say it's ok to drop. */
+			q->cst.first_above_time = now + schk->cfg.ccfg.interval;
+		} else if (now >= q->cst.first_above_time) {
+			*ok_to_drop = 1;
+		}
+	}
+	return m;
+}
+
+/* Codel dequeue function */
+__inline static struct mbuf * 
+fqc_codel_dequeue(struct fq_codel_flow *q, struct fq_codel_si *si)
+{
+	struct mbuf *m;
+	struct dn_aqm_codel_parms *cprms;
+	struct codel_status *cst;
+	aqm_time_t now;
+	uint16_t ok_to_drop;
+	struct fq_codel_schk *schk = (struct fq_codel_schk *)(si->_si.sched+1);
+
+	cst = &q->cst;
+	cprms = &schk->cfg.ccfg;
+
+	now = AQM_UNOW;
+	m = fqc_dodequeue(q, now, &ok_to_drop, si);
+
+	if (cst->dropping) {
+		if (!ok_to_drop) {
+			/* sojourn time below target - leave dropping state */
+			cst->dropping = false;
+		}
+
+		/* Time for the next drop. Drop current packet and dequeue
+		 * next.  If the dequeue doesn't take us out of dropping
+		 * state, schedule the next drop. A large backlog might
+		 * result in drop rates so high that the next drop should
+		 * happen now, hence the 'while' loop.
+		 */
+		while (now >= cst->drop_next_time && cst->dropping) {
+
+			/* mark the packet */
+			if (cprms->flags & CODEL_ECN_ENABLED && ecn_mark(m)) {
+				cst->count++;
+				/* schedule the next mark. */
+				cst->drop_next_time = control_law(cst, cprms, cst->drop_next_time);
+				return m;
+			}
+
+			/* drop the packet */
+			fq_update_stats(q, si, 0, 1);
+			m_freem(m);
+			m = fqc_dodequeue(q, now, &ok_to_drop, si);
+
+			if (!ok_to_drop) {
+				/* leave dropping state */
+				cst->dropping = false;
+			} else {
+				cst->count++;
+				/* schedule the next drop. */
+				cst->drop_next_time = control_law(cst, cprms, cst->drop_next_time);
+			}
+		}
+	/* If we get here we're not in dropping state. The 'ok_to_drop'
+	 * return from dodequeue means that the sojourn time has been
+	 * above 'target' for 'interval' so enter dropping state.
+	 */
+	} else if (ok_to_drop) {
+
+		/* if ECN option is disabled or the packet cannot be marked,
+		 * drop the packet and extract another.
+		 */
+		if (!(cprms->flags & CODEL_ECN_ENABLED) || !ecn_mark(m)) {
+			fq_update_stats(q, si, 0, 1);
+			m_freem(m);
+			m = fqc_dodequeue(q, now, &ok_to_drop,si);
+		}
+
+		cst->dropping = true;
+
+		/* If min went above target close to when it last went
+		 * below, assume that the drop rate that controlled the
+		 * queue on the last cycle is a good starting point to
+		 * control it now. ('drop_next' will be at most 'interval'
+		 * later than the time of the last drop so 'now - drop_next'
+		 * is a good approximation of the time from the last drop
+		 * until now.)
+		 */
+		cst->count = (cst->count > 2 && ((aqm_stime_t)now - 
+			(aqm_stime_t)cst->drop_next_time) < 8* cprms->interval)? cst->count - 2 : 1;
+
+		/* we don't have to set initial guess for Newton's method isqrt as
+		 * we initilaize  isqrt in control_law function when count == 1 */
+		cst->drop_next_time = control_law(cst, cprms, now);
+	}
+
+	return m;
+}
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_sched_fq_pie.c b/sys/netpfil/ipfw/dn_sched_fq_pie.c
new file mode 100644
index 0000000..2883cf8
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_fq_pie.c
@@ -0,0 +1,1262 @@
+/* 
+ * FQ_PIE - The FlowQueue-PIE scheduler/AQM
+ *
+ * $FreeBSD$
+ * 
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ *  Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from 
+ *  The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* Important note:
+ * As there is no an office document for FQ-PIE specification, we used
+ * FQ-CoDel algorithm with some modifications to implement FQ-PIE.
+ * This FQ-PIE implementation is a beta version and have not been tested 
+ * extensively. Our FQ-PIE uses stand-alone PIE AQM per sub-queue. By
+ * default, timestamp is used to calculate queue delay instead of departure
+ * rate estimation method. Although departure rate estimation is available 
+ * as testing option, the results could be incorrect. Moreover, turning PIE on 
+ * and off option is available but it does not work properly in this version.
+ */
+
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <net/if.h>	/* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>		/* ipfw_rule_ref */
+#include <netinet/ip_fw.h>	/* flow_id */
+#include <netinet/ip_dummynet.h>
+
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <sys/sysctl.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <sys/queue.h>
+#include <sys/hash.h>
+
+#include <netpfil/ipfw/dn_heap.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+
+#include <netpfil/ipfw/dn_aqm.h>
+#include <netpfil/ipfw/dn_aqm_pie.h>
+#include <netpfil/ipfw/dn_sched.h>
+
+#else
+#include <dn_test.h>
+#endif
+
+#define DN_SCHED_FQ_PIE 7
+
+/* list of queues */
+STAILQ_HEAD(fq_pie_list, fq_pie_flow) ;
+
+/* FQ_PIE parameters including PIE */
+struct dn_sch_fq_pie_parms {
+	struct dn_aqm_pie_parms	pcfg;	/* PIE configuration Parameters */
+	/* FQ_PIE Parameters */
+	uint32_t flows_cnt;	/* number of flows */
+	uint32_t limit;	/* hard limit of FQ_PIE queue size*/
+	uint32_t quantum;
+};
+
+/* flow (sub-queue) stats */
+struct flow_stats {
+	uint64_t tot_pkts;	/* statistics counters  */
+	uint64_t tot_bytes;
+	uint32_t length;		/* Queue length, in packets */
+	uint32_t len_bytes;	/* Queue length, in bytes */
+	uint32_t drops;
+};
+
+/* A flow of packets (sub-queue)*/
+struct fq_pie_flow {
+	struct mq	mq;	/* list of packets */
+	struct flow_stats stats;	/* statistics */
+	int deficit;
+	int active;		/* 1: flow is active (in a list) */
+	struct pie_status pst;	/* pie status variables */
+	struct fq_pie_si *psi;	/* parent scheduler instance */
+	STAILQ_ENTRY(fq_pie_flow) flowchain;
+};
+
+/* extra fq_pie scheduler configurations */
+struct fq_pie_schk {
+	struct dn_sch_fq_pie_parms cfg;
+};
+
+/* fq_pie scheduler instance */
+struct fq_pie_si {
+	struct dn_sch_inst _si;	/* standard scheduler instance */
+	struct dn_queue main_q; /* main queue is after si directly */
+	uint32_t nr_active_q;
+	struct fq_pie_flow *flows;	/* array of flows (queues) */
+	uint32_t perturbation; 	/* random value */
+	struct fq_pie_list newflows;	/* list of new queues */
+	struct fq_pie_list oldflows;	/* list of old queues */
+};
+
+
+struct mem_to_free {
+	void *mem_flows;
+	void *mem_callout;
+};
+static struct mtx freemem_mtx;
+static struct dn_alg fq_pie_desc;
+
+/*  Default FQ-PIE parameters including PIE */
+/*  PIE defaults
+ * target=15ms, max_burst=150ms, max_ecnth=0.1, 
+ * alpha=0.125, beta=1.25, tupdate=15ms
+ * FQ-
+ * flows=1024, limit=10240, quantum =1514
+ */
+struct dn_sch_fq_pie_parms 
+ fq_pie_sysctl = {{15000 * AQM_TIME_1US, 15000 * AQM_TIME_1US,
+	150000 * AQM_TIME_1US, PIE_SCALE * 0.1, PIE_SCALE * 0.125, 
+	PIE_SCALE * 1.25,	PIE_CAPDROP_ENABLED | PIE_DERAND_ENABLED},
+	1024, 10240, 1514};
+
+static int
+fqpie_sysctl_alpha_beta_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	long  value;
+
+	if (!strcmp(oidp->oid_name,"alpha"))
+		value = fq_pie_sysctl.pcfg.alpha;
+	else
+		value = fq_pie_sysctl.pcfg.beta;
+		
+	value = value * 1000 / PIE_SCALE;
+	error = sysctl_handle_long(oidp, &value, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (value < 1 || value > 7 * PIE_SCALE)
+		return (EINVAL);
+	value = (value * PIE_SCALE) / 1000;
+	if (!strcmp(oidp->oid_name,"alpha"))
+			fq_pie_sysctl.pcfg.alpha = value;
+	else
+		fq_pie_sysctl.pcfg.beta = value;
+	return (0);
+}
+
+static int
+fqpie_sysctl_target_tupdate_maxb_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	long  value;
+
+	if (!strcmp(oidp->oid_name,"target"))
+		value = fq_pie_sysctl.pcfg.qdelay_ref;
+	else if (!strcmp(oidp->oid_name,"tupdate"))
+		value = fq_pie_sysctl.pcfg.tupdate;
+	else
+		value = fq_pie_sysctl.pcfg.max_burst;
+	
+	value = value / AQM_TIME_1US;
+	error = sysctl_handle_long(oidp, &value, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (value < 1 || value > 10 * AQM_TIME_1S)
+		return (EINVAL);
+	value = value * AQM_TIME_1US;
+	
+	if (!strcmp(oidp->oid_name,"target"))
+		fq_pie_sysctl.pcfg.qdelay_ref  = value;
+	else if (!strcmp(oidp->oid_name,"tupdate"))
+		fq_pie_sysctl.pcfg.tupdate  = value;
+	else
+		fq_pie_sysctl.pcfg.max_burst = value;
+	return (0);
+}
+
+static int
+fqpie_sysctl_max_ecnth_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	long  value;
+
+	value = fq_pie_sysctl.pcfg.max_ecnth;
+	value = value * 1000 / PIE_SCALE;
+	error = sysctl_handle_long(oidp, &value, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (value < 1 || value > PIE_SCALE)
+		return (EINVAL);
+	value = (value * PIE_SCALE) / 1000;
+	fq_pie_sysctl.pcfg.max_ecnth = value;
+	return (0);
+}
+
+/* define FQ- PIE sysctl variables */
+SYSBEGIN(f4)
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_ip_dummynet);
+static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqpie,
+	CTLFLAG_RW, 0, "FQ_PIE");
+
+#ifdef SYSCTL_NODE
+	
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, target,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+	fqpie_sysctl_target_tupdate_maxb_handler, "L",
+	"queue target in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, tupdate,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+	fqpie_sysctl_target_tupdate_maxb_handler, "L",
+	"the frequency of drop probability calculation in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, max_burst,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+	fqpie_sysctl_target_tupdate_maxb_handler, "L",
+	"Burst allowance interval in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, max_ecnth,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+	fqpie_sysctl_max_ecnth_handler, "L",
+	"ECN safeguard threshold scaled by 1000");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, alpha,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+	fqpie_sysctl_alpha_beta_handler, "L", "PIE alpha scaled by 1000");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, beta,
+	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+	fqpie_sysctl_alpha_beta_handler, "L", "beta scaled by 1000");
+
+SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, quantum,
+	CTLFLAG_RW, &fq_pie_sysctl.quantum, 1514, "quantum for FQ_PIE");
+SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, flows,
+	CTLFLAG_RW, &fq_pie_sysctl.flows_cnt, 1024, "Number of queues for FQ_PIE");
+SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, limit,
+	CTLFLAG_RW, &fq_pie_sysctl.limit, 10240, "limit for FQ_PIE");
+#endif
+
+/* Helper function to update queue&main-queue and scheduler statistics.
+ * negative len & drop -> drop
+ * negative len -> dequeue
+ * positive len -> enqueue
+ * positive len + drop -> drop during enqueue
+ */
+__inline static void
+fq_update_stats(struct fq_pie_flow *q, struct fq_pie_si *si, int len,
+	int drop)
+{
+	int inc = 0;
+
+	if (len < 0) 
+		inc = -1;
+	else if (len > 0)
+		inc = 1;
+
+	if (drop) {
+		si->main_q.ni.drops ++;
+		q->stats.drops ++;
+		si->_si.ni.drops ++;
+		io_pkt_drop ++;
+	} 
+
+	if (!drop || (drop && len < 0)) {
+		/* Update stats for the main queue */
+		si->main_q.ni.length += inc;
+		si->main_q.ni.len_bytes += len;
+
+		/*update sub-queue stats */
+		q->stats.length += inc;
+		q->stats.len_bytes += len;
+
+		/*update scheduler instance stats */
+		si->_si.ni.length += inc;
+		si->_si.ni.len_bytes += len;
+	}
+
+	if (inc > 0) {
+		si->main_q.ni.tot_bytes += len;
+		si->main_q.ni.tot_pkts ++;
+		
+		q->stats.tot_bytes +=len;
+		q->stats.tot_pkts++;
+		
+		si->_si.ni.tot_bytes +=len;
+		si->_si.ni.tot_pkts ++;
+	}
+
+}
+
+/*
+ * Extract a packet from the head of sub-queue 'q'
+ * Return a packet or NULL if the queue is empty.
+ * If getts is set, also extract packet's timestamp from mtag.
+ */
+__inline static struct mbuf *
+fq_pie_extract_head(struct fq_pie_flow *q, aqm_time_t *pkt_ts,
+	struct fq_pie_si *si, int getts)
+{
+	struct mbuf *m = q->mq.head;
+
+	if (m == NULL)
+		return m;
+	q->mq.head = m->m_nextpkt;
+
+	fq_update_stats(q, si, -m->m_pkthdr.len, 0);
+
+	if (si->main_q.ni.length == 0) /* queue is now idle */
+			si->main_q.q_time = dn_cfg.curr_time;
+
+	if (getts) {
+		/* extract packet timestamp*/
+		struct m_tag *mtag;
+		mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+		if (mtag == NULL){
+			D("PIE timestamp mtag not found!");
+			*pkt_ts = 0;
+		} else {
+			*pkt_ts = *(aqm_time_t *)(mtag + 1);
+			m_tag_delete(m,mtag); 
+		}
+	}
+	return m;
+}
+
+/*
+ * Callout function for drop probability calculation 
+ * This function is called over tupdate ms and takes pointer of FQ-PIE
+ * flow as an argument
+  */
+static void
+fq_calculate_drop_prob(void *x)
+{
+	struct fq_pie_flow *q = (struct fq_pie_flow *) x;
+	struct pie_status *pst = &q->pst;
+	struct dn_aqm_pie_parms *pprms; 
+	int64_t p, prob, oldprob;
+	aqm_time_t now;
+
+	/* dealing with race condition */
+	if (callout_pending(&pst->aqm_pie_callout)) {
+		/* callout was reset */
+		mtx_unlock(&pst->lock_mtx);
+		return;
+	}
+
+	if (!callout_active(&pst->aqm_pie_callout)) {
+		/* callout was stopped */
+		mtx_unlock(&pst->lock_mtx);
+		mtx_destroy(&pst->lock_mtx);
+		q->psi->nr_active_q--;
+		return;
+	}
+	callout_deactivate(&pst->aqm_pie_callout);
+
+	now = AQM_UNOW;
+	pprms = pst->parms;
+	prob = pst->drop_prob;
+
+	/* calculate current qdelay */
+	if (pprms->flags & PIE_DEPRATEEST_ENABLED) {
+		pst->current_qdelay = ((uint64_t)q->stats.len_bytes  * pst->avg_dq_time)
+			>> PIE_DQ_THRESHOLD_BITS;
+	}
+
+	/* calculate drop probability */
+	p = (int64_t)pprms->alpha * 
+		((int64_t)pst->current_qdelay - (int64_t)pprms->qdelay_ref); 
+	p +=(int64_t) pprms->beta * 
+		((int64_t)pst->current_qdelay - (int64_t)pst->qdelay_old); 
+		
+	/* We PIE_MAX_PROB shift by 12-bits to increase the division precision  */
+	p *= (PIE_MAX_PROB << 12) / AQM_TIME_1S;
+
+	/* auto-tune drop probability */
+	if (prob < (PIE_MAX_PROB / 1000000)) /* 0.000001 */
+		p >>= 11 + PIE_FIX_POINT_BITS + 12;
+	else if (prob < (PIE_MAX_PROB / 100000)) /* 0.00001 */
+		p >>= 9 + PIE_FIX_POINT_BITS + 12;
+	else if (prob < (PIE_MAX_PROB / 10000)) /* 0.0001 */
+		p >>= 7 + PIE_FIX_POINT_BITS + 12;
+	else if (prob < (PIE_MAX_PROB / 1000)) /* 0.001 */
+		p >>= 5 + PIE_FIX_POINT_BITS + 12;
+	else if (prob < (PIE_MAX_PROB / 100)) /* 0.01 */
+		p >>= 3 + PIE_FIX_POINT_BITS + 12;
+	else if (prob < (PIE_MAX_PROB / 10)) /* 0.1 */
+		p >>= 1 + PIE_FIX_POINT_BITS + 12;
+	else
+		p >>= PIE_FIX_POINT_BITS + 12;
+
+	oldprob = prob;
+
+	/* Cap Drop adjustment */
+	if ((pprms->flags & PIE_CAPDROP_ENABLED) && prob >= PIE_MAX_PROB / 10
+		&& p > PIE_MAX_PROB / 50 ) 
+			p = PIE_MAX_PROB / 50;
+
+	prob = prob + p;
+
+	/* decay the drop probability exponentially */
+	if (pst->current_qdelay == 0 && pst->qdelay_old == 0)
+		/* 0.98 ~= 1- 1/64 */
+		prob = prob - (prob >> 6); 
+
+
+	/* check for multiplication over/under flow */
+	if (p>0) {
+		if (prob<oldprob) {
+			D("overflow");
+			prob= PIE_MAX_PROB;
+		}
+	}
+	else
+		if (prob>oldprob) {
+			prob= 0;
+			D("underflow");
+		}
+
+	/* make drop probability between 0 and PIE_MAX_PROB*/
+	if (prob < 0)
+		prob = 0;
+	else if (prob > PIE_MAX_PROB)
+		prob = PIE_MAX_PROB;
+
+	pst->drop_prob = prob;
+	
+	/* store current delay value */
+	pst->qdelay_old = pst->current_qdelay;
+
+	/* update burst allowance */
+	if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance) {
+		if (pst->burst_allowance > pprms->tupdate)
+			pst->burst_allowance -= pprms->tupdate;
+		else 
+			pst->burst_allowance = 0;
+	}
+
+	if (pst->sflags & PIE_ACTIVE)
+	callout_reset_sbt(&pst->aqm_pie_callout,
+		(uint64_t)pprms->tupdate * SBT_1US,
+		0, fq_calculate_drop_prob, q, 0);
+
+	mtx_unlock(&pst->lock_mtx);
+}
+
+/* 
+ * Reset PIE variables & activate the queue
+ */
+__inline static void
+fq_activate_pie(struct fq_pie_flow *q)
+{ 
+	struct pie_status *pst = &q->pst;
+	struct dn_aqm_pie_parms *pprms;
+
+	mtx_lock(&pst->lock_mtx);
+	pprms = pst->parms;
+
+	pprms = pst->parms;
+	pst->drop_prob = 0;
+	pst->qdelay_old = 0;
+	pst->burst_allowance = pprms->max_burst;
+	pst->accu_prob = 0;
+	pst->dq_count = 0;
+	pst->avg_dq_time = 0;
+	pst->sflags = PIE_INMEASUREMENT | PIE_ACTIVE;
+	pst->measurement_start = AQM_UNOW;
+	
+	callout_reset_sbt(&pst->aqm_pie_callout,
+		(uint64_t)pprms->tupdate * SBT_1US,
+		0, fq_calculate_drop_prob, q, 0);
+
+	mtx_unlock(&pst->lock_mtx);
+}
+
+ 
+ /* 
+  * Deactivate PIE and stop probe update callout
+  */
+__inline static void
+fq_deactivate_pie(struct pie_status *pst)
+{ 
+	mtx_lock(&pst->lock_mtx);
+	pst->sflags &= ~(PIE_ACTIVE | PIE_INMEASUREMENT);
+	callout_stop(&pst->aqm_pie_callout);
+	//D("PIE Deactivated");
+	mtx_unlock(&pst->lock_mtx);
+}
+
+ /* 
+  * Initialize PIE for sub-queue 'q'
+  */
+static int
+pie_init(struct fq_pie_flow *q)
+{
+	struct pie_status *pst=&q->pst;
+	struct dn_aqm_pie_parms *pprms = pst->parms;
+	struct fq_pie_schk *fqpie_schk;
+	
+	fqpie_schk = (struct fq_pie_schk *)(q->psi->_si.sched+1);
+	int err = 0;
+
+	if (!pprms){
+		D("AQM_PIE is not configured");
+		err = EINVAL;
+	} else {
+		q->psi->nr_active_q++;
+
+		/* For speed optimization, we caculate 1/3 queue size once here */
+		// XXX limit divided by number of queues divided by 3 ??? 
+		pst->one_third_q_size = (fqpie_schk->cfg.limit / 
+			fqpie_schk->cfg.flows_cnt) / 3;
+
+		mtx_init(&pst->lock_mtx, "mtx_pie", NULL, MTX_DEF);
+		callout_init_mtx(&pst->aqm_pie_callout, &pst->lock_mtx,
+			CALLOUT_RETURNUNLOCKED);
+	}
+
+	return err;
+}
+
+/* 
+ * Clean up PIE status for sub-queue 'q' 
+ * Stop callout timer and destroy mtx 
+ */
+static int
+pie_cleanup(struct fq_pie_flow *q)
+{
+	struct pie_status *pst  = &q->pst;
+
+	mtx_lock(&pst->lock_mtx);
+	if (callout_stop(&pst->aqm_pie_callout) || !(pst->sflags & PIE_ACTIVE)) {
+		mtx_unlock(&pst->lock_mtx);
+		mtx_destroy(&pst->lock_mtx);
+		q->psi->nr_active_q--;
+	} else {
+		mtx_unlock(&pst->lock_mtx);
+		return EBUSY;
+	}
+	return 0;
+}
+
+/* 
+ * Dequeue and return a pcaket from sub-queue 'q' or NULL if 'q' is empty.
+ * Also, caculate depature time or queue delay using timestamp
+ */
+ static struct mbuf *
+pie_dequeue(struct fq_pie_flow *q, struct fq_pie_si *si)
+{
+	struct mbuf *m;
+	struct dn_aqm_pie_parms *pprms;
+	struct pie_status *pst;
+	aqm_time_t now;
+	aqm_time_t pkt_ts, dq_time;
+	int32_t w;
+
+	pst  = &q->pst;
+	pprms = q->pst.parms;
+
+	/*we extarct packet ts only when Departure Rate Estimation dis not used*/
+	m = fq_pie_extract_head(q, &pkt_ts, si, 
+		!(pprms->flags & PIE_DEPRATEEST_ENABLED));
+	
+	if (!m || !(pst->sflags & PIE_ACTIVE))
+		return m;
+
+	now = AQM_UNOW;
+	if (pprms->flags & PIE_DEPRATEEST_ENABLED) {
+		/* calculate average depature time */
+		if(pst->sflags & PIE_INMEASUREMENT) {
+			pst->dq_count += m->m_pkthdr.len;
+
+			if (pst->dq_count >= PIE_DQ_THRESHOLD) {
+				dq_time = now - pst->measurement_start;
+
+				/* 
+				 * if we don't have old avg dq_time i.e PIE is (re)initialized, 
+				 * don't use weight to calculate new avg_dq_time
+				 */
+				if(pst->avg_dq_time == 0)
+					pst->avg_dq_time = dq_time;
+				else {
+					/* 
+					 * weight = PIE_DQ_THRESHOLD/2^6, but we scaled 
+					 * weight by 2^8. Thus, scaled 
+					 * weight = PIE_DQ_THRESHOLD /2^8 
+					 * */
+					w = PIE_DQ_THRESHOLD >> 8;
+					pst->avg_dq_time = (dq_time* w
+						+ (pst->avg_dq_time * ((1L << 8) - w))) >> 8;
+					pst->sflags &= ~PIE_INMEASUREMENT;
+				}
+			}
+		}
+
+		/* 
+		 * Start new measurment cycle when the queue has
+		 *  PIE_DQ_THRESHOLD worth of bytes.
+		 */
+		if(!(pst->sflags & PIE_INMEASUREMENT) && 
+			q->stats.len_bytes >= PIE_DQ_THRESHOLD) {
+			pst->sflags |= PIE_INMEASUREMENT;
+			pst->measurement_start = now;
+			pst->dq_count = 0;
+		}
+	}
+	/* Optionally, use packet timestamp to estimate queue delay */
+	else
+		pst->current_qdelay = now - pkt_ts;
+
+	return m;	
+}
+
+
+ /*
+ * Enqueue a packet in q, subject to space and FQ-PIE queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+static int
+pie_enqueue(struct fq_pie_flow *q, struct mbuf* m, struct fq_pie_si *si)
+{
+	uint64_t len;
+	struct pie_status *pst;
+	struct dn_aqm_pie_parms *pprms;
+	int t;
+
+	len = m->m_pkthdr.len;
+	pst  = &q->pst;
+	pprms = pst->parms;
+	t = ENQUE;
+
+	/* drop/mark the packet when PIE is active and burst time elapsed */
+	if (pst->sflags & PIE_ACTIVE && pst->burst_allowance == 0
+		&& drop_early(pst, q->stats.len_bytes) == DROP) {
+			/* 
+			 * if drop_prob over ECN threshold, drop the packet 
+			 * otherwise mark and enqueue it.
+			 */
+			if (pprms->flags & PIE_ECN_ENABLED && pst->drop_prob < 
+				(pprms->max_ecnth << (PIE_PROB_BITS - PIE_FIX_POINT_BITS))
+				&& ecn_mark(m))
+				t = ENQUE;
+			else
+				t = DROP;
+		}
+
+	/* Turn PIE on when 1/3 of the queue is full */ 
+	if (!(pst->sflags & PIE_ACTIVE) && q->stats.len_bytes >= 
+		pst->one_third_q_size) {
+		fq_activate_pie(q);
+	}
+
+	/*  reset burst tolerance and optinally turn PIE off*/
+	if (pst->drop_prob == 0 && pst->current_qdelay < (pprms->qdelay_ref >> 1)
+		&& pst->qdelay_old < (pprms->qdelay_ref >> 1)) {
+			
+			pst->burst_allowance = pprms->max_burst;
+		if (pprms->flags & PIE_ON_OFF_MODE_ENABLED && q->stats.len_bytes<=0)
+			fq_deactivate_pie(pst);
+	}
+
+	/* Use timestamp if Departure Rate Estimation mode is disabled */
+	if (t != DROP && !(pprms->flags & PIE_DEPRATEEST_ENABLED)) {
+		/* Add TS to mbuf as a TAG */
+		struct m_tag *mtag;
+		mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+		if (mtag == NULL)
+			mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS,
+				sizeof(aqm_time_t), M_NOWAIT);
+		if (mtag == NULL) {
+			m_freem(m); 
+			t = DROP;
+		}
+		*(aqm_time_t *)(mtag + 1) = AQM_UNOW;
+		m_tag_prepend(m, mtag);
+	}
+
+	if (t != DROP) {
+		mq_append(&q->mq, m);
+		fq_update_stats(q, si, len, 0);
+		return 0;
+	} else {
+		fq_update_stats(q, si, len, 1);
+		pst->accu_prob = 0;
+		FREE_PKT(m);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* Drop a packet form the head of FQ-PIE sub-queue */
+static void
+pie_drop_head(struct fq_pie_flow *q, struct fq_pie_si *si)
+{
+	struct mbuf *m = q->mq.head;
+
+	if (m == NULL)
+		return;
+	q->mq.head = m->m_nextpkt;
+
+	fq_update_stats(q, si, -m->m_pkthdr.len, 1);
+
+	if (si->main_q.ni.length == 0) /* queue is now idle */
+			si->main_q.q_time = dn_cfg.curr_time;
+	/* reset accu_prob after packet drop */
+	q->pst.accu_prob = 0;
+	
+	FREE_PKT(m);
+}
+
+/*
+ * Classify a packet to queue number using Jenkins hash function.
+ * Return: queue number 
+ * the input of the hash are protocol no, perturbation, src IP, dst IP,
+ * src port, dst port,
+ */
+static inline int
+fq_pie_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_pie_si *si)
+{
+	struct ip *ip;
+	struct tcphdr *th;
+	struct udphdr *uh;
+	uint8_t tuple[41];
+	uint16_t hash=0;
+
+//#ifdef INET6
+	struct ip6_hdr *ip6;
+	int isip6;
+	isip6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
+
+	if(isip6) {
+		ip6 = mtod(m, struct ip6_hdr *);
+		*((uint8_t *) &tuple[0]) = ip6->ip6_nxt;
+		*((uint32_t *) &tuple[1]) = si->perturbation;
+		memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16);
+		memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16);
+
+		switch (ip6->ip6_nxt) {
+		case IPPROTO_TCP:
+			th = (struct tcphdr *)(ip6 + 1);
+			*((uint16_t *) &tuple[37]) = th->th_dport;
+			*((uint16_t *) &tuple[39]) = th->th_sport;
+			break;
+
+		case IPPROTO_UDP:
+			uh = (struct udphdr *)(ip6 + 1);
+			*((uint16_t *) &tuple[37]) = uh->uh_dport;
+			*((uint16_t *) &tuple[39]) = uh->uh_sport;
+			break;
+		default:
+			memset(&tuple[37], 0, 4);
+		}
+
+		hash = jenkins_hash(tuple, 41, HASHINIT) %  fcount;
+		return hash;
+	} 
+//#endif
+
+	/* IPv4 */
+	ip = mtod(m, struct ip *);
+	*((uint8_t *) &tuple[0]) = ip->ip_p;
+	*((uint32_t *) &tuple[1]) = si->perturbation;
+	*((uint32_t *) &tuple[5]) = ip->ip_src.s_addr;
+	*((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr;
+
+	switch (ip->ip_p) {
+		case IPPROTO_TCP:
+			th = (struct tcphdr *)(ip + 1);
+			*((uint16_t *) &tuple[13]) = th->th_dport;
+			*((uint16_t *) &tuple[15]) = th->th_sport;
+			break;
+
+		case IPPROTO_UDP:
+			uh = (struct udphdr *)(ip + 1);
+			*((uint16_t *) &tuple[13]) = uh->uh_dport;
+			*((uint16_t *) &tuple[15]) = uh->uh_sport;
+			break;
+		default:
+			memset(&tuple[13], 0, 4);
+	}
+	hash = jenkins_hash(tuple, 17, HASHINIT) % fcount;
+
+	return hash;
+}
+
+/*
+ * Enqueue a packet into an appropriate queue according to
+ * FQ-CoDe; algorithm.
+ */
+static int 
+fq_pie_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q, 
+	struct mbuf *m)
+{ 
+	struct fq_pie_si *si;
+	struct fq_pie_schk *schk;
+	struct dn_sch_fq_pie_parms *param;
+	struct dn_queue *mainq;
+	int idx, drop, i, maxidx;
+
+	mainq = (struct dn_queue *)(_si + 1);
+	si = (struct fq_pie_si *)_si;
+	schk = (struct fq_pie_schk *)(si->_si.sched+1);
+	param = &schk->cfg;
+
+	 /* classify a packet to queue number*/
+	idx = fq_pie_classify_flow(m, param->flows_cnt, si);
+
+	/* enqueue packet into appropriate queue using PIE AQM.
+	 * Note: 'pie_enqueue' function returns 1 only when it unable to 
+	 * add timestamp to packet (no limit check)*/
+	drop = pie_enqueue(&si->flows[idx], m, si);
+	
+	/* pie unable to timestamp a packet */ 
+	if (drop)
+		return 1;
+	
+	/* If the flow (sub-queue) is not active ,then add it to tail of
+	 * new flows list, initialize and activate it.
+	 */
+	if (!si->flows[idx].active) {
+		STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain);
+		si->flows[idx].deficit = param->quantum;
+		fq_activate_pie(&si->flows[idx]);
+		si->flows[idx].active = 1;
+	}
+
+	/* check the limit for all queues and remove a packet from the
+	 * largest one 
+	 */
+	if (mainq->ni.length > schk->cfg.limit) {
+		/* find first active flow */
+		for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++)
+			if (si->flows[maxidx].active)
+				break;
+		if (maxidx < schk->cfg.flows_cnt) {
+			/* find the largest sub- queue */
+			for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++) 
+				if (si->flows[i].active && si->flows[i].stats.length >
+					si->flows[maxidx].stats.length)
+					maxidx = i;
+			pie_drop_head(&si->flows[maxidx], si);
+			drop = 1;
+		}
+	}
+
+	return drop;
+}
+
+/*
+ * Dequeue a packet from an appropriate queue according to
+ * FQ-CoDel algorithm.
+ */
+static struct mbuf *
+fq_pie_dequeue(struct dn_sch_inst *_si)
+{ 
+	struct fq_pie_si *si;
+	struct fq_pie_schk *schk;
+	struct dn_sch_fq_pie_parms *param;
+	struct fq_pie_flow *f;
+	struct mbuf *mbuf;
+	struct fq_pie_list *fq_pie_flowlist;
+
+	si = (struct fq_pie_si *)_si;
+	schk = (struct fq_pie_schk *)(si->_si.sched+1);
+	param = &schk->cfg;
+
+	do {
+		/* select a list to start with */
+		if (STAILQ_EMPTY(&si->newflows))
+			fq_pie_flowlist = &si->oldflows;
+		else
+			fq_pie_flowlist = &si->newflows;
+
+		/* Both new and old queue lists are empty, return NULL */
+		if (STAILQ_EMPTY(fq_pie_flowlist)) 
+			return NULL;
+
+		f = STAILQ_FIRST(fq_pie_flowlist);
+		while (f != NULL)	{
+			/* if there is no flow(sub-queue) deficit, increase deficit
+			 * by quantum, move the flow to the tail of old flows list
+			 * and try another flow.
+			 * Otherwise, the flow will be used for dequeue.
+			 */
+			if (f->deficit < 0) {
+				 f->deficit += param->quantum;
+				 STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain);
+				 STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
+			 } else 
+				 break;
+
+			f = STAILQ_FIRST(fq_pie_flowlist);
+		}
+		
+		/* the new flows list is empty, try old flows list */
+		if (STAILQ_EMPTY(fq_pie_flowlist)) 
+			continue;
+
+		/* Dequeue a packet from the selected flow */
+		mbuf = pie_dequeue(f, si);
+
+		/* pie did not return a packet */
+		if (!mbuf) {
+			/* If the selected flow belongs to new flows list, then move 
+			 * it to the tail of old flows list. Otherwise, deactivate it and
+			 * remove it from the old list and
+			 */
+			if (fq_pie_flowlist == &si->newflows) {
+				STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain);
+				STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
+			}	else {
+				f->active = 0;
+				fq_deactivate_pie(&f->pst);
+				STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain);
+			}
+			/* start again */
+			continue;
+		}
+
+		/* we have a packet to return, 
+		 * update flow deficit and return the packet*/
+		f->deficit -= mbuf->m_pkthdr.len;
+		return mbuf;
+
+	} while (1);
+	
+	/* unreachable point */
+	return NULL;
+}
+
+/*
+ * Initialize fq_pie scheduler instance.
+ * also, allocate memory for flows array.
+ */
+static int
+fq_pie_new_sched(struct dn_sch_inst *_si)
+{
+	struct fq_pie_si *si;
+	struct dn_queue *q;
+	struct fq_pie_schk *schk;
+	int i;
+
+	si = (struct fq_pie_si *)_si;
+	schk = (struct fq_pie_schk *)(_si->sched+1);
+
+	if(si->flows) {
+		D("si already configured!");
+		return 0;
+	}
+
+	/* init the main queue */
+	q = &si->main_q;
+	set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
+	q->_si = _si;
+	q->fs = _si->sched->fs;
+
+	/* allocate memory for flows array */
+	si->flows = malloc(schk->cfg.flows_cnt * sizeof(struct fq_pie_flow),
+		 M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (si->flows == NULL) {
+		D("cannot allocate memory for fq_pie configuration parameters");
+		return ENOMEM ; 
+	}
+
+	/* init perturbation for this si */
+	si->perturbation = random();
+	si->nr_active_q = 0;
+
+	/* init the old and new flows lists */
+	STAILQ_INIT(&si->newflows);
+	STAILQ_INIT(&si->oldflows);
+
+	/* init the flows (sub-queues) */
+	for (i = 0; i < schk->cfg.flows_cnt; i++) {
+		si->flows[i].pst.parms = &schk->cfg.pcfg;
+		si->flows[i].psi = si;
+		pie_init(&si->flows[i]);
+	}
+
+	/* init mtx lock and callout function for free memory  */
+	if (!fq_pie_desc.ref_count) {
+		mtx_init(&freemem_mtx, "mtx_pie", NULL, MTX_DEF);
+	}
+
+	mtx_lock(&freemem_mtx);
+	fq_pie_desc.ref_count++;
+	mtx_unlock(&freemem_mtx);
+
+	return 0;
+}
+
+/* 
+ * Free FQ-PIE flows memory callout function.
+ * This function is scheduled when a flow or more still active and
+ *  the scheduer is about to be destroyed, to prevent memory leak.
+ */
+static void 
+free_flows(void *_mem) 
+{
+	struct mem_to_free *mem = _mem;
+
+	free(mem->mem_flows, M_DUMMYNET);
+	free(mem->mem_callout, M_DUMMYNET);
+	free(_mem, M_DUMMYNET);
+
+	fq_pie_desc.ref_count--;
+	if (!fq_pie_desc.ref_count) {
+		mtx_unlock(&freemem_mtx);
+		mtx_destroy(&freemem_mtx);
+	} else
+		mtx_unlock(&freemem_mtx);
+	//D("mem freed ok!");
+}
+
+/*
+ * Free fq_pie scheduler instance.
+ */
+static int
+fq_pie_free_sched(struct dn_sch_inst *_si)
+{
+	struct fq_pie_si *si;
+	struct fq_pie_schk *schk;
+	int i;
+
+	si = (struct fq_pie_si *)_si;
+	schk = (struct fq_pie_schk *)(_si->sched+1);
+
+	for (i = 0; i < schk->cfg.flows_cnt; i++) {
+		pie_cleanup(&si->flows[i]);
+	}
+
+	/* if there are still some queues have a callout going to start,
+	 * we cannot free flows memory. If we do so, a panic can happen
+	 *  as prob calculate callout function uses flows memory.
+	 */
+	if (!si->nr_active_q) {
+		/* free the flows array */
+		free(si->flows , M_DUMMYNET);
+		si->flows = NULL;
+		mtx_lock(&freemem_mtx);
+		fq_pie_desc.ref_count--;
+		if (!fq_pie_desc.ref_count) {
+			mtx_unlock(&freemem_mtx);
+			mtx_destroy(&freemem_mtx);
+		} else
+			mtx_unlock(&freemem_mtx);
+		//D("ok!");
+		return 0;
+	} else {
+		/* memory leak happens here. So, we register a callout function to free
+		 *  flows memory later.
+		 */
+		D("unable to stop all fq_pie sub-queues!");
+		mtx_lock(&freemem_mtx);
+
+		struct callout *mem_callout;
+		struct mem_to_free *mem;
+
+		mem = malloc(sizeof(*mem), M_DUMMYNET,
+			M_NOWAIT | M_ZERO);
+		mem_callout = malloc(sizeof(*mem_callout), M_DUMMYNET,
+			M_NOWAIT | M_ZERO);
+
+		callout_init_mtx(mem_callout, &freemem_mtx,
+			CALLOUT_RETURNUNLOCKED);
+
+		mem->mem_flows = si->flows;
+		mem->mem_callout = mem_callout;
+		callout_reset_sbt(mem_callout, 
+			(uint64_t)(si->flows[0].pst.parms->tupdate + 1000) * SBT_1US,
+			0, free_flows, mem, 0);
+
+		si->flows = NULL;
+		mtx_unlock(&freemem_mtx);
+
+		return EBUSY;
+	}
+}
+
+/*
+ * Configure FQ-PIE scheduler.
+ * the configurations for the scheduler is passed fromipfw  userland.
+ */
+static int
+fq_pie_config(struct dn_schk *_schk)
+{
+	struct fq_pie_schk *schk;
+	struct dn_extra_parms *ep;
+	struct dn_sch_fq_pie_parms *fqp_cfg;
+	
+	schk = (struct fq_pie_schk *)(_schk+1);
+	ep = (struct dn_extra_parms *) _schk->cfg;
+
+	/* par array contains fq_pie configuration as follow
+	 * PIE: 0- qdelay_ref,1- tupdate, 2- max_burst
+	 * 3- max_ecnth, 4- alpha, 5- beta, 6- flags
+	 * FQ_PIE: 7- quantum, 8- limit, 9- flows
+	 */
+	if (ep && ep->oid.len ==sizeof(*ep) &&
+		ep->oid.subtype == DN_SCH_PARAMS) {
+
+		fqp_cfg = &schk->cfg;
+		if (ep->par[0] < 0)
+			fqp_cfg->pcfg.qdelay_ref = fq_pie_sysctl.pcfg.qdelay_ref;
+		else
+			fqp_cfg->pcfg.qdelay_ref = ep->par[0];
+		if (ep->par[1] < 0)
+			fqp_cfg->pcfg.tupdate = fq_pie_sysctl.pcfg.tupdate;
+		else
+			fqp_cfg->pcfg.tupdate = ep->par[1];
+		if (ep->par[2] < 0)
+			fqp_cfg->pcfg.max_burst = fq_pie_sysctl.pcfg.max_burst;
+		else
+			fqp_cfg->pcfg.max_burst = ep->par[2];
+		if (ep->par[3] < 0)
+			fqp_cfg->pcfg.max_ecnth = fq_pie_sysctl.pcfg.max_ecnth;
+		else
+			fqp_cfg->pcfg.max_ecnth = ep->par[3];
+		if (ep->par[4] < 0)
+			fqp_cfg->pcfg.alpha = fq_pie_sysctl.pcfg.alpha;
+		else
+			fqp_cfg->pcfg.alpha = ep->par[4];
+		if (ep->par[5] < 0)
+			fqp_cfg->pcfg.beta = fq_pie_sysctl.pcfg.beta;
+		else
+			fqp_cfg->pcfg.beta = ep->par[5];
+		if (ep->par[6] < 0)
+			fqp_cfg->pcfg.flags = 0;
+		else
+			fqp_cfg->pcfg.flags = ep->par[6];
+
+		/* FQ configurations */
+		if (ep->par[7] < 0)
+			fqp_cfg->quantum = fq_pie_sysctl.quantum;
+		else
+			fqp_cfg->quantum = ep->par[7];
+		if (ep->par[8] < 0)
+			fqp_cfg->limit = fq_pie_sysctl.limit;
+		else
+			fqp_cfg->limit = ep->par[8];
+		if (ep->par[9] < 0)
+			fqp_cfg->flows_cnt = fq_pie_sysctl.flows_cnt;
+		else
+			fqp_cfg->flows_cnt = ep->par[9];
+
+		/* Bound the configurations */
+		fqp_cfg->pcfg.qdelay_ref = BOUND_VAR(fqp_cfg->pcfg.qdelay_ref,
+			1, 5 * AQM_TIME_1S);
+		fqp_cfg->pcfg.tupdate = BOUND_VAR(fqp_cfg->pcfg.tupdate,
+			1, 5 * AQM_TIME_1S);
+		fqp_cfg->pcfg.max_burst = BOUND_VAR(fqp_cfg->pcfg.max_burst,
+			0, 5 * AQM_TIME_1S);
+		fqp_cfg->pcfg.max_ecnth = BOUND_VAR(fqp_cfg->pcfg.max_ecnth,
+			0, PIE_SCALE);
+		fqp_cfg->pcfg.alpha = BOUND_VAR(fqp_cfg->pcfg.alpha, 0, 7 * PIE_SCALE);
+		fqp_cfg->pcfg.beta = BOUND_VAR(fqp_cfg->pcfg.beta, 0, 7 * PIE_SCALE);
+
+		fqp_cfg->quantum = BOUND_VAR(fqp_cfg->quantum,1,9000);
+		fqp_cfg->limit= BOUND_VAR(fqp_cfg->limit,1,20480);
+		fqp_cfg->flows_cnt= BOUND_VAR(fqp_cfg->flows_cnt,1,65536);
+	}
+	else {
+		D("Wrong parameters for fq_pie scheduler");
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Return FQ-PIE scheduler configurations
+ * the configurations for the scheduler is passed to userland.
+ */
+static int 
+fq_pie_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) {
+	
+	struct fq_pie_schk *schk = (struct fq_pie_schk *)(_schk+1);
+	struct dn_sch_fq_pie_parms *fqp_cfg;
+
+	fqp_cfg = &schk->cfg;
+
+	strcpy(ep->name, fq_pie_desc.name);
+	ep->par[0] = fqp_cfg->pcfg.qdelay_ref;
+	ep->par[1] = fqp_cfg->pcfg.tupdate;
+	ep->par[2] = fqp_cfg->pcfg.max_burst;
+	ep->par[3] = fqp_cfg->pcfg.max_ecnth;
+	ep->par[4] = fqp_cfg->pcfg.alpha;
+	ep->par[5] = fqp_cfg->pcfg.beta;
+	ep->par[6] = fqp_cfg->pcfg.flags;
+	
+	ep->par[7] = fqp_cfg->quantum;
+	ep->par[8] = fqp_cfg->limit;
+	ep->par[9] = fqp_cfg->flows_cnt;
+
+	return 0;
+}
+
+/*
+ *  FQ-PIE scheduler descriptor
+ * contains the type of the scheduler, the name, the size of extra
+ * data structures, and function pointers.
+ */
+static struct dn_alg fq_pie_desc = {
+	_SI( .type = )  DN_SCHED_FQ_PIE,
+	_SI( .name = ) "FQ_PIE",
+	_SI( .flags = ) 0,
+
+	_SI( .schk_datalen = ) sizeof(struct fq_pie_schk),
+	_SI( .si_datalen = ) sizeof(struct fq_pie_si) - sizeof(struct dn_sch_inst),
+	_SI( .q_datalen = ) 0,
+
+	_SI( .enqueue = ) fq_pie_enqueue,
+	_SI( .dequeue = ) fq_pie_dequeue,
+	_SI( .config = ) fq_pie_config, /* new sched i.e. sched X config ...*/
+	_SI( .destroy = ) NULL,  /*sched x delete */
+	_SI( .new_sched = ) fq_pie_new_sched, /* new schd instance */
+	_SI( .free_sched = ) fq_pie_free_sched,	/* delete schd instance */
+	_SI( .new_fsk = ) NULL,
+	_SI( .free_fsk = ) NULL,
+	_SI( .new_queue = ) NULL,
+	_SI( .free_queue = ) NULL,
+	_SI( .getconfig = )  fq_pie_getconfig,
+	_SI( .ref_count = ) 0
+};
+
+DECLARE_DNSCHED_MODULE(dn_fq_pie, &fq_pie_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_prio.c b/sys/netpfil/ipfw/dn_sched_prio.c
index b779515..915b4cb 100644
--- a/sys/netpfil/ipfw/dn_sched_prio.c
+++ b/sys/netpfil/ipfw/dn_sched_prio.c
@@ -41,6 +41,9 @@
 #include <netinet/ip_dummynet.h>
 #include <netpfil/ipfw/dn_heap.h>
 #include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
 #include <netpfil/ipfw/dn_sched.h>
 #else
 #include <dn_test.h>
@@ -223,6 +226,9 @@ static struct dn_alg prio_desc = {
 
 	_SI( .new_queue = ) prio_new_queue,
 	_SI( .free_queue = ) prio_free_queue,
+#ifdef NEW_AQM
+	_SI( .getconfig = )  NULL,
+#endif
 };
 
 
diff --git a/sys/netpfil/ipfw/dn_sched_qfq.c b/sys/netpfil/ipfw/dn_sched_qfq.c
index 5bbff8a..87502d1 100644
--- a/sys/netpfil/ipfw/dn_sched_qfq.c
+++ b/sys/netpfil/ipfw/dn_sched_qfq.c
@@ -42,6 +42,9 @@
 #include <netinet/ip_dummynet.h>
 #include <netpfil/ipfw/dn_heap.h>
 #include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
 #include <netpfil/ipfw/dn_sched.h>
 #else
 #include <dn_test.h>
@@ -824,6 +827,9 @@ static struct dn_alg qfq_desc = {
 	_SI( .free_fsk = )  NULL,
 	_SI( .new_queue = ) qfq_new_queue,
 	_SI( .free_queue = ) qfq_free_queue,
+#ifdef NEW_AQM
+	_SI( .getconfig = )  NULL,
+#endif
 };
 
 DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_rr.c b/sys/netpfil/ipfw/dn_sched_rr.c
index dd608d7..b3658a6 100644
--- a/sys/netpfil/ipfw/dn_sched_rr.c
+++ b/sys/netpfil/ipfw/dn_sched_rr.c
@@ -42,6 +42,9 @@
 #include <netinet/ip_dummynet.h>
 #include <netpfil/ipfw/dn_heap.h>
 #include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
 #include <netpfil/ipfw/dn_sched.h>
 #else
 #include <dn_test.h>
@@ -301,6 +304,9 @@ static struct dn_alg rr_desc = {
 	_SI( .free_fsk = ) NULL,
 	_SI( .new_queue = ) rr_new_queue,
 	_SI( .free_queue = ) rr_free_queue,
+#ifdef NEW_AQM
+	_SI( .getconfig = )  NULL,
+#endif
 };
 
 
diff --git a/sys/netpfil/ipfw/dn_sched_wf2q.c b/sys/netpfil/ipfw/dn_sched_wf2q.c
index a91c1ce..06f92a9 100644
--- a/sys/netpfil/ipfw/dn_sched_wf2q.c
+++ b/sys/netpfil/ipfw/dn_sched_wf2q.c
@@ -43,6 +43,9 @@
 #include <netinet/ip_dummynet.h>
 #include <netpfil/ipfw/dn_heap.h>
 #include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
 #include <netpfil/ipfw/dn_sched.h>
 #else
 #include <dn_test.h>
@@ -367,6 +370,10 @@ static struct dn_alg wf2qp_desc = {
 
 	_SI( .new_queue = ) wf2qp_new_queue,
 	_SI( .free_queue = ) wf2qp_free_queue,
+#ifdef NEW_AQM
+	_SI( .getconfig = )  NULL,
+#endif
+
 };
 
 
diff --git a/sys/netpfil/ipfw/ip_dn_glue.c b/sys/netpfil/ipfw/ip_dn_glue.c
index 7d7e695..d7b04af 100644
--- a/sys/netpfil/ipfw/ip_dn_glue.c
+++ b/sys/netpfil/ipfw/ip_dn_glue.c
@@ -55,6 +55,9 @@
 #include <netpfil/ipfw/ip_fw_private.h>
 #include <netpfil/ipfw/dn_heap.h>
 #include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
 #include <netpfil/ipfw/dn_sched.h>
 
 /* FREEBSD7.2 ip_dummynet.h r191715*/
diff --git a/sys/netpfil/ipfw/ip_dn_io.c b/sys/netpfil/ipfw/ip_dn_io.c
index 90e2ccf..b7213ce 100644
--- a/sys/netpfil/ipfw/ip_dn_io.c
+++ b/sys/netpfil/ipfw/ip_dn_io.c
@@ -62,6 +62,9 @@ __FBSDID("$FreeBSD$");
 #include <netpfil/ipfw/ip_fw_private.h>
 #include <netpfil/ipfw/dn_heap.h>
 #include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
 #include <netpfil/ipfw/dn_sched.h>
 
 /*
@@ -83,8 +86,12 @@ static long tick_diff;
 
 static unsigned long	io_pkt;
 static unsigned long	io_pkt_fast;
-static unsigned long	io_pkt_drop;
 
+#ifdef NEW_AQM
+unsigned long	io_pkt_drop;
+#else
+static unsigned long	io_pkt_drop;
+#endif
 /*
  * We use a heap to store entities for which we have pending timer events.
  * The heap is checked at every tick and all entities with expired events
@@ -147,7 +154,11 @@ SYSBEGIN(f4)
 
 SYSCTL_DECL(_net_inet);
 SYSCTL_DECL(_net_inet_ip);
+#ifdef NEW_AQM
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+#else
 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+#endif
 
 /* wrapper to pass dn_cfg fields to SYSCTL_* */
 //#define DC(x)	(&(VNET_NAME(_base_dn_cfg).x))
@@ -249,6 +260,14 @@ static struct dn_pkt_tag *
 dn_tag_get(struct mbuf *m)
 {
 	struct m_tag *mtag = m_tag_first(m);
+#ifdef NEW_AQM
+	/* XXX: to skip ts m_tag. For Debugging only*/
+	if (mtag != NULL && mtag->m_tag_id == DN_AQM_MTAG_TS) {
+		m_tag_delete(m,mtag); 
+		mtag = m_tag_first(m);
+		D("skip TS tag");
+	}
+#endif
 	KASSERT(mtag != NULL &&
 	    mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
 	    mtag->m_tag_id == PACKET_TAG_DUMMYNET,
@@ -256,6 +275,7 @@ dn_tag_get(struct mbuf *m)
 	return (struct dn_pkt_tag *)(mtag+1);
 }
 
+#ifndef NEW_AQM
 static inline void
 mq_append(struct mq *q, struct mbuf *m)
 {
@@ -266,6 +286,7 @@ mq_append(struct mq *q, struct mbuf *m)
 	q->tail = m;
 	m->m_nextpkt = NULL;
 }
+#endif
 
 /*
  * Dispose a list of packet. Use a functions so if we need to do
@@ -390,7 +411,10 @@ red_drops (struct dn_queue *q, int len)
 /*
  * ECN/ECT Processing (partially adopted from altq)
  */
-static int
+#ifndef NEW_AQM
+static
+#endif
+int
 ecn_mark(struct mbuf* m)
 {
 	struct ip *ip;
@@ -482,6 +506,11 @@ dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
 		goto drop;
 	if (f->plr && random() < f->plr)
 		goto drop;
+#ifdef NEW_AQM
+	/* Call AQM enqueue function */
+	if (q->fs->aqmfp)
+		return q->fs->aqmfp->enqueue(q ,m);
+#endif
 	if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) {
 		if (!(f->flags & DN_IS_ECN) || !ecn_mark(m))
 			goto drop;
@@ -864,6 +893,10 @@ dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
 	if (fs->sched->fp->enqueue(si, q, m)) {
 		/* packet was dropped by enqueue() */
 		m = *m0 = NULL;
+
+		/* dn_enqueue already increases io_pkt_drop */
+		io_pkt_drop--;
+
 		goto dropit;
 	}
 
diff --git a/sys/netpfil/ipfw/ip_dn_private.h b/sys/netpfil/ipfw/ip_dn_private.h
index 159ddc9..b8b55e8 100644
--- a/sys/netpfil/ipfw/ip_dn_private.h
+++ b/sys/netpfil/ipfw/ip_dn_private.h
@@ -81,6 +81,10 @@ SLIST_HEAD(dn_fsk_head, dn_fsk);
 SLIST_HEAD(dn_queue_head, dn_queue);
 SLIST_HEAD(dn_alg_head, dn_alg);
 
+#ifdef NEW_AQM
+SLIST_HEAD(dn_aqm_head, dn_aqm); /* for new AQMs */
+#endif
+
 struct mq {	/* a basic queue of packets*/
         struct mbuf *head, *tail;
 };
@@ -135,6 +139,9 @@ struct dn_parms {
 	/* list of flowsets without a scheduler -- use sch_chain */
 	struct dn_fsk_head	fsu;	/* list of unlinked flowsets */
 	struct dn_alg_head	schedlist;	/* list of algorithms */
+#ifdef NEW_AQM
+	struct dn_aqm_head	aqmlist;	/* list of AQMs */
+#endif
 
 	/* Store the fs/sch to scan when draining. The value is the
 	 * bucket number of the hash table. Expire can be disabled
@@ -231,6 +238,10 @@ struct dn_fsk { /* kernel side of a flowset */
 	int lookup_weight ;	/* equal to (1-w_q)^t / (1-w_q)^(t+1) */
 	int avg_pkt_size ;	/* medium packet size */
 	int max_pkt_size ;	/* max packet size */
+#ifdef NEW_AQM
+	struct dn_aqm *aqmfp;	/* Pointer to AQM functions */
+	void *aqmcfg;	/* configuration parameters for AQM */
+#endif
 };
 
 /*
@@ -253,6 +264,9 @@ struct dn_queue {
 	int count;		/* arrivals since last RED drop */
 	int random;		/* random value (scaled) */
 	uint64_t q_time;	/* start of queue idle time */
+#ifdef NEW_AQM
+	void *aqm_status;	/* per-queue status variables*/
+#endif
 
 };
 
@@ -400,4 +414,20 @@ int do_config(void *p, int l);
 void dn_drain_scheduler(void);
 void dn_drain_queue(void);
 
+#ifdef NEW_AQM
+int ecn_mark(struct mbuf* m);
+
+/* moved from ip_dn_io.c to here to be available for AQMs modules*/
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+	if (q->head == NULL)
+		q->head = m;
+	else
+		q->tail->m_nextpkt = m;
+	q->tail = m;
+	m->m_nextpkt = NULL;
+}
+#endif /* NEW_AQM */
+
 #endif /* _IP_DN_PRIVATE_H */
diff --git a/sys/netpfil/ipfw/ip_dummynet.c b/sys/netpfil/ipfw/ip_dummynet.c
index 420b491..09fbe84 100644
--- a/sys/netpfil/ipfw/ip_dummynet.c
+++ b/sys/netpfil/ipfw/ip_dummynet.c
@@ -1,4 +1,11 @@
 /*-
+ * Codel/FQ_Codel and PIE/FQ-PIE Code:
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ *  Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from 
+ *  The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ * 
  * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
  * Portions Copyright (c) 2000 Akamba Corp.
  * All rights reserved
@@ -57,6 +64,9 @@ __FBSDID("$FreeBSD$");
 #include <netpfil/ipfw/ip_fw_private.h>
 #include <netpfil/ipfw/dn_heap.h>
 #include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
 #include <netpfil/ipfw/dn_sched.h>
 
 /* which objects to copy */
@@ -97,6 +107,21 @@ dn_reschedule(void)
 }
 /*----- end of callout hooks -----*/
 
+#ifdef NEW_AQM
+/* Return AQM descriptor for given type or name. */
+static struct dn_aqm *
+find_aqm_type(int type, char *name)
+{
+	struct dn_aqm *d;
+
+	SLIST_FOREACH(d, &dn_cfg.aqmlist, next) {
+		if (d->type == type || (name && !strcasecmp(d->name, name)))
+			return d;
+	}
+	return NULL; /* not found */
+}
+#endif
+
 /* Return a scheduler descriptor given the type or name. */
 static struct dn_alg *
 find_sched_type(int type, char *name)
@@ -319,7 +344,15 @@ q_new(uintptr_t key, int flags, void *arg)
 
 	if (fs->sched->fp->new_queue)
 		fs->sched->fp->new_queue(q);
+
+#ifdef NEW_AQM
+	/* call AQM init function after creating a queue*/
+	if (fs->aqmfp && fs->aqmfp->init)
+		if(fs->aqmfp->init(q))
+			D("unable to init AQM for fs %d", fs->fs.fs_nr);
+#endif
 	dn_cfg.queue_count++;
+
 	return q;
 }
 
@@ -333,6 +366,13 @@ dn_delete_queue(struct dn_queue *q, int flags)
 {
 	struct dn_fsk *fs = q->fs;
 
+#ifdef NEW_AQM
+	/* clean up AQM status for queue 'q'
+	 * cleanup here is called just with MULTIQUEUE
+	 */
+	if (fs && fs->aqmfp && fs->aqmfp->cleanup)
+		fs->aqmfp->cleanup(q);
+#endif
 	// D("fs %p si %p\n", fs, q->_si);
 	/* notify the parent scheduler that the queue is going away */
 	if (fs && fs->sched->fp->free_queue)
@@ -474,6 +514,16 @@ si_new(uintptr_t key, int flags, void *arg)
 	if (s->sch.flags & DN_HAVE_MASK)
 		si->ni.fid = *(struct ipfw_flow_id *)key;
 
+#ifdef NEW_AQM
+	/* init AQM status for !DN_MULTIQUEUE sched*/
+	if (!(s->fp->flags & DN_MULTIQUEUE))
+		if (s->fs->aqmfp && s->fs->aqmfp->init)
+			if(s->fs->aqmfp->init((struct dn_queue *)(si + 1))) {
+				D("unable to init AQM for fs %d", s->fs->fs.fs_nr);
+				goto error;
+			}
+#endif
+
 	dn_cfg.si_count++;
 	return si;
 
@@ -503,6 +553,20 @@ si_destroy(void *_si, void *arg)
 	dn_free_pkts(dl->mq.head);	/* drain delay line */
 	if (si->kflags & DN_ACTIVE) /* remove si from event heap */
 		heap_extract(&dn_cfg.evheap, si);
+
+#ifdef NEW_AQM
+	/* clean up AQM status for !DN_MULTIQUEUE sched
+	 * Note that all queues belong to fs were cleaned up in fsk_detach.
+	 * When drain_scheduler is called s->fs and q->fs are pointing 
+	 * to a correct fs, so we can use fs in this case.
+	 */
+	if (!(s->fp->flags & DN_MULTIQUEUE)) {
+		struct dn_queue *q = (struct dn_queue *)(si + 1);
+		if (q->aqm_status && q->fs->aqmfp)
+			if (q->fs->aqmfp->cleanup)
+				q->fs->aqmfp->cleanup(q);
+	}
+#endif
 	if (s->fp->free_sched)
 		s->fp->free_sched(si);
 	bzero(si, sizeof(*si));	/* safety */
@@ -591,6 +655,67 @@ fsk_new(uintptr_t key, int flags, void *arg)
 	return fs;
 }
 
+#ifdef NEW_AQM
+/* callback function for cleaning up AQM queue status belongs to a flowset
+ * connected to scheduler instance '_si' (for !DN_MULTIQUEUE only).
+ */
+static int
+si_cleanup_q(void *_si, void *arg)
+{
+	struct dn_sch_inst *si = _si;
+
+	if (!(si->sched->fp->flags & DN_MULTIQUEUE)) {
+		if (si->sched->fs->aqmfp && si->sched->fs->aqmfp->cleanup)
+			si->sched->fs->aqmfp->cleanup((struct dn_queue *) (si+1));
+	}
+	return 0;
+}
+
+/* callback to clean up queue AQM status.*/
+static int
+q_cleanup_q(void *_q, void *arg)
+{
+	struct dn_queue *q = _q;
+	q->fs->aqmfp->cleanup(q);
+	return 0;
+}
+
+/* Clean up all AQM queues status belongs to flowset 'fs' and then
+ * deconfig AQM for flowset 'fs'
+ */
+static void 
+aqm_cleanup_deconfig_fs(struct dn_fsk *fs)
+{
+	struct dn_sch_inst *si;
+
+	/* clean up AQM status for all queues for !DN_MULTIQUEUE sched*/
+	if (fs->fs.fs_nr > DN_MAX_ID) {
+		if (fs->sched && !(fs->sched->fp->flags & DN_MULTIQUEUE)) {
+			if (fs->sched->sch.flags & DN_HAVE_MASK)
+				dn_ht_scan(fs->sched->siht, si_cleanup_q, NULL);
+			else {
+					/* single si i.e. no sched mask */
+					si = (struct dn_sch_inst *) fs->sched->siht;
+					if (si && fs->aqmfp && fs->aqmfp->cleanup)
+						fs->aqmfp->cleanup((struct dn_queue *) (si+1));
+			}
+		} 
+	}
+
+	/* clean up AQM status for all queues for DN_MULTIQUEUE sched*/
+	if (fs->sched && fs->sched->fp->flags & DN_MULTIQUEUE && fs->qht) {
+			if (fs->fs.flags & DN_QHT_HASH)
+				dn_ht_scan(fs->qht, q_cleanup_q, NULL);
+			else
+				fs->aqmfp->cleanup((struct dn_queue *)(fs->qht));
+	}
+
+	/* deconfig AQM */
+	if(fs->aqmcfg && fs->aqmfp && fs->aqmfp->deconfig)
+		fs->aqmfp->deconfig(fs);
+}
+#endif
+
 /*
  * detach flowset from its current scheduler. Flags as follows:
  * DN_DETACH removes from the fsk_list
@@ -619,6 +744,10 @@ fsk_detach(struct dn_fsk *fs, int flags)
 		free(fs->w_q_lookup, M_DUMMYNET);
 	fs->w_q_lookup = NULL;
 	qht_delete(fs, flags);
+#ifdef NEW_AQM
+	aqm_cleanup_deconfig_fs(fs);
+#endif
+
 	if (fs->sched && fs->sched->fp->free_fsk)
 		fs->sched->fp->free_fsk(fs);
 	fs->sched = NULL;
@@ -1190,6 +1319,183 @@ update_fs(struct dn_schk *s)
 	}
 }
 
+#ifdef NEW_AQM
+/* Retrieve AQM configurations to ipfw userland 
+ */
+static int
+get_aqm_parms(struct sockopt *sopt)
+{
+	struct dn_extra_parms  *ep;
+	struct dn_fsk *fs;
+	size_t sopt_valsize;
+	int l, err = 0;
+	
+	sopt_valsize = sopt->sopt_valsize;
+	l = sizeof(*ep);
+	if (sopt->sopt_valsize < l) {
+		D("bad len sopt->sopt_valsize %d len %d",
+			(int) sopt->sopt_valsize , l);
+		err = EINVAL;
+		return err;
+	}
+	ep = malloc(l, M_DUMMYNET, M_WAITOK);
+	if(!ep) {
+		err = ENOMEM ;
+		return err;
+	}
+	do {
+		err = sooptcopyin(sopt, ep, l, l);
+		if(err)
+			break;
+		sopt->sopt_valsize = sopt_valsize;
+		if (ep->oid.len < l) {
+			err = EINVAL;
+			break;
+		}
+
+		fs = dn_ht_find(dn_cfg.fshash, ep->nr, 0, NULL);
+		if (!fs) {
+			D("fs %d not found", ep->nr);
+			err = EINVAL;
+			break;
+		}
+
+		if (fs->aqmfp && fs->aqmfp->getconfig) {
+			if(fs->aqmfp->getconfig(fs, ep)) {
+				D("Error while trying to get AQM params");
+				err = EINVAL;
+				break;
+			}
+			ep->oid.len = l;
+			err = sooptcopyout(sopt, ep, l);
+		}
+	}while(0);
+
+	free(ep, M_DUMMYNET);
+	return err;
+}
+
+/* Retrieve AQM configurations to ipfw userland
+ */
+static int
+get_sched_parms(struct sockopt *sopt)
+{
+	struct dn_extra_parms  *ep;
+	struct dn_schk *schk;
+	size_t sopt_valsize;
+	int l, err = 0;
+	
+	sopt_valsize = sopt->sopt_valsize;
+	l = sizeof(*ep);
+	if (sopt->sopt_valsize < l) {
+		D("bad len sopt->sopt_valsize %d len %d",
+			(int) sopt->sopt_valsize , l);
+		err = EINVAL;
+		return err;
+	}
+	ep = malloc(l, M_DUMMYNET, M_WAITOK);
+	if(!ep) {
+		err = ENOMEM ;
+		return err;
+	}
+	do {
+		err = sooptcopyin(sopt, ep, l, l);
+		if(err)
+			break;
+		sopt->sopt_valsize = sopt_valsize;
+		if (ep->oid.len < l) {
+			err = EINVAL;
+			break;
+		}
+
+		schk = locate_scheduler(ep->nr);
+		if (!schk) {
+			D("sched %d not found", ep->nr);
+			err = EINVAL;
+			break;
+		}
+		
+		if (schk->fp && schk->fp->getconfig) {
+			if(schk->fp->getconfig(schk, ep)) {
+				D("Error while trying to get sched params");
+				err = EINVAL;
+				break;
+			}
+			ep->oid.len = l;
+			err = sooptcopyout(sopt, ep, l);
+		}
+	}while(0);
+	free(ep, M_DUMMYNET);
+
+	return err;
+}
+
+/* Configure AQM for flowset 'fs'.
+ * extra parameters are passed from userland.
+ */
+static int
+config_aqm(struct dn_fsk *fs, struct  dn_extra_parms *ep, int busy)
+{
+	int err = 0;
+
+	do {
+		/* no configurations */
+		if (!ep) {
+			err = 0;
+			break;
+		}
+
+		/* no AQM for this flowset*/
+		if (!strcmp(ep->name,"")) {
+			err = 0;
+			break;
+		}
+		if (ep->oid.len < sizeof(*ep)) {
+			D("short aqm len %d", ep->oid.len);
+				err = EINVAL;
+				break;
+		}
+
+		if (busy) {
+			D("Unable to configure flowset, flowset busy!");
+			err = EINVAL;
+			break;
+		}
+
+		/* deconfigure old aqm if exist */
+		if (fs->aqmcfg && fs->aqmfp && fs->aqmfp->deconfig) {
+			aqm_cleanup_deconfig_fs(fs);
+		}
+
+		if (!(fs->aqmfp = find_aqm_type(0, ep->name))) {
+			D("AQM functions not found for type %s!", ep->name);
+			fs->fs.flags &= ~DN_IS_AQM;
+			err = EINVAL;
+			break;
+		} else
+			fs->fs.flags |= DN_IS_AQM;
+
+		if (ep->oid.subtype != DN_AQM_PARAMS) {
+				D("Wrong subtype");
+				err = EINVAL;
+				break;
+		}
+
+		if (fs->aqmfp->config) {
+			err = fs->aqmfp->config(fs, ep, ep->oid.len);
+			if (err) {
+					D("Unable to configure AQM for FS %d", fs->fs.fs_nr );
+					fs->fs.flags &= ~DN_IS_AQM;
+					fs->aqmfp = NULL;
+					break;
+			}
+		}
+	} while(0);
+
+	return err;
+}
+#endif
+
 /*
  * Configuration -- to preserve backward compatibility we use
  * the following scheme (N is 65536)
@@ -1322,6 +1628,14 @@ config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
 	    }
 	    if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) {
 		ND("flowset %d unchanged", i);
+#ifdef NEW_AQM
+		/* reconfigure AQM as the parameters can be changed.
+		 * we consider the flowsetis  busy if it has scheduler instance(s) 
+		*/ 
+		s = locate_scheduler(nfs->sched_nr);
+		config_aqm(fs, (struct dn_extra_parms *) arg, 
+			s != NULL && s->siht != NULL);
+#endif
 		break; /* no change, nothing to do */
 	    }
 	    if (oldc != dn_cfg.fsk_count)	/* new item */
@@ -1340,6 +1654,10 @@ config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
 		fsk_detach(fs, flags);
 	    }
 	    fs->fs = *nfs; /* copy configuration */
+#ifdef NEW_AQM
+			fs->aqmfp = NULL;
+			config_aqm(fs, (struct dn_extra_parms *) arg, s != NULL && s->siht != NULL);
+#endif
 	    if (s != NULL)
 		fsk_attach(fs, s);
 	} while (0);
@@ -1865,6 +2183,19 @@ dummynet_get(struct sockopt *sopt, void **compat)
 		// cmd->id = sopt_valsize;
 		D("compatibility mode");
 	}
+
+#ifdef NEW_AQM
+	/* get AQM params */
+	if(cmd->subtype == DN_AQM_PARAMS) {
+		error = get_aqm_parms(sopt);
+		goto done;
+	/* get Scheduler params */
+	} else if (cmd->subtype == DN_SCH_PARAMS) {
+		error = get_sched_parms(sopt);
+		goto done;
+	}
+#endif
+
 	a.extra = (struct copy_range *)cmd;
 	if (cmd->len == sizeof(*cmd)) { /* no range, create a default */
 		uint32_t *rp = (uint32_t *)(cmd + 1);
@@ -2316,4 +2647,98 @@ MODULE_VERSION(dummynet, 3);
  */
 //VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL);
 
+#ifdef NEW_AQM
+
+/* modevent helpers for the AQM modules */
+static int
+load_dn_aqm(struct dn_aqm *d)
+{
+	struct dn_aqm *aqm=NULL;
+
+	if (d == NULL)
+		return 1; /* error */
+	ip_dn_init();	/* just in case, we need the lock */
+
+	/* Check that mandatory funcs exists */
+	if (d->enqueue == NULL || d->dequeue == NULL) {
+		D("missing enqueue or dequeue for %s", d->name);
+		return 1;
+	}
+
+	/* Search if AQM already exists */
+	DN_BH_WLOCK();
+	SLIST_FOREACH(aqm, &dn_cfg.aqmlist, next) {
+		if (strcmp(aqm->name, d->name) == 0) {
+			D("%s already loaded", d->name);
+			break; /* AQM already exists */
+		}
+	}
+	if (aqm == NULL)
+		SLIST_INSERT_HEAD(&dn_cfg.aqmlist, d, next);
+	DN_BH_WUNLOCK();
+	D("dn_aqm %s %sloaded", d->name, aqm ? "not ":"");
+	return aqm ? 1 : 0;
+}
+
+
+/* Callback to clean up AQM status for queues connected to a flowset
+ * and then deconfigure the flowset.
+ * This function is called before an AQM module is unloaded
+ */
+static int
+fs_cleanup(void *_fs, void *arg)
+{
+	struct dn_fsk *fs = _fs;
+	uint32_t type = *(uint32_t *)arg;
+
+	if (fs->aqmfp && fs->aqmfp->type == type)
+		aqm_cleanup_deconfig_fs(fs);
+
+	return 0;
+}
+
+static int
+unload_dn_aqm(struct dn_aqm *aqm)
+{
+	struct dn_aqm *tmp, *r;
+	int err = EINVAL;
+	err = 0;
+	ND("called for %s", aqm->name);
+
+	DN_BH_WLOCK();
+
+	/* clean up AQM status and deconfig flowset */
+	dn_ht_scan(dn_cfg.fshash, fs_cleanup, &aqm->type);
+
+	SLIST_FOREACH_SAFE(r, &dn_cfg.aqmlist, next, tmp) {
+		if (strcmp(aqm->name, r->name) != 0)
+			continue;
+		ND("ref_count = %d", r->ref_count);
+		err = (r->ref_count != 0 || r->cfg_ref_count != 0) ? EBUSY : 0;
+		if (err == 0)
+			SLIST_REMOVE(&dn_cfg.aqmlist, r, dn_aqm, next);
+		break;
+	}
+	DN_BH_WUNLOCK();
+	D("%s %sunloaded", aqm->name, err ? "not ":"");
+	if (err)
+		D("ref_count=%d, cfg_ref_count=%d", r->ref_count, r->cfg_ref_count);
+	return err;
+}
+
+int
+dn_aqm_modevent(module_t mod, int cmd, void *arg)
+{
+	struct dn_aqm *aqm = arg;
+
+	if (cmd == MOD_LOAD)
+		return load_dn_aqm(aqm);
+	else if (cmd == MOD_UNLOAD)
+		return unload_dn_aqm(aqm);
+	else
+		return EINVAL;
+}
+#endif
+
 /* end of file */
+
diff --git a/sys/rpc/svc.c b/sys/rpc/svc.c
index b436c18..a4cc484 100644
--- a/sys/rpc/svc.c
+++ b/sys/rpc/svc.c
@@ -847,9 +847,7 @@ svc_xprt_alloc()
 	SVCXPRT_EXT *ext;
 
 	xprt = mem_alloc(sizeof(SVCXPRT));
-	memset(xprt, 0, sizeof(SVCXPRT));
 	ext = mem_alloc(sizeof(SVCXPRT_EXT));
-	memset(ext, 0, sizeof(SVCXPRT_EXT));
 	xprt->xp_p3 = ext;
 	refcount_init(&xprt->xp_refs, 1);
 
diff --git a/sys/rpc/svc_vc.c b/sys/rpc/svc_vc.c
index be8e04e..92a926d 100644
--- a/sys/rpc/svc_vc.c
+++ b/sys/rpc/svc_vc.c
@@ -189,11 +189,11 @@ svc_vc_create(SVCPOOL *pool, struct socket *so, size_t sendsize,
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	return (xprt);
+
 cleanup_svc_vc_create:
-	if (xprt) {
-		sx_destroy(&xprt->xp_lock);
-		svc_xprt_free(xprt);
-	}
+	sx_destroy(&xprt->xp_lock);
+	svc_xprt_free(xprt);
+
 	return (NULL);
 }
 
@@ -203,8 +203,8 @@ cleanup_svc_vc_create:
 SVCXPRT *
 svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr)
 {
-	SVCXPRT *xprt = NULL;
-	struct cf_conn *cd = NULL;
+	SVCXPRT *xprt;
+	struct cf_conn *cd;
 	struct sockaddr* sa = NULL;
 	struct sockopt opt;
 	int one = 1;
@@ -279,12 +279,10 @@ svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr)
 
 	return (xprt);
 cleanup_svc_vc_create:
-	if (xprt) {
-		sx_destroy(&xprt->xp_lock);
-		svc_xprt_free(xprt);
-	}
-	if (cd)
-		mem_free(cd, sizeof(*cd));
+	sx_destroy(&xprt->xp_lock);
+	svc_xprt_free(xprt);
+	mem_free(cd, sizeof(*cd));
+
 	return (NULL);
 }
 
diff --git a/sys/sys/cdefs.h b/sys/sys/cdefs.h
index 1729c7b..0b3ed26 100644
--- a/sys/sys/cdefs.h
+++ b/sys/sys/cdefs.h
@@ -273,7 +273,8 @@
 #define	_Alignof(x)		__alignof(x)
 #endif
 
-#if !__has_extension(c_atomic) && !__has_extension(cxx_atomic)
+#if !defined(__cplusplus) && !__has_extension(c_atomic) && \
+    !__has_extension(cxx_atomic)
 /*
  * No native support for _Atomic(). Place object in structure to prevent
  * most forms of direct non-atomic access.
diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h
index d2ad920..14bd867 100644
--- a/sys/sys/vmmeter.h
+++ b/sys/sys/vmmeter.h
@@ -183,7 +183,8 @@ static __inline
 int
 vm_paging_needed(void)
 {
-    return (cnt.v_free_count + cnt.v_cache_count < vm_pageout_wakeup_thresh);
+    return (cnt.v_free_count + cnt.v_cache_count <
+        (u_int)vm_pageout_wakeup_thresh);
 }
 
 #endif
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 512151b..c250c5d 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -253,11 +253,11 @@ vm_page_domain_init(struct vm_domain *vmd)
 
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
 	    "vm inactive pagequeue";
-	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
+	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
 	    &cnt.v_inactive_count;
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
 	    "vm active pagequeue";
-	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
+	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
 	    &cnt.v_active_count;
 	vmd->vmd_page_count = 0;
 	vmd->vmd_free_count = 0;
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 7ecb6c7..3ab4c24 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -215,7 +215,7 @@ struct vm_pagequeue {
 	struct mtx	pq_mutex;
 	struct pglist	pq_pl;
 	int		pq_cnt;
-	int		* const pq_vcnt;
+	u_int		* const pq_vcnt;
 	const char	* const pq_name;
 } __aligned(CACHE_LINE_SIZE);
 
diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c
index a580f2a..e3c1571 100644
--- a/sys/x86/x86/local_apic.c
+++ b/sys/x86/x86/local_apic.c
@@ -284,7 +284,7 @@ lapic_init(vm_paddr_t addr)
 	}
 
 #ifdef SMP
-#define	LOOPS	1000000
+#define	LOOPS	100000
 	/*
 	 * Calibrate the busy loop waiting for IPI ack in xAPIC mode.
 	 * lapic_ipi_wait_mult contains the number of iterations which
@@ -440,7 +440,7 @@ lapic_setup(int boot)
 	/* Program the CMCI LVT entry if present. */
 	if (maxlvt >= APIC_LVT_CMCI)
 		lapic->lvt_cmci = lvt_mode(la, APIC_LVT_CMCI, lapic->lvt_cmci);
-	    
+
 	intr_restore(saveintr);
 }
 
@@ -1363,7 +1363,7 @@ static void
 apic_setup_local(void *dummy __unused)
 {
 	int retval;
- 
+
 	if (best_enum == NULL)
 		return;