Merge remote-tracking branch 'origin/stable/10' into devel

author: Renato Botelho <renato@netgate.com> 2016-06-21 07:44:54 -0300
committer: Renato Botelho <renato@netgate.com> 2016-06-21 07:44:54 -0300
commit: 1fc6b0207cc2f3cce33817706603caa41a9de24d (patch)
tree: d2d812b76b08f42a002621f716dd5f3199c7ca7d /sys/dev/hyperv
parent: b8632c4f34175c7018be77059ab229e755eb67e0 (diff)
parent: bc9e0dd07a76c4d7a1c6fcf21824ca2cecff2c6d (diff)
download: FreeBSD-src-1fc6b0207cc2f3cce33817706603caa41a9de24d.zip
FreeBSD-src-1fc6b0207cc2f3cce33817706603caa41a9de24d.tar.gz
25 files changed, 2167 insertions, 682 deletions
diff --git a/sys/dev/hyperv/include/hyperv.h b/sys/dev/hyperv/include/hyperv.h
index f45543b..aeec8ec 100644
--- a/sys/dev/hyperv/include/hyperv.h
+++ b/sys/dev/hyperv/include/hyperv.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2012 NetApp Inc.
  * Copyright (c) 2012 Citrix Inc.
  * All rights reserved.
@@ -124,6 +124,8 @@ typedef struct hv_guid {
 	 unsigned char data[16];
 } __packed hv_guid;
 
+int snprintf_hv_guid(char *, size_t, const hv_guid *);
+
 #define HV_NIC_GUID							\
 	.data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,	\
 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
@@ -689,7 +691,6 @@ typedef struct {
 } hv_vmbus_ring_buffer_info;
 
 typedef void (*hv_vmbus_pfn_channel_callback)(void *context);
-typedef void (*hv_vmbus_sc_creation_callback)(void *context);
 
 typedef enum {
 	HV_CHANNEL_OFFER_STATE,
@@ -753,8 +754,6 @@ typedef struct hv_vmbus_channel {
 	 */
 	hv_vmbus_ring_buffer_info	inbound;
 
-	struct mtx			inbound_lock;
-
 	struct taskqueue *		rxq;
 	struct task			channel_task;
 	hv_vmbus_pfn_channel_callback	on_channel_callback;
@@ -804,13 +803,6 @@ typedef struct hv_vmbus_channel {
 	 * response on the same channel.
 	 */
 
-	/*
-	 * Multi-channel creation callback. This callback will be called in
-	 * process context when a Multi-channel offer is received from the host.
-	 * The guest can open the Multi-channel in the context of this callback.
-	 */
-	hv_vmbus_sc_creation_callback	sc_creation_callback;
-
 	struct mtx			sc_lock;
 
 	/*
@@ -818,18 +810,24 @@ typedef struct hv_vmbus_channel {
 	 */
 	TAILQ_HEAD(, hv_vmbus_channel)	sc_list_anchor;
 	TAILQ_ENTRY(hv_vmbus_channel)	sc_list_entry;
+	int				subchan_cnt;
 
 	/*
 	 * The primary channel this sub-channle belongs to.
 	 * This will be NULL for the primary channel.
 	 */
 	struct hv_vmbus_channel		*primary_channel;
+
 	/*
-	 * Support per channel state for use by vmbus drivers.
+	 * Driver private data
 	 */
-	void				*per_channel_state;
+	void				*hv_chan_priv1;
+	void				*hv_chan_priv2;
+	void				*hv_chan_priv3;
 } hv_vmbus_channel;
 
+#define HV_VMBUS_CHAN_ISPRIMARY(chan)	((chan)->primary_channel == NULL)
+
 static inline void
 hv_set_channel_read_state(hv_vmbus_channel* channel, boolean_t state)
 {
@@ -908,6 +906,11 @@ int		hv_vmbus_channel_teardown_gpdal(
 
 struct hv_vmbus_channel* vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary);
 
+void		vmbus_channel_cpu_set(struct hv_vmbus_channel *chan, int cpu);
+struct hv_vmbus_channel **
+		vmbus_get_subchan(struct hv_vmbus_channel *pri_chan, int subchan_cnt);
+void		vmbus_rel_subchan(struct hv_vmbus_channel **subchan, int subchan_cnt);
+
 /**
  * @brief Get physical address from virtual
  */
diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.c b/sys/dev/hyperv/netvsc/hv_net_vsc.c
index 9a89b62..a62f450 100644
--- a/sys/dev/hyperv/netvsc/hv_net_vsc.c
+++ b/sys/dev/hyperv/netvsc/hv_net_vsc.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2010-2012 Citrix Inc.
  * Copyright (c) 2012 NetApp Inc.
  * All rights reserved.
@@ -48,21 +48,27 @@
 #include "hv_rndis.h"
 #include "hv_rndis_filter.h"
 
+/* priv1 and priv2 are consumed by the main driver */
+#define hv_chan_rdbuf	hv_chan_priv3
+
 MALLOC_DEFINE(M_NETVSC, "netvsc", "Hyper-V netvsc driver");
 
 /*
  * Forward declarations
  */
-static void hv_nv_on_channel_callback(void *context);
+static void hv_nv_on_channel_callback(void *xchan);
 static int  hv_nv_init_send_buffer_with_net_vsp(struct hv_device *device);
 static int  hv_nv_init_rx_buffer_with_net_vsp(struct hv_device *device);
 static int  hv_nv_destroy_send_buffer(netvsc_dev *net_dev);
 static int  hv_nv_destroy_rx_buffer(netvsc_dev *net_dev);
 static int  hv_nv_connect_to_vsp(struct hv_device *device);
 static void hv_nv_on_send_completion(netvsc_dev *net_dev,
-    struct hv_device *device, hv_vm_packet_descriptor *pkt);
+    struct hv_device *device, struct hv_vmbus_channel *, hv_vm_packet_descriptor *pkt);
+static void hv_nv_on_receive_completion(struct hv_vmbus_channel *chan,
+    uint64_t tid, uint32_t status);
 static void hv_nv_on_receive(netvsc_dev *net_dev,
-    struct hv_device *device, hv_vm_packet_descriptor *pkt);
+    struct hv_device *device, struct hv_vmbus_channel *chan,
+    hv_vm_packet_descriptor *pkt);
 
 /*
  *
@@ -115,7 +121,7 @@ hv_nv_get_inbound_net_device(struct hv_device *device)
 	 * permit incoming packets if and only if there
 	 * are outstanding sends.
 	 */
-	if (net_dev->destroy && net_dev->num_outstanding_sends == 0) {
+	if (net_dev->destroy) {
 		return (NULL);
 	}
 
@@ -654,6 +660,16 @@ hv_nv_disconnect_from_vsp(netvsc_dev *net_dev)
 	hv_nv_destroy_send_buffer(net_dev);
 }
 
+void
+hv_nv_subchan_attach(struct hv_vmbus_channel *chan)
+{
+
+	chan->hv_chan_rdbuf = malloc(NETVSC_PACKET_SIZE, M_NETVSC, M_WAITOK);
+	hv_vmbus_channel_open(chan, NETVSC_DEVICE_RING_BUFFER_SIZE,
+	    NETVSC_DEVICE_RING_BUFFER_SIZE, NULL, 0,
+	    hv_nv_on_channel_callback, chan);
+}
+
 /*
  * Net VSC on device add
  * 
@@ -662,25 +678,30 @@ hv_nv_disconnect_from_vsp(netvsc_dev *net_dev)
 netvsc_dev *
 hv_nv_on_device_add(struct hv_device *device, void *additional_info)
 {
+	struct hv_vmbus_channel *chan = device->channel;
 	netvsc_dev *net_dev;
 	int ret = 0;
 
 	net_dev = hv_nv_alloc_net_device(device);
-	if (!net_dev)
-		goto cleanup;
+	if (net_dev == NULL)
+		return NULL;
 
 	/* Initialize the NetVSC channel extension */
 
 	sema_init(&net_dev->channel_init_sema, 0, "netdev_sema");
 
+	chan->hv_chan_rdbuf = malloc(NETVSC_PACKET_SIZE, M_NETVSC, M_WAITOK);
+
 	/*
 	 * Open the channel
 	 */
-	ret = hv_vmbus_channel_open(device->channel,
+	ret = hv_vmbus_channel_open(chan,
 	    NETVSC_DEVICE_RING_BUFFER_SIZE, NETVSC_DEVICE_RING_BUFFER_SIZE,
-	    NULL, 0, hv_nv_on_channel_callback, device);
-	if (ret != 0)
+	    NULL, 0, hv_nv_on_channel_callback, chan);
+	if (ret != 0) {
+		free(chan->hv_chan_rdbuf, M_NETVSC);
 		goto cleanup;
+	}
 
 	/*
 	 * Connect with the NetVsp
@@ -693,18 +714,16 @@ hv_nv_on_device_add(struct hv_device *device, void *additional_info)
 
 close:
 	/* Now, we can close the channel safely */
-
-	hv_vmbus_channel_close(device->channel);
+	free(chan->hv_chan_rdbuf, M_NETVSC);
+	hv_vmbus_channel_close(chan);
 
 cleanup:
 	/*
 	 * Free the packet buffers on the netvsc device packet queue.
 	 * Release other resources.
 	 */
-	if (net_dev) {
-		sema_destroy(&net_dev->channel_init_sema);
-		free(net_dev, M_NETVSC);
-	}
+	sema_destroy(&net_dev->channel_init_sema);
+	free(net_dev, M_NETVSC);
 
 	return (NULL);
 }
@@ -719,14 +738,7 @@ hv_nv_on_device_remove(struct hv_device *device, boolean_t destroy_channel)
 	netvsc_dev *net_dev = sc->net_dev;;
 	
 	/* Stop outbound traffic ie sends and receives completions */
-	mtx_lock(&device->channel->inbound_lock);
 	net_dev->destroy = TRUE;
-	mtx_unlock(&device->channel->inbound_lock);
-
-	/* Wait for all send completions */
-	while (net_dev->num_outstanding_sends) {
-		DELAY(100);
-	}
 
 	hv_nv_disconnect_from_vsp(net_dev);
 
@@ -739,6 +751,7 @@ hv_nv_on_device_remove(struct hv_device *device, boolean_t destroy_channel)
 		    HV_CHANNEL_CLOSING_NONDESTRUCTIVE_STATE;
 	}
 
+	free(device->channel->hv_chan_rdbuf, M_NETVSC);
 	hv_vmbus_channel_close(device->channel);
 
 	sema_destroy(&net_dev->channel_init_sema);
@@ -752,7 +765,8 @@ hv_nv_on_device_remove(struct hv_device *device, boolean_t destroy_channel)
  */
 static void
 hv_nv_on_send_completion(netvsc_dev *net_dev,
-    struct hv_device *device, hv_vm_packet_descriptor *pkt)
+    struct hv_device *device, struct hv_vmbus_channel *chan,
+    hv_vm_packet_descriptor *pkt)
 {
 	nvsp_msg *nvsp_msg_pkt;
 	netvsc_packet *net_vsc_pkt;
@@ -764,7 +778,9 @@ hv_nv_on_send_completion(netvsc_dev *net_dev,
 		|| nvsp_msg_pkt->hdr.msg_type
 			== nvsp_msg_1_type_send_rx_buf_complete
 		|| nvsp_msg_pkt->hdr.msg_type
-			== nvsp_msg_1_type_send_send_buf_complete) {
+			== nvsp_msg_1_type_send_send_buf_complete
+		|| nvsp_msg_pkt->hdr.msg_type
+			== nvsp_msg5_type_subchannel) {
 		/* Copy the response back */
 		memcpy(&net_dev->channel_init_packet, nvsp_msg_pkt,
 		    sizeof(nvsp_msg));
@@ -801,12 +817,10 @@ hv_nv_on_send_completion(netvsc_dev *net_dev,
 			}
 			
 			/* Notify the layer above us */
-			net_vsc_pkt->compl.send.on_send_completion(
+			net_vsc_pkt->compl.send.on_send_completion(chan,
 			    net_vsc_pkt->compl.send.send_completion_context);
 
 		}
-
-		atomic_subtract_int(&net_dev->num_outstanding_sends, 1);
 	}
 }
 
@@ -816,16 +830,11 @@ hv_nv_on_send_completion(netvsc_dev *net_dev,
  * Returns 0 on success, non-zero on failure.
  */
 int
-hv_nv_on_send(struct hv_device *device, netvsc_packet *pkt)
+hv_nv_on_send(struct hv_vmbus_channel *chan, netvsc_packet *pkt)
 {
-	netvsc_dev *net_dev;
 	nvsp_msg send_msg;
 	int ret;
 
-	net_dev = hv_nv_get_outbound_net_device(device);
-	if (!net_dev)
-		return (ENODEV);
-
 	send_msg.hdr.msg_type = nvsp_msg_1_type_send_rndis_pkt;
 	if (pkt->is_data_pkt) {
 		/* 0 is RMC_DATA */
@@ -841,20 +850,16 @@ hv_nv_on_send(struct hv_device *device, netvsc_packet *pkt)
 	    pkt->send_buf_section_size;
 
 	if (pkt->page_buf_count) {
-		ret = hv_vmbus_channel_send_packet_pagebuffer(device->channel,
+		ret = hv_vmbus_channel_send_packet_pagebuffer(chan,
 		    pkt->page_buffers, pkt->page_buf_count,
 		    &send_msg, sizeof(nvsp_msg), (uint64_t)(uintptr_t)pkt);
 	} else {
-		ret = hv_vmbus_channel_send_packet(device->channel,
+		ret = hv_vmbus_channel_send_packet(chan,
 		    &send_msg, sizeof(nvsp_msg), (uint64_t)(uintptr_t)pkt,
 		    HV_VMBUS_PACKET_TYPE_DATA_IN_BAND,
 		    HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 	}
 
-	/* Record outstanding send only if send_packet() succeeded */
-	if (ret == 0)
-		atomic_add_int(&net_dev->num_outstanding_sends, 1);
-
 	return (ret);
 }
 
@@ -866,7 +871,7 @@ hv_nv_on_send(struct hv_device *device, netvsc_packet *pkt)
  */
 static void
 hv_nv_on_receive(netvsc_dev *net_dev, struct hv_device *device,
-    hv_vm_packet_descriptor *pkt)
+    struct hv_vmbus_channel *chan, hv_vm_packet_descriptor *pkt)
 {
 	hv_vm_transfer_page_packet_header *vm_xfer_page_pkt;
 	nvsp_msg *nvsp_msg_pkt;
@@ -916,7 +921,7 @@ hv_nv_on_receive(netvsc_dev *net_dev, struct hv_device *device,
 		net_vsc_pkt->tot_data_buf_len = 
 		    vm_xfer_page_pkt->ranges[i].byte_count;
 
-		hv_rf_on_receive(net_dev, device, net_vsc_pkt);
+		hv_rf_on_receive(net_dev, device, chan, net_vsc_pkt);
 		if (net_vsc_pkt->status != nvsp_status_success) {
 			status = nvsp_status_failure;
 		}
@@ -927,9 +932,8 @@ hv_nv_on_receive(netvsc_dev *net_dev, struct hv_device *device,
 	 * messages (not just data messages) will trigger a response
 	 * message back to the host.
 	 */
-	hv_nv_on_receive_completion(device, vm_xfer_page_pkt->d.transaction_id,
+	hv_nv_on_receive_completion(chan, vm_xfer_page_pkt->d.transaction_id,
 	    status);
-	hv_rf_receive_rollup(net_dev);
 }
 
 /*
@@ -937,8 +941,8 @@ hv_nv_on_receive(netvsc_dev *net_dev, struct hv_device *device,
  *
  * Send a receive completion packet to RNDIS device (ie NetVsp)
  */
-void
-hv_nv_on_receive_completion(struct hv_device *device, uint64_t tid,
+static void
+hv_nv_on_receive_completion(struct hv_vmbus_channel *chan, uint64_t tid,
     uint32_t status)
 {
 	nvsp_msg rx_comp_msg;
@@ -953,7 +957,7 @@ hv_nv_on_receive_completion(struct hv_device *device, uint64_t tid,
 
 retry_send_cmplt:
 	/* Send the completion */
-	ret = hv_vmbus_channel_send_packet(device->channel, &rx_comp_msg,
+	ret = hv_vmbus_channel_send_packet(chan, &rx_comp_msg,
 	    sizeof(nvsp_msg), tid, HV_VMBUS_PACKET_TYPE_COMPLETION, 0);
 	if (ret == 0) {
 		/* success */
@@ -970,12 +974,53 @@ retry_send_cmplt:
 }
 
 /*
+ * Net VSC receiving vRSS send table from VSP
+ */
+static void
+hv_nv_send_table(struct hv_device *device, hv_vm_packet_descriptor *pkt)
+{
+	netvsc_dev *net_dev;
+	nvsp_msg *nvsp_msg_pkt;
+	int i;
+	uint32_t count, *table;
+
+	net_dev = hv_nv_get_inbound_net_device(device);
+	if (!net_dev)
+        	return;
+
+	nvsp_msg_pkt =
+	    (nvsp_msg *)((unsigned long)pkt + (pkt->data_offset8 << 3));
+
+	if (nvsp_msg_pkt->hdr.msg_type !=
+	    nvsp_msg5_type_send_indirection_table) {
+		printf("Netvsc: !Warning! receive msg type not "
+			"send_indirection_table. type = %d\n",
+			nvsp_msg_pkt->hdr.msg_type);
+		return;
+	}
+
+	count = nvsp_msg_pkt->msgs.vers_5_msgs.send_table.count;
+	if (count != VRSS_SEND_TABLE_SIZE) {
+        	printf("Netvsc: Received wrong send table size: %u\n", count);
+	        return;
+	}
+
+	table = (uint32_t *)
+	    ((unsigned long)&nvsp_msg_pkt->msgs.vers_5_msgs.send_table +
+	     nvsp_msg_pkt->msgs.vers_5_msgs.send_table.offset);
+
+	for (i = 0; i < count; i++)
+        	net_dev->vrss_send_table[i] = table[i];
+}
+
+/*
  * Net VSC on channel callback
  */
 static void
-hv_nv_on_channel_callback(void *context)
+hv_nv_on_channel_callback(void *xchan)
 {
-	struct hv_device *device = (struct hv_device *)context;
+	struct hv_vmbus_channel *chan = xchan;
+	struct hv_device *device = chan->device;
 	netvsc_dev *net_dev;
 	device_t dev = device->device;
 	uint32_t bytes_rxed;
@@ -989,20 +1034,24 @@ hv_nv_on_channel_callback(void *context)
 	if (net_dev == NULL)
 		return;
 
-	buffer = net_dev->callback_buf;
+	buffer = chan->hv_chan_rdbuf;
 
 	do {
-		ret = hv_vmbus_channel_recv_packet_raw(device->channel,
+		ret = hv_vmbus_channel_recv_packet_raw(chan,
 		    buffer, bufferlen, &bytes_rxed, &request_id);
 		if (ret == 0) {
 			if (bytes_rxed > 0) {
 				desc = (hv_vm_packet_descriptor *)buffer;
 				switch (desc->type) {
 				case HV_VMBUS_PACKET_TYPE_COMPLETION:
-					hv_nv_on_send_completion(net_dev, device, desc);
+					hv_nv_on_send_completion(net_dev, device,
+					    chan, desc);
 					break;
 				case HV_VMBUS_PACKET_TYPE_DATA_USING_TRANSFER_PAGES:
-					hv_nv_on_receive(net_dev, device, desc);
+					hv_nv_on_receive(net_dev, device, chan, desc);
+					break;
+				case HV_VMBUS_PACKET_TYPE_DATA_IN_BAND:
+					hv_nv_send_table(device, desc);
 					break;
 				default:
 					device_printf(dev,
@@ -1036,5 +1085,5 @@ hv_nv_on_channel_callback(void *context)
 	if (bufferlen > NETVSC_PACKET_SIZE)
 		free(buffer, M_NETVSC);
 
-	hv_rf_channel_rollup(net_dev);
+	hv_rf_channel_rollup(chan);
 }
diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.h b/sys/dev/hyperv/netvsc/hv_net_vsc.h
index 95dee17..7c43f64 100644
--- a/sys/dev/hyperv/netvsc/hv_net_vsc.h
+++ b/sys/dev/hyperv/netvsc/hv_net_vsc.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2010-2012 Citrix Inc.
  * Copyright (c) 2012 NetApp Inc.
  * All rights reserved.
@@ -86,6 +86,92 @@ MALLOC_DECLARE(M_NETVSC);
  */
 #define NVSP_MAX_PACKETS_PER_RECEIVE            375
 
+/* vRSS stuff */
+#define RNDIS_OBJECT_TYPE_RSS_CAPABILITIES      0x88
+#define RNDIS_OBJECT_TYPE_RSS_PARAMETERS        0x89
+
+#define RNDIS_RECEIVE_SCALE_CAPABILITIES_REVISION_2     2
+#define RNDIS_RECEIVE_SCALE_PARAMETERS_REVISION_2       2
+
+struct rndis_obj_header {
+        uint8_t type;
+        uint8_t rev;
+        uint16_t size;
+} __packed;
+
+/* rndis_recv_scale_cap/cap_flag */
+#define RNDIS_RSS_CAPS_MESSAGE_SIGNALED_INTERRUPTS      0x01000000
+#define RNDIS_RSS_CAPS_CLASSIFICATION_AT_ISR            0x02000000
+#define RNDIS_RSS_CAPS_CLASSIFICATION_AT_DPC            0x04000000
+#define RNDIS_RSS_CAPS_USING_MSI_X                      0x08000000
+#define RNDIS_RSS_CAPS_RSS_AVAILABLE_ON_PORTS           0x10000000
+#define RNDIS_RSS_CAPS_SUPPORTS_MSI_X                   0x20000000
+#define RNDIS_RSS_CAPS_HASH_TYPE_TCP_IPV4               0x00000100
+#define RNDIS_RSS_CAPS_HASH_TYPE_TCP_IPV6               0x00000200
+#define RNDIS_RSS_CAPS_HASH_TYPE_TCP_IPV6_EX            0x00000400
+
+/* RNDIS_RECEIVE_SCALE_CAPABILITIES */
+struct rndis_recv_scale_cap {
+        struct rndis_obj_header hdr;
+        uint32_t cap_flag;
+        uint32_t num_int_msg;
+        uint32_t num_recv_que;
+        uint16_t num_indirect_tabent;
+} __packed;
+
+/* rndis_recv_scale_param flags */
+#define RNDIS_RSS_PARAM_FLAG_BASE_CPU_UNCHANGED         0x0001
+#define RNDIS_RSS_PARAM_FLAG_HASH_INFO_UNCHANGED        0x0002
+#define RNDIS_RSS_PARAM_FLAG_ITABLE_UNCHANGED           0x0004
+#define RNDIS_RSS_PARAM_FLAG_HASH_KEY_UNCHANGED         0x0008
+#define RNDIS_RSS_PARAM_FLAG_DISABLE_RSS                0x0010
+
+/* Hash info bits */
+#define RNDIS_HASH_FUNC_TOEPLITZ                0x00000001
+#define RNDIS_HASH_IPV4                         0x00000100
+#define RNDIS_HASH_TCP_IPV4                     0x00000200
+#define RNDIS_HASH_IPV6                         0x00000400
+#define RNDIS_HASH_IPV6_EX                      0x00000800
+#define RNDIS_HASH_TCP_IPV6                     0x00001000
+#define RNDIS_HASH_TCP_IPV6_EX                  0x00002000
+
+#define RNDIS_RSS_INDIRECTION_TABLE_MAX_SIZE_REVISION_2 (128 * 4)
+#define RNDIS_RSS_HASH_SECRET_KEY_MAX_SIZE_REVISION_2   40
+
+#define ITAB_NUM                                        128
+#define HASH_KEYLEN RNDIS_RSS_HASH_SECRET_KEY_MAX_SIZE_REVISION_2
+
+/* RNDIS_RECEIVE_SCALE_PARAMETERS */
+typedef struct rndis_recv_scale_param_ {
+        struct rndis_obj_header hdr;
+
+        /* Qualifies the rest of the information */
+        uint16_t flag;
+
+        /* The base CPU number to do receive processing. not used */
+        uint16_t base_cpu_number;
+
+        /* This describes the hash function and type being enabled */
+        uint32_t hashinfo;
+
+        /* The size of indirection table array */
+        uint16_t indirect_tabsize;
+
+        /* The offset of the indirection table from the beginning of this
+         * structure
+         */
+        uint32_t indirect_taboffset;
+
+        /* The size of the hash secret key */
+        uint16_t hashkey_size;
+
+        /* The offset of the secret key from the beginning of this structure */
+        uint32_t hashkey_offset;
+
+        uint32_t processor_masks_offset;
+        uint32_t num_processor_masks;
+        uint32_t processor_masks_entry_size;
+} rndis_recv_scale_param;
 
 typedef enum nvsp_msg_type_ {
 	nvsp_msg_type_none                      = 0,
@@ -146,6 +232,27 @@ typedef enum nvsp_msg_type_ {
 
 	nvsp_msg_2_type_alloc_chimney_handle,
 	nvsp_msg_2_type_alloc_chimney_handle_complete,
+
+	nvsp_msg2_max = nvsp_msg_2_type_alloc_chimney_handle_complete,
+
+	/*
+	 * Version 4 Messages
+	 */
+	nvsp_msg4_type_send_vf_association,
+	nvsp_msg4_type_switch_data_path,
+	nvsp_msg4_type_uplink_connect_state_deprecated,
+
+	nvsp_msg4_max = nvsp_msg4_type_uplink_connect_state_deprecated,
+
+	/*
+	 * Version 5 Messages
+	 */
+	nvsp_msg5_type_oid_query_ex,
+	nvsp_msg5_type_oid_query_ex_comp,
+	nvsp_msg5_type_subchannel,
+	nvsp_msg5_type_send_indirection_table,
+
+	nvsp_msg5_max = nvsp_msg5_type_send_indirection_table,
 } nvsp_msg_type;
 
 typedef enum nvsp_status_ {
@@ -793,6 +900,39 @@ typedef struct nvsp_2_msg_send_vmq_rndis_pkt_complete_
 	uint32_t                                status;
 } __packed nvsp_2_msg_send_vmq_rndis_pkt_complete;
 
+/*
+ * Version 5 messages
+ */
+enum nvsp_subchannel_operation {
+        NVSP_SUBCHANNEL_NONE = 0,
+        NVSP_SUBCHANNE_ALLOCATE,
+        NVSP_SUBCHANNE_MAX
+};
+
+typedef struct nvsp_5_subchannel_request_
+{
+        uint32_t                                op;
+        uint32_t                                num_subchannels;
+} __packed nvsp_5_subchannel_request;
+
+typedef struct nvsp_5_subchannel_complete_
+{
+        uint32_t                                status;
+        /* Actual number of subchannels allocated */
+        uint32_t                                num_subchannels;
+} __packed nvsp_5_subchannel_complete;
+
+typedef struct nvsp_5_send_indirect_table_
+{
+        /* The number of entries in the send indirection table */
+        uint32_t                                count;
+        /*
+         * The offset of the send indireciton table from top of
+         * this struct. The send indirection table tells which channel
+         * to put the send traffic on. Each entry is a channel number.
+         */
+        uint32_t                                offset;
+} __packed nvsp_5_send_indirect_table;
 
 typedef union nvsp_1_msg_uber_ {
 	nvsp_1_msg_send_ndis_version            send_ndis_vers;
@@ -838,11 +978,18 @@ typedef union nvsp_2_msg_uber_ {
 	nvsp_2_msg_alloc_chimney_handle_complete alloc_chimney_handle_complete;
 } __packed nvsp_2_msg_uber;
 
+typedef union nvsp_5_msg_uber_
+{
+        nvsp_5_subchannel_request               subchannel_request;
+        nvsp_5_subchannel_complete              subchn_complete;
+        nvsp_5_send_indirect_table              send_table;
+} __packed nvsp_5_msg_uber;
 
 typedef union nvsp_all_msgs_ {
 	nvsp_msg_init_uber                      init_msgs;
 	nvsp_1_msg_uber                         vers_1_msgs;
 	nvsp_2_msg_uber                         vers_2_msgs;
+	nvsp_5_msg_uber				vers_5_msgs;
 } __packed nvsp_all_msgs;
 
 /*
@@ -883,6 +1030,7 @@ typedef struct nvsp_msg_ {
 #define NETVSC_MAX_CONFIGURABLE_MTU		(9 * 1024)
 
 #define NETVSC_PACKET_SIZE			PAGE_SIZE
+#define VRSS_SEND_TABLE_SIZE			16
 
 /*
  * Data types
@@ -893,7 +1041,6 @@ typedef struct nvsp_msg_ {
  */
 typedef struct netvsc_dev_ {
 	struct hv_device			*dev;
-	int					num_outstanding_sends;
 
 	/* Send buffer allocated by us but manages by NetVSP */
 	void					*send_buf;
@@ -924,12 +1071,15 @@ typedef struct netvsc_dev_ {
 	hv_bool_uint8_t				destroy;
 	/* Negotiated NVSP version */
 	uint32_t				nvsp_version;
-	
-	uint8_t					callback_buf[NETVSC_PACKET_SIZE]; 
+
+	uint32_t                                num_channel;
+
+	uint32_t                                vrss_send_table[VRSS_SEND_TABLE_SIZE];
 } netvsc_dev;
 
+struct hv_vmbus_channel;
 
-typedef void (*pfn_on_send_rx_completion)(void *);
+typedef void (*pfn_on_send_rx_completion)(struct hv_vmbus_channel *, void *);
 
 #define NETVSC_DEVICE_RING_BUFFER_SIZE	(128 * PAGE_SIZE)
 #define NETVSC_PACKET_MAXPAGE		32 
@@ -1000,10 +1150,12 @@ struct buf_ring;
 #endif
 
 struct hn_rx_ring {
-	struct lro_ctrl	hn_lro;
+	struct ifnet	*hn_ifp;
+	int		hn_rx_idx;
 
 	/* Trust csum verification on host side */
 	int		hn_trust_hcsum;	/* HN_TRUST_HCSUM_ */
+	struct lro_ctrl	hn_lro;
 
 	u_long		hn_csum_ip;
 	u_long		hn_csum_tcp;
@@ -1011,12 +1163,20 @@ struct hn_rx_ring {
 	u_long		hn_csum_trusted;
 	u_long		hn_lro_tried;
 	u_long		hn_small_pkts;
+	u_long		hn_pkts;
+	u_long		hn_rss_pkts;
+
+	/* Rarely used stuffs */
+	struct sysctl_oid *hn_rx_sysctl_tree;
+	int		hn_rx_flags;
 } __aligned(CACHE_LINE_SIZE);
 
 #define HN_TRUST_HCSUM_IP	0x0001
 #define HN_TRUST_HCSUM_TCP	0x0002
 #define HN_TRUST_HCSUM_UDP	0x0004
 
+#define HN_RX_FLAG_ATTACHED	0x1
+
 struct hn_tx_ring {
 #ifndef HN_USE_TXDESC_BUFRING
 	struct mtx	hn_txlist_spin;
@@ -1026,7 +1186,8 @@ struct hn_tx_ring {
 #endif
 	int		hn_txdesc_cnt;
 	int		hn_txdesc_avail;
-	int		hn_has_txeof;
+	u_short		hn_has_txeof;
+	u_short		hn_txdone_cnt;
 
 	int		hn_sched_tx;
 	void		(*hn_txeof)(struct hn_tx_ring *);
@@ -1034,8 +1195,13 @@ struct hn_tx_ring {
 	struct task	hn_tx_task;
 	struct task	hn_txeof_task;
 
+	struct buf_ring	*hn_mbuf_br;
+	int		hn_oactive;
+	int		hn_tx_idx;
+
 	struct mtx	hn_tx_lock;
 	struct hn_softc	*hn_sc;
+	struct hv_vmbus_channel *hn_chan;
 
 	int		hn_direct_tx_size;
 	int		hn_tx_chimney_size;
@@ -1046,14 +1212,19 @@ struct hn_tx_ring {
 	u_long		hn_send_failed;
 	u_long		hn_txdma_failed;
 	u_long		hn_tx_collapsed;
+	u_long		hn_tx_chimney_tried;
 	u_long		hn_tx_chimney;
+	u_long		hn_pkts;
 
 	/* Rarely used stuffs */
 	struct hn_txdesc *hn_txdesc;
 	bus_dma_tag_t	hn_tx_rndis_dtag;
 	struct sysctl_oid *hn_tx_sysctl_tree;
+	int		hn_tx_flags;
 } __aligned(CACHE_LINE_SIZE);
 
+#define HN_TX_FLAG_ATTACHED	0x1
+
 /*
  * Device-specific softc structure
  */
@@ -1073,13 +1244,18 @@ typedef struct hn_softc {
 	netvsc_dev  	*net_dev;
 
 	int		hn_rx_ring_cnt;
+	int		hn_rx_ring_inuse;
 	struct hn_rx_ring *hn_rx_ring;
 
 	int		hn_tx_ring_cnt;
+	int		hn_tx_ring_inuse;
 	struct hn_tx_ring *hn_tx_ring;
+
+	int		hn_cpu;
 	int		hn_tx_chimney_max;
 	struct taskqueue *hn_tx_taskq;
 	struct sysctl_oid *hn_tx_sysctl_tree;
+	struct sysctl_oid *hn_rx_sysctl_tree;
 } hn_softc_t;
 
 /*
@@ -1088,14 +1264,13 @@ typedef struct hn_softc {
 extern int hv_promisc_mode;
 
 void netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status);
-void hv_nv_on_receive_completion(struct hv_device *device,
-    uint64_t tid, uint32_t status);
 netvsc_dev *hv_nv_on_device_add(struct hv_device *device,
     void *additional_info);
 int hv_nv_on_device_remove(struct hv_device *device,
     boolean_t destroy_channel);
-int hv_nv_on_send(struct hv_device *device, netvsc_packet *pkt);
+int hv_nv_on_send(struct hv_vmbus_channel *chan, netvsc_packet *pkt);
 int hv_nv_get_next_send_section(netvsc_dev *net_dev);
+void hv_nv_subchan_attach(struct hv_vmbus_channel *chan);
 
 #endif  /* __HV_NET_VSC_H__ */
 
diff --git a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
index 0f4425e..f670c12 100644
--- a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
+++ b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
@@ -1,6 +1,6 @@
 /*-
  * Copyright (c) 2010-2012 Citrix Inc.
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2012 NetApp Inc.
  * All rights reserved.
  *
@@ -119,6 +119,8 @@ __FBSDID("$FreeBSD$");
 #include "hv_rndis.h"
 #include "hv_rndis_filter.h"
 
+#define hv_chan_rxr	hv_chan_priv1
+#define hv_chan_txr	hv_chan_priv2
 
 /* Short for Hyper-V network interface */
 #define NETVSC_DEVNAME    "hn"
@@ -136,8 +138,11 @@ __FBSDID("$FreeBSD$");
 
 #define HN_LROENT_CNT_DEF		128
 
+#define HN_RING_CNT_DEF_MAX		8
+
 #define HN_RNDIS_MSG_LEN		\
     (sizeof(rndis_msg) +		\
+     RNDIS_HASHVAL_PPI_SIZE +		\
      RNDIS_VLAN_PPI_SIZE +		\
      RNDIS_TSO_PPI_SIZE +		\
      RNDIS_CSUM_PPI_SIZE)
@@ -152,6 +157,8 @@ __FBSDID("$FreeBSD$");
 
 #define HN_DIRECT_TX_SIZE_DEF		128
 
+#define HN_EARLY_TXEOF_THRESH		8
+
 struct hn_txdesc {
 #ifndef HN_USE_TXDESC_BUFRING
 	SLIST_ENTRY(hn_txdesc) link;
@@ -180,6 +187,7 @@ struct hn_txdesc {
 #define HN_CSUM_ASSIST_WIN8	(CSUM_IP | CSUM_TCP)
 #define HN_CSUM_ASSIST		(CSUM_IP | CSUM_UDP | CSUM_TCP)
 
+#define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
 /* YYY 2*MTU is a bit rough, but should be good enough. */
 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
@@ -208,7 +216,8 @@ struct hn_txdesc {
 
 int hv_promisc_mode = 0;    /* normal mode by default */
 
-SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD, NULL, "Hyper-V network interface");
+SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
+    "Hyper-V network interface");
 
 /* Trust tcp segements verification on host side. */
 static int hn_trust_hosttcp = 1;
@@ -231,12 +240,10 @@ SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
     "Trust ip packet verification on host side, "
     "when csum info is missing (global setting)");
 
-#if __FreeBSD_version >= 1100045
 /* Limit TSO burst size */
 static int hn_tso_maxlen = 0;
 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
     &hn_tso_maxlen, 0, "TSO burst limit");
-#endif
 
 /* Limit chimney send size */
 static int hn_tx_chimney_size = 0;
@@ -274,6 +281,25 @@ static int hn_bind_tx_taskq = -1;
 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
     &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
 
+static int hn_use_if_start = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
+    &hn_use_if_start, 0, "Use if_start TX method");
+
+static int hn_chan_cnt = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
+    &hn_chan_cnt, 0,
+    "# of channels to use; each channel has one RX ring and one TX ring");
+
+static int hn_tx_ring_cnt = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
+    &hn_tx_ring_cnt, 0, "# of TX rings to use");
+
+static int hn_tx_swq_depth = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
+    &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
+
+static u_int hn_cpu_index;
+
 /*
  * Forward declarations
  */
@@ -303,15 +329,45 @@ static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_check_iplen(const struct mbuf *, int);
 static int hn_create_tx_ring(struct hn_softc *, int);
 static void hn_destroy_tx_ring(struct hn_tx_ring *);
-static int hn_create_tx_data(struct hn_softc *);
+static int hn_create_tx_data(struct hn_softc *, int);
 static void hn_destroy_tx_data(struct hn_softc *);
 static void hn_start_taskfunc(void *, int);
 static void hn_start_txeof_taskfunc(void *, int);
 static void hn_stop_tx_tasks(struct hn_softc *);
 static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **);
-static void hn_create_rx_data(struct hn_softc *sc);
+static void hn_create_rx_data(struct hn_softc *sc, int);
 static void hn_destroy_rx_data(struct hn_softc *sc);
 static void hn_set_tx_chimney_size(struct hn_softc *, int);
+static void hn_channel_attach(struct hn_softc *, struct hv_vmbus_channel *);
+static void hn_subchan_attach(struct hn_softc *, struct hv_vmbus_channel *);
+
+static int hn_transmit(struct ifnet *, struct mbuf *);
+static void hn_xmit_qflush(struct ifnet *);
+static int hn_xmit(struct hn_tx_ring *, int);
+static void hn_xmit_txeof(struct hn_tx_ring *);
+static void hn_xmit_taskfunc(void *, int);
+static void hn_xmit_txeof_taskfunc(void *, int);
+
+#if __FreeBSD_version >= 1100099
+static void
+hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
+{
+	int i;
+
+	for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
+		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
+}
+#endif
+
+static int
+hn_get_txswq_depth(const struct hn_tx_ring *txr)
+{
+
+	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
+	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
+		return txr->hn_txdesc_cnt;
+	return hn_tx_swq_depth;
+}
 
 static int
 hn_ifmedia_upd(struct ifnet *ifp __unused)
@@ -353,7 +409,7 @@ netvsc_probe(device_t dev)
 
 	p = vmbus_get_type(dev);
 	if (!memcmp(p, &g_net_vsc_device_type.data, sizeof(hv_guid))) {
-		device_set_desc(dev, "Synthetic Network Interface");
+		device_set_desc(dev, "Hyper-V Network Interface");
 		if (bootverbose)
 			printf("Netvsc probe... DONE \n");
 
@@ -386,21 +442,16 @@ static int
 netvsc_attach(device_t dev)
 {
 	struct hv_device *device_ctx = vmbus_get_devctx(dev);
+	struct hv_vmbus_channel *pri_chan;
 	netvsc_device_info device_info;
 	hn_softc_t *sc;
 	int unit = device_get_unit(dev);
 	struct ifnet *ifp = NULL;
-	int error;
-#if __FreeBSD_version >= 1100045
+	int error, ring_cnt, tx_ring_cnt;
 	int tso_maxlen;
-#endif
 
 	sc = device_get_softc(dev);
-	if (sc == NULL) {
-		return (ENOMEM);
-	}
 
-	bzero(sc, sizeof(hn_softc_t));
 	sc->hn_unit = unit;
 	sc->hn_dev = dev;
 
@@ -431,26 +482,69 @@ netvsc_attach(device_t dev)
 
 	ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
 	ifp->if_softc = sc;
+	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
+
+	/*
+	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
+	 * to use (tx_ring_cnt).
+	 *
+	 * NOTE:
+	 * The # of RX rings to use is same as the # of channels to use.
+	 */
+	ring_cnt = hn_chan_cnt;
+	if (ring_cnt <= 0) {
+		/* Default */
+		ring_cnt = mp_ncpus;
+		if (ring_cnt > HN_RING_CNT_DEF_MAX)
+			ring_cnt = HN_RING_CNT_DEF_MAX;
+	} else if (ring_cnt > mp_ncpus) {
+		ring_cnt = mp_ncpus;
+	}
+
+	tx_ring_cnt = hn_tx_ring_cnt;
+	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
+		tx_ring_cnt = ring_cnt;
+	if (hn_use_if_start) {
+		/* ifnet.if_start only needs one TX ring. */
+		tx_ring_cnt = 1;
+	}
+
+	/*
+	 * Set the leader CPU for channels.
+	 */
+	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
 
-	error = hn_create_tx_data(sc);
+	error = hn_create_tx_data(sc, tx_ring_cnt);
 	if (error)
 		goto failed;
+	hn_create_rx_data(sc, ring_cnt);
 
-	hn_create_rx_data(sc);
-
-	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
-	ifp->if_dunit = unit;
-	ifp->if_dname = NETVSC_DEVNAME;
+	/*
+	 * Associate the first TX/RX ring w/ the primary channel.
+	 */
+	pri_chan = device_ctx->channel;
+	KASSERT(HV_VMBUS_CHAN_ISPRIMARY(pri_chan), ("not primary channel"));
+	KASSERT(pri_chan->offer_msg.offer.sub_channel_index == 0,
+	    ("primary channel subidx %u",
+	     pri_chan->offer_msg.offer.sub_channel_index));
+	hn_channel_attach(sc, pri_chan);
 
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_ioctl = hn_ioctl;
-	ifp->if_start = hn_start;
 	ifp->if_init = hn_ifinit;
 	/* needed by hv_rf_on_device_add() code */
 	ifp->if_mtu = ETHERMTU;
-	IFQ_SET_MAXLEN(&ifp->if_snd, 512);
-	ifp->if_snd.ifq_drv_maxlen = 511;
-	IFQ_SET_READY(&ifp->if_snd);
+	if (hn_use_if_start) {
+		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
+
+		ifp->if_start = hn_start;
+		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
+		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
+		IFQ_SET_READY(&ifp->if_snd);
+	} else {
+		ifp->if_transmit = hn_transmit;
+		ifp->if_qflush = hn_xmit_qflush;
+	}
 
 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
@@ -470,15 +564,58 @@ netvsc_attach(device_t dev)
 	    IFCAP_LRO;
 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist | CSUM_TSO;
 
-	error = hv_rf_on_device_add(device_ctx, &device_info);
+	error = hv_rf_on_device_add(device_ctx, &device_info, ring_cnt);
 	if (error)
 		goto failed;
+	KASSERT(sc->net_dev->num_channel > 0 &&
+	    sc->net_dev->num_channel <= sc->hn_rx_ring_inuse,
+	    ("invalid channel count %u, should be less than %d",
+	     sc->net_dev->num_channel, sc->hn_rx_ring_inuse));
+
+	/*
+	 * Set the # of TX/RX rings that could be used according to
+	 * the # of channels that host offered.
+	 */
+	if (sc->hn_tx_ring_inuse > sc->net_dev->num_channel)
+		sc->hn_tx_ring_inuse = sc->net_dev->num_channel;
+	sc->hn_rx_ring_inuse = sc->net_dev->num_channel;
+	device_printf(dev, "%d TX ring, %d RX ring\n",
+	    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
+
+	if (sc->net_dev->num_channel > 1) {
+		struct hv_vmbus_channel **subchan;
+		int subchan_cnt = sc->net_dev->num_channel - 1;
+		int i;
+
+		/* Wait for sub-channels setup to complete. */
+		subchan = vmbus_get_subchan(pri_chan, subchan_cnt);
+
+		/* Attach the sub-channels. */
+		for (i = 0; i < subchan_cnt; ++i) {
+			/* NOTE: Calling order is critical. */
+			hn_subchan_attach(sc, subchan[i]);
+			hv_nv_subchan_attach(subchan[i]);
+		}
+
+		/* Release the sub-channels */
+		vmbus_rel_subchan(subchan, subchan_cnt);
+		device_printf(dev, "%d sub-channels setup done\n", subchan_cnt);
+	}
+
+#if __FreeBSD_version >= 1100099
+	if (sc->hn_rx_ring_inuse > 1) {
+		/*
+		 * Reduce TCP segment aggregation limit for multiple
+		 * RX rings to increase ACK timeliness.
+		 */
+		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
+	}
+#endif
 
 	if (device_info.link_state == 0) {
 		sc->hn_carrier = 1;
 	}
 
-#if __FreeBSD_version >= 1100045
 	tso_maxlen = hn_tso_maxlen;
 	if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET)
 		tso_maxlen = IP_MAXPACKET;
@@ -487,14 +624,11 @@ netvsc_attach(device_t dev)
 	ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
 	ifp->if_hw_tsomax = tso_maxlen -
 	    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
-#endif
 
 	ether_ifattach(ifp, device_info.mac_addr);
 
-#if __FreeBSD_version >= 1100045
 	if_printf(ifp, "TSO: %u/%u/%u\n", ifp->if_hw_tsomax,
 	    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
-#endif
 
 	sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
 	hn_set_tx_chimney_size(sc, sc->hn_tx_chimney_max);
@@ -674,8 +808,15 @@ hn_txdesc_hold(struct hn_txdesc *txd)
 	atomic_add_int(&txd->refs, 1);
 }
 
+static __inline void
+hn_txeof(struct hn_tx_ring *txr)
+{
+	txr->hn_has_txeof = 0;
+	txr->hn_txeof(txr);
+}
+
 static void
-hn_tx_done(void *xpkt)
+hn_tx_done(struct hv_vmbus_channel *chan, void *xpkt)
 {
 	netvsc_packet *packet = xpkt;
 	struct hn_txdesc *txd;
@@ -685,17 +826,28 @@ hn_tx_done(void *xpkt)
 	    packet->compl.send.send_completion_tid;
 
 	txr = txd->txr;
+	KASSERT(txr->hn_chan == chan,
+	    ("channel mismatch, on channel%u, should be channel%u",
+	     chan->offer_msg.offer.sub_channel_index,
+	     txr->hn_chan->offer_msg.offer.sub_channel_index));
+
 	txr->hn_has_txeof = 1;
 	hn_txdesc_put(txr, txd);
+
+	++txr->hn_txdone_cnt;
+	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
+		txr->hn_txdone_cnt = 0;
+		if (txr->hn_oactive)
+			hn_txeof(txr);
+	}
 }
 
 void
-netvsc_channel_rollup(struct hv_device *device_ctx)
+netvsc_channel_rollup(struct hv_vmbus_channel *chan)
 {
-	struct hn_softc *sc = device_get_softc(device_ctx->device);
-	struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; /* TODO: vRSS */
+	struct hn_tx_ring *txr = chan->hv_chan_txr;
 #if defined(INET) || defined(INET6)
-	struct hn_rx_ring *rxr = &sc->hn_rx_ring[0]; /* TODO: vRSS */
+	struct hn_rx_ring *rxr = chan->hv_chan_rxr;
 	struct lro_ctrl *lro = &rxr->hn_lro;
 	struct lro_entry *queued;
 
@@ -705,11 +857,16 @@ netvsc_channel_rollup(struct hv_device *device_ctx)
 	}
 #endif
 
-	if (!txr->hn_has_txeof)
+	/*
+	 * NOTE:
+	 * 'txr' could be NULL, if multiple channels and
+	 * ifnet.if_start method are enabled.
+	 */
+	if (txr == NULL || !txr->hn_has_txeof)
 		return;
 
-	txr->hn_has_txeof = 0;
-	txr->hn_txeof(txr);
+	txr->hn_txdone_cnt = 0;
+	hn_txeof(txr);
 }
 
 /*
@@ -726,6 +883,7 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
 	rndis_msg *rndis_mesg;
 	rndis_packet *rndis_pkt;
 	rndis_per_packet_info *rppi;
+	struct rndis_hash_value *hash_value;
 	uint32_t rndis_msg_size;
 
 	packet = &txd->netvsc_pkt;
@@ -750,6 +908,18 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
 
 	rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet);
 
+	/*
+	 * Set the hash value for this packet, so that the host could
+	 * dispatch the TX done event for this packet back to this TX
+	 * ring's channel.
+	 */
+	rndis_msg_size += RNDIS_HASHVAL_PPI_SIZE;
+	rppi = hv_set_rppi_data(rndis_mesg, RNDIS_HASHVAL_PPI_SIZE,
+	    nbl_hash_value);
+	hash_value = (struct rndis_hash_value *)((uint8_t *)rppi +
+	    rppi->per_packet_info_offset);
+	hash_value->hash_value = txr->hn_tx_idx;
+
 	if (m_head->m_flags & M_VLANTAG) {
 		ndis_8021q_info *rppi_vlan_info;
 
@@ -851,6 +1021,7 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
 		netvsc_dev *net_dev = txr->hn_sc->net_dev;
 		uint32_t send_buf_section_idx;
 
+		txr->hn_tx_chimney_tried++;
 		send_buf_section_idx =
 		    hv_nv_get_next_send_section(net_dev);
 		if (send_buf_section_idx !=
@@ -932,8 +1103,7 @@ done:
  * associated w/ the txd will _not_ be freed.
  */
 static int
-hn_send_pkt(struct ifnet *ifp, struct hv_device *device_ctx,
-    struct hn_tx_ring *txr, struct hn_txdesc *txd)
+hn_send_pkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
 {
 	int error, send_failed = 0;
 
@@ -942,10 +1112,17 @@ again:
 	 * Make sure that txd is not freed before ETHER_BPF_MTAP.
 	 */
 	hn_txdesc_hold(txd);
-	error = hv_nv_on_send(device_ctx, &txd->netvsc_pkt);
+	error = hv_nv_on_send(txr->hn_chan, &txd->netvsc_pkt);
 	if (!error) {
 		ETHER_BPF_MTAP(ifp, txd->m);
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+		if (!hn_use_if_start) {
+			if_inc_counter(ifp, IFCOUNTER_OBYTES,
+			    txd->m->m_pkthdr.len);
+			if (txd->m->m_flags & M_MCAST)
+				if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
+		}
+		txr->hn_pkts++;
 	}
 	hn_txdesc_put(txr, txd);
 
@@ -996,8 +1173,9 @@ hn_start_locked(struct hn_tx_ring *txr, int len)
 {
 	struct hn_softc *sc = txr->hn_sc;
 	struct ifnet *ifp = sc->hn_ifp;
-	struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
 
+	KASSERT(hn_use_if_start,
+	    ("hn_start_locked is called, when if_start is disabled"));
 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
 
@@ -1038,7 +1216,7 @@ hn_start_locked(struct hn_tx_ring *txr, int len)
 			continue;
 		}
 
-		error = hn_send_pkt(ifp, device_ctx, txr, txd);
+		error = hn_send_pkt(ifp, txr, txd);
 		if (__predict_false(error)) {
 			/* txd is freed, but m_head is not */
 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
@@ -1057,10 +1235,6 @@ netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status)
 {
 	hn_softc_t *sc = device_get_softc(device_obj->device);
 
-	if (sc == NULL) {
-		return;
-	}
-
 	if (status == 1) {
 		sc->hn_carrier = 1;
 	} else {
@@ -1133,26 +1307,18 @@ hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
  * Note:  This is no longer used as a callback
  */
 int
-netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
-    rndis_tcp_ip_csum_info *csum_info)
+netvsc_recv(struct hv_vmbus_channel *chan, netvsc_packet *packet,
+    const rndis_tcp_ip_csum_info *csum_info,
+    const struct rndis_hash_info *hash_info,
+    const struct rndis_hash_value *hash_value)
 {
-	struct hn_softc *sc = device_get_softc(device_ctx->device);
-	struct hn_rx_ring *rxr = &sc->hn_rx_ring[0]; /* TODO: vRSS */
+	struct hn_rx_ring *rxr = chan->hv_chan_rxr;
+	struct ifnet *ifp = rxr->hn_ifp;
 	struct mbuf *m_new;
-	struct ifnet *ifp;
 	int size, do_lro = 0, do_csum = 1;
 
-	if (sc == NULL) {
-		return (0); /* TODO: KYS how can this be! */
-	}
-
-	ifp = sc->hn_ifp;
-	
-	ifp = sc->arpcom.ac_ifp;
-
-	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
+	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
 		return (0);
-	}
 
 	/*
 	 * Bail out if packet contains more data than configured MTU.
@@ -1161,8 +1327,10 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
 		return (0);
 	} else if (packet->tot_data_buf_len <= MHLEN) {
 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
-		if (m_new == NULL)
+		if (m_new == NULL) {
+			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 			return (0);
+		}
 		memcpy(mtod(m_new, void *), packet->data,
 		    packet->tot_data_buf_len);
 		m_new->m_pkthdr.len = m_new->m_len = packet->tot_data_buf_len;
@@ -1182,7 +1350,7 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
 
 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
 		if (m_new == NULL) {
-			if_printf(ifp, "alloc mbuf failed.\n");
+			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 			return (0);
 		}
 
@@ -1251,7 +1419,6 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 					m_new->m_pkthdr.csum_data = 0xffff;
 				}
-				/* Rely on SW csum verification though... */
 				do_lro = 1;
 			} else if (pr == IPPROTO_UDP) {
 				if (do_csum &&
@@ -1278,12 +1445,58 @@ skip:
 		m_new->m_flags |= M_VLANTAG;
 	}
 
+	if (hash_info != NULL && hash_value != NULL) {
+		int hash_type = M_HASHTYPE_OPAQUE;
+
+		rxr->hn_rss_pkts++;
+		m_new->m_pkthdr.flowid = hash_value->hash_value;
+		if ((hash_info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
+		    NDIS_HASH_FUNCTION_TOEPLITZ) {
+			uint32_t type =
+			    (hash_info->hash_info & NDIS_HASH_TYPE_MASK);
+
+			switch (type) {
+			case NDIS_HASH_IPV4:
+				hash_type = M_HASHTYPE_RSS_IPV4;
+				break;
+
+			case NDIS_HASH_TCP_IPV4:
+				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
+				break;
+
+			case NDIS_HASH_IPV6:
+				hash_type = M_HASHTYPE_RSS_IPV6;
+				break;
+
+			case NDIS_HASH_IPV6_EX:
+				hash_type = M_HASHTYPE_RSS_IPV6_EX;
+				break;
+
+			case NDIS_HASH_TCP_IPV6:
+				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
+				break;
+
+			case NDIS_HASH_TCP_IPV6_EX:
+				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
+				break;
+			}
+		}
+		M_HASHTYPE_SET(m_new, hash_type);
+	} else {
+		if (hash_value != NULL)
+			m_new->m_pkthdr.flowid = hash_value->hash_value;
+		else
+			m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
+		M_HASHTYPE_SET(m_new, M_HASHTYPE_OPAQUE);
+	}
+
 	/*
 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
 	 * messages (not just data messages) will trigger a response.
 	 */
 
 	ifp->if_ipackets++;
+	rxr->hn_pkts++;
 
 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
 #if defined(INET) || defined(INET6)
@@ -1305,11 +1518,6 @@ skip:
 	return (0);
 }
 
-void
-netvsc_recv_rollup(struct hv_device *device_ctx __unused)
-{
-}
-
 /*
  * Rules for using sc->temp_unusable:
  * 1.  sc->temp_unusable can only be read or written while holding NV_LOCK()
@@ -1373,13 +1581,8 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 		 */
 		NV_LOCK(sc);
 		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
-		    HN_LRO_LENLIM_MIN(ifp)) {
-			int i;
-			for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
-				sc->hn_rx_ring[i].hn_lro.lro_length_lim =
-				    HN_LRO_LENLIM_MIN(ifp);
-			}
-		}
+		    HN_LRO_LENLIM_MIN(ifp))
+			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
 		NV_UNLOCK(sc);
 #endif
 
@@ -1412,7 +1615,8 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 			NV_UNLOCK(sc);
 			break;
 		}
-		error = hv_rf_on_device_add(hn_dev, &device_info);
+		error = hv_rf_on_device_add(hn_dev, &device_info,
+		    sc->hn_rx_ring_inuse);
 		if (error) {
 			NV_LOCK(sc);
 			sc->temp_unusable = FALSE;
@@ -1555,7 +1759,7 @@ static void
 hn_stop(hn_softc_t *sc)
 {
 	struct ifnet *ifp;
-	int ret;
+	int ret, i;
 	struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
 
 	ifp = sc->hn_ifp;
@@ -1565,6 +1769,9 @@ hn_stop(hn_softc_t *sc)
 
 	atomic_clear_int(&ifp->if_drv_flags,
 	    (IFF_DRV_RUNNING | IFF_DRV_OACTIVE));
+	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
+		sc->hn_tx_ring[i].hn_oactive = 0;
+
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 	sc->hn_initdone = 0;
 
@@ -1637,7 +1844,7 @@ hn_ifinit_locked(hn_softc_t *sc)
 {
 	struct ifnet *ifp;
 	struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
-	int ret;
+	int ret, i;
 
 	ifp = sc->hn_ifp;
 
@@ -1653,7 +1860,11 @@ hn_ifinit_locked(hn_softc_t *sc)
 	} else {
 		sc->hn_initdone = 1;
 	}
+
 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
+		sc->hn_tx_ring[i].hn_oactive = 0;
+
 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
 	if_link_state_change(ifp, LINK_STATE_UP);
 }
@@ -1704,7 +1915,7 @@ hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct hn_softc *sc = arg1;
 	unsigned int lenlim;
-	int error, i;
+	int error;
 
 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
@@ -1716,8 +1927,7 @@ hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
 		return EINVAL;
 
 	NV_LOCK(sc);
-	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
-		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
+	hn_set_lro_lenlim(sc, lenlim);
 	NV_UNLOCK(sc);
 	return 0;
 }
@@ -1746,7 +1956,7 @@ hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
 	 */
 	--ackcnt;
 	NV_LOCK(sc);
-	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+	for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
 	NV_UNLOCK(sc);
 	return 0;
@@ -1770,7 +1980,7 @@ hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
 		return error;
 
 	NV_LOCK(sc);
-	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
 
 		if (on)
@@ -1810,7 +2020,7 @@ hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
 	uint64_t stat;
 
 	stat = 0;
-	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
 		rxr = &sc->hn_rx_ring[i];
 		stat += *((int *)((uint8_t *)rxr + ofs));
 	}
@@ -1820,7 +2030,7 @@ hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
 		return error;
 
 	/* Zero out this stat. */
-	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
 		rxr = &sc->hn_rx_ring[i];
 		*((int *)((uint8_t *)rxr + ofs)) = 0;
 	}
@@ -1836,7 +2046,7 @@ hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
 	uint64_t stat;
 
 	stat = 0;
-	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
 		rxr = &sc->hn_rx_ring[i];
 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
 	}
@@ -1846,7 +2056,7 @@ hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
 		return error;
 
 	/* Zero out this stat. */
-	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
 		rxr = &sc->hn_rx_ring[i];
 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
 	}
@@ -1890,7 +2100,7 @@ hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
 	u_long stat;
 
 	stat = 0;
-	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
 		txr = &sc->hn_tx_ring[i];
 		stat += *((u_long *)((uint8_t *)txr + ofs));
 	}
@@ -1900,7 +2110,7 @@ hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
 		return error;
 
 	/* Zero out this stat. */
-	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
 		txr = &sc->hn_tx_ring[i];
 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
 	}
@@ -1922,7 +2132,7 @@ hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
 		return error;
 
 	NV_LOCK(sc);
-	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
 		txr = &sc->hn_tx_ring[i];
 		*((int *)((uint8_t *)txr + ofs)) = conf;
 	}
@@ -2019,7 +2229,7 @@ hn_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 }
 
 static void
-hn_create_rx_data(struct hn_softc *sc)
+hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
 {
 	struct sysctl_oid_list *child;
 	struct sysctl_ctx_list *ctx;
@@ -2031,7 +2241,9 @@ hn_create_rx_data(struct hn_softc *sc)
 #endif
 	int i;
 
-	sc->hn_rx_ring_cnt = 1; /* TODO: vRSS */
+	sc->hn_rx_ring_cnt = ring_cnt;
+	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
+
 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
 	    M_NETVSC, M_WAITOK | M_ZERO);
 
@@ -2044,6 +2256,13 @@ hn_create_rx_data(struct hn_softc *sc)
 #endif
 #endif	/* INET || INET6 */
 
+	ctx = device_get_sysctl_ctx(dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+
+	/* Create dev.hn.UNIT.rx sysctl tree */
+	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
+	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+
 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
 
@@ -2053,6 +2272,8 @@ hn_create_rx_data(struct hn_softc *sc)
 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
 		if (hn_trust_hostip)
 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
+		rxr->hn_ifp = sc->hn_ifp;
+		rxr->hn_rx_idx = i;
 
 		/*
 		 * Initialize LRO.
@@ -2069,13 +2290,35 @@ hn_create_rx_data(struct hn_softc *sc)
 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
 #endif
 #endif	/* INET || INET6 */
-	}
 
-	ctx = device_get_sysctl_ctx(dev);
-	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+		if (sc->hn_rx_sysctl_tree != NULL) {
+			char name[16];
+
+			/*
+			 * Create per RX ring sysctl tree:
+			 * dev.hn.UNIT.rx.RINGID
+			 */
+			snprintf(name, sizeof(name), "%d", i);
+			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
+			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
+			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+
+			if (rxr->hn_rx_sysctl_tree != NULL) {
+				SYSCTL_ADD_ULONG(ctx,
+				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
+				    OID_AUTO, "packets", CTLFLAG_RW,
+				    &rxr->hn_pkts, "# of packets received");
+				SYSCTL_ADD_ULONG(ctx,
+				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
+				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
+				    &rxr->hn_rss_pkts,
+				    "# of packets w/ RSS info received");
+			}
+		}
+	}
 
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
-	    CTLTYPE_U64 | CTLFLAG_RW, sc,
+	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
 #if __FreeBSD_version < 1100095
 	    hn_rx_stat_int_sysctl,
@@ -2084,7 +2327,7 @@ hn_create_rx_data(struct hn_softc *sc)
 #endif
 	    "LU", "LRO queued");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
-	    CTLTYPE_U64 | CTLFLAG_RW, sc,
+	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
 #if __FreeBSD_version < 1100095
 	    hn_rx_stat_int_sysctl,
@@ -2093,53 +2336,59 @@ hn_create_rx_data(struct hn_softc *sc)
 #endif
 	    "LU", "LRO flushed");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
-	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
 #if __FreeBSD_version >= 1100099
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
-	    CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_lro_lenlim_sysctl, "IU",
+	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+	    hn_lro_lenlim_sysctl, "IU",
 	    "Max # of data bytes to be aggregated by LRO");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
-	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_ackcnt_sysctl, "I",
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+	    hn_lro_ackcnt_sysctl, "I",
 	    "Max # of ACKs to be aggregated by LRO");
 #endif
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
-	    CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_TCP,
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
 	    hn_trust_hcsum_sysctl, "I",
 	    "Trust tcp segement verification on host side, "
 	    "when csum info is missing");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
-	    CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_UDP,
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
 	    hn_trust_hcsum_sysctl, "I",
 	    "Trust udp datagram verification on host side, "
 	    "when csum info is missing");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
-	    CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_IP,
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
 	    hn_trust_hcsum_sysctl, "I",
 	    "Trust ip packet verification on host side, "
 	    "when csum info is missing");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
-	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
-	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
-	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
-	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
 	    hn_rx_stat_ulong_sysctl, "LU",
 	    "# of packets that we trust host's csum verification");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
-	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
+	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
+	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
 }
 
 static void
@@ -2160,6 +2409,7 @@ hn_destroy_rx_data(struct hn_softc *sc)
 	sc->hn_rx_ring = NULL;
 
 	sc->hn_rx_ring_cnt = 0;
+	sc->hn_rx_ring_inuse = 0;
 }
 
 static int
@@ -2170,6 +2420,7 @@ hn_create_tx_ring(struct hn_softc *sc, int id)
 	int error, i;
 
 	txr->hn_sc = sc;
+	txr->hn_tx_idx = id;
 
 #ifndef HN_USE_TXDESC_BUFRING
 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
@@ -2187,8 +2438,22 @@ hn_create_tx_ring(struct hn_softc *sc, int id)
 #endif
 
 	txr->hn_tx_taskq = sc->hn_tx_taskq;
-	TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
-	TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
+
+	if (hn_use_if_start) {
+		txr->hn_txeof = hn_start_txeof;
+		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
+		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
+	} else {
+		int br_depth;
+
+		txr->hn_txeof = hn_xmit_txeof;
+		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
+		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
+
+		br_depth = hn_get_txswq_depth(txr);
+		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_NETVSC,
+		    M_WAITOK, &txr->hn_tx_lock);
+	}
 
 	txr->hn_direct_tx_size = hn_direct_tx_size;
 	if (hv_vmbus_protocal_version >= HV_VMBUS_VERSION_WIN8_1)
@@ -2202,8 +2467,6 @@ hn_create_tx_ring(struct hn_softc *sc, int id)
 	 */
 	txr->hn_sched_tx = 1;
 
-	txr->hn_txeof = hn_start_txeof; /* TODO: if_transmit */
-
 	parent_dtag = bus_get_dma_tag(sc->hn_dev);
 
 	/* DMA tag for RNDIS messages. */
@@ -2312,7 +2575,7 @@ hn_create_tx_ring(struct hn_softc *sc, int id)
 
 		snprintf(name, sizeof(name), "%d", id);
 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
-		    name, CTLFLAG_RD, 0, "");
+		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
 
 		if (txr->hn_tx_sysctl_tree != NULL) {
 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
@@ -2320,6 +2583,14 @@ hn_create_tx_ring(struct hn_softc *sc, int id)
 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
 			    "# of available TX descs");
+			if (!hn_use_if_start) {
+				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
+				    CTLFLAG_RD, &txr->hn_oactive, 0,
+				    "over active");
+			}
+			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
+			    CTLFLAG_RW, &txr->hn_pkts,
+			    "# of packets transmitted");
 		}
 	}
 
@@ -2354,8 +2625,10 @@ hn_destroy_tx_ring(struct hn_tx_ring *txr)
 		hn_txdesc_dmamap_destroy(txd);
 	}
 #else
+	mtx_lock(&txr->hn_tx_lock);
 	while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
 		hn_txdesc_dmamap_destroy(txd);
+	mtx_unlock(&txr->hn_tx_lock);
 #endif
 
 	if (txr->hn_tx_data_dtag != NULL)
@@ -2370,6 +2643,9 @@ hn_destroy_tx_ring(struct hn_tx_ring *txr)
 	free(txr->hn_txdesc, M_NETVSC);
 	txr->hn_txdesc = NULL;
 
+	if (txr->hn_mbuf_br != NULL)
+		buf_ring_free(txr->hn_mbuf_br, M_NETVSC);
+
 #ifndef HN_USE_TXDESC_BUFRING
 	mtx_destroy(&txr->hn_txlist_spin);
 #endif
@@ -2377,13 +2653,15 @@ hn_destroy_tx_ring(struct hn_tx_ring *txr)
 }
 
 static int
-hn_create_tx_data(struct hn_softc *sc)
+hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
 {
 	struct sysctl_oid_list *child;
 	struct sysctl_ctx_list *ctx;
 	int i;
 
-	sc->hn_tx_ring_cnt = 1; /* TODO: vRSS */
+	sc->hn_tx_ring_cnt = ring_cnt;
+	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
+
 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
 	    M_NETVSC, M_WAITOK | M_ZERO);
 
@@ -2392,7 +2670,7 @@ hn_create_tx_data(struct hn_softc *sc)
 
 	/* Create dev.hn.UNIT.tx sysctl tree */
 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
-	    CTLFLAG_RD, 0, "");
+	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
 
 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
 		int error;
@@ -2403,25 +2681,29 @@ hn_create_tx_data(struct hn_softc *sc)
 	}
 
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
-	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
-	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_send_failed),
 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
-	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
-	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
-	    CTLTYPE_ULONG | CTLFLAG_RW, sc,
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
+	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
 	    "# of total TX descs");
@@ -2429,19 +2711,24 @@ hn_create_tx_data(struct hn_softc *sc)
 	    CTLFLAG_RD, &sc->hn_tx_chimney_max, 0,
 	    "Chimney send packet size upper boundary");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
-	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_tx_chimney_size_sysctl,
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+	    hn_tx_chimney_size_sysctl,
 	    "I", "Chimney send packet size limit");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
-	    CTLTYPE_INT | CTLFLAG_RW, sc,
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
 	    hn_tx_conf_int_sysctl, "I",
 	    "Size of the packet for direct transmission");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
-	    CTLTYPE_INT | CTLFLAG_RW, sc,
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
 	    hn_tx_conf_int_sysctl, "I",
 	    "Always schedule transmission "
 	    "instead of doing direct transmission");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
+	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
+	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
 
 	return 0;
 }
@@ -2452,7 +2739,7 @@ hn_set_tx_chimney_size(struct hn_softc *sc, int chimney_size)
 	int i;
 
 	NV_LOCK(sc);
-	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
+	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
 		sc->hn_tx_ring[i].hn_tx_chimney_size = chimney_size;
 	NV_UNLOCK(sc);
 }
@@ -2472,6 +2759,7 @@ hn_destroy_tx_data(struct hn_softc *sc)
 	sc->hn_tx_ring = NULL;
 
 	sc->hn_tx_ring_cnt = 0;
+	sc->hn_tx_ring_inuse = 0;
 }
 
 static void
@@ -2500,7 +2788,7 @@ hn_stop_tx_tasks(struct hn_softc *sc)
 {
 	int i;
 
-	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
 
 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
@@ -2508,6 +2796,224 @@ hn_stop_tx_tasks(struct hn_softc *sc)
 	}
 }
 
+static int
+hn_xmit(struct hn_tx_ring *txr, int len)
+{
+	struct hn_softc *sc = txr->hn_sc;
+	struct ifnet *ifp = sc->hn_ifp;
+	struct mbuf *m_head;
+
+	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
+	KASSERT(hn_use_if_start == 0,
+	    ("hn_xmit is called, when if_start is enabled"));
+
+	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
+		return 0;
+
+	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
+		struct hn_txdesc *txd;
+		int error;
+
+		if (len > 0 && m_head->m_pkthdr.len > len) {
+			/*
+			 * This sending could be time consuming; let callers
+			 * dispatch this packet sending (and sending of any
+			 * following up packets) to tx taskqueue.
+			 */
+			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
+			return 1;
+		}
+
+		txd = hn_txdesc_get(txr);
+		if (txd == NULL) {
+			txr->hn_no_txdescs++;
+			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
+			txr->hn_oactive = 1;
+			break;
+		}
+
+		error = hn_encap(txr, txd, &m_head);
+		if (error) {
+			/* Both txd and m_head are freed; discard */
+			drbr_advance(ifp, txr->hn_mbuf_br);
+			continue;
+		}
+
+		error = hn_send_pkt(ifp, txr, txd);
+		if (__predict_false(error)) {
+			/* txd is freed, but m_head is not */
+			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
+			txr->hn_oactive = 1;
+			break;
+		}
+
+		/* Sent */
+		drbr_advance(ifp, txr->hn_mbuf_br);
+	}
+	return 0;
+}
+
+static int
+hn_transmit(struct ifnet *ifp, struct mbuf *m)
+{
+	struct hn_softc *sc = ifp->if_softc;
+	struct hn_tx_ring *txr;
+	int error, idx = 0;
+
+	/*
+	 * Select the TX ring based on flowid
+	 */
+	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
+		idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
+	txr = &sc->hn_tx_ring[idx];
+
+	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
+	if (error) {
+		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
+		return error;
+	}
+
+	if (txr->hn_oactive)
+		return 0;
+
+	if (txr->hn_sched_tx)
+		goto do_sched;
+
+	if (mtx_trylock(&txr->hn_tx_lock)) {
+		int sched;
+
+		sched = hn_xmit(txr, txr->hn_direct_tx_size);
+		mtx_unlock(&txr->hn_tx_lock);
+		if (!sched)
+			return 0;
+	}
+do_sched:
+	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
+	return 0;
+}
+
+static void
+hn_xmit_qflush(struct ifnet *ifp)
+{
+	struct hn_softc *sc = ifp->if_softc;
+	int i;
+
+	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
+		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
+		struct mbuf *m;
+
+		mtx_lock(&txr->hn_tx_lock);
+		while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
+			m_freem(m);
+		mtx_unlock(&txr->hn_tx_lock);
+	}
+	if_qflush(ifp);
+}
+
+static void
+hn_xmit_txeof(struct hn_tx_ring *txr)
+{
+
+	if (txr->hn_sched_tx)
+		goto do_sched;
+
+	if (mtx_trylock(&txr->hn_tx_lock)) {
+		int sched;
+
+		txr->hn_oactive = 0;
+		sched = hn_xmit(txr, txr->hn_direct_tx_size);
+		mtx_unlock(&txr->hn_tx_lock);
+		if (sched) {
+			taskqueue_enqueue(txr->hn_tx_taskq,
+			    &txr->hn_tx_task);
+		}
+	} else {
+do_sched:
+		/*
+		 * Release the oactive earlier, with the hope, that
+		 * others could catch up.  The task will clear the
+		 * oactive again with the hn_tx_lock to avoid possible
+		 * races.
+		 */
+		txr->hn_oactive = 0;
+		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
+	}
+}
+
+static void
+hn_xmit_taskfunc(void *xtxr, int pending __unused)
+{
+	struct hn_tx_ring *txr = xtxr;
+
+	mtx_lock(&txr->hn_tx_lock);
+	hn_xmit(txr, 0);
+	mtx_unlock(&txr->hn_tx_lock);
+}
+
+static void
+hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
+{
+	struct hn_tx_ring *txr = xtxr;
+
+	mtx_lock(&txr->hn_tx_lock);
+	txr->hn_oactive = 0;
+	hn_xmit(txr, 0);
+	mtx_unlock(&txr->hn_tx_lock);
+}
+
+static void
+hn_channel_attach(struct hn_softc *sc, struct hv_vmbus_channel *chan)
+{
+	struct hn_rx_ring *rxr;
+	int idx;
+
+	idx = chan->offer_msg.offer.sub_channel_index;
+
+	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
+	    ("invalid channel index %d, should > 0 && < %d",
+	     idx, sc->hn_rx_ring_inuse));
+	rxr = &sc->hn_rx_ring[idx];
+	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
+	    ("RX ring %d already attached", idx));
+	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
+
+	chan->hv_chan_rxr = rxr;
+	if (bootverbose) {
+		if_printf(sc->hn_ifp, "link RX ring %d to channel%u\n",
+		    idx, chan->offer_msg.child_rel_id);
+	}
+
+	if (idx < sc->hn_tx_ring_inuse) {
+		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
+
+		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
+		    ("TX ring %d already attached", idx));
+		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
+
+		chan->hv_chan_txr = txr;
+		txr->hn_chan = chan;
+		if (bootverbose) {
+			if_printf(sc->hn_ifp, "link TX ring %d to channel%u\n",
+			    idx, chan->offer_msg.child_rel_id);
+		}
+	}
+
+	/* Bind channel to a proper CPU */
+	vmbus_channel_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
+}
+
+static void
+hn_subchan_attach(struct hn_softc *sc, struct hv_vmbus_channel *chan)
+{
+
+	KASSERT(!HV_VMBUS_CHAN_ISPRIMARY(chan),
+	    ("subchannel callback on primary channel"));
+	KASSERT(chan->offer_msg.offer.sub_channel_index > 0,
+	    ("invalid channel subidx %u",
+	     chan->offer_msg.offer.sub_channel_index));
+	hn_channel_attach(sc, chan);
+}
+
 static void
 hn_tx_taskq_create(void *arg __unused)
 {
diff --git a/sys/dev/hyperv/netvsc/hv_rndis.h b/sys/dev/hyperv/netvsc/hv_rndis.h
index cd46ecc..b27579d 100644
--- a/sys/dev/hyperv/netvsc/hv_rndis.h
+++ b/sys/dev/hyperv/netvsc/hv_rndis.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2010-2012 Citrix Inc.
  * Copyright (c) 2012 NetApp Inc.
  * All rights reserved.
@@ -167,6 +167,14 @@
 #define RNDIS_OID_GEN_MACHINE_NAME                      0x0001021A
 #define RNDIS_OID_GEN_RNDIS_CONFIG_PARAMETER            0x0001021B
 
+/*
+ * For receive side scale
+ */
+/* Query only */
+#define RNDIS_OID_GEN_RSS_CAPABILITIES			0x00010203
+/* Query and set */
+#define RNDIS_OID_GEN_RSS_PARAMETERS			0x00010204
+
 #define RNDIS_OID_GEN_XMIT_OK                           0x00020101
 #define RNDIS_OID_GEN_RCV_OK                            0x00020102
 #define RNDIS_OID_GEN_XMIT_ERROR                        0x00020103
@@ -608,6 +616,9 @@ typedef enum ndis_per_pkt_infotype_ {
 	max_perpkt_info
 } ndis_per_pkt_infotype;
 
+#define nbl_hash_value	pkt_cancel_id
+#define nbl_hash_info	original_netbuf_list
+
 typedef struct ndis_8021q_info_ {
 	union {
 		struct {
@@ -680,6 +691,28 @@ typedef struct rndis_tcp_ip_csum_info_ {
 	};
 } rndis_tcp_ip_csum_info;
 
+struct rndis_hash_value {
+	uint32_t	hash_value;
+} __packed;
+
+struct rndis_hash_info {
+	uint32_t	hash_info;
+} __packed;
+
+#define NDIS_HASH_FUNCTION_MASK		0x000000FF	/* see hash function */
+#define NDIS_HASH_TYPE_MASK		0x00FFFF00	/* see hash type */
+
+/* hash function */
+#define NDIS_HASH_FUNCTION_TOEPLITZ	0x00000001
+
+/* hash type */
+#define NDIS_HASH_IPV4			0x00000100
+#define NDIS_HASH_TCP_IPV4		0x00000200
+#define NDIS_HASH_IPV6			0x00000400
+#define NDIS_HASH_IPV6_EX		0x00000800
+#define NDIS_HASH_TCP_IPV6		0x00001000
+#define NDIS_HASH_TCP_IPV6_EX		0x00002000
+
 typedef struct rndis_tcp_tso_info_ {
 	union {
 		struct {
@@ -713,6 +746,9 @@ typedef struct rndis_tcp_tso_info_ {
 	};
 } rndis_tcp_tso_info;
 
+#define RNDIS_HASHVAL_PPI_SIZE	(sizeof(rndis_per_packet_info) + \
+				sizeof(struct rndis_hash_value))
+
 #define RNDIS_VLAN_PPI_SIZE	(sizeof(rndis_per_packet_info) + \
 				sizeof(ndis_8021q_info))
 
@@ -1046,11 +1082,13 @@ typedef struct rndismp_rx_bufs_info_ {
 /*
  * Externs
  */
-int netvsc_recv(struct hv_device *device_ctx, 
-    netvsc_packet *packet, 
-    rndis_tcp_ip_csum_info *csum_info);
-void netvsc_recv_rollup(struct hv_device *device_ctx);
-void netvsc_channel_rollup(struct hv_device *device_ctx);
+struct hv_vmbus_channel;
+
+int netvsc_recv(struct hv_vmbus_channel *chan,
+    netvsc_packet *packet, const rndis_tcp_ip_csum_info *csum_info,
+    const struct rndis_hash_info *hash_info,
+    const struct rndis_hash_value *hash_value);
+void netvsc_channel_rollup(struct hv_vmbus_channel *chan);
 
 void* hv_set_rppi_data(rndis_msg *rndis_mesg,
     uint32_t rppi_size,
diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.c b/sys/dev/hyperv/netvsc/hv_rndis_filter.c
index 31ddbc0..8e95510 100644
--- a/sys/dev/hyperv/netvsc/hv_rndis_filter.c
+++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2010-2012 Citrix Inc.
  * Copyright (c) 2012 NetApp Inc.
  * All rights reserved.
@@ -45,10 +45,27 @@ __FBSDID("$FreeBSD$");
 #include <vm/pmap.h>
 
 #include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/vmbus/hv_vmbus_priv.h>
 #include "hv_net_vsc.h"
 #include "hv_rndis.h"
 #include "hv_rndis_filter.h"
 
+struct hv_rf_recvinfo {
+	const ndis_8021q_info		*vlan_info;
+	const rndis_tcp_ip_csum_info	*csum_info;
+	const struct rndis_hash_info	*hash_info;
+	const struct rndis_hash_value	*hash_value;
+};
+
+#define HV_RF_RECVINFO_VLAN	0x1
+#define HV_RF_RECVINFO_CSUM	0x2
+#define HV_RF_RECVINFO_HASHINF	0x4
+#define HV_RF_RECVINFO_HASHVAL	0x8
+#define HV_RF_RECVINFO_ALL		\
+	(HV_RF_RECVINFO_VLAN |		\
+	 HV_RF_RECVINFO_CSUM |		\
+	 HV_RF_RECVINFO_HASHINF |	\
+	 HV_RF_RECVINFO_HASHVAL)
 
 /*
  * Forward declarations
@@ -59,6 +76,7 @@ static void hv_rf_receive_response(rndis_device *device, rndis_msg *response);
 static void hv_rf_receive_indicate_status(rndis_device *device,
 					  rndis_msg *response);
 static void hv_rf_receive_data(rndis_device *device, rndis_msg *message,
+			       struct hv_vmbus_channel *chan,
 			       netvsc_packet *pkt);
 static int  hv_rf_query_device(rndis_device *device, uint32_t oid,
 			       void *result, uint32_t *result_size);
@@ -68,8 +86,8 @@ static int  hv_rf_set_packet_filter(rndis_device *device, uint32_t new_filter);
 static int  hv_rf_init_device(rndis_device *device);
 static int  hv_rf_open_device(rndis_device *device);
 static int  hv_rf_close_device(rndis_device *device);
-static void hv_rf_on_send_request_completion(void *context);
-static void hv_rf_on_send_request_halt_completion(void *context);
+static void hv_rf_on_send_request_completion(struct hv_vmbus_channel *, void *context);
+static void hv_rf_on_send_request_halt_completion(struct hv_vmbus_channel *, void *context);
 int
 hv_rf_send_offload_request(struct hv_device *device,
     rndis_offload_params *offloads);
@@ -223,6 +241,8 @@ hv_rf_send_request(rndis_device *device, rndis_request *request,
 {
 	int ret;
 	netvsc_packet *packet;
+	netvsc_dev      *net_dev = device->net_dev;
+	int send_buf_section_idx;
 
 	/* Set up the packet to send it */
 	packet = &request->pkt;
@@ -237,6 +257,20 @@ hv_rf_send_request(rndis_device *device, rndis_request *request,
 	packet->page_buffers[0].offset =
 	    (unsigned long)&request->request_msg & (PAGE_SIZE - 1);
 
+	if (packet->page_buffers[0].offset +
+		packet->page_buffers[0].length > PAGE_SIZE) {
+		packet->page_buf_count = 2;
+		packet->page_buffers[0].length =
+		        PAGE_SIZE - packet->page_buffers[0].offset;
+		packet->page_buffers[1].pfn =
+		        hv_get_phys_addr((char*)&request->request_msg +
+                		packet->page_buffers[0].length) >> PAGE_SHIFT;
+		packet->page_buffers[1].offset = 0;
+		packet->page_buffers[1].length =
+		        request->request_msg.msg_len -
+			        packet->page_buffers[0].length;
+	}
+
 	packet->compl.send.send_completion_context = request; /* packet */
 	if (message_type != REMOTE_NDIS_HALT_MSG) {
 		packet->compl.send.on_send_completion =
@@ -246,11 +280,26 @@ hv_rf_send_request(rndis_device *device, rndis_request *request,
 		    hv_rf_on_send_request_halt_completion;
 	}
 	packet->compl.send.send_completion_tid = (unsigned long)device;
-	packet->send_buf_section_idx =
-	    NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX;
+	if (packet->tot_data_buf_len < net_dev->send_section_size) {
+		send_buf_section_idx = hv_nv_get_next_send_section(net_dev);
+		if (send_buf_section_idx !=
+			NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) {
+			char *dest = ((char *)net_dev->send_buf +
+				send_buf_section_idx * net_dev->send_section_size);
+
+			memcpy(dest, &request->request_msg, request->request_msg.msg_len);
+			packet->send_buf_section_idx = send_buf_section_idx;
+			packet->send_buf_section_size = packet->tot_data_buf_len;
+			packet->page_buf_count = 0;
+			goto sendit;
+		}
+		/* Failed to allocate chimney send buffer; move on */
+	}
+	packet->send_buf_section_idx = NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX;
 	packet->send_buf_section_size = 0;
 
-	ret = hv_nv_on_send(device->net_dev->dev, packet);
+sendit:
+	ret = hv_nv_on_send(device->net_dev->dev->channel, packet);
 
 	return (ret);
 }
@@ -373,8 +422,7 @@ hv_rf_send_offload_request(struct hv_device *device,
 	}
 
 cleanup:
-	if (request)
-		hv_put_rndis_request(rndis_dev, request);
+	hv_put_rndis_request(rndis_dev, request);
 
 	return (ret);
 }
@@ -402,17 +450,95 @@ hv_rf_receive_indicate_status(rndis_device *device, rndis_msg *response)
 	}
 }
 
+static int
+hv_rf_find_recvinfo(const rndis_packet *rpkt, struct hv_rf_recvinfo *info)
+{
+	const rndis_per_packet_info *ppi;
+	uint32_t mask, len;
+
+	info->vlan_info = NULL;
+	info->csum_info = NULL;
+	info->hash_info = NULL;
+	info->hash_value = NULL;
+
+	if (rpkt->per_pkt_info_offset == 0)
+		return 0;
+
+	ppi = (const rndis_per_packet_info *)
+	    ((const uint8_t *)rpkt + rpkt->per_pkt_info_offset);
+	len = rpkt->per_pkt_info_length;
+	mask = 0;
+
+	while (len != 0) {
+		const void *ppi_dptr;
+		uint32_t ppi_dlen;
+
+		if (__predict_false(ppi->size < ppi->per_packet_info_offset))
+			return EINVAL;
+		ppi_dlen = ppi->size - ppi->per_packet_info_offset;
+		ppi_dptr = (const uint8_t *)ppi + ppi->per_packet_info_offset;
+
+		switch (ppi->type) {
+		case ieee_8021q_info:
+			if (__predict_false(ppi_dlen < sizeof(ndis_8021q_info)))
+				return EINVAL;
+			info->vlan_info = ppi_dptr;
+			mask |= HV_RF_RECVINFO_VLAN;
+			break;
+
+		case tcpip_chksum_info:
+			if (__predict_false(ppi_dlen <
+			    sizeof(rndis_tcp_ip_csum_info)))
+				return EINVAL;
+			info->csum_info = ppi_dptr;
+			mask |= HV_RF_RECVINFO_CSUM;
+			break;
+
+		case nbl_hash_value:
+			if (__predict_false(ppi_dlen <
+			    sizeof(struct rndis_hash_value)))
+				return EINVAL;
+			info->hash_value = ppi_dptr;
+			mask |= HV_RF_RECVINFO_HASHVAL;
+			break;
+
+		case nbl_hash_info:
+			if (__predict_false(ppi_dlen <
+			    sizeof(struct rndis_hash_info)))
+				return EINVAL;
+			info->hash_info = ppi_dptr;
+			mask |= HV_RF_RECVINFO_HASHINF;
+			break;
+
+		default:
+			goto skip;
+		}
+
+		if (mask == HV_RF_RECVINFO_ALL) {
+			/* All found; done */
+			break;
+		}
+skip:
+		if (__predict_false(len < ppi->size))
+			return EINVAL;
+		len -= ppi->size;
+		ppi = (const rndis_per_packet_info *)
+		    ((const uint8_t *)ppi + ppi->size);
+	}
+	return 0;
+}
+
 /*
  * RNDIS filter receive data
  */
 static void
-hv_rf_receive_data(rndis_device *device, rndis_msg *message, netvsc_packet *pkt)
+hv_rf_receive_data(rndis_device *device, rndis_msg *message,
+    struct hv_vmbus_channel *chan, netvsc_packet *pkt)
 {
 	rndis_packet *rndis_pkt;
-	ndis_8021q_info *rppi_vlan_info;
 	uint32_t data_offset;
-	rndis_tcp_ip_csum_info *csum_info = NULL;
 	device_t dev = device->net_dev->dev->device;
+	struct hv_rf_recvinfo info;
 
 	rndis_pkt = &message->msg.packet;
 
@@ -436,22 +562,26 @@ hv_rf_receive_data(rndis_device *device, rndis_msg *message, netvsc_packet *pkt)
 	pkt->tot_data_buf_len = rndis_pkt->data_length;
 	pkt->data = (void *)((unsigned long)pkt->data + data_offset);
 
-	rppi_vlan_info = hv_get_ppi_data(rndis_pkt, ieee_8021q_info);
-	if (rppi_vlan_info) {
-		pkt->vlan_tci = rppi_vlan_info->u1.s1.vlan_id;
-	} else {
-		pkt->vlan_tci = 0;
+	if (hv_rf_find_recvinfo(rndis_pkt, &info)) {
+		pkt->status = nvsp_status_failure;
+		device_printf(dev, "recvinfo parsing failed\n");
+		return;
 	}
 
-	csum_info = hv_get_ppi_data(rndis_pkt, tcpip_chksum_info);
-	netvsc_recv(device->net_dev->dev, pkt, csum_info);
+	if (info.vlan_info != NULL)
+		pkt->vlan_tci = info.vlan_info->u1.s1.vlan_id;
+	else
+		pkt->vlan_tci = 0;
+
+	netvsc_recv(chan, pkt, info.csum_info, info.hash_info, info.hash_value);
 }
 
 /*
  * RNDIS filter on receive
  */
 int
-hv_rf_on_receive(netvsc_dev *net_dev, struct hv_device *device, netvsc_packet *pkt)
+hv_rf_on_receive(netvsc_dev *net_dev, struct hv_device *device,
+    struct hv_vmbus_channel *chan, netvsc_packet *pkt)
 {
 	rndis_device *rndis_dev;
 	rndis_msg *rndis_hdr;
@@ -474,7 +604,7 @@ hv_rf_on_receive(netvsc_dev *net_dev, struct hv_device *device, netvsc_packet *p
 
 	/* data message */
 	case REMOTE_NDIS_PACKET_MSG:
-		hv_rf_receive_data(rndis_dev, rndis_hdr, pkt);
+		hv_rf_receive_data(rndis_dev, rndis_hdr, chan, pkt);
 		break;
 	/* completion messages */
 	case REMOTE_NDIS_INITIALIZE_CMPLT:
@@ -525,6 +655,19 @@ hv_rf_query_device(rndis_device *device, uint32_t oid, void *result,
 	query->info_buffer_length = 0;
 	query->device_vc_handle = 0;
 
+	if (oid == RNDIS_OID_GEN_RSS_CAPABILITIES) {
+		struct rndis_recv_scale_cap *cap;
+
+		request->request_msg.msg_len += 
+			sizeof(struct rndis_recv_scale_cap);
+		query->info_buffer_length = sizeof(struct rndis_recv_scale_cap);
+		cap = (struct rndis_recv_scale_cap *)((unsigned long)query + 
+						query->info_buffer_offset);
+		cap->hdr.type = RNDIS_OBJECT_TYPE_RSS_CAPABILITIES;
+		cap->hdr.rev = RNDIS_RECEIVE_SCALE_CAPABILITIES_REVISION_2;
+		cap->hdr.size = sizeof(struct rndis_recv_scale_cap);
+	}
+
 	ret = hv_rf_send_request(device, request, REMOTE_NDIS_QUERY_MSG);
 	if (ret != 0) {
 		/* Fixme:  printf added */
@@ -579,6 +722,114 @@ hv_rf_query_device_link_status(rndis_device *device)
 	    RNDIS_OID_GEN_MEDIA_CONNECT_STATUS, &device->link_status, &size));
 }
 
+static uint8_t netvsc_hash_key[HASH_KEYLEN] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
+};
+
+/*
+ * RNDIS set vRSS parameters
+ */
+static int
+hv_rf_set_rss_param(rndis_device *device, int num_queue)
+{
+	rndis_request *request;
+	rndis_set_request *set;
+	rndis_set_complete *set_complete;
+	rndis_recv_scale_param *rssp;
+	uint32_t extlen = sizeof(rndis_recv_scale_param) +
+	    (4 * ITAB_NUM) + HASH_KEYLEN;
+	uint32_t *itab, status;
+	uint8_t *keyp;
+	int i, ret;
+
+
+	request = hv_rndis_request(device, REMOTE_NDIS_SET_MSG,
+	    RNDIS_MESSAGE_SIZE(rndis_set_request) + extlen);
+	if (request == NULL) {
+		if (bootverbose)
+			printf("Netvsc: No memory to set vRSS parameters.\n");
+		ret = -1;
+		goto cleanup;
+	}
+
+	set = &request->request_msg.msg.set_request;
+	set->oid = RNDIS_OID_GEN_RSS_PARAMETERS;
+	set->info_buffer_length = extlen;
+	set->info_buffer_offset = sizeof(rndis_set_request);
+	set->device_vc_handle = 0;
+
+	/* Fill out the rssp parameter structure */
+	rssp = (rndis_recv_scale_param *)(set + 1);
+	rssp->hdr.type = RNDIS_OBJECT_TYPE_RSS_PARAMETERS;
+	rssp->hdr.rev = RNDIS_RECEIVE_SCALE_PARAMETERS_REVISION_2;
+	rssp->hdr.size = sizeof(rndis_recv_scale_param);
+	rssp->flag = 0;
+	rssp->hashinfo = RNDIS_HASH_FUNC_TOEPLITZ | RNDIS_HASH_IPV4 |
+	    RNDIS_HASH_TCP_IPV4 | RNDIS_HASH_IPV6 | RNDIS_HASH_TCP_IPV6;
+	rssp->indirect_tabsize = 4 * ITAB_NUM;
+	rssp->indirect_taboffset = sizeof(rndis_recv_scale_param);
+	rssp->hashkey_size = HASH_KEYLEN;
+	rssp->hashkey_offset = rssp->indirect_taboffset +
+	    rssp->indirect_tabsize;
+
+	/* Set indirection table entries */
+	itab = (uint32_t *)(rssp + 1);
+	for (i = 0; i < ITAB_NUM; i++)
+		itab[i] = i % num_queue;
+
+	/* Set hash key values */
+	keyp = (uint8_t *)((unsigned long)rssp + rssp->hashkey_offset);
+	for (i = 0; i < HASH_KEYLEN; i++)
+		keyp[i] = netvsc_hash_key[i];
+
+	ret = hv_rf_send_request(device, request, REMOTE_NDIS_SET_MSG);
+	if (ret != 0) {
+		goto cleanup;
+	}
+
+	/*
+	 * Wait for the response from the host.  Another thread will signal
+	 * us when the response has arrived.  In the failure case,
+	 * sema_timedwait() returns a non-zero status after waiting 5 seconds.
+	 */
+	ret = sema_timedwait(&request->wait_sema, 5 * hz);
+	if (ret == 0) {
+		/* Response received, check status */
+		set_complete = &request->response_msg.msg.set_complete;
+		status = set_complete->status;
+		if (status != RNDIS_STATUS_SUCCESS) {
+			/* Bad response status, return error */
+			if (bootverbose)
+				printf("Netvsc: Failed to set vRSS "
+				    "parameters.\n");
+			ret = -2;
+		} else {
+			if (bootverbose)
+				printf("Netvsc: Successfully set vRSS "
+				    "parameters.\n");
+		}
+	} else {
+		/*
+		 * We cannot deallocate the request since we may still
+		 * receive a send completion for it.
+		 */
+		printf("Netvsc: vRSS set timeout, id = %u, ret = %d\n",
+		    request->request_msg.msg.init_request.request_id, ret);
+		goto exit;
+	}
+
+cleanup:
+	if (request != NULL) {
+		hv_put_rndis_request(device, request);
+	}
+exit:
+	return (ret);
+}
+
 /*
  * RNDIS filter set packet filter
  * Sends an rndis request with the new filter, then waits for a response
@@ -752,10 +1003,8 @@ hv_rf_halt_device(rndis_device *device)
 	}
 
 	device->state = RNDIS_DEV_UNINITIALIZED;
-	
-	if (request != NULL) {
-		hv_put_rndis_request(device, request);
-	}
+
+	hv_put_rndis_request(device, request);
 
 	return (0);
 }
@@ -813,12 +1062,16 @@ hv_rf_close_device(rndis_device *device)
  * RNDIS filter on device add
  */
 int
-hv_rf_on_device_add(struct hv_device *device, void *additl_info)
+hv_rf_on_device_add(struct hv_device *device, void *additl_info,
+    int nchan)
 {
 	int ret;
 	netvsc_dev *net_dev;
 	rndis_device *rndis_dev;
+	nvsp_msg *init_pkt;
 	rndis_offload_params offloads;
+	struct rndis_recv_scale_cap rsscaps;
+	uint32_t rsscaps_size = sizeof(struct rndis_recv_scale_cap);
 	netvsc_device_info *dev_info = (netvsc_device_info *)additl_info;
 	device_t dev = device->device;
 
@@ -884,6 +1137,67 @@ hv_rf_on_device_add(struct hv_device *device, void *additl_info)
 	
 	dev_info->link_state = rndis_dev->link_status;
 
+	net_dev->num_channel = 1;
+	if (net_dev->nvsp_version < NVSP_PROTOCOL_VERSION_5 || nchan == 1)
+		return (0);
+
+	memset(&rsscaps, 0, rsscaps_size);
+	ret = hv_rf_query_device(rndis_dev,
+			RNDIS_OID_GEN_RSS_CAPABILITIES,
+			&rsscaps, &rsscaps_size);
+	if ((ret != 0) || (rsscaps.num_recv_que < 2)) {
+		device_printf(dev, "hv_rf_query_device failed or "
+			"rsscaps.num_recv_que < 2 \n");
+		goto out;
+	}
+	device_printf(dev, "channel, offered %u, requested %d\n",
+	    rsscaps.num_recv_que, nchan);
+	if (nchan > rsscaps.num_recv_que)
+		nchan = rsscaps.num_recv_que;
+	net_dev->num_channel = nchan;
+
+	if (net_dev->num_channel == 1) {
+		device_printf(dev, "net_dev->num_channel == 1 under VRSS\n");
+		goto out;
+	}
+	
+	/* request host to create sub channels */
+	init_pkt = &net_dev->channel_init_packet;
+	memset(init_pkt, 0, sizeof(nvsp_msg));
+
+	init_pkt->hdr.msg_type = nvsp_msg5_type_subchannel;
+	init_pkt->msgs.vers_5_msgs.subchannel_request.op =
+	    NVSP_SUBCHANNE_ALLOCATE;
+	init_pkt->msgs.vers_5_msgs.subchannel_request.num_subchannels =
+	    net_dev->num_channel - 1;
+
+	ret = hv_vmbus_channel_send_packet(device->channel, init_pkt,
+	    sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt,
+	    HV_VMBUS_PACKET_TYPE_DATA_IN_BAND,
+	    HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+	if (ret != 0) {
+		device_printf(dev, "Fail to allocate subchannel\n");
+		goto out;
+	}
+
+	sema_wait(&net_dev->channel_init_sema);
+
+	if (init_pkt->msgs.vers_5_msgs.subchn_complete.status !=
+	    nvsp_status_success) {
+		ret = ENODEV;
+		device_printf(dev, "sub channel complete error\n");
+		goto out;
+	}
+
+	net_dev->num_channel = 1 +
+	    init_pkt->msgs.vers_5_msgs.subchn_complete.num_subchannels;
+
+	ret = hv_rf_set_rss_param(rndis_dev, net_dev->num_channel);
+
+out:
+	if (ret)
+		net_dev->num_channel = 1;
+
 	return (ret);
 }
 
@@ -938,7 +1252,8 @@ hv_rf_on_close(struct hv_device *device)
  * RNDIS filter on send request completion callback
  */
 static void 
-hv_rf_on_send_request_completion(void *context)
+hv_rf_on_send_request_completion(struct hv_vmbus_channel *chan __unused,
+    void *context __unused)
 {
 }
 
@@ -946,7 +1261,8 @@ hv_rf_on_send_request_completion(void *context)
  * RNDIS filter on send request (halt only) completion callback
  */
 static void 
-hv_rf_on_send_request_halt_completion(void *context)
+hv_rf_on_send_request_halt_completion(struct hv_vmbus_channel *chan __unused,
+    void *context)
 {
 	rndis_request *request = context;
 
@@ -958,32 +1274,9 @@ hv_rf_on_send_request_halt_completion(void *context)
 	request->halt_complete_flag = 1;
 }
 
-/*
- * RNDIS filter when "all" reception is done
- */
-void
-hv_rf_receive_rollup(netvsc_dev *net_dev)
-{
-	rndis_device *rndis_dev;
-
-	rndis_dev = (rndis_device *)net_dev->extension;
-	netvsc_recv_rollup(rndis_dev->net_dev->dev);
-}
-
 void
-hv_rf_channel_rollup(netvsc_dev *net_dev)
+hv_rf_channel_rollup(struct hv_vmbus_channel *chan)
 {
-	rndis_device *rndis_dev;
-
-	rndis_dev = (rndis_device *)net_dev->extension;
 
-	/*
-	 * This could be called pretty early, so we need
-	 * to make sure everything has been setup.
-	 */
-	if (rndis_dev == NULL ||
-	    rndis_dev->net_dev == NULL ||
-	    rndis_dev->net_dev->dev == NULL)
-		return;
-	netvsc_channel_rollup(rndis_dev->net_dev->dev);
+	netvsc_channel_rollup(chan);
 }
diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.h b/sys/dev/hyperv/netvsc/hv_rndis_filter.h
index 9d7a38d..dbaaa42 100644
--- a/sys/dev/hyperv/netvsc/hv_rndis_filter.h
+++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2010-2012 Citrix Inc.
  * Copyright (c) 2012 NetApp Inc.
  * All rights reserved.
@@ -63,17 +63,32 @@ typedef struct rndis_request_ {
 	struct sema			wait_sema;	
 
 	/*
-	 * Fixme:  We assumed a fixed size response here.  If we do ever
-	 * need to handle a bigger response, we can either define a max
-	 * response message or add a response buffer variable above this field
+	 * The max response size is sizeof(rndis_msg) + PAGE_SIZE.
+	 *
+	 * XXX
+	 * This is ugly and should be cleaned up once we busdma-fy
+	 * RNDIS request bits.
 	 */
 	rndis_msg			response_msg;
+	uint8_t				buf_resp[PAGE_SIZE];
 
 	/* Simplify allocation by having a netvsc packet inline */
 	netvsc_packet			pkt;
 	hv_vmbus_page_buffer		buffer;
-	/* Fixme:  We assumed a fixed size request here. */
+
+	/*
+	 * The max request size is sizeof(rndis_msg) + PAGE_SIZE.
+	 *
+	 * NOTE:
+	 * This is required for the large request like RSS settings.
+	 *
+	 * XXX
+	 * This is ugly and should be cleaned up once we busdma-fy
+	 * RNDIS request bits.
+	 */
 	rndis_msg			request_msg;
+	uint8_t				buf_req[PAGE_SIZE];
+
 	/* Fixme:  Poor man's semaphore. */
 	uint32_t			halt_complete_flag;
 } rndis_request;
@@ -95,12 +110,13 @@ typedef struct rndis_device_ {
 /*
  * Externs
  */
+struct hv_vmbus_channel;
 
-int hv_rf_on_receive(netvsc_dev *net_dev,
-    struct hv_device *device, netvsc_packet *pkt);
+int hv_rf_on_receive(netvsc_dev *net_dev, struct hv_device *device,
+    struct hv_vmbus_channel *chan, netvsc_packet *pkt);
 void hv_rf_receive_rollup(netvsc_dev *net_dev);
-void hv_rf_channel_rollup(netvsc_dev *net_dev);
-int hv_rf_on_device_add(struct hv_device *device, void *additl_info);
+void hv_rf_channel_rollup(struct hv_vmbus_channel *chan);
+int hv_rf_on_device_add(struct hv_device *device, void *additl_info, int nchan);
 int hv_rf_on_device_remove(struct hv_device *device, boolean_t destroy_channel);
 int hv_rf_on_open(struct hv_device *device);
 int hv_rf_on_close(struct hv_device *device);
diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
index 27fb3fd..a89a762 100644
--- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
+++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2012 NetApp Inc.
  * Copyright (c) 2012 Citrix Inc.
  * All rights reserved.
@@ -134,7 +134,6 @@ struct storvsc_softc {
 	uint32_t			hs_num_out_reqs;
 	boolean_t			hs_destroy;
 	boolean_t			hs_drain_notify;
-	boolean_t			hs_open_multi_channel;
 	struct sema 			hs_drain_sema;	
 	struct hv_storvsc_request	hs_init_req;
 	struct hv_storvsc_request	hs_reset_req;
@@ -324,9 +323,6 @@ get_stor_device(struct hv_device *device,
 	struct storvsc_softc *sc;
 
 	sc = device_get_softc(device->device);
-	if (sc == NULL) {
-		return NULL;
-	}
 
 	if (outbound) {
 		/*
@@ -350,29 +346,19 @@ get_stor_device(struct hv_device *device,
 	return sc;
 }
 
-/**
- * @brief Callback handler, will be invoked when receive mutil-channel offer
- *
- * @param context  new multi-channel
- */
 static void
-storvsc_handle_sc_creation(void *context)
+storvsc_subchan_attach(struct hv_vmbus_channel *new_channel)
 {
-	hv_vmbus_channel *new_channel;
 	struct hv_device *device;
 	struct storvsc_softc *sc;
 	struct vmstor_chan_props props;
 	int ret = 0;
 
-	new_channel = (hv_vmbus_channel *)context;
-	device = new_channel->primary_channel->device;
+	device = new_channel->device;
 	sc = get_stor_device(device, TRUE);
 	if (sc == NULL)
 		return;
 
-	if (FALSE == sc->hs_open_multi_channel)
-		return;
-	
 	memset(&props, 0, sizeof(props));
 
 	ret = hv_vmbus_channel_open(new_channel,
@@ -395,11 +381,12 @@ storvsc_handle_sc_creation(void *context)
 static void
 storvsc_send_multichannel_request(struct hv_device *dev, int max_chans)
 {
+	struct hv_vmbus_channel **subchan;
 	struct storvsc_softc *sc;
 	struct hv_storvsc_request *request;
 	struct vstor_packet *vstor_packet;	
 	int request_channels_cnt = 0;
-	int ret;
+	int ret, i;
 
 	/* get multichannels count that need to create */
 	request_channels_cnt = MIN(max_chans, mp_ncpus);
@@ -413,9 +400,6 @@ storvsc_send_multichannel_request(struct hv_device *dev, int max_chans)
 
 	request = &sc->hs_init_req;
 
-	/* Establish a handler for multi-channel */
-	dev->channel->sc_creation_callback = storvsc_handle_sc_creation;
-
 	/* request the host to create multi-channel */
 	memset(request, 0, sizeof(struct hv_storvsc_request));
 	
@@ -451,7 +435,15 @@ storvsc_send_multichannel_request(struct hv_device *dev, int max_chans)
 		return;
 	}
 
-	sc->hs_open_multi_channel = TRUE;
+	/* Wait for sub-channels setup to complete. */
+	subchan = vmbus_get_subchan(dev->channel, request_channels_cnt);
+
+	/* Attach the sub-channels. */
+	for (i = 0; i < request_channels_cnt; ++i)
+		storvsc_subchan_attach(subchan[i]);
+
+	/* Release the sub-channels. */
+	vmbus_rel_subchan(subchan, request_channels_cnt);
 
 	if (bootverbose)
 		printf("Storvsc create multi-channel success!\n");
@@ -883,12 +875,7 @@ hv_storvsc_on_channel_callback(void *context)
 	struct hv_storvsc_request *request;
 	struct vstor_packet *vstor_packet;
 
-	if (channel->primary_channel != NULL){
-		device = channel->primary_channel->device;
-	} else {
-		device = channel->device;
-	}
-
+	device = channel->device;
 	KASSERT(device, ("device is NULL"));
 
 	sc = get_stor_device(device, FALSE);
@@ -970,6 +957,7 @@ storvsc_probe(device_t dev)
 			if(bootverbose)
 				device_printf(dev,
 					"Enlightened ATA/IDE detected\n");
+			device_set_desc(dev, g_drv_props_table[DRIVER_BLKVSC].drv_desc);
 			ret = BUS_PROBE_DEFAULT;
 		} else if(bootverbose)
 			device_printf(dev, "Emulated ATA/IDE set (hw.ata.disk_enable set)\n");
@@ -977,6 +965,7 @@ storvsc_probe(device_t dev)
 	case DRIVER_STORVSC:
 		if(bootverbose)
 			device_printf(dev, "Enlightened SCSI device detected\n");
+		device_set_desc(dev, g_drv_props_table[DRIVER_STORVSC].drv_desc);
 		ret = BUS_PROBE_DEFAULT;
 		break;
 	default:
@@ -1014,10 +1003,6 @@ storvsc_attach(device_t dev)
 	root_mount_token = root_mount_hold("storvsc");
 
 	sc = device_get_softc(dev);
-	if (sc == NULL) {
-		ret = ENOMEM;
-		goto cleanup;
-	}
 
 	stor_type = storvsc_get_storage_type(dev);
 
@@ -1026,15 +1011,12 @@ storvsc_attach(device_t dev)
 		goto cleanup;
 	}
 
-	bzero(sc, sizeof(struct storvsc_softc));
-
 	/* fill in driver specific properties */
 	sc->hs_drv_props = &g_drv_props_table[stor_type];
 
 	/* fill in device specific properties */
 	sc->hs_unit	= device_get_unit(dev);
 	sc->hs_dev	= hv_dev;
-	device_set_desc(dev, g_drv_props_table[stor_type].drv_desc);
 
 	LIST_INIT(&sc->hs_free_list);
 	mtx_init(&sc->hs_lock, "hvslck", NULL, MTX_DEF);
@@ -1081,7 +1063,6 @@ storvsc_attach(device_t dev)
 
 	sc->hs_destroy = FALSE;
 	sc->hs_drain_notify = FALSE;
-	sc->hs_open_multi_channel = FALSE;
 	sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema");
 
 	ret = hv_storvsc_connect_vsp(hv_dev);
@@ -1186,9 +1167,7 @@ storvsc_detach(device_t dev)
 	struct hv_sgl_node *sgl_node = NULL;
 	int j = 0;
 
-	mtx_lock(&hv_device->channel->inbound_lock);
 	sc->hs_destroy = TRUE;
-	mtx_unlock(&hv_device->channel->inbound_lock);
 
 	/*
 	 * At this point, all outbound traffic should be disabled. We
@@ -2147,8 +2126,9 @@ storvsc_io_done(struct hv_storvsc_request *reqp)
 		reqp->softc->hs_frozen = 0;
 	}
 	storvsc_free_request(sc, reqp);
-	xpt_done(ccb);
 	mtx_unlock(&sc->hs_lock);
+
+	xpt_done_direct(ccb);
 }
 
 /**
diff --git a/sys/dev/hyperv/utilities/hv_heartbeat.c b/sys/dev/hyperv/utilities/hv_heartbeat.c
index c1b6da5..5f4fcf6 100644
--- a/sys/dev/hyperv/utilities/hv_heartbeat.c
+++ b/sys/dev/hyperv/utilities/hv_heartbeat.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2014 Microsoft Corp.
+ * Copyright (c) 2014,2016 Microsoft Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -94,6 +94,10 @@ static int
 hv_heartbeat_probe(device_t dev)
 {
 	const char *p = vmbus_get_type(dev);
+
+	if (resource_disabled("hvheartbeat", 0))
+		return ENXIO;
+
 	if (!memcmp(p, &service_guid, sizeof(hv_guid))) {
 		device_set_desc(dev, "Hyper-V Heartbeat Service");
 		return BUS_PROBE_DEFAULT;
diff --git a/sys/dev/hyperv/utilities/hv_kvp.c b/sys/dev/hyperv/utilities/hv_kvp.c
index 8517918..b1f6ec1 100644
--- a/sys/dev/hyperv/utilities/hv_kvp.c
+++ b/sys/dev/hyperv/utilities/hv_kvp.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2014 Microsoft Corp.
+ * Copyright (c) 2014,2016 Microsoft Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -304,28 +304,11 @@ hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg,
 {
 	int err_ip, err_subnet, err_gway, err_dns, err_adap;
 	int UNUSED_FLAG = 1;
-	int guid_index;
 	struct hv_device *hv_dev;       /* GUID Data Structure */
 	hn_softc_t *sc;                 /* hn softc structure  */
 	char if_name[4];
-	unsigned char guid_instance[40];
-	char *guid_data = NULL;
 	char buf[39];
 
-	struct guid_extract {
-		char	a1[2];
-		char	a2[2];
-		char	a3[2];
-		char	a4[2];
-		char	b1[2];
-		char	b2[2];
-		char	c1[2];
-		char	c2[2];
-		char	d[4];
-		char	e[12];
-	};
-
-	struct guid_extract *id;
 	device_t *devs;
 	int devcnt;
 
@@ -352,17 +335,7 @@ hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg,
 			/* Trying to find GUID of Network Device */
 			hv_dev = sc->hn_dev_obj;
 
-			for (guid_index = 0; guid_index < 16; guid_index++) {
-				sprintf(&guid_instance[guid_index * 2], "%02x",
-				    hv_dev->device_id.data[guid_index]);
-			}
-
-			guid_data = (char *)guid_instance;
-			id = (struct guid_extract *)guid_data;
-			snprintf(buf, sizeof(buf), "{%.2s%.2s%.2s%.2s-%.2s%.2s-%.2s%.2s-%.4s-%s}",
-			    id->a4, id->a3, id->a2, id->a1,
-			    id->b2, id->b1, id->c2, id->c1, id->d, id->e);
-			guid_data = NULL;
+			snprintf_hv_guid(buf, sizeof(buf), &hv_dev->device_id);
 			sprintf(if_name, "%s%d", "hn", device_get_unit(devs[devcnt]));
 
 			if (strncmp(buf, (char *)umsg->body.kvp_ip_val.adapter_id, 39) == 0) {
@@ -890,6 +863,10 @@ static int
 hv_kvp_probe(device_t dev)
 {
 	const char *p = vmbus_get_type(dev);
+
+	if (resource_disabled("hvkvp", 0))
+		return ENXIO;
+
 	if (!memcmp(p, &service_guid, sizeof(hv_guid))) {
 		device_set_desc(dev, "Hyper-V KVP Service");
 		return BUS_PROBE_DEFAULT;
diff --git a/sys/dev/hyperv/utilities/hv_kvp.h b/sys/dev/hyperv/utilities/hv_kvp.h
index b62149e..6474e18 100644
--- a/sys/dev/hyperv/utilities/hv_kvp.h
+++ b/sys/dev/hyperv/utilities/hv_kvp.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2014 Microsoft Corp.
+ * Copyright (c) 2014,2016 Microsoft Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/hyperv/utilities/hv_shutdown.c b/sys/dev/hyperv/utilities/hv_shutdown.c
index 20bc65e..3dfbf13 100644
--- a/sys/dev/hyperv/utilities/hv_shutdown.c
+++ b/sys/dev/hyperv/utilities/hv_shutdown.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2014 Microsoft Corp.
+ * Copyright (c) 2014,2016 Microsoft Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -116,6 +116,10 @@ static int
 hv_shutdown_probe(device_t dev)
 {
 	const char *p = vmbus_get_type(dev);
+
+	if (resource_disabled("hvshutdown", 0))
+		return ENXIO;
+
 	if (!memcmp(p, &service_guid, sizeof(hv_guid))) {
 		device_set_desc(dev, "Hyper-V Shutdown Service");
 		return BUS_PROBE_DEFAULT;
diff --git a/sys/dev/hyperv/utilities/hv_timesync.c b/sys/dev/hyperv/utilities/hv_timesync.c
index d1ea904..eeb0434 100644
--- a/sys/dev/hyperv/utilities/hv_timesync.c
+++ b/sys/dev/hyperv/utilities/hv_timesync.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2014 Microsoft Corp.
+ * Copyright (c) 2014,2016 Microsoft Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -171,6 +171,10 @@ static int
 hv_timesync_probe(device_t dev)
 {
 	const char *p = vmbus_get_type(dev);
+
+	if (resource_disabled("hvtimesync", 0))
+		return ENXIO;
+
 	if (!memcmp(p, &service_guid, sizeof(hv_guid))) {
 		device_set_desc(dev, "Hyper-V Time Synch Service");
 		return BUS_PROBE_DEFAULT;
diff --git a/sys/dev/hyperv/utilities/hv_util.c b/sys/dev/hyperv/utilities/hv_util.c
index 7d19b3f..3119e3f 100644
--- a/sys/dev/hyperv/utilities/hv_util.c
+++ b/sys/dev/hyperv/utilities/hv_util.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2014 Microsoft Corp.
+ * Copyright (c) 2014,2016 Microsoft Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/hyperv/utilities/hv_util.h b/sys/dev/hyperv/utilities/hv_util.h
index 708dca8..e202784 100644
--- a/sys/dev/hyperv/utilities/hv_util.h
+++ b/sys/dev/hyperv/utilities/hv_util.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2012 NetApp Inc.
  * Copyright (c) 2012 Citrix Inc.
  * All rights reserved.
diff --git a/sys/dev/hyperv/vmbus/amd64/hv_vector.S b/sys/dev/hyperv/vmbus/amd64/hv_vector.S
new file mode 100644
index 0000000..2594483
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/amd64/hv_vector.S
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+#include <machine/specialreg.h>
+
+#include "assym.s"
+
+/*
+ * This is the Hyper-V vmbus channel direct callback interrupt.
+ * Only used when it is running on Hyper-V.
+ */
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(hv_vmbus_callback)
+	PUSH_FRAME
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	movq	%rsp, %rdi
+	call	hv_vector_handler
+	MEXITCOUNT
+	jmp	doreti
diff --git a/sys/dev/hyperv/vmbus/hv_channel.c b/sys/dev/hyperv/vmbus/hv_channel.c
index bb777cc..6da0643 100644
--- a/sys/dev/hyperv/vmbus/hv_channel.c
+++ b/sys/dev/hyperv/vmbus/hv_channel.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2012 NetApp Inc.
  * Copyright (c) 2012 Citrix Inc.
  * All rights reserved.
@@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mbuf.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
+#include <sys/sysctl.h>
 #include <machine/bus.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -80,6 +81,90 @@ vmbus_channel_set_event(hv_vmbus_channel *channel)
 
 }
 
+static int
+vmbus_channel_sysctl_monalloc(SYSCTL_HANDLER_ARGS)
+{
+	struct hv_vmbus_channel *chan = arg1;
+	int alloc = 0;
+
+	if (chan->offer_msg.monitor_allocated)
+		alloc = 1;
+	return sysctl_handle_int(oidp, &alloc, 0, req);
+}
+
+static void
+vmbus_channel_sysctl_create(hv_vmbus_channel* channel)
+{
+	device_t dev;
+	struct sysctl_oid *devch_sysctl;
+	struct sysctl_oid *devch_id_sysctl, *devch_sub_sysctl;
+	struct sysctl_oid *devch_id_in_sysctl, *devch_id_out_sysctl;
+	struct sysctl_ctx_list *ctx;
+	uint32_t ch_id;
+	uint16_t sub_ch_id;
+	char name[16];
+	
+	hv_vmbus_channel* primary_ch = channel->primary_channel;
+
+	if (primary_ch == NULL) {
+		dev = channel->device->device;
+		ch_id = channel->offer_msg.child_rel_id;
+	} else {
+		dev = primary_ch->device->device;
+		ch_id = primary_ch->offer_msg.child_rel_id;
+		sub_ch_id = channel->offer_msg.offer.sub_channel_index;
+	}
+	ctx = device_get_sysctl_ctx(dev);
+	/* This creates dev.DEVNAME.DEVUNIT.channel tree */
+	devch_sysctl = SYSCTL_ADD_NODE(ctx,
+		    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
+		    OID_AUTO, "channel", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+	/* This creates dev.DEVNAME.DEVUNIT.channel.CHANID tree */
+	snprintf(name, sizeof(name), "%d", ch_id);
+	devch_id_sysctl = SYSCTL_ADD_NODE(ctx,
+	    	    SYSCTL_CHILDREN(devch_sysctl),
+	    	    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+
+	if (primary_ch != NULL) {
+		devch_sub_sysctl = SYSCTL_ADD_NODE(ctx,
+			SYSCTL_CHILDREN(devch_id_sysctl),
+			OID_AUTO, "sub", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+		snprintf(name, sizeof(name), "%d", sub_ch_id);
+		devch_id_sysctl = SYSCTL_ADD_NODE(ctx,
+			SYSCTL_CHILDREN(devch_sub_sysctl),
+			OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+
+		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(devch_id_sysctl),
+		    OID_AUTO, "chanid", CTLFLAG_RD,
+		    &channel->offer_msg.child_rel_id, 0, "channel id");
+	}
+	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(devch_id_sysctl), OID_AUTO,
+	    "cpu", CTLFLAG_RD, &channel->target_cpu, 0, "owner CPU id");
+	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(devch_id_sysctl), OID_AUTO,
+	    "monitor_allocated", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
+	    channel, 0, vmbus_channel_sysctl_monalloc, "I",
+	    "is monitor allocated to this channel");
+
+	devch_id_in_sysctl = SYSCTL_ADD_NODE(ctx,
+                    SYSCTL_CHILDREN(devch_id_sysctl),
+                    OID_AUTO,
+		    "in",
+		    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+	devch_id_out_sysctl = SYSCTL_ADD_NODE(ctx,
+                    SYSCTL_CHILDREN(devch_id_sysctl),
+                    OID_AUTO,
+		    "out",
+		    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+	hv_ring_buffer_stat(ctx,
+		SYSCTL_CHILDREN(devch_id_in_sysctl),
+		&(channel->inbound),
+		"inbound ring buffer stats");
+	hv_ring_buffer_stat(ctx,
+		SYSCTL_CHILDREN(devch_id_out_sysctl),
+		&(channel->outbound),
+		"outbound ring buffer stats");
+}
+
 /**
  * @brief Open the specified channel
  */
@@ -143,6 +228,9 @@ hv_vmbus_channel_open(
 		in,
 		recv_ring_buffer_size);
 
+	/* Create sysctl tree for this channel */
+	vmbus_channel_sysctl_create(new_channel);
+
 	/**
 	 * Establish the gpadl for the ring buffer
 	 */
@@ -182,12 +270,12 @@ hv_vmbus_channel_open(
 	if (user_data_len)
 		memcpy(open_msg->user_data, user_data, user_data_len);
 
-	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_lock(&hv_vmbus_g_connection.channel_msg_lock);
 	TAILQ_INSERT_TAIL(
 		&hv_vmbus_g_connection.channel_msg_anchor,
 		open_info,
 		msg_list_entry);
-	mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock);
 
 	ret = hv_vmbus_post_message(
 		open_msg, sizeof(hv_vmbus_channel_open_channel));
@@ -214,12 +302,12 @@ hv_vmbus_channel_open(
 	}
 
 	cleanup:
-	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_lock(&hv_vmbus_g_connection.channel_msg_lock);
 	TAILQ_REMOVE(
 		&hv_vmbus_g_connection.channel_msg_anchor,
 		open_info,
 		msg_list_entry);
-	mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock);
 	sema_destroy(&open_info->wait_sema);
 	free(open_info, M_DEVBUF);
 
@@ -384,17 +472,22 @@ hv_vmbus_channel_establish_gpadl(
 	hv_vmbus_channel_msg_info*	curr;
 	uint32_t			next_gpadl_handle;
 
-	next_gpadl_handle = hv_vmbus_g_connection.next_gpadl_handle;
-	atomic_add_int((int*) &hv_vmbus_g_connection.next_gpadl_handle, 1);
+	next_gpadl_handle = atomic_fetchadd_int(
+	    &hv_vmbus_g_connection.next_gpadl_handle, 1);
 
 	ret = vmbus_channel_create_gpadl_header(
 		contig_buffer, size, &msg_info, &msg_count);
 
-	if(ret != 0) { /* if(allocation failed) return immediately */
-	    /* reverse atomic_add_int above */
-	    atomic_subtract_int((int*)
-		    &hv_vmbus_g_connection.next_gpadl_handle, 1);
-	    return ret;
+	if(ret != 0) {
+		/*
+		 * XXX
+		 * We can _not_ even revert the above incremental,
+		 * if multiple GPADL establishments are running
+		 * parallelly, decrement the global next_gpadl_handle
+		 * is calling for _big_ trouble.  A better solution
+		 * is to have a 0-based GPADL id bitmap ...
+		 */
+		return ret;
 	}
 
 	sema_init(&msg_info->wait_sema, 0, "Open Info Sema");
@@ -403,13 +496,13 @@ hv_vmbus_channel_establish_gpadl(
 	gpadl_msg->child_rel_id = channel->offer_msg.child_rel_id;
 	gpadl_msg->gpadl = next_gpadl_handle;
 
-	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_lock(&hv_vmbus_g_connection.channel_msg_lock);
 	TAILQ_INSERT_TAIL(
 		&hv_vmbus_g_connection.channel_msg_anchor,
 		msg_info,
 		msg_list_entry);
 
-	mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock);
 
 	ret = hv_vmbus_post_message(
 		gpadl_msg,
@@ -448,10 +541,10 @@ hv_vmbus_channel_establish_gpadl(
 
 cleanup:
 
-	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_lock(&hv_vmbus_g_connection.channel_msg_lock);
 	TAILQ_REMOVE(&hv_vmbus_g_connection.channel_msg_anchor,
 		msg_info, msg_list_entry);
-	mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock);
 
 	sema_destroy(&msg_info->wait_sema);
 	free(msg_info, M_DEVBUF);
@@ -490,10 +583,10 @@ hv_vmbus_channel_teardown_gpdal(
 	msg->child_rel_id = channel->offer_msg.child_rel_id;
 	msg->gpadl = gpadl_handle;
 
-	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_lock(&hv_vmbus_g_connection.channel_msg_lock);
 	TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_msg_anchor,
 			info, msg_list_entry);
-	mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock);
 
 	ret = hv_vmbus_post_message(msg,
 			sizeof(hv_vmbus_channel_gpadl_teardown));
@@ -506,10 +599,10 @@ cleanup:
 	/*
 	 * Received a torndown response
 	 */
-	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_lock(&hv_vmbus_g_connection.channel_msg_lock);
 	TAILQ_REMOVE(&hv_vmbus_g_connection.channel_msg_anchor,
 			info, msg_list_entry);
-	mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock);
 	sema_destroy(&info->wait_sema);
 	free(info, M_DEVBUF);
 
@@ -525,20 +618,13 @@ hv_vmbus_channel_close_internal(hv_vmbus_channel *channel)
 	hv_vmbus_channel_msg_info* info;
 
 	channel->state = HV_CHANNEL_OPEN_STATE;
-	channel->sc_creation_callback = NULL;
 
 	/*
 	 * set rxq to NULL to avoid more requests be scheduled
 	 */
 	channel->rxq = NULL;
 	taskqueue_drain(rxq, &channel->channel_task);
-	/*
-	 * Grab the lock to prevent race condition when a packet received
-	 * and unloading driver is in the process.
-	 */
-	mtx_lock(&channel->inbound_lock);
 	channel->on_channel_callback = NULL;
-	mtx_unlock(&channel->inbound_lock);
 
 	/**
 	 * Send a closing message
@@ -857,7 +943,6 @@ hv_vmbus_channel_recv_packet_raw(
 {
 	int		ret;
 	uint32_t	packetLen;
-	uint32_t	userLen;
 	hv_vm_packet_descriptor	desc;
 
 	*buffer_actual_len = 0;
@@ -871,8 +956,6 @@ hv_vmbus_channel_recv_packet_raw(
 	    return (0);
 
 	packetLen = desc.length8 << 3;
-	userLen = packetLen - (desc.data_offset8 << 3);
-
 	*buffer_actual_len = packetLen;
 
 	if (packetLen > buffer_len)
@@ -915,12 +998,6 @@ VmbusProcessChannelEvent(void* context, int pending)
 	 * callback to NULL. This closes the window.
 	 */
 
-	/*
-	 * Disable the lock due to newly added WITNESS check in r277723.
-	 * Will seek other way to avoid race condition.
-	 * -- whu
-	 */
-	// mtx_lock(&channel->inbound_lock);
 	if (channel->on_channel_callback != NULL) {
 		arg = channel->channel_callback_context;
 		is_batched_reading = channel->batched_reading;
@@ -947,5 +1024,4 @@ VmbusProcessChannelEvent(void* context, int pending)
 				bytes_to_read = 0;
 		} while (is_batched_reading && (bytes_to_read != 0));
 	}
-	// mtx_unlock(&channel->inbound_lock);
 }
diff --git a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
index ab6e8ad..00b54ed 100644
--- a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
+++ b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2012 NetApp Inc.
  * Copyright (c) 2012 Citrix Inc.
  * All rights reserved.
@@ -30,7 +30,10 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
 #include <sys/mbuf.h>
+#include <sys/mutex.h>
 
 #include "hv_vmbus_priv.h"
 
@@ -95,6 +98,14 @@ typedef struct hv_work_item {
 	void*		context;
 } hv_work_item;
 
+static struct mtx	vmbus_chwait_lock;
+MTX_SYSINIT(vmbus_chwait_lk, &vmbus_chwait_lock, "vmbus primarych wait lock",
+    MTX_DEF);
+static uint32_t		vmbus_chancnt;
+static uint32_t		vmbus_devcnt;
+
+#define VMBUS_CHANCNT_DONE	0x80000000
+
 /**
  * Implementation of the work abstraction.
  */
@@ -143,9 +154,7 @@ hv_vmbus_allocate_channel(void)
 					M_DEVBUF,
 					M_WAITOK | M_ZERO);
 
-	mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF);
 	mtx_init(&channel->sc_lock, "vmbus multi channel", NULL, MTX_DEF);
-
 	TAILQ_INIT(&channel->sc_list_anchor);
 
 	return (channel);
@@ -158,8 +167,6 @@ void
 hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel)
 {
 	mtx_destroy(&channel->sc_lock);
-	mtx_destroy(&channel->inbound_lock);
-
 	free(channel, M_DEVBUF);
 }
 
@@ -170,13 +177,10 @@ hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel)
 static void
 vmbus_channel_process_offer(hv_vmbus_channel *new_channel)
 {
-	boolean_t		f_new;
 	hv_vmbus_channel*	channel;
 	int			ret;
 	uint32_t                relid;
 
-	f_new = TRUE;
-	channel = NULL;
 	relid = new_channel->offer_msg.child_rel_id;
 	/*
 	 * Make sure this is a new offer
@@ -185,31 +189,24 @@ vmbus_channel_process_offer(hv_vmbus_channel *new_channel)
 	hv_vmbus_g_connection.channels[relid] = new_channel;
 
 	TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor,
-	    list_entry)
-	{
+	    list_entry) {
 		if (memcmp(&channel->offer_msg.offer.interface_type,
 		    &new_channel->offer_msg.offer.interface_type,
 		    sizeof(hv_guid)) == 0 &&
 		    memcmp(&channel->offer_msg.offer.interface_instance,
 		    &new_channel->offer_msg.offer.interface_instance,
-		    sizeof(hv_guid)) == 0) {
-			f_new = FALSE;
+		    sizeof(hv_guid)) == 0)
 			break;
-		}
 	}
 
-	if (f_new) {
-		/* Insert at tail */
-		TAILQ_INSERT_TAIL(
-		    &hv_vmbus_g_connection.channel_anchor,
-		    new_channel,
-		    list_entry);
+	if (channel == NULL) {
+		/* Install the new primary channel */
+		TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor,
+		    new_channel, list_entry);
 	}
 	mtx_unlock(&hv_vmbus_g_connection.channel_lock);
 
-	/*XXX add new channel to percpu_list */
-
-	if (!f_new) {
+	if (channel != NULL) {
 		/*
 		 * Check if this is a sub channel.
 		 */
@@ -218,17 +215,20 @@ vmbus_channel_process_offer(hv_vmbus_channel *new_channel)
 			 * It is a sub channel offer, process it.
 			 */
 			new_channel->primary_channel = channel;
+			new_channel->device = channel->device;
 			mtx_lock(&channel->sc_lock);
-			TAILQ_INSERT_TAIL(
-			    &channel->sc_list_anchor,
-			    new_channel,
-			    sc_list_entry);
+			TAILQ_INSERT_TAIL(&channel->sc_list_anchor,
+			    new_channel, sc_list_entry);
 			mtx_unlock(&channel->sc_lock);
 
+			if (bootverbose) {
+				printf("VMBUS get multi-channel offer, "
+				    "rel=%u, sub=%u\n",
+				    new_channel->offer_msg.child_rel_id,
+				    new_channel->offer_msg.offer.sub_channel_index);	
+			}
+
 			/* Insert new channel into channel_anchor. */
-			printf("VMBUS get multi-channel offer, rel=%u,sub=%u\n",
-			    new_channel->offer_msg.child_rel_id,
-			    new_channel->offer_msg.offer.sub_channel_index);	
 			mtx_lock(&hv_vmbus_g_connection.channel_lock);
 			TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor,
 			    new_channel, list_entry);				
@@ -239,17 +239,25 @@ vmbus_channel_process_offer(hv_vmbus_channel *new_channel)
 				    "its primary channel is <%p>.\n",
 				    new_channel, new_channel->primary_channel);
 
-			/*XXX add it to percpu_list */
-
 			new_channel->state = HV_CHANNEL_OPEN_STATE;
-			if (channel->sc_creation_callback != NULL) {
-				channel->sc_creation_callback(new_channel);
-			}
+
+			/*
+			 * Bump up sub-channel count and notify anyone that is
+			 * interested in this sub-channel, after this sub-channel
+			 * is setup.
+			 */
+			mtx_lock(&channel->sc_lock);
+			channel->subchan_cnt++;
+			mtx_unlock(&channel->sc_lock);
+			wakeup(channel);
+
 			return;
 		}
 
-	    hv_vmbus_free_vmbus_channel(new_channel);
-	    return;
+		printf("VMBUS: duplicated primary channel%u\n",
+		    new_channel->offer_msg.child_rel_id);
+		hv_vmbus_free_vmbus_channel(new_channel);
+		return;
 	}
 
 	new_channel->state = HV_CHANNEL_OPEN_STATE;
@@ -271,13 +279,37 @@ vmbus_channel_process_offer(hv_vmbus_channel *new_channel)
 	ret = hv_vmbus_child_device_register(new_channel->device);
 	if (ret != 0) {
 		mtx_lock(&hv_vmbus_g_connection.channel_lock);
-		TAILQ_REMOVE(
-		    &hv_vmbus_g_connection.channel_anchor,
-		    new_channel,
-		    list_entry);
+		TAILQ_REMOVE(&hv_vmbus_g_connection.channel_anchor,
+		    new_channel, list_entry);
 		mtx_unlock(&hv_vmbus_g_connection.channel_lock);
 		hv_vmbus_free_vmbus_channel(new_channel);
 	}
+
+	mtx_lock(&vmbus_chwait_lock);
+	vmbus_devcnt++;
+	mtx_unlock(&vmbus_chwait_lock);
+	wakeup(&vmbus_devcnt);
+}
+
+void
+vmbus_channel_cpu_set(struct hv_vmbus_channel *chan, int cpu)
+{
+	KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu %d", cpu));
+
+	if (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008 ||
+	    hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7) {
+		/* Only cpu0 is supported */
+		cpu = 0;
+	}
+
+	chan->target_cpu = cpu;
+	chan->target_vcpu = hv_vmbus_g_context.hv_vcpu_index[cpu];
+
+	if (bootverbose) {
+		printf("vmbus_chan%u: assigned to cpu%u [vcpu%u]\n",
+		    chan->offer_msg.child_rel_id,
+		    chan->target_cpu, chan->target_vcpu);
+	}
 }
 
 /**
@@ -312,11 +344,12 @@ static uint32_t next_vcpu;
  * distributed across all available CPUs.
  */
 static void
-vmbus_channel_select_cpu(hv_vmbus_channel *channel, hv_guid *guid)
+vmbus_channel_select_defcpu(struct hv_vmbus_channel *channel)
 {
 	uint32_t current_cpu;
 	int i;
 	boolean_t is_perf_channel = FALSE;
+	const hv_guid *guid = &channel->offer_msg.offer.interface_type;
 
 	for (i = PERF_CHN_NIC; i < MAX_PERF_CHN; i++) {
 		if (memcmp(guid->data, high_perf_devices[i].data,
@@ -326,24 +359,14 @@ vmbus_channel_select_cpu(hv_vmbus_channel *channel, hv_guid *guid)
 		}
 	}
 
-	if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
-	    (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7) ||
-	    (!is_perf_channel)) {
-		/* Host's view of guest cpu */
-		channel->target_vcpu = 0;
-		/* Guest's own view of cpu */
-		channel->target_cpu = 0;
+	if (!is_perf_channel) {
+		/* Stick to cpu0 */
+		vmbus_channel_cpu_set(channel, 0);
 		return;
 	}
 	/* mp_ncpus should have the number cpus currently online */
 	current_cpu = (++next_vcpu % mp_ncpus);
-	channel->target_cpu = current_cpu;
-	channel->target_vcpu =
-	    hv_vmbus_g_context.hv_vcpu_index[current_cpu];
-	if (bootverbose)
-		printf("VMBUS: Total online cpus %d, assign perf channel %d "
-		    "to vcpu %d, cpu %d\n", mp_ncpus, i, channel->target_vcpu,
-		    current_cpu);
+	vmbus_channel_cpu_set(channel, current_cpu);
 }
 
 /**
@@ -362,12 +385,6 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr)
 
 	offer = (hv_vmbus_channel_offer_channel*) hdr;
 
-	hv_guid *guidType;
-	hv_guid *guidInstance;
-
-	guidType = &offer->offer.interface_type;
-	guidInstance = &offer->offer.interface_instance;
-
 	// copy offer data
 	copied = malloc(sizeof(*copied), M_DEVBUF, M_NOWAIT);
 	if (copied == NULL) {
@@ -377,6 +394,11 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr)
 
 	memcpy(copied, hdr, sizeof(*copied));
 	hv_queue_work_item(vmbus_channel_on_offer_internal, copied);
+
+	mtx_lock(&vmbus_chwait_lock);
+	if ((vmbus_chancnt & VMBUS_CHANCNT_DONE) == 0)
+		vmbus_chancnt++;
+	mtx_unlock(&vmbus_chwait_lock);
 }
 
 static void
@@ -414,17 +436,14 @@ vmbus_channel_on_offer_internal(void* context)
 		    offer->connection_id;
 	}
 
-	/*
-	 * Bind the channel to a chosen cpu.
-	 */
-	vmbus_channel_select_cpu(new_channel,
-	    &offer->offer.interface_type);
-
 	memcpy(&new_channel->offer_msg, offer,
 	    sizeof(hv_vmbus_channel_offer_channel));
 	new_channel->monitor_group = (uint8_t) offer->monitor_id / 32;
 	new_channel->monitor_bit = (uint8_t) offer->monitor_id % 32;
 
+	/* Select default cpu for this channel. */
+	vmbus_channel_select_defcpu(new_channel);
+
 	vmbus_channel_process_offer(new_channel);
 
 	free(offer, M_DEVBUF);
@@ -458,7 +477,10 @@ vmbus_channel_on_offer_rescind_internal(void *context)
 	hv_vmbus_channel*               channel;
 
 	channel = (hv_vmbus_channel*)context;
-	hv_vmbus_child_device_unregister(channel->device);
+	if (HV_VMBUS_CHAN_ISPRIMARY(channel)) {
+		/* Only primary channel owns the hv_device */
+		hv_vmbus_child_device_unregister(channel->device);
+	}
 }
 
 /**
@@ -468,6 +490,11 @@ vmbus_channel_on_offer_rescind_internal(void *context)
 static void
 vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr)
 {
+
+	mtx_lock(&vmbus_chwait_lock);
+	vmbus_chancnt |= VMBUS_CHANCNT_DONE;
+	mtx_unlock(&vmbus_chwait_lock);
+	wakeup(&vmbus_chancnt);
 }
 
 /**
@@ -490,7 +517,7 @@ vmbus_channel_on_open_result(hv_vmbus_channel_msg_header* hdr)
 	/*
 	 * Find the open msg, copy the result and signal/unblock the wait event
 	 */
-	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_lock(&hv_vmbus_g_connection.channel_msg_lock);
 
 	TAILQ_FOREACH(msg_info, &hv_vmbus_g_connection.channel_msg_anchor,
 	    msg_list_entry) {
@@ -508,7 +535,7 @@ vmbus_channel_on_open_result(hv_vmbus_channel_msg_header* hdr)
 		}
 	    }
 	}
-	mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock);
 
 }
 
@@ -532,7 +559,7 @@ vmbus_channel_on_gpadl_created(hv_vmbus_channel_msg_header* hdr)
 	/* Find the establish msg, copy the result and signal/unblock
 	 * the wait event
 	 */
-	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_lock(&hv_vmbus_g_connection.channel_msg_lock);
 	TAILQ_FOREACH(msg_info, &hv_vmbus_g_connection.channel_msg_anchor,
 		msg_list_entry) {
 	    request_header = (hv_vmbus_channel_msg_header*) msg_info->msg;
@@ -551,7 +578,7 @@ vmbus_channel_on_gpadl_created(hv_vmbus_channel_msg_header* hdr)
 		}
 	    }
 	}
-	mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock);
 }
 
 /**
@@ -576,7 +603,7 @@ vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr)
 	 * wait event.
 	 */
 
-	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_lock(&hv_vmbus_g_connection.channel_msg_lock);
 
 	TAILQ_FOREACH(msg_info, &hv_vmbus_g_connection.channel_msg_anchor,
 		msg_list_entry) {
@@ -596,7 +623,7 @@ vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr)
 		}
 	    }
 	}
-    mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+    mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock);
 }
 
 /**
@@ -616,7 +643,7 @@ vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr)
 
 	versionResponse = (hv_vmbus_channel_version_response*)hdr;
 
-	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_lock(&hv_vmbus_g_connection.channel_msg_lock);
 	TAILQ_FOREACH(msg_info, &hv_vmbus_g_connection.channel_msg_anchor,
 	    msg_list_entry) {
 	    requestHeader = (hv_vmbus_channel_msg_header*) msg_info->msg;
@@ -630,7 +657,7 @@ vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr)
 		sema_post(&msg_info->wait_sema);
 	    }
 	}
-    mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+    mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock);
 
 }
 
@@ -679,7 +706,10 @@ hv_vmbus_release_unattached_channels(void)
 	    TAILQ_REMOVE(&hv_vmbus_g_connection.channel_anchor,
 			    channel, list_entry);
 
-	    hv_vmbus_child_device_unregister(channel->device);
+	    if (HV_VMBUS_CHAN_ISPRIMARY(channel)) {
+		/* Only primary channel owns the hv_device */
+		hv_vmbus_child_device_unregister(channel->device);
+	    }
 	    hv_vmbus_free_vmbus_channel(channel);
 	}
 	bzero(hv_vmbus_g_connection.channels, 
@@ -742,3 +772,56 @@ vmbus_select_outgoing_channel(struct hv_vmbus_channel *primary)
 
 	return(outgoing_channel);
 }
+
+void
+vmbus_scan(void)
+{
+	uint32_t chancnt;
+
+	mtx_lock(&vmbus_chwait_lock);
+	while ((vmbus_chancnt & VMBUS_CHANCNT_DONE) == 0)
+		mtx_sleep(&vmbus_chancnt, &vmbus_chwait_lock, 0, "waitch", 0);
+	chancnt = vmbus_chancnt & ~VMBUS_CHANCNT_DONE;
+
+	while (vmbus_devcnt != chancnt)
+		mtx_sleep(&vmbus_devcnt, &vmbus_chwait_lock, 0, "waitdev", 0);
+	mtx_unlock(&vmbus_chwait_lock);
+}
+
+struct hv_vmbus_channel **
+vmbus_get_subchan(struct hv_vmbus_channel *pri_chan, int subchan_cnt)
+{
+	struct hv_vmbus_channel **ret, *chan;
+	int i;
+
+	ret = malloc(subchan_cnt * sizeof(struct hv_vmbus_channel *), M_TEMP,
+	    M_WAITOK);
+
+	mtx_lock(&pri_chan->sc_lock);
+
+	while (pri_chan->subchan_cnt < subchan_cnt)
+		mtx_sleep(pri_chan, &pri_chan->sc_lock, 0, "subch", 0);
+
+	i = 0;
+	TAILQ_FOREACH(chan, &pri_chan->sc_list_anchor, sc_list_entry) {
+		/* TODO: refcnt chan */
+		ret[i] = chan;
+
+		++i;
+		if (i == subchan_cnt)
+			break;
+	}
+	KASSERT(i == subchan_cnt, ("invalid subchan count %d, should be %d",
+	    pri_chan->subchan_cnt, subchan_cnt));
+
+	mtx_unlock(&pri_chan->sc_lock);
+
+	return ret;
+}
+
+void
+vmbus_rel_subchan(struct hv_vmbus_channel **subchan, int subchan_cnt __unused)
+{
+
+	free(subchan, M_TEMP);
+}
diff --git a/sys/dev/hyperv/vmbus/hv_connection.c b/sys/dev/hyperv/vmbus/hv_connection.c
index fb1879d..0424b47 100644
--- a/sys/dev/hyperv/vmbus/hv_connection.c
+++ b/sys/dev/hyperv/vmbus/hv_connection.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2012 NetApp Inc.
  * Copyright (c) 2012 Citrix Inc.
  * All rights reserved.
@@ -99,26 +99,26 @@ hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info,
 	 * Add to list before we send the request since we may receive the
 	 * response before returning from this routine
 	 */
-	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_lock(&hv_vmbus_g_connection.channel_msg_lock);
 
 	TAILQ_INSERT_TAIL(
 		&hv_vmbus_g_connection.channel_msg_anchor,
 		msg_info,
 		msg_list_entry);
 
-	mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock);
 
 	ret = hv_vmbus_post_message(
 		msg,
 		sizeof(hv_vmbus_channel_initiate_contact));
 
 	if (ret != 0) {
-		mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+		mtx_lock(&hv_vmbus_g_connection.channel_msg_lock);
 		TAILQ_REMOVE(
 			&hv_vmbus_g_connection.channel_msg_anchor,
 			msg_info,
 			msg_list_entry);
-		mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+		mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock);
 		return (ret);
 	}
 
@@ -127,12 +127,12 @@ hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info,
 	 */
 	ret = sema_timedwait(&msg_info->wait_sema, 5 * hz); /* KYS 5 seconds */
 
-	mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_lock(&hv_vmbus_g_connection.channel_msg_lock);
 	TAILQ_REMOVE(
 		&hv_vmbus_g_connection.channel_msg_anchor,
 		msg_info,
 		msg_list_entry);
-	mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+	mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock);
 
 	/**
 	 * Check if successful
@@ -169,7 +169,7 @@ hv_vmbus_connect(void) {
 
 	TAILQ_INIT(&hv_vmbus_g_connection.channel_msg_anchor);
 	mtx_init(&hv_vmbus_g_connection.channel_msg_lock, "vmbus channel msg",
-		NULL, MTX_SPIN);
+		NULL, MTX_DEF);
 
 	TAILQ_INIT(&hv_vmbus_g_connection.channel_anchor);
 	mtx_init(&hv_vmbus_g_connection.channel_lock, "vmbus channel",
@@ -308,14 +308,18 @@ hv_vmbus_on_events(int cpu)
 	KASSERT(cpu <= mp_maxid, ("VMBUS: hv_vmbus_on_events: "
 	    "cpu out of range!"));
 
+	page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu];
+	event = (hv_vmbus_synic_event_flags *)
+	    page_addr + HV_VMBUS_MESSAGE_SINT;
 	if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
 	    (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) {
 		maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5;
 		/*
 		 * receive size is 1/2 page and divide that by 4 bytes
 		 */
-		recv_interrupt_page =
-		    hv_vmbus_g_connection.recv_interrupt_page;
+		if (synch_test_and_clear_bit(0, &event->flags32[0]))
+			recv_interrupt_page =
+			    hv_vmbus_g_connection.recv_interrupt_page;
 	} else {
 		/*
 		 * On Host with Win8 or above, the event page can be
@@ -323,9 +327,6 @@ hv_vmbus_on_events(int cpu)
 		 * that has the pending interrupt.
 		 */
 		maxdword = HV_EVENT_FLAGS_DWORD_COUNT;
-		page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu];
-		event = (hv_vmbus_synic_event_flags *)
-		    page_addr + HV_VMBUS_MESSAGE_SINT;
 		recv_interrupt_page = event->flags32;
 	}
 
@@ -367,31 +368,35 @@ hv_vmbus_on_events(int cpu)
 /**
  * Send a msg on the vmbus's message connection
  */
-int hv_vmbus_post_message(void *buffer, size_t bufferLen) {
-	int ret = 0;
+int hv_vmbus_post_message(void *buffer, size_t bufferLen)
+{
 	hv_vmbus_connection_id connId;
-	unsigned retries = 0;
+	sbintime_t time = SBT_1MS;
+	int retries;
+	int ret;
 
-	/* NetScaler delays from previous code were consolidated here */
-	static int delayAmount[] = {100, 100, 100, 500, 500, 5000, 5000, 5000};
+	connId.as_uint32_t = 0;
+	connId.u.id = HV_VMBUS_MESSAGE_CONNECTION_ID;
 
-	/* for(each entry in delayAmount) try to post message,
-	 *  delay a little bit before retrying
+	/*
+	 * We retry to cope with transient failures caused by host side's
+	 * insufficient resources. 20 times should suffice in practice.
 	 */
-	for (retries = 0;
-	    retries < sizeof(delayAmount)/sizeof(delayAmount[0]); retries++) {
-	    connId.as_uint32_t = 0;
-	    connId.u.id = HV_VMBUS_MESSAGE_CONNECTION_ID;
-	    ret = hv_vmbus_post_msg_via_msg_ipc(connId, 1, buffer, bufferLen);
-	    if (ret != HV_STATUS_INSUFFICIENT_BUFFERS)
-		break;
-	    /* TODO: KYS We should use a blocking wait call */
-	    DELAY(delayAmount[retries]);
+	for (retries = 0; retries < 20; retries++) {
+		ret = hv_vmbus_post_msg_via_msg_ipc(connId, 1, buffer,
+						    bufferLen);
+		if (ret == HV_STATUS_SUCCESS)
+			return (0);
+
+		pause_sbt("pstmsg", time, 0, C_HARDCLOCK);
+		if (time < SBT_1S * 2)
+			time *= 2;
 	}
 
-	KASSERT(ret == 0, ("Error VMBUS: Message Post Failed\n"));
+	KASSERT(ret == HV_STATUS_SUCCESS,
+		("Error VMBUS: Message Post Failed, ret=%d\n", ret));
 
-	return (ret);
+	return (EAGAIN);
 }
 
 /**
diff --git a/sys/dev/hyperv/vmbus/hv_et.c b/sys/dev/hyperv/vmbus/hv_et.c
index d961486..440b514 100644
--- a/sys/dev/hyperv/vmbus/hv_et.c
+++ b/sys/dev/hyperv/vmbus/hv_et.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2015 Microsoft Corp.
+ * Copyright (c) 2015,2016 Microsoft Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -28,6 +28,9 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
@@ -40,8 +43,7 @@ __FBSDID("$FreeBSD$");
 #define HV_MAX_DELTA_TICKS		0xffffffffLL
 #define HV_MIN_DELTA_TICKS		1LL
 
-static struct eventtimer et;
-static uint64_t periodticks[MAXCPU];
+static struct eventtimer *et;
 
 static inline uint64_t
 sbintime2tick(sbintime_t time)
@@ -60,11 +62,7 @@ hv_et_start(struct eventtimer *et, sbintime_t firsttime, sbintime_t periodtime)
 
 	timer_cfg.as_uint64 = 0;
 	timer_cfg.auto_enable = 1;
-	timer_cfg.sintx = HV_VMBUS_MESSAGE_SINT;
-
-	periodticks[curcpu] = sbintime2tick(periodtime);
-	if (firsttime == 0)
-		firsttime = periodtime;
+	timer_cfg.sintx = HV_VMBUS_TIMER_SINT;
 
 	current = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
 	current += sbintime2tick(firsttime);
@@ -87,45 +85,77 @@ hv_et_stop(struct eventtimer *et)
 void
 hv_et_intr(struct trapframe *frame)
 {
-	union hv_timer_config timer_cfg;
 	struct trapframe *oldframe;
 	struct thread *td;
 
-	if (periodticks[curcpu] != 0) {
-		uint64_t tick = sbintime2tick(periodticks[curcpu]);
-		timer_cfg.as_uint64 = rdmsr(HV_X64_MSR_STIMER0_CONFIG);
-		timer_cfg.enable = 0;
-		timer_cfg.auto_enable = 1;
-		timer_cfg.periodic = 1;
-		periodticks[curcpu] = 0;
-
-		wrmsr(HV_X64_MSR_STIMER0_CONFIG, timer_cfg.as_uint64);
-		wrmsr(HV_X64_MSR_STIMER0_COUNT, tick);
-	}
-
-	if (et.et_active) {
+	if (et->et_active) {
 		td = curthread;
 		td->td_intr_nesting_level++;
 		oldframe = td->td_intr_frame;
 		td->td_intr_frame = frame;
-		et.et_event_cb(&et, et.et_arg);
+		et->et_event_cb(et, et->et_arg);
 		td->td_intr_frame = oldframe;
 		td->td_intr_nesting_level--;
 	}
 }
 
-void
-hv_et_init(void)
+static void
+hv_et_identify(driver_t *driver, device_t parent)
 {
-	et.et_name = "HyperV";
-	et.et_flags = ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU | ET_FLAGS_PERIODIC;
-	et.et_quality = 1000;
-	et.et_frequency = HV_TIMER_FREQUENCY;
-	et.et_min_period = (1LL << 32) / HV_TIMER_FREQUENCY;
-	et.et_max_period = HV_MAX_DELTA_TICKS * ((1LL << 32) / HV_TIMER_FREQUENCY);
-	et.et_start = hv_et_start;
-	et.et_stop = hv_et_stop;
-	et.et_priv = &et;
-	et_register(&et);
+	if (device_find_child(parent, "hv_et", -1) != NULL)
+		return;
+
+	device_add_child(parent, "hv_et", -1);
+}
+
+static int
+hv_et_probe(device_t dev)
+{
+	device_set_desc(dev, "Hyper-V event timer");
+
+	return (BUS_PROBE_NOWILDCARD);
 }
 
+static int
+hv_et_attach(device_t dev)
+{
+	/* XXX: need allocate SINT and remove global et */
+	et = device_get_softc(dev);
+
+	et->et_name = "Hyper-V";
+	et->et_flags = ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU;
+	et->et_quality = 1000;
+	et->et_frequency = HV_TIMER_FREQUENCY;
+	et->et_min_period = HV_MIN_DELTA_TICKS * ((1LL << 32) / HV_TIMER_FREQUENCY);
+	et->et_max_period = HV_MAX_DELTA_TICKS * ((1LL << 32) / HV_TIMER_FREQUENCY);
+	et->et_start = hv_et_start;
+	et->et_stop = hv_et_stop;
+	et->et_priv = dev;
+
+	return (et_register(et));
+}
+
+static int
+hv_et_detach(device_t dev)
+{
+	return (et_deregister(et));
+}
+
+static device_method_t hv_et_methods[] = {
+	DEVMETHOD(device_identify,      hv_et_identify),
+	DEVMETHOD(device_probe,         hv_et_probe),
+	DEVMETHOD(device_attach,        hv_et_attach),
+	DEVMETHOD(device_detach,        hv_et_detach),
+
+	DEVMETHOD_END
+};
+
+static driver_t hv_et_driver = {
+	"hv_et",
+	hv_et_methods,
+	sizeof(struct eventtimer)
+};
+
+static devclass_t hv_et_devclass;
+DRIVER_MODULE(hv_et, vmbus, hv_et_driver, hv_et_devclass, NULL, 0);
+MODULE_VERSION(hv_et, 1);
diff --git a/sys/dev/hyperv/vmbus/hv_hv.c b/sys/dev/hyperv/vmbus/hv_hv.c
index 6afc2b8..70a5608 100644
--- a/sys/dev/hyperv/vmbus/hv_hv.c
+++ b/sys/dev/hyperv/vmbus/hv_hv.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2012 NetApp Inc.
  * Copyright (c) 2012 Citrix Inc.
  * All rights reserved.
@@ -33,6 +33,7 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/timetc.h>
@@ -47,9 +48,16 @@ __FBSDID("$FreeBSD$");
 
 #define HV_NANOSECONDS_PER_SEC		1000000000L
 
+#define	HYPERV_INTERFACE		0x31237648	/* HV#1 */
 
 static u_int hv_get_timecount(struct timecounter *tc);
 
+u_int	hyperv_features;
+u_int	hyperv_recommends;
+
+static u_int	hyperv_pm_features;
+static u_int	hyperv_features3;
+
 /**
  * Globals
  */
@@ -70,47 +78,6 @@ hv_get_timecount(struct timecounter *tc)
 }
 
 /**
- * @brief Query the cpuid for presence of windows hypervisor
- */
-int
-hv_vmbus_query_hypervisor_presence(void) 
-{
-	if (vm_guest != VM_GUEST_HV)
-		return (0);
-
-	return (hv_high >= HV_X64_CPUID_MIN && hv_high <= HV_X64_CPUID_MAX);
-}
-
-/**
- * @brief Get version of the windows hypervisor
- */
-static int
-hv_vmbus_get_hypervisor_version(void) 
-{
-	u_int regs[4];
-	unsigned int maxLeaf;
-	unsigned int op;
-
-	/*
-	 * Its assumed that this is called after confirming that
-	 * Viridian is present
-	 * Query id and revision.
-	 */
-	op = HV_CPU_ID_FUNCTION_HV_VENDOR_AND_MAX_FUNCTION;
-	do_cpuid(op, regs);
-
-	maxLeaf = regs[0];
-	op = HV_CPU_ID_FUNCTION_HV_INTERFACE;
-	do_cpuid(op, regs);
-
-	if (maxLeaf >= HV_CPU_ID_FUNCTION_MS_HV_VERSION) {
-	    op = HV_CPU_ID_FUNCTION_MS_HV_VERSION;
-	    do_cpuid(op, regs);
-	}
-	return (maxLeaf);
-}
-
-/**
  * @brief Invoke the specified hypercall
  */
 static uint64_t
@@ -159,9 +126,8 @@ hv_vmbus_do_hypercall(uint64_t control, void* input, void* output)
 int
 hv_vmbus_init(void) 
 {
-	int					max_leaf;
 	hv_vmbus_x64_msr_hypercall_contents	hypercall_msr;
-	void* 					virt_addr = 0;
+	void* 					virt_addr = NULL;
 
 	memset(
 	    hv_vmbus_g_context.syn_ic_event_page,
@@ -176,8 +142,6 @@ hv_vmbus_init(void)
 	if (vm_guest != VM_GUEST_HV)
 	    goto cleanup;
 
-	max_leaf = hv_vmbus_get_hypervisor_version();
-
 	/*
 	 * Write our OS info
 	 */
@@ -207,10 +171,6 @@ hv_vmbus_init(void)
 
 	hv_vmbus_g_context.hypercall_page = virt_addr;
 
-	tc_init(&hv_timecounter); /* register virtual timecount */
-
-	hv_et_init();
-	
 	return (0);
 
 	cleanup:
@@ -368,6 +328,9 @@ hv_vmbus_synic_init(void *arg)
 	wrmsr(HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT,
 	    shared_sint.as_uint64_t);
 
+	wrmsr(HV_X64_MSR_SINT0 + HV_VMBUS_TIMER_SINT,
+	    shared_sint.as_uint64_t);
+
 	/* Enable the global synic bit */
 	sctrl.as_uint64_t = rdmsr(HV_X64_MSR_SCONTROL);
 	sctrl.u.enable = 1;
@@ -404,12 +367,23 @@ void hv_vmbus_synic_cleanup(void *arg)
 	shared_sint.u.masked = 1;
 
 	/*
-	 * Disable the interrupt
+	 * Disable the interrupt 0
 	 */
 	wrmsr(
 	    HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT,
 	    shared_sint.as_uint64_t);
 
+	shared_sint.as_uint64_t = rdmsr(
+	    HV_X64_MSR_SINT0 + HV_VMBUS_TIMER_SINT);
+
+	shared_sint.u.masked = 1;
+
+	/*
+	 * Disable the interrupt 1
+	 */
+	wrmsr(
+	    HV_X64_MSR_SINT0 + HV_VMBUS_TIMER_SINT,
+	    shared_sint.as_uint64_t);
 	simp.as_uint64_t = rdmsr(HV_X64_MSR_SIMP);
 	simp.u.simp_enabled = 0;
 	simp.u.base_simp_gpa = 0;
@@ -423,3 +397,117 @@ void hv_vmbus_synic_cleanup(void *arg)
 	wrmsr(HV_X64_MSR_SIEFP, siefp.as_uint64_t);
 }
 
+static bool
+hyperv_identify(void)
+{
+	u_int regs[4];
+	unsigned int maxLeaf;
+	unsigned int op;
+
+	if (vm_guest != VM_GUEST_HV)
+		return (false);
+
+	op = HV_CPU_ID_FUNCTION_HV_VENDOR_AND_MAX_FUNCTION;
+	do_cpuid(op, regs);
+	maxLeaf = regs[0];
+	if (maxLeaf < HV_CPU_ID_FUNCTION_MS_HV_IMPLEMENTATION_LIMITS)
+		return (false);
+
+	op = HV_CPU_ID_FUNCTION_HV_INTERFACE;
+	do_cpuid(op, regs);
+	if (regs[0] != HYPERV_INTERFACE)
+		return (false);
+
+	op = HV_CPU_ID_FUNCTION_MS_HV_FEATURES;
+	do_cpuid(op, regs);
+	if ((regs[0] & HV_FEATURE_MSR_HYPERCALL) == 0) {
+		/*
+		 * Hyper-V w/o Hypercall is impossible; someone
+		 * is faking Hyper-V.
+		 */
+		return (false);
+	}
+	hyperv_features = regs[0];
+	hyperv_pm_features = regs[2];
+	hyperv_features3 = regs[3];
+
+	op = HV_CPU_ID_FUNCTION_MS_HV_VERSION;
+	do_cpuid(op, regs);
+	printf("Hyper-V Version: %d.%d.%d [SP%d]\n",
+	    regs[1] >> 16, regs[1] & 0xffff, regs[0], regs[2]);
+
+	printf("  Features=0x%b\n", hyperv_features,
+	    "\020"
+	    "\001VPRUNTIME"	/* MSR_VP_RUNTIME */
+	    "\002TMREFCNT"	/* MSR_TIME_REF_COUNT */
+	    "\003SYNIC"		/* MSRs for SynIC */
+	    "\004SYNTM"		/* MSRs for SynTimer */
+	    "\005APIC"		/* MSR_{EOI,ICR,TPR} */
+	    "\006HYPERCALL"	/* MSR_{GUEST_OS_ID,HYPERCALL} */
+	    "\007VPINDEX"	/* MSR_VP_INDEX */
+	    "\010RESET"		/* MSR_RESET */
+	    "\011STATS"		/* MSR_STATS_ */
+	    "\012REFTSC"	/* MSR_REFERENCE_TSC */
+	    "\013IDLE"		/* MSR_GUEST_IDLE */
+	    "\014TMFREQ"	/* MSR_{TSC,APIC}_FREQUENCY */
+	    "\015DEBUG");	/* MSR_SYNTH_DEBUG_ */
+	printf("  PM Features=max C%u, 0x%b\n",
+	    HV_PM_FEATURE_CSTATE(hyperv_pm_features),
+	    (hyperv_pm_features & ~HV_PM_FEATURE_CSTATE_MASK),
+	    "\020"
+	    "\005C3HPET");	/* HPET is required for C3 state */
+	printf("  Features3=0x%b\n", hyperv_features3,
+	    "\020"
+	    "\001MWAIT"		/* MWAIT */
+	    "\002DEBUG"		/* guest debug support */
+	    "\003PERFMON"	/* performance monitor */
+	    "\004PCPUDPE"	/* physical CPU dynamic partition event */
+	    "\005XMMHC"		/* hypercall input through XMM regs */
+	    "\006IDLE"		/* guest idle support */
+	    "\007SLEEP"		/* hypervisor sleep support */
+	    "\010NUMA"		/* NUMA distance query support */
+	    "\011TMFREQ"	/* timer frequency query (TSC, LAPIC) */
+	    "\012SYNCMC"	/* inject synthetic machine checks */
+	    "\013CRASH"		/* MSRs for guest crash */
+	    "\014DEBUGMSR"	/* MSRs for guest debug */
+	    "\015NPIEP"		/* NPIEP */
+	    "\016HVDIS");	/* disabling hypervisor */
+
+	op = HV_CPU_ID_FUNCTION_MS_HV_ENLIGHTENMENT_INFORMATION;
+	do_cpuid(op, regs);
+	hyperv_recommends = regs[0];
+	if (bootverbose)
+		printf("  Recommends: %08x %08x\n", regs[0], regs[1]);
+
+	op = HV_CPU_ID_FUNCTION_MS_HV_IMPLEMENTATION_LIMITS;
+	do_cpuid(op, regs);
+	if (bootverbose) {
+		printf("  Limits: Vcpu:%d Lcpu:%d Int:%d\n",
+		    regs[0], regs[1], regs[2]);
+	}
+
+	if (maxLeaf >= HV_CPU_ID_FUNCTION_MS_HV_HARDWARE_FEATURE) {
+		op = HV_CPU_ID_FUNCTION_MS_HV_HARDWARE_FEATURE;
+		do_cpuid(op, regs);
+		if (bootverbose) {
+			printf("  HW Features: %08x AMD: %08x\n",
+			    regs[0], regs[3]);
+		}
+	}
+
+	return (true);
+}
+
+static void
+hyperv_init(void *dummy __unused)
+{
+	if (!hyperv_identify())
+		return;
+
+	if (hyperv_features & HV_FEATURE_MSR_TIME_REFCNT) {
+		/* Register virtual timecount */
+		tc_init(&hv_timecounter);
+	}
+}
+SYSINIT(hyperv_initialize, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, hyperv_init,
+    NULL);
diff --git a/sys/dev/hyperv/vmbus/hv_ring_buffer.c b/sys/dev/hyperv/vmbus/hv_ring_buffer.c
index 0e51ef7..cd82b27 100644
--- a/sys/dev/hyperv/vmbus/hv_ring_buffer.c
+++ b/sys/dev/hyperv/vmbus/hv_ring_buffer.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2012 NetApp Inc.
  * Copyright (c) 2012 Citrix Inc.
  * All rights reserved.
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
+#include <sys/sysctl.h>
 
 #include "hv_vmbus_priv.h"
 
@@ -39,6 +40,47 @@ __FBSDID("$FreeBSD$");
 #define	HV_BYTES_AVAIL_TO_WRITE(r, w, z) ((w) >= (r))? \
 				((z) - ((w) - (r))):((r) - (w))
 
+static int
+hv_rbi_sysctl_stats(SYSCTL_HANDLER_ARGS)
+{
+	hv_vmbus_ring_buffer_info* rbi;
+	uint32_t read_index, write_index, interrupt_mask, sz;
+	uint32_t read_avail, write_avail;
+	char rbi_stats[256];
+
+	rbi = (hv_vmbus_ring_buffer_info*)arg1;
+	read_index = rbi->ring_buffer->read_index;
+	write_index = rbi->ring_buffer->write_index;
+	interrupt_mask = rbi->ring_buffer->interrupt_mask;
+	sz = rbi->ring_data_size;
+	write_avail = HV_BYTES_AVAIL_TO_WRITE(read_index,
+			write_index, sz);
+	read_avail = sz - write_avail;
+	snprintf(rbi_stats, sizeof(rbi_stats),
+		"r_idx:%d "
+		"w_idx:%d "
+		"int_mask:%d "
+		"r_avail:%d "
+		"w_avail:%d",
+		read_index, write_index, interrupt_mask,
+		read_avail, write_avail);
+
+	return (sysctl_handle_string(oidp, rbi_stats,
+			sizeof(rbi_stats), req));
+}
+
+void
+hv_ring_buffer_stat(
+	struct sysctl_ctx_list		*ctx,
+	struct sysctl_oid_list		*tree_node,
+	hv_vmbus_ring_buffer_info	*rbi,
+	const char			*desc)	
+{
+	SYSCTL_ADD_PROC(ctx, tree_node, OID_AUTO,
+	    "ring_buffer_stats",
+	    CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE, rbi, 0,
+	    hv_rbi_sysctl_stats, "A", desc);
+}
 /**
  * @brief Get number of bytes available to read and to write to
  * for the specified ring buffer
diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
index c8d6894..e274d59 100644
--- a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
+++ b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2012 NetApp Inc.
  * Copyright (c) 2012 Citrix Inc.
  * All rights reserved.
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/pcpu.h>
 #include <machine/apicvar.h>
 
+#include <dev/hyperv/include/hyperv.h>
 #include "hv_vmbus_priv.h"
 
 #include <contrib/dev/acpica/include/acpi.h>
@@ -75,7 +76,7 @@ static char *vmbus_ids[] = { "VMBUS", NULL };
  * the hypervisor.
  */
 static void
-vmbus_msg_swintr(void *arg)
+vmbus_msg_swintr(void *arg, int pending __unused)
 {
 	int 			cpu;
 	void*			page_addr;
@@ -116,8 +117,12 @@ handled:
 	     * message_pending and EOMing. Otherwise, the EOMing will
 	     * not deliver any more messages
 	     * since there is no empty slot
+	     *
+	     * NOTE:
+	     * mb() is used here, since atomic_thread_fence_seq_cst()
+	     * will become compiler fence on UP kernel.
 	     */
-	    wmb();
+	    mb();
 
 	    if (msg->header.message_flags.u.message_pending) {
 			/*
@@ -140,7 +145,6 @@ hv_vmbus_isr(struct trapframe *frame)
 {
 	int				cpu;
 	hv_vmbus_message*		msg;
-	hv_vmbus_synic_event_flags*	event;
 	void*				page_addr;
 
 	cpu = PCPU_GET(cpuid);
@@ -151,43 +155,31 @@ hv_vmbus_isr(struct trapframe *frame)
 	 * in Windows when running as a guest in Hyper-V
 	 */
 
-	page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu];
-	event = (hv_vmbus_synic_event_flags*)
-		    page_addr + HV_VMBUS_MESSAGE_SINT;
-
-	if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
-	    (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) {
-		/* Since we are a child, we only need to check bit 0 */
-		if (synch_test_and_clear_bit(0, &event->flags32[0])) {
-			hv_vmbus_on_events(cpu);
-		}
-	} else {
-		/*
-		 * On host with Win8 or above, we can directly look at
-		 * the event page. If bit n is set, we have an interrupt 
-		 * on the channel with id n.
-		 * Directly schedule the event software interrupt on
-		 * current cpu.
-		 */
-		hv_vmbus_on_events(cpu);
-	}
+	hv_vmbus_on_events(cpu);
 
 	/* Check if there are actual msgs to be process */
 	page_addr = hv_vmbus_g_context.syn_ic_msg_page[cpu];
-	msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT;
+	msg = (hv_vmbus_message*) page_addr + HV_VMBUS_TIMER_SINT;
 
 	/* we call eventtimer process the message */
 	if (msg->header.message_type == HV_MESSAGE_TIMER_EXPIRED) {
 		msg->header.message_type = HV_MESSAGE_TYPE_NONE;
 
+		/* call intrrupt handler of event timer */
+		hv_et_intr(frame);
+
 		/*
 		 * Make sure the write to message_type (ie set to
 		 * HV_MESSAGE_TYPE_NONE) happens before we read the
 		 * message_pending and EOMing. Otherwise, the EOMing will
 		 * not deliver any more messages
 		 * since there is no empty slot
+		 *
+		 * NOTE:
+		 * mb() is used here, since atomic_thread_fence_seq_cst()
+		 * will become compiler fence on UP kernel.
 		 */
-		wmb();
+		mb();
 
 		if (msg->header.message_flags.u.message_pending) {
 			/*
@@ -196,12 +188,12 @@ hv_vmbus_isr(struct trapframe *frame)
 			 */
 			wrmsr(HV_X64_MSR_EOM, 0);
 		}
-		hv_et_intr(frame);
-		return (FILTER_HANDLED);
 	}
 
+	msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT;
 	if (msg->header.message_type != HV_MESSAGE_TYPE_NONE) {
-		swi_sched(hv_vmbus_g_context.msg_swintr[cpu], 0);
+		taskqueue_enqueue(hv_vmbus_g_context.hv_msg_tq[cpu],
+		    &hv_vmbus_g_context.hv_msg_task[cpu]);
 	}
 
 	return (FILTER_HANDLED);
@@ -279,6 +271,26 @@ vmbus_write_ivar(
 	return (ENOENT);
 }
 
+static int
+vmbus_child_pnpinfo_str(device_t dev, device_t child, char *buf, size_t buflen)
+{
+	char guidbuf[40];
+	struct hv_device *dev_ctx = device_get_ivars(child);
+
+	if (dev_ctx == NULL)
+		return (0);
+
+	strlcat(buf, "classid=", buflen);
+	snprintf_hv_guid(guidbuf, sizeof(guidbuf), &dev_ctx->class_id);
+	strlcat(buf, guidbuf, buflen);
+
+	strlcat(buf, " deviceid=", buflen);
+	snprintf_hv_guid(guidbuf, sizeof(guidbuf), &dev_ctx->device_id);
+	strlcat(buf, guidbuf, buflen);
+
+	return (0);
+}
+
 struct hv_device*
 hv_vmbus_child_device_create(
 	hv_guid		type,
@@ -300,34 +312,34 @@ hv_vmbus_child_device_create(
 	return (child_dev);
 }
 
-static void
-print_dev_guid(struct hv_device *dev)
+int
+snprintf_hv_guid(char *buf, size_t sz, const hv_guid *guid)
 {
-	int i;
-	unsigned char guid_name[100];
-	for (i = 0; i < 32; i += 2)
-		sprintf(&guid_name[i], "%02x", dev->class_id.data[i / 2]);
-	if(bootverbose)
-		printf("VMBUS: Class ID: %s\n", guid_name);
+	int cnt;
+	const unsigned char *d = guid->data;
+
+	cnt = snprintf(buf, sz,
+		"%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+		d[3], d[2], d[1], d[0], d[5], d[4], d[7], d[6],
+		d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]);
+	return (cnt);
 }
 
 int
 hv_vmbus_child_device_register(struct hv_device *child_dev)
 {
 	device_t child;
-	int ret = 0;
-
-	print_dev_guid(child_dev);
 
+	if (bootverbose) {
+		char name[40];
+		snprintf_hv_guid(name, sizeof(name), &child_dev->class_id);
+		printf("VMBUS: Class ID: %s\n", name);
+	}
 
 	child = device_add_child(vmbus_devp, NULL, -1);
 	child_dev->device = child;
 	device_set_ivars(child, child_dev);
 
-	mtx_lock(&Giant);
-	ret = device_probe_and_attach(child);
-	mtx_unlock(&Giant);
-
 	return (0);
 }
 
@@ -356,7 +368,6 @@ vmbus_probe(device_t dev) {
 	return (BUS_PROBE_DEFAULT);
 }
 
-#ifdef HYPERV
 extern inthand_t IDTVEC(rsvd), IDTVEC(hv_vmbus_callback);
 
 /**
@@ -416,21 +427,6 @@ vmbus_vector_free(int vector)
         setidt(vector, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
 }
 
-#else /* HYPERV */
-
-static int
-vmbus_vector_alloc(void)
-{
-	return(0);
-}
-
-static void
-vmbus_vector_free(int vector)
-{
-}
-
-#endif /* HYPERV */
-
 static void
 vmbus_cpuset_setthread_task(void *xmask, int pending __unused)
 {
@@ -498,9 +494,6 @@ vmbus_bus_init(void)
 	setup_args.vector = hv_vmbus_g_context.hv_cb_vector;
 
 	CPU_FOREACH(j) {
-		hv_vmbus_g_context.hv_msg_intr_event[j] = NULL;
-		hv_vmbus_g_context.msg_swintr[j] = NULL;
-
 		snprintf(buf, sizeof(buf), "cpu%d:hyperv", j);
 		intrcnt_add(buf, &hv_vmbus_intr_cpu[j]);
 
@@ -519,11 +512,6 @@ vmbus_bus_init(void)
 		 */
 		hv_vmbus_g_context.hv_event_queue[j] = taskqueue_create_fast("hyperv event", M_WAITOK,
 			taskqueue_thread_enqueue, &hv_vmbus_g_context.hv_event_queue[j]);
-		if (hv_vmbus_g_context.hv_event_queue[j] == NULL) {
-			if (bootverbose)
-				printf("VMBUS: failed to setup taskqueue\n");
-			goto cleanup1;
-		}
 		taskqueue_start_threads(&hv_vmbus_g_context.hv_event_queue[j], 1, PI_NET,
 			"hvevent%d", j);
 
@@ -533,29 +521,20 @@ vmbus_bus_init(void)
 		taskqueue_drain(hv_vmbus_g_context.hv_event_queue[j], &cpuset_task);
 
 		/*
-		 * Setup software interrupt thread and handler for msg handling.
+		 * Setup per-cpu tasks and taskqueues to handle msg.
 		 */
-		ret = swi_add(&hv_vmbus_g_context.hv_msg_intr_event[j],
-		    "hv_msg", vmbus_msg_swintr, (void *)(long)j, SWI_CLOCK, 0,
-		    &hv_vmbus_g_context.msg_swintr[j]);
-		if (ret) {
-			if(bootverbose)
-				printf("VMBUS: failed to setup msg swi for "
-				    "cpu %d\n", j);
-			goto cleanup1;
-		}
+		hv_vmbus_g_context.hv_msg_tq[j] = taskqueue_create_fast(
+		    "hyperv msg", M_WAITOK, taskqueue_thread_enqueue,
+		    &hv_vmbus_g_context.hv_msg_tq[j]);
+		taskqueue_start_threads(&hv_vmbus_g_context.hv_msg_tq[j], 1, PI_NET,
+		    "hvmsg%d", j);
+		TASK_INIT(&hv_vmbus_g_context.hv_msg_task[j], 0,
+		    vmbus_msg_swintr, (void *)(long)j);
 
-		/*
-		 * Bind the swi thread to the cpu.
-		 */
-		ret = intr_event_bind(hv_vmbus_g_context.hv_msg_intr_event[j],
-		    j);
-		if (ret) {
-			if(bootverbose)
-				printf("VMBUS: failed to bind msg swi thread "
-				    "to cpu %d\n", j);
-			goto cleanup1;
-		}
+		CPU_SETOF(j, &cpu_mask);
+		TASK_INIT(&cpuset_task, 0, vmbus_cpuset_setthread_task, &cpu_mask);
+		taskqueue_enqueue(hv_vmbus_g_context.hv_msg_tq[j], &cpuset_task);
+		taskqueue_drain(hv_vmbus_g_context.hv_msg_tq[j], &cpuset_task);
 
 		/*
 		 * Prepare the per cpu msg and event pages to be called on each cpu.
@@ -581,6 +560,11 @@ vmbus_bus_init(void)
 		goto cleanup1;
 
 	hv_vmbus_request_channel_offers();
+
+	vmbus_scan();
+	bus_generic_attach(vmbus_devp);
+	device_printf(vmbus_devp, "device scan, probe and attach done\n");
+
 	return (ret);
 
 	cleanup1:
@@ -595,11 +579,10 @@ vmbus_bus_init(void)
 	 * remove swi and vmbus callback vector;
 	 */
 	CPU_FOREACH(j) {
-		if (hv_vmbus_g_context.hv_event_queue[j] != NULL)
+		if (hv_vmbus_g_context.hv_event_queue[j] != NULL) {
 			taskqueue_free(hv_vmbus_g_context.hv_event_queue[j]);
-		if (hv_vmbus_g_context.msg_swintr[j] != NULL)
-			swi_remove(hv_vmbus_g_context.msg_swintr[j]);
-		hv_vmbus_g_context.hv_msg_intr_event[j] = NULL;	
+			hv_vmbus_g_context.hv_event_queue[j] = NULL;
+		}
 	}
 
 	vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector);
@@ -626,6 +609,7 @@ vmbus_attach(device_t dev)
 	if (!cold)
 		vmbus_bus_init();
 
+	bus_generic_probe(dev);
 	return (0);
 }
 
@@ -656,7 +640,7 @@ vmbus_bus_exit(void)
 	smp_rendezvous(NULL, hv_vmbus_synic_cleanup, NULL, NULL);
 
 	for(i = 0; i < 2 * MAXCPU; i++) {
-		if (setup_args.page_buffers[i] != 0)
+		if (setup_args.page_buffers[i] != NULL)
 			free(setup_args.page_buffers[i], M_DEVBUF);
 	}
 
@@ -664,11 +648,10 @@ vmbus_bus_exit(void)
 
 	/* remove swi */
 	CPU_FOREACH(i) {
-		if (hv_vmbus_g_context.hv_event_queue[i] != NULL)
+		if (hv_vmbus_g_context.hv_event_queue[i] != NULL) {
 			taskqueue_free(hv_vmbus_g_context.hv_event_queue[i]);
-		if (hv_vmbus_g_context.msg_swintr[i] != NULL)
-			swi_remove(hv_vmbus_g_context.msg_swintr[i]);
-		hv_vmbus_g_context.hv_msg_intr_event[i] = NULL;	
+			hv_vmbus_g_context.hv_event_queue[i] = NULL;
+		}
 	}
 
 	vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector);
@@ -733,6 +716,7 @@ static device_method_t vmbus_methods[] = {
 	DEVMETHOD(bus_print_child, bus_generic_print_child),
 	DEVMETHOD(bus_read_ivar, vmbus_read_ivar),
 	DEVMETHOD(bus_write_ivar, vmbus_write_ivar),
+	DEVMETHOD(bus_child_pnpinfo_str, vmbus_child_pnpinfo_str),
 
 	{ 0, 0 } };
 
diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
index 5f62072..f83102a 100644
--- a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
+++ b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2012 Microsoft Corp.
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
  * Copyright (c) 2012 NetApp Inc.
  * Copyright (c) 2012 Citrix Inc.
  * All rights reserved.
@@ -70,6 +70,7 @@ typedef uint16_t hv_vmbus_status;
  *    You did not supply enough message buffers to send a message.
  */
 
+#define HV_STATUS_SUCCESS                ((uint16_t)0)
 #define HV_STATUS_INSUFFICIENT_BUFFERS   ((uint16_t)0x0013)
 
 typedef void (*hv_vmbus_channel_callback)(void *context);
@@ -180,7 +181,8 @@ enum {
 	HV_VMBUS_EVENT_PORT_ID		= 2,
 	HV_VMBUS_MONITOR_CONNECTION_ID	= 3,
 	HV_VMBUS_MONITOR_PORT_ID	= 3,
-	HV_VMBUS_MESSAGE_SINT		= 2
+	HV_VMBUS_MESSAGE_SINT		= 2,
+	HV_VMBUS_TIMER_SINT		= 4,
 };
 
 #define HV_PRESENT_BIT		0x80000000
@@ -203,8 +205,8 @@ typedef struct {
 	 * event and msg handling.
 	 */
 	struct taskqueue		*hv_event_queue[MAXCPU];
-	struct intr_event		*hv_msg_intr_event[MAXCPU];
-	void				*msg_swintr[MAXCPU];
+	struct taskqueue		*hv_msg_tq[MAXCPU];
+	struct task			hv_msg_task[MAXCPU];
 	/*
 	 * Host use this vector to intrrupt guest for vmbus channel
 	 * event and msg.
@@ -469,10 +471,28 @@ typedef enum {
 	HV_CPU_ID_FUNCTION_MS_HV_VERSION			= 0x40000002,
 	HV_CPU_ID_FUNCTION_MS_HV_FEATURES			= 0x40000003,
 	HV_CPU_ID_FUNCTION_MS_HV_ENLIGHTENMENT_INFORMATION	= 0x40000004,
-	HV_CPU_ID_FUNCTION_MS_HV_IMPLEMENTATION_LIMITS		= 0x40000005
-
+	HV_CPU_ID_FUNCTION_MS_HV_IMPLEMENTATION_LIMITS		= 0x40000005,
+	HV_CPU_ID_FUNCTION_MS_HV_HARDWARE_FEATURE		= 0x40000006
 } hv_vmbus_cpuid_function;
 
+#define	HV_FEATURE_MSR_TIME_REFCNT	0x0002	/* MSR_TIME_REF_COUNT */
+#define	HV_FEATURE_MSR_SYNIC		0x0004	/* MSRs for SynIC */
+#define	HV_FEATURE_MSR_SYNTIMER		0x0008	/* MSRs for SynTimer */
+#define	HV_FEATURE_MSR_APIC		0x0010	/* MSR_{EOI,ICR,TPR} */
+#define	HV_FEATURE_MSR_HYPERCALL	0x0020	/* MSR_{GUEST_OS_ID,HYPERCALL} */
+#define	HV_FEATURE_MSR_GUEST_IDLE	0x0400	/* MSR_GUEST_IDLE */
+
+#define	HV_PM_FEATURE_CSTATE_MASK	0x000f
+#define	HV_PM_FEATURE_C3_HPET		0x0010	/* C3 requires HPET */
+#define	HV_PM_FEATURE_CSTATE(f)		((f) & HV_PM_FEATURE_CSTATE_MASK)
+
+#define	HV_FEATURE3_MWAIT		0x0001	/* MWAIT */
+#define	HV_FEATURE3_XMM_HYPERCALL	0x0010	/* hypercall input through XMM regs */
+#define	HV_FEATURE3_GUEST_IDLE		0x0020	/* guest idle support */
+#define	HV_FEATURE3_NUMA		0x0080	/* NUMA distance query support */
+#define	HV_FEATURE3_TIME_FREQ		0x0100	/* timer frequency query (TSC, LAPIC) */
+#define	HV_FEATURE3_MSR_CRASH		0x0400	/* MSRs for guest crash */
+
 /*
  * Define the format of the SIMP register
  */
@@ -626,6 +646,9 @@ typedef enum {
 extern hv_vmbus_context		hv_vmbus_g_context;
 extern hv_vmbus_connection	hv_vmbus_g_connection;
 
+extern u_int			hyperv_features;
+extern u_int			hyperv_recommends;
+
 typedef void (*vmbus_msg_handler)(hv_vmbus_channel_msg_header *msg);
 
 typedef struct hv_vmbus_channel_msg_table_entry {
@@ -639,6 +662,14 @@ extern hv_vmbus_channel_msg_table_entry	g_channel_message_table[];
 /*
  * Private, VM Bus functions
  */
+struct sysctl_ctx_list;
+struct sysctl_oid_list;
+
+void			hv_ring_buffer_stat(
+				struct sysctl_ctx_list		*ctx,
+				struct sysctl_oid_list		*tree_node,
+				hv_vmbus_ring_buffer_info	*rbi,
+				const char			*desc);
 
 int			hv_vmbus_ring_buffer_init(
 				hv_vmbus_ring_buffer_info	*ring_info,
@@ -694,7 +725,6 @@ uint16_t		hv_vmbus_post_msg_via_msg_ipc(
 uint16_t		hv_vmbus_signal_event(void *con_id);
 void			hv_vmbus_synic_init(void *irq_arg);
 void			hv_vmbus_synic_cleanup(void *arg);
-int			hv_vmbus_query_hypervisor_presence(void);
 
 struct hv_device*	hv_vmbus_child_device_create(
 				hv_guid			device_type,
@@ -721,6 +751,9 @@ void			hv_vmbus_on_events(int cpu);
 void			hv_et_init(void);
 void			hv_et_intr(struct trapframe*);
 
+/* Wait for device creation */
+void			vmbus_scan(void);
+
 /*
  * The guest OS needs to register the guest ID with the hypervisor.
  * The guest ID is a 64 bit entity and the structure of this ID is
diff --git a/sys/dev/hyperv/vmbus/i386/hv_vector.S b/sys/dev/hyperv/vmbus/i386/hv_vector.S
new file mode 100644
index 0000000..55a2613
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/i386/hv_vector.S
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+#include <machine/specialreg.h>
+
+#include "assym.s"
+
+/*
+ * This is the Hyper-V vmbus channel direct callback interrupt.
+ * Only used when it is running on Hyper-V.
+ */
+        .text
+        SUPERALIGN_TEXT
+IDTVEC(hv_vmbus_callback)
+        PUSH_FRAME
+        SET_KERNEL_SREGS
+        cld
+        FAKE_MCOUNT(TF_EIP(%esp))
+        pushl   %esp
+        call    hv_vector_handler
+        add     $4, %esp
+        MEXITCOUNT
+        jmp     doreti
author	Renato Botelho <renato@netgate.com>	2016-06-21 07:44:54 -0300
committer	Renato Botelho <renato@netgate.com>	2016-06-21 07:44:54 -0300
commit	1fc6b0207cc2f3cce33817706603caa41a9de24d (patch)
tree	d2d812b76b08f42a002621f716dd5f3199c7ca7d /sys/dev/hyperv
parent	b8632c4f34175c7018be77059ab229e755eb67e0 (diff)
parent	bc9e0dd07a76c4d7a1c6fcf21824ca2cecff2c6d (diff)
download	FreeBSD-src-1fc6b0207cc2f3cce33817706603caa41a9de24d.zip FreeBSD-src-1fc6b0207cc2f3cce33817706603caa41a9de24d.tar.gz