summaryrefslogtreecommitdiffstats
path: root/sys/dev/hyperv/vmbus
diff options
context:
space:
mode:
authorwhu <whu@FreeBSD.org>2015-05-22 09:03:55 +0000
committerwhu <whu@FreeBSD.org>2015-05-22 09:03:55 +0000
commit30cd3b9808be2ed4002993166f0790b8f07d95d8 (patch)
treef2a61a02982282e7de44dcf271ef00cd723d44c5 /sys/dev/hyperv/vmbus
parentb453b295750133b89170fcb27025f932be66ad18 (diff)
downloadFreeBSD-src-30cd3b9808be2ed4002993166f0790b8f07d95d8.zip
FreeBSD-src-30cd3b9808be2ed4002993166f0790b8f07d95d8.tar.gz
MFC r282212:
Microsoft vmbus, storage and other related driver enhancements for HyperV. - Vmbus multi channel support. - Vector interrupt support. - Signal optimization. - Storvsc driver performance improvement. - Scatter and gather support for storvsc driver. - Minor bug fix for KVP driver. Thanks royger, jhb and delphij from FreeBSD community for the reviews and comments. Also thanks Hovy Xu from NetApp for the contributions to the storvsc driver. PR: 195238 Submitted by: whu Reviewed by: royger Approved by: royger Relnotes: yes Sponsored by: Microsoft OSTC Differential Revision: https://reviews.freebsd.org/D2575
Diffstat (limited to 'sys/dev/hyperv/vmbus')
-rw-r--r--sys/dev/hyperv/vmbus/hv_channel.c98
-rw-r--r--sys/dev/hyperv/vmbus/hv_channel_mgmt.c268
-rw-r--r--sys/dev/hyperv/vmbus/hv_connection.c289
-rw-r--r--sys/dev/hyperv/vmbus/hv_hv.c66
-rw-r--r--sys/dev/hyperv/vmbus/hv_ring_buffer.c78
-rw-r--r--sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c364
-rw-r--r--sys/dev/hyperv/vmbus/hv_vmbus_priv.h71
7 files changed, 870 insertions, 364 deletions
diff --git a/sys/dev/hyperv/vmbus/hv_channel.c b/sys/dev/hyperv/vmbus/hv_channel.c
index 103260a..94137fb 100644
--- a/sys/dev/hyperv/vmbus/hv_channel.c
+++ b/sys/dev/hyperv/vmbus/hv_channel.c
@@ -75,7 +75,7 @@ vmbus_channel_set_event(hv_vmbus_channel *channel)
(uint32_t *)&monitor_page->
trigger_group[channel->monitor_group].u.pending);
} else {
- hv_vmbus_set_event(channel->offer_msg.child_rel_id);
+ hv_vmbus_set_event(channel);
}
}
@@ -99,6 +99,18 @@ hv_vmbus_channel_open(
hv_vmbus_channel_open_channel* open_msg;
hv_vmbus_channel_msg_info* open_info;
+ mtx_lock(&new_channel->sc_lock);
+ if (new_channel->state == HV_CHANNEL_OPEN_STATE) {
+ new_channel->state = HV_CHANNEL_OPENING_STATE;
+ } else {
+ mtx_unlock(&new_channel->sc_lock);
+ if(bootverbose)
+ printf("VMBUS: Trying to open channel <%p> which in "
+ "%d state.\n", new_channel, new_channel->state);
+ return (EINVAL);
+ }
+ mtx_unlock(&new_channel->sc_lock);
+
new_channel->on_channel_callback = pfn_on_channel_callback;
new_channel->channel_callback_context = context;
@@ -162,7 +174,7 @@ hv_vmbus_channel_open(
new_channel->ring_buffer_gpadl_handle;
open_msg->downstream_ring_buffer_page_offset = send_ring_buffer_size
>> PAGE_SHIFT;
- open_msg->server_context_area_gpadl_handle = 0;
+ open_msg->target_vcpu = new_channel->target_vcpu;
if (user_data_len)
memcpy(open_msg->user_data, user_data, user_data_len);
@@ -182,10 +194,14 @@ hv_vmbus_channel_open(
ret = sema_timedwait(&open_info->wait_sema, 500); /* KYS 5 seconds */
- if (ret)
+ if (ret) {
+ if(bootverbose)
+ printf("VMBUS: channel <%p> open timeout.\n", new_channel);
goto cleanup;
+ }
if (open_info->response.open_result.status == 0) {
+ new_channel->state = HV_CHANNEL_OPENED_STATE;
if(bootverbose)
printf("VMBUS: channel <%p> open success.\n", new_channel);
} else {
@@ -497,16 +513,20 @@ cleanup:
return (ret);
}
-/**
- * @brief Close the specified channel
- */
-void
-hv_vmbus_channel_close(hv_vmbus_channel *channel)
+static void
+hv_vmbus_channel_close_internal(hv_vmbus_channel *channel)
{
int ret = 0;
hv_vmbus_channel_close_channel* msg;
hv_vmbus_channel_msg_info* info;
+ channel->state = HV_CHANNEL_OPEN_STATE;
+ channel->sc_creation_callback = NULL;
+
+ /*
+ * Grab the lock to prevent race condition when a packet received
+ * and unloading driver is in the process.
+ */
mtx_lock(&channel->inbound_lock);
channel->on_channel_callback = NULL;
mtx_unlock(&channel->inbound_lock);
@@ -545,23 +565,37 @@ hv_vmbus_channel_close(hv_vmbus_channel *channel)
M_DEVBUF);
free(info, M_DEVBUF);
+}
+
+/**
+ * @brief Close the specified channel
+ */
+void
+hv_vmbus_channel_close(hv_vmbus_channel *channel)
+{
+ hv_vmbus_channel* sub_channel;
+
+ if (channel->primary_channel != NULL) {
+ /*
+ * We only close multi-channels when the primary is
+ * closed.
+ */
+ return;
+ }
/*
- * If we are closing the channel during an error path in
- * opening the channel, don't free the channel
- * since the caller will free the channel
+ * Close all multi-channels first.
*/
- if (channel->state == HV_CHANNEL_OPEN_STATE) {
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
- TAILQ_REMOVE(
- &hv_vmbus_g_connection.channel_anchor,
- channel,
- list_entry);
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
-
- hv_vmbus_free_vmbus_channel(channel);
+ TAILQ_FOREACH(sub_channel, &channel->sc_list_anchor,
+ sc_list_entry) {
+ if (sub_channel->state != HV_CHANNEL_OPENED_STATE)
+ continue;
+ hv_vmbus_channel_close_internal(sub_channel);
}
-
+ /*
+ * Then close the primary channel.
+ */
+ hv_vmbus_channel_close_internal(channel);
}
/**
@@ -581,6 +615,7 @@ hv_vmbus_channel_send_packet(
uint32_t packet_len;
uint64_t aligned_data;
uint32_t packet_len_aligned;
+ boolean_t need_sig;
hv_vmbus_sg_buffer_list buffer_list[3];
packet_len = sizeof(hv_vm_packet_descriptor) + buffer_len;
@@ -604,12 +639,11 @@ hv_vmbus_channel_send_packet(
buffer_list[2].data = &aligned_data;
buffer_list[2].length = packet_len_aligned - packet_len;
- ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3);
+ ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3,
+ &need_sig);
/* TODO: We should determine if this is optional */
- if (ret == 0
- && !hv_vmbus_get_ring_buffer_interrupt_mask(
- &channel->outbound)) {
+ if (ret == 0 && need_sig) {
vmbus_channel_set_event(channel);
}
@@ -632,6 +666,7 @@ hv_vmbus_channel_send_packet_pagebuffer(
int ret = 0;
int i = 0;
+ boolean_t need_sig;
uint32_t packet_len;
uint32_t packetLen_aligned;
hv_vmbus_sg_buffer_list buffer_list[3];
@@ -675,11 +710,11 @@ hv_vmbus_channel_send_packet_pagebuffer(
buffer_list[2].data = &alignedData;
buffer_list[2].length = packetLen_aligned - packet_len;
- ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3);
+ ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3,
+ &need_sig);
/* TODO: We should determine if this is optional */
- if (ret == 0 &&
- !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) {
+ if (ret == 0 && need_sig) {
vmbus_channel_set_event(channel);
}
@@ -700,6 +735,7 @@ hv_vmbus_channel_send_packet_multipagebuffer(
int ret = 0;
uint32_t desc_size;
+ boolean_t need_sig;
uint32_t packet_len;
uint32_t packet_len_aligned;
uint32_t pfn_count;
@@ -750,11 +786,11 @@ hv_vmbus_channel_send_packet_multipagebuffer(
buffer_list[2].data = &aligned_data;
buffer_list[2].length = packet_len_aligned - packet_len;
- ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3);
+ ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3,
+ &need_sig);
/* TODO: We should determine if this is optional */
- if (ret == 0 &&
- !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) {
+ if (ret == 0 && need_sig) {
vmbus_channel_set_event(channel);
}
diff --git a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
index 011e305..d13ece5 100644
--- a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
+++ b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
@@ -26,6 +26,9 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
#include <sys/param.h>
#include <sys/mbuf.h>
@@ -50,6 +53,8 @@ static void vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr);
static void vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr);
static void vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr);
static void vmbus_channel_process_offer(void *context);
+struct hv_vmbus_channel*
+ vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary);
/**
* Channel message dispatch table
@@ -233,6 +238,9 @@ hv_vmbus_allocate_channel(void)
return (NULL);
mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF);
+ mtx_init(&channel->sc_lock, "vmbus multi channel", NULL, MTX_DEF);
+
+ TAILQ_INIT(&channel->sc_list_anchor);
channel->control_work_queue = hv_work_queue_create("control");
@@ -262,6 +270,7 @@ ReleaseVmbusChannel(void *context)
void
hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel)
{
+ mtx_destroy(&channel->sc_lock);
mtx_destroy(&channel->inbound_lock);
/*
* We have to release the channel's workqueue/thread in
@@ -279,10 +288,10 @@ hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel)
static void
vmbus_channel_process_offer(void *context)
{
- int ret;
hv_vmbus_channel* new_channel;
boolean_t f_new;
hv_vmbus_channel* channel;
+ int ret;
new_channel = (hv_vmbus_channel*) context;
f_new = TRUE;
@@ -291,38 +300,76 @@ vmbus_channel_process_offer(void *context)
/*
* Make sure this is a new offer
*/
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor,
list_entry)
{
- if (!memcmp(
- &channel->offer_msg.offer.interface_type,
- &new_channel->offer_msg.offer.interface_type,
- sizeof(hv_guid))
- && !memcmp(
- &channel->offer_msg.offer.interface_instance,
+ if (memcmp(&channel->offer_msg.offer.interface_type,
+ &new_channel->offer_msg.offer.interface_type,
+ sizeof(hv_guid)) == 0 &&
+ memcmp(&channel->offer_msg.offer.interface_instance,
&new_channel->offer_msg.offer.interface_instance,
- sizeof(hv_guid))) {
- f_new = FALSE;
- break;
- }
+ sizeof(hv_guid)) == 0) {
+ f_new = FALSE;
+ break;
+ }
}
if (f_new) {
- /* Insert at tail */
- TAILQ_INSERT_TAIL(
- &hv_vmbus_g_connection.channel_anchor,
- new_channel,
- list_entry);
+ /* Insert at tail */
+ TAILQ_INSERT_TAIL(
+ &hv_vmbus_g_connection.channel_anchor,
+ new_channel,
+ list_entry);
}
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
+
+ /*XXX add new channel to percpu_list */
if (!f_new) {
+ /*
+ * Check if this is a sub channel.
+ */
+ if (new_channel->offer_msg.offer.sub_channel_index != 0) {
+ /*
+ * It is a sub channel offer, process it.
+ */
+ new_channel->primary_channel = channel;
+ mtx_lock(&channel->sc_lock);
+ TAILQ_INSERT_TAIL(
+ &channel->sc_list_anchor,
+ new_channel,
+ sc_list_entry);
+ mtx_unlock(&channel->sc_lock);
+
+ /* Insert new channel into channel_anchor. */
+ printf("Storvsc get multi-channel offer, rel=%u.\n",
+ new_channel->offer_msg.child_rel_id);
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
+ TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor,
+ new_channel, list_entry);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
+
+ if(bootverbose)
+ printf("VMBUS: new multi-channel offer <%p>.\n",
+ new_channel);
+
+ /*XXX add it to percpu_list */
+
+ new_channel->state = HV_CHANNEL_OPEN_STATE;
+ if (channel->sc_creation_callback != NULL) {
+ channel->sc_creation_callback(new_channel);
+ }
+ return;
+ }
+
hv_vmbus_free_vmbus_channel(new_channel);
return;
}
+ new_channel->state = HV_CHANNEL_OPEN_STATE;
+
/*
* Start the process of binding this offer to the driver
* (We need to set the device field before calling
@@ -333,35 +380,86 @@ vmbus_channel_process_offer(void *context)
new_channel->offer_msg.offer.interface_instance, new_channel);
/*
- * TODO - the HV_CHANNEL_OPEN_STATE flag should not be set below
- * but in the "open" channel request. The ret != 0 logic below
- * doesn't take into account that a channel
- * may have been opened successfully
- */
-
- /*
* Add the new device to the bus. This will kick off device-driver
* binding which eventually invokes the device driver's AddDevice()
* method.
*/
ret = hv_vmbus_child_device_register(new_channel->device);
if (ret != 0) {
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
- TAILQ_REMOVE(
- &hv_vmbus_g_connection.channel_anchor,
- new_channel,
- list_entry);
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
- hv_vmbus_free_vmbus_channel(new_channel);
- } else {
- /*
- * This state is used to indicate a successful open
- * so that when we do close the channel normally,
- * we can clean up properly
- */
- new_channel->state = HV_CHANNEL_OPEN_STATE;
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
+ TAILQ_REMOVE(
+ &hv_vmbus_g_connection.channel_anchor,
+ new_channel,
+ list_entry);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
+ hv_vmbus_free_vmbus_channel(new_channel);
+ }
+}
+
+/**
+ * Array of device guids that are performance critical. We try to distribute
+ * the interrupt load for these devices across all online cpus.
+ */
+static const hv_guid high_perf_devices[] = {
+ {HV_NIC_GUID, },
+ {HV_IDE_GUID, },
+ {HV_SCSI_GUID, },
+};
+
+enum {
+ PERF_CHN_NIC = 0,
+ PERF_CHN_IDE,
+ PERF_CHN_SCSI,
+ MAX_PERF_CHN,
+};
+
+/*
+ * We use this static number to distribute the channel interrupt load.
+ */
+static uint32_t next_vcpu;
+
+/**
+ * Starting with Win8, we can statically distribute the incoming
+ * channel interrupt load by binding a channel to VCPU. We
+ * implement here a simple round robin scheme for distributing
+ * the interrupt load.
+ * We will bind channels that are not performance critical to cpu 0 and
+ * performance critical channels (IDE, SCSI and Network) will be uniformly
+ * distributed across all available CPUs.
+ */
+static void
+vmbus_channel_select_cpu(hv_vmbus_channel *channel, hv_guid *guid)
+{
+ uint32_t current_cpu;
+ int i;
+ boolean_t is_perf_channel = FALSE;
+
+ for (i = PERF_CHN_NIC; i < MAX_PERF_CHN; i++) {
+ if (memcmp(guid->data, high_perf_devices[i].data,
+ sizeof(hv_guid)) == 0) {
+ is_perf_channel = TRUE;
+ break;
+ }
+ }
+ if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
+ (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7) ||
+ (!is_perf_channel)) {
+ /* Host's view of guest cpu */
+ channel->target_vcpu = 0;
+ /* Guest's own view of cpu */
+ channel->target_cpu = 0;
+ return;
}
+ /* mp_ncpus should have the number cpus currently online */
+ current_cpu = (++next_vcpu % mp_ncpus);
+ channel->target_cpu = current_cpu;
+ channel->target_vcpu =
+ hv_vmbus_g_context.hv_vcpu_index[current_cpu];
+ if (bootverbose)
+ printf("VMBUS: Total online cpus %d, assign perf channel %d "
+ "to vcpu %d, cpu %d\n", mp_ncpus, i, channel->target_vcpu,
+ current_cpu);
}
/**
@@ -391,6 +489,38 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr)
if (new_channel == NULL)
return;
+ /*
+ * By default we setup state to enable batched
+ * reading. A specific service can choose to
+ * disable this prior to opening the channel.
+ */
+ new_channel->batched_reading = TRUE;
+
+ new_channel->signal_event_param =
+ (hv_vmbus_input_signal_event *)
+ (HV_ALIGN_UP((unsigned long)
+ &new_channel->signal_event_buffer,
+ HV_HYPERCALL_PARAM_ALIGN));
+
+ new_channel->signal_event_param->connection_id.as_uint32_t = 0;
+ new_channel->signal_event_param->connection_id.u.id =
+ HV_VMBUS_EVENT_CONNECTION_ID;
+ new_channel->signal_event_param->flag_number = 0;
+ new_channel->signal_event_param->rsvd_z = 0;
+
+ if (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008) {
+ new_channel->is_dedicated_interrupt =
+ (offer->is_dedicated_interrupt != 0);
+ new_channel->signal_event_param->connection_id.u.id =
+ offer->connection_id;
+ }
+
+ /*
+ * Bind the channel to a chosen cpu.
+ */
+ vmbus_channel_select_cpu(new_channel,
+ &offer->offer.interface_type);
+
memcpy(&new_channel->offer_msg, offer,
sizeof(hv_vmbus_channel_offer_channel));
new_channel->monitor_group = (uint8_t) offer->monitor_id / 32;
@@ -666,7 +796,7 @@ hv_vmbus_release_unattached_channels(void)
{
hv_vmbus_channel *channel;
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
while (!TAILQ_EMPTY(&hv_vmbus_g_connection.channel_anchor)) {
channel = TAILQ_FIRST(&hv_vmbus_g_connection.channel_anchor);
@@ -676,5 +806,61 @@ hv_vmbus_release_unattached_channels(void)
hv_vmbus_child_device_unregister(channel->device);
hv_vmbus_free_vmbus_channel(channel);
}
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
+}
+
+/**
+ * @brief Select the best outgoing channel
+ *
+ * The channel whose vcpu binding is closest to the currect vcpu will
+ * be selected.
+ * If no multi-channel, always select primary channel
+ *
+ * @param primary - primary channel
+ */
+struct hv_vmbus_channel *
+vmbus_select_outgoing_channel(struct hv_vmbus_channel *primary)
+{
+ hv_vmbus_channel *new_channel = NULL;
+ hv_vmbus_channel *outgoing_channel = primary;
+ int old_cpu_distance = 0;
+ int new_cpu_distance = 0;
+ int cur_vcpu = 0;
+ int smp_pro_id = PCPU_GET(cpuid);
+
+ if (TAILQ_EMPTY(&primary->sc_list_anchor)) {
+ return outgoing_channel;
+ }
+
+ if (smp_pro_id >= MAXCPU) {
+ return outgoing_channel;
+ }
+
+ cur_vcpu = hv_vmbus_g_context.hv_vcpu_index[smp_pro_id];
+
+ TAILQ_FOREACH(new_channel, &primary->sc_list_anchor, sc_list_entry) {
+ if (new_channel->state != HV_CHANNEL_OPENED_STATE){
+ continue;
+ }
+
+ if (new_channel->target_vcpu == cur_vcpu){
+ return new_channel;
+ }
+
+ old_cpu_distance = ((outgoing_channel->target_vcpu > cur_vcpu) ?
+ (outgoing_channel->target_vcpu - cur_vcpu) :
+ (cur_vcpu - outgoing_channel->target_vcpu));
+
+ new_cpu_distance = ((new_channel->target_vcpu > cur_vcpu) ?
+ (new_channel->target_vcpu - cur_vcpu) :
+ (cur_vcpu - new_channel->target_vcpu));
+
+ if (old_cpu_distance < new_cpu_distance) {
+ continue;
+ }
+
+ outgoing_channel = new_channel;
+ }
+
+ return(outgoing_channel);
}
diff --git a/sys/dev/hyperv/vmbus/hv_connection.c b/sys/dev/hyperv/vmbus/hv_connection.c
index c8e0b48..cc83037 100644
--- a/sys/dev/hyperv/vmbus/hv_connection.c
+++ b/sys/dev/hyperv/vmbus/hv_connection.c
@@ -26,6 +26,9 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/systm.h>
@@ -45,14 +48,113 @@ hv_vmbus_connection hv_vmbus_g_connection =
{ .connect_state = HV_DISCONNECTED,
.next_gpadl_handle = 0xE1E10, };
+uint32_t hv_vmbus_protocal_version = HV_VMBUS_VERSION_WS2008;
+
+static uint32_t
+hv_vmbus_get_next_version(uint32_t current_ver)
+{
+ switch (current_ver) {
+ case (HV_VMBUS_VERSION_WIN7):
+ return(HV_VMBUS_VERSION_WS2008);
+
+ case (HV_VMBUS_VERSION_WIN8):
+ return(HV_VMBUS_VERSION_WIN7);
+
+ case (HV_VMBUS_VERSION_WIN8_1):
+ return(HV_VMBUS_VERSION_WIN8);
+
+ case (HV_VMBUS_VERSION_WS2008):
+ default:
+ return(HV_VMBUS_VERSION_INVALID);
+ }
+}
+
+/**
+ * Negotiate the highest supported hypervisor version.
+ */
+static int
+hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info,
+ uint32_t version)
+{
+ int ret = 0;
+ hv_vmbus_channel_initiate_contact *msg;
+
+ sema_init(&msg_info->wait_sema, 0, "Msg Info Sema");
+ msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg;
+
+ msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT;
+ msg->vmbus_version_requested = version;
+
+ msg->interrupt_page = hv_get_phys_addr(
+ hv_vmbus_g_connection.interrupt_page);
+
+ msg->monitor_page_1 = hv_get_phys_addr(
+ hv_vmbus_g_connection.monitor_pages);
+
+ msg->monitor_page_2 =
+ hv_get_phys_addr(
+ ((uint8_t *) hv_vmbus_g_connection.monitor_pages
+ + PAGE_SIZE));
+
+ /**
+ * Add to list before we send the request since we may receive the
+ * response before returning from this routine
+ */
+ mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+
+ TAILQ_INSERT_TAIL(
+ &hv_vmbus_g_connection.channel_msg_anchor,
+ msg_info,
+ msg_list_entry);
+
+ mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+
+ ret = hv_vmbus_post_message(
+ msg,
+ sizeof(hv_vmbus_channel_initiate_contact));
+
+ if (ret != 0) {
+ mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+ TAILQ_REMOVE(
+ &hv_vmbus_g_connection.channel_msg_anchor,
+ msg_info,
+ msg_list_entry);
+ mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+ return (ret);
+ }
+
+ /**
+ * Wait for the connection response
+ */
+ ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */
+
+ mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+ TAILQ_REMOVE(
+ &hv_vmbus_g_connection.channel_msg_anchor,
+ msg_info,
+ msg_list_entry);
+ mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+
+ /**
+ * Check if successful
+ */
+ if (msg_info->response.version_response.version_supported) {
+ hv_vmbus_g_connection.connect_state = HV_CONNECTED;
+ } else {
+ ret = ECONNREFUSED;
+ }
+
+ return (ret);
+}
+
/**
* Send a connect request on the partition service connection
*/
int
hv_vmbus_connect(void) {
int ret = 0;
+ uint32_t version;
hv_vmbus_channel_msg_info* msg_info = NULL;
- hv_vmbus_channel_initiate_contact* msg;
/**
* Make sure we are not connecting or connected
@@ -74,7 +176,7 @@ hv_vmbus_connect(void) {
TAILQ_INIT(&hv_vmbus_g_connection.channel_anchor);
mtx_init(&hv_vmbus_g_connection.channel_lock, "vmbus channel",
- NULL, MTX_SPIN);
+ NULL, MTX_DEF);
/**
* Setup the vmbus event connection for channel interrupt abstraction
@@ -130,71 +232,30 @@ hv_vmbus_connect(void) {
goto cleanup;
}
- sema_init(&msg_info->wait_sema, 0, "Msg Info Sema");
- msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg;
-
- msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT;
- msg->vmbus_version_requested = HV_VMBUS_REVISION_NUMBER;
-
- msg->interrupt_page = hv_get_phys_addr(
- hv_vmbus_g_connection.interrupt_page);
-
- msg->monitor_page_1 = hv_get_phys_addr(
- hv_vmbus_g_connection.monitor_pages);
-
- msg->monitor_page_2 =
- hv_get_phys_addr(
- ((uint8_t *) hv_vmbus_g_connection.monitor_pages
- + PAGE_SIZE));
-
- /**
- * Add to list before we send the request since we may receive the
- * response before returning from this routine
+ /*
+ * Find the highest vmbus version number we can support.
*/
- mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
-
- TAILQ_INSERT_TAIL(
- &hv_vmbus_g_connection.channel_msg_anchor,
- msg_info,
- msg_list_entry);
-
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
-
- ret = hv_vmbus_post_message(
- msg,
- sizeof(hv_vmbus_channel_initiate_contact));
-
- if (ret != 0) {
- mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
- TAILQ_REMOVE(
- &hv_vmbus_g_connection.channel_msg_anchor,
- msg_info,
- msg_list_entry);
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
- goto cleanup;
- }
+ version = HV_VMBUS_VERSION_CURRENT;
+
+ do {
+ ret = hv_vmbus_negotiate_version(msg_info, version);
+ if (ret == EWOULDBLOCK) {
+ /*
+ * We timed out.
+ */
+ goto cleanup;
+ }
- /**
- * Wait for the connection response
- */
- ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */
+ if (hv_vmbus_g_connection.connect_state == HV_CONNECTED)
+ break;
- mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
- TAILQ_REMOVE(
- &hv_vmbus_g_connection.channel_msg_anchor,
- msg_info,
- msg_list_entry);
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+ version = hv_vmbus_get_next_version(version);
+ } while (version != HV_VMBUS_VERSION_INVALID);
- /**
- * Check if successful
- */
- if (msg_info->response.version_response.version_supported) {
- hv_vmbus_g_connection.connect_state = HV_CONNECTED;
- } else {
- ret = ECONNREFUSED;
- goto cleanup;
- }
+ hv_vmbus_protocal_version = version;
+ if (bootverbose)
+ printf("VMBUS: Portocal Version: %d.%d\n",
+ version >> 16, version & 0xFFFF);
sema_destroy(&msg_info->wait_sema);
free(msg_info, M_DEVBUF);
@@ -286,7 +347,7 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) {
* and channels are accessed without the need to take this lock or search
* the list.
*/
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
TAILQ_FOREACH(channel,
&hv_vmbus_g_connection.channel_anchor, list_entry) {
@@ -295,7 +356,7 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) {
break;
}
}
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
return (foundChannel);
}
@@ -306,7 +367,10 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) {
static void
VmbusProcessChannelEvent(uint32_t relid)
{
+ void* arg;
+ uint32_t bytes_to_read;
hv_vmbus_channel* channel;
+ boolean_t is_batched_reading;
/**
* Find the channel based on this relid and invokes
@@ -327,31 +391,98 @@ VmbusProcessChannelEvent(uint32_t relid)
* callback to NULL. This closes the window.
*/
- mtx_lock(&channel->inbound_lock);
+ /*
+ * Disable the lock due to newly added WITNESS check in r277723.
+ * Will seek other way to avoid race condition.
+ * -- whu
+ */
+ // mtx_lock(&channel->inbound_lock);
if (channel->on_channel_callback != NULL) {
- channel->on_channel_callback(channel->channel_callback_context);
+ arg = channel->channel_callback_context;
+ is_batched_reading = channel->batched_reading;
+ /*
+ * Optimize host to guest signaling by ensuring:
+ * 1. While reading the channel, we disable interrupts from
+ * host.
+ * 2. Ensure that we process all posted messages from the host
+ * before returning from this callback.
+ * 3. Once we return, enable signaling from the host. Once this
+ * state is set we check to see if additional packets are
+ * available to read. In this case we repeat the process.
+ */
+ do {
+ if (is_batched_reading)
+ hv_ring_buffer_read_begin(&channel->inbound);
+
+ channel->on_channel_callback(arg);
+
+ if (is_batched_reading)
+ bytes_to_read =
+ hv_ring_buffer_read_end(&channel->inbound);
+ else
+ bytes_to_read = 0;
+ } while (is_batched_reading && (bytes_to_read != 0));
}
- mtx_unlock(&channel->inbound_lock);
+ // mtx_unlock(&channel->inbound_lock);
}
+#ifdef HV_DEBUG_INTR
+extern uint32_t hv_intr_count;
+extern uint32_t hv_vmbus_swintr_event_cpu[MAXCPU];
+extern uint32_t hv_vmbus_intr_cpu[MAXCPU];
+#endif
+
/**
* Handler for events
*/
void
hv_vmbus_on_events(void *arg)
{
- int dword;
int bit;
+ int cpu;
+ int dword;
+ void *page_addr;
+ uint32_t* recv_interrupt_page = NULL;
int rel_id;
- int maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5;
+ int maxdword;
+ hv_vmbus_synic_event_flags *event;
/* int maxdword = PAGE_SIZE >> 3; */
- /*
- * receive size is 1/2 page and divide that by 4 bytes
- */
-
- uint32_t* recv_interrupt_page =
- hv_vmbus_g_connection.recv_interrupt_page;
+ cpu = (int)(long)arg;
+ KASSERT(cpu <= mp_maxid, ("VMBUS: hv_vmbus_on_events: "
+ "cpu out of range!"));
+
+#ifdef HV_DEBUG_INTR
+ int i;
+ hv_vmbus_swintr_event_cpu[cpu]++;
+ if (hv_intr_count % 10000 == 0) {
+ printf("VMBUS: Total interrupt %d\n", hv_intr_count);
+ for (i = 0; i < mp_ncpus; i++)
+ printf("VMBUS: hw cpu[%d]: %d, event sw intr cpu[%d]: %d\n",
+ i, hv_vmbus_intr_cpu[i], i, hv_vmbus_swintr_event_cpu[i]);
+ }
+#endif
+
+ if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
+ (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) {
+ maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5;
+ /*
+ * receive size is 1/2 page and divide that by 4 bytes
+ */
+ recv_interrupt_page =
+ hv_vmbus_g_connection.recv_interrupt_page;
+ } else {
+ /*
+ * On Host with Win8 or above, the event page can be
+ * checked directly to get the id of the channel
+ * that has the pending interrupt.
+ */
+ maxdword = HV_EVENT_FLAGS_DWORD_COUNT;
+ page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu];
+ event = (hv_vmbus_synic_event_flags *)
+ page_addr + HV_VMBUS_MESSAGE_SINT;
+ recv_interrupt_page = event->flags32;
+ }
/*
* Check events
@@ -416,16 +547,16 @@ int hv_vmbus_post_message(void *buffer, size_t bufferLen) {
* Send an event notification to the parent
*/
int
-hv_vmbus_set_event(uint32_t child_rel_id) {
+hv_vmbus_set_event(hv_vmbus_channel *channel) {
int ret = 0;
+ uint32_t child_rel_id = channel->offer_msg.child_rel_id;
/* Each uint32_t represents 32 channels */
synch_set_bit(child_rel_id & 31,
(((uint32_t *)hv_vmbus_g_connection.send_interrupt_page
+ (child_rel_id >> 5))));
- ret = hv_vmbus_signal_event();
+ ret = hv_vmbus_signal_event(channel->signal_event_param);
return (ret);
}
-
diff --git a/sys/dev/hyperv/vmbus/hv_hv.c b/sys/dev/hyperv/vmbus/hv_hv.c
index 80a1f42..84e2a5e 100644
--- a/sys/dev/hyperv/vmbus/hv_hv.c
+++ b/sys/dev/hyperv/vmbus/hv_hv.c
@@ -67,8 +67,6 @@ static inline void do_cpuid_inline(unsigned int op, unsigned int *eax,
hv_vmbus_context hv_vmbus_g_context = {
.syn_ic_initialized = FALSE,
.hypercall_page = NULL,
- .signal_event_param = NULL,
- .signal_event_buffer = NULL,
};
static struct timecounter hv_timecounter = {
@@ -256,28 +254,6 @@ hv_vmbus_init(void)
hv_vmbus_g_context.hypercall_page = virt_addr;
- /*
- * Setup the global signal event param for the signal event hypercall
- */
- hv_vmbus_g_context.signal_event_buffer =
- malloc(sizeof(hv_vmbus_input_signal_event_buffer), M_DEVBUF,
- M_ZERO | M_NOWAIT);
- KASSERT(hv_vmbus_g_context.signal_event_buffer != NULL,
- ("Error VMBUS: Failed to allocate signal_event_buffer\n"));
- if (hv_vmbus_g_context.signal_event_buffer == NULL)
- goto cleanup;
-
- hv_vmbus_g_context.signal_event_param =
- (hv_vmbus_input_signal_event*)
- (HV_ALIGN_UP((unsigned long)
- hv_vmbus_g_context.signal_event_buffer,
- HV_HYPERCALL_PARAM_ALIGN));
- hv_vmbus_g_context.signal_event_param->connection_id.as_uint32_t = 0;
- hv_vmbus_g_context.signal_event_param->connection_id.u.id =
- HV_VMBUS_EVENT_CONNECTION_ID;
- hv_vmbus_g_context.signal_event_param->flag_number = 0;
- hv_vmbus_g_context.signal_event_param->rsvd_z = 0;
-
tc_init(&hv_timecounter); /* register virtual timecount */
return (0);
@@ -303,12 +279,6 @@ hv_vmbus_cleanup(void)
{
hv_vmbus_x64_msr_hypercall_contents hypercall_msr;
- if (hv_vmbus_g_context.signal_event_buffer != NULL) {
- free(hv_vmbus_g_context.signal_event_buffer, M_DEVBUF);
- hv_vmbus_g_context.signal_event_buffer = NULL;
- hv_vmbus_g_context.signal_event_param = NULL;
- }
-
if (hv_vmbus_g_context.guest_id == HV_FREEBSD_GUEST_ID) {
if (hv_vmbus_g_context.hypercall_page != NULL) {
hypercall_msr.as_uint64_t = 0;
@@ -370,13 +340,13 @@ hv_vmbus_post_msg_via_msg_ipc(
* event IPC. (This involves a hypercall.)
*/
hv_vmbus_status
-hv_vmbus_signal_event()
+hv_vmbus_signal_event(void *con_id)
{
hv_vmbus_status status;
status = hv_vmbus_do_hypercall(
HV_CALL_SIGNAL_EVENT,
- hv_vmbus_g_context.signal_event_param,
+ con_id,
0) & 0xFFFF;
return (status);
@@ -390,6 +360,7 @@ hv_vmbus_synic_init(void *arg)
{
int cpu;
+ uint64_t hv_vcpu_index;
hv_vmbus_synic_simp simp;
hv_vmbus_synic_siefp siefp;
hv_vmbus_synic_scontrol sctrl;
@@ -403,23 +374,14 @@ hv_vmbus_synic_init(void *arg)
return;
/*
- * KYS: Looks like we can only initialize on cpu0; don't we support
- * SMP guests?
- *
- * TODO: Need to add SMP support for FreeBSD V9
- */
-
- if (cpu != 0)
- return;
-
- /*
* TODO: Check the version
*/
version = rdmsr(HV_X64_MSR_SVERSION);
-
- hv_vmbus_g_context.syn_ic_msg_page[cpu] = setup_args->page_buffers[0];
- hv_vmbus_g_context.syn_ic_event_page[cpu] = setup_args->page_buffers[1];
+ hv_vmbus_g_context.syn_ic_msg_page[cpu] =
+ setup_args->page_buffers[2 * cpu];
+ hv_vmbus_g_context.syn_ic_event_page[cpu] =
+ setup_args->page_buffers[2 * cpu + 1];
/*
* Setup the Synic's message page
@@ -443,9 +405,10 @@ hv_vmbus_synic_init(void *arg)
wrmsr(HV_X64_MSR_SIEFP, siefp.as_uint64_t);
/*HV_SHARED_SINT_IDT_VECTOR + 0x20; */
+ shared_sint.as_uint64_t = 0;
shared_sint.u.vector = setup_args->vector;
shared_sint.u.masked = FALSE;
- shared_sint.u.auto_eoi = FALSE;
+ shared_sint.u.auto_eoi = TRUE;
wrmsr(HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT,
shared_sint.as_uint64_t);
@@ -458,6 +421,13 @@ hv_vmbus_synic_init(void *arg)
hv_vmbus_g_context.syn_ic_initialized = TRUE;
+ /*
+ * Set up the cpuid mapping from Hyper-V to FreeBSD.
+ * The array is indexed using FreeBSD cpuid.
+ */
+ hv_vcpu_index = rdmsr(HV_X64_MSR_VP_INDEX);
+ hv_vmbus_g_context.hv_vcpu_index[cpu] = (uint32_t)hv_vcpu_index;
+
return;
}
@@ -469,14 +439,10 @@ void hv_vmbus_synic_cleanup(void *arg)
hv_vmbus_synic_sint shared_sint;
hv_vmbus_synic_simp simp;
hv_vmbus_synic_siefp siefp;
- int cpu = PCPU_GET(cpuid);
if (!hv_vmbus_g_context.syn_ic_initialized)
return;
- if (cpu != 0)
- return; /* TODO: XXXKYS: SMP? */
-
shared_sint.as_uint64_t = rdmsr(
HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT);
diff --git a/sys/dev/hyperv/vmbus/hv_ring_buffer.c b/sys/dev/hyperv/vmbus/hv_ring_buffer.c
index f7c1965..0e51ef7 100644
--- a/sys/dev/hyperv/vmbus/hv_ring_buffer.c
+++ b/sys/dev/hyperv/vmbus/hv_ring_buffer.c
@@ -26,6 +26,8 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/lock.h>
@@ -144,6 +146,69 @@ get_ring_buffer_indices(hv_vmbus_ring_buffer_info* ring_info)
return (uint64_t) ring_info->ring_buffer->write_index << 32;
}
+void
+hv_ring_buffer_read_begin(
+ hv_vmbus_ring_buffer_info* ring_info)
+{
+ ring_info->ring_buffer->interrupt_mask = 1;
+ mb();
+}
+
+uint32_t
+hv_ring_buffer_read_end(
+ hv_vmbus_ring_buffer_info* ring_info)
+{
+ uint32_t read, write;
+
+ ring_info->ring_buffer->interrupt_mask = 0;
+ mb();
+
+ /*
+ * Now check to see if the ring buffer is still empty.
+ * If it is not, we raced and we need to process new
+ * incoming messages.
+ */
+ get_ring_buffer_avail_bytes(ring_info, &read, &write);
+
+ return (read);
+}
+
+/*
+ * When we write to the ring buffer, check if the host needs to
+ * be signaled. Here is the details of this protocol:
+ *
+ * 1. The host guarantees that while it is draining the
+ * ring buffer, it will set the interrupt_mask to
+ * indicate it does not need to be interrupted when
+ * new data is placed.
+ *
+ * 2. The host guarantees that it will completely drain
+ * the ring buffer before exiting the read loop. Further,
+ * once the ring buffer is empty, it will clear the
+ * interrupt_mask and re-check to see if new data has
+ * arrived.
+ */
+static boolean_t
+hv_ring_buffer_needsig_on_write(
+ uint32_t old_write_location,
+ hv_vmbus_ring_buffer_info* rbi)
+{
+ mb();
+ if (rbi->ring_buffer->interrupt_mask)
+ return (FALSE);
+
+ /* Read memory barrier */
+ rmb();
+ /*
+ * This is the only case we need to signal when the
+ * ring transitions from being empty to non-empty.
+ */
+ if (old_write_location == rbi->ring_buffer->read_index)
+ return (TRUE);
+
+ return (FALSE);
+}
+
static uint32_t copy_to_ring_buffer(
hv_vmbus_ring_buffer_info* ring_info,
uint32_t start_write_offset,
@@ -204,11 +269,13 @@ int
hv_ring_buffer_write(
hv_vmbus_ring_buffer_info* out_ring_info,
hv_vmbus_sg_buffer_list sg_buffers[],
- uint32_t sg_buffer_count)
+ uint32_t sg_buffer_count,
+ boolean_t *need_sig)
{
int i = 0;
uint32_t byte_avail_to_write;
uint32_t byte_avail_to_read;
+ uint32_t old_write_location;
uint32_t total_bytes_to_write = 0;
volatile uint32_t next_write_location;
@@ -242,6 +309,8 @@ hv_ring_buffer_write(
*/
next_write_location = get_next_write_location(out_ring_info);
+ old_write_location = next_write_location;
+
for (i = 0; i < sg_buffer_count; i++) {
next_write_location = copy_to_ring_buffer(out_ring_info,
next_write_location, (char *) sg_buffers[i].data,
@@ -258,9 +327,9 @@ hv_ring_buffer_write(
(char *) &prev_indices, sizeof(uint64_t));
/*
- * Make sure we flush all writes before updating the writeIndex
+ * Full memory barrier before upding the write index.
*/
- wmb();
+ mb();
/*
* Now, update the write location
@@ -269,6 +338,9 @@ hv_ring_buffer_write(
mtx_unlock_spin(&out_ring_info->ring_lock);
+ *need_sig = hv_ring_buffer_needsig_on_write(old_write_location,
+ out_ring_info);
+
return (0);
}
diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
index ca28fd5..91813bb 100644
--- a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
+++ b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
@@ -53,22 +53,17 @@ __FBSDID("$FreeBSD$");
#include <machine/stdarg.h>
#include <machine/intr_machdep.h>
+#include <machine/md_var.h>
+#include <machine/segments.h>
#include <sys/pcpu.h>
+#include <machine/apicvar.h>
#include "hv_vmbus_priv.h"
#define VMBUS_IRQ 0x5
-static struct intr_event *hv_msg_intr_event;
-static struct intr_event *hv_event_intr_event;
-static void *msg_swintr;
-static void *event_swintr;
static device_t vmbus_devp;
-static void *vmbus_cookiep;
-static int vmbus_rid;
-struct resource *intr_res;
-static int vmbus_irq = VMBUS_IRQ;
static int vmbus_inited;
static hv_setup_args setup_args; /* only CPU 0 supported at this time */
@@ -77,14 +72,17 @@ static hv_setup_args setup_args; /* only CPU 0 supported at this time */
* the hypervisor.
*/
static void
-vmbus_msg_swintr(void *dummy)
+vmbus_msg_swintr(void *arg)
{
int cpu;
void* page_addr;
hv_vmbus_message* msg;
hv_vmbus_message* copied;
- cpu = PCPU_GET(cpuid);
+ cpu = (int)(long)arg;
+ KASSERT(cpu <= mp_maxid, ("VMBUS: vmbus_msg_swintr: "
+ "cpu out of range!"));
+
page_addr = hv_vmbus_g_context.syn_ic_msg_page[cpu];
msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT;
@@ -130,17 +128,8 @@ vmbus_msg_swintr(void *dummy)
*
* The purpose of this routine is to determine the type of VMBUS protocol
* message to process - an event or a channel message.
- * As this is an interrupt filter routine, the function runs in a very
- * restricted envinronment. From the manpage for bus_setup_intr(9)
- *
- * In this restricted environment, care must be taken to account for all
- * races. A careful analysis of races should be done as well. It is gener-
- * ally cheaper to take an extra interrupt, for example, than to protect
- * variables with spinlocks. Read, modify, write cycles of hardware regis-
- * ters need to be carefully analyzed if other threads are accessing the
- * same registers.
*/
-static int
+static inline int
hv_vmbus_isr(void *unused)
{
int cpu;
@@ -149,8 +138,6 @@ hv_vmbus_isr(void *unused)
void* page_addr;
cpu = PCPU_GET(cpuid);
- /* (Temporary limit) */
- KASSERT(cpu == 0, ("hv_vmbus_isr: Interrupt on CPU other than zero"));
/*
* The Windows team has advised that we check for events
@@ -162,9 +149,21 @@ hv_vmbus_isr(void *unused)
event = (hv_vmbus_synic_event_flags*)
page_addr + HV_VMBUS_MESSAGE_SINT;
- /* Since we are a child, we only need to check bit 0 */
- if (synch_test_and_clear_bit(0, &event->flags32[0])) {
- swi_sched(event_swintr, 0);
+ if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
+ (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) {
+ /* Since we are a child, we only need to check bit 0 */
+ if (synch_test_and_clear_bit(0, &event->flags32[0])) {
+ swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0);
+ }
+ } else {
+ /*
+ * On host with Win8 or above, we can directly look at
+ * the event page. If bit n is set, we have an interrupt
+ * on the channel with id n.
+ * Directly schedule the event software interrupt on
+ * current cpu.
+ */
+ swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0);
}
/* Check if there are actual msgs to be process */
@@ -172,12 +171,47 @@ hv_vmbus_isr(void *unused)
msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT;
if (msg->header.message_type != HV_MESSAGE_TYPE_NONE) {
- swi_sched(msg_swintr, 0);
+ swi_sched(hv_vmbus_g_context.msg_swintr[cpu], 0);
}
return FILTER_HANDLED;
}
+#ifdef HV_DEBUG_INTR
+uint32_t hv_intr_count = 0;
+#endif
+uint32_t hv_vmbus_swintr_event_cpu[MAXCPU];
+uint32_t hv_vmbus_intr_cpu[MAXCPU];
+
+void
+hv_vector_handler(struct trapframe *trap_frame)
+{
+#ifdef HV_DEBUG_INTR
+ int cpu;
+#endif
+
+ /*
+ * Disable preemption.
+ */
+ critical_enter();
+
+#ifdef HV_DEBUG_INTR
+ /*
+ * Do a little interrupt counting.
+ */
+ cpu = PCPU_GET(cpuid);
+ hv_vmbus_intr_cpu[cpu]++;
+ hv_intr_count++;
+#endif
+
+ hv_vmbus_isr(NULL);
+
+ /*
+ * Enable preemption.
+ */
+ critical_exit();
+}
+
static int
vmbus_read_ivar(
device_t dev,
@@ -316,6 +350,81 @@ vmbus_probe(device_t dev) {
return (BUS_PROBE_NOWILDCARD);
}
+#ifdef HYPERV
+extern inthand_t IDTVEC(rsvd), IDTVEC(hv_vmbus_callback);
+
+/**
+ * @brief Find a free IDT slot and setup the interrupt handler.
+ */
+static int
+vmbus_vector_alloc(void)
+{
+ int vector;
+ uintptr_t func;
+ struct gate_descriptor *ip;
+
+ /*
+ * Search backwards form the highest IDT vector available for use
+ * as vmbus channel callback vector. We install 'hv_vmbus_callback'
+ * handler at that vector and use it to interrupt vcpus.
+ */
+ vector = APIC_SPURIOUS_INT;
+ while (--vector >= APIC_IPI_INTS) {
+ ip = &idt[vector];
+ func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+ if (func == (uintptr_t)&IDTVEC(rsvd)) {
+#ifdef __i386__
+ setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYS386IGT,
+ SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+#else
+ setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYSIGT,
+ SEL_KPL, 0);
+#endif
+
+ return (vector);
+ }
+ }
+ return (0);
+}
+
+/**
+ * @brief Restore the IDT slot to rsvd.
+ */
+static void
+vmbus_vector_free(int vector)
+{
+ uintptr_t func;
+ struct gate_descriptor *ip;
+
+ if (vector == 0)
+ return;
+
+ KASSERT(vector >= APIC_IPI_INTS && vector < APIC_SPURIOUS_INT,
+ ("invalid vector %d", vector));
+
+ ip = &idt[vector];
+ func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+ KASSERT(func == (uintptr_t)&IDTVEC(hv_vmbus_callback),
+ ("invalid vector %d", vector));
+
+ setidt(vector, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+}
+
+#else /* HYPERV */
+
+static int
+vmbus_vector_alloc(void)
+{
+ return(0);
+}
+
+static void
+vmbus_vector_free(int vector)
+{
+}
+
+#endif /* HYPERV */
+
/**
* @brief Main vmbus driver initialization routine.
*
@@ -331,22 +440,7 @@ vmbus_probe(device_t dev) {
static int
vmbus_bus_init(void)
{
- struct ioapic_intsrc {
- struct intsrc io_intsrc;
- u_int io_irq;
- u_int io_intpin:8;
- u_int io_vector:8;
- u_int io_cpu:8;
- u_int io_activehi:1;
- u_int io_edgetrigger:1;
- u_int io_masked:1;
- int io_bus:4;
- uint32_t io_lowreg;
- };
- int i, ret;
- unsigned int vector = 0;
- struct intsrc *isrc;
- struct ioapic_intsrc *intpin;
+ int i, j, n, ret;
if (vmbus_inited)
return (0);
@@ -361,80 +455,100 @@ vmbus_bus_init(void)
return (ret);
}
- ret = swi_add(&hv_msg_intr_event, "hv_msg", vmbus_msg_swintr,
- NULL, SWI_CLOCK, 0, &msg_swintr);
-
- if (ret)
- goto cleanup;
-
/*
- * Message SW interrupt handler checks a per-CPU page and
- * thus the thread needs to be bound to CPU-0 - which is where
- * all interrupts are processed.
+ * Find a free IDT slot for vmbus callback.
*/
- ret = intr_event_bind(hv_msg_intr_event, 0);
-
- if (ret)
- goto cleanup1;
+ hv_vmbus_g_context.hv_cb_vector = vmbus_vector_alloc();
- ret = swi_add(&hv_event_intr_event, "hv_event", hv_vmbus_on_events,
- NULL, SWI_CLOCK, 0, &event_swintr);
-
- if (ret)
- goto cleanup1;
+ if (hv_vmbus_g_context.hv_cb_vector == 0) {
+ if(bootverbose)
+ printf("Error VMBUS: Cannot find free IDT slot for "
+ "vmbus callback!\n");
+ goto cleanup;
+ }
- intr_res = bus_alloc_resource(vmbus_devp,
- SYS_RES_IRQ, &vmbus_rid, vmbus_irq, vmbus_irq, 1, RF_ACTIVE);
+ if(bootverbose)
+ printf("VMBUS: vmbus callback vector %d\n",
+ hv_vmbus_g_context.hv_cb_vector);
- if (intr_res == NULL) {
- ret = ENOMEM; /* XXXKYS: Need a better errno */
- goto cleanup2;
+ /*
+ * Notify the hypervisor of our vector.
+ */
+ setup_args.vector = hv_vmbus_g_context.hv_cb_vector;
+
+ CPU_FOREACH(j) {
+ hv_vmbus_intr_cpu[j] = 0;
+ hv_vmbus_swintr_event_cpu[j] = 0;
+ hv_vmbus_g_context.hv_event_intr_event[j] = NULL;
+ hv_vmbus_g_context.hv_msg_intr_event[j] = NULL;
+ hv_vmbus_g_context.event_swintr[j] = NULL;
+ hv_vmbus_g_context.msg_swintr[j] = NULL;
+
+ for (i = 0; i < 2; i++)
+ setup_args.page_buffers[2 * j + i] = NULL;
}
/*
- * Setup interrupt filter handler
+ * Per cpu setup.
*/
- ret = bus_setup_intr(vmbus_devp, intr_res,
- INTR_TYPE_NET | INTR_MPSAFE, hv_vmbus_isr, NULL,
- NULL, &vmbus_cookiep);
-
- if (ret != 0)
- goto cleanup3;
-
- ret = bus_bind_intr(vmbus_devp, intr_res, 0);
- if (ret != 0)
- goto cleanup4;
-
- isrc = intr_lookup_source(vmbus_irq);
- if ((isrc == NULL) || (isrc->is_event == NULL)) {
- ret = EINVAL;
- goto cleanup4;
- }
+ CPU_FOREACH(j) {
+ /*
+ * Setup software interrupt thread and handler for msg handling.
+ */
+ ret = swi_add(&hv_vmbus_g_context.hv_msg_intr_event[j],
+ "hv_msg", vmbus_msg_swintr, (void *)(long)j, SWI_CLOCK, 0,
+ &hv_vmbus_g_context.msg_swintr[j]);
+ if (ret) {
+ if(bootverbose)
+ printf("VMBUS: failed to setup msg swi for "
+ "cpu %d\n", j);
+ goto cleanup1;
+ }
- /* vector = isrc->is_event->ie_vector; */
- intpin = (struct ioapic_intsrc *)isrc;
- vector = intpin->io_vector;
+ /*
+ * Bind the swi thread to the cpu.
+ */
+ ret = intr_event_bind(hv_vmbus_g_context.hv_msg_intr_event[j],
+ j);
+ if (ret) {
+ if(bootverbose)
+ printf("VMBUS: failed to bind msg swi thread "
+ "to cpu %d\n", j);
+ goto cleanup1;
+ }
- if(bootverbose)
- printf("VMBUS: irq 0x%x vector 0x%x\n", vmbus_irq, vector);
+ /*
+ * Setup software interrupt thread and handler for
+ * event handling.
+ */
+ ret = swi_add(&hv_vmbus_g_context.hv_event_intr_event[j],
+ "hv_event", hv_vmbus_on_events, (void *)(long)j,
+ SWI_CLOCK, 0, &hv_vmbus_g_context.event_swintr[j]);
+ if (ret) {
+ if(bootverbose)
+ printf("VMBUS: failed to setup event swi for "
+ "cpu %d\n", j);
+ goto cleanup1;
+ }
- /**
- * Notify the hypervisor of our irq.
- */
- setup_args.vector = vector;
- for(i = 0; i < 2; i++) {
- setup_args.page_buffers[i] =
+ /*
+ * Prepare the per cpu msg and event pages to be called on each cpu.
+ */
+ for(i = 0; i < 2; i++) {
+ setup_args.page_buffers[2 * j + i] =
malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO);
- if (setup_args.page_buffers[i] == NULL) {
- KASSERT(setup_args.page_buffers[i] != NULL,
+ if (setup_args.page_buffers[2 * j + i] == NULL) {
+ KASSERT(setup_args.page_buffers[2 * j + i] != NULL,
("Error VMBUS: malloc failed!"));
- if (i > 0)
- free(setup_args.page_buffers[0], M_DEVBUF);
- goto cleanup4;
+ goto cleanup1;
+ }
}
}
- /* only CPU #0 supported at this time */
+ if (bootverbose)
+ printf("VMBUS: Calling smp_rendezvous, smp_started = %d\n",
+ smp_started);
+
smp_rendezvous(NULL, hv_vmbus_synic_init, NULL, &setup_args);
/*
@@ -443,26 +557,32 @@ vmbus_bus_init(void)
ret = hv_vmbus_connect();
if (ret != 0)
- goto cleanup4;
+ goto cleanup1;
hv_vmbus_request_channel_offers();
return (ret);
- cleanup4:
-
+ cleanup1:
/*
- * remove swi, bus and intr resource
+ * Free pages alloc'ed
*/
- bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep);
+ for (n = 0; n < 2 * MAXCPU; n++)
+ if (setup_args.page_buffers[n] != NULL)
+ free(setup_args.page_buffers[n], M_DEVBUF);
- cleanup3:
- bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res);
-
- cleanup2:
- swi_remove(event_swintr);
+ /*
+ * remove swi and vmbus callback vector;
+ */
+ CPU_FOREACH(j) {
+ if (hv_vmbus_g_context.msg_swintr[j] != NULL)
+ swi_remove(hv_vmbus_g_context.msg_swintr[j]);
+ if (hv_vmbus_g_context.event_swintr[j] != NULL)
+ swi_remove(hv_vmbus_g_context.event_swintr[j]);
+ hv_vmbus_g_context.hv_msg_intr_event[j] = NULL;
+ hv_vmbus_g_context.hv_event_intr_event[j] = NULL;
+ }
- cleanup1:
- swi_remove(msg_swintr);
+ vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector);
cleanup:
hv_vmbus_cleanup();
@@ -515,20 +635,24 @@ vmbus_bus_exit(void)
smp_rendezvous(NULL, hv_vmbus_synic_cleanup, NULL, NULL);
- for(i = 0; i < 2; i++) {
+ for(i = 0; i < 2 * MAXCPU; i++) {
if (setup_args.page_buffers[i] != 0)
free(setup_args.page_buffers[i], M_DEVBUF);
}
hv_vmbus_cleanup();
- /* remove swi, bus and intr resource */
- bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep);
-
- bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res);
+ /* remove swi */
+ CPU_FOREACH(i) {
+ if (hv_vmbus_g_context.msg_swintr[i] != NULL)
+ swi_remove(hv_vmbus_g_context.msg_swintr[i]);
+ if (hv_vmbus_g_context.event_swintr[i] != NULL)
+ swi_remove(hv_vmbus_g_context.event_swintr[i]);
+ hv_vmbus_g_context.hv_msg_intr_event[i] = NULL;
+ hv_vmbus_g_context.hv_event_intr_event[i] = NULL;
+ }
- swi_remove(msg_swintr);
- swi_remove(event_swintr);
+ vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector);
return;
}
@@ -603,6 +727,6 @@ devclass_t vmbus_devclass;
DRIVER_MODULE(vmbus, nexus, vmbus_driver, vmbus_devclass, vmbus_modevent, 0);
MODULE_VERSION(vmbus,1);
-/* TODO: We want to be earlier than SI_SUB_VFS */
-SYSINIT(vmb_init, SI_SUB_VFS, SI_ORDER_MIDDLE, vmbus_init, NULL);
+/* We want to be started after SMP is initialized */
+SYSINIT(vmb_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, vmbus_init, NULL);
diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
index 6bc875d..faa6dec 100644
--- a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
+++ b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
@@ -181,49 +181,30 @@ enum {
#define HV_HYPERCALL_PARAM_ALIGN sizeof(uint64_t)
-/*
- * Connection identifier type
- */
-typedef union {
- uint32_t as_uint32_t;
- struct {
- uint32_t id:24;
- uint32_t reserved:8;
- } u;
-
-} __packed hv_vmbus_connection_id;
-
-/*
- * Definition of the hv_vmbus_signal_event hypercall input structure
- */
-typedef struct {
- hv_vmbus_connection_id connection_id;
- uint16_t flag_number;
- uint16_t rsvd_z;
-} __packed hv_vmbus_input_signal_event;
-
-typedef struct {
- uint64_t align8;
- hv_vmbus_input_signal_event event;
-} __packed hv_vmbus_input_signal_event_buffer;
-
typedef struct {
uint64_t guest_id;
void* hypercall_page;
hv_bool_uint8_t syn_ic_initialized;
+
+ hv_vmbus_handle syn_ic_msg_page[MAXCPU];
+ hv_vmbus_handle syn_ic_event_page[MAXCPU];
/*
- * This is used as an input param to HV_CALL_SIGNAL_EVENT hypercall.
- * The input param is immutable in our usage and
- * must be dynamic mem (vs stack or global).
+ * For FreeBSD cpuid to Hyper-V vcpuid mapping.
*/
- hv_vmbus_input_signal_event_buffer *signal_event_buffer;
+ uint32_t hv_vcpu_index[MAXCPU];
/*
- * 8-bytes aligned of the buffer above
+ * Each cpu has its own software interrupt handler for channel
+ * event and msg handling.
*/
- hv_vmbus_input_signal_event *signal_event_param;
-
- hv_vmbus_handle syn_ic_msg_page[MAXCPU];
- hv_vmbus_handle syn_ic_event_page[MAXCPU];
+ struct intr_event *hv_event_intr_event[MAXCPU];
+ struct intr_event *hv_msg_intr_event[MAXCPU];
+ void *event_swintr[MAXCPU];
+ void *msg_swintr[MAXCPU];
+ /*
+ * Host use this vector to intrrupt guest for vmbus channel
+ * event and msg.
+ */
+ unsigned int hv_cb_vector;
} hv_vmbus_context;
/*
@@ -368,7 +349,8 @@ typedef struct {
TAILQ_HEAD(, hv_vmbus_channel_msg_info) channel_msg_anchor;
struct mtx channel_msg_lock;
/**
- * List of channels
+ * List of primary channels. Sub channels will be linked
+ * under their primary channel.
*/
TAILQ_HEAD(, hv_vmbus_channel) channel_anchor;
struct mtx channel_lock;
@@ -560,6 +542,8 @@ typedef union {
uint32_t flags32[HV_EVENT_FLAGS_DWORD_COUNT];
} hv_vmbus_synic_event_flags;
+/* MSR used to provide vcpu index */
+#define HV_X64_MSR_VP_INDEX (0x40000002)
/*
* Define synthetic interrupt controller model specific registers
@@ -618,7 +602,8 @@ void hv_ring_buffer_cleanup(
int hv_ring_buffer_write(
hv_vmbus_ring_buffer_info *ring_info,
hv_vmbus_sg_buffer_list sg_buffers[],
- uint32_t sg_buff_count);
+ uint32_t sg_buff_count,
+ boolean_t *need_sig);
int hv_ring_buffer_peek(
hv_vmbus_ring_buffer_info *ring_info,
@@ -638,6 +623,12 @@ void hv_vmbus_dump_ring_info(
hv_vmbus_ring_buffer_info *ring_info,
char *prefix);
+void hv_ring_buffer_read_begin(
+ hv_vmbus_ring_buffer_info *ring_info);
+
+uint32_t hv_ring_buffer_read_end(
+ hv_vmbus_ring_buffer_info *ring_info);
+
hv_vmbus_channel* hv_vmbus_allocate_channel(void);
void hv_vmbus_free_vmbus_channel(hv_vmbus_channel *channel);
void hv_vmbus_on_channel_message(void *context);
@@ -652,7 +643,7 @@ uint16_t hv_vmbus_post_msg_via_msg_ipc(
void *payload,
size_t payload_size);
-uint16_t hv_vmbus_signal_event(void);
+uint16_t hv_vmbus_signal_event(void *con_id);
void hv_vmbus_synic_init(void *irq_arg);
void hv_vmbus_synic_cleanup(void *arg);
int hv_vmbus_query_hypervisor_presence(void);
@@ -674,7 +665,7 @@ hv_vmbus_channel* hv_vmbus_get_channel_from_rel_id(uint32_t rel_id);
int hv_vmbus_connect(void);
int hv_vmbus_disconnect(void);
int hv_vmbus_post_message(void *buffer, size_t buf_size);
-int hv_vmbus_set_event(uint32_t child_rel_id);
+int hv_vmbus_set_event(hv_vmbus_channel *channel);
void hv_vmbus_on_events(void *);
@@ -718,7 +709,7 @@ static inline uint64_t hv_generate_guest_id(
typedef struct {
unsigned int vector;
- void *page_buffers[2];
+ void *page_buffers[2 * MAXCPU];
} hv_setup_args;
#endif /* __HYPERV_PRIV_H__ */
OpenPOWER on IntegriCloud