summaryrefslogtreecommitdiffstats
path: root/sys/dev/hyperv/vmbus
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev/hyperv/vmbus')
-rw-r--r--sys/dev/hyperv/vmbus/hv_channel.c98
-rw-r--r--sys/dev/hyperv/vmbus/hv_channel_mgmt.c268
-rw-r--r--sys/dev/hyperv/vmbus/hv_connection.c289
-rw-r--r--sys/dev/hyperv/vmbus/hv_hv.c66
-rw-r--r--sys/dev/hyperv/vmbus/hv_ring_buffer.c78
-rw-r--r--sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c364
-rw-r--r--sys/dev/hyperv/vmbus/hv_vmbus_priv.h71
7 files changed, 870 insertions, 364 deletions
diff --git a/sys/dev/hyperv/vmbus/hv_channel.c b/sys/dev/hyperv/vmbus/hv_channel.c
index 103260a..94137fb 100644
--- a/sys/dev/hyperv/vmbus/hv_channel.c
+++ b/sys/dev/hyperv/vmbus/hv_channel.c
@@ -75,7 +75,7 @@ vmbus_channel_set_event(hv_vmbus_channel *channel)
(uint32_t *)&monitor_page->
trigger_group[channel->monitor_group].u.pending);
} else {
- hv_vmbus_set_event(channel->offer_msg.child_rel_id);
+ hv_vmbus_set_event(channel);
}
}
@@ -99,6 +99,18 @@ hv_vmbus_channel_open(
hv_vmbus_channel_open_channel* open_msg;
hv_vmbus_channel_msg_info* open_info;
+ mtx_lock(&new_channel->sc_lock);
+ if (new_channel->state == HV_CHANNEL_OPEN_STATE) {
+ new_channel->state = HV_CHANNEL_OPENING_STATE;
+ } else {
+ mtx_unlock(&new_channel->sc_lock);
+ if(bootverbose)
+ printf("VMBUS: Trying to open channel <%p> which in "
+ "%d state.\n", new_channel, new_channel->state);
+ return (EINVAL);
+ }
+ mtx_unlock(&new_channel->sc_lock);
+
new_channel->on_channel_callback = pfn_on_channel_callback;
new_channel->channel_callback_context = context;
@@ -162,7 +174,7 @@ hv_vmbus_channel_open(
new_channel->ring_buffer_gpadl_handle;
open_msg->downstream_ring_buffer_page_offset = send_ring_buffer_size
>> PAGE_SHIFT;
- open_msg->server_context_area_gpadl_handle = 0;
+ open_msg->target_vcpu = new_channel->target_vcpu;
if (user_data_len)
memcpy(open_msg->user_data, user_data, user_data_len);
@@ -182,10 +194,14 @@ hv_vmbus_channel_open(
ret = sema_timedwait(&open_info->wait_sema, 500); /* KYS 5 seconds */
- if (ret)
+ if (ret) {
+ if(bootverbose)
+ printf("VMBUS: channel <%p> open timeout.\n", new_channel);
goto cleanup;
+ }
if (open_info->response.open_result.status == 0) {
+ new_channel->state = HV_CHANNEL_OPENED_STATE;
if(bootverbose)
printf("VMBUS: channel <%p> open success.\n", new_channel);
} else {
@@ -497,16 +513,20 @@ cleanup:
return (ret);
}
-/**
- * @brief Close the specified channel
- */
-void
-hv_vmbus_channel_close(hv_vmbus_channel *channel)
+static void
+hv_vmbus_channel_close_internal(hv_vmbus_channel *channel)
{
int ret = 0;
hv_vmbus_channel_close_channel* msg;
hv_vmbus_channel_msg_info* info;
+ channel->state = HV_CHANNEL_OPEN_STATE;
+ channel->sc_creation_callback = NULL;
+
+ /*
+ * Grab the lock to prevent race condition when a packet received
+ * and unloading driver is in the process.
+ */
mtx_lock(&channel->inbound_lock);
channel->on_channel_callback = NULL;
mtx_unlock(&channel->inbound_lock);
@@ -545,23 +565,37 @@ hv_vmbus_channel_close(hv_vmbus_channel *channel)
M_DEVBUF);
free(info, M_DEVBUF);
+}
+
+/**
+ * @brief Close the specified channel
+ */
+void
+hv_vmbus_channel_close(hv_vmbus_channel *channel)
+{
+ hv_vmbus_channel* sub_channel;
+
+ if (channel->primary_channel != NULL) {
+ /*
+ * We only close multi-channels when the primary is
+ * closed.
+ */
+ return;
+ }
/*
- * If we are closing the channel during an error path in
- * opening the channel, don't free the channel
- * since the caller will free the channel
+ * Close all multi-channels first.
*/
- if (channel->state == HV_CHANNEL_OPEN_STATE) {
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
- TAILQ_REMOVE(
- &hv_vmbus_g_connection.channel_anchor,
- channel,
- list_entry);
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
-
- hv_vmbus_free_vmbus_channel(channel);
+ TAILQ_FOREACH(sub_channel, &channel->sc_list_anchor,
+ sc_list_entry) {
+ if (sub_channel->state != HV_CHANNEL_OPENED_STATE)
+ continue;
+ hv_vmbus_channel_close_internal(sub_channel);
}
-
+ /*
+ * Then close the primary channel.
+ */
+ hv_vmbus_channel_close_internal(channel);
}
/**
@@ -581,6 +615,7 @@ hv_vmbus_channel_send_packet(
uint32_t packet_len;
uint64_t aligned_data;
uint32_t packet_len_aligned;
+ boolean_t need_sig;
hv_vmbus_sg_buffer_list buffer_list[3];
packet_len = sizeof(hv_vm_packet_descriptor) + buffer_len;
@@ -604,12 +639,11 @@ hv_vmbus_channel_send_packet(
buffer_list[2].data = &aligned_data;
buffer_list[2].length = packet_len_aligned - packet_len;
- ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3);
+ ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3,
+ &need_sig);
/* TODO: We should determine if this is optional */
- if (ret == 0
- && !hv_vmbus_get_ring_buffer_interrupt_mask(
- &channel->outbound)) {
+ if (ret == 0 && need_sig) {
vmbus_channel_set_event(channel);
}
@@ -632,6 +666,7 @@ hv_vmbus_channel_send_packet_pagebuffer(
int ret = 0;
int i = 0;
+ boolean_t need_sig;
uint32_t packet_len;
uint32_t packetLen_aligned;
hv_vmbus_sg_buffer_list buffer_list[3];
@@ -675,11 +710,11 @@ hv_vmbus_channel_send_packet_pagebuffer(
buffer_list[2].data = &alignedData;
buffer_list[2].length = packetLen_aligned - packet_len;
- ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3);
+ ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3,
+ &need_sig);
/* TODO: We should determine if this is optional */
- if (ret == 0 &&
- !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) {
+ if (ret == 0 && need_sig) {
vmbus_channel_set_event(channel);
}
@@ -700,6 +735,7 @@ hv_vmbus_channel_send_packet_multipagebuffer(
int ret = 0;
uint32_t desc_size;
+ boolean_t need_sig;
uint32_t packet_len;
uint32_t packet_len_aligned;
uint32_t pfn_count;
@@ -750,11 +786,11 @@ hv_vmbus_channel_send_packet_multipagebuffer(
buffer_list[2].data = &aligned_data;
buffer_list[2].length = packet_len_aligned - packet_len;
- ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3);
+ ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3,
+ &need_sig);
/* TODO: We should determine if this is optional */
- if (ret == 0 &&
- !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) {
+ if (ret == 0 && need_sig) {
vmbus_channel_set_event(channel);
}
diff --git a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
index 011e305..d13ece5 100644
--- a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
+++ b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
@@ -26,6 +26,9 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
#include <sys/param.h>
#include <sys/mbuf.h>
@@ -50,6 +53,8 @@ static void vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr);
static void vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr);
static void vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr);
static void vmbus_channel_process_offer(void *context);
+struct hv_vmbus_channel*
+ vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary);
/**
* Channel message dispatch table
@@ -233,6 +238,9 @@ hv_vmbus_allocate_channel(void)
return (NULL);
mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF);
+ mtx_init(&channel->sc_lock, "vmbus multi channel", NULL, MTX_DEF);
+
+ TAILQ_INIT(&channel->sc_list_anchor);
channel->control_work_queue = hv_work_queue_create("control");
@@ -262,6 +270,7 @@ ReleaseVmbusChannel(void *context)
void
hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel)
{
+ mtx_destroy(&channel->sc_lock);
mtx_destroy(&channel->inbound_lock);
/*
* We have to release the channel's workqueue/thread in
@@ -279,10 +288,10 @@ hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel)
static void
vmbus_channel_process_offer(void *context)
{
- int ret;
hv_vmbus_channel* new_channel;
boolean_t f_new;
hv_vmbus_channel* channel;
+ int ret;
new_channel = (hv_vmbus_channel*) context;
f_new = TRUE;
@@ -291,38 +300,76 @@ vmbus_channel_process_offer(void *context)
/*
* Make sure this is a new offer
*/
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor,
list_entry)
{
- if (!memcmp(
- &channel->offer_msg.offer.interface_type,
- &new_channel->offer_msg.offer.interface_type,
- sizeof(hv_guid))
- && !memcmp(
- &channel->offer_msg.offer.interface_instance,
+ if (memcmp(&channel->offer_msg.offer.interface_type,
+ &new_channel->offer_msg.offer.interface_type,
+ sizeof(hv_guid)) == 0 &&
+ memcmp(&channel->offer_msg.offer.interface_instance,
&new_channel->offer_msg.offer.interface_instance,
- sizeof(hv_guid))) {
- f_new = FALSE;
- break;
- }
+ sizeof(hv_guid)) == 0) {
+ f_new = FALSE;
+ break;
+ }
}
if (f_new) {
- /* Insert at tail */
- TAILQ_INSERT_TAIL(
- &hv_vmbus_g_connection.channel_anchor,
- new_channel,
- list_entry);
+ /* Insert at tail */
+ TAILQ_INSERT_TAIL(
+ &hv_vmbus_g_connection.channel_anchor,
+ new_channel,
+ list_entry);
}
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
+
+ /*XXX add new channel to percpu_list */
if (!f_new) {
+ /*
+ * Check if this is a sub channel.
+ */
+ if (new_channel->offer_msg.offer.sub_channel_index != 0) {
+ /*
+ * It is a sub channel offer, process it.
+ */
+ new_channel->primary_channel = channel;
+ mtx_lock(&channel->sc_lock);
+ TAILQ_INSERT_TAIL(
+ &channel->sc_list_anchor,
+ new_channel,
+ sc_list_entry);
+ mtx_unlock(&channel->sc_lock);
+
+ /* Insert new channel into channel_anchor. */
+ printf("Storvsc get multi-channel offer, rel=%u.\n",
+ new_channel->offer_msg.child_rel_id);
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
+ TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor,
+ new_channel, list_entry);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
+
+ if(bootverbose)
+ printf("VMBUS: new multi-channel offer <%p>.\n",
+ new_channel);
+
+ /*XXX add it to percpu_list */
+
+ new_channel->state = HV_CHANNEL_OPEN_STATE;
+ if (channel->sc_creation_callback != NULL) {
+ channel->sc_creation_callback(new_channel);
+ }
+ return;
+ }
+
hv_vmbus_free_vmbus_channel(new_channel);
return;
}
+ new_channel->state = HV_CHANNEL_OPEN_STATE;
+
/*
* Start the process of binding this offer to the driver
* (We need to set the device field before calling
@@ -333,35 +380,86 @@ vmbus_channel_process_offer(void *context)
new_channel->offer_msg.offer.interface_instance, new_channel);
/*
- * TODO - the HV_CHANNEL_OPEN_STATE flag should not be set below
- * but in the "open" channel request. The ret != 0 logic below
- * doesn't take into account that a channel
- * may have been opened successfully
- */
-
- /*
* Add the new device to the bus. This will kick off device-driver
* binding which eventually invokes the device driver's AddDevice()
* method.
*/
ret = hv_vmbus_child_device_register(new_channel->device);
if (ret != 0) {
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
- TAILQ_REMOVE(
- &hv_vmbus_g_connection.channel_anchor,
- new_channel,
- list_entry);
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
- hv_vmbus_free_vmbus_channel(new_channel);
- } else {
- /*
- * This state is used to indicate a successful open
- * so that when we do close the channel normally,
- * we can clean up properly
- */
- new_channel->state = HV_CHANNEL_OPEN_STATE;
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
+ TAILQ_REMOVE(
+ &hv_vmbus_g_connection.channel_anchor,
+ new_channel,
+ list_entry);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
+ hv_vmbus_free_vmbus_channel(new_channel);
+ }
+}
+
+/**
+ * Array of device guids that are performance critical. We try to distribute
+ * the interrupt load for these devices across all online cpus.
+ */
+static const hv_guid high_perf_devices[] = {
+ {HV_NIC_GUID, },
+ {HV_IDE_GUID, },
+ {HV_SCSI_GUID, },
+};
+
+enum {
+ PERF_CHN_NIC = 0,
+ PERF_CHN_IDE,
+ PERF_CHN_SCSI,
+ MAX_PERF_CHN,
+};
+
+/*
+ * We use this static number to distribute the channel interrupt load.
+ */
+static uint32_t next_vcpu;
+
+/**
+ * Starting with Win8, we can statically distribute the incoming
+ * channel interrupt load by binding a channel to VCPU. We
+ * implement here a simple round robin scheme for distributing
+ * the interrupt load.
+ * We will bind channels that are not performance critical to cpu 0 and
+ * performance critical channels (IDE, SCSI and Network) will be uniformly
+ * distributed across all available CPUs.
+ */
+static void
+vmbus_channel_select_cpu(hv_vmbus_channel *channel, hv_guid *guid)
+{
+ uint32_t current_cpu;
+ int i;
+ boolean_t is_perf_channel = FALSE;
+
+ for (i = PERF_CHN_NIC; i < MAX_PERF_CHN; i++) {
+ if (memcmp(guid->data, high_perf_devices[i].data,
+ sizeof(hv_guid)) == 0) {
+ is_perf_channel = TRUE;
+ break;
+ }
+ }
+ if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
+ (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7) ||
+ (!is_perf_channel)) {
+ /* Host's view of guest cpu */
+ channel->target_vcpu = 0;
+ /* Guest's own view of cpu */
+ channel->target_cpu = 0;
+ return;
}
+ /* mp_ncpus should have the number cpus currently online */
+ current_cpu = (++next_vcpu % mp_ncpus);
+ channel->target_cpu = current_cpu;
+ channel->target_vcpu =
+ hv_vmbus_g_context.hv_vcpu_index[current_cpu];
+ if (bootverbose)
+ printf("VMBUS: Total online cpus %d, assign perf channel %d "
+ "to vcpu %d, cpu %d\n", mp_ncpus, i, channel->target_vcpu,
+ current_cpu);
}
/**
@@ -391,6 +489,38 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr)
if (new_channel == NULL)
return;
+ /*
+ * By default we setup state to enable batched
+ * reading. A specific service can choose to
+ * disable this prior to opening the channel.
+ */
+ new_channel->batched_reading = TRUE;
+
+ new_channel->signal_event_param =
+ (hv_vmbus_input_signal_event *)
+ (HV_ALIGN_UP((unsigned long)
+ &new_channel->signal_event_buffer,
+ HV_HYPERCALL_PARAM_ALIGN));
+
+ new_channel->signal_event_param->connection_id.as_uint32_t = 0;
+ new_channel->signal_event_param->connection_id.u.id =
+ HV_VMBUS_EVENT_CONNECTION_ID;
+ new_channel->signal_event_param->flag_number = 0;
+ new_channel->signal_event_param->rsvd_z = 0;
+
+ if (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008) {
+ new_channel->is_dedicated_interrupt =
+ (offer->is_dedicated_interrupt != 0);
+ new_channel->signal_event_param->connection_id.u.id =
+ offer->connection_id;
+ }
+
+ /*
+ * Bind the channel to a chosen cpu.
+ */
+ vmbus_channel_select_cpu(new_channel,
+ &offer->offer.interface_type);
+
memcpy(&new_channel->offer_msg, offer,
sizeof(hv_vmbus_channel_offer_channel));
new_channel->monitor_group = (uint8_t) offer->monitor_id / 32;
@@ -666,7 +796,7 @@ hv_vmbus_release_unattached_channels(void)
{
hv_vmbus_channel *channel;
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
while (!TAILQ_EMPTY(&hv_vmbus_g_connection.channel_anchor)) {
channel = TAILQ_FIRST(&hv_vmbus_g_connection.channel_anchor);
@@ -676,5 +806,61 @@ hv_vmbus_release_unattached_channels(void)
hv_vmbus_child_device_unregister(channel->device);
hv_vmbus_free_vmbus_channel(channel);
}
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
+}
+
+/**
+ * @brief Select the best outgoing channel
+ *
+ * The channel whose vcpu binding is closest to the currect vcpu will
+ * be selected.
+ * If no multi-channel, always select primary channel
+ *
+ * @param primary - primary channel
+ */
+struct hv_vmbus_channel *
+vmbus_select_outgoing_channel(struct hv_vmbus_channel *primary)
+{
+ hv_vmbus_channel *new_channel = NULL;
+ hv_vmbus_channel *outgoing_channel = primary;
+ int old_cpu_distance = 0;
+ int new_cpu_distance = 0;
+ int cur_vcpu = 0;
+ int smp_pro_id = PCPU_GET(cpuid);
+
+ if (TAILQ_EMPTY(&primary->sc_list_anchor)) {
+ return outgoing_channel;
+ }
+
+ if (smp_pro_id >= MAXCPU) {
+ return outgoing_channel;
+ }
+
+ cur_vcpu = hv_vmbus_g_context.hv_vcpu_index[smp_pro_id];
+
+ TAILQ_FOREACH(new_channel, &primary->sc_list_anchor, sc_list_entry) {
+ if (new_channel->state != HV_CHANNEL_OPENED_STATE){
+ continue;
+ }
+
+ if (new_channel->target_vcpu == cur_vcpu){
+ return new_channel;
+ }
+
+ old_cpu_distance = ((outgoing_channel->target_vcpu > cur_vcpu) ?
+ (outgoing_channel->target_vcpu - cur_vcpu) :
+ (cur_vcpu - outgoing_channel->target_vcpu));
+
+ new_cpu_distance = ((new_channel->target_vcpu > cur_vcpu) ?
+ (new_channel->target_vcpu - cur_vcpu) :
+ (cur_vcpu - new_channel->target_vcpu));
+
+ if (old_cpu_distance < new_cpu_distance) {
+ continue;
+ }
+
+ outgoing_channel = new_channel;
+ }
+
+ return(outgoing_channel);
}
diff --git a/sys/dev/hyperv/vmbus/hv_connection.c b/sys/dev/hyperv/vmbus/hv_connection.c
index c8e0b48..cc83037 100644
--- a/sys/dev/hyperv/vmbus/hv_connection.c
+++ b/sys/dev/hyperv/vmbus/hv_connection.c
@@ -26,6 +26,9 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/systm.h>
@@ -45,14 +48,113 @@ hv_vmbus_connection hv_vmbus_g_connection =
{ .connect_state = HV_DISCONNECTED,
.next_gpadl_handle = 0xE1E10, };
+uint32_t hv_vmbus_protocal_version = HV_VMBUS_VERSION_WS2008;
+
+static uint32_t
+hv_vmbus_get_next_version(uint32_t current_ver)
+{
+ switch (current_ver) {
+ case (HV_VMBUS_VERSION_WIN7):
+ return(HV_VMBUS_VERSION_WS2008);
+
+ case (HV_VMBUS_VERSION_WIN8):
+ return(HV_VMBUS_VERSION_WIN7);
+
+ case (HV_VMBUS_VERSION_WIN8_1):
+ return(HV_VMBUS_VERSION_WIN8);
+
+ case (HV_VMBUS_VERSION_WS2008):
+ default:
+ return(HV_VMBUS_VERSION_INVALID);
+ }
+}
+
+/**
+ * Negotiate the highest supported hypervisor version.
+ */
+static int
+hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info,
+ uint32_t version)
+{
+ int ret = 0;
+ hv_vmbus_channel_initiate_contact *msg;
+
+ sema_init(&msg_info->wait_sema, 0, "Msg Info Sema");
+ msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg;
+
+ msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT;
+ msg->vmbus_version_requested = version;
+
+ msg->interrupt_page = hv_get_phys_addr(
+ hv_vmbus_g_connection.interrupt_page);
+
+ msg->monitor_page_1 = hv_get_phys_addr(
+ hv_vmbus_g_connection.monitor_pages);
+
+ msg->monitor_page_2 =
+ hv_get_phys_addr(
+ ((uint8_t *) hv_vmbus_g_connection.monitor_pages
+ + PAGE_SIZE));
+
+ /**
+ * Add to list before we send the request since we may receive the
+ * response before returning from this routine
+ */
+ mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+
+ TAILQ_INSERT_TAIL(
+ &hv_vmbus_g_connection.channel_msg_anchor,
+ msg_info,
+ msg_list_entry);
+
+ mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+
+ ret = hv_vmbus_post_message(
+ msg,
+ sizeof(hv_vmbus_channel_initiate_contact));
+
+ if (ret != 0) {
+ mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+ TAILQ_REMOVE(
+ &hv_vmbus_g_connection.channel_msg_anchor,
+ msg_info,
+ msg_list_entry);
+ mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+ return (ret);
+ }
+
+ /**
+ * Wait for the connection response
+ */
+ ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */
+
+ mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+ TAILQ_REMOVE(
+ &hv_vmbus_g_connection.channel_msg_anchor,
+ msg_info,
+ msg_list_entry);
+ mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+
+ /**
+ * Check if successful
+ */
+ if (msg_info->response.version_response.version_supported) {
+ hv_vmbus_g_connection.connect_state = HV_CONNECTED;
+ } else {
+ ret = ECONNREFUSED;
+ }
+
+ return (ret);
+}
+
/**
* Send a connect request on the partition service connection
*/
int
hv_vmbus_connect(void) {
int ret = 0;
+ uint32_t version;
hv_vmbus_channel_msg_info* msg_info = NULL;
- hv_vmbus_channel_initiate_contact* msg;
/**
* Make sure we are not connecting or connected
@@ -74,7 +176,7 @@ hv_vmbus_connect(void) {
TAILQ_INIT(&hv_vmbus_g_connection.channel_anchor);
mtx_init(&hv_vmbus_g_connection.channel_lock, "vmbus channel",
- NULL, MTX_SPIN);
+ NULL, MTX_DEF);
/**
* Setup the vmbus event connection for channel interrupt abstraction
@@ -130,71 +232,30 @@ hv_vmbus_connect(void) {
goto cleanup;
}
- sema_init(&msg_info->wait_sema, 0, "Msg Info Sema");
- msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg;
-
- msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT;
- msg->vmbus_version_requested = HV_VMBUS_REVISION_NUMBER;
-
- msg->interrupt_page = hv_get_phys_addr(
- hv_vmbus_g_connection.interrupt_page);
-
- msg->monitor_page_1 = hv_get_phys_addr(
- hv_vmbus_g_connection.monitor_pages);
-
- msg->monitor_page_2 =
- hv_get_phys_addr(
- ((uint8_t *) hv_vmbus_g_connection.monitor_pages
- + PAGE_SIZE));
-
- /**
- * Add to list before we send the request since we may receive the
- * response before returning from this routine
+ /*
+ * Find the highest vmbus version number we can support.
*/
- mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
-
- TAILQ_INSERT_TAIL(
- &hv_vmbus_g_connection.channel_msg_anchor,
- msg_info,
- msg_list_entry);
-
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
-
- ret = hv_vmbus_post_message(
- msg,
- sizeof(hv_vmbus_channel_initiate_contact));
-
- if (ret != 0) {
- mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
- TAILQ_REMOVE(
- &hv_vmbus_g_connection.channel_msg_anchor,
- msg_info,
- msg_list_entry);
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
- goto cleanup;
- }
+ version = HV_VMBUS_VERSION_CURRENT;
+
+ do {
+ ret = hv_vmbus_negotiate_version(msg_info, version);
+ if (ret == EWOULDBLOCK) {
+ /*
+ * We timed out.
+ */
+ goto cleanup;
+ }
- /**
- * Wait for the connection response
- */
- ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */
+ if (hv_vmbus_g_connection.connect_state == HV_CONNECTED)
+ break;
- mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
- TAILQ_REMOVE(
- &hv_vmbus_g_connection.channel_msg_anchor,
- msg_info,
- msg_list_entry);
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+ version = hv_vmbus_get_next_version(version);
+ } while (version != HV_VMBUS_VERSION_INVALID);
- /**
- * Check if successful
- */
- if (msg_info->response.version_response.version_supported) {
- hv_vmbus_g_connection.connect_state = HV_CONNECTED;
- } else {
- ret = ECONNREFUSED;
- goto cleanup;
- }
+ hv_vmbus_protocal_version = version;
+ if (bootverbose)
+ printf("VMBUS: Portocal Version: %d.%d\n",
+ version >> 16, version & 0xFFFF);
sema_destroy(&msg_info->wait_sema);
free(msg_info, M_DEVBUF);
@@ -286,7 +347,7 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) {
* and channels are accessed without the need to take this lock or search
* the list.
*/
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
TAILQ_FOREACH(channel,
&hv_vmbus_g_connection.channel_anchor, list_entry) {
@@ -295,7 +356,7 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) {
break;
}
}
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
return (foundChannel);
}
@@ -306,7 +367,10 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) {
static void
VmbusProcessChannelEvent(uint32_t relid)
{
+ void* arg;
+ uint32_t bytes_to_read;
hv_vmbus_channel* channel;
+ boolean_t is_batched_reading;
/**
* Find the channel based on this relid and invokes
@@ -327,31 +391,98 @@ VmbusProcessChannelEvent(uint32_t relid)
* callback to NULL. This closes the window.
*/
- mtx_lock(&channel->inbound_lock);
+ /*
+ * Disable the lock due to newly added WITNESS check in r277723.
+ * Will seek other way to avoid race condition.
+ * -- whu
+ */
+ // mtx_lock(&channel->inbound_lock);
if (channel->on_channel_callback != NULL) {
- channel->on_channel_callback(channel->channel_callback_context);
+ arg = channel->channel_callback_context;
+ is_batched_reading = channel->batched_reading;
+ /*
+ * Optimize host to guest signaling by ensuring:
+ * 1. While reading the channel, we disable interrupts from
+ * host.
+ * 2. Ensure that we process all posted messages from the host
+ * before returning from this callback.
+ * 3. Once we return, enable signaling from the host. Once this
+ * state is set we check to see if additional packets are
+ * available to read. In this case we repeat the process.
+ */
+ do {
+ if (is_batched_reading)
+ hv_ring_buffer_read_begin(&channel->inbound);
+
+ channel->on_channel_callback(arg);
+
+ if (is_batched_reading)
+ bytes_to_read =
+ hv_ring_buffer_read_end(&channel->inbound);
+ else
+ bytes_to_read = 0;
+ } while (is_batched_reading && (bytes_to_read != 0));
}
- mtx_unlock(&channel->inbound_lock);
+ // mtx_unlock(&channel->inbound_lock);
}
+#ifdef HV_DEBUG_INTR
+extern uint32_t hv_intr_count;
+extern uint32_t hv_vmbus_swintr_event_cpu[MAXCPU];
+extern uint32_t hv_vmbus_intr_cpu[MAXCPU];
+#endif
+
/**
* Handler for events
*/
void
hv_vmbus_on_events(void *arg)
{
- int dword;
int bit;
+ int cpu;
+ int dword;
+ void *page_addr;
+ uint32_t* recv_interrupt_page = NULL;
int rel_id;
- int maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5;
+ int maxdword;
+ hv_vmbus_synic_event_flags *event;
/* int maxdword = PAGE_SIZE >> 3; */
- /*
- * receive size is 1/2 page and divide that by 4 bytes
- */
-
- uint32_t* recv_interrupt_page =
- hv_vmbus_g_connection.recv_interrupt_page;
+ cpu = (int)(long)arg;
+ KASSERT(cpu <= mp_maxid, ("VMBUS: hv_vmbus_on_events: "
+ "cpu out of range!"));
+
+#ifdef HV_DEBUG_INTR
+ int i;
+ hv_vmbus_swintr_event_cpu[cpu]++;
+ if (hv_intr_count % 10000 == 0) {
+ printf("VMBUS: Total interrupt %d\n", hv_intr_count);
+ for (i = 0; i < mp_ncpus; i++)
+ printf("VMBUS: hw cpu[%d]: %d, event sw intr cpu[%d]: %d\n",
+ i, hv_vmbus_intr_cpu[i], i, hv_vmbus_swintr_event_cpu[i]);
+ }
+#endif
+
+ if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
+ (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) {
+ maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5;
+ /*
+ * receive size is 1/2 page and divide that by 4 bytes
+ */
+ recv_interrupt_page =
+ hv_vmbus_g_connection.recv_interrupt_page;
+ } else {
+ /*
+ * On Host with Win8 or above, the event page can be
+ * checked directly to get the id of the channel
+ * that has the pending interrupt.
+ */
+ maxdword = HV_EVENT_FLAGS_DWORD_COUNT;
+ page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu];
+ event = (hv_vmbus_synic_event_flags *)
+ page_addr + HV_VMBUS_MESSAGE_SINT;
+ recv_interrupt_page = event->flags32;
+ }
/*
* Check events
@@ -416,16 +547,16 @@ int hv_vmbus_post_message(void *buffer, size_t bufferLen) {
* Send an event notification to the parent
*/
int
-hv_vmbus_set_event(uint32_t child_rel_id) {
+hv_vmbus_set_event(hv_vmbus_channel *channel) {
int ret = 0;
+ uint32_t child_rel_id = channel->offer_msg.child_rel_id;
/* Each uint32_t represents 32 channels */
synch_set_bit(child_rel_id & 31,
(((uint32_t *)hv_vmbus_g_connection.send_interrupt_page
+ (child_rel_id >> 5))));
- ret = hv_vmbus_signal_event();
+ ret = hv_vmbus_signal_event(channel->signal_event_param);
return (ret);
}
-
diff --git a/sys/dev/hyperv/vmbus/hv_hv.c b/sys/dev/hyperv/vmbus/hv_hv.c
index 80a1f42..84e2a5e 100644
--- a/sys/dev/hyperv/vmbus/hv_hv.c
+++ b/sys/dev/hyperv/vmbus/hv_hv.c
@@ -67,8 +67,6 @@ static inline void do_cpuid_inline(unsigned int op, unsigned int *eax,
hv_vmbus_context hv_vmbus_g_context = {
.syn_ic_initialized = FALSE,
.hypercall_page = NULL,
- .signal_event_param = NULL,
- .signal_event_buffer = NULL,
};
static struct timecounter hv_timecounter = {
@@ -256,28 +254,6 @@ hv_vmbus_init(void)
hv_vmbus_g_context.hypercall_page = virt_addr;
- /*
- * Setup the global signal event param for the signal event hypercall
- */
- hv_vmbus_g_context.signal_event_buffer =
- malloc(sizeof(hv_vmbus_input_signal_event_buffer), M_DEVBUF,
- M_ZERO | M_NOWAIT);
- KASSERT(hv_vmbus_g_context.signal_event_buffer != NULL,
- ("Error VMBUS: Failed to allocate signal_event_buffer\n"));
- if (hv_vmbus_g_context.signal_event_buffer == NULL)
- goto cleanup;
-
- hv_vmbus_g_context.signal_event_param =
- (hv_vmbus_input_signal_event*)
- (HV_ALIGN_UP((unsigned long)
- hv_vmbus_g_context.signal_event_buffer,
- HV_HYPERCALL_PARAM_ALIGN));
- hv_vmbus_g_context.signal_event_param->connection_id.as_uint32_t = 0;
- hv_vmbus_g_context.signal_event_param->connection_id.u.id =
- HV_VMBUS_EVENT_CONNECTION_ID;
- hv_vmbus_g_context.signal_event_param->flag_number = 0;
- hv_vmbus_g_context.signal_event_param->rsvd_z = 0;
-
tc_init(&hv_timecounter); /* register virtual timecount */
return (0);
@@ -303,12 +279,6 @@ hv_vmbus_cleanup(void)
{
hv_vmbus_x64_msr_hypercall_contents hypercall_msr;
- if (hv_vmbus_g_context.signal_event_buffer != NULL) {
- free(hv_vmbus_g_context.signal_event_buffer, M_DEVBUF);
- hv_vmbus_g_context.signal_event_buffer = NULL;
- hv_vmbus_g_context.signal_event_param = NULL;
- }
-
if (hv_vmbus_g_context.guest_id == HV_FREEBSD_GUEST_ID) {
if (hv_vmbus_g_context.hypercall_page != NULL) {
hypercall_msr.as_uint64_t = 0;
@@ -370,13 +340,13 @@ hv_vmbus_post_msg_via_msg_ipc(
* event IPC. (This involves a hypercall.)
*/
hv_vmbus_status
-hv_vmbus_signal_event()
+hv_vmbus_signal_event(void *con_id)
{
hv_vmbus_status status;
status = hv_vmbus_do_hypercall(
HV_CALL_SIGNAL_EVENT,
- hv_vmbus_g_context.signal_event_param,
+ con_id,
0) & 0xFFFF;
return (status);
@@ -390,6 +360,7 @@ hv_vmbus_synic_init(void *arg)
{
int cpu;
+ uint64_t hv_vcpu_index;
hv_vmbus_synic_simp simp;
hv_vmbus_synic_siefp siefp;
hv_vmbus_synic_scontrol sctrl;
@@ -403,23 +374,14 @@ hv_vmbus_synic_init(void *arg)
return;
/*
- * KYS: Looks like we can only initialize on cpu0; don't we support
- * SMP guests?
- *
- * TODO: Need to add SMP support for FreeBSD V9
- */
-
- if (cpu != 0)
- return;
-
- /*
* TODO: Check the version
*/
version = rdmsr(HV_X64_MSR_SVERSION);
-
- hv_vmbus_g_context.syn_ic_msg_page[cpu] = setup_args->page_buffers[0];
- hv_vmbus_g_context.syn_ic_event_page[cpu] = setup_args->page_buffers[1];
+ hv_vmbus_g_context.syn_ic_msg_page[cpu] =
+ setup_args->page_buffers[2 * cpu];
+ hv_vmbus_g_context.syn_ic_event_page[cpu] =
+ setup_args->page_buffers[2 * cpu + 1];
/*
* Setup the Synic's message page
@@ -443,9 +405,10 @@ hv_vmbus_synic_init(void *arg)
wrmsr(HV_X64_MSR_SIEFP, siefp.as_uint64_t);
/*HV_SHARED_SINT_IDT_VECTOR + 0x20; */
+ shared_sint.as_uint64_t = 0;
shared_sint.u.vector = setup_args->vector;
shared_sint.u.masked = FALSE;
- shared_sint.u.auto_eoi = FALSE;
+ shared_sint.u.auto_eoi = TRUE;
wrmsr(HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT,
shared_sint.as_uint64_t);
@@ -458,6 +421,13 @@ hv_vmbus_synic_init(void *arg)
hv_vmbus_g_context.syn_ic_initialized = TRUE;
+ /*
+ * Set up the cpuid mapping from Hyper-V to FreeBSD.
+ * The array is indexed using FreeBSD cpuid.
+ */
+ hv_vcpu_index = rdmsr(HV_X64_MSR_VP_INDEX);
+ hv_vmbus_g_context.hv_vcpu_index[cpu] = (uint32_t)hv_vcpu_index;
+
return;
}
@@ -469,14 +439,10 @@ void hv_vmbus_synic_cleanup(void *arg)
hv_vmbus_synic_sint shared_sint;
hv_vmbus_synic_simp simp;
hv_vmbus_synic_siefp siefp;
- int cpu = PCPU_GET(cpuid);
if (!hv_vmbus_g_context.syn_ic_initialized)
return;
- if (cpu != 0)
- return; /* TODO: XXXKYS: SMP? */
-
shared_sint.as_uint64_t = rdmsr(
HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT);
diff --git a/sys/dev/hyperv/vmbus/hv_ring_buffer.c b/sys/dev/hyperv/vmbus/hv_ring_buffer.c
index f7c1965..0e51ef7 100644
--- a/sys/dev/hyperv/vmbus/hv_ring_buffer.c
+++ b/sys/dev/hyperv/vmbus/hv_ring_buffer.c
@@ -26,6 +26,8 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/lock.h>
@@ -144,6 +146,69 @@ get_ring_buffer_indices(hv_vmbus_ring_buffer_info* ring_info)
return (uint64_t) ring_info->ring_buffer->write_index << 32;
}
+void
+hv_ring_buffer_read_begin(
+ hv_vmbus_ring_buffer_info* ring_info)
+{
+ ring_info->ring_buffer->interrupt_mask = 1;
+ mb();
+}
+
+uint32_t
+hv_ring_buffer_read_end(
+ hv_vmbus_ring_buffer_info* ring_info)
+{
+ uint32_t read, write;
+
+ ring_info->ring_buffer->interrupt_mask = 0;
+ mb();
+
+ /*
+ * Now check to see if the ring buffer is still empty.
+ * If it is not, we raced and we need to process new
+ * incoming messages.
+ */
+ get_ring_buffer_avail_bytes(ring_info, &read, &write);
+
+ return (read);
+}
+
+/*
+ * When we write to the ring buffer, check if the host needs to
+ * be signaled. Here is the details of this protocol:
+ *
+ * 1. The host guarantees that while it is draining the
+ * ring buffer, it will set the interrupt_mask to
+ * indicate it does not need to be interrupted when
+ * new data is placed.
+ *
+ * 2. The host guarantees that it will completely drain
+ * the ring buffer before exiting the read loop. Further,
+ * once the ring buffer is empty, it will clear the
+ * interrupt_mask and re-check to see if new data has
+ * arrived.
+ */
+static boolean_t
+hv_ring_buffer_needsig_on_write(
+ uint32_t old_write_location,
+ hv_vmbus_ring_buffer_info* rbi)
+{
+ mb();
+ if (rbi->ring_buffer->interrupt_mask)
+ return (FALSE);
+
+ /* Read memory barrier */
+ rmb();
+ /*
+ * This is the only case we need to signal when the
+ * ring transitions from being empty to non-empty.
+ */
+ if (old_write_location == rbi->ring_buffer->read_index)
+ return (TRUE);
+
+ return (FALSE);
+}
+
static uint32_t copy_to_ring_buffer(
hv_vmbus_ring_buffer_info* ring_info,
uint32_t start_write_offset,
@@ -204,11 +269,13 @@ int
hv_ring_buffer_write(
hv_vmbus_ring_buffer_info* out_ring_info,
hv_vmbus_sg_buffer_list sg_buffers[],
- uint32_t sg_buffer_count)
+ uint32_t sg_buffer_count,
+ boolean_t *need_sig)
{
int i = 0;
uint32_t byte_avail_to_write;
uint32_t byte_avail_to_read;
+ uint32_t old_write_location;
uint32_t total_bytes_to_write = 0;
volatile uint32_t next_write_location;
@@ -242,6 +309,8 @@ hv_ring_buffer_write(
*/
next_write_location = get_next_write_location(out_ring_info);
+ old_write_location = next_write_location;
+
for (i = 0; i < sg_buffer_count; i++) {
next_write_location = copy_to_ring_buffer(out_ring_info,
next_write_location, (char *) sg_buffers[i].data,
@@ -258,9 +327,9 @@ hv_ring_buffer_write(
(char *) &prev_indices, sizeof(uint64_t));
/*
- * Make sure we flush all writes before updating the writeIndex
+ * Full memory barrier before upding the write index.
*/
- wmb();
+ mb();
/*
* Now, update the write location
@@ -269,6 +338,9 @@ hv_ring_buffer_write(
mtx_unlock_spin(&out_ring_info->ring_lock);
+ *need_sig = hv_ring_buffer_needsig_on_write(old_write_location,
+ out_ring_info);
+
return (0);
}
diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
index ca28fd5..91813bb 100644
--- a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
+++ b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
@@ -53,22 +53,17 @@ __FBSDID("$FreeBSD$");
#include <machine/stdarg.h>
#include <machine/intr_machdep.h>
+#include <machine/md_var.h>
+#include <machine/segments.h>
#include <sys/pcpu.h>
+#include <machine/apicvar.h>
#include "hv_vmbus_priv.h"
#define VMBUS_IRQ 0x5
-static struct intr_event *hv_msg_intr_event;
-static struct intr_event *hv_event_intr_event;
-static void *msg_swintr;
-static void *event_swintr;
static device_t vmbus_devp;
-static void *vmbus_cookiep;
-static int vmbus_rid;
-struct resource *intr_res;
-static int vmbus_irq = VMBUS_IRQ;
static int vmbus_inited;
static hv_setup_args setup_args; /* only CPU 0 supported at this time */
@@ -77,14 +72,17 @@ static hv_setup_args setup_args; /* only CPU 0 supported at this time */
* the hypervisor.
*/
static void
-vmbus_msg_swintr(void *dummy)
+vmbus_msg_swintr(void *arg)
{
int cpu;
void* page_addr;
hv_vmbus_message* msg;
hv_vmbus_message* copied;
- cpu = PCPU_GET(cpuid);
+ cpu = (int)(long)arg;
+ KASSERT(cpu <= mp_maxid, ("VMBUS: vmbus_msg_swintr: "
+ "cpu out of range!"));
+
page_addr = hv_vmbus_g_context.syn_ic_msg_page[cpu];
msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT;
@@ -130,17 +128,8 @@ vmbus_msg_swintr(void *dummy)
*
* The purpose of this routine is to determine the type of VMBUS protocol
* message to process - an event or a channel message.
- * As this is an interrupt filter routine, the function runs in a very
- * restricted envinronment. From the manpage for bus_setup_intr(9)
- *
- * In this restricted environment, care must be taken to account for all
- * races. A careful analysis of races should be done as well. It is gener-
- * ally cheaper to take an extra interrupt, for example, than to protect
- * variables with spinlocks. Read, modify, write cycles of hardware regis-
- * ters need to be carefully analyzed if other threads are accessing the
- * same registers.
*/
-static int
+static inline int
hv_vmbus_isr(void *unused)
{
int cpu;
@@ -149,8 +138,6 @@ hv_vmbus_isr(void *unused)
void* page_addr;
cpu = PCPU_GET(cpuid);
- /* (Temporary limit) */
- KASSERT(cpu == 0, ("hv_vmbus_isr: Interrupt on CPU other than zero"));
/*
* The Windows team has advised that we check for events
@@ -162,9 +149,21 @@ hv_vmbus_isr(void *unused)
event = (hv_vmbus_synic_event_flags*)
page_addr + HV_VMBUS_MESSAGE_SINT;
- /* Since we are a child, we only need to check bit 0 */
- if (synch_test_and_clear_bit(0, &event->flags32[0])) {
- swi_sched(event_swintr, 0);
+ if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
+ (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) {
+ /* Since we are a child, we only need to check bit 0 */
+ if (synch_test_and_clear_bit(0, &event->flags32[0])) {
+ swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0);
+ }
+ } else {
+ /*
+ * On host with Win8 or above, we can directly look at
+ * the event page. If bit n is set, we have an interrupt
+ * on the channel with id n.
+ * Directly schedule the event software interrupt on
+ * current cpu.
+ */
+ swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0);
}
/* Check if there are actual msgs to be process */
@@ -172,12 +171,47 @@ hv_vmbus_isr(void *unused)
msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT;
if (msg->header.message_type != HV_MESSAGE_TYPE_NONE) {
- swi_sched(msg_swintr, 0);
+ swi_sched(hv_vmbus_g_context.msg_swintr[cpu], 0);
}
return FILTER_HANDLED;
}
+#ifdef HV_DEBUG_INTR
+uint32_t hv_intr_count = 0;
+#endif
+uint32_t hv_vmbus_swintr_event_cpu[MAXCPU];
+uint32_t hv_vmbus_intr_cpu[MAXCPU];
+
+void
+hv_vector_handler(struct trapframe *trap_frame)
+{
+#ifdef HV_DEBUG_INTR
+ int cpu;
+#endif
+
+ /*
+ * Disable preemption.
+ */
+ critical_enter();
+
+#ifdef HV_DEBUG_INTR
+ /*
+ * Do a little interrupt counting.
+ */
+ cpu = PCPU_GET(cpuid);
+ hv_vmbus_intr_cpu[cpu]++;
+ hv_intr_count++;
+#endif
+
+ hv_vmbus_isr(NULL);
+
+ /*
+ * Enable preemption.
+ */
+ critical_exit();
+}
+
static int
vmbus_read_ivar(
device_t dev,
@@ -316,6 +350,81 @@ vmbus_probe(device_t dev) {
return (BUS_PROBE_NOWILDCARD);
}
+#ifdef HYPERV
+extern inthand_t IDTVEC(rsvd), IDTVEC(hv_vmbus_callback);
+
+/**
+ * @brief Find a free IDT slot and setup the interrupt handler.
+ */
+static int
+vmbus_vector_alloc(void)
+{
+ int vector;
+ uintptr_t func;
+ struct gate_descriptor *ip;
+
+ /*
+ * Search backwards form the highest IDT vector available for use
+ * as vmbus channel callback vector. We install 'hv_vmbus_callback'
+ * handler at that vector and use it to interrupt vcpus.
+ */
+ vector = APIC_SPURIOUS_INT;
+ while (--vector >= APIC_IPI_INTS) {
+ ip = &idt[vector];
+ func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+ if (func == (uintptr_t)&IDTVEC(rsvd)) {
+#ifdef __i386__
+ setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYS386IGT,
+ SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+#else
+ setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYSIGT,
+ SEL_KPL, 0);
+#endif
+
+ return (vector);
+ }
+ }
+ return (0);
+}
+
+/**
+ * @brief Restore the IDT slot to rsvd.
+ */
+static void
+vmbus_vector_free(int vector)
+{
+ uintptr_t func;
+ struct gate_descriptor *ip;
+
+ if (vector == 0)
+ return;
+
+ KASSERT(vector >= APIC_IPI_INTS && vector < APIC_SPURIOUS_INT,
+ ("invalid vector %d", vector));
+
+ ip = &idt[vector];
+ func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+ KASSERT(func == (uintptr_t)&IDTVEC(hv_vmbus_callback),
+ ("invalid vector %d", vector));
+
+ setidt(vector, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+}
+
+#else /* HYPERV */
+
+static int
+vmbus_vector_alloc(void)
+{
+ return(0);
+}
+
+static void
+vmbus_vector_free(int vector)
+{
+}
+
+#endif /* HYPERV */
+
/**
* @brief Main vmbus driver initialization routine.
*
@@ -331,22 +440,7 @@ vmbus_probe(device_t dev) {
static int
vmbus_bus_init(void)
{
- struct ioapic_intsrc {
- struct intsrc io_intsrc;
- u_int io_irq;
- u_int io_intpin:8;
- u_int io_vector:8;
- u_int io_cpu:8;
- u_int io_activehi:1;
- u_int io_edgetrigger:1;
- u_int io_masked:1;
- int io_bus:4;
- uint32_t io_lowreg;
- };
- int i, ret;
- unsigned int vector = 0;
- struct intsrc *isrc;
- struct ioapic_intsrc *intpin;
+ int i, j, n, ret;
if (vmbus_inited)
return (0);
@@ -361,80 +455,100 @@ vmbus_bus_init(void)
return (ret);
}
- ret = swi_add(&hv_msg_intr_event, "hv_msg", vmbus_msg_swintr,
- NULL, SWI_CLOCK, 0, &msg_swintr);
-
- if (ret)
- goto cleanup;
-
/*
- * Message SW interrupt handler checks a per-CPU page and
- * thus the thread needs to be bound to CPU-0 - which is where
- * all interrupts are processed.
+ * Find a free IDT slot for vmbus callback.
*/
- ret = intr_event_bind(hv_msg_intr_event, 0);
-
- if (ret)
- goto cleanup1;
+ hv_vmbus_g_context.hv_cb_vector = vmbus_vector_alloc();
- ret = swi_add(&hv_event_intr_event, "hv_event", hv_vmbus_on_events,
- NULL, SWI_CLOCK, 0, &event_swintr);
-
- if (ret)
- goto cleanup1;
+ if (hv_vmbus_g_context.hv_cb_vector == 0) {
+ if(bootverbose)
+ printf("Error VMBUS: Cannot find free IDT slot for "
+ "vmbus callback!\n");
+ goto cleanup;
+ }
- intr_res = bus_alloc_resource(vmbus_devp,
- SYS_RES_IRQ, &vmbus_rid, vmbus_irq, vmbus_irq, 1, RF_ACTIVE);
+ if(bootverbose)
+ printf("VMBUS: vmbus callback vector %d\n",
+ hv_vmbus_g_context.hv_cb_vector);
- if (intr_res == NULL) {
- ret = ENOMEM; /* XXXKYS: Need a better errno */
- goto cleanup2;
+ /*
+ * Notify the hypervisor of our vector.
+ */
+ setup_args.vector = hv_vmbus_g_context.hv_cb_vector;
+
+ CPU_FOREACH(j) {
+ hv_vmbus_intr_cpu[j] = 0;
+ hv_vmbus_swintr_event_cpu[j] = 0;
+ hv_vmbus_g_context.hv_event_intr_event[j] = NULL;
+ hv_vmbus_g_context.hv_msg_intr_event[j] = NULL;
+ hv_vmbus_g_context.event_swintr[j] = NULL;
+ hv_vmbus_g_context.msg_swintr[j] = NULL;
+
+ for (i = 0; i < 2; i++)
+ setup_args.page_buffers[2 * j + i] = NULL;
}
/*
- * Setup interrupt filter handler
+ * Per cpu setup.
*/
- ret = bus_setup_intr(vmbus_devp, intr_res,
- INTR_TYPE_NET | INTR_MPSAFE, hv_vmbus_isr, NULL,
- NULL, &vmbus_cookiep);
-
- if (ret != 0)
- goto cleanup3;
-
- ret = bus_bind_intr(vmbus_devp, intr_res, 0);
- if (ret != 0)
- goto cleanup4;
-
- isrc = intr_lookup_source(vmbus_irq);
- if ((isrc == NULL) || (isrc->is_event == NULL)) {
- ret = EINVAL;
- goto cleanup4;
- }
+ CPU_FOREACH(j) {
+ /*
+ * Setup software interrupt thread and handler for msg handling.
+ */
+ ret = swi_add(&hv_vmbus_g_context.hv_msg_intr_event[j],
+ "hv_msg", vmbus_msg_swintr, (void *)(long)j, SWI_CLOCK, 0,
+ &hv_vmbus_g_context.msg_swintr[j]);
+ if (ret) {
+ if(bootverbose)
+ printf("VMBUS: failed to setup msg swi for "
+ "cpu %d\n", j);
+ goto cleanup1;
+ }
- /* vector = isrc->is_event->ie_vector; */
- intpin = (struct ioapic_intsrc *)isrc;
- vector = intpin->io_vector;
+ /*
+ * Bind the swi thread to the cpu.
+ */
+ ret = intr_event_bind(hv_vmbus_g_context.hv_msg_intr_event[j],
+ j);
+ if (ret) {
+ if(bootverbose)
+ printf("VMBUS: failed to bind msg swi thread "
+ "to cpu %d\n", j);
+ goto cleanup1;
+ }
- if(bootverbose)
- printf("VMBUS: irq 0x%x vector 0x%x\n", vmbus_irq, vector);
+ /*
+ * Setup software interrupt thread and handler for
+ * event handling.
+ */
+ ret = swi_add(&hv_vmbus_g_context.hv_event_intr_event[j],
+ "hv_event", hv_vmbus_on_events, (void *)(long)j,
+ SWI_CLOCK, 0, &hv_vmbus_g_context.event_swintr[j]);
+ if (ret) {
+ if(bootverbose)
+ printf("VMBUS: failed to setup event swi for "
+ "cpu %d\n", j);
+ goto cleanup1;
+ }
- /**
- * Notify the hypervisor of our irq.
- */
- setup_args.vector = vector;
- for(i = 0; i < 2; i++) {
- setup_args.page_buffers[i] =
+ /*
+ * Prepare the per cpu msg and event pages to be called on each cpu.
+ */
+ for(i = 0; i < 2; i++) {
+ setup_args.page_buffers[2 * j + i] =
malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO);
- if (setup_args.page_buffers[i] == NULL) {
- KASSERT(setup_args.page_buffers[i] != NULL,
+ if (setup_args.page_buffers[2 * j + i] == NULL) {
+ KASSERT(setup_args.page_buffers[2 * j + i] != NULL,
("Error VMBUS: malloc failed!"));
- if (i > 0)
- free(setup_args.page_buffers[0], M_DEVBUF);
- goto cleanup4;
+ goto cleanup1;
+ }
}
}
- /* only CPU #0 supported at this time */
+ if (bootverbose)
+ printf("VMBUS: Calling smp_rendezvous, smp_started = %d\n",
+ smp_started);
+
smp_rendezvous(NULL, hv_vmbus_synic_init, NULL, &setup_args);
/*
@@ -443,26 +557,32 @@ vmbus_bus_init(void)
ret = hv_vmbus_connect();
if (ret != 0)
- goto cleanup4;
+ goto cleanup1;
hv_vmbus_request_channel_offers();
return (ret);
- cleanup4:
-
+ cleanup1:
/*
- * remove swi, bus and intr resource
+ * Free pages alloc'ed
*/
- bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep);
+ for (n = 0; n < 2 * MAXCPU; n++)
+ if (setup_args.page_buffers[n] != NULL)
+ free(setup_args.page_buffers[n], M_DEVBUF);
- cleanup3:
- bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res);
-
- cleanup2:
- swi_remove(event_swintr);
+ /*
+ * remove swi and vmbus callback vector;
+ */
+ CPU_FOREACH(j) {
+ if (hv_vmbus_g_context.msg_swintr[j] != NULL)
+ swi_remove(hv_vmbus_g_context.msg_swintr[j]);
+ if (hv_vmbus_g_context.event_swintr[j] != NULL)
+ swi_remove(hv_vmbus_g_context.event_swintr[j]);
+ hv_vmbus_g_context.hv_msg_intr_event[j] = NULL;
+ hv_vmbus_g_context.hv_event_intr_event[j] = NULL;
+ }
- cleanup1:
- swi_remove(msg_swintr);
+ vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector);
cleanup:
hv_vmbus_cleanup();
@@ -515,20 +635,24 @@ vmbus_bus_exit(void)
smp_rendezvous(NULL, hv_vmbus_synic_cleanup, NULL, NULL);
- for(i = 0; i < 2; i++) {
+ for(i = 0; i < 2 * MAXCPU; i++) {
if (setup_args.page_buffers[i] != 0)
free(setup_args.page_buffers[i], M_DEVBUF);
}
hv_vmbus_cleanup();
- /* remove swi, bus and intr resource */
- bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep);
-
- bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res);
+ /* remove swi */
+ CPU_FOREACH(i) {
+ if (hv_vmbus_g_context.msg_swintr[i] != NULL)
+ swi_remove(hv_vmbus_g_context.msg_swintr[i]);
+ if (hv_vmbus_g_context.event_swintr[i] != NULL)
+ swi_remove(hv_vmbus_g_context.event_swintr[i]);
+ hv_vmbus_g_context.hv_msg_intr_event[i] = NULL;
+ hv_vmbus_g_context.hv_event_intr_event[i] = NULL;
+ }
- swi_remove(msg_swintr);
- swi_remove(event_swintr);
+ vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector);
return;
}
@@ -603,6 +727,6 @@ devclass_t vmbus_devclass;
DRIVER_MODULE(vmbus, nexus, vmbus_driver, vmbus_devclass, vmbus_modevent, 0);
MODULE_VERSION(vmbus,1);
-/* TODO: We want to be earlier than SI_SUB_VFS */
-SYSINIT(vmb_init, SI_SUB_VFS, SI_ORDER_MIDDLE, vmbus_init, NULL);
+/* We want to be started after SMP is initialized */
+SYSINIT(vmb_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, vmbus_init, NULL);
diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
index 6bc875d..faa6dec 100644
--- a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
+++ b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
@@ -181,49 +181,30 @@ enum {
#define HV_HYPERCALL_PARAM_ALIGN sizeof(uint64_t)
-/*
- * Connection identifier type
- */
-typedef union {
- uint32_t as_uint32_t;
- struct {
- uint32_t id:24;
- uint32_t reserved:8;
- } u;
-
-} __packed hv_vmbus_connection_id;
-
-/*
- * Definition of the hv_vmbus_signal_event hypercall input structure
- */
-typedef struct {
- hv_vmbus_connection_id connection_id;
- uint16_t flag_number;
- uint16_t rsvd_z;
-} __packed hv_vmbus_input_signal_event;
-
-typedef struct {
- uint64_t align8;
- hv_vmbus_input_signal_event event;
-} __packed hv_vmbus_input_signal_event_buffer;
-
typedef struct {
uint64_t guest_id;
void* hypercall_page;
hv_bool_uint8_t syn_ic_initialized;
+
+ hv_vmbus_handle syn_ic_msg_page[MAXCPU];
+ hv_vmbus_handle syn_ic_event_page[MAXCPU];
/*
- * This is used as an input param to HV_CALL_SIGNAL_EVENT hypercall.
- * The input param is immutable in our usage and
- * must be dynamic mem (vs stack or global).
+ * For FreeBSD cpuid to Hyper-V vcpuid mapping.
*/
- hv_vmbus_input_signal_event_buffer *signal_event_buffer;
+ uint32_t hv_vcpu_index[MAXCPU];
/*
- * 8-bytes aligned of the buffer above
+ * Each cpu has its own software interrupt handler for channel
+ * event and msg handling.
*/
- hv_vmbus_input_signal_event *signal_event_param;
-
- hv_vmbus_handle syn_ic_msg_page[MAXCPU];
- hv_vmbus_handle syn_ic_event_page[MAXCPU];
+ struct intr_event *hv_event_intr_event[MAXCPU];
+ struct intr_event *hv_msg_intr_event[MAXCPU];
+ void *event_swintr[MAXCPU];
+ void *msg_swintr[MAXCPU];
+ /*
+ * Host use this vector to intrrupt guest for vmbus channel
+ * event and msg.
+ */
+ unsigned int hv_cb_vector;
} hv_vmbus_context;
/*
@@ -368,7 +349,8 @@ typedef struct {
TAILQ_HEAD(, hv_vmbus_channel_msg_info) channel_msg_anchor;
struct mtx channel_msg_lock;
/**
- * List of channels
+ * List of primary channels. Sub channels will be linked
+ * under their primary channel.
*/
TAILQ_HEAD(, hv_vmbus_channel) channel_anchor;
struct mtx channel_lock;
@@ -560,6 +542,8 @@ typedef union {
uint32_t flags32[HV_EVENT_FLAGS_DWORD_COUNT];
} hv_vmbus_synic_event_flags;
+/* MSR used to provide vcpu index */
+#define HV_X64_MSR_VP_INDEX (0x40000002)
/*
* Define synthetic interrupt controller model specific registers
@@ -618,7 +602,8 @@ void hv_ring_buffer_cleanup(
int hv_ring_buffer_write(
hv_vmbus_ring_buffer_info *ring_info,
hv_vmbus_sg_buffer_list sg_buffers[],
- uint32_t sg_buff_count);
+ uint32_t sg_buff_count,
+ boolean_t *need_sig);
int hv_ring_buffer_peek(
hv_vmbus_ring_buffer_info *ring_info,
@@ -638,6 +623,12 @@ void hv_vmbus_dump_ring_info(
hv_vmbus_ring_buffer_info *ring_info,
char *prefix);
+void hv_ring_buffer_read_begin(
+ hv_vmbus_ring_buffer_info *ring_info);
+
+uint32_t hv_ring_buffer_read_end(
+ hv_vmbus_ring_buffer_info *ring_info);
+
hv_vmbus_channel* hv_vmbus_allocate_channel(void);
void hv_vmbus_free_vmbus_channel(hv_vmbus_channel *channel);
void hv_vmbus_on_channel_message(void *context);
@@ -652,7 +643,7 @@ uint16_t hv_vmbus_post_msg_via_msg_ipc(
void *payload,
size_t payload_size);
-uint16_t hv_vmbus_signal_event(void);
+uint16_t hv_vmbus_signal_event(void *con_id);
void hv_vmbus_synic_init(void *irq_arg);
void hv_vmbus_synic_cleanup(void *arg);
int hv_vmbus_query_hypervisor_presence(void);
@@ -674,7 +665,7 @@ hv_vmbus_channel* hv_vmbus_get_channel_from_rel_id(uint32_t rel_id);
int hv_vmbus_connect(void);
int hv_vmbus_disconnect(void);
int hv_vmbus_post_message(void *buffer, size_t buf_size);
-int hv_vmbus_set_event(uint32_t child_rel_id);
+int hv_vmbus_set_event(hv_vmbus_channel *channel);
void hv_vmbus_on_events(void *);
@@ -718,7 +709,7 @@ static inline uint64_t hv_generate_guest_id(
typedef struct {
unsigned int vector;
- void *page_buffers[2];
+ void *page_buffers[2 * MAXCPU];
} hv_setup_args;
#endif /* __HYPERV_PRIV_H__ */
OpenPOWER on IntegriCloud