diff options
Diffstat (limited to 'sys/dev/hyperv/vmbus')
-rw-r--r-- | sys/dev/hyperv/vmbus/hv_channel.c | 98 | ||||
-rw-r--r-- | sys/dev/hyperv/vmbus/hv_channel_mgmt.c | 268 | ||||
-rw-r--r-- | sys/dev/hyperv/vmbus/hv_connection.c | 289 | ||||
-rw-r--r-- | sys/dev/hyperv/vmbus/hv_hv.c | 66 | ||||
-rw-r--r-- | sys/dev/hyperv/vmbus/hv_ring_buffer.c | 78 | ||||
-rw-r--r-- | sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c | 364 | ||||
-rw-r--r-- | sys/dev/hyperv/vmbus/hv_vmbus_priv.h | 71 |
7 files changed, 870 insertions, 364 deletions
diff --git a/sys/dev/hyperv/vmbus/hv_channel.c b/sys/dev/hyperv/vmbus/hv_channel.c index 103260a..94137fb 100644 --- a/sys/dev/hyperv/vmbus/hv_channel.c +++ b/sys/dev/hyperv/vmbus/hv_channel.c @@ -75,7 +75,7 @@ vmbus_channel_set_event(hv_vmbus_channel *channel) (uint32_t *)&monitor_page-> trigger_group[channel->monitor_group].u.pending); } else { - hv_vmbus_set_event(channel->offer_msg.child_rel_id); + hv_vmbus_set_event(channel); } } @@ -99,6 +99,18 @@ hv_vmbus_channel_open( hv_vmbus_channel_open_channel* open_msg; hv_vmbus_channel_msg_info* open_info; + mtx_lock(&new_channel->sc_lock); + if (new_channel->state == HV_CHANNEL_OPEN_STATE) { + new_channel->state = HV_CHANNEL_OPENING_STATE; + } else { + mtx_unlock(&new_channel->sc_lock); + if(bootverbose) + printf("VMBUS: Trying to open channel <%p> which in " + "%d state.\n", new_channel, new_channel->state); + return (EINVAL); + } + mtx_unlock(&new_channel->sc_lock); + new_channel->on_channel_callback = pfn_on_channel_callback; new_channel->channel_callback_context = context; @@ -162,7 +174,7 @@ hv_vmbus_channel_open( new_channel->ring_buffer_gpadl_handle; open_msg->downstream_ring_buffer_page_offset = send_ring_buffer_size >> PAGE_SHIFT; - open_msg->server_context_area_gpadl_handle = 0; + open_msg->target_vcpu = new_channel->target_vcpu; if (user_data_len) memcpy(open_msg->user_data, user_data, user_data_len); @@ -182,10 +194,14 @@ hv_vmbus_channel_open( ret = sema_timedwait(&open_info->wait_sema, 500); /* KYS 5 seconds */ - if (ret) + if (ret) { + if(bootverbose) + printf("VMBUS: channel <%p> open timeout.\n", new_channel); goto cleanup; + } if (open_info->response.open_result.status == 0) { + new_channel->state = HV_CHANNEL_OPENED_STATE; if(bootverbose) printf("VMBUS: channel <%p> open success.\n", new_channel); } else { @@ -497,16 +513,20 @@ cleanup: return (ret); } -/** - * @brief Close the specified channel - */ -void -hv_vmbus_channel_close(hv_vmbus_channel *channel) +static void +hv_vmbus_channel_close_internal(hv_vmbus_channel *channel) { int ret = 0; hv_vmbus_channel_close_channel* msg; hv_vmbus_channel_msg_info* info; + channel->state = HV_CHANNEL_OPEN_STATE; + channel->sc_creation_callback = NULL; + + /* + * Grab the lock to prevent race condition when a packet received + * and unloading driver is in the process. + */ mtx_lock(&channel->inbound_lock); channel->on_channel_callback = NULL; mtx_unlock(&channel->inbound_lock); @@ -545,23 +565,37 @@ hv_vmbus_channel_close(hv_vmbus_channel *channel) M_DEVBUF); free(info, M_DEVBUF); +} + +/** + * @brief Close the specified channel + */ +void +hv_vmbus_channel_close(hv_vmbus_channel *channel) +{ + hv_vmbus_channel* sub_channel; + + if (channel->primary_channel != NULL) { + /* + * We only close multi-channels when the primary is + * closed. + */ + return; + } /* - * If we are closing the channel during an error path in - * opening the channel, don't free the channel - * since the caller will free the channel + * Close all multi-channels first. */ - if (channel->state == HV_CHANNEL_OPEN_STATE) { - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_anchor, - channel, - list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); - - hv_vmbus_free_vmbus_channel(channel); + TAILQ_FOREACH(sub_channel, &channel->sc_list_anchor, + sc_list_entry) { + if (sub_channel->state != HV_CHANNEL_OPENED_STATE) + continue; + hv_vmbus_channel_close_internal(sub_channel); } - + /* + * Then close the primary channel. + */ + hv_vmbus_channel_close_internal(channel); } /** @@ -581,6 +615,7 @@ hv_vmbus_channel_send_packet( uint32_t packet_len; uint64_t aligned_data; uint32_t packet_len_aligned; + boolean_t need_sig; hv_vmbus_sg_buffer_list buffer_list[3]; packet_len = sizeof(hv_vm_packet_descriptor) + buffer_len; @@ -604,12 +639,11 @@ hv_vmbus_channel_send_packet( buffer_list[2].data = &aligned_data; buffer_list[2].length = packet_len_aligned - packet_len; - ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, + &need_sig); /* TODO: We should determine if this is optional */ - if (ret == 0 - && !hv_vmbus_get_ring_buffer_interrupt_mask( - &channel->outbound)) { + if (ret == 0 && need_sig) { vmbus_channel_set_event(channel); } @@ -632,6 +666,7 @@ hv_vmbus_channel_send_packet_pagebuffer( int ret = 0; int i = 0; + boolean_t need_sig; uint32_t packet_len; uint32_t packetLen_aligned; hv_vmbus_sg_buffer_list buffer_list[3]; @@ -675,11 +710,11 @@ hv_vmbus_channel_send_packet_pagebuffer( buffer_list[2].data = &alignedData; buffer_list[2].length = packetLen_aligned - packet_len; - ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, + &need_sig); /* TODO: We should determine if this is optional */ - if (ret == 0 && - !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) { + if (ret == 0 && need_sig) { vmbus_channel_set_event(channel); } @@ -700,6 +735,7 @@ hv_vmbus_channel_send_packet_multipagebuffer( int ret = 0; uint32_t desc_size; + boolean_t need_sig; uint32_t packet_len; uint32_t packet_len_aligned; uint32_t pfn_count; @@ -750,11 +786,11 @@ hv_vmbus_channel_send_packet_multipagebuffer( buffer_list[2].data = &aligned_data; buffer_list[2].length = packet_len_aligned - packet_len; - ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, + &need_sig); /* TODO: We should determine if this is optional */ - if (ret == 0 && - !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) { + if (ret == 0 && need_sig) { vmbus_channel_set_event(channel); } diff --git a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c index 011e305..d13ece5 100644 --- a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c +++ b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c @@ -26,6 +26,9 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + #include <sys/param.h> #include <sys/mbuf.h> @@ -50,6 +53,8 @@ static void vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr); static void vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr); static void vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr); static void vmbus_channel_process_offer(void *context); +struct hv_vmbus_channel* + vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary); /** * Channel message dispatch table @@ -233,6 +238,9 @@ hv_vmbus_allocate_channel(void) return (NULL); mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF); + mtx_init(&channel->sc_lock, "vmbus multi channel", NULL, MTX_DEF); + + TAILQ_INIT(&channel->sc_list_anchor); channel->control_work_queue = hv_work_queue_create("control"); @@ -262,6 +270,7 @@ ReleaseVmbusChannel(void *context) void hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel) { + mtx_destroy(&channel->sc_lock); mtx_destroy(&channel->inbound_lock); /* * We have to release the channel's workqueue/thread in @@ -279,10 +288,10 @@ hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel) static void vmbus_channel_process_offer(void *context) { - int ret; hv_vmbus_channel* new_channel; boolean_t f_new; hv_vmbus_channel* channel; + int ret; new_channel = (hv_vmbus_channel*) context; f_new = TRUE; @@ -291,38 +300,76 @@ vmbus_channel_process_offer(void *context) /* * Make sure this is a new offer */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_lock(&hv_vmbus_g_connection.channel_lock); TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor, list_entry) { - if (!memcmp( - &channel->offer_msg.offer.interface_type, - &new_channel->offer_msg.offer.interface_type, - sizeof(hv_guid)) - && !memcmp( - &channel->offer_msg.offer.interface_instance, + if (memcmp(&channel->offer_msg.offer.interface_type, + &new_channel->offer_msg.offer.interface_type, + sizeof(hv_guid)) == 0 && + memcmp(&channel->offer_msg.offer.interface_instance, &new_channel->offer_msg.offer.interface_instance, - sizeof(hv_guid))) { - f_new = FALSE; - break; - } + sizeof(hv_guid)) == 0) { + f_new = FALSE; + break; + } } if (f_new) { - /* Insert at tail */ - TAILQ_INSERT_TAIL( - &hv_vmbus_g_connection.channel_anchor, - new_channel, - list_entry); + /* Insert at tail */ + TAILQ_INSERT_TAIL( + &hv_vmbus_g_connection.channel_anchor, + new_channel, + list_entry); } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); + + /*XXX add new channel to percpu_list */ if (!f_new) { + /* + * Check if this is a sub channel. + */ + if (new_channel->offer_msg.offer.sub_channel_index != 0) { + /* + * It is a sub channel offer, process it. + */ + new_channel->primary_channel = channel; + mtx_lock(&channel->sc_lock); + TAILQ_INSERT_TAIL( + &channel->sc_list_anchor, + new_channel, + sc_list_entry); + mtx_unlock(&channel->sc_lock); + + /* Insert new channel into channel_anchor. */ + printf("Storvsc get multi-channel offer, rel=%u.\n", + new_channel->offer_msg.child_rel_id); + mtx_lock(&hv_vmbus_g_connection.channel_lock); + TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor, + new_channel, list_entry); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); + + if(bootverbose) + printf("VMBUS: new multi-channel offer <%p>.\n", + new_channel); + + /*XXX add it to percpu_list */ + + new_channel->state = HV_CHANNEL_OPEN_STATE; + if (channel->sc_creation_callback != NULL) { + channel->sc_creation_callback(new_channel); + } + return; + } + hv_vmbus_free_vmbus_channel(new_channel); return; } + new_channel->state = HV_CHANNEL_OPEN_STATE; + /* * Start the process of binding this offer to the driver * (We need to set the device field before calling @@ -333,35 +380,86 @@ vmbus_channel_process_offer(void *context) new_channel->offer_msg.offer.interface_instance, new_channel); /* - * TODO - the HV_CHANNEL_OPEN_STATE flag should not be set below - * but in the "open" channel request. The ret != 0 logic below - * doesn't take into account that a channel - * may have been opened successfully - */ - - /* * Add the new device to the bus. This will kick off device-driver * binding which eventually invokes the device driver's AddDevice() * method. */ ret = hv_vmbus_child_device_register(new_channel->device); if (ret != 0) { - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_anchor, - new_channel, - list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); - hv_vmbus_free_vmbus_channel(new_channel); - } else { - /* - * This state is used to indicate a successful open - * so that when we do close the channel normally, - * we can clean up properly - */ - new_channel->state = HV_CHANNEL_OPEN_STATE; + mtx_lock(&hv_vmbus_g_connection.channel_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_anchor, + new_channel, + list_entry); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); + hv_vmbus_free_vmbus_channel(new_channel); + } +} + +/** + * Array of device guids that are performance critical. We try to distribute + * the interrupt load for these devices across all online cpus. + */ +static const hv_guid high_perf_devices[] = { + {HV_NIC_GUID, }, + {HV_IDE_GUID, }, + {HV_SCSI_GUID, }, +}; + +enum { + PERF_CHN_NIC = 0, + PERF_CHN_IDE, + PERF_CHN_SCSI, + MAX_PERF_CHN, +}; + +/* + * We use this static number to distribute the channel interrupt load. + */ +static uint32_t next_vcpu; + +/** + * Starting with Win8, we can statically distribute the incoming + * channel interrupt load by binding a channel to VCPU. We + * implement here a simple round robin scheme for distributing + * the interrupt load. + * We will bind channels that are not performance critical to cpu 0 and + * performance critical channels (IDE, SCSI and Network) will be uniformly + * distributed across all available CPUs. + */ +static void +vmbus_channel_select_cpu(hv_vmbus_channel *channel, hv_guid *guid) +{ + uint32_t current_cpu; + int i; + boolean_t is_perf_channel = FALSE; + + for (i = PERF_CHN_NIC; i < MAX_PERF_CHN; i++) { + if (memcmp(guid->data, high_perf_devices[i].data, + sizeof(hv_guid)) == 0) { + is_perf_channel = TRUE; + break; + } + } + if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || + (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7) || + (!is_perf_channel)) { + /* Host's view of guest cpu */ + channel->target_vcpu = 0; + /* Guest's own view of cpu */ + channel->target_cpu = 0; + return; } + /* mp_ncpus should have the number cpus currently online */ + current_cpu = (++next_vcpu % mp_ncpus); + channel->target_cpu = current_cpu; + channel->target_vcpu = + hv_vmbus_g_context.hv_vcpu_index[current_cpu]; + if (bootverbose) + printf("VMBUS: Total online cpus %d, assign perf channel %d " + "to vcpu %d, cpu %d\n", mp_ncpus, i, channel->target_vcpu, + current_cpu); } /** @@ -391,6 +489,38 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr) if (new_channel == NULL) return; + /* + * By default we setup state to enable batched + * reading. A specific service can choose to + * disable this prior to opening the channel. + */ + new_channel->batched_reading = TRUE; + + new_channel->signal_event_param = + (hv_vmbus_input_signal_event *) + (HV_ALIGN_UP((unsigned long) + &new_channel->signal_event_buffer, + HV_HYPERCALL_PARAM_ALIGN)); + + new_channel->signal_event_param->connection_id.as_uint32_t = 0; + new_channel->signal_event_param->connection_id.u.id = + HV_VMBUS_EVENT_CONNECTION_ID; + new_channel->signal_event_param->flag_number = 0; + new_channel->signal_event_param->rsvd_z = 0; + + if (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008) { + new_channel->is_dedicated_interrupt = + (offer->is_dedicated_interrupt != 0); + new_channel->signal_event_param->connection_id.u.id = + offer->connection_id; + } + + /* + * Bind the channel to a chosen cpu. + */ + vmbus_channel_select_cpu(new_channel, + &offer->offer.interface_type); + memcpy(&new_channel->offer_msg, offer, sizeof(hv_vmbus_channel_offer_channel)); new_channel->monitor_group = (uint8_t) offer->monitor_id / 32; @@ -666,7 +796,7 @@ hv_vmbus_release_unattached_channels(void) { hv_vmbus_channel *channel; - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_lock(&hv_vmbus_g_connection.channel_lock); while (!TAILQ_EMPTY(&hv_vmbus_g_connection.channel_anchor)) { channel = TAILQ_FIRST(&hv_vmbus_g_connection.channel_anchor); @@ -676,5 +806,61 @@ hv_vmbus_release_unattached_channels(void) hv_vmbus_child_device_unregister(channel->device); hv_vmbus_free_vmbus_channel(channel); } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); +} + +/** + * @brief Select the best outgoing channel + * + * The channel whose vcpu binding is closest to the currect vcpu will + * be selected. + * If no multi-channel, always select primary channel + * + * @param primary - primary channel + */ +struct hv_vmbus_channel * +vmbus_select_outgoing_channel(struct hv_vmbus_channel *primary) +{ + hv_vmbus_channel *new_channel = NULL; + hv_vmbus_channel *outgoing_channel = primary; + int old_cpu_distance = 0; + int new_cpu_distance = 0; + int cur_vcpu = 0; + int smp_pro_id = PCPU_GET(cpuid); + + if (TAILQ_EMPTY(&primary->sc_list_anchor)) { + return outgoing_channel; + } + + if (smp_pro_id >= MAXCPU) { + return outgoing_channel; + } + + cur_vcpu = hv_vmbus_g_context.hv_vcpu_index[smp_pro_id]; + + TAILQ_FOREACH(new_channel, &primary->sc_list_anchor, sc_list_entry) { + if (new_channel->state != HV_CHANNEL_OPENED_STATE){ + continue; + } + + if (new_channel->target_vcpu == cur_vcpu){ + return new_channel; + } + + old_cpu_distance = ((outgoing_channel->target_vcpu > cur_vcpu) ? + (outgoing_channel->target_vcpu - cur_vcpu) : + (cur_vcpu - outgoing_channel->target_vcpu)); + + new_cpu_distance = ((new_channel->target_vcpu > cur_vcpu) ? + (new_channel->target_vcpu - cur_vcpu) : + (cur_vcpu - new_channel->target_vcpu)); + + if (old_cpu_distance < new_cpu_distance) { + continue; + } + + outgoing_channel = new_channel; + } + + return(outgoing_channel); } diff --git a/sys/dev/hyperv/vmbus/hv_connection.c b/sys/dev/hyperv/vmbus/hv_connection.c index c8e0b48..cc83037 100644 --- a/sys/dev/hyperv/vmbus/hv_connection.c +++ b/sys/dev/hyperv/vmbus/hv_connection.c @@ -26,6 +26,9 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + #include <sys/param.h> #include <sys/malloc.h> #include <sys/systm.h> @@ -45,14 +48,113 @@ hv_vmbus_connection hv_vmbus_g_connection = { .connect_state = HV_DISCONNECTED, .next_gpadl_handle = 0xE1E10, }; +uint32_t hv_vmbus_protocal_version = HV_VMBUS_VERSION_WS2008; + +static uint32_t +hv_vmbus_get_next_version(uint32_t current_ver) +{ + switch (current_ver) { + case (HV_VMBUS_VERSION_WIN7): + return(HV_VMBUS_VERSION_WS2008); + + case (HV_VMBUS_VERSION_WIN8): + return(HV_VMBUS_VERSION_WIN7); + + case (HV_VMBUS_VERSION_WIN8_1): + return(HV_VMBUS_VERSION_WIN8); + + case (HV_VMBUS_VERSION_WS2008): + default: + return(HV_VMBUS_VERSION_INVALID); + } +} + +/** + * Negotiate the highest supported hypervisor version. + */ +static int +hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info, + uint32_t version) +{ + int ret = 0; + hv_vmbus_channel_initiate_contact *msg; + + sema_init(&msg_info->wait_sema, 0, "Msg Info Sema"); + msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg; + + msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT; + msg->vmbus_version_requested = version; + + msg->interrupt_page = hv_get_phys_addr( + hv_vmbus_g_connection.interrupt_page); + + msg->monitor_page_1 = hv_get_phys_addr( + hv_vmbus_g_connection.monitor_pages); + + msg->monitor_page_2 = + hv_get_phys_addr( + ((uint8_t *) hv_vmbus_g_connection.monitor_pages + + PAGE_SIZE)); + + /** + * Add to list before we send the request since we may receive the + * response before returning from this routine + */ + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + TAILQ_INSERT_TAIL( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + ret = hv_vmbus_post_message( + msg, + sizeof(hv_vmbus_channel_initiate_contact)); + + if (ret != 0) { + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + return (ret); + } + + /** + * Wait for the connection response + */ + ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */ + + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + /** + * Check if successful + */ + if (msg_info->response.version_response.version_supported) { + hv_vmbus_g_connection.connect_state = HV_CONNECTED; + } else { + ret = ECONNREFUSED; + } + + return (ret); +} + /** * Send a connect request on the partition service connection */ int hv_vmbus_connect(void) { int ret = 0; + uint32_t version; hv_vmbus_channel_msg_info* msg_info = NULL; - hv_vmbus_channel_initiate_contact* msg; /** * Make sure we are not connecting or connected @@ -74,7 +176,7 @@ hv_vmbus_connect(void) { TAILQ_INIT(&hv_vmbus_g_connection.channel_anchor); mtx_init(&hv_vmbus_g_connection.channel_lock, "vmbus channel", - NULL, MTX_SPIN); + NULL, MTX_DEF); /** * Setup the vmbus event connection for channel interrupt abstraction @@ -130,71 +232,30 @@ hv_vmbus_connect(void) { goto cleanup; } - sema_init(&msg_info->wait_sema, 0, "Msg Info Sema"); - msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg; - - msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT; - msg->vmbus_version_requested = HV_VMBUS_REVISION_NUMBER; - - msg->interrupt_page = hv_get_phys_addr( - hv_vmbus_g_connection.interrupt_page); - - msg->monitor_page_1 = hv_get_phys_addr( - hv_vmbus_g_connection.monitor_pages); - - msg->monitor_page_2 = - hv_get_phys_addr( - ((uint8_t *) hv_vmbus_g_connection.monitor_pages - + PAGE_SIZE)); - - /** - * Add to list before we send the request since we may receive the - * response before returning from this routine + /* + * Find the highest vmbus version number we can support. */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); - - TAILQ_INSERT_TAIL( - &hv_vmbus_g_connection.channel_msg_anchor, - msg_info, - msg_list_entry); - - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); - - ret = hv_vmbus_post_message( - msg, - sizeof(hv_vmbus_channel_initiate_contact)); - - if (ret != 0) { - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_msg_anchor, - msg_info, - msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); - goto cleanup; - } + version = HV_VMBUS_VERSION_CURRENT; + + do { + ret = hv_vmbus_negotiate_version(msg_info, version); + if (ret == EWOULDBLOCK) { + /* + * We timed out. + */ + goto cleanup; + } - /** - * Wait for the connection response - */ - ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */ + if (hv_vmbus_g_connection.connect_state == HV_CONNECTED) + break; - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_msg_anchor, - msg_info, - msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + version = hv_vmbus_get_next_version(version); + } while (version != HV_VMBUS_VERSION_INVALID); - /** - * Check if successful - */ - if (msg_info->response.version_response.version_supported) { - hv_vmbus_g_connection.connect_state = HV_CONNECTED; - } else { - ret = ECONNREFUSED; - goto cleanup; - } + hv_vmbus_protocal_version = version; + if (bootverbose) + printf("VMBUS: Portocal Version: %d.%d\n", + version >> 16, version & 0xFFFF); sema_destroy(&msg_info->wait_sema); free(msg_info, M_DEVBUF); @@ -286,7 +347,7 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) { * and channels are accessed without the need to take this lock or search * the list. */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_lock(&hv_vmbus_g_connection.channel_lock); TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor, list_entry) { @@ -295,7 +356,7 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) { break; } } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); return (foundChannel); } @@ -306,7 +367,10 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) { static void VmbusProcessChannelEvent(uint32_t relid) { + void* arg; + uint32_t bytes_to_read; hv_vmbus_channel* channel; + boolean_t is_batched_reading; /** * Find the channel based on this relid and invokes @@ -327,31 +391,98 @@ VmbusProcessChannelEvent(uint32_t relid) * callback to NULL. This closes the window. */ - mtx_lock(&channel->inbound_lock); + /* + * Disable the lock due to newly added WITNESS check in r277723. + * Will seek other way to avoid race condition. + * -- whu + */ + // mtx_lock(&channel->inbound_lock); if (channel->on_channel_callback != NULL) { - channel->on_channel_callback(channel->channel_callback_context); + arg = channel->channel_callback_context; + is_batched_reading = channel->batched_reading; + /* + * Optimize host to guest signaling by ensuring: + * 1. While reading the channel, we disable interrupts from + * host. + * 2. Ensure that we process all posted messages from the host + * before returning from this callback. + * 3. Once we return, enable signaling from the host. Once this + * state is set we check to see if additional packets are + * available to read. In this case we repeat the process. + */ + do { + if (is_batched_reading) + hv_ring_buffer_read_begin(&channel->inbound); + + channel->on_channel_callback(arg); + + if (is_batched_reading) + bytes_to_read = + hv_ring_buffer_read_end(&channel->inbound); + else + bytes_to_read = 0; + } while (is_batched_reading && (bytes_to_read != 0)); } - mtx_unlock(&channel->inbound_lock); + // mtx_unlock(&channel->inbound_lock); } +#ifdef HV_DEBUG_INTR +extern uint32_t hv_intr_count; +extern uint32_t hv_vmbus_swintr_event_cpu[MAXCPU]; +extern uint32_t hv_vmbus_intr_cpu[MAXCPU]; +#endif + /** * Handler for events */ void hv_vmbus_on_events(void *arg) { - int dword; int bit; + int cpu; + int dword; + void *page_addr; + uint32_t* recv_interrupt_page = NULL; int rel_id; - int maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5; + int maxdword; + hv_vmbus_synic_event_flags *event; /* int maxdword = PAGE_SIZE >> 3; */ - /* - * receive size is 1/2 page and divide that by 4 bytes - */ - - uint32_t* recv_interrupt_page = - hv_vmbus_g_connection.recv_interrupt_page; + cpu = (int)(long)arg; + KASSERT(cpu <= mp_maxid, ("VMBUS: hv_vmbus_on_events: " + "cpu out of range!")); + +#ifdef HV_DEBUG_INTR + int i; + hv_vmbus_swintr_event_cpu[cpu]++; + if (hv_intr_count % 10000 == 0) { + printf("VMBUS: Total interrupt %d\n", hv_intr_count); + for (i = 0; i < mp_ncpus; i++) + printf("VMBUS: hw cpu[%d]: %d, event sw intr cpu[%d]: %d\n", + i, hv_vmbus_intr_cpu[i], i, hv_vmbus_swintr_event_cpu[i]); + } +#endif + + if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || + (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) { + maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5; + /* + * receive size is 1/2 page and divide that by 4 bytes + */ + recv_interrupt_page = + hv_vmbus_g_connection.recv_interrupt_page; + } else { + /* + * On Host with Win8 or above, the event page can be + * checked directly to get the id of the channel + * that has the pending interrupt. + */ + maxdword = HV_EVENT_FLAGS_DWORD_COUNT; + page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu]; + event = (hv_vmbus_synic_event_flags *) + page_addr + HV_VMBUS_MESSAGE_SINT; + recv_interrupt_page = event->flags32; + } /* * Check events @@ -416,16 +547,16 @@ int hv_vmbus_post_message(void *buffer, size_t bufferLen) { * Send an event notification to the parent */ int -hv_vmbus_set_event(uint32_t child_rel_id) { +hv_vmbus_set_event(hv_vmbus_channel *channel) { int ret = 0; + uint32_t child_rel_id = channel->offer_msg.child_rel_id; /* Each uint32_t represents 32 channels */ synch_set_bit(child_rel_id & 31, (((uint32_t *)hv_vmbus_g_connection.send_interrupt_page + (child_rel_id >> 5)))); - ret = hv_vmbus_signal_event(); + ret = hv_vmbus_signal_event(channel->signal_event_param); return (ret); } - diff --git a/sys/dev/hyperv/vmbus/hv_hv.c b/sys/dev/hyperv/vmbus/hv_hv.c index 80a1f42..84e2a5e 100644 --- a/sys/dev/hyperv/vmbus/hv_hv.c +++ b/sys/dev/hyperv/vmbus/hv_hv.c @@ -67,8 +67,6 @@ static inline void do_cpuid_inline(unsigned int op, unsigned int *eax, hv_vmbus_context hv_vmbus_g_context = { .syn_ic_initialized = FALSE, .hypercall_page = NULL, - .signal_event_param = NULL, - .signal_event_buffer = NULL, }; static struct timecounter hv_timecounter = { @@ -256,28 +254,6 @@ hv_vmbus_init(void) hv_vmbus_g_context.hypercall_page = virt_addr; - /* - * Setup the global signal event param for the signal event hypercall - */ - hv_vmbus_g_context.signal_event_buffer = - malloc(sizeof(hv_vmbus_input_signal_event_buffer), M_DEVBUF, - M_ZERO | M_NOWAIT); - KASSERT(hv_vmbus_g_context.signal_event_buffer != NULL, - ("Error VMBUS: Failed to allocate signal_event_buffer\n")); - if (hv_vmbus_g_context.signal_event_buffer == NULL) - goto cleanup; - - hv_vmbus_g_context.signal_event_param = - (hv_vmbus_input_signal_event*) - (HV_ALIGN_UP((unsigned long) - hv_vmbus_g_context.signal_event_buffer, - HV_HYPERCALL_PARAM_ALIGN)); - hv_vmbus_g_context.signal_event_param->connection_id.as_uint32_t = 0; - hv_vmbus_g_context.signal_event_param->connection_id.u.id = - HV_VMBUS_EVENT_CONNECTION_ID; - hv_vmbus_g_context.signal_event_param->flag_number = 0; - hv_vmbus_g_context.signal_event_param->rsvd_z = 0; - tc_init(&hv_timecounter); /* register virtual timecount */ return (0); @@ -303,12 +279,6 @@ hv_vmbus_cleanup(void) { hv_vmbus_x64_msr_hypercall_contents hypercall_msr; - if (hv_vmbus_g_context.signal_event_buffer != NULL) { - free(hv_vmbus_g_context.signal_event_buffer, M_DEVBUF); - hv_vmbus_g_context.signal_event_buffer = NULL; - hv_vmbus_g_context.signal_event_param = NULL; - } - if (hv_vmbus_g_context.guest_id == HV_FREEBSD_GUEST_ID) { if (hv_vmbus_g_context.hypercall_page != NULL) { hypercall_msr.as_uint64_t = 0; @@ -370,13 +340,13 @@ hv_vmbus_post_msg_via_msg_ipc( * event IPC. (This involves a hypercall.) */ hv_vmbus_status -hv_vmbus_signal_event() +hv_vmbus_signal_event(void *con_id) { hv_vmbus_status status; status = hv_vmbus_do_hypercall( HV_CALL_SIGNAL_EVENT, - hv_vmbus_g_context.signal_event_param, + con_id, 0) & 0xFFFF; return (status); @@ -390,6 +360,7 @@ hv_vmbus_synic_init(void *arg) { int cpu; + uint64_t hv_vcpu_index; hv_vmbus_synic_simp simp; hv_vmbus_synic_siefp siefp; hv_vmbus_synic_scontrol sctrl; @@ -403,23 +374,14 @@ hv_vmbus_synic_init(void *arg) return; /* - * KYS: Looks like we can only initialize on cpu0; don't we support - * SMP guests? - * - * TODO: Need to add SMP support for FreeBSD V9 - */ - - if (cpu != 0) - return; - - /* * TODO: Check the version */ version = rdmsr(HV_X64_MSR_SVERSION); - - hv_vmbus_g_context.syn_ic_msg_page[cpu] = setup_args->page_buffers[0]; - hv_vmbus_g_context.syn_ic_event_page[cpu] = setup_args->page_buffers[1]; + hv_vmbus_g_context.syn_ic_msg_page[cpu] = + setup_args->page_buffers[2 * cpu]; + hv_vmbus_g_context.syn_ic_event_page[cpu] = + setup_args->page_buffers[2 * cpu + 1]; /* * Setup the Synic's message page @@ -443,9 +405,10 @@ hv_vmbus_synic_init(void *arg) wrmsr(HV_X64_MSR_SIEFP, siefp.as_uint64_t); /*HV_SHARED_SINT_IDT_VECTOR + 0x20; */ + shared_sint.as_uint64_t = 0; shared_sint.u.vector = setup_args->vector; shared_sint.u.masked = FALSE; - shared_sint.u.auto_eoi = FALSE; + shared_sint.u.auto_eoi = TRUE; wrmsr(HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT, shared_sint.as_uint64_t); @@ -458,6 +421,13 @@ hv_vmbus_synic_init(void *arg) hv_vmbus_g_context.syn_ic_initialized = TRUE; + /* + * Set up the cpuid mapping from Hyper-V to FreeBSD. + * The array is indexed using FreeBSD cpuid. + */ + hv_vcpu_index = rdmsr(HV_X64_MSR_VP_INDEX); + hv_vmbus_g_context.hv_vcpu_index[cpu] = (uint32_t)hv_vcpu_index; + return; } @@ -469,14 +439,10 @@ void hv_vmbus_synic_cleanup(void *arg) hv_vmbus_synic_sint shared_sint; hv_vmbus_synic_simp simp; hv_vmbus_synic_siefp siefp; - int cpu = PCPU_GET(cpuid); if (!hv_vmbus_g_context.syn_ic_initialized) return; - if (cpu != 0) - return; /* TODO: XXXKYS: SMP? */ - shared_sint.as_uint64_t = rdmsr( HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT); diff --git a/sys/dev/hyperv/vmbus/hv_ring_buffer.c b/sys/dev/hyperv/vmbus/hv_ring_buffer.c index f7c1965..0e51ef7 100644 --- a/sys/dev/hyperv/vmbus/hv_ring_buffer.c +++ b/sys/dev/hyperv/vmbus/hv_ring_buffer.c @@ -26,6 +26,8 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/lock.h> @@ -144,6 +146,69 @@ get_ring_buffer_indices(hv_vmbus_ring_buffer_info* ring_info) return (uint64_t) ring_info->ring_buffer->write_index << 32; } +void +hv_ring_buffer_read_begin( + hv_vmbus_ring_buffer_info* ring_info) +{ + ring_info->ring_buffer->interrupt_mask = 1; + mb(); +} + +uint32_t +hv_ring_buffer_read_end( + hv_vmbus_ring_buffer_info* ring_info) +{ + uint32_t read, write; + + ring_info->ring_buffer->interrupt_mask = 0; + mb(); + + /* + * Now check to see if the ring buffer is still empty. + * If it is not, we raced and we need to process new + * incoming messages. + */ + get_ring_buffer_avail_bytes(ring_info, &read, &write); + + return (read); +} + +/* + * When we write to the ring buffer, check if the host needs to + * be signaled. Here is the details of this protocol: + * + * 1. The host guarantees that while it is draining the + * ring buffer, it will set the interrupt_mask to + * indicate it does not need to be interrupted when + * new data is placed. + * + * 2. The host guarantees that it will completely drain + * the ring buffer before exiting the read loop. Further, + * once the ring buffer is empty, it will clear the + * interrupt_mask and re-check to see if new data has + * arrived. + */ +static boolean_t +hv_ring_buffer_needsig_on_write( + uint32_t old_write_location, + hv_vmbus_ring_buffer_info* rbi) +{ + mb(); + if (rbi->ring_buffer->interrupt_mask) + return (FALSE); + + /* Read memory barrier */ + rmb(); + /* + * This is the only case we need to signal when the + * ring transitions from being empty to non-empty. + */ + if (old_write_location == rbi->ring_buffer->read_index) + return (TRUE); + + return (FALSE); +} + static uint32_t copy_to_ring_buffer( hv_vmbus_ring_buffer_info* ring_info, uint32_t start_write_offset, @@ -204,11 +269,13 @@ int hv_ring_buffer_write( hv_vmbus_ring_buffer_info* out_ring_info, hv_vmbus_sg_buffer_list sg_buffers[], - uint32_t sg_buffer_count) + uint32_t sg_buffer_count, + boolean_t *need_sig) { int i = 0; uint32_t byte_avail_to_write; uint32_t byte_avail_to_read; + uint32_t old_write_location; uint32_t total_bytes_to_write = 0; volatile uint32_t next_write_location; @@ -242,6 +309,8 @@ hv_ring_buffer_write( */ next_write_location = get_next_write_location(out_ring_info); + old_write_location = next_write_location; + for (i = 0; i < sg_buffer_count; i++) { next_write_location = copy_to_ring_buffer(out_ring_info, next_write_location, (char *) sg_buffers[i].data, @@ -258,9 +327,9 @@ hv_ring_buffer_write( (char *) &prev_indices, sizeof(uint64_t)); /* - * Make sure we flush all writes before updating the writeIndex + * Full memory barrier before upding the write index. */ - wmb(); + mb(); /* * Now, update the write location @@ -269,6 +338,9 @@ hv_ring_buffer_write( mtx_unlock_spin(&out_ring_info->ring_lock); + *need_sig = hv_ring_buffer_needsig_on_write(old_write_location, + out_ring_info); + return (0); } diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c index ca28fd5..91813bb 100644 --- a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c +++ b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c @@ -53,22 +53,17 @@ __FBSDID("$FreeBSD$"); #include <machine/stdarg.h> #include <machine/intr_machdep.h> +#include <machine/md_var.h> +#include <machine/segments.h> #include <sys/pcpu.h> +#include <machine/apicvar.h> #include "hv_vmbus_priv.h" #define VMBUS_IRQ 0x5 -static struct intr_event *hv_msg_intr_event; -static struct intr_event *hv_event_intr_event; -static void *msg_swintr; -static void *event_swintr; static device_t vmbus_devp; -static void *vmbus_cookiep; -static int vmbus_rid; -struct resource *intr_res; -static int vmbus_irq = VMBUS_IRQ; static int vmbus_inited; static hv_setup_args setup_args; /* only CPU 0 supported at this time */ @@ -77,14 +72,17 @@ static hv_setup_args setup_args; /* only CPU 0 supported at this time */ * the hypervisor. */ static void -vmbus_msg_swintr(void *dummy) +vmbus_msg_swintr(void *arg) { int cpu; void* page_addr; hv_vmbus_message* msg; hv_vmbus_message* copied; - cpu = PCPU_GET(cpuid); + cpu = (int)(long)arg; + KASSERT(cpu <= mp_maxid, ("VMBUS: vmbus_msg_swintr: " + "cpu out of range!")); + page_addr = hv_vmbus_g_context.syn_ic_msg_page[cpu]; msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT; @@ -130,17 +128,8 @@ vmbus_msg_swintr(void *dummy) * * The purpose of this routine is to determine the type of VMBUS protocol * message to process - an event or a channel message. - * As this is an interrupt filter routine, the function runs in a very - * restricted envinronment. From the manpage for bus_setup_intr(9) - * - * In this restricted environment, care must be taken to account for all - * races. A careful analysis of races should be done as well. It is gener- - * ally cheaper to take an extra interrupt, for example, than to protect - * variables with spinlocks. Read, modify, write cycles of hardware regis- - * ters need to be carefully analyzed if other threads are accessing the - * same registers. */ -static int +static inline int hv_vmbus_isr(void *unused) { int cpu; @@ -149,8 +138,6 @@ hv_vmbus_isr(void *unused) void* page_addr; cpu = PCPU_GET(cpuid); - /* (Temporary limit) */ - KASSERT(cpu == 0, ("hv_vmbus_isr: Interrupt on CPU other than zero")); /* * The Windows team has advised that we check for events @@ -162,9 +149,21 @@ hv_vmbus_isr(void *unused) event = (hv_vmbus_synic_event_flags*) page_addr + HV_VMBUS_MESSAGE_SINT; - /* Since we are a child, we only need to check bit 0 */ - if (synch_test_and_clear_bit(0, &event->flags32[0])) { - swi_sched(event_swintr, 0); + if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || + (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) { + /* Since we are a child, we only need to check bit 0 */ + if (synch_test_and_clear_bit(0, &event->flags32[0])) { + swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0); + } + } else { + /* + * On host with Win8 or above, we can directly look at + * the event page. If bit n is set, we have an interrupt + * on the channel with id n. + * Directly schedule the event software interrupt on + * current cpu. + */ + swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0); } /* Check if there are actual msgs to be process */ @@ -172,12 +171,47 @@ hv_vmbus_isr(void *unused) msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT; if (msg->header.message_type != HV_MESSAGE_TYPE_NONE) { - swi_sched(msg_swintr, 0); + swi_sched(hv_vmbus_g_context.msg_swintr[cpu], 0); } return FILTER_HANDLED; } +#ifdef HV_DEBUG_INTR +uint32_t hv_intr_count = 0; +#endif +uint32_t hv_vmbus_swintr_event_cpu[MAXCPU]; +uint32_t hv_vmbus_intr_cpu[MAXCPU]; + +void +hv_vector_handler(struct trapframe *trap_frame) +{ +#ifdef HV_DEBUG_INTR + int cpu; +#endif + + /* + * Disable preemption. + */ + critical_enter(); + +#ifdef HV_DEBUG_INTR + /* + * Do a little interrupt counting. + */ + cpu = PCPU_GET(cpuid); + hv_vmbus_intr_cpu[cpu]++; + hv_intr_count++; +#endif + + hv_vmbus_isr(NULL); + + /* + * Enable preemption. + */ + critical_exit(); +} + static int vmbus_read_ivar( device_t dev, @@ -316,6 +350,81 @@ vmbus_probe(device_t dev) { return (BUS_PROBE_NOWILDCARD); } +#ifdef HYPERV +extern inthand_t IDTVEC(rsvd), IDTVEC(hv_vmbus_callback); + +/** + * @brief Find a free IDT slot and setup the interrupt handler. + */ +static int +vmbus_vector_alloc(void) +{ + int vector; + uintptr_t func; + struct gate_descriptor *ip; + + /* + * Search backwards form the highest IDT vector available for use + * as vmbus channel callback vector. We install 'hv_vmbus_callback' + * handler at that vector and use it to interrupt vcpus. + */ + vector = APIC_SPURIOUS_INT; + while (--vector >= APIC_IPI_INTS) { + ip = &idt[vector]; + func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); + if (func == (uintptr_t)&IDTVEC(rsvd)) { +#ifdef __i386__ + setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYS386IGT, + SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#else + setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYSIGT, + SEL_KPL, 0); +#endif + + return (vector); + } + } + return (0); +} + +/** + * @brief Restore the IDT slot to rsvd. + */ +static void +vmbus_vector_free(int vector) +{ + uintptr_t func; + struct gate_descriptor *ip; + + if (vector == 0) + return; + + KASSERT(vector >= APIC_IPI_INTS && vector < APIC_SPURIOUS_INT, + ("invalid vector %d", vector)); + + ip = &idt[vector]; + func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); + KASSERT(func == (uintptr_t)&IDTVEC(hv_vmbus_callback), + ("invalid vector %d", vector)); + + setidt(vector, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); +} + +#else /* HYPERV */ + +static int +vmbus_vector_alloc(void) +{ + return(0); +} + +static void +vmbus_vector_free(int vector) +{ +} + +#endif /* HYPERV */ + /** * @brief Main vmbus driver initialization routine. * @@ -331,22 +440,7 @@ vmbus_probe(device_t dev) { static int vmbus_bus_init(void) { - struct ioapic_intsrc { - struct intsrc io_intsrc; - u_int io_irq; - u_int io_intpin:8; - u_int io_vector:8; - u_int io_cpu:8; - u_int io_activehi:1; - u_int io_edgetrigger:1; - u_int io_masked:1; - int io_bus:4; - uint32_t io_lowreg; - }; - int i, ret; - unsigned int vector = 0; - struct intsrc *isrc; - struct ioapic_intsrc *intpin; + int i, j, n, ret; if (vmbus_inited) return (0); @@ -361,80 +455,100 @@ vmbus_bus_init(void) return (ret); } - ret = swi_add(&hv_msg_intr_event, "hv_msg", vmbus_msg_swintr, - NULL, SWI_CLOCK, 0, &msg_swintr); - - if (ret) - goto cleanup; - /* - * Message SW interrupt handler checks a per-CPU page and - * thus the thread needs to be bound to CPU-0 - which is where - * all interrupts are processed. + * Find a free IDT slot for vmbus callback. */ - ret = intr_event_bind(hv_msg_intr_event, 0); - - if (ret) - goto cleanup1; + hv_vmbus_g_context.hv_cb_vector = vmbus_vector_alloc(); - ret = swi_add(&hv_event_intr_event, "hv_event", hv_vmbus_on_events, - NULL, SWI_CLOCK, 0, &event_swintr); - - if (ret) - goto cleanup1; + if (hv_vmbus_g_context.hv_cb_vector == 0) { + if(bootverbose) + printf("Error VMBUS: Cannot find free IDT slot for " + "vmbus callback!\n"); + goto cleanup; + } - intr_res = bus_alloc_resource(vmbus_devp, - SYS_RES_IRQ, &vmbus_rid, vmbus_irq, vmbus_irq, 1, RF_ACTIVE); + if(bootverbose) + printf("VMBUS: vmbus callback vector %d\n", + hv_vmbus_g_context.hv_cb_vector); - if (intr_res == NULL) { - ret = ENOMEM; /* XXXKYS: Need a better errno */ - goto cleanup2; + /* + * Notify the hypervisor of our vector. + */ + setup_args.vector = hv_vmbus_g_context.hv_cb_vector; + + CPU_FOREACH(j) { + hv_vmbus_intr_cpu[j] = 0; + hv_vmbus_swintr_event_cpu[j] = 0; + hv_vmbus_g_context.hv_event_intr_event[j] = NULL; + hv_vmbus_g_context.hv_msg_intr_event[j] = NULL; + hv_vmbus_g_context.event_swintr[j] = NULL; + hv_vmbus_g_context.msg_swintr[j] = NULL; + + for (i = 0; i < 2; i++) + setup_args.page_buffers[2 * j + i] = NULL; } /* - * Setup interrupt filter handler + * Per cpu setup. */ - ret = bus_setup_intr(vmbus_devp, intr_res, - INTR_TYPE_NET | INTR_MPSAFE, hv_vmbus_isr, NULL, - NULL, &vmbus_cookiep); - - if (ret != 0) - goto cleanup3; - - ret = bus_bind_intr(vmbus_devp, intr_res, 0); - if (ret != 0) - goto cleanup4; - - isrc = intr_lookup_source(vmbus_irq); - if ((isrc == NULL) || (isrc->is_event == NULL)) { - ret = EINVAL; - goto cleanup4; - } + CPU_FOREACH(j) { + /* + * Setup software interrupt thread and handler for msg handling. + */ + ret = swi_add(&hv_vmbus_g_context.hv_msg_intr_event[j], + "hv_msg", vmbus_msg_swintr, (void *)(long)j, SWI_CLOCK, 0, + &hv_vmbus_g_context.msg_swintr[j]); + if (ret) { + if(bootverbose) + printf("VMBUS: failed to setup msg swi for " + "cpu %d\n", j); + goto cleanup1; + } - /* vector = isrc->is_event->ie_vector; */ - intpin = (struct ioapic_intsrc *)isrc; - vector = intpin->io_vector; + /* + * Bind the swi thread to the cpu. + */ + ret = intr_event_bind(hv_vmbus_g_context.hv_msg_intr_event[j], + j); + if (ret) { + if(bootverbose) + printf("VMBUS: failed to bind msg swi thread " + "to cpu %d\n", j); + goto cleanup1; + } - if(bootverbose) - printf("VMBUS: irq 0x%x vector 0x%x\n", vmbus_irq, vector); + /* + * Setup software interrupt thread and handler for + * event handling. + */ + ret = swi_add(&hv_vmbus_g_context.hv_event_intr_event[j], + "hv_event", hv_vmbus_on_events, (void *)(long)j, + SWI_CLOCK, 0, &hv_vmbus_g_context.event_swintr[j]); + if (ret) { + if(bootverbose) + printf("VMBUS: failed to setup event swi for " + "cpu %d\n", j); + goto cleanup1; + } - /** - * Notify the hypervisor of our irq. - */ - setup_args.vector = vector; - for(i = 0; i < 2; i++) { - setup_args.page_buffers[i] = + /* + * Prepare the per cpu msg and event pages to be called on each cpu. + */ + for(i = 0; i < 2; i++) { + setup_args.page_buffers[2 * j + i] = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO); - if (setup_args.page_buffers[i] == NULL) { - KASSERT(setup_args.page_buffers[i] != NULL, + if (setup_args.page_buffers[2 * j + i] == NULL) { + KASSERT(setup_args.page_buffers[2 * j + i] != NULL, ("Error VMBUS: malloc failed!")); - if (i > 0) - free(setup_args.page_buffers[0], M_DEVBUF); - goto cleanup4; + goto cleanup1; + } } } - /* only CPU #0 supported at this time */ + if (bootverbose) + printf("VMBUS: Calling smp_rendezvous, smp_started = %d\n", + smp_started); + smp_rendezvous(NULL, hv_vmbus_synic_init, NULL, &setup_args); /* @@ -443,26 +557,32 @@ vmbus_bus_init(void) ret = hv_vmbus_connect(); if (ret != 0) - goto cleanup4; + goto cleanup1; hv_vmbus_request_channel_offers(); return (ret); - cleanup4: - + cleanup1: /* - * remove swi, bus and intr resource + * Free pages alloc'ed */ - bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep); + for (n = 0; n < 2 * MAXCPU; n++) + if (setup_args.page_buffers[n] != NULL) + free(setup_args.page_buffers[n], M_DEVBUF); - cleanup3: - bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res); - - cleanup2: - swi_remove(event_swintr); + /* + * remove swi and vmbus callback vector; + */ + CPU_FOREACH(j) { + if (hv_vmbus_g_context.msg_swintr[j] != NULL) + swi_remove(hv_vmbus_g_context.msg_swintr[j]); + if (hv_vmbus_g_context.event_swintr[j] != NULL) + swi_remove(hv_vmbus_g_context.event_swintr[j]); + hv_vmbus_g_context.hv_msg_intr_event[j] = NULL; + hv_vmbus_g_context.hv_event_intr_event[j] = NULL; + } - cleanup1: - swi_remove(msg_swintr); + vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector); cleanup: hv_vmbus_cleanup(); @@ -515,20 +635,24 @@ vmbus_bus_exit(void) smp_rendezvous(NULL, hv_vmbus_synic_cleanup, NULL, NULL); - for(i = 0; i < 2; i++) { + for(i = 0; i < 2 * MAXCPU; i++) { if (setup_args.page_buffers[i] != 0) free(setup_args.page_buffers[i], M_DEVBUF); } hv_vmbus_cleanup(); - /* remove swi, bus and intr resource */ - bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep); - - bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res); + /* remove swi */ + CPU_FOREACH(i) { + if (hv_vmbus_g_context.msg_swintr[i] != NULL) + swi_remove(hv_vmbus_g_context.msg_swintr[i]); + if (hv_vmbus_g_context.event_swintr[i] != NULL) + swi_remove(hv_vmbus_g_context.event_swintr[i]); + hv_vmbus_g_context.hv_msg_intr_event[i] = NULL; + hv_vmbus_g_context.hv_event_intr_event[i] = NULL; + } - swi_remove(msg_swintr); - swi_remove(event_swintr); + vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector); return; } @@ -603,6 +727,6 @@ devclass_t vmbus_devclass; DRIVER_MODULE(vmbus, nexus, vmbus_driver, vmbus_devclass, vmbus_modevent, 0); MODULE_VERSION(vmbus,1); -/* TODO: We want to be earlier than SI_SUB_VFS */ -SYSINIT(vmb_init, SI_SUB_VFS, SI_ORDER_MIDDLE, vmbus_init, NULL); +/* We want to be started after SMP is initialized */ +SYSINIT(vmb_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, vmbus_init, NULL); diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h index 6bc875d..faa6dec 100644 --- a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h +++ b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h @@ -181,49 +181,30 @@ enum { #define HV_HYPERCALL_PARAM_ALIGN sizeof(uint64_t) -/* - * Connection identifier type - */ -typedef union { - uint32_t as_uint32_t; - struct { - uint32_t id:24; - uint32_t reserved:8; - } u; - -} __packed hv_vmbus_connection_id; - -/* - * Definition of the hv_vmbus_signal_event hypercall input structure - */ -typedef struct { - hv_vmbus_connection_id connection_id; - uint16_t flag_number; - uint16_t rsvd_z; -} __packed hv_vmbus_input_signal_event; - -typedef struct { - uint64_t align8; - hv_vmbus_input_signal_event event; -} __packed hv_vmbus_input_signal_event_buffer; - typedef struct { uint64_t guest_id; void* hypercall_page; hv_bool_uint8_t syn_ic_initialized; + + hv_vmbus_handle syn_ic_msg_page[MAXCPU]; + hv_vmbus_handle syn_ic_event_page[MAXCPU]; /* - * This is used as an input param to HV_CALL_SIGNAL_EVENT hypercall. - * The input param is immutable in our usage and - * must be dynamic mem (vs stack or global). + * For FreeBSD cpuid to Hyper-V vcpuid mapping. */ - hv_vmbus_input_signal_event_buffer *signal_event_buffer; + uint32_t hv_vcpu_index[MAXCPU]; /* - * 8-bytes aligned of the buffer above + * Each cpu has its own software interrupt handler for channel + * event and msg handling. */ - hv_vmbus_input_signal_event *signal_event_param; - - hv_vmbus_handle syn_ic_msg_page[MAXCPU]; - hv_vmbus_handle syn_ic_event_page[MAXCPU]; + struct intr_event *hv_event_intr_event[MAXCPU]; + struct intr_event *hv_msg_intr_event[MAXCPU]; + void *event_swintr[MAXCPU]; + void *msg_swintr[MAXCPU]; + /* + * Host use this vector to intrrupt guest for vmbus channel + * event and msg. + */ + unsigned int hv_cb_vector; } hv_vmbus_context; /* @@ -368,7 +349,8 @@ typedef struct { TAILQ_HEAD(, hv_vmbus_channel_msg_info) channel_msg_anchor; struct mtx channel_msg_lock; /** - * List of channels + * List of primary channels. Sub channels will be linked + * under their primary channel. */ TAILQ_HEAD(, hv_vmbus_channel) channel_anchor; struct mtx channel_lock; @@ -560,6 +542,8 @@ typedef union { uint32_t flags32[HV_EVENT_FLAGS_DWORD_COUNT]; } hv_vmbus_synic_event_flags; +/* MSR used to provide vcpu index */ +#define HV_X64_MSR_VP_INDEX (0x40000002) /* * Define synthetic interrupt controller model specific registers @@ -618,7 +602,8 @@ void hv_ring_buffer_cleanup( int hv_ring_buffer_write( hv_vmbus_ring_buffer_info *ring_info, hv_vmbus_sg_buffer_list sg_buffers[], - uint32_t sg_buff_count); + uint32_t sg_buff_count, + boolean_t *need_sig); int hv_ring_buffer_peek( hv_vmbus_ring_buffer_info *ring_info, @@ -638,6 +623,12 @@ void hv_vmbus_dump_ring_info( hv_vmbus_ring_buffer_info *ring_info, char *prefix); +void hv_ring_buffer_read_begin( + hv_vmbus_ring_buffer_info *ring_info); + +uint32_t hv_ring_buffer_read_end( + hv_vmbus_ring_buffer_info *ring_info); + hv_vmbus_channel* hv_vmbus_allocate_channel(void); void hv_vmbus_free_vmbus_channel(hv_vmbus_channel *channel); void hv_vmbus_on_channel_message(void *context); @@ -652,7 +643,7 @@ uint16_t hv_vmbus_post_msg_via_msg_ipc( void *payload, size_t payload_size); -uint16_t hv_vmbus_signal_event(void); +uint16_t hv_vmbus_signal_event(void *con_id); void hv_vmbus_synic_init(void *irq_arg); void hv_vmbus_synic_cleanup(void *arg); int hv_vmbus_query_hypervisor_presence(void); @@ -674,7 +665,7 @@ hv_vmbus_channel* hv_vmbus_get_channel_from_rel_id(uint32_t rel_id); int hv_vmbus_connect(void); int hv_vmbus_disconnect(void); int hv_vmbus_post_message(void *buffer, size_t buf_size); -int hv_vmbus_set_event(uint32_t child_rel_id); +int hv_vmbus_set_event(hv_vmbus_channel *channel); void hv_vmbus_on_events(void *); @@ -718,7 +709,7 @@ static inline uint64_t hv_generate_guest_id( typedef struct { unsigned int vector; - void *page_buffers[2]; + void *page_buffers[2 * MAXCPU]; } hv_setup_args; #endif /* __HYPERV_PRIV_H__ */ |