diff options
author | Renato Botelho <renato@netgate.com> | 2016-06-21 07:44:54 -0300 |
---|---|---|
committer | Renato Botelho <renato@netgate.com> | 2016-06-21 07:44:54 -0300 |
commit | 1fc6b0207cc2f3cce33817706603caa41a9de24d (patch) | |
tree | d2d812b76b08f42a002621f716dd5f3199c7ca7d /sys/dev/hyperv | |
parent | b8632c4f34175c7018be77059ab229e755eb67e0 (diff) | |
parent | bc9e0dd07a76c4d7a1c6fcf21824ca2cecff2c6d (diff) | |
download | FreeBSD-src-1fc6b0207cc2f3cce33817706603caa41a9de24d.zip FreeBSD-src-1fc6b0207cc2f3cce33817706603caa41a9de24d.tar.gz |
Merge remote-tracking branch 'origin/stable/10' into devel
Diffstat (limited to 'sys/dev/hyperv')
25 files changed, 2167 insertions, 682 deletions
diff --git a/sys/dev/hyperv/include/hyperv.h b/sys/dev/hyperv/include/hyperv.h index f45543b..aeec8ec 100644 --- a/sys/dev/hyperv/include/hyperv.h +++ b/sys/dev/hyperv/include/hyperv.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. @@ -124,6 +124,8 @@ typedef struct hv_guid { unsigned char data[16]; } __packed hv_guid; +int snprintf_hv_guid(char *, size_t, const hv_guid *); + #define HV_NIC_GUID \ .data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, \ 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} @@ -689,7 +691,6 @@ typedef struct { } hv_vmbus_ring_buffer_info; typedef void (*hv_vmbus_pfn_channel_callback)(void *context); -typedef void (*hv_vmbus_sc_creation_callback)(void *context); typedef enum { HV_CHANNEL_OFFER_STATE, @@ -753,8 +754,6 @@ typedef struct hv_vmbus_channel { */ hv_vmbus_ring_buffer_info inbound; - struct mtx inbound_lock; - struct taskqueue * rxq; struct task channel_task; hv_vmbus_pfn_channel_callback on_channel_callback; @@ -804,13 +803,6 @@ typedef struct hv_vmbus_channel { * response on the same channel. */ - /* - * Multi-channel creation callback. This callback will be called in - * process context when a Multi-channel offer is received from the host. - * The guest can open the Multi-channel in the context of this callback. - */ - hv_vmbus_sc_creation_callback sc_creation_callback; - struct mtx sc_lock; /* @@ -818,18 +810,24 @@ typedef struct hv_vmbus_channel { */ TAILQ_HEAD(, hv_vmbus_channel) sc_list_anchor; TAILQ_ENTRY(hv_vmbus_channel) sc_list_entry; + int subchan_cnt; /* * The primary channel this sub-channle belongs to. * This will be NULL for the primary channel. */ struct hv_vmbus_channel *primary_channel; + /* - * Support per channel state for use by vmbus drivers. + * Driver private data */ - void *per_channel_state; + void *hv_chan_priv1; + void *hv_chan_priv2; + void *hv_chan_priv3; } hv_vmbus_channel; +#define HV_VMBUS_CHAN_ISPRIMARY(chan) ((chan)->primary_channel == NULL) + static inline void hv_set_channel_read_state(hv_vmbus_channel* channel, boolean_t state) { @@ -908,6 +906,11 @@ int hv_vmbus_channel_teardown_gpdal( struct hv_vmbus_channel* vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary); +void vmbus_channel_cpu_set(struct hv_vmbus_channel *chan, int cpu); +struct hv_vmbus_channel ** + vmbus_get_subchan(struct hv_vmbus_channel *pri_chan, int subchan_cnt); +void vmbus_rel_subchan(struct hv_vmbus_channel **subchan, int subchan_cnt); + /** * @brief Get physical address from virtual */ diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.c b/sys/dev/hyperv/netvsc/hv_net_vsc.c index 9a89b62..a62f450 100644 --- a/sys/dev/hyperv/netvsc/hv_net_vsc.c +++ b/sys/dev/hyperv/netvsc/hv_net_vsc.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. @@ -48,21 +48,27 @@ #include "hv_rndis.h" #include "hv_rndis_filter.h" +/* priv1 and priv2 are consumed by the main driver */ +#define hv_chan_rdbuf hv_chan_priv3 + MALLOC_DEFINE(M_NETVSC, "netvsc", "Hyper-V netvsc driver"); /* * Forward declarations */ -static void hv_nv_on_channel_callback(void *context); +static void hv_nv_on_channel_callback(void *xchan); static int hv_nv_init_send_buffer_with_net_vsp(struct hv_device *device); static int hv_nv_init_rx_buffer_with_net_vsp(struct hv_device *device); static int hv_nv_destroy_send_buffer(netvsc_dev *net_dev); static int hv_nv_destroy_rx_buffer(netvsc_dev *net_dev); static int hv_nv_connect_to_vsp(struct hv_device *device); static void hv_nv_on_send_completion(netvsc_dev *net_dev, - struct hv_device *device, hv_vm_packet_descriptor *pkt); + struct hv_device *device, struct hv_vmbus_channel *, hv_vm_packet_descriptor *pkt); +static void hv_nv_on_receive_completion(struct hv_vmbus_channel *chan, + uint64_t tid, uint32_t status); static void hv_nv_on_receive(netvsc_dev *net_dev, - struct hv_device *device, hv_vm_packet_descriptor *pkt); + struct hv_device *device, struct hv_vmbus_channel *chan, + hv_vm_packet_descriptor *pkt); /* * @@ -115,7 +121,7 @@ hv_nv_get_inbound_net_device(struct hv_device *device) * permit incoming packets if and only if there * are outstanding sends. */ - if (net_dev->destroy && net_dev->num_outstanding_sends == 0) { + if (net_dev->destroy) { return (NULL); } @@ -654,6 +660,16 @@ hv_nv_disconnect_from_vsp(netvsc_dev *net_dev) hv_nv_destroy_send_buffer(net_dev); } +void +hv_nv_subchan_attach(struct hv_vmbus_channel *chan) +{ + + chan->hv_chan_rdbuf = malloc(NETVSC_PACKET_SIZE, M_NETVSC, M_WAITOK); + hv_vmbus_channel_open(chan, NETVSC_DEVICE_RING_BUFFER_SIZE, + NETVSC_DEVICE_RING_BUFFER_SIZE, NULL, 0, + hv_nv_on_channel_callback, chan); +} + /* * Net VSC on device add * @@ -662,25 +678,30 @@ hv_nv_disconnect_from_vsp(netvsc_dev *net_dev) netvsc_dev * hv_nv_on_device_add(struct hv_device *device, void *additional_info) { + struct hv_vmbus_channel *chan = device->channel; netvsc_dev *net_dev; int ret = 0; net_dev = hv_nv_alloc_net_device(device); - if (!net_dev) - goto cleanup; + if (net_dev == NULL) + return NULL; /* Initialize the NetVSC channel extension */ sema_init(&net_dev->channel_init_sema, 0, "netdev_sema"); + chan->hv_chan_rdbuf = malloc(NETVSC_PACKET_SIZE, M_NETVSC, M_WAITOK); + /* * Open the channel */ - ret = hv_vmbus_channel_open(device->channel, + ret = hv_vmbus_channel_open(chan, NETVSC_DEVICE_RING_BUFFER_SIZE, NETVSC_DEVICE_RING_BUFFER_SIZE, - NULL, 0, hv_nv_on_channel_callback, device); - if (ret != 0) + NULL, 0, hv_nv_on_channel_callback, chan); + if (ret != 0) { + free(chan->hv_chan_rdbuf, M_NETVSC); goto cleanup; + } /* * Connect with the NetVsp @@ -693,18 +714,16 @@ hv_nv_on_device_add(struct hv_device *device, void *additional_info) close: /* Now, we can close the channel safely */ - - hv_vmbus_channel_close(device->channel); + free(chan->hv_chan_rdbuf, M_NETVSC); + hv_vmbus_channel_close(chan); cleanup: /* * Free the packet buffers on the netvsc device packet queue. * Release other resources. */ - if (net_dev) { - sema_destroy(&net_dev->channel_init_sema); - free(net_dev, M_NETVSC); - } + sema_destroy(&net_dev->channel_init_sema); + free(net_dev, M_NETVSC); return (NULL); } @@ -719,14 +738,7 @@ hv_nv_on_device_remove(struct hv_device *device, boolean_t destroy_channel) netvsc_dev *net_dev = sc->net_dev;; /* Stop outbound traffic ie sends and receives completions */ - mtx_lock(&device->channel->inbound_lock); net_dev->destroy = TRUE; - mtx_unlock(&device->channel->inbound_lock); - - /* Wait for all send completions */ - while (net_dev->num_outstanding_sends) { - DELAY(100); - } hv_nv_disconnect_from_vsp(net_dev); @@ -739,6 +751,7 @@ hv_nv_on_device_remove(struct hv_device *device, boolean_t destroy_channel) HV_CHANNEL_CLOSING_NONDESTRUCTIVE_STATE; } + free(device->channel->hv_chan_rdbuf, M_NETVSC); hv_vmbus_channel_close(device->channel); sema_destroy(&net_dev->channel_init_sema); @@ -752,7 +765,8 @@ hv_nv_on_device_remove(struct hv_device *device, boolean_t destroy_channel) */ static void hv_nv_on_send_completion(netvsc_dev *net_dev, - struct hv_device *device, hv_vm_packet_descriptor *pkt) + struct hv_device *device, struct hv_vmbus_channel *chan, + hv_vm_packet_descriptor *pkt) { nvsp_msg *nvsp_msg_pkt; netvsc_packet *net_vsc_pkt; @@ -764,7 +778,9 @@ hv_nv_on_send_completion(netvsc_dev *net_dev, || nvsp_msg_pkt->hdr.msg_type == nvsp_msg_1_type_send_rx_buf_complete || nvsp_msg_pkt->hdr.msg_type - == nvsp_msg_1_type_send_send_buf_complete) { + == nvsp_msg_1_type_send_send_buf_complete + || nvsp_msg_pkt->hdr.msg_type + == nvsp_msg5_type_subchannel) { /* Copy the response back */ memcpy(&net_dev->channel_init_packet, nvsp_msg_pkt, sizeof(nvsp_msg)); @@ -801,12 +817,10 @@ hv_nv_on_send_completion(netvsc_dev *net_dev, } /* Notify the layer above us */ - net_vsc_pkt->compl.send.on_send_completion( + net_vsc_pkt->compl.send.on_send_completion(chan, net_vsc_pkt->compl.send.send_completion_context); } - - atomic_subtract_int(&net_dev->num_outstanding_sends, 1); } } @@ -816,16 +830,11 @@ hv_nv_on_send_completion(netvsc_dev *net_dev, * Returns 0 on success, non-zero on failure. */ int -hv_nv_on_send(struct hv_device *device, netvsc_packet *pkt) +hv_nv_on_send(struct hv_vmbus_channel *chan, netvsc_packet *pkt) { - netvsc_dev *net_dev; nvsp_msg send_msg; int ret; - net_dev = hv_nv_get_outbound_net_device(device); - if (!net_dev) - return (ENODEV); - send_msg.hdr.msg_type = nvsp_msg_1_type_send_rndis_pkt; if (pkt->is_data_pkt) { /* 0 is RMC_DATA */ @@ -841,20 +850,16 @@ hv_nv_on_send(struct hv_device *device, netvsc_packet *pkt) pkt->send_buf_section_size; if (pkt->page_buf_count) { - ret = hv_vmbus_channel_send_packet_pagebuffer(device->channel, + ret = hv_vmbus_channel_send_packet_pagebuffer(chan, pkt->page_buffers, pkt->page_buf_count, &send_msg, sizeof(nvsp_msg), (uint64_t)(uintptr_t)pkt); } else { - ret = hv_vmbus_channel_send_packet(device->channel, + ret = hv_vmbus_channel_send_packet(chan, &send_msg, sizeof(nvsp_msg), (uint64_t)(uintptr_t)pkt, HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); } - /* Record outstanding send only if send_packet() succeeded */ - if (ret == 0) - atomic_add_int(&net_dev->num_outstanding_sends, 1); - return (ret); } @@ -866,7 +871,7 @@ hv_nv_on_send(struct hv_device *device, netvsc_packet *pkt) */ static void hv_nv_on_receive(netvsc_dev *net_dev, struct hv_device *device, - hv_vm_packet_descriptor *pkt) + struct hv_vmbus_channel *chan, hv_vm_packet_descriptor *pkt) { hv_vm_transfer_page_packet_header *vm_xfer_page_pkt; nvsp_msg *nvsp_msg_pkt; @@ -916,7 +921,7 @@ hv_nv_on_receive(netvsc_dev *net_dev, struct hv_device *device, net_vsc_pkt->tot_data_buf_len = vm_xfer_page_pkt->ranges[i].byte_count; - hv_rf_on_receive(net_dev, device, net_vsc_pkt); + hv_rf_on_receive(net_dev, device, chan, net_vsc_pkt); if (net_vsc_pkt->status != nvsp_status_success) { status = nvsp_status_failure; } @@ -927,9 +932,8 @@ hv_nv_on_receive(netvsc_dev *net_dev, struct hv_device *device, * messages (not just data messages) will trigger a response * message back to the host. */ - hv_nv_on_receive_completion(device, vm_xfer_page_pkt->d.transaction_id, + hv_nv_on_receive_completion(chan, vm_xfer_page_pkt->d.transaction_id, status); - hv_rf_receive_rollup(net_dev); } /* @@ -937,8 +941,8 @@ hv_nv_on_receive(netvsc_dev *net_dev, struct hv_device *device, * * Send a receive completion packet to RNDIS device (ie NetVsp) */ -void -hv_nv_on_receive_completion(struct hv_device *device, uint64_t tid, +static void +hv_nv_on_receive_completion(struct hv_vmbus_channel *chan, uint64_t tid, uint32_t status) { nvsp_msg rx_comp_msg; @@ -953,7 +957,7 @@ hv_nv_on_receive_completion(struct hv_device *device, uint64_t tid, retry_send_cmplt: /* Send the completion */ - ret = hv_vmbus_channel_send_packet(device->channel, &rx_comp_msg, + ret = hv_vmbus_channel_send_packet(chan, &rx_comp_msg, sizeof(nvsp_msg), tid, HV_VMBUS_PACKET_TYPE_COMPLETION, 0); if (ret == 0) { /* success */ @@ -970,12 +974,53 @@ retry_send_cmplt: } /* + * Net VSC receiving vRSS send table from VSP + */ +static void +hv_nv_send_table(struct hv_device *device, hv_vm_packet_descriptor *pkt) +{ + netvsc_dev *net_dev; + nvsp_msg *nvsp_msg_pkt; + int i; + uint32_t count, *table; + + net_dev = hv_nv_get_inbound_net_device(device); + if (!net_dev) + return; + + nvsp_msg_pkt = + (nvsp_msg *)((unsigned long)pkt + (pkt->data_offset8 << 3)); + + if (nvsp_msg_pkt->hdr.msg_type != + nvsp_msg5_type_send_indirection_table) { + printf("Netvsc: !Warning! receive msg type not " + "send_indirection_table. type = %d\n", + nvsp_msg_pkt->hdr.msg_type); + return; + } + + count = nvsp_msg_pkt->msgs.vers_5_msgs.send_table.count; + if (count != VRSS_SEND_TABLE_SIZE) { + printf("Netvsc: Received wrong send table size: %u\n", count); + return; + } + + table = (uint32_t *) + ((unsigned long)&nvsp_msg_pkt->msgs.vers_5_msgs.send_table + + nvsp_msg_pkt->msgs.vers_5_msgs.send_table.offset); + + for (i = 0; i < count; i++) + net_dev->vrss_send_table[i] = table[i]; +} + +/* * Net VSC on channel callback */ static void -hv_nv_on_channel_callback(void *context) +hv_nv_on_channel_callback(void *xchan) { - struct hv_device *device = (struct hv_device *)context; + struct hv_vmbus_channel *chan = xchan; + struct hv_device *device = chan->device; netvsc_dev *net_dev; device_t dev = device->device; uint32_t bytes_rxed; @@ -989,20 +1034,24 @@ hv_nv_on_channel_callback(void *context) if (net_dev == NULL) return; - buffer = net_dev->callback_buf; + buffer = chan->hv_chan_rdbuf; do { - ret = hv_vmbus_channel_recv_packet_raw(device->channel, + ret = hv_vmbus_channel_recv_packet_raw(chan, buffer, bufferlen, &bytes_rxed, &request_id); if (ret == 0) { if (bytes_rxed > 0) { desc = (hv_vm_packet_descriptor *)buffer; switch (desc->type) { case HV_VMBUS_PACKET_TYPE_COMPLETION: - hv_nv_on_send_completion(net_dev, device, desc); + hv_nv_on_send_completion(net_dev, device, + chan, desc); break; case HV_VMBUS_PACKET_TYPE_DATA_USING_TRANSFER_PAGES: - hv_nv_on_receive(net_dev, device, desc); + hv_nv_on_receive(net_dev, device, chan, desc); + break; + case HV_VMBUS_PACKET_TYPE_DATA_IN_BAND: + hv_nv_send_table(device, desc); break; default: device_printf(dev, @@ -1036,5 +1085,5 @@ hv_nv_on_channel_callback(void *context) if (bufferlen > NETVSC_PACKET_SIZE) free(buffer, M_NETVSC); - hv_rf_channel_rollup(net_dev); + hv_rf_channel_rollup(chan); } diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.h b/sys/dev/hyperv/netvsc/hv_net_vsc.h index 95dee17..7c43f64 100644 --- a/sys/dev/hyperv/netvsc/hv_net_vsc.h +++ b/sys/dev/hyperv/netvsc/hv_net_vsc.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. @@ -86,6 +86,92 @@ MALLOC_DECLARE(M_NETVSC); */ #define NVSP_MAX_PACKETS_PER_RECEIVE 375 +/* vRSS stuff */ +#define RNDIS_OBJECT_TYPE_RSS_CAPABILITIES 0x88 +#define RNDIS_OBJECT_TYPE_RSS_PARAMETERS 0x89 + +#define RNDIS_RECEIVE_SCALE_CAPABILITIES_REVISION_2 2 +#define RNDIS_RECEIVE_SCALE_PARAMETERS_REVISION_2 2 + +struct rndis_obj_header { + uint8_t type; + uint8_t rev; + uint16_t size; +} __packed; + +/* rndis_recv_scale_cap/cap_flag */ +#define RNDIS_RSS_CAPS_MESSAGE_SIGNALED_INTERRUPTS 0x01000000 +#define RNDIS_RSS_CAPS_CLASSIFICATION_AT_ISR 0x02000000 +#define RNDIS_RSS_CAPS_CLASSIFICATION_AT_DPC 0x04000000 +#define RNDIS_RSS_CAPS_USING_MSI_X 0x08000000 +#define RNDIS_RSS_CAPS_RSS_AVAILABLE_ON_PORTS 0x10000000 +#define RNDIS_RSS_CAPS_SUPPORTS_MSI_X 0x20000000 +#define RNDIS_RSS_CAPS_HASH_TYPE_TCP_IPV4 0x00000100 +#define RNDIS_RSS_CAPS_HASH_TYPE_TCP_IPV6 0x00000200 +#define RNDIS_RSS_CAPS_HASH_TYPE_TCP_IPV6_EX 0x00000400 + +/* RNDIS_RECEIVE_SCALE_CAPABILITIES */ +struct rndis_recv_scale_cap { + struct rndis_obj_header hdr; + uint32_t cap_flag; + uint32_t num_int_msg; + uint32_t num_recv_que; + uint16_t num_indirect_tabent; +} __packed; + +/* rndis_recv_scale_param flags */ +#define RNDIS_RSS_PARAM_FLAG_BASE_CPU_UNCHANGED 0x0001 +#define RNDIS_RSS_PARAM_FLAG_HASH_INFO_UNCHANGED 0x0002 +#define RNDIS_RSS_PARAM_FLAG_ITABLE_UNCHANGED 0x0004 +#define RNDIS_RSS_PARAM_FLAG_HASH_KEY_UNCHANGED 0x0008 +#define RNDIS_RSS_PARAM_FLAG_DISABLE_RSS 0x0010 + +/* Hash info bits */ +#define RNDIS_HASH_FUNC_TOEPLITZ 0x00000001 +#define RNDIS_HASH_IPV4 0x00000100 +#define RNDIS_HASH_TCP_IPV4 0x00000200 +#define RNDIS_HASH_IPV6 0x00000400 +#define RNDIS_HASH_IPV6_EX 0x00000800 +#define RNDIS_HASH_TCP_IPV6 0x00001000 +#define RNDIS_HASH_TCP_IPV6_EX 0x00002000 + +#define RNDIS_RSS_INDIRECTION_TABLE_MAX_SIZE_REVISION_2 (128 * 4) +#define RNDIS_RSS_HASH_SECRET_KEY_MAX_SIZE_REVISION_2 40 + +#define ITAB_NUM 128 +#define HASH_KEYLEN RNDIS_RSS_HASH_SECRET_KEY_MAX_SIZE_REVISION_2 + +/* RNDIS_RECEIVE_SCALE_PARAMETERS */ +typedef struct rndis_recv_scale_param_ { + struct rndis_obj_header hdr; + + /* Qualifies the rest of the information */ + uint16_t flag; + + /* The base CPU number to do receive processing. not used */ + uint16_t base_cpu_number; + + /* This describes the hash function and type being enabled */ + uint32_t hashinfo; + + /* The size of indirection table array */ + uint16_t indirect_tabsize; + + /* The offset of the indirection table from the beginning of this + * structure + */ + uint32_t indirect_taboffset; + + /* The size of the hash secret key */ + uint16_t hashkey_size; + + /* The offset of the secret key from the beginning of this structure */ + uint32_t hashkey_offset; + + uint32_t processor_masks_offset; + uint32_t num_processor_masks; + uint32_t processor_masks_entry_size; +} rndis_recv_scale_param; typedef enum nvsp_msg_type_ { nvsp_msg_type_none = 0, @@ -146,6 +232,27 @@ typedef enum nvsp_msg_type_ { nvsp_msg_2_type_alloc_chimney_handle, nvsp_msg_2_type_alloc_chimney_handle_complete, + + nvsp_msg2_max = nvsp_msg_2_type_alloc_chimney_handle_complete, + + /* + * Version 4 Messages + */ + nvsp_msg4_type_send_vf_association, + nvsp_msg4_type_switch_data_path, + nvsp_msg4_type_uplink_connect_state_deprecated, + + nvsp_msg4_max = nvsp_msg4_type_uplink_connect_state_deprecated, + + /* + * Version 5 Messages + */ + nvsp_msg5_type_oid_query_ex, + nvsp_msg5_type_oid_query_ex_comp, + nvsp_msg5_type_subchannel, + nvsp_msg5_type_send_indirection_table, + + nvsp_msg5_max = nvsp_msg5_type_send_indirection_table, } nvsp_msg_type; typedef enum nvsp_status_ { @@ -793,6 +900,39 @@ typedef struct nvsp_2_msg_send_vmq_rndis_pkt_complete_ uint32_t status; } __packed nvsp_2_msg_send_vmq_rndis_pkt_complete; +/* + * Version 5 messages + */ +enum nvsp_subchannel_operation { + NVSP_SUBCHANNEL_NONE = 0, + NVSP_SUBCHANNE_ALLOCATE, + NVSP_SUBCHANNE_MAX +}; + +typedef struct nvsp_5_subchannel_request_ +{ + uint32_t op; + uint32_t num_subchannels; +} __packed nvsp_5_subchannel_request; + +typedef struct nvsp_5_subchannel_complete_ +{ + uint32_t status; + /* Actual number of subchannels allocated */ + uint32_t num_subchannels; +} __packed nvsp_5_subchannel_complete; + +typedef struct nvsp_5_send_indirect_table_ +{ + /* The number of entries in the send indirection table */ + uint32_t count; + /* + * The offset of the send indireciton table from top of + * this struct. The send indirection table tells which channel + * to put the send traffic on. Each entry is a channel number. + */ + uint32_t offset; +} __packed nvsp_5_send_indirect_table; typedef union nvsp_1_msg_uber_ { nvsp_1_msg_send_ndis_version send_ndis_vers; @@ -838,11 +978,18 @@ typedef union nvsp_2_msg_uber_ { nvsp_2_msg_alloc_chimney_handle_complete alloc_chimney_handle_complete; } __packed nvsp_2_msg_uber; +typedef union nvsp_5_msg_uber_ +{ + nvsp_5_subchannel_request subchannel_request; + nvsp_5_subchannel_complete subchn_complete; + nvsp_5_send_indirect_table send_table; +} __packed nvsp_5_msg_uber; typedef union nvsp_all_msgs_ { nvsp_msg_init_uber init_msgs; nvsp_1_msg_uber vers_1_msgs; nvsp_2_msg_uber vers_2_msgs; + nvsp_5_msg_uber vers_5_msgs; } __packed nvsp_all_msgs; /* @@ -883,6 +1030,7 @@ typedef struct nvsp_msg_ { #define NETVSC_MAX_CONFIGURABLE_MTU (9 * 1024) #define NETVSC_PACKET_SIZE PAGE_SIZE +#define VRSS_SEND_TABLE_SIZE 16 /* * Data types @@ -893,7 +1041,6 @@ typedef struct nvsp_msg_ { */ typedef struct netvsc_dev_ { struct hv_device *dev; - int num_outstanding_sends; /* Send buffer allocated by us but manages by NetVSP */ void *send_buf; @@ -924,12 +1071,15 @@ typedef struct netvsc_dev_ { hv_bool_uint8_t destroy; /* Negotiated NVSP version */ uint32_t nvsp_version; - - uint8_t callback_buf[NETVSC_PACKET_SIZE]; + + uint32_t num_channel; + + uint32_t vrss_send_table[VRSS_SEND_TABLE_SIZE]; } netvsc_dev; +struct hv_vmbus_channel; -typedef void (*pfn_on_send_rx_completion)(void *); +typedef void (*pfn_on_send_rx_completion)(struct hv_vmbus_channel *, void *); #define NETVSC_DEVICE_RING_BUFFER_SIZE (128 * PAGE_SIZE) #define NETVSC_PACKET_MAXPAGE 32 @@ -1000,10 +1150,12 @@ struct buf_ring; #endif struct hn_rx_ring { - struct lro_ctrl hn_lro; + struct ifnet *hn_ifp; + int hn_rx_idx; /* Trust csum verification on host side */ int hn_trust_hcsum; /* HN_TRUST_HCSUM_ */ + struct lro_ctrl hn_lro; u_long hn_csum_ip; u_long hn_csum_tcp; @@ -1011,12 +1163,20 @@ struct hn_rx_ring { u_long hn_csum_trusted; u_long hn_lro_tried; u_long hn_small_pkts; + u_long hn_pkts; + u_long hn_rss_pkts; + + /* Rarely used stuffs */ + struct sysctl_oid *hn_rx_sysctl_tree; + int hn_rx_flags; } __aligned(CACHE_LINE_SIZE); #define HN_TRUST_HCSUM_IP 0x0001 #define HN_TRUST_HCSUM_TCP 0x0002 #define HN_TRUST_HCSUM_UDP 0x0004 +#define HN_RX_FLAG_ATTACHED 0x1 + struct hn_tx_ring { #ifndef HN_USE_TXDESC_BUFRING struct mtx hn_txlist_spin; @@ -1026,7 +1186,8 @@ struct hn_tx_ring { #endif int hn_txdesc_cnt; int hn_txdesc_avail; - int hn_has_txeof; + u_short hn_has_txeof; + u_short hn_txdone_cnt; int hn_sched_tx; void (*hn_txeof)(struct hn_tx_ring *); @@ -1034,8 +1195,13 @@ struct hn_tx_ring { struct task hn_tx_task; struct task hn_txeof_task; + struct buf_ring *hn_mbuf_br; + int hn_oactive; + int hn_tx_idx; + struct mtx hn_tx_lock; struct hn_softc *hn_sc; + struct hv_vmbus_channel *hn_chan; int hn_direct_tx_size; int hn_tx_chimney_size; @@ -1046,14 +1212,19 @@ struct hn_tx_ring { u_long hn_send_failed; u_long hn_txdma_failed; u_long hn_tx_collapsed; + u_long hn_tx_chimney_tried; u_long hn_tx_chimney; + u_long hn_pkts; /* Rarely used stuffs */ struct hn_txdesc *hn_txdesc; bus_dma_tag_t hn_tx_rndis_dtag; struct sysctl_oid *hn_tx_sysctl_tree; + int hn_tx_flags; } __aligned(CACHE_LINE_SIZE); +#define HN_TX_FLAG_ATTACHED 0x1 + /* * Device-specific softc structure */ @@ -1073,13 +1244,18 @@ typedef struct hn_softc { netvsc_dev *net_dev; int hn_rx_ring_cnt; + int hn_rx_ring_inuse; struct hn_rx_ring *hn_rx_ring; int hn_tx_ring_cnt; + int hn_tx_ring_inuse; struct hn_tx_ring *hn_tx_ring; + + int hn_cpu; int hn_tx_chimney_max; struct taskqueue *hn_tx_taskq; struct sysctl_oid *hn_tx_sysctl_tree; + struct sysctl_oid *hn_rx_sysctl_tree; } hn_softc_t; /* @@ -1088,14 +1264,13 @@ typedef struct hn_softc { extern int hv_promisc_mode; void netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status); -void hv_nv_on_receive_completion(struct hv_device *device, - uint64_t tid, uint32_t status); netvsc_dev *hv_nv_on_device_add(struct hv_device *device, void *additional_info); int hv_nv_on_device_remove(struct hv_device *device, boolean_t destroy_channel); -int hv_nv_on_send(struct hv_device *device, netvsc_packet *pkt); +int hv_nv_on_send(struct hv_vmbus_channel *chan, netvsc_packet *pkt); int hv_nv_get_next_send_section(netvsc_dev *net_dev); +void hv_nv_subchan_attach(struct hv_vmbus_channel *chan); #endif /* __HV_NET_VSC_H__ */ diff --git a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c index 0f4425e..f670c12 100644 --- a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c +++ b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c @@ -1,6 +1,6 @@ /*- * Copyright (c) 2010-2012 Citrix Inc. - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * @@ -119,6 +119,8 @@ __FBSDID("$FreeBSD$"); #include "hv_rndis.h" #include "hv_rndis_filter.h" +#define hv_chan_rxr hv_chan_priv1 +#define hv_chan_txr hv_chan_priv2 /* Short for Hyper-V network interface */ #define NETVSC_DEVNAME "hn" @@ -136,8 +138,11 @@ __FBSDID("$FreeBSD$"); #define HN_LROENT_CNT_DEF 128 +#define HN_RING_CNT_DEF_MAX 8 + #define HN_RNDIS_MSG_LEN \ (sizeof(rndis_msg) + \ + RNDIS_HASHVAL_PPI_SIZE + \ RNDIS_VLAN_PPI_SIZE + \ RNDIS_TSO_PPI_SIZE + \ RNDIS_CSUM_PPI_SIZE) @@ -152,6 +157,8 @@ __FBSDID("$FreeBSD$"); #define HN_DIRECT_TX_SIZE_DEF 128 +#define HN_EARLY_TXEOF_THRESH 8 + struct hn_txdesc { #ifndef HN_USE_TXDESC_BUFRING SLIST_ENTRY(hn_txdesc) link; @@ -180,6 +187,7 @@ struct hn_txdesc { #define HN_CSUM_ASSIST_WIN8 (CSUM_IP | CSUM_TCP) #define HN_CSUM_ASSIST (CSUM_IP | CSUM_UDP | CSUM_TCP) +#define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) /* YYY 2*MTU is a bit rough, but should be good enough. */ #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) @@ -208,7 +216,8 @@ struct hn_txdesc { int hv_promisc_mode = 0; /* normal mode by default */ -SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD, NULL, "Hyper-V network interface"); +SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, + "Hyper-V network interface"); /* Trust tcp segements verification on host side. */ static int hn_trust_hosttcp = 1; @@ -231,12 +240,10 @@ SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, "Trust ip packet verification on host side, " "when csum info is missing (global setting)"); -#if __FreeBSD_version >= 1100045 /* Limit TSO burst size */ static int hn_tso_maxlen = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, &hn_tso_maxlen, 0, "TSO burst limit"); -#endif /* Limit chimney send size */ static int hn_tx_chimney_size = 0; @@ -274,6 +281,25 @@ static int hn_bind_tx_taskq = -1; SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN, &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu"); +static int hn_use_if_start = 0; +SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, + &hn_use_if_start, 0, "Use if_start TX method"); + +static int hn_chan_cnt = 0; +SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, + &hn_chan_cnt, 0, + "# of channels to use; each channel has one RX ring and one TX ring"); + +static int hn_tx_ring_cnt = 0; +SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, + &hn_tx_ring_cnt, 0, "# of TX rings to use"); + +static int hn_tx_swq_depth = 0; +SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, + &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); + +static u_int hn_cpu_index; + /* * Forward declarations */ @@ -303,15 +329,45 @@ static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); static int hn_check_iplen(const struct mbuf *, int); static int hn_create_tx_ring(struct hn_softc *, int); static void hn_destroy_tx_ring(struct hn_tx_ring *); -static int hn_create_tx_data(struct hn_softc *); +static int hn_create_tx_data(struct hn_softc *, int); static void hn_destroy_tx_data(struct hn_softc *); static void hn_start_taskfunc(void *, int); static void hn_start_txeof_taskfunc(void *, int); static void hn_stop_tx_tasks(struct hn_softc *); static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **); -static void hn_create_rx_data(struct hn_softc *sc); +static void hn_create_rx_data(struct hn_softc *sc, int); static void hn_destroy_rx_data(struct hn_softc *sc); static void hn_set_tx_chimney_size(struct hn_softc *, int); +static void hn_channel_attach(struct hn_softc *, struct hv_vmbus_channel *); +static void hn_subchan_attach(struct hn_softc *, struct hv_vmbus_channel *); + +static int hn_transmit(struct ifnet *, struct mbuf *); +static void hn_xmit_qflush(struct ifnet *); +static int hn_xmit(struct hn_tx_ring *, int); +static void hn_xmit_txeof(struct hn_tx_ring *); +static void hn_xmit_taskfunc(void *, int); +static void hn_xmit_txeof_taskfunc(void *, int); + +#if __FreeBSD_version >= 1100099 +static void +hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) +{ + int i; + + for (i = 0; i < sc->hn_rx_ring_inuse; ++i) + sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; +} +#endif + +static int +hn_get_txswq_depth(const struct hn_tx_ring *txr) +{ + + KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); + if (hn_tx_swq_depth < txr->hn_txdesc_cnt) + return txr->hn_txdesc_cnt; + return hn_tx_swq_depth; +} static int hn_ifmedia_upd(struct ifnet *ifp __unused) @@ -353,7 +409,7 @@ netvsc_probe(device_t dev) p = vmbus_get_type(dev); if (!memcmp(p, &g_net_vsc_device_type.data, sizeof(hv_guid))) { - device_set_desc(dev, "Synthetic Network Interface"); + device_set_desc(dev, "Hyper-V Network Interface"); if (bootverbose) printf("Netvsc probe... DONE \n"); @@ -386,21 +442,16 @@ static int netvsc_attach(device_t dev) { struct hv_device *device_ctx = vmbus_get_devctx(dev); + struct hv_vmbus_channel *pri_chan; netvsc_device_info device_info; hn_softc_t *sc; int unit = device_get_unit(dev); struct ifnet *ifp = NULL; - int error; -#if __FreeBSD_version >= 1100045 + int error, ring_cnt, tx_ring_cnt; int tso_maxlen; -#endif sc = device_get_softc(dev); - if (sc == NULL) { - return (ENOMEM); - } - bzero(sc, sizeof(hn_softc_t)); sc->hn_unit = unit; sc->hn_dev = dev; @@ -431,26 +482,69 @@ netvsc_attach(device_t dev) ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER); ifp->if_softc = sc; + if_initname(ifp, device_get_name(dev), device_get_unit(dev)); + + /* + * Figure out the # of RX rings (ring_cnt) and the # of TX rings + * to use (tx_ring_cnt). + * + * NOTE: + * The # of RX rings to use is same as the # of channels to use. + */ + ring_cnt = hn_chan_cnt; + if (ring_cnt <= 0) { + /* Default */ + ring_cnt = mp_ncpus; + if (ring_cnt > HN_RING_CNT_DEF_MAX) + ring_cnt = HN_RING_CNT_DEF_MAX; + } else if (ring_cnt > mp_ncpus) { + ring_cnt = mp_ncpus; + } + + tx_ring_cnt = hn_tx_ring_cnt; + if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) + tx_ring_cnt = ring_cnt; + if (hn_use_if_start) { + /* ifnet.if_start only needs one TX ring. */ + tx_ring_cnt = 1; + } + + /* + * Set the leader CPU for channels. + */ + sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; - error = hn_create_tx_data(sc); + error = hn_create_tx_data(sc, tx_ring_cnt); if (error) goto failed; + hn_create_rx_data(sc, ring_cnt); - hn_create_rx_data(sc); - - if_initname(ifp, device_get_name(dev), device_get_unit(dev)); - ifp->if_dunit = unit; - ifp->if_dname = NETVSC_DEVNAME; + /* + * Associate the first TX/RX ring w/ the primary channel. + */ + pri_chan = device_ctx->channel; + KASSERT(HV_VMBUS_CHAN_ISPRIMARY(pri_chan), ("not primary channel")); + KASSERT(pri_chan->offer_msg.offer.sub_channel_index == 0, + ("primary channel subidx %u", + pri_chan->offer_msg.offer.sub_channel_index)); + hn_channel_attach(sc, pri_chan); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = hn_ioctl; - ifp->if_start = hn_start; ifp->if_init = hn_ifinit; /* needed by hv_rf_on_device_add() code */ ifp->if_mtu = ETHERMTU; - IFQ_SET_MAXLEN(&ifp->if_snd, 512); - ifp->if_snd.ifq_drv_maxlen = 511; - IFQ_SET_READY(&ifp->if_snd); + if (hn_use_if_start) { + int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); + + ifp->if_start = hn_start; + IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); + ifp->if_snd.ifq_drv_maxlen = qdepth - 1; + IFQ_SET_READY(&ifp->if_snd); + } else { + ifp->if_transmit = hn_transmit; + ifp->if_qflush = hn_xmit_qflush; + } ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); @@ -470,15 +564,58 @@ netvsc_attach(device_t dev) IFCAP_LRO; ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist | CSUM_TSO; - error = hv_rf_on_device_add(device_ctx, &device_info); + error = hv_rf_on_device_add(device_ctx, &device_info, ring_cnt); if (error) goto failed; + KASSERT(sc->net_dev->num_channel > 0 && + sc->net_dev->num_channel <= sc->hn_rx_ring_inuse, + ("invalid channel count %u, should be less than %d", + sc->net_dev->num_channel, sc->hn_rx_ring_inuse)); + + /* + * Set the # of TX/RX rings that could be used according to + * the # of channels that host offered. + */ + if (sc->hn_tx_ring_inuse > sc->net_dev->num_channel) + sc->hn_tx_ring_inuse = sc->net_dev->num_channel; + sc->hn_rx_ring_inuse = sc->net_dev->num_channel; + device_printf(dev, "%d TX ring, %d RX ring\n", + sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); + + if (sc->net_dev->num_channel > 1) { + struct hv_vmbus_channel **subchan; + int subchan_cnt = sc->net_dev->num_channel - 1; + int i; + + /* Wait for sub-channels setup to complete. */ + subchan = vmbus_get_subchan(pri_chan, subchan_cnt); + + /* Attach the sub-channels. */ + for (i = 0; i < subchan_cnt; ++i) { + /* NOTE: Calling order is critical. */ + hn_subchan_attach(sc, subchan[i]); + hv_nv_subchan_attach(subchan[i]); + } + + /* Release the sub-channels */ + vmbus_rel_subchan(subchan, subchan_cnt); + device_printf(dev, "%d sub-channels setup done\n", subchan_cnt); + } + +#if __FreeBSD_version >= 1100099 + if (sc->hn_rx_ring_inuse > 1) { + /* + * Reduce TCP segment aggregation limit for multiple + * RX rings to increase ACK timeliness. + */ + hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); + } +#endif if (device_info.link_state == 0) { sc->hn_carrier = 1; } -#if __FreeBSD_version >= 1100045 tso_maxlen = hn_tso_maxlen; if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET) tso_maxlen = IP_MAXPACKET; @@ -487,14 +624,11 @@ netvsc_attach(device_t dev) ifp->if_hw_tsomaxsegsize = PAGE_SIZE; ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); -#endif ether_ifattach(ifp, device_info.mac_addr); -#if __FreeBSD_version >= 1100045 if_printf(ifp, "TSO: %u/%u/%u\n", ifp->if_hw_tsomax, ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); -#endif sc->hn_tx_chimney_max = sc->net_dev->send_section_size; hn_set_tx_chimney_size(sc, sc->hn_tx_chimney_max); @@ -674,8 +808,15 @@ hn_txdesc_hold(struct hn_txdesc *txd) atomic_add_int(&txd->refs, 1); } +static __inline void +hn_txeof(struct hn_tx_ring *txr) +{ + txr->hn_has_txeof = 0; + txr->hn_txeof(txr); +} + static void -hn_tx_done(void *xpkt) +hn_tx_done(struct hv_vmbus_channel *chan, void *xpkt) { netvsc_packet *packet = xpkt; struct hn_txdesc *txd; @@ -685,17 +826,28 @@ hn_tx_done(void *xpkt) packet->compl.send.send_completion_tid; txr = txd->txr; + KASSERT(txr->hn_chan == chan, + ("channel mismatch, on channel%u, should be channel%u", + chan->offer_msg.offer.sub_channel_index, + txr->hn_chan->offer_msg.offer.sub_channel_index)); + txr->hn_has_txeof = 1; hn_txdesc_put(txr, txd); + + ++txr->hn_txdone_cnt; + if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { + txr->hn_txdone_cnt = 0; + if (txr->hn_oactive) + hn_txeof(txr); + } } void -netvsc_channel_rollup(struct hv_device *device_ctx) +netvsc_channel_rollup(struct hv_vmbus_channel *chan) { - struct hn_softc *sc = device_get_softc(device_ctx->device); - struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; /* TODO: vRSS */ + struct hn_tx_ring *txr = chan->hv_chan_txr; #if defined(INET) || defined(INET6) - struct hn_rx_ring *rxr = &sc->hn_rx_ring[0]; /* TODO: vRSS */ + struct hn_rx_ring *rxr = chan->hv_chan_rxr; struct lro_ctrl *lro = &rxr->hn_lro; struct lro_entry *queued; @@ -705,11 +857,16 @@ netvsc_channel_rollup(struct hv_device *device_ctx) } #endif - if (!txr->hn_has_txeof) + /* + * NOTE: + * 'txr' could be NULL, if multiple channels and + * ifnet.if_start method are enabled. + */ + if (txr == NULL || !txr->hn_has_txeof) return; - txr->hn_has_txeof = 0; - txr->hn_txeof(txr); + txr->hn_txdone_cnt = 0; + hn_txeof(txr); } /* @@ -726,6 +883,7 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) rndis_msg *rndis_mesg; rndis_packet *rndis_pkt; rndis_per_packet_info *rppi; + struct rndis_hash_value *hash_value; uint32_t rndis_msg_size; packet = &txd->netvsc_pkt; @@ -750,6 +908,18 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet); + /* + * Set the hash value for this packet, so that the host could + * dispatch the TX done event for this packet back to this TX + * ring's channel. + */ + rndis_msg_size += RNDIS_HASHVAL_PPI_SIZE; + rppi = hv_set_rppi_data(rndis_mesg, RNDIS_HASHVAL_PPI_SIZE, + nbl_hash_value); + hash_value = (struct rndis_hash_value *)((uint8_t *)rppi + + rppi->per_packet_info_offset); + hash_value->hash_value = txr->hn_tx_idx; + if (m_head->m_flags & M_VLANTAG) { ndis_8021q_info *rppi_vlan_info; @@ -851,6 +1021,7 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) netvsc_dev *net_dev = txr->hn_sc->net_dev; uint32_t send_buf_section_idx; + txr->hn_tx_chimney_tried++; send_buf_section_idx = hv_nv_get_next_send_section(net_dev); if (send_buf_section_idx != @@ -932,8 +1103,7 @@ done: * associated w/ the txd will _not_ be freed. */ static int -hn_send_pkt(struct ifnet *ifp, struct hv_device *device_ctx, - struct hn_tx_ring *txr, struct hn_txdesc *txd) +hn_send_pkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) { int error, send_failed = 0; @@ -942,10 +1112,17 @@ again: * Make sure that txd is not freed before ETHER_BPF_MTAP. */ hn_txdesc_hold(txd); - error = hv_nv_on_send(device_ctx, &txd->netvsc_pkt); + error = hv_nv_on_send(txr->hn_chan, &txd->netvsc_pkt); if (!error) { ETHER_BPF_MTAP(ifp, txd->m); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + if (!hn_use_if_start) { + if_inc_counter(ifp, IFCOUNTER_OBYTES, + txd->m->m_pkthdr.len); + if (txd->m->m_flags & M_MCAST) + if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); + } + txr->hn_pkts++; } hn_txdesc_put(txr, txd); @@ -996,8 +1173,9 @@ hn_start_locked(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; - struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); + KASSERT(hn_use_if_start, + ("hn_start_locked is called, when if_start is disabled")); KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); mtx_assert(&txr->hn_tx_lock, MA_OWNED); @@ -1038,7 +1216,7 @@ hn_start_locked(struct hn_tx_ring *txr, int len) continue; } - error = hn_send_pkt(ifp, device_ctx, txr, txd); + error = hn_send_pkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ IFQ_DRV_PREPEND(&ifp->if_snd, m_head); @@ -1057,10 +1235,6 @@ netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status) { hn_softc_t *sc = device_get_softc(device_obj->device); - if (sc == NULL) { - return; - } - if (status == 1) { sc->hn_carrier = 1; } else { @@ -1133,26 +1307,18 @@ hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) * Note: This is no longer used as a callback */ int -netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, - rndis_tcp_ip_csum_info *csum_info) +netvsc_recv(struct hv_vmbus_channel *chan, netvsc_packet *packet, + const rndis_tcp_ip_csum_info *csum_info, + const struct rndis_hash_info *hash_info, + const struct rndis_hash_value *hash_value) { - struct hn_softc *sc = device_get_softc(device_ctx->device); - struct hn_rx_ring *rxr = &sc->hn_rx_ring[0]; /* TODO: vRSS */ + struct hn_rx_ring *rxr = chan->hv_chan_rxr; + struct ifnet *ifp = rxr->hn_ifp; struct mbuf *m_new; - struct ifnet *ifp; int size, do_lro = 0, do_csum = 1; - if (sc == NULL) { - return (0); /* TODO: KYS how can this be! */ - } - - ifp = sc->hn_ifp; - - ifp = sc->arpcom.ac_ifp; - - if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { + if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) return (0); - } /* * Bail out if packet contains more data than configured MTU. @@ -1161,8 +1327,10 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, return (0); } else if (packet->tot_data_buf_len <= MHLEN) { m_new = m_gethdr(M_NOWAIT, MT_DATA); - if (m_new == NULL) + if (m_new == NULL) { + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return (0); + } memcpy(mtod(m_new, void *), packet->data, packet->tot_data_buf_len); m_new->m_pkthdr.len = m_new->m_len = packet->tot_data_buf_len; @@ -1182,7 +1350,7 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); if (m_new == NULL) { - if_printf(ifp, "alloc mbuf failed.\n"); + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return (0); } @@ -1251,7 +1419,6 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } - /* Rely on SW csum verification though... */ do_lro = 1; } else if (pr == IPPROTO_UDP) { if (do_csum && @@ -1278,12 +1445,58 @@ skip: m_new->m_flags |= M_VLANTAG; } + if (hash_info != NULL && hash_value != NULL) { + int hash_type = M_HASHTYPE_OPAQUE; + + rxr->hn_rss_pkts++; + m_new->m_pkthdr.flowid = hash_value->hash_value; + if ((hash_info->hash_info & NDIS_HASH_FUNCTION_MASK) == + NDIS_HASH_FUNCTION_TOEPLITZ) { + uint32_t type = + (hash_info->hash_info & NDIS_HASH_TYPE_MASK); + + switch (type) { + case NDIS_HASH_IPV4: + hash_type = M_HASHTYPE_RSS_IPV4; + break; + + case NDIS_HASH_TCP_IPV4: + hash_type = M_HASHTYPE_RSS_TCP_IPV4; + break; + + case NDIS_HASH_IPV6: + hash_type = M_HASHTYPE_RSS_IPV6; + break; + + case NDIS_HASH_IPV6_EX: + hash_type = M_HASHTYPE_RSS_IPV6_EX; + break; + + case NDIS_HASH_TCP_IPV6: + hash_type = M_HASHTYPE_RSS_TCP_IPV6; + break; + + case NDIS_HASH_TCP_IPV6_EX: + hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; + break; + } + } + M_HASHTYPE_SET(m_new, hash_type); + } else { + if (hash_value != NULL) + m_new->m_pkthdr.flowid = hash_value->hash_value; + else + m_new->m_pkthdr.flowid = rxr->hn_rx_idx; + M_HASHTYPE_SET(m_new, M_HASHTYPE_OPAQUE); + } + /* * Note: Moved RX completion back to hv_nv_on_receive() so all * messages (not just data messages) will trigger a response. */ ifp->if_ipackets++; + rxr->hn_pkts++; if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { #if defined(INET) || defined(INET6) @@ -1305,11 +1518,6 @@ skip: return (0); } -void -netvsc_recv_rollup(struct hv_device *device_ctx __unused) -{ -} - /* * Rules for using sc->temp_unusable: * 1. sc->temp_unusable can only be read or written while holding NV_LOCK() @@ -1373,13 +1581,8 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) */ NV_LOCK(sc); if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < - HN_LRO_LENLIM_MIN(ifp)) { - int i; - for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { - sc->hn_rx_ring[i].hn_lro.lro_length_lim = - HN_LRO_LENLIM_MIN(ifp); - } - } + HN_LRO_LENLIM_MIN(ifp)) + hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); NV_UNLOCK(sc); #endif @@ -1412,7 +1615,8 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) NV_UNLOCK(sc); break; } - error = hv_rf_on_device_add(hn_dev, &device_info); + error = hv_rf_on_device_add(hn_dev, &device_info, + sc->hn_rx_ring_inuse); if (error) { NV_LOCK(sc); sc->temp_unusable = FALSE; @@ -1555,7 +1759,7 @@ static void hn_stop(hn_softc_t *sc) { struct ifnet *ifp; - int ret; + int ret, i; struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); ifp = sc->hn_ifp; @@ -1565,6 +1769,9 @@ hn_stop(hn_softc_t *sc) atomic_clear_int(&ifp->if_drv_flags, (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)); + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) + sc->hn_tx_ring[i].hn_oactive = 0; + if_link_state_change(ifp, LINK_STATE_DOWN); sc->hn_initdone = 0; @@ -1637,7 +1844,7 @@ hn_ifinit_locked(hn_softc_t *sc) { struct ifnet *ifp; struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); - int ret; + int ret, i; ifp = sc->hn_ifp; @@ -1653,7 +1860,11 @@ hn_ifinit_locked(hn_softc_t *sc) } else { sc->hn_initdone = 1; } + atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) + sc->hn_tx_ring[i].hn_oactive = 0; + atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); if_link_state_change(ifp, LINK_STATE_UP); } @@ -1704,7 +1915,7 @@ hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int lenlim; - int error, i; + int error; lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; error = sysctl_handle_int(oidp, &lenlim, 0, req); @@ -1716,8 +1927,7 @@ hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) return EINVAL; NV_LOCK(sc); - for (i = 0; i < sc->hn_rx_ring_cnt; ++i) - sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; + hn_set_lro_lenlim(sc, lenlim); NV_UNLOCK(sc); return 0; } @@ -1746,7 +1956,7 @@ hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) */ --ackcnt; NV_LOCK(sc); - for (i = 0; i < sc->hn_rx_ring_cnt; ++i) + for (i = 0; i < sc->hn_rx_ring_inuse; ++i) sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; NV_UNLOCK(sc); return 0; @@ -1770,7 +1980,7 @@ hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) return error; NV_LOCK(sc); - for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (on) @@ -1810,7 +2020,7 @@ hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) uint64_t stat; stat = 0; - for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((int *)((uint8_t *)rxr + ofs)); } @@ -1820,7 +2030,7 @@ hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) return error; /* Zero out this stat. */ - for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; *((int *)((uint8_t *)rxr + ofs)) = 0; } @@ -1836,7 +2046,7 @@ hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) uint64_t stat; stat = 0; - for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((uint64_t *)((uint8_t *)rxr + ofs)); } @@ -1846,7 +2056,7 @@ hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) return error; /* Zero out this stat. */ - for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; } @@ -1890,7 +2100,7 @@ hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) u_long stat; stat = 0; - for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; stat += *((u_long *)((uint8_t *)txr + ofs)); } @@ -1900,7 +2110,7 @@ hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) return error; /* Zero out this stat. */ - for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; *((u_long *)((uint8_t *)txr + ofs)) = 0; } @@ -1922,7 +2132,7 @@ hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) return error; NV_LOCK(sc); - for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; *((int *)((uint8_t *)txr + ofs)) = conf; } @@ -2019,7 +2229,7 @@ hn_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error) } static void -hn_create_rx_data(struct hn_softc *sc) +hn_create_rx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; @@ -2031,7 +2241,9 @@ hn_create_rx_data(struct hn_softc *sc) #endif int i; - sc->hn_rx_ring_cnt = 1; /* TODO: vRSS */ + sc->hn_rx_ring_cnt = ring_cnt; + sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; + sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, M_NETVSC, M_WAITOK | M_ZERO); @@ -2044,6 +2256,13 @@ hn_create_rx_data(struct hn_softc *sc) #endif #endif /* INET || INET6 */ + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + /* Create dev.hn.UNIT.rx sysctl tree */ + sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", + CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; @@ -2053,6 +2272,8 @@ hn_create_rx_data(struct hn_softc *sc) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; if (hn_trust_hostip) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; + rxr->hn_ifp = sc->hn_ifp; + rxr->hn_rx_idx = i; /* * Initialize LRO. @@ -2069,13 +2290,35 @@ hn_create_rx_data(struct hn_softc *sc) rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; #endif #endif /* INET || INET6 */ - } - ctx = device_get_sysctl_ctx(dev); - child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + if (sc->hn_rx_sysctl_tree != NULL) { + char name[16]; + + /* + * Create per RX ring sysctl tree: + * dev.hn.UNIT.rx.RINGID + */ + snprintf(name, sizeof(name), "%d", i); + rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, + SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), + OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + + if (rxr->hn_rx_sysctl_tree != NULL) { + SYSCTL_ADD_ULONG(ctx, + SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), + OID_AUTO, "packets", CTLFLAG_RW, + &rxr->hn_pkts, "# of packets received"); + SYSCTL_ADD_ULONG(ctx, + SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), + OID_AUTO, "rss_pkts", CTLFLAG_RW, + &rxr->hn_rss_pkts, + "# of packets w/ RSS info received"); + } + } + } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", - CTLTYPE_U64 | CTLFLAG_RW, sc, + CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro.lro_queued), #if __FreeBSD_version < 1100095 hn_rx_stat_int_sysctl, @@ -2084,7 +2327,7 @@ hn_create_rx_data(struct hn_softc *sc) #endif "LU", "LRO queued"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", - CTLTYPE_U64 | CTLFLAG_RW, sc, + CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), #if __FreeBSD_version < 1100095 hn_rx_stat_int_sysctl, @@ -2093,53 +2336,59 @@ hn_create_rx_data(struct hn_softc *sc) #endif "LU", "LRO flushed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", - CTLTYPE_ULONG | CTLFLAG_RW, sc, + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro_tried), hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); #if __FreeBSD_version >= 1100099 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", - CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_lro_lenlim_sysctl, "IU", + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_lro_lenlim_sysctl, "IU", "Max # of data bytes to be aggregated by LRO"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", - CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_ackcnt_sysctl, "I", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_lro_ackcnt_sysctl, "I", "Max # of ACKs to be aggregated by LRO"); #endif SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", - CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_TCP, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, hn_trust_hcsum_sysctl, "I", "Trust tcp segement verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", - CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_UDP, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, hn_trust_hcsum_sysctl, "I", "Trust udp datagram verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", - CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_IP, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, hn_trust_hcsum_sysctl, "I", "Trust ip packet verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", - CTLTYPE_ULONG | CTLFLAG_RW, sc, + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_ip), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", - CTLTYPE_ULONG | CTLFLAG_RW, sc, + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_tcp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", - CTLTYPE_ULONG | CTLFLAG_RW, sc, + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_udp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", - CTLTYPE_ULONG | CTLFLAG_RW, sc, + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_trusted), hn_rx_stat_ulong_sysctl, "LU", "# of packets that we trust host's csum verification"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", - CTLTYPE_ULONG | CTLFLAG_RW, sc, + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_small_pkts), hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", + CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", + CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); } static void @@ -2160,6 +2409,7 @@ hn_destroy_rx_data(struct hn_softc *sc) sc->hn_rx_ring = NULL; sc->hn_rx_ring_cnt = 0; + sc->hn_rx_ring_inuse = 0; } static int @@ -2170,6 +2420,7 @@ hn_create_tx_ring(struct hn_softc *sc, int id) int error, i; txr->hn_sc = sc; + txr->hn_tx_idx = id; #ifndef HN_USE_TXDESC_BUFRING mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); @@ -2187,8 +2438,22 @@ hn_create_tx_ring(struct hn_softc *sc, int id) #endif txr->hn_tx_taskq = sc->hn_tx_taskq; - TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); - TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); + + if (hn_use_if_start) { + txr->hn_txeof = hn_start_txeof; + TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); + TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); + } else { + int br_depth; + + txr->hn_txeof = hn_xmit_txeof; + TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); + TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); + + br_depth = hn_get_txswq_depth(txr); + txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_NETVSC, + M_WAITOK, &txr->hn_tx_lock); + } txr->hn_direct_tx_size = hn_direct_tx_size; if (hv_vmbus_protocal_version >= HV_VMBUS_VERSION_WIN8_1) @@ -2202,8 +2467,6 @@ hn_create_tx_ring(struct hn_softc *sc, int id) */ txr->hn_sched_tx = 1; - txr->hn_txeof = hn_start_txeof; /* TODO: if_transmit */ - parent_dtag = bus_get_dma_tag(sc->hn_dev); /* DMA tag for RNDIS messages. */ @@ -2312,7 +2575,7 @@ hn_create_tx_ring(struct hn_softc *sc, int id) snprintf(name, sizeof(name), "%d", id); txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, - name, CTLFLAG_RD, 0, ""); + name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (txr->hn_tx_sysctl_tree != NULL) { child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); @@ -2320,6 +2583,14 @@ hn_create_tx_ring(struct hn_softc *sc, int id) SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", CTLFLAG_RD, &txr->hn_txdesc_avail, 0, "# of available TX descs"); + if (!hn_use_if_start) { + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", + CTLFLAG_RD, &txr->hn_oactive, 0, + "over active"); + } + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", + CTLFLAG_RW, &txr->hn_pkts, + "# of packets transmitted"); } } @@ -2354,8 +2625,10 @@ hn_destroy_tx_ring(struct hn_tx_ring *txr) hn_txdesc_dmamap_destroy(txd); } #else + mtx_lock(&txr->hn_tx_lock); while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL) hn_txdesc_dmamap_destroy(txd); + mtx_unlock(&txr->hn_tx_lock); #endif if (txr->hn_tx_data_dtag != NULL) @@ -2370,6 +2643,9 @@ hn_destroy_tx_ring(struct hn_tx_ring *txr) free(txr->hn_txdesc, M_NETVSC); txr->hn_txdesc = NULL; + if (txr->hn_mbuf_br != NULL) + buf_ring_free(txr->hn_mbuf_br, M_NETVSC); + #ifndef HN_USE_TXDESC_BUFRING mtx_destroy(&txr->hn_txlist_spin); #endif @@ -2377,13 +2653,15 @@ hn_destroy_tx_ring(struct hn_tx_ring *txr) } static int -hn_create_tx_data(struct hn_softc *sc) +hn_create_tx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; int i; - sc->hn_tx_ring_cnt = 1; /* TODO: vRSS */ + sc->hn_tx_ring_cnt = ring_cnt; + sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; + sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, M_NETVSC, M_WAITOK | M_ZERO); @@ -2392,7 +2670,7 @@ hn_create_tx_data(struct hn_softc *sc) /* Create dev.hn.UNIT.tx sysctl tree */ sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", - CTLFLAG_RD, 0, ""); + CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { int error; @@ -2403,25 +2681,29 @@ hn_create_tx_data(struct hn_softc *sc) } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", - CTLTYPE_ULONG | CTLFLAG_RW, sc, + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_no_txdescs), hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", - CTLTYPE_ULONG | CTLFLAG_RW, sc, + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_send_failed), hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", - CTLTYPE_ULONG | CTLFLAG_RW, sc, + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_txdma_failed), hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", - CTLTYPE_ULONG | CTLFLAG_RW, sc, + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_collapsed), hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", - CTLTYPE_ULONG | CTLFLAG_RW, sc, + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), + hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, "# of total TX descs"); @@ -2429,19 +2711,24 @@ hn_create_tx_data(struct hn_softc *sc) CTLFLAG_RD, &sc->hn_tx_chimney_max, 0, "Chimney send packet size upper boundary"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", - CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_tx_chimney_size_sysctl, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_tx_chimney_size_sysctl, "I", "Chimney send packet size limit"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", - CTLTYPE_INT | CTLFLAG_RW, sc, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_direct_tx_size), hn_tx_conf_int_sysctl, "I", "Size of the packet for direct transmission"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", - CTLTYPE_INT | CTLFLAG_RW, sc, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_sched_tx), hn_tx_conf_int_sysctl, "I", "Always schedule transmission " "instead of doing direct transmission"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", + CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", + CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); return 0; } @@ -2452,7 +2739,7 @@ hn_set_tx_chimney_size(struct hn_softc *sc, int chimney_size) int i; NV_LOCK(sc); - for (i = 0; i < sc->hn_tx_ring_cnt; ++i) + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_tx_chimney_size = chimney_size; NV_UNLOCK(sc); } @@ -2472,6 +2759,7 @@ hn_destroy_tx_data(struct hn_softc *sc) sc->hn_tx_ring = NULL; sc->hn_tx_ring_cnt = 0; + sc->hn_tx_ring_inuse = 0; } static void @@ -2500,7 +2788,7 @@ hn_stop_tx_tasks(struct hn_softc *sc) { int i; - for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); @@ -2508,6 +2796,224 @@ hn_stop_tx_tasks(struct hn_softc *sc) } } +static int +hn_xmit(struct hn_tx_ring *txr, int len) +{ + struct hn_softc *sc = txr->hn_sc; + struct ifnet *ifp = sc->hn_ifp; + struct mbuf *m_head; + + mtx_assert(&txr->hn_tx_lock, MA_OWNED); + KASSERT(hn_use_if_start == 0, + ("hn_xmit is called, when if_start is enabled")); + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) + return 0; + + while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { + struct hn_txdesc *txd; + int error; + + if (len > 0 && m_head->m_pkthdr.len > len) { + /* + * This sending could be time consuming; let callers + * dispatch this packet sending (and sending of any + * following up packets) to tx taskqueue. + */ + drbr_putback(ifp, txr->hn_mbuf_br, m_head); + return 1; + } + + txd = hn_txdesc_get(txr); + if (txd == NULL) { + txr->hn_no_txdescs++; + drbr_putback(ifp, txr->hn_mbuf_br, m_head); + txr->hn_oactive = 1; + break; + } + + error = hn_encap(txr, txd, &m_head); + if (error) { + /* Both txd and m_head are freed; discard */ + drbr_advance(ifp, txr->hn_mbuf_br); + continue; + } + + error = hn_send_pkt(ifp, txr, txd); + if (__predict_false(error)) { + /* txd is freed, but m_head is not */ + drbr_putback(ifp, txr->hn_mbuf_br, m_head); + txr->hn_oactive = 1; + break; + } + + /* Sent */ + drbr_advance(ifp, txr->hn_mbuf_br); + } + return 0; +} + +static int +hn_transmit(struct ifnet *ifp, struct mbuf *m) +{ + struct hn_softc *sc = ifp->if_softc; + struct hn_tx_ring *txr; + int error, idx = 0; + + /* + * Select the TX ring based on flowid + */ + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) + idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; + txr = &sc->hn_tx_ring[idx]; + + error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); + if (error) { + if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); + return error; + } + + if (txr->hn_oactive) + return 0; + + if (txr->hn_sched_tx) + goto do_sched; + + if (mtx_trylock(&txr->hn_tx_lock)) { + int sched; + + sched = hn_xmit(txr, txr->hn_direct_tx_size); + mtx_unlock(&txr->hn_tx_lock); + if (!sched) + return 0; + } +do_sched: + taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); + return 0; +} + +static void +hn_xmit_qflush(struct ifnet *ifp) +{ + struct hn_softc *sc = ifp->if_softc; + int i; + + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { + struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; + struct mbuf *m; + + mtx_lock(&txr->hn_tx_lock); + while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) + m_freem(m); + mtx_unlock(&txr->hn_tx_lock); + } + if_qflush(ifp); +} + +static void +hn_xmit_txeof(struct hn_tx_ring *txr) +{ + + if (txr->hn_sched_tx) + goto do_sched; + + if (mtx_trylock(&txr->hn_tx_lock)) { + int sched; + + txr->hn_oactive = 0; + sched = hn_xmit(txr, txr->hn_direct_tx_size); + mtx_unlock(&txr->hn_tx_lock); + if (sched) { + taskqueue_enqueue(txr->hn_tx_taskq, + &txr->hn_tx_task); + } + } else { +do_sched: + /* + * Release the oactive earlier, with the hope, that + * others could catch up. The task will clear the + * oactive again with the hn_tx_lock to avoid possible + * races. + */ + txr->hn_oactive = 0; + taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); + } +} + +static void +hn_xmit_taskfunc(void *xtxr, int pending __unused) +{ + struct hn_tx_ring *txr = xtxr; + + mtx_lock(&txr->hn_tx_lock); + hn_xmit(txr, 0); + mtx_unlock(&txr->hn_tx_lock); +} + +static void +hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) +{ + struct hn_tx_ring *txr = xtxr; + + mtx_lock(&txr->hn_tx_lock); + txr->hn_oactive = 0; + hn_xmit(txr, 0); + mtx_unlock(&txr->hn_tx_lock); +} + +static void +hn_channel_attach(struct hn_softc *sc, struct hv_vmbus_channel *chan) +{ + struct hn_rx_ring *rxr; + int idx; + + idx = chan->offer_msg.offer.sub_channel_index; + + KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, + ("invalid channel index %d, should > 0 && < %d", + idx, sc->hn_rx_ring_inuse)); + rxr = &sc->hn_rx_ring[idx]; + KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, + ("RX ring %d already attached", idx)); + rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; + + chan->hv_chan_rxr = rxr; + if (bootverbose) { + if_printf(sc->hn_ifp, "link RX ring %d to channel%u\n", + idx, chan->offer_msg.child_rel_id); + } + + if (idx < sc->hn_tx_ring_inuse) { + struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; + + KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, + ("TX ring %d already attached", idx)); + txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; + + chan->hv_chan_txr = txr; + txr->hn_chan = chan; + if (bootverbose) { + if_printf(sc->hn_ifp, "link TX ring %d to channel%u\n", + idx, chan->offer_msg.child_rel_id); + } + } + + /* Bind channel to a proper CPU */ + vmbus_channel_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus); +} + +static void +hn_subchan_attach(struct hn_softc *sc, struct hv_vmbus_channel *chan) +{ + + KASSERT(!HV_VMBUS_CHAN_ISPRIMARY(chan), + ("subchannel callback on primary channel")); + KASSERT(chan->offer_msg.offer.sub_channel_index > 0, + ("invalid channel subidx %u", + chan->offer_msg.offer.sub_channel_index)); + hn_channel_attach(sc, chan); +} + static void hn_tx_taskq_create(void *arg __unused) { diff --git a/sys/dev/hyperv/netvsc/hv_rndis.h b/sys/dev/hyperv/netvsc/hv_rndis.h index cd46ecc..b27579d 100644 --- a/sys/dev/hyperv/netvsc/hv_rndis.h +++ b/sys/dev/hyperv/netvsc/hv_rndis.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. @@ -167,6 +167,14 @@ #define RNDIS_OID_GEN_MACHINE_NAME 0x0001021A #define RNDIS_OID_GEN_RNDIS_CONFIG_PARAMETER 0x0001021B +/* + * For receive side scale + */ +/* Query only */ +#define RNDIS_OID_GEN_RSS_CAPABILITIES 0x00010203 +/* Query and set */ +#define RNDIS_OID_GEN_RSS_PARAMETERS 0x00010204 + #define RNDIS_OID_GEN_XMIT_OK 0x00020101 #define RNDIS_OID_GEN_RCV_OK 0x00020102 #define RNDIS_OID_GEN_XMIT_ERROR 0x00020103 @@ -608,6 +616,9 @@ typedef enum ndis_per_pkt_infotype_ { max_perpkt_info } ndis_per_pkt_infotype; +#define nbl_hash_value pkt_cancel_id +#define nbl_hash_info original_netbuf_list + typedef struct ndis_8021q_info_ { union { struct { @@ -680,6 +691,28 @@ typedef struct rndis_tcp_ip_csum_info_ { }; } rndis_tcp_ip_csum_info; +struct rndis_hash_value { + uint32_t hash_value; +} __packed; + +struct rndis_hash_info { + uint32_t hash_info; +} __packed; + +#define NDIS_HASH_FUNCTION_MASK 0x000000FF /* see hash function */ +#define NDIS_HASH_TYPE_MASK 0x00FFFF00 /* see hash type */ + +/* hash function */ +#define NDIS_HASH_FUNCTION_TOEPLITZ 0x00000001 + +/* hash type */ +#define NDIS_HASH_IPV4 0x00000100 +#define NDIS_HASH_TCP_IPV4 0x00000200 +#define NDIS_HASH_IPV6 0x00000400 +#define NDIS_HASH_IPV6_EX 0x00000800 +#define NDIS_HASH_TCP_IPV6 0x00001000 +#define NDIS_HASH_TCP_IPV6_EX 0x00002000 + typedef struct rndis_tcp_tso_info_ { union { struct { @@ -713,6 +746,9 @@ typedef struct rndis_tcp_tso_info_ { }; } rndis_tcp_tso_info; +#define RNDIS_HASHVAL_PPI_SIZE (sizeof(rndis_per_packet_info) + \ + sizeof(struct rndis_hash_value)) + #define RNDIS_VLAN_PPI_SIZE (sizeof(rndis_per_packet_info) + \ sizeof(ndis_8021q_info)) @@ -1046,11 +1082,13 @@ typedef struct rndismp_rx_bufs_info_ { /* * Externs */ -int netvsc_recv(struct hv_device *device_ctx, - netvsc_packet *packet, - rndis_tcp_ip_csum_info *csum_info); -void netvsc_recv_rollup(struct hv_device *device_ctx); -void netvsc_channel_rollup(struct hv_device *device_ctx); +struct hv_vmbus_channel; + +int netvsc_recv(struct hv_vmbus_channel *chan, + netvsc_packet *packet, const rndis_tcp_ip_csum_info *csum_info, + const struct rndis_hash_info *hash_info, + const struct rndis_hash_value *hash_value); +void netvsc_channel_rollup(struct hv_vmbus_channel *chan); void* hv_set_rppi_data(rndis_msg *rndis_mesg, uint32_t rppi_size, diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.c b/sys/dev/hyperv/netvsc/hv_rndis_filter.c index 31ddbc0..8e95510 100644 --- a/sys/dev/hyperv/netvsc/hv_rndis_filter.c +++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. @@ -45,10 +45,27 @@ __FBSDID("$FreeBSD$"); #include <vm/pmap.h> #include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/vmbus/hv_vmbus_priv.h> #include "hv_net_vsc.h" #include "hv_rndis.h" #include "hv_rndis_filter.h" +struct hv_rf_recvinfo { + const ndis_8021q_info *vlan_info; + const rndis_tcp_ip_csum_info *csum_info; + const struct rndis_hash_info *hash_info; + const struct rndis_hash_value *hash_value; +}; + +#define HV_RF_RECVINFO_VLAN 0x1 +#define HV_RF_RECVINFO_CSUM 0x2 +#define HV_RF_RECVINFO_HASHINF 0x4 +#define HV_RF_RECVINFO_HASHVAL 0x8 +#define HV_RF_RECVINFO_ALL \ + (HV_RF_RECVINFO_VLAN | \ + HV_RF_RECVINFO_CSUM | \ + HV_RF_RECVINFO_HASHINF | \ + HV_RF_RECVINFO_HASHVAL) /* * Forward declarations @@ -59,6 +76,7 @@ static void hv_rf_receive_response(rndis_device *device, rndis_msg *response); static void hv_rf_receive_indicate_status(rndis_device *device, rndis_msg *response); static void hv_rf_receive_data(rndis_device *device, rndis_msg *message, + struct hv_vmbus_channel *chan, netvsc_packet *pkt); static int hv_rf_query_device(rndis_device *device, uint32_t oid, void *result, uint32_t *result_size); @@ -68,8 +86,8 @@ static int hv_rf_set_packet_filter(rndis_device *device, uint32_t new_filter); static int hv_rf_init_device(rndis_device *device); static int hv_rf_open_device(rndis_device *device); static int hv_rf_close_device(rndis_device *device); -static void hv_rf_on_send_request_completion(void *context); -static void hv_rf_on_send_request_halt_completion(void *context); +static void hv_rf_on_send_request_completion(struct hv_vmbus_channel *, void *context); +static void hv_rf_on_send_request_halt_completion(struct hv_vmbus_channel *, void *context); int hv_rf_send_offload_request(struct hv_device *device, rndis_offload_params *offloads); @@ -223,6 +241,8 @@ hv_rf_send_request(rndis_device *device, rndis_request *request, { int ret; netvsc_packet *packet; + netvsc_dev *net_dev = device->net_dev; + int send_buf_section_idx; /* Set up the packet to send it */ packet = &request->pkt; @@ -237,6 +257,20 @@ hv_rf_send_request(rndis_device *device, rndis_request *request, packet->page_buffers[0].offset = (unsigned long)&request->request_msg & (PAGE_SIZE - 1); + if (packet->page_buffers[0].offset + + packet->page_buffers[0].length > PAGE_SIZE) { + packet->page_buf_count = 2; + packet->page_buffers[0].length = + PAGE_SIZE - packet->page_buffers[0].offset; + packet->page_buffers[1].pfn = + hv_get_phys_addr((char*)&request->request_msg + + packet->page_buffers[0].length) >> PAGE_SHIFT; + packet->page_buffers[1].offset = 0; + packet->page_buffers[1].length = + request->request_msg.msg_len - + packet->page_buffers[0].length; + } + packet->compl.send.send_completion_context = request; /* packet */ if (message_type != REMOTE_NDIS_HALT_MSG) { packet->compl.send.on_send_completion = @@ -246,11 +280,26 @@ hv_rf_send_request(rndis_device *device, rndis_request *request, hv_rf_on_send_request_halt_completion; } packet->compl.send.send_completion_tid = (unsigned long)device; - packet->send_buf_section_idx = - NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; + if (packet->tot_data_buf_len < net_dev->send_section_size) { + send_buf_section_idx = hv_nv_get_next_send_section(net_dev); + if (send_buf_section_idx != + NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) { + char *dest = ((char *)net_dev->send_buf + + send_buf_section_idx * net_dev->send_section_size); + + memcpy(dest, &request->request_msg, request->request_msg.msg_len); + packet->send_buf_section_idx = send_buf_section_idx; + packet->send_buf_section_size = packet->tot_data_buf_len; + packet->page_buf_count = 0; + goto sendit; + } + /* Failed to allocate chimney send buffer; move on */ + } + packet->send_buf_section_idx = NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; packet->send_buf_section_size = 0; - ret = hv_nv_on_send(device->net_dev->dev, packet); +sendit: + ret = hv_nv_on_send(device->net_dev->dev->channel, packet); return (ret); } @@ -373,8 +422,7 @@ hv_rf_send_offload_request(struct hv_device *device, } cleanup: - if (request) - hv_put_rndis_request(rndis_dev, request); + hv_put_rndis_request(rndis_dev, request); return (ret); } @@ -402,17 +450,95 @@ hv_rf_receive_indicate_status(rndis_device *device, rndis_msg *response) } } +static int +hv_rf_find_recvinfo(const rndis_packet *rpkt, struct hv_rf_recvinfo *info) +{ + const rndis_per_packet_info *ppi; + uint32_t mask, len; + + info->vlan_info = NULL; + info->csum_info = NULL; + info->hash_info = NULL; + info->hash_value = NULL; + + if (rpkt->per_pkt_info_offset == 0) + return 0; + + ppi = (const rndis_per_packet_info *) + ((const uint8_t *)rpkt + rpkt->per_pkt_info_offset); + len = rpkt->per_pkt_info_length; + mask = 0; + + while (len != 0) { + const void *ppi_dptr; + uint32_t ppi_dlen; + + if (__predict_false(ppi->size < ppi->per_packet_info_offset)) + return EINVAL; + ppi_dlen = ppi->size - ppi->per_packet_info_offset; + ppi_dptr = (const uint8_t *)ppi + ppi->per_packet_info_offset; + + switch (ppi->type) { + case ieee_8021q_info: + if (__predict_false(ppi_dlen < sizeof(ndis_8021q_info))) + return EINVAL; + info->vlan_info = ppi_dptr; + mask |= HV_RF_RECVINFO_VLAN; + break; + + case tcpip_chksum_info: + if (__predict_false(ppi_dlen < + sizeof(rndis_tcp_ip_csum_info))) + return EINVAL; + info->csum_info = ppi_dptr; + mask |= HV_RF_RECVINFO_CSUM; + break; + + case nbl_hash_value: + if (__predict_false(ppi_dlen < + sizeof(struct rndis_hash_value))) + return EINVAL; + info->hash_value = ppi_dptr; + mask |= HV_RF_RECVINFO_HASHVAL; + break; + + case nbl_hash_info: + if (__predict_false(ppi_dlen < + sizeof(struct rndis_hash_info))) + return EINVAL; + info->hash_info = ppi_dptr; + mask |= HV_RF_RECVINFO_HASHINF; + break; + + default: + goto skip; + } + + if (mask == HV_RF_RECVINFO_ALL) { + /* All found; done */ + break; + } +skip: + if (__predict_false(len < ppi->size)) + return EINVAL; + len -= ppi->size; + ppi = (const rndis_per_packet_info *) + ((const uint8_t *)ppi + ppi->size); + } + return 0; +} + /* * RNDIS filter receive data */ static void -hv_rf_receive_data(rndis_device *device, rndis_msg *message, netvsc_packet *pkt) +hv_rf_receive_data(rndis_device *device, rndis_msg *message, + struct hv_vmbus_channel *chan, netvsc_packet *pkt) { rndis_packet *rndis_pkt; - ndis_8021q_info *rppi_vlan_info; uint32_t data_offset; - rndis_tcp_ip_csum_info *csum_info = NULL; device_t dev = device->net_dev->dev->device; + struct hv_rf_recvinfo info; rndis_pkt = &message->msg.packet; @@ -436,22 +562,26 @@ hv_rf_receive_data(rndis_device *device, rndis_msg *message, netvsc_packet *pkt) pkt->tot_data_buf_len = rndis_pkt->data_length; pkt->data = (void *)((unsigned long)pkt->data + data_offset); - rppi_vlan_info = hv_get_ppi_data(rndis_pkt, ieee_8021q_info); - if (rppi_vlan_info) { - pkt->vlan_tci = rppi_vlan_info->u1.s1.vlan_id; - } else { - pkt->vlan_tci = 0; + if (hv_rf_find_recvinfo(rndis_pkt, &info)) { + pkt->status = nvsp_status_failure; + device_printf(dev, "recvinfo parsing failed\n"); + return; } - csum_info = hv_get_ppi_data(rndis_pkt, tcpip_chksum_info); - netvsc_recv(device->net_dev->dev, pkt, csum_info); + if (info.vlan_info != NULL) + pkt->vlan_tci = info.vlan_info->u1.s1.vlan_id; + else + pkt->vlan_tci = 0; + + netvsc_recv(chan, pkt, info.csum_info, info.hash_info, info.hash_value); } /* * RNDIS filter on receive */ int -hv_rf_on_receive(netvsc_dev *net_dev, struct hv_device *device, netvsc_packet *pkt) +hv_rf_on_receive(netvsc_dev *net_dev, struct hv_device *device, + struct hv_vmbus_channel *chan, netvsc_packet *pkt) { rndis_device *rndis_dev; rndis_msg *rndis_hdr; @@ -474,7 +604,7 @@ hv_rf_on_receive(netvsc_dev *net_dev, struct hv_device *device, netvsc_packet *p /* data message */ case REMOTE_NDIS_PACKET_MSG: - hv_rf_receive_data(rndis_dev, rndis_hdr, pkt); + hv_rf_receive_data(rndis_dev, rndis_hdr, chan, pkt); break; /* completion messages */ case REMOTE_NDIS_INITIALIZE_CMPLT: @@ -525,6 +655,19 @@ hv_rf_query_device(rndis_device *device, uint32_t oid, void *result, query->info_buffer_length = 0; query->device_vc_handle = 0; + if (oid == RNDIS_OID_GEN_RSS_CAPABILITIES) { + struct rndis_recv_scale_cap *cap; + + request->request_msg.msg_len += + sizeof(struct rndis_recv_scale_cap); + query->info_buffer_length = sizeof(struct rndis_recv_scale_cap); + cap = (struct rndis_recv_scale_cap *)((unsigned long)query + + query->info_buffer_offset); + cap->hdr.type = RNDIS_OBJECT_TYPE_RSS_CAPABILITIES; + cap->hdr.rev = RNDIS_RECEIVE_SCALE_CAPABILITIES_REVISION_2; + cap->hdr.size = sizeof(struct rndis_recv_scale_cap); + } + ret = hv_rf_send_request(device, request, REMOTE_NDIS_QUERY_MSG); if (ret != 0) { /* Fixme: printf added */ @@ -579,6 +722,114 @@ hv_rf_query_device_link_status(rndis_device *device) RNDIS_OID_GEN_MEDIA_CONNECT_STATUS, &device->link_status, &size)); } +static uint8_t netvsc_hash_key[HASH_KEYLEN] = { + 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, + 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, + 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, + 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, + 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa +}; + +/* + * RNDIS set vRSS parameters + */ +static int +hv_rf_set_rss_param(rndis_device *device, int num_queue) +{ + rndis_request *request; + rndis_set_request *set; + rndis_set_complete *set_complete; + rndis_recv_scale_param *rssp; + uint32_t extlen = sizeof(rndis_recv_scale_param) + + (4 * ITAB_NUM) + HASH_KEYLEN; + uint32_t *itab, status; + uint8_t *keyp; + int i, ret; + + + request = hv_rndis_request(device, REMOTE_NDIS_SET_MSG, + RNDIS_MESSAGE_SIZE(rndis_set_request) + extlen); + if (request == NULL) { + if (bootverbose) + printf("Netvsc: No memory to set vRSS parameters.\n"); + ret = -1; + goto cleanup; + } + + set = &request->request_msg.msg.set_request; + set->oid = RNDIS_OID_GEN_RSS_PARAMETERS; + set->info_buffer_length = extlen; + set->info_buffer_offset = sizeof(rndis_set_request); + set->device_vc_handle = 0; + + /* Fill out the rssp parameter structure */ + rssp = (rndis_recv_scale_param *)(set + 1); + rssp->hdr.type = RNDIS_OBJECT_TYPE_RSS_PARAMETERS; + rssp->hdr.rev = RNDIS_RECEIVE_SCALE_PARAMETERS_REVISION_2; + rssp->hdr.size = sizeof(rndis_recv_scale_param); + rssp->flag = 0; + rssp->hashinfo = RNDIS_HASH_FUNC_TOEPLITZ | RNDIS_HASH_IPV4 | + RNDIS_HASH_TCP_IPV4 | RNDIS_HASH_IPV6 | RNDIS_HASH_TCP_IPV6; + rssp->indirect_tabsize = 4 * ITAB_NUM; + rssp->indirect_taboffset = sizeof(rndis_recv_scale_param); + rssp->hashkey_size = HASH_KEYLEN; + rssp->hashkey_offset = rssp->indirect_taboffset + + rssp->indirect_tabsize; + + /* Set indirection table entries */ + itab = (uint32_t *)(rssp + 1); + for (i = 0; i < ITAB_NUM; i++) + itab[i] = i % num_queue; + + /* Set hash key values */ + keyp = (uint8_t *)((unsigned long)rssp + rssp->hashkey_offset); + for (i = 0; i < HASH_KEYLEN; i++) + keyp[i] = netvsc_hash_key[i]; + + ret = hv_rf_send_request(device, request, REMOTE_NDIS_SET_MSG); + if (ret != 0) { + goto cleanup; + } + + /* + * Wait for the response from the host. Another thread will signal + * us when the response has arrived. In the failure case, + * sema_timedwait() returns a non-zero status after waiting 5 seconds. + */ + ret = sema_timedwait(&request->wait_sema, 5 * hz); + if (ret == 0) { + /* Response received, check status */ + set_complete = &request->response_msg.msg.set_complete; + status = set_complete->status; + if (status != RNDIS_STATUS_SUCCESS) { + /* Bad response status, return error */ + if (bootverbose) + printf("Netvsc: Failed to set vRSS " + "parameters.\n"); + ret = -2; + } else { + if (bootverbose) + printf("Netvsc: Successfully set vRSS " + "parameters.\n"); + } + } else { + /* + * We cannot deallocate the request since we may still + * receive a send completion for it. + */ + printf("Netvsc: vRSS set timeout, id = %u, ret = %d\n", + request->request_msg.msg.init_request.request_id, ret); + goto exit; + } + +cleanup: + if (request != NULL) { + hv_put_rndis_request(device, request); + } +exit: + return (ret); +} + /* * RNDIS filter set packet filter * Sends an rndis request with the new filter, then waits for a response @@ -752,10 +1003,8 @@ hv_rf_halt_device(rndis_device *device) } device->state = RNDIS_DEV_UNINITIALIZED; - - if (request != NULL) { - hv_put_rndis_request(device, request); - } + + hv_put_rndis_request(device, request); return (0); } @@ -813,12 +1062,16 @@ hv_rf_close_device(rndis_device *device) * RNDIS filter on device add */ int -hv_rf_on_device_add(struct hv_device *device, void *additl_info) +hv_rf_on_device_add(struct hv_device *device, void *additl_info, + int nchan) { int ret; netvsc_dev *net_dev; rndis_device *rndis_dev; + nvsp_msg *init_pkt; rndis_offload_params offloads; + struct rndis_recv_scale_cap rsscaps; + uint32_t rsscaps_size = sizeof(struct rndis_recv_scale_cap); netvsc_device_info *dev_info = (netvsc_device_info *)additl_info; device_t dev = device->device; @@ -884,6 +1137,67 @@ hv_rf_on_device_add(struct hv_device *device, void *additl_info) dev_info->link_state = rndis_dev->link_status; + net_dev->num_channel = 1; + if (net_dev->nvsp_version < NVSP_PROTOCOL_VERSION_5 || nchan == 1) + return (0); + + memset(&rsscaps, 0, rsscaps_size); + ret = hv_rf_query_device(rndis_dev, + RNDIS_OID_GEN_RSS_CAPABILITIES, + &rsscaps, &rsscaps_size); + if ((ret != 0) || (rsscaps.num_recv_que < 2)) { + device_printf(dev, "hv_rf_query_device failed or " + "rsscaps.num_recv_que < 2 \n"); + goto out; + } + device_printf(dev, "channel, offered %u, requested %d\n", + rsscaps.num_recv_que, nchan); + if (nchan > rsscaps.num_recv_que) + nchan = rsscaps.num_recv_que; + net_dev->num_channel = nchan; + + if (net_dev->num_channel == 1) { + device_printf(dev, "net_dev->num_channel == 1 under VRSS\n"); + goto out; + } + + /* request host to create sub channels */ + init_pkt = &net_dev->channel_init_packet; + memset(init_pkt, 0, sizeof(nvsp_msg)); + + init_pkt->hdr.msg_type = nvsp_msg5_type_subchannel; + init_pkt->msgs.vers_5_msgs.subchannel_request.op = + NVSP_SUBCHANNE_ALLOCATE; + init_pkt->msgs.vers_5_msgs.subchannel_request.num_subchannels = + net_dev->num_channel - 1; + + ret = hv_vmbus_channel_send_packet(device->channel, init_pkt, + sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + if (ret != 0) { + device_printf(dev, "Fail to allocate subchannel\n"); + goto out; + } + + sema_wait(&net_dev->channel_init_sema); + + if (init_pkt->msgs.vers_5_msgs.subchn_complete.status != + nvsp_status_success) { + ret = ENODEV; + device_printf(dev, "sub channel complete error\n"); + goto out; + } + + net_dev->num_channel = 1 + + init_pkt->msgs.vers_5_msgs.subchn_complete.num_subchannels; + + ret = hv_rf_set_rss_param(rndis_dev, net_dev->num_channel); + +out: + if (ret) + net_dev->num_channel = 1; + return (ret); } @@ -938,7 +1252,8 @@ hv_rf_on_close(struct hv_device *device) * RNDIS filter on send request completion callback */ static void -hv_rf_on_send_request_completion(void *context) +hv_rf_on_send_request_completion(struct hv_vmbus_channel *chan __unused, + void *context __unused) { } @@ -946,7 +1261,8 @@ hv_rf_on_send_request_completion(void *context) * RNDIS filter on send request (halt only) completion callback */ static void -hv_rf_on_send_request_halt_completion(void *context) +hv_rf_on_send_request_halt_completion(struct hv_vmbus_channel *chan __unused, + void *context) { rndis_request *request = context; @@ -958,32 +1274,9 @@ hv_rf_on_send_request_halt_completion(void *context) request->halt_complete_flag = 1; } -/* - * RNDIS filter when "all" reception is done - */ -void -hv_rf_receive_rollup(netvsc_dev *net_dev) -{ - rndis_device *rndis_dev; - - rndis_dev = (rndis_device *)net_dev->extension; - netvsc_recv_rollup(rndis_dev->net_dev->dev); -} - void -hv_rf_channel_rollup(netvsc_dev *net_dev) +hv_rf_channel_rollup(struct hv_vmbus_channel *chan) { - rndis_device *rndis_dev; - - rndis_dev = (rndis_device *)net_dev->extension; - /* - * This could be called pretty early, so we need - * to make sure everything has been setup. - */ - if (rndis_dev == NULL || - rndis_dev->net_dev == NULL || - rndis_dev->net_dev->dev == NULL) - return; - netvsc_channel_rollup(rndis_dev->net_dev->dev); + netvsc_channel_rollup(chan); } diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.h b/sys/dev/hyperv/netvsc/hv_rndis_filter.h index 9d7a38d..dbaaa42 100644 --- a/sys/dev/hyperv/netvsc/hv_rndis_filter.h +++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. @@ -63,17 +63,32 @@ typedef struct rndis_request_ { struct sema wait_sema; /* - * Fixme: We assumed a fixed size response here. If we do ever - * need to handle a bigger response, we can either define a max - * response message or add a response buffer variable above this field + * The max response size is sizeof(rndis_msg) + PAGE_SIZE. + * + * XXX + * This is ugly and should be cleaned up once we busdma-fy + * RNDIS request bits. */ rndis_msg response_msg; + uint8_t buf_resp[PAGE_SIZE]; /* Simplify allocation by having a netvsc packet inline */ netvsc_packet pkt; hv_vmbus_page_buffer buffer; - /* Fixme: We assumed a fixed size request here. */ + + /* + * The max request size is sizeof(rndis_msg) + PAGE_SIZE. + * + * NOTE: + * This is required for the large request like RSS settings. + * + * XXX + * This is ugly and should be cleaned up once we busdma-fy + * RNDIS request bits. + */ rndis_msg request_msg; + uint8_t buf_req[PAGE_SIZE]; + /* Fixme: Poor man's semaphore. */ uint32_t halt_complete_flag; } rndis_request; @@ -95,12 +110,13 @@ typedef struct rndis_device_ { /* * Externs */ +struct hv_vmbus_channel; -int hv_rf_on_receive(netvsc_dev *net_dev, - struct hv_device *device, netvsc_packet *pkt); +int hv_rf_on_receive(netvsc_dev *net_dev, struct hv_device *device, + struct hv_vmbus_channel *chan, netvsc_packet *pkt); void hv_rf_receive_rollup(netvsc_dev *net_dev); -void hv_rf_channel_rollup(netvsc_dev *net_dev); -int hv_rf_on_device_add(struct hv_device *device, void *additl_info); +void hv_rf_channel_rollup(struct hv_vmbus_channel *chan); +int hv_rf_on_device_add(struct hv_device *device, void *additl_info, int nchan); int hv_rf_on_device_remove(struct hv_device *device, boolean_t destroy_channel); int hv_rf_on_open(struct hv_device *device); int hv_rf_on_close(struct hv_device *device); diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c index 27fb3fd..a89a762 100644 --- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c +++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. @@ -134,7 +134,6 @@ struct storvsc_softc { uint32_t hs_num_out_reqs; boolean_t hs_destroy; boolean_t hs_drain_notify; - boolean_t hs_open_multi_channel; struct sema hs_drain_sema; struct hv_storvsc_request hs_init_req; struct hv_storvsc_request hs_reset_req; @@ -324,9 +323,6 @@ get_stor_device(struct hv_device *device, struct storvsc_softc *sc; sc = device_get_softc(device->device); - if (sc == NULL) { - return NULL; - } if (outbound) { /* @@ -350,29 +346,19 @@ get_stor_device(struct hv_device *device, return sc; } -/** - * @brief Callback handler, will be invoked when receive mutil-channel offer - * - * @param context new multi-channel - */ static void -storvsc_handle_sc_creation(void *context) +storvsc_subchan_attach(struct hv_vmbus_channel *new_channel) { - hv_vmbus_channel *new_channel; struct hv_device *device; struct storvsc_softc *sc; struct vmstor_chan_props props; int ret = 0; - new_channel = (hv_vmbus_channel *)context; - device = new_channel->primary_channel->device; + device = new_channel->device; sc = get_stor_device(device, TRUE); if (sc == NULL) return; - if (FALSE == sc->hs_open_multi_channel) - return; - memset(&props, 0, sizeof(props)); ret = hv_vmbus_channel_open(new_channel, @@ -395,11 +381,12 @@ storvsc_handle_sc_creation(void *context) static void storvsc_send_multichannel_request(struct hv_device *dev, int max_chans) { + struct hv_vmbus_channel **subchan; struct storvsc_softc *sc; struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; int request_channels_cnt = 0; - int ret; + int ret, i; /* get multichannels count that need to create */ request_channels_cnt = MIN(max_chans, mp_ncpus); @@ -413,9 +400,6 @@ storvsc_send_multichannel_request(struct hv_device *dev, int max_chans) request = &sc->hs_init_req; - /* Establish a handler for multi-channel */ - dev->channel->sc_creation_callback = storvsc_handle_sc_creation; - /* request the host to create multi-channel */ memset(request, 0, sizeof(struct hv_storvsc_request)); @@ -451,7 +435,15 @@ storvsc_send_multichannel_request(struct hv_device *dev, int max_chans) return; } - sc->hs_open_multi_channel = TRUE; + /* Wait for sub-channels setup to complete. */ + subchan = vmbus_get_subchan(dev->channel, request_channels_cnt); + + /* Attach the sub-channels. */ + for (i = 0; i < request_channels_cnt; ++i) + storvsc_subchan_attach(subchan[i]); + + /* Release the sub-channels. */ + vmbus_rel_subchan(subchan, request_channels_cnt); if (bootverbose) printf("Storvsc create multi-channel success!\n"); @@ -883,12 +875,7 @@ hv_storvsc_on_channel_callback(void *context) struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; - if (channel->primary_channel != NULL){ - device = channel->primary_channel->device; - } else { - device = channel->device; - } - + device = channel->device; KASSERT(device, ("device is NULL")); sc = get_stor_device(device, FALSE); @@ -970,6 +957,7 @@ storvsc_probe(device_t dev) if(bootverbose) device_printf(dev, "Enlightened ATA/IDE detected\n"); + device_set_desc(dev, g_drv_props_table[DRIVER_BLKVSC].drv_desc); ret = BUS_PROBE_DEFAULT; } else if(bootverbose) device_printf(dev, "Emulated ATA/IDE set (hw.ata.disk_enable set)\n"); @@ -977,6 +965,7 @@ storvsc_probe(device_t dev) case DRIVER_STORVSC: if(bootverbose) device_printf(dev, "Enlightened SCSI device detected\n"); + device_set_desc(dev, g_drv_props_table[DRIVER_STORVSC].drv_desc); ret = BUS_PROBE_DEFAULT; break; default: @@ -1014,10 +1003,6 @@ storvsc_attach(device_t dev) root_mount_token = root_mount_hold("storvsc"); sc = device_get_softc(dev); - if (sc == NULL) { - ret = ENOMEM; - goto cleanup; - } stor_type = storvsc_get_storage_type(dev); @@ -1026,15 +1011,12 @@ storvsc_attach(device_t dev) goto cleanup; } - bzero(sc, sizeof(struct storvsc_softc)); - /* fill in driver specific properties */ sc->hs_drv_props = &g_drv_props_table[stor_type]; /* fill in device specific properties */ sc->hs_unit = device_get_unit(dev); sc->hs_dev = hv_dev; - device_set_desc(dev, g_drv_props_table[stor_type].drv_desc); LIST_INIT(&sc->hs_free_list); mtx_init(&sc->hs_lock, "hvslck", NULL, MTX_DEF); @@ -1081,7 +1063,6 @@ storvsc_attach(device_t dev) sc->hs_destroy = FALSE; sc->hs_drain_notify = FALSE; - sc->hs_open_multi_channel = FALSE; sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema"); ret = hv_storvsc_connect_vsp(hv_dev); @@ -1186,9 +1167,7 @@ storvsc_detach(device_t dev) struct hv_sgl_node *sgl_node = NULL; int j = 0; - mtx_lock(&hv_device->channel->inbound_lock); sc->hs_destroy = TRUE; - mtx_unlock(&hv_device->channel->inbound_lock); /* * At this point, all outbound traffic should be disabled. We @@ -2147,8 +2126,9 @@ storvsc_io_done(struct hv_storvsc_request *reqp) reqp->softc->hs_frozen = 0; } storvsc_free_request(sc, reqp); - xpt_done(ccb); mtx_unlock(&sc->hs_lock); + + xpt_done_direct(ccb); } /** diff --git a/sys/dev/hyperv/utilities/hv_heartbeat.c b/sys/dev/hyperv/utilities/hv_heartbeat.c index c1b6da5..5f4fcf6 100644 --- a/sys/dev/hyperv/utilities/hv_heartbeat.c +++ b/sys/dev/hyperv/utilities/hv_heartbeat.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014 Microsoft Corp. + * Copyright (c) 2014,2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -94,6 +94,10 @@ static int hv_heartbeat_probe(device_t dev) { const char *p = vmbus_get_type(dev); + + if (resource_disabled("hvheartbeat", 0)) + return ENXIO; + if (!memcmp(p, &service_guid, sizeof(hv_guid))) { device_set_desc(dev, "Hyper-V Heartbeat Service"); return BUS_PROBE_DEFAULT; diff --git a/sys/dev/hyperv/utilities/hv_kvp.c b/sys/dev/hyperv/utilities/hv_kvp.c index 8517918..b1f6ec1 100644 --- a/sys/dev/hyperv/utilities/hv_kvp.c +++ b/sys/dev/hyperv/utilities/hv_kvp.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014 Microsoft Corp. + * Copyright (c) 2014,2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -304,28 +304,11 @@ hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg, { int err_ip, err_subnet, err_gway, err_dns, err_adap; int UNUSED_FLAG = 1; - int guid_index; struct hv_device *hv_dev; /* GUID Data Structure */ hn_softc_t *sc; /* hn softc structure */ char if_name[4]; - unsigned char guid_instance[40]; - char *guid_data = NULL; char buf[39]; - struct guid_extract { - char a1[2]; - char a2[2]; - char a3[2]; - char a4[2]; - char b1[2]; - char b2[2]; - char c1[2]; - char c2[2]; - char d[4]; - char e[12]; - }; - - struct guid_extract *id; device_t *devs; int devcnt; @@ -352,17 +335,7 @@ hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg, /* Trying to find GUID of Network Device */ hv_dev = sc->hn_dev_obj; - for (guid_index = 0; guid_index < 16; guid_index++) { - sprintf(&guid_instance[guid_index * 2], "%02x", - hv_dev->device_id.data[guid_index]); - } - - guid_data = (char *)guid_instance; - id = (struct guid_extract *)guid_data; - snprintf(buf, sizeof(buf), "{%.2s%.2s%.2s%.2s-%.2s%.2s-%.2s%.2s-%.4s-%s}", - id->a4, id->a3, id->a2, id->a1, - id->b2, id->b1, id->c2, id->c1, id->d, id->e); - guid_data = NULL; + snprintf_hv_guid(buf, sizeof(buf), &hv_dev->device_id); sprintf(if_name, "%s%d", "hn", device_get_unit(devs[devcnt])); if (strncmp(buf, (char *)umsg->body.kvp_ip_val.adapter_id, 39) == 0) { @@ -890,6 +863,10 @@ static int hv_kvp_probe(device_t dev) { const char *p = vmbus_get_type(dev); + + if (resource_disabled("hvkvp", 0)) + return ENXIO; + if (!memcmp(p, &service_guid, sizeof(hv_guid))) { device_set_desc(dev, "Hyper-V KVP Service"); return BUS_PROBE_DEFAULT; diff --git a/sys/dev/hyperv/utilities/hv_kvp.h b/sys/dev/hyperv/utilities/hv_kvp.h index b62149e..6474e18 100644 --- a/sys/dev/hyperv/utilities/hv_kvp.h +++ b/sys/dev/hyperv/utilities/hv_kvp.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014 Microsoft Corp. + * Copyright (c) 2014,2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/hyperv/utilities/hv_shutdown.c b/sys/dev/hyperv/utilities/hv_shutdown.c index 20bc65e..3dfbf13 100644 --- a/sys/dev/hyperv/utilities/hv_shutdown.c +++ b/sys/dev/hyperv/utilities/hv_shutdown.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014 Microsoft Corp. + * Copyright (c) 2014,2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,6 +116,10 @@ static int hv_shutdown_probe(device_t dev) { const char *p = vmbus_get_type(dev); + + if (resource_disabled("hvshutdown", 0)) + return ENXIO; + if (!memcmp(p, &service_guid, sizeof(hv_guid))) { device_set_desc(dev, "Hyper-V Shutdown Service"); return BUS_PROBE_DEFAULT; diff --git a/sys/dev/hyperv/utilities/hv_timesync.c b/sys/dev/hyperv/utilities/hv_timesync.c index d1ea904..eeb0434 100644 --- a/sys/dev/hyperv/utilities/hv_timesync.c +++ b/sys/dev/hyperv/utilities/hv_timesync.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014 Microsoft Corp. + * Copyright (c) 2014,2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -171,6 +171,10 @@ static int hv_timesync_probe(device_t dev) { const char *p = vmbus_get_type(dev); + + if (resource_disabled("hvtimesync", 0)) + return ENXIO; + if (!memcmp(p, &service_guid, sizeof(hv_guid))) { device_set_desc(dev, "Hyper-V Time Synch Service"); return BUS_PROBE_DEFAULT; diff --git a/sys/dev/hyperv/utilities/hv_util.c b/sys/dev/hyperv/utilities/hv_util.c index 7d19b3f..3119e3f 100644 --- a/sys/dev/hyperv/utilities/hv_util.c +++ b/sys/dev/hyperv/utilities/hv_util.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014 Microsoft Corp. + * Copyright (c) 2014,2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/hyperv/utilities/hv_util.h b/sys/dev/hyperv/utilities/hv_util.h index 708dca8..e202784 100644 --- a/sys/dev/hyperv/utilities/hv_util.h +++ b/sys/dev/hyperv/utilities/hv_util.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. diff --git a/sys/dev/hyperv/vmbus/amd64/hv_vector.S b/sys/dev/hyperv/vmbus/amd64/hv_vector.S new file mode 100644 index 0000000..2594483 --- /dev/null +++ b/sys/dev/hyperv/vmbus/amd64/hv_vector.S @@ -0,0 +1,46 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <machine/asmacros.h> +#include <machine/specialreg.h> + +#include "assym.s" + +/* + * This is the Hyper-V vmbus channel direct callback interrupt. + * Only used when it is running on Hyper-V. + */ + .text + SUPERALIGN_TEXT +IDTVEC(hv_vmbus_callback) + PUSH_FRAME + FAKE_MCOUNT(TF_RIP(%rsp)) + movq %rsp, %rdi + call hv_vector_handler + MEXITCOUNT + jmp doreti diff --git a/sys/dev/hyperv/vmbus/hv_channel.c b/sys/dev/hyperv/vmbus/hv_channel.c index bb777cc..6da0643 100644 --- a/sys/dev/hyperv/vmbus/hv_channel.c +++ b/sys/dev/hyperv/vmbus/hv_channel.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include <sys/mbuf.h> #include <sys/lock.h> #include <sys/mutex.h> +#include <sys/sysctl.h> #include <machine/bus.h> #include <vm/vm.h> #include <vm/vm_param.h> @@ -80,6 +81,90 @@ vmbus_channel_set_event(hv_vmbus_channel *channel) } +static int +vmbus_channel_sysctl_monalloc(SYSCTL_HANDLER_ARGS) +{ + struct hv_vmbus_channel *chan = arg1; + int alloc = 0; + + if (chan->offer_msg.monitor_allocated) + alloc = 1; + return sysctl_handle_int(oidp, &alloc, 0, req); +} + +static void +vmbus_channel_sysctl_create(hv_vmbus_channel* channel) +{ + device_t dev; + struct sysctl_oid *devch_sysctl; + struct sysctl_oid *devch_id_sysctl, *devch_sub_sysctl; + struct sysctl_oid *devch_id_in_sysctl, *devch_id_out_sysctl; + struct sysctl_ctx_list *ctx; + uint32_t ch_id; + uint16_t sub_ch_id; + char name[16]; + + hv_vmbus_channel* primary_ch = channel->primary_channel; + + if (primary_ch == NULL) { + dev = channel->device->device; + ch_id = channel->offer_msg.child_rel_id; + } else { + dev = primary_ch->device->device; + ch_id = primary_ch->offer_msg.child_rel_id; + sub_ch_id = channel->offer_msg.offer.sub_channel_index; + } + ctx = device_get_sysctl_ctx(dev); + /* This creates dev.DEVNAME.DEVUNIT.channel tree */ + devch_sysctl = SYSCTL_ADD_NODE(ctx, + SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "channel", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + /* This creates dev.DEVNAME.DEVUNIT.channel.CHANID tree */ + snprintf(name, sizeof(name), "%d", ch_id); + devch_id_sysctl = SYSCTL_ADD_NODE(ctx, + SYSCTL_CHILDREN(devch_sysctl), + OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + + if (primary_ch != NULL) { + devch_sub_sysctl = SYSCTL_ADD_NODE(ctx, + SYSCTL_CHILDREN(devch_id_sysctl), + OID_AUTO, "sub", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + snprintf(name, sizeof(name), "%d", sub_ch_id); + devch_id_sysctl = SYSCTL_ADD_NODE(ctx, + SYSCTL_CHILDREN(devch_sub_sysctl), + OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + + SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(devch_id_sysctl), + OID_AUTO, "chanid", CTLFLAG_RD, + &channel->offer_msg.child_rel_id, 0, "channel id"); + } + SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(devch_id_sysctl), OID_AUTO, + "cpu", CTLFLAG_RD, &channel->target_cpu, 0, "owner CPU id"); + SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(devch_id_sysctl), OID_AUTO, + "monitor_allocated", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, + channel, 0, vmbus_channel_sysctl_monalloc, "I", + "is monitor allocated to this channel"); + + devch_id_in_sysctl = SYSCTL_ADD_NODE(ctx, + SYSCTL_CHILDREN(devch_id_sysctl), + OID_AUTO, + "in", + CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + devch_id_out_sysctl = SYSCTL_ADD_NODE(ctx, + SYSCTL_CHILDREN(devch_id_sysctl), + OID_AUTO, + "out", + CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + hv_ring_buffer_stat(ctx, + SYSCTL_CHILDREN(devch_id_in_sysctl), + &(channel->inbound), + "inbound ring buffer stats"); + hv_ring_buffer_stat(ctx, + SYSCTL_CHILDREN(devch_id_out_sysctl), + &(channel->outbound), + "outbound ring buffer stats"); +} + /** * @brief Open the specified channel */ @@ -143,6 +228,9 @@ hv_vmbus_channel_open( in, recv_ring_buffer_size); + /* Create sysctl tree for this channel */ + vmbus_channel_sysctl_create(new_channel); + /** * Establish the gpadl for the ring buffer */ @@ -182,12 +270,12 @@ hv_vmbus_channel_open( if (user_data_len) memcpy(open_msg->user_data, user_data, user_data_len); - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_lock(&hv_vmbus_g_connection.channel_msg_lock); TAILQ_INSERT_TAIL( &hv_vmbus_g_connection.channel_msg_anchor, open_info, msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock); ret = hv_vmbus_post_message( open_msg, sizeof(hv_vmbus_channel_open_channel)); @@ -214,12 +302,12 @@ hv_vmbus_channel_open( } cleanup: - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_lock(&hv_vmbus_g_connection.channel_msg_lock); TAILQ_REMOVE( &hv_vmbus_g_connection.channel_msg_anchor, open_info, msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock); sema_destroy(&open_info->wait_sema); free(open_info, M_DEVBUF); @@ -384,17 +472,22 @@ hv_vmbus_channel_establish_gpadl( hv_vmbus_channel_msg_info* curr; uint32_t next_gpadl_handle; - next_gpadl_handle = hv_vmbus_g_connection.next_gpadl_handle; - atomic_add_int((int*) &hv_vmbus_g_connection.next_gpadl_handle, 1); + next_gpadl_handle = atomic_fetchadd_int( + &hv_vmbus_g_connection.next_gpadl_handle, 1); ret = vmbus_channel_create_gpadl_header( contig_buffer, size, &msg_info, &msg_count); - if(ret != 0) { /* if(allocation failed) return immediately */ - /* reverse atomic_add_int above */ - atomic_subtract_int((int*) - &hv_vmbus_g_connection.next_gpadl_handle, 1); - return ret; + if(ret != 0) { + /* + * XXX + * We can _not_ even revert the above incremental, + * if multiple GPADL establishments are running + * parallelly, decrement the global next_gpadl_handle + * is calling for _big_ trouble. A better solution + * is to have a 0-based GPADL id bitmap ... + */ + return ret; } sema_init(&msg_info->wait_sema, 0, "Open Info Sema"); @@ -403,13 +496,13 @@ hv_vmbus_channel_establish_gpadl( gpadl_msg->child_rel_id = channel->offer_msg.child_rel_id; gpadl_msg->gpadl = next_gpadl_handle; - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_lock(&hv_vmbus_g_connection.channel_msg_lock); TAILQ_INSERT_TAIL( &hv_vmbus_g_connection.channel_msg_anchor, msg_info, msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock); ret = hv_vmbus_post_message( gpadl_msg, @@ -448,10 +541,10 @@ hv_vmbus_channel_establish_gpadl( cleanup: - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_lock(&hv_vmbus_g_connection.channel_msg_lock); TAILQ_REMOVE(&hv_vmbus_g_connection.channel_msg_anchor, msg_info, msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock); sema_destroy(&msg_info->wait_sema); free(msg_info, M_DEVBUF); @@ -490,10 +583,10 @@ hv_vmbus_channel_teardown_gpdal( msg->child_rel_id = channel->offer_msg.child_rel_id; msg->gpadl = gpadl_handle; - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_lock(&hv_vmbus_g_connection.channel_msg_lock); TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_msg_anchor, info, msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock); ret = hv_vmbus_post_message(msg, sizeof(hv_vmbus_channel_gpadl_teardown)); @@ -506,10 +599,10 @@ cleanup: /* * Received a torndown response */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_lock(&hv_vmbus_g_connection.channel_msg_lock); TAILQ_REMOVE(&hv_vmbus_g_connection.channel_msg_anchor, info, msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock); sema_destroy(&info->wait_sema); free(info, M_DEVBUF); @@ -525,20 +618,13 @@ hv_vmbus_channel_close_internal(hv_vmbus_channel *channel) hv_vmbus_channel_msg_info* info; channel->state = HV_CHANNEL_OPEN_STATE; - channel->sc_creation_callback = NULL; /* * set rxq to NULL to avoid more requests be scheduled */ channel->rxq = NULL; taskqueue_drain(rxq, &channel->channel_task); - /* - * Grab the lock to prevent race condition when a packet received - * and unloading driver is in the process. - */ - mtx_lock(&channel->inbound_lock); channel->on_channel_callback = NULL; - mtx_unlock(&channel->inbound_lock); /** * Send a closing message @@ -857,7 +943,6 @@ hv_vmbus_channel_recv_packet_raw( { int ret; uint32_t packetLen; - uint32_t userLen; hv_vm_packet_descriptor desc; *buffer_actual_len = 0; @@ -871,8 +956,6 @@ hv_vmbus_channel_recv_packet_raw( return (0); packetLen = desc.length8 << 3; - userLen = packetLen - (desc.data_offset8 << 3); - *buffer_actual_len = packetLen; if (packetLen > buffer_len) @@ -915,12 +998,6 @@ VmbusProcessChannelEvent(void* context, int pending) * callback to NULL. This closes the window. */ - /* - * Disable the lock due to newly added WITNESS check in r277723. - * Will seek other way to avoid race condition. - * -- whu - */ - // mtx_lock(&channel->inbound_lock); if (channel->on_channel_callback != NULL) { arg = channel->channel_callback_context; is_batched_reading = channel->batched_reading; @@ -947,5 +1024,4 @@ VmbusProcessChannelEvent(void* context, int pending) bytes_to_read = 0; } while (is_batched_reading && (bytes_to_read != 0)); } - // mtx_unlock(&channel->inbound_lock); } diff --git a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c index ab6e8ad..00b54ed 100644 --- a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c +++ b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. @@ -30,7 +30,10 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> +#include <sys/kernel.h> +#include <sys/lock.h> #include <sys/mbuf.h> +#include <sys/mutex.h> #include "hv_vmbus_priv.h" @@ -95,6 +98,14 @@ typedef struct hv_work_item { void* context; } hv_work_item; +static struct mtx vmbus_chwait_lock; +MTX_SYSINIT(vmbus_chwait_lk, &vmbus_chwait_lock, "vmbus primarych wait lock", + MTX_DEF); +static uint32_t vmbus_chancnt; +static uint32_t vmbus_devcnt; + +#define VMBUS_CHANCNT_DONE 0x80000000 + /** * Implementation of the work abstraction. */ @@ -143,9 +154,7 @@ hv_vmbus_allocate_channel(void) M_DEVBUF, M_WAITOK | M_ZERO); - mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF); mtx_init(&channel->sc_lock, "vmbus multi channel", NULL, MTX_DEF); - TAILQ_INIT(&channel->sc_list_anchor); return (channel); @@ -158,8 +167,6 @@ void hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel) { mtx_destroy(&channel->sc_lock); - mtx_destroy(&channel->inbound_lock); - free(channel, M_DEVBUF); } @@ -170,13 +177,10 @@ hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel) static void vmbus_channel_process_offer(hv_vmbus_channel *new_channel) { - boolean_t f_new; hv_vmbus_channel* channel; int ret; uint32_t relid; - f_new = TRUE; - channel = NULL; relid = new_channel->offer_msg.child_rel_id; /* * Make sure this is a new offer @@ -185,31 +189,24 @@ vmbus_channel_process_offer(hv_vmbus_channel *new_channel) hv_vmbus_g_connection.channels[relid] = new_channel; TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor, - list_entry) - { + list_entry) { if (memcmp(&channel->offer_msg.offer.interface_type, &new_channel->offer_msg.offer.interface_type, sizeof(hv_guid)) == 0 && memcmp(&channel->offer_msg.offer.interface_instance, &new_channel->offer_msg.offer.interface_instance, - sizeof(hv_guid)) == 0) { - f_new = FALSE; + sizeof(hv_guid)) == 0) break; - } } - if (f_new) { - /* Insert at tail */ - TAILQ_INSERT_TAIL( - &hv_vmbus_g_connection.channel_anchor, - new_channel, - list_entry); + if (channel == NULL) { + /* Install the new primary channel */ + TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor, + new_channel, list_entry); } mtx_unlock(&hv_vmbus_g_connection.channel_lock); - /*XXX add new channel to percpu_list */ - - if (!f_new) { + if (channel != NULL) { /* * Check if this is a sub channel. */ @@ -218,17 +215,20 @@ vmbus_channel_process_offer(hv_vmbus_channel *new_channel) * It is a sub channel offer, process it. */ new_channel->primary_channel = channel; + new_channel->device = channel->device; mtx_lock(&channel->sc_lock); - TAILQ_INSERT_TAIL( - &channel->sc_list_anchor, - new_channel, - sc_list_entry); + TAILQ_INSERT_TAIL(&channel->sc_list_anchor, + new_channel, sc_list_entry); mtx_unlock(&channel->sc_lock); + if (bootverbose) { + printf("VMBUS get multi-channel offer, " + "rel=%u, sub=%u\n", + new_channel->offer_msg.child_rel_id, + new_channel->offer_msg.offer.sub_channel_index); + } + /* Insert new channel into channel_anchor. */ - printf("VMBUS get multi-channel offer, rel=%u,sub=%u\n", - new_channel->offer_msg.child_rel_id, - new_channel->offer_msg.offer.sub_channel_index); mtx_lock(&hv_vmbus_g_connection.channel_lock); TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor, new_channel, list_entry); @@ -239,17 +239,25 @@ vmbus_channel_process_offer(hv_vmbus_channel *new_channel) "its primary channel is <%p>.\n", new_channel, new_channel->primary_channel); - /*XXX add it to percpu_list */ - new_channel->state = HV_CHANNEL_OPEN_STATE; - if (channel->sc_creation_callback != NULL) { - channel->sc_creation_callback(new_channel); - } + + /* + * Bump up sub-channel count and notify anyone that is + * interested in this sub-channel, after this sub-channel + * is setup. + */ + mtx_lock(&channel->sc_lock); + channel->subchan_cnt++; + mtx_unlock(&channel->sc_lock); + wakeup(channel); + return; } - hv_vmbus_free_vmbus_channel(new_channel); - return; + printf("VMBUS: duplicated primary channel%u\n", + new_channel->offer_msg.child_rel_id); + hv_vmbus_free_vmbus_channel(new_channel); + return; } new_channel->state = HV_CHANNEL_OPEN_STATE; @@ -271,13 +279,37 @@ vmbus_channel_process_offer(hv_vmbus_channel *new_channel) ret = hv_vmbus_child_device_register(new_channel->device); if (ret != 0) { mtx_lock(&hv_vmbus_g_connection.channel_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_anchor, - new_channel, - list_entry); + TAILQ_REMOVE(&hv_vmbus_g_connection.channel_anchor, + new_channel, list_entry); mtx_unlock(&hv_vmbus_g_connection.channel_lock); hv_vmbus_free_vmbus_channel(new_channel); } + + mtx_lock(&vmbus_chwait_lock); + vmbus_devcnt++; + mtx_unlock(&vmbus_chwait_lock); + wakeup(&vmbus_devcnt); +} + +void +vmbus_channel_cpu_set(struct hv_vmbus_channel *chan, int cpu) +{ + KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu %d", cpu)); + + if (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008 || + hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7) { + /* Only cpu0 is supported */ + cpu = 0; + } + + chan->target_cpu = cpu; + chan->target_vcpu = hv_vmbus_g_context.hv_vcpu_index[cpu]; + + if (bootverbose) { + printf("vmbus_chan%u: assigned to cpu%u [vcpu%u]\n", + chan->offer_msg.child_rel_id, + chan->target_cpu, chan->target_vcpu); + } } /** @@ -312,11 +344,12 @@ static uint32_t next_vcpu; * distributed across all available CPUs. */ static void -vmbus_channel_select_cpu(hv_vmbus_channel *channel, hv_guid *guid) +vmbus_channel_select_defcpu(struct hv_vmbus_channel *channel) { uint32_t current_cpu; int i; boolean_t is_perf_channel = FALSE; + const hv_guid *guid = &channel->offer_msg.offer.interface_type; for (i = PERF_CHN_NIC; i < MAX_PERF_CHN; i++) { if (memcmp(guid->data, high_perf_devices[i].data, @@ -326,24 +359,14 @@ vmbus_channel_select_cpu(hv_vmbus_channel *channel, hv_guid *guid) } } - if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || - (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7) || - (!is_perf_channel)) { - /* Host's view of guest cpu */ - channel->target_vcpu = 0; - /* Guest's own view of cpu */ - channel->target_cpu = 0; + if (!is_perf_channel) { + /* Stick to cpu0 */ + vmbus_channel_cpu_set(channel, 0); return; } /* mp_ncpus should have the number cpus currently online */ current_cpu = (++next_vcpu % mp_ncpus); - channel->target_cpu = current_cpu; - channel->target_vcpu = - hv_vmbus_g_context.hv_vcpu_index[current_cpu]; - if (bootverbose) - printf("VMBUS: Total online cpus %d, assign perf channel %d " - "to vcpu %d, cpu %d\n", mp_ncpus, i, channel->target_vcpu, - current_cpu); + vmbus_channel_cpu_set(channel, current_cpu); } /** @@ -362,12 +385,6 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr) offer = (hv_vmbus_channel_offer_channel*) hdr; - hv_guid *guidType; - hv_guid *guidInstance; - - guidType = &offer->offer.interface_type; - guidInstance = &offer->offer.interface_instance; - // copy offer data copied = malloc(sizeof(*copied), M_DEVBUF, M_NOWAIT); if (copied == NULL) { @@ -377,6 +394,11 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr) memcpy(copied, hdr, sizeof(*copied)); hv_queue_work_item(vmbus_channel_on_offer_internal, copied); + + mtx_lock(&vmbus_chwait_lock); + if ((vmbus_chancnt & VMBUS_CHANCNT_DONE) == 0) + vmbus_chancnt++; + mtx_unlock(&vmbus_chwait_lock); } static void @@ -414,17 +436,14 @@ vmbus_channel_on_offer_internal(void* context) offer->connection_id; } - /* - * Bind the channel to a chosen cpu. - */ - vmbus_channel_select_cpu(new_channel, - &offer->offer.interface_type); - memcpy(&new_channel->offer_msg, offer, sizeof(hv_vmbus_channel_offer_channel)); new_channel->monitor_group = (uint8_t) offer->monitor_id / 32; new_channel->monitor_bit = (uint8_t) offer->monitor_id % 32; + /* Select default cpu for this channel. */ + vmbus_channel_select_defcpu(new_channel); + vmbus_channel_process_offer(new_channel); free(offer, M_DEVBUF); @@ -458,7 +477,10 @@ vmbus_channel_on_offer_rescind_internal(void *context) hv_vmbus_channel* channel; channel = (hv_vmbus_channel*)context; - hv_vmbus_child_device_unregister(channel->device); + if (HV_VMBUS_CHAN_ISPRIMARY(channel)) { + /* Only primary channel owns the hv_device */ + hv_vmbus_child_device_unregister(channel->device); + } } /** @@ -468,6 +490,11 @@ vmbus_channel_on_offer_rescind_internal(void *context) static void vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr) { + + mtx_lock(&vmbus_chwait_lock); + vmbus_chancnt |= VMBUS_CHANCNT_DONE; + mtx_unlock(&vmbus_chwait_lock); + wakeup(&vmbus_chancnt); } /** @@ -490,7 +517,7 @@ vmbus_channel_on_open_result(hv_vmbus_channel_msg_header* hdr) /* * Find the open msg, copy the result and signal/unblock the wait event */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_lock(&hv_vmbus_g_connection.channel_msg_lock); TAILQ_FOREACH(msg_info, &hv_vmbus_g_connection.channel_msg_anchor, msg_list_entry) { @@ -508,7 +535,7 @@ vmbus_channel_on_open_result(hv_vmbus_channel_msg_header* hdr) } } } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock); } @@ -532,7 +559,7 @@ vmbus_channel_on_gpadl_created(hv_vmbus_channel_msg_header* hdr) /* Find the establish msg, copy the result and signal/unblock * the wait event */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_lock(&hv_vmbus_g_connection.channel_msg_lock); TAILQ_FOREACH(msg_info, &hv_vmbus_g_connection.channel_msg_anchor, msg_list_entry) { request_header = (hv_vmbus_channel_msg_header*) msg_info->msg; @@ -551,7 +578,7 @@ vmbus_channel_on_gpadl_created(hv_vmbus_channel_msg_header* hdr) } } } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock); } /** @@ -576,7 +603,7 @@ vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr) * wait event. */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_lock(&hv_vmbus_g_connection.channel_msg_lock); TAILQ_FOREACH(msg_info, &hv_vmbus_g_connection.channel_msg_anchor, msg_list_entry) { @@ -596,7 +623,7 @@ vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr) } } } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock); } /** @@ -616,7 +643,7 @@ vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr) versionResponse = (hv_vmbus_channel_version_response*)hdr; - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_lock(&hv_vmbus_g_connection.channel_msg_lock); TAILQ_FOREACH(msg_info, &hv_vmbus_g_connection.channel_msg_anchor, msg_list_entry) { requestHeader = (hv_vmbus_channel_msg_header*) msg_info->msg; @@ -630,7 +657,7 @@ vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr) sema_post(&msg_info->wait_sema); } } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock); } @@ -679,7 +706,10 @@ hv_vmbus_release_unattached_channels(void) TAILQ_REMOVE(&hv_vmbus_g_connection.channel_anchor, channel, list_entry); - hv_vmbus_child_device_unregister(channel->device); + if (HV_VMBUS_CHAN_ISPRIMARY(channel)) { + /* Only primary channel owns the hv_device */ + hv_vmbus_child_device_unregister(channel->device); + } hv_vmbus_free_vmbus_channel(channel); } bzero(hv_vmbus_g_connection.channels, @@ -742,3 +772,56 @@ vmbus_select_outgoing_channel(struct hv_vmbus_channel *primary) return(outgoing_channel); } + +void +vmbus_scan(void) +{ + uint32_t chancnt; + + mtx_lock(&vmbus_chwait_lock); + while ((vmbus_chancnt & VMBUS_CHANCNT_DONE) == 0) + mtx_sleep(&vmbus_chancnt, &vmbus_chwait_lock, 0, "waitch", 0); + chancnt = vmbus_chancnt & ~VMBUS_CHANCNT_DONE; + + while (vmbus_devcnt != chancnt) + mtx_sleep(&vmbus_devcnt, &vmbus_chwait_lock, 0, "waitdev", 0); + mtx_unlock(&vmbus_chwait_lock); +} + +struct hv_vmbus_channel ** +vmbus_get_subchan(struct hv_vmbus_channel *pri_chan, int subchan_cnt) +{ + struct hv_vmbus_channel **ret, *chan; + int i; + + ret = malloc(subchan_cnt * sizeof(struct hv_vmbus_channel *), M_TEMP, + M_WAITOK); + + mtx_lock(&pri_chan->sc_lock); + + while (pri_chan->subchan_cnt < subchan_cnt) + mtx_sleep(pri_chan, &pri_chan->sc_lock, 0, "subch", 0); + + i = 0; + TAILQ_FOREACH(chan, &pri_chan->sc_list_anchor, sc_list_entry) { + /* TODO: refcnt chan */ + ret[i] = chan; + + ++i; + if (i == subchan_cnt) + break; + } + KASSERT(i == subchan_cnt, ("invalid subchan count %d, should be %d", + pri_chan->subchan_cnt, subchan_cnt)); + + mtx_unlock(&pri_chan->sc_lock); + + return ret; +} + +void +vmbus_rel_subchan(struct hv_vmbus_channel **subchan, int subchan_cnt __unused) +{ + + free(subchan, M_TEMP); +} diff --git a/sys/dev/hyperv/vmbus/hv_connection.c b/sys/dev/hyperv/vmbus/hv_connection.c index fb1879d..0424b47 100644 --- a/sys/dev/hyperv/vmbus/hv_connection.c +++ b/sys/dev/hyperv/vmbus/hv_connection.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. @@ -99,26 +99,26 @@ hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info, * Add to list before we send the request since we may receive the * response before returning from this routine */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_lock(&hv_vmbus_g_connection.channel_msg_lock); TAILQ_INSERT_TAIL( &hv_vmbus_g_connection.channel_msg_anchor, msg_info, msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock); ret = hv_vmbus_post_message( msg, sizeof(hv_vmbus_channel_initiate_contact)); if (ret != 0) { - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_lock(&hv_vmbus_g_connection.channel_msg_lock); TAILQ_REMOVE( &hv_vmbus_g_connection.channel_msg_anchor, msg_info, msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock); return (ret); } @@ -127,12 +127,12 @@ hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info, */ ret = sema_timedwait(&msg_info->wait_sema, 5 * hz); /* KYS 5 seconds */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_lock(&hv_vmbus_g_connection.channel_msg_lock); TAILQ_REMOVE( &hv_vmbus_g_connection.channel_msg_anchor, msg_info, msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_msg_lock); /** * Check if successful @@ -169,7 +169,7 @@ hv_vmbus_connect(void) { TAILQ_INIT(&hv_vmbus_g_connection.channel_msg_anchor); mtx_init(&hv_vmbus_g_connection.channel_msg_lock, "vmbus channel msg", - NULL, MTX_SPIN); + NULL, MTX_DEF); TAILQ_INIT(&hv_vmbus_g_connection.channel_anchor); mtx_init(&hv_vmbus_g_connection.channel_lock, "vmbus channel", @@ -308,14 +308,18 @@ hv_vmbus_on_events(int cpu) KASSERT(cpu <= mp_maxid, ("VMBUS: hv_vmbus_on_events: " "cpu out of range!")); + page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu]; + event = (hv_vmbus_synic_event_flags *) + page_addr + HV_VMBUS_MESSAGE_SINT; if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) { maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5; /* * receive size is 1/2 page and divide that by 4 bytes */ - recv_interrupt_page = - hv_vmbus_g_connection.recv_interrupt_page; + if (synch_test_and_clear_bit(0, &event->flags32[0])) + recv_interrupt_page = + hv_vmbus_g_connection.recv_interrupt_page; } else { /* * On Host with Win8 or above, the event page can be @@ -323,9 +327,6 @@ hv_vmbus_on_events(int cpu) * that has the pending interrupt. */ maxdword = HV_EVENT_FLAGS_DWORD_COUNT; - page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu]; - event = (hv_vmbus_synic_event_flags *) - page_addr + HV_VMBUS_MESSAGE_SINT; recv_interrupt_page = event->flags32; } @@ -367,31 +368,35 @@ hv_vmbus_on_events(int cpu) /** * Send a msg on the vmbus's message connection */ -int hv_vmbus_post_message(void *buffer, size_t bufferLen) { - int ret = 0; +int hv_vmbus_post_message(void *buffer, size_t bufferLen) +{ hv_vmbus_connection_id connId; - unsigned retries = 0; + sbintime_t time = SBT_1MS; + int retries; + int ret; - /* NetScaler delays from previous code were consolidated here */ - static int delayAmount[] = {100, 100, 100, 500, 500, 5000, 5000, 5000}; + connId.as_uint32_t = 0; + connId.u.id = HV_VMBUS_MESSAGE_CONNECTION_ID; - /* for(each entry in delayAmount) try to post message, - * delay a little bit before retrying + /* + * We retry to cope with transient failures caused by host side's + * insufficient resources. 20 times should suffice in practice. */ - for (retries = 0; - retries < sizeof(delayAmount)/sizeof(delayAmount[0]); retries++) { - connId.as_uint32_t = 0; - connId.u.id = HV_VMBUS_MESSAGE_CONNECTION_ID; - ret = hv_vmbus_post_msg_via_msg_ipc(connId, 1, buffer, bufferLen); - if (ret != HV_STATUS_INSUFFICIENT_BUFFERS) - break; - /* TODO: KYS We should use a blocking wait call */ - DELAY(delayAmount[retries]); + for (retries = 0; retries < 20; retries++) { + ret = hv_vmbus_post_msg_via_msg_ipc(connId, 1, buffer, + bufferLen); + if (ret == HV_STATUS_SUCCESS) + return (0); + + pause_sbt("pstmsg", time, 0, C_HARDCLOCK); + if (time < SBT_1S * 2) + time *= 2; } - KASSERT(ret == 0, ("Error VMBUS: Message Post Failed\n")); + KASSERT(ret == HV_STATUS_SUCCESS, + ("Error VMBUS: Message Post Failed, ret=%d\n", ret)); - return (ret); + return (EAGAIN); } /** diff --git a/sys/dev/hyperv/vmbus/hv_et.c b/sys/dev/hyperv/vmbus/hv_et.c index d961486..440b514 100644 --- a/sys/dev/hyperv/vmbus/hv_et.c +++ b/sys/dev/hyperv/vmbus/hv_et.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2015 Microsoft Corp. + * Copyright (c) 2015,2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,6 +28,9 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> #include <sys/proc.h> #include <sys/systm.h> #include <sys/smp.h> @@ -40,8 +43,7 @@ __FBSDID("$FreeBSD$"); #define HV_MAX_DELTA_TICKS 0xffffffffLL #define HV_MIN_DELTA_TICKS 1LL -static struct eventtimer et; -static uint64_t periodticks[MAXCPU]; +static struct eventtimer *et; static inline uint64_t sbintime2tick(sbintime_t time) @@ -60,11 +62,7 @@ hv_et_start(struct eventtimer *et, sbintime_t firsttime, sbintime_t periodtime) timer_cfg.as_uint64 = 0; timer_cfg.auto_enable = 1; - timer_cfg.sintx = HV_VMBUS_MESSAGE_SINT; - - periodticks[curcpu] = sbintime2tick(periodtime); - if (firsttime == 0) - firsttime = periodtime; + timer_cfg.sintx = HV_VMBUS_TIMER_SINT; current = rdmsr(HV_X64_MSR_TIME_REF_COUNT); current += sbintime2tick(firsttime); @@ -87,45 +85,77 @@ hv_et_stop(struct eventtimer *et) void hv_et_intr(struct trapframe *frame) { - union hv_timer_config timer_cfg; struct trapframe *oldframe; struct thread *td; - if (periodticks[curcpu] != 0) { - uint64_t tick = sbintime2tick(periodticks[curcpu]); - timer_cfg.as_uint64 = rdmsr(HV_X64_MSR_STIMER0_CONFIG); - timer_cfg.enable = 0; - timer_cfg.auto_enable = 1; - timer_cfg.periodic = 1; - periodticks[curcpu] = 0; - - wrmsr(HV_X64_MSR_STIMER0_CONFIG, timer_cfg.as_uint64); - wrmsr(HV_X64_MSR_STIMER0_COUNT, tick); - } - - if (et.et_active) { + if (et->et_active) { td = curthread; td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = frame; - et.et_event_cb(&et, et.et_arg); + et->et_event_cb(et, et->et_arg); td->td_intr_frame = oldframe; td->td_intr_nesting_level--; } } -void -hv_et_init(void) +static void +hv_et_identify(driver_t *driver, device_t parent) { - et.et_name = "HyperV"; - et.et_flags = ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU | ET_FLAGS_PERIODIC; - et.et_quality = 1000; - et.et_frequency = HV_TIMER_FREQUENCY; - et.et_min_period = (1LL << 32) / HV_TIMER_FREQUENCY; - et.et_max_period = HV_MAX_DELTA_TICKS * ((1LL << 32) / HV_TIMER_FREQUENCY); - et.et_start = hv_et_start; - et.et_stop = hv_et_stop; - et.et_priv = &et; - et_register(&et); + if (device_find_child(parent, "hv_et", -1) != NULL) + return; + + device_add_child(parent, "hv_et", -1); +} + +static int +hv_et_probe(device_t dev) +{ + device_set_desc(dev, "Hyper-V event timer"); + + return (BUS_PROBE_NOWILDCARD); } +static int +hv_et_attach(device_t dev) +{ + /* XXX: need allocate SINT and remove global et */ + et = device_get_softc(dev); + + et->et_name = "Hyper-V"; + et->et_flags = ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU; + et->et_quality = 1000; + et->et_frequency = HV_TIMER_FREQUENCY; + et->et_min_period = HV_MIN_DELTA_TICKS * ((1LL << 32) / HV_TIMER_FREQUENCY); + et->et_max_period = HV_MAX_DELTA_TICKS * ((1LL << 32) / HV_TIMER_FREQUENCY); + et->et_start = hv_et_start; + et->et_stop = hv_et_stop; + et->et_priv = dev; + + return (et_register(et)); +} + +static int +hv_et_detach(device_t dev) +{ + return (et_deregister(et)); +} + +static device_method_t hv_et_methods[] = { + DEVMETHOD(device_identify, hv_et_identify), + DEVMETHOD(device_probe, hv_et_probe), + DEVMETHOD(device_attach, hv_et_attach), + DEVMETHOD(device_detach, hv_et_detach), + + DEVMETHOD_END +}; + +static driver_t hv_et_driver = { + "hv_et", + hv_et_methods, + sizeof(struct eventtimer) +}; + +static devclass_t hv_et_devclass; +DRIVER_MODULE(hv_et, vmbus, hv_et_driver, hv_et_devclass, NULL, 0); +MODULE_VERSION(hv_et, 1); diff --git a/sys/dev/hyperv/vmbus/hv_hv.c b/sys/dev/hyperv/vmbus/hv_hv.c index 6afc2b8..70a5608 100644 --- a/sys/dev/hyperv/vmbus/hv_hv.c +++ b/sys/dev/hyperv/vmbus/hv_hv.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> +#include <sys/kernel.h> #include <sys/malloc.h> #include <sys/pcpu.h> #include <sys/timetc.h> @@ -47,9 +48,16 @@ __FBSDID("$FreeBSD$"); #define HV_NANOSECONDS_PER_SEC 1000000000L +#define HYPERV_INTERFACE 0x31237648 /* HV#1 */ static u_int hv_get_timecount(struct timecounter *tc); +u_int hyperv_features; +u_int hyperv_recommends; + +static u_int hyperv_pm_features; +static u_int hyperv_features3; + /** * Globals */ @@ -70,47 +78,6 @@ hv_get_timecount(struct timecounter *tc) } /** - * @brief Query the cpuid for presence of windows hypervisor - */ -int -hv_vmbus_query_hypervisor_presence(void) -{ - if (vm_guest != VM_GUEST_HV) - return (0); - - return (hv_high >= HV_X64_CPUID_MIN && hv_high <= HV_X64_CPUID_MAX); -} - -/** - * @brief Get version of the windows hypervisor - */ -static int -hv_vmbus_get_hypervisor_version(void) -{ - u_int regs[4]; - unsigned int maxLeaf; - unsigned int op; - - /* - * Its assumed that this is called after confirming that - * Viridian is present - * Query id and revision. - */ - op = HV_CPU_ID_FUNCTION_HV_VENDOR_AND_MAX_FUNCTION; - do_cpuid(op, regs); - - maxLeaf = regs[0]; - op = HV_CPU_ID_FUNCTION_HV_INTERFACE; - do_cpuid(op, regs); - - if (maxLeaf >= HV_CPU_ID_FUNCTION_MS_HV_VERSION) { - op = HV_CPU_ID_FUNCTION_MS_HV_VERSION; - do_cpuid(op, regs); - } - return (maxLeaf); -} - -/** * @brief Invoke the specified hypercall */ static uint64_t @@ -159,9 +126,8 @@ hv_vmbus_do_hypercall(uint64_t control, void* input, void* output) int hv_vmbus_init(void) { - int max_leaf; hv_vmbus_x64_msr_hypercall_contents hypercall_msr; - void* virt_addr = 0; + void* virt_addr = NULL; memset( hv_vmbus_g_context.syn_ic_event_page, @@ -176,8 +142,6 @@ hv_vmbus_init(void) if (vm_guest != VM_GUEST_HV) goto cleanup; - max_leaf = hv_vmbus_get_hypervisor_version(); - /* * Write our OS info */ @@ -207,10 +171,6 @@ hv_vmbus_init(void) hv_vmbus_g_context.hypercall_page = virt_addr; - tc_init(&hv_timecounter); /* register virtual timecount */ - - hv_et_init(); - return (0); cleanup: @@ -368,6 +328,9 @@ hv_vmbus_synic_init(void *arg) wrmsr(HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT, shared_sint.as_uint64_t); + wrmsr(HV_X64_MSR_SINT0 + HV_VMBUS_TIMER_SINT, + shared_sint.as_uint64_t); + /* Enable the global synic bit */ sctrl.as_uint64_t = rdmsr(HV_X64_MSR_SCONTROL); sctrl.u.enable = 1; @@ -404,12 +367,23 @@ void hv_vmbus_synic_cleanup(void *arg) shared_sint.u.masked = 1; /* - * Disable the interrupt + * Disable the interrupt 0 */ wrmsr( HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT, shared_sint.as_uint64_t); + shared_sint.as_uint64_t = rdmsr( + HV_X64_MSR_SINT0 + HV_VMBUS_TIMER_SINT); + + shared_sint.u.masked = 1; + + /* + * Disable the interrupt 1 + */ + wrmsr( + HV_X64_MSR_SINT0 + HV_VMBUS_TIMER_SINT, + shared_sint.as_uint64_t); simp.as_uint64_t = rdmsr(HV_X64_MSR_SIMP); simp.u.simp_enabled = 0; simp.u.base_simp_gpa = 0; @@ -423,3 +397,117 @@ void hv_vmbus_synic_cleanup(void *arg) wrmsr(HV_X64_MSR_SIEFP, siefp.as_uint64_t); } +static bool +hyperv_identify(void) +{ + u_int regs[4]; + unsigned int maxLeaf; + unsigned int op; + + if (vm_guest != VM_GUEST_HV) + return (false); + + op = HV_CPU_ID_FUNCTION_HV_VENDOR_AND_MAX_FUNCTION; + do_cpuid(op, regs); + maxLeaf = regs[0]; + if (maxLeaf < HV_CPU_ID_FUNCTION_MS_HV_IMPLEMENTATION_LIMITS) + return (false); + + op = HV_CPU_ID_FUNCTION_HV_INTERFACE; + do_cpuid(op, regs); + if (regs[0] != HYPERV_INTERFACE) + return (false); + + op = HV_CPU_ID_FUNCTION_MS_HV_FEATURES; + do_cpuid(op, regs); + if ((regs[0] & HV_FEATURE_MSR_HYPERCALL) == 0) { + /* + * Hyper-V w/o Hypercall is impossible; someone + * is faking Hyper-V. + */ + return (false); + } + hyperv_features = regs[0]; + hyperv_pm_features = regs[2]; + hyperv_features3 = regs[3]; + + op = HV_CPU_ID_FUNCTION_MS_HV_VERSION; + do_cpuid(op, regs); + printf("Hyper-V Version: %d.%d.%d [SP%d]\n", + regs[1] >> 16, regs[1] & 0xffff, regs[0], regs[2]); + + printf(" Features=0x%b\n", hyperv_features, + "\020" + "\001VPRUNTIME" /* MSR_VP_RUNTIME */ + "\002TMREFCNT" /* MSR_TIME_REF_COUNT */ + "\003SYNIC" /* MSRs for SynIC */ + "\004SYNTM" /* MSRs for SynTimer */ + "\005APIC" /* MSR_{EOI,ICR,TPR} */ + "\006HYPERCALL" /* MSR_{GUEST_OS_ID,HYPERCALL} */ + "\007VPINDEX" /* MSR_VP_INDEX */ + "\010RESET" /* MSR_RESET */ + "\011STATS" /* MSR_STATS_ */ + "\012REFTSC" /* MSR_REFERENCE_TSC */ + "\013IDLE" /* MSR_GUEST_IDLE */ + "\014TMFREQ" /* MSR_{TSC,APIC}_FREQUENCY */ + "\015DEBUG"); /* MSR_SYNTH_DEBUG_ */ + printf(" PM Features=max C%u, 0x%b\n", + HV_PM_FEATURE_CSTATE(hyperv_pm_features), + (hyperv_pm_features & ~HV_PM_FEATURE_CSTATE_MASK), + "\020" + "\005C3HPET"); /* HPET is required for C3 state */ + printf(" Features3=0x%b\n", hyperv_features3, + "\020" + "\001MWAIT" /* MWAIT */ + "\002DEBUG" /* guest debug support */ + "\003PERFMON" /* performance monitor */ + "\004PCPUDPE" /* physical CPU dynamic partition event */ + "\005XMMHC" /* hypercall input through XMM regs */ + "\006IDLE" /* guest idle support */ + "\007SLEEP" /* hypervisor sleep support */ + "\010NUMA" /* NUMA distance query support */ + "\011TMFREQ" /* timer frequency query (TSC, LAPIC) */ + "\012SYNCMC" /* inject synthetic machine checks */ + "\013CRASH" /* MSRs for guest crash */ + "\014DEBUGMSR" /* MSRs for guest debug */ + "\015NPIEP" /* NPIEP */ + "\016HVDIS"); /* disabling hypervisor */ + + op = HV_CPU_ID_FUNCTION_MS_HV_ENLIGHTENMENT_INFORMATION; + do_cpuid(op, regs); + hyperv_recommends = regs[0]; + if (bootverbose) + printf(" Recommends: %08x %08x\n", regs[0], regs[1]); + + op = HV_CPU_ID_FUNCTION_MS_HV_IMPLEMENTATION_LIMITS; + do_cpuid(op, regs); + if (bootverbose) { + printf(" Limits: Vcpu:%d Lcpu:%d Int:%d\n", + regs[0], regs[1], regs[2]); + } + + if (maxLeaf >= HV_CPU_ID_FUNCTION_MS_HV_HARDWARE_FEATURE) { + op = HV_CPU_ID_FUNCTION_MS_HV_HARDWARE_FEATURE; + do_cpuid(op, regs); + if (bootverbose) { + printf(" HW Features: %08x AMD: %08x\n", + regs[0], regs[3]); + } + } + + return (true); +} + +static void +hyperv_init(void *dummy __unused) +{ + if (!hyperv_identify()) + return; + + if (hyperv_features & HV_FEATURE_MSR_TIME_REFCNT) { + /* Register virtual timecount */ + tc_init(&hv_timecounter); + } +} +SYSINIT(hyperv_initialize, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, hyperv_init, + NULL); diff --git a/sys/dev/hyperv/vmbus/hv_ring_buffer.c b/sys/dev/hyperv/vmbus/hv_ring_buffer.c index 0e51ef7..cd82b27 100644 --- a/sys/dev/hyperv/vmbus/hv_ring_buffer.c +++ b/sys/dev/hyperv/vmbus/hv_ring_buffer.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. @@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/lock.h> #include <sys/mutex.h> +#include <sys/sysctl.h> #include "hv_vmbus_priv.h" @@ -39,6 +40,47 @@ __FBSDID("$FreeBSD$"); #define HV_BYTES_AVAIL_TO_WRITE(r, w, z) ((w) >= (r))? \ ((z) - ((w) - (r))):((r) - (w)) +static int +hv_rbi_sysctl_stats(SYSCTL_HANDLER_ARGS) +{ + hv_vmbus_ring_buffer_info* rbi; + uint32_t read_index, write_index, interrupt_mask, sz; + uint32_t read_avail, write_avail; + char rbi_stats[256]; + + rbi = (hv_vmbus_ring_buffer_info*)arg1; + read_index = rbi->ring_buffer->read_index; + write_index = rbi->ring_buffer->write_index; + interrupt_mask = rbi->ring_buffer->interrupt_mask; + sz = rbi->ring_data_size; + write_avail = HV_BYTES_AVAIL_TO_WRITE(read_index, + write_index, sz); + read_avail = sz - write_avail; + snprintf(rbi_stats, sizeof(rbi_stats), + "r_idx:%d " + "w_idx:%d " + "int_mask:%d " + "r_avail:%d " + "w_avail:%d", + read_index, write_index, interrupt_mask, + read_avail, write_avail); + + return (sysctl_handle_string(oidp, rbi_stats, + sizeof(rbi_stats), req)); +} + +void +hv_ring_buffer_stat( + struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *tree_node, + hv_vmbus_ring_buffer_info *rbi, + const char *desc) +{ + SYSCTL_ADD_PROC(ctx, tree_node, OID_AUTO, + "ring_buffer_stats", + CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE, rbi, 0, + hv_rbi_sysctl_stats, "A", desc); +} /** * @brief Get number of bytes available to read and to write to * for the specified ring buffer diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c index c8d6894..e274d59 100644 --- a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c +++ b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. @@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$"); #include <sys/pcpu.h> #include <machine/apicvar.h> +#include <dev/hyperv/include/hyperv.h> #include "hv_vmbus_priv.h" #include <contrib/dev/acpica/include/acpi.h> @@ -75,7 +76,7 @@ static char *vmbus_ids[] = { "VMBUS", NULL }; * the hypervisor. */ static void -vmbus_msg_swintr(void *arg) +vmbus_msg_swintr(void *arg, int pending __unused) { int cpu; void* page_addr; @@ -116,8 +117,12 @@ handled: * message_pending and EOMing. Otherwise, the EOMing will * not deliver any more messages * since there is no empty slot + * + * NOTE: + * mb() is used here, since atomic_thread_fence_seq_cst() + * will become compiler fence on UP kernel. */ - wmb(); + mb(); if (msg->header.message_flags.u.message_pending) { /* @@ -140,7 +145,6 @@ hv_vmbus_isr(struct trapframe *frame) { int cpu; hv_vmbus_message* msg; - hv_vmbus_synic_event_flags* event; void* page_addr; cpu = PCPU_GET(cpuid); @@ -151,43 +155,31 @@ hv_vmbus_isr(struct trapframe *frame) * in Windows when running as a guest in Hyper-V */ - page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu]; - event = (hv_vmbus_synic_event_flags*) - page_addr + HV_VMBUS_MESSAGE_SINT; - - if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || - (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) { - /* Since we are a child, we only need to check bit 0 */ - if (synch_test_and_clear_bit(0, &event->flags32[0])) { - hv_vmbus_on_events(cpu); - } - } else { - /* - * On host with Win8 or above, we can directly look at - * the event page. If bit n is set, we have an interrupt - * on the channel with id n. - * Directly schedule the event software interrupt on - * current cpu. - */ - hv_vmbus_on_events(cpu); - } + hv_vmbus_on_events(cpu); /* Check if there are actual msgs to be process */ page_addr = hv_vmbus_g_context.syn_ic_msg_page[cpu]; - msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT; + msg = (hv_vmbus_message*) page_addr + HV_VMBUS_TIMER_SINT; /* we call eventtimer process the message */ if (msg->header.message_type == HV_MESSAGE_TIMER_EXPIRED) { msg->header.message_type = HV_MESSAGE_TYPE_NONE; + /* call intrrupt handler of event timer */ + hv_et_intr(frame); + /* * Make sure the write to message_type (ie set to * HV_MESSAGE_TYPE_NONE) happens before we read the * message_pending and EOMing. Otherwise, the EOMing will * not deliver any more messages * since there is no empty slot + * + * NOTE: + * mb() is used here, since atomic_thread_fence_seq_cst() + * will become compiler fence on UP kernel. */ - wmb(); + mb(); if (msg->header.message_flags.u.message_pending) { /* @@ -196,12 +188,12 @@ hv_vmbus_isr(struct trapframe *frame) */ wrmsr(HV_X64_MSR_EOM, 0); } - hv_et_intr(frame); - return (FILTER_HANDLED); } + msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT; if (msg->header.message_type != HV_MESSAGE_TYPE_NONE) { - swi_sched(hv_vmbus_g_context.msg_swintr[cpu], 0); + taskqueue_enqueue(hv_vmbus_g_context.hv_msg_tq[cpu], + &hv_vmbus_g_context.hv_msg_task[cpu]); } return (FILTER_HANDLED); @@ -279,6 +271,26 @@ vmbus_write_ivar( return (ENOENT); } +static int +vmbus_child_pnpinfo_str(device_t dev, device_t child, char *buf, size_t buflen) +{ + char guidbuf[40]; + struct hv_device *dev_ctx = device_get_ivars(child); + + if (dev_ctx == NULL) + return (0); + + strlcat(buf, "classid=", buflen); + snprintf_hv_guid(guidbuf, sizeof(guidbuf), &dev_ctx->class_id); + strlcat(buf, guidbuf, buflen); + + strlcat(buf, " deviceid=", buflen); + snprintf_hv_guid(guidbuf, sizeof(guidbuf), &dev_ctx->device_id); + strlcat(buf, guidbuf, buflen); + + return (0); +} + struct hv_device* hv_vmbus_child_device_create( hv_guid type, @@ -300,34 +312,34 @@ hv_vmbus_child_device_create( return (child_dev); } -static void -print_dev_guid(struct hv_device *dev) +int +snprintf_hv_guid(char *buf, size_t sz, const hv_guid *guid) { - int i; - unsigned char guid_name[100]; - for (i = 0; i < 32; i += 2) - sprintf(&guid_name[i], "%02x", dev->class_id.data[i / 2]); - if(bootverbose) - printf("VMBUS: Class ID: %s\n", guid_name); + int cnt; + const unsigned char *d = guid->data; + + cnt = snprintf(buf, sz, + "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + d[3], d[2], d[1], d[0], d[5], d[4], d[7], d[6], + d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]); + return (cnt); } int hv_vmbus_child_device_register(struct hv_device *child_dev) { device_t child; - int ret = 0; - - print_dev_guid(child_dev); + if (bootverbose) { + char name[40]; + snprintf_hv_guid(name, sizeof(name), &child_dev->class_id); + printf("VMBUS: Class ID: %s\n", name); + } child = device_add_child(vmbus_devp, NULL, -1); child_dev->device = child; device_set_ivars(child, child_dev); - mtx_lock(&Giant); - ret = device_probe_and_attach(child); - mtx_unlock(&Giant); - return (0); } @@ -356,7 +368,6 @@ vmbus_probe(device_t dev) { return (BUS_PROBE_DEFAULT); } -#ifdef HYPERV extern inthand_t IDTVEC(rsvd), IDTVEC(hv_vmbus_callback); /** @@ -416,21 +427,6 @@ vmbus_vector_free(int vector) setidt(vector, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); } -#else /* HYPERV */ - -static int -vmbus_vector_alloc(void) -{ - return(0); -} - -static void -vmbus_vector_free(int vector) -{ -} - -#endif /* HYPERV */ - static void vmbus_cpuset_setthread_task(void *xmask, int pending __unused) { @@ -498,9 +494,6 @@ vmbus_bus_init(void) setup_args.vector = hv_vmbus_g_context.hv_cb_vector; CPU_FOREACH(j) { - hv_vmbus_g_context.hv_msg_intr_event[j] = NULL; - hv_vmbus_g_context.msg_swintr[j] = NULL; - snprintf(buf, sizeof(buf), "cpu%d:hyperv", j); intrcnt_add(buf, &hv_vmbus_intr_cpu[j]); @@ -519,11 +512,6 @@ vmbus_bus_init(void) */ hv_vmbus_g_context.hv_event_queue[j] = taskqueue_create_fast("hyperv event", M_WAITOK, taskqueue_thread_enqueue, &hv_vmbus_g_context.hv_event_queue[j]); - if (hv_vmbus_g_context.hv_event_queue[j] == NULL) { - if (bootverbose) - printf("VMBUS: failed to setup taskqueue\n"); - goto cleanup1; - } taskqueue_start_threads(&hv_vmbus_g_context.hv_event_queue[j], 1, PI_NET, "hvevent%d", j); @@ -533,29 +521,20 @@ vmbus_bus_init(void) taskqueue_drain(hv_vmbus_g_context.hv_event_queue[j], &cpuset_task); /* - * Setup software interrupt thread and handler for msg handling. + * Setup per-cpu tasks and taskqueues to handle msg. */ - ret = swi_add(&hv_vmbus_g_context.hv_msg_intr_event[j], - "hv_msg", vmbus_msg_swintr, (void *)(long)j, SWI_CLOCK, 0, - &hv_vmbus_g_context.msg_swintr[j]); - if (ret) { - if(bootverbose) - printf("VMBUS: failed to setup msg swi for " - "cpu %d\n", j); - goto cleanup1; - } + hv_vmbus_g_context.hv_msg_tq[j] = taskqueue_create_fast( + "hyperv msg", M_WAITOK, taskqueue_thread_enqueue, + &hv_vmbus_g_context.hv_msg_tq[j]); + taskqueue_start_threads(&hv_vmbus_g_context.hv_msg_tq[j], 1, PI_NET, + "hvmsg%d", j); + TASK_INIT(&hv_vmbus_g_context.hv_msg_task[j], 0, + vmbus_msg_swintr, (void *)(long)j); - /* - * Bind the swi thread to the cpu. - */ - ret = intr_event_bind(hv_vmbus_g_context.hv_msg_intr_event[j], - j); - if (ret) { - if(bootverbose) - printf("VMBUS: failed to bind msg swi thread " - "to cpu %d\n", j); - goto cleanup1; - } + CPU_SETOF(j, &cpu_mask); + TASK_INIT(&cpuset_task, 0, vmbus_cpuset_setthread_task, &cpu_mask); + taskqueue_enqueue(hv_vmbus_g_context.hv_msg_tq[j], &cpuset_task); + taskqueue_drain(hv_vmbus_g_context.hv_msg_tq[j], &cpuset_task); /* * Prepare the per cpu msg and event pages to be called on each cpu. @@ -581,6 +560,11 @@ vmbus_bus_init(void) goto cleanup1; hv_vmbus_request_channel_offers(); + + vmbus_scan(); + bus_generic_attach(vmbus_devp); + device_printf(vmbus_devp, "device scan, probe and attach done\n"); + return (ret); cleanup1: @@ -595,11 +579,10 @@ vmbus_bus_init(void) * remove swi and vmbus callback vector; */ CPU_FOREACH(j) { - if (hv_vmbus_g_context.hv_event_queue[j] != NULL) + if (hv_vmbus_g_context.hv_event_queue[j] != NULL) { taskqueue_free(hv_vmbus_g_context.hv_event_queue[j]); - if (hv_vmbus_g_context.msg_swintr[j] != NULL) - swi_remove(hv_vmbus_g_context.msg_swintr[j]); - hv_vmbus_g_context.hv_msg_intr_event[j] = NULL; + hv_vmbus_g_context.hv_event_queue[j] = NULL; + } } vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector); @@ -626,6 +609,7 @@ vmbus_attach(device_t dev) if (!cold) vmbus_bus_init(); + bus_generic_probe(dev); return (0); } @@ -656,7 +640,7 @@ vmbus_bus_exit(void) smp_rendezvous(NULL, hv_vmbus_synic_cleanup, NULL, NULL); for(i = 0; i < 2 * MAXCPU; i++) { - if (setup_args.page_buffers[i] != 0) + if (setup_args.page_buffers[i] != NULL) free(setup_args.page_buffers[i], M_DEVBUF); } @@ -664,11 +648,10 @@ vmbus_bus_exit(void) /* remove swi */ CPU_FOREACH(i) { - if (hv_vmbus_g_context.hv_event_queue[i] != NULL) + if (hv_vmbus_g_context.hv_event_queue[i] != NULL) { taskqueue_free(hv_vmbus_g_context.hv_event_queue[i]); - if (hv_vmbus_g_context.msg_swintr[i] != NULL) - swi_remove(hv_vmbus_g_context.msg_swintr[i]); - hv_vmbus_g_context.hv_msg_intr_event[i] = NULL; + hv_vmbus_g_context.hv_event_queue[i] = NULL; + } } vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector); @@ -733,6 +716,7 @@ static device_method_t vmbus_methods[] = { DEVMETHOD(bus_print_child, bus_generic_print_child), DEVMETHOD(bus_read_ivar, vmbus_read_ivar), DEVMETHOD(bus_write_ivar, vmbus_write_ivar), + DEVMETHOD(bus_child_pnpinfo_str, vmbus_child_pnpinfo_str), { 0, 0 } }; diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h index 5f62072..f83102a 100644 --- a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h +++ b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. @@ -70,6 +70,7 @@ typedef uint16_t hv_vmbus_status; * You did not supply enough message buffers to send a message. */ +#define HV_STATUS_SUCCESS ((uint16_t)0) #define HV_STATUS_INSUFFICIENT_BUFFERS ((uint16_t)0x0013) typedef void (*hv_vmbus_channel_callback)(void *context); @@ -180,7 +181,8 @@ enum { HV_VMBUS_EVENT_PORT_ID = 2, HV_VMBUS_MONITOR_CONNECTION_ID = 3, HV_VMBUS_MONITOR_PORT_ID = 3, - HV_VMBUS_MESSAGE_SINT = 2 + HV_VMBUS_MESSAGE_SINT = 2, + HV_VMBUS_TIMER_SINT = 4, }; #define HV_PRESENT_BIT 0x80000000 @@ -203,8 +205,8 @@ typedef struct { * event and msg handling. */ struct taskqueue *hv_event_queue[MAXCPU]; - struct intr_event *hv_msg_intr_event[MAXCPU]; - void *msg_swintr[MAXCPU]; + struct taskqueue *hv_msg_tq[MAXCPU]; + struct task hv_msg_task[MAXCPU]; /* * Host use this vector to intrrupt guest for vmbus channel * event and msg. @@ -469,10 +471,28 @@ typedef enum { HV_CPU_ID_FUNCTION_MS_HV_VERSION = 0x40000002, HV_CPU_ID_FUNCTION_MS_HV_FEATURES = 0x40000003, HV_CPU_ID_FUNCTION_MS_HV_ENLIGHTENMENT_INFORMATION = 0x40000004, - HV_CPU_ID_FUNCTION_MS_HV_IMPLEMENTATION_LIMITS = 0x40000005 - + HV_CPU_ID_FUNCTION_MS_HV_IMPLEMENTATION_LIMITS = 0x40000005, + HV_CPU_ID_FUNCTION_MS_HV_HARDWARE_FEATURE = 0x40000006 } hv_vmbus_cpuid_function; +#define HV_FEATURE_MSR_TIME_REFCNT 0x0002 /* MSR_TIME_REF_COUNT */ +#define HV_FEATURE_MSR_SYNIC 0x0004 /* MSRs for SynIC */ +#define HV_FEATURE_MSR_SYNTIMER 0x0008 /* MSRs for SynTimer */ +#define HV_FEATURE_MSR_APIC 0x0010 /* MSR_{EOI,ICR,TPR} */ +#define HV_FEATURE_MSR_HYPERCALL 0x0020 /* MSR_{GUEST_OS_ID,HYPERCALL} */ +#define HV_FEATURE_MSR_GUEST_IDLE 0x0400 /* MSR_GUEST_IDLE */ + +#define HV_PM_FEATURE_CSTATE_MASK 0x000f +#define HV_PM_FEATURE_C3_HPET 0x0010 /* C3 requires HPET */ +#define HV_PM_FEATURE_CSTATE(f) ((f) & HV_PM_FEATURE_CSTATE_MASK) + +#define HV_FEATURE3_MWAIT 0x0001 /* MWAIT */ +#define HV_FEATURE3_XMM_HYPERCALL 0x0010 /* hypercall input through XMM regs */ +#define HV_FEATURE3_GUEST_IDLE 0x0020 /* guest idle support */ +#define HV_FEATURE3_NUMA 0x0080 /* NUMA distance query support */ +#define HV_FEATURE3_TIME_FREQ 0x0100 /* timer frequency query (TSC, LAPIC) */ +#define HV_FEATURE3_MSR_CRASH 0x0400 /* MSRs for guest crash */ + /* * Define the format of the SIMP register */ @@ -626,6 +646,9 @@ typedef enum { extern hv_vmbus_context hv_vmbus_g_context; extern hv_vmbus_connection hv_vmbus_g_connection; +extern u_int hyperv_features; +extern u_int hyperv_recommends; + typedef void (*vmbus_msg_handler)(hv_vmbus_channel_msg_header *msg); typedef struct hv_vmbus_channel_msg_table_entry { @@ -639,6 +662,14 @@ extern hv_vmbus_channel_msg_table_entry g_channel_message_table[]; /* * Private, VM Bus functions */ +struct sysctl_ctx_list; +struct sysctl_oid_list; + +void hv_ring_buffer_stat( + struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *tree_node, + hv_vmbus_ring_buffer_info *rbi, + const char *desc); int hv_vmbus_ring_buffer_init( hv_vmbus_ring_buffer_info *ring_info, @@ -694,7 +725,6 @@ uint16_t hv_vmbus_post_msg_via_msg_ipc( uint16_t hv_vmbus_signal_event(void *con_id); void hv_vmbus_synic_init(void *irq_arg); void hv_vmbus_synic_cleanup(void *arg); -int hv_vmbus_query_hypervisor_presence(void); struct hv_device* hv_vmbus_child_device_create( hv_guid device_type, @@ -721,6 +751,9 @@ void hv_vmbus_on_events(int cpu); void hv_et_init(void); void hv_et_intr(struct trapframe*); +/* Wait for device creation */ +void vmbus_scan(void); + /* * The guest OS needs to register the guest ID with the hypervisor. * The guest ID is a 64 bit entity and the structure of this ID is diff --git a/sys/dev/hyperv/vmbus/i386/hv_vector.S b/sys/dev/hyperv/vmbus/i386/hv_vector.S new file mode 100644 index 0000000..55a2613 --- /dev/null +++ b/sys/dev/hyperv/vmbus/i386/hv_vector.S @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <machine/asmacros.h> +#include <machine/specialreg.h> + +#include "assym.s" + +/* + * This is the Hyper-V vmbus channel direct callback interrupt. + * Only used when it is running on Hyper-V. + */ + .text + SUPERALIGN_TEXT +IDTVEC(hv_vmbus_callback) + PUSH_FRAME + SET_KERNEL_SREGS + cld + FAKE_MCOUNT(TF_EIP(%esp)) + pushl %esp + call hv_vector_handler + add $4, %esp + MEXITCOUNT + jmp doreti |