diff options
Diffstat (limited to 'sys')
18 files changed, 12324 insertions, 0 deletions
diff --git a/sys/contrib/dev/hyperv/README b/sys/contrib/dev/hyperv/README new file mode 100644 index 0000000..1c4488d --- /dev/null +++ b/sys/contrib/dev/hyperv/README @@ -0,0 +1,34 @@ +***** Release rc2.3.0 4/27/2012 ************************************************** + +New features/limitations- + +-Added Fast IDE +-Massive code restructuring to meeting FreeBSD sytle guidelines + +***** Release rc2.2.0 1/4/2012 *************************************************** + +New features/limitations- + +-Port of LIS 2.1 with FreeBSD support code from Citrix, drivers are linked with + Kernel (future drivers will be loadable), port has not been refactored to meet + BSD coding standards + +-SCSI device driver functional, but support for scatter-gather lists is not + implemented-Fast IDE support has not been added-still using emulated IDE + support + +-Network storage device support has been added + +-While the storage and networking devices support multiple controllers, we're + waiting on a resolution from Microsoft to enable persistent and consistent + numbering between boots + +-Hyper-V bus has been ported with support code from Citrix to handle clock + synchronization between guest and host. Clock synchronization and heartbeat + logic have been moved to two, separate drivers-this separation is part + of the initial steps for refactoring and restructuring the Hyper-V bus driver from the + LIS 2.1 codebase + +Bug fixes- + +*******************************************************************************
\ No newline at end of file diff --git a/sys/contrib/dev/hyperv/include/hyperv.h b/sys/contrib/dev/hyperv/include/hyperv.h new file mode 100644 index 0000000..3651269 --- /dev/null +++ b/sys/contrib/dev/hyperv/include/hyperv.h @@ -0,0 +1,796 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * HyperV definitions for messages that are sent between instances of the + * Channel Management Library in separate partitions, or in some cases, + * back to itself. + */ + +#ifndef __HYPERV_H__ +#define __HYPERV_H__ + +#include <sys/param.h> +#include <sys/mbuf.h> +#include <sys/queue.h> +#include <sys/malloc.h> +#include <sys/kthread.h> +#include <sys/taskqueue.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/sema.h> +#include <sys/mutex.h> +#include <sys/bus.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> + +#include <amd64/include/xen/synch_bitops.h> +#include <amd64/include/atomic.h> + +typedef uint8_t hv_bool_uint8_t; + +#define HV_S_OK 0x00000000 +#define HV_E_FAIL 0x80004005 +#define HV_ERROR_NOT_SUPPORTED 0x80070032 +#define HV_ERROR_MACHINE_LOCKED 0x800704F7 + +/* + * A revision number of vmbus that is used for ensuring both ends on a + * partition are using compatible versions. + */ + +#define HV_VMBUS_REVISION_NUMBER 13 + +/* + * Make maximum size of pipe payload of 16K + */ + +#define HV_MAX_PIPE_DATA_PAYLOAD (sizeof(BYTE) * 16384) + +/* + * Define pipe_mode values + */ + +#define HV_VMBUS_PIPE_TYPE_BYTE 0x00000000 +#define HV_VMBUS_PIPE_TYPE_MESSAGE 0x00000004 + +/* + * The size of the user defined data buffer for non-pipe offers + */ + +#define HV_MAX_USER_DEFINED_BYTES 120 + +/* + * The size of the user defined data buffer for pipe offers + */ + +#define HV_MAX_PIPE_USER_DEFINED_BYTES 116 + + +#define HV_MAX_PAGE_BUFFER_COUNT 16 +#define HV_MAX_MULTIPAGE_BUFFER_COUNT 32 + +#define HV_ALIGN_UP(value, align) \ + (((value) & (align-1)) ? \ + (((value) + (align-1)) & ~(align-1) ) : (value)) + +#define HV_ALIGN_DOWN(value, align) ( (value) & ~(align-1) ) + +#define HV_NUM_PAGES_SPANNED(addr, len) \ + ((HV_ALIGN_UP(addr+len, PAGE_SIZE) - \ + HV_ALIGN_DOWN(addr, PAGE_SIZE)) >> PAGE_SHIFT ) + +typedef struct hv_guid { + unsigned char data[16]; +} __packed hv_guid; + +/* + * At the center of the Channel Management library is + * the Channel Offer. This struct contains the + * fundamental information about an offer. + */ + +typedef struct hv_vmbus_channel_offer { + hv_guid interface_type; + hv_guid interface_instance; + uint64_t interrupt_latency_in_100ns_units; + uint32_t interface_revision; + uint32_t server_context_area_size; /* in bytes */ + uint16_t channel_flags; + uint16_t mmio_megabytes; /* in bytes * 1024 * 1024 */ + union + { + /* + * Non-pipes: The user has HV_MAX_USER_DEFINED_BYTES bytes. + */ + struct { + uint8_t user_defined[HV_MAX_USER_DEFINED_BYTES]; + } __packed standard; + + /* + * Pipes: The following structure is an integrated pipe protocol, which + * is implemented on top of standard user-defined data. pipe + * clients have HV_MAX_PIPE_USER_DEFINED_BYTES left for their + * own use. + */ + struct { + uint32_t pipe_mode; + uint8_t user_defined[HV_MAX_PIPE_USER_DEFINED_BYTES]; + } __packed pipe; + } u; + + uint32_t padding; + +} __packed hv_vmbus_channel_offer; + +typedef uint32_t hv_gpadl_handle; + +typedef struct { + uint16_t type; + uint16_t data_offset8; + uint16_t length8; + uint16_t flags; + uint64_t transaction_id; +} __packed hv_vm_packet_descriptor; + +typedef uint32_t hv_previous_packet_offset; + +typedef struct { + hv_previous_packet_offset previous_packet_start_offset; + hv_vm_packet_descriptor descriptor; +} __packed hv_vm_packet_header; + +typedef struct { + uint32_t byte_count; + uint32_t byte_offset; +} __packed hv_vm_transfer_page; + +typedef struct { + hv_vm_packet_descriptor d; + uint16_t transfer_page_set_id; + hv_bool_uint8_t sender_owns_set; + uint8_t reserved; + uint32_t range_count; + hv_vm_transfer_page ranges[1]; +} __packed hv_vm_transfer_page_packet_header; + +typedef struct { + hv_vm_packet_descriptor d; + uint32_t gpadl; + uint32_t reserved; +} __packed hv_vm_gpadl_packet_header; + +typedef struct { + hv_vm_packet_descriptor d; + uint32_t gpadl; + uint16_t transfer_page_set_id; + uint16_t reserved; +} __packed hv_vm_add_remove_transfer_page_set; + +/* + * This structure defines a range in guest + * physical space that can be made + * to look virtually contiguous. + */ + +typedef struct { + uint32_t byte_count; + uint32_t byte_offset; + uint64_t pfn_array[0]; +} __packed hv_gpa_range; + +/* + * This is the format for an Establish Gpadl packet, which contains a handle + * by which this GPADL will be known and a set of GPA ranges associated with + * it. This can be converted to a MDL by the guest OS. If there are multiple + * GPA ranges, then the resulting MDL will be "chained," representing multiple + * VA ranges. + */ + +typedef struct { + hv_vm_packet_descriptor d; + uint32_t gpadl; + uint32_t range_count; + hv_gpa_range range[1]; +} __packed hv_vm_establish_gpadl; + +/* + * This is the format for a Teardown Gpadl packet, which indicates that the + * GPADL handle in the Establish Gpadl packet will never be referenced again. + */ + +typedef struct { + hv_vm_packet_descriptor d; + uint32_t gpadl; + /* for alignment to a 8-byte boundary */ + uint32_t reserved; +} __packed hv_vm_teardown_gpadl; + +/* + * This is the format for a GPA-Direct packet, which contains a set of GPA + * ranges, in addition to commands and/or data. + */ + +typedef struct { + hv_vm_packet_descriptor d; + uint32_t reserved; + uint32_t range_count; + hv_gpa_range range[1]; +} __packed hv_vm_data_gpa_direct; + +/* + * This is the format for a Additional data Packet. + */ +typedef struct { + hv_vm_packet_descriptor d; + uint64_t total_bytes; + uint32_t byte_offset; + uint32_t byte_count; + uint8_t data[1]; +} __packed hv_vm_additional_data; + +typedef union { + hv_vm_packet_descriptor simple_header; + hv_vm_transfer_page_packet_header transfer_page_header; + hv_vm_gpadl_packet_header gpadl_header; + hv_vm_add_remove_transfer_page_set add_remove_transfer_page_header; + hv_vm_establish_gpadl establish_gpadl_header; + hv_vm_teardown_gpadl teardown_gpadl_header; + hv_vm_data_gpa_direct data_gpa_direct_header; +} __packed hv_vm_packet_largest_possible_header; + +typedef enum { + HV_VMBUS_PACKET_TYPE_INVALID = 0x0, + HV_VMBUS_PACKET_TYPES_SYNCH = 0x1, + HV_VMBUS_PACKET_TYPE_ADD_TRANSFER_PAGE_SET = 0x2, + HV_VMBUS_PACKET_TYPE_REMOVE_TRANSFER_PAGE_SET = 0x3, + HV_VMBUS_PACKET_TYPE_ESTABLISH_GPADL = 0x4, + HV_VMBUS_PACKET_TYPE_TEAR_DOWN_GPADL = 0x5, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND = 0x6, + HV_VMBUS_PACKET_TYPE_DATA_USING_TRANSFER_PAGES = 0x7, + HV_VMBUS_PACKET_TYPE_DATA_USING_GPADL = 0x8, + HV_VMBUS_PACKET_TYPE_DATA_USING_GPA_DIRECT = 0x9, + HV_VMBUS_PACKET_TYPE_CANCEL_REQUEST = 0xa, + HV_VMBUS_PACKET_TYPE_COMPLETION = 0xb, + HV_VMBUS_PACKET_TYPE_DATA_USING_ADDITIONAL_PACKETS = 0xc, + HV_VMBUS_PACKET_TYPE_ADDITIONAL_DATA = 0xd +} hv_vmbus_packet_type; + +#define HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED 1 + +/* + * Version 1 messages + */ +typedef enum { + HV_CHANNEL_MESSAGE_INVALID = 0, + HV_CHANNEL_MESSAGE_OFFER_CHANNEL = 1, + HV_CHANNEL_MESSAGE_RESCIND_CHANNEL_OFFER = 2, + HV_CHANNEL_MESSAGE_REQUEST_OFFERS = 3, + HV_CHANNEL_MESSAGE_ALL_OFFERS_DELIVERED = 4, + HV_CHANNEL_MESSAGE_OPEN_CHANNEL = 5, + HV_CHANNEL_MESSAGE_OPEN_CHANNEL_RESULT = 6, + HV_CHANNEL_MESSAGE_CLOSE_CHANNEL = 7, + HV_CHANNEL_MESSAGEL_GPADL_HEADER = 8, + HV_CHANNEL_MESSAGE_GPADL_BODY = 9, + HV_CHANNEL_MESSAGE_GPADL_CREATED = 10, + HV_CHANNEL_MESSAGE_GPADL_TEARDOWN = 11, + HV_CHANNEL_MESSAGE_GPADL_TORNDOWN = 12, + HV_CHANNEL_MESSAGE_REL_ID_RELEASED = 13, + HV_CHANNEL_MESSAGE_INITIATED_CONTACT = 14, + HV_CHANNEL_MESSAGE_VERSION_RESPONSE = 15, + HV_CHANNEL_MESSAGE_UNLOAD = 16, + +#ifdef HV_VMBUS_FEATURE_PARENT_OR_PEER_MEMORY_MAPPED_INTO_A_CHILD + HV_CHANNEL_MESSAGE_VIEW_RANGE_ADD = 17, + HV_CHANNEL_MESSAGE_VIEW_RANGE_REMOVE = 18, +#endif + HV_CHANNEL_MESSAGE_COUNT +} hv_vmbus_channel_msg_type; + +typedef struct { + hv_vmbus_channel_msg_type message_type; + uint32_t padding; +} __packed hv_vmbus_channel_msg_header; + +/* + * Query VMBus Version parameters + */ +typedef struct { + hv_vmbus_channel_msg_header header; + uint32_t version; +} __packed hv_vmbus_channel_query_vmbus_version; + +/* + * VMBus Version Supported parameters + */ +typedef struct { + hv_vmbus_channel_msg_header header; + hv_bool_uint8_t version_supported; +} __packed hv_vmbus_channel_version_supported; + +/* + * Channel Offer parameters + */ +typedef struct { + hv_vmbus_channel_msg_header header; + hv_vmbus_channel_offer offer; + uint32_t child_rel_id; + uint8_t monitor_id; + hv_bool_uint8_t monitor_allocated; +} __packed hv_vmbus_channel_offer_channel; + +/* + * Rescind Offer parameters + */ +typedef struct +{ + hv_vmbus_channel_msg_header header; + uint32_t child_rel_id; +} __packed hv_vmbus_channel_rescind_offer; + + +/* + * Request Offer -- no parameters, SynIC message contains the partition ID + * + * Set Snoop -- no parameters, SynIC message contains the partition ID + * + * Clear Snoop -- no parameters, SynIC message contains the partition ID + * + * All Offers Delivered -- no parameters, SynIC message contains the + * partition ID + * + * Flush Client -- no parameters, SynIC message contains the partition ID + */ + + +/* + * Open Channel parameters + */ +typedef struct +{ + hv_vmbus_channel_msg_header header; + + /* + * Identifies the specific VMBus channel that is being opened. + */ + uint32_t child_rel_id; + + /* + * ID making a particular open request at a channel offer unique. + */ + uint32_t open_id; + + /* + * GPADL for the channel's ring buffer. + */ + hv_gpadl_handle ring_buffer_gpadl_handle; + + /* + * GPADL for the channel's server context save area. + */ + hv_gpadl_handle server_context_area_gpadl_handle; + + /* + * The upstream ring buffer begins at offset zero in the memory described + * by ring_buffer_gpadl_handle. The downstream ring buffer follows it at + * this offset (in pages). + */ + uint32_t downstream_ring_buffer_page_offset; + + /* + * User-specific data to be passed along to the server endpoint. + */ + uint8_t user_data[HV_MAX_USER_DEFINED_BYTES]; + +} __packed hv_vmbus_channel_open_channel; + +typedef uint32_t hv_nt_status; + +/* + * Open Channel Result parameters + */ +typedef struct +{ + hv_vmbus_channel_msg_header header; + uint32_t child_rel_id; + uint32_t open_id; + hv_nt_status status; +} __packed hv_vmbus_channel_open_result; + +/* + * Close channel parameters + */ +typedef struct +{ + hv_vmbus_channel_msg_header header; + uint32_t child_rel_id; +} __packed hv_vmbus_channel_close_channel; + +/* + * Channel Message GPADL + */ +#define HV_GPADL_TYPE_RING_BUFFER 1 +#define HV_GPADL_TYPE_SERVER_SAVE_AREA 2 +#define HV_GPADL_TYPE_TRANSACTION 8 + +/* + * The number of PFNs in a GPADL message is defined by the number of pages + * that would be spanned by byte_count and byte_offset. If the implied number + * of PFNs won't fit in this packet, there will be a follow-up packet that + * contains more + */ + +typedef struct { + hv_vmbus_channel_msg_header header; + uint32_t child_rel_id; + uint32_t gpadl; + uint16_t range_buf_len; + uint16_t range_count; + hv_gpa_range range[0]; +} __packed hv_vmbus_channel_gpadl_header; + +/* + * This is the follow-up packet that contains more PFNs + */ +typedef struct { + hv_vmbus_channel_msg_header header; + uint32_t message_number; + uint32_t gpadl; + uint64_t pfn[0]; +} __packed hv_vmbus_channel_gpadl_body; + +typedef struct { + hv_vmbus_channel_msg_header header; + uint32_t child_rel_id; + uint32_t gpadl; + uint32_t creation_status; +} __packed hv_vmbus_channel_gpadl_created; + +typedef struct { + hv_vmbus_channel_msg_header header; + uint32_t child_rel_id; + uint32_t gpadl; +} __packed hv_vmbus_channel_gpadl_teardown; + +typedef struct { + hv_vmbus_channel_msg_header header; + uint32_t gpadl; +} __packed hv_vmbus_channel_gpadl_torndown; + +typedef struct { + hv_vmbus_channel_msg_header header; + uint32_t child_rel_id; +} __packed hv_vmbus_channel_relid_released; + +typedef struct { + hv_vmbus_channel_msg_header header; + uint32_t vmbus_version_requested; + uint32_t padding2; + uint64_t interrupt_page; + uint64_t monitor_page_1; + uint64_t monitor_page_2; +} __packed hv_vmbus_channel_initiate_contact; + +typedef struct { + hv_vmbus_channel_msg_header header; + hv_bool_uint8_t version_supported; +} __packed hv_vmbus_channel_version_response; + +typedef hv_vmbus_channel_msg_header hv_vmbus_channel_unload; + +#define HW_MACADDR_LEN 6 + +/* + * Fixme: Added to quiet "typeof" errors involving hv_vmbus.h when + * the including C file was compiled with "-std=c99". + */ +#ifndef typeof +#define typeof __typeof +#endif + +#ifndef NULL +#define NULL (void *)0 +#endif + +typedef void *hv_vmbus_handle; + +#ifndef CONTAINING_RECORD +#define CONTAINING_RECORD(address, type, field) ((type *)( \ + (uint8_t *)(address) - \ + (uint8_t *)(&((type *)0)->field))) +#endif /* CONTAINING_RECORD */ + + +#define container_of(ptr, type, member) ({ \ + __typeof__( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +enum { + HV_VMBUS_IVAR_TYPE, + HV_VMBUS_IVAR_INSTANCE, + HV_VMBUS_IVAR_NODE, + HV_VMBUS_IVAR_DEVCTX +}; + +#define HV_VMBUS_ACCESSOR(var, ivar, type) \ + __BUS_ACCESSOR(vmbus, var, HV_VMBUS, ivar, type) + +HV_VMBUS_ACCESSOR(type, TYPE, const char *) +HV_VMBUS_ACCESSOR(devctx, DEVCTX, struct hv_device *) + + +/* + * Common defines for Hyper-V ICs + */ +#define HV_ICMSGTYPE_NEGOTIATE 0 +#define HV_ICMSGTYPE_HEARTBEAT 1 +#define HV_ICMSGTYPE_KVPEXCHANGE 2 +#define HV_ICMSGTYPE_SHUTDOWN 3 +#define HV_ICMSGTYPE_TIMESYNC 4 +#define HV_ICMSGTYPE_VSS 5 + +#define HV_ICMSGHDRFLAG_TRANSACTION 1 +#define HV_ICMSGHDRFLAG_REQUEST 2 +#define HV_ICMSGHDRFLAG_RESPONSE 4 + +typedef struct hv_vmbus_pipe_hdr { + uint32_t flags; + uint32_t msgsize; +} __packed hv_vmbus_pipe_hdr; + +typedef struct hv_vmbus_ic_version { + uint16_t major; + uint16_t minor; +} __packed hv_vmbus_ic_version; + +typedef struct hv_vmbus_icmsg_hdr { + hv_vmbus_ic_version icverframe; + uint16_t icmsgtype; + hv_vmbus_ic_version icvermsg; + uint16_t icmsgsize; + uint32_t status; + uint8_t ictransaction_id; + uint8_t icflags; + uint8_t reserved[2]; +} __packed hv_vmbus_icmsg_hdr; + +typedef struct hv_vmbus_icmsg_negotiate { + uint16_t icframe_vercnt; + uint16_t icmsg_vercnt; + uint32_t reserved; + hv_vmbus_ic_version icversion_data[1]; /* any size array */ +} __packed hv_vmbus_icmsg_negotiate; + +typedef struct hv_vmbus_shutdown_msg_data { + uint32_t reason_code; + uint32_t timeout_seconds; + uint32_t flags; + uint8_t display_message[2048]; +} __packed hv_vmbus_shutdown_msg_data; + +typedef struct hv_vmbus_heartbeat_msg_data { + uint64_t seq_num; + uint32_t reserved[8]; +} __packed hv_vmbus_heartbeat_msg_data; + +typedef struct { + /* + * offset in bytes from the start of ring data below + */ + volatile uint32_t write_index; + /* + * offset in bytes from the start of ring data below + */ + volatile uint32_t read_index; + /* + * NOTE: The interrupt_mask field is used only for channels, but + * vmbus connection also uses this data structure + */ + volatile uint32_t interrupt_mask; + /* pad it to PAGE_SIZE so that data starts on a page */ + uint8_t reserved[4084]; + + /* + * WARNING: Ring data starts here + ring_data_start_offset + * !!! DO NOT place any fields below this !!! + */ + uint8_t buffer[0]; /* doubles as interrupt mask */ +} __packed hv_vmbus_ring_buffer; + +typedef struct { + int length; + int offset; + uint64_t pfn; +} __packed hv_vmbus_page_buffer; + +typedef struct { + int length; + int offset; + uint64_t pfn_array[HV_MAX_MULTIPAGE_BUFFER_COUNT]; +} __packed hv_vmbus_multipage_buffer; + +typedef struct { + hv_vmbus_ring_buffer* ring_buffer; + uint32_t ring_size; /* Include the shared header */ + struct mtx ring_lock; + uint32_t ring_data_size; /* ring_size */ + uint32_t ring_data_start_offset; +} hv_vmbus_ring_buffer_info; + +typedef void (*hv_vmbus_pfn_channel_callback)(void *context); + +typedef enum { + HV_CHANNEL_OFFER_STATE, + HV_CHANNEL_OPENING_STATE, + HV_CHANNEL_OPEN_STATE, + HV_CHANNEL_CLOSING_NONDESTRUCTIVE_STATE, +} hv_vmbus_channel_state; + +typedef struct hv_vmbus_channel { + TAILQ_ENTRY(hv_vmbus_channel) list_entry; + struct hv_device* device; + hv_vmbus_channel_state state; + hv_vmbus_channel_offer_channel offer_msg; + /* + * These are based on the offer_msg.monitor_id. + * Save it here for easy access. + */ + uint8_t monitor_group; + uint8_t monitor_bit; + + uint32_t ring_buffer_gpadl_handle; + /* + * Allocated memory for ring buffer + */ + void* ring_buffer_pages; + uint32_t ring_buffer_page_count; + /* + * send to parent + */ + hv_vmbus_ring_buffer_info outbound; + /* + * receive from parent + */ + hv_vmbus_ring_buffer_info inbound; + + struct mtx inbound_lock; + hv_vmbus_handle control_work_queue; + + hv_vmbus_pfn_channel_callback on_channel_callback; + void* channel_callback_context; + +} hv_vmbus_channel; + +typedef struct hv_device { + hv_guid class_id; + hv_guid device_id; + device_t device; + hv_vmbus_channel* channel; +} hv_device; + + + +int hv_vmbus_channel_recv_packet( + hv_vmbus_channel* channel, + void* buffer, + uint32_t buffer_len, + uint32_t* buffer_actual_len, + uint64_t* request_id); + +int hv_vmbus_channel_recv_packet_raw( + hv_vmbus_channel* channel, + void* buffer, + uint32_t buffer_len, + uint32_t* buffer_actual_len, + uint64_t* request_id); + +int hv_vmbus_channel_open( + hv_vmbus_channel* channel, + uint32_t send_ring_buffer_size, + uint32_t recv_ring_buffer_size, + void* user_data, + uint32_t user_data_len, + hv_vmbus_pfn_channel_callback + pfn_on_channel_callback, + void* context); + +void hv_vmbus_channel_close(hv_vmbus_channel *channel); + +int hv_vmbus_channel_send_packet( + hv_vmbus_channel* channel, + void* buffer, + uint32_t buffer_len, + uint64_t request_id, + hv_vmbus_packet_type type, + uint32_t flags); + +int hv_vmbus_channel_send_packet_pagebuffer( + hv_vmbus_channel* channel, + hv_vmbus_page_buffer page_buffers[], + uint32_t page_count, + void* buffer, + uint32_t buffer_len, + uint64_t request_id); + +int hv_vmbus_channel_send_packet_multipagebuffer( + hv_vmbus_channel* channel, + hv_vmbus_multipage_buffer* multi_page_buffer, + void* buffer, + uint32_t buffer_len, + uint64_t request_id); + +int hv_vmbus_channel_establish_gpadl( + hv_vmbus_channel* channel, + /* must be phys and virt contiguous */ + void* contig_buffer, + /* page-size multiple */ + uint32_t size, + uint32_t* gpadl_handle); + +int hv_vmbus_channel_teardown_gpdal( + hv_vmbus_channel* channel, + uint32_t gpadl_handle); + +/* + * Work abstraction defines + */ +typedef struct hv_work_queue { + struct taskqueue* queue; + struct proc* proc; + struct sema* work_sema; +} hv_work_queue; + +typedef struct hv_work_item { + struct task work; + void (*callback)(void *); + void* context; + hv_work_queue* wq; +} hv_work_item; + +struct hv_work_queue* hv_work_queue_create(char* name); + +void hv_work_queue_close(struct hv_work_queue* wq); + +int hv_queue_work_item( + hv_work_queue* wq, + void (*callback)(void *), + void* context); +/** + * @brief Get physical address from virtual + */ +static inline unsigned long +hv_get_phys_addr(void *virt) +{ + unsigned long ret; + ret = (vtophys(virt) | ((vm_offset_t) virt & PAGE_MASK)); + return (ret); +} + +#endif /* __HYPERV_H__ */ + diff --git a/sys/contrib/dev/hyperv/netvsc/hv_net_vsc.c b/sys/contrib/dev/hyperv/netvsc/hv_net_vsc.c new file mode 100644 index 0000000..aeee94d --- /dev/null +++ b/sys/contrib/dev/hyperv/netvsc/hv_net_vsc.c @@ -0,0 +1,1141 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2012 NetApp Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * HyperV vmbus network VSC (virtual services client) module + * + */ + + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/lock.h> +#include <net/if.h> +#include <net/if_arp.h> +#include <machine/bus.h> +#include <machine/atomic.h> + +#include <dev/hyperv/include/hyperv.h> +#include "hv_net_vsc.h" +#include "hv_rndis.h" +#include "hv_rndis_filter.h" + + +/* + * Forward declarations + */ +static void hv_nv_on_channel_callback(void *context); +static int hv_nv_init_send_buffer_with_net_vsp(struct hv_device *device); +static int hv_nv_init_rx_buffer_with_net_vsp(struct hv_device *device); +static int hv_nv_destroy_send_buffer(netvsc_dev *net_dev); +static int hv_nv_destroy_rx_buffer(netvsc_dev *net_dev); +static int hv_nv_connect_to_vsp(struct hv_device *device); +static void hv_nv_on_send_completion(struct hv_device *device, + hv_vm_packet_descriptor *pkt); +static void hv_nv_on_receive(struct hv_device *device, + hv_vm_packet_descriptor *pkt); +static void hv_nv_send_receive_completion(struct hv_device *device, + uint64_t tid); + + +/* + * + */ +static inline netvsc_dev * +hv_nv_alloc_net_device(struct hv_device *device) +{ + netvsc_dev *net_dev; + hn_softc_t *sc = device_get_softc(device->device); + + net_dev = malloc(sizeof(netvsc_dev), M_DEVBUF, M_NOWAIT | M_ZERO); + if (net_dev == NULL) { + return (NULL); + } + + net_dev->dev = device; + net_dev->destroy = FALSE; + sc->net_dev = net_dev; + + return (net_dev); +} + +/* + * + */ +static inline netvsc_dev * +hv_nv_get_outbound_net_device(struct hv_device *device) +{ + hn_softc_t *sc = device_get_softc(device->device); + netvsc_dev *net_dev = sc->net_dev;; + + if ((net_dev != NULL) && net_dev->destroy) { + return (NULL); + } + + return (net_dev); +} + +/* + * + */ +static inline netvsc_dev * +hv_nv_get_inbound_net_device(struct hv_device *device) +{ + hn_softc_t *sc = device_get_softc(device->device); + netvsc_dev *net_dev = sc->net_dev;; + + if (net_dev == NULL) { + return (net_dev); + } + /* + * When the device is being destroyed; we only + * permit incoming packets if and only if there + * are outstanding sends. + */ + if (net_dev->destroy && net_dev->num_outstanding_sends == 0) { + return (NULL); + } + + return (net_dev); +} + +/* + * Net VSC initialize receive buffer with net VSP + * + * Net VSP: Network virtual services client, also known as the + * Hyper-V extensible switch and the synthetic data path. + */ +static int +hv_nv_init_rx_buffer_with_net_vsp(struct hv_device *device) +{ + netvsc_dev *net_dev; + nvsp_msg *init_pkt; + int ret = 0; + + net_dev = hv_nv_get_outbound_net_device(device); + if (!net_dev) { + return (ENODEV); + } + + net_dev->rx_buf = contigmalloc(net_dev->rx_buf_size, M_DEVBUF, + M_ZERO, 0UL, BUS_SPACE_MAXADDR, PAGE_SIZE, 0); + if (net_dev->rx_buf == NULL) { + ret = ENOMEM; + goto cleanup; + } + + /* + * Establish the GPADL handle for this buffer on this channel. + * Note: This call uses the vmbus connection rather than the + * channel to establish the gpadl handle. + * GPADL: Guest physical address descriptor list. + */ + ret = hv_vmbus_channel_establish_gpadl( + device->channel, net_dev->rx_buf, + net_dev->rx_buf_size, &net_dev->rx_buf_gpadl_handle); + if (ret != 0) { + goto cleanup; + } + + /* sema_wait(&ext->channel_init_sema); KYS CHECK */ + + /* Notify the NetVsp of the gpadl handle */ + init_pkt = &net_dev->channel_init_packet; + + memset(init_pkt, 0, sizeof(nvsp_msg)); + + init_pkt->hdr.msg_type = nvsp_msg_1_type_send_rx_buf; + init_pkt->msgs.vers_1_msgs.send_rx_buf.gpadl_handle = + net_dev->rx_buf_gpadl_handle; + init_pkt->msgs.vers_1_msgs.send_rx_buf.id = + NETVSC_RECEIVE_BUFFER_ID; + + /* Send the gpadl notification request */ + + ret = hv_vmbus_channel_send_packet(device->channel, init_pkt, + sizeof(nvsp_msg), (uint64_t)init_pkt, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + if (ret != 0) { + goto cleanup; + } + + sema_wait(&net_dev->channel_init_sema); + + /* Check the response */ + if (init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.status + != nvsp_status_success) { + ret = EINVAL; + goto cleanup; + } + + net_dev->rx_section_count = + init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.num_sections; + + net_dev->rx_sections = malloc(net_dev->rx_section_count * + sizeof(nvsp_1_rx_buf_section), M_DEVBUF, M_NOWAIT); + if (net_dev->rx_sections == NULL) { + ret = EINVAL; + goto cleanup; + } + memcpy(net_dev->rx_sections, + init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.sections, + net_dev->rx_section_count * sizeof(nvsp_1_rx_buf_section)); + + + /* + * For first release, there should only be 1 section that represents + * the entire receive buffer + */ + if (net_dev->rx_section_count != 1 + || net_dev->rx_sections->offset != 0) { + ret = EINVAL; + goto cleanup; + } + + goto exit; + +cleanup: + hv_nv_destroy_rx_buffer(net_dev); + +exit: + return (ret); +} + +/* + * Net VSC initialize send buffer with net VSP + */ +static int +hv_nv_init_send_buffer_with_net_vsp(struct hv_device *device) +{ + netvsc_dev *net_dev; + nvsp_msg *init_pkt; + int ret = 0; + + net_dev = hv_nv_get_outbound_net_device(device); + if (!net_dev) { + return (ENODEV); + } + + net_dev->send_buf = contigmalloc(net_dev->send_buf_size, M_DEVBUF, + M_ZERO, 0UL, BUS_SPACE_MAXADDR, PAGE_SIZE, 0); + if (net_dev->send_buf == NULL) { + ret = ENOMEM; + goto cleanup; + } + + /* + * Establish the gpadl handle for this buffer on this channel. + * Note: This call uses the vmbus connection rather than the + * channel to establish the gpadl handle. + */ + ret = hv_vmbus_channel_establish_gpadl(device->channel, + net_dev->send_buf, net_dev->send_buf_size, + &net_dev->send_buf_gpadl_handle); + if (ret != 0) { + goto cleanup; + } + + /* Notify the NetVsp of the gpadl handle */ + + init_pkt = &net_dev->channel_init_packet; + + memset(init_pkt, 0, sizeof(nvsp_msg)); + + init_pkt->hdr.msg_type = nvsp_msg_1_type_send_send_buf; + init_pkt->msgs.vers_1_msgs.send_rx_buf.gpadl_handle = + net_dev->send_buf_gpadl_handle; + init_pkt->msgs.vers_1_msgs.send_rx_buf.id = + NETVSC_SEND_BUFFER_ID; + + /* Send the gpadl notification request */ + + ret = hv_vmbus_channel_send_packet(device->channel, init_pkt, + sizeof(nvsp_msg), (uint64_t)init_pkt, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + if (ret != 0) { + goto cleanup; + } + + sema_wait(&net_dev->channel_init_sema); + + /* Check the response */ + if (init_pkt->msgs.vers_1_msgs.send_send_buf_complete.status + != nvsp_status_success) { + ret = EINVAL; + goto cleanup; + } + + net_dev->send_section_size = + init_pkt->msgs.vers_1_msgs.send_send_buf_complete.section_size; + + goto exit; + +cleanup: + hv_nv_destroy_send_buffer(net_dev); + +exit: + return (ret); +} + +/* + * Net VSC destroy receive buffer + */ +static int +hv_nv_destroy_rx_buffer(netvsc_dev *net_dev) +{ + nvsp_msg *revoke_pkt; + int ret = 0; + + /* + * If we got a section count, it means we received a + * send_rx_buf_complete msg + * (ie sent nvsp_msg_1_type_send_rx_buf msg) therefore, + * we need to send a revoke msg here + */ + if (net_dev->rx_section_count) { + /* Send the revoke receive buffer */ + revoke_pkt = &net_dev->revoke_packet; + memset(revoke_pkt, 0, sizeof(nvsp_msg)); + + revoke_pkt->hdr.msg_type = nvsp_msg_1_type_revoke_rx_buf; + revoke_pkt->msgs.vers_1_msgs.revoke_rx_buf.id = + NETVSC_RECEIVE_BUFFER_ID; + + ret = hv_vmbus_channel_send_packet(net_dev->dev->channel, + revoke_pkt, sizeof(nvsp_msg), + (uint64_t)revoke_pkt, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); + + /* + * If we failed here, we might as well return and have a leak + * rather than continue and a bugchk + */ + if (ret != 0) { + return (ret); + } + } + + /* Tear down the gpadl on the vsp end */ + if (net_dev->rx_buf_gpadl_handle) { + ret = hv_vmbus_channel_teardown_gpdal(net_dev->dev->channel, + net_dev->rx_buf_gpadl_handle); + /* + * If we failed here, we might as well return and have a leak + * rather than continue and a bugchk + */ + if (ret != 0) { + return (ret); + } + net_dev->rx_buf_gpadl_handle = 0; + } + + if (net_dev->rx_buf) { + /* Free up the receive buffer */ + contigfree(net_dev->rx_buf, net_dev->rx_buf_size, M_DEVBUF); + net_dev->rx_buf = NULL; + } + + if (net_dev->rx_sections) { + free(net_dev->rx_sections, M_DEVBUF); + net_dev->rx_sections = NULL; + net_dev->rx_section_count = 0; + } + + return (ret); +} + +/* + * Net VSC destroy send buffer + */ +static int +hv_nv_destroy_send_buffer(netvsc_dev *net_dev) +{ + nvsp_msg *revoke_pkt; + int ret = 0; + + /* + * If we got a section count, it means we received a + * send_rx_buf_complete msg + * (ie sent nvsp_msg_1_type_send_rx_buf msg) therefore, + * we need to send a revoke msg here + */ + if (net_dev->send_section_size) { + /* Send the revoke send buffer */ + revoke_pkt = &net_dev->revoke_packet; + memset(revoke_pkt, 0, sizeof(nvsp_msg)); + + revoke_pkt->hdr.msg_type = + nvsp_msg_1_type_revoke_send_buf; + revoke_pkt->msgs.vers_1_msgs.revoke_send_buf.id = + NETVSC_SEND_BUFFER_ID; + + ret = hv_vmbus_channel_send_packet(net_dev->dev->channel, + revoke_pkt, sizeof(nvsp_msg), + (uint64_t)revoke_pkt, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); + /* + * If we failed here, we might as well return and have a leak + * rather than continue and a bugchk + */ + if (ret != 0) { + return (ret); + } + } + + /* Tear down the gpadl on the vsp end */ + if (net_dev->send_buf_gpadl_handle) { + ret = hv_vmbus_channel_teardown_gpdal(net_dev->dev->channel, + net_dev->send_buf_gpadl_handle); + + /* + * If we failed here, we might as well return and have a leak + * rather than continue and a bugchk + */ + if (ret != 0) { + return (ret); + } + net_dev->send_buf_gpadl_handle = 0; + } + + if (net_dev->send_buf) { + /* Free up the receive buffer */ + contigfree(net_dev->send_buf, net_dev->send_buf_size, M_DEVBUF); + net_dev->send_buf = NULL; + } + + return (ret); +} + + +/* + * Attempt to negotiate the caller-specified NVSP version + * + * For NVSP v2, Server 2008 R2 does not set + * init_pkt->msgs.init_msgs.init_compl.negotiated_prot_vers + * to the negotiated version, so we cannot rely on that. + */ +static int +hv_nv_negotiate_nvsp_protocol(struct hv_device *device, netvsc_dev *net_dev, + uint32_t nvsp_ver) +{ + nvsp_msg *init_pkt; + int ret; + + init_pkt = &net_dev->channel_init_packet; + memset(init_pkt, 0, sizeof(nvsp_msg)); + init_pkt->hdr.msg_type = nvsp_msg_type_init; + + /* + * Specify parameter as the only acceptable protocol version + */ + init_pkt->msgs.init_msgs.init.p1.protocol_version = nvsp_ver; + init_pkt->msgs.init_msgs.init.protocol_version_2 = nvsp_ver; + + /* Send the init request */ + ret = hv_vmbus_channel_send_packet(device->channel, init_pkt, + sizeof(nvsp_msg), (uint64_t)init_pkt, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + if (ret != 0) + return (-1); + + sema_wait(&net_dev->channel_init_sema); + + if (init_pkt->msgs.init_msgs.init_compl.status != nvsp_status_success) + return (EINVAL); + + return (0); +} + +/* + * Send NDIS version 2 config packet containing MTU. + * + * Not valid for NDIS version 1. + */ +static int +hv_nv_send_ndis_config(struct hv_device *device, uint32_t mtu) +{ + netvsc_dev *net_dev; + nvsp_msg *init_pkt; + int ret; + + net_dev = hv_nv_get_outbound_net_device(device); + if (!net_dev) + return (-ENODEV); + + /* + * Set up configuration packet, write MTU + * Indicate we are capable of handling VLAN tags + */ + init_pkt = &net_dev->channel_init_packet; + memset(init_pkt, 0, sizeof(nvsp_msg)); + init_pkt->hdr.msg_type = nvsp_msg_2_type_send_ndis_config; + init_pkt->msgs.vers_2_msgs.send_ndis_config.mtu = mtu; + init_pkt-> + msgs.vers_2_msgs.send_ndis_config.capabilities.u1.u2.ieee8021q + = 1; + + /* Send the configuration packet */ + ret = hv_vmbus_channel_send_packet(device->channel, init_pkt, + sizeof(nvsp_msg), (uint64_t)init_pkt, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); + if (ret != 0) + return (-EINVAL); + + return (0); +} + +/* + * Net VSC connect to VSP + */ +static int +hv_nv_connect_to_vsp(struct hv_device *device) +{ + netvsc_dev *net_dev; + nvsp_msg *init_pkt; + uint32_t nvsp_vers; + uint32_t ndis_version; + int ret = 0; + device_t dev = device->device; + hn_softc_t *sc = device_get_softc(dev); + struct ifnet *ifp = sc->arpcom.ac_ifp; + + net_dev = hv_nv_get_outbound_net_device(device); + if (!net_dev) { + return (ENODEV); + } + + /* + * Negotiate the NVSP version. Try NVSP v2 first. + */ + nvsp_vers = NVSP_PROTOCOL_VERSION_2; + ret = hv_nv_negotiate_nvsp_protocol(device, net_dev, nvsp_vers); + if (ret != 0) { + /* NVSP v2 failed, try NVSP v1 */ + nvsp_vers = NVSP_PROTOCOL_VERSION_1; + ret = hv_nv_negotiate_nvsp_protocol(device, net_dev, nvsp_vers); + if (ret != 0) { + /* NVSP v1 failed, return bad status */ + return (ret); + } + } + net_dev->nvsp_version = nvsp_vers; + + /* + * Set the MTU if supported by this NVSP protocol version + * This needs to be right after the NVSP init message per Haiyang + */ + if (nvsp_vers >= NVSP_PROTOCOL_VERSION_2) + ret = hv_nv_send_ndis_config(device, ifp->if_mtu); + + /* + * Send the NDIS version + */ + init_pkt = &net_dev->channel_init_packet; + + memset(init_pkt, 0, sizeof(nvsp_msg)); + + /* + * Updated to version 5.1, minimum, for VLAN per Haiyang + */ + ndis_version = NDIS_VERSION; + + init_pkt->hdr.msg_type = nvsp_msg_1_type_send_ndis_vers; + init_pkt->msgs.vers_1_msgs.send_ndis_vers.ndis_major_vers = + (ndis_version & 0xFFFF0000) >> 16; + init_pkt->msgs.vers_1_msgs.send_ndis_vers.ndis_minor_vers = + ndis_version & 0xFFFF; + + /* Send the init request */ + + ret = hv_vmbus_channel_send_packet(device->channel, init_pkt, + sizeof(nvsp_msg), (uint64_t)init_pkt, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); + if (ret != 0) { + goto cleanup; + } + /* + * TODO: BUGBUG - We have to wait for the above msg since the netvsp + * uses KMCL which acknowledges packet (completion packet) + * since our Vmbus always set the + * HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED flag + */ + /* sema_wait(&NetVscChannel->channel_init_sema); */ + + /* Post the big receive buffer to NetVSP */ + ret = hv_nv_init_rx_buffer_with_net_vsp(device); + if (ret == 0) + ret = hv_nv_init_send_buffer_with_net_vsp(device); + +cleanup: + return (ret); +} + +/* + * Net VSC disconnect from VSP + */ +static void +hv_nv_disconnect_from_vsp(netvsc_dev *net_dev) +{ + hv_nv_destroy_rx_buffer(net_dev); + hv_nv_destroy_send_buffer(net_dev); +} + +/* + * Net VSC on device add + * + * Callback when the device belonging to this driver is added + */ +netvsc_dev * +hv_nv_on_device_add(struct hv_device *device, void *additional_info) +{ + netvsc_dev *net_dev; + netvsc_packet *packet; + netvsc_packet *next_packet; + int i, ret = 0; + + net_dev = hv_nv_alloc_net_device(device); + if (!net_dev) + goto cleanup; + + /* Initialize the NetVSC channel extension */ + net_dev->rx_buf_size = NETVSC_RECEIVE_BUFFER_SIZE; + mtx_init(&net_dev->rx_pkt_list_lock, "HV-RPL", NULL, + MTX_SPIN | MTX_RECURSE); + + net_dev->send_buf_size = NETVSC_SEND_BUFFER_SIZE; + + /* Same effect as STAILQ_HEAD_INITIALIZER() static initializer */ + STAILQ_INIT(&net_dev->myrx_packet_list); + + /* + * malloc a sufficient number of netvsc_packet buffers to hold + * a packet list. Add them to the netvsc device packet queue. + */ + for (i=0; i < NETVSC_RECEIVE_PACKETLIST_COUNT; i++) { + packet = malloc(sizeof(netvsc_packet) + + (NETVSC_RECEIVE_SG_COUNT * sizeof(hv_vmbus_page_buffer)), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (!packet) { + break; + } + STAILQ_INSERT_TAIL(&net_dev->myrx_packet_list, packet, + mylist_entry); + } + + sema_init(&net_dev->channel_init_sema, 0, "netdev_sema"); + + /* + * Open the channel + */ + ret = hv_vmbus_channel_open(device->channel, + NETVSC_DEVICE_RING_BUFFER_SIZE, NETVSC_DEVICE_RING_BUFFER_SIZE, + NULL, 0, hv_nv_on_channel_callback, device); + if (ret != 0) + goto cleanup; + + /* + * Connect with the NetVsp + */ + ret = hv_nv_connect_to_vsp(device); + if (ret != 0) + goto close; + + return (net_dev); + +close: + /* Now, we can close the channel safely */ + + hv_vmbus_channel_close(device->channel); + +cleanup: + /* + * Free the packet buffers on the netvsc device packet queue. + * Release other resources. + */ + if (net_dev) { + sema_destroy(&net_dev->channel_init_sema); + + packet = STAILQ_FIRST(&net_dev->myrx_packet_list); + while (packet != NULL) { + next_packet = STAILQ_NEXT(packet, mylist_entry); + free(packet, M_DEVBUF); + packet = next_packet; + } + /* Reset the list to initial state */ + STAILQ_INIT(&net_dev->myrx_packet_list); + + mtx_destroy(&net_dev->rx_pkt_list_lock); + + free(net_dev, M_DEVBUF); + } + + return (NULL); +} + +/* + * Net VSC on device remove + */ +int +hv_nv_on_device_remove(struct hv_device *device, boolean_t destroy_channel) +{ + netvsc_packet *net_vsc_pkt; + netvsc_packet *next_net_vsc_pkt; + hn_softc_t *sc = device_get_softc(device->device); + netvsc_dev *net_dev = sc->net_dev;; + + /* Stop outbound traffic ie sends and receives completions */ + mtx_lock(&device->channel->inbound_lock); + net_dev->destroy = TRUE; + mtx_unlock(&device->channel->inbound_lock); + + /* Wait for all send completions */ + while (net_dev->num_outstanding_sends) { + DELAY(100); + } + + hv_nv_disconnect_from_vsp(net_dev); + + /* At this point, no one should be accessing net_dev except in here */ + + /* Now, we can close the channel safely */ + + if (!destroy_channel) { + device->channel->state = + HV_CHANNEL_CLOSING_NONDESTRUCTIVE_STATE; + } + + hv_vmbus_channel_close(device->channel); + + /* Release all resources */ + net_vsc_pkt = STAILQ_FIRST(&net_dev->myrx_packet_list); + while (net_vsc_pkt != NULL) { + next_net_vsc_pkt = STAILQ_NEXT(net_vsc_pkt, mylist_entry); + free(net_vsc_pkt, M_DEVBUF); + net_vsc_pkt = next_net_vsc_pkt; + } + + /* Reset the list to initial state */ + STAILQ_INIT(&net_dev->myrx_packet_list); + + mtx_destroy(&net_dev->rx_pkt_list_lock); + sema_destroy(&net_dev->channel_init_sema); + free(net_dev, M_DEVBUF); + + return (0); +} + +/* + * Net VSC on send completion + */ +static void +hv_nv_on_send_completion(struct hv_device *device, hv_vm_packet_descriptor *pkt) +{ + netvsc_dev *net_dev; + nvsp_msg *nvsp_msg_pkt; + netvsc_packet *net_vsc_pkt; + + net_dev = hv_nv_get_inbound_net_device(device); + if (!net_dev) { + return; + } + + nvsp_msg_pkt = + (nvsp_msg *)((unsigned long)pkt + (pkt->data_offset8 << 3)); + + if (nvsp_msg_pkt->hdr.msg_type == nvsp_msg_type_init_complete + || nvsp_msg_pkt->hdr.msg_type + == nvsp_msg_1_type_send_rx_buf_complete + || nvsp_msg_pkt->hdr.msg_type + == nvsp_msg_1_type_send_send_buf_complete) { + /* Copy the response back */ + memcpy(&net_dev->channel_init_packet, nvsp_msg_pkt, + sizeof(nvsp_msg)); + sema_post(&net_dev->channel_init_sema); + } else if (nvsp_msg_pkt->hdr.msg_type == + nvsp_msg_1_type_send_rndis_pkt_complete) { + /* Get the send context */ + net_vsc_pkt = + (netvsc_packet *)(unsigned long)pkt->transaction_id; + + /* Notify the layer above us */ + net_vsc_pkt->compl.send.on_send_completion( + net_vsc_pkt->compl.send.send_completion_context); + + atomic_subtract_int(&net_dev->num_outstanding_sends, 1); + } +} + +/* + * Net VSC on send + * Sends a packet on the specified Hyper-V device. + * Returns 0 on success, non-zero on failure. + */ +int +hv_nv_on_send(struct hv_device *device, netvsc_packet *pkt) +{ + netvsc_dev *net_dev; + nvsp_msg send_msg; + int ret; + + net_dev = hv_nv_get_outbound_net_device(device); + if (!net_dev) + return (ENODEV); + + send_msg.hdr.msg_type = nvsp_msg_1_type_send_rndis_pkt; + if (pkt->is_data_pkt) { + /* 0 is RMC_DATA */ + send_msg.msgs.vers_1_msgs.send_rndis_pkt.chan_type = 0; + } else { + /* 1 is RMC_CONTROL */ + send_msg.msgs.vers_1_msgs.send_rndis_pkt.chan_type = 1; + } + + /* Not using send buffer section */ + send_msg.msgs.vers_1_msgs.send_rndis_pkt.send_buf_section_idx = + 0xFFFFFFFF; + send_msg.msgs.vers_1_msgs.send_rndis_pkt.send_buf_section_size = 0; + + if (pkt->page_buf_count) { + ret = hv_vmbus_channel_send_packet_pagebuffer(device->channel, + pkt->page_buffers, pkt->page_buf_count, + &send_msg, sizeof(nvsp_msg), (uint64_t)pkt); + } else { + ret = hv_vmbus_channel_send_packet(device->channel, + &send_msg, sizeof(nvsp_msg), (uint64_t)pkt, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + } + + /* Record outstanding send only if send_packet() succeeded */ + if (ret == 0) + atomic_add_int(&net_dev->num_outstanding_sends, 1); + + return (ret); +} + +/* + * Net VSC on receive + * + * In the FreeBSD Hyper-V virtual world, this function deals exclusively + * with virtual addresses. + */ +static void +hv_nv_on_receive(struct hv_device *device, hv_vm_packet_descriptor *pkt) +{ + netvsc_dev *net_dev; + hv_vm_transfer_page_packet_header *vm_xfer_page_pkt; + nvsp_msg *nvsp_msg_pkt; + netvsc_packet *net_vsc_pkt = NULL; + unsigned long start; + xfer_page_packet *xfer_page_pkt = NULL; + STAILQ_HEAD(PKT_LIST, netvsc_packet_) mylist_head = + STAILQ_HEAD_INITIALIZER(mylist_head); + int count = 0; + int i = 0; + + net_dev = hv_nv_get_inbound_net_device(device); + if (!net_dev) + return; + + /* + * All inbound packets other than send completion should be + * xfer page packet. + */ + if (pkt->type != HV_VMBUS_PACKET_TYPE_DATA_USING_TRANSFER_PAGES) + return; + + nvsp_msg_pkt = (nvsp_msg *)((unsigned long)pkt + + (pkt->data_offset8 << 3)); + + /* Make sure this is a valid nvsp packet */ + if (nvsp_msg_pkt->hdr.msg_type != nvsp_msg_1_type_send_rndis_pkt) + return; + + vm_xfer_page_pkt = (hv_vm_transfer_page_packet_header *)pkt; + + if (vm_xfer_page_pkt->transfer_page_set_id + != NETVSC_RECEIVE_BUFFER_ID) { + return; + } + + STAILQ_INIT(&mylist_head); + + /* + * Grab free packets (range count + 1) to represent this xfer page + * packet. +1 to represent the xfer page packet itself. We grab it + * here so that we know exactly how many we can fulfill. + */ + mtx_lock_spin(&net_dev->rx_pkt_list_lock); + while (!STAILQ_EMPTY(&net_dev->myrx_packet_list)) { + net_vsc_pkt = STAILQ_FIRST(&net_dev->myrx_packet_list); + STAILQ_REMOVE_HEAD(&net_dev->myrx_packet_list, mylist_entry); + + STAILQ_INSERT_TAIL(&mylist_head, net_vsc_pkt, mylist_entry); + + if (++count == vm_xfer_page_pkt->range_count + 1) + break; + } + + mtx_unlock_spin(&net_dev->rx_pkt_list_lock); + + /* + * We need at least 2 netvsc pkts (1 to represent the xfer page + * and at least 1 for the range) i.e. we can handle some of the + * xfer page packet ranges... + */ + if (count < 2) { + /* Return netvsc packet to the freelist */ + mtx_lock_spin(&net_dev->rx_pkt_list_lock); + for (i=count; i != 0; i--) { + net_vsc_pkt = STAILQ_FIRST(&mylist_head); + STAILQ_REMOVE_HEAD(&mylist_head, mylist_entry); + + STAILQ_INSERT_TAIL(&net_dev->myrx_packet_list, + net_vsc_pkt, mylist_entry); + } + mtx_unlock_spin(&net_dev->rx_pkt_list_lock); + + hv_nv_send_receive_completion(device, + vm_xfer_page_pkt->d.transaction_id); + + return; + } + + /* Take the first packet in the list */ + xfer_page_pkt = (xfer_page_packet *)STAILQ_FIRST(&mylist_head); + STAILQ_REMOVE_HEAD(&mylist_head, mylist_entry); + + /* This is how many data packets we can supply */ + xfer_page_pkt->count = count - 1; + + /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ + for (i=0; i < (count - 1); i++) { + net_vsc_pkt = STAILQ_FIRST(&mylist_head); + STAILQ_REMOVE_HEAD(&mylist_head, mylist_entry); + + /* + * Initialize the netvsc packet + */ + net_vsc_pkt->xfer_page_pkt = xfer_page_pkt; + net_vsc_pkt->compl.rx.rx_completion_context = net_vsc_pkt; + net_vsc_pkt->device = device; + /* Save this so that we can send it back */ + net_vsc_pkt->compl.rx.rx_completion_tid = + vm_xfer_page_pkt->d.transaction_id; + + net_vsc_pkt->tot_data_buf_len = + vm_xfer_page_pkt->ranges[i].byte_count; + net_vsc_pkt->page_buf_count = 1; + + net_vsc_pkt->page_buffers[0].length = + vm_xfer_page_pkt->ranges[i].byte_count; + + /* The virtual address of the packet in the receive buffer */ + start = ((unsigned long)net_dev->rx_buf + + vm_xfer_page_pkt->ranges[i].byte_offset); + start = ((unsigned long)start) & ~(PAGE_SIZE - 1); + + /* Page number of the virtual page containing packet start */ + net_vsc_pkt->page_buffers[0].pfn = start >> PAGE_SHIFT; + + /* Calculate the page relative offset */ + net_vsc_pkt->page_buffers[0].offset = + vm_xfer_page_pkt->ranges[i].byte_offset & (PAGE_SIZE - 1); + + /* + * In this implementation, we are dealing with virtual + * addresses exclusively. Since we aren't using physical + * addresses at all, we don't care if a packet crosses a + * page boundary. For this reason, the original code to + * check for and handle page crossings has been removed. + */ + + /* + * Pass it to the upper layer. The receive completion call + * has been moved into this function. + */ + hv_rf_on_receive(device, net_vsc_pkt); + + /* + * Moved completion call back here so that all received + * messages (not just data messages) will trigger a response + * message back to the host. + */ + hv_nv_on_receive_completion(net_vsc_pkt); + } +} + +/* + * Net VSC send receive completion + */ +static void +hv_nv_send_receive_completion(struct hv_device *device, uint64_t tid) +{ + nvsp_msg rx_comp_msg; + int retries = 0; + int ret = 0; + + rx_comp_msg.hdr.msg_type = nvsp_msg_1_type_send_rndis_pkt_complete; + + /* Pass in the status */ + rx_comp_msg.msgs.vers_1_msgs.send_rndis_pkt_complete.status = + nvsp_status_success; + +retry_send_cmplt: + /* Send the completion */ + ret = hv_vmbus_channel_send_packet(device->channel, &rx_comp_msg, + sizeof(nvsp_msg), tid, HV_VMBUS_PACKET_TYPE_COMPLETION, 0); + if (ret == 0) { + /* success */ + /* no-op */ + } else if (ret == EAGAIN) { + /* no more room... wait a bit and attempt to retry 3 times */ + retries++; + + if (retries < 4) { + DELAY(100); + goto retry_send_cmplt; + } + } +} + +/* + * Net VSC on receive completion + * + * Send a receive completion packet to RNDIS device (ie NetVsp) + */ +void +hv_nv_on_receive_completion(void *context) +{ + netvsc_packet *packet = (netvsc_packet *)context; + struct hv_device *device = (struct hv_device *)packet->device; + netvsc_dev *net_dev; + uint64_t tid = 0; + boolean_t send_rx_completion = FALSE; + + /* + * Even though it seems logical to do a hv_nv_get_outbound_net_device() + * here to send out receive completion, we are using + * hv_nv_get_inbound_net_device() since we may have disabled + * outbound traffic already. + */ + net_dev = hv_nv_get_inbound_net_device(device); + if (net_dev == NULL) + return; + + /* Overloading use of the lock. */ + mtx_lock_spin(&net_dev->rx_pkt_list_lock); + + packet->xfer_page_pkt->count--; + + /* + * Last one in the line that represent 1 xfer page packet. + * Return the xfer page packet itself to the free list. + */ + if (packet->xfer_page_pkt->count == 0) { + send_rx_completion = TRUE; + tid = packet->compl.rx.rx_completion_tid; + STAILQ_INSERT_TAIL(&net_dev->myrx_packet_list, + (netvsc_packet *)(packet->xfer_page_pkt), mylist_entry); + } + + /* Put the packet back on the free list */ + STAILQ_INSERT_TAIL(&net_dev->myrx_packet_list, packet, mylist_entry); + mtx_unlock_spin(&net_dev->rx_pkt_list_lock); + + /* Send a receive completion for the xfer page packet */ + if (send_rx_completion) + hv_nv_send_receive_completion(device, tid); +} + +/* + * Net VSC on channel callback + */ +static void +hv_nv_on_channel_callback(void *context) +{ + /* Fixme: Magic number */ + const int net_pkt_size = 2048; + struct hv_device *device = (struct hv_device *)context; + netvsc_dev *net_dev; + uint32_t bytes_rxed; + uint64_t request_id; + uint8_t *packet; + hv_vm_packet_descriptor *desc; + uint8_t *buffer; + int bufferlen = net_pkt_size; + int ret = 0; + + packet = malloc(net_pkt_size * sizeof(uint8_t), M_DEVBUF, M_NOWAIT); + if (!packet) + return; + + buffer = packet; + + net_dev = hv_nv_get_inbound_net_device(device); + if (net_dev == NULL) + goto out; + + do { + ret = hv_vmbus_channel_recv_packet_raw(device->channel, + buffer, bufferlen, &bytes_rxed, &request_id); + if (ret == 0) { + if (bytes_rxed > 0) { + desc = (hv_vm_packet_descriptor *)buffer; + switch (desc->type) { + case HV_VMBUS_PACKET_TYPE_COMPLETION: + hv_nv_on_send_completion(device, desc); + break; + case HV_VMBUS_PACKET_TYPE_DATA_USING_TRANSFER_PAGES: + hv_nv_on_receive(device, desc); + break; + default: + break; + } + } else { + break; + } + } else if (ret == ENOBUFS) { + /* Handle large packet */ + free(buffer, M_DEVBUF); + buffer = malloc(bytes_rxed, M_DEVBUF, M_NOWAIT); + if (buffer == NULL) { + break; + } + bufferlen = bytes_rxed; + } + } while (1); + +out: + free(buffer, M_DEVBUF); +} + diff --git a/sys/contrib/dev/hyperv/netvsc/hv_net_vsc.h b/sys/contrib/dev/hyperv/netvsc/hv_net_vsc.h new file mode 100644 index 0000000..f7e7d00 --- /dev/null +++ b/sys/contrib/dev/hyperv/netvsc/hv_net_vsc.h @@ -0,0 +1,995 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2012 NetApp Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * HyperV vmbus (virtual machine bus) network VSC (virtual services client) + * header file + * + * (Updated from unencumbered NvspProtocol.h) + */ + +#ifndef __HV_NET_VSC_H__ +#define __HV_NET_VSC_H__ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/sx.h> + +#include <dev/hyperv/include/hyperv.h> + + +#define NVSP_INVALID_PROTOCOL_VERSION (0xFFFFFFFF) + +#define NVSP_PROTOCOL_VERSION_1 2 +#define NVSP_PROTOCOL_VERSION_2 0x30002 +#define NVSP_MIN_PROTOCOL_VERSION (NVSP_PROTOCOL_VERSION_1) +#define NVSP_MAX_PROTOCOL_VERSION (NVSP_PROTOCOL_VERSION_2) + +#define NVSP_PROTOCOL_VERSION_CURRENT NVSP_PROTOCOL_VERSION_2 + +#define NVSP_OPERATIONAL_STATUS_OK (0x00000000) +#define NVSP_OPERATIONAL_STATUS_DEGRADED (0x00000001) +#define NVSP_OPERATIONAL_STATUS_NONRECOVERABLE (0x00000002) +#define NVSP_OPERATIONAL_STATUS_NO_CONTACT (0x00000003) +#define NVSP_OPERATIONAL_STATUS_LOST_COMMUNICATION (0x00000004) + +/* + * Maximun number of transfer pages (packets) the VSP will use on a receive + */ +#define NVSP_MAX_PACKETS_PER_RECEIVE 375 + + +typedef enum nvsp_msg_type_ { + nvsp_msg_type_none = 0, + + /* + * Init Messages + */ + nvsp_msg_type_init = 1, + nvsp_msg_type_init_complete = 2, + + nvsp_version_msg_start = 100, + + /* + * Version 1 Messages + */ + nvsp_msg_1_type_send_ndis_vers = nvsp_version_msg_start, + + nvsp_msg_1_type_send_rx_buf, + nvsp_msg_1_type_send_rx_buf_complete, + nvsp_msg_1_type_revoke_rx_buf, + + nvsp_msg_1_type_send_send_buf, + nvsp_msg_1_type_send_send_buf_complete, + nvsp_msg_1_type_revoke_send_buf, + + nvsp_msg_1_type_send_rndis_pkt, + nvsp_msg_1_type_send_rndis_pkt_complete, + + /* + * Version 2 Messages + */ + nvsp_msg_2_type_send_chimney_delegated_buf, + nvsp_msg_2_type_send_chimney_delegated_buf_complete, + nvsp_msg_2_type_revoke_chimney_delegated_buf, + + nvsp_msg_2_type_resume_chimney_rx_indication, + + nvsp_msg_2_type_terminate_chimney, + nvsp_msg_2_type_terminate_chimney_complete, + + nvsp_msg_2_type_indicate_chimney_event, + + nvsp_msg_2_type_send_chimney_packet, + nvsp_msg_2_type_send_chimney_packet_complete, + + nvsp_msg_2_type_post_chimney_rx_request, + nvsp_msg_2_type_post_chimney_rx_request_complete, + + nvsp_msg_2_type_alloc_rx_buf, + nvsp_msg_2_type_alloc_rx_buf_complete, + + nvsp_msg_2_type_free_rx_buf, + + nvsp_msg_2_send_vmq_rndis_pkt, + nvsp_msg_2_send_vmq_rndis_pkt_complete, + + nvsp_msg_2_type_send_ndis_config, + + nvsp_msg_2_type_alloc_chimney_handle, + nvsp_msg_2_type_alloc_chimney_handle_complete, +} nvsp_msg_type; + +typedef enum nvsp_status_ { + nvsp_status_none = 0, + nvsp_status_success, + nvsp_status_failure, + /* Deprecated */ + nvsp_status_prot_vers_range_too_new, + /* Deprecated */ + nvsp_status_prot_vers_range_too_old, + nvsp_status_invalid_rndis_pkt, + nvsp_status_busy, + nvsp_status_max, +} nvsp_status; + +typedef struct nvsp_msg_hdr_ { + uint32_t msg_type; +} __packed nvsp_msg_hdr; + +/* + * Init Messages + */ + +/* + * This message is used by the VSC to initialize the channel + * after the channels has been opened. This message should + * never include anything other then versioning (i.e. this + * message will be the same for ever). + * + * Forever is a long time. The values have been redefined + * in Win7 to indicate major and minor protocol version + * number. + */ +typedef struct nvsp_msg_init_ { + union { + struct { + uint16_t minor_protocol_version; + uint16_t major_protocol_version; + } s; + /* Formerly min_protocol_version */ + uint32_t protocol_version; + } p1; + /* Formerly max_protocol_version */ + uint32_t protocol_version_2; +} __packed nvsp_msg_init; + +/* + * This message is used by the VSP to complete the initialization + * of the channel. This message should never include anything other + * then versioning (i.e. this message will be the same forever). + */ +typedef struct nvsp_msg_init_complete_ { + /* Deprecated */ + uint32_t negotiated_prot_vers; + uint32_t max_mdl_chain_len; + uint32_t status; +} __packed nvsp_msg_init_complete; + +typedef union nvsp_msg_init_uber_ { + nvsp_msg_init init; + nvsp_msg_init_complete init_compl; +} __packed nvsp_msg_init_uber; + +/* + * Version 1 Messages + */ + +/* + * This message is used by the VSC to send the NDIS version + * to the VSP. The VSP can use this information when handling + * OIDs sent by the VSC. + */ +typedef struct nvsp_1_msg_send_ndis_version_ { + uint32_t ndis_major_vers; + /* Deprecated */ + uint32_t ndis_minor_vers; +} __packed nvsp_1_msg_send_ndis_version; + +/* + * This message is used by the VSC to send a receive buffer + * to the VSP. The VSP can then use the receive buffer to + * send data to the VSC. + */ +typedef struct nvsp_1_msg_send_rx_buf_ { + uint32_t gpadl_handle; + uint16_t id; +} __packed nvsp_1_msg_send_rx_buf; + +typedef struct nvsp_1_rx_buf_section_ { + uint32_t offset; + uint32_t sub_allocation_size; + uint32_t num_sub_allocations; + uint32_t end_offset; +} __packed nvsp_1_rx_buf_section; + +/* + * This message is used by the VSP to acknowledge a receive + * buffer send by the VSC. This message must be sent by the + * VSP before the VSP uses the receive buffer. + */ +typedef struct nvsp_1_msg_send_rx_buf_complete_ { + uint32_t status; + uint32_t num_sections; + + /* + * The receive buffer is split into two parts, a large + * suballocation section and a small suballocation + * section. These sections are then suballocated by a + * certain size. + * + * For example, the following break up of the receive + * buffer has 6 large suballocations and 10 small + * suballocations. + * + * | Large Section | | Small Section | + * ------------------------------------------------------------ + * | | | | | | | | | | | | | | | | | | + * | | + * LargeOffset SmallOffset + */ + nvsp_1_rx_buf_section sections[1]; + +} __packed nvsp_1_msg_send_rx_buf_complete; + +/* + * This message is sent by the VSC to revoke the receive buffer. + * After the VSP completes this transaction, the VSP should never + * use the receive buffer again. + */ +typedef struct nvsp_1_msg_revoke_rx_buf_ { + uint16_t id; +} __packed nvsp_1_msg_revoke_rx_buf; + +/* + * This message is used by the VSC to send a send buffer + * to the VSP. The VSC can then use the send buffer to + * send data to the VSP. + */ +typedef struct nvsp_1_msg_send_send_buf_ { + uint32_t gpadl_handle; + uint16_t id; +} __packed nvsp_1_msg_send_send_buf; + +/* + * This message is used by the VSP to acknowledge a send + * buffer sent by the VSC. This message must be sent by the + * VSP before the VSP uses the sent buffer. + */ +typedef struct nvsp_1_msg_send_send_buf_complete_ { + uint32_t status; + + /* + * The VSC gets to choose the size of the send buffer and + * the VSP gets to choose the sections size of the buffer. + * This was done to enable dynamic reconfigurations when + * the cost of GPA-direct buffers decreases. + */ + uint32_t section_size; +} __packed nvsp_1_msg_send_send_buf_complete; + +/* + * This message is sent by the VSC to revoke the send buffer. + * After the VSP completes this transaction, the vsp should never + * use the send buffer again. + */ +typedef struct nvsp_1_msg_revoke_send_buf_ { + uint16_t id; +} __packed nvsp_1_msg_revoke_send_buf; + +/* + * This message is used by both the VSP and the VSC to send + * an RNDIS message to the opposite channel endpoint. + */ +typedef struct nvsp_1_msg_send_rndis_pkt_ { + /* + * This field is specified by RNIDS. They assume there's + * two different channels of communication. However, + * the Network VSP only has one. Therefore, the channel + * travels with the RNDIS packet. + */ + uint32_t chan_type; + + /* + * This field is used to send part or all of the data + * through a send buffer. This values specifies an + * index into the send buffer. If the index is + * 0xFFFFFFFF, then the send buffer is not being used + * and all of the data was sent through other VMBus + * mechanisms. + */ + uint32_t send_buf_section_idx; + uint32_t send_buf_section_size; +} __packed nvsp_1_msg_send_rndis_pkt; + +/* + * This message is used by both the VSP and the VSC to complete + * a RNDIS message to the opposite channel endpoint. At this + * point, the initiator of this message cannot use any resources + * associated with the original RNDIS packet. + */ +typedef struct nvsp_1_msg_send_rndis_pkt_complete_ { + uint32_t status; +} __packed nvsp_1_msg_send_rndis_pkt_complete; + + +/* + * Version 2 Messages + */ + +/* + * This message is used by the VSC to send the NDIS version + * to the VSP. The VSP can use this information when handling + * OIDs sent by the VSC. + */ +typedef struct nvsp_2_netvsc_capabilities_ { + union { + uint64_t as_uint64; + struct { + uint64_t vmq : 1; + uint64_t chimney : 1; + uint64_t sriov : 1; + uint64_t ieee8021q : 1; + uint64_t correlationid : 1; + uint64_t teaming : 1; + } u2; + } u1; +} __packed nvsp_2_netvsc_capabilities; + +typedef struct nvsp_2_msg_send_ndis_config_ { + uint32_t mtu; + uint32_t reserved; + nvsp_2_netvsc_capabilities capabilities; +} __packed nvsp_2_msg_send_ndis_config; + +/* + * NvspMessage2TypeSendChimneyDelegatedBuffer + */ +typedef struct nvsp_2_msg_send_chimney_buf_ +{ + /* + * On WIN7 beta, delegated_obj_max_size is defined as a uint32_t + * Since WIN7 RC, it was split into two uint16_t. To have the same + * struct layout, delegated_obj_max_size shall be the first field. + */ + uint16_t delegated_obj_max_size; + + /* + * The revision # of chimney protocol used between NVSC and NVSP. + * + * This revision is NOT related to the chimney revision between + * NDIS protocol and miniport drivers. + */ + uint16_t revision; + + uint32_t gpadl_handle; +} __packed nvsp_2_msg_send_chimney_buf; + + +/* Unsupported chimney revision 0 (only present in WIN7 beta) */ +#define NVSP_CHIMNEY_REVISION_0 0 + +/* WIN7 Beta Chimney QFE */ +#define NVSP_CHIMNEY_REVISION_1 1 + +/* The chimney revision since WIN7 RC */ +#define NVSP_CHIMNEY_REVISION_2 2 + + +/* + * NvspMessage2TypeSendChimneyDelegatedBufferComplete + */ +typedef struct nvsp_2_msg_send_chimney_buf_complete_ { + uint32_t status; + + /* + * Maximum number outstanding sends and pre-posted receives. + * + * NVSC should not post more than SendQuota/ReceiveQuota packets. + * Otherwise, it can block the non-chimney path for an indefinite + * amount of time. + * (since chimney sends/receives are affected by the remote peer). + * + * Note: NVSP enforces the quota restrictions on a per-VMBCHANNEL + * basis. It doesn't enforce the restriction separately for chimney + * send/receive. If NVSC doesn't voluntarily enforce "SendQuota", + * it may kill its own network connectivity. + */ + uint32_t send_quota; + uint32_t rx_quota; +} __packed nvsp_2_msg_send_chimney_buf_complete; + +/* + * NvspMessage2TypeRevokeChimneyDelegatedBuffer + */ +typedef struct nvsp_2_msg_revoke_chimney_buf_ { + uint32_t gpadl_handle; +} __packed nvsp_2_msg_revoke_chimney_buf; + + +#define NVSP_CHIMNEY_OBJECT_TYPE_NEIGHBOR 0 +#define NVSP_CHIMNEY_OBJECT_TYPE_PATH4 1 +#define NVSP_CHIMNEY_OBJECT_TYPE_PATH6 2 +#define NVSP_CHIMNEY_OBJECT_TYPE_TCP 3 + +/* + * NvspMessage2TypeAllocateChimneyHandle + */ +typedef struct nvsp_2_msg_alloc_chimney_handle_ { + uint64_t vsc_context; + uint32_t object_type; +} __packed nvsp_2_msg_alloc_chimney_handle; + +/* + * NvspMessage2TypeAllocateChimneyHandleComplete + */ +typedef struct nvsp_2_msg_alloc_chimney_handle_complete_ { + uint32_t vsp_handle; +} __packed nvsp_2_msg_alloc_chimney_handle_complete; + + +/* + * NvspMessage2TypeResumeChimneyRXIndication + */ +typedef struct nvsp_2_msg_resume_chimney_rx_indication { + /* + * Handle identifying the offloaded connection + */ + uint32_t vsp_tcp_handle; +} __packed nvsp_2_msg_resume_chimney_rx_indication; + + +#define NVSP_2_MSG_TERMINATE_CHIMNEY_FLAGS_FIRST_STAGE (0x01u) +#define NVSP_2_MSG_TERMINATE_CHIMNEY_FLAGS_RESERVED (~(0x01u)) + +/* + * NvspMessage2TypeTerminateChimney + */ +typedef struct nvsp_2_msg_terminate_chimney_ { + /* + * Handle identifying the offloaded object + */ + uint32_t vsp_handle; + + /* + * Terminate Offload Flags + * Bit 0: + * When set to 0, terminate the offload at the destination NIC + * Bit 1-31: Reserved, shall be zero + */ + uint32_t flags; + + union { + /* + * This field is valid only when bit 0 of flags is clear. + * It specifies the index into the premapped delegated + * object buffer. The buffer was sent through the + * NvspMessage2TypeSendChimneyDelegatedBuffer + * message at initialization time. + * + * NVSP will write the delegated state into the delegated + * buffer upon upload completion. + */ + uint32_t index; + + /* + * This field is valid only when bit 0 of flags is set. + * + * The seqence number of the most recently accepted RX + * indication when VSC sets its TCP context into + * "terminating" state. + * + * This allows NVSP to determines if there are any in-flight + * RX indications for which the acceptance state is still + * undefined. + */ + uint64_t last_accepted_rx_seq_no; + } f0; +} __packed nvsp_2_msg_terminate_chimney; + + +#define NVSP_TERMINATE_CHIMNEY_COMPLETE_FLAG_DATA_CORRUPTED 0x0000001u + +/* + * NvspMessage2TypeTerminateChimneyComplete + */ +typedef struct nvsp_2_msg_terminate_chimney_complete_ { + uint64_t vsc_context; + uint32_t flags; +} __packed nvsp_2_msg_terminate_chimney_complete; + +/* + * NvspMessage2TypeIndicateChimneyEvent + */ +typedef struct nvsp_2_msg_indicate_chimney_event_ { + /* + * When VscTcpContext is 0, event_type is an NDIS_STATUS event code + * Otherwise, EventType is an TCP connection event (defined in + * NdisTcpOffloadEventHandler chimney DDK document). + */ + uint32_t event_type; + + /* + * When VscTcpContext is 0, EventType is an NDIS_STATUS event code + * Otherwise, EventType is an TCP connection event specific information + * (defined in NdisTcpOffloadEventHandler chimney DDK document). + */ + uint32_t event_specific_info; + + /* + * If not 0, the event is per-TCP connection event. This field + * contains the VSC's TCP context. + * If 0, the event indication is global. + */ + uint64_t vsc_tcp_context; +} __packed nvsp_2_msg_indicate_chimney_event; + + +#define NVSP_1_CHIMNEY_SEND_INVALID_OOB_INDEX 0xffffu +#define NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX 0xffffu + +/* + * NvspMessage2TypeSendChimneyPacket + */ +typedef struct nvsp_2_msg_send_chimney_pkt_ { + /* + * Identify the TCP connection for which this chimney send is + */ + uint32_t vsp_tcp_handle; + + /* + * This field is used to send part or all of the data + * through a send buffer. This values specifies an + * index into the send buffer. If the index is + * 0xFFFF, then the send buffer is not being used + * and all of the data was sent through other VMBus + * mechanisms. + */ + uint16_t send_buf_section_index; + uint16_t send_buf_section_size; + + /* + * OOB Data Index + * This an index to the OOB data buffer. If the index is 0xFFFFFFFF, + * then there is no OOB data. + * + * This field shall be always 0xFFFFFFFF for now. It is reserved for + * the future. + */ + uint16_t oob_data_index; + + /* + * DisconnectFlags = 0 + * Normal chimney send. See MiniportTcpOffloadSend for details. + * + * DisconnectFlags = TCP_DISCONNECT_GRACEFUL_CLOSE (0x01) + * Graceful disconnect. See MiniportTcpOffloadDisconnect for details. + * + * DisconnectFlags = TCP_DISCONNECT_ABORTIVE_CLOSE (0x02) + * Abortive disconnect. See MiniportTcpOffloadDisconnect for details. + */ + uint16_t disconnect_flags; + + uint32_t seq_no; +} __packed nvsp_2_msg_send_chimney_pkt; + +/* + * NvspMessage2TypeSendChimneyPacketComplete + */ +typedef struct nvsp_2_msg_send_chimney_pkt_complete_ { + /* + * The NDIS_STATUS for the chimney send + */ + uint32_t status; + + /* + * Number of bytes that have been sent to the peer (and ACKed by the peer). + */ + uint32_t bytes_transferred; +} __packed nvsp_2_msg_send_chimney_pkt_complete; + + +#define NVSP_1_CHIMNEY_RECV_FLAG_NO_PUSH 0x0001u +#define NVSP_1_CHIMNEY_RECV_INVALID_OOB_INDEX 0xffffu + +/* + * NvspMessage2TypePostChimneyRecvRequest + */ +typedef struct nvsp_2_msg_post_chimney_rx_request_ { + /* + * Identify the TCP connection which this chimney receive request + * is for. + */ + uint32_t vsp_tcp_handle; + + /* + * OOB Data Index + * This an index to the OOB data buffer. If the index is 0xFFFFFFFF, + * then there is no OOB data. + * + * This field shall be always 0xFFFFFFFF for now. It is reserved for + * the future. + */ + uint32_t oob_data_index; + + /* + * Bit 0 + * When it is set, this is a "no-push" receive. + * When it is clear, this is a "push" receive. + * + * Bit 1-15: Reserved and shall be zero + */ + uint16_t flags; + + /* + * For debugging and diagnoses purpose. + * The SeqNo is per TCP connection and starts from 0. + */ + uint32_t seq_no; +} __packed nvsp_2_msg_post_chimney_rx_request; + +/* + * NvspMessage2TypePostChimneyRecvRequestComplete + */ +typedef struct nvsp_2_msg_post_chimney_rx_request_complete_ { + /* + * The NDIS_STATUS for the chimney send + */ + uint32_t status; + + /* + * Number of bytes that have been sent to the peer (and ACKed by + * the peer). + */ + uint32_t bytes_xferred; +} __packed nvsp_2_msg_post_chimney_rx_request_complete; + +/* + * NvspMessage2TypeAllocateReceiveBuffer + */ +typedef struct nvsp_2_msg_alloc_rx_buf_ { + /* + * Allocation ID to match the allocation request and response + */ + uint32_t allocation_id; + + /* + * Length of the VM shared memory receive buffer that needs to + * be allocated + */ + uint32_t length; +} __packed nvsp_2_msg_alloc_rx_buf; + +/* + * NvspMessage2TypeAllocateReceiveBufferComplete + */ +typedef struct nvsp_2_msg_alloc_rx_buf_complete_ { + /* + * The NDIS_STATUS code for buffer allocation + */ + uint32_t status; + + /* + * Allocation ID from NVSP_2_MESSAGE_ALLOCATE_RECEIVE_BUFFER + */ + uint32_t allocation_id; + + /* + * GPADL handle for the allocated receive buffer + */ + uint32_t gpadl_handle; + + /* + * Receive buffer ID that is further used in + * NvspMessage2SendVmqRndisPacket + */ + uint64_t rx_buf_id; +} __packed nvsp_2_msg_alloc_rx_buf_complete; + +/* + * NvspMessage2TypeFreeReceiveBuffer + */ +typedef struct nvsp_2_msg_free_rx_buf_ { + /* + * Receive buffer ID previous returned in + * NvspMessage2TypeAllocateReceiveBufferComplete message + */ + uint64_t rx_buf_id; +} __packed nvsp_2_msg_free_rx_buf; + +/* + * This structure is used in defining the buffers in + * NVSP_2_MESSAGE_SEND_VMQ_RNDIS_PACKET structure + */ +typedef struct nvsp_xfer_page_range_ { + /* + * Specifies the ID of the receive buffer that has the buffer. This + * ID can be the general receive buffer ID specified in + * NvspMessage1TypeSendReceiveBuffer or it can be the shared memory + * receive buffer ID allocated by the VSC and specified in + * NvspMessage2TypeAllocateReceiveBufferComplete message + */ + uint64_t xfer_page_set_id; + + /* + * Number of bytes + */ + uint32_t byte_count; + + /* + * Offset in bytes from the beginning of the buffer + */ + uint32_t byte_offset; +} __packed nvsp_xfer_page_range; + +/* + * NvspMessage2SendVmqRndisPacket + */ +typedef struct nvsp_2_msg_send_vmq_rndis_pkt_ { + /* + * This field is specified by RNIDS. They assume there's + * two different channels of communication. However, + * the Network VSP only has one. Therefore, the channel + * travels with the RNDIS packet. It must be RMC_DATA + */ + uint32_t channel_type; + + /* + * Only the Range element corresponding to the RNDIS header of + * the first RNDIS message in the multiple RNDIS messages sent + * in one NVSP message. Information about the data portions as well + * as the subsequent RNDIS messages in the same NVSP message are + * embedded in the RNDIS header itself + */ + nvsp_xfer_page_range range; +} __packed nvsp_2_msg_send_vmq_rndis_pkt; + +/* + * This message is used by the VSC to complete + * a RNDIS VMQ message to the VSP. At this point, + * the initiator of this message can use any resources + * associated with the original RNDIS VMQ packet. + */ +typedef struct nvsp_2_msg_send_vmq_rndis_pkt_complete_ +{ + uint32_t status; +} __packed nvsp_2_msg_send_vmq_rndis_pkt_complete; + + +typedef union nvsp_1_msg_uber_ { + nvsp_1_msg_send_ndis_version send_ndis_vers; + + nvsp_1_msg_send_rx_buf send_rx_buf; + nvsp_1_msg_send_rx_buf_complete send_rx_buf_complete; + nvsp_1_msg_revoke_rx_buf revoke_rx_buf; + + nvsp_1_msg_send_send_buf send_send_buf; + nvsp_1_msg_send_send_buf_complete send_send_buf_complete; + nvsp_1_msg_revoke_send_buf revoke_send_buf; + + nvsp_1_msg_send_rndis_pkt send_rndis_pkt; + nvsp_1_msg_send_rndis_pkt_complete send_rndis_pkt_complete; +} __packed nvsp_1_msg_uber; + + +typedef union nvsp_2_msg_uber_ { + nvsp_2_msg_send_ndis_config send_ndis_config; + + nvsp_2_msg_send_chimney_buf send_chimney_buf; + nvsp_2_msg_send_chimney_buf_complete send_chimney_buf_complete; + nvsp_2_msg_revoke_chimney_buf revoke_chimney_buf; + + nvsp_2_msg_resume_chimney_rx_indication resume_chimney_rx_indication; + nvsp_2_msg_terminate_chimney terminate_chimney; + nvsp_2_msg_terminate_chimney_complete terminate_chimney_complete; + nvsp_2_msg_indicate_chimney_event indicate_chimney_event; + + nvsp_2_msg_send_chimney_pkt send_chimney_packet; + nvsp_2_msg_send_chimney_pkt_complete send_chimney_packet_complete; + nvsp_2_msg_post_chimney_rx_request post_chimney_rx_request; + nvsp_2_msg_post_chimney_rx_request_complete + post_chimney_rx_request_complete; + + nvsp_2_msg_alloc_rx_buf alloc_rx_buffer; + nvsp_2_msg_alloc_rx_buf_complete alloc_rx_buffer_complete; + nvsp_2_msg_free_rx_buf free_rx_buffer; + + nvsp_2_msg_send_vmq_rndis_pkt send_vmq_rndis_pkt; + nvsp_2_msg_send_vmq_rndis_pkt_complete send_vmq_rndis_pkt_complete; + nvsp_2_msg_alloc_chimney_handle alloc_chimney_handle; + nvsp_2_msg_alloc_chimney_handle_complete alloc_chimney_handle_complete; +} __packed nvsp_2_msg_uber; + + +typedef union nvsp_all_msgs_ { + nvsp_msg_init_uber init_msgs; + nvsp_1_msg_uber vers_1_msgs; + nvsp_2_msg_uber vers_2_msgs; +} __packed nvsp_all_msgs; + +/* + * ALL Messages + */ +typedef struct nvsp_msg_ { + nvsp_msg_hdr hdr; + nvsp_all_msgs msgs; +} __packed nvsp_msg; + + +/* + * The following arguably belongs in a separate header file + */ + +/* + * Defines + */ + +#define NETVSC_SEND_BUFFER_SIZE (64*1024) /* 64K */ +#define NETVSC_SEND_BUFFER_ID 0xface + + +#define NETVSC_RECEIVE_BUFFER_SIZE (1024*1024) /* 1MB */ + +#define NETVSC_RECEIVE_BUFFER_ID 0xcafe + +#define NETVSC_RECEIVE_SG_COUNT 1 + +/* Preallocated receive packets */ +#define NETVSC_RECEIVE_PACKETLIST_COUNT 256 + +/* + * Maximum MTU we permit to be configured for a netvsc interface. + * When the code was developed, a max MTU of 12232 was tested and + * proven to work. 9K is a reasonable maximum for an Ethernet. + */ +#define NETVSC_MAX_CONFIGURABLE_MTU (9 * 1024) + +/* + * Data types + */ + +/* + * Per netvsc channel-specific + */ +typedef struct netvsc_dev_ { + struct hv_device *dev; + int num_outstanding_sends; + + /* List of free preallocated NETVSC_PACKET to represent RX packet */ + STAILQ_HEAD(PQ, netvsc_packet_) myrx_packet_list; + struct mtx rx_pkt_list_lock; + + /* Send buffer allocated by us but manages by NetVSP */ + void *send_buf; + uint32_t send_buf_size; + uint32_t send_buf_gpadl_handle; + uint32_t send_section_size; + + /* Receive buffer allocated by us but managed by NetVSP */ + void *rx_buf; + uint32_t rx_buf_size; + uint32_t rx_buf_gpadl_handle; + uint32_t rx_section_count; + nvsp_1_rx_buf_section *rx_sections; + + /* Used for NetVSP initialization protocol */ + struct sema channel_init_sema; + nvsp_msg channel_init_packet; + + nvsp_msg revoke_packet; + /*uint8_t hw_mac_addr[HW_MACADDR_LEN];*/ + + /* Holds rndis device info */ + void *extension; + + hv_bool_uint8_t destroy; + /* Negotiated NVSP version */ + uint32_t nvsp_version; +} netvsc_dev; + + +typedef void (*pfn_on_send_rx_completion)(void *); + +#define NETVSC_DEVICE_RING_BUFFER_SIZE (64 * PAGE_SIZE) +#define NETVSC_PACKET_MAXPAGE 16 + + +typedef struct xfer_page_packet_ { + /* + * This needs to be here because the network RX code casts + * an instantiation of this structure to a netvsc_packet. + */ + STAILQ_ENTRY(netvsc_packet_) mylist_entry; + + uint32_t count; +} xfer_page_packet; + +typedef struct netvsc_packet_ { + /* + * List used when enqueued on &net_dev->rx_packet_list, + * and when enqueued within the netvsc code + */ + STAILQ_ENTRY(netvsc_packet_) mylist_entry; + struct hv_device *device; + hv_bool_uint8_t is_data_pkt; /* One byte */ + uint16_t vlan_tci; + xfer_page_packet *xfer_page_pkt; + + /* Completion */ + union { + struct { + uint64_t rx_completion_tid; + void *rx_completion_context; + /* This is no longer used */ + pfn_on_send_rx_completion on_rx_completion; + } rx; + struct { + uint64_t send_completion_tid; + void *send_completion_context; + /* Still used in netvsc and filter code */ + pfn_on_send_rx_completion on_send_completion; + } send; + } compl; + + void *extension; + uint32_t tot_data_buf_len; + uint32_t page_buf_count; + hv_vmbus_page_buffer page_buffers[NETVSC_PACKET_MAXPAGE]; +} netvsc_packet; + +typedef struct { + uint8_t mac_addr[6]; /* Assumption unsigned long */ + hv_bool_uint8_t link_state; +} netvsc_device_info; + +/* + * Device-specific softc structure + */ +typedef struct hn_softc { + struct ifnet *hn_ifp; + struct arpcom arpcom; + device_t hn_dev; + uint8_t hn_unit; + int hn_carrier; + int hn_if_flags; + struct mtx hn_lock; + int hn_initdone; + struct hv_device *hn_dev_obj; + netvsc_dev *net_dev; +} hn_softc_t; + + +/* + * Externs + */ +extern int hv_promisc_mode; + +extern void netvsc_linkstatus_callback(struct hv_device *device_obj, + uint32_t status); +extern int netvsc_recv(struct hv_device *device_obj, netvsc_packet *packet); +extern void netvsc_xmit_completion(void *context); + +extern void hv_nv_on_receive_completion(void *context); +extern netvsc_dev *hv_nv_on_device_add(struct hv_device *device, void *additional_info); +extern int hv_nv_on_device_remove(struct hv_device *device, + boolean_t destroy_channel); +extern int hv_nv_on_send(struct hv_device *device, netvsc_packet *pkt); + +#endif /* __HV_NET_VSC_H__ */ + diff --git a/sys/contrib/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c b/sys/contrib/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c new file mode 100644 index 0000000..47f48dd --- /dev/null +++ b/sys/contrib/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c @@ -0,0 +1,948 @@ +/*- + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 2004-2006 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sockio.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/queue.h> +#include <sys/lock.h> +#include <sys/sx.h> + +#include <net/if.h> +#include <net/if_arp.h> +#include <net/ethernet.h> +#include <net/if_dl.h> +#include <net/if_media.h> + +#include <net/bpf.h> + +#include <net/if_types.h> +#include <net/if_vlan_var.h> +#include <net/if.h> + +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/if_ether.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/pmap.h> + +#include <machine/bus.h> +#include <machine/resource.h> +#include <machine/frame.h> +#include <machine/vmparam.h> + +#include <sys/bus.h> +#include <sys/rman.h> +#include <sys/mutex.h> +#include <sys/errno.h> +#include <sys/types.h> +#include <machine/atomic.h> + +#include <machine/intr_machdep.h> + +#include <dev/hyperv/include/hyperv.h> +#include "hv_net_vsc.h" +#include "hv_rndis.h" +#include "hv_rndis_filter.h" + + +/* Short for Hyper-V network interface */ +#define NETVSC_DEVNAME "hn" + +/* + * It looks like offset 0 of buf is reserved to hold the softc pointer. + * The sc pointer evidently not needed, and is not presently populated. + * The packet offset is where the netvsc_packet starts in the buffer. + */ +#define HV_NV_SC_PTR_OFFSET_IN_BUF 0 +#define HV_NV_PACKET_OFFSET_IN_BUF 16 + + +/* + * Data types + */ + +struct hv_netvsc_driver_context { + uint32_t drv_inited; +}; + +/* + * Be aware that this sleepable mutex will exhibit WITNESS errors when + * certain TCP and ARP code paths are taken. This appears to be a + * well-known condition, as all other drivers checked use a sleeping + * mutex to protect their transmit paths. + * Also Be aware that mutexes do not play well with semaphores, and there + * is a conflicting semaphore in a certain channel code path. + */ +#define NV_LOCK_INIT(_sc, _name) \ + mtx_init(&(_sc)->hn_lock, _name, MTX_NETWORK_LOCK, MTX_DEF) +#define NV_LOCK(_sc) mtx_lock(&(_sc)->hn_lock) +#define NV_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->hn_lock, MA_OWNED) +#define NV_UNLOCK(_sc) mtx_unlock(&(_sc)->hn_lock) +#define NV_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->hn_lock) + + +/* + * Globals + */ + +int hv_promisc_mode = 0; /* normal mode by default */ + +/* The one and only one */ +static struct hv_netvsc_driver_context g_netvsc_drv; + + +/* + * Forward declarations + */ +static void hn_stop(hn_softc_t *sc); +static void hn_ifinit_locked(hn_softc_t *sc); +static void hn_ifinit(void *xsc); +static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); +static int hn_start_locked(struct ifnet *ifp); +static void hn_start(struct ifnet *ifp); + + +/* + * NetVsc driver initialization + * Note: Filter init is no longer required + */ +static int +netvsc_drv_init(void) +{ + return (0); +} + +/* + * NetVsc global initialization entry point + */ +static void +netvsc_init(void) +{ + printf("Netvsc initializing... "); + + /* + * XXXKYS: cleanup initialization + */ + if (!cold && !g_netvsc_drv.drv_inited) { + g_netvsc_drv.drv_inited = 1; + netvsc_drv_init(); + } else { + printf("Already initialized!\n"); + } +} + +/* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ +static const hv_guid g_net_vsc_device_type = { + .data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, + 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} +}; + +/* + * Standard probe entry point. + * + */ +static int +netvsc_probe(device_t dev) +{ + const char *p; + + p = vmbus_get_type(dev); + if (!memcmp(p, &g_net_vsc_device_type.data, sizeof(hv_guid))) { + device_set_desc(dev, "Synthetic Network Interface"); + printf("Netvsc probe... DONE \n"); + + return (0); + } + + return (ENXIO); +} + +/* + * Standard attach entry point. + * + * Called when the driver is loaded. It allocates needed resources, + * and initializes the "hardware" and software. + */ +static int +netvsc_attach(device_t dev) +{ + struct hv_device *device_ctx = vmbus_get_devctx(dev); + netvsc_device_info device_info; + hn_softc_t *sc; + int unit = device_get_unit(dev); + struct ifnet *ifp; + int ret; + + netvsc_init(); + + sc = device_get_softc(dev); + if (sc == NULL) { + return (ENOMEM); + } + + bzero(sc, sizeof(hn_softc_t)); + sc->hn_unit = unit; + sc->hn_dev = dev; + + NV_LOCK_INIT(sc, "NetVSCLock"); + + sc->hn_dev_obj = device_ctx; + + ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER); + ifp->if_softc = sc; + + if_initname(ifp, device_get_name(dev), device_get_unit(dev)); + ifp->if_dunit = unit; + ifp->if_dname = NETVSC_DEVNAME; + + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_ioctl = hn_ioctl; + ifp->if_start = hn_start; + ifp->if_init = hn_ifinit; + /* needed by hv_rf_on_device_add() code */ + ifp->if_mtu = ETHERMTU; + IFQ_SET_MAXLEN(&ifp->if_snd, 512); + ifp->if_snd.ifq_drv_maxlen = 511; + IFQ_SET_READY(&ifp->if_snd); + + /* + * Tell upper layers that we support full VLAN capability. + */ + ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header); + ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; + ifp->if_capenable |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; + + ret = hv_rf_on_device_add(device_ctx, &device_info); + if (ret != 0) { + if_free(ifp); + + return (ret); + } + if (device_info.link_state == 0) { + sc->hn_carrier = 1; + } + + ether_ifattach(ifp, device_info.mac_addr); + + return (0); +} + +/* + * Standard detach entry point + */ +static int +netvsc_detach(device_t dev) +{ + struct hv_device *hv_device = vmbus_get_devctx(dev); + + printf("netvsc_detach\n"); + + /* + * XXXKYS: Need to clean up all our + * driver state; this is the driver + * unloading. + */ + + /* + * XXXKYS: Need to stop outgoing traffic and unregister + * the netdevice. + */ + + hv_rf_on_device_remove(hv_device, HV_RF_NV_DESTROY_CHANNEL); + + return (0); +} + +/* + * Standard shutdown entry point + */ +static int +netvsc_shutdown(device_t dev) +{ + return (0); +} + +/* + * Send completion processing + * + * Note: It looks like offset 0 of buf is reserved to hold the softc + * pointer. The sc pointer is not currently needed in this function, and + * it is not presently populated by the TX function. + */ +void +netvsc_xmit_completion(void *context) +{ + netvsc_packet *packet = (netvsc_packet *)context; + struct mbuf *mb; + uint8_t *buf; + + mb = (struct mbuf *)packet->compl.send.send_completion_tid; + buf = ((uint8_t *)packet) - HV_NV_PACKET_OFFSET_IN_BUF; + + free(buf, M_DEVBUF); + + if (mb != NULL) { + m_freem(mb); + } +} + +/* + * Start a transmit of one or more packets + */ +static int +hn_start_locked(struct ifnet *ifp) +{ + hn_softc_t *sc = ifp->if_softc; + struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); + uint8_t *buf; + netvsc_packet *packet; + struct mbuf *m_head, *m; + struct mbuf *mc_head = NULL; + int i; + int num_frags; + int len; + int xlen; + int rppi_size; + int retries = 0; + int ret = 0; + + while (!IFQ_DRV_IS_EMPTY(&sc->hn_ifp->if_snd)) { + IFQ_DRV_DEQUEUE(&sc->hn_ifp->if_snd, m_head); + if (m_head == NULL) { + break; + } + + len = 0; + num_frags = 0; + xlen = 0; + + /* Walk the mbuf list computing total length and num frags */ + for (m = m_head; m != NULL; m = m->m_next) { + if (m->m_len != 0) { + num_frags++; + len += m->m_len; + } + } + + /* + * Reserve the number of pages requested. Currently, + * one page is reserved for the message in the RNDIS + * filter packet + */ + num_frags += HV_RF_NUM_TX_RESERVED_PAGE_BUFS; + + /* If exceeds # page_buffers in netvsc_packet */ + if (num_frags > NETVSC_PACKET_MAXPAGE) { + m_freem(m); + + return (EINVAL); + } + + rppi_size = 0; + if (m_head->m_flags & M_VLANTAG) { + rppi_size = sizeof(rndis_per_packet_info) + + sizeof(ndis_8021q_info); + } + + /* + * Allocate a buffer with space for a netvsc packet plus a + * number of reserved areas. First comes a (currently 16 + * bytes, currently unused) reserved data area. Second is + * the netvsc_packet, which includes (currently 4) page + * buffers. Third (optional) is a rndis_per_packet_info + * struct, but only if a VLAN tag should be inserted into the + * Ethernet frame by the Hyper-V infrastructure. Fourth is + * an area reserved for an rndis_filter_packet struct. + * Changed malloc to M_NOWAIT to avoid sleep under spin lock. + * No longer reserving extra space for page buffers, as they + * are already part of the netvsc_packet. + */ + buf = malloc(HV_NV_PACKET_OFFSET_IN_BUF + + sizeof(netvsc_packet) + rppi_size + + sizeof(rndis_filter_packet), + M_DEVBUF, M_ZERO | M_NOWAIT); + if (buf == NULL) { + m_freem(m); + + return (ENOMEM); + } + + packet = (netvsc_packet *)(buf + HV_NV_PACKET_OFFSET_IN_BUF); + *(vm_offset_t *)buf = HV_NV_SC_PTR_OFFSET_IN_BUF; + + /* + * extension points to the area reserved for the + * rndis_filter_packet, which is placed just after + * the netvsc_packet (and rppi struct, if present; + * length is updated later). + */ + packet->extension = packet + 1; + + /* Set up the rndis header */ + packet->page_buf_count = num_frags; + + /* Initialize it from the mbuf */ + packet->tot_data_buf_len = len; + + /* + * If the Hyper-V infrastructure needs to embed a VLAN tag, + * initialize netvsc_packet and rppi struct values as needed. + */ + if (rppi_size) { + /* Lower layers need the VLAN TCI */ + packet->vlan_tci = m_head->m_pkthdr.ether_vtag; + } + + /* + * Fill the page buffers with mbuf info starting at index + * HV_RF_NUM_TX_RESERVED_PAGE_BUFS. + */ + i = HV_RF_NUM_TX_RESERVED_PAGE_BUFS; + for (m = m_head; m != NULL; m = m->m_next) { + if (m->m_len) { + vm_offset_t paddr = + vtophys(mtod(m, vm_offset_t)); + packet->page_buffers[i].pfn = + paddr >> PAGE_SHIFT; + packet->page_buffers[i].offset = + paddr & (PAGE_SIZE - 1); + packet->page_buffers[i].length = m->m_len; + i++; + } + } + + /* + * If bpf, copy the mbuf chain. This is less expensive than + * it appears; the mbuf clusters are not copied, only their + * reference counts are incremented. + * Needed to avoid a race condition where the completion + * callback is invoked, freeing the mbuf chain, before the + * bpf_mtap code has a chance to run. + */ + if (ifp->if_bpf) { + mc_head = m_copypacket(m_head, M_DONTWAIT); + } +retry_send: + /* Set the completion routine */ + packet->compl.send.on_send_completion = netvsc_xmit_completion; + packet->compl.send.send_completion_context = packet; + packet->compl.send.send_completion_tid = (uint64_t)m_head; + + /* Removed critical_enter(), does not appear necessary */ + ret = hv_rf_on_send(device_ctx, packet); + + if (ret == 0) { + ifp->if_opackets++; + /* if bpf && mc_head, call bpf_mtap code */ + if (mc_head) { + ETHER_BPF_MTAP(ifp, mc_head); + } + } else { + retries++; + if (retries < 4) { + goto retry_send; + } + + IF_PREPEND(&ifp->if_snd, m_head); + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + + /* + * Null the mbuf pointer so the completion function + * does not free the mbuf chain. We just pushed the + * mbuf chain back on the if_snd queue. + */ + packet->compl.send.send_completion_tid = 0; + + /* + * Release the resources since we will not get any + * send completion + */ + netvsc_xmit_completion(packet); + } + + /* if bpf && mc_head, free the mbuf chain copy */ + if (mc_head) { + m_freem(mc_head); + } + } + + return (ret); +} + +/* + * Link up/down notification + */ +void +netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status) +{ + hn_softc_t *sc = device_get_softc(device_obj->device); + + if (sc == NULL) { + return; + } + + if (status == 1) { + sc->hn_carrier = 1; + } else { + sc->hn_carrier = 0; + } +} + +/* + * Append the specified data to the indicated mbuf chain, + * Extend the mbuf chain if the new data does not fit in + * existing space. + * + * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. + * There should be an equivalent in the kernel mbuf code, + * but there does not appear to be one yet. + * + * Differs from m_append() in that additional mbufs are + * allocated with cluster size MJUMPAGESIZE, and filled + * accordingly. + * + * Return 1 if able to complete the job; otherwise 0. + */ +static int +hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) +{ + struct mbuf *m, *n; + int remainder, space; + + for (m = m0; m->m_next != NULL; m = m->m_next) + ; + remainder = len; + space = M_TRAILINGSPACE(m); + if (space > 0) { + /* + * Copy into available space. + */ + if (space > remainder) + space = remainder; + bcopy(cp, mtod(m, caddr_t) + m->m_len, space); + m->m_len += space; + cp += space; + remainder -= space; + } + while (remainder > 0) { + /* + * Allocate a new mbuf; could check space + * and allocate a cluster instead. + */ + n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE); + if (n == NULL) + break; + n->m_len = min(MJUMPAGESIZE, remainder); + bcopy(cp, mtod(n, caddr_t), n->m_len); + cp += n->m_len; + remainder -= n->m_len; + m->m_next = n; + m = n; + } + if (m0->m_flags & M_PKTHDR) + m0->m_pkthdr.len += len - remainder; + + return (remainder == 0); +} + + +/* + * Called when we receive a data packet from the "wire" on the + * specified device + * + * Note: This is no longer used as a callback + */ +int +netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet) +{ + hn_softc_t *sc = (hn_softc_t *)device_get_softc(device_ctx->device); + struct mbuf *m_new; + struct ifnet *ifp = sc->hn_ifp; + int size; + int i; + + if (sc == NULL) { + return (0); /* TODO: KYS how can this be! */ + } + + ifp = sc->arpcom.ac_ifp; + + if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { + return (0); + } + + /* + * Bail out if packet contains more data than configured MTU. + */ + if (packet->tot_data_buf_len > (ifp->if_mtu + ETHER_HDR_LEN)) { + return (0); + } + + /* + * Get an mbuf with a cluster. For packets 2K or less, + * get a standard 2K cluster. For anything larger, get a + * 4K cluster. Any buffers larger than 4K can cause problems + * if looped around to the Hyper-V TX channel, so avoid them. + */ + size = MCLBYTES; + + if (packet->tot_data_buf_len > MCLBYTES) { + /* 4096 */ + size = MJUMPAGESIZE; + } + + m_new = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, size); + + if (m_new == NULL) + return (0); + + /* + * Remove trailing junk from RX data buffer. + * Fixme: This will not work for multiple Hyper-V RX buffers. + * Fortunately, the channel gathers all RX data into one buffer. + * + * L2 frame length, with L2 header, not including CRC + */ + packet->page_buffers[0].length = packet->tot_data_buf_len; + + /* + * Copy the received packet to one or more mbufs. + * The copy is required since the memory pointed to by netvsc_packet + * cannot be deallocated + */ + for (i=0; i < packet->page_buf_count; i++) { + /* Shift virtual page number to form virtual page address */ + uint8_t *vaddr = (uint8_t *) + (packet->page_buffers[i].pfn << PAGE_SHIFT); + + hv_m_append(m_new, packet->page_buffers[i].length, + vaddr + packet->page_buffers[i].offset); + } + + m_new->m_pkthdr.rcvif = ifp; + + if ((packet->vlan_tci != 0) && + (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0) { + m_new->m_pkthdr.ether_vtag = packet->vlan_tci; + m_new->m_flags |= M_VLANTAG; + } + + /* + * Note: Moved RX completion back to hv_nv_on_receive() so all + * messages (not just data messages) will trigger a response. + */ + + ifp->if_ipackets++; + + /* We're not holding the lock here, so don't release it */ + (*ifp->if_input)(ifp, m_new); + + return (0); +} + +/* + * Standard ioctl entry point. Called when the user wants to configure + * the interface. + */ +static int +hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + hn_softc_t *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *)data; + netvsc_device_info device_info; + struct hv_device *hn_dev; + int mask, error = 0; + + switch(cmd) { + + case SIOCSIFADDR: + case SIOCGIFADDR: + error = ether_ioctl(ifp, cmd, data); + break; + case SIOCSIFMTU: + hn_dev = vmbus_get_devctx(sc->hn_dev); + + NV_LOCK(sc); + + if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) { + error = EINVAL; + NV_UNLOCK(sc); + break; + } + /* Obtain and record requested MTU */ + ifp->if_mtu = ifr->ifr_mtu; + + /* + * We must remove and add back the device to cause the new + * MTU to take effect. This includes tearing down, but not + * deleting the channel, then bringing it back up. + */ + error = hv_rf_on_device_remove(hn_dev, HV_RF_NV_RETAIN_CHANNEL); + if (error) { + NV_UNLOCK(sc); + break; + } + error = hv_rf_on_device_add(hn_dev, &device_info); + if (error) { + NV_UNLOCK(sc); + break; + } + + hn_ifinit_locked(sc); + + NV_UNLOCK(sc); + break; + case SIOCSIFFLAGS: + NV_LOCK(sc); + if (ifp->if_flags & IFF_UP) { + /* + * If only the state of the PROMISC flag changed, + * then just use the 'set promisc mode' command + * instead of reinitializing the entire NIC. Doing + * a full re-init means reloading the firmware and + * waiting for it to start up, which may take a + * second or two. + */ +#ifdef notyet + /* Fixme: Promiscuous mode? */ + /* No promiscuous mode with Xen */ + if (ifp->if_drv_flags & IFF_DRV_RUNNING && + ifp->if_flags & IFF_PROMISC && + !(sc->hn_if_flags & IFF_PROMISC)) { + /* do something here for Hyper-V */ + ; +/* XN_SETBIT(sc, XN_RX_MODE, */ +/* XN_RXMODE_RX_PROMISC); */ + } else if (ifp->if_drv_flags & IFF_DRV_RUNNING && + !(ifp->if_flags & IFF_PROMISC) && + sc->hn_if_flags & IFF_PROMISC) { + /* do something here for Hyper-V */ + ; +/* XN_CLRBIT(sc, XN_RX_MODE, */ +/* XN_RXMODE_RX_PROMISC); */ + } else +#endif + hn_ifinit_locked(sc); + } else { + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + hn_stop(sc); + } + } + sc->hn_if_flags = ifp->if_flags; + NV_UNLOCK(sc); + error = 0; + break; + case SIOCSIFCAP: + mask = ifr->ifr_reqcap ^ ifp->if_capenable; + if (mask & IFCAP_HWCSUM) { + if (IFCAP_HWCSUM & ifp->if_capenable) { + ifp->if_capenable &= ~IFCAP_HWCSUM; + } else { + ifp->if_capenable |= IFCAP_HWCSUM; + } + } + error = 0; + break; + case SIOCADDMULTI: + case SIOCDELMULTI: +#ifdef notyet + /* Fixme: Multicast mode? */ + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + NV_LOCK(sc); + netvsc_setmulti(sc); + NV_UNLOCK(sc); + error = 0; + } +#endif + /* FALLTHROUGH */ + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + error = EINVAL; + break; + default: + error = ether_ioctl(ifp, cmd, data); + break; + } + + return (error); +} + +/* + * + */ +static void +hn_stop(hn_softc_t *sc) +{ + struct ifnet *ifp; + int ret; + struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); + + NV_LOCK_ASSERT(sc); + ifp = sc->hn_ifp; + + printf(" Closing Device ...\n"); + + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + sc->hn_initdone = 0; + + ret = hv_rf_on_close(device_ctx); +} + +/* + * FreeBSD transmit entry point + */ +static void +hn_start(struct ifnet *ifp) +{ + hn_softc_t *sc; + + sc = ifp->if_softc; + NV_LOCK(sc); + hn_start_locked(ifp); + NV_UNLOCK(sc); +} + +/* + * + */ +static void +hn_ifinit_locked(hn_softc_t *sc) +{ + struct ifnet *ifp; + struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); + int ret; + + NV_LOCK_ASSERT(sc); + + ifp = sc->hn_ifp; + + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + return; + } + + hv_promisc_mode = 1; + + ret = hv_rf_on_open(device_ctx); + if (ret != 0) { + return; + } else { + sc->hn_initdone = 1; + } + ifp->if_drv_flags |= IFF_DRV_RUNNING; + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; +} + +/* + * + */ +static void +hn_ifinit(void *xsc) +{ + hn_softc_t *sc = xsc; + + NV_LOCK(sc); + hn_ifinit_locked(sc); + NV_UNLOCK(sc); +} + +#ifdef LATER +/* + * + */ +static void +hn_watchdog(struct ifnet *ifp) +{ + hn_softc_t *sc; + sc = ifp->if_softc; + + printf("hn%d: watchdog timeout -- resetting\n", sc->hn_unit); + hn_ifinit(sc); /*???*/ + ifp->if_oerrors++; +} +#endif + +static device_method_t netvsc_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, netvsc_probe), + DEVMETHOD(device_attach, netvsc_attach), + DEVMETHOD(device_detach, netvsc_detach), + DEVMETHOD(device_shutdown, netvsc_shutdown), + + { 0, 0 } +}; + +static driver_t netvsc_driver = { + NETVSC_DEVNAME, + netvsc_methods, + sizeof(hn_softc_t) +}; + +static devclass_t netvsc_devclass; + +DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0); +MODULE_VERSION(hn, 1); +MODULE_DEPEND(hn, vmbus, 1, 1, 1); +SYSINIT(netvsc_initx, SI_SUB_RUN_SCHEDULER, SI_ORDER_MIDDLE + 1, netvsc_init, + NULL); + diff --git a/sys/contrib/dev/hyperv/netvsc/hv_rndis.h b/sys/contrib/dev/hyperv/netvsc/hv_rndis.h new file mode 100644 index 0000000..819cab5 --- /dev/null +++ b/sys/contrib/dev/hyperv/netvsc/hv_rndis.h @@ -0,0 +1,911 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2012 NetApp Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HV_RNDIS_H__ +#define __HV_RNDIS_H__ + + +/* + * NDIS protocol version numbers + */ +#define NDIS_VERSION_5_0 0x00050000 +#define NDIS_VERSION_5_1 0x00050001 +#define NDIS_VERSION_6_0 0x00060000 +#define NDIS_VERSION (NDIS_VERSION_5_1) + +/* + * Status codes + */ + +#define STATUS_SUCCESS (0x00000000L) +#define STATUS_UNSUCCESSFUL (0xC0000001L) +#define STATUS_PENDING (0x00000103L) +#define STATUS_INSUFFICIENT_RESOURCES (0xC000009AL) +#define STATUS_BUFFER_OVERFLOW (0x80000005L) +#define STATUS_NOT_SUPPORTED (0xC00000BBL) + +#define RNDIS_STATUS_SUCCESS (STATUS_SUCCESS) +#define RNDIS_STATUS_PENDING (STATUS_PENDING) +#define RNDIS_STATUS_NOT_RECOGNIZED (0x00010001L) +#define RNDIS_STATUS_NOT_COPIED (0x00010002L) +#define RNDIS_STATUS_NOT_ACCEPTED (0x00010003L) +#define RNDIS_STATUS_CALL_ACTIVE (0x00010007L) + +#define RNDIS_STATUS_ONLINE (0x40010003L) +#define RNDIS_STATUS_RESET_START (0x40010004L) +#define RNDIS_STATUS_RESET_END (0x40010005L) +#define RNDIS_STATUS_RING_STATUS (0x40010006L) +#define RNDIS_STATUS_CLOSED (0x40010007L) +#define RNDIS_STATUS_WAN_LINE_UP (0x40010008L) +#define RNDIS_STATUS_WAN_LINE_DOWN (0x40010009L) +#define RNDIS_STATUS_WAN_FRAGMENT (0x4001000AL) +#define RNDIS_STATUS_MEDIA_CONNECT (0x4001000BL) +#define RNDIS_STATUS_MEDIA_DISCONNECT (0x4001000CL) +#define RNDIS_STATUS_HARDWARE_LINE_UP (0x4001000DL) +#define RNDIS_STATUS_HARDWARE_LINE_DOWN (0x4001000EL) +#define RNDIS_STATUS_INTERFACE_UP (0x4001000FL) +#define RNDIS_STATUS_INTERFACE_DOWN (0x40010010L) +#define RNDIS_STATUS_MEDIA_BUSY (0x40010011L) +#define RNDIS_STATUS_MEDIA_SPECIFIC_INDICATION (0x40010012L) +#define RNDIS_STATUS_WW_INDICATION RNDIS_STATUS_MEDIA_SPECIFIC_INDICATION +#define RNDIS_STATUS_LINK_SPEED_CHANGE (0x40010013L) + +#define RNDIS_STATUS_NOT_RESETTABLE (0x80010001L) +#define RNDIS_STATUS_SOFT_ERRORS (0x80010003L) +#define RNDIS_STATUS_HARD_ERRORS (0x80010004L) +#define RNDIS_STATUS_BUFFER_OVERFLOW (STATUS_BUFFER_OVERFLOW) + +#define RNDIS_STATUS_FAILURE (STATUS_UNSUCCESSFUL) +#define RNDIS_STATUS_RESOURCES (STATUS_INSUFFICIENT_RESOURCES) +#define RNDIS_STATUS_CLOSING (0xC0010002L) +#define RNDIS_STATUS_BAD_VERSION (0xC0010004L) +#define RNDIS_STATUS_BAD_CHARACTERISTICS (0xC0010005L) +#define RNDIS_STATUS_ADAPTER_NOT_FOUND (0xC0010006L) +#define RNDIS_STATUS_OPEN_FAILED (0xC0010007L) +#define RNDIS_STATUS_DEVICE_FAILED (0xC0010008L) +#define RNDIS_STATUS_MULTICAST_FULL (0xC0010009L) +#define RNDIS_STATUS_MULTICAST_EXISTS (0xC001000AL) +#define RNDIS_STATUS_MULTICAST_NOT_FOUND (0xC001000BL) +#define RNDIS_STATUS_REQUEST_ABORTED (0xC001000CL) +#define RNDIS_STATUS_RESET_IN_PROGRESS (0xC001000DL) +#define RNDIS_STATUS_CLOSING_INDICATING (0xC001000EL) +#define RNDIS_STATUS_NOT_SUPPORTED (STATUS_NOT_SUPPORTED) +#define RNDIS_STATUS_INVALID_PACKET (0xC001000FL) +#define RNDIS_STATUS_OPEN_LIST_FULL (0xC0010010L) +#define RNDIS_STATUS_ADAPTER_NOT_READY (0xC0010011L) +#define RNDIS_STATUS_ADAPTER_NOT_OPEN (0xC0010012L) +#define RNDIS_STATUS_NOT_INDICATING (0xC0010013L) +#define RNDIS_STATUS_INVALID_LENGTH (0xC0010014L) +#define RNDIS_STATUS_INVALID_DATA (0xC0010015L) +#define RNDIS_STATUS_BUFFER_TOO_SHORT (0xC0010016L) +#define RNDIS_STATUS_INVALID_OID (0xC0010017L) +#define RNDIS_STATUS_ADAPTER_REMOVED (0xC0010018L) +#define RNDIS_STATUS_UNSUPPORTED_MEDIA (0xC0010019L) +#define RNDIS_STATUS_GROUP_ADDRESS_IN_USE (0xC001001AL) +#define RNDIS_STATUS_FILE_NOT_FOUND (0xC001001BL) +#define RNDIS_STATUS_ERROR_READING_FILE (0xC001001CL) +#define RNDIS_STATUS_ALREADY_MAPPED (0xC001001DL) +#define RNDIS_STATUS_RESOURCE_CONFLICT (0xC001001EL) +#define RNDIS_STATUS_NO_CABLE (0xC001001FL) + +#define RNDIS_STATUS_INVALID_SAP (0xC0010020L) +#define RNDIS_STATUS_SAP_IN_USE (0xC0010021L) +#define RNDIS_STATUS_INVALID_ADDRESS (0xC0010022L) +#define RNDIS_STATUS_VC_NOT_ACTIVATED (0xC0010023L) +#define RNDIS_STATUS_DEST_OUT_OF_ORDER (0xC0010024L) +#define RNDIS_STATUS_VC_NOT_AVAILABLE (0xC0010025L) +#define RNDIS_STATUS_CELLRATE_NOT_AVAILABLE (0xC0010026L) +#define RNDIS_STATUS_INCOMPATABLE_QOS (0xC0010027L) +#define RNDIS_STATUS_AAL_PARAMS_UNSUPPORTED (0xC0010028L) +#define RNDIS_STATUS_NO_ROUTE_TO_DESTINATION (0xC0010029L) + +#define RNDIS_STATUS_TOKEN_RING_OPEN_ERROR (0xC0011000L) + + +/* + * Object Identifiers used by NdisRequest Query/Set Information + */ + +/* + * General Objects + */ + +#define RNDIS_OID_GEN_SUPPORTED_LIST 0x00010101 +#define RNDIS_OID_GEN_HARDWARE_STATUS 0x00010102 +#define RNDIS_OID_GEN_MEDIA_SUPPORTED 0x00010103 +#define RNDIS_OID_GEN_MEDIA_IN_USE 0x00010104 +#define RNDIS_OID_GEN_MAXIMUM_LOOKAHEAD 0x00010105 +#define RNDIS_OID_GEN_MAXIMUM_FRAME_SIZE 0x00010106 +#define RNDIS_OID_GEN_LINK_SPEED 0x00010107 +#define RNDIS_OID_GEN_TRANSMIT_BUFFER_SPACE 0x00010108 +#define RNDIS_OID_GEN_RECEIVE_BUFFER_SPACE 0x00010109 +#define RNDIS_OID_GEN_TRANSMIT_BLOCK_SIZE 0x0001010A +#define RNDIS_OID_GEN_RECEIVE_BLOCK_SIZE 0x0001010B +#define RNDIS_OID_GEN_VENDOR_ID 0x0001010C +#define RNDIS_OID_GEN_VENDOR_DESCRIPTION 0x0001010D +#define RNDIS_OID_GEN_CURRENT_PACKET_FILTER 0x0001010E +#define RNDIS_OID_GEN_CURRENT_LOOKAHEAD 0x0001010F +#define RNDIS_OID_GEN_DRIVER_VERSION 0x00010110 +#define RNDIS_OID_GEN_MAXIMUM_TOTAL_SIZE 0x00010111 +#define RNDIS_OID_GEN_PROTOCOL_OPTIONS 0x00010112 +#define RNDIS_OID_GEN_MAC_OPTIONS 0x00010113 +#define RNDIS_OID_GEN_MEDIA_CONNECT_STATUS 0x00010114 +#define RNDIS_OID_GEN_MAXIMUM_SEND_PACKETS 0x00010115 +#define RNDIS_OID_GEN_VENDOR_DRIVER_VERSION 0x00010116 +#define RNDIS_OID_GEN_NETWORK_LAYER_ADDRESSES 0x00010118 +#define RNDIS_OID_GEN_TRANSPORT_HEADER_OFFSET 0x00010119 +#define RNDIS_OID_GEN_MACHINE_NAME 0x0001021A +#define RNDIS_OID_GEN_RNDIS_CONFIG_PARAMETER 0x0001021B + +#define RNDIS_OID_GEN_XMIT_OK 0x00020101 +#define RNDIS_OID_GEN_RCV_OK 0x00020102 +#define RNDIS_OID_GEN_XMIT_ERROR 0x00020103 +#define RNDIS_OID_GEN_RCV_ERROR 0x00020104 +#define RNDIS_OID_GEN_RCV_NO_BUFFER 0x00020105 + +#define RNDIS_OID_GEN_DIRECTED_BYTES_XMIT 0x00020201 +#define RNDIS_OID_GEN_DIRECTED_FRAMES_XMIT 0x00020202 +#define RNDIS_OID_GEN_MULTICAST_BYTES_XMIT 0x00020203 +#define RNDIS_OID_GEN_MULTICAST_FRAMES_XMIT 0x00020204 +#define RNDIS_OID_GEN_BROADCAST_BYTES_XMIT 0x00020205 +#define RNDIS_OID_GEN_BROADCAST_FRAMES_XMIT 0x00020206 +#define RNDIS_OID_GEN_DIRECTED_BYTES_RCV 0x00020207 +#define RNDIS_OID_GEN_DIRECTED_FRAMES_RCV 0x00020208 +#define RNDIS_OID_GEN_MULTICAST_BYTES_RCV 0x00020209 +#define RNDIS_OID_GEN_MULTICAST_FRAMES_RCV 0x0002020A +#define RNDIS_OID_GEN_BROADCAST_BYTES_RCV 0x0002020B +#define RNDIS_OID_GEN_BROADCAST_FRAMES_RCV 0x0002020C + +#define RNDIS_OID_GEN_RCV_CRC_ERROR 0x0002020D +#define RNDIS_OID_GEN_TRANSMIT_QUEUE_LENGTH 0x0002020E + +#define RNDIS_OID_GEN_GET_TIME_CAPS 0x0002020F +#define RNDIS_OID_GEN_GET_NETCARD_TIME 0x00020210 + +/* + * These are connection-oriented general OIDs. + * These replace the above OIDs for connection-oriented media. + */ +#define RNDIS_OID_GEN_CO_SUPPORTED_LIST 0x00010101 +#define RNDIS_OID_GEN_CO_HARDWARE_STATUS 0x00010102 +#define RNDIS_OID_GEN_CO_MEDIA_SUPPORTED 0x00010103 +#define RNDIS_OID_GEN_CO_MEDIA_IN_USE 0x00010104 +#define RNDIS_OID_GEN_CO_LINK_SPEED 0x00010105 +#define RNDIS_OID_GEN_CO_VENDOR_ID 0x00010106 +#define RNDIS_OID_GEN_CO_VENDOR_DESCRIPTION 0x00010107 +#define RNDIS_OID_GEN_CO_DRIVER_VERSION 0x00010108 +#define RNDIS_OID_GEN_CO_PROTOCOL_OPTIONS 0x00010109 +#define RNDIS_OID_GEN_CO_MAC_OPTIONS 0x0001010A +#define RNDIS_OID_GEN_CO_MEDIA_CONNECT_STATUS 0x0001010B +#define RNDIS_OID_GEN_CO_VENDOR_DRIVER_VERSION 0x0001010C +#define RNDIS_OID_GEN_CO_MINIMUM_LINK_SPEED 0x0001010D + +#define RNDIS_OID_GEN_CO_GET_TIME_CAPS 0x00010201 +#define RNDIS_OID_GEN_CO_GET_NETCARD_TIME 0x00010202 + +/* + * These are connection-oriented statistics OIDs. + */ +#define RNDIS_OID_GEN_CO_XMIT_PDUS_OK 0x00020101 +#define RNDIS_OID_GEN_CO_RCV_PDUS_OK 0x00020102 +#define RNDIS_OID_GEN_CO_XMIT_PDUS_ERROR 0x00020103 +#define RNDIS_OID_GEN_CO_RCV_PDUS_ERROR 0x00020104 +#define RNDIS_OID_GEN_CO_RCV_PDUS_NO_BUFFER 0x00020105 + + +#define RNDIS_OID_GEN_CO_RCV_CRC_ERROR 0x00020201 +#define RNDIS_OID_GEN_CO_TRANSMIT_QUEUE_LENGTH 0x00020202 +#define RNDIS_OID_GEN_CO_BYTES_XMIT 0x00020203 +#define RNDIS_OID_GEN_CO_BYTES_RCV 0x00020204 +#define RNDIS_OID_GEN_CO_BYTES_XMIT_OUTSTANDING 0x00020205 +#define RNDIS_OID_GEN_CO_NETCARD_LOAD 0x00020206 + +/* + * These are objects for Connection-oriented media call-managers. + */ +#define RNDIS_OID_CO_ADD_PVC 0xFF000001 +#define RNDIS_OID_CO_DELETE_PVC 0xFF000002 +#define RNDIS_OID_CO_GET_CALL_INFORMATION 0xFF000003 +#define RNDIS_OID_CO_ADD_ADDRESS 0xFF000004 +#define RNDIS_OID_CO_DELETE_ADDRESS 0xFF000005 +#define RNDIS_OID_CO_GET_ADDRESSES 0xFF000006 +#define RNDIS_OID_CO_ADDRESS_CHANGE 0xFF000007 +#define RNDIS_OID_CO_SIGNALING_ENABLED 0xFF000008 +#define RNDIS_OID_CO_SIGNALING_DISABLED 0xFF000009 + + +/* + * 802.3 Objects (Ethernet) + */ + +#define RNDIS_OID_802_3_PERMANENT_ADDRESS 0x01010101 +#define RNDIS_OID_802_3_CURRENT_ADDRESS 0x01010102 +#define RNDIS_OID_802_3_MULTICAST_LIST 0x01010103 +#define RNDIS_OID_802_3_MAXIMUM_LIST_SIZE 0x01010104 +#define RNDIS_OID_802_3_MAC_OPTIONS 0x01010105 + +/* + * + */ +#define NDIS_802_3_MAC_OPTION_PRIORITY 0x00000001 + +#define RNDIS_OID_802_3_RCV_ERROR_ALIGNMENT 0x01020101 +#define RNDIS_OID_802_3_XMIT_ONE_COLLISION 0x01020102 +#define RNDIS_OID_802_3_XMIT_MORE_COLLISIONS 0x01020103 + +#define RNDIS_OID_802_3_XMIT_DEFERRED 0x01020201 +#define RNDIS_OID_802_3_XMIT_MAX_COLLISIONS 0x01020202 +#define RNDIS_OID_802_3_RCV_OVERRUN 0x01020203 +#define RNDIS_OID_802_3_XMIT_UNDERRUN 0x01020204 +#define RNDIS_OID_802_3_XMIT_HEARTBEAT_FAILURE 0x01020205 +#define RNDIS_OID_802_3_XMIT_TIMES_CRS_LOST 0x01020206 +#define RNDIS_OID_802_3_XMIT_LATE_COLLISIONS 0x01020207 + + +/* + * RNDIS MP custom OID for test + */ +#define OID_RNDISMP_GET_RECEIVE_BUFFERS 0xFFA0C90D // Query only + + +/* + * Remote NDIS message types + */ +#define REMOTE_NDIS_PACKET_MSG 0x00000001 +#define REMOTE_NDIS_INITIALIZE_MSG 0x00000002 +#define REMOTE_NDIS_HALT_MSG 0x00000003 +#define REMOTE_NDIS_QUERY_MSG 0x00000004 +#define REMOTE_NDIS_SET_MSG 0x00000005 +#define REMOTE_NDIS_RESET_MSG 0x00000006 +#define REMOTE_NDIS_INDICATE_STATUS_MSG 0x00000007 +#define REMOTE_NDIS_KEEPALIVE_MSG 0x00000008 + +#define REMOTE_CONDIS_MP_CREATE_VC_MSG 0x00008001 +#define REMOTE_CONDIS_MP_DELETE_VC_MSG 0x00008002 +#define REMOTE_CONDIS_MP_ACTIVATE_VC_MSG 0x00008005 +#define REMOTE_CONDIS_MP_DEACTIVATE_VC_MSG 0x00008006 +#define REMOTE_CONDIS_INDICATE_STATUS_MSG 0x00008007 + +/* + * Remote NDIS message completion types + */ +#define REMOTE_NDIS_INITIALIZE_CMPLT 0x80000002 +#define REMOTE_NDIS_QUERY_CMPLT 0x80000004 +#define REMOTE_NDIS_SET_CMPLT 0x80000005 +#define REMOTE_NDIS_RESET_CMPLT 0x80000006 +#define REMOTE_NDIS_KEEPALIVE_CMPLT 0x80000008 + +#define REMOTE_CONDIS_MP_CREATE_VC_CMPLT 0x80008001 +#define REMOTE_CONDIS_MP_DELETE_VC_CMPLT 0x80008002 +#define REMOTE_CONDIS_MP_ACTIVATE_VC_CMPLT 0x80008005 +#define REMOTE_CONDIS_MP_DEACTIVATE_VC_CMPLT 0x80008006 + +/* + * Reserved message type for private communication between lower-layer + * host driver and remote device, if necessary. + */ +#define REMOTE_NDIS_BUS_MSG 0xff000001 + +/* + * Defines for DeviceFlags in rndis_initialize_complete + */ +#define RNDIS_DF_CONNECTIONLESS 0x00000001 +#define RNDIS_DF_CONNECTION_ORIENTED 0x00000002 +#define RNDIS_DF_RAW_DATA 0x00000004 + +/* + * Remote NDIS medium types. + */ +#define RNDIS_MEDIUM_802_3 0x00000000 +#define RNDIS_MEDIUM_802_5 0x00000001 +#define RNDIS_MEDIUM_FDDI 0x00000002 +#define RNDIS_MEDIUM_WAN 0x00000003 +#define RNDIS_MEDIUM_LOCAL_TALK 0x00000004 +#define RNDIS_MEDIUM_ARCNET_RAW 0x00000006 +#define RNDIS_MEDIUM_ARCNET_878_2 0x00000007 +#define RNDIS_MEDIUM_ATM 0x00000008 +#define RNDIS_MEDIUM_WIRELESS_WAN 0x00000009 +#define RNDIS_MEDIUM_IRDA 0x0000000a +#define RNDIS_MEDIUM_CO_WAN 0x0000000b +/* Not a real medium, defined as an upper bound */ +#define RNDIS_MEDIUM_MAX 0x0000000d + +/* + * Remote NDIS medium connection states. + */ +#define RNDIS_MEDIA_STATE_CONNECTED 0x00000000 +#define RNDIS_MEDIA_STATE_DISCONNECTED 0x00000001 + +/* + * Remote NDIS version numbers + */ +#define RNDIS_MAJOR_VERSION 0x00000001 +#define RNDIS_MINOR_VERSION 0x00000000 + +/* + * NdisInitialize message + */ +typedef struct rndis_initialize_request_ { + /* RNDIS request ID */ + uint32_t request_id; + uint32_t major_version; + uint32_t minor_version; + uint32_t max_xfer_size; +} rndis_initialize_request; + +/* + * Response to NdisInitialize + */ +typedef struct rndis_initialize_complete_ { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS status */ + uint32_t status; + uint32_t major_version; + uint32_t minor_version; + uint32_t device_flags; + /* RNDIS medium */ + uint32_t medium; + uint32_t max_pkts_per_msg; + uint32_t max_xfer_size; + uint32_t pkt_align_factor; + uint32_t af_list_offset; + uint32_t af_list_size; +} rndis_initialize_complete; + +/* + * Call manager devices only: Information about an address family + * supported by the device is appended to the response to NdisInitialize. + */ +typedef struct rndis_co_address_family_ { + /* RNDIS AF */ + uint32_t address_family; + uint32_t major_version; + uint32_t minor_version; +} rndis_co_address_family; + +/* + * NdisHalt message + */ +typedef struct rndis_halt_request_ { + /* RNDIS request ID */ + uint32_t request_id; +} rndis_halt_request; + +/* + * NdisQueryRequest message + */ +typedef struct rndis_query_request_ { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS OID */ + uint32_t oid; + uint32_t info_buffer_length; + uint32_t info_buffer_offset; + /* RNDIS handle */ + uint32_t device_vc_handle; +} rndis_query_request; + +/* + * Response to NdisQueryRequest + */ +typedef struct rndis_query_complete_ { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS status */ + uint32_t status; + uint32_t info_buffer_length; + uint32_t info_buffer_offset; +} rndis_query_complete; + +/* + * NdisSetRequest message + */ +typedef struct rndis_set_request_ { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS OID */ + uint32_t oid; + uint32_t info_buffer_length; + uint32_t info_buffer_offset; + /* RNDIS handle */ + uint32_t device_vc_handle; +} rndis_set_request; + +/* + * Response to NdisSetRequest + */ +typedef struct rndis_set_complete_ { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS status */ + uint32_t status; +} rndis_set_complete; + +/* + * NdisReset message + */ +typedef struct rndis_reset_request_ { + uint32_t reserved; +} rndis_reset_request; + +/* + * Response to NdisReset + */ +typedef struct rndis_reset_complete_ { + /* RNDIS status */ + uint32_t status; + uint32_t addressing_reset; +} rndis_reset_complete; + +/* + * NdisMIndicateStatus message + */ +typedef struct rndis_indicate_status_ { + /* RNDIS status */ + uint32_t status; + uint32_t status_buf_length; + uint32_t status_buf_offset; +} rndis_indicate_status; + +/* + * Diagnostic information passed as the status buffer in + * rndis_indicate_status messages signifying error conditions. + */ +typedef struct rndis_diagnostic_info_ { + /* RNDIS status */ + uint32_t diag_status; + uint32_t error_offset; +} rndis_diagnostic_info; + +/* + * NdisKeepAlive message + */ +typedef struct rndis_keepalive_request_ { + /* RNDIS request ID */ + uint32_t request_id; +} rndis_keepalive_request; + +/* + * Response to NdisKeepAlive + */ +typedef struct rndis_keepalive_complete_ { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS status */ + uint32_t status; +} rndis_keepalive_complete; + +/* + * Data message. All offset fields contain byte offsets from the beginning + * of the rndis_packet structure. All length fields are in bytes. + * VcHandle is set to 0 for connectionless data, otherwise it + * contains the VC handle. + */ +typedef struct rndis_packet_ { + uint32_t data_offset; + uint32_t data_length; + uint32_t oob_data_offset; + uint32_t oob_data_length; + uint32_t num_oob_data_elements; + uint32_t per_pkt_info_offset; + uint32_t per_pkt_info_length; + /* RNDIS handle */ + uint32_t vc_handle; + uint32_t reserved; +} rndis_packet; + +typedef struct rndis_packet_ex_ { + uint32_t data_offset; + uint32_t data_length; + uint32_t oob_data_offset; + uint32_t oob_data_length; + uint32_t num_oob_data_elements; + uint32_t per_pkt_info_offset; + uint32_t per_pkt_info_length; + /* RNDIS handle */ + uint32_t vc_handle; + uint32_t reserved; + uint64_t data_buf_id; + uint32_t data_buf_offset; + uint64_t next_header_buf_id; + uint32_t next_header_byte_offset; + uint32_t next_header_byte_count; +} rndis_packet_ex; + +/* + * Optional Out of Band data associated with a Data message. + */ +typedef struct rndis_oobd_ { + uint32_t size; + /* RNDIS class ID */ + uint32_t type; + uint32_t class_info_offset; +} rndis_oobd; + +/* + * Packet extension field contents associated with a Data message. + */ +typedef struct rndis_per_packet_info_ { + uint32_t size; + uint32_t type; + uint32_t per_packet_info_offset; +} rndis_per_packet_info; + +typedef enum ndis_per_pkt_infotype_ { + tcpip_chksum_info, + ipsec_info, + tcp_large_send_info, + classification_handle_info, + ndis_reserved, + sgl_info, + ieee_8021q_info, + original_pkt_info, + pkt_cancel_id, + original_netbuf_list, + cached_netbuf_list, + short_pkt_padding_info, + max_perpkt_info +} ndis_per_pkt_infotype; + +typedef struct ndis_8021q_info_ { + union { + struct { + uint32_t user_pri : 3; /* User Priority */ + uint32_t cfi : 1; /* Canonical Format ID */ + uint32_t vlan_id : 12; + uint32_t reserved : 16; + } s1; + uint32_t value; + } u1; +} ndis_8021q_info; + +/* + * Format of Information buffer passed in a SetRequest for the OID + * OID_GEN_RNDIS_CONFIG_PARAMETER. + */ +typedef struct rndis_config_parameter_info_ { + uint32_t parameter_name_offset; + uint32_t parameter_name_length; + uint32_t parameter_type; + uint32_t parameter_value_offset; + uint32_t parameter_value_length; +} rndis_config_parameter_info; + +/* + * Values for ParameterType in rndis_config_parameter_info + */ +#define RNDIS_CONFIG_PARAM_TYPE_INTEGER 0 +#define RNDIS_CONFIG_PARAM_TYPE_STRING 2 + + +/* + * CONDIS Miniport messages for connection oriented devices + * that do not implement a call manager. + */ + +/* + * CoNdisMiniportCreateVc message + */ +typedef struct rcondis_mp_create_vc_ { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS handle */ + uint32_t ndis_vc_handle; +} rcondis_mp_create_vc; + +/* + * Response to CoNdisMiniportCreateVc + */ +typedef struct rcondis_mp_create_vc_complete_ { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS handle */ + uint32_t device_vc_handle; + /* RNDIS status */ + uint32_t status; +} rcondis_mp_create_vc_complete; + +/* + * CoNdisMiniportDeleteVc message + */ +typedef struct rcondis_mp_delete_vc_ { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS handle */ + uint32_t device_vc_handle; +} rcondis_mp_delete_vc; + +/* + * Response to CoNdisMiniportDeleteVc + */ +typedef struct rcondis_mp_delete_vc_complete_ { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS status */ + uint32_t status; +} rcondis_mp_delete_vc_complete; + +/* + * CoNdisMiniportQueryRequest message + */ +typedef struct rcondis_mp_query_request_ { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS request type */ + uint32_t request_type; + /* RNDIS OID */ + uint32_t oid; + /* RNDIS handle */ + uint32_t device_vc_handle; + uint32_t info_buf_length; + uint32_t info_buf_offset; +} rcondis_mp_query_request; + +/* + * CoNdisMiniportSetRequest message + */ +typedef struct rcondis_mp_set_request_ { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS request type */ + uint32_t request_type; + /* RNDIS OID */ + uint32_t oid; + /* RNDIS handle */ + uint32_t device_vc_handle; + uint32_t info_buf_length; + uint32_t info_buf_offset; +} rcondis_mp_set_request; + +/* + * CoNdisIndicateStatus message + */ +typedef struct rcondis_indicate_status_ { + /* RNDIS handle */ + uint32_t ndis_vc_handle; + /* RNDIS status */ + uint32_t status; + uint32_t status_buf_length; + uint32_t status_buf_offset; +} rcondis_indicate_status; + +/* + * CONDIS Call/VC parameters + */ + +typedef struct rcondis_specific_parameters_ { + uint32_t parameter_type; + uint32_t parameter_length; + uint32_t parameter_offset; +} rcondis_specific_parameters; + +typedef struct rcondis_media_parameters_ { + uint32_t flags; + uint32_t reserved1; + uint32_t reserved2; + rcondis_specific_parameters media_specific; +} rcondis_media_parameters; + +typedef struct rndis_flowspec_ { + uint32_t token_rate; + uint32_t token_bucket_size; + uint32_t peak_bandwidth; + uint32_t latency; + uint32_t delay_variation; + uint32_t service_type; + uint32_t max_sdu_size; + uint32_t minimum_policed_size; +} rndis_flowspec; + +typedef struct rcondis_call_manager_parameters_ { + rndis_flowspec transmit; + rndis_flowspec receive; + rcondis_specific_parameters call_mgr_specific; +} rcondis_call_manager_parameters; + +/* + * CoNdisMiniportActivateVc message + */ +typedef struct rcondis_mp_activate_vc_request_ { + /* RNDIS request ID */ + uint32_t request_id; + uint32_t flags; + /* RNDIS handle */ + uint32_t device_vc_handle; + uint32_t media_params_offset; + uint32_t media_params_length; + uint32_t call_mgr_params_offset; + uint32_t call_mgr_params_length; +} rcondis_mp_activate_vc_request; + +/* + * Response to CoNdisMiniportActivateVc + */ +typedef struct rcondis_mp_activate_vc_complete_ { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS status */ + uint32_t status; +} rcondis_mp_activate_vc_complete; + +/* + * CoNdisMiniportDeactivateVc message + */ +typedef struct rcondis_mp_deactivate_vc_request_ { + /* RNDIS request ID */ + uint32_t request_id; + uint32_t flags; + /* RNDIS handle */ + uint32_t device_vc_handle; +} rcondis_mp_deactivate_vc_request; + +/* + * Response to CoNdisMiniportDeactivateVc + */ +typedef struct rcondis_mp_deactivate_vc_complete_ { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS status */ + uint32_t status; +} rcondis_mp_deactivate_vc_complete; + +/* + * union with all of the RNDIS messages + */ +typedef union rndis_msg_container_ { + rndis_packet packet; + rndis_initialize_request init_request; + rndis_halt_request halt_request; + rndis_query_request query_request; + rndis_set_request set_request; + rndis_reset_request reset_request; + rndis_keepalive_request keepalive_request; + rndis_indicate_status indicate_status; + rndis_initialize_complete init_complete; + rndis_query_complete query_complete; + rndis_set_complete set_complete; + rndis_reset_complete reset_complete; + rndis_keepalive_complete keepalive_complete; + rcondis_mp_create_vc co_miniport_create_vc; + rcondis_mp_delete_vc co_miniport_delete_vc; + rcondis_indicate_status co_miniport_status; + rcondis_mp_activate_vc_request co_miniport_activate_vc; + rcondis_mp_deactivate_vc_request co_miniport_deactivate_vc; + rcondis_mp_create_vc_complete co_miniport_create_vc_complete; + rcondis_mp_delete_vc_complete co_miniport_delete_vc_complete; + rcondis_mp_activate_vc_complete co_miniport_activate_vc_complete; + rcondis_mp_deactivate_vc_complete co_miniport_deactivate_vc_complete; + rndis_packet_ex packet_ex; +} rndis_msg_container; + +/* + * Remote NDIS message format + */ +typedef struct rndis_msg_ { + uint32_t ndis_msg_type; + + /* + * Total length of this message, from the beginning + * of the rndis_msg struct, in bytes. + */ + uint32_t msg_len; + + /* Actual message */ + rndis_msg_container msg; +} rndis_msg; + + +/* + * Handy macros + */ + +/* + * get the size of an RNDIS message. Pass in the message type, + * rndis_set_request, rndis_packet for example + */ +#define RNDIS_MESSAGE_SIZE(message) \ + (sizeof(message) + (sizeof(rndis_msg) - sizeof(rndis_msg_container))) + +/* + * get pointer to info buffer with message pointer + */ +#define MESSAGE_TO_INFO_BUFFER(message) \ + (((PUCHAR)(message)) + message->InformationBufferOffset) + +/* + * get pointer to status buffer with message pointer + */ +#define MESSAGE_TO_STATUS_BUFFER(message) \ + (((PUCHAR)(message)) + message->StatusBufferOffset) + +/* + * get pointer to OOBD buffer with message pointer + */ +#define MESSAGE_TO_OOBD_BUFFER(message) \ + (((PUCHAR)(message)) + message->OOBDataOffset) + +/* + * get pointer to data buffer with message pointer + */ +#define MESSAGE_TO_DATA_BUFFER(message) \ + (((PUCHAR)(message)) + message->PerPacketInfoOffset) + +/* + * get pointer to contained message from NDIS_MESSAGE pointer + */ +#define RNDIS_MESSAGE_PTR_TO_MESSAGE_PTR(rndis_message) \ + ((void *) &rndis_message->Message) + +/* + * get pointer to contained message from NDIS_MESSAGE pointer + */ +#define RNDIS_MESSAGE_RAW_PTR_TO_MESSAGE_PTR(rndis_message) \ + ((void *) rndis_message) + + + +/* + * Structures used in OID_RNDISMP_GET_RECEIVE_BUFFERS + */ + +#define RNDISMP_RECEIVE_BUFFER_ELEM_FLAG_VMQ_RECEIVE_BUFFER 0x00000001 + +typedef struct rndismp_rx_buf_elem_ { + uint32_t flags; + uint32_t length; + uint64_t rx_buf_id; + uint32_t gpadl_handle; + void *rx_buf; +} rndismp_rx_buf_elem; + +typedef struct rndismp_rx_bufs_info_ { + uint32_t num_rx_bufs; + rndismp_rx_buf_elem rx_buf_elems[1]; +} rndismp_rx_bufs_info; + + + +#define RNDIS_HEADER_SIZE (sizeof(rndis_msg) - sizeof(rndis_msg_container)) + +#define NDIS_PACKET_TYPE_DIRECTED 0x00000001 +#define NDIS_PACKET_TYPE_MULTICAST 0x00000002 +#define NDIS_PACKET_TYPE_ALL_MULTICAST 0x00000004 +#define NDIS_PACKET_TYPE_BROADCAST 0x00000008 +#define NDIS_PACKET_TYPE_SOURCE_ROUTING 0x00000010 +#define NDIS_PACKET_TYPE_PROMISCUOUS 0x00000020 +#define NDIS_PACKET_TYPE_SMT 0x00000040 +#define NDIS_PACKET_TYPE_ALL_LOCAL 0x00000080 +#define NDIS_PACKET_TYPE_GROUP 0x00000100 +#define NDIS_PACKET_TYPE_ALL_FUNCTIONAL 0x00000200 +#define NDIS_PACKET_TYPE_FUNCTIONAL 0x00000400 +#define NDIS_PACKET_TYPE_MAC_FRAME 0x00000800 + + +#endif /* __HV_RNDIS_H__ */ + diff --git a/sys/contrib/dev/hyperv/netvsc/hv_rndis_filter.c b/sys/contrib/dev/hyperv/netvsc/hv_rndis_filter.c new file mode 100644 index 0000000..691cf7e --- /dev/null +++ b/sys/contrib/dev/hyperv/netvsc/hv_rndis_filter.c @@ -0,0 +1,929 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2012 NetApp Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <net/if_arp.h> +#include <net/ethernet.h> +#include <sys/types.h> +#include <machine/atomic.h> +#include <sys/sema.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> + +#include <dev/hyperv/include/hyperv.h> +#include "hv_net_vsc.h" +#include "hv_rndis.h" +#include "hv_rndis_filter.h" + + +/* + * Forward declarations + */ +static int hv_rf_send_request(rndis_device *device, rndis_request *request, + uint32_t message_type); +static void hv_rf_receive_response(rndis_device *device, rndis_msg *response); +static void hv_rf_receive_indicate_status(rndis_device *device, + rndis_msg *response); +static void hv_rf_receive_data(rndis_device *device, rndis_msg *message, + netvsc_packet *pkt); +static int hv_rf_query_device(rndis_device *device, uint32_t oid, + void *result, uint32_t *result_size); +static inline int hv_rf_query_device_mac(rndis_device *device); +static inline int hv_rf_query_device_link_status(rndis_device *device); +static int hv_rf_set_packet_filter(rndis_device *device, uint32_t new_filter); +static int hv_rf_init_device(rndis_device *device); +static int hv_rf_open_device(rndis_device *device); +static int hv_rf_close_device(rndis_device *device); +static void hv_rf_on_send_completion(void *context); +static void hv_rf_on_send_request_completion(void *context); +static void hv_rf_on_send_request_halt_completion(void *context); + + +/* + * Allow module_param to work and override to switch to promiscuous mode. + */ +static inline rndis_device * +hv_get_rndis_device(void) +{ + rndis_device *device; + + device = malloc(sizeof(rndis_device), M_DEVBUF, M_NOWAIT | M_ZERO); + if (device == NULL) { + return (NULL); + } + + mtx_init(&device->req_lock, "HV-FRL", NULL, MTX_SPIN | MTX_RECURSE); + + /* Same effect as STAILQ_HEAD_INITIALIZER() static initializer */ + STAILQ_INIT(&device->myrequest_list); + + device->state = RNDIS_DEV_UNINITIALIZED; + + return (device); +} + +/* + * + */ +static inline void +hv_put_rndis_device(rndis_device *device) +{ + mtx_destroy(&device->req_lock); + free(device, M_DEVBUF); +} + +/* + * + */ +static inline rndis_request * +hv_rndis_request(rndis_device *device, uint32_t message_type, + uint32_t message_length) +{ + rndis_request *request; + rndis_msg *rndis_mesg; + rndis_set_request *set; + + request = malloc(sizeof(rndis_request), M_DEVBUF, M_NOWAIT | M_ZERO); + if (request == NULL) { + return (NULL); + } + + sema_init(&request->wait_sema, 0, "rndis sema"); + + rndis_mesg = &request->request_msg; + rndis_mesg->ndis_msg_type = message_type; + rndis_mesg->msg_len = message_length; + + /* + * Set the request id. This field is always after the rndis header + * for request/response packet types so we just use the set_request + * as a template. + */ + set = &rndis_mesg->msg.set_request; + set->request_id = atomic_fetchadd_int(&device->new_request_id, 1); + /* Increment to get the new value (call above returns old value) */ + set->request_id += 1; + + /* Add to the request list */ + mtx_lock_spin(&device->req_lock); + STAILQ_INSERT_TAIL(&device->myrequest_list, request, mylist_entry); + mtx_unlock_spin(&device->req_lock); + + return (request); +} + +/* + * + */ +static inline void +hv_put_rndis_request(rndis_device *device, rndis_request *request) +{ + mtx_lock_spin(&device->req_lock); + /* Fixme: Has O(n) performance */ + /* + * XXXKYS: Use Doubly linked lists. + */ + STAILQ_REMOVE(&device->myrequest_list, request, rndis_request_, + mylist_entry); + mtx_unlock_spin(&device->req_lock); + + sema_destroy(&request->wait_sema); + free(request, M_DEVBUF); +} + +/* + * + */ +static int +hv_rf_send_request(rndis_device *device, rndis_request *request, + uint32_t message_type) +{ + int ret; + netvsc_packet *packet; + + /* Set up the packet to send it */ + packet = &request->pkt; + + packet->is_data_pkt = FALSE; + packet->tot_data_buf_len = request->request_msg.msg_len; + packet->page_buf_count = 1; + + packet->page_buffers[0].pfn = + hv_get_phys_addr(&request->request_msg) >> PAGE_SHIFT; + packet->page_buffers[0].length = request->request_msg.msg_len; + packet->page_buffers[0].offset = + (unsigned long)&request->request_msg & (PAGE_SIZE - 1); + + packet->compl.send.send_completion_context = request; /* packet */ + if (message_type != REMOTE_NDIS_HALT_MSG) { + packet->compl.send.on_send_completion = + hv_rf_on_send_request_completion; + } else { + packet->compl.send.on_send_completion = + hv_rf_on_send_request_halt_completion; + } + packet->compl.send.send_completion_tid = (unsigned long)device; + + ret = hv_nv_on_send(device->net_dev->dev, packet); + + return (ret); +} + +/* + * RNDIS filter receive response + */ +static void +hv_rf_receive_response(rndis_device *device, rndis_msg *response) +{ + rndis_request *request = NULL; + rndis_request *next_request; + boolean_t found = FALSE; + + mtx_lock_spin(&device->req_lock); + request = STAILQ_FIRST(&device->myrequest_list); + while (request != NULL) { + /* + * All request/response message contains request_id as the + * first field + */ + if (request->request_msg.msg.init_request.request_id == + response->msg.init_complete.request_id) { + found = TRUE; + break; + } + next_request = STAILQ_NEXT(request, mylist_entry); + request = next_request; + } + mtx_unlock_spin(&device->req_lock); + + if (found) { + if (response->msg_len <= sizeof(rndis_msg)) { + memcpy(&request->response_msg, response, + response->msg_len); + } else { + if (response->ndis_msg_type == REMOTE_NDIS_RESET_CMPLT) { + /* Does not have a request id field */ + request->response_msg.msg.reset_complete.status = + STATUS_BUFFER_OVERFLOW; + } else { + request->response_msg.msg.init_complete.status = + STATUS_BUFFER_OVERFLOW; + } + } + + sema_post(&request->wait_sema); + } +} + +/* + * RNDIS filter receive indicate status + */ +static void +hv_rf_receive_indicate_status(rndis_device *device, rndis_msg *response) +{ + rndis_indicate_status *indicate = &response->msg.indicate_status; + + if (indicate->status == RNDIS_STATUS_MEDIA_CONNECT) { + netvsc_linkstatus_callback(device->net_dev->dev, 1); + } else if (indicate->status == RNDIS_STATUS_MEDIA_DISCONNECT) { + netvsc_linkstatus_callback(device->net_dev->dev, 0); + } else { + /* TODO: */ + } +} + +/* + * RNDIS filter receive data + */ +static void +hv_rf_receive_data(rndis_device *device, rndis_msg *message, netvsc_packet *pkt) +{ + rndis_packet *rndis_pkt; + rndis_per_packet_info *rppi; + ndis_8021q_info *rppi_vlan_info; + uint32_t data_offset; + + rndis_pkt = &message->msg.packet; + + /* + * Fixme: Handle multiple rndis pkt msgs that may be enclosed in this + * netvsc packet (ie tot_data_buf_len != message_length) + */ + + /* Remove rndis header, then pass data packet up the stack */ + data_offset = RNDIS_HEADER_SIZE + rndis_pkt->data_offset; + + /* L2 frame length, with L2 header, not including CRC */ + pkt->tot_data_buf_len = rndis_pkt->data_length; + pkt->page_buffers[0].offset += data_offset; + /* Buffer length now L2 frame length plus trailing junk */ + pkt->page_buffers[0].length -= data_offset; + + pkt->is_data_pkt = TRUE; + + pkt->vlan_tci = 0; + + /* + * Read the VLAN ID if supplied by the Hyper-V infrastructure. + * Let higher-level driver code decide if it wants to use it. + * Ignore CFI, priority for now as FreeBSD does not support these. + */ + if (rndis_pkt->per_pkt_info_offset != 0) { + /* rppi struct exists; compute its address */ + rppi = (rndis_per_packet_info *)((uint8_t *)rndis_pkt + + rndis_pkt->per_pkt_info_offset); + /* if VLAN ppi struct, get the VLAN ID */ + if (rppi->type == ieee_8021q_info) { + rppi_vlan_info = (ndis_8021q_info *)((uint8_t *)rppi + + rppi->per_packet_info_offset); + pkt->vlan_tci = rppi_vlan_info->u1.s1.vlan_id; + } + } + + netvsc_recv(device->net_dev->dev, pkt); +} + +/* + * RNDIS filter on receive + */ +int +hv_rf_on_receive(struct hv_device *device, netvsc_packet *pkt) +{ + hn_softc_t *sc = device_get_softc(device->device); + netvsc_dev *net_dev = sc->net_dev; + rndis_device *rndis_dev; + rndis_msg rndis_mesg; + rndis_msg *rndis_hdr; + + /* Make sure the rndis device state is initialized */ + if (net_dev->extension == NULL) + return (ENODEV); + + rndis_dev = (rndis_device *)net_dev->extension; + if (rndis_dev->state == RNDIS_DEV_UNINITIALIZED) + return (EINVAL); + + /* Shift virtual page number to form virtual page address */ + rndis_hdr = (rndis_msg *)(pkt->page_buffers[0].pfn << PAGE_SHIFT); + + rndis_hdr = (void *)((unsigned long)rndis_hdr + + pkt->page_buffers[0].offset); + + /* + * Make sure we got a valid rndis message + * Fixme: There seems to be a bug in set completion msg where + * its msg_len is 16 bytes but the byte_count field in the + * xfer page range shows 52 bytes + */ +#if 0 + if (pkt->tot_data_buf_len != rndis_hdr->msg_len) { + DPRINT_ERR(NETVSC, "invalid rndis message? (expected %u " + "bytes got %u)... dropping this message!", + rndis_hdr->msg_len, pkt->tot_data_buf_len); + DPRINT_EXIT(NETVSC); + + return (-1); + } +#endif + + memcpy(&rndis_mesg, rndis_hdr, + (rndis_hdr->msg_len > sizeof(rndis_msg)) ? + sizeof(rndis_msg) : rndis_hdr->msg_len); + + switch (rndis_mesg.ndis_msg_type) { + + /* data message */ + case REMOTE_NDIS_PACKET_MSG: + hv_rf_receive_data(rndis_dev, &rndis_mesg, pkt); + break; + /* completion messages */ + case REMOTE_NDIS_INITIALIZE_CMPLT: + case REMOTE_NDIS_QUERY_CMPLT: + case REMOTE_NDIS_SET_CMPLT: + case REMOTE_NDIS_RESET_CMPLT: + case REMOTE_NDIS_KEEPALIVE_CMPLT: + hv_rf_receive_response(rndis_dev, &rndis_mesg); + break; + /* notification message */ + case REMOTE_NDIS_INDICATE_STATUS_MSG: + hv_rf_receive_indicate_status(rndis_dev, &rndis_mesg); + break; + default: + printf("hv_rf_on_receive(): Unknown msg_type 0x%x\n", + rndis_mesg.ndis_msg_type); + break; + } + + return (0); +} + +/* + * RNDIS filter query device + */ +static int +hv_rf_query_device(rndis_device *device, uint32_t oid, void *result, + uint32_t *result_size) +{ + rndis_request *request; + uint32_t in_result_size = *result_size; + rndis_query_request *query; + rndis_query_complete *query_complete; + int ret = 0; + + *result_size = 0; + request = hv_rndis_request(device, REMOTE_NDIS_QUERY_MSG, + RNDIS_MESSAGE_SIZE(rndis_query_request)); + if (request == NULL) { + ret = -1; + goto cleanup; + } + + /* Set up the rndis query */ + query = &request->request_msg.msg.query_request; + query->oid = oid; + query->info_buffer_offset = sizeof(rndis_query_request); + query->info_buffer_length = 0; + query->device_vc_handle = 0; + + ret = hv_rf_send_request(device, request, REMOTE_NDIS_QUERY_MSG); + if (ret != 0) { + /* Fixme: printf added */ + printf("RNDISFILTER request failed to Send!\n"); + goto cleanup; + } + + sema_wait(&request->wait_sema); + + /* Copy the response back */ + query_complete = &request->response_msg.msg.query_complete; + + if (query_complete->info_buffer_length > in_result_size) { + ret = EINVAL; + goto cleanup; + } + + memcpy(result, (void *)((unsigned long)query_complete + + query_complete->info_buffer_offset), + query_complete->info_buffer_length); + + *result_size = query_complete->info_buffer_length; + +cleanup: + if (request != NULL) + hv_put_rndis_request(device, request); + + return (ret); +} + +/* + * RNDIS filter query device MAC address + */ +static inline int +hv_rf_query_device_mac(rndis_device *device) +{ + uint32_t size = HW_MACADDR_LEN; + + return (hv_rf_query_device(device, + RNDIS_OID_802_3_PERMANENT_ADDRESS, device->hw_mac_addr, &size)); +} + +/* + * RNDIS filter query device link status + */ +static inline int +hv_rf_query_device_link_status(rndis_device *device) +{ + uint32_t size = sizeof(uint32_t); + + return (hv_rf_query_device(device, + RNDIS_OID_GEN_MEDIA_CONNECT_STATUS, &device->link_status, &size)); +} + +/* + * RNDIS filter set packet filter + * Sends an rndis request with the new filter, then waits for a response + * from the host. + * Returns zero on success, non-zero on failure. + */ +static int +hv_rf_set_packet_filter(rndis_device *device, uint32_t new_filter) +{ + rndis_request *request; + rndis_set_request *set; + rndis_set_complete *set_complete; + uint32_t status; + int ret; + + request = hv_rndis_request(device, REMOTE_NDIS_SET_MSG, + RNDIS_MESSAGE_SIZE(rndis_set_request) + sizeof(uint32_t)); + if (request == NULL) { + ret = -1; + goto cleanup; + } + + /* Set up the rndis set */ + set = &request->request_msg.msg.set_request; + set->oid = RNDIS_OID_GEN_CURRENT_PACKET_FILTER; + set->info_buffer_length = sizeof(uint32_t); + set->info_buffer_offset = sizeof(rndis_set_request); + + memcpy((void *)((unsigned long)set + sizeof(rndis_set_request)), + &new_filter, sizeof(uint32_t)); + + ret = hv_rf_send_request(device, request, REMOTE_NDIS_SET_MSG); + if (ret != 0) { + goto cleanup; + } + + /* + * Wait for the response from the host. Another thread will signal + * us when the response has arrived. In the failure case, + * sema_timedwait() returns a non-zero status after waiting 5 seconds. + */ + ret = sema_timedwait(&request->wait_sema, 500); + if (ret == 0) { + /* Response received, check status */ + set_complete = &request->response_msg.msg.set_complete; + status = set_complete->status; + if (status != RNDIS_STATUS_SUCCESS) { + /* Bad response status, return error */ + ret = -2; + } + } else { + /* + * We cannot deallocate the request since we may still + * receive a send completion for it. + */ + goto exit; + } + +cleanup: + if (request != NULL) { + hv_put_rndis_request(device, request); + } +exit: + return (ret); +} + +/* + * RNDIS filter init device + */ +static int +hv_rf_init_device(rndis_device *device) +{ + rndis_request *request; + rndis_initialize_request *init; + rndis_initialize_complete *init_complete; + uint32_t status; + int ret; + + request = hv_rndis_request(device, REMOTE_NDIS_INITIALIZE_MSG, + RNDIS_MESSAGE_SIZE(rndis_initialize_request)); + if (!request) { + ret = -1; + goto cleanup; + } + + /* Set up the rndis set */ + init = &request->request_msg.msg.init_request; + init->major_version = RNDIS_MAJOR_VERSION; + init->minor_version = RNDIS_MINOR_VERSION; + /* + * Per the RNDIS document, this should be set to the max MTU + * plus the header size. However, 2048 works fine, so leaving + * it as is. + */ + init->max_xfer_size = 2048; + + device->state = RNDIS_DEV_INITIALIZING; + + ret = hv_rf_send_request(device, request, REMOTE_NDIS_INITIALIZE_MSG); + if (ret != 0) { + device->state = RNDIS_DEV_UNINITIALIZED; + goto cleanup; + } + + sema_wait(&request->wait_sema); + + init_complete = &request->response_msg.msg.init_complete; + status = init_complete->status; + if (status == RNDIS_STATUS_SUCCESS) { + device->state = RNDIS_DEV_INITIALIZED; + ret = 0; + } else { + device->state = RNDIS_DEV_UNINITIALIZED; + ret = -1; + } + +cleanup: + if (request) { + hv_put_rndis_request(device, request); + } + + return (ret); +} + +#define HALT_COMPLETION_WAIT_COUNT 25 + +/* + * RNDIS filter halt device + */ +static int +hv_rf_halt_device(rndis_device *device) +{ + rndis_request *request; + rndis_halt_request *halt; + int i, ret; + + /* Attempt to do a rndis device halt */ + request = hv_rndis_request(device, REMOTE_NDIS_HALT_MSG, + RNDIS_MESSAGE_SIZE(rndis_halt_request)); + if (request == NULL) { + return (-1); + } + + /* initialize "poor man's semaphore" */ + request->halt_complete_flag = 0; + + /* Set up the rndis set */ + halt = &request->request_msg.msg.halt_request; + halt->request_id = atomic_fetchadd_int(&device->new_request_id, 1); + /* Increment to get the new value (call above returns old value) */ + halt->request_id += 1; + + ret = hv_rf_send_request(device, request, REMOTE_NDIS_HALT_MSG); + if (ret != 0) { + return (-1); + } + + /* + * Wait for halt response from halt callback. We must wait for + * the transaction response before freeing the request and other + * resources. + */ + for (i=HALT_COMPLETION_WAIT_COUNT; i > 0; i--) { + if (request->halt_complete_flag != 0) { + break; + } + DELAY(400); + } + if (i == 0) { + return (-1); + } + + device->state = RNDIS_DEV_UNINITIALIZED; + + if (request != NULL) { + hv_put_rndis_request(device, request); + } + + return (0); +} + +/* + * RNDIS filter open device + */ +static int +hv_rf_open_device(rndis_device *device) +{ + int ret; + + if (device->state != RNDIS_DEV_INITIALIZED) { + return (0); + } + + if (hv_promisc_mode != 1) { + ret = hv_rf_set_packet_filter(device, + NDIS_PACKET_TYPE_BROADCAST | + NDIS_PACKET_TYPE_ALL_MULTICAST | + NDIS_PACKET_TYPE_DIRECTED); + } else { + ret = hv_rf_set_packet_filter(device, + NDIS_PACKET_TYPE_PROMISCUOUS); + } + + if (ret == 0) { + device->state = RNDIS_DEV_DATAINITIALIZED; + } + + return (ret); +} + +/* + * RNDIS filter close device + */ +static int +hv_rf_close_device(rndis_device *device) +{ + int ret; + + if (device->state != RNDIS_DEV_DATAINITIALIZED) { + return (0); + } + + ret = hv_rf_set_packet_filter(device, 0); + if (ret == 0) { + device->state = RNDIS_DEV_INITIALIZED; + } + + return (ret); +} + +/* + * RNDIS filter on device add + */ +int +hv_rf_on_device_add(struct hv_device *device, void *additl_info) +{ + int ret; + netvsc_dev *net_dev; + rndis_device *rndis_dev; + netvsc_device_info *dev_info = (netvsc_device_info *)additl_info; + + rndis_dev = hv_get_rndis_device(); + if (rndis_dev == NULL) { + return (ENOMEM); + } + + /* + * Let the inner driver handle this first to create the netvsc channel + * NOTE! Once the channel is created, we may get a receive callback + * (hv_rf_on_receive()) before this call is completed. + * Note: Earlier code used a function pointer here. + */ + net_dev = hv_nv_on_device_add(device, additl_info); + if (!net_dev) { + hv_put_rndis_device(rndis_dev); + + return (ENOMEM); + } + + /* + * Initialize the rndis device + */ + + net_dev->extension = rndis_dev; + rndis_dev->net_dev = net_dev; + + /* Send the rndis initialization message */ + ret = hv_rf_init_device(rndis_dev); + if (ret != 0) { + /* + * TODO: If rndis init failed, we will need to shut down + * the channel + */ + } + + /* Get the mac address */ + ret = hv_rf_query_device_mac(rndis_dev); + if (ret != 0) { + /* TODO: shut down rndis device and the channel */ + } + + memcpy(dev_info->mac_addr, rndis_dev->hw_mac_addr, HW_MACADDR_LEN); + + hv_rf_query_device_link_status(rndis_dev); + + dev_info->link_state = rndis_dev->link_status; + + return (ret); +} + +/* + * RNDIS filter on device remove + */ +int +hv_rf_on_device_remove(struct hv_device *device, boolean_t destroy_channel) +{ + hn_softc_t *sc = device_get_softc(device->device); + netvsc_dev *net_dev = sc->net_dev; + rndis_device *rndis_dev = (rndis_device *)net_dev->extension; + int ret; + + /* Halt and release the rndis device */ + ret = hv_rf_halt_device(rndis_dev); + + hv_put_rndis_device(rndis_dev); + net_dev->extension = NULL; + + /* Pass control to inner driver to remove the device */ + ret |= hv_nv_on_device_remove(device, destroy_channel); + + return (ret); +} + +/* + * RNDIS filter on open + */ +int +hv_rf_on_open(struct hv_device *device) +{ + hn_softc_t *sc = device_get_softc(device->device); + netvsc_dev *net_dev = sc->net_dev; + + return (hv_rf_open_device((rndis_device *)net_dev->extension)); +} + +/* + * RNDIS filter on close + */ +int +hv_rf_on_close(struct hv_device *device) +{ + hn_softc_t *sc = device_get_softc(device->device); + netvsc_dev *net_dev = sc->net_dev; + + return (hv_rf_close_device((rndis_device *)net_dev->extension)); +} + +/* + * RNDIS filter on send + */ +int +hv_rf_on_send(struct hv_device *device, netvsc_packet *pkt) +{ + rndis_filter_packet *filter_pkt; + rndis_msg *rndis_mesg; + rndis_packet *rndis_pkt; + rndis_per_packet_info *rppi; + ndis_8021q_info *rppi_vlan_info; + uint32_t rndis_msg_size; + int ret = 0; + + /* Add the rndis header */ + filter_pkt = (rndis_filter_packet *)pkt->extension; + + memset(filter_pkt, 0, sizeof(rndis_filter_packet)); + + rndis_mesg = &filter_pkt->message; + rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet); + + if (pkt->vlan_tci != 0) { + rndis_msg_size += sizeof(rndis_per_packet_info) + + sizeof(ndis_8021q_info); + } + + rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG; + rndis_mesg->msg_len = pkt->tot_data_buf_len + rndis_msg_size; + + rndis_pkt = &rndis_mesg->msg.packet; + rndis_pkt->data_offset = sizeof(rndis_packet); + rndis_pkt->data_length = pkt->tot_data_buf_len; + + pkt->is_data_pkt = TRUE; + pkt->page_buffers[0].pfn = hv_get_phys_addr(rndis_mesg) >> PAGE_SHIFT; + pkt->page_buffers[0].offset = + (unsigned long)rndis_mesg & (PAGE_SIZE - 1); + pkt->page_buffers[0].length = rndis_msg_size; + + /* Save the packet context */ + filter_pkt->completion_context = + pkt->compl.send.send_completion_context; + + /* Use ours */ + pkt->compl.send.on_send_completion = hv_rf_on_send_completion; + pkt->compl.send.send_completion_context = filter_pkt; + + /* + * If there is a VLAN tag, we need to set up some additional + * fields so the Hyper-V infrastructure will stuff the VLAN tag + * into the frame. + */ + if (pkt->vlan_tci != 0) { + /* Move data offset past end of rppi + VLAN structs */ + rndis_pkt->data_offset += sizeof(rndis_per_packet_info) + + sizeof(ndis_8021q_info); + + /* must be set when we have rppi, VLAN info */ + rndis_pkt->per_pkt_info_offset = sizeof(rndis_packet); + rndis_pkt->per_pkt_info_length = sizeof(rndis_per_packet_info) + + sizeof(ndis_8021q_info); + + /* rppi immediately follows rndis_pkt */ + rppi = (rndis_per_packet_info *)(rndis_pkt + 1); + rppi->size = sizeof(rndis_per_packet_info) + + sizeof(ndis_8021q_info); + rppi->type = ieee_8021q_info; + rppi->per_packet_info_offset = sizeof(rndis_per_packet_info); + + /* VLAN info immediately follows rppi struct */ + rppi_vlan_info = (ndis_8021q_info *)(rppi + 1); + /* FreeBSD does not support CFI or priority */ + rppi_vlan_info->u1.s1.vlan_id = pkt->vlan_tci & 0xfff; + } + + /* + * Invoke netvsc send. If return status is bad, the caller now + * resets the context pointers before retrying. + */ + ret = hv_nv_on_send(device, pkt); + + return (ret); +} + +/* + * RNDIS filter on send completion callback + */ +static void +hv_rf_on_send_completion(void *context) +{ + rndis_filter_packet *filter_pkt = (rndis_filter_packet *)context; + + /* Pass it back to the original handler */ + netvsc_xmit_completion(filter_pkt->completion_context); +} + +/* + * RNDIS filter on send request completion callback + */ +static void +hv_rf_on_send_request_completion(void *context) +{ +} + +/* + * RNDIS filter on send request (halt only) completion callback + */ +static void +hv_rf_on_send_request_halt_completion(void *context) +{ + rndis_request *request = context; + + /* + * Notify hv_rf_halt_device() about halt completion. + * The halt code must wait for completion before freeing + * the transaction resources. + */ + request->halt_complete_flag = 1; +} + diff --git a/sys/contrib/dev/hyperv/netvsc/hv_rndis_filter.h b/sys/contrib/dev/hyperv/netvsc/hv_rndis_filter.h new file mode 100644 index 0000000..edbb347 --- /dev/null +++ b/sys/contrib/dev/hyperv/netvsc/hv_rndis_filter.h @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2012 NetApp Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HV_RNDIS_FILTER_H__ +#define __HV_RNDIS_FILTER_H__ + + +/* + * Defines + */ + +/* Destroy or preserve channel on filter/netvsc teardown */ +#define HV_RF_NV_DESTROY_CHANNEL TRUE +#define HV_RF_NV_RETAIN_CHANNEL FALSE + +/* + * Number of page buffers to reserve for the RNDIS filter packet in the + * transmitted message. + */ +#define HV_RF_NUM_TX_RESERVED_PAGE_BUFS 1 + + +/* + * Data types + */ + +typedef enum { + RNDIS_DEV_UNINITIALIZED = 0, + RNDIS_DEV_INITIALIZING, + RNDIS_DEV_INITIALIZED, + RNDIS_DEV_DATAINITIALIZED, +} rndis_device_state; + +typedef struct rndis_request_ { + STAILQ_ENTRY(rndis_request_) mylist_entry; + struct sema wait_sema; + + /* + * Fixme: We assumed a fixed size response here. If we do ever + * need to handle a bigger response, we can either define a max + * response message or add a response buffer variable above this field + */ + rndis_msg response_msg; + + /* Simplify allocation by having a netvsc packet inline */ + netvsc_packet pkt; + hv_vmbus_page_buffer buffer; + /* Fixme: We assumed a fixed size request here. */ + rndis_msg request_msg; + /* Fixme: Poor man's semaphore. */ + uint32_t halt_complete_flag; +} rndis_request; + +typedef struct rndis_device_ { + netvsc_dev *net_dev; + + rndis_device_state state; + uint32_t link_status; + uint32_t new_request_id; + + struct mtx req_lock; + + STAILQ_HEAD(RQ, rndis_request_) myrequest_list; + + uint8_t hw_mac_addr[HW_MACADDR_LEN]; +} rndis_device; + +typedef struct rndis_filter_packet_ { + void *completion_context; + /* No longer used */ + pfn_on_send_rx_completion on_completion; + + rndis_msg message; +} rndis_filter_packet; + + +/* + * Externs + */ + +extern int hv_rf_on_receive(struct hv_device *device, netvsc_packet *pkt); +extern int hv_rf_on_device_add(struct hv_device *device, void *additl_info); +extern int hv_rf_on_device_remove(struct hv_device *device, + boolean_t destroy_channel); +extern int hv_rf_on_open(struct hv_device *device); +extern int hv_rf_on_close(struct hv_device *device); +extern int hv_rf_on_send(struct hv_device *device, netvsc_packet *pkt); + + +#endif /* __HV_RNDIS_FILTER_H__ */ + diff --git a/sys/contrib/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/contrib/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c new file mode 100644 index 0000000..657cedf --- /dev/null +++ b/sys/contrib/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c @@ -0,0 +1,1470 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * StorVSC driver for Hyper-V. This driver presents a SCSI HBA interface + * to the Comman Access Method (CAM) layer. CAM control blocks (CCBs) are + * converted into VSCSI protocol messages which are delivered to the parent + * partition StorVSP driver over the Hyper-V VMBUS. + */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/condvar.h> +#include <sys/systm.h> +#include <sys/sockio.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/kernel.h> +#include <sys/queue.h> +#include <sys/lock.h> +#include <sys/sx.h> +#include <sys/taskqueue.h> +#include <sys/bus.h> +#include <sys/mutex.h> +#include <sys/callout.h> +#include <vm/vm.h> +#include <vm/pmap.h> +#include <sys/lock.h> +#include <sys/sema.h> + +#include <cam/cam.h> +#include <cam/cam_ccb.h> +#include <cam/cam_periph.h> +#include <cam/cam_sim.h> +#include <cam/cam_xpt_sim.h> +#include <cam/cam_xpt_internal.h> +#include <cam/cam_debug.h> +#include <cam/scsi/scsi_all.h> +#include <cam/scsi/scsi_message.h> + + +#include <dev/hyperv/include/hyperv.h> +#include "hv_vstorage.h" + +#define STORVSC_RINGBUFFER_SIZE (20*PAGE_SIZE) +#define STORVSC_MAX_LUNS_PER_TARGET (64) +#define STORVSC_MAX_IO_REQUESTS (STORVSC_MAX_LUNS_PER_TARGET * 2) +#define BLKVSC_MAX_IDE_DISKS_PER_TARGET (1) +#define BLKVSC_MAX_IO_REQUESTS STORVSC_MAX_IO_REQUESTS +#define STORVSC_MAX_TARGETS (1) + +struct storvsc_softc; + +enum storvsc_request_type { + WRITE_TYPE, + READ_TYPE, + UNKNOWN_TYPE +}; + +struct hv_storvsc_request { + LIST_ENTRY(hv_storvsc_request) link; + struct vstor_packet vstor_packet; + hv_vmbus_multipage_buffer data_buf; + void *sense_data; + uint8_t sense_info_len; + uint8_t retries; + union ccb *ccb; + struct storvsc_softc *softc; + struct callout callout; + struct sema synch_sema; /*Synchronize the request/response if needed */ +}; + +struct storvsc_softc { + struct hv_device *hs_dev; + LIST_HEAD(, hv_storvsc_request) hs_free_list; + struct mtx hs_lock; + struct storvsc_driver_props *hs_drv_props; + int hs_unit; + uint32_t hs_frozen; + struct cam_sim *hs_sim; + struct cam_path *hs_path; + uint32_t hs_num_out_reqs; + boolean_t hs_destroy; + boolean_t hs_drain_notify; + struct sema hs_drain_sema; + struct hv_storvsc_request hs_init_req; + struct hv_storvsc_request hs_reset_req; +}; + + +/** + * HyperV storvsc timeout testing cases: + * a. IO returned after first timeout; + * b. IO returned after second timeout and queue freeze; + * c. IO returned while timer handler is running + * The first can be tested by "sg_senddiag -vv /dev/daX", + * and the second and third can be done by + * "sg_wr_mode -v -p 08 -c 0,1a -m 0,ff /dev/daX". + */ +#define HVS_TIMEOUT_TEST 0 + +/* + * Bus/adapter reset functionality on the Hyper-V host is + * buggy and it will be disabled until + * it can be further tested. + */ +#define HVS_HOST_RESET 0 + +struct storvsc_driver_props { + char *drv_name; + char *drv_desc; + uint8_t drv_max_luns_per_target; + uint8_t drv_max_ios_per_target; + uint32_t drv_ringbuffer_size; +}; + +enum hv_storage_type { + DRIVER_BLKVSC, + DRIVER_STORVSC, + DRIVER_UNKNOWN +}; + +#define HS_MAX_ADAPTERS 10 + +/* {ba6163d9-04a1-4d29-b605-72e2ffb1dc7f} */ +static const hv_guid gStorVscDeviceType={ + .data = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d, + 0xb6, 0x05, 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f} +}; + +/* {32412632-86cb-44a2-9b5c-50d1417354f5} */ +static const hv_guid gBlkVscDeviceType={ + .data = {0x32, 0x26, 0x41, 0x32, 0xcb, 0x86, 0xa2, 0x44, + 0x9b, 0x5c, 0x50, 0xd1, 0x41, 0x73, 0x54, 0xf5} +}; + +static struct storvsc_driver_props g_drv_props_table[] = { + {"blkvsc", "Hyper-V IDE Storage Interface", + BLKVSC_MAX_IDE_DISKS_PER_TARGET, BLKVSC_MAX_IO_REQUESTS, + STORVSC_RINGBUFFER_SIZE}, + {"storvsc", "Hyper-V SCSI Storage Interface", + STORVSC_MAX_LUNS_PER_TARGET, STORVSC_MAX_IO_REQUESTS, + STORVSC_RINGBUFFER_SIZE} +}; + +static struct storvsc_softc *hs_softc[HS_MAX_ADAPTERS]; + +/* static functions */ +static int storvsc_probe(device_t dev); +static int storvsc_attach(device_t dev); +static int storvsc_detach(device_t dev); +static void storvsc_poll(struct cam_sim * sim); +static void storvsc_action(struct cam_sim * sim, union ccb * ccb); +static void scan_for_luns(struct storvsc_softc * sc); +static void create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp); +static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp); +static enum hv_storage_type storvsc_get_storage_type(device_t dev); +static void hv_storvsc_on_channel_callback(void *context); +static void hv_storvsc_on_iocompletion( struct storvsc_softc *sc, + struct vstor_packet *vstor_packet, + struct hv_storvsc_request *request); +static int hv_storvsc_connect_vsp(struct hv_device *device); +static void storvsc_io_done(struct hv_storvsc_request *reqp); + +static device_method_t storvsc_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, storvsc_probe), + DEVMETHOD(device_attach, storvsc_attach), + DEVMETHOD(device_detach, storvsc_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + { 0, 0 } +}; + +static driver_t storvsc_driver = { + "storvsc", storvsc_methods, sizeof(struct storvsc_softc), +}; + +static devclass_t storvsc_devclass; +DRIVER_MODULE(storvsc, vmbus, storvsc_driver, storvsc_devclass, 0, 0); +MODULE_VERSION(storvsc,1); +MODULE_DEPEND(storvsc, vmbus, 1, 1, 1); + +extern int ata_disk_enable; + +/** + * The host is capable of sending messages to us that are + * completely unsolicited. So, we need to address the race + * condition where we may be in the process of unloading the + * driver when the host may send us an unsolicited message. + * We address this issue by implementing a sequentially + * consistent protocol: + * + * 1. Channel callback is invoked while holding the the channel lock + * and an unloading driver will reset the channel callback under + * the protection of this channel lock. + * + * 2. To ensure bounded wait time for unloading a driver, we don't + * permit outgoing traffic once the device is marked as being + * destroyed. + * + * 3. Once the device is marked as being destroyed, we only + * permit incoming traffic to properly account for + * packets already sent out. + */ +static inline struct storvsc_softc * +get_stor_device(struct hv_device *device, + boolean_t outbound) +{ + struct storvsc_softc *sc; + + sc = device_get_softc(device->device); + if (sc == NULL) { + return NULL; + } + + if (outbound) { + /* + * Here we permit outgoing I/O only + * if the device is not being destroyed. + */ + + if (sc->hs_destroy) { + sc = NULL; + } + } else { + /* + * inbound case; if being destroyed + * only permit to account for + * messages already sent out. + */ + if (sc->hs_destroy && (sc->hs_num_out_reqs == 0)) { + sc = NULL; + } + } + return sc; +} + +/** + * @brief initialize channel connection to parent partition + * + * @param dev a Hyper-V device pointer + * @returns 0 on success, non-zero error on failure + */ +static int +hv_storvsc_channel_init(struct hv_device *dev) +{ + int ret = 0; + struct hv_storvsc_request *request; + struct vstor_packet *vstor_packet; + struct storvsc_softc *sc; + + sc = get_stor_device(dev, TRUE); + if (sc == NULL) { + return ENODEV; + } + + request = &sc->hs_init_req; + memset(request, 0, sizeof(struct hv_storvsc_request)); + vstor_packet = &request->vstor_packet; + request->softc = sc; + + /** + * Initiate the vsc/vsp initialization protocol on the open channel + */ + sema_init(&request->synch_sema, 0, ("stor_synch_sema")); + + vstor_packet->operation = VSTOR_OPERATION_BEGININITIALIZATION; + vstor_packet->flags = REQUEST_COMPLETION_FLAG; + + + ret = hv_vmbus_channel_send_packet( + dev->channel, + vstor_packet, + sizeof(struct vstor_packet), + (uint64_t)request, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + + if (ret != 0) { + goto cleanup; + } + + ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + + if (ret != 0) { + goto cleanup; + } + + if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || + vstor_packet->status != 0) { + goto cleanup; + } + + /* reuse the packet for version range supported */ + + memset(vstor_packet, 0, sizeof(struct vstor_packet)); + vstor_packet->operation = VSTOR_OPERATION_QUERYPROTOCOLVERSION; + vstor_packet->flags = REQUEST_COMPLETION_FLAG; + + vstor_packet->version.major_minor = VMSTOR_PROTOCOL_VERSION_CURRENT; + + /* revision is only significant for Windows guests */ + vstor_packet->version.revision = 0; + + ret = hv_vmbus_channel_send_packet( + dev->channel, + vstor_packet, + sizeof(struct vstor_packet), + (uint64_t)request, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + + if (ret != 0) { + goto cleanup; + } + + ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + + if (ret) { + goto cleanup; + } + + /* TODO: Check returned version */ + if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || + vstor_packet->status != 0) { + goto cleanup; + } + + /** + * Query channel properties + */ + memset(vstor_packet, 0, sizeof(struct vstor_packet)); + vstor_packet->operation = VSTOR_OPERATION_QUERYPROPERTIES; + vstor_packet->flags = REQUEST_COMPLETION_FLAG; + + ret = hv_vmbus_channel_send_packet( + dev->channel, + vstor_packet, + sizeof(struct vstor_packet), + (uint64_t)request, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + + if ( ret != 0) { + goto cleanup; + } + + ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + + if (ret != 0) { + goto cleanup; + } + + /* TODO: Check returned version */ + if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || + vstor_packet->status != 0) { + goto cleanup; + } + + memset(vstor_packet, 0, sizeof(struct vstor_packet)); + vstor_packet->operation = VSTOR_OPERATION_ENDINITIALIZATION; + vstor_packet->flags = REQUEST_COMPLETION_FLAG; + + ret = hv_vmbus_channel_send_packet( + dev->channel, + vstor_packet, + sizeof(struct vstor_packet), + (uint64_t)request, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + + if (ret != 0) { + goto cleanup; + } + + ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + + if (ret != 0) { + goto cleanup; + } + + if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || + vstor_packet->status != 0) { + goto cleanup; + } + +cleanup: + sema_destroy(&request->synch_sema); + return (ret); +} + +/** + * @brief Open channel connection to paraent partition StorVSP driver + * + * Open and initialize channel connection to parent partition StorVSP driver. + * + * @param pointer to a Hyper-V device + * @returns 0 on success, non-zero error on failure + */ +static int +hv_storvsc_connect_vsp(struct hv_device *dev) +{ + int ret = 0; + struct vmstor_chan_props props; + struct storvsc_softc *sc; + + sc = device_get_softc(dev->device); + + memset(&props, 0, sizeof(struct vmstor_chan_props)); + + /* + * Open the channel + */ + + ret = hv_vmbus_channel_open( + dev->channel, + sc->hs_drv_props->drv_ringbuffer_size, + sc->hs_drv_props->drv_ringbuffer_size, + (void *)&props, + sizeof(struct vmstor_chan_props), + hv_storvsc_on_channel_callback, + dev); + + + if (ret != 0) { + return ret; + } + + ret = hv_storvsc_channel_init(dev); + + return (ret); +} + +#if HVS_HOST_RESET +static int +hv_storvsc_host_reset(struct hv_device *dev) +{ + int ret = 0; + struct storvsc_softc *sc; + + struct hv_storvsc_request *request; + struct vstor_packet *vstor_packet; + + sc = get_stor_device(dev, TRUE); + if (sc == NULL) { + return ENODEV; + } + + request = &sc->hs_reset_req; + request->softc = sc; + vstor_packet = &request->vstor_packet; + + sema_init(&request->synch_sema, 0, "stor synch sema"); + + vstor_packet->operation = VSTOR_OPERATION_RESETBUS; + vstor_packet->flags = REQUEST_COMPLETION_FLAG; + + ret = hv_vmbus_channel_send_packet(dev->channel, + vstor_packet, + sizeof(struct vstor_packet), + (uint64_t)&sc->hs_reset_req, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + + if (ret != 0) { + goto cleanup; + } + + ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + + if (ret) { + goto cleanup; + } + + + /* + * At this point, all outstanding requests in the adapter + * should have been flushed out and return to us + */ + +cleanup: + sema_destroy(&request->synch_sema); + return (ret); +} +#endif /* HVS_HOST_RESET */ + +/** + * @brief Function to initiate an I/O request + * + * @param device Hyper-V device pointer + * @param request pointer to a request structure + * @returns 0 on success, non-zero error on failure + */ +static int +hv_storvsc_io_request(struct hv_device *device, + struct hv_storvsc_request *request) +{ + struct storvsc_softc *sc; + struct vstor_packet *vstor_packet = &request->vstor_packet; + int ret = 0; + + sc = get_stor_device(device, TRUE); + + if (sc == NULL) { + return ENODEV; + } + + vstor_packet->flags |= REQUEST_COMPLETION_FLAG; + + vstor_packet->vm_srb.length = sizeof(struct vmscsi_req); + + vstor_packet->vm_srb.sense_info_len = SENSE_BUFFER_SIZE; + + vstor_packet->vm_srb.transfer_len = request->data_buf.length; + + vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB; + + + mtx_unlock(&request->softc->hs_lock); + if (request->data_buf.length) { + ret = hv_vmbus_channel_send_packet_multipagebuffer( + device->channel, + &request->data_buf, + vstor_packet, + sizeof(struct vstor_packet), + (uint64_t)request); + + } else { + ret = hv_vmbus_channel_send_packet( + device->channel, + vstor_packet, + sizeof(struct vstor_packet), + (uint64_t)request, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + } + mtx_lock(&request->softc->hs_lock); + + if (ret != 0) { + printf("Unable to send packet %p ret %d", vstor_packet, ret); + } else { + atomic_add_int(&sc->hs_num_out_reqs, 1); + } + + return (ret); +} + + +/** + * Process IO_COMPLETION_OPERATION and ready + * the result to be completed for upper layer + * processing by the CAM layer. + */ +static void +hv_storvsc_on_iocompletion(struct storvsc_softc *sc, + struct vstor_packet *vstor_packet, + struct hv_storvsc_request *request) +{ + struct vmscsi_req *vm_srb; + + vm_srb = &vstor_packet->vm_srb; + + request->sense_info_len = 0; + if (((vm_srb->scsi_status & 0xFF) == SCSI_STATUS_CHECK_COND) && + (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID)) { + /* Autosense data available */ + + KASSERT(vm_srb->sense_info_len <= request->sense_info_len, + ("vm_srb->sense_info_len <= " + "request->sense_info_len")); + + memcpy(request->sense_data, vm_srb->sense_data, + vm_srb->sense_info_len); + + request->sense_info_len = vm_srb->sense_info_len; + } + + /* Complete request by passing to the CAM layer */ + storvsc_io_done(request); + atomic_subtract_int(&sc->hs_num_out_reqs, 1); + if (sc->hs_drain_notify && (sc->hs_num_out_reqs == 0)) { + sema_post(&sc->hs_drain_sema); + } +} + +static void +hv_storvsc_on_channel_callback(void *context) +{ + int ret = 0; + struct hv_device *device = (struct hv_device *)context; + struct storvsc_softc *sc; + uint32_t bytes_recvd; + uint64_t request_id; + uint8_t packet[roundup2(sizeof(struct vstor_packet), 8)]; + struct hv_storvsc_request *request; + struct vstor_packet *vstor_packet; + + sc = get_stor_device(device, FALSE); + if (sc == NULL) { + return; + } + + KASSERT(device, ("device")); + + ret = hv_vmbus_channel_recv_packet( + device->channel, + packet, + roundup2(sizeof(struct vstor_packet), 8), + &bytes_recvd, + &request_id); + + while ((ret == 0) && (bytes_recvd > 0)) { + request = (struct hv_storvsc_request *)request_id; + KASSERT(request, ("request")); + + if ((request == &sc->hs_init_req) || + (request == &sc->hs_reset_req)) { + memcpy(&request->vstor_packet, packet, + sizeof(struct vstor_packet)); + sema_post(&request->synch_sema); + } else { + vstor_packet = (struct vstor_packet *)packet; + switch(vstor_packet->operation) { + case VSTOR_OPERATION_COMPLETEIO: + hv_storvsc_on_iocompletion(sc, + vstor_packet, request); + break; + case VSTOR_OPERATION_REMOVEDEVICE: + /* TODO: implement */ + break; + default: + break; + } + } + ret = hv_vmbus_channel_recv_packet( + device->channel, + packet, + roundup2(sizeof(struct vstor_packet), 8), + &bytes_recvd, + &request_id); + } +} + +/** + * @brief callback function for completing a single LUN scan + * + * This function is responsible for waking up the executer of + * the scan LUN CCB action (cam_periph_runccb.) cam_periph_ccbwait + * sleeps on the mutex being signaled. + * + * @param periph a pointer to a CAM peripheral + * @param done_ccb pointer to CAM control block + */ +static void +storvsc_xptdone(struct cam_periph *periph, union ccb *done_ccb) +{ + wakeup(&done_ccb->ccb_h.cbfcnp); +} + +/** + * @brief scan for attached logical unit numbers (LUNs) + * + * In Hyper-V there is no backend changed device operation which + * presents FreeBSD with a list of devices to connect. The result is + * that we have to scan for a list of luns in the storvsc_attach() + * routine. There is only one SCSI target, so scan for the maximum + * number of luns. + * + * @param pointer to softc + */ +static void +scan_for_luns(struct storvsc_softc *sc) +{ + union ccb *request_ccb; + struct cam_path *path = sc->hs_path; + struct cam_path *my_path = NULL; + cam_status status; + int lun_nb = 0; + int error; + + request_ccb = malloc(sizeof(union ccb), M_CAMXPT, M_WAITOK); + my_path = malloc(sizeof(*my_path), M_CAMXPT, M_WAITOK); + + mtx_lock(&sc->hs_lock); + do { + /* + * Scan the next LUN. Reuse path and ccb structs. + */ + bzero(my_path, sizeof(*my_path)); + bzero(request_ccb, sizeof(*request_ccb)); + status = xpt_compile_path(my_path, + xpt_periph, + path->bus->path_id, + 0, + lun_nb); + + if (status != CAM_REQ_CMP) { + mtx_unlock(&sc->hs_lock); + xpt_print(path, "scan_for_lunYYY: can't compile" + " path, 0x%p can't continue\n", + sc->hs_path); + free(request_ccb, M_CAMXPT); + free(my_path, M_CAMXPT); + return; + } + + xpt_setup_ccb(&request_ccb->ccb_h, my_path, 5); + request_ccb->ccb_h.func_code = XPT_SCAN_LUN; + request_ccb->ccb_h.cbfcnp = storvsc_xptdone; + request_ccb->crcn.flags = CAM_FLAG_NONE; + + error = cam_periph_runccb(request_ccb, NULL, + CAM_FLAG_NONE, 0, NULL); + KASSERT(error == 0, ("cam_periph_runccb failed %d\n", error)); + xpt_release_path(my_path); + } while ( ++lun_nb < sc->hs_drv_props->drv_max_luns_per_target); + mtx_unlock(&sc->hs_lock); + free(request_ccb, M_CAMXPT); + free(my_path, M_CAMXPT); +} + +/** + * @brief StorVSC probe function + * + * Device probe function. Returns 0 if the input device is a StorVSC + * device. Otherwise, a ENXIO is returned. If the input device is + * for BlkVSC (paravirtual IDE) device and this support is disabled in + * favor of the emulated ATA/IDE device, return ENXIO. + * + * @param a device + * @returns 0 on success, ENXIO if not a matcing StorVSC device + */ +static int +storvsc_probe(device_t dev) +{ + int ret = ENXIO; + + switch (storvsc_get_storage_type(dev)) { + case DRIVER_BLKVSC: + if (ata_disk_enable == 0) { + ret = 0; + } + break; + case DRIVER_STORVSC: + ret = 0; + break; + default: + ret = ENXIO; + } + return (ret); +} + +/** + * @brief StorVSC attach function + * + * Function responsible for allocating per-device structures, + * setting up CAM interfaces and scanning for available LUNs to + * be used for SCSI device peripherals. + * + * @param a device + * @returns 0 on success or an error on failure + */ +static int +storvsc_attach(device_t dev) +{ + struct hv_device *hv_dev = vmbus_get_devctx(dev); + enum hv_storage_type stor_type; + struct storvsc_softc *sc; + struct cam_devq *devq; + int ret, i; + struct hv_storvsc_request *reqp; + struct root_hold_token *root_mount_token = NULL; + + /* + * We need to serialize storvsc attach calls. + */ + root_mount_token = root_mount_hold("storvsc"); + + sc = device_get_softc(dev); + if (sc == NULL) { + ret = ENOMEM; + goto cleanup; + } + + stor_type = storvsc_get_storage_type(dev); + + if (stor_type == DRIVER_UNKNOWN) { + ret = ENODEV; + goto cleanup; + } + + bzero(sc, sizeof(struct storvsc_softc)); + + /* fill in driver specific properties */ + sc->hs_drv_props = &g_drv_props_table[stor_type]; + + /* fill in device specific properties */ + sc->hs_unit = device_get_unit(dev); + sc->hs_dev = hv_dev; + device_set_desc(dev, g_drv_props_table[stor_type].drv_desc); + + LIST_INIT(&sc->hs_free_list); + mtx_init(&sc->hs_lock, "hvslck", NULL, MTX_DEF); + + for (i = 0; i < sc->hs_drv_props->drv_max_ios_per_target; ++i) { + reqp = malloc(sizeof(struct hv_storvsc_request), + M_DEVBUF, M_WAITOK|M_ZERO); + reqp->softc = sc; + + LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link); + } + + sc->hs_destroy = FALSE; + sc->hs_drain_notify = FALSE; + sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema"); + + ret = hv_storvsc_connect_vsp(hv_dev); + if (ret != 0) { + goto cleanup; + } + + /* + * Create the device queue. + * Hyper-V maps each target to one SCSI HBA + */ + devq = cam_simq_alloc(sc->hs_drv_props->drv_max_ios_per_target); + if (devq == NULL) { + device_printf(dev, "Failed to alloc device queue\n"); + ret = ENOMEM; + goto cleanup; + } + + sc->hs_sim = cam_sim_alloc(storvsc_action, + storvsc_poll, + sc->hs_drv_props->drv_name, + sc, + sc->hs_unit, + &sc->hs_lock, 1, + sc->hs_drv_props->drv_max_ios_per_target, + devq); + + if (sc->hs_sim == NULL) { + device_printf(dev, "Failed to alloc sim\n"); + cam_simq_free(devq); + ret = ENOMEM; + goto cleanup; + } + + mtx_lock(&sc->hs_lock); + /* bus_id is set to 0, need to get it from VMBUS channel query? */ + if (xpt_bus_register(sc->hs_sim, dev, 0) != CAM_SUCCESS) { + cam_sim_free(sc->hs_sim, /*free_devq*/TRUE); + mtx_unlock(&sc->hs_lock); + device_printf(dev, "Unable to register SCSI bus\n"); + ret = ENXIO; + goto cleanup; + } + + if (xpt_create_path(&sc->hs_path, /*periph*/NULL, + cam_sim_path(sc->hs_sim), + CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { + xpt_bus_deregister(cam_sim_path(sc->hs_sim)); + cam_sim_free(sc->hs_sim, /*free_devq*/TRUE); + mtx_unlock(&sc->hs_lock); + device_printf(dev, "Unable to create path\n"); + ret = ENXIO; + goto cleanup; + } + + mtx_unlock(&sc->hs_lock); + scan_for_luns(sc); + for (i = 0; (hs_softc[i] != NULL) && (i < HS_MAX_ADAPTERS); i++); + KASSERT(i < HS_MAX_ADAPTERS, ("storvsc_attach: hs_softc full\n")); + hs_softc[i] = sc; + + root_mount_rel(root_mount_token); + return (0); + + +cleanup: + root_mount_rel(root_mount_token); + while (!LIST_EMPTY(&sc->hs_free_list)) { + reqp = LIST_FIRST(&sc->hs_free_list); + LIST_REMOVE(reqp, link); + free(reqp, M_DEVBUF); + } + return (ret); +} + +/** + * @brief StorVSC device detach function + * + * This function is responsible for safely detaching a + * StorVSC device. This includes waiting for inbound responses + * to complete and freeing associated per-device structures. + * + * @param dev a device + * returns 0 on success + */ +static int +storvsc_detach(device_t dev) +{ + struct storvsc_softc *sc = device_get_softc(dev); + struct hv_storvsc_request *reqp = NULL; + struct hv_device *hv_device = vmbus_get_devctx(dev); + + mtx_lock(&hv_device->channel->inbound_lock); + sc->hs_destroy = TRUE; + mtx_unlock(&hv_device->channel->inbound_lock); + + /* + * At this point, all outbound traffic should be disabled. We + * only allow inbound traffic (responses) to proceed so that + * outstanding requests can be completed. + */ + + sc->hs_drain_notify = TRUE; + sema_wait(&sc->hs_drain_sema); + sc->hs_drain_notify = FALSE; + + /* + * Since we have already drained, we don't need to busy wait. + * The call to close the channel will reset the callback + * under the protection of the incoming channel lock. + */ + + hv_vmbus_channel_close(hv_device->channel); + + mtx_lock(&sc->hs_lock); + while (!LIST_EMPTY(&sc->hs_free_list)) { + reqp = LIST_FIRST(&sc->hs_free_list); + LIST_REMOVE(reqp, link); + + free(reqp, M_DEVBUF); + } + mtx_unlock(&sc->hs_lock); + return (0); +} + +#if HVS_TIMEOUT_TEST +/** + * @brief unit test for timed out operations + * + * This function provides unit testing capability to simulate + * timed out operations. Recompilation with HV_TIMEOUT_TEST=1 + * is required. + * + * @param reqp pointer to a request structure + * @param opcode SCSI operation being performed + * @param wait if 1, wait for I/O to complete + */ +static void +storvsc_timeout_test(struct hv_storvsc_request *reqp, + uint8_t opcode, int wait) +{ + int ret; + union ccb *ccb = reqp->ccb; + struct storvsc_softc *sc = reqp->softc; + + if (reqp->vstor_packet.vm_srb.cdb[0] != opcode) { + return; + } + + if (wait) { + mtx_lock(&reqp->event.mtx); + } + ret = hv_storvsc_io_request(sc->hs_dev, reqp); + if (ret != 0) { + if (wait) { + mtx_unlock(&reqp->event.mtx); + } + printf("%s: io_request failed with %d.\n", + __func__, ret); + ccb->ccb_h.status = CAM_PROVIDE_FAIL; + mtx_lock(&sc->hs_lock); + storvsc_free_request(sc, reqp); + xpt_done(ccb); + mtx_unlock(&sc->hs_lock); + return; + } + + if (wait) { + xpt_print(ccb->ccb_h.path, + "%u: %s: waiting for IO return.\n", + ticks, __func__); + ret = cv_timedwait(&reqp->event.cv, &reqp->event.mtx, 60*hz); + mtx_unlock(&reqp->event.mtx); + xpt_print(ccb->ccb_h.path, "%u: %s: %s.\n", + ticks, __func__, (ret == 0)? + "IO return detected" : + "IO return not detected"); + /* + * Now both the timer handler and io done are running + * simultaneously. We want to confirm the io done always + * finishes after the timer handler exits. So reqp used by + * timer handler is not freed or stale. Do busy loop for + * another 1/10 second to make sure io done does + * wait for the timer handler to complete. + */ + DELAY(100*1000); + mtx_lock(&sc->hs_lock); + xpt_print(ccb->ccb_h.path, + "%u: %s: finishing, queue frozen %d, " + "ccb status 0x%x scsi_status 0x%x.\n", + ticks, __func__, sc->hs_frozen, + ccb->ccb_h.status, + ccb->csio.scsi_status); + mtx_unlock(&sc->hs_lock); + } +} +#endif /* HVS_TIMEOUT_TEST */ + +/** + * @brief timeout handler for requests + * + * This function is called as a result of a callout expiring. + * + * @param arg pointer to a request + */ +static void +storvsc_timeout(void *arg) +{ + struct hv_storvsc_request *reqp = arg; + struct storvsc_softc *sc = reqp->softc; + union ccb *ccb = reqp->ccb; + + if (reqp->retries == 0) { + mtx_lock(&sc->hs_lock); + xpt_print(ccb->ccb_h.path, + "%u: IO timed out (req=0x%p), wait for another %u secs.\n", + ticks, reqp, ccb->ccb_h.timeout / 1000); + cam_error_print(ccb, CAM_ESF_ALL, CAM_EPF_ALL); + mtx_unlock(&sc->hs_lock); + + reqp->retries++; + callout_reset(&reqp->callout, + (ccb->ccb_h.timeout * hz) / 1000, + storvsc_timeout, reqp); +#if HVS_TIMEOUT_TEST + storvsc_timeout_test(reqp, SEND_DIAGNOSTIC, 0); +#endif + return; + } + + mtx_lock(&sc->hs_lock); + xpt_print(ccb->ccb_h.path, + "%u: IO (reqp = 0x%p) did not return for %u seconds, %s.\n", + ticks, reqp, ccb->ccb_h.timeout * (reqp->retries+1) / 1000, + (sc->hs_frozen == 0)? + "freezing the queue" : "the queue is already frozen"); + if (sc->hs_frozen == 0) { + sc->hs_frozen = 1; + xpt_freeze_simq(xpt_path_sim(ccb->ccb_h.path), 1); + } + mtx_unlock(&sc->hs_lock); + +#if HVS_TIMEOUT_TEST + storvsc_timeout_test(reqp, MODE_SELECT_10, 1); +#endif +} + +/** + * @brief StorVSC device poll function + * + * This function is responsible for servicing requests when + * interrupts are disabled (i.e when we are dumping core.) + * + * @param sim a pointer to a CAM SCSI interface module + */ +static void +storvsc_poll(struct cam_sim *sim) +{ + struct storvsc_softc *sc = cam_sim_softc(sim); + + mtx_assert(&sc->hs_lock, MA_OWNED); + mtx_unlock(&sc->hs_lock); + hv_storvsc_on_channel_callback(sc->hs_dev); + mtx_lock(&sc->hs_lock); +} + +/** + * @brief StorVSC device action function + * + * This function is responsible for handling SCSI operations which + * are passed from the CAM layer. The requests are in the form of + * CAM control blocks which indicate the action being performed. + * Not all actions require converting the request to a VSCSI protocol + * message - these actions can be responded to by this driver. + * Requests which are destined for a backend storage device are converted + * to a VSCSI protocol message and sent on the channel connection associated + * with this device. + * + * @param sim pointer to a CAM SCSI interface module + * @param ccb pointer to a CAM control block + */ +static void +storvsc_action(struct cam_sim *sim, union ccb *ccb) +{ + struct storvsc_softc *sc = cam_sim_softc(sim); + int res; + + mtx_assert(&sc->hs_lock, MA_OWNED); + switch (ccb->ccb_h.func_code) { + case XPT_PATH_INQ: { + struct ccb_pathinq *cpi = &ccb->cpi; + + cpi->version_num = 1; + cpi->hba_inquiry = PI_TAG_ABLE|PI_SDTR_ABLE; + cpi->target_sprt = 0; + cpi->hba_misc = PIM_NOBUSRESET; + cpi->hba_eng_cnt = 0; + cpi->max_target = STORVSC_MAX_TARGETS; + cpi->max_lun = sc->hs_drv_props->drv_max_luns_per_target; + cpi->initiator_id = 0; + cpi->bus_id = cam_sim_bus(sim); + cpi->base_transfer_speed = 300000; + cpi->transport = XPORT_SAS; + cpi->transport_version = 0; + cpi->protocol = PROTO_SCSI; + cpi->protocol_version = SCSI_REV_SPC2; + strncpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN); + strncpy(cpi->hba_vid, sc->hs_drv_props->drv_name, HBA_IDLEN); + strncpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN); + cpi->unit_number = cam_sim_unit(sim); + + ccb->ccb_h.status = CAM_REQ_CMP; + xpt_done(ccb); + return; + } + case XPT_GET_TRAN_SETTINGS: { + struct ccb_trans_settings *cts = &ccb->cts; + + cts->transport = XPORT_SAS; + cts->transport_version = 0; + cts->protocol = PROTO_SCSI; + cts->protocol_version = SCSI_REV_SPC2; + + /* enable tag queuing and disconnected mode */ + cts->proto_specific.valid = CTS_SCSI_VALID_TQ; + cts->proto_specific.scsi.valid = CTS_SCSI_VALID_TQ; + cts->proto_specific.scsi.flags = CTS_SCSI_FLAGS_TAG_ENB; + cts->xport_specific.valid = CTS_SPI_VALID_DISC; + cts->xport_specific.spi.flags = CTS_SPI_FLAGS_DISC_ENB; + + ccb->ccb_h.status = CAM_REQ_CMP; + xpt_done(ccb); + return; + } + case XPT_SET_TRAN_SETTINGS: { + ccb->ccb_h.status = CAM_REQ_CMP; + xpt_done(ccb); + return; + } + case XPT_CALC_GEOMETRY:{ + cam_calc_geometry(&ccb->ccg, 1); + xpt_done(ccb); + return; + } + case XPT_RESET_BUS: + case XPT_RESET_DEV:{ +#if HVS_HOST_RESET + if ((res = hv_storvsc_host_reset(sc->hs_dev)) != 0) { + xpt_print(ccb->ccb_h.path, + "hv_storvsc_host_reset failed with %d\n", res); + ccb->ccb_h.status = CAM_PROVIDE_FAIL; + xpt_done(ccb); + return; + } + ccb->ccb_h.status = CAM_REQ_CMP; + xpt_done(ccb); + return; +#else + xpt_print(ccb->ccb_h.path, + "%s reset not supported.\n", + (ccb->ccb_h.func_code == XPT_RESET_BUS)? + "bus" : "dev"); + ccb->ccb_h.status = CAM_REQ_INVALID; + xpt_done(ccb); + return; +#endif /* HVS_HOST_RESET */ + } + case XPT_SCSI_IO: + case XPT_IMMED_NOTIFY: { + struct hv_storvsc_request *reqp = NULL; + + if (ccb->csio.cdb_len == 0) { + panic("cdl_len is 0\n"); + } + + if (LIST_EMPTY(&sc->hs_free_list)) { + ccb->ccb_h.status = CAM_REQUEUE_REQ; + if (sc->hs_frozen == 0) { + sc->hs_frozen = 1; + xpt_freeze_simq(sim, /* count*/1); + } + xpt_done(ccb); + return; + } + + reqp = LIST_FIRST(&sc->hs_free_list); + LIST_REMOVE(reqp, link); + + bzero(reqp, sizeof(struct hv_storvsc_request)); + reqp->softc = sc; + + ccb->ccb_h.status |= CAM_SIM_QUEUED; + create_storvsc_request(ccb, reqp); + + if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) { + callout_init(&reqp->callout, CALLOUT_MPSAFE); + callout_reset(&reqp->callout, + (ccb->ccb_h.timeout * hz) / 1000, + storvsc_timeout, reqp); +#if HVS_TIMEOUT_TEST + cv_init(&reqp->event.cv, "storvsc timeout cv"); + mtx_init(&reqp->event.mtx, "storvsc timeout mutex", + NULL, MTX_DEF); + switch (reqp->vstor_packet.vm_srb.cdb[0]) { + case MODE_SELECT_10: + case SEND_DIAGNOSTIC: + /* To have timer send the request. */ + return; + default: + break; + } +#endif /* HVS_TIMEOUT_TEST */ + } + + if ((res = hv_storvsc_io_request(sc->hs_dev, reqp)) != 0) { + xpt_print(ccb->ccb_h.path, + "hv_storvsc_io_request failed with %d\n", res); + ccb->ccb_h.status = CAM_PROVIDE_FAIL; + storvsc_free_request(sc, reqp); + xpt_done(ccb); + return; + } + return; + } + + default: + ccb->ccb_h.status = CAM_REQ_INVALID; + xpt_done(ccb); + return; + } +} + +/** + * @brief Fill in a request structure based on a CAM control block + * + * Fills in a request structure based on the contents of a CAM control + * block. The request structure holds the payload information for + * VSCSI protocol request. + * + * @param ccb pointer to a CAM contorl block + * @param reqp pointer to a request structure + */ +static void +create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp) +{ + struct ccb_scsiio *csio = &ccb->csio; + uint64_t phys_addr; + uint32_t bytes_to_copy = 0; + uint32_t pfn_num = 0; + uint32_t pfn; + + /* refer to struct vmscsi_req for meanings of these two fields */ + reqp->vstor_packet.vm_srb.port = + cam_sim_unit(xpt_path_sim(ccb->ccb_h.path)); + reqp->vstor_packet.vm_srb.path_id = + cam_sim_bus(xpt_path_sim(ccb->ccb_h.path)); + + reqp->vstor_packet.vm_srb.target_id = ccb->ccb_h.target_id; + reqp->vstor_packet.vm_srb.lun = ccb->ccb_h.target_lun; + + reqp->vstor_packet.vm_srb.cdb_len = csio->cdb_len; + if(ccb->ccb_h.flags & CAM_CDB_POINTER) { + memcpy(&reqp->vstor_packet.vm_srb.cdb, csio->cdb_io.cdb_ptr, + csio->cdb_len); + } else { + memcpy(&reqp->vstor_packet.vm_srb.cdb, csio->cdb_io.cdb_bytes, + csio->cdb_len); + } + + switch (ccb->ccb_h.flags & CAM_DIR_MASK) { + case CAM_DIR_OUT: + reqp->vstor_packet.vm_srb.data_in = WRITE_TYPE; + break; + case CAM_DIR_IN: + reqp->vstor_packet.vm_srb.data_in = READ_TYPE; + break; + case CAM_DIR_NONE: + reqp->vstor_packet.vm_srb.data_in = UNKNOWN_TYPE; + break; + default: + reqp->vstor_packet.vm_srb.data_in = UNKNOWN_TYPE; + break; + } + + reqp->sense_data = &csio->sense_data; + reqp->sense_info_len = csio->sense_len; + + reqp->ccb = ccb; + /* + KASSERT((ccb->ccb_h.flags & CAM_SCATTER_VALID) == 0, + ("ccb is scatter gather valid\n")); + */ + if (csio->dxfer_len != 0) { + reqp->data_buf.length = csio->dxfer_len; + bytes_to_copy = csio->dxfer_len; + phys_addr = vtophys(csio->data_ptr); + reqp->data_buf.offset = phys_addr - trunc_page(phys_addr); + } + + while (bytes_to_copy != 0) { + int bytes, page_offset; + phys_addr = vtophys(&csio->data_ptr[reqp->data_buf.length - + bytes_to_copy]); + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[pfn_num] = pfn; + page_offset = phys_addr - trunc_page(phys_addr); + + bytes = min(PAGE_SIZE - page_offset, bytes_to_copy); + + bytes_to_copy -= bytes; + pfn_num++; + } +} + +/** + * @brief completion function before returning to CAM + * + * I/O process has been completed and the result needs + * to be passed to the CAM layer. + * Free resources related to this request. + * + * @param reqp pointer to a request structure + */ +static void +storvsc_io_done(struct hv_storvsc_request *reqp) +{ + union ccb *ccb = reqp->ccb; + struct ccb_scsiio *csio = &ccb->csio; + struct storvsc_softc *sc = reqp->softc; + struct vmscsi_req *vm_srb = &reqp->vstor_packet.vm_srb; + + if (reqp->retries > 0) { + mtx_lock(&sc->hs_lock); +#if HVS_TIMEOUT_TEST + xpt_print(ccb->ccb_h.path, + "%u: IO returned after timeout, " + "waking up timer handler if any.\n", ticks); + mtx_lock(&reqp->event.mtx); + cv_signal(&reqp->event.cv); + mtx_unlock(&reqp->event.mtx); +#endif + reqp->retries = 0; + xpt_print(ccb->ccb_h.path, + "%u: IO returned after timeout, " + "stopping timer if any.\n", ticks); + mtx_unlock(&sc->hs_lock); + } + + /* + * callout_drain() will wait for the timer handler to finish + * if it is running. So we don't need any lock to synchronize + * between this routine and the timer handler. + * Note that we need to make sure reqp is not freed when timer + * handler is using or will use it. + */ + if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) { + callout_drain(&reqp->callout); + } + + ccb->ccb_h.status &= ~CAM_SIM_QUEUED; + ccb->ccb_h.status &= ~CAM_STATUS_MASK; + if (vm_srb->scsi_status == SCSI_STATUS_OK) { + ccb->ccb_h.status |= CAM_REQ_CMP; + } else { + mtx_lock(&sc->hs_lock); + xpt_print(ccb->ccb_h.path, + "srovsc scsi_status = %d\n", + vm_srb->scsi_status); + mtx_unlock(&sc->hs_lock); + ccb->ccb_h.status |= CAM_SCSI_STATUS_ERROR; + } + + ccb->csio.scsi_status = (vm_srb->scsi_status & 0xFF); + ccb->csio.resid = ccb->csio.dxfer_len - vm_srb->transfer_len; + + if (reqp->sense_info_len != 0) { + csio->sense_resid = csio->sense_len - reqp->sense_info_len; + ccb->ccb_h.status |= CAM_AUTOSNS_VALID; + } + + mtx_lock(&sc->hs_lock); + if (reqp->softc->hs_frozen == 1) { + xpt_print(ccb->ccb_h.path, + "%u: storvsc unfreezing softc 0x%p.\n", + ticks, reqp->softc); + ccb->ccb_h.status |= CAM_RELEASE_SIMQ; + reqp->softc->hs_frozen = 0; + } + storvsc_free_request(sc, reqp); + xpt_done(ccb); + mtx_unlock(&sc->hs_lock); +} + +/** + * @brief Free a request structure + * + * Free a request structure by returning it to the free list + * + * @param sc pointer to a softc + * @param reqp pointer to a request structure + */ +static void +storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp) +{ + + LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link); +} + +/** + * @brief Determine type of storage device from GUID + * + * Using the type GUID, determine if this is a StorVSC (paravirtual + * SCSI or BlkVSC (paravirtual IDE) device. + * + * @param dev a device + * returns an enum + */ +static enum hv_storage_type +storvsc_get_storage_type(device_t dev) +{ + const char *p = vmbus_get_type(dev); + + if (!memcmp(p, &gBlkVscDeviceType, sizeof(hv_guid))) { + return DRIVER_BLKVSC; + } else if (!memcmp(p, &gStorVscDeviceType, sizeof(hv_guid))) { + return DRIVER_STORVSC; + } + return (DRIVER_UNKNOWN); +} + diff --git a/sys/contrib/dev/hyperv/storvsc/hv_vstorage.h b/sys/contrib/dev/hyperv/storvsc/hv_vstorage.h new file mode 100644 index 0000000..d01d084 --- /dev/null +++ b/sys/contrib/dev/hyperv/storvsc/hv_vstorage.h @@ -0,0 +1,231 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HV_VSTORAGE_H__ +#define __HV_VSTORAGE_H__ + +/* + * Major/minor macros. Minor version is in LSB, meaning that earlier flat + * version numbers will be interpreted as "0.x" (i.e., 1 becomes 0.1). + */ + +#define VMSTOR_PROTOCOL_MAJOR(VERSION_) (((VERSION_) >> 8) & 0xff) +#define VMSTOR_PROTOCOL_MINOR(VERSION_) (((VERSION_) ) & 0xff) +#define VMSTOR_PROTOCOL_VERSION(MAJOR_, MINOR_) ((((MAJOR_) & 0xff) << 8) | \ + (((MINOR_) & 0xff) )) + +/* + * Invalid version. + */ +#define VMSTOR_INVALID_PROTOCOL_VERSION -1 + +/* + * Version history: + * V1 Beta 0.1 + * V1 RC < 2008/1/31 1.0 + * V1 RC > 2008/1/31 2.0 + */ + +#define VMSTOR_PROTOCOL_VERSION_CURRENT VMSTOR_PROTOCOL_VERSION(2, 0) + +/** + * Packet structure ops describing virtual storage requests. + */ +enum vstor_packet_ops { + VSTOR_OPERATION_COMPLETEIO = 1, + VSTOR_OPERATION_REMOVEDEVICE = 2, + VSTOR_OPERATION_EXECUTESRB = 3, + VSTOR_OPERATION_RESETLUN = 4, + VSTOR_OPERATION_RESETADAPTER = 5, + VSTOR_OPERATION_RESETBUS = 6, + VSTOR_OPERATION_BEGININITIALIZATION = 7, + VSTOR_OPERATION_ENDINITIALIZATION = 8, + VSTOR_OPERATION_QUERYPROTOCOLVERSION = 9, + VSTOR_OPERATION_QUERYPROPERTIES = 10, + VSTOR_OPERATION_MAXIMUM = 10 +}; + + +/* + * Platform neutral description of a scsi request - + * this remains the same across the write regardless of 32/64 bit + * note: it's patterned off the Windows DDK SCSI_PASS_THROUGH structure + */ + +#define CDB16GENERIC_LENGTH 0x10 +#define SENSE_BUFFER_SIZE 0x12 +#define MAX_DATA_BUFFER_LENGTH_WITH_PADDING 0x14 + +struct vmscsi_req { + uint16_t length; + uint8_t srb_status; + uint8_t scsi_status; + + /* HBA number, set to the order number detected by initiator. */ + uint8_t port; + /* SCSI bus number or bus_id, different from CAM's path_id. */ + uint8_t path_id; + + uint8_t target_id; + uint8_t lun; + + uint8_t cdb_len; + uint8_t sense_info_len; + uint8_t data_in; + uint8_t reserved; + + uint32_t transfer_len; + + union { + uint8_t cdb[CDB16GENERIC_LENGTH]; + + uint8_t sense_data[SENSE_BUFFER_SIZE]; + + uint8_t reserved_array[MAX_DATA_BUFFER_LENGTH_WITH_PADDING]; + }; + +} __packed; + +/** + * This structure is sent during the initialization phase to get the different + * properties of the channel. + */ + +struct vmstor_chan_props { + uint16_t proto_ver; + uint8_t path_id; + uint8_t target_id; + + /** + * Note: port number is only really known on the client side + */ + uint32_t port; + uint32_t flags; + uint32_t max_transfer_bytes; + + /** + * This id is unique for each channel and will correspond with + * vendor specific data in the inquiry_ata + */ + uint64_t unique_id; + +} __packed; + +/** + * This structure is sent during the storage protocol negotiations. + */ + +struct vmstor_proto_ver +{ + /** + * Major (MSW) and minor (LSW) version numbers. + */ + uint16_t major_minor; + + uint16_t revision; /* always zero */ +} __packed; + +/** + * Channel Property Flags + */ + +#define STORAGE_CHANNEL_REMOVABLE_FLAG 0x1 +#define STORAGE_CHANNEL_EMULATED_IDE_FLAG 0x2 + + +struct vstor_packet { + /** + * Requested operation type + */ + enum vstor_packet_ops operation; + + /* + * Flags - see below for values + */ + uint32_t flags; + + /** + * Status of the request returned from the server side. + */ + uint32_t status; + + union + { + /** + * Structure used to forward SCSI commands from the client to + * the server. + */ + struct vmscsi_req vm_srb; + + /** + * Structure used to query channel properties. + */ + struct vmstor_chan_props chan_props; + + /** + * Used during version negotiations. + */ + struct vmstor_proto_ver version; + }; + +} __packed; + + +/** + * SRB (SCSI Request Block) Status Codes + */ +#define SRB_STATUS_PENDING 0x00 +#define SRB_STATUS_SUCCESS 0x01 +#define SRB_STATUS_ABORTED 0x02 +#define SRB_STATUS_ABORT_FAILED 0x03 +#define SRB_STATUS_ERROR 0x04 +#define SRB_STATUS_BUSY 0x05 + +/** + * SRB Status Masks (can be combined with above status codes) + */ +#define SRB_STATUS_QUEUE_FROZEN 0x40 +#define SRB_STATUS_AUTOSENSE_VALID 0x80 + + +/** + * Packet flags + */ + +/** + * This flag indicates that the server should send back a completion for this + * packet. + */ +#define REQUEST_COMPLETION_FLAG 0x1 + +/** + * This is the set of flags that the vsc can set in any packets it sends + */ +#define VSC_LEGAL_FLAGS (REQUEST_COMPLETION_FLAG) + +#endif /* __HV_VSTORAGE_H__ */ diff --git a/sys/contrib/dev/hyperv/utilities/hv_util.c b/sys/contrib/dev/hyperv/utilities/hv_util.c new file mode 100644 index 0000000..9ad4370 --- /dev/null +++ b/sys/contrib/dev/hyperv/utilities/hv_util.c @@ -0,0 +1,492 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * A common driver for all hyper-V util services. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/reboot.h> +#include <sys/timetc.h> + +#include <dev/hyperv/include/hyperv.h> + +#define HV_SHUT_DOWN 0 +#define HV_TIME_SYNCH 1 +#define HV_HEART_BEAT 2 +#define HV_KVP 3 +#define HV_MAX_UTIL_SERVICES 4 + +#define HV_NANO_SEC 1000000000L /* 10^ 9 nanosecs = 1 sec */ + +#define HV_WLTIMEDELTA 116444736000000000L /* in 100ns unit */ +#define HV_ICTIMESYNCFLAG_PROBE 0 +#define HV_ICTIMESYNCFLAG_SYNC 1 +#define HV_ICTIMESYNCFLAG_SAMPLE 2 + +typedef struct hv_vmbus_service { + hv_guid guid; /* Hyper-V GUID */ + char* name; /* name of service */ + boolean_t enabled; /* service enabled */ + hv_work_queue* work_queue; /* background work queue */ + /* + * function to initialize service + */ + int (*init)(struct hv_vmbus_service *); + /* + * function to process Hyper-V messages + */ + void (*callback)(void *); +} hv_vmbus_service; + +static void hv_shutdown_cb(void *context); +static void hv_heartbeat_cb(void *context); +static void hv_timesync_cb(void *context); +static void hv_kvp_cb(void *context); + +static int hv_timesync_init(hv_vmbus_service *serv); + +/** + * Note: GUID codes below are predefined by the host hypervisor + * (Hyper-V and Azure)interface and required for correct operation. + */ +static hv_vmbus_service service_table[] = { + /* Shutdown Service */ + { .guid.data = {0x31, 0x60, 0x0B, 0X0E, 0x13, 0x52, 0x34, 0x49, + 0x81, 0x8B, 0x38, 0XD9, 0x0C, 0xED, 0x39, 0xDB}, + .name = "Hyper-V Shutdown Service\n", + .enabled = TRUE, + .callback = hv_shutdown_cb, + }, + + /* Time Synch Service */ + { .guid.data = {0x30, 0xe6, 0x27, 0x95, 0xae, 0xd0, 0x7b, 0x49, + 0xad, 0xce, 0xe8, 0x0a, 0xb0, 0x17, 0x5c, 0xaf}, + .name = "Hyper-V Time Synch Service\n", + .enabled = TRUE, + .init = hv_timesync_init, + .callback = hv_timesync_cb, + }, + + /* Heartbeat Service */ + { .guid.data = {0x39, 0x4f, 0x16, 0x57, 0x15, 0x91, 0x78, 0x4e, + 0xab, 0x55, 0x38, 0x2f, 0x3b, 0xd5, 0x42, 0x2d}, + .name = "Hyper-V Heartbeat Service\n", + .enabled = TRUE, + .callback = hv_heartbeat_cb, + + }, + + /* KVP (Key Value Pair) Service */ + { .guid.data = {0xe7, 0xf4, 0xa0, 0xa9, 0x45, 0x5a, 0x96, 0x4d, + 0xb8, 0x27, 0x8a, 0x84, 0x1e, 0x8c, 0x3, 0xe6}, + .name = "Hyper-V KVP Service\n", + .enabled = FALSE, + .callback = hv_kvp_cb, + }, +}; + +/** + * Receive buffer pointers, there is one buffer per utility service. The + * buffer is allocated during attach(). + */ +static uint8_t* receive_buffer[HV_MAX_UTIL_SERVICES]; + +struct hv_ictimesync_data { + uint64_t parenttime; + uint64_t childtime; + uint64_t roundtriptime; + uint8_t flags; +} __packed; + +static int hv_timesync_init(hv_vmbus_service *serv) +{ + serv->work_queue = hv_work_queue_create("Time Sync"); + if (serv->work_queue == NULL) + return (ENOMEM); + return (0); +} + +static void +hv_negotiate_version( + struct hv_vmbus_icmsg_hdr* icmsghdrp, + struct hv_vmbus_icmsg_negotiate* negop, + uint8_t* buf) + { + icmsghdrp->icmsgsize = 0x10; + + negop = (struct hv_vmbus_icmsg_negotiate *)&buf[ + sizeof(struct hv_vmbus_pipe_hdr) + + sizeof(struct hv_vmbus_icmsg_hdr)]; + + if (negop->icframe_vercnt == 2 && + negop->icversion_data[1].major == 3) { + negop->icversion_data[0].major = 3; + negop->icversion_data[0].minor = 0; + negop->icversion_data[1].major = 3; + negop->icversion_data[1].minor = 0; + } else { + negop->icversion_data[0].major = 1; + negop->icversion_data[0].minor = 0; + negop->icversion_data[1].major = 1; + negop->icversion_data[1].minor = 0; + } + + negop->icframe_vercnt = 1; + negop->icmsg_vercnt = 1; +} + +static void hv_kvp_cb(void *context) +{ +} + +/** + * Set host time based on time sync message from host + */ +static void +hv_set_host_time(void *context) +{ + uint64_t hosttime = (uint64_t)context; + struct timespec ts, host_ts; + int64_t tns, host_tns, tmp, tsec; + + nanotime(&ts); + tns = ts.tv_sec * HV_NANO_SEC + ts.tv_nsec; + host_tns = (hosttime - HV_WLTIMEDELTA) * 100; + + tmp = host_tns; + tsec = tmp / HV_NANO_SEC; + host_ts.tv_nsec = (long) (tmp - (tsec * HV_NANO_SEC)); + host_ts.tv_sec = tsec; + + /* force time sync with host after reboot, restore, etc. */ + mtx_lock(&Giant); + tc_setclock(&host_ts); + resettodr(); + mtx_unlock(&Giant); +} + +/** + * @brief Synchronize time with host after reboot, restore, etc. + * + * ICTIMESYNCFLAG_SYNC flag bit indicates reboot, restore events of the VM. + * After reboot the flag ICTIMESYNCFLAG_SYNC is included in the first time + * message after the timesync channel is opened. Since the hv_utils module is + * loaded after hv_vmbus, the first message is usually missed. The other + * thing is, systime is automatically set to emulated hardware clock which may + * not be UTC time or in the same time zone. So, to override these effects, we + * use the first 50 time samples for initial system time setting. + */ +static inline +void hv_adj_guesttime(uint64_t hosttime, uint8_t flags) +{ + static int scnt = 50; + + if ((flags & HV_ICTIMESYNCFLAG_SYNC) != 0) { + hv_queue_work_item(service_table[HV_TIME_SYNCH].work_queue, + hv_set_host_time, (void *) hosttime); + return; + } + + if ((flags & HV_ICTIMESYNCFLAG_SAMPLE) != 0 && scnt > 0) { + scnt--; + hv_queue_work_item(service_table[HV_TIME_SYNCH].work_queue, + hv_set_host_time, (void *) hosttime); + } +} + +/** + * Time Sync Channel message handler + */ +static void +hv_timesync_cb(void *context) +{ + hv_vmbus_channel* channel = context; + hv_vmbus_icmsg_hdr* icmsghdrp; + uint32_t recvlen; + uint64_t requestId; + int ret; + uint8_t* time_buf; + struct hv_ictimesync_data* timedatap; + + time_buf = receive_buffer[HV_TIME_SYNCH]; + + ret = hv_vmbus_channel_recv_packet(channel, time_buf, + PAGE_SIZE, &recvlen, &requestId); + + if ((ret == 0) && recvlen > 0) { + icmsghdrp = (struct hv_vmbus_icmsg_hdr *) &time_buf[ + sizeof(struct hv_vmbus_pipe_hdr)]; + + if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { + hv_negotiate_version(icmsghdrp, NULL, time_buf); + } else { + timedatap = (struct hv_ictimesync_data *) &time_buf[ + sizeof(struct hv_vmbus_pipe_hdr) + + sizeof(struct hv_vmbus_icmsg_hdr)]; + hv_adj_guesttime(timedatap->parenttime, timedatap->flags); + } + + icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION + | HV_ICMSGHDRFLAG_RESPONSE; + + hv_vmbus_channel_send_packet(channel, time_buf, + recvlen, requestId, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); + } +} + +/** + * Shutdown + */ +static void +hv_shutdown_cb(void *context) +{ + uint8_t* buf; + hv_vmbus_channel* channel = context; + uint8_t execute_shutdown = 0; + hv_vmbus_icmsg_hdr* icmsghdrp; + uint32_t recv_len; + uint64_t request_id; + int ret; + hv_vmbus_shutdown_msg_data* shutdown_msg; + + buf = receive_buffer[HV_SHUT_DOWN]; + + ret = hv_vmbus_channel_recv_packet(channel, buf, PAGE_SIZE, + &recv_len, &request_id); + + if ((ret == 0) && recv_len > 0) { + + icmsghdrp = (struct hv_vmbus_icmsg_hdr *) + &buf[sizeof(struct hv_vmbus_pipe_hdr)]; + + if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { + hv_negotiate_version(icmsghdrp, NULL, buf); + + } else { + shutdown_msg = + (struct hv_vmbus_shutdown_msg_data *) + &buf[sizeof(struct hv_vmbus_pipe_hdr) + + sizeof(struct hv_vmbus_icmsg_hdr)]; + + switch (shutdown_msg->flags) { + case 0: + case 1: + icmsghdrp->status = HV_S_OK; + execute_shutdown = 1; + if(bootverbose) + printf("Shutdown request received -" + " graceful shutdown initiated\n"); + break; + default: + icmsghdrp->status = HV_E_FAIL; + execute_shutdown = 0; + printf("Shutdown request received -" + " Invalid request\n"); + break; + } + } + + icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | + HV_ICMSGHDRFLAG_RESPONSE; + + hv_vmbus_channel_send_packet(channel, buf, + recv_len, request_id, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); + } + + if (execute_shutdown) + shutdown_nice(RB_POWEROFF); +} + +/** + * Process heartbeat message + */ +static void +hv_heartbeat_cb(void *context) +{ + uint8_t* buf; + hv_vmbus_channel* channel = context; + uint32_t recvlen; + uint64_t requestid; + int ret; + + struct hv_vmbus_heartbeat_msg_data* heartbeat_msg; + struct hv_vmbus_icmsg_hdr* icmsghdrp; + + buf = receive_buffer[HV_HEART_BEAT]; + + ret = hv_vmbus_channel_recv_packet(channel, buf, PAGE_SIZE, &recvlen, + &requestid); + + if ((ret == 0) && recvlen > 0) { + + icmsghdrp = (struct hv_vmbus_icmsg_hdr *) + &buf[sizeof(struct hv_vmbus_pipe_hdr)]; + + if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { + hv_negotiate_version(icmsghdrp, NULL, buf); + + } else { + heartbeat_msg = + (struct hv_vmbus_heartbeat_msg_data *) + &buf[sizeof(struct hv_vmbus_pipe_hdr) + + sizeof(struct hv_vmbus_icmsg_hdr)]; + + heartbeat_msg->seq_num += 1; + } + + icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | + HV_ICMSGHDRFLAG_RESPONSE; + + hv_vmbus_channel_send_packet(channel, buf, recvlen, requestid, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); + } +} + + +static int +hv_util_probe(device_t dev) +{ + int i; + int rtn_value = ENXIO; + + for (i = 0; i < HV_MAX_UTIL_SERVICES; i++) { + const char *p = vmbus_get_type(dev); + if (service_table[i].enabled && !memcmp(p, &service_table[i].guid, sizeof(hv_guid))) { + device_set_softc(dev, (void *) (&service_table[i])); + rtn_value = 0; + } + } + + return rtn_value; +} + +static int +hv_util_attach(device_t dev) +{ + struct hv_device* hv_dev; + struct hv_vmbus_service* service; + int ret; + size_t receive_buffer_offset; + + hv_dev = vmbus_get_devctx(dev); + service = device_get_softc(dev); + receive_buffer_offset = service - &service_table[0]; + device_printf(dev, "Hyper-V Service attaching: %s\n", service->name); + receive_buffer[receive_buffer_offset] = + malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO); + + if (service->init != NULL) { + ret = service->init(service); + if (ret) { + ret = ENODEV; + goto error0; + } + } + + ret = hv_vmbus_channel_open(hv_dev->channel, 2 * PAGE_SIZE, + 2 * PAGE_SIZE, NULL, 0, + service->callback, hv_dev->channel); + + if (ret) + goto error0; + + return (0); + + error0: + + free(receive_buffer[receive_buffer_offset], M_DEVBUF); + receive_buffer[receive_buffer_offset] = NULL; + + return (ret); +} + +static int +hv_util_detach(device_t dev) +{ + struct hv_device* hv_dev; + struct hv_vmbus_service* service; + size_t receive_buffer_offset; + + hv_dev = vmbus_get_devctx(dev); + + hv_vmbus_channel_close(hv_dev->channel); + service = device_get_softc(dev); + receive_buffer_offset = service - &service_table[0]; + + if (service->work_queue != NULL) + hv_work_queue_close(service->work_queue); + + free(receive_buffer[receive_buffer_offset], M_DEVBUF); + receive_buffer[receive_buffer_offset] = NULL; + + return (0); +} + +static void hv_util_init(void) +{ +} + +static int hv_util_modevent(module_t mod, int event, void *arg) +{ + switch (event) { + case MOD_LOAD: + break; + case MOD_UNLOAD: + break; + default: + break; + } + return (0); +} + +static device_method_t util_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, hv_util_probe), + DEVMETHOD(device_attach, hv_util_attach), + DEVMETHOD(device_detach, hv_util_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + { 0, 0 } } +; + +static driver_t util_driver = { "hyperv-utils", util_methods, 0 }; + +static devclass_t util_devclass; + +DRIVER_MODULE(hv_utils, vmbus, util_driver, util_devclass, hv_util_modevent, 0); +MODULE_VERSION(hv_utils, 1); +MODULE_DEPEND(hv_utils, vmbus, 1, 1, 1); + +SYSINIT(hv_util_initx, SI_SUB_RUN_SCHEDULER, SI_ORDER_MIDDLE + 1, + hv_util_init, NULL); diff --git a/sys/contrib/dev/hyperv/vmbus/hv_channel.c b/sys/contrib/dev/hyperv/vmbus/hv_channel.c new file mode 100644 index 0000000..17dfd76 --- /dev/null +++ b/sys/contrib/dev/hyperv/vmbus/hv_channel.c @@ -0,0 +1,842 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/malloc.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <machine/bus.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> + +#include "hv_vmbus_priv.h" + +static int vmbus_channel_create_gpadl_header( + /* must be phys and virt contiguous*/ + void* contig_buffer, + /* page-size multiple */ + uint32_t size, + hv_vmbus_channel_msg_info** msg_info, + uint32_t* message_count); + +static void vmbus_channel_set_event(hv_vmbus_channel* channel); + +/** + * @brief Trigger an event notification on the specified channel + */ +static void +vmbus_channel_set_event(hv_vmbus_channel *channel) +{ + hv_vmbus_monitor_page *monitor_page; + + if (channel->offer_msg.monitor_allocated) { + /* Each uint32_t represents 32 channels */ + synch_set_bit((channel->offer_msg.child_rel_id & 31), + ((uint32_t *)hv_vmbus_g_connection.send_interrupt_page + + ((channel->offer_msg.child_rel_id >> 5)))); + + monitor_page = (hv_vmbus_monitor_page *) + hv_vmbus_g_connection.monitor_pages; + + monitor_page++; /* Get the child to parent monitor page */ + + synch_set_bit(channel->monitor_bit, + (uint32_t *)&monitor_page-> + trigger_group[channel->monitor_group].pending); + } else { + hv_vmbus_set_event(channel->offer_msg.child_rel_id); + } + +} + +/** + * @brief Open the specified channel + */ +int +hv_vmbus_channel_open( + hv_vmbus_channel* new_channel, + uint32_t send_ring_buffer_size, + uint32_t recv_ring_buffer_size, + void* user_data, + uint32_t user_data_len, + hv_vmbus_pfn_channel_callback pfn_on_channel_callback, + void* context) +{ + + int ret = 0; + void *in, *out; + hv_vmbus_channel_open_channel* open_msg; + hv_vmbus_channel_msg_info* open_info; + + new_channel->on_channel_callback = pfn_on_channel_callback; + new_channel->channel_callback_context = context; + + /* Allocate the ring buffer */ + out = contigmalloc((send_ring_buffer_size + recv_ring_buffer_size), + M_DEVBUF, M_ZERO, 0UL, BUS_SPACE_MAXADDR, PAGE_SIZE, 0); + KASSERT(out != NULL, + ("Error VMBUS: contigmalloc failed to allocate Ring Buffer!")); + if (out == NULL) + return (ENOMEM); + + in = ((uint8_t *) out + send_ring_buffer_size); + + new_channel->ring_buffer_pages = out; + new_channel->ring_buffer_page_count = (send_ring_buffer_size + + recv_ring_buffer_size) >> PAGE_SHIFT; + + hv_vmbus_ring_buffer_init( + &new_channel->outbound, + out, + send_ring_buffer_size); + + hv_vmbus_ring_buffer_init( + &new_channel->inbound, + in, + recv_ring_buffer_size); + + /** + * Establish the gpadl for the ring buffer + */ + new_channel->ring_buffer_gpadl_handle = 0; + + ret = hv_vmbus_channel_establish_gpadl(new_channel, + new_channel->outbound.ring_buffer, + send_ring_buffer_size + recv_ring_buffer_size, + &new_channel->ring_buffer_gpadl_handle); + + /** + * Create and init the channel open message + */ + open_info = (hv_vmbus_channel_msg_info*) malloc( + sizeof(hv_vmbus_channel_msg_info) + + sizeof(hv_vmbus_channel_open_channel), + M_DEVBUF, + M_NOWAIT); + KASSERT(open_info != NULL, + ("Error VMBUS: malloc failed to allocate Open Channel message!")); + + if (open_info == NULL) + return (ENOMEM); + + sema_init(&open_info->wait_sema, 0, "Open Info Sema"); + + open_msg = (hv_vmbus_channel_open_channel*) open_info->msg; + open_msg->header.message_type = HV_CHANNEL_MESSAGE_OPEN_CHANNEL; + open_msg->open_id = new_channel->offer_msg.child_rel_id; + open_msg->child_rel_id = new_channel->offer_msg.child_rel_id; + open_msg->ring_buffer_gpadl_handle = + new_channel->ring_buffer_gpadl_handle; + open_msg->downstream_ring_buffer_page_offset = send_ring_buffer_size + >> PAGE_SHIFT; + open_msg->server_context_area_gpadl_handle = 0; + + if (user_data_len) + memcpy(open_msg->user_data, user_data, user_data_len); + + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_INSERT_TAIL( + &hv_vmbus_g_connection.channel_msg_anchor, + open_info, + msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + ret = hv_vmbus_post_message( + open_msg, sizeof(hv_vmbus_channel_open_channel)); + + if (ret != 0) + goto cleanup; + + ret = sema_timedwait(&open_info->wait_sema, 500); /* KYS 5 seconds */ + + if (ret) + goto cleanup; + + if (open_info->response.open_result.status == 0) { + if(bootverbose) + printf("VMBUS: channel <%p> open success.\n", new_channel); + } else { + if(bootverbose) + printf("Error VMBUS: channel <%p> open failed - %d!\n", + new_channel, open_info->response.open_result.status); + } + + cleanup: + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_msg_anchor, + open_info, + msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + sema_destroy(&open_info->wait_sema); + free(open_info, M_DEVBUF); + + return (ret); +} + +/** + * @brief Create a gpadl for the specified buffer + */ +static int +vmbus_channel_create_gpadl_header( + void* contig_buffer, + uint32_t size, /* page-size multiple */ + hv_vmbus_channel_msg_info** msg_info, + uint32_t* message_count) +{ + int i; + int page_count; + unsigned long long pfn; + uint32_t msg_size; + hv_vmbus_channel_gpadl_header* gpa_header; + hv_vmbus_channel_gpadl_body* gpadl_body; + hv_vmbus_channel_msg_info* msg_header; + hv_vmbus_channel_msg_info* msg_body; + + int pfnSum, pfnCount, pfnLeft, pfnCurr, pfnSize; + + page_count = size >> PAGE_SHIFT; + pfn = hv_get_phys_addr(contig_buffer) >> PAGE_SHIFT; + + /*do we need a gpadl body msg */ + pfnSize = HV_MAX_SIZE_CHANNEL_MESSAGE + - sizeof(hv_vmbus_channel_gpadl_header) + - sizeof(hv_gpa_range); + pfnCount = pfnSize / sizeof(uint64_t); + + if (page_count > pfnCount) { /* if(we need a gpadl body) */ + /* fill in the header */ + msg_size = sizeof(hv_vmbus_channel_msg_info) + + sizeof(hv_vmbus_channel_gpadl_header) + + sizeof(hv_gpa_range) + + pfnCount * sizeof(uint64_t); + msg_header = malloc(msg_size, M_DEVBUF, M_NOWAIT | M_ZERO); + KASSERT( + msg_header != NULL, + ("Error VMBUS: malloc failed to allocate Gpadl Message!")); + if (msg_header == NULL) + return (ENOMEM); + + TAILQ_INIT(&msg_header->sub_msg_list_anchor); + msg_header->message_size = msg_size; + + gpa_header = (hv_vmbus_channel_gpadl_header*) msg_header->msg; + gpa_header->range_count = 1; + gpa_header->range_buf_len = sizeof(hv_gpa_range) + + page_count * sizeof(uint64_t); + gpa_header->range[0].byte_offset = 0; + gpa_header->range[0].byte_count = size; + for (i = 0; i < pfnCount; i++) { + gpa_header->range[0].pfn_array[i] = pfn + i; + } + *msg_info = msg_header; + *message_count = 1; + + pfnSum = pfnCount; + pfnLeft = page_count - pfnCount; + + /* + * figure out how many pfns we can fit + */ + pfnSize = HV_MAX_SIZE_CHANNEL_MESSAGE + - sizeof(hv_vmbus_channel_gpadl_body); + pfnCount = pfnSize / sizeof(uint64_t); + + /* + * fill in the body + */ + while (pfnLeft) { + if (pfnLeft > pfnCount) { + pfnCurr = pfnCount; + } else { + pfnCurr = pfnLeft; + } + + msg_size = sizeof(hv_vmbus_channel_msg_info) + + sizeof(hv_vmbus_channel_gpadl_body) + + pfnCurr * sizeof(uint64_t); + msg_body = malloc(msg_size, M_DEVBUF, M_NOWAIT | M_ZERO); + KASSERT( + msg_body != NULL, + ("Error VMBUS: malloc failed to allocate Gpadl msg_body!")); + if (msg_body == NULL) + return (ENOMEM); + + msg_body->message_size = msg_size; + (*message_count)++; + gpadl_body = + (hv_vmbus_channel_gpadl_body*) msg_body->msg; + /* + * gpadl_body->gpadl = kbuffer; + */ + for (i = 0; i < pfnCurr; i++) { + gpadl_body->pfn[i] = pfn + pfnSum + i; + } + + TAILQ_INSERT_TAIL( + &msg_header->sub_msg_list_anchor, + msg_body, + msg_list_entry); + pfnSum += pfnCurr; + pfnLeft -= pfnCurr; + } + } else { /* else everything fits in a header */ + + msg_size = sizeof(hv_vmbus_channel_msg_info) + + sizeof(hv_vmbus_channel_gpadl_header) + + sizeof(hv_gpa_range) + + page_count * sizeof(uint64_t); + msg_header = malloc(msg_size, M_DEVBUF, M_NOWAIT | M_ZERO); + KASSERT( + msg_header != NULL, + ("Error VMBUS: malloc failed to allocate Gpadl Message!")); + if (msg_header == NULL) + return (ENOMEM); + + msg_header->message_size = msg_size; + + gpa_header = (hv_vmbus_channel_gpadl_header*) msg_header->msg; + gpa_header->range_count = 1; + gpa_header->range_buf_len = sizeof(hv_gpa_range) + + page_count * sizeof(uint64_t); + gpa_header->range[0].byte_offset = 0; + gpa_header->range[0].byte_count = size; + for (i = 0; i < page_count; i++) { + gpa_header->range[0].pfn_array[i] = pfn + i; + } + + *msg_info = msg_header; + *message_count = 1; + } + + return (0); +} + +/** + * @brief Establish a GPADL for the specified buffer + */ +int +hv_vmbus_channel_establish_gpadl( + hv_vmbus_channel* channel, + void* contig_buffer, + uint32_t size, /* page-size multiple */ + uint32_t* gpadl_handle) + +{ + int ret = 0; + hv_vmbus_channel_gpadl_header* gpadl_msg; + hv_vmbus_channel_gpadl_body* gpadl_body; + hv_vmbus_channel_msg_info* msg_info; + hv_vmbus_channel_msg_info* sub_msg_info; + uint32_t msg_count; + hv_vmbus_channel_msg_info* curr; + uint32_t next_gpadl_handle; + + next_gpadl_handle = hv_vmbus_g_connection.next_gpadl_handle; + atomic_add_int((int*) &hv_vmbus_g_connection.next_gpadl_handle, 1); + + ret = vmbus_channel_create_gpadl_header( + contig_buffer, size, &msg_info, &msg_count); + + if(ret != 0) { /* if(allocation failed) return immediately */ + /* reverse atomic_add_int above */ + atomic_subtract_int((int*) + &hv_vmbus_g_connection.next_gpadl_handle, 1); + return ret; + } + + sema_init(&msg_info->wait_sema, 0, "Open Info Sema"); + gpadl_msg = (hv_vmbus_channel_gpadl_header*) msg_info->msg; + gpadl_msg->header.message_type = HV_CHANNEL_MESSAGEL_GPADL_HEADER; + gpadl_msg->child_rel_id = channel->offer_msg.child_rel_id; + gpadl_msg->gpadl = next_gpadl_handle; + + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_INSERT_TAIL( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + ret = hv_vmbus_post_message( + gpadl_msg, + msg_info->message_size - + (uint32_t) sizeof(hv_vmbus_channel_msg_info)); + + if (ret != 0) + goto cleanup; + + if (msg_count > 1) { + TAILQ_FOREACH(curr, + &msg_info->sub_msg_list_anchor, msg_list_entry) { + sub_msg_info = curr; + gpadl_body = + (hv_vmbus_channel_gpadl_body*) sub_msg_info->msg; + + gpadl_body->header.message_type = + HV_CHANNEL_MESSAGE_GPADL_BODY; + gpadl_body->gpadl = next_gpadl_handle; + + ret = hv_vmbus_post_message( + gpadl_body, + sub_msg_info->message_size + - (uint32_t) sizeof(hv_vmbus_channel_msg_info)); + /* if (the post message failed) give up and clean up */ + if(ret != 0) + goto cleanup; + } + } + + ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds*/ + if (ret != 0) + goto cleanup; + + *gpadl_handle = gpadl_msg->gpadl; + +cleanup: + + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_REMOVE(&hv_vmbus_g_connection.channel_msg_anchor, + msg_info, msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + sema_destroy(&msg_info->wait_sema); + free(msg_info, M_DEVBUF); + + return (ret); +} + +/** + * @brief Teardown the specified GPADL handle + */ +int +hv_vmbus_channel_teardown_gpdal( + hv_vmbus_channel* channel, + uint32_t gpadl_handle) +{ + int ret = 0; + hv_vmbus_channel_gpadl_teardown* msg; + hv_vmbus_channel_msg_info* info; + + info = (hv_vmbus_channel_msg_info *) + malloc( sizeof(hv_vmbus_channel_msg_info) + + sizeof(hv_vmbus_channel_gpadl_teardown), + M_DEVBUF, M_NOWAIT); + KASSERT(info != NULL, + ("Error VMBUS: malloc failed to allocate Gpadl Teardown Msg!")); + if (info == NULL) { + ret = ENOMEM; + goto cleanup; + } + + sema_init(&info->wait_sema, 0, "Open Info Sema"); + + msg = (hv_vmbus_channel_gpadl_teardown*) info->msg; + + msg->header.message_type = HV_CHANNEL_MESSAGE_GPADL_TEARDOWN; + msg->child_rel_id = channel->offer_msg.child_rel_id; + msg->gpadl = gpadl_handle; + + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_msg_anchor, + info, msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + ret = hv_vmbus_post_message(msg, + sizeof(hv_vmbus_channel_gpadl_teardown)); + if (ret != 0) + goto cleanup; + + ret = sema_timedwait(&info->wait_sema, 500); /* KYS 5 seconds */ + +cleanup: + /* + * Received a torndown response + */ + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_REMOVE(&hv_vmbus_g_connection.channel_msg_anchor, + info, msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + sema_destroy(&info->wait_sema); + free(info, M_DEVBUF); + + return (ret); +} + +/** + * @brief Close the specified channel + */ +void +hv_vmbus_channel_close(hv_vmbus_channel *channel) +{ + int ret = 0; + hv_vmbus_channel_close_channel* msg; + hv_vmbus_channel_msg_info* info; + + mtx_lock(&channel->inbound_lock); + channel->on_channel_callback = NULL; + mtx_unlock(&channel->inbound_lock); + + /** + * Send a closing message + */ + info = (hv_vmbus_channel_msg_info *) + malloc( sizeof(hv_vmbus_channel_msg_info) + + sizeof(hv_vmbus_channel_close_channel), + M_DEVBUF, M_NOWAIT); + KASSERT(info != NULL, ("VMBUS: malloc failed hv_vmbus_channel_close!")); + if(info == NULL) + return; + + msg = (hv_vmbus_channel_close_channel*) info->msg; + msg->header.message_type = HV_CHANNEL_MESSAGE_CLOSE_CHANNEL; + msg->child_rel_id = channel->offer_msg.child_rel_id; + + ret = hv_vmbus_post_message( + msg, sizeof(hv_vmbus_channel_close_channel)); + + /* Tear down the gpadl for the channel's ring buffer */ + if (channel->ring_buffer_gpadl_handle) { + hv_vmbus_channel_teardown_gpdal(channel, + channel->ring_buffer_gpadl_handle); + } + + /* TODO: Send a msg to release the childRelId */ + + /* cleanup the ring buffers for this channel */ + hv_ring_buffer_cleanup(&channel->outbound); + hv_ring_buffer_cleanup(&channel->inbound); + + contigfree( + channel->ring_buffer_pages, + channel->ring_buffer_page_count, + M_DEVBUF); + + free(info, M_DEVBUF); + + /* + * If we are closing the channel during an error path in + * opening the channel, don't free the channel + * since the caller will free the channel + */ + if (channel->state == HV_CHANNEL_OPEN_STATE) { + mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_anchor, + channel, + list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + + hv_vmbus_free_vmbus_channel(channel); + } + +} + +/** + * @brief Send the specified buffer on the given channel + */ +int +hv_vmbus_channel_send_packet( + hv_vmbus_channel* channel, + void* buffer, + uint32_t buffer_len, + uint64_t request_id, + hv_vmbus_packet_type type, + uint32_t flags) +{ + int ret = 0; + hv_vm_packet_descriptor desc; + uint32_t packet_len; + uint64_t aligned_data; + uint32_t packet_len_aligned; + hv_vmbus_sg_buffer_list buffer_list[3]; + + packet_len = sizeof(hv_vm_packet_descriptor) + buffer_len; + packet_len_aligned = HV_ALIGN_UP(packet_len, sizeof(uint64_t)); + aligned_data = 0; + + /* Setup the descriptor */ + desc.type = type; /* HV_VMBUS_PACKET_TYPE_DATA_IN_BAND; */ + desc.flags = flags; /* HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED */ + /* in 8-bytes granularity */ + desc.data_offset8 = sizeof(hv_vm_packet_descriptor) >> 3; + desc.length8 = (uint16_t) (packet_len_aligned >> 3); + desc.transaction_id = request_id; + + buffer_list[0].data = &desc; + buffer_list[0].length = sizeof(hv_vm_packet_descriptor); + + buffer_list[1].data = buffer; + buffer_list[1].length = buffer_len; + + buffer_list[2].data = &aligned_data; + buffer_list[2].length = packet_len_aligned - packet_len; + + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + + /* TODO: We should determine if this is optional */ + if (ret == 0 + && !hv_vmbus_get_ring_buffer_interrupt_mask( + &channel->outbound)) { + vmbus_channel_set_event(channel); + } + + return (ret); +} + +/** + * @brief Send a range of single-page buffer packets using + * a GPADL Direct packet type + */ +int +hv_vmbus_channel_send_packet_pagebuffer( + hv_vmbus_channel* channel, + hv_vmbus_page_buffer page_buffers[], + uint32_t page_count, + void* buffer, + uint32_t buffer_len, + uint64_t request_id) +{ + + int ret = 0; + int i = 0; + uint32_t packet_len; + uint32_t packetLen_aligned; + hv_vmbus_sg_buffer_list buffer_list[3]; + hv_vmbus_channel_packet_page_buffer desc; + uint32_t descSize; + uint64_t alignedData = 0; + + if (page_count > HV_MAX_PAGE_BUFFER_COUNT) + return (EINVAL); + + /* + * Adjust the size down since hv_vmbus_channel_packet_page_buffer + * is the largest size we support + */ + descSize = sizeof(hv_vmbus_channel_packet_page_buffer) - + ((HV_MAX_PAGE_BUFFER_COUNT - page_count) * + sizeof(hv_vmbus_page_buffer)); + packet_len = descSize + buffer_len; + packetLen_aligned = HV_ALIGN_UP(packet_len, sizeof(uint64_t)); + + /* Setup the descriptor */ + desc.type = HV_VMBUS_PACKET_TYPE_DATA_USING_GPA_DIRECT; + desc.flags = HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; + desc.data_offset8 = descSize >> 3; /* in 8-bytes granularity */ + desc.length8 = (uint16_t) (packetLen_aligned >> 3); + desc.transaction_id = request_id; + desc.range_count = page_count; + + for (i = 0; i < page_count; i++) { + desc.range[i].length = page_buffers[i].length; + desc.range[i].offset = page_buffers[i].offset; + desc.range[i].pfn = page_buffers[i].pfn; + } + + buffer_list[0].data = &desc; + buffer_list[0].length = descSize; + + buffer_list[1].data = buffer; + buffer_list[1].length = buffer_len; + + buffer_list[2].data = &alignedData; + buffer_list[2].length = packetLen_aligned - packet_len; + + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + + /* TODO: We should determine if this is optional */ + if (ret == 0 && + !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) { + vmbus_channel_set_event(channel); + } + + return (ret); +} + +/** + * @brief Send a multi-page buffer packet using a GPADL Direct packet type + */ +int +hv_vmbus_channel_send_packet_multipagebuffer( + hv_vmbus_channel* channel, + hv_vmbus_multipage_buffer* multi_page_buffer, + void* buffer, + uint32_t buffer_len, + uint64_t request_id) +{ + + int ret = 0; + uint32_t desc_size; + uint32_t packet_len; + uint32_t packet_len_aligned; + uint32_t pfn_count; + uint64_t aligned_data = 0; + hv_vmbus_sg_buffer_list buffer_list[3]; + hv_vmbus_channel_packet_multipage_buffer desc; + + pfn_count = + HV_NUM_PAGES_SPANNED( + multi_page_buffer->offset, + multi_page_buffer->length); + + if ((pfn_count == 0) || (pfn_count > HV_MAX_MULTIPAGE_BUFFER_COUNT)) + return (EINVAL); + /* + * Adjust the size down since hv_vmbus_channel_packet_multipage_buffer + * is the largest size we support + */ + desc_size = + sizeof(hv_vmbus_channel_packet_multipage_buffer) - + ((HV_MAX_MULTIPAGE_BUFFER_COUNT - pfn_count) * + sizeof(uint64_t)); + packet_len = desc_size + buffer_len; + packet_len_aligned = HV_ALIGN_UP(packet_len, sizeof(uint64_t)); + + /* + * Setup the descriptor + */ + desc.type = HV_VMBUS_PACKET_TYPE_DATA_USING_GPA_DIRECT; + desc.flags = HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; + desc.data_offset8 = desc_size >> 3; /* in 8-bytes granularity */ + desc.length8 = (uint16_t) (packet_len_aligned >> 3); + desc.transaction_id = request_id; + desc.range_count = 1; + + desc.range.length = multi_page_buffer->length; + desc.range.offset = multi_page_buffer->offset; + + memcpy(desc.range.pfn_array, multi_page_buffer->pfn_array, + pfn_count * sizeof(uint64_t)); + + buffer_list[0].data = &desc; + buffer_list[0].length = desc_size; + + buffer_list[1].data = buffer; + buffer_list[1].length = buffer_len; + + buffer_list[2].data = &aligned_data; + buffer_list[2].length = packet_len_aligned - packet_len; + + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + + /* TODO: We should determine if this is optional */ + if (ret == 0 && + !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) { + vmbus_channel_set_event(channel); + } + + return (ret); +} + +/** + * @brief Retrieve the user packet on the specified channel + */ +int +hv_vmbus_channel_recv_packet( + hv_vmbus_channel* channel, + void* Buffer, + uint32_t buffer_len, + uint32_t* buffer_actual_len, + uint64_t* request_id) +{ + int ret; + uint32_t user_len; + uint32_t packet_len; + hv_vm_packet_descriptor desc; + + *buffer_actual_len = 0; + *request_id = 0; + + ret = hv_ring_buffer_peek(&channel->inbound, &desc, + sizeof(hv_vm_packet_descriptor)); + if (ret != 0) + return (0); + + packet_len = desc.length8 << 3; + user_len = packet_len - (desc.data_offset8 << 3); + + *buffer_actual_len = user_len; + + if (user_len > buffer_len) + return (EINVAL); + + *request_id = desc.transaction_id; + + /* Copy over the packet to the user buffer */ + ret = hv_ring_buffer_read(&channel->inbound, Buffer, user_len, + (desc.data_offset8 << 3)); + + return (0); +} + +/** + * @brief Retrieve the raw packet on the specified channel + */ +int +hv_vmbus_channel_recv_packet_raw( + hv_vmbus_channel* channel, + void* buffer, + uint32_t buffer_len, + uint32_t* buffer_actual_len, + uint64_t* request_id) +{ + int ret; + uint32_t packetLen; + uint32_t userLen; + hv_vm_packet_descriptor desc; + + *buffer_actual_len = 0; + *request_id = 0; + + ret = hv_ring_buffer_peek( + &channel->inbound, &desc, + sizeof(hv_vm_packet_descriptor)); + + if (ret != 0) + return (0); + + packetLen = desc.length8 << 3; + userLen = packetLen - (desc.data_offset8 << 3); + + *buffer_actual_len = packetLen; + + if (packetLen > buffer_len) + return (ENOBUFS); + + *request_id = desc.transaction_id; + + /* Copy over the entire packet to the user buffer */ + ret = hv_ring_buffer_read(&channel->inbound, buffer, packetLen, 0); + + return (0); +} diff --git a/sys/contrib/dev/hyperv/vmbus/hv_channel_mgmt.c b/sys/contrib/dev/hyperv/vmbus/hv_channel_mgmt.c new file mode 100644 index 0000000..011e305 --- /dev/null +++ b/sys/contrib/dev/hyperv/vmbus/hv_channel_mgmt.c @@ -0,0 +1,680 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/mbuf.h> + +#include "hv_vmbus_priv.h" + +typedef void (*hv_pfn_channel_msg_handler)(hv_vmbus_channel_msg_header* msg); + +typedef struct hv_vmbus_channel_msg_table_entry { + hv_vmbus_channel_msg_type messageType; + hv_pfn_channel_msg_handler messageHandler; +} hv_vmbus_channel_msg_table_entry; + +/* + * Internal functions + */ + +static void vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr); +static void vmbus_channel_on_open_result(hv_vmbus_channel_msg_header* hdr); +static void vmbus_channel_on_offer_rescind(hv_vmbus_channel_msg_header* hdr); +static void vmbus_channel_on_gpadl_created(hv_vmbus_channel_msg_header* hdr); +static void vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr); +static void vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr); +static void vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr); +static void vmbus_channel_process_offer(void *context); + +/** + * Channel message dispatch table + */ +hv_vmbus_channel_msg_table_entry + g_channel_message_table[HV_CHANNEL_MESSAGE_COUNT] = { + { HV_CHANNEL_MESSAGE_INVALID, NULL }, + { HV_CHANNEL_MESSAGE_OFFER_CHANNEL, vmbus_channel_on_offer }, + { HV_CHANNEL_MESSAGE_RESCIND_CHANNEL_OFFER, + vmbus_channel_on_offer_rescind }, + { HV_CHANNEL_MESSAGE_REQUEST_OFFERS, NULL }, + { HV_CHANNEL_MESSAGE_ALL_OFFERS_DELIVERED, + vmbus_channel_on_offers_delivered }, + { HV_CHANNEL_MESSAGE_OPEN_CHANNEL, NULL }, + { HV_CHANNEL_MESSAGE_OPEN_CHANNEL_RESULT, + vmbus_channel_on_open_result }, + { HV_CHANNEL_MESSAGE_CLOSE_CHANNEL, NULL }, + { HV_CHANNEL_MESSAGEL_GPADL_HEADER, NULL }, + { HV_CHANNEL_MESSAGE_GPADL_BODY, NULL }, + { HV_CHANNEL_MESSAGE_GPADL_CREATED, + vmbus_channel_on_gpadl_created }, + { HV_CHANNEL_MESSAGE_GPADL_TEARDOWN, NULL }, + { HV_CHANNEL_MESSAGE_GPADL_TORNDOWN, + vmbus_channel_on_gpadl_torndown }, + { HV_CHANNEL_MESSAGE_REL_ID_RELEASED, NULL }, + { HV_CHANNEL_MESSAGE_INITIATED_CONTACT, NULL }, + { HV_CHANNEL_MESSAGE_VERSION_RESPONSE, + vmbus_channel_on_version_response }, + { HV_CHANNEL_MESSAGE_UNLOAD, NULL } +}; + + +/** + * Implementation of the work abstraction. + */ +static void +work_item_callback(void *work, int pending) +{ + struct hv_work_item *w = (struct hv_work_item *)work; + + /* + * Serialize work execution. + */ + if (w->wq->work_sema != NULL) { + sema_wait(w->wq->work_sema); + } + + w->callback(w->context); + + if (w->wq->work_sema != NULL) { + sema_post(w->wq->work_sema); + } + + free(w, M_DEVBUF); +} + +struct hv_work_queue* +hv_work_queue_create(char* name) +{ + static unsigned int qid = 0; + char qname[64]; + int pri; + struct hv_work_queue* wq; + + wq = malloc(sizeof(struct hv_work_queue), M_DEVBUF, M_NOWAIT | M_ZERO); + KASSERT(wq != NULL, ("Error VMBUS: Failed to allocate work_queue\n")); + if (wq == NULL) + return (NULL); + + /* + * We use work abstraction to handle messages + * coming from the host and these are typically offers. + * Some FreeBsd drivers appear to have a concurrency issue + * where probe/attach needs to be serialized. We ensure that + * by having only one thread process work elements in a + * specific queue by serializing work execution. + * + */ + if (strcmp(name, "vmbusQ") == 0) { + pri = PI_DISK; + } else { /* control */ + pri = PI_NET; + /* + * Initialize semaphore for this queue by pointing + * to the globale semaphore used for synchronizing all + * control messages. + */ + wq->work_sema = &hv_vmbus_g_connection.control_sema; + } + + sprintf(qname, "hv_%s_%u", name, qid); + + /* + * Fixme: FreeBSD 8.2 has a different prototype for + * taskqueue_create(), and for certain other taskqueue functions. + * We need to research the implications of these changes. + * Fixme: Not sure when the changes were introduced. + */ + wq->queue = taskqueue_create(qname, M_NOWAIT, taskqueue_thread_enqueue, + &wq->queue + #if __FreeBSD_version < 800000 + , &wq->proc + #endif + ); + + if (wq->queue == NULL) { + free(wq, M_DEVBUF); + return (NULL); + } + + if (taskqueue_start_threads(&wq->queue, 1, pri, "%s taskq", qname)) { + taskqueue_free(wq->queue); + free(wq, M_DEVBUF); + return (NULL); + } + + qid++; + + return (wq); +} + +void +hv_work_queue_close(struct hv_work_queue *wq) +{ + /* + * KYS: Need to drain the taskqueue + * before we close the hv_work_queue. + */ + /*KYS: taskqueue_drain(wq->tq, ); */ + taskqueue_free(wq->queue); + free(wq, M_DEVBUF); +} + +/** + * @brief Create work item + */ +int +hv_queue_work_item( + struct hv_work_queue *wq, + void (*callback)(void *), void *context) +{ + struct hv_work_item *w = malloc(sizeof(struct hv_work_item), + M_DEVBUF, M_NOWAIT | M_ZERO); + KASSERT(w != NULL, ("Error VMBUS: Failed to allocate WorkItem\n")); + if (w == NULL) + return (ENOMEM); + + w->callback = callback; + w->context = context; + w->wq = wq; + + TASK_INIT(&w->work, 0, work_item_callback, w); + + return (taskqueue_enqueue(wq->queue, &w->work)); +} + +/** + * @brief Rescind the offer by initiating a device removal + */ +static void +vmbus_channel_process_rescind_offer(void *context) +{ + hv_vmbus_channel* channel = (hv_vmbus_channel*) context; + hv_vmbus_child_device_unregister(channel->device); +} + +/** + * @brief Allocate and initialize a vmbus channel object + */ +hv_vmbus_channel* +hv_vmbus_allocate_channel(void) +{ + hv_vmbus_channel* channel; + + channel = (hv_vmbus_channel*) malloc( + sizeof(hv_vmbus_channel), + M_DEVBUF, + M_NOWAIT | M_ZERO); + KASSERT(channel != NULL, ("Error VMBUS: Failed to allocate channel!")); + if (channel == NULL) + return (NULL); + + mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF); + + channel->control_work_queue = hv_work_queue_create("control"); + + if (channel->control_work_queue == NULL) { + mtx_destroy(&channel->inbound_lock); + free(channel, M_DEVBUF); + return (NULL); + } + + return (channel); +} + +/** + * @brief Release the vmbus channel object itself + */ +static inline void +ReleaseVmbusChannel(void *context) +{ + hv_vmbus_channel* channel = (hv_vmbus_channel*) context; + hv_work_queue_close(channel->control_work_queue); + free(channel, M_DEVBUF); +} + +/** + * @brief Release the resources used by the vmbus channel object + */ +void +hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel) +{ + mtx_destroy(&channel->inbound_lock); + /* + * We have to release the channel's workqueue/thread in + * the vmbus's workqueue/thread context + * ie we can't destroy ourselves + */ + hv_queue_work_item(hv_vmbus_g_connection.work_queue, + ReleaseVmbusChannel, (void *) channel); +} + +/** + * @brief Process the offer by creating a channel/device + * associated with this offer + */ +static void +vmbus_channel_process_offer(void *context) +{ + int ret; + hv_vmbus_channel* new_channel; + boolean_t f_new; + hv_vmbus_channel* channel; + + new_channel = (hv_vmbus_channel*) context; + f_new = TRUE; + channel = NULL; + + /* + * Make sure this is a new offer + */ + mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + + TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor, + list_entry) + { + if (!memcmp( + &channel->offer_msg.offer.interface_type, + &new_channel->offer_msg.offer.interface_type, + sizeof(hv_guid)) + && !memcmp( + &channel->offer_msg.offer.interface_instance, + &new_channel->offer_msg.offer.interface_instance, + sizeof(hv_guid))) { + f_new = FALSE; + break; + } + } + + if (f_new) { + /* Insert at tail */ + TAILQ_INSERT_TAIL( + &hv_vmbus_g_connection.channel_anchor, + new_channel, + list_entry); + } + mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + + if (!f_new) { + hv_vmbus_free_vmbus_channel(new_channel); + return; + } + + /* + * Start the process of binding this offer to the driver + * (We need to set the device field before calling + * hv_vmbus_child_device_add()) + */ + new_channel->device = hv_vmbus_child_device_create( + new_channel->offer_msg.offer.interface_type, + new_channel->offer_msg.offer.interface_instance, new_channel); + + /* + * TODO - the HV_CHANNEL_OPEN_STATE flag should not be set below + * but in the "open" channel request. The ret != 0 logic below + * doesn't take into account that a channel + * may have been opened successfully + */ + + /* + * Add the new device to the bus. This will kick off device-driver + * binding which eventually invokes the device driver's AddDevice() + * method. + */ + ret = hv_vmbus_child_device_register(new_channel->device); + if (ret != 0) { + mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_anchor, + new_channel, + list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + hv_vmbus_free_vmbus_channel(new_channel); + } else { + /* + * This state is used to indicate a successful open + * so that when we do close the channel normally, + * we can clean up properly + */ + new_channel->state = HV_CHANNEL_OPEN_STATE; + + } +} + +/** + * @brief Handler for channel offers from Hyper-V/Azure + * + * Handler for channel offers from vmbus in parent partition. We ignore + * all offers except network and storage offers. For each network and storage + * offers, we create a channel object and queue a work item to the channel + * object to process the offer synchronously + */ +static void +vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr) +{ + hv_vmbus_channel_offer_channel* offer; + hv_vmbus_channel* new_channel; + + offer = (hv_vmbus_channel_offer_channel*) hdr; + + hv_guid *guidType; + hv_guid *guidInstance; + + guidType = &offer->offer.interface_type; + guidInstance = &offer->offer.interface_instance; + + /* Allocate the channel object and save this offer */ + new_channel = hv_vmbus_allocate_channel(); + if (new_channel == NULL) + return; + + memcpy(&new_channel->offer_msg, offer, + sizeof(hv_vmbus_channel_offer_channel)); + new_channel->monitor_group = (uint8_t) offer->monitor_id / 32; + new_channel->monitor_bit = (uint8_t) offer->monitor_id % 32; + + /* TODO: Make sure the offer comes from our parent partition */ + hv_queue_work_item( + new_channel->control_work_queue, + vmbus_channel_process_offer, + new_channel); +} + +/** + * @brief Rescind offer handler. + * + * We queue a work item to process this offer + * synchronously + */ +static void +vmbus_channel_on_offer_rescind(hv_vmbus_channel_msg_header* hdr) +{ + hv_vmbus_channel_rescind_offer* rescind; + hv_vmbus_channel* channel; + + rescind = (hv_vmbus_channel_rescind_offer*) hdr; + + channel = hv_vmbus_get_channel_from_rel_id(rescind->child_rel_id); + if (channel == NULL) + return; + + hv_queue_work_item(channel->control_work_queue, + vmbus_channel_process_rescind_offer, channel); +} + +/** + * + * @brief Invoked when all offers have been delivered. + */ +static void +vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr) +{ +} + +/** + * @brief Open result handler. + * + * This is invoked when we received a response + * to our channel open request. Find the matching request, copy the + * response and signal the requesting thread. + */ +static void +vmbus_channel_on_open_result(hv_vmbus_channel_msg_header* hdr) +{ + hv_vmbus_channel_open_result* result; + hv_vmbus_channel_msg_info* msg_info; + hv_vmbus_channel_msg_header* requestHeader; + hv_vmbus_channel_open_channel* openMsg; + + result = (hv_vmbus_channel_open_result*) hdr; + + /* + * Find the open msg, copy the result and signal/unblock the wait event + */ + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + TAILQ_FOREACH(msg_info, &hv_vmbus_g_connection.channel_msg_anchor, + msg_list_entry) { + requestHeader = (hv_vmbus_channel_msg_header*) msg_info->msg; + + if (requestHeader->message_type == + HV_CHANNEL_MESSAGE_OPEN_CHANNEL) { + openMsg = (hv_vmbus_channel_open_channel*) msg_info->msg; + if (openMsg->child_rel_id == result->child_rel_id + && openMsg->open_id == result->open_id) { + memcpy(&msg_info->response.open_result, result, + sizeof(hv_vmbus_channel_open_result)); + sema_post(&msg_info->wait_sema); + break; + } + } + } + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + +} + +/** + * @brief GPADL created handler. + * + * This is invoked when we received a response + * to our gpadl create request. Find the matching request, copy the + * response and signal the requesting thread. + */ +static void +vmbus_channel_on_gpadl_created(hv_vmbus_channel_msg_header* hdr) +{ + hv_vmbus_channel_gpadl_created* gpadl_created; + hv_vmbus_channel_msg_info* msg_info; + hv_vmbus_channel_msg_header* request_header; + hv_vmbus_channel_gpadl_header* gpadl_header; + + gpadl_created = (hv_vmbus_channel_gpadl_created*) hdr; + + /* Find the establish msg, copy the result and signal/unblock + * the wait event + */ + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_FOREACH(msg_info, &hv_vmbus_g_connection.channel_msg_anchor, + msg_list_entry) { + request_header = (hv_vmbus_channel_msg_header*) msg_info->msg; + if (request_header->message_type == + HV_CHANNEL_MESSAGEL_GPADL_HEADER) { + gpadl_header = + (hv_vmbus_channel_gpadl_header*) request_header; + + if ((gpadl_created->child_rel_id == gpadl_header->child_rel_id) + && (gpadl_created->gpadl == gpadl_header->gpadl)) { + memcpy(&msg_info->response.gpadl_created, + gpadl_created, + sizeof(hv_vmbus_channel_gpadl_created)); + sema_post(&msg_info->wait_sema); + break; + } + } + } + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); +} + +/** + * @brief GPADL torndown handler. + * + * This is invoked when we received a respons + * to our gpadl teardown request. Find the matching request, copy the + * response and signal the requesting thread + */ +static void +vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr) +{ + hv_vmbus_channel_gpadl_torndown* gpadl_torndown; + hv_vmbus_channel_msg_info* msg_info; + hv_vmbus_channel_msg_header* requestHeader; + hv_vmbus_channel_gpadl_teardown* gpadlTeardown; + + gpadl_torndown = (hv_vmbus_channel_gpadl_torndown*)hdr; + + /* + * Find the open msg, copy the result and signal/unblock the + * wait event. + */ + + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + TAILQ_FOREACH(msg_info, &hv_vmbus_g_connection.channel_msg_anchor, + msg_list_entry) { + requestHeader = (hv_vmbus_channel_msg_header*) msg_info->msg; + + if (requestHeader->message_type + == HV_CHANNEL_MESSAGE_GPADL_TEARDOWN) { + gpadlTeardown = + (hv_vmbus_channel_gpadl_teardown*) requestHeader; + + if (gpadl_torndown->gpadl == gpadlTeardown->gpadl) { + memcpy(&msg_info->response.gpadl_torndown, + gpadl_torndown, + sizeof(hv_vmbus_channel_gpadl_torndown)); + sema_post(&msg_info->wait_sema); + break; + } + } + } + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); +} + +/** + * @brief Version response handler. + * + * This is invoked when we received a response + * to our initiate contact request. Find the matching request, copy th + * response and signal the requesting thread. + */ +static void +vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr) +{ + hv_vmbus_channel_msg_info* msg_info; + hv_vmbus_channel_msg_header* requestHeader; + hv_vmbus_channel_initiate_contact* initiate; + hv_vmbus_channel_version_response* versionResponse; + + versionResponse = (hv_vmbus_channel_version_response*)hdr; + + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_FOREACH(msg_info, &hv_vmbus_g_connection.channel_msg_anchor, + msg_list_entry) { + requestHeader = (hv_vmbus_channel_msg_header*) msg_info->msg; + if (requestHeader->message_type + == HV_CHANNEL_MESSAGE_INITIATED_CONTACT) { + initiate = + (hv_vmbus_channel_initiate_contact*) requestHeader; + memcpy(&msg_info->response.version_response, + versionResponse, + sizeof(hv_vmbus_channel_version_response)); + sema_post(&msg_info->wait_sema); + } + } + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + +} + +/** + * @brief Handler for channel protocol messages. + * + * This is invoked in the vmbus worker thread context. + */ +void +hv_vmbus_on_channel_message(void *context) +{ + hv_vmbus_message* msg; + hv_vmbus_channel_msg_header* hdr; + int size; + + msg = (hv_vmbus_message*) context; + hdr = (hv_vmbus_channel_msg_header*) msg->u.payload; + size = msg->header.payload_size; + + if (hdr->message_type >= HV_CHANNEL_MESSAGE_COUNT) { + free(msg, M_DEVBUF); + return; + } + + if (g_channel_message_table[hdr->message_type].messageHandler) { + g_channel_message_table[hdr->message_type].messageHandler(hdr); + } + + /* Free the msg that was allocated in VmbusOnMsgDPC() */ + free(msg, M_DEVBUF); +} + +/** + * @brief Send a request to get all our pending offers. + */ +int +hv_vmbus_request_channel_offers(void) +{ + int ret; + hv_vmbus_channel_msg_header* msg; + hv_vmbus_channel_msg_info* msg_info; + + msg_info = (hv_vmbus_channel_msg_info *) + malloc(sizeof(hv_vmbus_channel_msg_info) + + sizeof(hv_vmbus_channel_msg_header), M_DEVBUF, M_NOWAIT); + + if (msg_info == NULL) { + if(bootverbose) + printf("Error VMBUS: malloc failed for Request Offers\n"); + return (ENOMEM); + } + + msg = (hv_vmbus_channel_msg_header*) msg_info->msg; + msg->message_type = HV_CHANNEL_MESSAGE_REQUEST_OFFERS; + + ret = hv_vmbus_post_message(msg, sizeof(hv_vmbus_channel_msg_header)); + + if (msg_info) + free(msg_info, M_DEVBUF); + + return (ret); +} + +/** + * @brief Release channels that are unattached/unconnected (i.e., no drivers associated) + */ +void +hv_vmbus_release_unattached_channels(void) +{ + hv_vmbus_channel *channel; + + mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + + while (!TAILQ_EMPTY(&hv_vmbus_g_connection.channel_anchor)) { + channel = TAILQ_FIRST(&hv_vmbus_g_connection.channel_anchor); + TAILQ_REMOVE(&hv_vmbus_g_connection.channel_anchor, + channel, list_entry); + + hv_vmbus_child_device_unregister(channel->device); + hv_vmbus_free_vmbus_channel(channel); + } + mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); +} diff --git a/sys/contrib/dev/hyperv/vmbus/hv_connection.c b/sys/contrib/dev/hyperv/vmbus/hv_connection.c new file mode 100644 index 0000000..c8e0b48 --- /dev/null +++ b/sys/contrib/dev/hyperv/vmbus/hv_connection.c @@ -0,0 +1,431 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/malloc.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <machine/bus.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> + +#include "hv_vmbus_priv.h" + +/* + * Globals + */ +hv_vmbus_connection hv_vmbus_g_connection = + { .connect_state = HV_DISCONNECTED, + .next_gpadl_handle = 0xE1E10, }; + +/** + * Send a connect request on the partition service connection + */ +int +hv_vmbus_connect(void) { + int ret = 0; + hv_vmbus_channel_msg_info* msg_info = NULL; + hv_vmbus_channel_initiate_contact* msg; + + /** + * Make sure we are not connecting or connected + */ + if (hv_vmbus_g_connection.connect_state != HV_DISCONNECTED) { + return (-1); + } + + /** + * Initialize the vmbus connection + */ + hv_vmbus_g_connection.connect_state = HV_CONNECTING; + hv_vmbus_g_connection.work_queue = hv_work_queue_create("vmbusQ"); + sema_init(&hv_vmbus_g_connection.control_sema, 1, "control_sema"); + + TAILQ_INIT(&hv_vmbus_g_connection.channel_msg_anchor); + mtx_init(&hv_vmbus_g_connection.channel_msg_lock, "vmbus channel msg", + NULL, MTX_SPIN); + + TAILQ_INIT(&hv_vmbus_g_connection.channel_anchor); + mtx_init(&hv_vmbus_g_connection.channel_lock, "vmbus channel", + NULL, MTX_SPIN); + + /** + * Setup the vmbus event connection for channel interrupt abstraction + * stuff + */ + hv_vmbus_g_connection.interrupt_page = contigmalloc( + PAGE_SIZE, M_DEVBUF, + M_NOWAIT | M_ZERO, 0UL, + BUS_SPACE_MAXADDR, + PAGE_SIZE, 0); + KASSERT(hv_vmbus_g_connection.interrupt_page != NULL, + ("Error VMBUS: malloc failed to allocate Channel" + " Request Event message!")); + if (hv_vmbus_g_connection.interrupt_page == NULL) { + ret = ENOMEM; + goto cleanup; + } + + hv_vmbus_g_connection.recv_interrupt_page = + hv_vmbus_g_connection.interrupt_page; + + hv_vmbus_g_connection.send_interrupt_page = + ((uint8_t *) hv_vmbus_g_connection.interrupt_page + + (PAGE_SIZE >> 1)); + + /** + * Set up the monitor notification facility. The 1st page for + * parent->child and the 2nd page for child->parent + */ + hv_vmbus_g_connection.monitor_pages = contigmalloc( + 2 * PAGE_SIZE, + M_DEVBUF, + M_NOWAIT | M_ZERO, + 0UL, + BUS_SPACE_MAXADDR, + PAGE_SIZE, + 0); + KASSERT(hv_vmbus_g_connection.monitor_pages != NULL, + ("Error VMBUS: malloc failed to allocate Monitor Pages!")); + if (hv_vmbus_g_connection.monitor_pages == NULL) { + ret = ENOMEM; + goto cleanup; + } + + msg_info = (hv_vmbus_channel_msg_info*) + malloc(sizeof(hv_vmbus_channel_msg_info) + + sizeof(hv_vmbus_channel_initiate_contact), + M_DEVBUF, M_NOWAIT | M_ZERO); + KASSERT(msg_info != NULL, + ("Error VMBUS: malloc failed for Initiate Contact message!")); + if (msg_info == NULL) { + ret = ENOMEM; + goto cleanup; + } + + sema_init(&msg_info->wait_sema, 0, "Msg Info Sema"); + msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg; + + msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT; + msg->vmbus_version_requested = HV_VMBUS_REVISION_NUMBER; + + msg->interrupt_page = hv_get_phys_addr( + hv_vmbus_g_connection.interrupt_page); + + msg->monitor_page_1 = hv_get_phys_addr( + hv_vmbus_g_connection.monitor_pages); + + msg->monitor_page_2 = + hv_get_phys_addr( + ((uint8_t *) hv_vmbus_g_connection.monitor_pages + + PAGE_SIZE)); + + /** + * Add to list before we send the request since we may receive the + * response before returning from this routine + */ + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + TAILQ_INSERT_TAIL( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + ret = hv_vmbus_post_message( + msg, + sizeof(hv_vmbus_channel_initiate_contact)); + + if (ret != 0) { + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + goto cleanup; + } + + /** + * Wait for the connection response + */ + ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */ + + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + /** + * Check if successful + */ + if (msg_info->response.version_response.version_supported) { + hv_vmbus_g_connection.connect_state = HV_CONNECTED; + } else { + ret = ECONNREFUSED; + goto cleanup; + } + + sema_destroy(&msg_info->wait_sema); + free(msg_info, M_DEVBUF); + + return (0); + + /* + * Cleanup after failure! + */ + cleanup: + + hv_vmbus_g_connection.connect_state = HV_DISCONNECTED; + + hv_work_queue_close(hv_vmbus_g_connection.work_queue); + sema_destroy(&hv_vmbus_g_connection.control_sema); + mtx_destroy(&hv_vmbus_g_connection.channel_lock); + mtx_destroy(&hv_vmbus_g_connection.channel_msg_lock); + + if (hv_vmbus_g_connection.interrupt_page != NULL) { + contigfree( + hv_vmbus_g_connection.interrupt_page, + PAGE_SIZE, + M_DEVBUF); + hv_vmbus_g_connection.interrupt_page = NULL; + } + + if (hv_vmbus_g_connection.monitor_pages != NULL) { + contigfree( + hv_vmbus_g_connection.monitor_pages, + 2 * PAGE_SIZE, + M_DEVBUF); + hv_vmbus_g_connection.monitor_pages = NULL; + } + + if (msg_info) { + sema_destroy(&msg_info->wait_sema); + free(msg_info, M_DEVBUF); + } + + return (ret); +} + +/** + * Send a disconnect request on the partition service connection + */ +int +hv_vmbus_disconnect(void) { + int ret = 0; + hv_vmbus_channel_unload* msg; + + msg = malloc(sizeof(hv_vmbus_channel_unload), + M_DEVBUF, M_NOWAIT | M_ZERO); + KASSERT(msg != NULL, + ("Error VMBUS: malloc failed to allocate Channel Unload Msg!")); + if (msg == NULL) + return (ENOMEM); + + msg->message_type = HV_CHANNEL_MESSAGE_UNLOAD; + + ret = hv_vmbus_post_message(msg, sizeof(hv_vmbus_channel_unload)); + + + contigfree(hv_vmbus_g_connection.interrupt_page, PAGE_SIZE, M_DEVBUF); + + mtx_destroy(&hv_vmbus_g_connection.channel_msg_lock); + + hv_work_queue_close(hv_vmbus_g_connection.work_queue); + sema_destroy(&hv_vmbus_g_connection.control_sema); + + hv_vmbus_g_connection.connect_state = HV_DISCONNECTED; + + free(msg, M_DEVBUF); + + return (ret); +} + +/** + * Get the channel object given its child relative id (ie channel id) + */ +hv_vmbus_channel* +hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) { + + hv_vmbus_channel* channel; + hv_vmbus_channel* foundChannel = NULL; + + /* + * TODO: + * Consider optimization where relids are stored in a fixed size array + * and channels are accessed without the need to take this lock or search + * the list. + */ + mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + TAILQ_FOREACH(channel, + &hv_vmbus_g_connection.channel_anchor, list_entry) { + + if (channel->offer_msg.child_rel_id == rel_id) { + foundChannel = channel; + break; + } + } + mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + + return (foundChannel); +} + +/** + * Process a channel event notification + */ +static void +VmbusProcessChannelEvent(uint32_t relid) +{ + hv_vmbus_channel* channel; + + /** + * Find the channel based on this relid and invokes + * the channel callback to process the event + */ + + channel = hv_vmbus_get_channel_from_rel_id(relid); + + if (channel == NULL) { + return; + } + /** + * To deal with the race condition where we might + * receive a packet while the relevant driver is + * being unloaded, dispatch the callback while + * holding the channel lock. The unloading driver + * will acquire the same channel lock to set the + * callback to NULL. This closes the window. + */ + + mtx_lock(&channel->inbound_lock); + if (channel->on_channel_callback != NULL) { + channel->on_channel_callback(channel->channel_callback_context); + } + mtx_unlock(&channel->inbound_lock); +} + +/** + * Handler for events + */ +void +hv_vmbus_on_events(void *arg) +{ + int dword; + int bit; + int rel_id; + int maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5; + /* int maxdword = PAGE_SIZE >> 3; */ + + /* + * receive size is 1/2 page and divide that by 4 bytes + */ + + uint32_t* recv_interrupt_page = + hv_vmbus_g_connection.recv_interrupt_page; + + /* + * Check events + */ + if (recv_interrupt_page != NULL) { + for (dword = 0; dword < maxdword; dword++) { + if (recv_interrupt_page[dword]) { + for (bit = 0; bit < 32; bit++) { + if (synch_test_and_clear_bit(bit, + (uint32_t *) &recv_interrupt_page[dword])) { + rel_id = (dword << 5) + bit; + if (rel_id == 0) { + /* + * Special case - + * vmbus channel protocol msg. + */ + continue; + } else { + VmbusProcessChannelEvent(rel_id); + + } + } + } + } + } + } + + return; +} + +/** + * Send a msg on the vmbus's message connection + */ +int hv_vmbus_post_message(void *buffer, size_t bufferLen) { + int ret = 0; + hv_vmbus_connection_id connId; + unsigned retries = 0; + + /* NetScaler delays from previous code were consolidated here */ + static int delayAmount[] = {100, 100, 100, 500, 500, 5000, 5000, 5000}; + + /* for(each entry in delayAmount) try to post message, + * delay a little bit before retrying + */ + for (retries = 0; + retries < sizeof(delayAmount)/sizeof(delayAmount[0]); retries++) { + connId.as_uint32_t = 0; + connId.u.id = HV_VMBUS_MESSAGE_CONNECTION_ID; + ret = hv_vmbus_post_msg_via_msg_ipc(connId, 1, buffer, bufferLen); + if (ret != HV_STATUS_INSUFFICIENT_BUFFERS) + break; + /* TODO: KYS We should use a blocking wait call */ + DELAY(delayAmount[retries]); + } + + KASSERT(ret == 0, ("Error VMBUS: Message Post Failed\n")); + + return (ret); +} + +/** + * Send an event notification to the parent + */ +int +hv_vmbus_set_event(uint32_t child_rel_id) { + int ret = 0; + + /* Each uint32_t represents 32 channels */ + + synch_set_bit(child_rel_id & 31, + (((uint32_t *)hv_vmbus_g_connection.send_interrupt_page + + (child_rel_id >> 5)))); + ret = hv_vmbus_signal_event(); + + return (ret); +} + diff --git a/sys/contrib/dev/hyperv/vmbus/hv_hv.c b/sys/contrib/dev/hyperv/vmbus/hv_hv.c new file mode 100644 index 0000000..0e73bdc --- /dev/null +++ b/sys/contrib/dev/hyperv/vmbus/hv_hv.c @@ -0,0 +1,515 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Implements low-level interactions with Hypver-V/Azure + */ + +#include <sys/param.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/timetc.h> +#include <machine/bus.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> + + +#include "hv_vmbus_priv.h" + +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 +#define HV_X64_CPUID_MIN 0x40000005 +#define HV_X64_CPUID_MAX 0x4000ffff +#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 + +#define HV_NANOSECONDS_PER_SEC 1000000000L + + +static u_int hv_get_timecount(struct timecounter *tc); +static u_int hv_get_timecount(struct timecounter *tc); + +static inline void do_cpuid_inline(unsigned int op, unsigned int *eax, + unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { + __asm__ __volatile__("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), + "=d" (*edx) : "0" (op), "c" (ecx)); +} + +/** + * Globals + */ +hv_vmbus_context hv_vmbus_g_context = { + .syn_ic_initialized = FALSE, + .hypercall_page = NULL, + .signal_event_param = NULL, + .signal_event_buffer = NULL, +}; + +static struct timecounter hv_timecounter = { + hv_get_timecount, 0, ~0u, HV_NANOSECONDS_PER_SEC/100, "Hyper-V", HV_NANOSECONDS_PER_SEC/100 +}; + +static u_int +hv_get_timecount(struct timecounter *tc) +{ + u_int now = hv_vmbus_read_msr(HV_X64_MSR_TIME_REF_COUNT); + return (now); +} + +/** + * @brief Query the cpuid for presence of windows hypervisor + */ +int +hv_vmbus_query_hypervisor_presence(void) +{ + u_int regs[4]; + int hyper_v_detected = 0; + do_cpuid(1, regs); + if (regs[2] & 0x80000000) { /* if(a hypervisor is detected) */ + /* make sure this really is Hyper-V */ + /* we look at the CPUID info */ + do_cpuid(HV_X64_MSR_GUEST_OS_ID, regs); + hyper_v_detected = + regs[0] >= HV_X64_CPUID_MIN && + regs[0] <= HV_X64_CPUID_MAX && + !memcmp("Microsoft Hv", ®s[1], 12); + } + return (hyper_v_detected); +} + +/** + * @brief Get version of the windows hypervisor + */ +static int +hv_vmbus_get_hypervisor_version(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + unsigned int maxLeaf; + unsigned int op; + + /* + * Its assumed that this is called after confirming that + * Viridian is present + * Query id and revision. + */ + eax = 0; + ebx = 0; + ecx = 0; + edx = 0; + op = HV_CPU_ID_FUNCTION_HV_VENDOR_AND_MAX_FUNCTION; + do_cpuid_inline(op, &eax, &ebx, &ecx, &edx); + + maxLeaf = eax; + eax = 0; + ebx = 0; + ecx = 0; + edx = 0; + op = HV_CPU_ID_FUNCTION_HV_INTERFACE; + do_cpuid_inline(op, &eax, &ebx, &ecx, &edx); + + if (maxLeaf >= HV_CPU_ID_FUNCTION_MS_HV_VERSION) { + eax = 0; + ebx = 0; + ecx = 0; + edx = 0; + op = HV_CPU_ID_FUNCTION_MS_HV_VERSION; + do_cpuid_inline(op, &eax, &ebx, &ecx, &edx); + } + return (maxLeaf); +} + +/** + * @brief Invoke the specified hypercall + */ +static uint64_t +hv_vmbus_do_hypercall(uint64_t control, void* input, void* output) +{ +#ifdef __x86_64__ + uint64_t hv_status = 0; + uint64_t input_address = (input) ? hv_get_phys_addr(input) : 0; + uint64_t output_address = (output) ? hv_get_phys_addr(output) : 0; + volatile void* hypercall_page = hv_vmbus_g_context.hypercall_page; + + __asm__ __volatile__ ("mov %0, %%r8" : : "r" (output_address): "r8"); + __asm__ __volatile__ ("call *%3" : "=a"(hv_status): + "c" (control), "d" (input_address), + "m" (hypercall_page)); + return (hv_status); +#else + uint32_t control_high = control >> 32; + uint32_t control_low = control & 0xFFFFFFFF; + uint32_t hv_status_high = 1; + uint32_t hv_status_low = 1; + uint64_t input_address = (input) ? hv_get_phys_addr(input) : 0; + uint32_t input_address_high = input_address >> 32; + uint32_t input_address_low = input_address & 0xFFFFFFFF; + uint64_t output_address = (output) ? hv_get_phys_addr(output) : 0; + uint32_t output_address_high = output_address >> 32; + uint32_t output_address_low = output_address & 0xFFFFFFFF; + volatile void* hypercall_page = hv_vmbus_g_context.hypercall_page; + + __asm__ __volatile__ ("call *%8" : "=d"(hv_status_high), + "=a"(hv_status_low) : "d" (control_high), + "a" (control_low), "b" (input_address_high), + "c" (input_address_low), + "D"(output_address_high), + "S"(output_address_low), "m" (hypercall_page)); + return (hv_status_low | ((uint64_t)hv_status_high << 32)); +#endif /* __x86_64__ */ +} + +/** + * @brief Main initialization routine. + * + * This routine must be called + * before any other routines in here are called + */ +int +hv_vmbus_init(void) +{ + int max_leaf; + hv_vmbus_x64_msr_hypercall_contents hypercall_msr; + void* virt_addr = 0; + + memset( + hv_vmbus_g_context.syn_ic_event_page, + 0, + sizeof(hv_vmbus_handle) * MAXCPU); + + memset( + hv_vmbus_g_context.syn_ic_msg_page, + 0, + sizeof(hv_vmbus_handle) * MAXCPU); + + if (!hv_vmbus_query_hypervisor_presence()) + goto cleanup; + + max_leaf = hv_vmbus_get_hypervisor_version(); + + /* + * Write our OS info + */ + uint64_t os_guest_info = HV_FREEBSD_GUEST_ID; + hv_vmbus_write_msr(HV_X64_MSR_GUEST_OS_ID, os_guest_info); + hv_vmbus_g_context.guest_id = os_guest_info; + + /* + * See if the hypercall page is already set + */ + hypercall_msr.as_uint64_t = hv_vmbus_read_msr(HV_X64_MSR_HYPERCALL); + virt_addr = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO); + KASSERT(virt_addr != NULL, + ("Error VMBUS: malloc failed to allocate page during init!")); + if (virt_addr == NULL) + goto cleanup; + + hypercall_msr.enable = 1; + hypercall_msr.guest_physical_address = + (hv_get_phys_addr(virt_addr) >> PAGE_SHIFT); + hv_vmbus_write_msr(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64_t); + + /* + * Confirm that hypercall page did get set up + */ + hypercall_msr.as_uint64_t = 0; + hypercall_msr.as_uint64_t = hv_vmbus_read_msr(HV_X64_MSR_HYPERCALL); + + if (!hypercall_msr.enable) + goto cleanup; + + hv_vmbus_g_context.hypercall_page = virt_addr; + + /* + * Setup the global signal event param for the signal event hypercall + */ + hv_vmbus_g_context.signal_event_buffer = + malloc(sizeof(hv_vmbus_input_signal_event_buffer), M_DEVBUF, + M_ZERO | M_NOWAIT); + KASSERT(hv_vmbus_g_context.signal_event_buffer != NULL, + ("Error VMBUS: Failed to allocate signal_event_buffer\n")); + if (hv_vmbus_g_context.signal_event_buffer == NULL) + goto cleanup; + + hv_vmbus_g_context.signal_event_param = + (hv_vmbus_input_signal_event*) + (HV_ALIGN_UP((unsigned long) + hv_vmbus_g_context.signal_event_buffer, + HV_HYPERCALL_PARAM_ALIGN)); + hv_vmbus_g_context.signal_event_param->connection_id.as_uint32_t = 0; + hv_vmbus_g_context.signal_event_param->connection_id.u.id = + HV_VMBUS_EVENT_CONNECTION_ID; + hv_vmbus_g_context.signal_event_param->flag_number = 0; + hv_vmbus_g_context.signal_event_param->rsvd_z = 0; + + tc_init(&hv_timecounter); /* register virtual timecount */ + + return (0); + + cleanup: + if (virt_addr != NULL) { + if (hypercall_msr.enable) { + hypercall_msr.as_uint64_t = 0; + hv_vmbus_write_msr(HV_X64_MSR_HYPERCALL, + hypercall_msr.as_uint64_t); + } + + free(virt_addr, M_DEVBUF); + } + return (ENOTSUP); +} + +/** + * @brief Cleanup routine, called normally during driver unloading or exiting + */ +void +hv_vmbus_cleanup(void) +{ + hv_vmbus_x64_msr_hypercall_contents hypercall_msr; + + if (hv_vmbus_g_context.signal_event_buffer != NULL) { + free(hv_vmbus_g_context.signal_event_buffer, M_DEVBUF); + hv_vmbus_g_context.signal_event_buffer = NULL; + hv_vmbus_g_context.signal_event_param = NULL; + } + + if (hv_vmbus_g_context.guest_id == HV_FREEBSD_GUEST_ID) { + if (hv_vmbus_g_context.hypercall_page != NULL) { + hypercall_msr.as_uint64_t = 0; + hv_vmbus_write_msr(HV_X64_MSR_HYPERCALL, + hypercall_msr.as_uint64_t); + free(hv_vmbus_g_context.hypercall_page, M_DEVBUF); + hv_vmbus_g_context.hypercall_page = NULL; + } + } +} + +/** + * @brief Post a message using the hypervisor message IPC. + * (This involves a hypercall.) + */ +hv_vmbus_status +hv_vmbus_post_msg_via_msg_ipc( + hv_vmbus_connection_id connection_id, + hv_vmbus_msg_type message_type, + void* payload, + size_t payload_size) +{ + struct alignedinput { + uint64_t alignment8; + hv_vmbus_input_post_message msg; + }; + + hv_vmbus_input_post_message* aligned_msg; + hv_vmbus_status status; + size_t addr; + + if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) + return (EMSGSIZE); + + addr = (size_t) malloc(sizeof(struct alignedinput), M_DEVBUF, + M_ZERO | M_NOWAIT); + KASSERT(addr != 0, + ("Error VMBUS: malloc failed to allocate message buffer!")); + if (addr == 0) + return (ENOMEM); + + aligned_msg = (hv_vmbus_input_post_message*) + (HV_ALIGN_UP(addr, HV_HYPERCALL_PARAM_ALIGN)); + + aligned_msg->connection_id = connection_id; + aligned_msg->message_type = message_type; + aligned_msg->payload_size = payload_size; + memcpy((void*) aligned_msg->payload, payload, payload_size); + + status = hv_vmbus_do_hypercall( + HV_CALL_POST_MESSAGE, aligned_msg, 0) & 0xFFFF; + + free((void *) addr, M_DEVBUF); + return (status); +} + +/** + * @brief Signal an event on the specified connection using the hypervisor + * event IPC. (This involves a hypercall.) + */ +hv_vmbus_status +hv_vmbus_signal_event() +{ + hv_vmbus_status status; + + status = hv_vmbus_do_hypercall( + HV_CALL_SIGNAL_EVENT, + hv_vmbus_g_context.signal_event_param, + 0) & 0xFFFF; + + return (status); +} + +/** + * @brief hv_vmbus_synic_init + */ +void +hv_vmbus_synic_init(void *irq_arg) + +{ + int cpu; + uint32_t irq_vector; + hv_vmbus_synic_simp simp; + hv_vmbus_synic_siefp siefp; + hv_vmbus_synic_scontrol sctrl; + hv_vmbus_synic_sint shared_sint; + uint64_t version; + + irq_vector = *((uint32_t *) (irq_arg)); + cpu = PCPU_GET(cpuid); + + if (hv_vmbus_g_context.hypercall_page == NULL) + return; + + /* + * KYS: Looks like we can only initialize on cpu0; don't we support + * SMP guests? + * + * TODO: Need to add SMP support for FreeBSD V9 + */ + + if (cpu != 0) + return; + + /* + * TODO: Check the version + */ + version = hv_vmbus_read_msr(HV_X64_MSR_SVERSION); + + hv_vmbus_g_context.syn_ic_msg_page[cpu] = + malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO); + KASSERT(hv_vmbus_g_context.syn_ic_msg_page[cpu] != NULL, + ("Error VMBUS: malloc failed for allocating page!")); + if (hv_vmbus_g_context.syn_ic_msg_page[cpu] == NULL) + goto cleanup; + + hv_vmbus_g_context.syn_ic_event_page[cpu] = + malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO); + KASSERT(hv_vmbus_g_context.syn_ic_event_page[cpu] != NULL, + ("Error VMBUS: malloc failed to allocate page!")); + if (hv_vmbus_g_context.syn_ic_event_page[cpu] == NULL) + goto cleanup; + + /* + * Setup the Synic's message page + */ + + simp.as_uint64_t = hv_vmbus_read_msr(HV_X64_MSR_SIMP); + simp.simp_enabled = 1; + simp.base_simp_gpa = ((hv_get_phys_addr( + hv_vmbus_g_context.syn_ic_msg_page[cpu])) >> PAGE_SHIFT); + + hv_vmbus_write_msr(HV_X64_MSR_SIMP, simp.as_uint64_t); + + /* + * Setup the Synic's event page + */ + siefp.as_uint64_t = hv_vmbus_read_msr(HV_X64_MSR_SIEFP); + siefp.siefp_enabled = 1; + siefp.base_siefp_gpa = ((hv_get_phys_addr( + hv_vmbus_g_context.syn_ic_event_page[cpu])) >> PAGE_SHIFT); + + hv_vmbus_write_msr(HV_X64_MSR_SIEFP, siefp.as_uint64_t); + + shared_sint.vector = irq_vector; /*HV_SHARED_SINT_IDT_VECTOR + 0x20; */ + shared_sint.masked = FALSE; + shared_sint.auto_eoi = FALSE; + + hv_vmbus_write_msr( + HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT, + shared_sint.as_uint64_t); + + /* Enable the global synic bit */ + sctrl.as_uint64_t = hv_vmbus_read_msr(HV_X64_MSR_SCONTROL); + sctrl.enable = 1; + + hv_vmbus_write_msr(HV_X64_MSR_SCONTROL, sctrl.as_uint64_t); + + hv_vmbus_g_context.syn_ic_initialized = TRUE; + + return; + + cleanup: + + free(hv_vmbus_g_context.syn_ic_msg_page[cpu], M_DEVBUF); + free(hv_vmbus_g_context.syn_ic_msg_page[cpu], M_DEVBUF); +} + +/** + * @brief Cleanup routine for hv_vmbus_synic_init() + */ +void hv_vmbus_synic_cleanup(void *arg) +{ + hv_vmbus_synic_sint shared_sint; + hv_vmbus_synic_simp simp; + hv_vmbus_synic_siefp siefp; + int cpu = PCPU_GET(cpuid); + + if (!hv_vmbus_g_context.syn_ic_initialized) + return; + + if (cpu != 0) + return; /* TODO: XXXKYS: SMP? */ + + shared_sint.as_uint64_t = hv_vmbus_read_msr( + HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT); + + shared_sint.masked = 1; + + /* + * Disable the interrupt + */ + hv_vmbus_write_msr( + HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT, + shared_sint.as_uint64_t); + + simp.as_uint64_t = hv_vmbus_read_msr(HV_X64_MSR_SIMP); + simp.simp_enabled = 0; + simp.base_simp_gpa = 0; + + hv_vmbus_write_msr(HV_X64_MSR_SIMP, simp.as_uint64_t); + + siefp.as_uint64_t = hv_vmbus_read_msr(HV_X64_MSR_SIEFP); + siefp.siefp_enabled = 0; + siefp.base_siefp_gpa = 0; + + hv_vmbus_write_msr(HV_X64_MSR_SIEFP, siefp.as_uint64_t); + + contigfree(hv_vmbus_g_context.syn_ic_msg_page[cpu], + PAGE_SIZE, M_DEVBUF); + contigfree(hv_vmbus_g_context.syn_ic_event_page[cpu], + PAGE_SIZE, M_DEVBUF); +} + diff --git a/sys/contrib/dev/hyperv/vmbus/hv_ring_buffer.c b/sys/contrib/dev/hyperv/vmbus/hv_ring_buffer.c new file mode 100644 index 0000000..f7c1965 --- /dev/null +++ b/sys/contrib/dev/hyperv/vmbus/hv_ring_buffer.c @@ -0,0 +1,440 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> + +#include "hv_vmbus_priv.h" + +/* Amount of space to write to */ +#define HV_BYTES_AVAIL_TO_WRITE(r, w, z) ((w) >= (r))? \ + ((z) - ((w) - (r))):((r) - (w)) + +/** + * @brief Get number of bytes available to read and to write to + * for the specified ring buffer + */ +static inline void +get_ring_buffer_avail_bytes( + hv_vmbus_ring_buffer_info* rbi, + uint32_t* read, + uint32_t* write) +{ + uint32_t read_loc, write_loc; + + /* + * Capture the read/write indices before they changed + */ + read_loc = rbi->ring_buffer->read_index; + write_loc = rbi->ring_buffer->write_index; + + *write = HV_BYTES_AVAIL_TO_WRITE( + read_loc, write_loc, rbi->ring_data_size); + *read = rbi->ring_data_size - *write; +} + +/** + * @brief Get the next write location for the specified ring buffer + */ +static inline uint32_t +get_next_write_location(hv_vmbus_ring_buffer_info* ring_info) +{ + uint32_t next = ring_info->ring_buffer->write_index; + return (next); +} + +/** + * @brief Set the next write location for the specified ring buffer + */ +static inline void +set_next_write_location( + hv_vmbus_ring_buffer_info* ring_info, + uint32_t next_write_location) +{ + ring_info->ring_buffer->write_index = next_write_location; +} + +/** + * @brief Get the next read location for the specified ring buffer + */ +static inline uint32_t +get_next_read_location(hv_vmbus_ring_buffer_info* ring_info) +{ + uint32_t next = ring_info->ring_buffer->read_index; + return (next); +} + +/** + * @brief Get the next read location + offset for the specified ring buffer. + * This allows the caller to skip. + */ +static inline uint32_t +get_next_read_location_with_offset( + hv_vmbus_ring_buffer_info* ring_info, + uint32_t offset) +{ + uint32_t next = ring_info->ring_buffer->read_index; + next += offset; + next %= ring_info->ring_data_size; + return (next); +} + +/** + * @brief Set the next read location for the specified ring buffer + */ +static inline void +set_next_read_location( + hv_vmbus_ring_buffer_info* ring_info, + uint32_t next_read_location) +{ + ring_info->ring_buffer->read_index = next_read_location; +} + +/** + * @brief Get the start of the ring buffer + */ +static inline void * +get_ring_buffer(hv_vmbus_ring_buffer_info* ring_info) +{ + return (void *) ring_info->ring_buffer->buffer; +} + +/** + * @brief Get the size of the ring buffer. + */ +static inline uint32_t +get_ring_buffer_size(hv_vmbus_ring_buffer_info* ring_info) +{ + return ring_info->ring_data_size; +} + +/** + * Get the read and write indices as uint64_t of the specified ring buffer. + */ +static inline uint64_t +get_ring_buffer_indices(hv_vmbus_ring_buffer_info* ring_info) +{ + return (uint64_t) ring_info->ring_buffer->write_index << 32; +} + +static uint32_t copy_to_ring_buffer( + hv_vmbus_ring_buffer_info* ring_info, + uint32_t start_write_offset, + char* src, + uint32_t src_len); + +static uint32_t copy_from_ring_buffer( + hv_vmbus_ring_buffer_info* ring_info, + char* dest, + uint32_t dest_len, + uint32_t start_read_offset); + + +/** + * @brief Get the interrupt mask for the specified ring buffer. + */ +uint32_t +hv_vmbus_get_ring_buffer_interrupt_mask(hv_vmbus_ring_buffer_info *rbi) +{ + return rbi->ring_buffer->interrupt_mask; +} + +/** + * @brief Initialize the ring buffer. + */ +int +hv_vmbus_ring_buffer_init( + hv_vmbus_ring_buffer_info* ring_info, + void* buffer, + uint32_t buffer_len) +{ + memset(ring_info, 0, sizeof(hv_vmbus_ring_buffer_info)); + + ring_info->ring_buffer = (hv_vmbus_ring_buffer*) buffer; + ring_info->ring_buffer->read_index = + ring_info->ring_buffer->write_index = 0; + + ring_info->ring_size = buffer_len; + ring_info->ring_data_size = buffer_len - sizeof(hv_vmbus_ring_buffer); + + mtx_init(&ring_info->ring_lock, "vmbus ring buffer", NULL, MTX_SPIN); + + return (0); +} + +/** + * @brief Cleanup the ring buffer. + */ +void hv_ring_buffer_cleanup(hv_vmbus_ring_buffer_info* ring_info) +{ + mtx_destroy(&ring_info->ring_lock); +} + +/** + * @brief Write to the ring buffer. + */ +int +hv_ring_buffer_write( + hv_vmbus_ring_buffer_info* out_ring_info, + hv_vmbus_sg_buffer_list sg_buffers[], + uint32_t sg_buffer_count) +{ + int i = 0; + uint32_t byte_avail_to_write; + uint32_t byte_avail_to_read; + uint32_t total_bytes_to_write = 0; + + volatile uint32_t next_write_location; + uint64_t prev_indices = 0; + + for (i = 0; i < sg_buffer_count; i++) { + total_bytes_to_write += sg_buffers[i].length; + } + + total_bytes_to_write += sizeof(uint64_t); + + mtx_lock_spin(&out_ring_info->ring_lock); + + get_ring_buffer_avail_bytes(out_ring_info, &byte_avail_to_read, + &byte_avail_to_write); + + /* + * If there is only room for the packet, assume it is full. + * Otherwise, the next time around, we think the ring buffer + * is empty since the read index == write index + */ + + if (byte_avail_to_write <= total_bytes_to_write) { + + mtx_unlock_spin(&out_ring_info->ring_lock); + return (EAGAIN); + } + + /* + * Write to the ring buffer + */ + next_write_location = get_next_write_location(out_ring_info); + + for (i = 0; i < sg_buffer_count; i++) { + next_write_location = copy_to_ring_buffer(out_ring_info, + next_write_location, (char *) sg_buffers[i].data, + sg_buffers[i].length); + } + + /* + * Set previous packet start + */ + prev_indices = get_ring_buffer_indices(out_ring_info); + + next_write_location = copy_to_ring_buffer( + out_ring_info, next_write_location, + (char *) &prev_indices, sizeof(uint64_t)); + + /* + * Make sure we flush all writes before updating the writeIndex + */ + wmb(); + + /* + * Now, update the write location + */ + set_next_write_location(out_ring_info, next_write_location); + + mtx_unlock_spin(&out_ring_info->ring_lock); + + return (0); +} + +/** + * @brief Read without advancing the read index. + */ +int +hv_ring_buffer_peek( + hv_vmbus_ring_buffer_info* in_ring_info, + void* buffer, + uint32_t buffer_len) +{ + uint32_t bytesAvailToWrite; + uint32_t bytesAvailToRead; + uint32_t nextReadLocation = 0; + + mtx_lock_spin(&in_ring_info->ring_lock); + + get_ring_buffer_avail_bytes(in_ring_info, &bytesAvailToRead, + &bytesAvailToWrite); + + /* + * Make sure there is something to read + */ + if (bytesAvailToRead < buffer_len) { + mtx_unlock_spin(&in_ring_info->ring_lock); + return (EAGAIN); + } + + /* + * Convert to byte offset + */ + nextReadLocation = get_next_read_location(in_ring_info); + + nextReadLocation = copy_from_ring_buffer( + in_ring_info, (char *)buffer, buffer_len, nextReadLocation); + + mtx_unlock_spin(&in_ring_info->ring_lock); + + return (0); +} + +/** + * @brief Read and advance the read index. + */ +int +hv_ring_buffer_read( + hv_vmbus_ring_buffer_info* in_ring_info, + void* buffer, + uint32_t buffer_len, + uint32_t offset) +{ + uint32_t bytes_avail_to_write; + uint32_t bytes_avail_to_read; + uint32_t next_read_location = 0; + uint64_t prev_indices = 0; + + if (buffer_len <= 0) + return (EINVAL); + + mtx_lock_spin(&in_ring_info->ring_lock); + + get_ring_buffer_avail_bytes( + in_ring_info, &bytes_avail_to_read, + &bytes_avail_to_write); + + /* + * Make sure there is something to read + */ + if (bytes_avail_to_read < buffer_len) { + mtx_unlock_spin(&in_ring_info->ring_lock); + return (EAGAIN); + } + + next_read_location = get_next_read_location_with_offset( + in_ring_info, + offset); + + next_read_location = copy_from_ring_buffer( + in_ring_info, + (char *) buffer, + buffer_len, + next_read_location); + + next_read_location = copy_from_ring_buffer( + in_ring_info, + (char *) &prev_indices, + sizeof(uint64_t), + next_read_location); + + /* + * Make sure all reads are done before we update the read index since + * the writer may start writing to the read area once the read index + * is updated. + */ + wmb(); + + /* + * Update the read index + */ + set_next_read_location(in_ring_info, next_read_location); + + mtx_unlock_spin(&in_ring_info->ring_lock); + + return (0); +} + +/** + * @brief Helper routine to copy from source to ring buffer. + * + * Assume there is enough room. Handles wrap-around in dest case only! + */ +uint32_t +copy_to_ring_buffer( + hv_vmbus_ring_buffer_info* ring_info, + uint32_t start_write_offset, + char* src, + uint32_t src_len) +{ + char *ring_buffer = get_ring_buffer(ring_info); + uint32_t ring_buffer_size = get_ring_buffer_size(ring_info); + uint32_t fragLen; + + if (src_len > ring_buffer_size - start_write_offset) { + /* wrap-around detected! */ + fragLen = ring_buffer_size - start_write_offset; + memcpy(ring_buffer + start_write_offset, src, fragLen); + memcpy(ring_buffer, src + fragLen, src_len - fragLen); + } else { + memcpy(ring_buffer + start_write_offset, src, src_len); + } + + start_write_offset += src_len; + start_write_offset %= ring_buffer_size; + + return (start_write_offset); +} + +/** + * @brief Helper routine to copy to source from ring buffer. + * + * Assume there is enough room. Handles wrap-around in src case only! + */ +uint32_t +copy_from_ring_buffer( + hv_vmbus_ring_buffer_info* ring_info, + char* dest, + uint32_t dest_len, + uint32_t start_read_offset) +{ + uint32_t fragLen; + char *ring_buffer = get_ring_buffer(ring_info); + uint32_t ring_buffer_size = get_ring_buffer_size(ring_info); + + if (dest_len > ring_buffer_size - start_read_offset) { + /* wrap-around detected at the src */ + fragLen = ring_buffer_size - start_read_offset; + memcpy(dest, ring_buffer + start_read_offset, fragLen); + memcpy(dest + fragLen, ring_buffer, dest_len - fragLen); + } else { + memcpy(dest, ring_buffer + start_read_offset, dest_len); + } + + start_read_offset += dest_len; + start_read_offset %= ring_buffer_size; + + return (start_read_offset); +} + diff --git a/sys/contrib/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c b/sys/contrib/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c new file mode 100644 index 0000000..4dfddd3 --- /dev/null +++ b/sys/contrib/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c @@ -0,0 +1,583 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * VM Bus Driver Implementation + */ + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/systm.h> +#include <sys/rtprio.h> +#include <sys/interrupt.h> +#include <sys/sx.h> +#include <sys/taskqueue.h> +#include <sys/mutex.h> +#include <sys/smp.h> + +#include <machine/resource.h> +#include <sys/rman.h> + +#include <machine/stdarg.h> +#include <machine/intr_machdep.h> +#include <sys/pcpu.h> + +#include "hv_vmbus_priv.h" + + +#define VMBUS_IRQ 0x5 + +static struct intr_event *hv_msg_intr_event; +static struct intr_event *hv_event_intr_event; +static void *msg_swintr; +static void *event_swintr; +static device_t vmbus_devp; +static void *vmbus_cookiep; +static int vmbus_rid; +struct resource *intr_res; +static int vmbus_irq = VMBUS_IRQ; +static int vmbus_inited; + +/** + * @brief Software interrupt thread routine to handle channel messages from + * the hypervisor. + */ +static void +vmbus_msg_swintr(void *dummy) +{ + int cpu; + void* page_addr; + hv_vmbus_message* msg; + hv_vmbus_message* copied; + + cpu = PCPU_GET(cpuid); + page_addr = hv_vmbus_g_context.syn_ic_msg_page[cpu]; + msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT; + + for (;;) { + if (msg->header.message_type == HV_MESSAGE_TYPE_NONE) { + break; /* no message */ + } else { + copied = malloc(sizeof(hv_vmbus_message), + M_DEVBUF, M_NOWAIT); + KASSERT(copied != NULL, + ("Error VMBUS: malloc failed to allocate" + " hv_vmbus_message!")); + if (copied == NULL) + continue; + memcpy(copied, msg, sizeof(hv_vmbus_message)); + hv_queue_work_item(hv_vmbus_g_connection.work_queue, + hv_vmbus_on_channel_message, copied); + } + + msg->header.message_type = HV_MESSAGE_TYPE_NONE; + + /* + * Make sure the write to message_type (ie set to + * HV_MESSAGE_TYPE_NONE) happens before we read the + * message_pending and EOMing. Otherwise, the EOMing will + * not deliver any more messages + * since there is no empty slot + */ + wmb(); + + if (msg->header.message_flags.message_pending) { + /* + * This will cause message queue rescan to possibly + * deliver another msg from the hypervisor + */ + hv_vmbus_write_msr(HV_X64_MSR_EOM, 0); + } + } +} + +/** + * @brief Interrupt filter routine for VMBUS. + * + * The purpose of this routine is to determine the type of VMBUS protocol + * message to process - an event or a channel message. + * As this is an interrupt filter routine, the function runs in a very + * restricted envinronment. From the manpage for bus_setup_intr(9) + * + * In this restricted environment, care must be taken to account for all + * races. A careful analysis of races should be done as well. It is gener- + * ally cheaper to take an extra interrupt, for example, than to protect + * variables with spinlocks. Read, modify, write cycles of hardware regis- + * ters need to be carefully analyzed if other threads are accessing the + * same registers. + */ +static int +hv_vmbus_isr(void *unused) +{ + int cpu; + hv_vmbus_message* msg; + hv_vmbus_synic_event_flags* event; + void* page_addr; + + cpu = PCPU_GET(cpuid); + /* (Temporary limit) */ + KASSERT(cpu == 0, ("hv_vmbus_isr: Interrupt on CPU other than zero")); + + /* + * The Windows team has advised that we check for events + * before checking for messages. This is the way they do it + * in Windows when running as a guest in Hyper-V + */ + + page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu]; + event = (hv_vmbus_synic_event_flags*) + page_addr + HV_VMBUS_MESSAGE_SINT; + + /* Since we are a child, we only need to check bit 0 */ + if (synch_test_and_clear_bit(0, &event->flags32[0])) { + swi_sched(event_swintr, 0); + } + + /* Check if there are actual msgs to be process */ + page_addr = hv_vmbus_g_context.syn_ic_msg_page[cpu]; + msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT; + + if (msg->header.message_type != HV_MESSAGE_TYPE_NONE) { + swi_sched(msg_swintr, 0); + } + + return FILTER_HANDLED; +} + +static int +vmbus_read_ivar( + device_t dev, + device_t child, + int index, + uintptr_t* result) +{ + struct hv_device *child_dev_ctx = device_get_ivars(child); + + switch (index) { + + case HV_VMBUS_IVAR_TYPE: + *result = (uintptr_t) &child_dev_ctx->class_id; + return (0); + case HV_VMBUS_IVAR_INSTANCE: + *result = (uintptr_t) &child_dev_ctx->device_id; + return (0); + case HV_VMBUS_IVAR_DEVCTX: + *result = (uintptr_t) child_dev_ctx; + return (0); + case HV_VMBUS_IVAR_NODE: + *result = (uintptr_t) child_dev_ctx->device; + return (0); + } + return (ENOENT); +} + +static int +vmbus_write_ivar( + device_t dev, + device_t child, + int index, + uintptr_t value) +{ + switch (index) { + + case HV_VMBUS_IVAR_TYPE: + case HV_VMBUS_IVAR_INSTANCE: + case HV_VMBUS_IVAR_DEVCTX: + case HV_VMBUS_IVAR_NODE: + /* read-only */ + return (EINVAL); + } + return (ENOENT); +} + +struct hv_device* +hv_vmbus_child_device_create( + hv_guid type, + hv_guid instance, + hv_vmbus_channel* channel) +{ + hv_device* child_dev; + + /* + * Allocate the new child device + */ + child_dev = malloc(sizeof(hv_device), M_DEVBUF, + M_NOWAIT | M_ZERO); + KASSERT(child_dev != NULL, + ("Error VMBUS: malloc failed to allocate hv_device!")); + + if (child_dev == NULL) + return (NULL); + + child_dev->channel = channel; + memcpy(&child_dev->class_id, &type, sizeof(hv_guid)); + memcpy(&child_dev->device_id, &instance, sizeof(hv_guid)); + + return (child_dev); +} + +static void +print_dev_guid(struct hv_device *dev) +{ + int i; + unsigned char guid_name[100]; + for (i = 0; i < 32; i += 2) + sprintf(&guid_name[i], "%02x", dev->class_id.data[i / 2]); + if(bootverbose) + printf("VMBUS: Class ID: %s\n", guid_name); +} + +int +hv_vmbus_child_device_register(struct hv_device *child_dev) +{ + device_t child; + int ret = 0; + + print_dev_guid(child_dev); + + + child = device_add_child(vmbus_devp, NULL, -1); + child_dev->device = child; + device_set_ivars(child, child_dev); + + mtx_lock(&Giant); + ret = device_probe_and_attach(child); + mtx_unlock(&Giant); + + return (0); +} + +int +hv_vmbus_child_device_unregister(struct hv_device *child_dev) +{ + int ret = 0; + /* + * XXXKYS: Ensure that this is the opposite of + * device_add_child() + */ + mtx_lock(&Giant); + ret = device_delete_child(vmbus_devp, child_dev->device); + mtx_unlock(&Giant); + return(ret); +} + +static void vmbus_identify(driver_t *driver, device_t parent) { + BUS_ADD_CHILD(parent, 0, "vmbus", 0); + if (device_find_child(parent, "vmbus", 0) == NULL) { + BUS_ADD_CHILD(parent, 0, "vmbus", 0); + } +} + +static int +vmbus_probe(device_t dev) { + if(bootverbose) + device_printf(dev, "VMBUS: probe\n"); + + if (!hv_vmbus_query_hypervisor_presence()) + return (ENXIO); + + device_set_desc(dev, "Vmbus Devices"); + + return (0); +} + +/** + * @brief Main vmbus driver initialization routine. + * + * Here, we + * - initialize the vmbus driver context + * - setup various driver entry points + * - invoke the vmbus hv main init routine + * - get the irq resource + * - invoke the vmbus to add the vmbus root device + * - setup the vmbus root device + * - retrieve the channel offers + */ +static int +vmbus_bus_init(void) +{ + struct ioapic_intsrc { + struct intsrc io_intsrc; + u_int io_irq; + u_int io_intpin:8; + u_int io_vector:8; + u_int io_cpu:8; + u_int io_activehi:1; + u_int io_edgetrigger:1; + u_int io_masked:1; + int io_bus:4; + uint32_t io_lowreg; + }; + + int ret; + unsigned int vector = 0; + struct intsrc *isrc; + struct ioapic_intsrc *intpin; + + if (vmbus_inited) + return (0); + + vmbus_inited = 1; + + ret = hv_vmbus_init(); + + if (ret) { + if(bootverbose) + printf("Error VMBUS: Hypervisor Initialization Failed!\n"); + return (ret); + } + + ret = swi_add(&hv_msg_intr_event, "hv_msg", vmbus_msg_swintr, + NULL, SWI_CLOCK, 0, &msg_swintr); + + if (ret) + goto cleanup; + + /* + * Message SW interrupt handler checks a per-CPU page and + * thus the thread needs to be bound to CPU-0 - which is where + * all interrupts are processed. + */ + ret = intr_event_bind(hv_msg_intr_event, 0); + + if (ret) + goto cleanup1; + + ret = swi_add(&hv_event_intr_event, "hv_event", hv_vmbus_on_events, + NULL, SWI_CLOCK, 0, &event_swintr); + + if (ret) + goto cleanup1; + + intr_res = bus_alloc_resource(vmbus_devp, + SYS_RES_IRQ, &vmbus_rid, vmbus_irq, vmbus_irq, 1, RF_ACTIVE); + + if (intr_res == NULL) { + ret = ENOMEM; /* XXXKYS: Need a better errno */ + goto cleanup2; + } + + /* + * Setup interrupt filter handler + */ + ret = bus_setup_intr(vmbus_devp, intr_res, + INTR_TYPE_NET | INTR_MPSAFE, hv_vmbus_isr, NULL, + NULL, &vmbus_cookiep); + + if (ret != 0) + goto cleanup3; + + ret = bus_bind_intr(vmbus_devp, intr_res, 0); + if (ret != 0) + goto cleanup4; + + isrc = intr_lookup_source(vmbus_irq); + if ((isrc == NULL) || (isrc->is_event == NULL)) { + ret = EINVAL; + goto cleanup4; + } + + /* vector = isrc->is_event->ie_vector; */ + intpin = (struct ioapic_intsrc *)isrc; + vector = intpin->io_vector; + + if(bootverbose) + printf("VMBUS: irq 0x%x vector 0x%x\n", vmbus_irq, vector); + + /** + * Notify the hypervisor of our irq. + */ + + smp_rendezvous(NULL, hv_vmbus_synic_init, NULL, &vector); + + /** + * Connect to VMBus in the root partition + */ + ret = hv_vmbus_connect(); + + if (ret) + goto cleanup4; + + hv_vmbus_request_channel_offers(); + return (ret); + + cleanup4: + + /* + * remove swi, bus and intr resource + */ + bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep); + + cleanup3: + + bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res); + + cleanup2: + swi_remove(event_swintr); + + cleanup1: + swi_remove(msg_swintr); + + cleanup: + hv_vmbus_cleanup(); + + return (ret); +} + +static int +vmbus_attach(device_t dev) +{ + if(bootverbose) + device_printf(dev, "VMBUS: attach dev: %p\n", dev); + vmbus_devp = dev; + + /* + * If the system has already booted and thread + * scheduling is possible indicated by the global + * cold set to zero, we just call the driver + * initialization directly. + */ + if (!cold) + vmbus_bus_init(); + + return (0); +} + +static void +vmbus_init(void) +{ + /* + * If the system has already booted and thread + * scheduling is possible indicated by the global + * cold set to zero, we just call the driver + * initialization directly. + */ + if (!cold) + vmbus_bus_init(); +} + +static void +vmbus_bus_exit(void) +{ + hv_vmbus_release_unattached_channels(); + hv_vmbus_disconnect(); + + smp_rendezvous(NULL, hv_vmbus_synic_cleanup, NULL, NULL); + + hv_vmbus_cleanup(); + + /* remove swi, bus and intr resource */ + bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep); + + bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res); + + swi_remove(msg_swintr); + swi_remove(event_swintr); + + return; +} + +static void +vmbus_exit(void) +{ + vmbus_bus_exit(); +} + +static int +vmbus_detach(device_t dev) +{ + vmbus_exit(); + return (0); +} + +static void +vmbus_mod_load(void) +{ + if(bootverbose) + printf("VMBUS: load\n"); +} + +static void +vmbus_mod_unload(void) +{ + if(bootverbose) + printf("VMBUS: unload\n"); +} + +static int +vmbus_modevent(module_t mod, int what, void *arg) +{ + switch (what) { + + case MOD_LOAD: + vmbus_mod_load(); + break; + case MOD_UNLOAD: + vmbus_mod_unload(); + break; + } + + return (0); +} + +static device_method_t vmbus_methods[] = { + /** Device interface */ + DEVMETHOD(device_identify, vmbus_identify), + DEVMETHOD(device_probe, vmbus_probe), + DEVMETHOD(device_attach, vmbus_attach), + DEVMETHOD(device_detach, vmbus_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + /** Bus interface */ + DEVMETHOD(bus_add_child, bus_generic_add_child), + DEVMETHOD(bus_print_child, bus_generic_print_child), + DEVMETHOD(bus_read_ivar, vmbus_read_ivar), + DEVMETHOD(bus_write_ivar, vmbus_write_ivar), + + { 0, 0 } }; + +static char driver_name[] = "vmbus"; +static driver_t vmbus_driver = { driver_name, vmbus_methods,0, }; + + +devclass_t vmbus_devclass; + +DRIVER_MODULE(vmbus, nexus, vmbus_driver, vmbus_devclass, vmbus_modevent, 0); +MODULE_VERSION(vmbus,1); + +/* TODO: We want to be earlier than SI_SUB_VFS */ +SYSINIT(vmb_init, SI_SUB_VFS, SI_ORDER_MIDDLE, vmbus_init, NULL); + diff --git a/sys/contrib/dev/hyperv/vmbus/hv_vmbus_priv.h b/sys/contrib/dev/hyperv/vmbus/hv_vmbus_priv.h new file mode 100644 index 0000000..739acb1 --- /dev/null +++ b/sys/contrib/dev/hyperv/vmbus/hv_vmbus_priv.h @@ -0,0 +1,770 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HYPERV_PRIV_H__ +#define __HYPERV_PRIV_H__ + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sema.h> + +#include <dev/hyperv/include/hyperv.h> + + +/* + * Status codes for hypervisor operations. + */ + +typedef uint16_t hv_vmbus_status; + +#define HV_MESSAGE_SIZE (256) +#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) +#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) +#define HV_ANY_VP (0xFFFFFFFF) + +/* + * Synthetic interrupt controller flag constants. + */ + +#define HV_EVENT_FLAGS_COUNT (256 * 8) +#define HV_EVENT_FLAGS_BYTE_COUNT (256) +#define HV_EVENT_FLAGS_DWORD_COUNT (256 / sizeof(uint32_t)) + +/* + * MessageId: HV_STATUS_INSUFFICIENT_BUFFERS + * MessageText: + * You did not supply enough message buffers to send a message. + */ + +#define HV_STATUS_INSUFFICIENT_BUFFERS ((uint16_t)0x0013) + +typedef void (*hv_vmbus_channel_callback)(void *context); + +typedef struct { + void* data; + uint32_t length; +} hv_vmbus_sg_buffer_list; + +typedef struct { + uint32_t current_interrupt_mask; + uint32_t current_read_index; + uint32_t current_write_index; + uint32_t bytes_avail_to_read; + uint32_t bytes_avail_to_write; +} hv_vmbus_ring_buffer_debug_info; + +typedef struct { + uint32_t rel_id; + hv_vmbus_channel_state state; + hv_guid interface_type; + hv_guid interface_instance; + uint32_t monitor_id; + uint32_t server_monitor_pending; + uint32_t server_monitor_latency; + uint32_t server_monitor_connection_id; + uint32_t client_monitor_pending; + uint32_t client_monitor_latency; + uint32_t client_monitor_connection_id; + hv_vmbus_ring_buffer_debug_info inbound; + hv_vmbus_ring_buffer_debug_info outbound; +} hv_vmbus_channel_debug_info; + +typedef union { + hv_vmbus_channel_version_supported version_supported; + hv_vmbus_channel_open_result open_result; + hv_vmbus_channel_gpadl_torndown gpadl_torndown; + hv_vmbus_channel_gpadl_created gpadl_created; + hv_vmbus_channel_version_response version_response; +} hv_vmbus_channel_msg_response; + +/* + * Represents each channel msg on the vmbus connection + * This is a variable-size data structure depending on + * the msg type itself + */ +typedef struct hv_vmbus_channel_msg_info { + /* + * Bookkeeping stuff + */ + TAILQ_ENTRY(hv_vmbus_channel_msg_info) msg_list_entry; + /* + * So far, this is only used to handle + * gpadl body message + */ + TAILQ_HEAD(, hv_vmbus_channel_msg_info) sub_msg_list_anchor; + /* + * Synchronize the request/response if + * needed. + * KYS: Use a semaphore for now. + * Not perf critical. + */ + struct sema wait_sema; + hv_vmbus_channel_msg_response response; + uint32_t message_size; + /** + * The channel message that goes out on + * the "wire". It will contain at + * minimum the + * hv_vmbus_channel_msg_header + * header. + */ + unsigned char msg[0]; +} hv_vmbus_channel_msg_info; + +/* + * The format must be the same as hv_vm_data_gpa_direct + */ +typedef struct hv_vmbus_channel_packet_page_buffer { + uint16_t type; + uint16_t data_offset8; + uint16_t length8; + uint16_t flags; + uint64_t transaction_id; + uint32_t reserved; + uint32_t range_count; + hv_vmbus_page_buffer range[HV_MAX_PAGE_BUFFER_COUNT]; +} __packed hv_vmbus_channel_packet_page_buffer; + +/* + * The format must be the same as hv_vm_data_gpa_direct + */ +typedef struct hv_vmbus_channel_packet_multipage_buffer { + uint16_t type; + uint16_t data_offset8; + uint16_t length8; + uint16_t flags; + uint64_t transaction_id; + uint32_t reserved; + uint32_t range_count; /* Always 1 in this case */ + hv_vmbus_multipage_buffer range; +} __packed hv_vmbus_channel_packet_multipage_buffer; + +enum { + HV_VMBUS_MESSAGE_CONNECTION_ID = 1, + HV_VMBUS_MESSAGE_PORT_ID = 1, + HV_VMBUS_EVENT_CONNECTION_ID = 2, + HV_VMBUS_EVENT_PORT_ID = 2, + HV_VMBUS_MONITOR_CONNECTION_ID = 3, + HV_VMBUS_MONITOR_PORT_ID = 3, + HV_VMBUS_MESSAGE_SINT = 2 +}; + +#define HV_PRESENT_BIT 0x80000000 + +#define HV_HYPERCALL_PARAM_ALIGN sizeof(uint64_t) + +/* + * Connection identifier type + */ +typedef union { + uint32_t as_uint32_t; + struct { + uint32_t id:24; + uint32_t reserved:8; + } u; + +} __packed hv_vmbus_connection_id; + +/* + * Definition of the hv_vmbus_signal_event hypercall input structure + */ +typedef struct { + hv_vmbus_connection_id connection_id; + uint16_t flag_number; + uint16_t rsvd_z; +} __packed hv_vmbus_input_signal_event; + +typedef struct { + uint64_t align8; + hv_vmbus_input_signal_event event; +} __packed hv_vmbus_input_signal_event_buffer; + +typedef struct { + uint64_t guest_id; + void* hypercall_page; + hv_bool_uint8_t syn_ic_initialized; + /* + * This is used as an input param to HV_CALL_SIGNAL_EVENT hypercall. + * The input param is immutable in our usage and + * must be dynamic mem (vs stack or global). + */ + hv_vmbus_input_signal_event_buffer *signal_event_buffer; + /* + * 8-bytes aligned of the buffer above + */ + hv_vmbus_input_signal_event *signal_event_param; + + hv_vmbus_handle syn_ic_msg_page[MAXCPU]; + hv_vmbus_handle syn_ic_event_page[MAXCPU]; +} hv_vmbus_context; + +/* + * Define hypervisor message types + */ +typedef enum { + + HV_MESSAGE_TYPE_NONE = 0x00000000, + + /* + * Memory access messages + */ + HV_MESSAGE_TYPE_UNMAPPED_GPA = 0x80000000, + HV_MESSAGE_TYPE_GPA_INTERCEPT = 0x80000001, + + /* + * Timer notification messages + */ + HV_MESSAGE_TIMER_EXPIRED = 0x80000010, + + /* + * Error messages + */ + HV_MESSAGE_TYPE_INVALID_VP_REGISTER_VALUE = 0x80000020, + HV_MESSAGE_TYPE_UNRECOVERABLE_EXCEPTION = 0x80000021, + HV_MESSAGE_TYPE_UNSUPPORTED_FEATURE = 0x80000022, + + /* + * Trace buffer complete messages + */ + HV_MESSAGE_TYPE_EVENT_LOG_BUFFER_COMPLETE = 0x80000040, + + /* + * Platform-specific processor intercept messages + */ + HV_MESSAGE_TYPE_X64_IO_PORT_INTERCEPT = 0x80010000, + HV_MESSAGE_TYPE_X64_MSR_INTERCEPT = 0x80010001, + HV_MESSAGE_TYPE_X64_CPU_INTERCEPT = 0x80010002, + HV_MESSAGE_TYPE_X64_EXCEPTION_INTERCEPT = 0x80010003, + HV_MESSAGE_TYPE_X64_APIC_EOI = 0x80010004, + HV_MESSAGE_TYPE_X64_LEGACY_FP_ERROR = 0x80010005 + +} hv_vmbus_msg_type; + +/* + * Define port identifier type + */ +typedef union _hv_vmbus_port_id { + uint32_t as_uint32_t; + struct { + uint32_t id:24; + uint32_t reserved:8; + } u ; +} hv_vmbus_port_id; + +/* + * Define synthetic interrupt controller message flag + */ +typedef union { + uint8_t as_uint8_t; + struct { + uint8_t message_pending:1; + uint8_t reserved:7; + }; +} hv_vmbus_msg_flags; + +typedef uint64_t hv_vmbus_partition_id; + +/* + * Define synthetic interrupt controller message header + */ +typedef struct { + hv_vmbus_msg_type message_type; + uint8_t payload_size; + hv_vmbus_msg_flags message_flags; + uint8_t reserved[2]; + union { + hv_vmbus_partition_id sender; + hv_vmbus_port_id port; + } u; +} hv_vmbus_msg_header; + +/* + * Define synthetic interrupt controller message format + */ +typedef struct { + hv_vmbus_msg_header header; + union { + uint64_t payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; + } u ; +} hv_vmbus_message; + +/* + * Maximum channels is determined by the size of the interrupt + * page which is PAGE_SIZE. 1/2 of PAGE_SIZE is for + * send endpoint interrupt and the other is receive + * endpoint interrupt. + * + * Note: (PAGE_SIZE >> 1) << 3 allocates 16348 channels + */ +#define HV_MAX_NUM_CHANNELS (PAGE_SIZE >> 1) << 3 + +/* + * (The value here must be in multiple of 32) + */ +#define HV_MAX_NUM_CHANNELS_SUPPORTED 256 + +/* + * VM Bus connection states + */ +typedef enum { + HV_DISCONNECTED, + HV_CONNECTING, + HV_CONNECTED, + HV_DISCONNECTING +} hv_vmbus_connect_state; + +#define HV_MAX_SIZE_CHANNEL_MESSAGE HV_MESSAGE_PAYLOAD_BYTE_COUNT + + +typedef struct { + hv_vmbus_connect_state connect_state; + uint32_t next_gpadl_handle; + /** + * Represents channel interrupts. Each bit position + * represents a channel. + * When a channel sends an interrupt via VMBUS, it + * finds its bit in the send_interrupt_page, set it and + * calls Hv to generate a port event. The other end + * receives the port event and parse the + * recv_interrupt_page to see which bit is set + */ + void *interrupt_page; + void *send_interrupt_page; + void *recv_interrupt_page; + /* + * 2 pages - 1st page for parent->child + * notification and 2nd is child->parent + * notification + */ + void *monitor_pages; + TAILQ_HEAD(, hv_vmbus_channel_msg_info) channel_msg_anchor; + struct mtx channel_msg_lock; + /** + * List of channels + */ + TAILQ_HEAD(, hv_vmbus_channel) channel_anchor; + struct mtx channel_lock; + + hv_vmbus_handle work_queue; + struct sema control_sema; +} hv_vmbus_connection; + +/* + * Declare the MSR used to identify the guest OS + */ +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 + +typedef union { + uint64_t as_uint64_t; + struct { + uint64_t build_number : 16; + uint64_t service_version : 8; /* Service Pack, etc. */ + uint64_t minor_version : 8; + uint64_t major_version : 8; + /* + * HV_GUEST_OS_MICROSOFT_IDS (If Vendor=MS) + * HV_GUEST_OS_VENDOR + */ + uint64_t os_id : 8; + uint64_t vendor_id : 16; + }; +} hv_vmbus_x64_msr_guest_os_id_contents; + +/* + * Declare the MSR used to setup pages used to communicate with the hypervisor + */ +#define HV_X64_MSR_HYPERCALL 0x40000001 + +typedef union { + uint64_t as_uint64_t; + struct { + uint64_t enable :1; + uint64_t reserved :11; + uint64_t guest_physical_address :52; + }; +} hv_vmbus_x64_msr_hypercall_contents; + +typedef union { + uint32_t as_uint32_t; + struct { + uint32_t group_enable :4; + uint32_t rsvd_z :28; + }; +} hv_vmbus_monitor_trigger_state; + +typedef union { + uint64_t as_uint64_t; + struct { + uint32_t pending; + uint32_t armed; + }; +} hv_vmbus_monitor_trigger_group; + +typedef struct { + hv_vmbus_connection_id connection_id; + uint16_t flag_number; + uint16_t rsvd_z; +} hv_vmbus_monitor_parameter; + +/* + * hv_vmbus_monitor_page Layout + * ------------------------------------------------------ + * | 0 | trigger_state (4 bytes) | Rsvd1 (4 bytes) | + * | 8 | trigger_group[0] | + * | 10 | trigger_group[1] | + * | 18 | trigger_group[2] | + * | 20 | trigger_group[3] | + * | 28 | Rsvd2[0] | + * | 30 | Rsvd2[1] | + * | 38 | Rsvd2[2] | + * | 40 | next_check_time[0][0] | next_check_time[0][1] | + * | ... | + * | 240 | latency[0][0..3] | + * | 340 | Rsvz3[0] | + * | 440 | parameter[0][0] | + * | 448 | parameter[0][1] | + * | ... | + * | 840 | Rsvd4[0] | + * ------------------------------------------------------ + */ + +typedef struct { + hv_vmbus_monitor_trigger_state trigger_state; + uint32_t rsvd_z1; + + hv_vmbus_monitor_trigger_group trigger_group[4]; + uint64_t rsvd_z2[3]; + + int32_t next_check_time[4][32]; + + uint16_t latency[4][32]; + uint64_t rsvd_z3[32]; + + hv_vmbus_monitor_parameter parameter[4][32]; + + uint8_t rsvd_z4[1984]; +} hv_vmbus_monitor_page; + +/* + * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent + * is set by CPUID(HV_CPU_ID_FUNCTION_VERSION_AND_FEATURES). + */ +typedef enum { + HV_CPU_ID_FUNCTION_VERSION_AND_FEATURES = 0x00000001, + HV_CPU_ID_FUNCTION_HV_VENDOR_AND_MAX_FUNCTION = 0x40000000, + HV_CPU_ID_FUNCTION_HV_INTERFACE = 0x40000001, + /* + * The remaining functions depend on the value + * of hv_cpu_id_function_interface + */ + HV_CPU_ID_FUNCTION_MS_HV_VERSION = 0x40000002, + HV_CPU_ID_FUNCTION_MS_HV_FEATURES = 0x40000003, + HV_CPU_ID_FUNCTION_MS_HV_ENLIGHTENMENT_INFORMATION = 0x40000004, + HV_CPU_ID_FUNCTION_MS_HV_IMPLEMENTATION_LIMITS = 0x40000005 + +} hv_vmbus_cpuid_function; + +/* + * Define the format of the SIMP register + */ +typedef union { + uint64_t as_uint64_t; + struct { + uint64_t simp_enabled : 1; + uint64_t preserved : 11; + uint64_t base_simp_gpa : 52; + }; +} hv_vmbus_synic_simp; + +/* + * Define the format of the SIEFP register + */ +typedef union { + uint64_t as_uint64_t; + struct { + uint64_t siefp_enabled : 1; + uint64_t preserved : 11; + uint64_t base_siefp_gpa : 52; + }; +} hv_vmbus_synic_siefp; + +/* + * Define synthetic interrupt source + */ +typedef union { + uint64_t as_uint64_t; + struct { + uint64_t vector : 8; + uint64_t reserved1 : 8; + uint64_t masked : 1; + uint64_t auto_eoi : 1; + uint64_t reserved2 : 46; + }; +} hv_vmbus_synic_sint; + +/* + * Define syn_ic control register + */ +typedef union _hv_vmbus_synic_scontrol { + uint64_t as_uint64_t; + struct { + uint64_t enable : 1; + uint64_t reserved : 63; + }; +} hv_vmbus_synic_scontrol; + +/* + * Define the hv_vmbus_post_message hypercall input structure + */ +typedef struct { + hv_vmbus_connection_id connection_id; + uint32_t reserved; + hv_vmbus_msg_type message_type; + uint32_t payload_size; + uint64_t payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; +} hv_vmbus_input_post_message; + +/* + * Define the synthetic interrupt controller event flags format + */ +typedef union { + uint8_t flags8[HV_EVENT_FLAGS_BYTE_COUNT]; + uint32_t flags32[HV_EVENT_FLAGS_DWORD_COUNT]; +} hv_vmbus_synic_event_flags; + + +/* + * Define synthetic interrupt controller model specific registers + */ +#define HV_X64_MSR_SCONTROL (0x40000080) +#define HV_X64_MSR_SVERSION (0x40000081) +#define HV_X64_MSR_SIEFP (0x40000082) +#define HV_X64_MSR_SIMP (0x40000083) +#define HV_X64_MSR_EOM (0x40000084) + +#define HV_X64_MSR_SINT0 (0x40000090) +#define HV_X64_MSR_SINT1 (0x40000091) +#define HV_X64_MSR_SINT2 (0x40000092) +#define HV_X64_MSR_SINT3 (0x40000093) +#define HV_X64_MSR_SINT4 (0x40000094) +#define HV_X64_MSR_SINT5 (0x40000095) +#define HV_X64_MSR_SINT6 (0x40000096) +#define HV_X64_MSR_SINT7 (0x40000097) +#define HV_X64_MSR_SINT8 (0x40000098) +#define HV_X64_MSR_SINT9 (0x40000099) +#define HV_X64_MSR_SINT10 (0x4000009A) +#define HV_X64_MSR_SINT11 (0x4000009B) +#define HV_X64_MSR_SINT12 (0x4000009C) +#define HV_X64_MSR_SINT13 (0x4000009D) +#define HV_X64_MSR_SINT14 (0x4000009E) +#define HV_X64_MSR_SINT15 (0x4000009F) + +/* + * Declare the various hypercall operations + */ +typedef enum { + HV_CALL_POST_MESSAGE = 0x005c, + HV_CALL_SIGNAL_EVENT = 0x005d, +} hv_vmbus_call_code; + +/** + * Global variables + */ + +extern hv_vmbus_context hv_vmbus_g_context; +extern hv_vmbus_connection hv_vmbus_g_connection; + + +/* + * Private, VM Bus functions + */ + +int hv_vmbus_ring_buffer_init( + hv_vmbus_ring_buffer_info *ring_info, + void *buffer, + uint32_t buffer_len); + +void hv_ring_buffer_cleanup( + hv_vmbus_ring_buffer_info *ring_info); + +int hv_ring_buffer_write( + hv_vmbus_ring_buffer_info *ring_info, + hv_vmbus_sg_buffer_list sg_buffers[], + uint32_t sg_buff_count); + +int hv_ring_buffer_peek( + hv_vmbus_ring_buffer_info *ring_info, + void *buffer, + uint32_t buffer_len); + +int hv_ring_buffer_read( + hv_vmbus_ring_buffer_info *ring_info, + void *buffer, + uint32_t buffer_len, + uint32_t offset); + +uint32_t hv_vmbus_get_ring_buffer_interrupt_mask( + hv_vmbus_ring_buffer_info *ring_info); + +void hv_vmbus_dump_ring_info( + hv_vmbus_ring_buffer_info *ring_info, + char *prefix); + +hv_vmbus_channel* hv_vmbus_allocate_channel(void); +void hv_vmbus_free_vmbus_channel(hv_vmbus_channel *channel); +void hv_vmbus_on_channel_message(void *context); +int hv_vmbus_request_channel_offers(void); +void hv_vmbus_release_unattached_channels(void); +int hv_vmbus_init(void); +void hv_vmbus_cleanup(void); + +uint16_t hv_vmbus_post_msg_via_msg_ipc( + hv_vmbus_connection_id connection_id, + hv_vmbus_msg_type message_type, + void *payload, + size_t payload_size); + +uint16_t hv_vmbus_signal_event(void); +void hv_vmbus_synic_init(void *irq_arg); +void hv_vmbus_synic_cleanup(void *arg); +int hv_vmbus_query_hypervisor_presence(void); + +struct hv_device* hv_vmbus_child_device_create( + hv_guid device_type, + hv_guid device_instance, + hv_vmbus_channel *channel); + +int hv_vmbus_child_device_register( + struct hv_device *child_dev); +int hv_vmbus_child_device_unregister( + struct hv_device *child_dev); +hv_vmbus_channel* hv_vmbus_get_channel_from_rel_id(uint32_t rel_id); + +/** + * Connection interfaces + */ +int hv_vmbus_connect(void); +int hv_vmbus_disconnect(void); +int hv_vmbus_post_message(void *buffer, size_t buf_size); +int hv_vmbus_set_event(uint32_t child_rel_id); +void hv_vmbus_on_events(void *); + +/* + * static inline functions + * (with some helper macros for reading/writing to model specific registers) + */ + +#ifdef __x86_64__ + +#define HV_VMBUS_READ_MSR(reg, v) { \ + uint32_t h, l; \ + __asm__ __volatile__("rdmsr" \ + : "=a" (l), "=d" (h) \ + : "c" (reg)); \ + v = (((uint64_t)h) << 32) | l; \ +} + +#define HV_VMBUS_WRITE_MSR(reg, v) { \ + uint32_t h, l; \ + l = (uint32_t)(((uint64_t)(v)) & 0xFFFFFFFF); \ + h = (uint32_t)((((uint64_t)(v)) >> 32) & 0xFFFFFFFF); \ + __asm__ __volatile__("wrmsr" \ + : /* no outputs */ \ + : "c" (reg), "a" (l), "d" (h)); \ +} + +#else + +#define HV_VMBUS_READ_MSR(reg, v) \ + __asm__ __volatile__("rdmsr" \ + : "=A" (v) \ + : "c" (reg)) + +#define HV_VMBUS_WRITE_MSR(reg, v) \ + __asm__ __volatile__("wrmsr" \ + : /* no outputs */ \ + : "c" (reg), "A" ((uint64_t)v)) + +#endif + +static inline unsigned long long +hv_vmbus_read_msr(int msr) +{ + unsigned long long val; + HV_VMBUS_READ_MSR(msr, val); + return (val); +} + +static inline +void hv_vmbus_write_msr(int msr, uint64_t val) +{ + HV_VMBUS_WRITE_MSR(msr, val); + return; +} + +/* + * The guest OS needs to register the guest ID with the hypervisor. + * The guest ID is a 64 bit entity and the structure of this ID is + * specified in the Hyper-V specification: + * + * http://msdn.microsoft.com/en-us/library/windows/ + * hardware/ff542653%28v=vs.85%29.aspx + * + * While the current guideline does not specify how FreeBSD guest ID(s) + * need to be generated, our plan is to publish the guidelines for + * FreeBSD and other guest operating systems that currently are hosted + * on Hyper-V. The implementation here conforms to this yet + * unpublished guidelines. + * + * Bit(s) + * 63 - Indicates if the OS is Open Source or not; 1 is Open Source + * 62:56 - Os Type; Linux is 0x100, FreeBSD is 0x200 + * 55:48 - Distro specific identification + * 47:16 - FreeBSD kernel version number + * 15:0 - Distro specific identification + * + */ + +#define HV_FREEBSD_VENDOR_ID 0x8200 +#define HV_FREEBSD_GUEST_ID hv_generate_guest_id(0,0) + +static inline uint64_t hv_generate_guest_id( + uint8_t distro_id_part1, + uint16_t distro_id_part2) +{ + uint64_t guest_id; + guest_id = (((uint64_t)HV_FREEBSD_VENDOR_ID) << 48); + guest_id |= (((uint64_t)(distro_id_part1)) << 48); + guest_id |= (((uint64_t)(__FreeBSD_version)) << 16); /* in param.h */ + guest_id |= ((uint64_t)(distro_id_part2)); + return guest_id; +} + + +#endif /* __HYPERV_PRIV_H__ */ |