summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authorwhu <whu@FreeBSD.org>2015-05-22 09:03:55 +0000
committerwhu <whu@FreeBSD.org>2015-05-22 09:03:55 +0000
commit30cd3b9808be2ed4002993166f0790b8f07d95d8 (patch)
treef2a61a02982282e7de44dcf271ef00cd723d44c5 /sys
parentb453b295750133b89170fcb27025f932be66ad18 (diff)
downloadFreeBSD-src-30cd3b9808be2ed4002993166f0790b8f07d95d8.zip
FreeBSD-src-30cd3b9808be2ed4002993166f0790b8f07d95d8.tar.gz
MFC r282212:
Microsoft vmbus, storage and other related driver enhancements for HyperV. - Vmbus multi channel support. - Vector interrupt support. - Signal optimization. - Storvsc driver performance improvement. - Scatter and gather support for storvsc driver. - Minor bug fix for KVP driver. Thanks royger, jhb and delphij from FreeBSD community for the reviews and comments. Also thanks Hovy Xu from NetApp for the contributions to the storvsc driver. PR: 195238 Submitted by: whu Reviewed by: royger Approved by: royger Relnotes: yes Sponsored by: Microsoft OSTC Differential Revision: https://reviews.freebsd.org/D2575
Diffstat (limited to 'sys')
-rw-r--r--sys/amd64/amd64/apic_vector.S16
-rw-r--r--sys/amd64/conf/GENERIC4
-rw-r--r--sys/amd64/conf/NOTES2
-rw-r--r--sys/amd64/include/apicvar.h1
-rw-r--r--sys/conf/options.amd642
-rw-r--r--sys/conf/options.i3862
-rw-r--r--sys/dev/hyperv/include/hyperv.h167
-rw-r--r--sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c801
-rw-r--r--sys/dev/hyperv/storvsc/hv_vstorage.h16
-rw-r--r--sys/dev/hyperv/utilities/hv_kvp.c11
-rw-r--r--sys/dev/hyperv/utilities/hv_util.c9
-rw-r--r--sys/dev/hyperv/vmbus/hv_channel.c98
-rw-r--r--sys/dev/hyperv/vmbus/hv_channel_mgmt.c268
-rw-r--r--sys/dev/hyperv/vmbus/hv_connection.c289
-rw-r--r--sys/dev/hyperv/vmbus/hv_hv.c66
-rw-r--r--sys/dev/hyperv/vmbus/hv_ring_buffer.c78
-rw-r--r--sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c364
-rw-r--r--sys/dev/hyperv/vmbus/hv_vmbus_priv.h71
-rw-r--r--sys/i386/conf/GENERIC4
-rw-r--r--sys/i386/i386/apic_vector.s19
20 files changed, 1812 insertions, 476 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 4b2bef0..69a1043 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -150,6 +150,22 @@ IDTVEC(xen_intr_upcall)
jmp doreti
#endif
+#ifdef HYPERV
+/*
+ * This is the Hyper-V vmbus channel direct callback interrupt.
+ * Only used when it is running on Hyper-V.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(hv_vmbus_callback)
+ PUSH_FRAME
+ FAKE_MCOUNT(TF_RIP(%rsp))
+ movq %rsp, %rdi
+ call hv_vector_handler
+ MEXITCOUNT
+ jmp doreti
+#endif
+
#ifdef SMP
/*
* Global address space TLB shootdown.
diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index 7955824..b0138c1 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -346,7 +346,9 @@ device virtio_blk # VirtIO Block device
device virtio_scsi # VirtIO SCSI device
device virtio_balloon # VirtIO Memory Balloon device
-# HyperV drivers
+# HyperV drivers and enchancement support
+# NOTE: HYPERV depends on hyperv. They must be added or removed together.
+options HYPERV # Hyper-V kernel infrastructure
device hyperv # HyperV drivers
# Xen HVM Guest Optimizations
diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES
index 61f4315..7babc67 100644
--- a/sys/amd64/conf/NOTES
+++ b/sys/amd64/conf/NOTES
@@ -479,6 +479,8 @@ device virtio_balloon # VirtIO Memory Balloon device
device virtio_random # VirtIO Entropy device
device virtio_console # VirtIO Console device
+# Microsoft Hyper-V enchancement support
+options HYPERV # Hyper-V kernel infrastructure
device hyperv # HyperV drivers
# Xen HVM Guest Optimizations
diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h
index 62e5ff2..2073fa5 100644
--- a/sys/amd64/include/apicvar.h
+++ b/sys/amd64/include/apicvar.h
@@ -216,6 +216,7 @@ int lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
void lapic_set_tpr(u_int vector);
void lapic_setup(int boot);
void xen_intr_handle_upcall(struct trapframe *frame);
+void hv_vector_handler(struct trapframe *frame);
#endif /* !LOCORE */
#endif /* _MACHINE_APICVAR_H_ */
diff --git a/sys/conf/options.amd64 b/sys/conf/options.amd64
index 52aef61..b249771 100644
--- a/sys/conf/options.amd64
+++ b/sys/conf/options.amd64
@@ -67,5 +67,7 @@ BPF_JITTER opt_bpf.h
XENHVM opt_global.h
+HYPERV opt_global.h
+
# options for the Intel C600 SAS driver (isci)
ISCI_LOGGING opt_isci.h
diff --git a/sys/conf/options.i386 b/sys/conf/options.i386
index 7102d8f..0e21ed6 100644
--- a/sys/conf/options.i386
+++ b/sys/conf/options.i386
@@ -127,5 +127,7 @@ NATIVE opt_global.h
XEN opt_global.h
XENHVM opt_global.h
+HYPERV opt_global.h
+
# options for the Intel C600 SAS driver (isci)
ISCI_LOGGING opt_isci.h
diff --git a/sys/dev/hyperv/include/hyperv.h b/sys/dev/hyperv/include/hyperv.h
index 8a45d89..5360b7c 100644
--- a/sys/dev/hyperv/include/hyperv.h
+++ b/sys/dev/hyperv/include/hyperv.h
@@ -46,6 +46,7 @@
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/sema.h>
+#include <sys/smp.h>
#include <sys/mutex.h>
#include <sys/bus.h>
#include <vm/vm.h>
@@ -63,11 +64,22 @@ typedef uint8_t hv_bool_uint8_t;
#define HV_ERROR_MACHINE_LOCKED 0x800704F7
/*
- * A revision number of vmbus that is used for ensuring both ends on a
- * partition are using compatible versions.
+ * VMBUS version is 32 bit, upper 16 bit for major_number and lower
+ * 16 bit for minor_number.
+ *
+ * 0.13 -- Windows Server 2008
+ * 1.1 -- Windows 7
+ * 2.4 -- Windows 8
+ * 3.0 -- Windows 8.1
*/
+#define HV_VMBUS_VERSION_WS2008 ((0 << 16) | (13))
+#define HV_VMBUS_VERSION_WIN7 ((1 << 16) | (1))
+#define HV_VMBUS_VERSION_WIN8 ((2 << 16) | (4))
+#define HV_VMBUS_VERSION_WIN8_1 ((3 << 16) | (0))
+
+#define HV_VMBUS_VERSION_INVALID -1
-#define HV_VMBUS_REVISION_NUMBER 13
+#define HV_VMBUS_VERSION_CURRENT HV_VMBUS_VERSION_WIN8_1
/*
* Make maximum size of pipe payload of 16K
@@ -112,6 +124,18 @@ typedef struct hv_guid {
unsigned char data[16];
} __packed hv_guid;
+#define HV_NIC_GUID \
+ .data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, \
+ 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
+
+#define HV_IDE_GUID \
+ .data = {0x32, 0x26, 0x41, 0x32, 0xcb, 0x86, 0xa2, 0x44, \
+ 0x9b, 0x5c, 0x50, 0xd1, 0x41, 0x73, 0x54, 0xf5}
+
+#define HV_SCSI_GUID \
+ .data = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d, \
+ 0xb6, 0x05, 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f}
+
/*
* At the center of the Channel Management library is
* the Channel Offer. This struct contains the
@@ -147,7 +171,11 @@ typedef struct hv_vmbus_channel_offer {
} __packed pipe;
} u;
- uint32_t padding;
+ /*
+ * Sub_channel_index, newly added in Win8.
+ */
+ uint16_t sub_channel_index;
+ uint16_t padding;
} __packed hv_vmbus_channel_offer;
@@ -344,7 +372,25 @@ typedef struct {
hv_vmbus_channel_offer offer;
uint32_t child_rel_id;
uint8_t monitor_id;
- hv_bool_uint8_t monitor_allocated;
+ /*
+ * This field has been split into a bit field on Win7
+ * and higher.
+ */
+ uint8_t monitor_allocated:1;
+ uint8_t reserved:7;
+ /*
+ * Following fields were added in win7 and higher.
+ * Make sure to check the version before accessing these fields.
+ *
+ * If "is_dedicated_interrupt" is set, we must not set the
+ * associated bit in the channel bitmap while sending the
+ * interrupt to the host.
+ *
+ * connection_id is used in signaling the host.
+ */
+ uint16_t is_dedicated_interrupt:1;
+ uint16_t reserved1:15;
+ uint32_t connection_id;
} __packed hv_vmbus_channel_offer_channel;
/*
@@ -394,9 +440,11 @@ typedef struct
hv_gpadl_handle ring_buffer_gpadl_handle;
/*
- * GPADL for the channel's server context save area.
+ * Before win8, all incoming channel interrupts are only
+ * delivered on cpu 0. Setting this value to 0 would
+ * preserve the earlier behavior.
*/
- hv_gpadl_handle server_context_area_gpadl_handle;
+ uint32_t target_vcpu;
/*
* The upstream ring buffer begins at offset zero in the memory described
@@ -646,14 +694,42 @@ typedef struct {
} hv_vmbus_ring_buffer_info;
typedef void (*hv_vmbus_pfn_channel_callback)(void *context);
+typedef void (*hv_vmbus_sc_creation_callback)(void *context);
typedef enum {
HV_CHANNEL_OFFER_STATE,
HV_CHANNEL_OPENING_STATE,
HV_CHANNEL_OPEN_STATE,
+ HV_CHANNEL_OPENED_STATE,
HV_CHANNEL_CLOSING_NONDESTRUCTIVE_STATE,
} hv_vmbus_channel_state;
+/*
+ * Connection identifier type
+ */
+typedef union {
+ uint32_t as_uint32_t;
+ struct {
+ uint32_t id:24;
+ uint32_t reserved:8;
+ } u;
+
+} __packed hv_vmbus_connection_id;
+
+/*
+ * Definition of the hv_vmbus_signal_event hypercall input structure
+ */
+typedef struct {
+ hv_vmbus_connection_id connection_id;
+ uint16_t flag_number;
+ uint16_t rsvd_z;
+} __packed hv_vmbus_input_signal_event;
+
+typedef struct {
+ uint64_t align8;
+ hv_vmbus_input_signal_event event;
+} __packed hv_vmbus_input_signal_event_buffer;
+
typedef struct hv_vmbus_channel {
TAILQ_ENTRY(hv_vmbus_channel) list_entry;
struct hv_device* device;
@@ -688,8 +764,82 @@ typedef struct hv_vmbus_channel {
hv_vmbus_pfn_channel_callback on_channel_callback;
void* channel_callback_context;
+ /*
+ * If batched_reading is set to "true", mask the interrupt
+ * and read until the channel is empty.
+ * If batched_reading is set to "false", the channel is not
+ * going to perform batched reading.
+ *
+ * Batched reading is enabled by default; specific
+ * drivers that don't want this behavior can turn it off.
+ */
+ boolean_t batched_reading;
+
+ boolean_t is_dedicated_interrupt;
+
+ /*
+ * Used as an input param for HV_CALL_SIGNAL_EVENT hypercall.
+ */
+ hv_vmbus_input_signal_event_buffer signal_event_buffer;
+ /*
+ * 8-bytes aligned of the buffer above
+ */
+ hv_vmbus_input_signal_event *signal_event_param;
+
+ /*
+ * From Win8, this field specifies the target virtual process
+ * on which to deliver the interupt from the host to guest.
+ * Before Win8, all channel interrupts would only be
+ * delivered on cpu 0. Setting this value to 0 would preserve
+ * the earlier behavior.
+ */
+ uint32_t target_vcpu;
+ /* The corresponding CPUID in the guest */
+ uint32_t target_cpu;
+
+ /*
+ * Support for multi-channels.
+ * The initial offer is considered the primary channel and this
+ * offer message will indicate if the host supports multi-channels.
+ * The guest is free to ask for multi-channels to be offerred and can
+ * open these multi-channels as a normal "primary" channel. However,
+ * all multi-channels will have the same type and instance guids as the
+ * primary channel. Requests sent on a given channel will result in a
+ * response on the same channel.
+ */
+
+ /*
+ * Multi-channel creation callback. This callback will be called in
+ * process context when a Multi-channel offer is received from the host.
+ * The guest can open the Multi-channel in the context of this callback.
+ */
+ hv_vmbus_sc_creation_callback sc_creation_callback;
+
+ struct mtx sc_lock;
+
+ /*
+ * Link list of all the multi-channels if this is a primary channel
+ */
+ TAILQ_HEAD(, hv_vmbus_channel) sc_list_anchor;
+ TAILQ_ENTRY(hv_vmbus_channel) sc_list_entry;
+
+ /*
+ * The primary channel this sub-channle belongs to.
+ * This will be NULL for the primary channel.
+ */
+ struct hv_vmbus_channel *primary_channel;
+ /*
+ * Support per channel state for use by vmbus drivers.
+ */
+ void *per_channel_state;
} hv_vmbus_channel;
+static inline void
+hv_set_channel_read_state(hv_vmbus_channel* channel, boolean_t state)
+{
+ channel->batched_reading = state;
+}
+
typedef struct hv_device {
hv_guid class_id;
hv_guid device_id;
@@ -760,6 +910,8 @@ int hv_vmbus_channel_teardown_gpdal(
hv_vmbus_channel* channel,
uint32_t gpadl_handle);
+struct hv_vmbus_channel* vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary);
+
/*
* Work abstraction defines
*/
@@ -819,6 +971,7 @@ typedef struct hv_vmbus_service {
extern uint8_t* receive_buffer[];
extern hv_vmbus_service service_table[];
+extern uint32_t hv_vmbus_protocal_version;
void hv_kvp_callback(void *context);
int hv_kvp_init(hv_vmbus_service *serv);
diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
index d00d279..f8a871b 100644
--- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
+++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
@@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/condvar.h>
+#include <sys/time.h>
#include <sys/systm.h>
#include <sys/sockio.h>
#include <sys/mbuf.h>
@@ -53,8 +54,12 @@ __FBSDID("$FreeBSD$");
#include <sys/callout.h>
#include <vm/vm.h>
#include <vm/pmap.h>
+#include <vm/uma.h>
#include <sys/lock.h>
#include <sys/sema.h>
+#include <sys/sglist.h>
+#include <machine/bus.h>
+#include <sys/bus_dma.h>
#include <cam/cam.h>
#include <cam/cam_ccb.h>
@@ -66,7 +71,6 @@ __FBSDID("$FreeBSD$");
#include <cam/scsi/scsi_all.h>
#include <cam/scsi/scsi_message.h>
-
#include <dev/hyperv/include/hyperv.h>
#include "hv_vstorage.h"
@@ -77,8 +81,29 @@ __FBSDID("$FreeBSD$");
#define BLKVSC_MAX_IO_REQUESTS STORVSC_MAX_IO_REQUESTS
#define STORVSC_MAX_TARGETS (2)
+#define STORVSC_WIN7_MAJOR 4
+#define STORVSC_WIN7_MINOR 2
+
+#define STORVSC_WIN8_MAJOR 5
+#define STORVSC_WIN8_MINOR 1
+
+#define HV_ALIGN(x, a) roundup2(x, a)
+
struct storvsc_softc;
+struct hv_sgl_node {
+ LIST_ENTRY(hv_sgl_node) link;
+ struct sglist *sgl_data;
+};
+
+struct hv_sgl_page_pool{
+ LIST_HEAD(, hv_sgl_node) in_use_sgl_list;
+ LIST_HEAD(, hv_sgl_node) free_sgl_list;
+ boolean_t is_init;
+} g_hv_sgl_page_pool;
+
+#define STORVSC_MAX_SG_PAGE_CNT STORVSC_MAX_IO_REQUESTS * HV_MAX_MULTIPAGE_BUFFER_COUNT
+
enum storvsc_request_type {
WRITE_TYPE,
READ_TYPE,
@@ -96,20 +121,24 @@ struct hv_storvsc_request {
struct storvsc_softc *softc;
struct callout callout;
struct sema synch_sema; /*Synchronize the request/response if needed */
+ struct sglist *bounce_sgl;
+ unsigned int bounce_sgl_count;
+ uint64_t not_aligned_seg_bits;
};
struct storvsc_softc {
struct hv_device *hs_dev;
- LIST_HEAD(, hv_storvsc_request) hs_free_list;
- struct mtx hs_lock;
- struct storvsc_driver_props *hs_drv_props;
- int hs_unit;
- uint32_t hs_frozen;
- struct cam_sim *hs_sim;
- struct cam_path *hs_path;
+ LIST_HEAD(, hv_storvsc_request) hs_free_list;
+ struct mtx hs_lock;
+ struct storvsc_driver_props *hs_drv_props;
+ int hs_unit;
+ uint32_t hs_frozen;
+ struct cam_sim *hs_sim;
+ struct cam_path *hs_path;
uint32_t hs_num_out_reqs;
boolean_t hs_destroy;
boolean_t hs_drain_notify;
+ boolean_t hs_open_multi_channel;
struct sema hs_drain_sema;
struct hv_storvsc_request hs_init_req;
struct hv_storvsc_request hs_reset_req;
@@ -124,7 +153,7 @@ struct storvsc_softc {
* The first can be tested by "sg_senddiag -vv /dev/daX",
* and the second and third can be done by
* "sg_wr_mode -v -p 08 -c 0,1a -m 0,ff /dev/daX".
- */
+ */
#define HVS_TIMEOUT_TEST 0
/*
@@ -138,7 +167,7 @@ struct storvsc_driver_props {
char *drv_name;
char *drv_desc;
uint8_t drv_max_luns_per_target;
- uint8_t drv_max_ios_per_target;
+ uint8_t drv_max_ios_per_target;
uint32_t drv_ringbuffer_size;
};
@@ -150,6 +179,8 @@ enum hv_storage_type {
#define HS_MAX_ADAPTERS 10
+#define HV_STORAGE_SUPPORTS_MULTI_CHANNEL 0x1
+
/* {ba6163d9-04a1-4d29-b605-72e2ffb1dc7f} */
static const hv_guid gStorVscDeviceType={
.data = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d,
@@ -171,13 +202,16 @@ static struct storvsc_driver_props g_drv_props_table[] = {
STORVSC_RINGBUFFER_SIZE}
};
+static int storvsc_current_major;
+static int storvsc_current_minor;
+
/* static functions */
static int storvsc_probe(device_t dev);
static int storvsc_attach(device_t dev);
static int storvsc_detach(device_t dev);
static void storvsc_poll(struct cam_sim * sim);
static void storvsc_action(struct cam_sim * sim, union ccb * ccb);
-static void create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp);
+static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp);
static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp);
static enum hv_storage_type storvsc_get_storage_type(device_t dev);
static void hv_storvsc_on_channel_callback(void *context);
@@ -186,6 +220,14 @@ static void hv_storvsc_on_iocompletion( struct storvsc_softc *sc,
struct hv_storvsc_request *request);
static int hv_storvsc_connect_vsp(struct hv_device *device);
static void storvsc_io_done(struct hv_storvsc_request *reqp);
+static void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl,
+ bus_dma_segment_t *orig_sgl,
+ unsigned int orig_sgl_count,
+ uint64_t seg_bits);
+void storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl,
+ unsigned int dest_sgl_count,
+ struct sglist* src_sgl,
+ uint64_t seg_bits);
static device_method_t storvsc_methods[] = {
/* Device interface */
@@ -207,7 +249,7 @@ MODULE_DEPEND(storvsc, vmbus, 1, 1, 1);
/**
- * The host is capable of sending messages to us that are
+ * The host is capable of sending messages to us that are
* completely unsolicited. So, we need to address the race
* condition where we may be in the process of unloading the
* driver when the host may send us an unsolicited message.
@@ -223,7 +265,7 @@ MODULE_DEPEND(storvsc, vmbus, 1, 1, 1);
* destroyed.
*
* 3. Once the device is marked as being destroyed, we only
- * permit incoming traffic to properly account for
+ * permit incoming traffic to properly account for
* packets already sent out.
*/
static inline struct storvsc_softc *
@@ -260,6 +302,113 @@ get_stor_device(struct hv_device *device,
}
/**
+ * @brief Callback handler, will be invoked when receive mutil-channel offer
+ *
+ * @param context new multi-channel
+ */
+static void
+storvsc_handle_sc_creation(void *context)
+{
+ hv_vmbus_channel *new_channel;
+ struct hv_device *device;
+ struct storvsc_softc *sc;
+ struct vmstor_chan_props props;
+ int ret = 0;
+
+ new_channel = (hv_vmbus_channel *)context;
+ device = new_channel->primary_channel->device;
+ sc = get_stor_device(device, TRUE);
+ if (sc == NULL)
+ return;
+
+ if (FALSE == sc->hs_open_multi_channel)
+ return;
+
+ memset(&props, 0, sizeof(props));
+
+ ret = hv_vmbus_channel_open(new_channel,
+ sc->hs_drv_props->drv_ringbuffer_size,
+ sc->hs_drv_props->drv_ringbuffer_size,
+ (void *)&props,
+ sizeof(struct vmstor_chan_props),
+ hv_storvsc_on_channel_callback,
+ new_channel);
+
+ return;
+}
+
+/**
+ * @brief Send multi-channel creation request to host
+ *
+ * @param device a Hyper-V device pointer
+ * @param max_chans the max channels supported by vmbus
+ */
+static void
+storvsc_send_multichannel_request(struct hv_device *dev, int max_chans)
+{
+ struct storvsc_softc *sc;
+ struct hv_storvsc_request *request;
+ struct vstor_packet *vstor_packet;
+ int request_channels_cnt = 0;
+ int ret;
+
+ /* get multichannels count that need to create */
+ request_channels_cnt = MIN(max_chans, mp_ncpus);
+
+ sc = get_stor_device(dev, TRUE);
+ if (sc == NULL) {
+ printf("Storvsc_error: get sc failed while send mutilchannel "
+ "request\n");
+ return;
+ }
+
+ request = &sc->hs_init_req;
+
+ /* Establish a handler for multi-channel */
+ dev->channel->sc_creation_callback = storvsc_handle_sc_creation;
+
+ /* request the host to create multi-channel */
+ memset(request, 0, sizeof(struct hv_storvsc_request));
+
+ sema_init(&request->synch_sema, 0, ("stor_synch_sema"));
+
+ vstor_packet = &request->vstor_packet;
+
+ vstor_packet->operation = VSTOR_OPERATION_CREATE_MULTI_CHANNELS;
+ vstor_packet->flags = REQUEST_COMPLETION_FLAG;
+ vstor_packet->u.multi_channels_cnt = request_channels_cnt;
+
+ ret = hv_vmbus_channel_send_packet(
+ dev->channel,
+ vstor_packet,
+ sizeof(struct vstor_packet),
+ (uint64_t)(uintptr_t)request,
+ HV_VMBUS_PACKET_TYPE_DATA_IN_BAND,
+ HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+
+ /* wait for 5 seconds */
+ ret = sema_timedwait(&request->synch_sema, 5 * hz);
+ if (ret != 0) {
+ printf("Storvsc_error: create multi-channel timeout, %d\n",
+ ret);
+ return;
+ }
+
+ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
+ vstor_packet->status != 0) {
+ printf("Storvsc_error: create multi-channel invalid operation "
+ "(%d) or statue (%u)\n",
+ vstor_packet->operation, vstor_packet->status);
+ return;
+ }
+
+ sc->hs_open_multi_channel = TRUE;
+
+ if (bootverbose)
+ printf("Storvsc create multi-channel success!\n");
+}
+
+/**
* @brief initialize channel connection to parent partition
*
* @param dev a Hyper-V device pointer
@@ -272,11 +421,15 @@ hv_storvsc_channel_init(struct hv_device *dev)
struct hv_storvsc_request *request;
struct vstor_packet *vstor_packet;
struct storvsc_softc *sc;
+ uint16_t max_chans = 0;
+ boolean_t support_multichannel = FALSE;
+
+ max_chans = 0;
+ support_multichannel = FALSE;
sc = get_stor_device(dev, TRUE);
- if (sc == NULL) {
- return ENODEV;
- }
+ if (sc == NULL)
+ return (ENODEV);
request = &sc->hs_init_req;
memset(request, 0, sizeof(struct hv_storvsc_request));
@@ -300,15 +453,13 @@ hv_storvsc_channel_init(struct hv_device *dev)
HV_VMBUS_PACKET_TYPE_DATA_IN_BAND,
HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
- if (ret != 0) {
+ if (ret != 0)
goto cleanup;
- }
-
- ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */
- if (ret != 0) {
+ /* wait 5 seconds */
+ ret = sema_timedwait(&request->synch_sema, 5 * hz);
+ if (ret != 0)
goto cleanup;
- }
if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
vstor_packet->status != 0) {
@@ -321,7 +472,8 @@ hv_storvsc_channel_init(struct hv_device *dev)
vstor_packet->operation = VSTOR_OPERATION_QUERYPROTOCOLVERSION;
vstor_packet->flags = REQUEST_COMPLETION_FLAG;
- vstor_packet->u.version.major_minor = VMSTOR_PROTOCOL_VERSION_CURRENT;
+ vstor_packet->u.version.major_minor =
+ VMSTOR_PROTOCOL_VERSION(storvsc_current_major, storvsc_current_minor);
/* revision is only significant for Windows guests */
vstor_packet->u.version.revision = 0;
@@ -334,21 +486,19 @@ hv_storvsc_channel_init(struct hv_device *dev)
HV_VMBUS_PACKET_TYPE_DATA_IN_BAND,
HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
- if (ret != 0) {
+ if (ret != 0)
goto cleanup;
- }
- ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */
+ /* wait 5 seconds */
+ ret = sema_timedwait(&request->synch_sema, 5 * hz);
- if (ret) {
+ if (ret)
goto cleanup;
- }
/* TODO: Check returned version */
if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
- vstor_packet->status != 0) {
+ vstor_packet->status != 0)
goto cleanup;
- }
/**
* Query channel properties
@@ -365,22 +515,30 @@ hv_storvsc_channel_init(struct hv_device *dev)
HV_VMBUS_PACKET_TYPE_DATA_IN_BAND,
HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
- if ( ret != 0) {
+ if ( ret != 0)
goto cleanup;
- }
- ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */
+ /* wait 5 seconds */
+ ret = sema_timedwait(&request->synch_sema, 5 * hz);
- if (ret != 0) {
+ if (ret != 0)
goto cleanup;
- }
/* TODO: Check returned version */
if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
- vstor_packet->status != 0) {
+ vstor_packet->status != 0) {
goto cleanup;
}
+ /* multi-channels feature is supported by WIN8 and above version */
+ max_chans = vstor_packet->u.chan_props.max_channel_cnt;
+ if ((hv_vmbus_protocal_version != HV_VMBUS_VERSION_WIN7) &&
+ (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008) &&
+ (vstor_packet->u.chan_props.flags &
+ HV_STORAGE_SUPPORTS_MULTI_CHANNEL)) {
+ support_multichannel = TRUE;
+ }
+
memset(vstor_packet, 0, sizeof(struct vstor_packet));
vstor_packet->operation = VSTOR_OPERATION_ENDINITIALIZATION;
vstor_packet->flags = REQUEST_COMPLETION_FLAG;
@@ -397,16 +555,22 @@ hv_storvsc_channel_init(struct hv_device *dev)
goto cleanup;
}
- ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */
+ /* wait 5 seconds */
+ ret = sema_timedwait(&request->synch_sema, 5 * hz);
- if (ret != 0) {
+ if (ret != 0)
goto cleanup;
- }
if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
- vstor_packet->status != 0) {
+ vstor_packet->status != 0)
goto cleanup;
- }
+
+ /*
+ * If multi-channel is supported, send multichannel create
+ * request to host.
+ */
+ if (support_multichannel)
+ storvsc_send_multichannel_request(dev, max_chans);
cleanup:
sema_destroy(&request->synch_sema);
@@ -443,8 +607,7 @@ hv_storvsc_connect_vsp(struct hv_device *dev)
(void *)&props,
sizeof(struct vmstor_chan_props),
hv_storvsc_on_channel_callback,
- dev);
-
+ dev->channel);
if (ret != 0) {
return ret;
@@ -490,7 +653,7 @@ hv_storvsc_host_reset(struct hv_device *dev)
goto cleanup;
}
- ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */
+ ret = sema_timedwait(&request->synch_sema, 5 * hz); /* KYS 5 seconds */
if (ret) {
goto cleanup;
@@ -498,7 +661,7 @@ hv_storvsc_host_reset(struct hv_device *dev)
/*
- * At this point, all outstanding requests in the adapter
+ * At this point, all outstanding requests in the adapter
* should have been flushed out and return to us
*/
@@ -521,6 +684,7 @@ hv_storvsc_io_request(struct hv_device *device,
{
struct storvsc_softc *sc;
struct vstor_packet *vstor_packet = &request->vstor_packet;
+ struct hv_vmbus_channel* outgoing_channel = NULL;
int ret = 0;
sc = get_stor_device(device, TRUE);
@@ -539,19 +703,20 @@ hv_storvsc_io_request(struct hv_device *device,
vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB;
+ outgoing_channel = vmbus_select_outgoing_channel(device->channel);
mtx_unlock(&request->softc->hs_lock);
if (request->data_buf.length) {
ret = hv_vmbus_channel_send_packet_multipagebuffer(
- device->channel,
+ outgoing_channel,
&request->data_buf,
- vstor_packet,
- sizeof(struct vstor_packet),
+ vstor_packet,
+ sizeof(struct vstor_packet),
(uint64_t)(uintptr_t)request);
} else {
ret = hv_vmbus_channel_send_packet(
- device->channel,
+ outgoing_channel,
vstor_packet,
sizeof(struct vstor_packet),
(uint64_t)(uintptr_t)request,
@@ -610,7 +775,8 @@ static void
hv_storvsc_on_channel_callback(void *context)
{
int ret = 0;
- struct hv_device *device = (struct hv_device *)context;
+ hv_vmbus_channel *channel = (hv_vmbus_channel *)context;
+ struct hv_device *device = NULL;
struct storvsc_softc *sc;
uint32_t bytes_recvd;
uint64_t request_id;
@@ -618,15 +784,22 @@ hv_storvsc_on_channel_callback(void *context)
struct hv_storvsc_request *request;
struct vstor_packet *vstor_packet;
+ if (channel->primary_channel != NULL){
+ device = channel->primary_channel->device;
+ } else {
+ device = channel->device;
+ }
+
+ KASSERT(device, ("device is NULL"));
+
sc = get_stor_device(device, FALSE);
if (sc == NULL) {
+ printf("Storvsc_error: get stor device failed.\n");
return;
}
- KASSERT(device, ("device"));
-
ret = hv_vmbus_channel_recv_packet(
- device->channel,
+ channel,
packet,
roundup2(sizeof(struct vstor_packet), 8),
&bytes_recvd,
@@ -634,21 +807,28 @@ hv_storvsc_on_channel_callback(void *context)
while ((ret == 0) && (bytes_recvd > 0)) {
request = (struct hv_storvsc_request *)(uintptr_t)request_id;
- KASSERT(request, ("request"));
if ((request == &sc->hs_init_req) ||
(request == &sc->hs_reset_req)) {
memcpy(&request->vstor_packet, packet,
sizeof(struct vstor_packet));
- sema_post(&request->synch_sema);
+ sema_post(&request->synch_sema);
} else {
vstor_packet = (struct vstor_packet *)packet;
switch(vstor_packet->operation) {
case VSTOR_OPERATION_COMPLETEIO:
+ if (request == NULL)
+ panic("VMBUS: storvsc received a "
+ "packet with NULL request id in "
+ "COMPLETEIO operation.");
+
hv_storvsc_on_iocompletion(sc,
vstor_packet, request);
break;
case VSTOR_OPERATION_REMOVEDEVICE:
+ case VSTOR_OPERATION_ENUMERATE_BUS:
+ printf("VMBUS: storvsc operation %d not "
+ "implemented.\n", vstor_packet->operation);
/* TODO: implement */
break;
default:
@@ -656,7 +836,7 @@ hv_storvsc_on_channel_callback(void *context)
}
}
ret = hv_vmbus_channel_recv_packet(
- device->channel,
+ channel,
packet,
roundup2(sizeof(struct vstor_packet), 8),
&bytes_recvd,
@@ -680,7 +860,16 @@ storvsc_probe(device_t dev)
{
int ata_disk_enable = 0;
int ret = ENXIO;
-
+
+ if ((HV_VMBUS_VERSION_WIN8 == hv_vmbus_protocal_version) ||
+ (HV_VMBUS_VERSION_WIN8_1 == hv_vmbus_protocal_version)){
+ storvsc_current_major = STORVSC_WIN8_MAJOR;
+ storvsc_current_minor = STORVSC_WIN8_MINOR;
+ } else {
+ storvsc_current_major = STORVSC_WIN7_MAJOR;
+ storvsc_current_minor = STORVSC_WIN7_MINOR;
+ }
+
switch (storvsc_get_storage_type(dev)) {
case DRIVER_BLKVSC:
if(bootverbose)
@@ -721,9 +910,11 @@ storvsc_attach(device_t dev)
enum hv_storage_type stor_type;
struct storvsc_softc *sc;
struct cam_devq *devq;
- int ret, i;
+ int ret, i, j;
struct hv_storvsc_request *reqp;
struct root_hold_token *root_mount_token = NULL;
+ struct hv_sgl_node *sgl_node = NULL;
+ void *tmp_buff = NULL;
/*
* We need to serialize storvsc attach calls.
@@ -764,8 +955,41 @@ storvsc_attach(device_t dev)
LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link);
}
+ /* create sg-list page pool */
+ if (FALSE == g_hv_sgl_page_pool.is_init) {
+ g_hv_sgl_page_pool.is_init = TRUE;
+ LIST_INIT(&g_hv_sgl_page_pool.in_use_sgl_list);
+ LIST_INIT(&g_hv_sgl_page_pool.free_sgl_list);
+
+ /*
+ * Pre-create SG list, each SG list with
+ * HV_MAX_MULTIPAGE_BUFFER_COUNT segments, each
+ * segment has one page buffer
+ */
+ for (i = 0; i < STORVSC_MAX_IO_REQUESTS; i++) {
+ sgl_node = malloc(sizeof(struct hv_sgl_node),
+ M_DEVBUF, M_WAITOK|M_ZERO);
+
+ sgl_node->sgl_data =
+ sglist_alloc(HV_MAX_MULTIPAGE_BUFFER_COUNT,
+ M_WAITOK|M_ZERO);
+
+ for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++) {
+ tmp_buff = malloc(PAGE_SIZE,
+ M_DEVBUF, M_WAITOK|M_ZERO);
+
+ sgl_node->sgl_data->sg_segs[j].ss_paddr =
+ (vm_paddr_t)tmp_buff;
+ }
+
+ LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list,
+ sgl_node, link);
+ }
+ }
+
sc->hs_destroy = FALSE;
sc->hs_drain_notify = FALSE;
+ sc->hs_open_multi_channel = FALSE;
sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema");
ret = hv_storvsc_connect_vsp(hv_dev);
@@ -834,6 +1058,20 @@ cleanup:
LIST_REMOVE(reqp, link);
free(reqp, M_DEVBUF);
}
+
+ while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
+ sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
+ LIST_REMOVE(sgl_node, link);
+ for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++) {
+ if (NULL !=
+ (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) {
+ free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF);
+ }
+ }
+ sglist_free(sgl_node->sgl_data);
+ free(sgl_node, M_DEVBUF);
+ }
+
return (ret);
}
@@ -853,6 +1091,8 @@ storvsc_detach(device_t dev)
struct storvsc_softc *sc = device_get_softc(dev);
struct hv_storvsc_request *reqp = NULL;
struct hv_device *hv_device = vmbus_get_devctx(dev);
+ struct hv_sgl_node *sgl_node = NULL;
+ int j = 0;
mtx_lock(&hv_device->channel->inbound_lock);
sc->hs_destroy = TRUE;
@@ -884,6 +1124,20 @@ storvsc_detach(device_t dev)
free(reqp, M_DEVBUF);
}
mtx_unlock(&sc->hs_lock);
+
+ while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
+ sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
+ LIST_REMOVE(sgl_node, link);
+ for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++){
+ if (NULL !=
+ (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) {
+ free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF);
+ }
+ }
+ sglist_free(sgl_node->sgl_data);
+ free(sgl_node, M_DEVBUF);
+ }
+
return (0);
}
@@ -939,7 +1193,7 @@ storvsc_timeout_test(struct hv_storvsc_request *reqp,
ticks, __func__, (ret == 0)?
"IO return detected" :
"IO return not detected");
- /*
+ /*
* Now both the timer handler and io done are running
* simultaneously. We want to confirm the io done always
* finishes after the timer handler exits. So reqp used by
@@ -1023,7 +1277,7 @@ storvsc_poll(struct cam_sim *sim)
mtx_assert(&sc->hs_lock, MA_OWNED);
mtx_unlock(&sc->hs_lock);
- hv_storvsc_on_channel_callback(sc->hs_dev);
+ hv_storvsc_on_channel_callback(sc->hs_dev->channel);
mtx_lock(&sc->hs_lock);
}
@@ -1151,9 +1405,13 @@ storvsc_action(struct cam_sim *sim, union ccb *ccb)
bzero(reqp, sizeof(struct hv_storvsc_request));
reqp->softc = sc;
-
- ccb->ccb_h.status |= CAM_SIM_QUEUED;
- create_storvsc_request(ccb, reqp);
+
+ ccb->ccb_h.status |= CAM_SIM_QUEUED;
+ if ((res = create_storvsc_request(ccb, reqp)) != 0) {
+ ccb->ccb_h.status = CAM_REQ_INVALID;
+ xpt_done(ccb);
+ return;
+ }
if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) {
callout_init(&reqp->callout, CALLOUT_MPSAFE);
@@ -1194,6 +1452,212 @@ storvsc_action(struct cam_sim *sim, union ccb *ccb)
}
/**
+ * @brief destroy bounce buffer
+ *
+ * This function is responsible for destroy a Scatter/Gather list
+ * that create by storvsc_create_bounce_buffer()
+ *
+ * @param sgl- the Scatter/Gather need be destroy
+ * @param sg_count- page count of the SG list.
+ *
+ */
+static void
+storvsc_destroy_bounce_buffer(struct sglist *sgl)
+{
+ struct hv_sgl_node *sgl_node = NULL;
+
+ sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list);
+ LIST_REMOVE(sgl_node, link);
+ if (NULL == sgl_node) {
+ printf("storvsc error: not enough in use sgl\n");
+ return;
+ }
+ sgl_node->sgl_data = sgl;
+ LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link);
+}
+
+/**
+ * @brief create bounce buffer
+ *
+ * This function is responsible for create a Scatter/Gather list,
+ * which hold several pages that can be aligned with page size.
+ *
+ * @param seg_count- SG-list segments count
+ * @param write - if WRITE_TYPE, set SG list page used size to 0,
+ * otherwise set used size to page size.
+ *
+ * return NULL if create failed
+ */
+static struct sglist *
+storvsc_create_bounce_buffer(uint16_t seg_count, int write)
+{
+ int i = 0;
+ struct sglist *bounce_sgl = NULL;
+ unsigned int buf_len = ((write == WRITE_TYPE) ? 0 : PAGE_SIZE);
+ struct hv_sgl_node *sgl_node = NULL;
+
+ /* get struct sglist from free_sgl_list */
+ sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
+ LIST_REMOVE(sgl_node, link);
+ if (NULL == sgl_node) {
+ printf("storvsc error: not enough free sgl\n");
+ return NULL;
+ }
+ bounce_sgl = sgl_node->sgl_data;
+ LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link);
+
+ bounce_sgl->sg_maxseg = seg_count;
+
+ if (write == WRITE_TYPE)
+ bounce_sgl->sg_nseg = 0;
+ else
+ bounce_sgl->sg_nseg = seg_count;
+
+ for (i = 0; i < seg_count; i++)
+ bounce_sgl->sg_segs[i].ss_len = buf_len;
+
+ return bounce_sgl;
+}
+
+/**
+ * @brief copy data from SG list to bounce buffer
+ *
+ * This function is responsible for copy data from one SG list's segments
+ * to another SG list which used as bounce buffer.
+ *
+ * @param bounce_sgl - the destination SG list
+ * @param orig_sgl - the segment of the source SG list.
+ * @param orig_sgl_count - the count of segments.
+ * @param orig_sgl_count - indicate which segment need bounce buffer,
+ * set 1 means need.
+ *
+ */
+static void
+storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl,
+ bus_dma_segment_t *orig_sgl,
+ unsigned int orig_sgl_count,
+ uint64_t seg_bits)
+{
+ int src_sgl_idx = 0;
+
+ for (src_sgl_idx = 0; src_sgl_idx < orig_sgl_count; src_sgl_idx++) {
+ if (seg_bits & (1 << src_sgl_idx)) {
+ memcpy((void*)bounce_sgl->sg_segs[src_sgl_idx].ss_paddr,
+ (void*)orig_sgl[src_sgl_idx].ds_addr,
+ orig_sgl[src_sgl_idx].ds_len);
+
+ bounce_sgl->sg_segs[src_sgl_idx].ss_len =
+ orig_sgl[src_sgl_idx].ds_len;
+ }
+ }
+}
+
+/**
+ * @brief copy data from SG list which used as bounce to another SG list
+ *
+ * This function is responsible for copy data from one SG list with bounce
+ * buffer to another SG list's segments.
+ *
+ * @param dest_sgl - the destination SG list's segments
+ * @param dest_sgl_count - the count of destination SG list's segment.
+ * @param src_sgl - the source SG list.
+ * @param seg_bits - indicate which segment used bounce buffer of src SG-list.
+ *
+ */
+void
+storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl,
+ unsigned int dest_sgl_count,
+ struct sglist* src_sgl,
+ uint64_t seg_bits)
+{
+ int sgl_idx = 0;
+
+ for (sgl_idx = 0; sgl_idx < dest_sgl_count; sgl_idx++) {
+ if (seg_bits & (1 << sgl_idx)) {
+ memcpy((void*)(dest_sgl[sgl_idx].ds_addr),
+ (void*)(src_sgl->sg_segs[sgl_idx].ss_paddr),
+ src_sgl->sg_segs[sgl_idx].ss_len);
+ }
+ }
+}
+
+/**
+ * @brief check SG list with bounce buffer or not
+ *
+ * This function is responsible for check if need bounce buffer for SG list.
+ *
+ * @param sgl - the SG list's segments
+ * @param sg_count - the count of SG list's segment.
+ * @param bits - segmengs number that need bounce buffer
+ *
+ * return -1 if SG list needless bounce buffer
+ */
+static int
+storvsc_check_bounce_buffer_sgl(bus_dma_segment_t *sgl,
+ unsigned int sg_count,
+ uint64_t *bits)
+{
+ int i = 0;
+ int offset = 0;
+ uint64_t phys_addr = 0;
+ uint64_t tmp_bits = 0;
+ boolean_t found_hole = FALSE;
+ boolean_t pre_aligned = TRUE;
+
+ if (sg_count < 2){
+ return -1;
+ }
+
+ *bits = 0;
+
+ phys_addr = vtophys(sgl[0].ds_addr);
+ offset = phys_addr - trunc_page(phys_addr);
+
+ if (offset != 0) {
+ pre_aligned = FALSE;
+ tmp_bits |= 1;
+ }
+
+ for (i = 1; i < sg_count; i++) {
+ phys_addr = vtophys(sgl[i].ds_addr);
+ offset = phys_addr - trunc_page(phys_addr);
+
+ if (offset == 0) {
+ if (FALSE == pre_aligned){
+ /*
+ * This segment is aligned, if the previous
+ * one is not aligned, find a hole
+ */
+ found_hole = TRUE;
+ }
+ pre_aligned = TRUE;
+ } else {
+ tmp_bits |= 1 << i;
+ if (!pre_aligned) {
+ if (phys_addr != vtophys(sgl[i-1].ds_addr +
+ sgl[i-1].ds_len)) {
+ /*
+ * Check whether connect to previous
+ * segment,if not, find the hole
+ */
+ found_hole = TRUE;
+ }
+ } else {
+ found_hole = TRUE;
+ }
+ pre_aligned = FALSE;
+ }
+ }
+
+ if (!found_hole) {
+ return (-1);
+ } else {
+ *bits = tmp_bits;
+ return 0;
+ }
+}
+
+/**
* @brief Fill in a request structure based on a CAM control block
*
* Fills in a request structure based on the contents of a CAM control
@@ -1203,7 +1667,7 @@ storvsc_action(struct cam_sim *sim, union ccb *ccb)
* @param ccb pointer to a CAM contorl block
* @param reqp pointer to a request structure
*/
-static void
+static int
create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp)
{
struct ccb_scsiio *csio = &ccb->csio;
@@ -1211,6 +1675,7 @@ create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp)
uint32_t bytes_to_copy = 0;
uint32_t pfn_num = 0;
uint32_t pfn;
+ uint64_t not_aligned_seg_bits = 0;
/* refer to struct vmscsi_req for meanings of these two fields */
reqp->vstor_packet.u.vm_srb.port =
@@ -1231,48 +1696,172 @@ create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp)
}
switch (ccb->ccb_h.flags & CAM_DIR_MASK) {
- case CAM_DIR_OUT:
- reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE;
- break;
- case CAM_DIR_IN:
- reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE;
- break;
- case CAM_DIR_NONE:
- reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE;
- break;
- default:
- reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE;
- break;
+ case CAM_DIR_OUT:
+ reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE;
+ break;
+ case CAM_DIR_IN:
+ reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE;
+ break;
+ case CAM_DIR_NONE:
+ reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE;
+ break;
+ default:
+ reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE;
+ break;
}
reqp->sense_data = &csio->sense_data;
reqp->sense_info_len = csio->sense_len;
reqp->ccb = ccb;
- /*
- KASSERT((ccb->ccb_h.flags & CAM_SCATTER_VALID) == 0,
- ("ccb is scatter gather valid\n"));
- */
- if (csio->dxfer_len != 0) {
- reqp->data_buf.length = csio->dxfer_len;
+
+ if (0 == csio->dxfer_len) {
+ return (0);
+ }
+
+ reqp->data_buf.length = csio->dxfer_len;
+
+ switch (ccb->ccb_h.flags & CAM_DATA_MASK) {
+ case CAM_DATA_VADDR:
+ {
bytes_to_copy = csio->dxfer_len;
phys_addr = vtophys(csio->data_ptr);
- reqp->data_buf.offset = phys_addr - trunc_page(phys_addr);
+ reqp->data_buf.offset = phys_addr & PAGE_MASK;
+
+ while (bytes_to_copy != 0) {
+ int bytes, page_offset;
+ phys_addr =
+ vtophys(&csio->data_ptr[reqp->data_buf.length -
+ bytes_to_copy]);
+ pfn = phys_addr >> PAGE_SHIFT;
+ reqp->data_buf.pfn_array[pfn_num] = pfn;
+ page_offset = phys_addr & PAGE_MASK;
+
+ bytes = min(PAGE_SIZE - page_offset, bytes_to_copy);
+
+ bytes_to_copy -= bytes;
+ pfn_num++;
+ }
+ break;
}
- while (bytes_to_copy != 0) {
- int bytes, page_offset;
- phys_addr = vtophys(&csio->data_ptr[reqp->data_buf.length -
- bytes_to_copy]);
- pfn = phys_addr >> PAGE_SHIFT;
- reqp->data_buf.pfn_array[pfn_num] = pfn;
- page_offset = phys_addr - trunc_page(phys_addr);
+ case CAM_DATA_SG:
+ {
+ int i = 0;
+ int offset = 0;
+ int ret;
+
+ bus_dma_segment_t *storvsc_sglist =
+ (bus_dma_segment_t *)ccb->csio.data_ptr;
+ u_int16_t storvsc_sg_count = ccb->csio.sglist_cnt;
+
+ printf("Storvsc: get SG I/O operation, %d\n",
+ reqp->vstor_packet.u.vm_srb.data_in);
+
+ if (storvsc_sg_count > HV_MAX_MULTIPAGE_BUFFER_COUNT){
+ printf("Storvsc: %d segments is too much, "
+ "only support %d segments\n",
+ storvsc_sg_count, HV_MAX_MULTIPAGE_BUFFER_COUNT);
+ return (EINVAL);
+ }
+
+ /*
+ * We create our own bounce buffer function currently. Idealy
+ * we should use BUS_DMA(9) framework. But with current BUS_DMA
+ * code there is no callback API to check the page alignment of
+ * middle segments before busdma can decide if a bounce buffer
+ * is needed for particular segment. There is callback,
+ * "bus_dma_filter_t *filter", but the parrameters are not
+ * sufficient for storvsc driver.
+ * TODO:
+ * Add page alignment check in BUS_DMA(9) callback. Once
+ * this is complete, switch the following code to use
+ * BUS_DMA(9) for storvsc bounce buffer support.
+ */
+ /* check if we need to create bounce buffer */
+ ret = storvsc_check_bounce_buffer_sgl(storvsc_sglist,
+ storvsc_sg_count, &not_aligned_seg_bits);
+ if (ret != -1) {
+ reqp->bounce_sgl =
+ storvsc_create_bounce_buffer(storvsc_sg_count,
+ reqp->vstor_packet.u.vm_srb.data_in);
+ if (NULL == reqp->bounce_sgl) {
+ printf("Storvsc_error: "
+ "create bounce buffer failed.\n");
+ return (ENOMEM);
+ }
+
+ reqp->bounce_sgl_count = storvsc_sg_count;
+ reqp->not_aligned_seg_bits = not_aligned_seg_bits;
+
+ /*
+ * if it is write, we need copy the original data
+ *to bounce buffer
+ */
+ if (WRITE_TYPE == reqp->vstor_packet.u.vm_srb.data_in) {
+ storvsc_copy_sgl_to_bounce_buf(
+ reqp->bounce_sgl,
+ storvsc_sglist,
+ storvsc_sg_count,
+ reqp->not_aligned_seg_bits);
+ }
+
+ /* transfer virtual address to physical frame number */
+ if (reqp->not_aligned_seg_bits & 0x1){
+ phys_addr =
+ vtophys(reqp->bounce_sgl->sg_segs[0].ss_paddr);
+ }else{
+ phys_addr =
+ vtophys(storvsc_sglist[0].ds_addr);
+ }
+ reqp->data_buf.offset = phys_addr & PAGE_MASK;
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ reqp->data_buf.pfn_array[0] = pfn;
+
+ for (i = 1; i < storvsc_sg_count; i++) {
+ if (reqp->not_aligned_seg_bits & (1 << i)) {
+ phys_addr =
+ vtophys(reqp->bounce_sgl->sg_segs[i].ss_paddr);
+ } else {
+ phys_addr =
+ vtophys(storvsc_sglist[i].ds_addr);
+ }
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ reqp->data_buf.pfn_array[i] = pfn;
+ }
+ } else {
+ phys_addr = vtophys(storvsc_sglist[0].ds_addr);
+
+ reqp->data_buf.offset = phys_addr & PAGE_MASK;
- bytes = min(PAGE_SIZE - page_offset, bytes_to_copy);
+ for (i = 0; i < storvsc_sg_count; i++) {
+ phys_addr = vtophys(storvsc_sglist[i].ds_addr);
+ pfn = phys_addr >> PAGE_SHIFT;
+ reqp->data_buf.pfn_array[i] = pfn;
+ }
- bytes_to_copy -= bytes;
- pfn_num++;
+ /* check the last segment cross boundary or not */
+ offset = phys_addr & PAGE_MASK;
+ if (offset) {
+ phys_addr =
+ vtophys(storvsc_sglist[i-1].ds_addr +
+ PAGE_SIZE - offset);
+ pfn = phys_addr >> PAGE_SHIFT;
+ reqp->data_buf.pfn_array[i] = pfn;
+ }
+
+ reqp->bounce_sgl_count = 0;
+ }
+ break;
+ }
+ default:
+ printf("Unknow flags: %d\n", ccb->ccb_h.flags);
+ return(EINVAL);
}
+
+ return(0);
}
/**
@@ -1291,7 +1880,29 @@ storvsc_io_done(struct hv_storvsc_request *reqp)
struct ccb_scsiio *csio = &ccb->csio;
struct storvsc_softc *sc = reqp->softc;
struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb;
-
+ bus_dma_segment_t *ori_sglist = NULL;
+ int ori_sg_count = 0;
+
+ /* destroy bounce buffer if it is used */
+ if (reqp->bounce_sgl_count) {
+ ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr;
+ ori_sg_count = ccb->csio.sglist_cnt;
+
+ /*
+ * If it is READ operation, we should copy back the data
+ * to original SG list.
+ */
+ if (READ_TYPE == reqp->vstor_packet.u.vm_srb.data_in) {
+ storvsc_copy_from_bounce_buf_to_sgl(ori_sglist,
+ ori_sg_count,
+ reqp->bounce_sgl,
+ reqp->not_aligned_seg_bits);
+ }
+
+ storvsc_destroy_bounce_buffer(reqp->bounce_sgl);
+ reqp->bounce_sgl_count = 0;
+ }
+
if (reqp->retries > 0) {
mtx_lock(&sc->hs_lock);
#if HVS_TIMEOUT_TEST
@@ -1309,7 +1920,7 @@ storvsc_io_done(struct hv_storvsc_request *reqp)
mtx_unlock(&sc->hs_lock);
}
- /*
+ /*
* callout_drain() will wait for the timer handler to finish
* if it is running. So we don't need any lock to synchronize
* between this routine and the timer handler.
diff --git a/sys/dev/hyperv/storvsc/hv_vstorage.h b/sys/dev/hyperv/storvsc/hv_vstorage.h
index 2632676..deb9183 100644
--- a/sys/dev/hyperv/storvsc/hv_vstorage.h
+++ b/sys/dev/hyperv/storvsc/hv_vstorage.h
@@ -53,7 +53,7 @@
* V1 RC > 2008/1/31 2.0
*/
-#define VMSTOR_PROTOCOL_VERSION_CURRENT VMSTOR_PROTOCOL_VERSION(2, 0)
+#define VMSTOR_PROTOCOL_VERSION_CURRENT VMSTOR_PROTOCOL_VERSION(5, 1)
/**
* Packet structure ops describing virtual storage requests.
@@ -69,7 +69,10 @@ enum vstor_packet_ops {
VSTOR_OPERATION_ENDINITIALIZATION = 8,
VSTOR_OPERATION_QUERYPROTOCOLVERSION = 9,
VSTOR_OPERATION_QUERYPROPERTIES = 10,
- VSTOR_OPERATION_MAXIMUM = 10
+ VSTOR_OPERATION_ENUMERATE_BUS = 11,
+ VSTOR_OPERATION_FCHBA_DATA = 12,
+ VSTOR_OPERATION_CREATE_MULTI_CHANNELS = 13,
+ VSTOR_OPERATION_MAXIMUM = 13
};
@@ -123,10 +126,12 @@ struct vmstor_chan_props {
uint8_t path_id;
uint8_t target_id;
+ uint16_t max_channel_cnt;
+
/**
* Note: port number is only really known on the client side
*/
- uint32_t port;
+ uint16_t port;
uint32_t flags;
uint32_t max_transfer_bytes;
@@ -193,6 +198,11 @@ struct vstor_packet {
* Used during version negotiations.
*/
struct vmstor_proto_ver version;
+
+ /**
+ * Number of multichannels to create
+ */
+ uint16_t multi_channels_cnt;
} u;
} __packed;
diff --git a/sys/dev/hyperv/utilities/hv_kvp.c b/sys/dev/hyperv/utilities/hv_kvp.c
index 848d364..4598510 100644
--- a/sys/dev/hyperv/utilities/hv_kvp.c
+++ b/sys/dev/hyperv/utilities/hv_kvp.c
@@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$");
#include <sys/_null.h>
#include <sys/signal.h>
#include <sys/syslog.h>
+#include <sys/systm.h>
#include <sys/mutex.h>
#include <net/if_arp.h>
@@ -232,7 +233,7 @@ hv_kvp_negotiate_version(struct hv_vmbus_icmsg_hdr *icmsghdrp,
*/
if ((icframe_vercnt >= 2) && (negop->icversion_data[1].major == 3)) {
icframe_vercnt = 3;
- if (icmsg_vercnt >= 2)
+ if (icmsg_vercnt > 2)
icmsg_vercnt = 4;
else
icmsg_vercnt = 3;
@@ -734,8 +735,8 @@ hv_kvp_process_request(void *context)
recvlen = 0;
ret = hv_vmbus_channel_recv_packet(channel, kvp_buf, 2 * PAGE_SIZE,
&recvlen, &requestid);
- hv_kvp_log_info("%s: read: context %p, pending_cnt %ju ret =%d, recvlen=%d\n",
- __func__, context, pending_cnt, ret, recvlen);
+ hv_kvp_log_info("%s: read: context %p, pending_cnt %llu ret =%d, recvlen=%d\n",
+ __func__, context, (unsigned long long)pending_cnt, ret, recvlen);
}
}
@@ -813,9 +814,9 @@ static void
hv_kvp_dev_destroy(void)
{
- if (daemon_task != NULL) {
+ if (daemon_task != NULL) {
PROC_LOCK(daemon_task);
- kern_psignal(daemon_task, SIGKILL);
+ kern_psignal(daemon_task, SIGKILL);
PROC_UNLOCK(daemon_task);
}
diff --git a/sys/dev/hyperv/utilities/hv_util.c b/sys/dev/hyperv/utilities/hv_util.c
index 3e545cf..dc4b1e2 100644
--- a/sys/dev/hyperv/utilities/hv_util.c
+++ b/sys/dev/hyperv/utilities/hv_util.c
@@ -408,6 +408,15 @@ hv_util_attach(device_t dev)
}
}
+ /*
+ * These services are not performance critical and do not need
+ * batched reading. Furthermore, some services such as KVP can
+ * only handle one message from the host at a time.
+ * Turn off batched reading for all util drivers before we open the
+ * channel.
+ */
+ hv_set_channel_read_state(hv_dev->channel, FALSE);
+
ret = hv_vmbus_channel_open(hv_dev->channel, 4 * PAGE_SIZE,
4 * PAGE_SIZE, NULL, 0,
service->callback, hv_dev->channel);
diff --git a/sys/dev/hyperv/vmbus/hv_channel.c b/sys/dev/hyperv/vmbus/hv_channel.c
index 103260a..94137fb 100644
--- a/sys/dev/hyperv/vmbus/hv_channel.c
+++ b/sys/dev/hyperv/vmbus/hv_channel.c
@@ -75,7 +75,7 @@ vmbus_channel_set_event(hv_vmbus_channel *channel)
(uint32_t *)&monitor_page->
trigger_group[channel->monitor_group].u.pending);
} else {
- hv_vmbus_set_event(channel->offer_msg.child_rel_id);
+ hv_vmbus_set_event(channel);
}
}
@@ -99,6 +99,18 @@ hv_vmbus_channel_open(
hv_vmbus_channel_open_channel* open_msg;
hv_vmbus_channel_msg_info* open_info;
+ mtx_lock(&new_channel->sc_lock);
+ if (new_channel->state == HV_CHANNEL_OPEN_STATE) {
+ new_channel->state = HV_CHANNEL_OPENING_STATE;
+ } else {
+ mtx_unlock(&new_channel->sc_lock);
+ if(bootverbose)
+ printf("VMBUS: Trying to open channel <%p> which in "
+ "%d state.\n", new_channel, new_channel->state);
+ return (EINVAL);
+ }
+ mtx_unlock(&new_channel->sc_lock);
+
new_channel->on_channel_callback = pfn_on_channel_callback;
new_channel->channel_callback_context = context;
@@ -162,7 +174,7 @@ hv_vmbus_channel_open(
new_channel->ring_buffer_gpadl_handle;
open_msg->downstream_ring_buffer_page_offset = send_ring_buffer_size
>> PAGE_SHIFT;
- open_msg->server_context_area_gpadl_handle = 0;
+ open_msg->target_vcpu = new_channel->target_vcpu;
if (user_data_len)
memcpy(open_msg->user_data, user_data, user_data_len);
@@ -182,10 +194,14 @@ hv_vmbus_channel_open(
ret = sema_timedwait(&open_info->wait_sema, 500); /* KYS 5 seconds */
- if (ret)
+ if (ret) {
+ if(bootverbose)
+ printf("VMBUS: channel <%p> open timeout.\n", new_channel);
goto cleanup;
+ }
if (open_info->response.open_result.status == 0) {
+ new_channel->state = HV_CHANNEL_OPENED_STATE;
if(bootverbose)
printf("VMBUS: channel <%p> open success.\n", new_channel);
} else {
@@ -497,16 +513,20 @@ cleanup:
return (ret);
}
-/**
- * @brief Close the specified channel
- */
-void
-hv_vmbus_channel_close(hv_vmbus_channel *channel)
+static void
+hv_vmbus_channel_close_internal(hv_vmbus_channel *channel)
{
int ret = 0;
hv_vmbus_channel_close_channel* msg;
hv_vmbus_channel_msg_info* info;
+ channel->state = HV_CHANNEL_OPEN_STATE;
+ channel->sc_creation_callback = NULL;
+
+ /*
+ * Grab the lock to prevent race condition when a packet received
+ * and unloading driver is in the process.
+ */
mtx_lock(&channel->inbound_lock);
channel->on_channel_callback = NULL;
mtx_unlock(&channel->inbound_lock);
@@ -545,23 +565,37 @@ hv_vmbus_channel_close(hv_vmbus_channel *channel)
M_DEVBUF);
free(info, M_DEVBUF);
+}
+
+/**
+ * @brief Close the specified channel
+ */
+void
+hv_vmbus_channel_close(hv_vmbus_channel *channel)
+{
+ hv_vmbus_channel* sub_channel;
+
+ if (channel->primary_channel != NULL) {
+ /*
+ * We only close multi-channels when the primary is
+ * closed.
+ */
+ return;
+ }
/*
- * If we are closing the channel during an error path in
- * opening the channel, don't free the channel
- * since the caller will free the channel
+ * Close all multi-channels first.
*/
- if (channel->state == HV_CHANNEL_OPEN_STATE) {
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
- TAILQ_REMOVE(
- &hv_vmbus_g_connection.channel_anchor,
- channel,
- list_entry);
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
-
- hv_vmbus_free_vmbus_channel(channel);
+ TAILQ_FOREACH(sub_channel, &channel->sc_list_anchor,
+ sc_list_entry) {
+ if (sub_channel->state != HV_CHANNEL_OPENED_STATE)
+ continue;
+ hv_vmbus_channel_close_internal(sub_channel);
}
-
+ /*
+ * Then close the primary channel.
+ */
+ hv_vmbus_channel_close_internal(channel);
}
/**
@@ -581,6 +615,7 @@ hv_vmbus_channel_send_packet(
uint32_t packet_len;
uint64_t aligned_data;
uint32_t packet_len_aligned;
+ boolean_t need_sig;
hv_vmbus_sg_buffer_list buffer_list[3];
packet_len = sizeof(hv_vm_packet_descriptor) + buffer_len;
@@ -604,12 +639,11 @@ hv_vmbus_channel_send_packet(
buffer_list[2].data = &aligned_data;
buffer_list[2].length = packet_len_aligned - packet_len;
- ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3);
+ ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3,
+ &need_sig);
/* TODO: We should determine if this is optional */
- if (ret == 0
- && !hv_vmbus_get_ring_buffer_interrupt_mask(
- &channel->outbound)) {
+ if (ret == 0 && need_sig) {
vmbus_channel_set_event(channel);
}
@@ -632,6 +666,7 @@ hv_vmbus_channel_send_packet_pagebuffer(
int ret = 0;
int i = 0;
+ boolean_t need_sig;
uint32_t packet_len;
uint32_t packetLen_aligned;
hv_vmbus_sg_buffer_list buffer_list[3];
@@ -675,11 +710,11 @@ hv_vmbus_channel_send_packet_pagebuffer(
buffer_list[2].data = &alignedData;
buffer_list[2].length = packetLen_aligned - packet_len;
- ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3);
+ ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3,
+ &need_sig);
/* TODO: We should determine if this is optional */
- if (ret == 0 &&
- !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) {
+ if (ret == 0 && need_sig) {
vmbus_channel_set_event(channel);
}
@@ -700,6 +735,7 @@ hv_vmbus_channel_send_packet_multipagebuffer(
int ret = 0;
uint32_t desc_size;
+ boolean_t need_sig;
uint32_t packet_len;
uint32_t packet_len_aligned;
uint32_t pfn_count;
@@ -750,11 +786,11 @@ hv_vmbus_channel_send_packet_multipagebuffer(
buffer_list[2].data = &aligned_data;
buffer_list[2].length = packet_len_aligned - packet_len;
- ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3);
+ ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3,
+ &need_sig);
/* TODO: We should determine if this is optional */
- if (ret == 0 &&
- !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) {
+ if (ret == 0 && need_sig) {
vmbus_channel_set_event(channel);
}
diff --git a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
index 011e305..d13ece5 100644
--- a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
+++ b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c
@@ -26,6 +26,9 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
#include <sys/param.h>
#include <sys/mbuf.h>
@@ -50,6 +53,8 @@ static void vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr);
static void vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr);
static void vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr);
static void vmbus_channel_process_offer(void *context);
+struct hv_vmbus_channel*
+ vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary);
/**
* Channel message dispatch table
@@ -233,6 +238,9 @@ hv_vmbus_allocate_channel(void)
return (NULL);
mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF);
+ mtx_init(&channel->sc_lock, "vmbus multi channel", NULL, MTX_DEF);
+
+ TAILQ_INIT(&channel->sc_list_anchor);
channel->control_work_queue = hv_work_queue_create("control");
@@ -262,6 +270,7 @@ ReleaseVmbusChannel(void *context)
void
hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel)
{
+ mtx_destroy(&channel->sc_lock);
mtx_destroy(&channel->inbound_lock);
/*
* We have to release the channel's workqueue/thread in
@@ -279,10 +288,10 @@ hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel)
static void
vmbus_channel_process_offer(void *context)
{
- int ret;
hv_vmbus_channel* new_channel;
boolean_t f_new;
hv_vmbus_channel* channel;
+ int ret;
new_channel = (hv_vmbus_channel*) context;
f_new = TRUE;
@@ -291,38 +300,76 @@ vmbus_channel_process_offer(void *context)
/*
* Make sure this is a new offer
*/
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor,
list_entry)
{
- if (!memcmp(
- &channel->offer_msg.offer.interface_type,
- &new_channel->offer_msg.offer.interface_type,
- sizeof(hv_guid))
- && !memcmp(
- &channel->offer_msg.offer.interface_instance,
+ if (memcmp(&channel->offer_msg.offer.interface_type,
+ &new_channel->offer_msg.offer.interface_type,
+ sizeof(hv_guid)) == 0 &&
+ memcmp(&channel->offer_msg.offer.interface_instance,
&new_channel->offer_msg.offer.interface_instance,
- sizeof(hv_guid))) {
- f_new = FALSE;
- break;
- }
+ sizeof(hv_guid)) == 0) {
+ f_new = FALSE;
+ break;
+ }
}
if (f_new) {
- /* Insert at tail */
- TAILQ_INSERT_TAIL(
- &hv_vmbus_g_connection.channel_anchor,
- new_channel,
- list_entry);
+ /* Insert at tail */
+ TAILQ_INSERT_TAIL(
+ &hv_vmbus_g_connection.channel_anchor,
+ new_channel,
+ list_entry);
}
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
+
+ /*XXX add new channel to percpu_list */
if (!f_new) {
+ /*
+ * Check if this is a sub channel.
+ */
+ if (new_channel->offer_msg.offer.sub_channel_index != 0) {
+ /*
+ * It is a sub channel offer, process it.
+ */
+ new_channel->primary_channel = channel;
+ mtx_lock(&channel->sc_lock);
+ TAILQ_INSERT_TAIL(
+ &channel->sc_list_anchor,
+ new_channel,
+ sc_list_entry);
+ mtx_unlock(&channel->sc_lock);
+
+ /* Insert new channel into channel_anchor. */
+ printf("Storvsc get multi-channel offer, rel=%u.\n",
+ new_channel->offer_msg.child_rel_id);
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
+ TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor,
+ new_channel, list_entry);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
+
+ if(bootverbose)
+ printf("VMBUS: new multi-channel offer <%p>.\n",
+ new_channel);
+
+ /*XXX add it to percpu_list */
+
+ new_channel->state = HV_CHANNEL_OPEN_STATE;
+ if (channel->sc_creation_callback != NULL) {
+ channel->sc_creation_callback(new_channel);
+ }
+ return;
+ }
+
hv_vmbus_free_vmbus_channel(new_channel);
return;
}
+ new_channel->state = HV_CHANNEL_OPEN_STATE;
+
/*
* Start the process of binding this offer to the driver
* (We need to set the device field before calling
@@ -333,35 +380,86 @@ vmbus_channel_process_offer(void *context)
new_channel->offer_msg.offer.interface_instance, new_channel);
/*
- * TODO - the HV_CHANNEL_OPEN_STATE flag should not be set below
- * but in the "open" channel request. The ret != 0 logic below
- * doesn't take into account that a channel
- * may have been opened successfully
- */
-
- /*
* Add the new device to the bus. This will kick off device-driver
* binding which eventually invokes the device driver's AddDevice()
* method.
*/
ret = hv_vmbus_child_device_register(new_channel->device);
if (ret != 0) {
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
- TAILQ_REMOVE(
- &hv_vmbus_g_connection.channel_anchor,
- new_channel,
- list_entry);
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
- hv_vmbus_free_vmbus_channel(new_channel);
- } else {
- /*
- * This state is used to indicate a successful open
- * so that when we do close the channel normally,
- * we can clean up properly
- */
- new_channel->state = HV_CHANNEL_OPEN_STATE;
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
+ TAILQ_REMOVE(
+ &hv_vmbus_g_connection.channel_anchor,
+ new_channel,
+ list_entry);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
+ hv_vmbus_free_vmbus_channel(new_channel);
+ }
+}
+
+/**
+ * Array of device guids that are performance critical. We try to distribute
+ * the interrupt load for these devices across all online cpus.
+ */
+static const hv_guid high_perf_devices[] = {
+ {HV_NIC_GUID, },
+ {HV_IDE_GUID, },
+ {HV_SCSI_GUID, },
+};
+
+enum {
+ PERF_CHN_NIC = 0,
+ PERF_CHN_IDE,
+ PERF_CHN_SCSI,
+ MAX_PERF_CHN,
+};
+
+/*
+ * We use this static number to distribute the channel interrupt load.
+ */
+static uint32_t next_vcpu;
+
+/**
+ * Starting with Win8, we can statically distribute the incoming
+ * channel interrupt load by binding a channel to VCPU. We
+ * implement here a simple round robin scheme for distributing
+ * the interrupt load.
+ * We will bind channels that are not performance critical to cpu 0 and
+ * performance critical channels (IDE, SCSI and Network) will be uniformly
+ * distributed across all available CPUs.
+ */
+static void
+vmbus_channel_select_cpu(hv_vmbus_channel *channel, hv_guid *guid)
+{
+ uint32_t current_cpu;
+ int i;
+ boolean_t is_perf_channel = FALSE;
+
+ for (i = PERF_CHN_NIC; i < MAX_PERF_CHN; i++) {
+ if (memcmp(guid->data, high_perf_devices[i].data,
+ sizeof(hv_guid)) == 0) {
+ is_perf_channel = TRUE;
+ break;
+ }
+ }
+ if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
+ (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7) ||
+ (!is_perf_channel)) {
+ /* Host's view of guest cpu */
+ channel->target_vcpu = 0;
+ /* Guest's own view of cpu */
+ channel->target_cpu = 0;
+ return;
}
+ /* mp_ncpus should have the number cpus currently online */
+ current_cpu = (++next_vcpu % mp_ncpus);
+ channel->target_cpu = current_cpu;
+ channel->target_vcpu =
+ hv_vmbus_g_context.hv_vcpu_index[current_cpu];
+ if (bootverbose)
+ printf("VMBUS: Total online cpus %d, assign perf channel %d "
+ "to vcpu %d, cpu %d\n", mp_ncpus, i, channel->target_vcpu,
+ current_cpu);
}
/**
@@ -391,6 +489,38 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr)
if (new_channel == NULL)
return;
+ /*
+ * By default we setup state to enable batched
+ * reading. A specific service can choose to
+ * disable this prior to opening the channel.
+ */
+ new_channel->batched_reading = TRUE;
+
+ new_channel->signal_event_param =
+ (hv_vmbus_input_signal_event *)
+ (HV_ALIGN_UP((unsigned long)
+ &new_channel->signal_event_buffer,
+ HV_HYPERCALL_PARAM_ALIGN));
+
+ new_channel->signal_event_param->connection_id.as_uint32_t = 0;
+ new_channel->signal_event_param->connection_id.u.id =
+ HV_VMBUS_EVENT_CONNECTION_ID;
+ new_channel->signal_event_param->flag_number = 0;
+ new_channel->signal_event_param->rsvd_z = 0;
+
+ if (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008) {
+ new_channel->is_dedicated_interrupt =
+ (offer->is_dedicated_interrupt != 0);
+ new_channel->signal_event_param->connection_id.u.id =
+ offer->connection_id;
+ }
+
+ /*
+ * Bind the channel to a chosen cpu.
+ */
+ vmbus_channel_select_cpu(new_channel,
+ &offer->offer.interface_type);
+
memcpy(&new_channel->offer_msg, offer,
sizeof(hv_vmbus_channel_offer_channel));
new_channel->monitor_group = (uint8_t) offer->monitor_id / 32;
@@ -666,7 +796,7 @@ hv_vmbus_release_unattached_channels(void)
{
hv_vmbus_channel *channel;
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
while (!TAILQ_EMPTY(&hv_vmbus_g_connection.channel_anchor)) {
channel = TAILQ_FIRST(&hv_vmbus_g_connection.channel_anchor);
@@ -676,5 +806,61 @@ hv_vmbus_release_unattached_channels(void)
hv_vmbus_child_device_unregister(channel->device);
hv_vmbus_free_vmbus_channel(channel);
}
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
+}
+
+/**
+ * @brief Select the best outgoing channel
+ *
+ * The channel whose vcpu binding is closest to the currect vcpu will
+ * be selected.
+ * If no multi-channel, always select primary channel
+ *
+ * @param primary - primary channel
+ */
+struct hv_vmbus_channel *
+vmbus_select_outgoing_channel(struct hv_vmbus_channel *primary)
+{
+ hv_vmbus_channel *new_channel = NULL;
+ hv_vmbus_channel *outgoing_channel = primary;
+ int old_cpu_distance = 0;
+ int new_cpu_distance = 0;
+ int cur_vcpu = 0;
+ int smp_pro_id = PCPU_GET(cpuid);
+
+ if (TAILQ_EMPTY(&primary->sc_list_anchor)) {
+ return outgoing_channel;
+ }
+
+ if (smp_pro_id >= MAXCPU) {
+ return outgoing_channel;
+ }
+
+ cur_vcpu = hv_vmbus_g_context.hv_vcpu_index[smp_pro_id];
+
+ TAILQ_FOREACH(new_channel, &primary->sc_list_anchor, sc_list_entry) {
+ if (new_channel->state != HV_CHANNEL_OPENED_STATE){
+ continue;
+ }
+
+ if (new_channel->target_vcpu == cur_vcpu){
+ return new_channel;
+ }
+
+ old_cpu_distance = ((outgoing_channel->target_vcpu > cur_vcpu) ?
+ (outgoing_channel->target_vcpu - cur_vcpu) :
+ (cur_vcpu - outgoing_channel->target_vcpu));
+
+ new_cpu_distance = ((new_channel->target_vcpu > cur_vcpu) ?
+ (new_channel->target_vcpu - cur_vcpu) :
+ (cur_vcpu - new_channel->target_vcpu));
+
+ if (old_cpu_distance < new_cpu_distance) {
+ continue;
+ }
+
+ outgoing_channel = new_channel;
+ }
+
+ return(outgoing_channel);
}
diff --git a/sys/dev/hyperv/vmbus/hv_connection.c b/sys/dev/hyperv/vmbus/hv_connection.c
index c8e0b48..cc83037 100644
--- a/sys/dev/hyperv/vmbus/hv_connection.c
+++ b/sys/dev/hyperv/vmbus/hv_connection.c
@@ -26,6 +26,9 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/systm.h>
@@ -45,14 +48,113 @@ hv_vmbus_connection hv_vmbus_g_connection =
{ .connect_state = HV_DISCONNECTED,
.next_gpadl_handle = 0xE1E10, };
+uint32_t hv_vmbus_protocal_version = HV_VMBUS_VERSION_WS2008;
+
+static uint32_t
+hv_vmbus_get_next_version(uint32_t current_ver)
+{
+ switch (current_ver) {
+ case (HV_VMBUS_VERSION_WIN7):
+ return(HV_VMBUS_VERSION_WS2008);
+
+ case (HV_VMBUS_VERSION_WIN8):
+ return(HV_VMBUS_VERSION_WIN7);
+
+ case (HV_VMBUS_VERSION_WIN8_1):
+ return(HV_VMBUS_VERSION_WIN8);
+
+ case (HV_VMBUS_VERSION_WS2008):
+ default:
+ return(HV_VMBUS_VERSION_INVALID);
+ }
+}
+
+/**
+ * Negotiate the highest supported hypervisor version.
+ */
+static int
+hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info,
+ uint32_t version)
+{
+ int ret = 0;
+ hv_vmbus_channel_initiate_contact *msg;
+
+ sema_init(&msg_info->wait_sema, 0, "Msg Info Sema");
+ msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg;
+
+ msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT;
+ msg->vmbus_version_requested = version;
+
+ msg->interrupt_page = hv_get_phys_addr(
+ hv_vmbus_g_connection.interrupt_page);
+
+ msg->monitor_page_1 = hv_get_phys_addr(
+ hv_vmbus_g_connection.monitor_pages);
+
+ msg->monitor_page_2 =
+ hv_get_phys_addr(
+ ((uint8_t *) hv_vmbus_g_connection.monitor_pages
+ + PAGE_SIZE));
+
+ /**
+ * Add to list before we send the request since we may receive the
+ * response before returning from this routine
+ */
+ mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+
+ TAILQ_INSERT_TAIL(
+ &hv_vmbus_g_connection.channel_msg_anchor,
+ msg_info,
+ msg_list_entry);
+
+ mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+
+ ret = hv_vmbus_post_message(
+ msg,
+ sizeof(hv_vmbus_channel_initiate_contact));
+
+ if (ret != 0) {
+ mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+ TAILQ_REMOVE(
+ &hv_vmbus_g_connection.channel_msg_anchor,
+ msg_info,
+ msg_list_entry);
+ mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+ return (ret);
+ }
+
+ /**
+ * Wait for the connection response
+ */
+ ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */
+
+ mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+ TAILQ_REMOVE(
+ &hv_vmbus_g_connection.channel_msg_anchor,
+ msg_info,
+ msg_list_entry);
+ mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+
+ /**
+ * Check if successful
+ */
+ if (msg_info->response.version_response.version_supported) {
+ hv_vmbus_g_connection.connect_state = HV_CONNECTED;
+ } else {
+ ret = ECONNREFUSED;
+ }
+
+ return (ret);
+}
+
/**
* Send a connect request on the partition service connection
*/
int
hv_vmbus_connect(void) {
int ret = 0;
+ uint32_t version;
hv_vmbus_channel_msg_info* msg_info = NULL;
- hv_vmbus_channel_initiate_contact* msg;
/**
* Make sure we are not connecting or connected
@@ -74,7 +176,7 @@ hv_vmbus_connect(void) {
TAILQ_INIT(&hv_vmbus_g_connection.channel_anchor);
mtx_init(&hv_vmbus_g_connection.channel_lock, "vmbus channel",
- NULL, MTX_SPIN);
+ NULL, MTX_DEF);
/**
* Setup the vmbus event connection for channel interrupt abstraction
@@ -130,71 +232,30 @@ hv_vmbus_connect(void) {
goto cleanup;
}
- sema_init(&msg_info->wait_sema, 0, "Msg Info Sema");
- msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg;
-
- msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT;
- msg->vmbus_version_requested = HV_VMBUS_REVISION_NUMBER;
-
- msg->interrupt_page = hv_get_phys_addr(
- hv_vmbus_g_connection.interrupt_page);
-
- msg->monitor_page_1 = hv_get_phys_addr(
- hv_vmbus_g_connection.monitor_pages);
-
- msg->monitor_page_2 =
- hv_get_phys_addr(
- ((uint8_t *) hv_vmbus_g_connection.monitor_pages
- + PAGE_SIZE));
-
- /**
- * Add to list before we send the request since we may receive the
- * response before returning from this routine
+ /*
+ * Find the highest vmbus version number we can support.
*/
- mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
-
- TAILQ_INSERT_TAIL(
- &hv_vmbus_g_connection.channel_msg_anchor,
- msg_info,
- msg_list_entry);
-
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
-
- ret = hv_vmbus_post_message(
- msg,
- sizeof(hv_vmbus_channel_initiate_contact));
-
- if (ret != 0) {
- mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
- TAILQ_REMOVE(
- &hv_vmbus_g_connection.channel_msg_anchor,
- msg_info,
- msg_list_entry);
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
- goto cleanup;
- }
+ version = HV_VMBUS_VERSION_CURRENT;
+
+ do {
+ ret = hv_vmbus_negotiate_version(msg_info, version);
+ if (ret == EWOULDBLOCK) {
+ /*
+ * We timed out.
+ */
+ goto cleanup;
+ }
- /**
- * Wait for the connection response
- */
- ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */
+ if (hv_vmbus_g_connection.connect_state == HV_CONNECTED)
+ break;
- mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock);
- TAILQ_REMOVE(
- &hv_vmbus_g_connection.channel_msg_anchor,
- msg_info,
- msg_list_entry);
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock);
+ version = hv_vmbus_get_next_version(version);
+ } while (version != HV_VMBUS_VERSION_INVALID);
- /**
- * Check if successful
- */
- if (msg_info->response.version_response.version_supported) {
- hv_vmbus_g_connection.connect_state = HV_CONNECTED;
- } else {
- ret = ECONNREFUSED;
- goto cleanup;
- }
+ hv_vmbus_protocal_version = version;
+ if (bootverbose)
+ printf("VMBUS: Portocal Version: %d.%d\n",
+ version >> 16, version & 0xFFFF);
sema_destroy(&msg_info->wait_sema);
free(msg_info, M_DEVBUF);
@@ -286,7 +347,7 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) {
* and channels are accessed without the need to take this lock or search
* the list.
*/
- mtx_lock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_lock(&hv_vmbus_g_connection.channel_lock);
TAILQ_FOREACH(channel,
&hv_vmbus_g_connection.channel_anchor, list_entry) {
@@ -295,7 +356,7 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) {
break;
}
}
- mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock);
+ mtx_unlock(&hv_vmbus_g_connection.channel_lock);
return (foundChannel);
}
@@ -306,7 +367,10 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) {
static void
VmbusProcessChannelEvent(uint32_t relid)
{
+ void* arg;
+ uint32_t bytes_to_read;
hv_vmbus_channel* channel;
+ boolean_t is_batched_reading;
/**
* Find the channel based on this relid and invokes
@@ -327,31 +391,98 @@ VmbusProcessChannelEvent(uint32_t relid)
* callback to NULL. This closes the window.
*/
- mtx_lock(&channel->inbound_lock);
+ /*
+ * Disable the lock due to newly added WITNESS check in r277723.
+ * Will seek other way to avoid race condition.
+ * -- whu
+ */
+ // mtx_lock(&channel->inbound_lock);
if (channel->on_channel_callback != NULL) {
- channel->on_channel_callback(channel->channel_callback_context);
+ arg = channel->channel_callback_context;
+ is_batched_reading = channel->batched_reading;
+ /*
+ * Optimize host to guest signaling by ensuring:
+ * 1. While reading the channel, we disable interrupts from
+ * host.
+ * 2. Ensure that we process all posted messages from the host
+ * before returning from this callback.
+ * 3. Once we return, enable signaling from the host. Once this
+ * state is set we check to see if additional packets are
+ * available to read. In this case we repeat the process.
+ */
+ do {
+ if (is_batched_reading)
+ hv_ring_buffer_read_begin(&channel->inbound);
+
+ channel->on_channel_callback(arg);
+
+ if (is_batched_reading)
+ bytes_to_read =
+ hv_ring_buffer_read_end(&channel->inbound);
+ else
+ bytes_to_read = 0;
+ } while (is_batched_reading && (bytes_to_read != 0));
}
- mtx_unlock(&channel->inbound_lock);
+ // mtx_unlock(&channel->inbound_lock);
}
+#ifdef HV_DEBUG_INTR
+extern uint32_t hv_intr_count;
+extern uint32_t hv_vmbus_swintr_event_cpu[MAXCPU];
+extern uint32_t hv_vmbus_intr_cpu[MAXCPU];
+#endif
+
/**
* Handler for events
*/
void
hv_vmbus_on_events(void *arg)
{
- int dword;
int bit;
+ int cpu;
+ int dword;
+ void *page_addr;
+ uint32_t* recv_interrupt_page = NULL;
int rel_id;
- int maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5;
+ int maxdword;
+ hv_vmbus_synic_event_flags *event;
/* int maxdword = PAGE_SIZE >> 3; */
- /*
- * receive size is 1/2 page and divide that by 4 bytes
- */
-
- uint32_t* recv_interrupt_page =
- hv_vmbus_g_connection.recv_interrupt_page;
+ cpu = (int)(long)arg;
+ KASSERT(cpu <= mp_maxid, ("VMBUS: hv_vmbus_on_events: "
+ "cpu out of range!"));
+
+#ifdef HV_DEBUG_INTR
+ int i;
+ hv_vmbus_swintr_event_cpu[cpu]++;
+ if (hv_intr_count % 10000 == 0) {
+ printf("VMBUS: Total interrupt %d\n", hv_intr_count);
+ for (i = 0; i < mp_ncpus; i++)
+ printf("VMBUS: hw cpu[%d]: %d, event sw intr cpu[%d]: %d\n",
+ i, hv_vmbus_intr_cpu[i], i, hv_vmbus_swintr_event_cpu[i]);
+ }
+#endif
+
+ if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
+ (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) {
+ maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5;
+ /*
+ * receive size is 1/2 page and divide that by 4 bytes
+ */
+ recv_interrupt_page =
+ hv_vmbus_g_connection.recv_interrupt_page;
+ } else {
+ /*
+ * On Host with Win8 or above, the event page can be
+ * checked directly to get the id of the channel
+ * that has the pending interrupt.
+ */
+ maxdword = HV_EVENT_FLAGS_DWORD_COUNT;
+ page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu];
+ event = (hv_vmbus_synic_event_flags *)
+ page_addr + HV_VMBUS_MESSAGE_SINT;
+ recv_interrupt_page = event->flags32;
+ }
/*
* Check events
@@ -416,16 +547,16 @@ int hv_vmbus_post_message(void *buffer, size_t bufferLen) {
* Send an event notification to the parent
*/
int
-hv_vmbus_set_event(uint32_t child_rel_id) {
+hv_vmbus_set_event(hv_vmbus_channel *channel) {
int ret = 0;
+ uint32_t child_rel_id = channel->offer_msg.child_rel_id;
/* Each uint32_t represents 32 channels */
synch_set_bit(child_rel_id & 31,
(((uint32_t *)hv_vmbus_g_connection.send_interrupt_page
+ (child_rel_id >> 5))));
- ret = hv_vmbus_signal_event();
+ ret = hv_vmbus_signal_event(channel->signal_event_param);
return (ret);
}
-
diff --git a/sys/dev/hyperv/vmbus/hv_hv.c b/sys/dev/hyperv/vmbus/hv_hv.c
index 80a1f42..84e2a5e 100644
--- a/sys/dev/hyperv/vmbus/hv_hv.c
+++ b/sys/dev/hyperv/vmbus/hv_hv.c
@@ -67,8 +67,6 @@ static inline void do_cpuid_inline(unsigned int op, unsigned int *eax,
hv_vmbus_context hv_vmbus_g_context = {
.syn_ic_initialized = FALSE,
.hypercall_page = NULL,
- .signal_event_param = NULL,
- .signal_event_buffer = NULL,
};
static struct timecounter hv_timecounter = {
@@ -256,28 +254,6 @@ hv_vmbus_init(void)
hv_vmbus_g_context.hypercall_page = virt_addr;
- /*
- * Setup the global signal event param for the signal event hypercall
- */
- hv_vmbus_g_context.signal_event_buffer =
- malloc(sizeof(hv_vmbus_input_signal_event_buffer), M_DEVBUF,
- M_ZERO | M_NOWAIT);
- KASSERT(hv_vmbus_g_context.signal_event_buffer != NULL,
- ("Error VMBUS: Failed to allocate signal_event_buffer\n"));
- if (hv_vmbus_g_context.signal_event_buffer == NULL)
- goto cleanup;
-
- hv_vmbus_g_context.signal_event_param =
- (hv_vmbus_input_signal_event*)
- (HV_ALIGN_UP((unsigned long)
- hv_vmbus_g_context.signal_event_buffer,
- HV_HYPERCALL_PARAM_ALIGN));
- hv_vmbus_g_context.signal_event_param->connection_id.as_uint32_t = 0;
- hv_vmbus_g_context.signal_event_param->connection_id.u.id =
- HV_VMBUS_EVENT_CONNECTION_ID;
- hv_vmbus_g_context.signal_event_param->flag_number = 0;
- hv_vmbus_g_context.signal_event_param->rsvd_z = 0;
-
tc_init(&hv_timecounter); /* register virtual timecount */
return (0);
@@ -303,12 +279,6 @@ hv_vmbus_cleanup(void)
{
hv_vmbus_x64_msr_hypercall_contents hypercall_msr;
- if (hv_vmbus_g_context.signal_event_buffer != NULL) {
- free(hv_vmbus_g_context.signal_event_buffer, M_DEVBUF);
- hv_vmbus_g_context.signal_event_buffer = NULL;
- hv_vmbus_g_context.signal_event_param = NULL;
- }
-
if (hv_vmbus_g_context.guest_id == HV_FREEBSD_GUEST_ID) {
if (hv_vmbus_g_context.hypercall_page != NULL) {
hypercall_msr.as_uint64_t = 0;
@@ -370,13 +340,13 @@ hv_vmbus_post_msg_via_msg_ipc(
* event IPC. (This involves a hypercall.)
*/
hv_vmbus_status
-hv_vmbus_signal_event()
+hv_vmbus_signal_event(void *con_id)
{
hv_vmbus_status status;
status = hv_vmbus_do_hypercall(
HV_CALL_SIGNAL_EVENT,
- hv_vmbus_g_context.signal_event_param,
+ con_id,
0) & 0xFFFF;
return (status);
@@ -390,6 +360,7 @@ hv_vmbus_synic_init(void *arg)
{
int cpu;
+ uint64_t hv_vcpu_index;
hv_vmbus_synic_simp simp;
hv_vmbus_synic_siefp siefp;
hv_vmbus_synic_scontrol sctrl;
@@ -403,23 +374,14 @@ hv_vmbus_synic_init(void *arg)
return;
/*
- * KYS: Looks like we can only initialize on cpu0; don't we support
- * SMP guests?
- *
- * TODO: Need to add SMP support for FreeBSD V9
- */
-
- if (cpu != 0)
- return;
-
- /*
* TODO: Check the version
*/
version = rdmsr(HV_X64_MSR_SVERSION);
-
- hv_vmbus_g_context.syn_ic_msg_page[cpu] = setup_args->page_buffers[0];
- hv_vmbus_g_context.syn_ic_event_page[cpu] = setup_args->page_buffers[1];
+ hv_vmbus_g_context.syn_ic_msg_page[cpu] =
+ setup_args->page_buffers[2 * cpu];
+ hv_vmbus_g_context.syn_ic_event_page[cpu] =
+ setup_args->page_buffers[2 * cpu + 1];
/*
* Setup the Synic's message page
@@ -443,9 +405,10 @@ hv_vmbus_synic_init(void *arg)
wrmsr(HV_X64_MSR_SIEFP, siefp.as_uint64_t);
/*HV_SHARED_SINT_IDT_VECTOR + 0x20; */
+ shared_sint.as_uint64_t = 0;
shared_sint.u.vector = setup_args->vector;
shared_sint.u.masked = FALSE;
- shared_sint.u.auto_eoi = FALSE;
+ shared_sint.u.auto_eoi = TRUE;
wrmsr(HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT,
shared_sint.as_uint64_t);
@@ -458,6 +421,13 @@ hv_vmbus_synic_init(void *arg)
hv_vmbus_g_context.syn_ic_initialized = TRUE;
+ /*
+ * Set up the cpuid mapping from Hyper-V to FreeBSD.
+ * The array is indexed using FreeBSD cpuid.
+ */
+ hv_vcpu_index = rdmsr(HV_X64_MSR_VP_INDEX);
+ hv_vmbus_g_context.hv_vcpu_index[cpu] = (uint32_t)hv_vcpu_index;
+
return;
}
@@ -469,14 +439,10 @@ void hv_vmbus_synic_cleanup(void *arg)
hv_vmbus_synic_sint shared_sint;
hv_vmbus_synic_simp simp;
hv_vmbus_synic_siefp siefp;
- int cpu = PCPU_GET(cpuid);
if (!hv_vmbus_g_context.syn_ic_initialized)
return;
- if (cpu != 0)
- return; /* TODO: XXXKYS: SMP? */
-
shared_sint.as_uint64_t = rdmsr(
HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT);
diff --git a/sys/dev/hyperv/vmbus/hv_ring_buffer.c b/sys/dev/hyperv/vmbus/hv_ring_buffer.c
index f7c1965..0e51ef7 100644
--- a/sys/dev/hyperv/vmbus/hv_ring_buffer.c
+++ b/sys/dev/hyperv/vmbus/hv_ring_buffer.c
@@ -26,6 +26,8 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/lock.h>
@@ -144,6 +146,69 @@ get_ring_buffer_indices(hv_vmbus_ring_buffer_info* ring_info)
return (uint64_t) ring_info->ring_buffer->write_index << 32;
}
+void
+hv_ring_buffer_read_begin(
+ hv_vmbus_ring_buffer_info* ring_info)
+{
+ ring_info->ring_buffer->interrupt_mask = 1;
+ mb();
+}
+
+uint32_t
+hv_ring_buffer_read_end(
+ hv_vmbus_ring_buffer_info* ring_info)
+{
+ uint32_t read, write;
+
+ ring_info->ring_buffer->interrupt_mask = 0;
+ mb();
+
+ /*
+ * Now check to see if the ring buffer is still empty.
+ * If it is not, we raced and we need to process new
+ * incoming messages.
+ */
+ get_ring_buffer_avail_bytes(ring_info, &read, &write);
+
+ return (read);
+}
+
+/*
+ * When we write to the ring buffer, check if the host needs to
+ * be signaled. Here is the details of this protocol:
+ *
+ * 1. The host guarantees that while it is draining the
+ * ring buffer, it will set the interrupt_mask to
+ * indicate it does not need to be interrupted when
+ * new data is placed.
+ *
+ * 2. The host guarantees that it will completely drain
+ * the ring buffer before exiting the read loop. Further,
+ * once the ring buffer is empty, it will clear the
+ * interrupt_mask and re-check to see if new data has
+ * arrived.
+ */
+static boolean_t
+hv_ring_buffer_needsig_on_write(
+ uint32_t old_write_location,
+ hv_vmbus_ring_buffer_info* rbi)
+{
+ mb();
+ if (rbi->ring_buffer->interrupt_mask)
+ return (FALSE);
+
+ /* Read memory barrier */
+ rmb();
+ /*
+ * This is the only case we need to signal when the
+ * ring transitions from being empty to non-empty.
+ */
+ if (old_write_location == rbi->ring_buffer->read_index)
+ return (TRUE);
+
+ return (FALSE);
+}
+
static uint32_t copy_to_ring_buffer(
hv_vmbus_ring_buffer_info* ring_info,
uint32_t start_write_offset,
@@ -204,11 +269,13 @@ int
hv_ring_buffer_write(
hv_vmbus_ring_buffer_info* out_ring_info,
hv_vmbus_sg_buffer_list sg_buffers[],
- uint32_t sg_buffer_count)
+ uint32_t sg_buffer_count,
+ boolean_t *need_sig)
{
int i = 0;
uint32_t byte_avail_to_write;
uint32_t byte_avail_to_read;
+ uint32_t old_write_location;
uint32_t total_bytes_to_write = 0;
volatile uint32_t next_write_location;
@@ -242,6 +309,8 @@ hv_ring_buffer_write(
*/
next_write_location = get_next_write_location(out_ring_info);
+ old_write_location = next_write_location;
+
for (i = 0; i < sg_buffer_count; i++) {
next_write_location = copy_to_ring_buffer(out_ring_info,
next_write_location, (char *) sg_buffers[i].data,
@@ -258,9 +327,9 @@ hv_ring_buffer_write(
(char *) &prev_indices, sizeof(uint64_t));
/*
- * Make sure we flush all writes before updating the writeIndex
+ * Full memory barrier before upding the write index.
*/
- wmb();
+ mb();
/*
* Now, update the write location
@@ -269,6 +338,9 @@ hv_ring_buffer_write(
mtx_unlock_spin(&out_ring_info->ring_lock);
+ *need_sig = hv_ring_buffer_needsig_on_write(old_write_location,
+ out_ring_info);
+
return (0);
}
diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
index ca28fd5..91813bb 100644
--- a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
+++ b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c
@@ -53,22 +53,17 @@ __FBSDID("$FreeBSD$");
#include <machine/stdarg.h>
#include <machine/intr_machdep.h>
+#include <machine/md_var.h>
+#include <machine/segments.h>
#include <sys/pcpu.h>
+#include <machine/apicvar.h>
#include "hv_vmbus_priv.h"
#define VMBUS_IRQ 0x5
-static struct intr_event *hv_msg_intr_event;
-static struct intr_event *hv_event_intr_event;
-static void *msg_swintr;
-static void *event_swintr;
static device_t vmbus_devp;
-static void *vmbus_cookiep;
-static int vmbus_rid;
-struct resource *intr_res;
-static int vmbus_irq = VMBUS_IRQ;
static int vmbus_inited;
static hv_setup_args setup_args; /* only CPU 0 supported at this time */
@@ -77,14 +72,17 @@ static hv_setup_args setup_args; /* only CPU 0 supported at this time */
* the hypervisor.
*/
static void
-vmbus_msg_swintr(void *dummy)
+vmbus_msg_swintr(void *arg)
{
int cpu;
void* page_addr;
hv_vmbus_message* msg;
hv_vmbus_message* copied;
- cpu = PCPU_GET(cpuid);
+ cpu = (int)(long)arg;
+ KASSERT(cpu <= mp_maxid, ("VMBUS: vmbus_msg_swintr: "
+ "cpu out of range!"));
+
page_addr = hv_vmbus_g_context.syn_ic_msg_page[cpu];
msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT;
@@ -130,17 +128,8 @@ vmbus_msg_swintr(void *dummy)
*
* The purpose of this routine is to determine the type of VMBUS protocol
* message to process - an event or a channel message.
- * As this is an interrupt filter routine, the function runs in a very
- * restricted envinronment. From the manpage for bus_setup_intr(9)
- *
- * In this restricted environment, care must be taken to account for all
- * races. A careful analysis of races should be done as well. It is gener-
- * ally cheaper to take an extra interrupt, for example, than to protect
- * variables with spinlocks. Read, modify, write cycles of hardware regis-
- * ters need to be carefully analyzed if other threads are accessing the
- * same registers.
*/
-static int
+static inline int
hv_vmbus_isr(void *unused)
{
int cpu;
@@ -149,8 +138,6 @@ hv_vmbus_isr(void *unused)
void* page_addr;
cpu = PCPU_GET(cpuid);
- /* (Temporary limit) */
- KASSERT(cpu == 0, ("hv_vmbus_isr: Interrupt on CPU other than zero"));
/*
* The Windows team has advised that we check for events
@@ -162,9 +149,21 @@ hv_vmbus_isr(void *unused)
event = (hv_vmbus_synic_event_flags*)
page_addr + HV_VMBUS_MESSAGE_SINT;
- /* Since we are a child, we only need to check bit 0 */
- if (synch_test_and_clear_bit(0, &event->flags32[0])) {
- swi_sched(event_swintr, 0);
+ if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) ||
+ (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) {
+ /* Since we are a child, we only need to check bit 0 */
+ if (synch_test_and_clear_bit(0, &event->flags32[0])) {
+ swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0);
+ }
+ } else {
+ /*
+ * On host with Win8 or above, we can directly look at
+ * the event page. If bit n is set, we have an interrupt
+ * on the channel with id n.
+ * Directly schedule the event software interrupt on
+ * current cpu.
+ */
+ swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0);
}
/* Check if there are actual msgs to be process */
@@ -172,12 +171,47 @@ hv_vmbus_isr(void *unused)
msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT;
if (msg->header.message_type != HV_MESSAGE_TYPE_NONE) {
- swi_sched(msg_swintr, 0);
+ swi_sched(hv_vmbus_g_context.msg_swintr[cpu], 0);
}
return FILTER_HANDLED;
}
+#ifdef HV_DEBUG_INTR
+uint32_t hv_intr_count = 0;
+#endif
+uint32_t hv_vmbus_swintr_event_cpu[MAXCPU];
+uint32_t hv_vmbus_intr_cpu[MAXCPU];
+
+void
+hv_vector_handler(struct trapframe *trap_frame)
+{
+#ifdef HV_DEBUG_INTR
+ int cpu;
+#endif
+
+ /*
+ * Disable preemption.
+ */
+ critical_enter();
+
+#ifdef HV_DEBUG_INTR
+ /*
+ * Do a little interrupt counting.
+ */
+ cpu = PCPU_GET(cpuid);
+ hv_vmbus_intr_cpu[cpu]++;
+ hv_intr_count++;
+#endif
+
+ hv_vmbus_isr(NULL);
+
+ /*
+ * Enable preemption.
+ */
+ critical_exit();
+}
+
static int
vmbus_read_ivar(
device_t dev,
@@ -316,6 +350,81 @@ vmbus_probe(device_t dev) {
return (BUS_PROBE_NOWILDCARD);
}
+#ifdef HYPERV
+extern inthand_t IDTVEC(rsvd), IDTVEC(hv_vmbus_callback);
+
+/**
+ * @brief Find a free IDT slot and setup the interrupt handler.
+ */
+static int
+vmbus_vector_alloc(void)
+{
+ int vector;
+ uintptr_t func;
+ struct gate_descriptor *ip;
+
+ /*
+ * Search backwards form the highest IDT vector available for use
+ * as vmbus channel callback vector. We install 'hv_vmbus_callback'
+ * handler at that vector and use it to interrupt vcpus.
+ */
+ vector = APIC_SPURIOUS_INT;
+ while (--vector >= APIC_IPI_INTS) {
+ ip = &idt[vector];
+ func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+ if (func == (uintptr_t)&IDTVEC(rsvd)) {
+#ifdef __i386__
+ setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYS386IGT,
+ SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+#else
+ setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYSIGT,
+ SEL_KPL, 0);
+#endif
+
+ return (vector);
+ }
+ }
+ return (0);
+}
+
+/**
+ * @brief Restore the IDT slot to rsvd.
+ */
+static void
+vmbus_vector_free(int vector)
+{
+ uintptr_t func;
+ struct gate_descriptor *ip;
+
+ if (vector == 0)
+ return;
+
+ KASSERT(vector >= APIC_IPI_INTS && vector < APIC_SPURIOUS_INT,
+ ("invalid vector %d", vector));
+
+ ip = &idt[vector];
+ func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+ KASSERT(func == (uintptr_t)&IDTVEC(hv_vmbus_callback),
+ ("invalid vector %d", vector));
+
+ setidt(vector, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+}
+
+#else /* HYPERV */
+
+static int
+vmbus_vector_alloc(void)
+{
+ return(0);
+}
+
+static void
+vmbus_vector_free(int vector)
+{
+}
+
+#endif /* HYPERV */
+
/**
* @brief Main vmbus driver initialization routine.
*
@@ -331,22 +440,7 @@ vmbus_probe(device_t dev) {
static int
vmbus_bus_init(void)
{
- struct ioapic_intsrc {
- struct intsrc io_intsrc;
- u_int io_irq;
- u_int io_intpin:8;
- u_int io_vector:8;
- u_int io_cpu:8;
- u_int io_activehi:1;
- u_int io_edgetrigger:1;
- u_int io_masked:1;
- int io_bus:4;
- uint32_t io_lowreg;
- };
- int i, ret;
- unsigned int vector = 0;
- struct intsrc *isrc;
- struct ioapic_intsrc *intpin;
+ int i, j, n, ret;
if (vmbus_inited)
return (0);
@@ -361,80 +455,100 @@ vmbus_bus_init(void)
return (ret);
}
- ret = swi_add(&hv_msg_intr_event, "hv_msg", vmbus_msg_swintr,
- NULL, SWI_CLOCK, 0, &msg_swintr);
-
- if (ret)
- goto cleanup;
-
/*
- * Message SW interrupt handler checks a per-CPU page and
- * thus the thread needs to be bound to CPU-0 - which is where
- * all interrupts are processed.
+ * Find a free IDT slot for vmbus callback.
*/
- ret = intr_event_bind(hv_msg_intr_event, 0);
-
- if (ret)
- goto cleanup1;
+ hv_vmbus_g_context.hv_cb_vector = vmbus_vector_alloc();
- ret = swi_add(&hv_event_intr_event, "hv_event", hv_vmbus_on_events,
- NULL, SWI_CLOCK, 0, &event_swintr);
-
- if (ret)
- goto cleanup1;
+ if (hv_vmbus_g_context.hv_cb_vector == 0) {
+ if(bootverbose)
+ printf("Error VMBUS: Cannot find free IDT slot for "
+ "vmbus callback!\n");
+ goto cleanup;
+ }
- intr_res = bus_alloc_resource(vmbus_devp,
- SYS_RES_IRQ, &vmbus_rid, vmbus_irq, vmbus_irq, 1, RF_ACTIVE);
+ if(bootverbose)
+ printf("VMBUS: vmbus callback vector %d\n",
+ hv_vmbus_g_context.hv_cb_vector);
- if (intr_res == NULL) {
- ret = ENOMEM; /* XXXKYS: Need a better errno */
- goto cleanup2;
+ /*
+ * Notify the hypervisor of our vector.
+ */
+ setup_args.vector = hv_vmbus_g_context.hv_cb_vector;
+
+ CPU_FOREACH(j) {
+ hv_vmbus_intr_cpu[j] = 0;
+ hv_vmbus_swintr_event_cpu[j] = 0;
+ hv_vmbus_g_context.hv_event_intr_event[j] = NULL;
+ hv_vmbus_g_context.hv_msg_intr_event[j] = NULL;
+ hv_vmbus_g_context.event_swintr[j] = NULL;
+ hv_vmbus_g_context.msg_swintr[j] = NULL;
+
+ for (i = 0; i < 2; i++)
+ setup_args.page_buffers[2 * j + i] = NULL;
}
/*
- * Setup interrupt filter handler
+ * Per cpu setup.
*/
- ret = bus_setup_intr(vmbus_devp, intr_res,
- INTR_TYPE_NET | INTR_MPSAFE, hv_vmbus_isr, NULL,
- NULL, &vmbus_cookiep);
-
- if (ret != 0)
- goto cleanup3;
-
- ret = bus_bind_intr(vmbus_devp, intr_res, 0);
- if (ret != 0)
- goto cleanup4;
-
- isrc = intr_lookup_source(vmbus_irq);
- if ((isrc == NULL) || (isrc->is_event == NULL)) {
- ret = EINVAL;
- goto cleanup4;
- }
+ CPU_FOREACH(j) {
+ /*
+ * Setup software interrupt thread and handler for msg handling.
+ */
+ ret = swi_add(&hv_vmbus_g_context.hv_msg_intr_event[j],
+ "hv_msg", vmbus_msg_swintr, (void *)(long)j, SWI_CLOCK, 0,
+ &hv_vmbus_g_context.msg_swintr[j]);
+ if (ret) {
+ if(bootverbose)
+ printf("VMBUS: failed to setup msg swi for "
+ "cpu %d\n", j);
+ goto cleanup1;
+ }
- /* vector = isrc->is_event->ie_vector; */
- intpin = (struct ioapic_intsrc *)isrc;
- vector = intpin->io_vector;
+ /*
+ * Bind the swi thread to the cpu.
+ */
+ ret = intr_event_bind(hv_vmbus_g_context.hv_msg_intr_event[j],
+ j);
+ if (ret) {
+ if(bootverbose)
+ printf("VMBUS: failed to bind msg swi thread "
+ "to cpu %d\n", j);
+ goto cleanup1;
+ }
- if(bootverbose)
- printf("VMBUS: irq 0x%x vector 0x%x\n", vmbus_irq, vector);
+ /*
+ * Setup software interrupt thread and handler for
+ * event handling.
+ */
+ ret = swi_add(&hv_vmbus_g_context.hv_event_intr_event[j],
+ "hv_event", hv_vmbus_on_events, (void *)(long)j,
+ SWI_CLOCK, 0, &hv_vmbus_g_context.event_swintr[j]);
+ if (ret) {
+ if(bootverbose)
+ printf("VMBUS: failed to setup event swi for "
+ "cpu %d\n", j);
+ goto cleanup1;
+ }
- /**
- * Notify the hypervisor of our irq.
- */
- setup_args.vector = vector;
- for(i = 0; i < 2; i++) {
- setup_args.page_buffers[i] =
+ /*
+ * Prepare the per cpu msg and event pages to be called on each cpu.
+ */
+ for(i = 0; i < 2; i++) {
+ setup_args.page_buffers[2 * j + i] =
malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO);
- if (setup_args.page_buffers[i] == NULL) {
- KASSERT(setup_args.page_buffers[i] != NULL,
+ if (setup_args.page_buffers[2 * j + i] == NULL) {
+ KASSERT(setup_args.page_buffers[2 * j + i] != NULL,
("Error VMBUS: malloc failed!"));
- if (i > 0)
- free(setup_args.page_buffers[0], M_DEVBUF);
- goto cleanup4;
+ goto cleanup1;
+ }
}
}
- /* only CPU #0 supported at this time */
+ if (bootverbose)
+ printf("VMBUS: Calling smp_rendezvous, smp_started = %d\n",
+ smp_started);
+
smp_rendezvous(NULL, hv_vmbus_synic_init, NULL, &setup_args);
/*
@@ -443,26 +557,32 @@ vmbus_bus_init(void)
ret = hv_vmbus_connect();
if (ret != 0)
- goto cleanup4;
+ goto cleanup1;
hv_vmbus_request_channel_offers();
return (ret);
- cleanup4:
-
+ cleanup1:
/*
- * remove swi, bus and intr resource
+ * Free pages alloc'ed
*/
- bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep);
+ for (n = 0; n < 2 * MAXCPU; n++)
+ if (setup_args.page_buffers[n] != NULL)
+ free(setup_args.page_buffers[n], M_DEVBUF);
- cleanup3:
- bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res);
-
- cleanup2:
- swi_remove(event_swintr);
+ /*
+ * remove swi and vmbus callback vector;
+ */
+ CPU_FOREACH(j) {
+ if (hv_vmbus_g_context.msg_swintr[j] != NULL)
+ swi_remove(hv_vmbus_g_context.msg_swintr[j]);
+ if (hv_vmbus_g_context.event_swintr[j] != NULL)
+ swi_remove(hv_vmbus_g_context.event_swintr[j]);
+ hv_vmbus_g_context.hv_msg_intr_event[j] = NULL;
+ hv_vmbus_g_context.hv_event_intr_event[j] = NULL;
+ }
- cleanup1:
- swi_remove(msg_swintr);
+ vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector);
cleanup:
hv_vmbus_cleanup();
@@ -515,20 +635,24 @@ vmbus_bus_exit(void)
smp_rendezvous(NULL, hv_vmbus_synic_cleanup, NULL, NULL);
- for(i = 0; i < 2; i++) {
+ for(i = 0; i < 2 * MAXCPU; i++) {
if (setup_args.page_buffers[i] != 0)
free(setup_args.page_buffers[i], M_DEVBUF);
}
hv_vmbus_cleanup();
- /* remove swi, bus and intr resource */
- bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep);
-
- bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res);
+ /* remove swi */
+ CPU_FOREACH(i) {
+ if (hv_vmbus_g_context.msg_swintr[i] != NULL)
+ swi_remove(hv_vmbus_g_context.msg_swintr[i]);
+ if (hv_vmbus_g_context.event_swintr[i] != NULL)
+ swi_remove(hv_vmbus_g_context.event_swintr[i]);
+ hv_vmbus_g_context.hv_msg_intr_event[i] = NULL;
+ hv_vmbus_g_context.hv_event_intr_event[i] = NULL;
+ }
- swi_remove(msg_swintr);
- swi_remove(event_swintr);
+ vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector);
return;
}
@@ -603,6 +727,6 @@ devclass_t vmbus_devclass;
DRIVER_MODULE(vmbus, nexus, vmbus_driver, vmbus_devclass, vmbus_modevent, 0);
MODULE_VERSION(vmbus,1);
-/* TODO: We want to be earlier than SI_SUB_VFS */
-SYSINIT(vmb_init, SI_SUB_VFS, SI_ORDER_MIDDLE, vmbus_init, NULL);
+/* We want to be started after SMP is initialized */
+SYSINIT(vmb_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, vmbus_init, NULL);
diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
index 6bc875d..faa6dec 100644
--- a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
+++ b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h
@@ -181,49 +181,30 @@ enum {
#define HV_HYPERCALL_PARAM_ALIGN sizeof(uint64_t)
-/*
- * Connection identifier type
- */
-typedef union {
- uint32_t as_uint32_t;
- struct {
- uint32_t id:24;
- uint32_t reserved:8;
- } u;
-
-} __packed hv_vmbus_connection_id;
-
-/*
- * Definition of the hv_vmbus_signal_event hypercall input structure
- */
-typedef struct {
- hv_vmbus_connection_id connection_id;
- uint16_t flag_number;
- uint16_t rsvd_z;
-} __packed hv_vmbus_input_signal_event;
-
-typedef struct {
- uint64_t align8;
- hv_vmbus_input_signal_event event;
-} __packed hv_vmbus_input_signal_event_buffer;
-
typedef struct {
uint64_t guest_id;
void* hypercall_page;
hv_bool_uint8_t syn_ic_initialized;
+
+ hv_vmbus_handle syn_ic_msg_page[MAXCPU];
+ hv_vmbus_handle syn_ic_event_page[MAXCPU];
/*
- * This is used as an input param to HV_CALL_SIGNAL_EVENT hypercall.
- * The input param is immutable in our usage and
- * must be dynamic mem (vs stack or global).
+ * For FreeBSD cpuid to Hyper-V vcpuid mapping.
*/
- hv_vmbus_input_signal_event_buffer *signal_event_buffer;
+ uint32_t hv_vcpu_index[MAXCPU];
/*
- * 8-bytes aligned of the buffer above
+ * Each cpu has its own software interrupt handler for channel
+ * event and msg handling.
*/
- hv_vmbus_input_signal_event *signal_event_param;
-
- hv_vmbus_handle syn_ic_msg_page[MAXCPU];
- hv_vmbus_handle syn_ic_event_page[MAXCPU];
+ struct intr_event *hv_event_intr_event[MAXCPU];
+ struct intr_event *hv_msg_intr_event[MAXCPU];
+ void *event_swintr[MAXCPU];
+ void *msg_swintr[MAXCPU];
+ /*
+ * Host use this vector to intrrupt guest for vmbus channel
+ * event and msg.
+ */
+ unsigned int hv_cb_vector;
} hv_vmbus_context;
/*
@@ -368,7 +349,8 @@ typedef struct {
TAILQ_HEAD(, hv_vmbus_channel_msg_info) channel_msg_anchor;
struct mtx channel_msg_lock;
/**
- * List of channels
+ * List of primary channels. Sub channels will be linked
+ * under their primary channel.
*/
TAILQ_HEAD(, hv_vmbus_channel) channel_anchor;
struct mtx channel_lock;
@@ -560,6 +542,8 @@ typedef union {
uint32_t flags32[HV_EVENT_FLAGS_DWORD_COUNT];
} hv_vmbus_synic_event_flags;
+/* MSR used to provide vcpu index */
+#define HV_X64_MSR_VP_INDEX (0x40000002)
/*
* Define synthetic interrupt controller model specific registers
@@ -618,7 +602,8 @@ void hv_ring_buffer_cleanup(
int hv_ring_buffer_write(
hv_vmbus_ring_buffer_info *ring_info,
hv_vmbus_sg_buffer_list sg_buffers[],
- uint32_t sg_buff_count);
+ uint32_t sg_buff_count,
+ boolean_t *need_sig);
int hv_ring_buffer_peek(
hv_vmbus_ring_buffer_info *ring_info,
@@ -638,6 +623,12 @@ void hv_vmbus_dump_ring_info(
hv_vmbus_ring_buffer_info *ring_info,
char *prefix);
+void hv_ring_buffer_read_begin(
+ hv_vmbus_ring_buffer_info *ring_info);
+
+uint32_t hv_ring_buffer_read_end(
+ hv_vmbus_ring_buffer_info *ring_info);
+
hv_vmbus_channel* hv_vmbus_allocate_channel(void);
void hv_vmbus_free_vmbus_channel(hv_vmbus_channel *channel);
void hv_vmbus_on_channel_message(void *context);
@@ -652,7 +643,7 @@ uint16_t hv_vmbus_post_msg_via_msg_ipc(
void *payload,
size_t payload_size);
-uint16_t hv_vmbus_signal_event(void);
+uint16_t hv_vmbus_signal_event(void *con_id);
void hv_vmbus_synic_init(void *irq_arg);
void hv_vmbus_synic_cleanup(void *arg);
int hv_vmbus_query_hypervisor_presence(void);
@@ -674,7 +665,7 @@ hv_vmbus_channel* hv_vmbus_get_channel_from_rel_id(uint32_t rel_id);
int hv_vmbus_connect(void);
int hv_vmbus_disconnect(void);
int hv_vmbus_post_message(void *buffer, size_t buf_size);
-int hv_vmbus_set_event(uint32_t child_rel_id);
+int hv_vmbus_set_event(hv_vmbus_channel *channel);
void hv_vmbus_on_events(void *);
@@ -718,7 +709,7 @@ static inline uint64_t hv_generate_guest_id(
typedef struct {
unsigned int vector;
- void *page_buffers[2];
+ void *page_buffers[2 * MAXCPU];
} hv_setup_args;
#endif /* __HYPERV_PRIV_H__ */
diff --git a/sys/i386/conf/GENERIC b/sys/i386/conf/GENERIC
index 52ea39c..2b00964 100644
--- a/sys/i386/conf/GENERIC
+++ b/sys/i386/conf/GENERIC
@@ -356,7 +356,9 @@ device virtio_blk # VirtIO Block device
device virtio_scsi # VirtIO SCSI device
device virtio_balloon # VirtIO Memory Balloon device
-# HyperV drivers
+# HyperV drivers and enchancement support
+# NOTE: HYPERV depends on hyperv. They must be added or removed together.
+options HYPERV # Hyper-V kernel infrastructure
device hyperv # HyperV drivers
# Xen HVM Guest Optimizations
diff --git a/sys/i386/i386/apic_vector.s b/sys/i386/i386/apic_vector.s
index 3d1999e..da7eac6 100644
--- a/sys/i386/i386/apic_vector.s
+++ b/sys/i386/i386/apic_vector.s
@@ -157,6 +157,25 @@ IDTVEC(xen_intr_upcall)
jmp doreti
#endif
+#ifdef HYPERV
+/*
+ * This is the Hyper-V vmbus channel direct callback interrupt.
+ * Only used when it is running on Hyper-V.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(hv_vmbus_callback)
+ PUSH_FRAME
+ SET_KERNEL_SREGS
+ cld
+ FAKE_MCOUNT(TF_EIP(%esp))
+ pushl %esp
+ call hv_vector_handler
+ add $4, %esp
+ MEXITCOUNT
+ jmp doreti
+#endif
+
#ifdef SMP
/*
* Global address space TLB shootdown.
OpenPOWER on IntegriCloud