summaryrefslogtreecommitdiffstats
path: root/usr.sbin/bhyve
diff options
context:
space:
mode:
authorneel <neel@FreeBSD.org>2013-01-30 04:30:36 +0000
committerneel <neel@FreeBSD.org>2013-01-30 04:30:36 +0000
commit6b8dd85cc6a6fb909199883b05cc590385133db1 (patch)
treed2b32ff5d9a64e8c2daa5f143b50a5cfcfd8d795 /usr.sbin/bhyve
parent1ae7af0ed87a074e560452e9f87dc50964a20275 (diff)
downloadFreeBSD-src-6b8dd85cc6a6fb909199883b05cc590385133db1.zip
FreeBSD-src-6b8dd85cc6a6fb909199883b05cc590385133db1.tar.gz
Add support for MSI-X interrupts in the virtio network device and make that
the default. The current behavior of advertising a single MSI vector can be requested by setting the environment variable "BHYVE_USE_MSI" to "true". The use of MSI is not compliant with the virtio specification and will be eventually phased out. Submitted by: Gopakumar T Obtained from: NetApp
Diffstat (limited to 'usr.sbin/bhyve')
-rw-r--r--usr.sbin/bhyve/pci_emul.c208
-rw-r--r--usr.sbin/bhyve/pci_emul.h8
-rw-r--r--usr.sbin/bhyve/pci_virtio_net.c159
-rw-r--r--usr.sbin/bhyve/virtio.h3
4 files changed, 348 insertions, 30 deletions
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index e086aeb..d6315cf 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -167,6 +167,94 @@ pci_parse_slot(char *opt, int legacy)
}
static int
+pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset)
+{
+
+ if (offset < pi->pi_msix.pba_offset)
+ return (0);
+
+ if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
+ return (0);
+ }
+
+ return (1);
+}
+
+int
+pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
+ uint64_t value)
+{
+ int msix_entry_offset;
+ int tab_index;
+ char *dest;
+
+ /* support only 4 or 8 byte writes */
+ if (size != 4 && size != 8)
+ return (-1);
+
+ /*
+ * Return if table index is beyond what device supports
+ */
+ tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+ if (tab_index >= pi->pi_msix.table_count)
+ return (-1);
+
+ msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+ /* support only aligned writes */
+ if ((msix_entry_offset % size) != 0)
+ return (-1);
+
+ dest = (char *)(pi->pi_msix.table + tab_index);
+ dest += msix_entry_offset;
+
+ if (size == 4)
+ *((uint32_t *)dest) = value;
+ else
+ *((uint64_t *)dest) = value;
+
+ return (0);
+}
+
+uint64_t
+pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size)
+{
+ char *dest;
+ int msix_entry_offset;
+ int tab_index;
+ uint64_t retval = ~0;
+
+ /* support only 4 or 8 byte reads */
+ if (size != 4 && size != 8)
+ return (retval);
+
+ msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+ /* support only aligned reads */
+ if ((msix_entry_offset % size) != 0) {
+ return (retval);
+ }
+
+ tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+
+ if (tab_index < pi->pi_msix.table_count) {
+ /* valid MSI-X Table access */
+ dest = (char *)(pi->pi_msix.table + tab_index);
+ dest += msix_entry_offset;
+
+ if (size == 4)
+ retval = *((uint32_t *)dest);
+ else
+ retval = *((uint64_t *)dest);
+ } else if (pci_valid_pba_offset(pi, offset)) {
+ /* return 0 for PBA access */
+ retval = 0;
+ }
+
+ return (retval);
+}
+
+static int
pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
@@ -178,8 +266,7 @@ pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
for (i = 0; i <= PCI_BARMAX; i++) {
if (pdi->pi_bar[i].type == PCIBAR_IO &&
port >= pdi->pi_bar[i].addr &&
- port + bytes <=
- pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
+ port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
offset = port - pdi->pi_bar[i].addr;
if (in)
*eax = (*pe->pe_barread)(ctx, vcpu, pdi, i,
@@ -484,13 +571,95 @@ pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
}
+static void
+pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum,
+ uint32_t msix_tab_size, int nextptr)
+{
+ CTASSERT(sizeof(struct msixcap) == 12);
+
+ assert(msix_tab_size % 4096 == 0);
+
+ bzero(msixcap, sizeof(struct msixcap));
+ msixcap->capid = PCIY_MSIX;
+ msixcap->nextptr = nextptr;
+
+ /*
+ * Message Control Register, all fields set to
+ * zero except for the Table Size.
+ * Note: Table size N is encoded as N-1
+ */
+ msixcap->msgctrl = msgnum - 1;
+
+ /*
+ * MSI-X BAR setup:
+ * - MSI-X table start at offset 0
+ * - PBA table starts at a 4K aligned offset after the MSI-X table
+ */
+ msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK;
+ msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK);
+}
+
+static void
+pci_msix_table_init(struct pci_devinst *pi, int table_entries)
+{
+ int i, table_size;
+
+ assert(table_entries > 0);
+ assert(table_entries <= MAX_MSIX_TABLE_ENTRIES);
+
+ table_size = table_entries * MSIX_TABLE_ENTRY_SIZE;
+ pi->pi_msix.table = malloc(table_size);
+ bzero(pi->pi_msix.table, table_size);
+
+ /* set mask bit of vector control register */
+ for (i = 0; i < table_entries; i++)
+ pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK;
+}
+
+int
+pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum)
+{
+ uint16_t pba_index;
+ uint32_t tab_size;
+ struct msixcap msixcap;
+
+ assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES);
+ assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0);
+
+ tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE;
+
+ /* Align table size to nearest 4K */
+ tab_size = roundup2(tab_size, 4096);
+
+ pi->pi_msix.table_bar = barnum;
+ pi->pi_msix.pba_bar = barnum;
+ pi->pi_msix.table_offset = 0;
+ pi->pi_msix.table_count = msgnum;
+ pi->pi_msix.pba_offset = tab_size;
+
+ /* calculate the MMIO size required for MSI-X PBA */
+ pba_index = (msgnum - 1) / (PBA_TABLE_ENTRY_SIZE * 8);
+ pi->pi_msix.pba_size = (pba_index + 1) * PBA_TABLE_ENTRY_SIZE;
+
+ pci_msix_table_init(pi, msgnum);
+
+ pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size, 0);
+
+ /* allocate memory for MSI-X Table and PBA */
+ pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32,
+ tab_size + pi->pi_msix.pba_size);
+
+ return (pci_emul_add_capability(pi, (u_char *)&msixcap,
+ sizeof(msixcap)));
+}
+
void
msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val)
{
uint16_t msgctrl, rwmask;
int off, table_bar;
-
+
off = offset - capoff;
table_bar = pi->pi_msix.table_bar;
/* Message Control Register */
@@ -502,6 +671,7 @@ msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
val = msgctrl;
pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
+ pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK;
}
CFGWRITE(pi, offset, val, bytes);
@@ -589,6 +759,9 @@ pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
case PCIY_MSI:
msicap_cfgwrite(pi, capoff, offset, bytes, val);
break;
+ case PCIY_MSIX:
+ msixcap_cfgwrite(pi, capoff, offset, bytes, val);
+ break;
default:
break;
}
@@ -668,6 +841,35 @@ pci_msi_msgnum(struct pci_devinst *pi)
return (0);
}
+int
+pci_msix_enabled(struct pci_devinst *pi)
+{
+
+ return (pi->pi_msix.enabled && !pi->pi_msi.enabled);
+}
+
+void
+pci_generate_msix(struct pci_devinst *pi, int index)
+{
+ struct msix_table_entry *mte;
+
+ if (!pci_msix_enabled(pi))
+ return;
+
+ if (pi->pi_msix.function_mask)
+ return;
+
+ if (index >= pi->pi_msix.table_count)
+ return;
+
+ mte = &pi->pi_msix.table[index];
+ if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+ /* XXX Set PBA bit if interrupt is disabled */
+ vm_lapic_irq(pi->pi_vmctx,
+ (mte->addr >> 12) & 0xff, mte->msg_data & 0xff);
+ }
+}
+
void
pci_generate_msi(struct pci_devinst *pi, int msg)
{
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
index f007bdf..5338aec 100644
--- a/usr.sbin/bhyve/pci_emul.h
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -96,6 +96,8 @@ struct msix_table_entry {
* for the size that should be emulated.
*/
#define MSIX_TABLE_ENTRY_SIZE 16
+#define MAX_MSIX_TABLE_ENTRIES 2048
+#define PBA_TABLE_ENTRY_SIZE 8
struct pci_devinst {
struct pci_devemu *pi_d;
@@ -120,6 +122,8 @@ struct pci_devinst {
size_t table_offset;
int table_count;
size_t pba_offset;
+ size_t pba_size;
+ int function_mask;
struct msix_table_entry *table; /* allocated at runtime */
} pi_msix;
@@ -168,6 +172,10 @@ int pci_msix_enabled(struct pci_devinst *pi);
int pci_msi_msgnum(struct pci_devinst *pi);
void pci_parse_slot(char *opt, int legacy);
void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
+int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum);
+int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
+ uint64_t value);
+uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
static __inline void
pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c
index 3f6f88a..444e0e5 100644
--- a/usr.sbin/bhyve/pci_virtio_net.c
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@@ -59,17 +59,17 @@ __FBSDID("$FreeBSD$");
/*
* PCI config-space register offsets
*/
-#define VTNET_R_CFG0 20
-#define VTNET_R_CFG1 21
-#define VTNET_R_CFG2 22
-#define VTNET_R_CFG3 23
-#define VTNET_R_CFG4 24
-#define VTNET_R_CFG5 25
-#define VTNET_R_CFG6 26
-#define VTNET_R_CFG7 27
-#define VTNET_R_MAX 27
-
-#define VTNET_REGSZ VTNET_R_MAX+1
+#define VTNET_R_CFG0 24
+#define VTNET_R_CFG1 25
+#define VTNET_R_CFG2 26
+#define VTNET_R_CFG3 27
+#define VTNET_R_CFG4 28
+#define VTNET_R_CFG5 29
+#define VTNET_R_CFG6 30
+#define VTNET_R_CFG7 31
+#define VTNET_R_MAX 31
+
+#define VTNET_REGSZ VTNET_R_MAX+1
/*
* Host capabilities
@@ -88,6 +88,8 @@ __FBSDID("$FreeBSD$");
#define VTNET_MAXQ 3
+static int use_msix = 1;
+
struct vring_hqueue {
/* Internal state */
uint16_t hq_size;
@@ -144,9 +146,24 @@ struct pci_vtnet_softc {
uint64_t vsc_pfn[VTNET_MAXQ];
struct vring_hqueue vsc_hq[VTNET_MAXQ];
+ uint16_t vsc_msix_table_idx[VTNET_MAXQ];
};
/*
+ * Return the size of IO BAR that maps virtio header and device specific
+ * region. The size would vary depending on whether MSI-X is enabled or
+ * not.
+ */
+static uint64_t
+pci_vtnet_iosize(struct pci_devinst *pi)
+{
+ if (pci_msix_enabled(pi))
+ return (VTNET_REGSZ);
+ else
+ return (VTNET_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX));
+}
+
+/*
* Return the number of available descriptors in the vring taking care
* of the 16-bit index wraparound.
*/
@@ -344,8 +361,13 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
hq->hq_cur_aidx = aidx;
if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
- sc->vsc_isr |= 1;
- pci_generate_msi(sc->vsc_pi, 0);
+ if (use_msix) {
+ pci_generate_msix(sc->vsc_pi,
+ sc->vsc_msix_table_idx[VTNET_RXQ]);
+ } else {
+ sc->vsc_isr |= 1;
+ pci_generate_msi(sc->vsc_pi, 0);
+ }
}
}
@@ -438,8 +460,13 @@ pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
* Generate an interrupt if able
*/
if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
- sc->vsc_isr |= 1;
- pci_generate_msi(sc->vsc_pi, 0);
+ if (use_msix) {
+ pci_generate_msix(sc->vsc_pi,
+ sc->vsc_msix_table_idx[VTNET_TXQ]);
+ } else {
+ sc->vsc_isr |= 1;
+ pci_generate_msi(sc->vsc_pi, 0);
+ }
}
}
@@ -512,6 +539,7 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
unsigned char digest[16];
char nstr[80];
struct pci_vtnet_softc *sc;
+ const char *env_msi;
/*
* Access to guest memory is required. Fail if
@@ -527,6 +555,14 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
sc->vsc_pi = pi;
pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+ /*
+ * Use MSI if set by user
+ */
+ if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) {
+ if (strcasecmp(env_msi, "yes") == 0)
+ use_msix = 0;
+ }
/*
* Attempt to open the tap device
@@ -594,7 +630,24 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
- pci_emul_add_msicap(pi, 1);
+
+ if (use_msix) {
+ /* MSI-X support */
+ int i;
+
+ for (i = 0; i < VTNET_MAXQ; i++)
+ sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
+
+ /*
+ * BAR 1 used to map MSI-X table and PBA
+ */
+ if (pci_emul_add_msixcap(pi, VTNET_MAXQ, 1))
+ return (1);
+ } else {
+ /* MSI support */
+ pci_emul_add_msicap(pi, 1);
+ }
+
pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ);
return (0);
@@ -609,6 +662,21 @@ static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
pci_vtnet_ping_ctlq
};
+static uint64_t
+vtnet_adjust_offset(struct pci_devinst *pi, uint64_t offset)
+{
+ /*
+ * Device specific offsets used by guest would change based on
+ * whether MSI-X capability is enabled or not
+ */
+ if (!pci_msix_enabled(pi)) {
+ if (offset >= VTCFG_R_MSIX)
+ return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX));
+ }
+
+ return (offset);
+}
+
static void
pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value)
@@ -616,9 +684,17 @@ pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
struct pci_vtnet_softc *sc = pi->pi_arg;
void *ptr;
+ if (use_msix) {
+ if (baridx == pi->pi_msix.table_bar ||
+ baridx == pi->pi_msix.pba_bar) {
+ pci_emul_msix_twrite(pi, offset, size, value);
+ return;
+ }
+ }
+
assert(baridx == 0);
- if (offset + size > VTNET_REGSZ) {
+ if (offset + size > pci_vtnet_iosize(pi)) {
DPRINTF(("vtnet_write: 2big, offset %ld size %d\n",
offset, size));
return;
@@ -626,6 +702,8 @@ pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
pthread_mutex_lock(&sc->vsc_mtx);
+ offset = vtnet_adjust_offset(pi, offset);
+
switch (offset) {
case VTCFG_R_GUESTCAP:
assert(size == 4);
@@ -649,6 +727,15 @@ pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
assert(size == 1);
pci_vtnet_update_status(sc, value);
break;
+ case VTCFG_R_CFGVEC:
+ assert(size == 2);
+ sc->vsc_msix_table_idx[VTNET_CTLQ] = value;
+ break;
+ case VTCFG_R_QVEC:
+ assert(size == 2);
+ assert(sc->vsc_curq != VTNET_CTLQ);
+ sc->vsc_msix_table_idx[sc->vsc_curq] = value;
+ break;
case VTNET_R_CFG0:
case VTNET_R_CFG1:
case VTNET_R_CFG2:
@@ -693,9 +780,16 @@ pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
void *ptr;
uint64_t value;
+ if (use_msix) {
+ if (baridx == pi->pi_msix.table_bar ||
+ baridx == pi->pi_msix.pba_bar) {
+ return (pci_emul_msix_tread(pi, offset, size));
+ }
+ }
+
assert(baridx == 0);
- if (offset + size > VTNET_REGSZ) {
+ if (offset + size > pci_vtnet_iosize(pi)) {
DPRINTF(("vtnet_read: 2big, offset %ld size %d\n",
offset, size));
return (0);
@@ -703,6 +797,8 @@ pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
pthread_mutex_lock(&sc->vsc_mtx);
+ offset = vtnet_adjust_offset(pi, offset);
+
switch (offset) {
case VTCFG_R_HOSTCAP:
assert(size == 4);
@@ -737,21 +833,30 @@ pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
value = sc->vsc_isr;
sc->vsc_isr = 0; /* a read clears this flag */
break;
+ case VTCFG_R_CFGVEC:
+ assert(size == 2);
+ value = sc->vsc_msix_table_idx[VTNET_CTLQ];
+ break;
+ case VTCFG_R_QVEC:
+ assert(size == 2);
+ assert(sc->vsc_curq != VTNET_CTLQ);
+ value = sc->vsc_msix_table_idx[sc->vsc_curq];
+ break;
case VTNET_R_CFG0:
case VTNET_R_CFG1:
case VTNET_R_CFG2:
case VTNET_R_CFG3:
case VTNET_R_CFG4:
case VTNET_R_CFG5:
- assert((size + offset) <= (VTNET_R_CFG5 + 1));
- ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
- if (size == 1) {
- value = *(uint8_t *) ptr;
- } else if (size == 2) {
- value = *(uint16_t *) ptr;
- } else {
- value = *(uint32_t *) ptr;
- }
+ assert((size + offset) <= (VTNET_R_CFG5 + 1));
+ ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
+ if (size == 1) {
+ value = *(uint8_t *) ptr;
+ } else if (size == 2) {
+ value = *(uint16_t *) ptr;
+ } else {
+ value = *(uint32_t *) ptr;
+ }
break;
case VTNET_R_CFG6:
assert(size != 4);
diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h
index 474e244..04ef586 100644
--- a/usr.sbin/bhyve/virtio.h
+++ b/usr.sbin/bhyve/virtio.h
@@ -36,6 +36,7 @@
#define VRING_DESC_F_INDIRECT (1 << 2)
#define VRING_AVAIL_F_NO_INTERRUPT 1
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
struct virtio_desc {
uint64_t vd_addr;
@@ -78,6 +79,8 @@ struct virtio_used {
#define VTCFG_R_QNOTIFY 16
#define VTCFG_R_STATUS 18
#define VTCFG_R_ISR 19
+#define VTCFG_R_CFGVEC 20
+#define VTCFG_R_QVEC 22
#define VTCFG_R_CFG0 20 /* No MSI-X */
#define VTCFG_R_CFG1 24 /* With MSI-X */
#define VTCFG_R_MSIX 20
OpenPOWER on IntegriCloud