From 6b8dd85cc6a6fb909199883b05cc590385133db1 Mon Sep 17 00:00:00 2001 From: neel Date: Wed, 30 Jan 2013 04:30:36 +0000 Subject: Add support for MSI-X interrupts in the virtio network device and make that the default. The current behavior of advertising a single MSI vector can be requested by setting the environment variable "BHYVE_USE_MSI" to "true". The use of MSI is not compliant with the virtio specification and will be eventually phased out. Submitted by: Gopakumar T Obtained from: NetApp --- usr.sbin/bhyve/pci_emul.c | 208 +++++++++++++++++++++++++++++++++++++++- usr.sbin/bhyve/pci_emul.h | 8 ++ usr.sbin/bhyve/pci_virtio_net.c | 159 ++++++++++++++++++++++++------ usr.sbin/bhyve/virtio.h | 3 + 4 files changed, 348 insertions(+), 30 deletions(-) (limited to 'usr.sbin/bhyve') diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c index e086aeb..d6315cf 100644 --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -167,6 +167,94 @@ pci_parse_slot(char *opt, int legacy) } static int +pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) +{ + + if (offset < pi->pi_msix.pba_offset) + return (0); + + if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { + return (0); + } + + return (1); +} + +int +pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, + uint64_t value) +{ + int msix_entry_offset; + int tab_index; + char *dest; + + /* support only 4 or 8 byte writes */ + if (size != 4 && size != 8) + return (-1); + + /* + * Return if table index is beyond what device supports + */ + tab_index = offset / MSIX_TABLE_ENTRY_SIZE; + if (tab_index >= pi->pi_msix.table_count) + return (-1); + + msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + /* support only aligned writes */ + if ((msix_entry_offset % size) != 0) + return (-1); + + dest = (char *)(pi->pi_msix.table + tab_index); + dest += msix_entry_offset; + + if (size == 4) + *((uint32_t *)dest) = value; + else + *((uint64_t *)dest) = value; + + return (0); +} + +uint64_t +pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) +{ + char *dest; + int msix_entry_offset; + int tab_index; + uint64_t retval = ~0; + + /* support only 4 or 8 byte reads */ + if (size != 4 && size != 8) + return (retval); + + msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + /* support only aligned reads */ + if ((msix_entry_offset % size) != 0) { + return (retval); + } + + tab_index = offset / MSIX_TABLE_ENTRY_SIZE; + + if (tab_index < pi->pi_msix.table_count) { + /* valid MSI-X Table access */ + dest = (char *)(pi->pi_msix.table + tab_index); + dest += msix_entry_offset; + + if (size == 4) + retval = *((uint32_t *)dest); + else + retval = *((uint64_t *)dest); + } else if (pci_valid_pba_offset(pi, offset)) { + /* return 0 for PBA access */ + retval = 0; + } + + return (retval); +} + +static int pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { @@ -178,8 +266,7 @@ pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, for (i = 0; i <= PCI_BARMAX; i++) { if (pdi->pi_bar[i].type == PCIBAR_IO && port >= pdi->pi_bar[i].addr && - port + bytes <= - pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { + port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { offset = port - pdi->pi_bar[i].addr; if (in) *eax = (*pe->pe_barread)(ctx, vcpu, pdi, i, @@ -484,13 +571,95 @@ pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); } +static void +pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, + uint32_t msix_tab_size, int nextptr) +{ + CTASSERT(sizeof(struct msixcap) == 12); + + assert(msix_tab_size % 4096 == 0); + + bzero(msixcap, sizeof(struct msixcap)); + msixcap->capid = PCIY_MSIX; + msixcap->nextptr = nextptr; + + /* + * Message Control Register, all fields set to + * zero except for the Table Size. + * Note: Table size N is encoded as N-1 + */ + msixcap->msgctrl = msgnum - 1; + + /* + * MSI-X BAR setup: + * - MSI-X table start at offset 0 + * - PBA table starts at a 4K aligned offset after the MSI-X table + */ + msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK; + msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK); +} + +static void +pci_msix_table_init(struct pci_devinst *pi, int table_entries) +{ + int i, table_size; + + assert(table_entries > 0); + assert(table_entries <= MAX_MSIX_TABLE_ENTRIES); + + table_size = table_entries * MSIX_TABLE_ENTRY_SIZE; + pi->pi_msix.table = malloc(table_size); + bzero(pi->pi_msix.table, table_size); + + /* set mask bit of vector control register */ + for (i = 0; i < table_entries; i++) + pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; +} + +int +pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) +{ + uint16_t pba_index; + uint32_t tab_size; + struct msixcap msixcap; + + assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); + assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); + + tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; + + /* Align table size to nearest 4K */ + tab_size = roundup2(tab_size, 4096); + + pi->pi_msix.table_bar = barnum; + pi->pi_msix.pba_bar = barnum; + pi->pi_msix.table_offset = 0; + pi->pi_msix.table_count = msgnum; + pi->pi_msix.pba_offset = tab_size; + + /* calculate the MMIO size required for MSI-X PBA */ + pba_index = (msgnum - 1) / (PBA_TABLE_ENTRY_SIZE * 8); + pi->pi_msix.pba_size = (pba_index + 1) * PBA_TABLE_ENTRY_SIZE; + + pci_msix_table_init(pi, msgnum); + + pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size, 0); + + /* allocate memory for MSI-X Table and PBA */ + pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32, + tab_size + pi->pi_msix.pba_size); + + return (pci_emul_add_capability(pi, (u_char *)&msixcap, + sizeof(msixcap))); +} + void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask; int off, table_bar; - + off = offset - capoff; table_bar = pi->pi_msix.table_bar; /* Message Control Register */ @@ -502,6 +671,7 @@ msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, val = msgctrl; pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; + pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; } CFGWRITE(pi, offset, val, bytes); @@ -589,6 +759,9 @@ pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val) case PCIY_MSI: msicap_cfgwrite(pi, capoff, offset, bytes, val); break; + case PCIY_MSIX: + msixcap_cfgwrite(pi, capoff, offset, bytes, val); + break; default: break; } @@ -668,6 +841,35 @@ pci_msi_msgnum(struct pci_devinst *pi) return (0); } +int +pci_msix_enabled(struct pci_devinst *pi) +{ + + return (pi->pi_msix.enabled && !pi->pi_msi.enabled); +} + +void +pci_generate_msix(struct pci_devinst *pi, int index) +{ + struct msix_table_entry *mte; + + if (!pci_msix_enabled(pi)) + return; + + if (pi->pi_msix.function_mask) + return; + + if (index >= pi->pi_msix.table_count) + return; + + mte = &pi->pi_msix.table[index]; + if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + /* XXX Set PBA bit if interrupt is disabled */ + vm_lapic_irq(pi->pi_vmctx, + (mte->addr >> 12) & 0xff, mte->msg_data & 0xff); + } +} + void pci_generate_msi(struct pci_devinst *pi, int msg) { diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h index f007bdf..5338aec 100644 --- a/usr.sbin/bhyve/pci_emul.h +++ b/usr.sbin/bhyve/pci_emul.h @@ -96,6 +96,8 @@ struct msix_table_entry { * for the size that should be emulated. */ #define MSIX_TABLE_ENTRY_SIZE 16 +#define MAX_MSIX_TABLE_ENTRIES 2048 +#define PBA_TABLE_ENTRY_SIZE 8 struct pci_devinst { struct pci_devemu *pi_d; @@ -120,6 +122,8 @@ struct pci_devinst { size_t table_offset; int table_count; size_t pba_offset; + size_t pba_size; + int function_mask; struct msix_table_entry *table; /* allocated at runtime */ } pi_msix; @@ -168,6 +172,10 @@ int pci_msix_enabled(struct pci_devinst *pi); int pci_msi_msgnum(struct pci_devinst *pi); void pci_parse_slot(char *opt, int legacy); void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); +int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum); +int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, + uint64_t value); +uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size); static __inline void pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c index 3f6f88a..444e0e5 100644 --- a/usr.sbin/bhyve/pci_virtio_net.c +++ b/usr.sbin/bhyve/pci_virtio_net.c @@ -59,17 +59,17 @@ __FBSDID("$FreeBSD$"); /* * PCI config-space register offsets */ -#define VTNET_R_CFG0 20 -#define VTNET_R_CFG1 21 -#define VTNET_R_CFG2 22 -#define VTNET_R_CFG3 23 -#define VTNET_R_CFG4 24 -#define VTNET_R_CFG5 25 -#define VTNET_R_CFG6 26 -#define VTNET_R_CFG7 27 -#define VTNET_R_MAX 27 - -#define VTNET_REGSZ VTNET_R_MAX+1 +#define VTNET_R_CFG0 24 +#define VTNET_R_CFG1 25 +#define VTNET_R_CFG2 26 +#define VTNET_R_CFG3 27 +#define VTNET_R_CFG4 28 +#define VTNET_R_CFG5 29 +#define VTNET_R_CFG6 30 +#define VTNET_R_CFG7 31 +#define VTNET_R_MAX 31 + +#define VTNET_REGSZ VTNET_R_MAX+1 /* * Host capabilities @@ -88,6 +88,8 @@ __FBSDID("$FreeBSD$"); #define VTNET_MAXQ 3 +static int use_msix = 1; + struct vring_hqueue { /* Internal state */ uint16_t hq_size; @@ -144,9 +146,24 @@ struct pci_vtnet_softc { uint64_t vsc_pfn[VTNET_MAXQ]; struct vring_hqueue vsc_hq[VTNET_MAXQ]; + uint16_t vsc_msix_table_idx[VTNET_MAXQ]; }; /* + * Return the size of IO BAR that maps virtio header and device specific + * region. The size would vary depending on whether MSI-X is enabled or + * not. + */ +static uint64_t +pci_vtnet_iosize(struct pci_devinst *pi) +{ + if (pci_msix_enabled(pi)) + return (VTNET_REGSZ); + else + return (VTNET_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX)); +} + +/* * Return the number of available descriptors in the vring taking care * of the 16-bit index wraparound. */ @@ -344,8 +361,13 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) hq->hq_cur_aidx = aidx; if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { - sc->vsc_isr |= 1; - pci_generate_msi(sc->vsc_pi, 0); + if (use_msix) { + pci_generate_msix(sc->vsc_pi, + sc->vsc_msix_table_idx[VTNET_RXQ]); + } else { + sc->vsc_isr |= 1; + pci_generate_msi(sc->vsc_pi, 0); + } } } @@ -438,8 +460,13 @@ pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq) * Generate an interrupt if able */ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { - sc->vsc_isr |= 1; - pci_generate_msi(sc->vsc_pi, 0); + if (use_msix) { + pci_generate_msix(sc->vsc_pi, + sc->vsc_msix_table_idx[VTNET_TXQ]); + } else { + sc->vsc_isr |= 1; + pci_generate_msi(sc->vsc_pi, 0); + } } } @@ -512,6 +539,7 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) unsigned char digest[16]; char nstr[80]; struct pci_vtnet_softc *sc; + const char *env_msi; /* * Access to guest memory is required. Fail if @@ -527,6 +555,14 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) sc->vsc_pi = pi; pthread_mutex_init(&sc->vsc_mtx, NULL); + + /* + * Use MSI if set by user + */ + if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) { + if (strcasecmp(env_msi, "yes") == 0) + use_msix = 0; + } /* * Attempt to open the tap device @@ -594,7 +630,24 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); - pci_emul_add_msicap(pi, 1); + + if (use_msix) { + /* MSI-X support */ + int i; + + for (i = 0; i < VTNET_MAXQ; i++) + sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR; + + /* + * BAR 1 used to map MSI-X table and PBA + */ + if (pci_emul_add_msixcap(pi, VTNET_MAXQ, 1)) + return (1); + } else { + /* MSI support */ + pci_emul_add_msicap(pi, 1); + } + pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ); return (0); @@ -609,6 +662,21 @@ static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = { pci_vtnet_ping_ctlq }; +static uint64_t +vtnet_adjust_offset(struct pci_devinst *pi, uint64_t offset) +{ + /* + * Device specific offsets used by guest would change based on + * whether MSI-X capability is enabled or not + */ + if (!pci_msix_enabled(pi)) { + if (offset >= VTCFG_R_MSIX) + return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX)); + } + + return (offset); +} + static void pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) @@ -616,9 +684,17 @@ pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, struct pci_vtnet_softc *sc = pi->pi_arg; void *ptr; + if (use_msix) { + if (baridx == pi->pi_msix.table_bar || + baridx == pi->pi_msix.pba_bar) { + pci_emul_msix_twrite(pi, offset, size, value); + return; + } + } + assert(baridx == 0); - if (offset + size > VTNET_REGSZ) { + if (offset + size > pci_vtnet_iosize(pi)) { DPRINTF(("vtnet_write: 2big, offset %ld size %d\n", offset, size)); return; @@ -626,6 +702,8 @@ pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, pthread_mutex_lock(&sc->vsc_mtx); + offset = vtnet_adjust_offset(pi, offset); + switch (offset) { case VTCFG_R_GUESTCAP: assert(size == 4); @@ -649,6 +727,15 @@ pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, assert(size == 1); pci_vtnet_update_status(sc, value); break; + case VTCFG_R_CFGVEC: + assert(size == 2); + sc->vsc_msix_table_idx[VTNET_CTLQ] = value; + break; + case VTCFG_R_QVEC: + assert(size == 2); + assert(sc->vsc_curq != VTNET_CTLQ); + sc->vsc_msix_table_idx[sc->vsc_curq] = value; + break; case VTNET_R_CFG0: case VTNET_R_CFG1: case VTNET_R_CFG2: @@ -693,9 +780,16 @@ pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, void *ptr; uint64_t value; + if (use_msix) { + if (baridx == pi->pi_msix.table_bar || + baridx == pi->pi_msix.pba_bar) { + return (pci_emul_msix_tread(pi, offset, size)); + } + } + assert(baridx == 0); - if (offset + size > VTNET_REGSZ) { + if (offset + size > pci_vtnet_iosize(pi)) { DPRINTF(("vtnet_read: 2big, offset %ld size %d\n", offset, size)); return (0); @@ -703,6 +797,8 @@ pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, pthread_mutex_lock(&sc->vsc_mtx); + offset = vtnet_adjust_offset(pi, offset); + switch (offset) { case VTCFG_R_HOSTCAP: assert(size == 4); @@ -737,21 +833,30 @@ pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, value = sc->vsc_isr; sc->vsc_isr = 0; /* a read clears this flag */ break; + case VTCFG_R_CFGVEC: + assert(size == 2); + value = sc->vsc_msix_table_idx[VTNET_CTLQ]; + break; + case VTCFG_R_QVEC: + assert(size == 2); + assert(sc->vsc_curq != VTNET_CTLQ); + value = sc->vsc_msix_table_idx[sc->vsc_curq]; + break; case VTNET_R_CFG0: case VTNET_R_CFG1: case VTNET_R_CFG2: case VTNET_R_CFG3: case VTNET_R_CFG4: case VTNET_R_CFG5: - assert((size + offset) <= (VTNET_R_CFG5 + 1)); - ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0]; - if (size == 1) { - value = *(uint8_t *) ptr; - } else if (size == 2) { - value = *(uint16_t *) ptr; - } else { - value = *(uint32_t *) ptr; - } + assert((size + offset) <= (VTNET_R_CFG5 + 1)); + ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0]; + if (size == 1) { + value = *(uint8_t *) ptr; + } else if (size == 2) { + value = *(uint16_t *) ptr; + } else { + value = *(uint32_t *) ptr; + } break; case VTNET_R_CFG6: assert(size != 4); diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h index 474e244..04ef586 100644 --- a/usr.sbin/bhyve/virtio.h +++ b/usr.sbin/bhyve/virtio.h @@ -36,6 +36,7 @@ #define VRING_DESC_F_INDIRECT (1 << 2) #define VRING_AVAIL_F_NO_INTERRUPT 1 +#define VIRTIO_MSI_NO_VECTOR 0xFFFF struct virtio_desc { uint64_t vd_addr; @@ -78,6 +79,8 @@ struct virtio_used { #define VTCFG_R_QNOTIFY 16 #define VTCFG_R_STATUS 18 #define VTCFG_R_ISR 19 +#define VTCFG_R_CFGVEC 20 +#define VTCFG_R_QVEC 22 #define VTCFG_R_CFG0 20 /* No MSI-X */ #define VTCFG_R_CFG1 24 /* With MSI-X */ #define VTCFG_R_MSIX 20 -- cgit v1.1