diff options
author | grehan <grehan@FreeBSD.org> | 2012-04-28 16:28:00 +0000 |
---|---|---|
committer | grehan <grehan@FreeBSD.org> | 2012-04-28 16:28:00 +0000 |
commit | 9f0c999f8126597eb572b80056df88335dbd0070 (patch) | |
tree | e25fe0456b877e9c40defdc19a863563208d18de /usr.sbin/bhyve | |
parent | 94d2b7f64912987093f1a98573737a32e4e5d8d1 (diff) | |
download | FreeBSD-src-9f0c999f8126597eb572b80056df88335dbd0070.zip FreeBSD-src-9f0c999f8126597eb572b80056df88335dbd0070.tar.gz |
MSI-x interrupt support for PCI pass-thru devices.
Includes instruction emulation for memory r/w access. This
opens the door for io-apic, local apic, hpet timer, and
legacy device emulation.
Submitted by: ryan dot berryhill at sandvine dot com
Reviewed by: grehan
Obtained from: Sandvine
Diffstat (limited to 'usr.sbin/bhyve')
-rw-r--r-- | usr.sbin/bhyve/Makefile | 3 | ||||
-rw-r--r-- | usr.sbin/bhyve/fbsdrun.c | 29 | ||||
-rw-r--r-- | usr.sbin/bhyve/instruction_emul.c | 555 | ||||
-rw-r--r-- | usr.sbin/bhyve/instruction_emul.h | 47 | ||||
-rw-r--r-- | usr.sbin/bhyve/pci_emul.c | 29 | ||||
-rw-r--r-- | usr.sbin/bhyve/pci_emul.h | 40 | ||||
-rw-r--r-- | usr.sbin/bhyve/pci_passthru.c | 270 |
7 files changed, 953 insertions, 20 deletions
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile index b0398ed..f64e579 100644 --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -4,7 +4,8 @@ PROG= bhyve -SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c mevent.c +SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c +SRCS+= instruction_emul.c mevent.c SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c SRCS+= pci_virtio_net.c pit_8254.c post.c rtc.c uart.c xmsr.c diff --git a/usr.sbin/bhyve/fbsdrun.c b/usr.sbin/bhyve/fbsdrun.c index 6f009b5..c2295ea 100644 --- a/usr.sbin/bhyve/fbsdrun.c +++ b/usr.sbin/bhyve/fbsdrun.c @@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$"); #include "mevent.h" #include "pci_emul.h" #include "xmsr.h" +#include "instruction_emul.h" #define DEFAULT_GUEST_HZ 100 #define DEFAULT_GUEST_TSLICE 200 @@ -108,6 +109,7 @@ struct fbsdstats { uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; + uint64_t vmexit_paging; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; int io_reset; @@ -412,6 +414,20 @@ vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) return (VMEXIT_RESTART); } +static int +vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + stats.vmexit_paging++; + + if (emulate_instruction(ctx, *pvcpu, vmexit->rip, vmexit->u.paging.cr3) != 0) { + printf("Failed to emulate instruction at 0x%lx\n", vmexit->rip); + return (VMEXIT_ABORT); + } + + return (VMEXIT_CONTINUE); +} + static void sigalrm(int sig) { @@ -446,12 +462,13 @@ setup_timeslice(void) } static vmexit_handler_t handler[VM_EXITCODE_MAX] = { - [VM_EXITCODE_INOUT] = vmexit_inout, - [VM_EXITCODE_VMX] = vmexit_vmx, - [VM_EXITCODE_BOGUS] = vmexit_bogus, - [VM_EXITCODE_RDMSR] = vmexit_rdmsr, - [VM_EXITCODE_WRMSR] = vmexit_wrmsr, - [VM_EXITCODE_MTRAP] = vmexit_mtrap, + [VM_EXITCODE_INOUT] = vmexit_inout, + [VM_EXITCODE_VMX] = vmexit_vmx, + [VM_EXITCODE_BOGUS] = vmexit_bogus, + [VM_EXITCODE_RDMSR] = vmexit_rdmsr, + [VM_EXITCODE_WRMSR] = vmexit_wrmsr, + [VM_EXITCODE_MTRAP] = vmexit_mtrap, + [VM_EXITCODE_PAGING] = vmexit_paging }; static void diff --git a/usr.sbin/bhyve/instruction_emul.c b/usr.sbin/bhyve/instruction_emul.c new file mode 100644 index 0000000..8c99194 --- /dev/null +++ b/usr.sbin/bhyve/instruction_emul.c @@ -0,0 +1,555 @@ +/*- + * Copyright (c) 2012 Sandvine, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <strings.h> +#include <unistd.h> +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "fbsdrun.h" +#include "instruction_emul.h" + +#define PREFIX_LOCK 0xF0 +#define PREFIX_REPNE 0xF2 +#define PREFIX_REPE 0xF3 +#define PREFIX_CS_OVERRIDE 0x2E +#define PREFIX_SS_OVERRIDE 0x36 +#define PREFIX_DS_OVERRIDE 0x3E +#define PREFIX_ES_OVERRIDE 0x26 +#define PREFIX_FS_OVERRIDE 0x64 +#define PREFIX_GS_OVERRIDE 0x65 +#define PREFIX_BRANCH_NOT_TAKEN 0x2E +#define PREFIX_BRANCH_TAKEN 0x3E +#define PREFIX_OPSIZE 0x66 +#define PREFIX_ADDRSIZE 0x67 + +#define OPCODE_2BYTE_ESCAPE 0x0F +#define OPCODE_3BYTE_ESCAPE 0x38 + +#define MODRM_MOD_MASK 0xC0 +#define MODRM_MOD_SHIFT 6 +#define MODRM_RM_MASK 0x07 +#define MODRM_RM_SHIFT 0 +#define MODRM_REG_MASK 0x38 +#define MODRM_REG_SHIFT 3 + +#define MOD_INDIRECT 0x0 +#define MOD_INDIRECT_DISP8 0x1 +#define MOD_INDIRECT_DISP32 0x2 +#define MOD_DIRECT 0x3 + +#define RM_EAX 0x0 +#define RM_ECX 0x1 +#define RM_EDX 0x2 +#define RM_EBX 0x3 +#define RM_SIB 0x4 +#define RM_DISP32 0x5 +#define RM_EBP RM_DISP32 +#define RM_ESI 0x6 +#define RM_EDI 0x7 + +#define REG_EAX 0x0 +#define REG_ECX 0x1 +#define REG_EDX 0x2 +#define REG_EBX 0x3 +#define REG_ESP 0x4 +#define REG_EBP 0x5 +#define REG_ESI 0x6 +#define REG_EDI 0x7 +#define REG_R8 0x8 +#define REG_R9 0x9 +#define REG_R10 0xA +#define REG_R11 0xB +#define REG_R12 0xC +#define REG_R13 0xD +#define REG_R14 0xE +#define REG_R15 0xF + +#define HAS_MODRM 1 +#define FROM_RM (1<<1) +#define FROM_REG (1<<2) +#define TO_RM (1<<3) +#define TO_REG (1<<4) + +#define REX_MASK 0xF0 +#define REX_PREFIX 0x40 +#define is_rex_prefix(x) ( ((x) & REX_MASK) == REX_PREFIX ) +#define REX_W_MASK 0x8 +#define REX_R_MASK 0x4 +#define REX_X_MASK 0x2 +#define REX_B_MASK 0x1 + +#define is_prefix(x) ((x) == PREFIX_LOCK || (x) == PREFIX_REPNE || \ + (x) == PREFIX_REPE || (x) == PREFIX_CS_OVERRIDE || \ + (x) == PREFIX_SS_OVERRIDE || (x) == PREFIX_DS_OVERRIDE || \ + (x) == PREFIX_ES_OVERRIDE || (x) == PREFIX_FS_OVERRIDE || \ + (x) == PREFIX_GS_OVERRIDE || (x) == PREFIX_BRANCH_NOT_TAKEN || \ + (x) == PREFIX_BRANCH_TAKEN || (x) == PREFIX_OPSIZE || \ + (x) == PREFIX_ADDRSIZE || is_rex_prefix((x))) + +#define PAGE_FRAME_MASK 0x80 +#define PAGE_OFFSET_MASK 0xFFF +#define PAGE_TABLE_ENTRY_MASK (~PAGE_OFFSET_MASK) +#define PML4E_OFFSET_MASK 0x0000FF8000000000 +#define PML4E_SHIFT 39 + +#define MAX_EMULATED_REGIONS 8 +int registered_regions = 0; +struct memory_region +{ + uintptr_t start; + uintptr_t end; + emulated_read_func_t memread; + emulated_write_func_t memwrite; + void *arg; +} emulated_regions[MAX_EMULATED_REGIONS]; + +struct decoded_instruction +{ + void *instruction; + uint8_t *opcode; + uint8_t *modrm; + uint8_t *sib; + uint8_t *displacement; + uint8_t *immediate; + + uint8_t opcode_flags; + + uint8_t addressing_mode; + uint8_t rm; + uint8_t reg; + uint8_t rex_r; + uint8_t rex_w; + uint8_t rex_b; + uint8_t rex_x; + + int32_t disp; +}; + +static enum vm_reg_name vm_reg_name_mappings[] = { + [REG_EAX] = VM_REG_GUEST_RAX, + [REG_EBX] = VM_REG_GUEST_RBX, + [REG_ECX] = VM_REG_GUEST_RCX, + [REG_EDX] = VM_REG_GUEST_RDX, + [REG_ESP] = VM_REG_GUEST_RSP, + [REG_EBP] = VM_REG_GUEST_RBP, + [REG_ESI] = VM_REG_GUEST_RSI, + [REG_EDI] = VM_REG_GUEST_RDI, + [REG_R8] = VM_REG_GUEST_R8, + [REG_R9] = VM_REG_GUEST_R9, + [REG_R10] = VM_REG_GUEST_R10, + [REG_R11] = VM_REG_GUEST_R11, + [REG_R12] = VM_REG_GUEST_R12, + [REG_R13] = VM_REG_GUEST_R13, + [REG_R14] = VM_REG_GUEST_R14, + [REG_R15] = VM_REG_GUEST_R15 +}; + +uint8_t one_byte_opcodes[256] = { + [0x89] = HAS_MODRM | FROM_REG | TO_RM, + [0x8B] = HAS_MODRM | FROM_RM | TO_REG, +}; + +static uintptr_t +gla2gpa(uint64_t gla, uint64_t guest_cr3) +{ + uint64_t *table; + uint64_t mask, entry; + int level, shift; + uintptr_t page_frame; + + table = paddr_guest2host(guest_cr3 & PAGE_TABLE_ENTRY_MASK); + mask = PML4E_OFFSET_MASK; + shift = PML4E_SHIFT; + for (level = 0; level < 4; ++level) + { + entry = table[(gla & mask) >> shift]; + table = (uint64_t*)(entry & PAGE_TABLE_ENTRY_MASK); + + /* This entry does not point to another page table */ + if (entry & PAGE_FRAME_MASK || level >= 3) + break; + + table = paddr_guest2host((uintptr_t)table); + mask >>= 9; + shift -= 9; + } + + mask = (1 << shift) - 1; + page_frame = ((uintptr_t)table & ~mask); + return (page_frame | (gla & mask)); +} + +static void * +gla2hla(uint64_t gla, uint64_t guest_cr3) +{ + uintptr_t gpa; + + gpa = gla2gpa(gla, guest_cr3); + return paddr_guest2host(gpa); +} + +/* + * Decodes all of the prefixes of the instruction. Only a subset of REX + * prefixes are currently supported. If any unsupported prefix is + * encountered, returns -1. + */ +static int +decode_prefixes(struct decoded_instruction *decoded) +{ + uint8_t *current_prefix; + + current_prefix = decoded->instruction; + + if (is_rex_prefix(*current_prefix)) { + decoded->rex_w = *current_prefix & REX_W_MASK; + decoded->rex_r = *current_prefix & REX_R_MASK; + decoded->rex_x = *current_prefix & REX_X_MASK; + decoded->rex_b = *current_prefix & REX_B_MASK; + current_prefix++; + } else if (is_prefix(*current_prefix)) { + return (-1); + } + + decoded->opcode = current_prefix; + return (0); +} + +/* + * Decodes the instruction's opcode. If the opcode is not understood, returns + * -1 indicating an error. Sets the instruction's mod_rm pointer to the + * location of the ModR/M field. + */ +static int +decode_opcode(struct decoded_instruction *decoded) +{ + uint8_t opcode, flags; + + opcode = *decoded->opcode; + flags = one_byte_opcodes[opcode]; + + if (!flags) + return (-1); + + if (flags & HAS_MODRM) { + decoded->modrm = decoded->opcode + 1; + } + + decoded->opcode_flags = flags; + + return (0); +} + +/* + * Decodes the instruction's ModR/M field. Sets the instruction's sib pointer + * to the location of the SIB if one is expected to be present, or 0 if not. + */ +static int +decode_mod_rm(struct decoded_instruction *decoded) +{ + uint8_t modrm; + uint8_t *extension_operands; + + if (decoded->modrm) { + modrm = *decoded->modrm; + + decoded->addressing_mode = (modrm & MODRM_MOD_MASK) >> MODRM_MOD_SHIFT; + decoded->rm = (modrm & MODRM_RM_MASK) >> MODRM_RM_SHIFT; + decoded->reg = (modrm & MODRM_REG_MASK) >> MODRM_REG_SHIFT; + + if (decoded->rex_b) + decoded->rm |= (1<<3); + + if (decoded->rex_r) + decoded->reg |= (1<<3); + + extension_operands = decoded->modrm + 1; + + if (decoded->rm == RM_SIB) { + decoded->sib = decoded->modrm + 1; + extension_operands = decoded->sib + 1; + } + + switch (decoded->addressing_mode) { + case MOD_INDIRECT: + case MOD_DIRECT: + decoded->displacement = 0; + break; + case MOD_INDIRECT_DISP8: + decoded->displacement = extension_operands; + break; + case MOD_INDIRECT_DISP32: + decoded->displacement = extension_operands; + break; + } + } + + return (0); +} + +/* + * Decodes the instruction's SIB field. No such instructions are currently + * supported, so do nothing and return -1 if there is a SIB field, 0 otherwise. + */ +static int +decode_sib(struct decoded_instruction *decoded) +{ + + if (decoded->sib) + return (-1); + + return (0); +} + +/* + * Grabs and saves the instruction's immediate operand and displacement if + * they are present. Immediates are not currently supported, so if an + * immediate is present it will return -1 indicating an error. + */ +static int +decode_extension_operands(struct decoded_instruction *decoded) +{ + + if (decoded->displacement) { + if (decoded->addressing_mode == MOD_INDIRECT_DISP8) { + decoded->disp = (int32_t)*decoded->displacement; + } else if (decoded->addressing_mode == MOD_INDIRECT_DISP32) { + decoded->disp = *((int32_t*)decoded->displacement); + } + } + + if (decoded->immediate) { + return (-1); + } + + return (0); +} + +static int +decode_instruction(void *instr, struct decoded_instruction *decoded) +{ + int error; + + bzero(decoded, sizeof(*decoded)); + decoded->instruction = instr; + + error = decode_prefixes(decoded); + if (error) + return (error); + + error = decode_opcode(decoded); + if (error) + return (error); + + error = decode_mod_rm(decoded); + if (error) + return (error); + + error = decode_sib(decoded); + if (error) + return (error); + + error = decode_extension_operands(decoded); + if (error) + return (error); + + return (0); +} + +static struct memory_region * +find_region(uintptr_t addr) +{ + int i; + + for (i = 0; i < registered_regions; ++i) { + if (emulated_regions[i].start <= addr && + emulated_regions[i].end >= addr) { + return &emulated_regions[i]; + } + } + + return (0); +} + +static enum vm_reg_name +get_vm_reg_name(uint8_t reg) +{ + return vm_reg_name_mappings[reg]; +} + +static int +get_operand(struct vmctx *vm, int vcpu, uint64_t guest_cr3, + const struct decoded_instruction *instruction, uint64_t *operand) +{ + enum vm_reg_name regname; + uint64_t reg; + uintptr_t target; + int error; + uint8_t rm, addressing_mode; + struct memory_region *emulated_memory; + + if (instruction->opcode_flags & FROM_RM) { + rm = instruction->rm; + addressing_mode = instruction->addressing_mode; + } else if (instruction->opcode_flags & FROM_REG) { + rm = instruction->reg; + addressing_mode = MOD_DIRECT; + } else + return (-1); + + regname = get_vm_reg_name(rm); + error = vm_get_register(vm, vcpu, regname, ®); + if (error) + return (error); + + switch (addressing_mode) { + case MOD_DIRECT: + *operand = reg; + return (0); + case MOD_INDIRECT: + target = gla2gpa(reg, guest_cr3); + emulated_memory = find_region(target); + if (emulated_memory) { + return emulated_memory->memread(vm, vcpu, target, + 4, operand, + emulated_memory->arg); + } + return (-1); + case MOD_INDIRECT_DISP8: + case MOD_INDIRECT_DISP32: + target = gla2gpa(reg, guest_cr3); + target += instruction->disp; + emulated_memory = find_region(target); + if (emulated_memory) { + return emulated_memory->memread(vm, vcpu, target, + 4, operand, + emulated_memory->arg); + } + return (-1); + default: + return (-1); + } +} + +static int +perform_write(struct vmctx *vm, int vcpu, uint64_t guest_cr3, + const struct decoded_instruction *instruction, uint64_t operand) +{ + enum vm_reg_name regname; + uintptr_t target; + int error; + uint64_t reg; + struct memory_region *emulated_memory; + uint8_t addressing_mode; + + if (instruction->opcode_flags & TO_RM) { + reg = instruction->rm; + addressing_mode = instruction->addressing_mode; + } else if (instruction->opcode_flags & TO_REG) { + reg = instruction->reg; + addressing_mode = MOD_DIRECT; + } else + return (-1); + + regname = get_vm_reg_name(reg); + error = vm_get_register(vm, vcpu, regname, ®); + if (error) + return (error); + + switch(addressing_mode) { + case MOD_DIRECT: + return vm_set_register(vm, vcpu, regname, operand); + case MOD_INDIRECT: + target = gla2gpa(reg, guest_cr3); + emulated_memory = find_region(target); + if (emulated_memory) { + return emulated_memory->memwrite(vm, vcpu, target, + 4, operand, + emulated_memory->arg); + } + return (-1); + default: + return (-1); + } +} + +static int +emulate_decoded_instruction(struct vmctx *vm, int vcpu, uint64_t cr3, + const struct decoded_instruction *instruction) +{ + uint64_t operand; + int error; + + error = get_operand(vm, vcpu, cr3, instruction, &operand); + if (error) + return (error); + + return perform_write(vm, vcpu, cr3, instruction, operand); +} + +int +emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip, uint64_t cr3) +{ + struct decoded_instruction instr; + int error; + void *instruction = gla2hla(rip, cr3); + + if ((error = decode_instruction(instruction, &instr)) != 0) + return (error); + + return emulate_decoded_instruction(vm, vcpu, cr3, &instr); +} + +struct memory_region * +register_emulated_memory(uintptr_t start, size_t len, emulated_read_func_t memread, + emulated_write_func_t memwrite, void *arg) +{ + if (registered_regions > MAX_EMULATED_REGIONS) + return (NULL); + + struct memory_region *region = &emulated_regions[registered_regions]; + region->start = start; + region->end = start + len; + region->memread = memread; + region->memwrite = memwrite; + region->arg = arg; + + registered_regions++; + return (region); +} + +void +move_memory_region(struct memory_region *region, uintptr_t start) +{ + size_t len; + + len = region->end - region->start; + region->start = start; + region->end = start + len; +} + diff --git a/usr.sbin/bhyve/instruction_emul.h b/usr.sbin/bhyve/instruction_emul.h new file mode 100644 index 0000000..e7b6bff --- /dev/null +++ b/usr.sbin/bhyve/instruction_emul.h @@ -0,0 +1,47 @@ +/*- + * Copyright (c) 2012 Sandvine, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _INSTRUCTION_EMUL_H_ +#define _INSTRUCTION_EMUL_H_ + +struct memory_region; + +typedef int (*emulated_read_func_t)(struct vmctx *vm, int vcpu, uintptr_t addr, + int size, uint64_t *data, void *arg); +typedef int (*emulated_write_func_t)(struct vmctx *vm, int vcpu, uintptr_t addr, + int size, uint64_t data, void *arg); + +int emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip, + uint64_t cr3); +struct memory_region *register_emulated_memory(uintptr_t start, size_t len, + emulated_read_func_t memread, + emulated_write_func_t memwrite, + void *arg); +void move_memory_region(struct memory_region *memory_region, uintptr_t start); + +#endif diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c index 650c4de..9de87ad 100644 --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include "fbsdrun.h" #include "inout.h" #include "pci_emul.h" +#include "instruction_emul.h" #define CONF1_ADDR_PORT 0x0cf8 #define CONF1_DATA_PORT 0x0cfc @@ -572,6 +573,29 @@ pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) } void +msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val) +{ + uint16_t msgctrl, rwmask; + int off, table_bar; + + off = offset - capoff; + table_bar = pi->pi_msix.table_bar; + /* Message Control Register */ + if (off == 2 && bytes == 2) { + rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; + msgctrl = pci_get_cfgdata16(pi, offset); + msgctrl &= ~rwmask; + msgctrl |= val & rwmask; + val = msgctrl; + + pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; + } + + CFGWRITE(pi, offset, val, bytes); +} + +void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { @@ -847,6 +871,11 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, assert(0); } pci_set_cfgdata32(pi, coff, bar); + + if (pi->pi_bar[idx].handler) { + pi->pi_bar[idx].handler(pi, idx, bar); + } + } else if (pci_emul_iscap(pi, coff)) { pci_emul_capwrite(pi, coff, bytes, *eax); } else { diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h index f5f8e22..588e5ba 100644 --- a/usr.sbin/bhyve/pci_emul.h +++ b/usr.sbin/bhyve/pci_emul.h @@ -42,6 +42,7 @@ struct vmctx; struct pci_devinst; +struct memory_region; struct pci_devemu { char *pe_emu; /* Name of device emulation */ @@ -73,14 +74,30 @@ enum pcibar_type { PCIBAR_MEMHI64 }; +typedef int (*bar_write_func_t)(struct pci_devinst *pdi, int idx, uint64_t bar); + struct pcibar { enum pcibar_type type; /* io or memory */ uint64_t size; uint64_t addr; + bar_write_func_t handler; }; #define PI_NAMESZ 40 +struct msix_table_entry { + uint64_t addr; + uint32_t msg_data; + uint32_t vector_control; +} __packed; + +/* + * In case the structure is modified to hold extra information, use a define + * for the size that should be emulated. + */ +#define MSIX_TABLE_ENTRY_SIZE 16 +#define MAX_MSIX_TABLE_SIZE 2048 + struct pci_devinst { struct pci_devemu *pi_d; struct vmctx *pi_vmctx; @@ -96,6 +113,19 @@ struct pci_devinst { int msgnum; } pi_msi; + struct { + int enabled; + int table_bar; + int pba_bar; + size_t table_offset; + uintptr_t table_gpa; + size_t table_size; + int table_count; + size_t pba_offset; + struct memory_region *table_bar_region; + struct msix_table_entry table[MAX_MSIX_TABLE_SIZE]; + } pi_msix; + void *pi_arg; /* devemu-private data */ u_char pi_cfgdata[PCI_REGMAX + 1]; @@ -111,6 +141,14 @@ struct msicap { uint16_t msgdata; } __packed; +struct msixcap { + uint8_t capid; + uint8_t nextptr; + uint16_t msgctrl; + uint32_t table_offset; + uint32_t pba_offset; +} __packed; + void init_pci(struct vmctx *ctx); void pci_parse_slot(char *opt); void pci_parse_name(char *opt); @@ -120,6 +158,8 @@ int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase, int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val); +void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val); void pci_generate_msi(struct pci_devinst *pi, int msgnum); int pci_msi_enabled(struct pci_devinst *pi); diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c index 1c417fd..a6f1f63 100644 --- a/usr.sbin/bhyve/pci_passthru.c +++ b/usr.sbin/bhyve/pci_passthru.c @@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$"); #include <machine/vmm.h> #include <vmmapi.h> #include "pci_emul.h" +#include "instruction_emul.h" #ifndef _PATH_DEVPCI #define _PATH_DEVPCI "/dev/pci" @@ -58,6 +59,11 @@ __FBSDID("$FreeBSD$"); #define LEGACY_SUPPORT 1 +#define MSIX_TABLE_BIR_MASK 7 +#define MSIX_TABLE_OFFSET_MASK (~MSIX_TABLE_BIR_MASK); +#define MSIX_TABLE_COUNT(x) (((x) & 0x7FF) + 1) +#define MSIX_CAPLEN 12 + static int pcifd = -1; static int iofd = -1; @@ -69,6 +75,9 @@ struct passthru_softc { int msgctrl; int emulated; } psc_msi; + struct { + int capoff; + } psc_msix; struct pcisel psc_sel; }; @@ -152,17 +161,19 @@ passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) static int cfginitmsi(struct passthru_softc *sc) { - int ptr, cap, sts, caplen; + int ptr, capptr, cap, sts, caplen; uint32_t u32; struct pcisel sel; struct pci_devinst *pi; + struct msixcap msixcap; + uint32_t *msixcap_ptr; pi = sc->psc_pi; sel = sc->psc_sel; /* * Parse the capabilities and cache the location of the MSI - * capability. + * and MSI-X capabilities. */ sts = read_config(&sel, PCIR_STATUS, 2); if (sts & PCIM_STATUS_CAPPRESENT) { @@ -179,18 +190,44 @@ cfginitmsi(struct passthru_softc *sc) ptr + 2, 2); sc->psc_msi.emulated = 0; caplen = msi_caplen(sc->psc_msi.msgctrl); + capptr = ptr; while (caplen > 0) { - u32 = read_config(&sel, ptr, 4); - pci_set_cfgdata32(pi, ptr, u32); + u32 = read_config(&sel, capptr, 4); + pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; - ptr += 4; + capptr += 4; + } + } else if (cap == PCIY_MSIX) { + /* + * Copy the MSI-X capability + */ + sc->psc_msix.capoff = ptr; + caplen = 12; + msixcap_ptr = (uint32_t*) &msixcap; + capptr = ptr; + while (caplen > 0) { + u32 = read_config(&sel, capptr, 4); + *msixcap_ptr = u32; + pci_set_cfgdata32(pi, capptr, u32); + caplen -= 4; + capptr += 4; + msixcap_ptr++; } - break; } ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1); } } + if (sc->psc_msix.capoff == 0) + return (-1); + + pi->pi_msix.pba_bar = msixcap.pba_offset & MSIX_TABLE_BIR_MASK; + pi->pi_msix.pba_offset = msixcap.pba_offset & MSIX_TABLE_OFFSET_MASK; + pi->pi_msix.table_bar = msixcap.table_offset & MSIX_TABLE_BIR_MASK; + pi->pi_msix.table_offset = msixcap.table_offset & MSIX_TABLE_OFFSET_MASK; + + pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); + #ifdef LEGACY_SUPPORT /* * If the passthrough device does not support MSI then craft a @@ -208,12 +245,182 @@ cfginitmsi(struct passthru_softc *sc) } #endif - if (sc->psc_msi.capoff == 0) /* MSI or bust */ + /* Make sure one of the capabilities is present */ + if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) return (-1); else return (0); } +static int +msix_table_read(struct vmctx *vm, int vcpu, uintptr_t addr, + int size, uint64_t *data, void *arg) +{ + struct passthru_softc *sc; + struct pci_devinst *pi; + int index; + size_t offset, entry_offset; + uint8_t *src8; + uint16_t *src16; + uint32_t *src32; + uint64_t *src64; + struct msix_table_entry *entry; + + sc = arg; + pi = sc->psc_pi; + offset = addr - pi->pi_msix.table_gpa; + entry_offset = addr % MSIX_TABLE_ENTRY_SIZE; + index = offset / MSIX_TABLE_ENTRY_SIZE; + entry = &pi->pi_msix.table[index]; + + switch(size) { + case 1: + src8 = (uint8_t*)((void*)entry + entry_offset); + *data = *src8; + break; + case 2: + src16 = (uint16_t*)((void*)entry + entry_offset); + *data = *src16; + break; + case 4: + src32 = (uint32_t*)((void*)entry + entry_offset); + *data = *src32; + break; + case 8: + src64 = (uint64_t*)((void*)entry + entry_offset); + *data = *src64; + break; + default: + return (-1); + } + + return (0); +} + +static int +msix_table_write(struct vmctx *vm, int vcpu, uintptr_t addr, + int size, uint64_t data, void *arg) +{ + struct passthru_softc *sc; + struct pci_devinst *pi; + int error, index; + size_t offset, entry_offset; + uint32_t *dest; + struct msix_table_entry *entry; + uint32_t vector_control; + + sc = arg; + pi = sc->psc_pi; + offset = addr - pi->pi_msix.table_gpa; + entry_offset = addr % MSIX_TABLE_ENTRY_SIZE; + index = offset / MSIX_TABLE_ENTRY_SIZE; + entry = &pi->pi_msix.table[index]; + + /* Only 4 byte naturally-aligned writes are supported */ + if (size == 4 && entry_offset % 4 == 0) { + vector_control = entry->vector_control; + dest = (uint32_t*)((void*)entry + entry_offset); + *dest = data; + /* If MSI-X hasn't been enabled, do nothing */ + if (pi->pi_msix.enabled) { + /* If the entry is masked, don't set it up */ + if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || + (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + error = vm_setup_msix(vm, vcpu, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, + sc->psc_sel.pc_func, + index, entry->msg_data, + entry->vector_control, + entry->addr); + if (error) + return (-1); + } + } + } else { + printf("Unsupported unaligned or non-4-byte write to MSI-X table\n"); + return (-1); + } + return (0); +} + +static int +msix_bar_handler(struct pci_devinst *pdi, int idx, uint64_t bar) +{ + uintptr_t start; + + start = (bar & PCIM_BAR_MEM_BASE) + pdi->pi_msix.table_offset; + move_memory_region(pdi->pi_msix.table_bar_region, start); + pdi->pi_msix.table_gpa = start; + return (0); +} + +static int +init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base) +{ + int idx; + size_t table_size; + vm_paddr_t start; + size_t len; + struct pci_devinst *pi = sc->psc_pi; + + /* + * If the MSI-X table BAR maps memory intended for + * other uses, it is at least assured that the table + * either resides in its own page within the region, + * or it resides in a page shared with only the PBA. + */ + if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar && + ((pi->pi_msix.pba_offset - pi->pi_msix.table_offset) < 4096)) { + /* Need to also emulate the PBA, not supported yet */ + printf("Unsupported MSI-X table and PBA in same page\n"); + return (-1); + } + /* + * May need to split the BAR into 3 regions: + * Before the MSI-X table, the MSI-X table, and after it + * XXX for now, assume that the table is not in the middle + */ + table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; + pi->pi_msix.table_size = table_size; + idx = pi->pi_msix.table_bar; + + /* Round up to page size */ + table_size = (table_size + 0x1000) & ~0xFFF; + if (pi->pi_msix.table_offset == 0) { + /* Map everything after the MSI-X table */ + start = pi->pi_bar[idx].addr + table_size; + len = pi->pi_bar[idx].size - table_size; + } else { + /* Map everything before the MSI-X table */ + start = pi->pi_bar[idx].addr; + len = pi->pi_msix.table_offset; + } + return vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, sc->psc_sel.pc_func, + start, len, base + table_size); +} + +static int +cfginitmsix(struct passthru_softc *sc) +{ + int table_bar; + struct pci_devinst *pi; + + pi = sc->psc_pi; + table_bar = pi->pi_msix.table_bar; + pi->pi_msix.table_gpa = sc->psc_bar[table_bar].addr + pi->pi_msix.table_offset; + pi->pi_msix.table_bar_region = register_emulated_memory(pi->pi_msix.table_gpa, + pi->pi_msix.table_size, + msix_table_read, + msix_table_write, sc); + if (!pi->pi_msix.table_bar_region) + return (-1); + + pi->pi_bar[table_bar].handler = msix_bar_handler; + + return (0); +} + static int cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) { @@ -262,10 +469,13 @@ cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) if (error) return (-1); - /* - * Map the physical MMIO space in the guest MMIO space - */ - if (bartype != PCIBAR_IO) { + /* The MSI-X table needs special handling */ + if (i == pi->pi_msix.table_bar) { + error = init_msix_table(ctx, sc, base); + if (error) + return (-1); + } else if (bartype != PCIBAR_IO) { + /* Map the physical MMIO space in the guest MMIO space */ error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_bar[i].addr, pi->pi_bar[i].size, base); @@ -299,10 +509,13 @@ cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func) sc->psc_sel.pc_dev = slot; sc->psc_sel.pc_func = func; + if (cfginitmsi(sc) != 0) + goto done; + if (cfginitbar(ctx, sc) != 0) goto done; - if (cfginitmsi(sc) != 0) + if (cfginitmsix(sc) != 0) goto done; error = 0; /* success */ @@ -381,6 +594,16 @@ msicap_access(struct passthru_softc *sc, int coff) return (0); } +static int +msixcap_access(struct passthru_softc *sc, int coff) +{ + if (sc->psc_msix.capoff == 0) + return (0); + + return (coff >= sc->psc_msix.capoff && + coff < sc->psc_msix.capoff + MSIX_CAPLEN); +} + static int passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t *rv) @@ -416,7 +639,7 @@ static int passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t val) { - int error; + int error, msix_table_entries, i; struct passthru_softc *sc; sc = pi->pi_arg; @@ -443,6 +666,27 @@ passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, return (0); } + if (msixcap_access(sc, coff)) { + msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val); + if (pi->pi_msix.enabled) { + msix_table_entries = pi->pi_msix.table_count; + for (i = 0; i < msix_table_entries; i++) { + error = vm_setup_msix(ctx, vcpu, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, + sc->psc_sel.pc_func, i, + pi->pi_msix.table[i].msg_data, + pi->pi_msix.table[i].vector_control, + pi->pi_msix.table[i].addr); + + if (error) { + printf("vm_setup_msix returned error %d\r\n", errno); + exit(1); + } + } + } + return (0); + } + #ifdef LEGACY_SUPPORT /* * If this device does not support MSI natively then we cannot let |