diff options
author | grehan <grehan@FreeBSD.org> | 2012-10-19 18:11:17 +0000 |
---|---|---|
committer | grehan <grehan@FreeBSD.org> | 2012-10-19 18:11:17 +0000 |
commit | beaad57fa07508b2383454e13f0a26824ada6328 (patch) | |
tree | 32133e495715159b674a250fa62c5c04595e4753 /usr.sbin | |
parent | 8fb5b5f8de608d18362583be1e90150aab0b4d33 (diff) | |
download | FreeBSD-src-beaad57fa07508b2383454e13f0a26824ada6328.zip FreeBSD-src-beaad57fa07508b2383454e13f0a26824ada6328.tar.gz |
Rework how guest MMIO regions are dealt with.
- New memory region interface. An RB tree holds the regions,
with a last-found per-vCPU cache to deal with the common case
of repeated guest accesses to MMIO registers in the same page.
- Support memory-mapped BARs in PCI emulation.
mem.c/h - memory region interface
instruction_emul.c/h - remove old region interface.
Use gpa from EPT exit to avoid a tablewalk to
determine operand address. Determine operand size
and use when calling through to region handler.
fbsdrun.c - call into region interface on paging
exit. Distinguish between instruction emul error
and region not found
pci_emul.c/h - implement new BAR callback api.
Split BAR alloc routine into routines that
require/don't require the BAR phys address.
ioapic.c
pci_passthru.c
pci_virtio_block.c
pci_virtio_net.c
pci_uart.c - update to new BAR callback i/f
Reviewed by: neel
Obtained from: NetApp
Diffstat (limited to 'usr.sbin')
-rw-r--r-- | usr.sbin/bhyve/Makefile | 2 | ||||
-rw-r--r-- | usr.sbin/bhyve/fbsdrun.c | 17 | ||||
-rw-r--r-- | usr.sbin/bhyve/instruction_emul.c | 290 | ||||
-rw-r--r-- | usr.sbin/bhyve/instruction_emul.h | 15 | ||||
-rw-r--r-- | usr.sbin/bhyve/ioapic.c | 79 | ||||
-rw-r--r-- | usr.sbin/bhyve/mem.c | 196 | ||||
-rw-r--r-- | usr.sbin/bhyve/mem.h | 58 | ||||
-rw-r--r-- | usr.sbin/bhyve/pci_emul.c | 221 | ||||
-rw-r--r-- | usr.sbin/bhyve/pci_emul.h | 26 | ||||
-rw-r--r-- | usr.sbin/bhyve/pci_passthru.c | 225 | ||||
-rw-r--r-- | usr.sbin/bhyve/pci_uart.c | 25 | ||||
-rw-r--r-- | usr.sbin/bhyve/pci_virtio_block.c | 35 | ||||
-rw-r--r-- | usr.sbin/bhyve/pci_virtio_net.c | 35 |
13 files changed, 850 insertions, 374 deletions
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile index f2e49ca..72d60ae 100644 --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -5,7 +5,7 @@ PROG= bhyve SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c -SRCS+= instruction_emul.c ioapic.c mevent.c +SRCS+= instruction_emul.c ioapic.c mem.c mevent.c SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c SRCS+= pci_virtio_net.c pci_uart.c pit_8254.c post.c rtc.c uart.c xmsr.c SRCS+= spinup_ap.c diff --git a/usr.sbin/bhyve/fbsdrun.c b/usr.sbin/bhyve/fbsdrun.c index 1d59425..d7061f9 100644 --- a/usr.sbin/bhyve/fbsdrun.c +++ b/usr.sbin/bhyve/fbsdrun.c @@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$"); #include "fbsdrun.h" #include "inout.h" #include "dbgport.h" +#include "mem.h" #include "mevent.h" #include "pci_emul.h" #include "xmsr.h" @@ -446,11 +447,21 @@ vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) static int vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { - + int err; stats.vmexit_paging++; - if (emulate_instruction(ctx, *pvcpu, vmexit->rip, vmexit->u.paging.cr3) != 0) { - printf("Failed to emulate instruction at 0x%lx\n", vmexit->rip); + err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa, vmexit->rip, + vmexit->u.paging.cr3, vmexit->u.paging.rwx); + + if (err) { + if (err == EINVAL) { + printf("Failed to emulate instruction at 0x%lx\n", + vmexit->rip); + } else if (err == ESRCH) { + printf("Unhandled memory access to 0x%lx\n", + vmexit->u.paging.gpa); + } + return (VMEXIT_ABORT); } diff --git a/usr.sbin/bhyve/instruction_emul.c b/usr.sbin/bhyve/instruction_emul.c index 790c5ff..78c3608 100644 --- a/usr.sbin/bhyve/instruction_emul.c +++ b/usr.sbin/bhyve/instruction_emul.c @@ -28,10 +28,12 @@ #include <strings.h> #include <unistd.h> +#include <assert.h> #include <machine/vmm.h> #include <vmmapi.h> #include "fbsdrun.h" +#include "mem.h" #include "instruction_emul.h" #define PREFIX_LOCK 0xF0 @@ -46,6 +48,7 @@ #define PREFIX_BRANCH_NOT_TAKEN 0x2E #define PREFIX_BRANCH_TAKEN 0x3E #define PREFIX_OPSIZE 0x66 +#define is_opsz_prefix(x) ((x) == PREFIX_OPSIZE) #define PREFIX_ADDRSIZE 0x67 #define OPCODE_2BYTE_ESCAPE 0x0F @@ -95,6 +98,11 @@ #define FROM_REG (1<<2) #define TO_RM (1<<3) #define TO_REG (1<<4) +#define ZEXT (1<<5) +#define FROM_8 (1<<6) +#define FROM_16 (1<<7) +#define TO_8 (1<<8) +#define TO_16 (1<<9) #define REX_MASK 0xF0 #define REX_PREFIX 0x40 @@ -118,16 +126,7 @@ #define PML4E_OFFSET_MASK 0x0000FF8000000000 #define PML4E_SHIFT 39 -#define MAX_EMULATED_REGIONS 8 -int registered_regions = 0; -struct memory_region -{ - uintptr_t start; - uintptr_t end; - emulated_read_func_t memread; - emulated_write_func_t memwrite; - void *arg; -} emulated_regions[MAX_EMULATED_REGIONS]; +#define INSTR_VERIFY struct decoded_instruction { @@ -138,11 +137,12 @@ struct decoded_instruction uint8_t *displacement; uint8_t *immediate; - uint8_t opcode_flags; + uint16_t opcode_flags; uint8_t addressing_mode; uint8_t rm; uint8_t reg; + uint8_t opsz; uint8_t rex_r; uint8_t rex_w; uint8_t rex_b; @@ -170,11 +170,17 @@ static enum vm_reg_name vm_reg_name_mappings[] = { [REG_R15] = VM_REG_GUEST_R15 }; -uint8_t one_byte_opcodes[256] = { - [0x89] = HAS_MODRM | FROM_REG | TO_RM, +uint16_t one_byte_opcodes[256] = { + [0x88] = HAS_MODRM | FROM_REG | TO_RM | TO_8 | FROM_8, + [0x89] = HAS_MODRM | FROM_REG | TO_RM, [0x8B] = HAS_MODRM | FROM_RM | TO_REG, }; +uint16_t two_byte_opcodes[256] = { + [0xB6] = HAS_MODRM | FROM_RM | TO_REG | ZEXT | FROM_8, + [0xB7] = HAS_MODRM | FROM_RM | TO_REG | ZEXT | FROM_16, +}; + static uintptr_t gla2gpa(uint64_t gla, uint64_t guest_cr3) { @@ -211,7 +217,8 @@ gla2hla(uint64_t gla, uint64_t guest_cr3) uintptr_t gpa; gpa = gla2gpa(gla, guest_cr3); - return paddr_guest2host(gpa); + + return (paddr_guest2host(gpa)); } /* @@ -232,6 +239,9 @@ decode_prefixes(struct decoded_instruction *decoded) decoded->rex_x = *current_prefix & REX_X_MASK; decoded->rex_b = *current_prefix & REX_B_MASK; current_prefix++; + } else if (is_opsz_prefix(*current_prefix)) { + decoded->opsz = 1; + current_prefix++; } else if (is_prefix(*current_prefix)) { return (-1); } @@ -248,16 +258,26 @@ decode_prefixes(struct decoded_instruction *decoded) static int decode_opcode(struct decoded_instruction *decoded) { - uint8_t opcode, flags; + uint8_t opcode; + uint16_t flags; + int extra; opcode = *decoded->opcode; - flags = one_byte_opcodes[opcode]; - + extra = 0; + + if (opcode != 0xf) + flags = one_byte_opcodes[opcode]; + else { + opcode = *(decoded->opcode + 1); + flags = two_byte_opcodes[opcode]; + extra = 1; + } + if (!flags) return (-1); if (flags & HAS_MODRM) { - decoded->modrm = decoded->opcode + 1; + decoded->modrm = decoded->opcode + 1 + extra; } decoded->opcode_flags = flags; @@ -381,37 +401,70 @@ decode_instruction(void *instr, struct decoded_instruction *decoded) return (0); } -static struct memory_region * -find_region(uintptr_t addr) +static enum vm_reg_name +get_vm_reg_name(uint8_t reg) { - int i; - - for (i = 0; i < registered_regions; ++i) { - if (emulated_regions[i].start <= addr && - emulated_regions[i].end >= addr) { - return &emulated_regions[i]; - } - } - return (0); + return (vm_reg_name_mappings[reg]); } -static enum vm_reg_name -get_vm_reg_name(uint8_t reg) +static uint64_t +adjust_operand(const struct decoded_instruction *instruction, uint64_t val, + int size) { - return vm_reg_name_mappings[reg]; + uint64_t ret; + + if (instruction->opcode_flags & ZEXT) { + switch (size) { + case 1: + ret = val & 0xff; + break; + case 2: + ret = val & 0xffff; + break; + case 4: + ret = val & 0xffffffff; + break; + case 8: + ret = val; + break; + default: + break; + } + } else { + /* + * Extend the sign + */ + switch (size) { + case 1: + ret = (int8_t)(val & 0xff); + break; + case 2: + ret = (int16_t)(val & 0xffff); + break; + case 4: + ret = (int32_t)(val & 0xffffffff); + break; + case 8: + ret = val; + break; + default: + break; + } + } + + return (ret); } static int -get_operand(struct vmctx *vm, int vcpu, uint64_t guest_cr3, - const struct decoded_instruction *instruction, uint64_t *operand) +get_operand(struct vmctx *vm, int vcpu, uint64_t gpa, uint64_t guest_cr3, + const struct decoded_instruction *instruction, uint64_t *operand, + struct mem_range *mr) { enum vm_reg_name regname; uint64_t reg; - uintptr_t target; int error; - uint8_t rm, addressing_mode; - struct memory_region *emulated_memory; + uint8_t rm, addressing_mode, size; if (instruction->opcode_flags & FROM_RM) { rm = instruction->rm; @@ -422,6 +475,17 @@ get_operand(struct vmctx *vm, int vcpu, uint64_t guest_cr3, } else return (-1); + /* + * Determine size of operand + */ + size = 4; + if (instruction->opcode_flags & FROM_8) { + size = 1; + } else if (instruction->opcode_flags & FROM_16 || + instruction->opsz) { + size = 2; + } + regname = get_vm_reg_name(rm); error = vm_get_register(vm, vcpu, regname, ®); if (error) @@ -430,33 +494,67 @@ get_operand(struct vmctx *vm, int vcpu, uint64_t guest_cr3, switch (addressing_mode) { case MOD_DIRECT: *operand = reg; - return (0); + error = 0; + break; case MOD_INDIRECT: case MOD_INDIRECT_DISP8: case MOD_INDIRECT_DISP32: +#ifdef INSTR_VERIFY + { + uintptr_t target; + target = gla2gpa(reg, guest_cr3); target += instruction->disp; - emulated_memory = find_region(target); - if (emulated_memory) { - return emulated_memory->memread(vm, vcpu, target, - 4, operand, - emulated_memory->arg); - } - return (-1); + assert(gpa == target); + } +#endif + error = (*mr->handler)(vm, vcpu, MEM_F_READ, gpa, size, + operand, mr->arg1, mr->arg2); + break; default: return (-1); } + + if (!error) + *operand = adjust_operand(instruction, *operand, size); + + return (error); +} + +static uint64_t +adjust_write(uint64_t reg, uint64_t operand, int size) +{ + uint64_t val; + + switch (size) { + case 1: + val = (reg & ~0xff) | (operand & 0xff); + break; + case 2: + val = (reg & ~0xffff) | (operand & 0xffff); + break; + case 4: + val = (reg & ~0xffffffff) | (operand & 0xffffffff); + break; + case 8: + val = operand; + default: + break; + } + + return (val); } static int -perform_write(struct vmctx *vm, int vcpu, uint64_t guest_cr3, - const struct decoded_instruction *instruction, uint64_t operand) +perform_write(struct vmctx *vm, int vcpu, uint64_t gpa, uint64_t guest_cr3, + const struct decoded_instruction *instruction, uint64_t operand, + struct mem_range *mr) { enum vm_reg_name regname; uintptr_t target; int error; + int size; uint64_t reg; - struct memory_region *emulated_memory; uint8_t addressing_mode; if (instruction->opcode_flags & TO_RM) { @@ -467,83 +565,77 @@ perform_write(struct vmctx *vm, int vcpu, uint64_t guest_cr3, addressing_mode = MOD_DIRECT; } else return (-1); - - regname = get_vm_reg_name(reg); - error = vm_get_register(vm, vcpu, regname, ®); - if (error) - return (error); - + + /* + * Determine the operand size. rex.w has priority + */ + size = 4; + if (instruction->rex_w) { + size = 8; + } else if (instruction->opcode_flags & TO_8) { + size = 1; + } else if (instruction->opsz) { + size = 2; + }; + switch(addressing_mode) { case MOD_DIRECT: - return vm_set_register(vm, vcpu, regname, operand); + regname = get_vm_reg_name(reg); + error = vm_get_register(vm, vcpu, regname, ®); + if (error) + return (error); + operand = adjust_write(reg, operand, size); + + return (vm_set_register(vm, vcpu, regname, operand)); case MOD_INDIRECT: case MOD_INDIRECT_DISP8: case MOD_INDIRECT_DISP32: +#ifdef INSTR_VERIFY + regname = get_vm_reg_name(reg); + error = vm_get_register(vm, vcpu, regname, ®); + assert(!error); target = gla2gpa(reg, guest_cr3); target += instruction->disp; - emulated_memory = find_region(target); - if (emulated_memory) { - return emulated_memory->memwrite(vm, vcpu, target, - 4, operand, - emulated_memory->arg); - } - return (-1); + assert(gpa == target); +#endif + error = (*mr->handler)(vm, vcpu, MEM_F_WRITE, gpa, size, + &operand, mr->arg1, mr->arg2); + return (error); default: return (-1); } } static int -emulate_decoded_instruction(struct vmctx *vm, int vcpu, uint64_t cr3, - const struct decoded_instruction *instruction) +emulate_decoded_instruction(struct vmctx *vm, int vcpu, uint64_t gpa, + uint64_t cr3, + const struct decoded_instruction *instruction, + struct mem_range *mr) { uint64_t operand; int error; - error = get_operand(vm, vcpu, cr3, instruction, &operand); + error = get_operand(vm, vcpu, gpa, cr3, instruction, &operand, mr); if (error) return (error); - return perform_write(vm, vcpu, cr3, instruction, operand); + return perform_write(vm, vcpu, gpa, cr3, instruction, operand, mr); } -int -emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip, uint64_t cr3) +int +emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip, uint64_t cr3, + uint64_t gpa, int flags, struct mem_range *mr) { struct decoded_instruction instr; int error; - void *instruction = gla2hla(rip, cr3); - - if ((error = decode_instruction(instruction, &instr)) != 0) - return (error); - - return emulate_decoded_instruction(vm, vcpu, cr3, &instr); -} + void *instruction; -struct memory_region * -register_emulated_memory(uintptr_t start, size_t len, emulated_read_func_t memread, - emulated_write_func_t memwrite, void *arg) -{ - if (registered_regions >= MAX_EMULATED_REGIONS) - return (NULL); - - struct memory_region *region = &emulated_regions[registered_regions]; - region->start = start; - region->end = start + len; - region->memread = memread; - region->memwrite = memwrite; - region->arg = arg; - - registered_regions++; - return (region); -} + instruction = gla2hla(rip, cr3); -void -move_memory_region(struct memory_region *region, uintptr_t start) -{ - size_t len; + error = decode_instruction(instruction, &instr); + if (!error) + error = emulate_decoded_instruction(vm, vcpu, gpa, cr3, + &instr, mr); - len = region->end - region->start; - region->start = start; - region->end = start + len; + return (error); } diff --git a/usr.sbin/bhyve/instruction_emul.h b/usr.sbin/bhyve/instruction_emul.h index e7b6bff..ef85796 100644 --- a/usr.sbin/bhyve/instruction_emul.h +++ b/usr.sbin/bhyve/instruction_emul.h @@ -29,19 +29,8 @@ #ifndef _INSTRUCTION_EMUL_H_ #define _INSTRUCTION_EMUL_H_ -struct memory_region; - -typedef int (*emulated_read_func_t)(struct vmctx *vm, int vcpu, uintptr_t addr, - int size, uint64_t *data, void *arg); -typedef int (*emulated_write_func_t)(struct vmctx *vm, int vcpu, uintptr_t addr, - int size, uint64_t data, void *arg); - int emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip, - uint64_t cr3); -struct memory_region *register_emulated_memory(uintptr_t start, size_t len, - emulated_read_func_t memread, - emulated_write_func_t memwrite, - void *arg); -void move_memory_region(struct memory_region *memory_region, uintptr_t start); + uint64_t cr3, uint64_t gpa, int flags, + struct mem_range *mr); #endif diff --git a/usr.sbin/bhyve/ioapic.c b/usr.sbin/bhyve/ioapic.c index dc74cfa..ea6e47c 100644 --- a/usr.sbin/bhyve/ioapic.c +++ b/usr.sbin/bhyve/ioapic.c @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include <vmmapi.h> #include "inout.h" +#include "mem.h" #include "instruction_emul.h" #include "fbsdrun.h" @@ -67,10 +68,13 @@ struct ioapic { static struct ioapic ioapics[1]; /* only a single ioapic for now */ -static int ioapic_region_read(struct vmctx *vm, int vcpu, uintptr_t paddr, - int size, uint64_t *data, void *arg); -static int ioapic_region_write(struct vmctx *vm, int vcpu, uintptr_t paddr, - int size, uint64_t data, void *arg); +static int ioapic_region_read(struct ioapic *ioapic, uintptr_t paddr, + int size, uint64_t *data); +static int ioapic_region_write(struct ioapic *ioapic, uintptr_t paddr, + int size, uint64_t data); +static int ioapic_region_handler(struct vmctx *vm, int vcpu, int dir, + uintptr_t paddr, int size, uint64_t *val, + void *arg1, long arg2); static void ioapic_set_pinstate(struct vmctx *ctx, int pin, bool newstate) @@ -139,8 +143,10 @@ ioapic_assert_pin(struct vmctx *ctx, int pin) void ioapic_init(int which) { - int i; + struct mem_range memp; struct ioapic *ioapic; + int error; + int i; assert(which == 0); @@ -153,14 +159,19 @@ ioapic_init(int which) for (i = 0; i < REDIR_ENTRIES; i++) ioapic->redtbl[i] = 0x0001000000010000UL; - /* Register emulated memory region */ ioapic->paddr = IOAPIC_PADDR; - ioapic->region = register_emulated_memory(ioapic->paddr, - sizeof(struct IOAPIC), - ioapic_region_read, - ioapic_region_write, - (void *)(uintptr_t)which); - assert(ioapic->region != NULL); + + /* Register emulated memory region */ + memp.name = "ioapic"; + memp.flags = MEM_F_RW; + memp.handler = ioapic_region_handler; + memp.arg1 = ioapic; + memp.arg2 = which; + memp.base = ioapic->paddr; + memp.size = sizeof(struct IOAPIC); + error = register_mem(&memp); + + assert (error == 0); ioapic->inited = 1; } @@ -237,15 +248,11 @@ ioapic_write(struct ioapic *ioapic, uint32_t addr, uint32_t data) } static int -ioapic_region_read(struct vmctx *vm, int vcpu, uintptr_t paddr, int size, - uint64_t *data, void *arg) +ioapic_region_read(struct ioapic *ioapic, uintptr_t paddr, int size, + uint64_t *data) { - int which, offset; - struct ioapic *ioapic; - - which = (uintptr_t)arg; + int offset; - ioapic = &ioapics[which]; offset = paddr - ioapic->paddr; /* @@ -255,7 +262,7 @@ ioapic_region_read(struct vmctx *vm, int vcpu, uintptr_t paddr, int size, if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) { #if 1 printf("invalid access to ioapic%d: size %d, offset %d\n", - which, size, offset); + (int)(ioapic - ioapics), size, offset); #endif *data = 0; return (0); @@ -270,15 +277,11 @@ ioapic_region_read(struct vmctx *vm, int vcpu, uintptr_t paddr, int size, } static int -ioapic_region_write(struct vmctx *vm, int vcpu, uintptr_t paddr, int size, - uint64_t data, void *arg) +ioapic_region_write(struct ioapic *ioapic, uintptr_t paddr, int size, + uint64_t data) { - int which, offset; - struct ioapic *ioapic; - - which = (uintptr_t)arg; + int offset; - ioapic = &ioapics[which]; offset = paddr - ioapic->paddr; /* @@ -288,7 +291,7 @@ ioapic_region_write(struct vmctx *vm, int vcpu, uintptr_t paddr, int size, if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) { #if 1 printf("invalid access to ioapic%d: size %d, offset %d\n", - which, size, offset); + (int)(ioapic - ioapics), size, offset); #endif return (0); } @@ -300,3 +303,23 @@ ioapic_region_write(struct vmctx *vm, int vcpu, uintptr_t paddr, int size, return (0); } + +static int +ioapic_region_handler(struct vmctx *vm, int vcpu, int dir, uintptr_t paddr, + int size, uint64_t *val, void *arg1, long arg2) +{ + struct ioapic *ioapic; + int which; + + ioapic = arg1; + which = arg2; + + assert(ioapic == &ioapics[which]); + + if (dir == MEM_F_READ) + ioapic_region_read(ioapic, paddr, size, val); + else + ioapic_region_write(ioapic, paddr, size, *val); + + return (0); +} diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c new file mode 100644 index 0000000..deb91dc --- /dev/null +++ b/usr.sbin/bhyve/mem.c @@ -0,0 +1,196 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Memory ranges are represented with an RB tree. On insertion, the range + * is checked for overlaps. On lookup, the key has the same base and limit + * so it can be searched within the range. + * + * It is assumed that all setup of ranges takes place in single-threaded + * mode before vCPUs have been started. As such, no locks are used on the + * RB tree. If this is no longer the case, then a r/w lock could be used, + * with readers on the lookup and a writer if the tree needs to be changed + * (and per vCPU caches flushed) + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/tree.h> +#include <sys/errno.h> +#include <machine/vmm.h> + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> + +#include "mem.h" +#include "instruction_emul.h" + +struct mmio_rb_range { + RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */ + struct mem_range mr_param; + uint64_t mr_base; + uint64_t mr_end; +}; + +struct mmio_rb_tree; +RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); + +RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rbroot; + +/* + * Per-vCPU cache. Since most accesses from a vCPU will be to + * consecutive addresses in a range, it makes sense to cache the + * result of a lookup. + */ +static struct mmio_rb_range *mmio_hint[VM_MAXCPU]; + +static int +mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b) +{ + if (a->mr_end < b->mr_base) + return (-1); + else if (a->mr_base > b->mr_end) + return (1); + return (0); +} + +static int +mmio_rb_lookup(uint64_t addr, struct mmio_rb_range **entry) +{ + struct mmio_rb_range find, *res; + + find.mr_base = find.mr_end = addr; + + res = RB_FIND(mmio_rb_tree, &mmio_rbroot, &find); + + if (res != NULL) { + *entry = res; + return (0); + } + + return (ENOENT); +} + +static int +mmio_rb_add(struct mmio_rb_range *new) +{ + struct mmio_rb_range *overlap; + + overlap = RB_INSERT(mmio_rb_tree, &mmio_rbroot, new); + + if (overlap != NULL) { +#ifdef RB_DEBUG + printf("overlap detected: new %lx:%lx, tree %lx:%lx\n", + new->mr_base, new->mr_end, + overlap->mr_base, overlap->mr_end); +#endif + + return (EEXIST); + } + + return (0); +} + +#if 0 +static void +mmio_rb_dump(void) +{ + struct mmio_rb_range *np; + + RB_FOREACH(np, mmio_rb_tree, &mmio_rbroot) { + printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end, + np->mr_param.name); + } +} +#endif + +RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); + +int +emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, uint64_t rip, + uint64_t cr3, int mode) +{ + struct mmio_rb_range *entry; + int err; + + err = 0; + + /* + * First check the per-vCPU cache + */ + if (mmio_hint[vcpu] && + paddr >= mmio_hint[vcpu]->mr_base && + paddr <= mmio_hint[vcpu]->mr_end) { + err = emulate_instruction(ctx, vcpu, rip, cr3, paddr, mode, + &mmio_hint[vcpu]->mr_param); + } else { + if (mmio_rb_lookup(paddr, &entry)) { + err = ENOENT; + } else { + mmio_hint[vcpu] = entry; + err = emulate_instruction(ctx, vcpu, rip, cr3, paddr, + mode, &entry->mr_param); + } + } + + return (err); +} + +int +register_mem(struct mem_range *memp) +{ + struct mmio_rb_range *mrp; + int err; + + err = 0; + + mrp = malloc(sizeof(struct mmio_rb_range)); + + if (mrp != NULL) { + mrp->mr_param = *memp; + mrp->mr_base = memp->base; + mrp->mr_end = memp->base + memp->size - 1; + + err = mmio_rb_add(mrp); + if (err) + free(mrp); + } else + err = ENOMEM; + + return (err); +} + +void +init_mem(void) +{ + + RB_INIT(&mmio_rbroot); +} diff --git a/usr.sbin/bhyve/mem.h b/usr.sbin/bhyve/mem.h new file mode 100644 index 0000000..53c4f72 --- /dev/null +++ b/usr.sbin/bhyve/mem.h @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MEM_H_ +#define _MEM_H_ + +#include <sys/linker_set.h> + +struct vmctx; + +typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2); + +struct mem_range { + const char *name; + int flags; + mem_func_t handler; + void *arg1; + long arg2; + uint64_t base; + uint64_t size; +}; +#define MEM_F_READ 0x1 +#define MEM_F_WRITE 0x2 +#define MEM_F_RW 0x3 + +void init_mem(void); +int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, uint64_t rip, + uint64_t cr3, int mode); + +int register_mem(struct mem_range *memp); + +#endif /* _MEM_H_ */ diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c index 96667d5..06fbcc8 100644 --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include "fbsdrun.h" #include "inout.h" +#include "mem.h" #include "pci_emul.h" #include "ioapic.h" @@ -364,22 +365,26 @@ pci_finish_mptable_names(void) } static int -pci_emul_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) +pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) { struct pci_devinst *pdi = arg; struct pci_devemu *pe = pdi->pi_d; - int offset, i; + uint64_t offset; + int i; for (i = 0; i <= PCI_BARMAX; i++) { if (pdi->pi_bar[i].type == PCIBAR_IO && port >= pdi->pi_bar[i].addr && - port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { + port + bytes <= + pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { offset = port - pdi->pi_bar[i].addr; if (in) - *eax = (*pe->pe_ior)(pdi, i, offset, bytes); + *eax = (*pe->pe_barread)(ctx, vcpu, pdi, i, + offset, bytes); else - (*pe->pe_iow)(pdi, i, offset, bytes, *eax); + (*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset, + bytes, *eax); return (0); } } @@ -387,6 +392,32 @@ pci_emul_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, } static int +pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2) +{ + struct pci_devinst *pdi = arg1; + struct pci_devemu *pe = pdi->pi_d; + uint64_t offset; + int bidx = (int) arg2; + + assert(bidx <= PCI_BARMAX); + assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 || + pdi->pi_bar[bidx].type == PCIBAR_MEM64); + assert(addr >= pdi->pi_bar[bidx].addr && + addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); + + offset = addr - pdi->pi_bar[bidx].addr; + + if (dir == MEM_F_WRITE) + (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, size, *val); + else + *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset, size); + + return (0); +} + + +static int pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, uint64_t *addr) { @@ -405,12 +436,21 @@ pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, } int -pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase, - enum pcibar_type type, uint64_t size) +pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, + uint64_t size) +{ + + return (pci_emul_alloc_pbar(pdi, idx, 0, type, size)); +} + +int +pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, + enum pcibar_type type, uint64_t size) { int i, error; uint64_t *baseptr, limit, addr, mask, lobits, bar; struct inout_port iop; + struct mem_range memp; assert(idx >= 0 && idx <= PCI_BARMAX); @@ -497,13 +537,25 @@ pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase, if (type == PCIBAR_IO) { iop.name = pdi->pi_name; iop.flags = IOPORT_F_INOUT; - iop.handler = pci_emul_handler; + iop.handler = pci_emul_io_handler; iop.arg = pdi; for (i = 0; i < size; i++) { iop.port = addr + i; register_inout(&iop); } + } else if (type == PCIBAR_MEM32 || type == PCIBAR_MEM64) { + /* add memory bar intercept handler */ + memp.name = pdi->pi_name; + memp.flags = MEM_F_RW; + memp.base = addr; + memp.size = size; + memp.handler = pci_emul_mem_handler; + memp.arg1 = pdi; + memp.arg2 = idx; + + error = register_mem(&memp); + assert(error == 0); } return (0); @@ -1061,10 +1113,6 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, } pci_set_cfgdata32(pi, coff, bar); - if (pi->pi_bar[idx].handler) { - pi->pi_bar[idx].handler(pi, idx, bar); - } - } else if (pci_emul_iscap(pi, coff)) { pci_emul_capwrite(pi, coff, bytes, *eax); } else { @@ -1098,12 +1146,15 @@ INOUT_PORT(pci_irq, 0xC01, IOPORT_F_OUT, pci_irq_port_handler); /* * Define a dummy test device */ -#define DREGSZ 20 +#define DIOSZ 20 +#define DMEMSZ 4096 struct pci_emul_dsoftc { - uint8_t regs[DREGSZ]; + uint8_t ioregs[DIOSZ]; + uint8_t memregs[DMEMSZ]; }; -#define PCI_EMUL_MSGS 4 +#define PCI_EMUL_MSI_MSGS 4 +#define PCI_EMUL_MSIX_MSGS 16 static int pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts) @@ -1120,64 +1171,132 @@ pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); - error = pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, DREGSZ); + error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ); assert(error == 0); - error = pci_emul_add_msicap(pi, PCI_EMUL_MSGS); + error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); assert(error == 0); return (0); } static void -pci_emul_diow(struct pci_devinst *pi, int baridx, int offset, int size, - uint32_t value) +pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) { int i; struct pci_emul_dsoftc *sc = pi->pi_arg; - if (offset + size > DREGSZ) { - printf("diow: too large, offset %d size %d\n", offset, size); - return; - } + if (baridx == 0) { + if (offset + size > DIOSZ) { + printf("diow: iow too large, offset %ld size %d\n", + offset, size); + return; + } - if (size == 1) { - sc->regs[offset] = value & 0xff; - } else if (size == 2) { - *(uint16_t *)&sc->regs[offset] = value & 0xffff; - } else { - *(uint32_t *)&sc->regs[offset] = value; + if (size == 1) { + sc->ioregs[offset] = value & 0xff; + } else if (size == 2) { + *(uint16_t *)&sc->ioregs[offset] = value & 0xffff; + } else if (size == 4) { + *(uint32_t *)&sc->ioregs[offset] = value; + } else { + printf("diow: iow unknown size %d\n", size); + } + + /* + * Special magic value to generate an interrupt + */ + if (offset == 4 && size == 4 && pci_msi_enabled(pi)) + pci_generate_msi(pi, value % pci_msi_msgnum(pi)); + + if (value == 0xabcdef) { + for (i = 0; i < pci_msi_msgnum(pi); i++) + pci_generate_msi(pi, i); + } } - /* - * Special magic value to generate an interrupt - */ - if (offset == 4 && size == 4 && pci_msi_enabled(pi)) - pci_generate_msi(pi, value % pci_msi_msgnum(pi)); + if (baridx == 1) { + if (offset + size > DMEMSZ) { + printf("diow: memw too large, offset %ld size %d\n", + offset, size); + return; + } - if (value == 0xabcdef) { - for (i = 0; i < pci_msi_msgnum(pi); i++) - pci_generate_msi(pi, i); + if (size == 1) { + sc->memregs[offset] = value; + } else if (size == 2) { + *(uint16_t *)&sc->memregs[offset] = value; + } else if (size == 4) { + *(uint32_t *)&sc->memregs[offset] = value; + } else if (size == 8) { + *(uint64_t *)&sc->memregs[offset] = value; + } else { + printf("diow: memw unknown size %d\n", size); + } + + /* + * magic interrupt ?? + */ + } + + if (baridx > 1) { + printf("diow: unknown bar idx %d\n", baridx); } } -static uint32_t -pci_emul_dior(struct pci_devinst *pi, int baridx, int offset, int size) +static uint64_t +pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) { struct pci_emul_dsoftc *sc = pi->pi_arg; uint32_t value; - if (offset + size > DREGSZ) { - printf("dior: too large, offset %d size %d\n", offset, size); - return (0); + if (baridx == 0) { + if (offset + size > DIOSZ) { + printf("dior: ior too large, offset %ld size %d\n", + offset, size); + return (0); + } + + if (size == 1) { + value = sc->ioregs[offset]; + } else if (size == 2) { + value = *(uint16_t *) &sc->ioregs[offset]; + } else if (size == 4) { + value = *(uint32_t *) &sc->ioregs[offset]; + } else { + printf("dior: ior unknown size %d\n", size); + } } - if (size == 1) { - value = sc->regs[offset]; - } else if (size == 2) { - value = *(uint16_t *) &sc->regs[offset]; - } else { - value = *(uint32_t *) &sc->regs[offset]; + if (baridx == 1) { + if (offset + size > DMEMSZ) { + printf("dior: memr too large, offset %ld size %d\n", + offset, size); + return (0); + } + + if (size == 1) { + value = sc->memregs[offset]; + } else if (size == 2) { + value = *(uint16_t *) &sc->memregs[offset]; + } else if (size == 4) { + value = *(uint32_t *) &sc->memregs[offset]; + } else if (size == 8) { + value = *(uint64_t *) &sc->memregs[offset]; + } else { + printf("dior: ior unknown size %d\n", size); + } + } + + + if (baridx > 1) { + printf("dior: unknown bar idx %d\n", baridx); + return (0); } return (value); @@ -1186,8 +1305,8 @@ pci_emul_dior(struct pci_devinst *pi, int baridx, int offset, int size) struct pci_devemu pci_dummy = { .pe_emu = "dummy", .pe_init = pci_emul_dinit, - .pe_iow = pci_emul_diow, - .pe_ior = pci_emul_dior + .pe_barwrite = pci_emul_diow, + .pe_barread = pci_emul_dior }; PCI_EMUL_SET(pci_dummy); diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h index 19dba99..79b86d1 100644 --- a/usr.sbin/bhyve/pci_emul.h +++ b/usr.sbin/bhyve/pci_emul.h @@ -48,7 +48,8 @@ struct pci_devemu { char *pe_emu; /* Name of device emulation */ /* instance creation */ - int (*pe_init)(struct vmctx *, struct pci_devinst *, char *opts); + int (*pe_init)(struct vmctx *, struct pci_devinst *, + char *opts); /* config space read/write callbacks */ int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu, @@ -58,11 +59,13 @@ struct pci_devemu { struct pci_devinst *pi, int offset, int bytes, uint32_t *retval); - /* I/O space read/write callbacks */ - void (*pe_iow)(struct pci_devinst *pi, int baridx, - int offset, int size, uint32_t value); - uint32_t (*pe_ior)(struct pci_devinst *pi, int baridx, - int offset, int size); + /* BAR read/write callbacks */ + void (*pe_barwrite)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value); + uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int baridx, + uint64_t offset, int size); }; #define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); @@ -74,13 +77,10 @@ enum pcibar_type { PCIBAR_MEMHI64 }; -typedef int (*bar_write_func_t)(struct pci_devinst *pdi, int idx, uint64_t bar); - struct pcibar { enum pcibar_type type; /* io or memory */ uint64_t size; uint64_t addr; - bar_write_func_t handler; }; #define PI_NAMESZ 40 @@ -119,11 +119,9 @@ struct pci_devinst { int table_bar; int pba_bar; size_t table_offset; - uintptr_t table_gpa; size_t table_size; int table_count; size_t pba_offset; - struct memory_region *table_bar_region; struct msix_table_entry table[MAX_MSIX_TABLE_SIZE]; } pi_msix; @@ -156,15 +154,19 @@ void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val); void pci_callback(void); -int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase, +int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size); +int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, + uint64_t hostbase, enum pcibar_type type, uint64_t size); int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); int pci_is_legacy(struct pci_devinst *pi); void pci_generate_msi(struct pci_devinst *pi, int msgnum); +void pci_generate_msix(struct pci_devinst *pi, int msgnum); void pci_lintr_assert(struct pci_devinst *pi); void pci_lintr_deassert(struct pci_devinst *pi); int pci_lintr_request(struct pci_devinst *pi, int ivec); int pci_msi_enabled(struct pci_devinst *pi); +int pci_msix_enabled(struct pci_devinst *pi); int pci_msi_msgnum(struct pci_devinst *pi); void pci_parse_name(char *opt); void pci_parse_slot(char *opt, int legacy); diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c index a6f1f63..94e4416 100644 --- a/usr.sbin/bhyve/pci_passthru.c +++ b/usr.sbin/bhyve/pci_passthru.c @@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$"); #include <machine/vmm.h> #include <vmmapi.h> #include "pci_emul.h" +#include "mem.h" #include "instruction_emul.h" #ifndef _PATH_DEVPCI @@ -218,15 +219,17 @@ cfginitmsi(struct passthru_softc *sc) } } - if (sc->psc_msix.capoff == 0) - return (-1); - - pi->pi_msix.pba_bar = msixcap.pba_offset & MSIX_TABLE_BIR_MASK; - pi->pi_msix.pba_offset = msixcap.pba_offset & MSIX_TABLE_OFFSET_MASK; - pi->pi_msix.table_bar = msixcap.table_offset & MSIX_TABLE_BIR_MASK; - pi->pi_msix.table_offset = msixcap.table_offset & MSIX_TABLE_OFFSET_MASK; - - pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); + if (sc->psc_msix.capoff != 0) { + pi->pi_msix.pba_bar = + msixcap.pba_offset & MSIX_TABLE_BIR_MASK; + pi->pi_msix.pba_offset = + msixcap.pba_offset & MSIX_TABLE_OFFSET_MASK; + pi->pi_msix.table_bar = + msixcap.table_offset & MSIX_TABLE_BIR_MASK; + pi->pi_msix.table_offset = + msixcap.table_offset & MSIX_TABLE_OFFSET_MASK; + pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); + } #ifdef LEGACY_SUPPORT /* @@ -252,106 +255,84 @@ cfginitmsi(struct passthru_softc *sc) return (0); } -static int -msix_table_read(struct vmctx *vm, int vcpu, uintptr_t addr, - int size, uint64_t *data, void *arg) +static uint64_t +msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) { - struct passthru_softc *sc; struct pci_devinst *pi; - int index; - size_t offset, entry_offset; + struct msix_table_entry *entry; uint8_t *src8; uint16_t *src16; uint32_t *src32; uint64_t *src64; - struct msix_table_entry *entry; + uint64_t data; + size_t entry_offset; + int index; - sc = arg; pi = sc->psc_pi; - offset = addr - pi->pi_msix.table_gpa; - entry_offset = addr % MSIX_TABLE_ENTRY_SIZE; + entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; index = offset / MSIX_TABLE_ENTRY_SIZE; entry = &pi->pi_msix.table[index]; switch(size) { case 1: - src8 = (uint8_t*)((void*)entry + entry_offset); - *data = *src8; + src8 = (uint8_t *)((void *)entry + entry_offset); + data = *src8; break; case 2: - src16 = (uint16_t*)((void*)entry + entry_offset); - *data = *src16; + src16 = (uint16_t *)((void *)entry + entry_offset); + data = *src16; break; case 4: - src32 = (uint32_t*)((void*)entry + entry_offset); - *data = *src32; + src32 = (uint32_t *)((void *)entry + entry_offset); + data = *src32; break; case 8: - src64 = (uint64_t*)((void*)entry + entry_offset); - *data = *src64; + src64 = (uint64_t *)((void *)entry + entry_offset); + data = *src64; break; default: return (-1); } - return (0); + return (data); } -static int -msix_table_write(struct vmctx *vm, int vcpu, uintptr_t addr, - int size, uint64_t data, void *arg) +static void +msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc, + uint64_t offset, int size, uint64_t data) { - struct passthru_softc *sc; struct pci_devinst *pi; - int error, index; - size_t offset, entry_offset; - uint32_t *dest; struct msix_table_entry *entry; + uint32_t *dest; + size_t entry_offset; uint32_t vector_control; + int error, index; - sc = arg; pi = sc->psc_pi; - offset = addr - pi->pi_msix.table_gpa; - entry_offset = addr % MSIX_TABLE_ENTRY_SIZE; + entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; index = offset / MSIX_TABLE_ENTRY_SIZE; entry = &pi->pi_msix.table[index]; /* Only 4 byte naturally-aligned writes are supported */ - if (size == 4 && entry_offset % 4 == 0) { - vector_control = entry->vector_control; - dest = (uint32_t*)((void*)entry + entry_offset); - *dest = data; - /* If MSI-X hasn't been enabled, do nothing */ - if (pi->pi_msix.enabled) { - /* If the entry is masked, don't set it up */ - if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || - (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { - error = vm_setup_msix(vm, vcpu, sc->psc_sel.pc_bus, - sc->psc_sel.pc_dev, - sc->psc_sel.pc_func, - index, entry->msg_data, - entry->vector_control, - entry->addr); - if (error) - return (-1); - } + assert(size == 4); + assert(entry_offset % 4 == 0); + + vector_control = entry->vector_control; + dest = (uint32_t *)((void *)entry + entry_offset); + *dest = data; + /* If MSI-X hasn't been enabled, do nothing */ + if (pi->pi_msix.enabled) { + /* If the entry is masked, don't set it up */ + if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || + (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + error = vm_setup_msix(ctx, vcpu, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, + sc->psc_sel.pc_func, + index, entry->msg_data, + entry->vector_control, + entry->addr); } - } else { - printf("Unsupported unaligned or non-4-byte write to MSI-X table\n"); - return (-1); } - return (0); -} - -static int -msix_bar_handler(struct pci_devinst *pdi, int idx, uint64_t bar) -{ - uintptr_t start; - - start = (bar & PCIM_BAR_MEM_BASE) + pdi->pi_msix.table_offset; - move_memory_region(pdi->pi_msix.table_bar_region, start); - pdi->pi_msix.table_gpa = start; - return (0); } static int @@ -375,6 +356,7 @@ init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base) printf("Unsupported MSI-X table and PBA in same page\n"); return (-1); } + /* * May need to split the BAR into 3 regions: * Before the MSI-X table, the MSI-X table, and after it @@ -395,30 +377,9 @@ init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base) start = pi->pi_bar[idx].addr; len = pi->pi_msix.table_offset; } - return vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, - sc->psc_sel.pc_dev, sc->psc_sel.pc_func, - start, len, base + table_size); -} - -static int -cfginitmsix(struct passthru_softc *sc) -{ - int table_bar; - struct pci_devinst *pi; - - pi = sc->psc_pi; - table_bar = pi->pi_msix.table_bar; - pi->pi_msix.table_gpa = sc->psc_bar[table_bar].addr + pi->pi_msix.table_offset; - pi->pi_msix.table_bar_region = register_emulated_memory(pi->pi_msix.table_gpa, - pi->pi_msix.table_size, - msix_table_read, - msix_table_write, sc); - if (!pi->pi_msix.table_bar_region) - return (-1); - - pi->pi_bar[table_bar].handler = msix_bar_handler; - - return (0); + return (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, sc->psc_sel.pc_func, + start, len, base + table_size)); } static int @@ -464,8 +425,8 @@ cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) sc->psc_bar[i].addr = base; /* Allocate the BAR in the guest I/O or MMIO space */ - error = pci_emul_alloc_bar(pi, i, base, bartype, - bar.pbi_length); + error = pci_emul_alloc_pbar(pi, i, base, bartype, + bar.pbi_length); if (error) return (-1); @@ -515,9 +476,6 @@ cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func) if (cfginitbar(ctx, sc) != 0) goto done; - if (cfginitmsix(sc) != 0) - goto done; - error = 0; /* success */ done: return (error); @@ -544,7 +502,8 @@ passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) goto done; } - if (opts == NULL || sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) + if (opts == NULL || + sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) goto done; if (vm_assign_pptdev(ctx, bus, slot, func) != 0) @@ -557,7 +516,7 @@ passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) sc->psc_pi = pi; /* initialize config space */ - if (cfginit(ctx, pi, bus, slot, func) != 0) + if ((error = cfginit(ctx, pi, bus, slot, func)) != 0) goto done; error = 0; /* success */ @@ -605,8 +564,8 @@ msixcap_access(struct passthru_softc *sc, int coff) } static int -passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, - int bytes, uint32_t *rv) +passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t *rv) { struct passthru_softc *sc; @@ -636,8 +595,8 @@ passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, } static int -passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, - int bytes, uint32_t val) +passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t val) { int error, msix_table_entries, i; struct passthru_softc *sc; @@ -705,40 +664,54 @@ passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, } static void -passthru_iow(struct pci_devinst *pi, int baridx, int offset, int size, - uint32_t value) +passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) { struct passthru_softc *sc; struct iodev_pio_req pio; sc = pi->pi_arg; - bzero(&pio, sizeof(struct iodev_pio_req)); - pio.access = IODEV_PIO_WRITE; - pio.port = sc->psc_bar[baridx].addr + offset; - pio.width = size; - pio.val = value; - - (void)ioctl(iofd, IODEV_PIO, &pio); + if (pi->pi_msix.enabled && pi->pi_msix.table_bar == baridx) { + msix_table_write(ctx, vcpu, sc, offset, size, value); + } else { + assert(pi->pi_bar[baridx].type == PCIBAR_IO); + bzero(&pio, sizeof(struct iodev_pio_req)); + pio.access = IODEV_PIO_WRITE; + pio.port = sc->psc_bar[baridx].addr + offset; + pio.width = size; + pio.val = value; + + (void)ioctl(iofd, IODEV_PIO, &pio); + } } -static uint32_t -passthru_ior(struct pci_devinst *pi, int baridx, int offset, int size) +static uint64_t +passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) { struct passthru_softc *sc; struct iodev_pio_req pio; + uint64_t val; sc = pi->pi_arg; - bzero(&pio, sizeof(struct iodev_pio_req)); - pio.access = IODEV_PIO_READ; - pio.port = sc->psc_bar[baridx].addr + offset; - pio.width = size; - pio.val = 0; + if (pi->pi_msix.enabled && pi->pi_msix.table_bar == baridx) { + val = msix_table_read(sc, offset, size); + } else { + assert(pi->pi_bar[baridx].type == PCIBAR_IO); + bzero(&pio, sizeof(struct iodev_pio_req)); + pio.access = IODEV_PIO_READ; + pio.port = sc->psc_bar[baridx].addr + offset; + pio.width = size; + pio.val = 0; + + (void)ioctl(iofd, IODEV_PIO, &pio); - (void)ioctl(iofd, IODEV_PIO, &pio); + val = pio.val; + } - return (pio.val); + return (val); } struct pci_devemu passthru = { @@ -746,7 +719,7 @@ struct pci_devemu passthru = { .pe_init = passthru_init, .pe_cfgwrite = passthru_cfgwrite, .pe_cfgread = passthru_cfgread, - .pe_iow = passthru_iow, - .pe_ior = passthru_ior, + .pe_barwrite = passthru_write, + .pe_barread = passthru_read, }; PCI_EMUL_SET(passthru); diff --git a/usr.sbin/bhyve/pci_uart.c b/usr.sbin/bhyve/pci_uart.c index 0f8a281..51876f5 100644 --- a/usr.sbin/bhyve/pci_uart.c +++ b/usr.sbin/bhyve/pci_uart.c @@ -320,8 +320,8 @@ pci_uart_drain(int fd, enum ev_type ev, void *arg) } static void -pci_uart_write(struct pci_devinst *pi, int baridx, int offset, int size, - uint32_t value) +pci_uart_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) { struct pci_uart_softc *sc; int fifosz; @@ -329,6 +329,7 @@ pci_uart_write(struct pci_devinst *pi, int baridx, int offset, int size, sc = pi->pi_arg; + assert(baridx == 0); assert(size == 1); /* Open terminal */ @@ -459,15 +460,17 @@ done: pci_uart_toggle_intr(sc); } -uint32_t -pci_uart_read(struct pci_devinst *pi, int baridx, int offset, int size) +uint64_t +pci_uart_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) { struct pci_uart_softc *sc; uint8_t iir, intr_reason; - uint32_t reg; + uint64_t reg; sc = pi->pi_arg; + assert(baridx == 0); assert(size == 1); /* Open terminal */ @@ -573,11 +576,11 @@ pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); if (pci_is_legacy(pi)) { pci_uart_legacy_res(&bar, &ivec); + pci_emul_alloc_pbar(pi, 0, bar, PCIBAR_IO, 8); } else { - bar = 0; ivec = -1; + pci_emul_alloc_bar(pi, 0, PCIBAR_IO, 8); } - pci_emul_alloc_bar(pi, 0, bar, PCIBAR_IO, 8); pci_lintr_request(pi, ivec); if (opts != NULL && !strcmp("stdio", opts) && !pci_uart_stdio) { @@ -591,9 +594,9 @@ pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) } struct pci_devemu pci_de_com = { - .pe_emu = "uart", - .pe_init = pci_uart_init, - .pe_iow = pci_uart_write, - .pe_ior = pci_uart_read, + .pe_emu = "uart", + .pe_init = pci_uart_init, + .pe_barwrite = pci_uart_write, + .pe_barread = pci_uart_read }; PCI_EMUL_SET(pci_de_com); diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c index 98c0695..916c7c3 100644 --- a/usr.sbin/bhyve/pci_virtio_block.c +++ b/usr.sbin/bhyve/pci_virtio_block.c @@ -382,20 +382,22 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK); - pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, VTBLK_REGSZ); pci_emul_add_msicap(pi, 1); + pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTBLK_REGSZ); return (0); } static void -pci_vtblk_write(struct pci_devinst *pi, int baridx, int offset, int size, - uint32_t value) +pci_vtblk_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) { struct pci_vtblk_softc *sc = pi->pi_arg; - + + assert(baridx == 0); + if (offset + size > VTBLK_REGSZ) { - DPRINTF(("vtblk_write: 2big, offset %d size %d\n", + DPRINTF(("vtblk_write: 2big, offset %ld size %d\n", offset, size)); return; } @@ -426,24 +428,27 @@ pci_vtblk_write(struct pci_devinst *pi, int baridx, int offset, int size, case VTCFG_R_QNUM: case VTCFG_R_ISR: case VTBLK_R_CFG ... VTBLK_R_CFG_END: - DPRINTF(("vtblk: write to readonly reg %d\n\r", offset)); + DPRINTF(("vtblk: write to readonly reg %ld\n\r", offset)); break; default: - DPRINTF(("vtblk: unknown i/o write offset %d\n\r", offset)); + DPRINTF(("vtblk: unknown i/o write offset %ld\n\r", offset)); value = 0; break; } } -uint32_t -pci_vtblk_read(struct pci_devinst *pi, int baridx, int offset, int size) +uint64_t +pci_vtblk_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) { struct pci_vtblk_softc *sc = pi->pi_arg; void *ptr; uint32_t value; + assert(baridx == 0); + if (offset + size > VTBLK_REGSZ) { - DPRINTF(("vtblk_read: 2big, offset %d size %d\n", + DPRINTF(("vtblk_read: 2big, offset %ld size %d\n", offset, size)); return (0); } @@ -493,7 +498,7 @@ pci_vtblk_read(struct pci_devinst *pi, int baridx, int offset, int size) } break; default: - DPRINTF(("vtblk: unknown i/o read offset %d\n\r", offset)); + DPRINTF(("vtblk: unknown i/o read offset %ld\n\r", offset)); value = 0; break; } @@ -502,9 +507,9 @@ pci_vtblk_read(struct pci_devinst *pi, int baridx, int offset, int size) } struct pci_devemu pci_de_vblk = { - .pe_emu = "virtio-blk", - .pe_init = pci_vtblk_init, - .pe_iow = pci_vtblk_write, - .pe_ior = pci_vtblk_read, + .pe_emu = "virtio-blk", + .pe_init = pci_vtblk_init, + .pe_barwrite = pci_vtblk_write, + .pe_barread = pci_vtblk_read }; PCI_EMUL_SET(pci_de_vblk); diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c index 6160b14..26c8ee8 100644 --- a/usr.sbin/bhyve/pci_virtio_net.c +++ b/usr.sbin/bhyve/pci_virtio_net.c @@ -574,8 +574,8 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); - pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, VTNET_REGSZ); pci_emul_add_msicap(pi, 1); + pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ); return (0); } @@ -590,14 +590,16 @@ static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = { }; static void -pci_vtnet_write(struct pci_devinst *pi, int baridx, int offset, int size, - uint32_t value) +pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) { struct pci_vtnet_softc *sc = pi->pi_arg; void *ptr; + assert(baridx == 0); + if (offset + size > VTNET_REGSZ) { - DPRINTF(("vtnet_write: 2big, offset %d size %d\n", + DPRINTF(("vtnet_write: 2big, offset %ld size %d\n", offset, size)); return; } @@ -652,10 +654,10 @@ pci_vtnet_write(struct pci_devinst *pi, int baridx, int offset, int size, case VTCFG_R_ISR: case VTNET_R_CFG6: case VTNET_R_CFG7: - DPRINTF(("vtnet: write to readonly reg %d\n\r", offset)); + DPRINTF(("vtnet: write to readonly reg %ld\n\r", offset)); break; default: - DPRINTF(("vtnet: unknown i/o write offset %d\n\r", offset)); + DPRINTF(("vtnet: unknown i/o write offset %ld\n\r", offset)); value = 0; break; } @@ -663,15 +665,18 @@ pci_vtnet_write(struct pci_devinst *pi, int baridx, int offset, int size, pthread_mutex_unlock(&sc->vsc_mtx); } -uint32_t -pci_vtnet_read(struct pci_devinst *pi, int baridx, int offset, int size) +uint64_t +pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) { struct pci_vtnet_softc *sc = pi->pi_arg; void *ptr; - uint32_t value; + uint64_t value; + + assert(baridx == 0); if (offset + size > VTNET_REGSZ) { - DPRINTF(("vtnet_read: 2big, offset %d size %d\n", + DPRINTF(("vtnet_read: 2big, offset %ld size %d\n", offset, size)); return (0); } @@ -737,7 +742,7 @@ pci_vtnet_read(struct pci_devinst *pi, int baridx, int offset, int size) value = 0; /* XXX link status in LSB */ break; default: - DPRINTF(("vtnet: unknown i/o read offset %d\n\r", offset)); + DPRINTF(("vtnet: unknown i/o read offset %ld\n\r", offset)); value = 0; break; } @@ -748,9 +753,9 @@ pci_vtnet_read(struct pci_devinst *pi, int baridx, int offset, int size) } struct pci_devemu pci_de_vnet = { - .pe_emu = "virtio-net", - .pe_init = pci_vtnet_init, - .pe_iow = pci_vtnet_write, - .pe_ior = pci_vtnet_read, + .pe_emu = "virtio-net", + .pe_init = pci_vtnet_init, + .pe_barwrite = pci_vtnet_write, + .pe_barread = pci_vtnet_read }; PCI_EMUL_SET(pci_de_vnet); |