summaryrefslogtreecommitdiffstats
path: root/usr.sbin/bhyve
diff options
context:
space:
mode:
authorgrehan <grehan@FreeBSD.org>2012-04-28 16:28:00 +0000
committergrehan <grehan@FreeBSD.org>2012-04-28 16:28:00 +0000
commit9f0c999f8126597eb572b80056df88335dbd0070 (patch)
treee25fe0456b877e9c40defdc19a863563208d18de /usr.sbin/bhyve
parent94d2b7f64912987093f1a98573737a32e4e5d8d1 (diff)
downloadFreeBSD-src-9f0c999f8126597eb572b80056df88335dbd0070.zip
FreeBSD-src-9f0c999f8126597eb572b80056df88335dbd0070.tar.gz
MSI-x interrupt support for PCI pass-thru devices.
Includes instruction emulation for memory r/w access. This opens the door for io-apic, local apic, hpet timer, and legacy device emulation. Submitted by: ryan dot berryhill at sandvine dot com Reviewed by: grehan Obtained from: Sandvine
Diffstat (limited to 'usr.sbin/bhyve')
-rw-r--r--usr.sbin/bhyve/Makefile3
-rw-r--r--usr.sbin/bhyve/fbsdrun.c29
-rw-r--r--usr.sbin/bhyve/instruction_emul.c555
-rw-r--r--usr.sbin/bhyve/instruction_emul.h47
-rw-r--r--usr.sbin/bhyve/pci_emul.c29
-rw-r--r--usr.sbin/bhyve/pci_emul.h40
-rw-r--r--usr.sbin/bhyve/pci_passthru.c270
7 files changed, 953 insertions, 20 deletions
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
index b0398ed..f64e579 100644
--- a/usr.sbin/bhyve/Makefile
+++ b/usr.sbin/bhyve/Makefile
@@ -4,7 +4,8 @@
PROG= bhyve
-SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c mevent.c
+SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c
+SRCS+= instruction_emul.c mevent.c
SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
SRCS+= pci_virtio_net.c pit_8254.c post.c rtc.c uart.c xmsr.c
diff --git a/usr.sbin/bhyve/fbsdrun.c b/usr.sbin/bhyve/fbsdrun.c
index 6f009b5..c2295ea 100644
--- a/usr.sbin/bhyve/fbsdrun.c
+++ b/usr.sbin/bhyve/fbsdrun.c
@@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$");
#include "mevent.h"
#include "pci_emul.h"
#include "xmsr.h"
+#include "instruction_emul.h"
#define DEFAULT_GUEST_HZ 100
#define DEFAULT_GUEST_TSLICE 200
@@ -108,6 +109,7 @@ struct fbsdstats {
uint64_t vmexit_hlt;
uint64_t vmexit_pause;
uint64_t vmexit_mtrap;
+ uint64_t vmexit_paging;
uint64_t cpu_switch_rotate;
uint64_t cpu_switch_direct;
int io_reset;
@@ -412,6 +414,20 @@ vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
return (VMEXIT_RESTART);
}
+static int
+vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ stats.vmexit_paging++;
+
+ if (emulate_instruction(ctx, *pvcpu, vmexit->rip, vmexit->u.paging.cr3) != 0) {
+ printf("Failed to emulate instruction at 0x%lx\n", vmexit->rip);
+ return (VMEXIT_ABORT);
+ }
+
+ return (VMEXIT_CONTINUE);
+}
+
static void
sigalrm(int sig)
{
@@ -446,12 +462,13 @@ setup_timeslice(void)
}
static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
- [VM_EXITCODE_INOUT] = vmexit_inout,
- [VM_EXITCODE_VMX] = vmexit_vmx,
- [VM_EXITCODE_BOGUS] = vmexit_bogus,
- [VM_EXITCODE_RDMSR] = vmexit_rdmsr,
- [VM_EXITCODE_WRMSR] = vmexit_wrmsr,
- [VM_EXITCODE_MTRAP] = vmexit_mtrap,
+ [VM_EXITCODE_INOUT] = vmexit_inout,
+ [VM_EXITCODE_VMX] = vmexit_vmx,
+ [VM_EXITCODE_BOGUS] = vmexit_bogus,
+ [VM_EXITCODE_RDMSR] = vmexit_rdmsr,
+ [VM_EXITCODE_WRMSR] = vmexit_wrmsr,
+ [VM_EXITCODE_MTRAP] = vmexit_mtrap,
+ [VM_EXITCODE_PAGING] = vmexit_paging
};
static void
diff --git a/usr.sbin/bhyve/instruction_emul.c b/usr.sbin/bhyve/instruction_emul.c
new file mode 100644
index 0000000..8c99194
--- /dev/null
+++ b/usr.sbin/bhyve/instruction_emul.c
@@ -0,0 +1,555 @@
+/*-
+ * Copyright (c) 2012 Sandvine, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <strings.h>
+#include <unistd.h>
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "fbsdrun.h"
+#include "instruction_emul.h"
+
+#define PREFIX_LOCK 0xF0
+#define PREFIX_REPNE 0xF2
+#define PREFIX_REPE 0xF3
+#define PREFIX_CS_OVERRIDE 0x2E
+#define PREFIX_SS_OVERRIDE 0x36
+#define PREFIX_DS_OVERRIDE 0x3E
+#define PREFIX_ES_OVERRIDE 0x26
+#define PREFIX_FS_OVERRIDE 0x64
+#define PREFIX_GS_OVERRIDE 0x65
+#define PREFIX_BRANCH_NOT_TAKEN 0x2E
+#define PREFIX_BRANCH_TAKEN 0x3E
+#define PREFIX_OPSIZE 0x66
+#define PREFIX_ADDRSIZE 0x67
+
+#define OPCODE_2BYTE_ESCAPE 0x0F
+#define OPCODE_3BYTE_ESCAPE 0x38
+
+#define MODRM_MOD_MASK 0xC0
+#define MODRM_MOD_SHIFT 6
+#define MODRM_RM_MASK 0x07
+#define MODRM_RM_SHIFT 0
+#define MODRM_REG_MASK 0x38
+#define MODRM_REG_SHIFT 3
+
+#define MOD_INDIRECT 0x0
+#define MOD_INDIRECT_DISP8 0x1
+#define MOD_INDIRECT_DISP32 0x2
+#define MOD_DIRECT 0x3
+
+#define RM_EAX 0x0
+#define RM_ECX 0x1
+#define RM_EDX 0x2
+#define RM_EBX 0x3
+#define RM_SIB 0x4
+#define RM_DISP32 0x5
+#define RM_EBP RM_DISP32
+#define RM_ESI 0x6
+#define RM_EDI 0x7
+
+#define REG_EAX 0x0
+#define REG_ECX 0x1
+#define REG_EDX 0x2
+#define REG_EBX 0x3
+#define REG_ESP 0x4
+#define REG_EBP 0x5
+#define REG_ESI 0x6
+#define REG_EDI 0x7
+#define REG_R8 0x8
+#define REG_R9 0x9
+#define REG_R10 0xA
+#define REG_R11 0xB
+#define REG_R12 0xC
+#define REG_R13 0xD
+#define REG_R14 0xE
+#define REG_R15 0xF
+
+#define HAS_MODRM 1
+#define FROM_RM (1<<1)
+#define FROM_REG (1<<2)
+#define TO_RM (1<<3)
+#define TO_REG (1<<4)
+
+#define REX_MASK 0xF0
+#define REX_PREFIX 0x40
+#define is_rex_prefix(x) ( ((x) & REX_MASK) == REX_PREFIX )
+#define REX_W_MASK 0x8
+#define REX_R_MASK 0x4
+#define REX_X_MASK 0x2
+#define REX_B_MASK 0x1
+
+#define is_prefix(x) ((x) == PREFIX_LOCK || (x) == PREFIX_REPNE || \
+ (x) == PREFIX_REPE || (x) == PREFIX_CS_OVERRIDE || \
+ (x) == PREFIX_SS_OVERRIDE || (x) == PREFIX_DS_OVERRIDE || \
+ (x) == PREFIX_ES_OVERRIDE || (x) == PREFIX_FS_OVERRIDE || \
+ (x) == PREFIX_GS_OVERRIDE || (x) == PREFIX_BRANCH_NOT_TAKEN || \
+ (x) == PREFIX_BRANCH_TAKEN || (x) == PREFIX_OPSIZE || \
+ (x) == PREFIX_ADDRSIZE || is_rex_prefix((x)))
+
+#define PAGE_FRAME_MASK 0x80
+#define PAGE_OFFSET_MASK 0xFFF
+#define PAGE_TABLE_ENTRY_MASK (~PAGE_OFFSET_MASK)
+#define PML4E_OFFSET_MASK 0x0000FF8000000000
+#define PML4E_SHIFT 39
+
+#define MAX_EMULATED_REGIONS 8
+int registered_regions = 0;
+struct memory_region
+{
+ uintptr_t start;
+ uintptr_t end;
+ emulated_read_func_t memread;
+ emulated_write_func_t memwrite;
+ void *arg;
+} emulated_regions[MAX_EMULATED_REGIONS];
+
+struct decoded_instruction
+{
+ void *instruction;
+ uint8_t *opcode;
+ uint8_t *modrm;
+ uint8_t *sib;
+ uint8_t *displacement;
+ uint8_t *immediate;
+
+ uint8_t opcode_flags;
+
+ uint8_t addressing_mode;
+ uint8_t rm;
+ uint8_t reg;
+ uint8_t rex_r;
+ uint8_t rex_w;
+ uint8_t rex_b;
+ uint8_t rex_x;
+
+ int32_t disp;
+};
+
+static enum vm_reg_name vm_reg_name_mappings[] = {
+ [REG_EAX] = VM_REG_GUEST_RAX,
+ [REG_EBX] = VM_REG_GUEST_RBX,
+ [REG_ECX] = VM_REG_GUEST_RCX,
+ [REG_EDX] = VM_REG_GUEST_RDX,
+ [REG_ESP] = VM_REG_GUEST_RSP,
+ [REG_EBP] = VM_REG_GUEST_RBP,
+ [REG_ESI] = VM_REG_GUEST_RSI,
+ [REG_EDI] = VM_REG_GUEST_RDI,
+ [REG_R8] = VM_REG_GUEST_R8,
+ [REG_R9] = VM_REG_GUEST_R9,
+ [REG_R10] = VM_REG_GUEST_R10,
+ [REG_R11] = VM_REG_GUEST_R11,
+ [REG_R12] = VM_REG_GUEST_R12,
+ [REG_R13] = VM_REG_GUEST_R13,
+ [REG_R14] = VM_REG_GUEST_R14,
+ [REG_R15] = VM_REG_GUEST_R15
+};
+
+uint8_t one_byte_opcodes[256] = {
+ [0x89] = HAS_MODRM | FROM_REG | TO_RM,
+ [0x8B] = HAS_MODRM | FROM_RM | TO_REG,
+};
+
+static uintptr_t
+gla2gpa(uint64_t gla, uint64_t guest_cr3)
+{
+ uint64_t *table;
+ uint64_t mask, entry;
+ int level, shift;
+ uintptr_t page_frame;
+
+ table = paddr_guest2host(guest_cr3 & PAGE_TABLE_ENTRY_MASK);
+ mask = PML4E_OFFSET_MASK;
+ shift = PML4E_SHIFT;
+ for (level = 0; level < 4; ++level)
+ {
+ entry = table[(gla & mask) >> shift];
+ table = (uint64_t*)(entry & PAGE_TABLE_ENTRY_MASK);
+
+ /* This entry does not point to another page table */
+ if (entry & PAGE_FRAME_MASK || level >= 3)
+ break;
+
+ table = paddr_guest2host((uintptr_t)table);
+ mask >>= 9;
+ shift -= 9;
+ }
+
+ mask = (1 << shift) - 1;
+ page_frame = ((uintptr_t)table & ~mask);
+ return (page_frame | (gla & mask));
+}
+
+static void *
+gla2hla(uint64_t gla, uint64_t guest_cr3)
+{
+ uintptr_t gpa;
+
+ gpa = gla2gpa(gla, guest_cr3);
+ return paddr_guest2host(gpa);
+}
+
+/*
+ * Decodes all of the prefixes of the instruction. Only a subset of REX
+ * prefixes are currently supported. If any unsupported prefix is
+ * encountered, returns -1.
+ */
+static int
+decode_prefixes(struct decoded_instruction *decoded)
+{
+ uint8_t *current_prefix;
+
+ current_prefix = decoded->instruction;
+
+ if (is_rex_prefix(*current_prefix)) {
+ decoded->rex_w = *current_prefix & REX_W_MASK;
+ decoded->rex_r = *current_prefix & REX_R_MASK;
+ decoded->rex_x = *current_prefix & REX_X_MASK;
+ decoded->rex_b = *current_prefix & REX_B_MASK;
+ current_prefix++;
+ } else if (is_prefix(*current_prefix)) {
+ return (-1);
+ }
+
+ decoded->opcode = current_prefix;
+ return (0);
+}
+
+/*
+ * Decodes the instruction's opcode. If the opcode is not understood, returns
+ * -1 indicating an error. Sets the instruction's mod_rm pointer to the
+ * location of the ModR/M field.
+ */
+static int
+decode_opcode(struct decoded_instruction *decoded)
+{
+ uint8_t opcode, flags;
+
+ opcode = *decoded->opcode;
+ flags = one_byte_opcodes[opcode];
+
+ if (!flags)
+ return (-1);
+
+ if (flags & HAS_MODRM) {
+ decoded->modrm = decoded->opcode + 1;
+ }
+
+ decoded->opcode_flags = flags;
+
+ return (0);
+}
+
+/*
+ * Decodes the instruction's ModR/M field. Sets the instruction's sib pointer
+ * to the location of the SIB if one is expected to be present, or 0 if not.
+ */
+static int
+decode_mod_rm(struct decoded_instruction *decoded)
+{
+ uint8_t modrm;
+ uint8_t *extension_operands;
+
+ if (decoded->modrm) {
+ modrm = *decoded->modrm;
+
+ decoded->addressing_mode = (modrm & MODRM_MOD_MASK) >> MODRM_MOD_SHIFT;
+ decoded->rm = (modrm & MODRM_RM_MASK) >> MODRM_RM_SHIFT;
+ decoded->reg = (modrm & MODRM_REG_MASK) >> MODRM_REG_SHIFT;
+
+ if (decoded->rex_b)
+ decoded->rm |= (1<<3);
+
+ if (decoded->rex_r)
+ decoded->reg |= (1<<3);
+
+ extension_operands = decoded->modrm + 1;
+
+ if (decoded->rm == RM_SIB) {
+ decoded->sib = decoded->modrm + 1;
+ extension_operands = decoded->sib + 1;
+ }
+
+ switch (decoded->addressing_mode) {
+ case MOD_INDIRECT:
+ case MOD_DIRECT:
+ decoded->displacement = 0;
+ break;
+ case MOD_INDIRECT_DISP8:
+ decoded->displacement = extension_operands;
+ break;
+ case MOD_INDIRECT_DISP32:
+ decoded->displacement = extension_operands;
+ break;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Decodes the instruction's SIB field. No such instructions are currently
+ * supported, so do nothing and return -1 if there is a SIB field, 0 otherwise.
+ */
+static int
+decode_sib(struct decoded_instruction *decoded)
+{
+
+ if (decoded->sib)
+ return (-1);
+
+ return (0);
+}
+
+/*
+ * Grabs and saves the instruction's immediate operand and displacement if
+ * they are present. Immediates are not currently supported, so if an
+ * immediate is present it will return -1 indicating an error.
+ */
+static int
+decode_extension_operands(struct decoded_instruction *decoded)
+{
+
+ if (decoded->displacement) {
+ if (decoded->addressing_mode == MOD_INDIRECT_DISP8) {
+ decoded->disp = (int32_t)*decoded->displacement;
+ } else if (decoded->addressing_mode == MOD_INDIRECT_DISP32) {
+ decoded->disp = *((int32_t*)decoded->displacement);
+ }
+ }
+
+ if (decoded->immediate) {
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+decode_instruction(void *instr, struct decoded_instruction *decoded)
+{
+ int error;
+
+ bzero(decoded, sizeof(*decoded));
+ decoded->instruction = instr;
+
+ error = decode_prefixes(decoded);
+ if (error)
+ return (error);
+
+ error = decode_opcode(decoded);
+ if (error)
+ return (error);
+
+ error = decode_mod_rm(decoded);
+ if (error)
+ return (error);
+
+ error = decode_sib(decoded);
+ if (error)
+ return (error);
+
+ error = decode_extension_operands(decoded);
+ if (error)
+ return (error);
+
+ return (0);
+}
+
+static struct memory_region *
+find_region(uintptr_t addr)
+{
+ int i;
+
+ for (i = 0; i < registered_regions; ++i) {
+ if (emulated_regions[i].start <= addr &&
+ emulated_regions[i].end >= addr) {
+ return &emulated_regions[i];
+ }
+ }
+
+ return (0);
+}
+
+static enum vm_reg_name
+get_vm_reg_name(uint8_t reg)
+{
+ return vm_reg_name_mappings[reg];
+}
+
+static int
+get_operand(struct vmctx *vm, int vcpu, uint64_t guest_cr3,
+ const struct decoded_instruction *instruction, uint64_t *operand)
+{
+ enum vm_reg_name regname;
+ uint64_t reg;
+ uintptr_t target;
+ int error;
+ uint8_t rm, addressing_mode;
+ struct memory_region *emulated_memory;
+
+ if (instruction->opcode_flags & FROM_RM) {
+ rm = instruction->rm;
+ addressing_mode = instruction->addressing_mode;
+ } else if (instruction->opcode_flags & FROM_REG) {
+ rm = instruction->reg;
+ addressing_mode = MOD_DIRECT;
+ } else
+ return (-1);
+
+ regname = get_vm_reg_name(rm);
+ error = vm_get_register(vm, vcpu, regname, &reg);
+ if (error)
+ return (error);
+
+ switch (addressing_mode) {
+ case MOD_DIRECT:
+ *operand = reg;
+ return (0);
+ case MOD_INDIRECT:
+ target = gla2gpa(reg, guest_cr3);
+ emulated_memory = find_region(target);
+ if (emulated_memory) {
+ return emulated_memory->memread(vm, vcpu, target,
+ 4, operand,
+ emulated_memory->arg);
+ }
+ return (-1);
+ case MOD_INDIRECT_DISP8:
+ case MOD_INDIRECT_DISP32:
+ target = gla2gpa(reg, guest_cr3);
+ target += instruction->disp;
+ emulated_memory = find_region(target);
+ if (emulated_memory) {
+ return emulated_memory->memread(vm, vcpu, target,
+ 4, operand,
+ emulated_memory->arg);
+ }
+ return (-1);
+ default:
+ return (-1);
+ }
+}
+
+static int
+perform_write(struct vmctx *vm, int vcpu, uint64_t guest_cr3,
+ const struct decoded_instruction *instruction, uint64_t operand)
+{
+ enum vm_reg_name regname;
+ uintptr_t target;
+ int error;
+ uint64_t reg;
+ struct memory_region *emulated_memory;
+ uint8_t addressing_mode;
+
+ if (instruction->opcode_flags & TO_RM) {
+ reg = instruction->rm;
+ addressing_mode = instruction->addressing_mode;
+ } else if (instruction->opcode_flags & TO_REG) {
+ reg = instruction->reg;
+ addressing_mode = MOD_DIRECT;
+ } else
+ return (-1);
+
+ regname = get_vm_reg_name(reg);
+ error = vm_get_register(vm, vcpu, regname, &reg);
+ if (error)
+ return (error);
+
+ switch(addressing_mode) {
+ case MOD_DIRECT:
+ return vm_set_register(vm, vcpu, regname, operand);
+ case MOD_INDIRECT:
+ target = gla2gpa(reg, guest_cr3);
+ emulated_memory = find_region(target);
+ if (emulated_memory) {
+ return emulated_memory->memwrite(vm, vcpu, target,
+ 4, operand,
+ emulated_memory->arg);
+ }
+ return (-1);
+ default:
+ return (-1);
+ }
+}
+
+static int
+emulate_decoded_instruction(struct vmctx *vm, int vcpu, uint64_t cr3,
+ const struct decoded_instruction *instruction)
+{
+ uint64_t operand;
+ int error;
+
+ error = get_operand(vm, vcpu, cr3, instruction, &operand);
+ if (error)
+ return (error);
+
+ return perform_write(vm, vcpu, cr3, instruction, operand);
+}
+
+int
+emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip, uint64_t cr3)
+{
+ struct decoded_instruction instr;
+ int error;
+ void *instruction = gla2hla(rip, cr3);
+
+ if ((error = decode_instruction(instruction, &instr)) != 0)
+ return (error);
+
+ return emulate_decoded_instruction(vm, vcpu, cr3, &instr);
+}
+
+struct memory_region *
+register_emulated_memory(uintptr_t start, size_t len, emulated_read_func_t memread,
+ emulated_write_func_t memwrite, void *arg)
+{
+ if (registered_regions > MAX_EMULATED_REGIONS)
+ return (NULL);
+
+ struct memory_region *region = &emulated_regions[registered_regions];
+ region->start = start;
+ region->end = start + len;
+ region->memread = memread;
+ region->memwrite = memwrite;
+ region->arg = arg;
+
+ registered_regions++;
+ return (region);
+}
+
+void
+move_memory_region(struct memory_region *region, uintptr_t start)
+{
+ size_t len;
+
+ len = region->end - region->start;
+ region->start = start;
+ region->end = start + len;
+}
+
diff --git a/usr.sbin/bhyve/instruction_emul.h b/usr.sbin/bhyve/instruction_emul.h
new file mode 100644
index 0000000..e7b6bff
--- /dev/null
+++ b/usr.sbin/bhyve/instruction_emul.h
@@ -0,0 +1,47 @@
+/*-
+ * Copyright (c) 2012 Sandvine, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _INSTRUCTION_EMUL_H_
+#define _INSTRUCTION_EMUL_H_
+
+struct memory_region;
+
+typedef int (*emulated_read_func_t)(struct vmctx *vm, int vcpu, uintptr_t addr,
+ int size, uint64_t *data, void *arg);
+typedef int (*emulated_write_func_t)(struct vmctx *vm, int vcpu, uintptr_t addr,
+ int size, uint64_t data, void *arg);
+
+int emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip,
+ uint64_t cr3);
+struct memory_region *register_emulated_memory(uintptr_t start, size_t len,
+ emulated_read_func_t memread,
+ emulated_write_func_t memwrite,
+ void *arg);
+void move_memory_region(struct memory_region *memory_region, uintptr_t start);
+
+#endif
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index 650c4de..9de87ad 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include "fbsdrun.h"
#include "inout.h"
#include "pci_emul.h"
+#include "instruction_emul.h"
#define CONF1_ADDR_PORT 0x0cf8
#define CONF1_DATA_PORT 0x0cfc
@@ -572,6 +573,29 @@ pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
}
void
+msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+ uint16_t msgctrl, rwmask;
+ int off, table_bar;
+
+ off = offset - capoff;
+ table_bar = pi->pi_msix.table_bar;
+ /* Message Control Register */
+ if (off == 2 && bytes == 2) {
+ rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
+ msgctrl = pci_get_cfgdata16(pi, offset);
+ msgctrl &= ~rwmask;
+ msgctrl |= val & rwmask;
+ val = msgctrl;
+
+ pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
+ }
+
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+void
msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val)
{
@@ -847,6 +871,11 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
assert(0);
}
pci_set_cfgdata32(pi, coff, bar);
+
+ if (pi->pi_bar[idx].handler) {
+ pi->pi_bar[idx].handler(pi, idx, bar);
+ }
+
} else if (pci_emul_iscap(pi, coff)) {
pci_emul_capwrite(pi, coff, bytes, *eax);
} else {
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
index f5f8e22..588e5ba 100644
--- a/usr.sbin/bhyve/pci_emul.h
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -42,6 +42,7 @@
struct vmctx;
struct pci_devinst;
+struct memory_region;
struct pci_devemu {
char *pe_emu; /* Name of device emulation */
@@ -73,14 +74,30 @@ enum pcibar_type {
PCIBAR_MEMHI64
};
+typedef int (*bar_write_func_t)(struct pci_devinst *pdi, int idx, uint64_t bar);
+
struct pcibar {
enum pcibar_type type; /* io or memory */
uint64_t size;
uint64_t addr;
+ bar_write_func_t handler;
};
#define PI_NAMESZ 40
+struct msix_table_entry {
+ uint64_t addr;
+ uint32_t msg_data;
+ uint32_t vector_control;
+} __packed;
+
+/*
+ * In case the structure is modified to hold extra information, use a define
+ * for the size that should be emulated.
+ */
+#define MSIX_TABLE_ENTRY_SIZE 16
+#define MAX_MSIX_TABLE_SIZE 2048
+
struct pci_devinst {
struct pci_devemu *pi_d;
struct vmctx *pi_vmctx;
@@ -96,6 +113,19 @@ struct pci_devinst {
int msgnum;
} pi_msi;
+ struct {
+ int enabled;
+ int table_bar;
+ int pba_bar;
+ size_t table_offset;
+ uintptr_t table_gpa;
+ size_t table_size;
+ int table_count;
+ size_t pba_offset;
+ struct memory_region *table_bar_region;
+ struct msix_table_entry table[MAX_MSIX_TABLE_SIZE];
+ } pi_msix;
+
void *pi_arg; /* devemu-private data */
u_char pi_cfgdata[PCI_REGMAX + 1];
@@ -111,6 +141,14 @@ struct msicap {
uint16_t msgdata;
} __packed;
+struct msixcap {
+ uint8_t capid;
+ uint8_t nextptr;
+ uint16_t msgctrl;
+ uint32_t table_offset;
+ uint32_t pba_offset;
+} __packed;
+
void init_pci(struct vmctx *ctx);
void pci_parse_slot(char *opt);
void pci_parse_name(char *opt);
@@ -120,6 +158,8 @@ int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val);
+void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val);
void pci_generate_msi(struct pci_devinst *pi, int msgnum);
int pci_msi_enabled(struct pci_devinst *pi);
diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c
index 1c417fd..a6f1f63 100644
--- a/usr.sbin/bhyve/pci_passthru.c
+++ b/usr.sbin/bhyve/pci_passthru.c
@@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$");
#include <machine/vmm.h>
#include <vmmapi.h>
#include "pci_emul.h"
+#include "instruction_emul.h"
#ifndef _PATH_DEVPCI
#define _PATH_DEVPCI "/dev/pci"
@@ -58,6 +59,11 @@ __FBSDID("$FreeBSD$");
#define LEGACY_SUPPORT 1
+#define MSIX_TABLE_BIR_MASK 7
+#define MSIX_TABLE_OFFSET_MASK (~MSIX_TABLE_BIR_MASK);
+#define MSIX_TABLE_COUNT(x) (((x) & 0x7FF) + 1)
+#define MSIX_CAPLEN 12
+
static int pcifd = -1;
static int iofd = -1;
@@ -69,6 +75,9 @@ struct passthru_softc {
int msgctrl;
int emulated;
} psc_msi;
+ struct {
+ int capoff;
+ } psc_msix;
struct pcisel psc_sel;
};
@@ -152,17 +161,19 @@ passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
static int
cfginitmsi(struct passthru_softc *sc)
{
- int ptr, cap, sts, caplen;
+ int ptr, capptr, cap, sts, caplen;
uint32_t u32;
struct pcisel sel;
struct pci_devinst *pi;
+ struct msixcap msixcap;
+ uint32_t *msixcap_ptr;
pi = sc->psc_pi;
sel = sc->psc_sel;
/*
* Parse the capabilities and cache the location of the MSI
- * capability.
+ * and MSI-X capabilities.
*/
sts = read_config(&sel, PCIR_STATUS, 2);
if (sts & PCIM_STATUS_CAPPRESENT) {
@@ -179,18 +190,44 @@ cfginitmsi(struct passthru_softc *sc)
ptr + 2, 2);
sc->psc_msi.emulated = 0;
caplen = msi_caplen(sc->psc_msi.msgctrl);
+ capptr = ptr;
while (caplen > 0) {
- u32 = read_config(&sel, ptr, 4);
- pci_set_cfgdata32(pi, ptr, u32);
+ u32 = read_config(&sel, capptr, 4);
+ pci_set_cfgdata32(pi, capptr, u32);
caplen -= 4;
- ptr += 4;
+ capptr += 4;
+ }
+ } else if (cap == PCIY_MSIX) {
+ /*
+ * Copy the MSI-X capability
+ */
+ sc->psc_msix.capoff = ptr;
+ caplen = 12;
+ msixcap_ptr = (uint32_t*) &msixcap;
+ capptr = ptr;
+ while (caplen > 0) {
+ u32 = read_config(&sel, capptr, 4);
+ *msixcap_ptr = u32;
+ pci_set_cfgdata32(pi, capptr, u32);
+ caplen -= 4;
+ capptr += 4;
+ msixcap_ptr++;
}
- break;
}
ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
}
}
+ if (sc->psc_msix.capoff == 0)
+ return (-1);
+
+ pi->pi_msix.pba_bar = msixcap.pba_offset & MSIX_TABLE_BIR_MASK;
+ pi->pi_msix.pba_offset = msixcap.pba_offset & MSIX_TABLE_OFFSET_MASK;
+ pi->pi_msix.table_bar = msixcap.table_offset & MSIX_TABLE_BIR_MASK;
+ pi->pi_msix.table_offset = msixcap.table_offset & MSIX_TABLE_OFFSET_MASK;
+
+ pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
+
#ifdef LEGACY_SUPPORT
/*
* If the passthrough device does not support MSI then craft a
@@ -208,12 +245,182 @@ cfginitmsi(struct passthru_softc *sc)
}
#endif
- if (sc->psc_msi.capoff == 0) /* MSI or bust */
+ /* Make sure one of the capabilities is present */
+ if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
return (-1);
else
return (0);
}
+static int
+msix_table_read(struct vmctx *vm, int vcpu, uintptr_t addr,
+ int size, uint64_t *data, void *arg)
+{
+ struct passthru_softc *sc;
+ struct pci_devinst *pi;
+ int index;
+ size_t offset, entry_offset;
+ uint8_t *src8;
+ uint16_t *src16;
+ uint32_t *src32;
+ uint64_t *src64;
+ struct msix_table_entry *entry;
+
+ sc = arg;
+ pi = sc->psc_pi;
+ offset = addr - pi->pi_msix.table_gpa;
+ entry_offset = addr % MSIX_TABLE_ENTRY_SIZE;
+ index = offset / MSIX_TABLE_ENTRY_SIZE;
+ entry = &pi->pi_msix.table[index];
+
+ switch(size) {
+ case 1:
+ src8 = (uint8_t*)((void*)entry + entry_offset);
+ *data = *src8;
+ break;
+ case 2:
+ src16 = (uint16_t*)((void*)entry + entry_offset);
+ *data = *src16;
+ break;
+ case 4:
+ src32 = (uint32_t*)((void*)entry + entry_offset);
+ *data = *src32;
+ break;
+ case 8:
+ src64 = (uint64_t*)((void*)entry + entry_offset);
+ *data = *src64;
+ break;
+ default:
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+msix_table_write(struct vmctx *vm, int vcpu, uintptr_t addr,
+ int size, uint64_t data, void *arg)
+{
+ struct passthru_softc *sc;
+ struct pci_devinst *pi;
+ int error, index;
+ size_t offset, entry_offset;
+ uint32_t *dest;
+ struct msix_table_entry *entry;
+ uint32_t vector_control;
+
+ sc = arg;
+ pi = sc->psc_pi;
+ offset = addr - pi->pi_msix.table_gpa;
+ entry_offset = addr % MSIX_TABLE_ENTRY_SIZE;
+ index = offset / MSIX_TABLE_ENTRY_SIZE;
+ entry = &pi->pi_msix.table[index];
+
+ /* Only 4 byte naturally-aligned writes are supported */
+ if (size == 4 && entry_offset % 4 == 0) {
+ vector_control = entry->vector_control;
+ dest = (uint32_t*)((void*)entry + entry_offset);
+ *dest = data;
+ /* If MSI-X hasn't been enabled, do nothing */
+ if (pi->pi_msix.enabled) {
+ /* If the entry is masked, don't set it up */
+ if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
+ (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+ error = vm_setup_msix(vm, vcpu, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev,
+ sc->psc_sel.pc_func,
+ index, entry->msg_data,
+ entry->vector_control,
+ entry->addr);
+ if (error)
+ return (-1);
+ }
+ }
+ } else {
+ printf("Unsupported unaligned or non-4-byte write to MSI-X table\n");
+ return (-1);
+ }
+ return (0);
+}
+
+static int
+msix_bar_handler(struct pci_devinst *pdi, int idx, uint64_t bar)
+{
+ uintptr_t start;
+
+ start = (bar & PCIM_BAR_MEM_BASE) + pdi->pi_msix.table_offset;
+ move_memory_region(pdi->pi_msix.table_bar_region, start);
+ pdi->pi_msix.table_gpa = start;
+ return (0);
+}
+
+static int
+init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
+{
+ int idx;
+ size_t table_size;
+ vm_paddr_t start;
+ size_t len;
+ struct pci_devinst *pi = sc->psc_pi;
+
+ /*
+ * If the MSI-X table BAR maps memory intended for
+ * other uses, it is at least assured that the table
+ * either resides in its own page within the region,
+ * or it resides in a page shared with only the PBA.
+ */
+ if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar &&
+ ((pi->pi_msix.pba_offset - pi->pi_msix.table_offset) < 4096)) {
+ /* Need to also emulate the PBA, not supported yet */
+ printf("Unsupported MSI-X table and PBA in same page\n");
+ return (-1);
+ }
+ /*
+ * May need to split the BAR into 3 regions:
+ * Before the MSI-X table, the MSI-X table, and after it
+ * XXX for now, assume that the table is not in the middle
+ */
+ table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
+ pi->pi_msix.table_size = table_size;
+ idx = pi->pi_msix.table_bar;
+
+ /* Round up to page size */
+ table_size = (table_size + 0x1000) & ~0xFFF;
+ if (pi->pi_msix.table_offset == 0) {
+ /* Map everything after the MSI-X table */
+ start = pi->pi_bar[idx].addr + table_size;
+ len = pi->pi_bar[idx].size - table_size;
+ } else {
+ /* Map everything before the MSI-X table */
+ start = pi->pi_bar[idx].addr;
+ len = pi->pi_msix.table_offset;
+ }
+ return vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
+ start, len, base + table_size);
+}
+
+static int
+cfginitmsix(struct passthru_softc *sc)
+{
+ int table_bar;
+ struct pci_devinst *pi;
+
+ pi = sc->psc_pi;
+ table_bar = pi->pi_msix.table_bar;
+ pi->pi_msix.table_gpa = sc->psc_bar[table_bar].addr + pi->pi_msix.table_offset;
+ pi->pi_msix.table_bar_region = register_emulated_memory(pi->pi_msix.table_gpa,
+ pi->pi_msix.table_size,
+ msix_table_read,
+ msix_table_write, sc);
+ if (!pi->pi_msix.table_bar_region)
+ return (-1);
+
+ pi->pi_bar[table_bar].handler = msix_bar_handler;
+
+ return (0);
+}
+
static int
cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
{
@@ -262,10 +469,13 @@ cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
if (error)
return (-1);
- /*
- * Map the physical MMIO space in the guest MMIO space
- */
- if (bartype != PCIBAR_IO) {
+ /* The MSI-X table needs special handling */
+ if (i == pi->pi_msix.table_bar) {
+ error = init_msix_table(ctx, sc, base);
+ if (error)
+ return (-1);
+ } else if (bartype != PCIBAR_IO) {
+ /* Map the physical MMIO space in the guest MMIO space */
error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
@@ -299,10 +509,13 @@ cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
sc->psc_sel.pc_dev = slot;
sc->psc_sel.pc_func = func;
+ if (cfginitmsi(sc) != 0)
+ goto done;
+
if (cfginitbar(ctx, sc) != 0)
goto done;
- if (cfginitmsi(sc) != 0)
+ if (cfginitmsix(sc) != 0)
goto done;
error = 0; /* success */
@@ -381,6 +594,16 @@ msicap_access(struct passthru_softc *sc, int coff)
return (0);
}
+static int
+msixcap_access(struct passthru_softc *sc, int coff)
+{
+ if (sc->psc_msix.capoff == 0)
+ return (0);
+
+ return (coff >= sc->psc_msix.capoff &&
+ coff < sc->psc_msix.capoff + MSIX_CAPLEN);
+}
+
static int
passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff,
int bytes, uint32_t *rv)
@@ -416,7 +639,7 @@ static int
passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff,
int bytes, uint32_t val)
{
- int error;
+ int error, msix_table_entries, i;
struct passthru_softc *sc;
sc = pi->pi_arg;
@@ -443,6 +666,27 @@ passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff,
return (0);
}
+ if (msixcap_access(sc, coff)) {
+ msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
+ if (pi->pi_msix.enabled) {
+ msix_table_entries = pi->pi_msix.table_count;
+ for (i = 0; i < msix_table_entries; i++) {
+ error = vm_setup_msix(ctx, vcpu, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev,
+ sc->psc_sel.pc_func, i,
+ pi->pi_msix.table[i].msg_data,
+ pi->pi_msix.table[i].vector_control,
+ pi->pi_msix.table[i].addr);
+
+ if (error) {
+ printf("vm_setup_msix returned error %d\r\n", errno);
+ exit(1);
+ }
+ }
+ }
+ return (0);
+ }
+
#ifdef LEGACY_SUPPORT
/*
* If this device does not support MSI natively then we cannot let
OpenPOWER on IntegriCloud