diff options
Diffstat (limited to 'sys/amd64')
43 files changed, 10025 insertions, 0 deletions
diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h index 895619c..c95fee0 100644 --- a/sys/amd64/include/specialreg.h +++ b/sys/amd64/include/specialreg.h @@ -297,6 +297,7 @@ */ #define APICBASE_RESERVED 0x000006ff #define APICBASE_BSP 0x00000100 +#define APICBASE_X2APIC 0x00000400 #define APICBASE_ENABLED 0x00000800 #define APICBASE_ADDRESS 0xfffff000 diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h new file mode 100644 index 0000000..0f4c356 --- /dev/null +++ b/sys/amd64/include/vmm.h @@ -0,0 +1,268 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: vmm.h 482 2011-05-09 21:22:43Z grehan $ + */ + +#ifndef _VMM_H_ +#define _VMM_H_ + +#ifdef _KERNEL + +#define VM_MAX_NAMELEN 32 + +struct vm; +struct vm_memory_segment; +struct seg_desc; +struct vm_exit; +struct vm_run; +struct vlapic; + +typedef int (*vmm_init_func_t)(void); +typedef int (*vmm_cleanup_func_t)(void); +typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */ +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, + struct vm_exit *vmexit); +typedef void (*vmi_cleanup_func_t)(void *vmi); +typedef int (*vmi_mmap_func_t)(void *vmi, vm_paddr_t gpa, vm_paddr_t hpa, + size_t length, vm_memattr_t attr, + int prot, boolean_t superpages_ok); +typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, + uint64_t *retval); +typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, + uint64_t val); +typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_inject_event_t)(void *vmi, int vcpu, + int type, int vector, + uint32_t code, int code_valid); +typedef int (*vmi_inject_nmi_t)(void *vmi, int vcpu); +typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); +typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); + +struct vmm_ops { + vmm_init_func_t init; /* module wide initialization */ + vmm_cleanup_func_t cleanup; + + vmi_init_func_t vminit; /* vm-specific initialization */ + vmi_run_func_t vmrun; + vmi_cleanup_func_t vmcleanup; + vmi_mmap_func_t vmmmap; + vmi_get_register_t vmgetreg; + vmi_set_register_t vmsetreg; + vmi_get_desc_t vmgetdesc; + vmi_set_desc_t vmsetdesc; + vmi_inject_event_t vminject; + vmi_inject_nmi_t vmnmi; + vmi_get_cap_t vmgetcap; + vmi_set_cap_t vmsetcap; +}; + +extern struct vmm_ops vmm_ops_intel; +extern struct vmm_ops vmm_ops_amd; + +struct vm *vm_create(const char *name); +void vm_destroy(struct vm *vm); +const char *vm_name(struct vm *vm); +int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa); +int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); +vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size); +int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, + struct vm_memory_segment *seg); +int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); +int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); +int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *ret_desc); +int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc); +int vm_get_pinning(struct vm *vm, int vcpu, int *cpuid); +int vm_set_pinning(struct vm *vm, int vcpu, int cpuid); +int vm_run(struct vm *vm, struct vm_run *vmrun); +int vm_inject_event(struct vm *vm, int vcpu, int type, + int vector, uint32_t error_code, int error_code_valid); +int vm_inject_nmi(struct vm *vm, int vcpu); +uint64_t *vm_guest_msrs(struct vm *vm, int cpu); +struct vlapic *vm_lapic(struct vm *vm, int cpu); +int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); +int vm_set_capability(struct vm *vm, int vcpu, int type, int val); +void vm_activate_cpu(struct vm *vm, int vcpu); +cpumask_t vm_active_cpus(struct vm *vm); + +/* + * Return 1 if device indicated by bus/slot/func is supposed to be a + * pci passthrough device. + * + * Return 0 otherwise. + */ +int vmm_is_pptdev(int bus, int slot, int func); + +void *vm_iommu_domain(struct vm *vm); + +#define VCPU_STOPPED 0 +#define VCPU_RUNNING 1 +void vm_set_run_state(struct vm *vm, int vcpu, int running); +int vm_get_run_state(struct vm *vm, int vcpu, int *hostcpu); + +void *vcpu_stats(struct vm *vm, int vcpu); + +static int __inline +vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) +{ + return (vm_get_run_state(vm, vcpu, hostcpu) == VCPU_RUNNING); +} + +static cpumask_t __inline +vcpu_mask(int vcpuid) +{ + return ((cpumask_t)1 << vcpuid); +} + +#endif /* KERNEL */ + +#define VM_MAXCPU 8 /* maximum virtual cpus */ + +/* + * Identifiers for events that can be injected into the VM + */ +enum vm_event_type { + VM_EVENT_NONE, + VM_HW_INTR, + VM_NMI, + VM_HW_EXCEPTION, + VM_SW_INTR, + VM_PRIV_SW_EXCEPTION, + VM_SW_EXCEPTION, + VM_EVENT_MAX +}; + +/* + * Identifiers for architecturally defined registers. + */ +enum vm_reg_name { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_CR0, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_DR7, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RIP, + VM_REG_GUEST_RFLAGS, + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + VM_REG_GUEST_LDTR, + VM_REG_GUEST_TR, + VM_REG_GUEST_IDTR, + VM_REG_GUEST_GDTR, + VM_REG_GUEST_EFER, + VM_REG_LAST +}; + +/* + * Identifiers for optional vmm capabilities + */ +enum vm_cap_type { + VM_CAP_HALT_EXIT, + VM_CAP_MTRAP_EXIT, + VM_CAP_PAUSE_EXIT, + VM_CAP_UNRESTRICTED_GUEST, + VM_CAP_MAX +}; + +/* + * The 'access' field has the format specified in Table 21-2 of the Intel + * Architecture Manual vol 3b. + * + * XXX The contents of the 'access' field are architecturally defined except + * bit 16 - Segment Unusable. + */ +struct seg_desc { + uint64_t base; + uint32_t limit; + uint32_t access; +}; + +enum vm_exitcode { + VM_EXITCODE_INOUT, + VM_EXITCODE_VMX, + VM_EXITCODE_BOGUS, + VM_EXITCODE_RDMSR, + VM_EXITCODE_WRMSR, + VM_EXITCODE_HLT, + VM_EXITCODE_MTRAP, + VM_EXITCODE_PAUSE, + VM_EXITCODE_MAX, +}; + +struct vm_exit { + enum vm_exitcode exitcode; + int inst_length; /* 0 means unknown */ + uint64_t rip; + union { + struct { + uint16_t bytes:3; /* 1 or 2 or 4 */ + uint16_t in:1; /* out is 0, in is 1 */ + uint16_t string:1; + uint16_t rep:1; + uint16_t port; + uint32_t eax; /* valid for out */ + } inout; + /* + * VMX specific payload. Used when there is no "better" + * exitcode to represent the VM-exit. + */ + struct { + int error; /* vmx inst error */ + uint32_t exit_reason; + uint64_t exit_qualification; + } vmx; + struct { + uint32_t code; /* ecx value */ + uint64_t wval; + } msr; + } u; +}; + +#endif /* _VMM_H_ */ diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h new file mode 100644 index 0000000..1b143b5 --- /dev/null +++ b/sys/amd64/include/vmm_dev.h @@ -0,0 +1,191 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $ + */ + +#ifndef _VMM_DEV_H_ +#define _VMM_DEV_H_ + +#ifdef _KERNEL +void vmmdev_init(void); +void vmmdev_cleanup(void); +#endif + +struct vm_memory_segment { + vm_paddr_t hpa; /* out */ + vm_paddr_t gpa; /* in */ + size_t len; /* in */ +}; + +struct vm_register { + int cpuid; + int regnum; /* enum vm_reg_name */ + uint64_t regval; +}; + +struct vm_seg_desc { /* data or code segment */ + int cpuid; + int regnum; /* enum vm_reg_name */ + struct seg_desc desc; +}; + +struct vm_pin { + int vm_cpuid; + int host_cpuid; /* -1 to unpin */ +}; + +struct vm_run { + int cpuid; + uint64_t rip; /* start running here */ + struct vm_exit vm_exit; +}; + +struct vm_event { + int cpuid; + enum vm_event_type type; + int vector; + uint32_t error_code; + int error_code_valid; +}; + +struct vm_lapic_irq { + int cpuid; + int vector; +}; + +struct vm_capability { + int cpuid; + enum vm_cap_type captype; + int capval; + int allcpus; +}; + +struct vm_pptdev { + int bus; + int slot; + int func; +}; + +struct vm_pptdev_mmio { + int bus; + int slot; + int func; + vm_paddr_t gpa; + vm_paddr_t hpa; + size_t len; +}; + +struct vm_pptdev_msi { + int vcpu; + int bus; + int slot; + int func; + int numvec; /* 0 means disabled */ + int vector; + int destcpu; +}; + +struct vm_nmi { + int cpuid; +}; + +#define MAX_VM_STATS 64 +struct vm_stats { + int cpuid; /* in */ + int num_entries; /* out */ + struct timeval tv; + uint64_t statbuf[MAX_VM_STATS]; +}; + +struct vm_stat_desc { + int index; /* in */ + char desc[128]; /* out */ +}; + +enum { + IOCNUM_RUN, + IOCNUM_SET_PINNING, + IOCNUM_GET_PINNING, + IOCNUM_MAP_MEMORY, + IOCNUM_GET_MEMORY_SEG, + IOCNUM_SET_REGISTER, + IOCNUM_GET_REGISTER, + IOCNUM_SET_SEGMENT_DESCRIPTOR, + IOCNUM_GET_SEGMENT_DESCRIPTOR, + IOCNUM_INJECT_EVENT, + IOCNUM_LAPIC_IRQ, + IOCNUM_SET_CAPABILITY, + IOCNUM_GET_CAPABILITY, + IOCNUM_BIND_PPTDEV, + IOCNUM_UNBIND_PPTDEV, + IOCNUM_MAP_PPTDEV_MMIO, + IOCNUM_PPTDEV_MSI, + IOCNUM_INJECT_NMI, + IOCNUM_VM_STATS, + IOCNUM_VM_STAT_DESC, +}; + +#define VM_RUN \ + _IOWR('v', IOCNUM_RUN, struct vm_run) +#define VM_SET_PINNING \ + _IOW('v', IOCNUM_SET_PINNING, struct vm_pin) +#define VM_GET_PINNING \ + _IOWR('v', IOCNUM_GET_PINNING, struct vm_pin) +#define VM_MAP_MEMORY \ + _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment) +#define VM_GET_MEMORY_SEG \ + _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment) +#define VM_SET_REGISTER \ + _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) +#define VM_GET_REGISTER \ + _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) +#define VM_SET_SEGMENT_DESCRIPTOR \ + _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_GET_SEGMENT_DESCRIPTOR \ + _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_INJECT_EVENT \ + _IOW('v', IOCNUM_INJECT_EVENT, struct vm_event) +#define VM_LAPIC_IRQ \ + _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq) +#define VM_SET_CAPABILITY \ + _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) +#define VM_GET_CAPABILITY \ + _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) +#define VM_BIND_PPTDEV \ + _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev) +#define VM_UNBIND_PPTDEV \ + _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev) +#define VM_MAP_PPTDEV_MMIO \ + _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio) +#define VM_PPTDEV_MSI \ + _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi) +#define VM_INJECT_NMI \ + _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) +#define VM_STATS \ + _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) +#define VM_STAT_DESC \ + _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) +#endif diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c new file mode 100644 index 0000000..41e937a --- /dev/null +++ b/sys/amd64/vmm/amd/amdv.c @@ -0,0 +1,247 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> + +#include <machine/vmm.h> +#include "io/iommu.h" + +static int +amdv_init(void) +{ + + printf("amdv_init: not implemented\n"); + return (ENXIO); +} + +static int +amdv_cleanup(void) +{ + + printf("amdv_cleanup: not implemented\n"); + return (ENXIO); +} + +static void * +amdv_vminit(struct vm *vm) +{ + + printf("amdv_vminit: not implemented\n"); + return (NULL); +} + +static int +amdv_vmrun(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit) +{ + + printf("amdv_vmrun: not implemented\n"); + return (ENXIO); +} + +static void +amdv_vmcleanup(void *arg) +{ + + printf("amdv_vmcleanup: not implemented\n"); + return; +} + +static int +amdv_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, + vm_memattr_t attr, int prot, boolean_t spok) +{ + + printf("amdv_vmmmap: not implemented\n"); + return (EINVAL); +} + +static int +amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval) +{ + + printf("amdv_getreg: not implemented\n"); + return (EINVAL); +} + +static int +amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val) +{ + + printf("amdv_setreg: not implemented\n"); + return (EINVAL); +} + +static int +amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc) +{ + + printf("amdv_get_desc: not implemented\n"); + return (EINVAL); +} + +static int +amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc) +{ + + printf("amdv_get_desc: not implemented\n"); + return (EINVAL); +} + +static int +amdv_inject_event(void *vmi, int vcpu, int type, int vector, + uint32_t error_code, int error_code_valid) +{ + + printf("amdv_inject_event: not implemented\n"); + return (EINVAL); +} + +static int +amdv_nmi(void *arg, int vcpu) +{ + + printf("amdv_nmi: not implemented\n"); + return (EINVAL); +} + +static int +amdv_getcap(void *arg, int vcpu, int type, int *retval) +{ + + printf("amdv_getcap: not implemented\n"); + return (EINVAL); +} + +static int +amdv_setcap(void *arg, int vcpu, int type, int val) +{ + + printf("amdv_setcap: not implemented\n"); + return (EINVAL); +} + +struct vmm_ops vmm_ops_amd = { + amdv_init, + amdv_cleanup, + amdv_vminit, + amdv_vmrun, + amdv_vmcleanup, + amdv_vmmmap, + amdv_getreg, + amdv_setreg, + amdv_getdesc, + amdv_setdesc, + amdv_inject_event, + amdv_nmi, + amdv_getcap, + amdv_setcap +}; + +static int +amd_iommu_init(void) +{ + + printf("amd_iommu_init: not implemented\n"); + return (ENXIO); +} + +static void +amd_iommu_cleanup(void) +{ + + printf("amd_iommu_cleanup: not implemented\n"); +} + +static void +amd_iommu_enable(void) +{ + + printf("amd_iommu_enable: not implemented\n"); +} + +static void +amd_iommu_disable(void) +{ + + printf("amd_iommu_disable: not implemented\n"); +} + +static void * +amd_iommu_create_domain(vm_paddr_t maxaddr) +{ + + printf("amd_iommu_create_domain: not implemented\n"); + return (NULL); +} + +static void +amd_iommu_destroy_domain(void *domain) +{ + + printf("amd_iommu_destroy_domain: not implemented\n"); +} + +static uint64_t +amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len) +{ + + printf("amd_iommu_create_mapping: not implemented\n"); + return (0); +} + +static void +amd_iommu_add_device(void *domain, int bus, int slot, int func) +{ + + printf("amd_iommu_add_device: not implemented\n"); +} + +static void +amd_iommu_remove_device(void *domain, int bus, int slot, int func) +{ + + printf("amd_iommu_remove_device: not implemented\n"); +} + +struct iommu_ops iommu_ops_amd = { + amd_iommu_init, + amd_iommu_cleanup, + amd_iommu_enable, + amd_iommu_disable, + amd_iommu_create_domain, + amd_iommu_destroy_domain, + amd_iommu_create_mapping, + amd_iommu_add_device, + amd_iommu_remove_device, +}; diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c new file mode 100644 index 0000000..c9fca9d --- /dev/null +++ b/sys/amd64/vmm/intel/ept.c @@ -0,0 +1,312 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/smp.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/param.h> +#include <machine/cpufunc.h> +#include <machine/pmap.h> +#include <machine/vmparam.h> + +#include <machine/vmm.h> +#include "vmx_cpufunc.h" +#include "vmx_msr.h" +#include "vmx.h" +#include "ept.h" + +#define EPT_PWL4(cap) ((cap) & (1UL << 6)) +#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14)) +#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */ +#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */ +#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32)) +#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20)) + +#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL +#define INVVPID_ALL_TYPES_SUPPORTED(cap) \ + (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK) + +#define INVEPT_ALL_TYPES_MASK 0x6000000UL +#define INVEPT_ALL_TYPES_SUPPORTED(cap) \ + (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK) + +#define EPT_PG_RD (1 << 0) +#define EPT_PG_WR (1 << 1) +#define EPT_PG_EX (1 << 2) +#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) +#define EPT_PG_IGNORE_PAT (1 << 6) +#define EPT_PG_SUPERPAGE (1 << 7) + +#define EPT_ADDR_MASK ((uint64_t)-1 << 12) + +MALLOC_DECLARE(M_VMX); + +static uint64_t page_sizes_mask; + +int +ept_init(void) +{ + int page_shift; + uint64_t cap; + + cap = rdmsr(MSR_VMX_EPT_VPID_CAP); + + /* + * Verify that: + * - page walk length is 4 steps + * - extended page tables can be laid out in write-back memory + * - invvpid instruction with all possible types is supported + * - invept instruction with all possible types is supported + */ + if (!EPT_PWL4(cap) || + !EPT_MEMORY_TYPE_WB(cap) || + !INVVPID_SUPPORTED(cap) || + !INVVPID_ALL_TYPES_SUPPORTED(cap) || + !INVEPT_SUPPORTED(cap) || + !INVEPT_ALL_TYPES_SUPPORTED(cap)) + return (EINVAL); + + /* Set bits in 'page_sizes_mask' for each valid page size */ + page_shift = PAGE_SHIFT; + page_sizes_mask = 1UL << page_shift; /* 4KB page */ + + page_shift += 9; + if (EPT_PDE_SUPERPAGE(cap)) + page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */ + + page_shift += 9; + if (EPT_PDPTE_SUPERPAGE(cap)) + page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */ + + return (0); +} + +static size_t +ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, + vm_memattr_t attr, vm_prot_t prot, boolean_t spok) +{ + int spshift, ptpshift, ptpindex, nlevels; + + /* + * Compute the size of the mapping that we can accomodate. + * + * This is based on three factors: + * - super page sizes supported by the processor + * - alignment of the region starting at 'gpa' and 'hpa' + * - length of the region 'len' + */ + spshift = PAGE_SHIFT; + if (spok) + spshift += (EPT_PWLEVELS - 1) * 9; + while (spshift >= PAGE_SHIFT) { + uint64_t spsize = 1UL << spshift; + if ((page_sizes_mask & spsize) != 0 && + (gpa & (spsize - 1)) == 0 && + (hpa & (spsize - 1)) == 0 && + length >= spsize) { + break; + } + spshift -= 9; + } + + if (spshift < PAGE_SHIFT) { + panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, " + "length 0x%016lx, page_sizes_mask 0x%016lx", + gpa, hpa, length, page_sizes_mask); + } + + nlevels = EPT_PWLEVELS; + while (--nlevels >= 0) { + ptpshift = PAGE_SHIFT + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + /* We have reached the leaf mapping */ + if (spshift >= ptpshift) + break; + + /* + * We are working on a non-leaf page table page. + * + * Create the next level page table page if necessary and point + * to it from the current page table. + */ + if (ptp[ptpindex] == 0) { + void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO); + ptp[ptpindex] = vtophys(nlp); + ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX; + } + + /* Work our way down to the next level page table page */ + ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK); + } + + if ((gpa & ((1UL << ptpshift) - 1)) != 0) { + panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d " + "mismatch\n", gpa, ptpshift); + } + + /* Do the mapping */ + ptp[ptpindex] = hpa; + + /* Apply the access controls */ + if (prot & VM_PROT_READ) + ptp[ptpindex] |= EPT_PG_RD; + if (prot & VM_PROT_WRITE) + ptp[ptpindex] |= EPT_PG_WR; + if (prot & VM_PROT_EXECUTE) + ptp[ptpindex] |= EPT_PG_EX; + + /* + * XXX should we enforce this memory type by setting the ignore PAT + * bit to 1. + */ + ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr); + + if (nlevels > 0) + ptp[ptpindex] |= EPT_PG_SUPERPAGE; + + return (1UL << ptpshift); +} + +static void +ept_free_pt_entry(pt_entry_t pte) +{ + if (pte == 0) + return; + + /* sanity check */ + if ((pte & EPT_PG_SUPERPAGE) != 0) + panic("ept_free_pt_entry: pte cannot have superpage bit"); + + return; +} + +static void +ept_free_pd_entry(pd_entry_t pde) +{ + pt_entry_t *pt; + int i; + + if (pde == 0) + return; + + if ((pde & EPT_PG_SUPERPAGE) == 0) { + pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK); + for (i = 0; i < NPTEPG; i++) + ept_free_pt_entry(pt[i]); + free(pt, M_VMX); /* free the page table page */ + } +} + +static void +ept_free_pdp_entry(pdp_entry_t pdpe) +{ + pd_entry_t *pd; + int i; + + if (pdpe == 0) + return; + + if ((pdpe & EPT_PG_SUPERPAGE) == 0) { + pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK); + for (i = 0; i < NPDEPG; i++) + ept_free_pd_entry(pd[i]); + free(pd, M_VMX); /* free the page directory page */ + } +} + +static void +ept_free_pml4_entry(pml4_entry_t pml4e) +{ + pdp_entry_t *pdp; + int i; + + if (pml4e == 0) + return; + + if ((pml4e & EPT_PG_SUPERPAGE) == 0) { + pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK); + for (i = 0; i < NPDPEPG; i++) + ept_free_pdp_entry(pdp[i]); + free(pdp, M_VMX); /* free the page directory ptr page */ + } +} + +void +ept_vmcleanup(struct vmx *vmx) +{ + int i; + + for (i = 0; i < NPML4EPG; i++) + ept_free_pml4_entry(vmx->pml4ept[i]); +} + +int +ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len, + vm_memattr_t attr, int prot, boolean_t spok) +{ + size_t n; + struct vmx *vmx = arg; + + while (len > 0) { + n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr, + prot, spok); + len -= n; + gpa += n; + hpa += n; + } + + return (0); +} + +static void +invept_single_context(void *arg) +{ + struct invept_desc desc = *(struct invept_desc *)arg; + + invept(INVEPT_TYPE_SINGLE_CONTEXT, desc); +} + +void +ept_invalidate_mappings(u_long pml4ept) +{ + struct invept_desc invept_desc = { 0 }; + + invept_desc.eptp = EPTP(pml4ept); + + smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc); +} diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h new file mode 100644 index 0000000..013c330 --- /dev/null +++ b/sys/amd64/vmm/intel/ept.h @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _EPT_H_ +#define _EPT_H_ + +struct vmx; + +#define EPT_PWLEVELS 4 /* page walk levels */ +#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK) + +int ept_init(void); +int ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, + vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings); +void ept_invalidate_mappings(u_long ept_pml4); +void ept_vmcleanup(struct vmx *vmx); +#endif diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c new file mode 100644 index 0000000..80d45cc --- /dev/null +++ b/sys/amd64/vmm/intel/vmcs.c @@ -0,0 +1,451 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/pcpu.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/segments.h> +#include <machine/pmap.h> + +#include <machine/vmm.h> +#include "vmcs.h" +#include "vmx_cpufunc.h" +#include "ept.h" +#include "vmx.h" + +static uint64_t +vmcs_fix_regval(uint32_t encoding, uint64_t val) +{ + + switch (encoding) { + case VMCS_GUEST_CR0: + val = vmx_fix_cr0(val); + break; + case VMCS_GUEST_CR4: + val = vmx_fix_cr4(val); + break; + default: + break; + } + return (val); +} + +static uint32_t +vmcs_field_encoding(int ident) +{ + switch (ident) { + case VM_REG_GUEST_CR0: + return (VMCS_GUEST_CR0); + case VM_REG_GUEST_CR3: + return (VMCS_GUEST_CR3); + case VM_REG_GUEST_CR4: + return (VMCS_GUEST_CR4); + case VM_REG_GUEST_DR7: + return (VMCS_GUEST_DR7); + case VM_REG_GUEST_RSP: + return (VMCS_GUEST_RSP); + case VM_REG_GUEST_RIP: + return (VMCS_GUEST_RIP); + case VM_REG_GUEST_RFLAGS: + return (VMCS_GUEST_RFLAGS); + case VM_REG_GUEST_ES: + return (VMCS_GUEST_ES_SELECTOR); + case VM_REG_GUEST_CS: + return (VMCS_GUEST_CS_SELECTOR); + case VM_REG_GUEST_SS: + return (VMCS_GUEST_SS_SELECTOR); + case VM_REG_GUEST_DS: + return (VMCS_GUEST_DS_SELECTOR); + case VM_REG_GUEST_FS: + return (VMCS_GUEST_FS_SELECTOR); + case VM_REG_GUEST_GS: + return (VMCS_GUEST_GS_SELECTOR); + case VM_REG_GUEST_TR: + return (VMCS_GUEST_TR_SELECTOR); + case VM_REG_GUEST_LDTR: + return (VMCS_GUEST_LDTR_SELECTOR); + case VM_REG_GUEST_EFER: + return (VMCS_GUEST_IA32_EFER); + default: + return (-1); + } + +} + +static int +vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc) +{ + + switch (seg) { + case VM_REG_GUEST_ES: + *base = VMCS_GUEST_ES_BASE; + *lim = VMCS_GUEST_ES_LIMIT; + *acc = VMCS_GUEST_ES_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_CS: + *base = VMCS_GUEST_CS_BASE; + *lim = VMCS_GUEST_CS_LIMIT; + *acc = VMCS_GUEST_CS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_SS: + *base = VMCS_GUEST_SS_BASE; + *lim = VMCS_GUEST_SS_LIMIT; + *acc = VMCS_GUEST_SS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_DS: + *base = VMCS_GUEST_DS_BASE; + *lim = VMCS_GUEST_DS_LIMIT; + *acc = VMCS_GUEST_DS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_FS: + *base = VMCS_GUEST_FS_BASE; + *lim = VMCS_GUEST_FS_LIMIT; + *acc = VMCS_GUEST_FS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_GS: + *base = VMCS_GUEST_GS_BASE; + *lim = VMCS_GUEST_GS_LIMIT; + *acc = VMCS_GUEST_GS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_TR: + *base = VMCS_GUEST_TR_BASE; + *lim = VMCS_GUEST_TR_LIMIT; + *acc = VMCS_GUEST_TR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_LDTR: + *base = VMCS_GUEST_LDTR_BASE; + *lim = VMCS_GUEST_LDTR_LIMIT; + *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_IDTR: + *base = VMCS_GUEST_IDTR_BASE; + *lim = VMCS_GUEST_IDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + case VM_REG_GUEST_GDTR: + *base = VMCS_GUEST_GDTR_BASE; + *lim = VMCS_GUEST_GDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + default: + return (EINVAL); + } + + return (0); +} + +int +vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval) +{ + int error; + uint32_t encoding; + + /* + * If we need to get at vmx-specific state in the VMCS we can bypass + * the translation of 'ident' to 'encoding' by simply setting the + * sign bit. As it so happens the upper 16 bits are reserved (i.e + * set to 0) in the encodings for the VMCS so we are free to use the + * sign bit. + */ + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + VMPTRLD(vmcs); + error = vmread(encoding, retval); + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val) +{ + int error; + uint32_t encoding; + + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + val = vmcs_fix_regval(encoding, val); + + VMPTRLD(vmcs); + error = vmwrite(encoding, val); + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + panic("vmcs_setdesc: invalid segment register %d", seg); + + VMPTRLD(vmcs); + if ((error = vmwrite(base, desc->base)) != 0) + goto done; + + if ((error = vmwrite(limit, desc->limit)) != 0) + goto done; + + if (access != VMCS_INVALID_ENCODING) { + if ((error = vmwrite(access, desc->access)) != 0) + goto done; + } +done: + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + uint64_t u64; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + panic("vmcs_getdesc: invalid segment register %d", seg); + + VMPTRLD(vmcs); + if ((error = vmread(base, &u64)) != 0) + goto done; + desc->base = u64; + + if ((error = vmread(limit, &u64)) != 0) + goto done; + desc->limit = u64; + + if (access != VMCS_INVALID_ENCODING) { + if ((error = vmread(access, &u64)) != 0) + goto done; + desc->access = u64; + } +done: + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count) +{ + int error; + + VMPTRLD(vmcs); + + /* + * Guest MSRs are saved in the VM-exit MSR-store area. + * Guest MSRs are loaded from the VM-entry MSR-load area. + * Both areas point to the same location in memory. + */ + if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0) + goto done; + + if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0) + goto done; + + error = 0; +done: + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_set_defaults(struct vmcs *vmcs, + u_long host_rip, u_long host_rsp, u_long ept_pml4, + uint32_t pinbased_ctls, uint32_t procbased_ctls, + uint32_t procbased_ctls2, uint32_t exit_ctls, + uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid) +{ + int error, codesel, datasel, tsssel; + u_long cr0, cr4, efer; + uint64_t eptp, pat; + uint32_t exc_bitmap; + + codesel = GSEL(GCODE_SEL, SEL_KPL); + datasel = GSEL(GDATA_SEL, SEL_KPL); + tsssel = GSEL(GPROC0_SEL, SEL_KPL); + + /* + * Make sure we have a "current" VMCS to work with. + */ + VMPTRLD(vmcs); + + /* + * Load the VMX controls + */ + if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0) + goto done; + if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0) + goto done; + if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0) + goto done; + if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0) + goto done; + if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0) + goto done; + + /* Guest state */ + + /* Initialize guest IA32_PAT MSR with the default value */ + pat = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0) + goto done; + + /* Host state */ + + /* Initialize host IA32_PAT MSR */ + pat = rdmsr(MSR_PAT); + if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0) + goto done; + + /* Load the IA32_EFER MSR */ + efer = rdmsr(MSR_EFER); + if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0) + goto done; + + /* Load the control registers */ + cr0 = rcr0(); + if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0) + goto done; + + cr4 = rcr4(); + if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0) + goto done; + + /* Load the segment selectors */ + if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0) + goto done; + + /* + * Load the Base-Address for %fs and idtr. + * + * Note that we exclude %gs, tss and gdtr here because their base + * address is pcpu specific. + */ + if ((error = vmwrite(VMCS_HOST_FS_BASE, 0)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_IDTR_BASE, r_idt.rd_base)) != 0) + goto done; + + /* instruction pointer */ + if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0) + goto done; + + /* stack pointer */ + if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0) + goto done; + + /* eptp */ + eptp = EPTP(ept_pml4); + if ((error = vmwrite(VMCS_EPTP, eptp)) != 0) + goto done; + + /* vpid */ + if ((error = vmwrite(VMCS_VPID, vpid)) != 0) + goto done; + + /* msr bitmap */ + if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0) + goto done; + + /* exception bitmap */ + exc_bitmap = 1 << IDT_MC; + if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0) + goto done; + + /* link pointer */ + if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0) + goto done; +done: + VMCLEAR(vmcs); + return (error); +} + +uint64_t +vmcs_read(uint32_t encoding) +{ + int error; + uint64_t val; + + error = vmread(encoding, &val); + if (error != 0) + panic("vmcs_read(%u) error %d", encoding, error); + + return (val); +} diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h new file mode 100644 index 0000000..c633a59 --- /dev/null +++ b/sys/amd64/vmm/intel/vmcs.h @@ -0,0 +1,324 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMCS_H_ +#define _VMCS_H_ + +#ifdef _KERNEL +struct vmcs { + uint32_t identifier; + uint32_t abort_code; + char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2]; +}; +CTASSERT(sizeof(struct vmcs) == PAGE_SIZE); + +/* MSR save region is composed of an array of 'struct msr_entry' */ +struct msr_entry { + uint32_t index; + uint32_t reserved; + uint64_t val; + +}; + +int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count); +int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp, + u_long ept_pml4, + uint32_t pinbased_ctls, uint32_t procbased_ctls, + uint32_t procbased_ctls2, uint32_t exit_ctls, + uint32_t entry_ctls, u_long msr_bitmap, + uint16_t vpid); +int vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval); +int vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val); +int vmcs_getdesc(struct vmcs *vmcs, int ident, + struct seg_desc *desc); +int vmcs_setdesc(struct vmcs *vmcs, int ident, + struct seg_desc *desc); +uint64_t vmcs_read(uint32_t encoding); + +#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH) +#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP) +#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR) +#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff) +#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION) + +#endif /* _KERNEL */ + +#define VMCS_IDENT(encoding) ((encoding) | 0x80000000) +/* + * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B. + */ +#define VMCS_INVALID_ENCODING 0xffffffff + +/* 16-bit control fields */ +#define VMCS_VPID 0x00000000 + +/* 16-bit guest-state fields */ +#define VMCS_GUEST_ES_SELECTOR 0x00000800 +#define VMCS_GUEST_CS_SELECTOR 0x00000802 +#define VMCS_GUEST_SS_SELECTOR 0x00000804 +#define VMCS_GUEST_DS_SELECTOR 0x00000806 +#define VMCS_GUEST_FS_SELECTOR 0x00000808 +#define VMCS_GUEST_GS_SELECTOR 0x0000080A +#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C +#define VMCS_GUEST_TR_SELECTOR 0x0000080E + +/* 16-bit host-state fields */ +#define VMCS_HOST_ES_SELECTOR 0x00000C00 +#define VMCS_HOST_CS_SELECTOR 0x00000C02 +#define VMCS_HOST_SS_SELECTOR 0x00000C04 +#define VMCS_HOST_DS_SELECTOR 0x00000C06 +#define VMCS_HOST_FS_SELECTOR 0x00000C08 +#define VMCS_HOST_GS_SELECTOR 0x00000C0A +#define VMCS_HOST_TR_SELECTOR 0x00000C0C + +/* 64-bit control fields */ +#define VMCS_IO_BITMAP_A 0x00002000 +#define VMCS_IO_BITMAP_B 0x00002002 +#define VMCS_MSR_BITMAP 0x00002004 +#define VMCS_EXIT_MSR_STORE 0x00002006 +#define VMCS_EXIT_MSR_LOAD 0x00002008 +#define VMCS_ENTRY_MSR_LOAD 0x0000200A +#define VMCS_EXECUTIVE_VMCS 0x0000200C +#define VMCS_TSC_OFFSET 0x00002010 +#define VMCS_VIRTUAL_APIC 0x00002012 +#define VMCS_APIC_ACCESS 0x00002014 +#define VMCS_EPTP 0x0000201A + +/* 64-bit read-only fields */ +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 + +/* 64-bit guest-state fields */ +#define VMCS_LINK_POINTER 0x00002800 +#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802 +#define VMCS_GUEST_IA32_PAT 0x00002804 +#define VMCS_GUEST_IA32_EFER 0x00002806 +#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808 +#define VMCS_GUEST_PDPTE0 0x0000280A +#define VMCS_GUEST_PDPTE1 0x0000280C +#define VMCS_GUEST_PDPTE2 0x0000280E +#define VMCS_GUEST_PDPTE3 0x00002810 + +/* 64-bit host-state fields */ +#define VMCS_HOST_IA32_PAT 0x00002C00 +#define VMCS_HOST_IA32_EFER 0x00002C02 +#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04 + +/* 32-bit control fields */ +#define VMCS_PIN_BASED_CTLS 0x00004000 +#define VMCS_PRI_PROC_BASED_CTLS 0x00004002 +#define VMCS_EXCEPTION_BITMAP 0x00004004 +#define VMCS_PF_ERROR_MASK 0x00004006 +#define VMCS_PF_ERROR_MATCH 0x00004008 +#define VMCS_CR3_TARGET_COUNT 0x0000400A +#define VMCS_EXIT_CTLS 0x0000400C +#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E +#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010 +#define VMCS_ENTRY_CTLS 0x00004012 +#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014 +#define VMCS_ENTRY_INTR_INFO 0x00004016 +#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018 +#define VMCS_ENTRY_INST_LENGTH 0x0000401A +#define VMCS_TPR_THRESHOLD 0x0000401C +#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E +#define VMCS_PLE_GAP 0x00004020 +#define VMCS_PLE_WINDOW 0x00004022 + +/* 32-bit read-only data fields */ +#define VMCS_INSTRUCTION_ERROR 0x00004400 +#define VMCS_EXIT_REASON 0x00004402 +#define VMCS_EXIT_INTERRUPTION_INFO 0x00004404 +#define VMCS_EXIT_INTERRUPTION_ERROR 0x00004406 +#define VMCS_IDT_VECTORING_INFO 0x00004408 +#define VMCS_IDT_VECTORING_ERROR 0x0000440A +#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C +#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E + +/* 32-bit guest-state fields */ +#define VMCS_GUEST_ES_LIMIT 0x00004800 +#define VMCS_GUEST_CS_LIMIT 0x00004802 +#define VMCS_GUEST_SS_LIMIT 0x00004804 +#define VMCS_GUEST_DS_LIMIT 0x00004806 +#define VMCS_GUEST_FS_LIMIT 0x00004808 +#define VMCS_GUEST_GS_LIMIT 0x0000480A +#define VMCS_GUEST_LDTR_LIMIT 0x0000480C +#define VMCS_GUEST_TR_LIMIT 0x0000480E +#define VMCS_GUEST_GDTR_LIMIT 0x00004810 +#define VMCS_GUEST_IDTR_LIMIT 0x00004812 +#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814 +#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816 +#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818 +#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A +#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C +#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E +#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820 +#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822 +#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824 +#define VMCS_GUEST_ACTIVITY 0x00004826 +#define VMCS_GUEST_SMBASE 0x00004828 +#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A +#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E + +/* 32-bit host state fields */ +#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00 + +/* Natural Width control fields */ +#define VMCS_CR0_MASK 0x00006000 +#define VMCS_CR4_MASK 0x00006002 +#define VMCS_CR0_SHADOW 0x00006004 +#define VMCS_CR4_SHADOW 0x00006006 +#define VMCS_CR3_TARGET0 0x00006008 +#define VMCS_CR3_TARGET1 0x0000600A +#define VMCS_CR3_TARGET2 0x0000600C +#define VMCS_CR3_TARGET3 0x0000600E + +/* Natural Width read-only fields */ +#define VMCS_EXIT_QUALIFICATION 0x00006400 +#define VMCS_IO_RCX 0x00006402 +#define VMCS_IO_RSI 0x00006404 +#define VMCS_IO_RDI 0x00006406 +#define VMCS_IO_RIP 0x00006408 +#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A + +/* Natural Width guest-state fields */ +#define VMCS_GUEST_CR0 0x00006800 +#define VMCS_GUEST_CR3 0x00006802 +#define VMCS_GUEST_CR4 0x00006804 +#define VMCS_GUEST_ES_BASE 0x00006806 +#define VMCS_GUEST_CS_BASE 0x00006808 +#define VMCS_GUEST_SS_BASE 0x0000680A +#define VMCS_GUEST_DS_BASE 0x0000680C +#define VMCS_GUEST_FS_BASE 0x0000680E +#define VMCS_GUEST_GS_BASE 0x00006810 +#define VMCS_GUEST_LDTR_BASE 0x00006812 +#define VMCS_GUEST_TR_BASE 0x00006814 +#define VMCS_GUEST_GDTR_BASE 0x00006816 +#define VMCS_GUEST_IDTR_BASE 0x00006818 +#define VMCS_GUEST_DR7 0x0000681A +#define VMCS_GUEST_RSP 0x0000681C +#define VMCS_GUEST_RIP 0x0000681E +#define VMCS_GUEST_RFLAGS 0x00006820 +#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822 +#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824 +#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826 + +/* Natural Width host-state fields */ +#define VMCS_HOST_CR0 0x00006C00 +#define VMCS_HOST_CR3 0x00006C02 +#define VMCS_HOST_CR4 0x00006C04 +#define VMCS_HOST_FS_BASE 0x00006C06 +#define VMCS_HOST_GS_BASE 0x00006C08 +#define VMCS_HOST_TR_BASE 0x00006C0A +#define VMCS_HOST_GDTR_BASE 0x00006C0C +#define VMCS_HOST_IDTR_BASE 0x00006C0E +#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10 +#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12 +#define VMCS_HOST_RSP 0x00006C14 +#define VMCS_HOST_RIP 0x00006c16 + +/* + * VM instruction error numbers + */ +#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5 + +/* + * VMCS exit reasons + */ +#define EXIT_REASON_EXCEPTION 0 +#define EXIT_REASON_EXT_INTR 1 +#define EXIT_REASON_TRIPLE_FAULT 2 +#define EXIT_REASON_INIT 3 +#define EXIT_REASON_SIPI 4 +#define EXIT_REASON_IO_SMI 5 +#define EXIT_REASON_SMI 6 +#define EXIT_REASON_INTR_WINDOW 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_GETSEC 11 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_RSM 17 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMXOFF 26 +#define EXIT_REASON_VMXON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_INOUT 30 +#define EXIT_REASON_RDMSR 31 +#define EXIT_REASON_WRMSR 32 +#define EXIT_REASON_INVAL_VMCS 33 +#define EXIT_REASON_INVAL_MSR 34 +#define EXIT_REASON_MWAIT 36 +#define EXIT_REASON_MTF 37 +#define EXIT_REASON_MONITOR 39 +#define EXIT_REASON_PAUSE 40 +#define EXIT_REASON_MCE 41 +#define EXIT_REASON_TPR 43 +#define EXIT_REASON_APIC 44 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 +#define EXIT_REASON_EPT_FAULT 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 +#define EXIT_REASON_RDTSCP 51 +#define EXIT_REASON_VMX_PREEMPT 52 +#define EXIT_REASON_INVVPID 53 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 + +/* + * VMCS interrupt information fields + */ +#define VMCS_INTERRUPTION_INFO_VALID (1 << 31) +#define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8) +#define VMCS_INTERRUPTION_INFO_NMI (2 << 8) + +/* + * VMCS Guest interruptibility field + */ +#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0) +#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1) +#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2) +#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3) + +/* + * Exit qualification for EXIT_REASON_INVAL_VMCS + */ +#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3 + +#endif diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c new file mode 100644 index 0000000..ec181c4 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx.c @@ -0,0 +1,1673 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/smp.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/proc.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/psl.h> +#include <machine/cpufunc.h> +#include <machine/pmap.h> +#include <machine/segments.h> +#include <machine/vmparam.h> + +#include <machine/vmm.h> +#include "vmm_lapic.h" +#include "vmm_msr.h" +#include "vmm_ktr.h" +#include "vmm_stat.h" + +#include "vmx_msr.h" +#include "ept.h" +#include "vmx_cpufunc.h" +#include "vmx.h" +#include "x86.h" +#include "vmx_controls.h" + +#define CR4_VMXE (1UL << 13) + +#define PINBASED_CTLS_ONE_SETTING \ + (PINBASED_EXTINT_EXITING | \ + PINBASED_NMI_EXITING | \ + PINBASED_VIRTUAL_NMI) +#define PINBASED_CTLS_ZERO_SETTING 0 + +#define PROCBASED_CTLS_WINDOW_SETTING \ + (PROCBASED_INT_WINDOW_EXITING | \ + PROCBASED_NMI_WINDOW_EXITING) + +#define PROCBASED_CTLS_ONE_SETTING \ + (PROCBASED_SECONDARY_CONTROLS | \ + PROCBASED_IO_EXITING | \ + PROCBASED_MSR_BITMAPS | \ + PROCBASED_CTLS_WINDOW_SETTING) +#define PROCBASED_CTLS_ZERO_SETTING \ + (PROCBASED_CR3_LOAD_EXITING | \ + PROCBASED_CR3_STORE_EXITING | \ + PROCBASED_IO_BITMAPS) + +#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT +#define PROCBASED_CTLS2_ZERO_SETTING 0 + +#define VM_EXIT_CTLS_ONE_SETTING \ + (VM_EXIT_HOST_LMA | \ + VM_EXIT_SAVE_EFER | \ + VM_EXIT_SAVE_PAT | \ + VM_EXIT_LOAD_PAT | \ + VM_EXIT_LOAD_EFER) +#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS + +#define VM_ENTRY_CTLS_ONE_SETTING \ + (VM_ENTRY_LOAD_PAT | \ + VM_ENTRY_LOAD_EFER) +#define VM_ENTRY_CTLS_ZERO_SETTING \ + (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_INTO_SMM | \ + VM_ENTRY_DEACTIVATE_DUAL_MONITOR) + +#define guest_msr_rw(vmx, msr) \ + msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) + +#define HANDLED 1 +#define UNHANDLED 0 + +MALLOC_DEFINE(M_VMX, "vmx", "vmx"); + +extern struct pcpu __pcpu[]; + +static int vmxon_enabled[MAXCPU]; +static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); + +static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; +static uint32_t exit_ctls, entry_ctls; + +static uint64_t cr0_ones_mask, cr0_zeros_mask; +static uint64_t cr4_ones_mask, cr4_zeros_mask; + +static volatile u_int nextvpid; + +/* + * Virtual NMI blocking conditions. + * + * Some processor implementations also require NMI to be blocked if + * the STI_BLOCKING bit is set. It is possible to detect this at runtime + * based on the (exit_reason,exit_qual) tuple being set to + * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). + * + * We take the easy way out and also include STI_BLOCKING as one of the + * gating items for vNMI injection. + */ +static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | + VMCS_INTERRUPTIBILITY_NMI_BLOCKING | + VMCS_INTERRUPTIBILITY_STI_BLOCKING; + +/* + * Optional capabilities + */ +static int cap_halt_exit; +static int cap_pause_exit; +static int cap_unrestricted_guest; +static int cap_monitor_trap; + +/* statistics */ +static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus"); +static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt"); + +#ifdef KTR +static const char * +exit_reason_to_str(int reason) +{ + static char reasonbuf[32]; + + switch (reason) { + case EXIT_REASON_EXCEPTION: + return "exception"; + case EXIT_REASON_EXT_INTR: + return "extint"; + case EXIT_REASON_TRIPLE_FAULT: + return "triplefault"; + case EXIT_REASON_INIT: + return "init"; + case EXIT_REASON_SIPI: + return "sipi"; + case EXIT_REASON_IO_SMI: + return "iosmi"; + case EXIT_REASON_SMI: + return "smi"; + case EXIT_REASON_INTR_WINDOW: + return "intrwindow"; + case EXIT_REASON_NMI_WINDOW: + return "nmiwindow"; + case EXIT_REASON_TASK_SWITCH: + return "taskswitch"; + case EXIT_REASON_CPUID: + return "cpuid"; + case EXIT_REASON_GETSEC: + return "getsec"; + case EXIT_REASON_HLT: + return "hlt"; + case EXIT_REASON_INVD: + return "invd"; + case EXIT_REASON_INVLPG: + return "invlpg"; + case EXIT_REASON_RDPMC: + return "rdpmc"; + case EXIT_REASON_RDTSC: + return "rdtsc"; + case EXIT_REASON_RSM: + return "rsm"; + case EXIT_REASON_VMCALL: + return "vmcall"; + case EXIT_REASON_VMCLEAR: + return "vmclear"; + case EXIT_REASON_VMLAUNCH: + return "vmlaunch"; + case EXIT_REASON_VMPTRLD: + return "vmptrld"; + case EXIT_REASON_VMPTRST: + return "vmptrst"; + case EXIT_REASON_VMREAD: + return "vmread"; + case EXIT_REASON_VMRESUME: + return "vmresume"; + case EXIT_REASON_VMWRITE: + return "vmwrite"; + case EXIT_REASON_VMXOFF: + return "vmxoff"; + case EXIT_REASON_VMXON: + return "vmxon"; + case EXIT_REASON_CR_ACCESS: + return "craccess"; + case EXIT_REASON_DR_ACCESS: + return "draccess"; + case EXIT_REASON_INOUT: + return "inout"; + case EXIT_REASON_RDMSR: + return "rdmsr"; + case EXIT_REASON_WRMSR: + return "wrmsr"; + case EXIT_REASON_INVAL_VMCS: + return "invalvmcs"; + case EXIT_REASON_INVAL_MSR: + return "invalmsr"; + case EXIT_REASON_MWAIT: + return "mwait"; + case EXIT_REASON_MTF: + return "mtf"; + case EXIT_REASON_MONITOR: + return "monitor"; + case EXIT_REASON_PAUSE: + return "pause"; + case EXIT_REASON_MCE: + return "mce"; + case EXIT_REASON_TPR: + return "tpr"; + case EXIT_REASON_APIC: + return "apic"; + case EXIT_REASON_GDTR_IDTR: + return "gdtridtr"; + case EXIT_REASON_LDTR_TR: + return "ldtrtr"; + case EXIT_REASON_EPT_FAULT: + return "eptfault"; + case EXIT_REASON_EPT_MISCONFIG: + return "eptmisconfig"; + case EXIT_REASON_INVEPT: + return "invept"; + case EXIT_REASON_RDTSCP: + return "rdtscp"; + case EXIT_REASON_VMX_PREEMPT: + return "vmxpreempt"; + case EXIT_REASON_INVVPID: + return "invvpid"; + case EXIT_REASON_WBINVD: + return "wbinvd"; + case EXIT_REASON_XSETBV: + return "xsetbv"; + default: + snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); + return (reasonbuf); + } +} + +#ifdef SETJMP_TRACE +static const char * +vmx_setjmp_rc2str(int rc) +{ + switch (rc) { + case VMX_RETURN_DIRECT: + return "direct"; + case VMX_RETURN_LONGJMP: + return "longjmp"; + case VMX_RETURN_VMRESUME: + return "vmresume"; + case VMX_RETURN_VMLAUNCH: + return "vmlaunch"; + default: + return "unknown"; + } +} + +#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ + VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ + (vmxctx)->regname) + +static void +vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) +{ + uint64_t host_rip, host_rsp; + + if (vmxctx != &vmx->ctx[vcpu]) + panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", + vmxctx, &vmx->ctx[vcpu]); + + VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); + VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", + vmx_setjmp_rc2str(rc), rc); + + host_rsp = host_rip = ~0; + vmread(VMCS_HOST_RIP, &host_rip); + vmread(VMCS_HOST_RSP, &host_rsp); + VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", + host_rip, host_rsp); + + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); + + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); +} +#endif +#else +static void __inline +vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) +{ + return; +} +#endif /* KTR */ + +u_long +vmx_fix_cr0(u_long cr0) +{ + + return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); +} + +u_long +vmx_fix_cr4(u_long cr4) +{ + + return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); +} + +static void +msr_save_area_init(struct msr_entry *g_area, int *g_count) +{ + int cnt; + + static struct msr_entry guest_msrs[] = { + { MSR_KGSBASE, 0, 0 }, + }; + + cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); + if (cnt > GUEST_MSR_MAX_ENTRIES) + panic("guest msr save area overrun"); + bcopy(guest_msrs, g_area, sizeof(guest_msrs)); + *g_count = cnt; +} + +static void +vmx_disable(void *arg __unused) +{ + struct invvpid_desc invvpid_desc = { 0 }; + struct invept_desc invept_desc = { 0 }; + + if (vmxon_enabled[curcpu]) { + /* + * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. + * + * VMXON or VMXOFF are not required to invalidate any TLB + * caching structures. This prevents potential retention of + * cached information in the TLB between distinct VMX episodes. + */ + invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); + invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); + vmxoff(); + } + load_cr4(rcr4() & ~CR4_VMXE); +} + +static int +vmx_cleanup(void) +{ + + smp_rendezvous(NULL, vmx_disable, NULL, NULL); + + return (0); +} + +static void +vmx_enable(void *arg __unused) +{ + int error; + + load_cr4(rcr4() | CR4_VMXE); + + *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); + error = vmxon(vmxon_region[curcpu]); + if (error == 0) + vmxon_enabled[curcpu] = 1; +} + +static int +vmx_init(void) +{ + int error; + unsigned int regs[4]; + uint64_t fixed0, fixed1; + uint32_t tmp; + + /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ + do_cpuid(1, regs); + if ((regs[2] & CPUID_0000_0001_FEAT0_VMX) == 0) { + printf("vmx_init: processor does not support VMX operation\n"); + return (ENXIO); + } + + /* Check support for primary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_CTLS_ONE_SETTING, + PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired primary " + "processor-based controls\n"); + return (error); + } + + /* Clear the processor-based ctl bits that are set on demand */ + procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; + + /* Check support for secondary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED_CTLS2_ONE_SETTING, + PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); + if (error) { + printf("vmx_init: processor does not support desired secondary " + "processor-based controls\n"); + return (error); + } + + /* Check support for VPID */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_ENABLE_VPID, 0, &tmp); + if (error == 0) + procbased_ctls2 |= PROCBASED2_ENABLE_VPID; + + /* Check support for pin-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, + PINBASED_CTLS_ONE_SETTING, + PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "pin-based controls\n"); + return (error); + } + + /* Check support for VM-exit controls */ + error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, + VM_EXIT_CTLS_ONE_SETTING, + VM_EXIT_CTLS_ZERO_SETTING, + &exit_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "exit controls\n"); + return (error); + } + + /* Check support for VM-entry controls */ + error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, + VM_ENTRY_CTLS_ONE_SETTING, + VM_ENTRY_CTLS_ZERO_SETTING, + &entry_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "entry controls\n"); + return (error); + } + + /* + * Check support for optional features by testing them + * as individual bits + */ + cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_HLT_EXITING, 0, + &tmp) == 0); + + cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_PROCBASED_CTLS, + PROCBASED_MTF, 0, + &tmp) == 0); + + cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_PAUSE_EXITING, 0, + &tmp) == 0); + + cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_UNRESTRICTED_GUEST, 0, + &tmp) == 0); + + /* Initialize EPT */ + error = ept_init(); + if (error) { + printf("vmx_init: ept initialization failed (%d)\n", error); + return (error); + } + + /* + * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 + */ + fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); + cr0_ones_mask = fixed0 & fixed1; + cr0_zeros_mask = ~fixed0 & ~fixed1; + + /* + * CR0_PE and CR0_PG can be set to zero in VMX non-root operation + * if unrestricted guest execution is allowed. + */ + if (cap_unrestricted_guest) + cr0_ones_mask &= ~(CR0_PG | CR0_PE); + + /* + * Do not allow the guest to set CR0_NW or CR0_CD. + */ + cr0_zeros_mask |= (CR0_NW | CR0_CD); + + fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); + cr4_ones_mask = fixed0 & fixed1; + cr4_zeros_mask = ~fixed0 & ~fixed1; + + /* enable VMX operation */ + smp_rendezvous(NULL, vmx_enable, NULL, NULL); + + return (0); +} + +/* + * If this processor does not support VPIDs then simply return 0. + * + * Otherwise generate the next value of VPID to use. Any value is alright + * as long as it is non-zero. + * + * We always execute in VMX non-root context with EPT enabled. Thus all + * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This + * in turn means that multiple VMs can share the same VPID as long as + * they have distinct EPT page tables. + * + * XXX + * We should optimize this so that it returns VPIDs that are not in + * use. Then we will not unnecessarily invalidate mappings in + * vmx_set_pcpu_defaults() just because two or more vcpus happen to + * use the same 'vpid'. + */ +static uint16_t +vmx_vpid(void) +{ + uint16_t vpid = 0; + + if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) { + do { + vpid = atomic_fetchadd_int(&nextvpid, 1); + } while (vpid == 0); + } + + return (vpid); +} + +static int +vmx_setup_cr0_shadow(struct vmcs *vmcs) +{ + int error; + uint64_t mask, shadow; + + mask = cr0_ones_mask | cr0_zeros_mask; + error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_MASK), mask); + if (error) + return (error); + + shadow = cr0_ones_mask; + error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_SHADOW), shadow); + if (error) + return (error); + + return (0); +} + +static void * +vmx_vminit(struct vm *vm) +{ + uint16_t vpid; + int i, error, guest_msr_count; + struct vmx *vmx; + + vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); + if ((uintptr_t)vmx & PAGE_MASK) { + panic("malloc of struct vmx not aligned on %d byte boundary", + PAGE_SIZE); + } + vmx->vm = vm; + + /* + * Clean up EPTP-tagged guest physical and combined mappings + * + * VMX transitions are not required to invalidate any guest physical + * mappings. So, it may be possible for stale guest physical mappings + * to be present in the processor TLBs. + * + * Combined mappings for this EP4TA are also invalidated for all VPIDs. + */ + ept_invalidate_mappings(vtophys(vmx->pml4ept)); + + msr_bitmap_initialize(vmx->msr_bitmap); + + /* + * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. + * The guest FSBASE and GSBASE are saved and restored during + * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are + * always restored from the vmcs host state area on vm-exit. + * + * Guest KGSBASE is saved and restored in the guest MSR save area. + * Host KGSBASE is restored before returning to userland from the pcb. + * There will be a window of time when we are executing in the host + * kernel context with a value of KGSBASE from the guest. This is ok + * because the value of KGSBASE is inconsequential in kernel context. + * + * MSR_EFER is saved and restored in the guest VMCS area on a + * VM exit and entry respectively. It is also restored from the + * host VMCS area on a VM exit. + * + * MSR_PAT is saved and restored in the guest VMCS are on a VM exit + * and entry respectively. It is also restored from the host VMCS + * area on a VM exit. + */ + if (guest_msr_rw(vmx, MSR_GSBASE) || + guest_msr_rw(vmx, MSR_FSBASE) || + guest_msr_rw(vmx, MSR_KGSBASE) || + guest_msr_rw(vmx, MSR_EFER) || + guest_msr_rw(vmx, MSR_PAT)) + panic("vmx_vminit: error setting guest msr access"); + + for (i = 0; i < VM_MAXCPU; i++) { + vmx->vmcs[i].identifier = vmx_revision(); + error = vmclear(&vmx->vmcs[i]); + if (error != 0) { + panic("vmx_vminit: vmclear error %d on vcpu %d\n", + error, i); + } + + vpid = vmx_vpid(); + + error = vmcs_set_defaults(&vmx->vmcs[i], + (u_long)vmx_longjmp, + (u_long)&vmx->ctx[i], + vtophys(vmx->pml4ept), + pinbased_ctls, + procbased_ctls, + procbased_ctls2, + exit_ctls, entry_ctls, + vtophys(vmx->msr_bitmap), + vpid); + + if (error != 0) + panic("vmx_vminit: vmcs_set_defaults error %d", error); + + vmx->cap[i].set = 0; + vmx->cap[i].proc_ctls = procbased_ctls; + + vmx->state[i].request_nmi = 0; + vmx->state[i].lastcpu = -1; + vmx->state[i].vpid = vpid; + + msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); + + error = vmcs_set_msr_save(&vmx->vmcs[i], + vtophys(vmx->guest_msrs[i]), + guest_msr_count); + if (error != 0) + panic("vmcs_set_msr_save error %d", error); + + error = vmx_setup_cr0_shadow(&vmx->vmcs[i]); + } + + return (vmx); +} + +static int +vmx_handle_cpuid(struct vmxctx *vmxctx) +{ + int handled, func; + + func = vmxctx->guest_rax; + + handled = x86_emulate_cpuid((uint32_t*)(&vmxctx->guest_rax), + (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx), + (uint32_t*)(&vmxctx->guest_rdx)); +#if 0 + printf("%s: func %x rax %lx rbx %lx rcx %lx rdx %lx handled %d\n", + __func__, func, vmxctx->guest_rax, vmxctx->guest_rbx, + vmxctx->guest_rcx, vmxctx->guest_rdx, handled); +#endif + + return (handled); +} + +static __inline void +vmx_run_trace(struct vmx *vmx, int vcpu) +{ +#ifdef KTR + VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip()); +#endif +} + +static __inline void +vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, + int handled, int astpending) +{ +#ifdef KTR + VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", + handled ? "handled" : "unhandled", + exit_reason_to_str(exit_reason), rip); + + if (astpending) + VMM_CTR0(vmx->vm, vcpu, "astpending"); +#endif +} + +static int +vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) +{ + int error, lastcpu; + struct vmxstate *vmxstate; + struct invvpid_desc invvpid_desc = { 0 }; + + vmxstate = &vmx->state[vcpu]; + lastcpu = vmxstate->lastcpu; + vmxstate->lastcpu = curcpu; + + if (lastcpu == curcpu) { + error = 0; + goto done; + } + + vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); + + error = vmwrite(VMCS_HOST_TR_BASE, (u_long)PCPU_GET(tssp)); + if (error != 0) + goto done; + + error = vmwrite(VMCS_HOST_GDTR_BASE, (u_long)&gdt[NGDT * curcpu]); + if (error != 0) + goto done; + + error = vmwrite(VMCS_HOST_GS_BASE, (u_long)&__pcpu[curcpu]); + if (error != 0) + goto done; + + /* + * If we are using VPIDs then invalidate all mappings tagged with 'vpid' + * + * We do this because this vcpu was executing on a different host + * cpu when it last ran. We do not track whether it invalidated + * mappings associated with its 'vpid' during that run. So we must + * assume that the mappings associated with 'vpid' on 'curcpu' are + * stale and invalidate them. + * + * Note that we incur this penalty only when the scheduler chooses to + * move the thread associated with this vcpu between host cpus. + * + * Note also that this will invalidate mappings tagged with 'vpid' + * for "all" EP4TAs. + */ + if (vmxstate->vpid != 0) { + invvpid_desc.vpid = vmxstate->vpid; + invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); + } +done: + return (error); +} + +static void +vm_exit_update_rip(struct vm_exit *vmexit) +{ + int error; + + error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); + if (error) + panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); +} + +/* + * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. + */ +CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); + +static void __inline +vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) +{ + int error; + + vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; + + error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + if (error) + panic("vmx_set_int_window_exiting: vmwrite error %d", error); +} + +static void __inline +vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) +{ + int error; + + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; + + error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + if (error) + panic("vmx_clear_int_window_exiting: vmwrite error %d", error); +} + +static void __inline +vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + int error; + + vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; + + error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + if (error) + panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); +} + +static void __inline +vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + int error; + + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; + + error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + if (error) + panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); +} + +static int +vmx_inject_nmi(struct vmx *vmx, int vcpu) +{ + int error; + uint64_t info, interruptibility; + + /* Bail out if no NMI requested */ + if (vmx->state[vcpu].request_nmi == 0) + return (0); + + error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); + if (error) { + panic("vmx_inject_nmi: vmread(interruptibility) %d", + error); + } + if (interruptibility & nmi_blocking_bits) + goto nmiblocked; + + /* + * Inject the virtual NMI. The vector must be the NMI IDT entry + * or the VMCS entry check will fail. + */ + info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; + info |= IDT_NMI; + + error = vmwrite(VMCS_ENTRY_INTR_INFO, info); + if (error) + panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); + + VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); + + /* Clear the request */ + vmx->state[vcpu].request_nmi = 0; + return (1); + +nmiblocked: + /* + * Set the NMI Window Exiting execution control so we can inject + * the virtual NMI as soon as blocking condition goes away. + */ + vmx_set_nmi_window_exiting(vmx, vcpu); + + VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); + return (1); +} + +static void +vmx_inject_interrupts(struct vmx *vmx, int vcpu) +{ + int error, vector; + uint64_t info, rflags, interruptibility; + + const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; + +#if 1 + /* + * XXX + * If an event is being injected from userland then just return. + * For e.g. we may inject a breakpoint exception to cause the + * guest to enter the debugger so we can inspect its state. + */ + error = vmread(VMCS_ENTRY_INTR_INFO, &info); + if (error) + panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); + if (info & VMCS_INTERRUPTION_INFO_VALID) + return; +#endif + /* + * NMI injection has priority so deal with those first + */ + if (vmx_inject_nmi(vmx, vcpu)) + return; + + /* Ask the local apic for a vector to inject */ + vector = lapic_pending_intr(vmx->vm, vcpu); + if (vector < 0) + return; + + if (vector < 32 || vector > 255) + panic("vmx_inject_interrupts: invalid vector %d\n", vector); + + /* Check RFLAGS.IF and the interruptibility state of the guest */ + error = vmread(VMCS_GUEST_RFLAGS, &rflags); + if (error) + panic("vmx_inject_interrupts: vmread(rflags) %d", error); + + if ((rflags & PSL_I) == 0) + goto cantinject; + + error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); + if (error) { + panic("vmx_inject_interrupts: vmread(interruptibility) %d", + error); + } + if (interruptibility & HWINTR_BLOCKED) + goto cantinject; + + /* Inject the interrupt */ + info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; + info |= vector; + error = vmwrite(VMCS_ENTRY_INTR_INFO, info); + if (error) + panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); + + /* Update the Local APIC ISR */ + lapic_intr_accepted(vmx->vm, vcpu, vector); + + VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); + + return; + +cantinject: + /* + * Set the Interrupt Window Exiting execution control so we can inject + * the interrupt as soon as blocking condition goes away. + */ + vmx_set_int_window_exiting(vmx, vcpu); + + VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); +} + +static int +vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + int error; + uint64_t regval; + const struct vmxctx *vmxctx; + + /* We only handle mov to %cr0 at this time */ + if ((exitqual & 0xff) != 0x00) + return (UNHANDLED); + + vmxctx = &vmx->ctx[vcpu]; + + /* + * We must use vmwrite() directly here because vmcs_setreg() will + * call vmclear(vmcs) as a side-effect which we certainly don't want. + */ + switch ((exitqual >> 8) & 0xf) { + case 0: + regval = vmxctx->guest_rax; + break; + case 1: + regval = vmxctx->guest_rcx; + break; + case 2: + regval = vmxctx->guest_rdx; + break; + case 3: + regval = vmxctx->guest_rbx; + break; + case 4: + error = vmread(VMCS_GUEST_RSP, ®val); + if (error) { + panic("vmx_emulate_cr_access: " + "error %d reading guest rsp", error); + } + break; + case 5: + regval = vmxctx->guest_rbp; + break; + case 6: + regval = vmxctx->guest_rsi; + break; + case 7: + regval = vmxctx->guest_rdi; + break; + case 8: + regval = vmxctx->guest_r8; + break; + case 9: + regval = vmxctx->guest_r9; + break; + case 10: + regval = vmxctx->guest_r10; + break; + case 11: + regval = vmxctx->guest_r11; + break; + case 12: + regval = vmxctx->guest_r12; + break; + case 13: + regval = vmxctx->guest_r13; + break; + case 14: + regval = vmxctx->guest_r14; + break; + case 15: + regval = vmxctx->guest_r15; + break; + } + + regval |= cr0_ones_mask; + regval &= ~cr0_zeros_mask; + error = vmwrite(VMCS_GUEST_CR0, regval); + if (error) + panic("vmx_emulate_cr_access: error %d writing cr0", error); + + return (HANDLED); +} + +static int +vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + int handled; + struct vmcs *vmcs; + struct vmxctx *vmxctx; + uint32_t eax, ecx, edx; + uint64_t qual; + + handled = 0; + vmcs = &vmx->vmcs[vcpu]; + vmxctx = &vmx->ctx[vcpu]; + qual = vmexit->u.vmx.exit_qualification; + vmexit->exitcode = VM_EXITCODE_BOGUS; + + switch (vmexit->u.vmx.exit_reason) { + case EXIT_REASON_CR_ACCESS: + handled = vmx_emulate_cr_access(vmx, vcpu, qual); + break; + case EXIT_REASON_RDMSR: + ecx = vmxctx->guest_rcx; + handled = emulate_rdmsr(vmx->vm, vcpu, ecx); + if (!handled) { + vmexit->exitcode = VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + } + break; + case EXIT_REASON_WRMSR: + eax = vmxctx->guest_rax; + ecx = vmxctx->guest_rcx; + edx = vmxctx->guest_rdx; + handled = emulate_wrmsr(vmx->vm, vcpu, ecx, + (uint64_t)edx << 32 | eax); + if (!handled) { + vmexit->exitcode = VM_EXITCODE_WRMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; + } + break; + case EXIT_REASON_HLT: + vmexit->exitcode = VM_EXITCODE_HLT; + break; + case EXIT_REASON_MTF: + vmexit->exitcode = VM_EXITCODE_MTRAP; + break; + case EXIT_REASON_PAUSE: + vmexit->exitcode = VM_EXITCODE_PAUSE; + break; + case EXIT_REASON_INTR_WINDOW: + vmx_clear_int_window_exiting(vmx, vcpu); + VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); + /* FALLTHRU */ + case EXIT_REASON_EXT_INTR: + /* + * External interrupts serve only to cause VM exits and allow + * the host interrupt handler to run. + * + * If this external interrupt triggers a virtual interrupt + * to a VM, then that state will be recorded by the + * host interrupt handler in the VM's softc. We will inject + * this virtual interrupt during the subsequent VM enter. + */ + + /* + * This is special. We want to treat this as an 'handled' + * VM-exit but not increment the instruction pointer. + */ + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); + return (1); + case EXIT_REASON_NMI_WINDOW: + /* Exit to allow the pending virtual NMI to be injected */ + vmx_clear_nmi_window_exiting(vmx, vcpu); + VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); + return (1); + case EXIT_REASON_INOUT: + vmexit->exitcode = VM_EXITCODE_INOUT; + vmexit->u.inout.bytes = (qual & 0x7) + 1; + vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; + vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; + vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; + vmexit->u.inout.port = (uint16_t)(qual >> 16); + vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); + break; + case EXIT_REASON_CPUID: + handled = vmx_handle_cpuid(vmxctx); + break; + default: + break; + } + + if (handled) { + /* + * It is possible that control is returned to userland + * even though we were able to handle the VM exit in the + * kernel (for e.g. 'astpending' is set in the run loop). + * + * In such a case we want to make sure that the userland + * restarts guest execution at the instruction *after* + * the one we just processed. Therefore we update the + * guest rip in the VMCS and in 'vmexit'. + */ + vm_exit_update_rip(vmexit); + vmexit->rip += vmexit->inst_length; + vmexit->inst_length = 0; + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic VMX exit. + */ + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.error = 0; + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + return (handled); +} + +static int +vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit) +{ + int error, vie, rc, handled, astpending, loopstart; + uint32_t exit_reason; + struct vmx *vmx; + struct vmxctx *vmxctx; + struct vmcs *vmcs; + + vmx = arg; + vmcs = &vmx->vmcs[vcpu]; + vmxctx = &vmx->ctx[vcpu]; + loopstart = 1; + + /* + * XXX Can we avoid doing this every time we do a vm run? + */ + VMPTRLD(vmcs); + + /* + * XXX + * We do this every time because we may setup the virtual machine + * from a different process than the one that actually runs it. + * + * If the life of a virtual machine was spent entirely in the context + * of a single process we could do this once in vmcs_set_defaults(). + */ + if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) + panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); + + if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) + panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); + + if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) + panic("vmx_run: error %d setting up pcpu defaults", error); + + do { + lapic_timer_tick(vmx->vm, vcpu); + vmx_inject_interrupts(vmx, vcpu); + vmx_run_trace(vmx, vcpu); + rc = vmx_setjmp(vmxctx); +#ifdef SETJMP_TRACE + vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); +#endif + switch (rc) { + case VMX_RETURN_DIRECT: + if (loopstart) { + loopstart = 0; + vmx_launch(vmxctx); + } else + vmx_resume(vmxctx); + panic("vmx_launch/resume should not return"); + break; + case VMX_RETURN_LONGJMP: + break; /* vm exit */ + case VMX_RETURN_VMRESUME: + vie = vmcs_instruction_error(); + if (vmxctx->launch_error == VM_FAIL_INVALID || + vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { + printf("vmresume error %d vmcs inst error %d\n", + vmxctx->launch_error, vie); + goto err_exit; + } + vmx_launch(vmxctx); /* try to launch the guest */ + panic("vmx_launch should not return"); + break; + case VMX_RETURN_VMLAUNCH: + vie = vmcs_instruction_error(); +#if 1 + printf("vmlaunch error %d vmcs inst error %d\n", + vmxctx->launch_error, vie); +#endif + goto err_exit; + default: + panic("vmx_setjmp returned %d", rc); + } + + /* + * XXX locking? + * See comments in exception.S about checking for ASTs + * atomically while interrupts are disabled. But it is + * not clear that they apply in our case. + */ + astpending = curthread->td_flags & TDF_ASTPENDING; + + /* enable interrupts */ + enable_intr(); + + /* collect some basic information for VM exit processing */ + vmexit->rip = rip = vmcs_guest_rip(); + vmexit->inst_length = vmexit_instruction_length(); + vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); + vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); + + handled = vmx_exit_process(vmx, vcpu, vmexit); + + vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled, + astpending); + } while (handled && !astpending); + + /* + * If a VM exit has been handled then the exitcode must be BOGUS + * If a VM exit is not handled then the exitcode must not be BOGUS + */ + if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || + (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { + panic("Mismatch between handled (%d) and exitcode (%d)", + handled, vmexit->exitcode); + } + + VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); + + /* + * XXX + * We need to do this to ensure that any VMCS state cached by the + * processor is flushed to memory. We need to do this in case the + * VM moves to a different cpu the next time it runs. + * + * Can we avoid doing this? + */ + VMCLEAR(vmcs); + return (0); + +err_exit: + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.exit_reason = (uint32_t)-1; + vmexit->u.vmx.exit_qualification = (uint32_t)-1; + vmexit->u.vmx.error = vie; + VMCLEAR(vmcs); + return (ENOEXEC); +} + +static void +vmx_vmcleanup(void *arg) +{ + int error; + struct vmx *vmx = arg; + + /* + * XXXSMP we also need to clear the VMCS active on the other vcpus. + */ + error = vmclear(&vmx->vmcs[0]); + if (error != 0) + panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); + + ept_vmcleanup(vmx); + free(vmx, M_VMX); + + return; +} + +static register_t * +vmxctx_regptr(struct vmxctx *vmxctx, int reg) +{ + + switch (reg) { + case VM_REG_GUEST_RAX: + return (&vmxctx->guest_rax); + case VM_REG_GUEST_RBX: + return (&vmxctx->guest_rbx); + case VM_REG_GUEST_RCX: + return (&vmxctx->guest_rcx); + case VM_REG_GUEST_RDX: + return (&vmxctx->guest_rdx); + case VM_REG_GUEST_RSI: + return (&vmxctx->guest_rsi); + case VM_REG_GUEST_RDI: + return (&vmxctx->guest_rdi); + case VM_REG_GUEST_RBP: + return (&vmxctx->guest_rbp); + case VM_REG_GUEST_R8: + return (&vmxctx->guest_r8); + case VM_REG_GUEST_R9: + return (&vmxctx->guest_r9); + case VM_REG_GUEST_R10: + return (&vmxctx->guest_r10); + case VM_REG_GUEST_R11: + return (&vmxctx->guest_r11); + case VM_REG_GUEST_R12: + return (&vmxctx->guest_r12); + case VM_REG_GUEST_R13: + return (&vmxctx->guest_r13); + case VM_REG_GUEST_R14: + return (&vmxctx->guest_r14); + case VM_REG_GUEST_R15: + return (&vmxctx->guest_r15); + default: + break; + } + return (NULL); +} + +static int +vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) +{ + register_t *regp; + + if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { + *retval = *regp; + return (0); + } else + return (EINVAL); +} + +static int +vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) +{ + register_t *regp; + + if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { + *regp = val; + return (0); + } else + return (EINVAL); +} + +static int +vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) +{ + struct vmx *vmx = arg; + + if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) + return (0); + + /* + * If the vcpu is running then don't mess with the VMCS. + * + * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause + * the subsequent vmlaunch/vmresume to fail. + */ + if (vcpu_is_running(vmx->vm, vcpu, NULL)) + panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); + + return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval)); +} + +static int +vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) +{ + int error; + uint64_t ctls; + struct vmx *vmx = arg; + + /* + * XXX Allow caller to set contents of the guest registers saved in + * the 'vmxctx' even though the vcpu might be running. We need this + * specifically to support the rdmsr emulation that will set the + * %eax and %edx registers during vm exit processing. + */ + if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) + return (0); + + /* + * If the vcpu is running then don't mess with the VMCS. + * + * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause + * the subsequent vmlaunch/vmresume to fail. + */ + if (vcpu_is_running(vmx->vm, vcpu, NULL)) + panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); + + error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val); + + if (error == 0) { + /* + * If the "load EFER" VM-entry control is 1 then the + * value of EFER.LMA must be identical to "IA-32e mode guest" + * bit in the VM-entry control. + */ + if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && + (reg == VM_REG_GUEST_EFER)) { + vmcs_getreg(&vmx->vmcs[vcpu], + VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); + if (val & EFER_LMA) + ctls |= VM_ENTRY_GUEST_LMA; + else + ctls &= ~VM_ENTRY_GUEST_LMA; + vmcs_setreg(&vmx->vmcs[vcpu], + VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); + } + } + + return (error); +} + +static int +vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmx *vmx = arg; + + return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); +} + +static int +vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmx *vmx = arg; + + return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); +} + +static int +vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, + int code_valid) +{ + int error; + uint32_t info; + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + + static uint32_t type_map[VM_EVENT_MAX] = { + 0x1, /* VM_EVENT_NONE */ + 0x0, /* VM_HW_INTR */ + 0x2, /* VM_NMI */ + 0x3, /* VM_HW_EXCEPTION */ + 0x4, /* VM_SW_INTR */ + 0x5, /* VM_PRIV_SW_EXCEPTION */ + 0x6, /* VM_SW_EXCEPTION */ + }; + + info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); + info |= VMCS_INTERRUPTION_INFO_VALID; + error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); + if (error != 0) + return (error); + + if (code_valid) { + error = vmcs_setreg(vmcs, + VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), + code); + } + return (error); +} + +static int +vmx_nmi(void *arg, int vcpu) +{ + struct vmx *vmx = arg; + + atomic_set_int(&vmx->state[vcpu].request_nmi, 1); + + return (0); +} + +static int +vmx_getcap(void *arg, int vcpu, int type, int *retval) +{ + struct vmx *vmx = arg; + int vcap; + int ret; + + ret = ENOENT; + + vcap = vmx->cap[vcpu].set; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) + ret = 0; + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) + ret = 0; + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) + ret = 0; + break; + case VM_CAP_UNRESTRICTED_GUEST: + if (cap_unrestricted_guest) + ret = 0; + break; + default: + break; + } + + if (ret == 0) + *retval = (vcap & (1 << type)) ? 1 : 0; + + return (ret); +} + +static int +vmx_setcap(void *arg, int vcpu, int type, int val) +{ + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + uint32_t baseval; + uint32_t *pptr; + int error; + int flag; + int reg; + int retval; + + retval = ENOENT; + pptr = NULL; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_HLT_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_MTF; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_PAUSE_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_UNRESTRICTED_GUEST: + if (cap_unrestricted_guest) { + retval = 0; + baseval = procbased_ctls2; + flag = PROCBASED2_UNRESTRICTED_GUEST; + reg = VMCS_SEC_PROC_BASED_CTLS; + } + break; + default: + break; + } + + if (retval == 0) { + if (val) { + baseval |= flag; + } else { + baseval &= ~flag; + } + VMPTRLD(vmcs); + error = vmwrite(reg, baseval); + VMCLEAR(vmcs); + + if (error) { + retval = error; + } else { + /* + * Update optional stored flags, and record + * setting + */ + if (pptr != NULL) { + *pptr = baseval; + } + + if (val) { + vmx->cap[vcpu].set |= (1 << type); + } else { + vmx->cap[vcpu].set &= ~(1 << type); + } + } + } + + return (retval); +} + +struct vmm_ops vmm_ops_intel = { + vmx_init, + vmx_cleanup, + vmx_vminit, + vmx_run, + vmx_vmcleanup, + ept_vmmmap, + vmx_getreg, + vmx_setreg, + vmx_getdesc, + vmx_setdesc, + vmx_inject, + vmx_nmi, + vmx_getcap, + vmx_setcap +}; diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h new file mode 100644 index 0000000..69697f8 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx.h @@ -0,0 +1,115 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_H_ +#define _VMX_H_ + +#include "vmcs.h" + +#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */ + +struct vmxctx { + register_t guest_rdi; /* Guest state */ + register_t guest_rsi; + register_t guest_rdx; + register_t guest_rcx; + register_t guest_r8; + register_t guest_r9; + register_t guest_rax; + register_t guest_rbx; + register_t guest_rbp; + register_t guest_r10; + register_t guest_r11; + register_t guest_r12; + register_t guest_r13; + register_t guest_r14; + register_t guest_r15; + register_t guest_cr2; + + register_t host_r15; /* Host state */ + register_t host_r14; + register_t host_r13; + register_t host_r12; + register_t host_rbp; + register_t host_rsp; + register_t host_rbx; + register_t host_rip; + /* + * XXX todo debug registers and fpu state + */ + + int launch_error; +}; + +struct vmxcap { + int set; + uint32_t proc_ctls; +}; + +struct vmxstate { + int request_nmi; + int lastcpu; /* host cpu that this 'vcpu' last ran on */ + uint16_t vpid; +}; + +/* virtual machine softc */ +struct vmx { + pml4_entry_t pml4ept[NPML4EPG]; + struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */ + char msr_bitmap[PAGE_SIZE]; + struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES]; + struct vmxctx ctx[VM_MAXCPU]; + struct vmxcap cap[VM_MAXCPU]; + struct vmxstate state[VM_MAXCPU]; + struct vm *vm; +}; +CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0); + +#define VMX_RETURN_DIRECT 0 +#define VMX_RETURN_LONGJMP 1 +#define VMX_RETURN_VMRESUME 2 +#define VMX_RETURN_VMLAUNCH 3 +/* + * vmx_setjmp() returns: + * - 0 when it returns directly + * - 1 when it returns from vmx_longjmp + * - 2 when it returns from vmx_resume (which would only be in the error case) + * - 3 when it returns from vmx_launch (which would only be in the error case) + */ +int vmx_setjmp(struct vmxctx *ctx); +void vmx_longjmp(void); /* returns via vmx_setjmp */ +void vmx_launch(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */ +void vmx_resume(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */ + +u_long vmx_fix_cr0(u_long cr0); +u_long vmx_fix_cr4(u_long cr4); + +#endif diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h new file mode 100644 index 0000000..31f29f8 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_controls.h @@ -0,0 +1,92 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_CONTROLS_H_ +#define _VMX_CONTROLS_H_ + +/* Pin-Based VM-Execution Controls */ +#define PINBASED_EXTINT_EXITING (1 << 0) +#define PINBASED_NMI_EXITING (1 << 3) +#define PINBASED_VIRTUAL_NMI (1 << 5) +#define PINBASED_PREMPTION_TIMER (1 << 6) + +/* Primary Processor-Based VM-Execution Controls */ +#define PROCBASED_INT_WINDOW_EXITING (1 << 2) +#define PROCBASED_TSC_OFFSET (1 << 3) +#define PROCBASED_HLT_EXITING (1 << 7) +#define PROCBASED_INVLPG_EXITING (1 << 9) +#define PROCBASED_MWAIT_EXITING (1 << 10) +#define PROCBASED_RDPMC_EXITING (1 << 11) +#define PROCBASED_RDTSC_EXITING (1 << 12) +#define PROCBASED_CR3_LOAD_EXITING (1 << 15) +#define PROCBASED_CR3_STORE_EXITING (1 << 16) +#define PROCBASED_CR8_LOAD_EXITING (1 << 19) +#define PROCBASED_CR8_STORE_EXITING (1 << 20) +#define PROCBASED_USE_TPR_SHADOW (1 << 21) +#define PROCBASED_NMI_WINDOW_EXITING (1 << 22) +#define PROCBASED_MOV_DR_EXITING (1 << 23) +#define PROCBASED_IO_EXITING (1 << 24) +#define PROCBASED_IO_BITMAPS (1 << 25) +#define PROCBASED_MTF (1 << 27) +#define PROCBASED_MSR_BITMAPS (1 << 28) +#define PROCBASED_MONITOR_EXITING (1 << 29) +#define PROCBASED_PAUSE_EXITING (1 << 30) +#define PROCBASED_SECONDARY_CONTROLS (1 << 31) + +/* Secondary Processor-Based VM-Execution Controls */ +#define PROCBASED2_VIRTUALIZE_APIC (1 << 0) +#define PROCBASED2_ENABLE_EPT (1 << 1) +#define PROCBASED2_DESC_TABLE_EXITING (1 << 2) +#define PROCBASED2_ENABLE_RDTSCP (1 << 3) +#define PROCBASED2_VIRTUALIZE_X2APIC (1 << 4) +#define PROCBASED2_ENABLE_VPID (1 << 5) +#define PROCBASED2_WBINVD_EXITING (1 << 6) +#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7) +#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10) + +/* VM Exit Controls */ +#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2) +#define VM_EXIT_HOST_LMA (1 << 9) +#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12) +#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15) +#define VM_EXIT_SAVE_PAT (1 << 18) +#define VM_EXIT_LOAD_PAT (1 << 19) +#define VM_EXIT_SAVE_EFER (1 << 20) +#define VM_EXIT_LOAD_EFER (1 << 21) +#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22) + +/* VM Entry Controls */ +#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2) +#define VM_ENTRY_GUEST_LMA (1 << 9) +#define VM_ENTRY_INTO_SMM (1 << 10) +#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11) +#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13) +#define VM_ENTRY_LOAD_PAT (1 << 14) +#define VM_ENTRY_LOAD_EFER (1 << 15) + +#endif diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h new file mode 100644 index 0000000..e9f6c6d --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_cpufunc.h @@ -0,0 +1,199 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_CPUFUNC_H_ +#define _VMX_CPUFUNC_H_ + +struct vmcs; + +/* + * Section 5.2 "Conventions" from Intel Architecture Manual 2B. + * + * error + * VMsucceed 0 + * VMFailInvalid 1 + * VMFailValid 2 see also VMCS VM-Instruction Error Field + */ +#define VM_SUCCESS 0 +#define VM_FAIL_INVALID 1 +#define VM_FAIL_VALID 2 +#define VMX_SET_ERROR_CODE(varname) \ + do { \ + __asm __volatile(" jnc 1f;" \ + " mov $1, %0;" /* CF: error = 1 */ \ + " jmp 3f;" \ + "1: jnz 2f;" \ + " mov $2, %0;" /* ZF: error = 2 */ \ + " jmp 3f;" \ + "2: mov $0, %0;" \ + "3: nop" \ + :"=r" (varname)); \ + } while (0) + +/* returns 0 on success and non-zero on failure */ +static __inline int +vmxon(char *region) +{ + int error; + uint64_t addr; + + addr = vtophys(region); + __asm __volatile("vmxon %0" : : "m" (*(uint64_t *)&addr) : "memory"); + VMX_SET_ERROR_CODE(error); + return (error); +} + +/* returns 0 on success and non-zero on failure */ +static __inline int +vmclear(struct vmcs *vmcs) +{ + int error; + uint64_t addr; + + addr = vtophys(vmcs); + __asm __volatile("vmclear %0" : : "m" (*(uint64_t *)&addr) : "memory"); + VMX_SET_ERROR_CODE(error); + return (error); +} + +static __inline void +vmxoff(void) +{ + __asm __volatile("vmxoff"); +} + +static __inline void +vmptrst(uint64_t *addr) +{ + __asm __volatile("vmptrst %0" : : "m" (*addr) : "memory"); +} + +static __inline int +vmptrld(struct vmcs *vmcs) +{ + int error; + uint64_t addr; + + addr = vtophys(vmcs); + __asm __volatile("vmptrld %0" : : "m" (*(uint64_t *)&addr) : "memory"); + VMX_SET_ERROR_CODE(error); + return (error); +} + +static __inline int +vmwrite(uint64_t reg, uint64_t val) +{ + int error; + + __asm __volatile("vmwrite %0, %1" : : "r" (val), "r" (reg) : "memory"); + + VMX_SET_ERROR_CODE(error); + + return (error); +} + +static __inline int +vmread(uint64_t r, uint64_t *addr) +{ + int error; + + __asm __volatile("vmread %0, %1" : : "r" (r), "m" (*addr) : "memory"); + + VMX_SET_ERROR_CODE(error); + + return (error); +} + +static void __inline +VMCLEAR(struct vmcs *vmcs) +{ + int err; + + err = vmclear(vmcs); + if (err != 0) + panic("%s: vmclear(%p) error %d", __func__, vmcs, err); + + critical_exit(); +} + +static void __inline +VMPTRLD(struct vmcs *vmcs) +{ + int err; + + critical_enter(); + + err = vmptrld(vmcs); + if (err != 0) + panic("%s: vmptrld(%p) error %d", __func__, vmcs, err); +} + +#define INVVPID_TYPE_ADDRESS 0UL +#define INVVPID_TYPE_SINGLE_CONTEXT 1UL +#define INVVPID_TYPE_ALL_CONTEXTS 2UL + +struct invvpid_desc { + uint16_t vpid; + uint16_t _res1; + uint32_t _res2; + uint64_t linear_addr; +}; +CTASSERT(sizeof(struct invvpid_desc) == 16); + +static void __inline +invvpid(uint64_t type, struct invvpid_desc desc) +{ + int error; + + __asm __volatile("invvpid %0, %1" :: "m" (desc), "r" (type) : "memory"); + + VMX_SET_ERROR_CODE(error); + if (error) + panic("invvpid error %d", error); +} + +#define INVEPT_TYPE_SINGLE_CONTEXT 1UL +#define INVEPT_TYPE_ALL_CONTEXTS 2UL +struct invept_desc { + uint64_t eptp; + uint64_t _res; +}; +CTASSERT(sizeof(struct invept_desc) == 16); + +static void __inline +invept(uint64_t type, struct invept_desc desc) +{ + int error; + + __asm __volatile("invept %0, %1" :: "m" (desc), "r" (type) : "memory"); + + VMX_SET_ERROR_CODE(error); + if (error) + panic("invept error %d", error); +} +#endif diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c new file mode 100644 index 0000000..c4b1efc --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_genassym.c @@ -0,0 +1,81 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/assym.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/pmap.h> + +#include <machine/vmm.h> +#include "vmx.h" +#include "vmx_cpufunc.h" + +ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi)); +ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi)); +ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx)); +ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx)); +ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8)); +ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9)); +ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax)); +ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx)); +ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp)); +ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10)); +ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11)); +ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12)); +ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13)); +ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14)); +ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15)); +ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2)); + +ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15)); +ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14)); +ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13)); +ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12)); +ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp)); +ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp)); +ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx)); +ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip)); + +ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error)); + +ASSYM(VM_SUCCESS, VM_SUCCESS); +ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID); +ASSYM(VM_FAIL_VALID, VM_FAIL_VALID); + +ASSYM(VMX_RETURN_DIRECT, VMX_RETURN_DIRECT); +ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP); +ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME); +ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH); diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c new file mode 100644 index 0000000..1e9a837 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_msr.c @@ -0,0 +1,172 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/cpufunc.h> + +#include "vmx_msr.h" + +static boolean_t +vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos) +{ + + if (msr_val & (1UL << (bitpos + 32))) + return (TRUE); + else + return (FALSE); +} + +static boolean_t +vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos) +{ + + if ((msr_val & (1UL << bitpos)) == 0) + return (TRUE); + else + return (FALSE); +} + +uint32_t +vmx_revision(void) +{ + + return (rdmsr(MSR_VMX_BASIC) & 0xffffffff); +} + +/* + * Generate a bitmask to be used for the VMCS execution control fields. + * + * The caller specifies what bits should be set to one in 'ones_mask' + * and what bits should be set to zero in 'zeros_mask'. The don't-care + * bits are set to the default value. The default values are obtained + * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining + * VMX Capabilities". + * + * Returns zero on success and non-zero on error. + */ +int +vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval) +{ + int i; + uint64_t val, trueval; + boolean_t true_ctls_avail, one_allowed, zero_allowed; + + /* We cannot ask the same bit to be set to both '1' and '0' */ + if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask)) + return (EINVAL); + + if (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) + true_ctls_avail = TRUE; + else + true_ctls_avail = FALSE; + + val = rdmsr(ctl_reg); + if (true_ctls_avail) + trueval = rdmsr(true_ctl_reg); /* step c */ + else + trueval = val; /* step a */ + + for (i = 0; i < 32; i++) { + one_allowed = vmx_ctl_allows_one_setting(trueval, i); + zero_allowed = vmx_ctl_allows_zero_setting(trueval, i); + + KASSERT(one_allowed || zero_allowed, + ("invalid zero/one setting for bit %d of ctl 0x%0x, " + "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg)); + + if (zero_allowed && !one_allowed) { /* b(i),c(i) */ + if (ones_mask & (1 << i)) + return (EINVAL); + *retval &= ~(1 << i); + } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */ + if (zeros_mask & (1 << i)) + return (EINVAL); + *retval |= 1 << i; + } else { + if (zeros_mask & (1 << i)) /* b(ii),c(ii) */ + *retval &= ~(1 << i); + else if (ones_mask & (1 << i)) /* b(ii), c(ii) */ + *retval |= 1 << i; + else if (!true_ctls_avail) + *retval &= ~(1 << i); /* b(iii) */ + else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/ + *retval &= ~(1 << i); + else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */ + *retval |= 1 << i; + else { + panic("vmx_set_ctlreg: unable to determine " + "correct value of ctl bit %d for msr " + "0x%0x and true msr 0x%0x", i, ctl_reg, + true_ctl_reg); + } + } + } + + return (0); +} + +void +msr_bitmap_initialize(char *bitmap) +{ + + memset(bitmap, 0xff, PAGE_SIZE); +} + +int +msr_bitmap_change_access(char *bitmap, u_int msr, int access) +{ + int byte, bit; + + if (msr >= 0x00000000 && msr <= 0x00001FFF) + byte = msr / 8; + else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) + byte = 1024 + (msr - 0xC0000000) / 8; + else + return (EINVAL); + + bit = msr & 0x7; + + if (access & MSR_BITMAP_ACCESS_READ) + bitmap[byte] &= ~(1 << bit); + else + bitmap[byte] |= 1 << bit; + + byte += 2048; + if (access & MSR_BITMAP_ACCESS_WRITE) + bitmap[byte] &= ~(1 << bit); + else + bitmap[byte] |= 1 << bit; + + return (0); +} diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h new file mode 100644 index 0000000..e6379a9 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_msr.h @@ -0,0 +1,78 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_MSR_H_ +#define _VMX_MSR_H_ + +#define MSR_VMX_BASIC 0x480 +#define MSR_VMX_EPT_VPID_CAP 0x48C + +#define MSR_VMX_PROCBASED_CTLS 0x482 +#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48E + +#define MSR_VMX_PINBASED_CTLS 0x481 +#define MSR_VMX_TRUE_PINBASED_CTLS 0x48D + +#define MSR_VMX_PROCBASED_CTLS2 0x48B + +#define MSR_VMX_EXIT_CTLS 0x483 +#define MSR_VMX_TRUE_EXIT_CTLS 0x48f + +#define MSR_VMX_ENTRY_CTLS 0x484 +#define MSR_VMX_TRUE_ENTRY_CTLS 0x490 + +#define MSR_VMX_CR0_FIXED0 0x486 +#define MSR_VMX_CR0_FIXED1 0x487 + +#define MSR_VMX_CR4_FIXED0 0x488 +#define MSR_VMX_CR4_FIXED1 0x489 + +uint32_t vmx_revision(void); + +int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval); + +/* + * According to Section 21.10.4 "Software Access to Related Structures", + * changes to data structures pointed to by the VMCS must be made only when + * there is no logical processor with a current VMCS that points to the + * data structure. + * + * This pretty much limits us to configuring the MSR bitmap before VMCS + * initialization for SMP VMs. Unless of course we do it the hard way - which + * would involve some form of synchronization between the vcpus to vmclear + * all VMCSs' that point to the bitmap. + */ +#define MSR_BITMAP_ACCESS_NONE 0x0 +#define MSR_BITMAP_ACCESS_READ 0x1 +#define MSR_BITMAP_ACCESS_WRITE 0x2 +#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE) +void msr_bitmap_initialize(char *bitmap); +int msr_bitmap_change_access(char *bitmap, u_int msr, int access); + +#endif diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S new file mode 100644 index 0000000..4d1bf1d --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_support.S @@ -0,0 +1,204 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <machine/asmacros.h> + +#include "vmx_assym.s" + +/* + * Assumes that %rdi holds a pointer to the 'vmxctx' + */ +#define VMX_GUEST_RESTORE \ + /* \ + * Make sure that interrupts are disabled before restoring CR2. \ + * Otherwise there could be a page fault during the interrupt \ + * handler execution that would end up trashing CR2. \ + */ \ + cli; \ + movq VMXCTX_GUEST_CR2(%rdi),%rsi; \ + movq %rsi,%cr2; \ + movq VMXCTX_GUEST_RSI(%rdi),%rsi; \ + movq VMXCTX_GUEST_RDX(%rdi),%rdx; \ + movq VMXCTX_GUEST_RCX(%rdi),%rcx; \ + movq VMXCTX_GUEST_R8(%rdi),%r8; \ + movq VMXCTX_GUEST_R9(%rdi),%r9; \ + movq VMXCTX_GUEST_RAX(%rdi),%rax; \ + movq VMXCTX_GUEST_RBX(%rdi),%rbx; \ + movq VMXCTX_GUEST_RBP(%rdi),%rbp; \ + movq VMXCTX_GUEST_R10(%rdi),%r10; \ + movq VMXCTX_GUEST_R11(%rdi),%r11; \ + movq VMXCTX_GUEST_R12(%rdi),%r12; \ + movq VMXCTX_GUEST_R13(%rdi),%r13; \ + movq VMXCTX_GUEST_R14(%rdi),%r14; \ + movq VMXCTX_GUEST_R15(%rdi),%r15; \ + movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */ + +#define VM_INSTRUCTION_ERROR(reg) \ + jnc 1f; \ + movl $VM_FAIL_INVALID,reg; /* CF is set */ \ + jmp 3f; \ +1: jnz 2f; \ + movl $VM_FAIL_VALID,reg; /* ZF is set */ \ + jmp 3f; \ +2: movl $VM_SUCCESS,reg; \ +3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp) + + .text +/* + * int vmx_setjmp(ctxp) + * %rdi = ctxp + * + * Return value is '0' when it returns directly from here. + * Return value is '1' when it returns after a vm exit through vmx_longjmp. + */ +ENTRY(vmx_setjmp) + movq (%rsp),%rax /* return address */ + movq %r15,VMXCTX_HOST_R15(%rdi) + movq %r14,VMXCTX_HOST_R14(%rdi) + movq %r13,VMXCTX_HOST_R13(%rdi) + movq %r12,VMXCTX_HOST_R12(%rdi) + movq %rbp,VMXCTX_HOST_RBP(%rdi) + movq %rsp,VMXCTX_HOST_RSP(%rdi) + movq %rbx,VMXCTX_HOST_RBX(%rdi) + movq %rax,VMXCTX_HOST_RIP(%rdi) + + /* + * XXX save host debug registers + */ + movl $VMX_RETURN_DIRECT,%eax + ret +END(vmx_setjmp) + +/* + * void vmx_return(struct vmxctx *ctxp, int retval) + * %rdi = ctxp + * %rsi = retval + * Return to vmm context through vmx_setjmp() with a value of 'retval'. + */ +ENTRY(vmx_return) + /* Restore host context. */ + movq VMXCTX_HOST_R15(%rdi),%r15 + movq VMXCTX_HOST_R14(%rdi),%r14 + movq VMXCTX_HOST_R13(%rdi),%r13 + movq VMXCTX_HOST_R12(%rdi),%r12 + movq VMXCTX_HOST_RBP(%rdi),%rbp + movq VMXCTX_HOST_RSP(%rdi),%rsp + movq VMXCTX_HOST_RBX(%rdi),%rbx + movq VMXCTX_HOST_RIP(%rdi),%rax + movq %rax,(%rsp) /* return address */ + + /* + * XXX restore host debug registers + */ + movl %esi,%eax + ret +END(vmx_return) + +/* + * void vmx_longjmp(void) + * %rsp points to the struct vmxctx + */ +ENTRY(vmx_longjmp) + /* + * Save guest state that is not automatically saved in the vmcs. + */ + movq %rdi,VMXCTX_GUEST_RDI(%rsp) + movq %rsi,VMXCTX_GUEST_RSI(%rsp) + movq %rdx,VMXCTX_GUEST_RDX(%rsp) + movq %rcx,VMXCTX_GUEST_RCX(%rsp) + movq %r8,VMXCTX_GUEST_R8(%rsp) + movq %r9,VMXCTX_GUEST_R9(%rsp) + movq %rax,VMXCTX_GUEST_RAX(%rsp) + movq %rbx,VMXCTX_GUEST_RBX(%rsp) + movq %rbp,VMXCTX_GUEST_RBP(%rsp) + movq %r10,VMXCTX_GUEST_R10(%rsp) + movq %r11,VMXCTX_GUEST_R11(%rsp) + movq %r12,VMXCTX_GUEST_R12(%rsp) + movq %r13,VMXCTX_GUEST_R13(%rsp) + movq %r14,VMXCTX_GUEST_R14(%rsp) + movq %r15,VMXCTX_GUEST_R15(%rsp) + + movq %cr2,%rdi + movq %rdi,VMXCTX_GUEST_CR2(%rsp) + + movq %rsp,%rdi + movq $VMX_RETURN_LONGJMP,%rsi + callq vmx_return +END(vmx_longjmp) + +/* + * void vmx_resume(struct vmxctx *ctxp) + * %rdi = ctxp + * + * Although the return type is a 'void' this function may return indirectly + * through vmx_setjmp() with a return value of 2. + */ +ENTRY(vmx_resume) + /* + * Restore guest state that is not automatically loaded from the vmcs. + */ + VMX_GUEST_RESTORE + + vmresume + + /* + * Capture the reason why vmresume failed. + */ + VM_INSTRUCTION_ERROR(%eax) + + /* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */ + movq %rsp,%rdi + movq $VMX_RETURN_VMRESUME,%rsi + callq vmx_return +END(vmx_resume) + +/* + * void vmx_launch(struct vmxctx *ctxp) + * %rdi = ctxp + * + * Although the return type is a 'void' this function may return indirectly + * through vmx_setjmp() with a return value of 3. + */ +ENTRY(vmx_launch) + /* + * Restore guest state that is not automatically loaded from the vmcs. + */ + VMX_GUEST_RESTORE + + vmlaunch + + /* + * Capture the reason why vmlaunch failed. + */ + VM_INSTRUCTION_ERROR(%eax) + + /* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */ + movq %rsp,%rdi + movq $VMX_RETURN_VMLAUNCH,%rsi + callq vmx_return +END(vmx_launch) diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c new file mode 100644 index 0000000..24495a9 --- /dev/null +++ b/sys/amd64/vmm/intel/vtd.c @@ -0,0 +1,637 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <dev/pci/pcireg.h> + +#include <machine/pmap.h> +#include <machine/vmparam.h> +#include <machine/pci_cfgreg.h> + +#include "io/iommu.h" + +/* + * Documented in the "Intel Virtualization Technology for Directed I/O", + * Architecture Spec, September 2008. + */ + +/* Section 10.4 "Register Descriptions" */ +struct vtdmap { + volatile uint32_t version; + volatile uint32_t res0; + volatile uint64_t cap; + volatile uint64_t ext_cap; + volatile uint32_t gcr; + volatile uint32_t gsr; + volatile uint64_t rta; + volatile uint64_t ccr; +}; + +#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F) +#define VTD_CAP_ND(cap) ((cap) & 0x7) +#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1) +#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF) +#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1) + +#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1) +#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1) +#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF) + +#define VTD_GCR_WBF (1 << 27) +#define VTD_GCR_SRTP (1 << 30) +#define VTD_GCR_TE (1 << 31) + +#define VTD_GSR_WBFS (1 << 27) +#define VTD_GSR_RTPS (1 << 30) +#define VTD_GSR_TES (1 << 31) + +#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */ +#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */ + +#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */ +#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */ +#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */ +#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */ +#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */ +#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */ +#define VTD_IIR_DOMAIN_P 32 + +#define VTD_ROOT_PRESENT 0x1 +#define VTD_CTX_PRESENT 0x1 +#define VTD_CTX_TT_ALL (1UL << 2) + +#define VTD_PTE_RD (1UL << 0) +#define VTD_PTE_WR (1UL << 1) +#define VTD_PTE_SUPERPAGE (1UL << 7) +#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL) + +struct domain { + uint64_t *ptp; /* first level page table page */ + int pt_levels; /* number of page table levels */ + int addrwidth; /* 'AW' field in context entry */ + int spsmask; /* supported super page sizes */ + u_int id; /* domain id */ + vm_paddr_t maxaddr; /* highest address to be mapped */ + SLIST_ENTRY(domain) next; +}; + +static SLIST_HEAD(, domain) domhead; + +#define DRHD_MAX_UNITS 8 +static int drhd_num; +static struct vtdmap *vtdmaps[DRHD_MAX_UNITS]; +static int max_domains; +typedef int (*drhd_ident_func_t)(void); + +static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); +static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); + +static MALLOC_DEFINE(M_VTD, "vtd", "vtd"); + +/* + * Config space register definitions from the "Intel 5520 and 5500" datasheet. + */ +static int +tylersburg_vtd_ident(void) +{ + int units, nlbus; + uint16_t did, vid; + uint32_t miscsts, vtbar; + + const int bus = 0; + const int slot = 20; + const int func = 0; + + units = 0; + + vid = pci_cfgregread(bus, slot, func, PCIR_VENDOR, 2); + did = pci_cfgregread(bus, slot, func, PCIR_DEVICE, 2); + if (vid != 0x8086 || did != 0x342E) + goto done; + + /* + * Check if this is a dual IOH configuration. + */ + miscsts = pci_cfgregread(bus, slot, func, 0x9C, 4); + if (miscsts & (1 << 25)) + nlbus = pci_cfgregread(bus, slot, func, 0x160, 1); + else + nlbus = -1; + + vtbar = pci_cfgregread(bus, slot, func, 0x180, 4); + if (vtbar & 0x1) { + vtdmaps[units++] = (struct vtdmap *) + PHYS_TO_DMAP(vtbar & 0xffffe000); + } else if (bootverbose) + printf("VT-d unit in legacy IOH is disabled!\n"); + + if (nlbus != -1) { + vtbar = pci_cfgregread(nlbus, slot, func, 0x180, 4); + if (vtbar & 0x1) { + vtdmaps[units++] = (struct vtdmap *) + PHYS_TO_DMAP(vtbar & 0xffffe000); + } else if (bootverbose) + printf("VT-d unit in non-legacy IOH is disabled!\n"); + } +done: + return (units); +} + +static drhd_ident_func_t drhd_ident_funcs[] = { + tylersburg_vtd_ident, + NULL +}; + +static int +vtd_max_domains(struct vtdmap *vtdmap) +{ + int nd; + + nd = VTD_CAP_ND(vtdmap->cap); + + switch (nd) { + case 0: + return (16); + case 1: + return (64); + case 2: + return (256); + case 3: + return (1024); + case 4: + return (4 * 1024); + case 5: + return (16 * 1024); + case 6: + return (64 * 1024); + default: + panic("vtd_max_domains: invalid value of nd (0x%0x)", nd); + } +} + +static u_int +domain_id(void) +{ + u_int id; + struct domain *dom; + + /* Skip domain id 0 - it is reserved when Caching Mode field is set */ + for (id = 1; id < max_domains; id++) { + SLIST_FOREACH(dom, &domhead, next) { + if (dom->id == id) + break; + } + if (dom == NULL) + break; /* found it */ + } + + if (id >= max_domains) + panic("domain ids exhausted"); + + return (id); +} + +static void +vtd_wbflush(struct vtdmap *vtdmap) +{ + + if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0) + pmap_invalidate_cache(); + + if (VTD_CAP_RWBF(vtdmap->cap)) { + vtdmap->gcr = VTD_GCR_WBF; + while ((vtdmap->gsr & VTD_GSR_WBFS) != 0) + ; + } +} + +static void +vtd_ctx_global_invalidate(struct vtdmap *vtdmap) +{ + + vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL; + while ((vtdmap->ccr & VTD_CCR_ICC) != 0) + ; +} + +static void +vtd_iotlb_global_invalidate(struct vtdmap *vtdmap) +{ + int offset; + volatile uint64_t *iotlb_reg, val; + + vtd_wbflush(vtdmap); + + offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16; + iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8); + + *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL | + VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES; + + while (1) { + val = *iotlb_reg; + if ((val & VTD_IIR_IVT) == 0) + break; + } +} + +static void +vtd_translation_enable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = VTD_GCR_TE; + while ((vtdmap->gsr & VTD_GSR_TES) == 0) + ; +} + +static void +vtd_translation_disable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = 0; + while ((vtdmap->gsr & VTD_GSR_TES) != 0) + ; +} + +static int +vtd_init(void) +{ + int i, units; + struct vtdmap *vtdmap; + vm_paddr_t ctx_paddr; + + for (i = 0; drhd_ident_funcs[i] != NULL; i++) { + units = (*drhd_ident_funcs[i])(); + if (units > 0) + break; + } + + if (units <= 0) + return (ENXIO); + + drhd_num = units; + vtdmap = vtdmaps[0]; + + if (VTD_CAP_CM(vtdmap->cap) != 0) + panic("vtd_init: invalid caching mode"); + + max_domains = vtd_max_domains(vtdmap); + + /* + * Set up the root-table to point to the context-entry tables + */ + for (i = 0; i < 256; i++) { + ctx_paddr = vtophys(ctx_tables[i]); + if (ctx_paddr & PAGE_MASK) + panic("ctx table (0x%0lx) not page aligned", ctx_paddr); + + root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT; + } + + return (0); +} + +static void +vtd_cleanup(void) +{ +} + +static void +vtd_enable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_wbflush(vtdmap); + + /* Update the root table address */ + vtdmap->rta = vtophys(root_table); + vtdmap->gcr = VTD_GCR_SRTP; + while ((vtdmap->gsr & VTD_GSR_RTPS) == 0) + ; + + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + + vtd_translation_enable(vtdmap); + } +} + +static void +vtd_disable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_translation_disable(vtdmap); + } +} + +static void +vtd_add_device(void *arg, int bus, int slot, int func) +{ + int idx; + uint64_t *ctxp; + struct domain *dom = arg; + vm_paddr_t pt_paddr; + struct vtdmap *vtdmap; + + if (bus < 0 || bus > PCI_BUSMAX || + slot < 0 || slot > PCI_SLOTMAX || + func < 0 || func > PCI_FUNCMAX) + panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func); + + vtdmap = vtdmaps[0]; + ctxp = ctx_tables[bus]; + pt_paddr = vtophys(dom->ptp); + idx = (slot << 3 | func) * 2; + + if (ctxp[idx] & VTD_CTX_PRESENT) { + panic("vtd_add_device: device %d/%d/%d is already owned by " + "domain %d", bus, slot, func, + (uint16_t)(ctxp[idx + 1] >> 8)); + } + + /* + * Order is important. The 'present' bit is set only after all fields + * of the context pointer are initialized. + */ + ctxp[idx + 1] = dom->addrwidth | (dom->id << 8); + + if (VTD_ECAP_DI(vtdmap->ext_cap)) + ctxp[idx] = VTD_CTX_TT_ALL; + else + ctxp[idx] = 0; + + ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT; + + /* + * 'Not Present' entries are not cached in either the Context Cache + * or in the IOTLB, so there is no need to invalidate either of them. + */ +} + +static void +vtd_remove_device(void *arg, int bus, int slot, int func) +{ + int i, idx; + uint64_t *ctxp; + struct vtdmap *vtdmap; + + if (bus < 0 || bus > PCI_BUSMAX || + slot < 0 || slot > PCI_SLOTMAX || + func < 0 || func > PCI_FUNCMAX) + panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func); + + ctxp = ctx_tables[bus]; + idx = (slot << 3 | func) * 2; + + /* + * Order is important. The 'present' bit is must be cleared first. + */ + ctxp[idx] = 0; + ctxp[idx + 1] = 0; + + /* + * Invalidate the Context Cache and the IOTLB. + * + * XXX use device-selective invalidation for Context Cache + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + } +} + +static uint64_t +vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + struct domain *dom; + int i, spshift, ptpshift, ptpindex, nlevels; + uint64_t spsize, *ptp; + + dom = arg; + ptpindex = 0; + ptpshift = 0; + + if (gpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa); + + if (hpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa); + + if (len & PAGE_MASK) + panic("vtd_create_mapping: unaligned len 0x%0lx", len); + + /* + * Compute the size of the mapping that we can accomodate. + * + * This is based on three factors: + * - supported super page size + * - alignment of the region starting at 'gpa' and 'hpa' + * - length of the region 'len' + */ + spshift = 48; + for (i = 3; i >= 0; i--) { + spsize = 1UL << spshift; + if ((dom->spsmask & (1 << i)) != 0 && + (gpa & (spsize - 1)) == 0 && + (hpa & (spsize - 1)) == 0 && + (len >= spsize)) { + break; + } + spshift -= 9; + } + + ptp = dom->ptp; + nlevels = dom->pt_levels; + while (--nlevels >= 0) { + ptpshift = 12 + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + /* We have reached the leaf mapping */ + if (spshift >= ptpshift) { + break; + } + + /* + * We are working on a non-leaf page table page. + * + * Create a downstream page table page if necessary and point + * to it from the current page table. + */ + if (ptp[ptpindex] == 0) { + void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO); + ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR; + } + + ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M); + } + + if ((gpa & ((1UL << ptpshift) - 1)) != 0) + panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift); + + /* + * Create a 'gpa' -> 'hpa' mapping + */ + ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR; + + if (nlevels > 0) + ptp[ptpindex] |= VTD_PTE_SUPERPAGE; + + return (1UL << ptpshift); +} + +static void * +vtd_create_domain(vm_paddr_t maxaddr) +{ + struct domain *dom; + vm_paddr_t addr; + int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth; + struct vtdmap *vtdmap; + + if (drhd_num <= 0) + panic("vtd_create_domain: no dma remapping hardware available"); + + vtdmap = vtdmaps[0]; + + /* + * Calculate AGAW. + * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec. + */ + addr = 0; + for (gaw = 0; addr < maxaddr; gaw++) + addr = 1ULL << gaw; + + res = (gaw - 12) % 9; + if (res == 0) + agaw = gaw; + else + agaw = gaw + 9 - res; + + if (agaw > 64) + agaw = 64; + + /* + * Select the smallest Supported AGAW and the corresponding number + * of page table levels. + */ + pt_levels = 2; + sagaw = 30; + addrwidth = 0; + tmp = VTD_CAP_SAGAW(vtdmap->cap); + for (i = 0; i < 5; i++) { + if ((tmp & (1 << i)) != 0 && sagaw >= agaw) + break; + pt_levels++; + addrwidth++; + sagaw += 9; + if (sagaw > 64) + sagaw = 64; + } + + if (i >= 5) { + panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d", + VTD_CAP_SAGAW(vtdmap->cap), agaw); + } + + dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK); + dom->pt_levels = pt_levels; + dom->addrwidth = addrwidth; + dom->spsmask = VTD_CAP_SPS(vtdmap->cap); + dom->id = domain_id(); + dom->maxaddr = maxaddr; + dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK); + if ((uintptr_t)dom->ptp & PAGE_MASK) + panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp); + + SLIST_INSERT_HEAD(&domhead, dom, next); + + return (dom); +} + +static void +vtd_free_ptp(uint64_t *ptp, int level) +{ + int i; + uint64_t *nlp; + + if (level > 1) { + for (i = 0; i < 512; i++) { + if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0) + continue; + if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0) + continue; + nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M); + vtd_free_ptp(nlp, level - 1); + } + } + + bzero(ptp, PAGE_SIZE); + free(ptp, M_VTD); +} + +static void +vtd_destroy_domain(void *arg) +{ + struct domain *dom; + + dom = arg; + + SLIST_REMOVE(&domhead, dom, domain, next); + vtd_free_ptp(dom->ptp, dom->pt_levels); + free(dom, M_VTD); +} + +struct iommu_ops iommu_ops_intel = { + vtd_init, + vtd_cleanup, + vtd_enable, + vtd_disable, + vtd_create_domain, + vtd_destroy_domain, + vtd_create_mapping, + vtd_add_device, + vtd_remove_device, +}; diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c new file mode 100644 index 0000000..baf2447 --- /dev/null +++ b/sys/amd64/vmm/io/iommu.c @@ -0,0 +1,230 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/bus.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/md_var.h> + +#include "vmm_util.h" +#include "iommu.h" + +static boolean_t iommu_avail; +static struct iommu_ops *ops; +static void *host_domain; + +static __inline int +IOMMU_INIT(void) +{ + if (ops != NULL) + return ((*ops->init)()); + else + return (ENXIO); +} + +static __inline void +IOMMU_CLEANUP(void) +{ + if (ops != NULL && iommu_avail) + (*ops->cleanup)(); +} + +static __inline void * +IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_domain)(maxaddr)); + else + return (NULL); +} + +static __inline void +IOMMU_DESTROY_DOMAIN(void *dom) +{ + + if (ops != NULL && iommu_avail) + (*ops->destroy_domain)(dom); +} + +static __inline uint64_t +IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_mapping)(domain, gpa, hpa, len)); + else + return (len); /* XXX */ +} + +static __inline void +IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func) +{ + + if (ops != NULL && iommu_avail) + (*ops->add_device)(domain, bus, slot, func); +} + +static __inline void +IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func) +{ + + if (ops != NULL && iommu_avail) + (*ops->remove_device)(domain, bus, slot, func); +} + +static __inline void +IOMMU_ENABLE(void) +{ + + if (ops != NULL && iommu_avail) + (*ops->enable)(); +} + +static __inline void +IOMMU_DISABLE(void) +{ + + if (ops != NULL && iommu_avail) + (*ops->disable)(); +} + +void +iommu_init(void) +{ + int error, bus, slot, func; + vm_paddr_t maxaddr; + const char *name; + device_t dev; + + if (vmm_is_intel()) + ops = &iommu_ops_intel; + else if (vmm_is_amd()) + ops = &iommu_ops_amd; + else + ops = NULL; + + error = IOMMU_INIT(); + if (error) + return; + + iommu_avail = TRUE; + + /* + * Create a domain for the devices owned by the host + */ + maxaddr = ptoa(Maxmem); + host_domain = IOMMU_CREATE_DOMAIN(maxaddr); + if (host_domain == NULL) + panic("iommu_init: unable to create a host domain"); + + /* + * Create 1:1 mappings from '0' to 'Maxmem' for devices assigned to + * the host + */ + iommu_create_mapping(host_domain, 0, 0, maxaddr); + + for (bus = 0; bus <= PCI_BUSMAX; bus++) { + for (slot = 0; slot <= PCI_SLOTMAX; slot++) { + for (func = 0; func <= PCI_FUNCMAX; func++) { + dev = pci_find_dbsf(0, bus, slot, func); + if (dev == NULL) + continue; + + /* skip passthrough devices */ + name = device_get_name(dev); + if (name != NULL && strcmp(name, "ppt") == 0) + continue; + + /* everything else belongs to the host domain */ + iommu_add_device(host_domain, bus, slot, func); + } + } + } + IOMMU_ENABLE(); + +} + +void +iommu_cleanup(void) +{ + IOMMU_DISABLE(); + IOMMU_DESTROY_DOMAIN(host_domain); + IOMMU_CLEANUP(); +} + +void * +iommu_create_domain(vm_paddr_t maxaddr) +{ + + return (IOMMU_CREATE_DOMAIN(maxaddr)); +} + +void +iommu_destroy_domain(void *dom) +{ + + IOMMU_DESTROY_DOMAIN(dom); +} + +void +iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len) +{ + uint64_t mapped, remaining; + + remaining = len; + + while (remaining > 0) { + mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining); + gpa += mapped; + hpa += mapped; + remaining -= mapped; + } +} + +void +iommu_add_device(void *dom, int bus, int slot, int func) +{ + + IOMMU_ADD_DEVICE(dom, bus, slot, func); +} + +void +iommu_remove_device(void *dom, int bus, int slot, int func) +{ + + IOMMU_REMOVE_DEVICE(dom, bus, slot, func); +} diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h new file mode 100644 index 0000000..e4f7229 --- /dev/null +++ b/sys/amd64/vmm/io/iommu.h @@ -0,0 +1,67 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_IOMMU_H_ +#define _IO_IOMMU_H_ + +typedef int (*iommu_init_func_t)(void); +typedef void (*iommu_cleanup_func_t)(void); +typedef void (*iommu_enable_func_t)(void); +typedef void (*iommu_disable_func_t)(void); +typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr); +typedef void (*iommu_destroy_domain_t)(void *domain); +typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t len); +typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func); +typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func); + +struct iommu_ops { + iommu_init_func_t init; /* module wide */ + iommu_cleanup_func_t cleanup; + iommu_enable_func_t enable; + iommu_disable_func_t disable; + + iommu_create_domain_t create_domain; /* domain-specific */ + iommu_destroy_domain_t destroy_domain; + iommu_create_mapping_t create_mapping; + iommu_add_device_t add_device; + iommu_remove_device_t remove_device; +}; + +extern struct iommu_ops iommu_ops_intel; +extern struct iommu_ops iommu_ops_amd; + +void iommu_init(void); +void iommu_cleanup(void); +void *iommu_create_domain(vm_paddr_t maxaddr); +void iommu_destroy_domain(void *dom); +void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, + size_t len); +void iommu_add_device(void *dom, int bus, int slot, int func); +void iommu_remove_device(void *dom, int bus, int slot, int func); +#endif diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c new file mode 100644 index 0000000..dc2f326 --- /dev/null +++ b/sys/amd64/vmm/io/ppt.c @@ -0,0 +1,449 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/bus.h> +#include <sys/pciio.h> +#include <sys/rman.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/resource.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> + +#include "vmm_lapic.h" +#include "vmm_ktr.h" + +#include "iommu.h" +#include "ppt.h" + +#define MAX_PPTDEVS (sizeof(pptdevs) / sizeof(pptdevs[0])) +#define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1) +#define MAX_MSIMSGS 32 + +struct pptintr_arg { /* pptintr(pptintr_arg) */ + struct pptdev *pptdev; + int msg; +}; + +static struct pptdev { + device_t dev; + struct vm *vm; /* owner of this device */ + struct vm_memory_segment mmio[MAX_MMIOSEGS]; + struct { + int num_msgs; /* guest state */ + int vector; + int vcpu; + + int startrid; /* host state */ + struct resource *res[MAX_MSIMSGS]; + void *cookie[MAX_MSIMSGS]; + struct pptintr_arg arg[MAX_MSIMSGS]; + } msi; +} pptdevs[32]; + +static int num_pptdevs; + +static int +ppt_probe(device_t dev) +{ + int bus, slot, func; + struct pci_devinfo *dinfo; + + dinfo = (struct pci_devinfo *)device_get_ivars(dev); + + bus = pci_get_bus(dev); + slot = pci_get_slot(dev); + func = pci_get_function(dev); + + /* + * To qualify as a pci passthrough device a device must: + * - be allowed by administrator to be used in this role + * - be an endpoint device + */ + if (vmm_is_pptdev(bus, slot, func) && + (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL) + return (0); + else + return (ENXIO); +} + +static int +ppt_attach(device_t dev) +{ + int n; + + if (num_pptdevs >= MAX_PPTDEVS) { + printf("ppt_attach: maximum number of pci passthrough devices " + "exceeded\n"); + return (ENXIO); + } + + n = num_pptdevs++; + pptdevs[n].dev = dev; + + if (bootverbose) + device_printf(dev, "attached\n"); + + return (0); +} + +static int +ppt_detach(device_t dev) +{ + /* + * XXX check whether there are any pci passthrough devices assigned + * to guests before we allow this driver to detach. + */ + + return (0); +} + +static device_method_t ppt_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, ppt_probe), + DEVMETHOD(device_attach, ppt_attach), + DEVMETHOD(device_detach, ppt_detach), + {0, 0} +}; + +static devclass_t ppt_devclass; +DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0); +DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL); + +static struct pptdev * +ppt_find(int bus, int slot, int func) +{ + device_t dev; + int i, b, s, f; + + for (i = 0; i < num_pptdevs; i++) { + dev = pptdevs[i].dev; + b = pci_get_bus(dev); + s = pci_get_slot(dev); + f = pci_get_function(dev); + if (bus == b && slot == s && func == f) + return (&pptdevs[i]); + } + return (NULL); +} + +static void +ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt) +{ + int i; + struct vm_memory_segment *seg; + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) + continue; + (void)vm_unmap_mmio(vm, seg->gpa, seg->len); + bzero(seg, sizeof(struct vm_memory_segment)); + } +} + +static void +ppt_teardown_msi(struct pptdev *ppt) +{ + int i, rid; + void *cookie; + struct resource *res; + + if (ppt->msi.num_msgs == 0) + return; + + for (i = 0; i < ppt->msi.num_msgs; i++) { + rid = ppt->msi.startrid + i; + res = ppt->msi.res[i]; + cookie = ppt->msi.cookie[i]; + + if (cookie != NULL) + bus_teardown_intr(ppt->dev, res, cookie); + + if (res != NULL) + bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); + + ppt->msi.res[i] = NULL; + ppt->msi.cookie[i] = NULL; + } + + if (ppt->msi.startrid == 1) + pci_release_msi(ppt->dev); + + ppt->msi.num_msgs = 0; +} + +int +ppt_assign_device(struct vm *vm, int bus, int slot, int func) +{ + struct pptdev *ppt; + + ppt = ppt_find(bus, slot, func); + if (ppt != NULL) { + /* + * If this device is owned by a different VM then we + * cannot change its owner. + */ + if (ppt->vm != NULL && ppt->vm != vm) + return (EBUSY); + + ppt->vm = vm; + iommu_add_device(vm_iommu_domain(vm), bus, slot, func); + return (0); + } + return (ENOENT); +} + +int +ppt_unassign_device(struct vm *vm, int bus, int slot, int func) +{ + struct pptdev *ppt; + + ppt = ppt_find(bus, slot, func); + if (ppt != NULL) { + /* + * If this device is not owned by this 'vm' then bail out. + */ + if (ppt->vm != vm) + return (EBUSY); + ppt_unmap_mmio(vm, ppt); + ppt_teardown_msi(ppt); + iommu_remove_device(vm_iommu_domain(vm), bus, slot, func); + ppt->vm = NULL; + return (0); + } + return (ENOENT); +} + +int +ppt_unassign_all(struct vm *vm) +{ + int i, bus, slot, func; + device_t dev; + + for (i = 0; i < num_pptdevs; i++) { + if (pptdevs[i].vm == vm) { + dev = pptdevs[i].dev; + bus = pci_get_bus(dev); + slot = pci_get_slot(dev); + func = pci_get_function(dev); + ppt_unassign_device(vm, bus, slot, func); + } + } + + return (0); +} + +int +ppt_map_mmio(struct vm *vm, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + int i, error; + struct vm_memory_segment *seg; + struct pptdev *ppt; + + ppt = ppt_find(bus, slot, func); + if (ppt != NULL) { + if (ppt->vm != vm) + return (EBUSY); + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) { + error = vm_map_mmio(vm, gpa, len, hpa); + if (error == 0) { + seg->gpa = gpa; + seg->len = len; + seg->hpa = hpa; + } + return (error); + } + } + return (ENOSPC); + } + return (ENOENT); +} + +static int +pptintr(void *arg) +{ + int vec; + struct pptdev *ppt; + struct pptintr_arg *pptarg; + + pptarg = arg; + ppt = pptarg->pptdev; + vec = ppt->msi.vector + pptarg->msg; + + if (ppt->vm != NULL) + (void) lapic_set_intr(ppt->vm, ppt->msi.vcpu, vec); + else { + /* + * XXX + * This is not expected to happen - panic? + */ + } + + /* + * For legacy interrupts give other filters a chance in case + * the interrupt was not generated by the passthrough device. + */ + if (ppt->msi.startrid == 0) + return (FILTER_STRAY); + else + return (FILTER_HANDLED); +} + +/* + * XXX + * When we try to free the MSI resource the kernel will bind the thread to + * the host cpu was originally handling the MSI. The function freeing the + * MSI vector (apic_free_vector()) will panic the kernel if the thread + * is already bound to a cpu. + * + * So, we temporarily unbind the vcpu thread before freeing the MSI resource. + */ +static void +PPT_TEARDOWN_MSI(struct vm *vm, int vcpu, struct pptdev *ppt) +{ + int pincpu = -1; + + vm_get_pinning(vm, vcpu, &pincpu); + + if (pincpu >= 0) + vm_set_pinning(vm, vcpu, -1); + + ppt_teardown_msi(ppt); + + if (pincpu >= 0) + vm_set_pinning(vm, vcpu, pincpu); +} + +int +ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, + int destcpu, int vector, int numvec) +{ + int i, rid, flags; + int msi_count, startrid, error, tmp; + struct pptdev *ppt; + + if ((destcpu >= VM_MAXCPU || destcpu < 0) || + (vector < 0 || vector > 255) || + (numvec < 0 || numvec > MAX_MSIMSGS)) + return (EINVAL); + + ppt = ppt_find(bus, slot, func); + if (ppt == NULL) + return (ENOENT); + if (ppt->vm != vm) /* Make sure we own this device */ + return (EBUSY); + + /* Free any allocated resources */ + PPT_TEARDOWN_MSI(vm, vcpu, ppt); + + if (numvec == 0) /* nothing more to do */ + return (0); + + flags = RF_ACTIVE; + msi_count = pci_msi_count(ppt->dev); + if (msi_count == 0) { + startrid = 0; /* legacy interrupt */ + msi_count = 1; + flags |= RF_SHAREABLE; + } else + startrid = 1; /* MSI */ + + /* + * The device must be capable of supporting the number of vectors + * the guest wants to allocate. + */ + if (numvec > msi_count) + return (EINVAL); + + /* + * Make sure that we can allocate all the MSI vectors that are needed + * by the guest. + */ + if (startrid == 1) { + tmp = numvec; + error = pci_alloc_msi(ppt->dev, &tmp); + if (error) + return (error); + else if (tmp != numvec) { + pci_release_msi(ppt->dev); + return (ENOSPC); + } else { + /* success */ + } + } + + ppt->msi.vector = vector; + ppt->msi.vcpu = destcpu; + ppt->msi.startrid = startrid; + + /* + * Allocate the irq resource and attach it to the interrupt handler. + */ + for (i = 0; i < numvec; i++) { + ppt->msi.num_msgs = i + 1; + ppt->msi.cookie[i] = NULL; + + rid = startrid + i; + ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, + &rid, flags); + if (ppt->msi.res[i] == NULL) + break; + + ppt->msi.arg[i].pptdev = ppt; + ppt->msi.arg[i].msg = i; + + error = bus_setup_intr(ppt->dev, ppt->msi.res[i], + INTR_TYPE_NET | INTR_MPSAFE | INTR_FAST, + pptintr, NULL, &ppt->msi.arg[i], + &ppt->msi.cookie[i]); + if (error != 0) + break; + } + + if (i < numvec) { + PPT_TEARDOWN_MSI(vm, vcpu, ppt); + return (ENXIO); + } + + return (0); +} diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h new file mode 100644 index 0000000..95f3ad0 --- /dev/null +++ b/sys/amd64/vmm/io/ppt.h @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_PPT_H_ +#define _IO_PPT_H_ + +int ppt_assign_device(struct vm *vm, int bus, int slot, int func); +int ppt_unassign_device(struct vm *vm, int bus, int slot, int func); +int ppt_unassign_all(struct vm *vm); +int ppt_map_mmio(struct vm *vm, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, + int destcpu, int vector, int numvec); + +#endif diff --git a/sys/amd64/vmm/io/vdev.c b/sys/amd64/vmm/io/vdev.c new file mode 100644 index 0000000..cd6c5d1 --- /dev/null +++ b/sys/amd64/vmm/io/vdev.c @@ -0,0 +1,270 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include "vdev.h" + +struct vdev { + SLIST_ENTRY(vdev) entry; + struct vdev_ops *ops; + void *dev; +}; +static SLIST_HEAD(, vdev) vdev_head; +static int vdev_count; + +struct vdev_region { + SLIST_ENTRY(vdev_region) entry; + struct vdev_ops *ops; + void *dev; + struct io_region *io; +}; +static SLIST_HEAD(, vdev_region) region_head; +static int region_count; + +static MALLOC_DEFINE(M_VDEV, "vdev", "vdev"); + +#define VDEV_INIT (0) +#define VDEV_RESET (1) +#define VDEV_HALT (2) + +// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"}; + +static int +vdev_system_event(int event) +{ + struct vdev *vd; + int rc; + + // TODO: locking + SLIST_FOREACH(vd, &vdev_head, entry) { + // printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name); + switch (event) { + case VDEV_INIT: + rc = vd->ops->init(vd->dev); + break; + case VDEV_RESET: + rc = vd->ops->reset(vd->dev); + break; + case VDEV_HALT: + rc = vd->ops->halt(vd->dev); + break; + default: + break; + } + if (rc) { + printf("vdev %s init failed rc=%d\n", + vd->ops->name, rc); + return rc; + } + } + return 0; +} + +int +vdev_init(void) +{ + return vdev_system_event(VDEV_INIT); +} + +int +vdev_reset(void) +{ + return vdev_system_event(VDEV_RESET); +} + +int +vdev_halt(void) +{ + return vdev_system_event(VDEV_HALT); +} + +void +vdev_vm_init(void) +{ + SLIST_INIT(&vdev_head); + vdev_count = 0; + + SLIST_INIT(®ion_head); + region_count = 0; +} +void +vdev_vm_cleanup(void) +{ + struct vdev *vd; + + // TODO: locking + while (!SLIST_EMPTY(&vdev_head)) { + vd = SLIST_FIRST(&vdev_head); + SLIST_REMOVE_HEAD(&vdev_head, entry); + free(vd, M_VDEV); + vdev_count--; + } +} + +int +vdev_register(struct vdev_ops *ops, void *dev) +{ + struct vdev *vd; + vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO); + vd->ops = ops; + vd->dev = dev; + + // TODO: locking + SLIST_INSERT_HEAD(&vdev_head, vd, entry); + vdev_count++; + return 0; +} + +void +vdev_unregister(void *dev) +{ + struct vdev *vd, *found; + + found = NULL; + // TODO: locking + SLIST_FOREACH(vd, &vdev_head, entry) { + if (vd->dev == dev) { + found = vd; + } + } + + if (found) { + SLIST_REMOVE(&vdev_head, found, vdev, entry); + free(found, M_VDEV); + } +} + +#define IN_RANGE(val, start, end) \ + (((val) >= (start)) && ((val) < (end))) + +static struct vdev_region* +vdev_find_region(struct io_region *io, void *dev) +{ + struct vdev_region *region, *found; + uint64_t region_base; + uint64_t region_end; + + found = NULL; + + // TODO: locking + // FIXME: we should verify we are in the context the current + // vcpu here as well. + SLIST_FOREACH(region, ®ion_head, entry) { + region_base = region->io->base; + region_end = region_base + region->io->len; + if (IN_RANGE(io->base, region_base, region_end) && + IN_RANGE(io->base+io->len, region_base, region_end+1) && + (dev && dev == region->dev)) { + found = region; + break; + } + } + return found; +} + +int +vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io) +{ + struct vdev_region *region; + + region = vdev_find_region(io, dev); + if (region) { + return -EEXIST; + } + + region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO); + region->io = io; + region->ops = ops; + region->dev = dev; + + // TODO: locking + SLIST_INSERT_HEAD(®ion_head, region, entry); + region_count++; + + return 0; +} + +void +vdev_unregister_region(void *dev, struct io_region *io) +{ + struct vdev_region *region; + + region = vdev_find_region(io, dev); + + if (region) { + SLIST_REMOVE(®ion_head, region, vdev_region, entry); + free(region, M_VDEV); + region_count--; + } +} + +static int +vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read) +{ + struct vdev_region *region; + struct io_region io; + region_attr_t attr; + int rc; + + io.base = gpa; + io.len = size; + + region = vdev_find_region(&io, NULL); + if (!region) + return -EINVAL; + + attr = (read) ? MMIO_READ : MMIO_WRITE; + if (!(region->io->attr & attr)) + return -EPERM; + + if (read) + rc = region->ops->memread(region->dev, gpa, size, data); + else + rc = region->ops->memwrite(region->dev, gpa, size, *data); + + return rc; +} + +int +vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data) +{ + return vdev_memrw(gpa, size, data, 1); +} + +int +vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data) +{ + return vdev_memrw(gpa, size, &data, 0); +} diff --git a/sys/amd64/vmm/io/vdev.h b/sys/amd64/vmm/io/vdev.h new file mode 100644 index 0000000..6feeba8 --- /dev/null +++ b/sys/amd64/vmm/io/vdev.h @@ -0,0 +1,84 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VDEV_H_ +#define _VDEV_H_ + +typedef enum { + BYTE = 1, + WORD = 2, + DWORD = 4, + QWORD = 8, +} opsize_t; + +typedef enum { + MMIO_READ = 1, + MMIO_WRITE = 2, +} region_attr_t; + +struct io_region { + uint64_t base; + uint64_t len; + region_attr_t attr; + int vcpu; +}; + +typedef int (*vdev_init_t)(void* dev); +typedef int (*vdev_reset_t)(void* dev); +typedef int (*vdev_halt_t)(void* dev); +typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data); +typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data); + + +struct vdev_ops { + const char *name; + vdev_init_t init; + vdev_reset_t reset; + vdev_halt_t halt; + vdev_memread_t memread; + vdev_memwrite_t memwrite; +}; + + +void vdev_vm_init(void); +void vdev_vm_cleanup(void); + +int vdev_register(struct vdev_ops *ops, void *dev); +void vdev_unregister(void *dev); + +int vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io); +void vdev_unregister_region(void *dev, struct io_region *io); + +int vdev_init(void); +int vdev_reset(void); +int vdev_halt(void); +int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data); +int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data); + +#endif /* _VDEV_H_ */ + diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c new file mode 100644 index 0000000..a21addf --- /dev/null +++ b/sys/amd64/vmm/io/vlapic.c @@ -0,0 +1,812 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/systm.h> + +#include <machine/clock.h> +#include <machine/apicreg.h> + +#include <machine/vmm.h> + +#include "vmm_lapic.h" +#include "vmm_ktr.h" +#include "vdev.h" +#include "vlapic.h" + +#define VLAPIC_CTR0(vlapic, format) \ + VMM_CTR0((vlapic)->vm, (vlapic)->vcpuid, format) + +#define VLAPIC_CTR1(vlapic, format, p1) \ + VMM_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1) + +#define VLAPIC_CTR_IRR(vlapic, msg) \ +do { \ + uint32_t *irrptr = &(vlapic)->apic.irr0; \ + irrptr[0] = irrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \ +} while (0) + +#define VLAPIC_CTR_ISR(vlapic, msg) \ +do { \ + uint32_t *isrptr = &(vlapic)->apic.isr0; \ + isrptr[0] = isrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \ +} while (0) + +static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); + +#define PRIO(x) ((x) >> 4) + +#define VLAPIC_VERSION (16) +#define VLAPIC_MAXLVT_ENTRIES (5) + +struct vlapic { + struct vm *vm; + int vcpuid; + + struct io_region *mmio; + struct vdev_ops *ops; + struct LAPIC apic; + + int esr_update; + + int divisor; + int ccr_ticks; + + /* + * The 'isrvec_stk' is a stack of vectors injected by the local apic. + * A vector is popped from the stack when the processor does an EOI. + * The vector on the top of the stack is used to compute the + * Processor Priority in conjunction with the TPR. + */ + uint8_t isrvec_stk[ISRVEC_STK_SIZE]; + int isrvec_stk_top; +}; + +static void +vlapic_mask_lvts(uint32_t *lvts, int num_lvt) +{ + int i; + for (i = 0; i < num_lvt; i++) { + *lvts |= APIC_LVT_M; + lvts += 4; + } +} + +#if 0 +static inline void +vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) +{ + printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, + *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, + *lvt & APIC_LVTT_M); +} +#endif + +static uint64_t +vlapic_get_ccr(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + return lapic->ccr_timer; +} + +static void +vlapic_update_errors(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + lapic->esr = 0; // XXX +} + +static void +vlapic_init_ipi(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + lapic->version = VLAPIC_VERSION; + lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT); + lapic->dfr = 0xffffffff; + lapic->svr = APIC_SVR_VECTOR; + vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1); +} + +static int +vlapic_op_reset(void* dev) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + struct LAPIC *lapic = &vlapic->apic; + + memset(lapic, 0, sizeof(*lapic)); + lapic->id = vlapic->vcpuid << 24; + lapic->apr = vlapic->vcpuid; + vlapic_init_ipi(vlapic); + + return 0; + +} + +static int +vlapic_op_init(void* dev) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + vdev_register_region(vlapic->ops, vlapic, vlapic->mmio); + return vlapic_op_reset(dev); +} + +static int +vlapic_op_halt(void* dev) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + vdev_unregister_region(vlapic, vlapic->mmio); + return 0; + +} + +void +vlapic_set_intr_ready(struct vlapic *vlapic, int vector) +{ + struct LAPIC *lapic = &vlapic->apic; + uint32_t *irrptr; + int idx; + + if (vector < 0 || vector >= 256) + panic("vlapic_set_intr_ready: invalid vector %d\n", vector); + + idx = (vector / 32) * 4; + irrptr = &lapic->irr0; + atomic_set_int(&irrptr[idx], 1 << (vector % 32)); + VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); +} + +#define VLAPIC_BUS_FREQ tsc_freq +#define VLAPIC_DCR(x) ((x->dcr_timer & 0x8) >> 1)|(x->dcr_timer & 0x3) + +static int +vlapic_timer_divisor(uint32_t dcr) +{ + switch (dcr & 0xB) { + case APIC_TDCR_2: + return (2); + case APIC_TDCR_4: + return (4); + case APIC_TDCR_8: + return (8); + case APIC_TDCR_16: + return (16); + case APIC_TDCR_32: + return (32); + case APIC_TDCR_64: + return (64); + case APIC_TDCR_128: + return (128); + default: + panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); + } +} + +static void +vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed) +{ + uint32_t icr_timer; + + icr_timer = vlapic->apic.icr_timer; + + vlapic->ccr_ticks = ticks; + if (elapsed < icr_timer) + vlapic->apic.ccr_timer = icr_timer - elapsed; + else { + /* + * This can happen when the guest is trying to run its local + * apic timer higher that the setting of 'hz' in the host. + * + * We deal with this by running the guest local apic timer + * at the rate of the host's 'hz' setting. + */ + vlapic->apic.ccr_timer = 0; + } +} + +static __inline uint32_t * +vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) +{ + struct LAPIC *lapic = &vlapic->apic; + int i; + + if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) { + panic("vlapic_get_lvt: invalid LVT\n"); + } + i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; + return ((&lapic->lvt_timer) + i);; +} + +#if 1 +static void +dump_isrvec_stk(struct vlapic *vlapic) +{ + int i; + uint32_t *isrptr; + + isrptr = &vlapic->apic.isr0; + for (i = 0; i < 8; i++) + printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); + + for (i = 0; i <= vlapic->isrvec_stk_top; i++) + printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); +} +#endif + +/* + * Algorithm adopted from section "Interrupt, Task and Processor Priority" + * in Intel Architecture Manual Vol 3a. + */ +static void +vlapic_update_ppr(struct vlapic *vlapic) +{ + int isrvec, tpr, ppr; + + /* + * Note that the value on the stack at index 0 is always 0. + * + * This is a placeholder for the value of ISRV when none of the + * bits is set in the ISRx registers. + */ + isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; + tpr = vlapic->apic.tpr; + +#if 1 + { + int i, lastprio, curprio, vector, idx; + uint32_t *isrptr; + + if (vlapic->isrvec_stk_top == 0 && isrvec != 0) + panic("isrvec_stk is corrupted: %d", isrvec); + + /* + * Make sure that the priority of the nested interrupts is + * always increasing. + */ + lastprio = -1; + for (i = 1; i <= vlapic->isrvec_stk_top; i++) { + curprio = PRIO(vlapic->isrvec_stk[i]); + if (curprio <= lastprio) { + dump_isrvec_stk(vlapic); + panic("isrvec_stk does not satisfy invariant"); + } + lastprio = curprio; + } + + /* + * Make sure that each bit set in the ISRx registers has a + * corresponding entry on the isrvec stack. + */ + i = 1; + isrptr = &vlapic->apic.isr0; + for (vector = 0; vector < 256; vector++) { + idx = (vector / 32) * 4; + if (isrptr[idx] & (1 << (vector % 32))) { + if (i > vlapic->isrvec_stk_top || + vlapic->isrvec_stk[i] != vector) { + dump_isrvec_stk(vlapic); + panic("ISR and isrvec_stk out of sync"); + } + i++; + } + } + } +#endif + + if (PRIO(tpr) >= PRIO(isrvec)) + ppr = tpr; + else + ppr = isrvec & 0xf0; + + vlapic->apic.ppr = ppr; + VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); +} + +static void +vlapic_process_eoi(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + uint32_t *isrptr; + int i, idx, bitpos; + + isrptr = &lapic->isr0; + + /* + * The x86 architecture reserves the the first 32 vectors for use + * by the processor. + */ + for (i = 7; i > 0; i--) { + idx = i * 4; + bitpos = fls(isrptr[idx]); + if (bitpos != 0) { + if (vlapic->isrvec_stk_top <= 0) { + panic("invalid vlapic isrvec_stk_top %d", + vlapic->isrvec_stk_top); + } + isrptr[idx] &= ~(1 << (bitpos - 1)); + VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); + vlapic->isrvec_stk_top--; + vlapic_update_ppr(vlapic); + return; + } + } +} + +static __inline int +vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask) +{ + return (*lvt & mask); +} + +static __inline int +vlapic_periodic_timer(struct vlapic *vlapic) +{ + uint32_t *lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + + return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); +} + +static void +vlapic_fire_timer(struct vlapic *vlapic) +{ + int vector; + uint32_t *lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + + if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) { + vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR); + vlapic_set_intr_ready(vlapic, vector); + } +} + +static int +lapic_process_icr(struct vlapic *vlapic, uint64_t icrval) +{ + int i; + cpumask_t dmask, thiscpumask; + uint32_t dest, vec, mode; + + thiscpumask = vcpu_mask(vlapic->vcpuid); + + dmask = 0; + dest = icrval >> 32; + vec = icrval & APIC_VECTOR_MASK; + mode = icrval & APIC_DELMODE_MASK; + + if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { + switch (icrval & APIC_DEST_MASK) { + case APIC_DEST_DESTFLD: + dmask = vcpu_mask(dest); + break; + case APIC_DEST_SELF: + dmask = thiscpumask; + break; + case APIC_DEST_ALLISELF: + dmask = vm_active_cpus(vlapic->vm); + break; + case APIC_DEST_ALLESELF: + dmask = vm_active_cpus(vlapic->vm) & ~thiscpumask; + break; + } + + for (i = 0; i < VM_MAXCPU; i++) { + if (dmask & vcpu_mask(i)) { + if (mode == APIC_DELMODE_FIXED) + lapic_set_intr(vlapic->vm, i, vec); + else + vm_inject_nmi(vlapic->vm, i); + } + } + + return (0); /* handled completely in the kernel */ + } + + /* + * XXX this assumes that the startup IPI always succeeds + */ + if (mode == APIC_DELMODE_STARTUP) + vm_activate_cpu(vlapic->vm, dest); + + /* + * This will cause a return to userland. + */ + return (1); +} + +int +vlapic_pending_intr(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + int idx, i, bitpos, vector; + uint32_t *irrptr, val; + + irrptr = &lapic->irr0; + + /* + * The x86 architecture reserves the the first 32 vectors for use + * by the processor. + */ + for (i = 7; i > 0; i--) { + idx = i * 4; + val = atomic_load_acq_int(&irrptr[idx]); + bitpos = fls(val); + if (bitpos != 0) { + vector = i * 32 + (bitpos - 1); + if (PRIO(vector) > PRIO(lapic->ppr)) { + VLAPIC_CTR1(vlapic, "pending intr %d", vector); + return (vector); + } else + break; + } + } + VLAPIC_CTR0(vlapic, "no pending intr"); + return (-1); +} + +void +vlapic_intr_accepted(struct vlapic *vlapic, int vector) +{ + struct LAPIC *lapic = &vlapic->apic; + uint32_t *irrptr, *isrptr; + int idx, stk_top; + + /* + * clear the ready bit for vector being accepted in irr + * and set the vector as in service in isr. + */ + idx = (vector / 32) * 4; + + irrptr = &lapic->irr0; + atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); + VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); + + isrptr = &lapic->isr0; + isrptr[idx] |= 1 << (vector % 32); + VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); + + /* + * Update the PPR + */ + vlapic->isrvec_stk_top++; + + stk_top = vlapic->isrvec_stk_top; + if (stk_top >= ISRVEC_STK_SIZE) + panic("isrvec_stk_top overflow %d", stk_top); + + vlapic->isrvec_stk[stk_top] = vector; + vlapic_update_ppr(vlapic); +} + +int +vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + struct LAPIC *lapic = &vlapic->apic; + uint64_t offset = gpa & ~(PAGE_SIZE); + uint32_t *reg; + int i; + + if (offset > sizeof(*lapic)) { + *data = 0; + return 0; + } + + offset &= ~3; + switch(offset) + { + case APIC_OFFSET_ID: + *data = lapic->id; + break; + case APIC_OFFSET_VER: + *data = lapic->version; + break; + case APIC_OFFSET_TPR: + *data = lapic->tpr; + break; + case APIC_OFFSET_APR: + *data = lapic->apr; + break; + case APIC_OFFSET_PPR: + *data = lapic->ppr; + break; + case APIC_OFFSET_EOI: + *data = lapic->eoi; + break; + case APIC_OFFSET_LDR: + *data = lapic->ldr; + break; + case APIC_OFFSET_DFR: + *data = lapic->dfr; + break; + case APIC_OFFSET_SVR: + *data = lapic->svr; + break; + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + i = (offset - APIC_OFFSET_ISR0) >> 2; + reg = &lapic->isr0; + *data = *(reg + i); + break; + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + i = (offset - APIC_OFFSET_TMR0) >> 2; + reg = &lapic->tmr0; + *data = *(reg + i); + break; + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + i = (offset - APIC_OFFSET_IRR0) >> 2; + reg = &lapic->irr0; + *data = atomic_load_acq_int(reg + i); + break; + case APIC_OFFSET_ESR: + *data = lapic->esr; + break; + case APIC_OFFSET_ICR_LOW: + *data = lapic->icr_lo; + break; + case APIC_OFFSET_ICR_HI: + *data = lapic->icr_hi; + break; + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + reg = vlapic_get_lvt(vlapic, offset); + *data = *(reg); + break; + case APIC_OFFSET_ICR: + *data = lapic->icr_timer; + break; + case APIC_OFFSET_CCR: + *data = vlapic_get_ccr(vlapic); + break; + case APIC_OFFSET_DCR: + *data = lapic->dcr_timer; + break; + case APIC_OFFSET_RRR: + default: + *data = 0; + break; + } + return 0; +} + +int +vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + struct LAPIC *lapic = &vlapic->apic; + uint64_t offset = gpa & ~(PAGE_SIZE); + uint32_t *reg; + int retval; + + if (offset > sizeof(*lapic)) { + return 0; + } + + retval = 0; + offset &= ~3; + switch(offset) + { + case APIC_OFFSET_ID: + lapic->id = data; + break; + case APIC_OFFSET_TPR: + lapic->tpr = data & 0xff; + vlapic_update_ppr(vlapic); + break; + case APIC_OFFSET_EOI: + vlapic_process_eoi(vlapic); + break; + case APIC_OFFSET_LDR: + break; + case APIC_OFFSET_DFR: + break; + case APIC_OFFSET_SVR: + lapic->svr = data; + break; + case APIC_OFFSET_ICR_LOW: + retval = lapic_process_icr(vlapic, data); + break; + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + reg = vlapic_get_lvt(vlapic, offset); + if (!(lapic->svr & APIC_SVR_ENABLE)) { + data |= APIC_LVT_M; + } + *reg = data; + // vlapic_dump_lvt(offset, reg); + break; + case APIC_OFFSET_ICR: + lapic->icr_timer = data; + vlapic_start_timer(vlapic, 0); + break; + + case APIC_OFFSET_DCR: + lapic->dcr_timer = data; + vlapic->divisor = vlapic_timer_divisor(data); + break; + + case APIC_OFFSET_ESR: + vlapic_update_errors(vlapic); + break; + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_CCR: + default: + // Read only. + break; + } + + return (retval); +} + +void +vlapic_timer_tick(struct vlapic *vlapic) +{ + int curticks, delta, periodic; + uint32_t ccr; + uint32_t decrement, remainder; + + curticks = ticks; + + /* Common case */ + delta = curticks - vlapic->ccr_ticks; + if (delta == 0) + return; + + /* Local APIC timer is disabled */ + if (vlapic->apic.icr_timer == 0) + return; + + /* One-shot mode and timer has already counted down to zero */ + periodic = vlapic_periodic_timer(vlapic); + if (!periodic && vlapic->apic.ccr_timer == 0) + return; + /* + * The 'curticks' and 'ccr_ticks' are out of sync by more than + * 2^31 ticks. We deal with this by restarting the timer. + */ + if (delta < 0) { + vlapic_start_timer(vlapic, 0); + return; + } + + ccr = vlapic->apic.ccr_timer; + decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz; + while (delta-- > 0) { + if (ccr <= decrement) { + remainder = decrement - ccr; + vlapic_fire_timer(vlapic); + if (periodic) { + vlapic_start_timer(vlapic, remainder); + ccr = vlapic->apic.ccr_timer; + } else { + /* + * One-shot timer has counted down to zero. + */ + ccr = 0; + break; + } + } else + ccr -= decrement; + } + + vlapic->ccr_ticks = curticks; + vlapic->apic.ccr_timer = ccr; +} + +struct vdev_ops vlapic_dev_ops = { + .name = "vlapic", + .init = vlapic_op_init, + .reset = vlapic_op_reset, + .halt = vlapic_op_halt, + .memread = vlapic_op_mem_read, + .memwrite = vlapic_op_mem_write, +}; +static struct io_region vlapic_mmio[VM_MAXCPU]; + +struct vlapic * +vlapic_init(struct vm *vm, int vcpuid) +{ + struct vlapic *vlapic; + + vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO); + vlapic->vm = vm; + vlapic->vcpuid = vcpuid; + vlapic->ops = &vlapic_dev_ops; + + vlapic->mmio = vlapic_mmio + vcpuid; + vlapic->mmio->base = DEFAULT_APIC_BASE; + vlapic->mmio->len = PAGE_SIZE; + vlapic->mmio->attr = MMIO_READ|MMIO_WRITE; + vlapic->mmio->vcpu = vcpuid; + + vdev_register(&vlapic_dev_ops, vlapic); + + vlapic_op_init(vlapic); + + return (vlapic); +} + +void +vlapic_cleanup(struct vlapic *vlapic) +{ + vdev_unregister(vlapic); + free(vlapic, M_VLAPIC); +} diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h new file mode 100644 index 0000000..861ea8c --- /dev/null +++ b/sys/amd64/vmm/io/vlapic.h @@ -0,0 +1,105 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VLAPIC_H_ +#define _VLAPIC_H_ + +#include "vdev.h" + +struct vm; + +/* + * Map of APIC Registers: Offset Description Access + */ +#define APIC_OFFSET_ID 0x20 // Local APIC ID R/W +#define APIC_OFFSET_VER 0x30 // Local APIC Version R +#define APIC_OFFSET_TPR 0x80 // Task Priority Register R/W +#define APIC_OFFSET_APR 0x90 // Arbitration Priority Register R +#define APIC_OFFSET_PPR 0xA0 // Processor Priority Register R +#define APIC_OFFSET_EOI 0xB0 // EOI Register W +#define APIC_OFFSET_RRR 0xC0 // Remote read R +#define APIC_OFFSET_LDR 0xD0 // Logical Destination R/W +#define APIC_OFFSET_DFR 0xE0 // Destination Format Register 0..27 R; 28..31 R/W +#define APIC_OFFSET_SVR 0xF0 // Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W +#define APIC_OFFSET_ISR0 0x100 // ISR 000-031 R +#define APIC_OFFSET_ISR1 0x110 // ISR 032-063 R +#define APIC_OFFSET_ISR2 0x120 // ISR 064-095 R +#define APIC_OFFSET_ISR3 0x130 // ISR 095-128 R +#define APIC_OFFSET_ISR4 0x140 // ISR 128-159 R +#define APIC_OFFSET_ISR5 0x150 // ISR 160-191 R +#define APIC_OFFSET_ISR6 0x160 // ISR 192-223 R +#define APIC_OFFSET_ISR7 0x170 // ISR 224-255 R +#define APIC_OFFSET_TMR0 0x180 // TMR 000-031 R +#define APIC_OFFSET_TMR1 0x190 // TMR 032-063 R +#define APIC_OFFSET_TMR2 0x1A0 // TMR 064-095 R +#define APIC_OFFSET_TMR3 0x1B0 // TMR 095-128 R +#define APIC_OFFSET_TMR4 0x1C0 // TMR 128-159 R +#define APIC_OFFSET_TMR5 0x1D0 // TMR 160-191 R +#define APIC_OFFSET_TMR6 0x1E0 // TMR 192-223 R +#define APIC_OFFSET_TMR7 0x1F0 // TMR 224-255 R +#define APIC_OFFSET_IRR0 0x200 // IRR 000-031 R +#define APIC_OFFSET_IRR1 0x210 // IRR 032-063 R +#define APIC_OFFSET_IRR2 0x220 // IRR 064-095 R +#define APIC_OFFSET_IRR3 0x230 // IRR 095-128 R +#define APIC_OFFSET_IRR4 0x240 // IRR 128-159 R +#define APIC_OFFSET_IRR5 0x250 // IRR 160-191 R +#define APIC_OFFSET_IRR6 0x260 // IRR 192-223 R +#define APIC_OFFSET_IRR7 0x270 // IRR 224-255 R +#define APIC_OFFSET_ESR 0x280 // Error Status Register R +#define APIC_OFFSET_ICR_LOW 0x300 // Interrupt Command Reg. (0-31) R/W +#define APIC_OFFSET_ICR_HI 0x310 // Interrupt Command Reg. (32-63) R/W +#define APIC_OFFSET_TIMER_LVT 0x320 // Local Vector Table (Timer) R/W +#define APIC_OFFSET_THERM_LVT 0x330 // Local Vector Table (Thermal) R/W (PIV+) +#define APIC_OFFSET_PERF_LVT 0x340 // Local Vector Table (Performance) R/W (P6+) +#define APIC_OFFSET_LINT0_LVT 0x350 // Local Vector Table (LINT0) R/W +#define APIC_OFFSET_LINT1_LVT 0x360 // Local Vector Table (LINT1) R/W +#define APIC_OFFSET_ERROR_LVT 0x370 // Local Vector Table (ERROR) R/W +#define APIC_OFFSET_ICR 0x380 // Initial Count Reg. for Timer R/W +#define APIC_OFFSET_CCR 0x390 // Current Count of Timer R +#define APIC_OFFSET_DCR 0x3E0 // Timer Divide Configuration Reg. R/W + +/* + * 16 priority levels with at most one vector injected per level. + */ +#define ISRVEC_STK_SIZE (16 + 1) + +struct vlapic *vlapic_init(struct vm *vm, int vcpuid); +void vlapic_cleanup(struct vlapic *vlapic); + +int vlapic_op_mem_write(void* dev, uint64_t gpa, + opsize_t size, uint64_t data); + +int vlapic_op_mem_read(void* dev, uint64_t gpa, + opsize_t size, uint64_t *data); + +int vlapic_pending_intr(struct vlapic *vlapic); +void vlapic_intr_accepted(struct vlapic *vlapic, int vector); +void vlapic_set_intr_ready(struct vlapic *vlapic, int vector); +void vlapic_timer_tick(struct vlapic *vlapic); + +#endif /* _VLAPIC_H_ */ diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c new file mode 100644 index 0000000..c93c31e --- /dev/null +++ b/sys/amd64/vmm/vmm.c @@ -0,0 +1,737 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/sched.h> +#include <sys/smp.h> +#include <sys/systm.h> + +#include <vm/vm.h> + +#include <machine/vm.h> +#include <machine/pcb.h> +#include <machine/apicreg.h> + +#include <machine/vmm.h> +#include "vmm_mem.h" +#include "vmm_util.h" +#include <machine/vmm_dev.h> +#include "vlapic.h" +#include "vmm_msr.h" +#include "vmm_ipi.h" +#include "vmm_stat.h" + +#include "io/ppt.h" +#include "io/iommu.h" + +struct vlapic; + +struct vcpu { + int flags; + int pincpu; /* host cpuid this vcpu is bound to */ + int hostcpu; /* host cpuid this vcpu last ran on */ + uint64_t guest_msrs[VMM_MSR_NUM]; + struct vlapic *vlapic; + int vcpuid; + struct savefpu savefpu; /* guest fpu state */ + void *stats; +}; +#define VCPU_F_PINNED 0x0001 +#define VCPU_F_RUNNING 0x0002 + +#define VCPU_PINCPU(vm, vcpuid) \ + ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1) + +#define VCPU_UNPIN(vm, vcpuid) (vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED) + +#define VCPU_PIN(vm, vcpuid, host_cpuid) \ +do { \ + vm->vcpu[vcpuid].flags |= VCPU_F_PINNED; \ + vm->vcpu[vcpuid].pincpu = host_cpuid; \ +} while(0) + +#define VM_MAX_MEMORY_SEGMENTS 2 + +struct vm { + void *cookie; /* processor-specific data */ + void *iommu; /* iommu-specific data */ + struct vcpu vcpu[VM_MAXCPU]; + int num_mem_segs; + struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS]; + char name[VM_MAX_NAMELEN]; + + /* + * Mask of active vcpus. + * An active vcpu is one that has been started implicitly (BSP) or + * explicitly (AP) by sending it a startup ipi. + */ + cpumask_t active_cpus; +}; + +static struct vmm_ops *ops; +#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) +#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) + +#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL) +#define VMRUN(vmi, vcpu, rip, vmexit) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, vmexit) : ENXIO) +#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) +#define VMMMAP(vmi, gpa, hpa, len, attr, prot, spm) \ + (ops != NULL ? (*ops->vmmmap)(vmi, gpa, hpa, len, attr, prot, spm) : ENXIO) +#define VMGETREG(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETREG(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) +#define VMGETDESC(vmi, vcpu, num, desc) \ + (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) +#define VMSETDESC(vmi, vcpu, num, desc) \ + (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) +#define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \ + (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO) +#define VMNMI(vmi, vcpu) \ + (ops != NULL ? (*ops->vmnmi)(vmi, vcpu) : ENXIO) +#define VMGETCAP(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETCAP(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) + +#define fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr))) +#define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr))) +#define fpu_start_emulating() __asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \ + : : "n" (CR0_TS) : "ax") +#define fpu_stop_emulating() __asm("clts") + +static MALLOC_DEFINE(M_VM, "vm", "vm"); +CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ + +/* statistics */ +static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); + +static void +vcpu_cleanup(struct vcpu *vcpu) +{ + vlapic_cleanup(vcpu->vlapic); + vmm_stat_free(vcpu->stats); +} + +static void +vcpu_init(struct vm *vm, uint32_t vcpu_id) +{ + struct vcpu *vcpu; + + vcpu = &vm->vcpu[vcpu_id]; + + vcpu->hostcpu = -1; + vcpu->vcpuid = vcpu_id; + vcpu->vlapic = vlapic_init(vm, vcpu_id); + fpugetregs(curthread, &vcpu->savefpu); + vcpu->stats = vmm_stat_alloc(); +} + +static int +vmm_init(void) +{ + int error; + + vmm_ipi_init(); + + error = vmm_mem_init(); + if (error) + return (error); + + if (vmm_is_intel()) + ops = &vmm_ops_intel; + else if (vmm_is_amd()) + ops = &vmm_ops_amd; + else + return (ENXIO); + + vmm_msr_init(); + + return (VMM_INIT()); +} + +static int +vmm_handler(module_t mod, int what, void *arg) +{ + int error; + + switch (what) { + case MOD_LOAD: + vmmdev_init(); + iommu_init(); + error = vmm_init(); + break; + case MOD_UNLOAD: + vmmdev_cleanup(); + iommu_cleanup(); + vmm_ipi_cleanup(); + error = VMM_CLEANUP(); + break; + default: + error = 0; + break; + } + return (error); +} + +static moduledata_t vmm_kmod = { + "vmm", + vmm_handler, + NULL +}; + +/* + * Execute the module load handler after the pci passthru driver has had + * a chance to claim devices. We need this information at the time we do + * iommu initialization. + */ +DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY); +MODULE_VERSION(vmm, 1); + +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); + +struct vm * +vm_create(const char *name) +{ + int i; + struct vm *vm; + vm_paddr_t maxaddr; + + const int BSP = 0; + + if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) + return (NULL); + + vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); + strcpy(vm->name, name); + vm->cookie = VMINIT(vm); + + for (i = 0; i < VM_MAXCPU; i++) { + vcpu_init(vm, i); + guest_msrs_init(vm, i); + } + + maxaddr = vmm_mem_maxaddr(); + vm->iommu = iommu_create_domain(maxaddr); + vm_activate_cpu(vm, BSP); + + return (vm); +} + +void +vm_destroy(struct vm *vm) +{ + int i; + + ppt_unassign_all(vm); + + for (i = 0; i < vm->num_mem_segs; i++) + vmm_mem_free(vm->mem_segs[i].hpa, vm->mem_segs[i].len); + + for (i = 0; i < VM_MAXCPU; i++) + vcpu_cleanup(&vm->vcpu[i]); + + iommu_destroy_domain(vm->iommu); + + VMCLEANUP(vm->cookie); + + free(vm, M_VM); +} + +const char * +vm_name(struct vm *vm) +{ + return (vm->name); +} + +int +vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + const boolean_t spok = TRUE; /* superpage mappings are ok */ + + return (VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE, + VM_PROT_RW, spok)); +} + +int +vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + const boolean_t spok = TRUE; /* superpage mappings are ok */ + + return (VMMMAP(vm->cookie, gpa, 0, len, VM_MEMATTR_UNCACHEABLE, + VM_PROT_NONE, spok)); +} + +int +vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa) +{ + int error; + vm_paddr_t hpa; + + const boolean_t spok = TRUE; /* superpage mappings are ok */ + + /* + * find the hpa if already it was already vm_malloc'd. + */ + hpa = vm_gpa2hpa(vm, gpa, len); + if (hpa != ((vm_paddr_t)-1)) + goto out; + + if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) + return (E2BIG); + + hpa = vmm_mem_alloc(len); + if (hpa == 0) + return (ENOMEM); + + error = VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK, + VM_PROT_ALL, spok); + if (error) { + vmm_mem_free(hpa, len); + return (error); + } + + iommu_create_mapping(vm->iommu, gpa, hpa, len); + + vm->mem_segs[vm->num_mem_segs].gpa = gpa; + vm->mem_segs[vm->num_mem_segs].hpa = hpa; + vm->mem_segs[vm->num_mem_segs].len = len; + vm->num_mem_segs++; +out: + *ret_hpa = hpa; + return (0); +} + +vm_paddr_t +vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + int i; + vm_paddr_t gpabase, gpalimit, hpabase; + + for (i = 0; i < vm->num_mem_segs; i++) { + hpabase = vm->mem_segs[i].hpa; + gpabase = vm->mem_segs[i].gpa; + gpalimit = gpabase + vm->mem_segs[i].len; + if (gpa >= gpabase && gpa + len <= gpalimit) + return ((gpa - gpabase) + hpabase); + } + return ((vm_paddr_t)-1); +} + +int +vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, + struct vm_memory_segment *seg) +{ + int i; + + for (i = 0; i < vm->num_mem_segs; i++) { + if (gpabase == vm->mem_segs[i].gpa) { + *seg = vm->mem_segs[i]; + return (0); + } + } + return (-1); +} + +int +vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (VMGETREG(vm->cookie, vcpu, reg, retval)); +} + +int +vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (VMSETREG(vm->cookie, vcpu, reg, val)); +} + +static boolean_t +is_descriptor_table(int reg) +{ + + switch (reg) { + case VM_REG_GUEST_IDTR: + case VM_REG_GUEST_GDTR: + return (TRUE); + default: + return (FALSE); + } +} + +static boolean_t +is_segment_register(int reg) +{ + + switch (reg) { + case VM_REG_GUEST_ES: + case VM_REG_GUEST_CS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_TR: + case VM_REG_GUEST_LDTR: + return (TRUE); + default: + return (FALSE); + } +} + +int +vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMGETDESC(vm->cookie, vcpu, reg, desc)); +} + +int +vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMSETDESC(vm->cookie, vcpu, reg, desc)); +} + +int +vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid) +{ + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + *cpuid = VCPU_PINCPU(vm, vcpuid); + + return (0); +} + +int +vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid) +{ + struct thread *td; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + td = curthread; /* XXXSMP only safe when muxing vcpus */ + + /* unpin */ + if (host_cpuid < 0) { + VCPU_UNPIN(vm, vcpuid); + thread_lock(td); + sched_unbind(td); + thread_unlock(td); + return (0); + } + + if (CPU_ABSENT(host_cpuid)) + return (EINVAL); + + /* + * XXX we should check that 'host_cpuid' has not already been pinned + * by another vm. + */ + thread_lock(td); + sched_bind(td, host_cpuid); + thread_unlock(td); + VCPU_PIN(vm, vcpuid, host_cpuid); + + return (0); +} + +static void +restore_guest_fpustate(struct vcpu *vcpu) +{ + register_t s; + + s = intr_disable(); + fpu_stop_emulating(); + fxrstor(&vcpu->savefpu); + fpu_start_emulating(); + intr_restore(s); +} + +static void +save_guest_fpustate(struct vcpu *vcpu) +{ + register_t s; + + s = intr_disable(); + fpu_stop_emulating(); + fxsave(&vcpu->savefpu); + fpu_start_emulating(); + intr_restore(s); +} + +int +vm_run(struct vm *vm, struct vm_run *vmrun) +{ + int error, vcpuid; + struct vcpu *vcpu; + struct pcb *pcb; + uint64_t tscval; + + vcpuid = vmrun->cpuid; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + critical_enter(); + + tscval = rdtsc(); + + pcb = PCPU_GET(curpcb); + pcb->pcb_full_iret = 1; + + vcpu->hostcpu = curcpu; + + fpuexit(curthread); + restore_guest_msrs(vm, vcpuid); + restore_guest_fpustate(vcpu); + error = VMRUN(vm->cookie, vcpuid, vmrun->rip, &vmrun->vm_exit); + save_guest_fpustate(vcpu); + restore_host_msrs(vm, vcpuid); + + vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); + + critical_exit(); + + return (error); +} + +int +vm_inject_event(struct vm *vm, int vcpuid, int type, + int vector, uint32_t code, int code_valid) +{ + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0) + return (EINVAL); + + if (vector < 0 || vector > 255) + return (EINVAL); + + return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); +} + +int +vm_inject_nmi(struct vm *vm, int vcpu) +{ + int error; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + error = VMNMI(vm->cookie, vcpu); + vm_interrupt_hostcpu(vm, vcpu); + return (error); +} + +int +vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMGETCAP(vm->cookie, vcpu, type, retval)); +} + +int +vm_set_capability(struct vm *vm, int vcpu, int type, int val) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMSETCAP(vm->cookie, vcpu, type, val)); +} + +uint64_t * +vm_guest_msrs(struct vm *vm, int cpu) +{ + return (vm->vcpu[cpu].guest_msrs); +} + +struct vlapic * +vm_lapic(struct vm *vm, int cpu) +{ + return (vm->vcpu[cpu].vlapic); +} + +boolean_t +vmm_is_pptdev(int bus, int slot, int func) +{ + int found, b, s, f, n; + char *val, *cp, *cp2; + + /* + * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12" + */ + found = 0; + cp = val = getenv("pptdevs"); + while (cp != NULL && *cp != '\0') { + if ((cp2 = strchr(cp, ' ')) != NULL) + *cp2 = '\0'; + + n = sscanf(cp, "%d/%d/%d", &b, &s, &f); + if (n == 3 && bus == b && slot == s && func == f) { + found = 1; + break; + } + + if (cp2 != NULL) + *cp2++ = ' '; + + cp = cp2; + } + freeenv(val); + return (found); +} + +void * +vm_iommu_domain(struct vm *vm) +{ + + return (vm->iommu); +} + +void +vm_set_run_state(struct vm *vm, int vcpuid, int state) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_set_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + if (state == VCPU_RUNNING) { + if (vcpu->flags & VCPU_F_RUNNING) { + panic("vm_set_run_state: %s[%d] is already running", + vm_name(vm), vcpuid); + } + vcpu->flags |= VCPU_F_RUNNING; + } else { + if ((vcpu->flags & VCPU_F_RUNNING) == 0) { + panic("vm_set_run_state: %s[%d] is already stopped", + vm_name(vm), vcpuid); + } + vcpu->flags &= ~VCPU_F_RUNNING; + } +} + +int +vm_get_run_state(struct vm *vm, int vcpuid, int *cpuptr) +{ + int retval, hostcpu; + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_get_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + if (vcpu->flags & VCPU_F_RUNNING) { + retval = VCPU_RUNNING; + hostcpu = vcpu->hostcpu; + } else { + retval = VCPU_STOPPED; + hostcpu = -1; + } + + if (cpuptr) + *cpuptr = hostcpu; + + return (retval); +} + +void +vm_activate_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid >= 0 && vcpuid < VM_MAXCPU) + vm->active_cpus |= vcpu_mask(vcpuid); +} + +cpumask_t +vm_active_cpus(struct vm *vm) +{ + + return (vm->active_cpus); +} + +void * +vcpu_stats(struct vm *vm, int vcpuid) +{ + + return (vm->vcpu[vcpuid].stats); +} diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c new file mode 100644 index 0000000..cf443fc --- /dev/null +++ b/sys/amd64/vmm/vmm_dev.c @@ -0,0 +1,468 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/queue.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/malloc.h> +#include <sys/conf.h> +#include <sys/sysctl.h> +#include <sys/libkern.h> +#include <sys/ioccom.h> +#include <sys/mman.h> +#include <sys/uio.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/pmap.h> +#include <machine/vmparam.h> + +#include <machine/vmm.h> +#include "vmm_lapic.h" +#include "vmm_stat.h" +#include "io/ppt.h" +#include <machine/vmm_dev.h> + +struct vmmdev_softc { + struct vm *vm; /* vm instance cookie */ + struct cdev *cdev; + SLIST_ENTRY(vmmdev_softc) link; +}; +static SLIST_HEAD(, vmmdev_softc) head; + +static struct mtx vmmdev_mtx; + +static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); + +SYSCTL_DECL(_hw_vmm); + +static struct vmmdev_softc * +vmmdev_lookup(const char *name) +{ + struct vmmdev_softc *sc; + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + SLIST_FOREACH(sc, &head, link) { + if (strcmp(name, vm_name(sc->vm)) == 0) + break; + } + + return (sc); +} + +static struct vmmdev_softc * +vmmdev_lookup2(struct cdev *cdev) +{ + struct vmmdev_softc *sc; + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + SLIST_FOREACH(sc, &head, link) { + if (sc->cdev == cdev) + break; + } + + return (sc); +} + +static int +vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) +{ + int error, off, c; + vm_paddr_t hpa, gpa; + struct vmmdev_softc *sc; + + static char zerobuf[PAGE_SIZE]; + + error = 0; + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup2(cdev); + + while (uio->uio_resid > 0 && error == 0) { + gpa = uio->uio_offset; + off = gpa & PAGE_MASK; + c = min(uio->uio_resid, PAGE_SIZE - off); + + /* + * The VM has a hole in its physical memory map. If we want to + * use 'dd' to inspect memory beyond the hole we need to + * provide bogus data for memory that lies in the hole. + * + * Since this device does not support lseek(2), dd(1) will + * read(2) blocks of data to simulate the lseek(2). + */ + hpa = vm_gpa2hpa(sc->vm, gpa, c); + if (hpa == (vm_paddr_t)-1) { + if (uio->uio_rw == UIO_READ) + error = uiomove(zerobuf, c, uio); + else + error = EFAULT; + } else + error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio); + } + + mtx_unlock(&vmmdev_mtx); + return (error); +} + +static int +vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, + struct thread *td) +{ + int error, vcpu; + struct vmmdev_softc *sc; + struct vm_memory_segment *seg; + struct vm_register *vmreg; + struct vm_seg_desc* vmsegdesc; + struct vm_pin *vmpin; + struct vm_run *vmrun; + struct vm_event *vmevent; + struct vm_lapic_irq *vmirq; + struct vm_capability *vmcap; + struct vm_pptdev *pptdev; + struct vm_pptdev_mmio *pptmmio; + struct vm_pptdev_msi *pptmsi; + struct vm_nmi *vmnmi; + struct vm_stats *vmstats; + struct vm_stat_desc *statdesc; + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup2(cdev); + if (sc == NULL) { + mtx_unlock(&vmmdev_mtx); + return (ENXIO); + } + + /* + * Some VMM ioctls can operate only on vcpus that are not running. + */ + switch (cmd) { + case VM_RUN: + case VM_SET_PINNING: + case VM_GET_REGISTER: + case VM_SET_REGISTER: + case VM_GET_SEGMENT_DESCRIPTOR: + case VM_SET_SEGMENT_DESCRIPTOR: + case VM_INJECT_EVENT: + case VM_GET_CAPABILITY: + case VM_SET_CAPABILITY: + case VM_PPTDEV_MSI: + /* + * XXX fragile, handle with care + * Assumes that the first field of the ioctl data is the vcpu. + */ + vcpu = *(int *)data; + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + error = EINVAL; + goto done; + } + + if (vcpu_is_running(sc->vm, vcpu, NULL)) { + error = EBUSY; + goto done; + } + break; + default: + break; + } + + switch(cmd) { + case VM_RUN: + vmrun = (struct vm_run *)data; + + vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_RUNNING); + mtx_unlock(&vmmdev_mtx); + + error = vm_run(sc->vm, vmrun); + + mtx_lock(&vmmdev_mtx); + vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_STOPPED); + break; + case VM_STAT_DESC: { + const char *desc; + statdesc = (struct vm_stat_desc *)data; + desc = vmm_stat_desc(statdesc->index); + if (desc != NULL) { + error = 0; + strlcpy(statdesc->desc, desc, sizeof(statdesc->desc)); + } else + error = EINVAL; + break; + } + case VM_STATS: { + CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES); + vmstats = (struct vm_stats *)data; + getmicrotime(&vmstats->tv); + error = vmm_stat_copy(sc->vm, vmstats->cpuid, + &vmstats->num_entries, vmstats->statbuf); + break; + } + case VM_PPTDEV_MSI: + pptmsi = (struct vm_pptdev_msi *)data; + error = ppt_setup_msi(sc->vm, pptmsi->vcpu, + pptmsi->bus, pptmsi->slot, pptmsi->func, + pptmsi->destcpu, pptmsi->vector, + pptmsi->numvec); + break; + case VM_MAP_PPTDEV_MMIO: + pptmmio = (struct vm_pptdev_mmio *)data; + error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, + pptmmio->func, pptmmio->gpa, pptmmio->len, + pptmmio->hpa); + break; + case VM_BIND_PPTDEV: + pptdev = (struct vm_pptdev *)data; + error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot, + pptdev->func); + break; + case VM_UNBIND_PPTDEV: + pptdev = (struct vm_pptdev *)data; + error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot, + pptdev->func); + break; + case VM_INJECT_EVENT: + vmevent = (struct vm_event *)data; + error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type, + vmevent->vector, + vmevent->error_code, + vmevent->error_code_valid); + break; + case VM_INJECT_NMI: + vmnmi = (struct vm_nmi *)data; + error = vm_inject_nmi(sc->vm, vmnmi->cpuid); + break; + case VM_LAPIC_IRQ: + vmirq = (struct vm_lapic_irq *)data; + error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector); + break; + case VM_SET_PINNING: + vmpin = (struct vm_pin *)data; + error = vm_set_pinning(sc->vm, vmpin->vm_cpuid, + vmpin->host_cpuid); + break; + case VM_GET_PINNING: + vmpin = (struct vm_pin *)data; + error = vm_get_pinning(sc->vm, vmpin->vm_cpuid, + &vmpin->host_cpuid); + break; + case VM_MAP_MEMORY: + seg = (struct vm_memory_segment *)data; + error = vm_malloc(sc->vm, seg->gpa, seg->len, &seg->hpa); + break; + case VM_GET_MEMORY_SEG: + seg = (struct vm_memory_segment *)data; + seg->hpa = seg->len = 0; + (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); + error = 0; + break; + case VM_GET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, + &vmreg->regval); + break; + case VM_SET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, + vmreg->regval); + break; + case VM_SET_SEGMENT_DESCRIPTOR: + vmsegdesc = (struct vm_seg_desc *)data; + error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, + vmsegdesc->regnum, + &vmsegdesc->desc); + break; + case VM_GET_SEGMENT_DESCRIPTOR: + vmsegdesc = (struct vm_seg_desc *)data; + error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, + vmsegdesc->regnum, + &vmsegdesc->desc); + break; + case VM_GET_CAPABILITY: + vmcap = (struct vm_capability *)data; + error = vm_get_capability(sc->vm, vmcap->cpuid, + vmcap->captype, + &vmcap->capval); + break; + case VM_SET_CAPABILITY: + vmcap = (struct vm_capability *)data; + error = vm_set_capability(sc->vm, vmcap->cpuid, + vmcap->captype, + vmcap->capval); + break; + default: + error = ENOTTY; + break; + } +done: + mtx_unlock(&vmmdev_mtx); + + return (error); +} + +static int +vmmdev_mmap(struct cdev *cdev, vm_offset_t offset, vm_paddr_t *paddr, int nprot) +{ + int error; + struct vmmdev_softc *sc; + + error = -1; + mtx_lock(&vmmdev_mtx); + + sc = vmmdev_lookup2(cdev); + if (sc != NULL && (nprot & PROT_EXEC) == 0) { + *paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE); + if (*paddr != (vm_paddr_t)-1) + error = 0; + } + + mtx_unlock(&vmmdev_mtx); + + return (error); +} + +static void +vmmdev_destroy(struct vmmdev_softc *sc) +{ + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + /* + * XXX must stop virtual machine instances that may be still + * running and cleanup their state. + */ + SLIST_REMOVE(&head, sc, vmmdev_softc, link); + destroy_dev(sc->cdev); + vm_destroy(sc->vm); + free(sc, M_VMMDEV); +} + +static int +sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) +{ + int error; + char buf[VM_MAX_NAMELEN]; + struct vmmdev_softc *sc; + + strlcpy(buf, "beavis", sizeof(buf)); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + if (sc == NULL) { + mtx_unlock(&vmmdev_mtx); + return (EINVAL); + } + vmmdev_destroy(sc); + mtx_unlock(&vmmdev_mtx); + return (0); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_vmm_destroy, "A", NULL); + +static struct cdevsw vmmdevsw = { + .d_name = "vmmdev", + .d_version = D_VERSION, + .d_ioctl = vmmdev_ioctl, + .d_mmap = vmmdev_mmap, + .d_read = vmmdev_rw, + .d_write = vmmdev_rw, +}; + +static int +sysctl_vmm_create(SYSCTL_HANDLER_ARGS) +{ + int error; + struct vm *vm; + struct vmmdev_softc *sc; + char buf[VM_MAX_NAMELEN]; + + strlcpy(buf, "beavis", sizeof(buf)); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + + mtx_lock(&vmmdev_mtx); + + sc = vmmdev_lookup(buf); + if (sc != NULL) { + mtx_unlock(&vmmdev_mtx); + return (EEXIST); + } + + vm = vm_create(buf); + if (vm == NULL) { + mtx_unlock(&vmmdev_mtx); + return (EINVAL); + } + + sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); + sc->vm = vm; + sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600, + "vmm/%s", buf); + sc->cdev->si_drv1 = sc; + SLIST_INSERT_HEAD(&head, sc, link); + + mtx_unlock(&vmmdev_mtx); + return (0); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_vmm_create, "A", NULL); + +void +vmmdev_init(void) +{ + mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); +} + +void +vmmdev_cleanup(void) +{ + struct vmmdev_softc *sc, *sc2; + + mtx_lock(&vmmdev_mtx); + + SLIST_FOREACH_SAFE(sc, &head, link, sc2) + vmmdev_destroy(sc); + + mtx_unlock(&vmmdev_mtx); +} diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c new file mode 100644 index 0000000..c8e795b --- /dev/null +++ b/sys/amd64/vmm/vmm_ipi.c @@ -0,0 +1,103 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/bus.h> + +#include <machine/intr_machdep.h> +#include <machine/apicvar.h> +#include <machine/segments.h> +#include <machine/md_var.h> +#include <machine/smp.h> + +#include <machine/vmm.h> +#include "vmm_ipi.h" + +extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn); + +/* + * The default is to use the IPI_AST to interrupt a vcpu. + */ +static int ipinum = IPI_AST; + +CTASSERT(APIC_SPURIOUS_INT == 255); + +void +vmm_ipi_init(void) +{ + int idx; + uintptr_t func; + struct gate_descriptor *ip; + + /* + * Search backwards from the highest IDT vector available for use + * as our IPI vector. We install the 'justreturn' handler at that + * vector and use it to interrupt the vcpus. + * + * We do this because the IPI_AST is heavyweight and saves all + * registers in the trapframe. This is overkill for our use case + * which is simply to EOI the interrupt and return. + */ + idx = APIC_SPURIOUS_INT; + while (--idx >= APIC_IPI_INTS) { + ip = &idt[idx]; + func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); + if (func == (uintptr_t)&IDTVEC(rsvd)) { + ipinum = idx; + setidt(ipinum, IDTVEC(justreturn), SDT_SYSIGT, + SEL_KPL, 0); + break; + } + } + + if (ipinum != IPI_AST && bootverbose) { + printf("vmm_ipi_init: installing ipi handler to interrupt " + "vcpus at vector %d\n", ipinum); + } +} + +void +vmm_ipi_cleanup(void) +{ + if (ipinum != IPI_AST) + setidt(ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); +} + +void +vm_interrupt_hostcpu(struct vm *vm, int vcpu) +{ + int hostcpu; + + if (vcpu_is_running(vm, vcpu, &hostcpu) && hostcpu != curcpu) + ipi_selected((cpumask_t)1 << hostcpu, ipinum); +} diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h new file mode 100644 index 0000000..7ab94bf --- /dev/null +++ b/sys/amd64/vmm/vmm_ipi.h @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_IPI_H_ +#define _VMM_IPI_H_ + +struct vm; + +void vmm_ipi_init(void); +void vmm_ipi_cleanup(void); +void vm_interrupt_hostcpu(struct vm *vm, int vcpu); + +#endif diff --git a/sys/amd64/vmm/vmm_ktr.h b/sys/amd64/vmm/vmm_ktr.h new file mode 100644 index 0000000..e691c61 --- /dev/null +++ b/sys/amd64/vmm/vmm_ktr.h @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_KTR_H_ +#define _VMM_KTR_H_ + +#include <sys/ktr.h> +#include <sys/pcpu.h> + +#define KTR_VMM KTR_GEN + +#define VMM_CTR0(vm, vcpuid, format) \ +CTR3(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu) + +#define VMM_CTR1(vm, vcpuid, format, p1) \ +CTR4(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \ + (p1)) + +#define VMM_CTR2(vm, vcpuid, format, p1, p2) \ +CTR5(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \ + (p1), (p2)) + +#define VMM_CTR3(vm, vcpuid, format, p1, p2, p3) \ +CTR6(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \ + (p1), (p2), (p3)) +#endif diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c new file mode 100644 index 0000000..8704fcf --- /dev/null +++ b/sys/amd64/vmm/vmm_lapic.c @@ -0,0 +1,121 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/vmm.h> +#include "vmm_ipi.h" +#include "vmm_lapic.h" +#include "vlapic.h" + +int +lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val) +{ + int handled; + + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (vlapic_op_mem_write(vlapic, offset, DWORD, val) == 0) + handled = 1; + else + handled = 0; + + return (handled); +} + +int +lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *rv) +{ + int handled; + + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (vlapic_op_mem_read(vlapic, offset, DWORD, rv) == 0) + handled = 1; + else + handled = 0; + + return (handled); +} + +int +lapic_pending_intr(struct vm *vm, int cpu) +{ + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + return (vlapic_pending_intr(vlapic)); +} + +void +lapic_intr_accepted(struct vm *vm, int cpu, int vector) +{ + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + vlapic_intr_accepted(vlapic, vector); +} + +int +lapic_set_intr(struct vm *vm, int cpu, int vector) +{ + struct vlapic *vlapic; + + if (cpu < 0 || cpu >= VM_MAXCPU) + return (EINVAL); + + if (vector < 32 || vector > 255) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + vlapic_set_intr_ready(vlapic, vector); + + vm_interrupt_hostcpu(vm, cpu); + + return (0); +} + +void +lapic_timer_tick(struct vm *vm, int cpu) +{ + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + vlapic_timer_tick(vlapic); +} diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h new file mode 100644 index 0000000..815b2f7 --- /dev/null +++ b/sys/amd64/vmm/vmm_lapic.h @@ -0,0 +1,64 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_LAPIC_H_ +#define _VMM_LAPIC_H_ + +struct vm; + +int lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val); +int lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *retval); +void lapic_timer_tick(struct vm *vm, int cpu); + +/* + * Returns a vector between 32 and 255 if an interrupt is pending in the + * IRR that can be delivered based on the current state of ISR and TPR. + * + * Note that the vector does not automatically transition to the ISR as a + * result of calling this function. + * + * Returns -1 if there is no eligible vector that can be delivered to the + * guest at this time. + */ +int lapic_pending_intr(struct vm *vm, int cpu); + +/* + * Transition 'vector' from IRR to ISR. This function is called with the + * vector returned by 'lapic_pending_intr()' when the guest is able to + * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that + * block interrupt delivery). + */ +void lapic_intr_accepted(struct vm *vm, int cpu, int vector); + +/* + * Signals to the LAPIC that an interrupt at 'vector' needs to be generated + * to the 'cpu', the state is recorded in IRR. + */ +int lapic_set_intr(struct vm *vm, int cpu, int vector); + +#endif diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c new file mode 100644 index 0000000..9ce1e80 --- /dev/null +++ b/sys/amd64/vmm/vmm_mem.c @@ -0,0 +1,413 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/linker.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/md_var.h> +#include <machine/metadata.h> +#include <machine/pc/bios.h> +#include <machine/vmparam.h> +#include <machine/pmap.h> + +#include "vmm_util.h" +#include "vmm_mem.h" + +static MALLOC_DEFINE(M_VMM_MEM, "vmm memory", "vmm memory"); + +#define MB (1024 * 1024) +#define GB (1024 * MB) + +#define VMM_MEM_MAXSEGS 64 + +/* protected by vmm_mem_mtx */ +static struct { + vm_paddr_t base; + vm_size_t length; +} vmm_mem_avail[VMM_MEM_MAXSEGS]; + +static int vmm_mem_nsegs; + +static vm_paddr_t maxaddr; + +static struct mtx vmm_mem_mtx; + +/* + * Steal any memory that was deliberately hidden from FreeBSD either by + * the use of MAXMEM kernel config option or the hw.physmem loader tunable. + */ +static int +vmm_mem_steal_memory(void) +{ + int nsegs; + caddr_t kmdp; + uint32_t smapsize; + uint64_t base, length; + struct bios_smap *smapbase, *smap, *smapend; + + /* + * Borrowed from hammer_time() and getmemsize() in machdep.c + */ + kmdp = preload_search_by_type("elf kernel"); + if (kmdp == NULL) + kmdp = preload_search_by_type("elf64 kernel"); + + smapbase = (struct bios_smap *)preload_search_info(kmdp, + MODINFO_METADATA | MODINFOMD_SMAP); + if (smapbase == NULL) + panic("No BIOS smap info from loader!"); + + smapsize = *((uint32_t *)smapbase - 1); + smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); + + nsegs = 0; + for (smap = smapbase; smap < smapend; smap++) { + /* + * XXX + * Assuming non-overlapping, monotonically increasing + * memory segments. + */ + if (smap->type != SMAP_TYPE_MEMORY) + continue; + if (smap->length == 0) + break; + + base = roundup(smap->base, NBPDR); + length = rounddown(smap->length, NBPDR); + + /* Skip this segment if FreeBSD is using all of it. */ + if (base + length <= ptoa(Maxmem)) + continue; + + /* + * If FreeBSD is using part of this segment then adjust + * 'base' and 'length' accordingly. + */ + if (base < ptoa(Maxmem)) { + uint64_t used; + used = roundup(ptoa(Maxmem), NBPDR) - base; + base += used; + length -= used; + } + + if (length == 0) + continue; + + vmm_mem_avail[nsegs].base = base; + vmm_mem_avail[nsegs].length = length; + + if (base + length > maxaddr) + maxaddr = base + length; + + if (0 && bootverbose) { + printf("vmm_mem_populate: index %d, base 0x%0lx, " + "length %ld\n", + nsegs, vmm_mem_avail[nsegs].base, + vmm_mem_avail[nsegs].length); + } + + nsegs++; + if (nsegs >= VMM_MEM_MAXSEGS) { + printf("vmm_mem_populate: maximum number of vmm memory " + "segments reached!\n"); + return (ENOSPC); + } + } + + vmm_mem_nsegs = nsegs; + + return (0); +} + +static void +vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end) +{ + vm_paddr_t addr, remaining; + int pdpi, pdi, superpage_size; + pml4_entry_t *pml4p; + pdp_entry_t *pdp; + pd_entry_t *pd; + uint64_t page_attr_bits; + + if (end >= NBPML4) + panic("Cannot map memory beyond %ldGB", NBPML4 / GB); + + /* XXX FreeBSD 8.1 does not use 1G superpages in the direct map */ + if (0 && vmm_supports_1G_pages()) + superpage_size = NBPDP; + else + superpage_size = NBPDR; + + /* + * Get the page directory pointer page that contains the direct + * map address mappings. + */ + pml4p = kernel_pmap->pm_pml4; + pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4p[DMPML4I] & ~PAGE_MASK); + + page_attr_bits = PG_RW | PG_V | PG_PS | PG_G; + addr = start; + while (addr < end) { + remaining = end - addr; + pdpi = addr / NBPDP; + if (superpage_size == NBPDP && + remaining >= NBPDP && + addr % NBPDP == 0) { + /* + * If there isn't a mapping for this address then + * create one but if there is one already make sure + * it matches what we expect it to be. + */ + if (pdp[pdpi] == 0) { + pdp[pdpi] = addr | page_attr_bits; + if (0 && bootverbose) { + printf("vmm_mem_populate: mapping " + "0x%lx with 1GB page at " + "pdpi %d\n", addr, pdpi); + } + } else { + pdp_entry_t pdpe = pdp[pdpi]; + if ((pdpe & ~PAGE_MASK) != addr || + (pdpe & page_attr_bits) != page_attr_bits) { + panic("An invalid mapping 0x%016lx " + "already exists for 0x%016lx\n", + pdpe, addr); + } + } + addr += NBPDP; + } else { + if (remaining < NBPDR) { + panic("vmm_mem_populate: remaining (%ld) must " + "be greater than NBPDR (%d)\n", + remaining, NBPDR); + } + if (pdp[pdpi] == 0) { + /* + * XXX we lose this memory forever because + * we do not keep track of the virtual address + * that would be required to free this page. + */ + pd = malloc(PAGE_SIZE, M_VMM_MEM, + M_WAITOK | M_ZERO); + if ((uintptr_t)pd & PAGE_MASK) { + panic("vmm_mem_populate: page directory" + "page not aligned on %d " + "boundary\n", PAGE_SIZE); + } + pdp[pdpi] = vtophys(pd); + pdp[pdpi] |= PG_RW | PG_V | PG_U; + if (0 && bootverbose) { + printf("Creating page directory " + "at pdp index %d for 0x%016lx\n", + pdpi, addr); + } + } + pdi = (addr % NBPDP) / NBPDR; + pd = (pd_entry_t *)PHYS_TO_DMAP(pdp[pdpi] & ~PAGE_MASK); + + /* + * Create a new mapping if one doesn't already exist + * or validate it if it does. + */ + if (pd[pdi] == 0) { + pd[pdi] = addr | page_attr_bits; + if (0 && bootverbose) { + printf("vmm_mem_populate: mapping " + "0x%lx with 2MB page at " + "pdpi %d, pdi %d\n", + addr, pdpi, pdi); + } + } else { + pd_entry_t pde = pd[pdi]; + if ((pde & ~PAGE_MASK) != addr || + (pde & page_attr_bits) != page_attr_bits) { + panic("An invalid mapping 0x%016lx " + "already exists for 0x%016lx\n", + pde, addr); + } + } + addr += NBPDR; + } + } +} + +static int +vmm_mem_populate(void) +{ + int seg, error; + vm_paddr_t start, end; + + /* populate the vmm_mem_avail[] array */ + error = vmm_mem_steal_memory(); + if (error) + return (error); + + /* + * Now map the memory that was hidden from FreeBSD in + * the direct map VA space. + */ + for (seg = 0; seg < vmm_mem_nsegs; seg++) { + start = vmm_mem_avail[seg].base; + end = start + vmm_mem_avail[seg].length; + if ((start & PDRMASK) != 0 || (end & PDRMASK) != 0) { + panic("start (0x%016lx) and end (0x%016lx) must be " + "aligned on a %dMB boundary\n", + start, end, NBPDR / MB); + } + vmm_mem_direct_map(start, end); + } + + return (0); +} + +int +vmm_mem_init(void) +{ + int error; + + mtx_init(&vmm_mem_mtx, "vmm_mem_mtx", NULL, MTX_DEF); + + error = vmm_mem_populate(); + if (error) + return (error); + + return (0); +} + +vm_paddr_t +vmm_mem_alloc(size_t size) +{ + int i; + vm_paddr_t addr; + + if ((size & PDRMASK) != 0) { + panic("vmm_mem_alloc: size 0x%0lx must be " + "aligned on a 0x%0x boundary\n", size, NBPDR); + } + + addr = 0; + + mtx_lock(&vmm_mem_mtx); + for (i = 0; i < vmm_mem_nsegs; i++) { + if (vmm_mem_avail[i].length >= size) { + addr = vmm_mem_avail[i].base; + vmm_mem_avail[i].base += size; + vmm_mem_avail[i].length -= size; + /* remove a zero length segment */ + if (vmm_mem_avail[i].length == 0) { + memmove(&vmm_mem_avail[i], + &vmm_mem_avail[i + 1], + (vmm_mem_nsegs - (i + 1)) * + sizeof(vmm_mem_avail[0])); + vmm_mem_nsegs--; + } + break; + } + } + mtx_unlock(&vmm_mem_mtx); + + return (addr); +} + +void +vmm_mem_free(vm_paddr_t base, size_t length) +{ + int i; + + if ((base & PDRMASK) != 0 || (length & PDRMASK) != 0) { + panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be " + "aligned on a 0x%0x boundary\n", base, length, NBPDR); + } + + mtx_lock(&vmm_mem_mtx); + + for (i = 0; i < vmm_mem_nsegs; i++) { + if (vmm_mem_avail[i].base > base) + break; + } + + if (vmm_mem_nsegs >= VMM_MEM_MAXSEGS) + panic("vmm_mem_free: cannot free any more segments"); + + /* Create a new segment at index 'i' */ + memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i], + (vmm_mem_nsegs - i) * sizeof(vmm_mem_avail[0])); + + vmm_mem_avail[i].base = base; + vmm_mem_avail[i].length = length; + + vmm_mem_nsegs++; + +coalesce_some_more: + for (i = 0; i < vmm_mem_nsegs - 1; i++) { + if (vmm_mem_avail[i].base + vmm_mem_avail[i].length == + vmm_mem_avail[i + 1].base) { + vmm_mem_avail[i].length += vmm_mem_avail[i + 1].length; + memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i + 2], + (vmm_mem_nsegs - (i + 2)) * sizeof(vmm_mem_avail[0])); + vmm_mem_nsegs--; + goto coalesce_some_more; + } + } + + mtx_unlock(&vmm_mem_mtx); +} + +vm_paddr_t +vmm_mem_maxaddr(void) +{ + + return (maxaddr); +} + +void +vmm_mem_dump(void) +{ + int i; + vm_paddr_t base; + vm_size_t length; + + mtx_lock(&vmm_mem_mtx); + for (i = 0; i < vmm_mem_nsegs; i++) { + base = vmm_mem_avail[i].base; + length = vmm_mem_avail[i].length; + printf("%-4d0x%016lx 0x%016lx\n", i, base, base + length); + } + mtx_unlock(&vmm_mem_mtx); +} diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h new file mode 100644 index 0000000..ef1bf1a --- /dev/null +++ b/sys/amd64/vmm/vmm_mem.h @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_MEM_H_ +#define _VMM_MEM_H_ + +int vmm_mem_init(void); +vm_paddr_t vmm_mem_alloc(size_t size); +void vmm_mem_free(vm_paddr_t start, size_t size); +vm_paddr_t vmm_mem_maxaddr(void); +void vmm_mem_dump(void); + +#endif diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c new file mode 100644 index 0000000..152aa7b --- /dev/null +++ b/sys/amd64/vmm/vmm_msr.c @@ -0,0 +1,264 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/specialreg.h> +#include <machine/apicreg.h> + +#include <machine/vmm.h> +#include "vmm_lapic.h" +#include "vmm_msr.h" + +#define VMM_MSR_F_EMULATE 0x01 +#define VMM_MSR_F_READONLY 0x02 + +struct vmm_msr { + int num; + int flags; + uint64_t hostval; +}; + +static struct vmm_msr vmm_msr[] = { + { MSR_LSTAR, 0 }, + { MSR_CSTAR, 0 }, + { MSR_STAR, 0 }, + { MSR_SF_MASK, 0 }, + { MSR_APICBASE, VMM_MSR_F_EMULATE }, + { MSR_BIOS_SIGN,VMM_MSR_F_EMULATE }, + { MSR_MCG_CAP, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY }, +}; + +#define vmm_msr_num (sizeof(vmm_msr) / sizeof(vmm_msr[0])) +CTASSERT(VMM_MSR_NUM >= vmm_msr_num); + +#define readonly_msr(idx) \ + ((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0) + +#define emulated_msr(idx) \ + ((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0) + +void +vmm_msr_init(void) +{ + int i; + + for (i = 0; i < vmm_msr_num; i++) { + if (emulated_msr(i)) + continue; + /* + * XXX this assumes that the value of the host msr does not + * change after we have cached it. + */ + vmm_msr[i].hostval = rdmsr(vmm_msr[i].num); + } +} + +void +guest_msrs_init(struct vm *vm, int cpu) +{ + int i; + uint64_t *guest_msrs; + + guest_msrs = vm_guest_msrs(vm, cpu); + + for (i = 0; i < vmm_msr_num; i++) { + switch (vmm_msr[i].num) { + case MSR_LSTAR: + case MSR_CSTAR: + case MSR_STAR: + case MSR_SF_MASK: + case MSR_BIOS_SIGN: + case MSR_MCG_CAP: + guest_msrs[i] = 0; + break; + case MSR_APICBASE: + guest_msrs[i] = DEFAULT_APIC_BASE | APICBASE_ENABLED | + APICBASE_X2APIC; + if (cpu == 0) + guest_msrs[i] |= APICBASE_BSP; + break; + default: + panic("guest_msrs_init: missing initialization for msr " + "0x%0x", vmm_msr[i].num); + } + } +} + +static boolean_t +x2apic_msr(u_int num) +{ + + if (num >= 0x800 && num <= 0xBFF) + return (TRUE); + else + return (FALSE); +} + +static u_int +x2apic_msr_to_regoff(u_int msr) +{ + + return ((msr - 0x800) << 4); +} + +static boolean_t +x2apic_msr_id(u_int num) +{ + return (num == 0x802); +} + +static int +msr_num_to_idx(u_int num) +{ + int i; + + for (i = 0; i < vmm_msr_num; i++) + if (vmm_msr[i].num == num) + return (i); + + return (-1); +} + +int +emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val) +{ + int handled, idx; + uint64_t *guest_msrs; + + handled = 0; + + if (x2apic_msr(num)) + return (lapic_write(vm, cpu, x2apic_msr_to_regoff(num), val)); + + idx = msr_num_to_idx(num); + if (idx < 0) + goto done; + + if (!readonly_msr(idx)) { + guest_msrs = vm_guest_msrs(vm, cpu); + + /* Stash the value */ + guest_msrs[idx] = val; + + /* Update processor state for non-emulated MSRs */ + if (!emulated_msr(idx)) + wrmsr(vmm_msr[idx].num, val); + } + + handled = 1; +done: + return (handled); +} + +int +emulate_rdmsr(struct vm *vm, int cpu, u_int num) +{ + int error, handled, idx; + uint32_t eax, edx; + uint64_t result, *guest_msrs; + + handled = 0; + + if (x2apic_msr(num)) { + handled = lapic_read(vm, cpu, x2apic_msr_to_regoff(num), + &result); + /* + * The version ID needs to be massaged + */ + if (x2apic_msr_id(num)) { + result = result >> 24; + } + goto done; + } + + idx = msr_num_to_idx(num); + if (idx < 0) + goto done; + + guest_msrs = vm_guest_msrs(vm, cpu); + result = guest_msrs[idx]; + + /* + * If this is not an emulated msr register make sure that the processor + * state matches our cached state. + */ + if (!emulated_msr(idx) && (rdmsr(num) != result)) { + panic("emulate_rdmsr: msr 0x%0x has inconsistent cached " + "(0x%016lx) and actual (0x%016lx) values", num, + result, rdmsr(num)); + } + + handled = 1; + +done: + if (handled) { + eax = result; + edx = result >> 32; + error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax); + if (error) + panic("vm_set_register(rax) error %d", error); + error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx); + if (error) + panic("vm_set_register(rdx) error %d", error); + } + return (handled); +} + +void +restore_guest_msrs(struct vm *vm, int cpu) +{ + int i; + uint64_t *guest_msrs; + + guest_msrs = vm_guest_msrs(vm, cpu); + + for (i = 0; i < vmm_msr_num; i++) { + if (emulated_msr(i)) + continue; + else + wrmsr(vmm_msr[i].num, guest_msrs[i]); + } +} + +void +restore_host_msrs(struct vm *vm, int cpu) +{ + int i; + + for (i = 0; i < vmm_msr_num; i++) { + if (emulated_msr(i)) + continue; + else + wrmsr(vmm_msr[i].num, vmm_msr[i].hostval); + } +} diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h new file mode 100644 index 0000000..1e15787 --- /dev/null +++ b/sys/amd64/vmm/vmm_msr.h @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_MSR_H_ +#define _VMM_MSR_H_ + +#define VMM_MSR_NUM 16 +struct vm; + +void vmm_msr_init(void); +int emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val); +int emulate_rdmsr(struct vm *vm, int vcpu, u_int msr); +void guest_msrs_init(struct vm *vm, int cpu); +void restore_host_msrs(struct vm *vm, int cpu); +void restore_guest_msrs(struct vm *vm, int cpu); + +#endif diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c new file mode 100644 index 0000000..e6f5c48 --- /dev/null +++ b/sys/amd64/vmm/vmm_stat.c @@ -0,0 +1,103 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <machine/vmm.h> +#include "vmm_stat.h" + +static int vstnum; +static struct vmm_stat_type *vsttab[MAX_VMM_STAT_TYPES]; + +static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat"); + +void +vmm_stat_init(void *arg) +{ + struct vmm_stat_type *vst = arg; + + /* We require all stats to identify themselves with a description */ + if (vst->desc == NULL) + return; + + if (vstnum >= MAX_VMM_STAT_TYPES) { + printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc); + return; + } + + vst->index = vstnum; + vsttab[vstnum++] = vst; +} + +int +vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf) +{ + int i; + uint64_t *stats; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + stats = vcpu_stats(vm, vcpu); + for (i = 0; i < vstnum; i++) + buf[i] = stats[i]; + *num_stats = vstnum; + return (0); +} + +void * +vmm_stat_alloc(void) +{ + u_long size; + + size = vstnum * sizeof(uint64_t); + + return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK)); +} + +void +vmm_stat_free(void *vp) +{ + free(vp, M_VMM_STAT); +} + +const char * +vmm_stat_desc(int index) +{ + + if (index >= 0 && index < vstnum) + return (vsttab[index]->desc); + else + return (NULL); +} diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h new file mode 100644 index 0000000..7c075a6 --- /dev/null +++ b/sys/amd64/vmm/vmm_stat.h @@ -0,0 +1,71 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_STAT_H_ +#define _VMM_STAT_H_ + +struct vm; + +#define MAX_VMM_STAT_TYPES 64 /* arbitrary */ + +struct vmm_stat_type { + const char *desc; /* description of statistic */ + int index; /* position in the stats buffer */ +}; + +void vmm_stat_init(void *arg); + +#define VMM_STAT_DEFINE(type, desc) \ + struct vmm_stat_type type[1] = { \ + { desc, -1 } \ + }; \ + SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type) + +void *vmm_stat_alloc(void); +void vmm_stat_free(void *vp); + +/* + * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries + */ +int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf); +const char *vmm_stat_desc(int index); + +static void __inline +vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats = vcpu_stats(vm, vcpu); + if (vst->index >= 0) + stats[vst->index] += x; +#endif +} + +#endif diff --git a/sys/amd64/vmm/vmm_support.S b/sys/amd64/vmm/vmm_support.S new file mode 100644 index 0000000..2afc608 --- /dev/null +++ b/sys/amd64/vmm/vmm_support.S @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#define LOCORE + +#include <machine/asmacros.h> + +#define LA_EOI 0xB0 + + .text + SUPERALIGN_TEXT +IDTVEC(justreturn) + pushq %rax + movq lapic, %rax + movl $0, LA_EOI(%rax) + popq %rax + iretq diff --git a/sys/amd64/vmm/vmm_util.c b/sys/amd64/vmm/vmm_util.c new file mode 100644 index 0000000..f245f92 --- /dev/null +++ b/sys/amd64/vmm/vmm_util.c @@ -0,0 +1,111 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/libkern.h> + +#include <machine/md_var.h> + +#include "vmm_util.h" + +boolean_t +vmm_is_intel(void) +{ + + if (strcmp(cpu_vendor, "GenuineIntel") == 0) + return (TRUE); + else + return (FALSE); +} + +boolean_t +vmm_is_amd(void) +{ + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) + return (TRUE); + else + return (FALSE); +} + +boolean_t +vmm_supports_1G_pages(void) +{ + unsigned int regs[4]; + + /* + * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages + * + * Both Intel and AMD support this bit. + */ + if (cpu_exthigh >= 0x80000001) { + do_cpuid(0x80000001, regs); + if (regs[3] & (1 << 26)) + return (TRUE); + } + return (FALSE); +} + +#include <sys/proc.h> +#include <machine/frame.h> +#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x)) +#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x)) +void +dump_trapframe(struct trapframe *tf) +{ + DUMP_REG(rdi); + DUMP_REG(rsi); + DUMP_REG(rdx); + DUMP_REG(rcx); + DUMP_REG(r8); + DUMP_REG(r9); + DUMP_REG(rax); + DUMP_REG(rbx); + DUMP_REG(rbp); + DUMP_REG(r10); + DUMP_REG(r11); + DUMP_REG(r12); + DUMP_REG(r13); + DUMP_REG(r14); + DUMP_REG(r15); + DUMP_REG(trapno); + DUMP_REG(addr); + DUMP_REG(flags); + DUMP_REG(err); + DUMP_REG(rip); + DUMP_REG(rflags); + DUMP_REG(rsp); + DUMP_SEG(cs); + DUMP_SEG(ss); + DUMP_SEG(fs); + DUMP_SEG(gs); + DUMP_SEG(es); + DUMP_SEG(ds); +} diff --git a/sys/amd64/vmm/vmm_util.h b/sys/amd64/vmm/vmm_util.h new file mode 100644 index 0000000..7f82332 --- /dev/null +++ b/sys/amd64/vmm/vmm_util.h @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_UTIL_H_ +#define _VMM_UTIL_H_ + +struct trapframe; + +boolean_t vmm_is_intel(void); +boolean_t vmm_is_amd(void); +boolean_t vmm_supports_1G_pages(void); + +void dump_trapframe(struct trapframe *tf); + +#endif diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c new file mode 100644 index 0000000..45c4c53 --- /dev/null +++ b/sys/amd64/vmm/x86.c @@ -0,0 +1,113 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <machine/cpufunc.h> +#include <machine/specialreg.h> + +#include "x86.h" + +int +x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) +{ + unsigned int func, regs[4]; + + func = *eax; + + cpuid_count(*eax, *ecx, regs); + + switch(func) { + case CPUID_0000_0000: + case CPUID_0000_0002: + case CPUID_0000_0003: + case CPUID_0000_0004: + case CPUID_0000_000A: + break; + + case CPUID_8000_0000: + case CPUID_8000_0001: + case CPUID_8000_0002: + case CPUID_8000_0003: + case CPUID_8000_0004: + case CPUID_8000_0006: + case CPUID_8000_0007: + case CPUID_8000_0008: + + break; + + case CPUID_0000_0001: + /* + * Override the APIC ID only in ebx + */ + regs[1] &= ~(CPUID_0000_0001_APICID_MASK); + /* + * XXX fixme for MP case, set apicid properly for cpu. + */ + regs[1] |= (0 << CPUID_0000_0001_APICID_SHIFT); + + /* + * Don't expose VMX capability. + * Advertise x2APIC capability. + */ + regs[2] &= ~CPUID_0000_0001_FEAT0_VMX; + regs[2] |= CPUID2_X2APIC; + + /* + * Machine check handling is done in the host. + * Hide MTRR capability. + */ + regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR); + + break; + + case CPUID_0000_000B: + /* + * XXXSMP fixme + * Processor topology enumeration + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = *ecx & 0xff; + regs[3] = 0; + break; + + default: + return (0); + } + + *eax = regs[0]; + *ebx = regs[1]; + *ecx = regs[2]; + *edx = regs[3]; + return (1); +} + diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h new file mode 100644 index 0000000..bc4f8a4 --- /dev/null +++ b/sys/amd64/vmm/x86.h @@ -0,0 +1,62 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _X86_H_ +#define _X86_H_ + +#define CPUID_0000_0000 (0x0) +#define CPUID_0000_0001 (0x1) +#define CPUID_0000_0002 (0x2) +#define CPUID_0000_0003 (0x3) +#define CPUID_0000_0004 (0x4) +#define CPUID_0000_000A (0xA) +#define CPUID_0000_000B (0xB) +#define CPUID_8000_0000 (0x80000000) +#define CPUID_8000_0001 (0x80000001) +#define CPUID_8000_0002 (0x80000002) +#define CPUID_8000_0003 (0x80000003) +#define CPUID_8000_0004 (0x80000004) +#define CPUID_8000_0006 (0x80000006) +#define CPUID_8000_0007 (0x80000007) +#define CPUID_8000_0008 (0x80000008) + +/* + * CPUID instruction Fn0000_0001: + */ +#define CPUID_0000_0001_APICID_MASK (0xff<<24) +#define CPUID_0000_0001_APICID_SHIFT 24 + +/* + * CPUID instruction Fn0000_0001 ECX + */ +#define CPUID_0000_0001_FEAT0_VMX (1<<5) + +int x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, + uint32_t *edx); + +#endif |