From 3330f5394fa1d580c13d5ae2f59b89291fd579e1 Mon Sep 17 00:00:00 2001 From: attilio Date: Thu, 13 Aug 2009 17:54:11 +0000 Subject: MFC r196196: * Completely remove the option STOP_NMI from the kernel. This option has proven to have a good effect when entering KDB by using a NMI, but it completely violates all the good rules about interrupts disabled while holding a spinlock in other occasions. This can be the cause of deadlocks on events where a normal IPI_STOP is expected. * Add an new IPI called IPI_STOP_HARD on all the supported architectures. This IPI is responsible for sending a stop message among CPUs using a privileged channel when disponible. In other cases it just does match a normal IPI_STOP. Right now the IPI_STOP_HARD functionality uses a NMI on ia32 and amd64 architectures, while on the other has a normal IPI_STOP effect. It is responsibility of maintainers to eventually implement an hard stop when necessary and possible. * Use the new IPI facility in order to implement a new userend SMP kernel function called stop_cpus_hard(). That is specular to stop_cpu() but it does use the privileged channel for the stopping facility. * Let KDB use the newly introduced function stop_cpus_hard() and leave stop_cpus() for all the other cases * Disable interrupts on CPU0 when starting the process of APs suspension. * Style cleanup and comments adding This patch should fix the reboot/shutdown deadlocks many users are constantly reporting on mailing lists. Please don't forget to update your config file with the STOP_NMI option removal Reviewed by: jhb Tested by: pho, bz, rink Approved by: re (kib) --- sys/amd64/amd64/local_apic.c | 13 +++++- sys/amd64/amd64/mp_machdep.c | 100 ++++++++++++++----------------------------- sys/amd64/amd64/trap.c | 2 - sys/amd64/conf/GENERIC | 1 - sys/amd64/conf/NOTES | 5 --- sys/amd64/conf/XENHVM | 1 - sys/amd64/include/apicvar.h | 6 +-- sys/amd64/include/smp.h | 5 +-- 8 files changed, 44 insertions(+), 89 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c index 14559f3..cd3073c 100644 --- a/sys/amd64/amd64/local_apic.c +++ b/sys/amd64/amd64/local_apic.c @@ -1238,8 +1238,17 @@ lapic_ipi_vectored(u_int vector, int dest) KASSERT((vector & ~APIC_VECTOR_MASK) == 0, ("%s: invalid vector %d", __func__, vector)); - icrlo = vector | APIC_DELMODE_FIXED | APIC_DESTMODE_PHY | - APIC_LEVEL_DEASSERT | APIC_TRIGMOD_EDGE; + icrlo = APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE; + + /* + * IPI_STOP_HARD is just a "fake" vector used to send a NMI. + * Use special rules regard NMI if passed, otherwise specify + * the vector. + */ + if (vector == IPI_STOP_HARD) + icrlo |= APIC_DELMODE_NMI | APIC_LEVEL_ASSERT; + else + icrlo |= vector | APIC_DELMODE_FIXED | APIC_LEVEL_DEASSERT; destfield = 0; switch (dest) { case APIC_IPI_DEST_SELF: diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 52c209c..0ef8017 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -114,31 +114,12 @@ volatile int smp_tlb_wait; extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); -#ifdef STOP_NMI -static volatile cpumask_t ipi_nmi_pending; - -static void ipi_nmi_selected(cpumask_t cpus); -#endif - /* * Local data and functions. */ -#ifdef STOP_NMI -/* - * Provide an alternate method of stopping other CPUs. If another CPU has - * disabled interrupts the conventional STOP IPI will be blocked. This - * NMI-based stop should get through in that case. - */ -static int stop_cpus_with_nmi = 1; -SYSCTL_INT(_debug, OID_AUTO, stop_cpus_with_nmi, CTLTYPE_INT | CTLFLAG_RW, - &stop_cpus_with_nmi, 0, ""); -TUNABLE_INT("debug.stop_cpus_with_nmi", &stop_cpus_with_nmi); -#else -#define stop_cpus_with_nmi 0 -#endif - static u_int logical_cpus; +static volatile cpumask_t ipi_nmi_pending; /* used to hold the AP's until we are ready to release them */ static struct mtx ap_boot_mtx; @@ -1158,12 +1139,14 @@ ipi_selected(cpumask_t cpus, u_int ipi) ipi = IPI_BITMAP_VECTOR; } -#ifdef STOP_NMI - if (ipi == IPI_STOP && stop_cpus_with_nmi) { - ipi_nmi_selected(cpus); - return; - } -#endif + /* + * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit + * of help in order to understand what is the source. + * Set the mask of receiving CPUs for this purpose. + */ + if (ipi == IPI_STOP_HARD) + atomic_set_int(&ipi_nmi_pending, cpus); + CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); while ((cpu = ffs(cpus)) != 0) { cpu--; @@ -1194,64 +1177,43 @@ void ipi_all_but_self(u_int ipi) { - if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) { + if (IPI_IS_BITMAPED(ipi)) { ipi_selected(PCPU_GET(other_cpus), ipi); return; } - CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); - lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); -} -#ifdef STOP_NMI -/* - * send NMI IPI to selected CPUs - */ - -#define BEFORE_SPIN 1000000 - -static void -ipi_nmi_selected(cpumask_t cpus) -{ - int cpu; - register_t icrlo; - - icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT - | APIC_TRIGMOD_EDGE; - - CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus); - - atomic_set_int(&ipi_nmi_pending, cpus); - - while ((cpu = ffs(cpus)) != 0) { - cpu--; - cpus &= ~(1 << cpu); - - KASSERT(cpu_apic_ids[cpu] != -1, - ("IPI NMI to non-existent CPU %d", cpu)); - - /* Wait for an earlier IPI to finish. */ - if (!lapic_ipi_wait(BEFORE_SPIN)) - panic("ipi_nmi_selected: previous IPI has not cleared"); + /* + * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit + * of help in order to understand what is the source. + * Set the mask of receiving CPUs for this purpose. + */ + if (ipi == IPI_STOP_HARD) + atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus)); - lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]); - } + CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); + lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); } int -ipi_nmi_handler(void) +ipi_nmi_handler() { - int cpumask = PCPU_GET(cpumask); + cpumask_t cpumask; - if (!(ipi_nmi_pending & cpumask)) - return 1; + /* + * As long as there is not a simple way to know about a NMI's + * source, if the bitmask for the current CPU is present in + * the global pending bitword an IPI_STOP_HARD has been issued + * and should be handled. + */ + cpumask = PCPU_GET(cpumask); + if ((ipi_nmi_pending & cpumask) == 0) + return (1); atomic_clear_int(&ipi_nmi_pending, cpumask); cpustop_handler(); - return 0; + return (0); } -#endif /* STOP_NMI */ - /* * Handle an IPI_STOP by saving our current context and spinning until we * are resumed. diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index fee3caf..323e8d1 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -239,13 +239,11 @@ trap(struct trapframe *frame) type = frame->tf_trapno; #ifdef SMP -#ifdef STOP_NMI /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI) { if (ipi_nmi_handler() == 0) goto out; } -#endif /* STOP_NMI */ #endif /* SMP */ #ifdef KDB diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index 73a4fb6..a49f7bc 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -69,7 +69,6 @@ options P1003_1B_SEMAPHORES # POSIX-style semaphores options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. options KBD_INSTALL_CDEV # install a CDEV entry in /dev -options STOP_NMI # Stop CPUS using NMI instead of IPI options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) options AUDIT # Security event auditing options MAC # TrustedBSD MAC Framework diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES index 088a381..27fe068 100644 --- a/sys/amd64/conf/NOTES +++ b/sys/amd64/conf/NOTES @@ -30,11 +30,6 @@ device mptable # Optional MPSPEC mptable support # options MP_WATCHDOG -# -# Debugging options. -# -options STOP_NMI # Stop CPUS using NMI instead of IPI - ##################################################################### diff --git a/sys/amd64/conf/XENHVM b/sys/amd64/conf/XENHVM index 5e108d5..1536e3c 100644 --- a/sys/amd64/conf/XENHVM +++ b/sys/amd64/conf/XENHVM @@ -68,7 +68,6 @@ options SYSVMSG # SYSV-style message queues options SYSVSEM # SYSV-style semaphores options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions options KBD_INSTALL_CDEV # install a CDEV entry in /dev -options STOP_NMI # Stop CPUS using NMI instead of IPI options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) options AUDIT # Security event auditing #options KDTRACE_FRAME # Ensure frames are compiled in diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h index 84ba3b8..73fff6c 100644 --- a/sys/amd64/include/apicvar.h +++ b/sys/amd64/include/apicvar.h @@ -102,11 +102,6 @@ * smp_ipi_mtx and waits for the completion of the IPI (Only one IPI user * at a time) The second group uses a single interrupt and a bitmap to avoid * redundant IPI interrupts. - * - * Right now IPI_STOP used by kdb shares the interrupt priority class with - * the two IPI groups mentioned above. As such IPI_STOP may cause a deadlock. - * Eventually IPI_STOP should use NMI IPIs - this would eliminate this and - * other deadlocks caused by IPI_STOP. */ /* Interrupts for local APIC LVT entries other than the timer. */ @@ -134,6 +129,7 @@ #define IPI_STOP (APIC_IPI_INTS + 7) /* Stop CPU until restarted. */ #define IPI_SUSPEND (APIC_IPI_INTS + 8) /* Suspend CPU until restarted. */ +#define IPI_STOP_HARD (APIC_IPI_INTS + 9) /* Stop CPU with a NMI. */ /* * The spurious interrupt can share the priority class with the IPIs since diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index d295715..1cc21a4 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -52,6 +52,7 @@ void cpu_add(u_int apic_id, char boot_cpu); void cpustop_handler(void); void cpususpend_handler(void); void init_secondary(void); +int ipi_nmi_handler(void); void ipi_selected(cpumask_t cpus, u_int ipi); void ipi_all_but_self(u_int ipi); void ipi_bitmap_handler(struct trapframe frame); @@ -66,10 +67,6 @@ void smp_masked_invlpg_range(cpumask_t mask, vm_offset_t startva, void smp_invltlb(void); void smp_masked_invltlb(cpumask_t mask); -#ifdef STOP_NMI -int ipi_nmi_handler(void); -#endif - #endif /* !LOCORE */ #endif /* SMP */ -- cgit v1.1 From 7042429fac0d3bfb1df101cef080ac39f6400e91 Mon Sep 17 00:00:00 2001 From: jhb Date: Fri, 14 Aug 2009 20:57:21 +0000 Subject: Adjust the handling of the local APIC PMC interrupt vector: - Provide lapic_disable_pmc(), lapic_enable_pmc(), and lapic_reenable_pmc() routines in the local APIC code that the hwpmc(4) driver can use to manage the local APIC PMC interrupt vector. - Do not enable the local APIC PMC interrupt vector by default when HWPMC_HOOKS is enabled. Instead, the hwpmc(4) driver explicitly enables the interrupt when it is succesfully initialized and disables the interrupt when it is unloaded. This avoids enabling the interrupt on unsupported CPUs which may result in spurious NMIs. Reported by: rnoland Reviewed by: jkoshy Approved by: re (kib) MFC after: 2 weeks --- sys/amd64/amd64/local_apic.c | 86 ++++++++++++++++++++++++++++++++++++++++++-- sys/amd64/include/apicvar.h | 3 ++ sys/amd64/include/pmc_mdep.h | 1 - 3 files changed, 86 insertions(+), 4 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c index cd3073c..13bd774 100644 --- a/sys/amd64/amd64/local_apic.c +++ b/sys/amd64/amd64/local_apic.c @@ -123,7 +123,7 @@ static struct lvt lvts[LVT_MAX + 1] = { { 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 }, /* LINT1: NMI */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT }, /* Timer */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT }, /* Error */ - { 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 }, /* PMC */ + { 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 }, /* PMC */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT }, /* Thermal */ }; @@ -305,11 +305,9 @@ lapic_setup(int boot) lapic->lvt_lint0 = lvt_mode(la, LVT_LINT0, lapic->lvt_lint0); lapic->lvt_lint1 = lvt_mode(la, LVT_LINT1, lapic->lvt_lint1); -#ifdef HWPMC_HOOKS /* Program the PMC LVT entry if present. */ if (maxlvt >= LVT_PMC) lapic->lvt_pcint = lvt_mode(la, LVT_PMC, lapic->lvt_pcint); -#endif /* Program timer LVT and setup handler. */ lapic->lvt_timer = lvt_mode(la, LVT_TIMER, lapic->lvt_timer); @@ -332,6 +330,88 @@ lapic_setup(int boot) intr_restore(eflags); } +void +lapic_reenable_pmc(void) +{ +#ifdef HWPMC_HOOKS + uint32_t value; + + value = lapic->lvt_pcint; + value &= ~APIC_LVT_M; + lapic->lvt_pcint = value; +#endif +} + +#ifdef HWPMC_HOOKS +static void +lapic_update_pmc(void *dummy) +{ + struct lapic *la; + + la = &lapics[lapic_id()]; + lapic->lvt_pcint = lvt_mode(la, LVT_PMC, lapic->lvt_pcint); +} +#endif + +int +lapic_enable_pmc(void) +{ +#ifdef HWPMC_HOOKS + u_int32_t maxlvt; + + /* Fail if the local APIC is not present. */ + if (lapic == NULL) + return (0); + + /* Fail if the PMC LVT is not present. */ + maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; + if (maxlvt < LVT_PMC) + return (0); + + lvts[LVT_PMC].lvt_masked = 0; + +#ifdef SMP + /* + * If hwpmc was loaded at boot time then the APs may not be + * started yet. In that case, don't forward the request to + * them as they will program the lvt when they start. + */ + if (smp_started) + smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL); + else +#endif + lapic_update_pmc(NULL); + return (1); +#else + return (0); +#endif +} + +void +lapic_disable_pmc(void) +{ +#ifdef HWPMC_HOOKS + u_int32_t maxlvt; + + /* Fail if the local APIC is not present. */ + if (lapic == NULL) + return; + + /* Fail if the PMC LVT is not present. */ + maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; + if (maxlvt < LVT_PMC) + return; + + lvts[LVT_PMC].lvt_masked = 1; + +#ifdef SMP + /* The APs should always be started when hwpmc is unloaded. */ + KASSERT(mp_ncpus == 1 || smp_started, ("hwpmc unloaded too early")); +#endif + smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL); +#endif +} + /* * Called by cpu_initclocks() on the BSP to setup the local APIC timer so * that it can drive hardclock, statclock, and profclock. This function diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h index 73fff6c..9d6d538 100644 --- a/sys/amd64/include/apicvar.h +++ b/sys/amd64/include/apicvar.h @@ -201,7 +201,9 @@ int ioapic_set_triggermode(void *cookie, u_int pin, int ioapic_set_smi(void *cookie, u_int pin); void lapic_create(u_int apic_id, int boot_cpu); void lapic_disable(void); +void lapic_disable_pmc(void); void lapic_dump(const char *str); +int lapic_enable_pmc(void); void lapic_eoi(void); u_int lapic_error(void); int lapic_id(void); @@ -212,6 +214,7 @@ void lapic_ipi_vectored(u_int vector, int dest); int lapic_ipi_wait(int delay); void lapic_handle_intr(int vector, struct trapframe *frame); void lapic_handle_timer(struct trapframe *frame); +void lapic_reenable_pmc(void); void lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id); int lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked); int lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode); diff --git a/sys/amd64/include/pmc_mdep.h b/sys/amd64/include/pmc_mdep.h index f8c26f2..f233a51 100644 --- a/sys/amd64/include/pmc_mdep.h +++ b/sys/amd64/include/pmc_mdep.h @@ -115,7 +115,6 @@ union pmc_md_pmc { */ void start_exceptions(void), end_exceptions(void); -void pmc_x86_lapic_enable_pmc_interrupt(void); struct pmc_mdep *pmc_amd_initialize(void); void pmc_amd_finalize(struct pmc_mdep *_md); -- cgit v1.1 From b8de80198d700a59246c08580b7121e1bff8f268 Mon Sep 17 00:00:00 2001 From: kib Date: Mon, 17 Aug 2009 13:32:56 +0000 Subject: MFC r196318: Correct accounting error when allocating a a page table page to implement a user-space demotion. Approved by: re (rwatson) --- sys/amd64/amd64/pmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 622ed62..b9eee49 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -2261,6 +2261,8 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) " in pmap %p", va, pmap); return (FALSE); } + if (va < VM_MAXUSER_ADDRESS) + pmap->pm_stats.resident_count++; } mptepa = VM_PAGE_TO_PHYS(mpte); firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); -- cgit v1.1 From 594f0177cea5291ff5a34f8ae3e061bab9704c13 Mon Sep 17 00:00:00 2001 From: ed Date: Wed, 19 Aug 2009 20:44:22 +0000 Subject: MFC r196390: Make the MacBookPro3,1 hardware boot again. Tested by: Patrick Lamaiziere Approved by: re (kib) --- sys/amd64/amd64/machdep.c | 1 + 1 file changed, 1 insertion(+) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 8aee975..2c54be2 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -217,6 +217,7 @@ cpu_startup(dummy) strncmp(sysenv, "MacBook3,1", 10) == 0 || strncmp(sysenv, "MacBookPro1,1", 13) == 0 || strncmp(sysenv, "MacBookPro1,2", 13) == 0 || + strncmp(sysenv, "MacBookPro3,1", 13) == 0 || strncmp(sysenv, "Macmini1,1", 10) == 0) { if (bootverbose) printf("Disabling LEGACY_USB_EN bit on " -- cgit v1.1 From 47bc5699d9da2d3c1c892b9f881316374c875607 Mon Sep 17 00:00:00 2001 From: jkim Date: Thu, 20 Aug 2009 23:04:21 +0000 Subject: MFC: r196412 Check whether the SMBIOS reports reasonable amount of memory. If it is less than "avail memory", fall back to Maxmem to avoid user confusion. We use SMBIOS information to display "real memory" since r190599 but some broken SMBIOS implementation reported only half of actual memory. Tested by: bz Approved by: re (kib) --- sys/amd64/amd64/machdep.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 2c54be2..0bfd7ad 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -236,19 +236,21 @@ cpu_startup(dummy) #ifdef PERFMON perfmon_init(); #endif + realmem = Maxmem; + + /* + * Display physical memory if SMBIOS reports reasonable amount. + */ + memsize = 0; sysenv = getenv("smbios.memory.enabled"); if (sysenv != NULL) { - memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10); + memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; freeenv(sysenv); - } else - memsize = 0; - if (memsize > 0) - printf("real memory = %ju (%ju MB)\n", memsize << 10, - memsize >> 10); - else - printf("real memory = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem), - ptoa((uintmax_t)Maxmem) / 1048576); - realmem = Maxmem; + } + if (memsize < ptoa((uintmax_t)cnt.v_free_count)) + memsize = ptoa((uintmax_t)Maxmem); + printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); + /* * Display any holes after the first chunk of extended memory. */ -- cgit v1.1 From bef0247c6bb7c48a6b86bdb9a7b23a355d81fd06 Mon Sep 17 00:00:00 2001 From: bz Date: Thu, 27 Aug 2009 17:34:13 +0000 Subject: MFC r196512: Fix handling of .note.ABI-tag section for GNU systems [1]. Handle GNU/Linux according to LSB Core Specification 4.0, Chapter 11. Object Format, 11.8. ABI note tag. Also check the first word of desc, not only name, according to glibc abi-tags specification to distinguish between Linux and kFreeBSD. Add explicit handling for Debian GNU/kFreeBSD, which runs on our kernels as well [2]. In {amd64,i386}/trap.c, when checking osrel of the current process, also check the ABI to not change the signal behaviour for Linux binary processes, now that we save an osrel version for all three from the lists above in struct proc [2]. These changes make it possible to run FreeBSD, Debian GNU/kFreeBSD and Linux binaries on the same machine again for at least i386 and amd64, and no longer break kFreeBSD which was detected as GNU(/Linux). PR: kern/135468 Submitted by: dchagin [1] (initial patch) Suggested by: kib [2] Tested by: Petr Salinger (Petr.Salinger seznam.cz) for kFreeBSD Reviewed by: kib Approved by: re (kensmith) --- sys/amd64/amd64/elf_machdep.c | 17 +++++++++++++++++ sys/amd64/amd64/trap.c | 4 +++- sys/amd64/linux32/linux32_sysvec.c | 35 ++++++++++++++++++++++++++++++----- 3 files changed, 50 insertions(+), 6 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c index c5e19cf..ea48b25 100644 --- a/sys/amd64/amd64/elf_machdep.c +++ b/sys/amd64/amd64/elf_machdep.c @@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -108,6 +109,22 @@ SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf64_insert_brand_entry, &freebsd_brand_oinfo); +static Elf64_Brandinfo kfreebsd_brand_info = { + .brand = ELFOSABI_FREEBSD, + .machine = EM_X86_64, + .compat_3_brand = "FreeBSD", + .emul_path = NULL, + .interp_path = "/lib/ld-kfreebsd-x86-64.so.1", + .sysvec = &elf64_freebsd_sysvec, + .interp_newpath = NULL, + .brand_note = &elf64_kfreebsd_brandnote, + .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE +}; + +SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY, + (sysinit_cfunc_t) elf64_insert_brand_entry, + &kfreebsd_brand_info); + void elf64_dump_thread(struct thread *td __unused, void *dst __unused, diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 323e8d1..65f761e 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -409,7 +409,9 @@ trap(struct trapframe *frame) * This check also covers the images * without the ABI-tag ELF note. */ - if (p->p_osrel >= 700004) { + if (SV_CURPROC_ABI() == + SV_ABI_FREEBSD && + p->p_osrel >= 700004) { i = SIGSEGV; ucode = SEGV_ACCERR; } else { diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c index 77186a1..54a04ee 100644 --- a/sys/amd64/linux32/linux32_sysvec.c +++ b/sys/amd64/linux32/linux32_sysvec.c @@ -127,6 +127,7 @@ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void exec_linux_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings); static void linux32_fixlimit(struct rlimit *rl, int which); +static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel); static eventhandler_tag linux_exit_tag; static eventhandler_tag linux_schedtail_tag; @@ -1066,14 +1067,38 @@ struct sysentvec elf_linux_sysvec = { .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32 }; -static char GNULINUX_ABI_VENDOR[] = "GNU"; +static char GNU_ABI_VENDOR[] = "GNU"; +static int GNULINUX_ABI_DESC = 0; + +static boolean_t +linux32_trans_osrel(const Elf_Note *note, int32_t *osrel) +{ + const Elf32_Word *desc; + uintptr_t p; + + p = (uintptr_t)(note + 1); + p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); + + desc = (const Elf32_Word *)p; + if (desc[0] != GNULINUX_ABI_DESC) + return (FALSE); + + /* + * For linux we encode osrel as follows (see linux_mib.c): + * VVVMMMIII (version, major, minor), see linux_mib.c. + */ + *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3]; + + return (TRUE); +} static Elf_Brandnote linux32_brandnote = { - .hdr.n_namesz = sizeof(GNULINUX_ABI_VENDOR), - .hdr.n_descsz = 16, + .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), + .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, - .vendor = GNULINUX_ABI_VENDOR, - .flags = 0 + .vendor = GNU_ABI_VENDOR, + .flags = BN_TRANSLATE_OSREL, + .trans_osrel = linux32_trans_osrel }; static Elf32_Brandinfo linux_brand = { -- cgit v1.1 From b29642064f11390c250c27e7c2abfa6de93a6b9b Mon Sep 17 00:00:00 2001 From: rnoland Date: Tue, 1 Sep 2009 16:41:28 +0000 Subject: MFC 196643 Swap the start/end virtual addresses in pmap_invalidate_cache_range(). This fixes the functionality on non SelfSnoop hardware. Found by: rnoland Submitted by: alc Reviewed by: kib Approved by: re (rwatson) --- sys/amd64/amd64/pmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index b9eee49..f0da536 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -943,8 +943,8 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) * coherence domain. */ mfence(); - for (; eva < sva; eva += cpu_clflush_line_size) - clflush(eva); + for (; sva < eva; sva += cpu_clflush_line_size) + clflush(sva); mfence(); } else { -- cgit v1.1 From 08cdcfb10a2c831ad0a66910b1abf2228435bfa4 Mon Sep 17 00:00:00 2001 From: bz Date: Wed, 2 Sep 2009 10:39:46 +0000 Subject: MFC r196653: Make sure FreeBSD binaries without .note.ABI-tag section work correctly and do not match a colliding Debian GNU/kFreeBSD brandinfo statements. For this mark the Debian GNU/kFreeBSD brandinfo that it must have an .note.ABI-tag section and ignore the old EI_OSABI brandinfo when comparing a possibly colliding set of options. Due to SYSINIT we add the brandinfo in a non-deterministic order, so native FreeBSD is not always first. We may want to consider to force native FreeBSD to come first as well. The only way a problem could currently be noticed is when running an i386 binary without the .note.ABI-tag on amd64 and the Debian GNU/kFreeBSD brandinfo was matched first, as the fallback to ld-elf32.so.1 does not exist in that case. Reported and tested by: ticso In collaboration with: kib MFC after: 3 days Approved by: re (rwatson) --- sys/amd64/amd64/elf_machdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c index ea48b25..d5e7a6e 100644 --- a/sys/amd64/amd64/elf_machdep.c +++ b/sys/amd64/amd64/elf_machdep.c @@ -118,7 +118,7 @@ static Elf64_Brandinfo kfreebsd_brand_info = { .sysvec = &elf64_freebsd_sysvec, .interp_newpath = NULL, .brand_note = &elf64_kfreebsd_brandnote, - .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE + .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY }; SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY, -- cgit v1.1 From 9421144e6c2f592d781e0ad5d7866acfbb2be796 Mon Sep 17 00:00:00 2001 From: jhb Date: Thu, 3 Sep 2009 13:54:58 +0000 Subject: MFC 196705 and 196707: - Improve pmap_change_attr() on i386 so that it is able to demote a large (2/4MB) page into 4KB pages as needed. This should be fairly rare in practice. - Simplify pmap_change_attr() a bit: - Always calculate the cache bits instead of doing it on-demand. - Always set changed to TRUE rather than only doing it if it is false. Approved by: re (kib) --- sys/amd64/amd64/pmap.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index f0da536..4e35ef4 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -4476,7 +4476,8 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) if (base < DMAP_MIN_ADDRESS) return (EINVAL); - cache_bits_pde = cache_bits_pte = -1; + cache_bits_pde = pmap_cache_bits(mode, 1); + cache_bits_pte = pmap_cache_bits(mode, 0); changed = FALSE; /* @@ -4493,8 +4494,6 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) * memory type, then we need not demote this page. Just * increment tmpva to the next 1GB page frame. */ - if (cache_bits_pde < 0) - cache_bits_pde = pmap_cache_bits(mode, 1); if ((*pdpe & PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_1gpage(tmpva) + NBPDP; continue; @@ -4522,8 +4521,6 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) * memory type, then we need not demote this page. Just * increment tmpva to the next 2MB page frame. */ - if (cache_bits_pde < 0) - cache_bits_pde = pmap_cache_bits(mode, 1); if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_2mpage(tmpva) + NBPDR; continue; @@ -4557,12 +4554,9 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (*pdpe & PG_PS) { - if (cache_bits_pde < 0) - cache_bits_pde = pmap_cache_bits(mode, 1); if ((*pdpe & PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pdpe, cache_bits_pde); - if (!changed) - changed = TRUE; + changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS) { if (pa_start == pa_end) { @@ -4588,12 +4582,9 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde & PG_PS) { - if (cache_bits_pde < 0) - cache_bits_pde = pmap_cache_bits(mode, 1); if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde); - if (!changed) - changed = TRUE; + changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS) { if (pa_start == pa_end) { @@ -4616,13 +4607,10 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) } tmpva = trunc_2mpage(tmpva) + NBPDR; } else { - if (cache_bits_pte < 0) - cache_bits_pte = pmap_cache_bits(mode, 0); pte = pmap_pde_to_pte(pde, tmpva); if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte); - if (!changed) - changed = TRUE; + changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS) { if (pa_start == pa_end) { -- cgit v1.1 From 61b83a071cf0407f3e644c4e9e3610216bb637bd Mon Sep 17 00:00:00 2001 From: jhb Date: Tue, 8 Sep 2009 21:50:34 +0000 Subject: MFC 196745: Don't attempt to bind the current thread to the CPU an IRQ is bound to when removing an interrupt handler from an IRQ during shutdown. During shutdown we are already bound to CPU 0 and this was triggering a panic. Approved by: re (kib) --- sys/amd64/amd64/local_apic.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c index 13bd774..87bec91 100644 --- a/sys/amd64/amd64/local_apic.c +++ b/sys/amd64/amd64/local_apic.c @@ -990,18 +990,21 @@ apic_free_vector(u_int apic_id, u_int vector, u_int irq) * we don't lose an interrupt delivery race. */ td = curthread; - thread_lock(td); - if (sched_is_bound(td)) - panic("apic_free_vector: Thread already bound.\n"); - sched_bind(td, apic_cpuid(apic_id)); - thread_unlock(td); + if (!rebooting) { + thread_lock(td); + if (sched_is_bound(td)) + panic("apic_free_vector: Thread already bound.\n"); + sched_bind(td, apic_cpuid(apic_id)); + thread_unlock(td); + } mtx_lock_spin(&icu_lock); lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = -1; mtx_unlock_spin(&icu_lock); - thread_lock(td); - sched_unbind(td); - thread_unlock(td); - + if (!rebooting) { + thread_lock(td); + sched_unbind(td); + thread_unlock(td); + } } /* Map an IDT vector (APIC) to an IRQ (interrupt source). */ -- cgit v1.1 From 90a09c13dfdde3ffcf990b535aa7fe8eeb253dbf Mon Sep 17 00:00:00 2001 From: kensmith Date: Thu, 10 Sep 2009 14:04:00 +0000 Subject: Remove extra debugging support that is turned on for head but turned off for stable branches: - shift to MALLOC_PRODUCTION - turn off automatic crash dumps - Remove kernel debuggers, INVARIANTS*[1], WITNESS* from GENERIC kernel config files[2] [1] INVARIANTS* left on for ia64 by request marcel [2] sun4v was left as-is Reviewed by: marcel, kib Approved by: re (implicit) --- sys/amd64/conf/GENERIC | 9 --------- 1 file changed, 9 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index a49f7bc..ddd3035 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -76,15 +76,6 @@ options FLOWTABLE # per-cpu routing cache #options KDTRACE_FRAME # Ensure frames are compiled in #options KDTRACE_HOOKS # Kernel DTrace hooks -# Debugging for use in -current -options KDB # Enable kernel debugger support. -options DDB # Support DDB. -options GDB # Support remote GDB. -options INVARIANTS # Enable calls of extra sanity checking -options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS -options WITNESS # Enable checks to detect deadlocks and cycles -options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed - # Make an SMP-capable kernel by default options SMP # Symmetric MultiProcessor Kernel -- cgit v1.1 From 5ee8918a73551dc2a9235ad05250c3225f12ddd1 Mon Sep 17 00:00:00 2001 From: jhb Date: Fri, 25 Sep 2009 15:08:26 +0000 Subject: MFC 197410: - Split the logic to parse an SMAP entry out into a separate function on amd64 similar to i386. This fixes a bug on amd64 where overlapping entries would not cause the SMAP parsing to stop. - Change the SMAP parsing code to do a sorted insertion into physmap[] instead of an append to support systems with out-of-order SMAP entries. Approved by: re (kib) --- sys/amd64/amd64/machdep.c | 106 +++++++++++++++++++++++++++++++--------------- 1 file changed, 73 insertions(+), 33 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 0bfd7ad..95db5d2 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1192,6 +1192,77 @@ isa_irq_pending(void) u_int basemem; +static int +add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) +{ + int i, insert_idx, physmap_idx; + + physmap_idx = *physmap_idxp; + + if (boothowto & RB_VERBOSE) + printf("SMAP type=%02x base=%016lx len=%016lx\n", + smap->type, smap->base, smap->length); + + if (smap->type != SMAP_TYPE_MEMORY) + return (1); + + if (smap->length == 0) + return (0); + + /* + * Find insertion point while checking for overlap. Start off by + * assuming the new entry will be added to the end. + */ + insert_idx = physmap_idx + 2; + for (i = 0; i <= physmap_idx; i += 2) { + if (smap->base < physmap[i + 1]) { + if (smap->base + smap->length <= physmap[i]) { + insert_idx = i; + break; + } + if (boothowto & RB_VERBOSE) + printf( + "Overlapping memory regions, ignoring second region\n"); + return (1); + } + } + + /* See if we can prepend to the next entry. */ + if (insert_idx <= physmap_idx && + smap->base + smap->length == physmap[insert_idx]) { + physmap[insert_idx] = smap->base; + return (1); + } + + /* See if we can append to the previous entry. */ + if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) { + physmap[insert_idx - 1] += smap->length; + return (1); + } + + physmap_idx += 2; + *physmap_idxp = physmap_idx; + if (physmap_idx == PHYSMAP_SIZE) { + printf( + "Too many segments in the physical address map, giving up\n"); + return (0); + } + + /* + * Move the last 'N' entries down to make room for the new + * entry if needed. + */ + for (i = physmap_idx; i > insert_idx; i -= 2) { + physmap[i] = physmap[i - 2]; + physmap[i + 1] = physmap[i - 1]; + } + + /* Insert the new entry. */ + physmap[insert_idx] = smap->base; + physmap[insert_idx + 1] = smap->base + smap->length; + return (1); +} + /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and @@ -1235,40 +1306,9 @@ getmemsize(caddr_t kmdp, u_int64_t first) smapsize = *((u_int32_t *)smapbase - 1); smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); - for (smap = smapbase; smap < smapend; smap++) { - if (boothowto & RB_VERBOSE) - printf("SMAP type=%02x base=%016lx len=%016lx\n", - smap->type, smap->base, smap->length); - - if (smap->type != SMAP_TYPE_MEMORY) - continue; - - if (smap->length == 0) - continue; - - for (i = 0; i <= physmap_idx; i += 2) { - if (smap->base < physmap[i + 1]) { - if (boothowto & RB_VERBOSE) - printf( - "Overlapping or non-monotonic memory region, ignoring second region\n"); - continue; - } - } - - if (smap->base == physmap[physmap_idx + 1]) { - physmap[physmap_idx + 1] += smap->length; - continue; - } - - physmap_idx += 2; - if (physmap_idx == PHYSMAP_SIZE) { - printf( - "Too many segments in the physical address map, giving up\n"); + for (smap = smapbase; smap < smapend; smap++) + if (!add_smap_entry(smap, physmap, &physmap_idx)) break; - } - physmap[physmap_idx] = smap->base; - physmap[physmap_idx + 1] = smap->base + smap->length; - } /* * Find the 'base memory' segment for SMP -- cgit v1.1 From 05c6929c66e6fa7d7d8229d117ccc2cda3df4027 Mon Sep 17 00:00:00 2001 From: rpaulo Date: Thu, 1 Oct 2009 10:06:09 +0000 Subject: MFC r197653: Improve 802.11s comment. Approved by: re (kib) --- sys/amd64/conf/GENERIC | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index ddd3035..24300bd 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -248,7 +248,7 @@ device xe # Xircom pccard Ethernet device wlan # 802.11 support options IEEE80211_DEBUG # enable debug msgs options IEEE80211_AMPDU_AGE # age frames in AMPDU reorder q's -options IEEE80211_SUPPORT_MESH # enable 802.11s D3.0 support +options IEEE80211_SUPPORT_MESH # enable 802.11s draft support device wlan_wep # 802.11 WEP support device wlan_ccmp # 802.11 CCMP support device wlan_tkip # 802.11 TKIP support -- cgit v1.1 From 96a6dd3944a98cac6db65b9808bc9ed92e7d83ce Mon Sep 17 00:00:00 2001 From: alc Date: Fri, 2 Oct 2009 05:11:46 +0000 Subject: MFC r197580 Temporarily disable the use of 1GB page mappings by the direct map. Approved by: re (kib) --- sys/amd64/amd64/pmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 4e35ef4..97de6b6 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -440,7 +440,7 @@ create_pagetables(vm_paddr_t *firstaddr) if (ndmpdp < 4) /* Minimum 4GB of dirmap */ ndmpdp = 4; DMPDPphys = allocpages(firstaddr, NDMPML4E); - if ((amd_feature & AMDID_PAGE1GB) == 0) + if (TRUE || (amd_feature & AMDID_PAGE1GB) == 0) DMPDphys = allocpages(firstaddr, ndmpdp); dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; @@ -474,7 +474,7 @@ create_pagetables(vm_paddr_t *firstaddr) /* Now set up the direct map space using either 2MB or 1GB pages */ /* Preset PG_M and PG_A because demotion expects it */ - if ((amd_feature & AMDID_PAGE1GB) == 0) { + if (TRUE || (amd_feature & AMDID_PAGE1GB) == 0) { for (i = 0; i < NPDEPG * ndmpdp; i++) { ((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT; ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | -- cgit v1.1 From 67dc2f16e41d0734777d6902781f14eb3a4b2042 Mon Sep 17 00:00:00 2001 From: kib Date: Sun, 4 Oct 2009 12:20:59 +0000 Subject: MFC r197663: As a workaround, for Intel CPUs, do not use CLFLUSH in pmap_invalidate_cache_range() when self-snoop is apparently not reported in cpu features. Approved by: re (bz, kensmith) --- sys/amd64/amd64/initcpu.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c index c293c1a..0037d66 100644 --- a/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -165,4 +165,10 @@ initializecpu(void) */ if ((cpu_feature & CPUID_CLFSH) != 0) cpu_clflush_line_size = ((cpu_procinfo >> 8) & 0xff) * 8; + /* + * XXXKIB: (temporary) hack to work around traps generated when + * CLFLUSHing APIC registers window. + */ + if (cpu_vendor_id == CPU_VENDOR_INTEL && !(cpu_feature & CPUID_SS)) + cpu_feature &= ~CPUID_CLFSH; } -- cgit v1.1 From 9bce578b0a2ab3f2a08aa77bdb16d4b6ed69bf46 Mon Sep 17 00:00:00 2001 From: attilio Date: Mon, 12 Oct 2009 16:05:31 +0000 Subject: MFC r197803, r197824, r197910: Per their definition, atomic instructions used in conjuction with memory barriers should also ensure that the compiler doesn't reorder paths where they are used. GCC, however, does that aggressively, even in presence of volatile operands. The most reliable way GCC offers for avoid instructions reordering is clobbering "memory". Not all our memory barriers, right now, clobber memory for GCC-like compilers. Fix these cases. Approved by: re (kib) --- sys/amd64/include/atomic.h | 104 ++++++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 44 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/atomic.h b/sys/amd64/include/atomic.h index d2a3846..a2bd930 100644 --- a/sys/amd64/include/atomic.h +++ b/sys/amd64/include/atomic.h @@ -32,9 +32,9 @@ #error this file needs sys/cdefs.h as a prerequisite #endif -#define mb() __asm__ __volatile__ ("mfence;": : :"memory") -#define wmb() __asm__ __volatile__ ("sfence;": : :"memory") -#define rmb() __asm__ __volatile__ ("lfence;": : :"memory") +#define mb() __asm __volatile("mfence;" : : : "memory") +#define wmb() __asm __volatile("sfence;" : : : "memory") +#define rmb() __asm __volatile("lfence;" : : : "memory") /* * Various simple operations on memory, each of which is atomic in the @@ -73,7 +73,8 @@ */ #if defined(KLD_MODULE) || !defined(__GNUCLIKE_ASM) #define ATOMIC_ASM(NAME, TYPE, OP, CONS, V) \ -void atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v) +void atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v); \ +void atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v) int atomic_cmpset_int(volatile u_int *dst, u_int exp, u_int src); int atomic_cmpset_long(volatile u_long *dst, u_long exp, u_long src); @@ -97,8 +98,9 @@ void atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) #endif /* - * The assembly is volatilized to demark potential before-and-after side - * effects if an interrupt or SMP collision were to occur. + * The assembly is volatilized to avoid code chunk removal by the compiler. + * GCC aggressively reorders operations and memory clobbering is necessary + * in order to avoid that for memory barriers. */ #define ATOMIC_ASM(NAME, TYPE, OP, CONS, V) \ static __inline void \ @@ -108,6 +110,15 @@ atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ : "=m" (*p) \ : CONS (V), "m" (*p)); \ } \ + \ +static __inline void \ +atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ +{ \ + __asm __volatile(MPLOCKED OP \ + : "=m" (*p) \ + : CONS (V), "m" (*p) \ + : "memory"); \ +} \ struct __hack /* @@ -205,18 +216,23 @@ atomic_fetchadd_long(volatile u_long *p, u_long v) * PentiumPro or higher, reads may pass writes, so for that case we have * to use a serializing instruction (i.e. with LOCK) to do the load in * SMP kernels. For UP kernels, however, the cache of the single processor - * is always consistent, so we don't need any memory barriers. + * is always consistent, so we only need to take care of compiler. */ #define ATOMIC_STORE_LOAD(TYPE, LOP, SOP) \ static __inline u_##TYPE \ atomic_load_acq_##TYPE(volatile u_##TYPE *p) \ { \ - return (*p); \ + u_##TYPE tmp; \ + \ + tmp = *p; \ + __asm __volatile ("" : : : "memory"); \ + return (tmp); \ } \ \ static __inline void \ atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ { \ + __asm __volatile ("" : : : "memory"); \ *p = v; \ } \ struct __hack @@ -247,7 +263,8 @@ atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ __asm __volatile(SOP \ : "=m" (*p), /* 0 */ \ "+r" (v) /* 1 */ \ - : "m" (*p)); /* 2 */ \ + : "m" (*p) /* 2 */ \ + : "memory"); \ } \ struct __hack @@ -327,44 +344,43 @@ u_long atomic_readandclear_long(volatile u_long *addr); #endif /* __GNUCLIKE_ASM */ -/* Acquire and release variants are identical to the normal ones. */ -#define atomic_set_acq_char atomic_set_char -#define atomic_set_rel_char atomic_set_char -#define atomic_clear_acq_char atomic_clear_char -#define atomic_clear_rel_char atomic_clear_char -#define atomic_add_acq_char atomic_add_char -#define atomic_add_rel_char atomic_add_char -#define atomic_subtract_acq_char atomic_subtract_char -#define atomic_subtract_rel_char atomic_subtract_char - -#define atomic_set_acq_short atomic_set_short -#define atomic_set_rel_short atomic_set_short -#define atomic_clear_acq_short atomic_clear_short -#define atomic_clear_rel_short atomic_clear_short -#define atomic_add_acq_short atomic_add_short -#define atomic_add_rel_short atomic_add_short -#define atomic_subtract_acq_short atomic_subtract_short -#define atomic_subtract_rel_short atomic_subtract_short - -#define atomic_set_acq_int atomic_set_int -#define atomic_set_rel_int atomic_set_int -#define atomic_clear_acq_int atomic_clear_int -#define atomic_clear_rel_int atomic_clear_int -#define atomic_add_acq_int atomic_add_int -#define atomic_add_rel_int atomic_add_int -#define atomic_subtract_acq_int atomic_subtract_int -#define atomic_subtract_rel_int atomic_subtract_int +#define atomic_set_acq_char atomic_set_barr_char +#define atomic_set_rel_char atomic_set_barr_char +#define atomic_clear_acq_char atomic_clear_barr_char +#define atomic_clear_rel_char atomic_clear_barr_char +#define atomic_add_acq_char atomic_add_barr_char +#define atomic_add_rel_char atomic_add_barr_char +#define atomic_subtract_acq_char atomic_subtract_barr_char +#define atomic_subtract_rel_char atomic_subtract_barr_char + +#define atomic_set_acq_short atomic_set_barr_short +#define atomic_set_rel_short atomic_set_barr_short +#define atomic_clear_acq_short atomic_clear_barr_short +#define atomic_clear_rel_short atomic_clear_barr_short +#define atomic_add_acq_short atomic_add_barr_short +#define atomic_add_rel_short atomic_add_barr_short +#define atomic_subtract_acq_short atomic_subtract_barr_short +#define atomic_subtract_rel_short atomic_subtract_barr_short + +#define atomic_set_acq_int atomic_set_barr_int +#define atomic_set_rel_int atomic_set_barr_int +#define atomic_clear_acq_int atomic_clear_barr_int +#define atomic_clear_rel_int atomic_clear_barr_int +#define atomic_add_acq_int atomic_add_barr_int +#define atomic_add_rel_int atomic_add_barr_int +#define atomic_subtract_acq_int atomic_subtract_barr_int +#define atomic_subtract_rel_int atomic_subtract_barr_int #define atomic_cmpset_acq_int atomic_cmpset_int #define atomic_cmpset_rel_int atomic_cmpset_int -#define atomic_set_acq_long atomic_set_long -#define atomic_set_rel_long atomic_set_long -#define atomic_clear_acq_long atomic_clear_long -#define atomic_clear_rel_long atomic_clear_long -#define atomic_add_acq_long atomic_add_long -#define atomic_add_rel_long atomic_add_long -#define atomic_subtract_acq_long atomic_subtract_long -#define atomic_subtract_rel_long atomic_subtract_long +#define atomic_set_acq_long atomic_set_barr_long +#define atomic_set_rel_long atomic_set_barr_long +#define atomic_clear_acq_long atomic_clear_barr_long +#define atomic_clear_rel_long atomic_clear_barr_long +#define atomic_add_acq_long atomic_add_barr_long +#define atomic_add_rel_long atomic_add_barr_long +#define atomic_subtract_acq_long atomic_subtract_barr_long +#define atomic_subtract_rel_long atomic_subtract_barr_long #define atomic_cmpset_acq_long atomic_cmpset_long #define atomic_cmpset_rel_long atomic_cmpset_long -- cgit v1.1 From 0c7713cc54c6b456ec24664e0b523960564cf1f4 Mon Sep 17 00:00:00 2001 From: kib Date: Tue, 20 Oct 2009 13:32:28 +0000 Subject: MFC r197933: Define architectural load bases for PIE binaries. MFC r198203 (by marius): Change load base for sparc to match default gcc memory layout model. Approved by: re (kensmith) --- sys/amd64/include/elf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/include/elf.h b/sys/amd64/include/elf.h index e5c95f7..88f4398 100644 --- a/sys/amd64/include/elf.h +++ b/sys/amd64/include/elf.h @@ -106,4 +106,10 @@ __ElfType(Auxinfo); #define ELF_TARG_MACH EM_X86_64 #define ELF_TARG_VER 1 +#if __ELF_WORD_SIZE == 32 +#define ET_DYN_LOAD_ADDR 0x01001000 +#else +#define ET_DYN_LOAD_ADDR 0x01021000 +#endif + #endif /* !_MACHINE_ELF_H_ */ -- cgit v1.1 From d72d5acfe5ce013b74ce3ec8b4b585db39d70756 Mon Sep 17 00:00:00 2001 From: jhb Date: Thu, 29 Oct 2009 16:00:27 +0000 Subject: MFC 197439: Extract the code to find and map the MADT ACPI table during early kernel startup and genericize it so it can be reused to map other tables as well: - Add a routine to walk a list of ACPI subtables such as those used in the APIC and SRAT tables in the MI acpi(4) driver. - Move the routines for mapping and unmapping an ACPI table as well as mapping the RSDT or XSDT and searching for a table with a given signature out into acpica_machdep.c for both amd64 and i386. --- sys/amd64/acpica/acpi_machdep.c | 244 +++++++++++++++++++++++++++++++++++++ sys/amd64/acpica/madt.c | 233 ++--------------------------------- sys/amd64/include/acpica_machdep.h | 3 + 3 files changed, 257 insertions(+), 223 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/acpica/acpi_machdep.c b/sys/amd64/acpica/acpi_machdep.c index b902c12..0d866e8 100644 --- a/sys/amd64/acpica/acpi_machdep.c +++ b/sys/amd64/acpica/acpi_machdep.c @@ -32,8 +32,12 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include +#include +#include #include @@ -100,6 +104,246 @@ acpi_cpu_c1() } /* + * Support for mapping ACPI tables during early boot. Currently this + * uses the crashdump map to map each table. However, the crashdump + * map is created in pmap_bootstrap() right after the direct map, so + * we should be able to just use pmap_mapbios() here instead. + * + * This makes the following assumptions about how we use this KVA: + * pages 0 and 1 are used to map in the header of each table found via + * the RSDT or XSDT and pages 2 to n are used to map in the RSDT or + * XSDT. This has to use 2 pages for the table headers in case a + * header spans a page boundary. + * + * XXX: We don't ensure the table fits in the available address space + * in the crashdump map. + */ + +/* + * Map some memory using the crashdump map. 'offset' is an offset in + * pages into the crashdump map to use for the start of the mapping. + */ +static void * +table_map(vm_paddr_t pa, int offset, vm_offset_t length) +{ + vm_offset_t va, off; + void *data; + + off = pa & PAGE_MASK; + length = roundup(length + off, PAGE_SIZE); + pa = pa & PG_FRAME; + va = (vm_offset_t)pmap_kenter_temporary(pa, offset) + + (offset * PAGE_SIZE); + data = (void *)(va + off); + length -= PAGE_SIZE; + while (length > 0) { + va += PAGE_SIZE; + pa += PAGE_SIZE; + length -= PAGE_SIZE; + pmap_kenter(va, pa); + invlpg(va); + } + return (data); +} + +/* Unmap memory previously mapped with table_map(). */ +static void +table_unmap(void *data, vm_offset_t length) +{ + vm_offset_t va, off; + + va = (vm_offset_t)data; + off = va & PAGE_MASK; + length = roundup(length + off, PAGE_SIZE); + va &= ~PAGE_MASK; + while (length > 0) { + pmap_kremove(va); + invlpg(va); + va += PAGE_SIZE; + length -= PAGE_SIZE; + } +} + +/* + * Map a table at a given offset into the crashdump map. It first + * maps the header to determine the table length and then maps the + * entire table. + */ +static void * +map_table(vm_paddr_t pa, int offset, const char *sig) +{ + ACPI_TABLE_HEADER *header; + vm_offset_t length; + void *table; + + header = table_map(pa, offset, sizeof(ACPI_TABLE_HEADER)); + if (strncmp(header->Signature, sig, ACPI_NAME_SIZE) != 0) { + table_unmap(header, sizeof(ACPI_TABLE_HEADER)); + return (NULL); + } + length = header->Length; + table_unmap(header, sizeof(ACPI_TABLE_HEADER)); + table = table_map(pa, offset, length); + if (ACPI_FAILURE(AcpiTbChecksum(table, length))) { + if (bootverbose) + printf("ACPI: Failed checksum for table %s\n", sig); + table_unmap(table, length); + return (NULL); + } + return (table); +} + +/* + * See if a given ACPI table is the requested table. Returns the + * length of the able if it matches or zero on failure. + */ +static int +probe_table(vm_paddr_t address, const char *sig) +{ + ACPI_TABLE_HEADER *table; + + table = table_map(address, 0, sizeof(ACPI_TABLE_HEADER)); + if (table == NULL) { + if (bootverbose) + printf("ACPI: Failed to map table at 0x%jx\n", + (uintmax_t)address); + return (0); + } + if (bootverbose) + printf("Table '%.4s' at 0x%jx\n", table->Signature, + (uintmax_t)address); + + if (strncmp(table->Signature, sig, ACPI_NAME_SIZE) != 0) { + table_unmap(table, sizeof(ACPI_TABLE_HEADER)); + return (0); + } + table_unmap(table, sizeof(ACPI_TABLE_HEADER)); + return (1); +} + +/* + * Try to map a table at a given physical address previously returned + * by acpi_find_table(). + */ +void * +acpi_map_table(vm_paddr_t pa, const char *sig) +{ + + return (map_table(pa, 0, sig)); +} + +/* Unmap a table previously mapped via acpi_map_table(). */ +void +acpi_unmap_table(void *table) +{ + ACPI_TABLE_HEADER *header; + + header = (ACPI_TABLE_HEADER *)table; + table_unmap(table, header->Length); +} + +/* + * Return the physical address of the requested table or zero if one + * is not found. + */ +vm_paddr_t +acpi_find_table(const char *sig) +{ + ACPI_PHYSICAL_ADDRESS rsdp_ptr; + ACPI_TABLE_RSDP *rsdp; + ACPI_TABLE_RSDT *rsdt; + ACPI_TABLE_XSDT *xsdt; + ACPI_TABLE_HEADER *table; + vm_paddr_t addr; + int i, count; + + if (resource_disabled("acpi", 0)) + return (0); + + /* + * Map in the RSDP. Since ACPI uses AcpiOsMapMemory() which in turn + * calls pmap_mapbios() to find the RSDP, we assume that we can use + * pmap_mapbios() to map the RSDP. + */ + if ((rsdp_ptr = AcpiOsGetRootPointer()) == 0) + return (0); + rsdp = pmap_mapbios(rsdp_ptr, sizeof(ACPI_TABLE_RSDP)); + if (rsdp == NULL) { + if (bootverbose) + printf("ACPI: Failed to map RSDP\n"); + return (0); + } + + /* + * For ACPI >= 2.0, use the XSDT if it is available. + * Otherwise, use the RSDT. We map the XSDT or RSDT at page 2 + * in the crashdump area. Pages 0 and 1 are used to map in the + * headers of candidate ACPI tables. + */ + addr = 0; + if (rsdp->Revision >= 2 && rsdp->XsdtPhysicalAddress != 0) { + /* + * AcpiOsGetRootPointer only verifies the checksum for + * the version 1.0 portion of the RSDP. Version 2.0 has + * an additional checksum that we verify first. + */ + if (AcpiTbChecksum((UINT8 *)rsdp, ACPI_RSDP_XCHECKSUM_LENGTH)) { + if (bootverbose) + printf("ACPI: RSDP failed extended checksum\n"); + return (0); + } + xsdt = map_table(rsdp->XsdtPhysicalAddress, 2, ACPI_SIG_XSDT); + if (xsdt == NULL) { + if (bootverbose) + printf("ACPI: Failed to map XSDT\n"); + return (0); + } + count = (xsdt->Header.Length - sizeof(ACPI_TABLE_HEADER)) / + sizeof(UINT64); + for (i = 0; i < count; i++) + if (probe_table(xsdt->TableOffsetEntry[i], sig)) { + addr = xsdt->TableOffsetEntry[i]; + break; + } + acpi_unmap_table(xsdt); + } else { + rsdt = map_table(rsdp->RsdtPhysicalAddress, 2, ACPI_SIG_RSDT); + if (rsdt == NULL) { + if (bootverbose) + printf("ACPI: Failed to map RSDT\n"); + return (0); + } + count = (rsdt->Header.Length - sizeof(ACPI_TABLE_HEADER)) / + sizeof(UINT32); + for (i = 0; i < count; i++) + if (probe_table(rsdt->TableOffsetEntry[i], sig)) { + addr = rsdt->TableOffsetEntry[i]; + break; + } + acpi_unmap_table(rsdt); + } + pmap_unmapbios((vm_offset_t)rsdp, sizeof(ACPI_TABLE_RSDP)); + if (addr == 0) { + if (bootverbose) + printf("ACPI: No %s table found\n", sig); + return (0); + } + if (bootverbose) + printf("%s: Found table at 0x%jx\n", sig, (uintmax_t)addr); + + /* + * Verify that we can map the full table and that its checksum is + * correct, etc. + */ + table = map_table(addr, 0, sig); + if (table == NULL) + return (0); + acpi_unmap_table(table); + + return (addr); +} + +/* * ACPI nexus(4) driver. */ static int diff --git a/sys/amd64/acpica/madt.c b/sys/amd64/acpica/madt.c index b27f8e4..a409682 100644 --- a/sys/amd64/acpica/madt.c +++ b/sys/amd64/acpica/madt.c @@ -36,27 +36,19 @@ __FBSDID("$FreeBSD$"); #include #include #include - #include -#include #include #include -#include #include #include -#include -#include #include -#include #include #include #include -typedef void madt_entry_handler(ACPI_SUBTABLE_HEADER *entry, void *arg); - /* These two arrays are indexed by APIC IDs. */ struct ioapic_info { void *io_apic; @@ -79,8 +71,6 @@ static enum intr_polarity interrupt_polarity(UINT16 IntiFlags, UINT8 Source); static enum intr_trigger interrupt_trigger(UINT16 IntiFlags, UINT8 Source); static int madt_find_cpu(u_int acpi_id, u_int *apic_id); static int madt_find_interrupt(int intr, void **apic, u_int *pin); -static void *madt_map(vm_paddr_t pa, int offset, vm_offset_t length); -static void *madt_map_table(vm_paddr_t pa, int offset, const char *sig); static void madt_parse_apics(ACPI_SUBTABLE_HEADER *entry, void *arg); static void madt_parse_interrupt_override( ACPI_MADT_INTERRUPT_OVERRIDE *intr); @@ -92,13 +82,10 @@ static int madt_probe(void); static int madt_probe_cpus(void); static void madt_probe_cpus_handler(ACPI_SUBTABLE_HEADER *entry, void *arg __unused); -static int madt_probe_table(vm_paddr_t address); static void madt_register(void *dummy); static int madt_setup_local(void); static int madt_setup_io(void); -static void madt_unmap(void *data, vm_offset_t length); -static void madt_unmap_table(void *table); -static void madt_walk_table(madt_entry_handler *handler, void *arg); +static void madt_walk_table(acpi_subtable_handler *handler, void *arg); static struct apic_enumerator madt_enumerator = { "MADT", @@ -109,224 +96,30 @@ static struct apic_enumerator madt_enumerator = { }; /* - * Code to abuse the crashdump map to map in the tables for the early - * probe. We cheat and make the following assumptions about how we - * use this KVA: pages 0 and 1 are used to map in the header of each - * table found via the RSDT or XSDT and pages 2 to n are used to map - * in the RSDT or XSDT. We have to use 2 pages for the table headers - * in case a header spans a page boundary. The offset is in pages; - * the length is in bytes. - */ -static void * -madt_map(vm_paddr_t pa, int offset, vm_offset_t length) -{ - vm_offset_t va, off; - void *data; - - off = pa & PAGE_MASK; - length = roundup(length + off, PAGE_SIZE); - pa = pa & PG_FRAME; - va = (vm_offset_t)pmap_kenter_temporary(pa, offset) + - (offset * PAGE_SIZE); - data = (void *)(va + off); - length -= PAGE_SIZE; - while (length > 0) { - va += PAGE_SIZE; - pa += PAGE_SIZE; - length -= PAGE_SIZE; - pmap_kenter(va, pa); - invlpg(va); - } - return (data); -} - -static void -madt_unmap(void *data, vm_offset_t length) -{ - vm_offset_t va, off; - - va = (vm_offset_t)data; - off = va & PAGE_MASK; - length = roundup(length + off, PAGE_SIZE); - va &= ~PAGE_MASK; - while (length > 0) { - pmap_kremove(va); - invlpg(va); - va += PAGE_SIZE; - length -= PAGE_SIZE; - } -} - -static void * -madt_map_table(vm_paddr_t pa, int offset, const char *sig) -{ - ACPI_TABLE_HEADER *header; - vm_offset_t length; - void *table; - - header = madt_map(pa, offset, sizeof(ACPI_TABLE_HEADER)); - if (strncmp(header->Signature, sig, ACPI_NAME_SIZE) != 0) { - madt_unmap(header, sizeof(ACPI_TABLE_HEADER)); - return (NULL); - } - length = header->Length; - madt_unmap(header, sizeof(ACPI_TABLE_HEADER)); - table = madt_map(pa, offset, length); - if (ACPI_FAILURE(AcpiTbChecksum(table, length))) { - if (bootverbose) - printf("MADT: Failed checksum for table %s\n", sig); - madt_unmap(table, length); - return (NULL); - } - return (table); -} - -static void -madt_unmap_table(void *table) -{ - ACPI_TABLE_HEADER *header; - - header = (ACPI_TABLE_HEADER *)table; - madt_unmap(table, header->Length); -} - -/* * Look for an ACPI Multiple APIC Description Table ("APIC") */ static int madt_probe(void) { - ACPI_PHYSICAL_ADDRESS rsdp_ptr; - ACPI_TABLE_RSDP *rsdp; - ACPI_TABLE_RSDT *rsdt; - ACPI_TABLE_XSDT *xsdt; - int i, count; - if (resource_disabled("acpi", 0)) + madt_physaddr = acpi_find_table(ACPI_SIG_MADT); + if (madt_physaddr == 0) return (ENXIO); - - /* - * Map in the RSDP. Since ACPI uses AcpiOsMapMemory() which in turn - * calls pmap_mapbios() to find the RSDP, we assume that we can use - * pmap_mapbios() to map the RSDP. - */ - if ((rsdp_ptr = AcpiOsGetRootPointer()) == 0) - return (ENXIO); - rsdp = pmap_mapbios(rsdp_ptr, sizeof(ACPI_TABLE_RSDP)); - if (rsdp == NULL) { - if (bootverbose) - printf("MADT: Failed to map RSDP\n"); - return (ENXIO); - } - - /* - * For ACPI >= 2.0, use the XSDT if it is available. - * Otherwise, use the RSDT. We map the XSDT or RSDT at page 1 - * in the crashdump area. Page 0 is used to map in the - * headers of candidate ACPI tables. - */ - if (rsdp->Revision >= 2 && rsdp->XsdtPhysicalAddress != 0) { - /* - * AcpiOsGetRootPointer only verifies the checksum for - * the version 1.0 portion of the RSDP. Version 2.0 has - * an additional checksum that we verify first. - */ - if (AcpiTbChecksum((UINT8 *)rsdp, ACPI_RSDP_XCHECKSUM_LENGTH)) { - if (bootverbose) - printf("MADT: RSDP failed extended checksum\n"); - return (ENXIO); - } - xsdt = madt_map_table(rsdp->XsdtPhysicalAddress, 2, - ACPI_SIG_XSDT); - if (xsdt == NULL) { - if (bootverbose) - printf("MADT: Failed to map XSDT\n"); - return (ENXIO); - } - count = (xsdt->Header.Length - sizeof(ACPI_TABLE_HEADER)) / - sizeof(UINT64); - for (i = 0; i < count; i++) - if (madt_probe_table(xsdt->TableOffsetEntry[i])) - break; - madt_unmap_table(xsdt); - } else { - rsdt = madt_map_table(rsdp->RsdtPhysicalAddress, 2, - ACPI_SIG_RSDT); - if (rsdt == NULL) { - if (bootverbose) - printf("MADT: Failed to map RSDT\n"); - return (ENXIO); - } - count = (rsdt->Header.Length - sizeof(ACPI_TABLE_HEADER)) / - sizeof(UINT32); - for (i = 0; i < count; i++) - if (madt_probe_table(rsdt->TableOffsetEntry[i])) - break; - madt_unmap_table(rsdt); - } - pmap_unmapbios((vm_offset_t)rsdp, sizeof(ACPI_TABLE_RSDP)); - if (madt_physaddr == 0) { - if (bootverbose) - printf("MADT: No MADT table found\n"); - return (ENXIO); - } - if (bootverbose) - printf("MADT: Found table at 0x%jx\n", - (uintmax_t)madt_physaddr); - - /* - * Verify that we can map the full table and that its checksum is - * correct, etc. - */ - madt = madt_map_table(madt_physaddr, 0, ACPI_SIG_MADT); - if (madt == NULL) - return (ENXIO); - madt_unmap_table(madt); - madt = NULL; - return (0); } /* - * See if a given ACPI table is the MADT. - */ -static int -madt_probe_table(vm_paddr_t address) -{ - ACPI_TABLE_HEADER *table; - - table = madt_map(address, 0, sizeof(ACPI_TABLE_HEADER)); - if (table == NULL) { - if (bootverbose) - printf("MADT: Failed to map table at 0x%jx\n", - (uintmax_t)address); - return (0); - } - if (bootverbose) - printf("Table '%.4s' at 0x%jx\n", table->Signature, - (uintmax_t)address); - - if (strncmp(table->Signature, ACPI_SIG_MADT, ACPI_NAME_SIZE) != 0) { - madt_unmap(table, sizeof(ACPI_TABLE_HEADER)); - return (0); - } - madt_physaddr = address; - madt_length = table->Length; - madt_unmap(table, sizeof(ACPI_TABLE_HEADER)); - return (1); -} - -/* * Run through the MP table enumerating CPUs. */ static int madt_probe_cpus(void) { - madt = madt_map_table(madt_physaddr, 0, ACPI_SIG_MADT); + madt = acpi_map_table(madt_physaddr, ACPI_SIG_MADT); + madt_length = madt->Header.Length; KASSERT(madt != NULL, ("Unable to re-map MADT")); madt_walk_table(madt_probe_cpus_handler, NULL); - madt_unmap_table(madt); + acpi_unmap_table(madt); madt = NULL; return (0); } @@ -417,17 +210,11 @@ SYSINIT(madt_register, SI_SUB_TUNABLES - 1, SI_ORDER_FIRST, * Call the handler routine for each entry in the MADT table. */ static void -madt_walk_table(madt_entry_handler *handler, void *arg) +madt_walk_table(acpi_subtable_handler *handler, void *arg) { - ACPI_SUBTABLE_HEADER *entry; - u_char *p, *end; - - end = (u_char *)(madt) + madt->Header.Length; - for (p = (u_char *)(madt + 1); p < end; ) { - entry = (ACPI_SUBTABLE_HEADER *)p; - handler(entry, arg); - p += entry->Length; - } + + acpi_walk_subtables(madt + 1, (char *)madt + madt->Header.Length, + handler, arg); } static void diff --git a/sys/amd64/include/acpica_machdep.h b/sys/amd64/include/acpica_machdep.h index 76cc69e..9943af7 100644 --- a/sys/amd64/include/acpica_machdep.h +++ b/sys/amd64/include/acpica_machdep.h @@ -77,5 +77,8 @@ extern int acpi_release_global_lock(uint32_t *lock); void acpi_SetDefaultIntrModel(int model); void acpi_cpu_c1(void); +void *acpi_map_table(vm_paddr_t pa, const char *sig); +void acpi_unmap_table(void *table); +vm_paddr_t acpi_find_table(const char *sig); #endif /* __ACPICA_MACHDEP_H__ */ -- cgit v1.1 From 6248096baf3dba8807416018717b2cf49d9418e3 Mon Sep 17 00:00:00 2001 From: kib Date: Thu, 29 Oct 2009 16:24:39 +0000 Subject: MFC r197389: Do panic regardeless of execution mode at the moment of T_RESERVED trap. --- sys/amd64/amd64/trap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 65f761e..cfccf3c 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -253,6 +253,11 @@ trap(struct trapframe *frame) } #endif + if (type == T_RESERVED) { + trap_fatal(frame, 0); + goto out; + } + #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI. If the PMC module is -- cgit v1.1 From 23c01e6e9ddfd2d071ce9f2df153c573c89de41b Mon Sep 17 00:00:00 2001 From: alc Date: Sat, 31 Oct 2009 18:54:26 +0000 Subject: MFC r197316 Add a new sysctl for reporting all of the supported page sizes. --- sys/amd64/include/param.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h index edcf427..10d3ab3 100644 --- a/sys/amd64/include/param.h +++ b/sys/amd64/include/param.h @@ -118,6 +118,8 @@ #define NBPML4 (1ul< Date: Sat, 31 Oct 2009 19:02:08 +0000 Subject: MFC r197317 When superpages are enabled, add the 2 or 4MB page size to the array of supported page sizes. --- sys/amd64/amd64/pmap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 97de6b6..d3d653d 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -663,6 +663,11 @@ pmap_init(void) * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); + if (pg_ps_enabled) { + KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, + ("pmap_init: can't assign to pagesizes[1]")); + pagesizes[1] = NBPDR; + } /* * Calculate the size of the pv head table for superpages. -- cgit v1.1 From c8f0456bbb423d4a66a049f99b5edaabde6bc7a0 Mon Sep 17 00:00:00 2001 From: avg Date: Sun, 1 Nov 2009 17:45:37 +0000 Subject: MFC 197647: cpufunc.h: unify/correct style of c extension names --- sys/amd64/include/cpufunc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/cpufunc.h b/sys/amd64/include/cpufunc.h index eb264ae..dee78cb 100644 --- a/sys/amd64/include/cpufunc.h +++ b/sys/amd64/include/cpufunc.h @@ -277,7 +277,7 @@ static __inline void mfence(void) { - __asm__ __volatile("mfence" : : : "memory"); + __asm __volatile("mfence" : : : "memory"); } static __inline void @@ -457,14 +457,14 @@ load_es(u_int sel) __asm __volatile("mov %0,%%es" : : "rm" (sel)); } -static inline void +static __inline void cpu_monitor(const void *addr, int extensions, int hints) { __asm __volatile("monitor;" : :"a" (addr), "c" (extensions), "d"(hints)); } -static inline void +static __inline void cpu_mwait(int extensions, int hints) { __asm __volatile("mwait;" : :"a" (hints), "c" (extensions)); -- cgit v1.1 From 6ecbe62b9495ba34cbecad58409cc5c20016dffe Mon Sep 17 00:00:00 2001 From: avg Date: Sun, 1 Nov 2009 18:39:26 +0000 Subject: MFC 197450: number of cleanups in i386 and amd64 pci md code --- sys/amd64/pci/pci_cfgreg.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/pci/pci_cfgreg.c b/sys/amd64/pci/pci_cfgreg.c index be9e404..3e29a58 100644 --- a/sys/amd64/pci/pci_cfgreg.c +++ b/sys/amd64/pci/pci_cfgreg.c @@ -181,9 +181,9 @@ pci_cfgenable(unsigned bus, unsigned slot, unsigned func, int reg, int bytes) { int dataport = 0; - if (bus <= PCI_BUSMAX && slot < 32 && func <= PCI_FUNCMAX && - reg <= PCI_REGMAX && bytes != 3 && (unsigned) bytes <= 4 && - (reg & (bytes - 1)) == 0) { + if (bus <= PCI_BUSMAX && slot <= PCI_SLOTMAX && func <= PCI_FUNCMAX && + (unsigned)reg <= PCI_REGMAX && bytes != 3 && + (unsigned)bytes <= 4 && (reg & (bytes - 1)) == 0) { outl(CONF1_ADDR_PORT, (1 << 31) | (bus << 16) | (slot << 11) | (func << 8) | (reg & ~0x03)); dataport = CONF1_DATA_PORT + (reg & 0x03); @@ -281,7 +281,7 @@ pcie_cfgregopen(uint64_t base, uint8_t minbus, uint8_t maxbus) * fall back to using type 1 config access instead. */ if (pci_cfgregopen() != 0) { - for (slot = 0; slot < 32; slot++) { + for (slot = 0; slot <= PCI_SLOTMAX; slot++) { val1 = pcireg_cfgread(0, slot, 0, 0, 4); if (val1 == 0xffffffff) continue; @@ -309,8 +309,8 @@ pciereg_cfgread(int bus, unsigned slot, unsigned func, unsigned reg, volatile vm_offset_t va; int data = -1; - if (bus < pcie_minbus || bus > pcie_maxbus || slot >= 32 || - func > PCI_FUNCMAX || reg >= 0x1000) + if (bus < pcie_minbus || bus > pcie_maxbus || slot > PCI_SLOTMAX || + func > PCI_FUNCMAX || reg > PCIE_REGMAX) return (-1); va = PCIE_VADDR(pcie_base, reg, bus, slot, func); @@ -336,8 +336,8 @@ pciereg_cfgwrite(int bus, unsigned slot, unsigned func, unsigned reg, int data, { volatile vm_offset_t va; - if (bus < pcie_minbus || bus > pcie_maxbus || slot >= 32 || - func > PCI_FUNCMAX || reg >= 0x1000) + if (bus < pcie_minbus || bus > pcie_maxbus || slot > PCI_SLOTMAX || + func > PCI_FUNCMAX || reg > PCIE_REGMAX) return; va = PCIE_VADDR(pcie_base, reg, bus, slot, func); -- cgit v1.1 From 096a30cf601c35e8da952b7712d8474b7aca22b2 Mon Sep 17 00:00:00 2001 From: jhb Date: Wed, 4 Nov 2009 20:49:14 +0000 Subject: MFC 198554: Fix some problems with effective mmap() offsets > 32 bits. This was partially fixed on amd64 earlier. Rather than forcing linux_mmap_common() to use a 32-bit offset, have it accept a 64-bit file offset. This offset is then passed to the real mmap() call. Rather than inventing a structure to hold the normal linux_mmap args that has a 64-bit offset, just pass each of the arguments individually to linux_mmap_common() since that more closes matches the existing style of various kern_foo() functions. --- sys/amd64/linux32/linux32_machdep.c | 66 +++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 36 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/linux32/linux32_machdep.c b/sys/amd64/linux32/linux32_machdep.c index 42ea070..46119b6 100644 --- a/sys/amd64/linux32/linux32_machdep.c +++ b/sys/amd64/linux32/linux32_machdep.c @@ -91,6 +91,10 @@ linux_to_bsd_sigaltstack(int lsa) return (bsa); } +static int linux_mmap_common(struct thread *td, l_uintptr_t addr, + l_size_t len, l_int prot, l_int flags, l_int fd, + l_loff_t pos); + int bsd_to_linux_sigaltstack(int bsa) { @@ -759,12 +763,9 @@ linux_clone(struct thread *td, struct linux_clone_args *args) #define STACK_SIZE (2 * 1024 * 1024) #define GUARD_SIZE (4 * PAGE_SIZE) -static int linux_mmap_common(struct thread *, struct l_mmap_argv *); - int linux_mmap2(struct thread *td, struct linux_mmap2_args *args) { - struct l_mmap_argv linux_args; #ifdef DEBUG if (ldebug(mmap2)) @@ -773,14 +774,9 @@ linux_mmap2(struct thread *td, struct linux_mmap2_args *args) args->flags, args->fd, args->pgoff); #endif - linux_args.addr = PTROUT(args->addr); - linux_args.len = args->len; - linux_args.prot = args->prot; - linux_args.flags = args->flags; - linux_args.fd = args->fd; - linux_args.pgoff = args->pgoff; - - return (linux_mmap_common(td, &linux_args)); + return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot, + args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff * + PAGE_SIZE)); } int @@ -799,15 +795,15 @@ linux_mmap(struct thread *td, struct linux_mmap_args *args) linux_args.addr, linux_args.len, linux_args.prot, linux_args.flags, linux_args.fd, linux_args.pgoff); #endif - if ((linux_args.pgoff % PAGE_SIZE) != 0) - return (EINVAL); - linux_args.pgoff /= PAGE_SIZE; - return (linux_mmap_common(td, &linux_args)); + return (linux_mmap_common(td, linux_args.addr, linux_args.len, + linux_args.prot, linux_args.flags, linux_args.fd, + (uint32_t)linux_args.pgoff)); } static int -linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args) +linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot, + l_int flags, l_int fd, l_loff_t pos) { struct proc *p = td->td_proc; struct mmap_args /* { @@ -830,21 +826,20 @@ linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args) * Linux mmap(2): * You must specify exactly one of MAP_SHARED and MAP_PRIVATE */ - if (! ((linux_args->flags & LINUX_MAP_SHARED) ^ - (linux_args->flags & LINUX_MAP_PRIVATE))) + if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE))) return (EINVAL); - if (linux_args->flags & LINUX_MAP_SHARED) + if (flags & LINUX_MAP_SHARED) bsd_args.flags |= MAP_SHARED; - if (linux_args->flags & LINUX_MAP_PRIVATE) + if (flags & LINUX_MAP_PRIVATE) bsd_args.flags |= MAP_PRIVATE; - if (linux_args->flags & LINUX_MAP_FIXED) + if (flags & LINUX_MAP_FIXED) bsd_args.flags |= MAP_FIXED; - if (linux_args->flags & LINUX_MAP_ANON) + if (flags & LINUX_MAP_ANON) bsd_args.flags |= MAP_ANON; else bsd_args.flags |= MAP_NOSYNC; - if (linux_args->flags & LINUX_MAP_GROWSDOWN) + if (flags & LINUX_MAP_GROWSDOWN) bsd_args.flags |= MAP_STACK; /* @@ -852,12 +847,12 @@ linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args) * on Linux/i386. We do this to ensure maximum compatibility. * Linux/ia64 does the same in i386 emulation mode. */ - bsd_args.prot = linux_args->prot; + bsd_args.prot = prot; if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) bsd_args.prot |= PROT_READ | PROT_EXEC; /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */ - bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : linux_args->fd; + bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd; if (bsd_args.fd != -1) { /* * Linux follows Solaris mmap(2) description: @@ -882,7 +877,7 @@ linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args) fdrop(fp, td); } - if (linux_args->flags & LINUX_MAP_GROWSDOWN) { + if (flags & LINUX_MAP_GROWSDOWN) { /* * The Linux MAP_GROWSDOWN option does not limit auto * growth of the region. Linux mmap with this option @@ -905,8 +900,7 @@ linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args) * fixed size of (STACK_SIZE - GUARD_SIZE). */ - if ((caddr_t)PTRIN(linux_args->addr) + linux_args->len > - p->p_vmspace->vm_maxsaddr) { + if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) { /* * Some Linux apps will attempt to mmap * thread stacks near the top of their @@ -937,19 +931,19 @@ linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args) * we map the full stack, since we don't have a way * to autogrow it. */ - if (linux_args->len > STACK_SIZE - GUARD_SIZE) { - bsd_args.addr = (caddr_t)PTRIN(linux_args->addr); - bsd_args.len = linux_args->len; + if (len > STACK_SIZE - GUARD_SIZE) { + bsd_args.addr = (caddr_t)PTRIN(addr); + bsd_args.len = len; } else { - bsd_args.addr = (caddr_t)PTRIN(linux_args->addr) - - (STACK_SIZE - GUARD_SIZE - linux_args->len); + bsd_args.addr = (caddr_t)PTRIN(addr) - + (STACK_SIZE - GUARD_SIZE - len); bsd_args.len = STACK_SIZE - GUARD_SIZE; } } else { - bsd_args.addr = (caddr_t)PTRIN(linux_args->addr); - bsd_args.len = linux_args->len; + bsd_args.addr = (caddr_t)PTRIN(addr); + bsd_args.len = len; } - bsd_args.pos = (off_t)linux_args->pgoff * PAGE_SIZE; + bsd_args.pos = pos; #ifdef DEBUG if (ldebug(mmap)) -- cgit v1.1 From 72c2b241e0df95c69568e87e57e73171f83a3097 Mon Sep 17 00:00:00 2001 From: attilio Date: Fri, 6 Nov 2009 10:15:15 +0000 Subject: MFC r198868, r198950: Opteron rev E family of processor expose a bug where acq memory barriers can be broken, resulting in random breakages. Printout a warning message if affecred family and model are found. --- sys/amd64/amd64/identcpu.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c index 2c1b804..19ddd96 100644 --- a/sys/amd64/amd64/identcpu.c +++ b/sys/amd64/amd64/identcpu.c @@ -607,6 +607,21 @@ print_AMD_info(void) printf(", %d lines/tag", (regs[2] >> 8) & 0x0f); print_AMD_l2_assoc((regs[2] >> 12) & 0x0f); } + + /* + * Opteron Rev E shows a bug as in very rare occasions a read memory + * barrier is not performed as expected if it is followed by a + * non-atomic read-modify-write instruction. + * As long as that bug pops up very rarely (intensive machine usage + * on other operating systems generally generates one unexplainable + * crash any 2 months) and as long as a model specific fix would be + * impratical at this stage, print out a warning string if the broken + * model and family are identified. + */ + if (CPUID_TO_FAMILY(cpu_id) == 0xf && CPUID_TO_MODEL(cpu_id) >= 0x20 && + CPUID_TO_MODEL(cpu_id) <= 0x3f) + printf("WARNING: This architecture revision has known SMP " + "hardware bugs which may cause random instability\n"); } static void -- cgit v1.1 From 7a40b8619c3f71cce2a269846280bdba81b22164 Mon Sep 17 00:00:00 2001 From: attilio Date: Fri, 6 Nov 2009 15:24:48 +0000 Subject: MFC r197070: Consolidate CPUID to CPU family/model macros for amd64 and i386 to reduce unnecessary #ifdef's for shared code between them. This MFC should unbreak the kernel build breakage introduced by r198977. Reported by: kib Pointy hat to: me --- sys/amd64/amd64/identcpu.c | 14 +++++++------- sys/amd64/amd64/initcpu.c | 4 ++-- sys/amd64/amd64/msi.c | 4 ++-- sys/amd64/include/specialreg.h | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c index 19ddd96..420dd03 100644 --- a/sys/amd64/amd64/identcpu.c +++ b/sys/amd64/amd64/identcpu.c @@ -371,21 +371,21 @@ printcpuinfo(void) switch (cpu_vendor_id) { case CPU_VENDOR_AMD: if ((amd_pminfo & AMDPM_TSC_INVARIANT) || - AMD64_CPU_FAMILY(cpu_id) >= 0x10 || + CPUID_TO_FAMILY(cpu_id) >= 0x10 || cpu_id == 0x60fb2) tsc_is_invariant = 1; break; case CPU_VENDOR_INTEL: if ((amd_pminfo & AMDPM_TSC_INVARIANT) || - (AMD64_CPU_FAMILY(cpu_id) == 0x6 && - AMD64_CPU_MODEL(cpu_id) >= 0xe) || - (AMD64_CPU_FAMILY(cpu_id) == 0xf && - AMD64_CPU_MODEL(cpu_id) >= 0x3)) + (CPUID_TO_FAMILY(cpu_id) == 0x6 && + CPUID_TO_MODEL(cpu_id) >= 0xe) || + (CPUID_TO_FAMILY(cpu_id) == 0xf && + CPUID_TO_MODEL(cpu_id) >= 0x3)) tsc_is_invariant = 1; break; case CPU_VENDOR_CENTAUR: - if (AMD64_CPU_FAMILY(cpu_id) == 0x6 && - AMD64_CPU_MODEL(cpu_id) >= 0xf && + if (CPUID_TO_FAMILY(cpu_id) == 0x6 && + CPUID_TO_MODEL(cpu_id) >= 0xf && (rdmsr(0x1203) & 0x100000000ULL) == 0) tsc_is_invariant = 1; break; diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c index 0037d66..7aaff82 100644 --- a/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -154,8 +154,8 @@ initializecpu(void) pg_nx = PG_NX; } if (cpu_vendor_id == CPU_VENDOR_CENTAUR && - AMD64_CPU_FAMILY(cpu_id) == 0x6 && - AMD64_CPU_MODEL(cpu_id) >= 0xf) + CPUID_TO_FAMILY(cpu_id) == 0x6 && + CPUID_TO_MODEL(cpu_id) >= 0xf) init_via(); /* diff --git a/sys/amd64/amd64/msi.c b/sys/amd64/amd64/msi.c index 736b692..91a8cbb 100644 --- a/sys/amd64/amd64/msi.c +++ b/sys/amd64/amd64/msi.c @@ -275,8 +275,8 @@ msi_init(void) case CPU_VENDOR_AMD: break; case CPU_VENDOR_CENTAUR: - if (AMD64_CPU_FAMILY(cpu_id) == 0x6 && - AMD64_CPU_MODEL(cpu_id) >= 0xf) + if (CPUID_TO_FAMILY(cpu_id) == 0x6 && + CPUID_TO_MODEL(cpu_id) >= 0xf) break; /* FALLTHROUGH */ default: diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h index 88ff734..d1f0c89 100644 --- a/sys/amd64/include/specialreg.h +++ b/sys/amd64/include/specialreg.h @@ -168,10 +168,10 @@ #define CPUID_FAMILY 0x00000f00 #define CPUID_EXT_MODEL 0x000f0000 #define CPUID_EXT_FAMILY 0x0ff00000 -#define AMD64_CPU_MODEL(id) \ +#define CPUID_TO_MODEL(id) \ ((((id) & CPUID_MODEL) >> 4) | \ (((id) & CPUID_EXT_MODEL) >> 12)) -#define AMD64_CPU_FAMILY(id) \ +#define CPUID_TO_FAMILY(id) \ ((((id) & CPUID_FAMILY) >> 8) + \ (((id) & CPUID_EXT_FAMILY) >> 20)) -- cgit v1.1 From c60a1c40ea3922c4c5f89319d756bf2fa348cd3f Mon Sep 17 00:00:00 2001 From: kensmith Date: Mon, 9 Nov 2009 21:39:42 +0000 Subject: Comment out the sbp(4) entry for GENERIC config files that contain it. There are known issues with this driver that are beyond what can be fixed for 8.0-RELEASE and the bugs can cause boot failure on some systems. It's not clear if it impacts all systems and there is interest in getting the problem fixed so for now just comment it out instead of remove it. Commit straight to stable/8, this is an 8.0-RELEASE issue. Head was left alone so work on it can continue there. Reviewed by: Primary misc. architecture maintainers (marcel, marius) --- sys/amd64/conf/GENERIC | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index 24300bd..d0f24e2 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -313,7 +313,7 @@ device udav # Davicom DM9601E USB # FireWire support device firewire # FireWire bus code -device sbp # SCSI over FireWire (Requires scbus and da) +#device sbp # SCSI over FireWire (Requires scbus and da) device fwe # Ethernet over FireWire (non-standard!) device fwip # IP over FireWire (RFC 2734,3146) device dcons # Dumb console driver -- cgit v1.1 From 890346338074885650a925acecc26c6214f9254d Mon Sep 17 00:00:00 2001 From: jhb Date: Tue, 17 Nov 2009 15:56:45 +0000 Subject: MFC 198043: Move the USB wireless drivers down into their own section next to the USB ethernet drivers. --- sys/amd64/conf/GENERIC | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index d0f24e2..f205106 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -288,10 +288,6 @@ device ukbd # Keyboard device ulpt # Printer device umass # Disks/Mass storage - Requires scbus and da device ums # Mouse -device rum # Ralink Technology RT2501USB wireless NICs -device uath # Atheros AR5523 wireless NICs -device ural # Ralink Technology RT2500USB wireless NICs -device zyd # ZyDAS zb1211/zb1211b wireless NICs device urio # Diamond Rio 500 MP3 player # USB Serial devices device uark # Technologies ARK3116 based serial adapters @@ -310,6 +306,11 @@ device cue # CATC USB Ethernet device kue # Kawasaki LSI USB Ethernet device rue # RealTek RTL8150 USB Ethernet device udav # Davicom DM9601E USB +# USB Wireless +device rum # Ralink Technology RT2501USB wireless NICs +device uath # Atheros AR5523 wireless NICs +device ural # Ralink Technology RT2500USB wireless NICs +device zyd # ZyDAS zb1211/zb1211b wireless NICs # FireWire support device firewire # FireWire bus code -- cgit v1.1 From 629ad8710b200ef57d28598657647a1306cf2b16 Mon Sep 17 00:00:00 2001 From: kuriyama Date: Sun, 22 Nov 2009 14:32:32 +0000 Subject: - MFC r199067,199215,199253 - Add hw.clflush_disable loader tunable to avoid panic (trap 9) at map_invalidate_cache_range() even if CPU is not Intel. - This tunable can be set to -1 (default), 0 and 1. -1 is same as current behavior, which automatically disable CLFLUSH on Intel CPUs without CPUID_SS (should be occured on Xen only). You can specify 1 when this panic happened on non-Intel CPUs (such as AMD's). Because disabling CLFLUSH may reduce performance, you can try with setting 0 on Intel CPUs without SS to use CLFLUSH feature. - Amd64 init_secondary() calls initializecpu() while curthread is still not properly set up. r199067 added the call to TUNABLE_INT_FETCH() to initializecpu() that results in hang because AP are started when kernel environment is already dynamic and thus needs to acquire mutex, that is too early in AP start sequence to work. Extract the code that should be executed only once, because it sets up global variables, from initializecpu() to initializecpucache(), and call the later only from hammer_time() executed on BSP. Now, TUNABLE_INT_FETCH() is done only once at BSP at the early boot stage. --- sys/amd64/amd64/initcpu.c | 22 +++++++++++++++++++++- sys/amd64/amd64/machdep.c | 1 + sys/amd64/include/md_var.h | 1 + 3 files changed, 23 insertions(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c index 7aaff82..c97ad3d 100644 --- a/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -47,6 +47,12 @@ __FBSDID("$FreeBSD$"); static int hw_instruction_sse; SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD, &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU"); +/* + * -1: automatic (default) + * 0: keep enable CLFLUSH + * 1: force disable CLFLUSH + */ +static int hw_clflush_disable = -1; int cpu; /* Are we 386, 386sx, 486, etc? */ u_int cpu_feature; /* Feature flags */ @@ -157,6 +163,11 @@ initializecpu(void) CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xf) init_via(); +} + +void +initializecpucache() +{ /* * CPUID with %eax = 1, %ebx returns @@ -169,6 +180,15 @@ initializecpu(void) * XXXKIB: (temporary) hack to work around traps generated when * CLFLUSHing APIC registers window. */ - if (cpu_vendor_id == CPU_VENDOR_INTEL && !(cpu_feature & CPUID_SS)) + TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable); + if (cpu_vendor_id == CPU_VENDOR_INTEL && !(cpu_feature & CPUID_SS) && + hw_clflush_disable == -1) + cpu_feature &= ~CPUID_CLFSH; + /* + * Allow to disable CLFLUSH feature manually by + * hw.clflush_disable tunable. This may help Xen guest on some AMD + * CPUs. + */ + if (hw_clflush_disable == 1) cpu_feature &= ~CPUID_CLFSH; } diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 95db5d2..e4c51a3 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1667,6 +1667,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) identify_cpu(); /* Final stage of CPU initialization */ initializecpu(); /* Initialize CPU registers */ + initializecpucache(); /* make an initial tss so cpu can get interrupt stack on syscall! */ common_tss[0].tss_rsp0 = thread0.td_kstack + \ diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h index c66fc9f..15df851 100644 --- a/sys/amd64/include/md_var.h +++ b/sys/amd64/include/md_var.h @@ -89,6 +89,7 @@ void gs_load_fault(void) __asm(__STRING(gs_load_fault)); void dump_add_page(vm_paddr_t); void dump_drop_page(vm_paddr_t); void initializecpu(void); +void initializecpucache(void); void fillw(int /*u_short*/ pat, void *base, size_t cnt); void fpstate_drop(struct thread *td); int is_physical_memory(vm_paddr_t addr); -- cgit v1.1 From c75ccf4f6c42e9755a06177deed1a3b2aa9025ed Mon Sep 17 00:00:00 2001 From: bz Date: Sat, 5 Dec 2009 20:37:46 +0000 Subject: MFC r197518: lindev(4) [1] is supposed to be a collection of linux-specific pseudo devices that we also support, just not by default (thus only LINT or module builds by default). While currently there is only "/dev/full" [2], we are planning to see more in the future. We may decide to change the module/dependency logic in the future should the list grow too long. This is not part of linux.ko as also non-linux binaries like kFreeBSD userland or ports can make use of this as well. Suggested by: rwatson [1] (name) Submitted by: ed [2] Discussed with: markm, ed, rwatson, kib (weeks ago) Reviewed by: rwatson, brueffer (prev. version) PR: kern/68961 --- sys/amd64/conf/NOTES | 3 +++ 1 file changed, 3 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES index 27fe068..5361224 100644 --- a/sys/amd64/conf/NOTES +++ b/sys/amd64/conf/NOTES @@ -503,3 +503,6 @@ options VM_KMEM_SIZE_SCALE # Enable NDIS binary driver support options NDISAPI device ndis + +# Linux-specific pseudo devices support +device lindev -- cgit v1.1 From 4b8cc441d499645fb9384830f45f987eda50ae9f Mon Sep 17 00:00:00 2001 From: bz Date: Sat, 5 Dec 2009 20:43:15 +0000 Subject: MFC r197729: Make sure that the primary native brandinfo always gets added first and the native ia32 compat as middle (before other things). o(ld)brandinfo as well as third party like linux, kfreebsd, etc. stays on SI_ORDER_ANY coming last. The reason for this is only to make sure that even in case we would overflow the MAX_BRANDS sized array, the native FreeBSD brandinfo would still be there and the system would be operational. Reviewed by: kib --- sys/amd64/amd64/elf_machdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c index d5e7a6e..dc7c8b9 100644 --- a/sys/amd64/amd64/elf_machdep.c +++ b/sys/amd64/amd64/elf_machdep.c @@ -89,7 +89,7 @@ static Elf64_Brandinfo freebsd_brand_info = { .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; -SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_ANY, +SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t) elf64_insert_brand_entry, &freebsd_brand_info); -- cgit v1.1 From 4f817226e165dbc93626909eb579a189bd4b6b67 Mon Sep 17 00:00:00 2001 From: avg Date: Tue, 8 Dec 2009 15:21:39 +0000 Subject: MFC r199184: reflect that pg_ps_enabled is a tunable --- sys/amd64/amd64/pmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index d3d653d..70fc041 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -181,7 +181,7 @@ pt_entry_t pg_nx; SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pg_ps_enabled = 1; -SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RD, &pg_ps_enabled, 0, +SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, "Are large page mappings enabled?"); static u_int64_t KPTphys; /* phys addr of kernel level 1 */ -- cgit v1.1 From 8dde51c9b8bb821ae73fe850b7e6fd6d5d2bf998 Mon Sep 17 00:00:00 2001 From: avg Date: Tue, 8 Dec 2009 15:27:06 +0000 Subject: MFC r199968: x86 cpu features: add MOVBE reporting and flag --- sys/amd64/amd64/identcpu.c | 2 +- sys/amd64/include/specialreg.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c index 420dd03..3cd2f5e 100644 --- a/sys/amd64/amd64/identcpu.c +++ b/sys/amd64/amd64/identcpu.c @@ -259,7 +259,7 @@ printcpuinfo(void) "\024SSE4.1" "\025SSE4.2" "\026x2APIC" /* xAPIC Extensions */ - "\027" + "\027MOVBE" "\030POPCNT" "\031" "\032" diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h index d1f0c89..8cadbcd 100644 --- a/sys/amd64/include/specialreg.h +++ b/sys/amd64/include/specialreg.h @@ -129,6 +129,7 @@ #define CPUID2_SSE41 0x00080000 #define CPUID2_SSE42 0x00100000 #define CPUID2_X2APIC 0x00200000 +#define CPUID2_MOVBE 0x00400000 #define CPUID2_POPCNT 0x00800000 /* -- cgit v1.1 From e317625370af5eb44b8855a15df69c7848fc1b45 Mon Sep 17 00:00:00 2001 From: kib Date: Sat, 12 Dec 2009 20:06:25 +0000 Subject: MFC r199135: Extract the code that records syscall results in the frame into MD function cpu_set_syscall_retval(). --- sys/amd64/amd64/trap.c | 34 +--------------------------------- sys/amd64/amd64/vm_machdep.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 33 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index cfccf3c..5583c82 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -1004,39 +1004,7 @@ syscall(struct trapframe *frame) #endif } - switch (error) { - case 0: - frame->tf_rax = td->td_retval[0]; - frame->tf_rdx = td->td_retval[1]; - frame->tf_rflags &= ~PSL_C; - break; - - case ERESTART: - /* - * Reconstruct pc, we know that 'syscall' is 2 bytes. - * We have to do a full context restore so that %r10 - * (which was holding the value of %rcx) is restored for - * the next iteration. - */ - frame->tf_rip -= frame->tf_err; - frame->tf_r10 = frame->tf_rcx; - td->td_pcb->pcb_flags |= PCB_FULLCTX; - break; - - case EJUSTRETURN: - break; - - default: - if (p->p_sysent->sv_errsize) { - if (error >= p->p_sysent->sv_errsize) - error = -1; /* XXX */ - else - error = p->p_sysent->sv_errtbl[error]; - } - frame->tf_rax = error; - frame->tf_rflags |= PSL_C; - break; - } + cpu_set_syscall_retval(td, error); /* * Traced syscall. diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index 51d1d62..6e56740 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -317,6 +317,45 @@ cpu_thread_free(struct thread *td) cpu_thread_clean(td); } +void +cpu_set_syscall_retval(struct thread *td, int error) +{ + + switch (error) { + case 0: + td->td_frame->tf_rax = td->td_retval[0]; + td->td_frame->tf_rdx = td->td_retval[1]; + td->td_frame->tf_rflags &= ~PSL_C; + break; + + case ERESTART: + /* + * Reconstruct pc, we know that 'syscall' is 2 bytes. + * We have to do a full context restore so that %r10 + * (which was holding the value of %rcx) is restored + * for the next iteration. + */ + td->td_frame->tf_rip -= td->td_frame->tf_err; + td->td_frame->tf_r10 = td->td_frame->tf_rcx; + td->td_pcb->pcb_flags |= PCB_FULLCTX; + break; + + case EJUSTRETURN: + break; + + default: + if (td->td_proc->p_sysent->sv_errsize) { + if (error >= td->td_proc->p_sysent->sv_errsize) + error = -1; /* XXX */ + else + error = td->td_proc->p_sysent->sv_errtbl[error]; + } + td->td_frame->tf_rax = error; + td->td_frame->tf_rflags |= PSL_C; + break; + } +} + /* * Initialize machine state (pcb and trap frame) for a new thread about to * upcall. Put enough state in the new thread's PCB to get it to go back -- cgit v1.1 From 62403394c0144ba68b98a015c199ceabd258a12b Mon Sep 17 00:00:00 2001 From: kib Date: Sat, 19 Dec 2009 10:28:24 +0000 Subject: MFC r200444: For ia32 syscall(), call cpu_set_syscall_retval(). --- sys/amd64/amd64/vm_machdep.c | 6 +++++- sys/amd64/ia32/ia32_syscall.c | 30 +----------------------------- 2 files changed, 6 insertions(+), 30 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index 6e56740..a99fdaa 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -330,10 +330,14 @@ cpu_set_syscall_retval(struct thread *td, int error) case ERESTART: /* - * Reconstruct pc, we know that 'syscall' is 2 bytes. + * Reconstruct pc, we know that 'syscall' is 2 bytes, + * lcall $X,y is 7 bytes, int 0x80 is 2 bytes. + * We saved this in tf_err. * We have to do a full context restore so that %r10 * (which was holding the value of %rcx) is restored * for the next iteration. + * r10 restore is only required for freebsd/amd64 processes, + * but shall be innocent for any ia32 ABI. */ td->td_frame->tf_rip -= td->td_frame->tf_err; td->td_frame->tf_r10 = td->td_frame->tf_rcx; diff --git a/sys/amd64/ia32/ia32_syscall.c b/sys/amd64/ia32/ia32_syscall.c index 4807248..5e20876 100644 --- a/sys/amd64/ia32/ia32_syscall.c +++ b/sys/amd64/ia32/ia32_syscall.c @@ -183,35 +183,7 @@ ia32_syscall(struct trapframe *frame) AUDIT_SYSCALL_EXIT(error, td); } - switch (error) { - case 0: - frame->tf_rax = td->td_retval[0]; - frame->tf_rdx = td->td_retval[1]; - frame->tf_rflags &= ~PSL_C; - break; - - case ERESTART: - /* - * Reconstruct pc, assuming lcall $X,y is 7 bytes, - * int 0x80 is 2 bytes. We saved this in tf_err. - */ - frame->tf_rip -= frame->tf_err; - break; - - case EJUSTRETURN: - break; - - default: - if (p->p_sysent->sv_errsize) { - if (error >= p->p_sysent->sv_errsize) - error = -1; /* XXX */ - else - error = p->p_sysent->sv_errtbl[error]; - } - frame->tf_rax = error; - frame->tf_rflags |= PSL_C; - break; - } + cpu_set_syscall_retval(td, error); /* * Traced syscall. -- cgit v1.1 From 6e07528a2ecb9c4786663c06930b2f0d59a50694 Mon Sep 17 00:00:00 2001 From: avg Date: Sat, 19 Dec 2009 10:38:28 +0000 Subject: MFC r200033: mca: improve status checking, recording and reporting --- sys/amd64/amd64/mca.c | 111 +++++++++++++++++++++++++++--------------------- sys/amd64/include/mca.h | 1 + 2 files changed, 63 insertions(+), 49 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/mca.c b/sys/amd64/amd64/mca.c index d291d00..7014f75 100644 --- a/sys/amd64/amd64/mca.c +++ b/sys/amd64/amd64/mca.c @@ -117,48 +117,6 @@ sysctl_mca_records(SYSCTL_HANDLER_ARGS) return (SYSCTL_OUT(req, &record, sizeof(record))); } -static struct mca_record * -mca_record_entry(int bank) -{ - struct mca_internal *rec; - uint64_t status; - u_int p[4]; - - status = rdmsr(MSR_MC_STATUS(bank)); - if (!(status & MC_STATUS_VAL)) - return (NULL); - - rec = malloc(sizeof(*rec), M_MCA, M_NOWAIT | M_ZERO); - if (rec == NULL) { - printf("MCA: Unable to allocate space for an event.\n"); - return (NULL); - } - - /* Save exception information. */ - rec->rec.mr_status = status; - if (status & MC_STATUS_ADDRV) - rec->rec.mr_addr = rdmsr(MSR_MC_ADDR(bank)); - if (status & MC_STATUS_MISCV) - rec->rec.mr_misc = rdmsr(MSR_MC_MISC(bank)); - rec->rec.mr_tsc = rdtsc(); - rec->rec.mr_apic_id = PCPU_GET(apic_id); - - /* - * Clear machine check. Don't do this for uncorrectable - * errors so that the BIOS can see them. - */ - if (!(rec->rec.mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) { - wrmsr(MSR_MC_STATUS(bank), 0); - do_cpuid(0, p); - } - - mtx_lock_spin(&mca_lock); - STAILQ_INSERT_TAIL(&mca_records, rec, link); - mca_count++; - mtx_unlock_spin(&mca_lock); - return (&rec->rec); -} - static const char * mca_error_ttype(uint16_t mca_error) { @@ -219,11 +177,13 @@ mca_error_request(uint16_t mca_error) } /* Dump details about a single machine check. */ -static void -mca_log(struct mca_record *rec) +static void __nonnull(1) +mca_log(const struct mca_record *rec) { uint16_t mca_error; + printf("MCA: bank %d, status 0x%016llx\n", rec->mr_bank, + (long long)rec->mr_status); printf("MCA: CPU %d ", rec->mr_apic_id); if (rec->mr_status & MC_STATUS_UC) printf("UNCOR "); @@ -329,6 +289,59 @@ mca_log(struct mca_record *rec) printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr); } +static int __nonnull(2) +mca_check_status(int bank, struct mca_record *rec) +{ + uint64_t status; + u_int p[4]; + + status = rdmsr(MSR_MC_STATUS(bank)); + if (!(status & MC_STATUS_VAL)) + return (0); + + /* Save exception information. */ + rec->mr_status = status; + rec->mr_bank = bank; + rec->mr_addr = 0; + if (status & MC_STATUS_ADDRV) + rec->mr_addr = rdmsr(MSR_MC_ADDR(bank)); + rec->mr_misc = 0; + if (status & MC_STATUS_MISCV) + rec->mr_misc = rdmsr(MSR_MC_MISC(bank)); + rec->mr_tsc = rdtsc(); + rec->mr_apic_id = PCPU_GET(apic_id); + + /* + * Clear machine check. Don't do this for uncorrectable + * errors so that the BIOS can see them. + */ + if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) { + wrmsr(MSR_MC_STATUS(bank), 0); + do_cpuid(0, p); + } + return (1); +} + +static void __nonnull(1) +mca_record_entry(const struct mca_record *record) +{ + struct mca_internal *rec; + + rec = malloc(sizeof(*rec), M_MCA, M_NOWAIT); + if (rec == NULL) { + printf("MCA: Unable to allocate space for an event.\n"); + mca_log(record); + return; + } + + rec->rec = *record; + rec->logged = 0; + mtx_lock_spin(&mca_lock); + STAILQ_INSERT_TAIL(&mca_records, rec, link); + mca_count++; + mtx_unlock_spin(&mca_lock); +} + /* * This scans all the machine check banks of the current CPU to see if * there are any machine checks. Any non-recoverable errors are @@ -341,7 +354,7 @@ mca_log(struct mca_record *rec) static int mca_scan(int mcip) { - struct mca_record *rec; + struct mca_record rec; uint64_t mcg_cap, ucmask; int count, i, recoverable; @@ -354,13 +367,13 @@ mca_scan(int mcip) ucmask |= MC_STATUS_OVER; mcg_cap = rdmsr(MSR_MCG_CAP); for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) { - rec = mca_record_entry(i); - if (rec != NULL) { + if (mca_check_status(i, &rec)) { count++; - if (rec->mr_status & ucmask) { + if (rec.mr_status & ucmask) { recoverable = 0; - mca_log(rec); + mca_log(&rec); } + mca_record_entry(&rec); } } return (mcip ? recoverable : count); diff --git a/sys/amd64/include/mca.h b/sys/amd64/include/mca.h index c43d989..ddc3aeb 100644 --- a/sys/amd64/include/mca.h +++ b/sys/amd64/include/mca.h @@ -36,6 +36,7 @@ struct mca_record { uint64_t mr_misc; uint64_t mr_tsc; int mr_apic_id; + int mr_bank; }; #ifdef _KERNEL -- cgit v1.1 From 2d6460c70b9c9e42fa2165c305d435aabe264008 Mon Sep 17 00:00:00 2001 From: avg Date: Sat, 19 Dec 2009 10:44:26 +0000 Subject: MFC r200064: mca: small enhancements related to cpu quirks --- sys/amd64/amd64/mca.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/mca.c b/sys/amd64/amd64/mca.c index 7014f75..0403de4 100644 --- a/sys/amd64/amd64/mca.c +++ b/sys/amd64/amd64/mca.c @@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -478,6 +479,8 @@ void mca_init(void) { uint64_t mcg_cap; + uint64_t ctl; + int skip; int i; /* MCE is required. */ @@ -495,15 +498,26 @@ mca_init(void) wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE); for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) { - /* - * Enable logging of all errors. For P6 - * processors, MC0_CTL is always enabled. - * - * XXX: Better CPU test needed here? - */ - if (!(i == 0 && (cpu_id & 0xf00) == 0x600)) - wrmsr(MSR_MC_CTL(i), 0xffffffffffffffffUL); + /* By default enable logging of all errors. */ + ctl = 0xffffffffffffffffUL; + skip = 0; + + if (cpu_vendor_id == CPU_VENDOR_INTEL) { + /* + * For P6 models before Nehalem MC0_CTL is + * always enabled and reserved. + */ + if (i == 0 && CPUID_TO_FAMILY(cpu_id) == 0x6 + && CPUID_TO_MODEL(cpu_id) < 0x1a) + skip = 1; + } else if (cpu_vendor_id == CPU_VENDOR_AMD) { + /* BKDG for Family 10h: unset GartTblWkEn. */ + if (i == 4 && CPUID_TO_FAMILY(cpu_id) >= 0xf) + ctl &= ~(1UL << 10); + } + if (!skip) + wrmsr(MSR_MC_CTL(i), ctl); /* Clear all errors. */ wrmsr(MSR_MC_STATUS(i), 0); } -- cgit v1.1 From d1f389d774670bd276da5e368fad88de21804cbb Mon Sep 17 00:00:00 2001 From: kib Date: Sat, 19 Dec 2009 11:31:28 +0000 Subject: MFC r198507: Use kern_sigprocmask() instead of direct manipulation of td_sigmask to reschedule newly blocked signals. MFC r198590: Trapsignal() calls kern_sigprocmask() when delivering catched signal with proc lock held. MFC r198670: For trapsignal() and postsig(), kern_sigprocmask() is called with both process lock and curproc->p_sigacts->ps_mtx locked. Prevent lock recursion on ps_mtx in reschedule_signals(). --- sys/amd64/amd64/machdep.c | 8 ++------ sys/amd64/ia32/ia32_signal.c | 30 ++++++++---------------------- sys/amd64/linux32/linux32_sysvec.c | 18 ++++++------------ 3 files changed, 16 insertions(+), 40 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index e4c51a3..c4130a4 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -415,7 +415,7 @@ sigreturn(td, uap) ucontext_t uc; struct proc *p = td->td_proc; struct trapframe *regs; - const ucontext_t *ucp; + ucontext_t *ucp; long rflags; int cs, error, ret; ksiginfo_t ksi; @@ -478,7 +478,6 @@ sigreturn(td, uap) td->td_pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; td->td_pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; - PROC_LOCK(p); #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; @@ -486,10 +485,7 @@ sigreturn(td, uap) td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif - td->td_sigmask = ucp->uc_sigmask; - SIG_CANTMASK(td->td_sigmask); - signotify(td); - PROC_UNLOCK(p); + kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); td->td_pcb->pcb_flags |= PCB_FULLCTX; td->td_pcb->pcb_full_iret = 1; return (EJUSTRETURN); diff --git a/sys/amd64/ia32/ia32_signal.c b/sys/amd64/ia32/ia32_signal.c index d7c1dd5..10ec641 100644 --- a/sys/amd64/ia32/ia32_signal.c +++ b/sys/amd64/ia32/ia32_signal.c @@ -244,10 +244,8 @@ freebsd32_setcontext(struct thread *td, struct freebsd32_setcontext_args *uap) if (ret == 0) { ret = ia32_set_mcontext(td, &uc.uc_mcontext); if (ret == 0) { - SIG_CANTMASK(uc.uc_sigmask); - PROC_LOCK(td->td_proc); - td->td_sigmask = uc.uc_sigmask; - PROC_UNLOCK(td->td_proc); + kern_sigprocmask(td, SIG_SETMASK, + &uc.uc_sigmask, NULL, 0); } } } @@ -273,10 +271,8 @@ freebsd32_swapcontext(struct thread *td, struct freebsd32_swapcontext_args *uap) if (ret == 0) { ret = ia32_set_mcontext(td, &uc.uc_mcontext); if (ret == 0) { - SIG_CANTMASK(uc.uc_sigmask); - PROC_LOCK(td->td_proc); - td->td_sigmask = uc.uc_sigmask; - PROC_UNLOCK(td->td_proc); + kern_sigprocmask(td, SIG_SETMASK, + &uc.uc_sigmask, NULL, 0); } } } @@ -544,9 +540,8 @@ freebsd4_freebsd32_sigreturn(td, uap) } */ *uap; { struct ia32_ucontext4 uc; - struct proc *p = td->td_proc; struct trapframe *regs; - const struct ia32_ucontext4 *ucp; + struct ia32_ucontext4 *ucp; int cs, eflags, error; ksiginfo_t ksi; @@ -610,11 +605,7 @@ freebsd4_freebsd32_sigreturn(td, uap) regs->tf_fs = ucp->uc_mcontext.mc_fs; regs->tf_gs = ucp->uc_mcontext.mc_gs; - PROC_LOCK(p); - td->td_sigmask = ucp->uc_sigmask; - SIG_CANTMASK(td->td_sigmask); - signotify(td); - PROC_UNLOCK(p); + kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); td->td_pcb->pcb_full_iret = 1; return (EJUSTRETURN); } @@ -631,9 +622,8 @@ freebsd32_sigreturn(td, uap) } */ *uap; { struct ia32_ucontext uc; - struct proc *p = td->td_proc; struct trapframe *regs; - const struct ia32_ucontext *ucp; + struct ia32_ucontext *ucp; int cs, eflags, error, ret; ksiginfo_t ksi; @@ -702,11 +692,7 @@ freebsd32_sigreturn(td, uap) regs->tf_gs = ucp->uc_mcontext.mc_gs; regs->tf_flags = TF_HASSEGS; - PROC_LOCK(p); - td->td_sigmask = ucp->uc_sigmask; - SIG_CANTMASK(td->td_sigmask); - signotify(td); - PROC_UNLOCK(p); + kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); td->td_pcb->pcb_full_iret = 1; return (EJUSTRETURN); } diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c index 54a04ee..6e3e326 100644 --- a/sys/amd64/linux32/linux32_sysvec.c +++ b/sys/amd64/linux32/linux32_sysvec.c @@ -565,9 +565,9 @@ linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { - struct proc *p = td->td_proc; struct l_sigframe frame; struct trapframe *regs; + sigset_t bmask; l_sigset_t lmask; int eflags, i; ksiginfo_t ksi; @@ -623,11 +623,8 @@ linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) lmask.__bits[0] = frame.sf_sc.sc_mask; for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) lmask.__bits[i+1] = frame.sf_extramask[i]; - PROC_LOCK(p); - linux_to_bsd_sigset(&lmask, &td->td_sigmask); - SIG_CANTMASK(td->td_sigmask); - signotify(td); - PROC_UNLOCK(p); + linux_to_bsd_sigset(&lmask, &bmask); + kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context. @@ -666,9 +663,9 @@ linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { - struct proc *p = td->td_proc; struct l_ucontext uc; struct l_sigcontext *context; + sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; @@ -725,11 +722,8 @@ linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) return(EINVAL); } - PROC_LOCK(p); - linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask); - SIG_CANTMASK(td->td_sigmask); - signotify(td); - PROC_UNLOCK(p); + linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); + kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context -- cgit v1.1 From dca5db95ead46cf3b3e992a4b6ce1b96273e1df5 Mon Sep 17 00:00:00 2001 From: avg Date: Mon, 21 Dec 2009 05:58:55 +0000 Subject: MFC r199969: amdsbwd: new driver for AMD SB600/SB7xx watchdog timer --- sys/amd64/conf/NOTES | 2 ++ 1 file changed, 2 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES index 5361224..a231d33 100644 --- a/sys/amd64/conf/NOTES +++ b/sys/amd64/conf/NOTES @@ -385,8 +385,10 @@ device asmc # Hardware watchdog timers: # # ichwd: Intel ICH watchdog timer +# amdsbwd: AMD SB7xx watchdog timer # device ichwd +device amdsbwd # # Temperature sensors: -- cgit v1.1 From 0b09bc897f9960c759f1437056dd1dae0a7f27b0 Mon Sep 17 00:00:00 2001 From: dougb Date: Tue, 29 Dec 2009 05:35:25 +0000 Subject: MFC r200594: Add INCLUDE_CONFIG_FILE, and a note in comments about how to also include the comments with CONFIGARGS --- sys/amd64/conf/DEFAULTS | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/DEFAULTS b/sys/amd64/conf/DEFAULTS index 78952ee..d6d39b6 100644 --- a/sys/amd64/conf/DEFAULTS +++ b/sys/amd64/conf/DEFAULTS @@ -21,3 +21,9 @@ options GEOM_PART_EBR options GEOM_PART_EBR_COMPAT options GEOM_PART_MBR +# Store the plain version of the configuration file in the kernel itself. +# To store the entire file, including comments, put this in /etc/src.conf: +# CONFIGARGS= -C +# See config(8) for more details. +# +options INCLUDE_CONFIG_FILE # Include this file in kernel -- cgit v1.1 From cf0d4c606046680632b3e1b5711ae921a707be70 Mon Sep 17 00:00:00 2001 From: imp Date: Mon, 4 Jan 2010 21:33:10 +0000 Subject: Revert 201158. DEFAULTS isn't for this kind of thing.a --- sys/amd64/conf/DEFAULTS | 7 ------- 1 file changed, 7 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/DEFAULTS b/sys/amd64/conf/DEFAULTS index d6d39b6..1fb52b3 100644 --- a/sys/amd64/conf/DEFAULTS +++ b/sys/amd64/conf/DEFAULTS @@ -20,10 +20,3 @@ options GEOM_PART_BSD options GEOM_PART_EBR options GEOM_PART_EBR_COMPAT options GEOM_PART_MBR - -# Store the plain version of the configuration file in the kernel itself. -# To store the entire file, including comments, put this in /etc/src.conf: -# CONFIGARGS= -C -# See config(8) for more details. -# -options INCLUDE_CONFIG_FILE # Include this file in kernel -- cgit v1.1 From e906e61885fb51fb5334cec9007bd7cd17507e99 Mon Sep 17 00:00:00 2001 From: brooks Date: Tue, 12 Jan 2010 06:00:56 +0000 Subject: MFC r201443: Add vlan(4) to all GENERIC kernels. --- sys/amd64/conf/GENERIC | 1 + 1 file changed, 1 insertion(+) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index f205106..6dc5c2c 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -265,6 +265,7 @@ device wi # WaveLAN/Intersil/Symbol 802.11 wireless NICs. device loop # Network loopback device random # Entropy device device ether # Ethernet support +device vlan # 802.1Q VLAN support device tun # Packet tunnel. device pty # BSD-style compatibility pseudo ttys device md # Memory "disks" -- cgit v1.1 From aa96e9a5bce21018931dd47a420764621121186f Mon Sep 17 00:00:00 2001 From: kib Date: Fri, 15 Jan 2010 22:19:51 +0000 Subject: MFC r201890: Set md_ldt after md_ldt_sd is populated. --- sys/amd64/amd64/sys_machdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/sys_machdep.c b/sys/amd64/amd64/sys_machdep.c index 1cba8a2..bb81664 100644 --- a/sys/amd64/amd64/sys_machdep.c +++ b/sys/amd64/amd64/sys_machdep.c @@ -420,13 +420,14 @@ user_ldt_alloc(struct proc *p, int force) return (pldt); } - mdp->md_ldt = new_ldt; if (pldt != NULL) { bcopy(pldt->ldt_base, new_ldt->ldt_base, max_ldt_segment * sizeof(struct user_segment_descriptor)); user_ldt_derefl(pldt); } ssdtosyssd(&sldt, &p->p_md.md_ldt_sd); + atomic_store_rel_ptr((volatile uintptr_t *)&mdp->md_ldt, + (uintptr_t)new_ldt); if (p == curproc) set_user_ldt(mdp); -- cgit v1.1 From 258a09a63f93b76b1ccd2c3d907309d3556f50b7 Mon Sep 17 00:00:00 2001 From: imp Date: Mon, 18 Jan 2010 00:53:21 +0000 Subject: MFC r202019: Add INCLUDE_CONFIG_FILE in GENERIC on all non-embedded platforms. # This is the resolution of removing it from DEFAULTS... --- sys/amd64/conf/GENERIC | 1 + 1 file changed, 1 insertion(+) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index 6dc5c2c..e5a6955 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -75,6 +75,7 @@ options MAC # TrustedBSD MAC Framework options FLOWTABLE # per-cpu routing cache #options KDTRACE_FRAME # Ensure frames are compiled in #options KDTRACE_HOOKS # Kernel DTrace hooks +options INCLUDE_CONFIG_FILE # Include this file in kernel # Make an SMP-capable kernel by default options SMP # Symmetric MultiProcessor Kernel -- cgit v1.1 From f870e8630e0e421cce286c928d950a148a741bd3 Mon Sep 17 00:00:00 2001 From: alc Date: Mon, 18 Jan 2010 21:17:03 +0000 Subject: MFC r202085 Simplify pmap_init(). Additionally, correct a harmless misbehavior on i386. --- sys/amd64/amd64/pmap.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 70fc041..b26cc68 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -626,7 +626,6 @@ pmap_page_init(vm_page_t m) void pmap_init(void) { - pd_entry_t *pd; vm_page_t mpte; vm_size_t s; int i, pv_npg; @@ -635,18 +634,13 @@ pmap_init(void) * Initialize the vm page array entries for the kernel pmap's * page table pages. */ - pd = pmap_pde(kernel_pmap, KERNBASE); for (i = 0; i < NKPT; i++) { - if ((pd[i] & (PG_PS | PG_V)) == (PG_PS | PG_V)) - continue; - KASSERT((pd[i] & PG_V) != 0, - ("pmap_init: page table page is missing")); - mpte = PHYS_TO_VM_PAGE(pd[i] & PG_FRAME); + mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = pmap_pde_pindex(KERNBASE) + i; - mpte->phys_addr = pd[i] & PG_FRAME; + mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); } /* -- cgit v1.1 From ed3f8b6cbec7668d7c3684c50f6db189f85ee4d7 Mon Sep 17 00:00:00 2001 From: jhb Date: Thu, 21 Jan 2010 15:10:20 +0000 Subject: MFC 202286: Update the ident for the XENHVM kernel config to match the filename. --- sys/amd64/conf/XENHVM | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/XENHVM b/sys/amd64/conf/XENHVM index 1536e3c..f875f5a 100644 --- a/sys/amd64/conf/XENHVM +++ b/sys/amd64/conf/XENHVM @@ -19,7 +19,7 @@ # $FreeBSD$ cpu HAMMER -ident GENERIC +ident XENHVM # To statically compile in device wiring instead of /boot/device.hints #hints "GENERIC.hints" # Default places to look for devices. -- cgit v1.1 From 41fd8cafd516ff7948cdf8e99dbdb1e760f8df59 Mon Sep 17 00:00:00 2001 From: jhb Date: Thu, 21 Jan 2010 17:54:29 +0000 Subject: MFC 198134,198149,198170,198171,198391,200948: Add a facility for associating optional descriptions with active interrupt handlers. This is primarily intended as a way to allow devices that use multiple interrupts (e.g. MSI) to meaningfully distinguish the various interrupt handlers. - Add a new BUS_DESCRIBE_INTR() method to the bus interface to associate a description with an active interrupt handler setup by BUS_SETUP_INTR. It has a default method (bus_generic_describe_intr()) which simply passes the request up to the parent device. - Add a bus_describe_intr() wrapper around BUS_DESCRIBE_INTR() that supports printf(9) style formatting using var args. - Reserve MAXCOMLEN bytes in the intr_handler structure to hold the name of an interrupt handler and copy the name passed to intr_event_add_handler() into that buffer instead of just saving the pointer to the name. - Add a new intr_event_describe_handler() which appends a description string to an interrupt handler's name. - Implement support for interrupt descriptions on amd64, i386, and sparc64 by having the nexus(4) driver supply a custom bus_describe_intr method that invokes a new intr_describe() MD routine which in turn looks up the associated interrupt event and invokes intr_event_describe_handler(). --- sys/amd64/amd64/intr_machdep.c | 17 +++++++++++++++++ sys/amd64/amd64/nexus.c | 12 ++++++++++++ sys/amd64/include/intr_machdep.h | 1 + 3 files changed, 30 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/intr_machdep.c b/sys/amd64/amd64/intr_machdep.c index 212ac0d..6ab80df 100644 --- a/sys/amd64/amd64/intr_machdep.c +++ b/sys/amd64/amd64/intr_machdep.c @@ -400,6 +400,23 @@ atpic_reset(void) } #endif +/* Add a description to an active interrupt handler. */ +int +intr_describe(u_int vector, void *ih, const char *descr) +{ + struct intsrc *isrc; + int error; + + isrc = intr_lookup_source(vector); + if (isrc == NULL) + return (EINVAL); + error = intr_event_describe_handler(isrc->is_event, ih, descr); + if (error) + return (error); + intrcnt_updatename(isrc); + return (0); +} + #ifdef DDB /* * Dump data about interrupt handlers diff --git a/sys/amd64/amd64/nexus.c b/sys/amd64/amd64/nexus.c index 5eafd3b..61cb587 100644 --- a/sys/amd64/amd64/nexus.c +++ b/sys/amd64/amd64/nexus.c @@ -92,6 +92,9 @@ static int nexus_bind_intr(device_t, device_t, struct resource *, int); #endif static int nexus_config_intr(device_t, int, enum intr_trigger, enum intr_polarity); +static int nexus_describe_intr(device_t dev, device_t child, + struct resource *irq, void *cookie, + const char *descr); static int nexus_activate_resource(device_t, device_t, int, int, struct resource *); static int nexus_deactivate_resource(device_t, device_t, int, int, @@ -135,6 +138,7 @@ static device_method_t nexus_methods[] = { DEVMETHOD(bus_bind_intr, nexus_bind_intr), #endif DEVMETHOD(bus_config_intr, nexus_config_intr), + DEVMETHOD(bus_describe_intr, nexus_describe_intr), DEVMETHOD(bus_get_resource_list, nexus_get_reslist), DEVMETHOD(bus_set_resource, nexus_set_resource), DEVMETHOD(bus_get_resource, nexus_get_resource), @@ -479,6 +483,14 @@ nexus_config_intr(device_t dev, int irq, enum intr_trigger trig, return (intr_config_intr(irq, trig, pol)); } +static int +nexus_describe_intr(device_t dev, device_t child, struct resource *irq, + void *cookie, const char *descr) +{ + + return (intr_describe(rman_get_start(irq), cookie, descr)); +} + static struct resource_list * nexus_get_reslist(device_t dev, device_t child) { diff --git a/sys/amd64/include/intr_machdep.h b/sys/amd64/include/intr_machdep.h index 634db19..6cd4eee 100644 --- a/sys/amd64/include/intr_machdep.h +++ b/sys/amd64/include/intr_machdep.h @@ -151,6 +151,7 @@ int intr_bind(u_int vector, u_char cpu); #endif int intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol); +int intr_describe(u_int vector, void *ih, const char *descr); void intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame); u_int intr_next_cpu(void); struct intsrc *intr_lookup_source(int vector); -- cgit v1.1 From 8be646a544a7d87a9a3ceff771365482bb960a53 Mon Sep 17 00:00:00 2001 From: marcel Date: Fri, 22 Jan 2010 03:50:43 +0000 Subject: MFC rev. 202097: Use io(4) for I/O port access on ia64, rather than through sysarch(2). --- sys/amd64/amd64/io.c | 9 +++++++++ sys/amd64/include/iodev.h | 1 + 2 files changed, 10 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/io.c b/sys/amd64/amd64/io.c index 02d9c8d..09d6e89 100644 --- a/sys/amd64/amd64/io.c +++ b/sys/amd64/amd64/io.c @@ -76,3 +76,12 @@ ioclose(struct cdev *dev __unused, int flags __unused, int fmt __unused, return (0); } + +/* ARGSUSED */ +int +ioioctl(struct cdev *dev __unused, u_long cmd __unused, caddr_t data __unused, + int fflag __unused, struct thread *td __unused) +{ + + return (ENXIO); +} diff --git a/sys/amd64/include/iodev.h b/sys/amd64/include/iodev.h index 4b35d8b..1a0a17a 100644 --- a/sys/amd64/include/iodev.h +++ b/sys/amd64/include/iodev.h @@ -28,3 +28,4 @@ d_open_t ioopen; d_close_t ioclose; +d_ioctl_t ioioctl; -- cgit v1.1 From 250c6042c1a01d96dc2843c5fb8641674678684a Mon Sep 17 00:00:00 2001 From: gavin Date: Fri, 5 Feb 2010 08:52:51 +0000 Subject: Merge r202161 from head: Spell "Hz" correctly wherever it is user-visible. PR: bin/142566 Submitted by: N.J. Mann njm njm.me.uk --- sys/amd64/amd64/local_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c index 87bec91..98ed4df 100644 --- a/sys/amd64/amd64/local_apic.c +++ b/sys/amd64/amd64/local_apic.c @@ -448,7 +448,7 @@ lapic_setup_clock(void) panic("lapic: Divisor too big"); value /= 2; if (bootverbose) - printf("lapic: Divisor %lu, Frequency %lu hz\n", + printf("lapic: Divisor %lu, Frequency %lu Hz\n", lapic_timer_divisor, value); /* -- cgit v1.1 From 105ceef6e77714e5419b12c57bcc1fa98d485e01 Mon Sep 17 00:00:00 2001 From: avg Date: Sat, 6 Feb 2010 12:17:20 +0000 Subject: MFC r203160: add static qualifier to definition of a static function --- sys/amd64/amd64/msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/msi.c b/sys/amd64/amd64/msi.c index 91a8cbb..6745ce2 100644 --- a/sys/amd64/amd64/msi.c +++ b/sys/amd64/amd64/msi.c @@ -288,7 +288,7 @@ msi_init(void) mtx_init(&msi_lock, "msi", NULL, MTX_DEF); } -void +static void msi_create_source(void) { struct msi_intsrc *msi; -- cgit v1.1 From e896a698a55dac3d00b1d933e56575d08d06bf37 Mon Sep 17 00:00:00 2001 From: kib Date: Sun, 7 Feb 2010 11:37:38 +0000 Subject: MFC r202882: For i386, amd64 and ia32 on amd64 MD syscall(), reread syscall number and arguments after ptracestop(), if debugger modified anything in the process environment. --- sys/amd64/amd64/trap.c | 176 +++++++++++++++++++++++++----------------- sys/amd64/ia32/ia32_syscall.c | 149 +++++++++++++++++++++-------------- 2 files changed, 200 insertions(+), 125 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 5583c82..41ca758 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -884,95 +884,131 @@ dblfault_handler(struct trapframe *frame) panic("double fault"); } -/* - * syscall - system call request C handler - * - * A system call is essentially treated as a trap. - */ -void -syscall(struct trapframe *frame) -{ - caddr_t params; +struct syscall_args { + u_int code; struct sysent *callp; - struct thread *td = curthread; - struct proc *p = td->td_proc; - register_t orig_tf_rflags; - int error; - int narg; register_t args[8]; register_t *argp; - u_int code; - int reg, regcnt; - ksiginfo_t ksi; - - PCPU_INC(cnt.v_syscall); + int narg; +}; -#ifdef DIAGNOSTIC - if (ISPL(frame->tf_cs) != SEL_UPL) { - panic("syscall"); - /* NOT REACHED */ - } -#endif +static int +fetch_syscall_args(struct thread *td, struct syscall_args *sa) +{ + struct proc *p; + struct trapframe *frame; + caddr_t params; + int reg, regcnt, error; + p = td->td_proc; + frame = td->td_frame; reg = 0; regcnt = 6; - td->td_pticks = 0; - td->td_frame = frame; - if (td->td_ucred != p->p_ucred) - cred_update_thread(td); + params = (caddr_t)frame->tf_rsp + sizeof(register_t); - code = frame->tf_rax; - orig_tf_rflags = frame->tf_rflags; + sa->code = frame->tf_rax; if (p->p_sysent->sv_prepsyscall) { - (*p->p_sysent->sv_prepsyscall)(frame, (int *)args, &code, ¶ms); + (*p->p_sysent->sv_prepsyscall)(frame, (int *)sa->args, + &sa->code, ¶ms); } else { - if (code == SYS_syscall || code == SYS___syscall) { - code = frame->tf_rdi; + if (sa->code == SYS_syscall || sa->code == SYS___syscall) { + sa->code = frame->tf_rdi; reg++; regcnt--; } } - if (p->p_sysent->sv_mask) - code &= p->p_sysent->sv_mask; + sa->code &= p->p_sysent->sv_mask; - if (code >= p->p_sysent->sv_size) - callp = &p->p_sysent->sv_table[0]; + if (sa->code >= p->p_sysent->sv_size) + sa->callp = &p->p_sysent->sv_table[0]; else - callp = &p->p_sysent->sv_table[code]; + sa->callp = &p->p_sysent->sv_table[sa->code]; - narg = callp->sy_narg; - KASSERT(narg <= sizeof(args) / sizeof(args[0]), + sa->narg = sa->callp->sy_narg; + KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]), ("Too many syscall arguments!")); error = 0; - argp = &frame->tf_rdi; - argp += reg; - bcopy(argp, args, sizeof(args[0]) * regcnt); - if (narg > regcnt) { + sa->argp = &frame->tf_rdi; + sa->argp += reg; + bcopy(sa->argp, sa->args, sizeof(sa->args[0]) * regcnt); + if (sa->narg > regcnt) { KASSERT(params != NULL, ("copyin args with no params!")); - error = copyin(params, &args[regcnt], - (narg - regcnt) * sizeof(args[0])); + error = copyin(params, &sa->args[regcnt], + (sa->narg - regcnt) * sizeof(sa->args[0])); } - argp = &args[0]; + sa->argp = &sa->args[0]; + /* + * This may result in two records if debugger modified + * registers or memory during sleep at stop/ptrace point. + */ #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) - ktrsyscall(code, narg, argp); + ktrsyscall(sa->code, sa->narg, sa->argp); #endif + return (error); +} - CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td, - td->td_proc->p_pid, td->td_name, code); +/* + * syscall - system call request C handler + * + * A system call is essentially treated as a trap. + */ +void +syscall(struct trapframe *frame) +{ + struct thread *td; + struct proc *p; + struct syscall_args sa; + register_t orig_tf_rflags; + int error; + ksiginfo_t ksi; + PCPU_INC(cnt.v_syscall); + td = curthread; + p = td->td_proc; td->td_syscalls++; +#ifdef DIAGNOSTIC + if (ISPL(frame->tf_cs) != SEL_UPL) { + panic("syscall"); + /* NOT REACHED */ + } +#endif + + td->td_pticks = 0; + td->td_frame = frame; + if (td->td_ucred != p->p_ucred) + cred_update_thread(td); + orig_tf_rflags = frame->tf_rflags; + if (p->p_flag & P_TRACED) { + PROC_LOCK(p); + td->td_dbgflags &= ~TDB_USERWR; + PROC_UNLOCK(p); + } + error = fetch_syscall_args(td, &sa); + + CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td, + td->td_proc->p_pid, td->td_name, sa.code); + if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; - STOPEVENT(p, S_SCE, narg); - + STOPEVENT(p, S_SCE, sa.narg); PTRACESTOP_SC(p, td, S_PT_SCE); + if (td->td_dbgflags & TDB_USERWR) { + /* + * Reread syscall number and arguments if + * debugger modified registers or memory. + */ + error = fetch_syscall_args(td, &sa); + if (error != 0) + goto retval; + td->td_retval[1] = frame->tf_rdx; + } #ifdef KDTRACE_HOOKS /* @@ -980,13 +1016,13 @@ syscall(struct trapframe *frame) * callback and if there is a probe active for the * syscall 'entry', process the probe. */ - if (systrace_probe_func != NULL && callp->sy_entry != 0) - (*systrace_probe_func)(callp->sy_entry, code, callp, - args); + if (systrace_probe_func != NULL && sa.callp->sy_entry != 0) + (*systrace_probe_func)(sa.callp->sy_entry, sa.code, + sa.callp, sa.args); #endif - AUDIT_SYSCALL_ENTER(code, td); - error = (*callp->sy_call)(td, argp); + AUDIT_SYSCALL_ENTER(sa.code, td); + error = (*sa.callp->sy_call)(td, sa.argp); AUDIT_SYSCALL_EXIT(error, td); /* Save the latest error return value. */ @@ -998,12 +1034,12 @@ syscall(struct trapframe *frame) * callback and if there is a probe active for the * syscall 'return', process the probe. */ - if (systrace_probe_func != NULL && callp->sy_return != 0) - (*systrace_probe_func)(callp->sy_return, code, callp, - args); + if (systrace_probe_func != NULL && sa.callp->sy_return != 0) + (*systrace_probe_func)(sa.callp->sy_return, sa.code, + sa.callp, sa.args); #endif } - + retval: cpu_set_syscall_retval(td, error); /* @@ -1022,14 +1058,16 @@ syscall(struct trapframe *frame) * Check for misbehavior. */ WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning", - (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"); + (sa.code >= 0 && sa.code < SYS_MAXSYSCALL) ? + syscallnames[sa.code] : "???"); KASSERT(td->td_critnest == 0, ("System call %s returning in a critical section", - (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???")); + (sa.code >= 0 && sa.code < SYS_MAXSYSCALL) ? + syscallnames[sa.code] : "???")); KASSERT(td->td_locks == 0, ("System call %s returning with %d locks held", - (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???", - td->td_locks)); + (sa.code >= 0 && sa.code < SYS_MAXSYSCALL) ? + syscallnames[sa.code] : "???", td->td_locks)); /* * Handle reschedule and other end-of-syscall issues @@ -1037,11 +1075,11 @@ syscall(struct trapframe *frame) userret(td, frame); CTR4(KTR_SYSC, "syscall exit thread %p pid %d proc %s code %d", td, - td->td_proc->p_pid, td->td_name, code); + td->td_proc->p_pid, td->td_name, sa.code); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) - ktrsysret(code, error, td->td_retval[0]); + ktrsysret(sa.code, error, td->td_retval[0]); #endif /* @@ -1049,7 +1087,7 @@ syscall(struct trapframe *frame) * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ - STOPEVENT(p, S_SCX, code); + STOPEVENT(p, S_SCX, sa.code); PTRACESTOP_SC(p, td, S_PT_SCX); } diff --git a/sys/amd64/ia32/ia32_syscall.c b/sys/amd64/ia32/ia32_syscall.c index 5e20876..aa1ae6c 100644 --- a/sys/amd64/ia32/ia32_syscall.c +++ b/sys/amd64/ia32/ia32_syscall.c @@ -88,101 +88,136 @@ extern const char *freebsd32_syscallnames[]; void ia32_syscall(struct trapframe *frame); /* Called from asm code */ -void -ia32_syscall(struct trapframe *frame) -{ +struct ia32_syscall_args { + u_int code; caddr_t params; - int i; struct sysent *callp; - struct thread *td = curthread; - struct proc *p = td->td_proc; - register_t orig_tf_rflags; - int error; + u_int64_t args64[8]; int narg; +}; + +static int +fetch_ia32_syscall_args(struct thread *td, struct ia32_syscall_args *sa) +{ + struct proc *p; + struct trapframe *frame; u_int32_t args[8]; - u_int64_t args64[8]; - u_int code; - ksiginfo_t ksi; + int error, i; - PCPU_INC(cnt.v_syscall); - td->td_pticks = 0; - td->td_frame = frame; - if (td->td_ucred != p->p_ucred) - cred_update_thread(td); - params = (caddr_t)frame->tf_rsp + sizeof(u_int32_t); - code = frame->tf_rax; - orig_tf_rflags = frame->tf_rflags; + p = td->td_proc; + frame = td->td_frame; + + sa->params = (caddr_t)frame->tf_rsp + sizeof(u_int32_t); + sa->code = frame->tf_rax; if (p->p_sysent->sv_prepsyscall) { /* * The prep code is MP aware. */ - (*p->p_sysent->sv_prepsyscall)(frame, args, &code, ¶ms); + (*p->p_sysent->sv_prepsyscall)(frame, args, &sa->code, + &sa->params); } else { /* * Need to check if this is a 32 bit or 64 bit syscall. * fuword is MP aware. */ - if (code == SYS_syscall) { + if (sa->code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ - code = fuword32(params); - params += sizeof(int); - } else if (code == SYS___syscall) { + sa->code = fuword32(sa->params); + sa->params += sizeof(int); + } else if (sa->code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. * We use a 32-bit fetch in case params is not * aligned. */ - code = fuword32(params); - params += sizeof(quad_t); + sa->code = fuword32(sa->params); + sa->params += sizeof(quad_t); } } - if (p->p_sysent->sv_mask) - code &= p->p_sysent->sv_mask; - - if (code >= p->p_sysent->sv_size) - callp = &p->p_sysent->sv_table[0]; + sa->code &= p->p_sysent->sv_mask; + if (sa->code >= p->p_sysent->sv_size) + sa->callp = &p->p_sysent->sv_table[0]; else - callp = &p->p_sysent->sv_table[code]; - - narg = callp->sy_narg; + sa->callp = &p->p_sysent->sv_table[sa->code]; + sa->narg = sa->callp->sy_narg; - /* - * copyin and the ktrsyscall()/ktrsysret() code is MP-aware - */ - if (params != NULL && narg != 0) - error = copyin(params, (caddr_t)args, - (u_int)(narg * sizeof(int))); + if (sa->params != NULL && sa->narg != 0) + error = copyin(sa->params, (caddr_t)args, + (u_int)(sa->narg * sizeof(int))); else error = 0; - for (i = 0; i < narg; i++) - args64[i] = args[i]; + for (i = 0; i < sa->narg; i++) + sa->args64[i] = args[i]; #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) - ktrsyscall(code, narg, args64); + ktrsyscall(sa->code, sa->narg, sa->args64); #endif + + return (error); +} + +void +ia32_syscall(struct trapframe *frame) +{ + struct thread *td; + struct proc *p; + struct ia32_syscall_args sa; + register_t orig_tf_rflags; + int error; + ksiginfo_t ksi; + + PCPU_INC(cnt.v_syscall); + td = curthread; + p = td->td_proc; + td->td_syscalls++; + + td->td_pticks = 0; + td->td_frame = frame; + if (td->td_ucred != p->p_ucred) + cred_update_thread(td); + orig_tf_rflags = frame->tf_rflags; + if (p->p_flag & P_TRACED) { + PROC_LOCK(p); + td->td_dbgflags &= ~TDB_USERWR; + PROC_UNLOCK(p); + } + error = fetch_ia32_syscall_args(td, &sa); + CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td, - td->td_proc->p_pid, td->td_proc->p_comm, code); + td->td_proc->p_pid, td->td_name, sa.code); if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; - STOPEVENT(p, S_SCE, narg); - + STOPEVENT(p, S_SCE, sa.narg); PTRACESTOP_SC(p, td, S_PT_SCE); + if (td->td_dbgflags & TDB_USERWR) { + /* + * Reread syscall number and arguments if + * debugger modified registers or memory. + */ + error = fetch_ia32_syscall_args(td, &sa); + if (error != 0) + goto retval; + td->td_retval[1] = frame->tf_rdx; + } - AUDIT_SYSCALL_ENTER(code, td); - error = (*callp->sy_call)(td, args64); + AUDIT_SYSCALL_ENTER(sa.code, td); + error = (*sa.callp->sy_call)(td, sa.args64); AUDIT_SYSCALL_EXIT(error, td); - } + /* Save the latest error return value. */ + td->td_errno = error; + } + retval: cpu_set_syscall_retval(td, error); /* @@ -201,14 +236,16 @@ ia32_syscall(struct trapframe *frame) * Check for misbehavior. */ WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning", - (code >= 0 && code < SYS_MAXSYSCALL) ? freebsd32_syscallnames[code] : "???"); + (sa.code >= 0 && sa.code < SYS_MAXSYSCALL) ? + freebsd32_syscallnames[sa.code] : "???"); KASSERT(td->td_critnest == 0, ("System call %s returning in a critical section", - (code >= 0 && code < SYS_MAXSYSCALL) ? freebsd32_syscallnames[code] : "???")); + (sa.code >= 0 && sa.code < SYS_MAXSYSCALL) ? + freebsd32_syscallnames[sa.code] : "???")); KASSERT(td->td_locks == 0, ("System call %s returning with %d locks held", - (code >= 0 && code < SYS_MAXSYSCALL) ? freebsd32_syscallnames[code] : "???", - td->td_locks)); + (sa.code >= 0 && sa.code < SYS_MAXSYSCALL) ? + freebsd32_syscallnames[sa.code] : "???", td->td_locks)); /* * Handle reschedule and other end-of-syscall issues @@ -216,10 +253,10 @@ ia32_syscall(struct trapframe *frame) userret(td, frame); CTR4(KTR_SYSC, "syscall exit thread %p pid %d proc %s code %d", td, - td->td_proc->p_pid, td->td_proc->p_comm, code); + td->td_proc->p_pid, td->td_proc->p_comm, sa.code); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) - ktrsysret(code, error, td->td_retval[0]); + ktrsysret(sa.code, error, td->td_retval[0]); #endif /* @@ -227,7 +264,7 @@ ia32_syscall(struct trapframe *frame) * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ - STOPEVENT(p, S_SCX, code); + STOPEVENT(p, S_SCX, sa.code); PTRACESTOP_SC(p, td, S_PT_SCX); } -- cgit v1.1 From 05b666175c3574ab196b72aa01279fbccde0db29 Mon Sep 17 00:00:00 2001 From: delphij Date: Tue, 2 Mar 2010 01:56:55 +0000 Subject: MFC x86emu/x86bios emulator and make previously i386 only dpms and vesa framebuffer driver, etc. work on FreeBSD/amd64. A significant amount of improvements were done by jkim@ during the recent months to make vesa(4) work better, over the initial code import. This work is based on OpenBSD's x86emu implementation and contributed by paradox and swell.k at gmail com. Hopefully I have stolen all their work to 8-STABLE :) All bugs in this commit are mine, as usual. --- sys/amd64/conf/NOTES | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES index a231d33..159f12e 100644 --- a/sys/amd64/conf/NOTES +++ b/sys/amd64/conf/NOTES @@ -154,6 +154,17 @@ options AGP_DEBUG ##################################################################### # HARDWARE DEVICE CONFIGURATION +# To include support for VGA VESA video modes +options VESA + +# Turn on extra debugging checks and output for VESA support. +options VESA_DEBUG + +device dpms # DPMS suspend & resume via VESA BIOS + +# x86 real mode BIOS emulator, required by atkbdc/dpms/vesa +options X86BIOS + # # Optional devices: # @@ -213,6 +224,9 @@ options VGA_WIDTH90 # support 90 column modes # Debugging. options VGA_DEBUG +# Linear framebuffer driver for S3 VESA 1.2 cards. Works on top of VESA. +device s3pci + # 3Dfx Voodoo Graphics, Voodoo II /dev/3dfx CDEV support. This will create # the /dev/3dfx0 device to work with glide implementations. This should get # linked to /dev/3dfx and /dev/voodoo. Note that this is not the same as -- cgit v1.1 From 39a08e2d4d047871df11c95a4989509bcede329c Mon Sep 17 00:00:00 2001 From: alc Date: Tue, 2 Mar 2010 16:29:08 +0000 Subject: MFC r204420 When running as a guest operating system, the FreeBSD kernel must assume that the virtual machine monitor has enabled machine check exceptions. Unfortunately, on AMD Family 10h processors the machine check hardware has a bug (Erratum 383) that can result in a false machine check exception when a superpage promotion occurs. Thus, I am disabling superpage promotion when the FreeBSD kernel is running as a guest operating system on an AMD Family 10h processor. --- sys/amd64/amd64/pmap.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index b26cc68..7bb81cc 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -654,6 +654,15 @@ pmap_init(void) pv_entry_high_water = 9 * (pv_entry_max / 10); /* + * Disable large page mappings by default if the kernel is running in + * a virtual machine on an AMD Family 10h processor. This is a work- + * around for Erratum 383. + */ + if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD && + CPUID_TO_FAMILY(cpu_id) == 0x10) + pg_ps_enabled = 0; + + /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); -- cgit v1.1 From 9bc472d6f744e6faa934776114dd11efb219ced6 Mon Sep 17 00:00:00 2001 From: jhb Date: Mon, 8 Mar 2010 21:36:20 +0000 Subject: MFC 204518: Print the contents of the miscellaneous (MISC) register to the console if it is valid along with the other register values when a machine check is encountered. --- sys/amd64/amd64/mca.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/mca.c b/sys/amd64/amd64/mca.c index 0403de4..b0e842a 100644 --- a/sys/amd64/amd64/mca.c +++ b/sys/amd64/amd64/mca.c @@ -288,6 +288,8 @@ mca_log(const struct mca_record *rec) printf("\n"); if (rec->mr_status & MC_STATUS_ADDRV) printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr); + if (rec->mr_status & MC_STATUS_MISCV) + printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc); } static int __nonnull(2) -- cgit v1.1 From 5906cbf86b9bd4b4a3d60c8276db5df0a4742d72 Mon Sep 17 00:00:00 2001 From: kib Date: Wed, 24 Mar 2010 09:45:17 +0000 Subject: MFC r204957: Fall back to wbinvd when region for CLFLUSH is >= 2MB. MFC r205334 (by avg): Fix a typo in a comment. --- sys/amd64/amd64/pmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 7bb81cc..0935506 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -941,7 +941,8 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) if (cpu_feature & CPUID_SS) ; /* If "Self Snoop" is supported, do nothing. */ - else if (cpu_feature & CPUID_CLFSH) { + else if ((cpu_feature & CPUID_CLFSH) != 0 && + eva - sva < 2 * 1024 * 1024) { /* * Otherwise, do per-cache line flush. Use the mfence @@ -958,7 +959,8 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) /* * No targeted cache flush methods are supported by CPU, - * globally invalidate cache as a last resort. + * or the supplied range is bigger than 2MB. + * Globally invalidate cache. */ pmap_invalidate_cache(); } -- cgit v1.1 From 52188d4fac4c52b1c94084a738e21504945c2a0b Mon Sep 17 00:00:00 2001 From: jhb Date: Thu, 25 Mar 2010 15:48:23 +0000 Subject: MFC 205013: Print out the family and model from the cpu_id. This is especially useful given the advent of the extended family and extended model fields. The values are printed in hex to match their common usage in documentation. --- sys/amd64/amd64/identcpu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c index 3cd2f5e..b0da729 100644 --- a/sys/amd64/amd64/identcpu.c +++ b/sys/amd64/amd64/identcpu.c @@ -187,7 +187,9 @@ printcpuinfo(void) if (cpu_vendor_id == CPU_VENDOR_INTEL || cpu_vendor_id == CPU_VENDOR_AMD || cpu_vendor_id == CPU_VENDOR_CENTAUR) { - printf(" Stepping = %u", cpu_id & 0xf); + printf(" Family = %x", CPUID_TO_FAMILY(cpu_id)); + printf(" Model = %x", CPUID_TO_MODEL(cpu_id)); + printf(" Stepping = %u", cpu_id & CPUID_STEPPING); if (cpu_high > 0) { /* -- cgit v1.1 From e3fe54954e01dab6b1f05a15f0d43b54ce512210 Mon Sep 17 00:00:00 2001 From: jhb Date: Fri, 26 Mar 2010 13:01:30 +0000 Subject: MFC 205210,205448: Remove unneeded type specifiers from 64-bit constants. The compiler infers their natural type from the constants' values. --- sys/amd64/include/specialreg.h | 44 +++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h index 8cadbcd..733f4d7 100644 --- a/sys/amd64/include/specialreg.h +++ b/sys/amd64/include/specialreg.h @@ -320,16 +320,16 @@ #define MTRR_N64K 8 /* numbers of fixed-size entries */ #define MTRR_N16K 16 #define MTRR_N4K 64 -#define MTRR_CAP_WC 0x0000000000000400UL -#define MTRR_CAP_FIXED 0x0000000000000100UL -#define MTRR_CAP_VCNT 0x00000000000000ffUL -#define MTRR_DEF_ENABLE 0x0000000000000800UL -#define MTRR_DEF_FIXED_ENABLE 0x0000000000000400UL -#define MTRR_DEF_TYPE 0x00000000000000ffUL -#define MTRR_PHYSBASE_PHYSBASE 0x000ffffffffff000UL -#define MTRR_PHYSBASE_TYPE 0x00000000000000ffUL -#define MTRR_PHYSMASK_PHYSMASK 0x000ffffffffff000UL -#define MTRR_PHYSMASK_VALID 0x0000000000000800UL +#define MTRR_CAP_WC 0x0000000000000400 +#define MTRR_CAP_FIXED 0x0000000000000100 +#define MTRR_CAP_VCNT 0x00000000000000ff +#define MTRR_DEF_ENABLE 0x0000000000000800 +#define MTRR_DEF_FIXED_ENABLE 0x0000000000000400 +#define MTRR_DEF_TYPE 0x00000000000000ff +#define MTRR_PHYSBASE_PHYSBASE 0x000ffffffffff000 +#define MTRR_PHYSBASE_TYPE 0x00000000000000ff +#define MTRR_PHYSMASK_PHYSMASK 0x000ffffffffff000 +#define MTRR_PHYSMASK_VALID 0x0000000000000800 /* Performance Control Register (5x86 only). */ #define PCR0 0x20 @@ -357,22 +357,22 @@ #define MCG_STATUS_RIPV 0x00000001 #define MCG_STATUS_EIPV 0x00000002 #define MCG_STATUS_MCIP 0x00000004 -#define MCG_CTL_ENABLE 0xffffffffffffffffUL -#define MCG_CTL_DISABLE 0x0000000000000000UL +#define MCG_CTL_ENABLE 0xffffffffffffffff +#define MCG_CTL_DISABLE 0x0000000000000000 #define MSR_MC_CTL(x) (MSR_MC0_CTL + (x) * 4) #define MSR_MC_STATUS(x) (MSR_MC0_STATUS + (x) * 4) #define MSR_MC_ADDR(x) (MSR_MC0_ADDR + (x) * 4) #define MSR_MC_MISC(x) (MSR_MC0_MISC + (x) * 4) -#define MC_STATUS_MCA_ERROR 0x000000000000ffffUL -#define MC_STATUS_MODEL_ERROR 0x00000000ffff0000UL -#define MC_STATUS_OTHER_INFO 0x01ffffff00000000UL -#define MC_STATUS_PCC 0x0200000000000000UL -#define MC_STATUS_ADDRV 0x0400000000000000UL -#define MC_STATUS_MISCV 0x0800000000000000UL -#define MC_STATUS_EN 0x1000000000000000UL -#define MC_STATUS_UC 0x2000000000000000UL -#define MC_STATUS_OVER 0x4000000000000000UL -#define MC_STATUS_VAL 0x8000000000000000UL +#define MC_STATUS_MCA_ERROR 0x000000000000ffff +#define MC_STATUS_MODEL_ERROR 0x00000000ffff0000 +#define MC_STATUS_OTHER_INFO 0x01ffffff00000000 +#define MC_STATUS_PCC 0x0200000000000000 +#define MC_STATUS_ADDRV 0x0400000000000000 +#define MC_STATUS_MISCV 0x0800000000000000 +#define MC_STATUS_EN 0x1000000000000000 +#define MC_STATUS_UC 0x2000000000000000 +#define MC_STATUS_OVER 0x4000000000000000 +#define MC_STATUS_VAL 0x8000000000000000 /* * The following four 3-byte registers control the non-cacheable regions. -- cgit v1.1 From 34dd3613f570b31797d837896d2c5c2c0bc61360 Mon Sep 17 00:00:00 2001 From: jhb Date: Fri, 26 Mar 2010 13:49:46 +0000 Subject: MFC 205214: - Extend the machine check record structure to include several fields useful for parsing model-specific and other fields in machine check events including the global machine check capabilities and status registers, CPU identification, and the FreeBSD CPU ID. - Report these added fields in the console log of a machine check so that a record structure can be reconstituted from the console messages. - Parse new architectural errors including memory controller errors. --- sys/amd64/amd64/mca.c | 52 +++++++++++++++++++++++++++++++++++++++--- sys/amd64/include/mca.h | 5 ++++ sys/amd64/include/specialreg.h | 12 ++++++++++ 3 files changed, 66 insertions(+), 3 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/mca.c b/sys/amd64/amd64/mca.c index b0e842a..76bee77 100644 --- a/sys/amd64/amd64/mca.c +++ b/sys/amd64/amd64/mca.c @@ -177,19 +177,46 @@ mca_error_request(uint16_t mca_error) return ("???"); } +static const char * +mca_error_mmtype(uint16_t mca_error) +{ + + switch ((mca_error & 0x70) >> 4) { + case 0x0: + return ("GEN"); + case 0x1: + return ("RD"); + case 0x2: + return ("WR"); + case 0x3: + return ("AC"); + case 0x4: + return ("MS"); + } + return ("???"); +} + /* Dump details about a single machine check. */ static void __nonnull(1) mca_log(const struct mca_record *rec) { uint16_t mca_error; - printf("MCA: bank %d, status 0x%016llx\n", rec->mr_bank, + printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank, (long long)rec->mr_status); - printf("MCA: CPU %d ", rec->mr_apic_id); + printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n", + (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status); + printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor, + rec->mr_cpu_id, rec->mr_apic_id); + printf("MCA: CPU %d ", rec->mr_cpu); if (rec->mr_status & MC_STATUS_UC) printf("UNCOR "); - else + else { printf("COR "); + if (rec->mr_mcg_cap & MCG_CAP_TES_P) + printf("(%lld) ", ((long long)rec->mr_status & + MC_STATUS_COR_COUNT) >> 38); + } if (rec->mr_status & MC_STATUS_PCC) printf("PCC "); if (rec->mr_status & MC_STATUS_OVER) @@ -212,6 +239,9 @@ mca_log(const struct mca_record *rec) case 0x0004: printf("FRC error"); break; + case 0x0005: + printf("internal parity error"); + break; case 0x0400: printf("internal timer error"); break; @@ -236,6 +266,17 @@ mca_log(const struct mca_record *rec) break; } + /* Memory controller error. */ + if ((mca_error & 0xef80) == 0x0080) { + printf("%s channel ", mca_error_mmtype(mca_error)); + if ((mca_error & 0x000f) != 0x000f) + printf("%d", mca_error & 0x000f); + else + printf("??"); + printf(" memory error"); + break; + } + /* Cache error. */ if ((mca_error & 0xef00) == 0x0100) { printf("%sCACHE %s %s error", @@ -313,6 +354,11 @@ mca_check_status(int bank, struct mca_record *rec) rec->mr_misc = rdmsr(MSR_MC_MISC(bank)); rec->mr_tsc = rdtsc(); rec->mr_apic_id = PCPU_GET(apic_id); + rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP); + rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS); + rec->mr_cpu_id = cpu_id; + rec->mr_cpu_vendor_id = cpu_vendor_id; + rec->mr_cpu = PCPU_GET(cpuid); /* * Clear machine check. Don't do this for uncorrectable diff --git a/sys/amd64/include/mca.h b/sys/amd64/include/mca.h index ddc3aeb..bc09480 100644 --- a/sys/amd64/include/mca.h +++ b/sys/amd64/include/mca.h @@ -37,6 +37,11 @@ struct mca_record { uint64_t mr_tsc; int mr_apic_id; int mr_bank; + uint64_t mr_mcg_cap; + uint64_t mr_mcg_status; + int mr_cpu_id; + int mr_cpu_vendor_id; + int mr_cpu; }; #ifdef _KERNEL diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h index 733f4d7..9253462 100644 --- a/sys/amd64/include/specialreg.h +++ b/sys/amd64/include/specialreg.h @@ -267,6 +267,7 @@ #define MSR_MTRR16kBase 0x258 #define MSR_MTRR4kBase 0x268 #define MSR_PAT 0x277 +#define MSR_MC0_CTL2 0x280 #define MSR_MTRRdefType 0x2ff #define MSR_MC0_CTL 0x400 #define MSR_MC0_STATUS 0x401 @@ -352,8 +353,10 @@ #define MCG_CAP_COUNT 0x000000ff #define MCG_CAP_CTL_P 0x00000100 #define MCG_CAP_EXT_P 0x00000200 +#define MCG_CAP_CMCI_P 0x00000400 #define MCG_CAP_TES_P 0x00000800 #define MCG_CAP_EXT_CNT 0x00ff0000 +#define MCG_CAP_SER_P 0x01000000 #define MCG_STATUS_RIPV 0x00000001 #define MCG_STATUS_EIPV 0x00000002 #define MCG_STATUS_MCIP 0x00000004 @@ -363,9 +366,14 @@ #define MSR_MC_STATUS(x) (MSR_MC0_STATUS + (x) * 4) #define MSR_MC_ADDR(x) (MSR_MC0_ADDR + (x) * 4) #define MSR_MC_MISC(x) (MSR_MC0_MISC + (x) * 4) +#define MSR_MC_CTL2(x) (MSR_MC0_CTL2 + (x)) /* If MCG_CAP_CMCI_P */ #define MC_STATUS_MCA_ERROR 0x000000000000ffff #define MC_STATUS_MODEL_ERROR 0x00000000ffff0000 #define MC_STATUS_OTHER_INFO 0x01ffffff00000000 +#define MC_STATUS_COR_COUNT 0x001fffc000000000 /* If MCG_CAP_TES_P */ +#define MC_STATUS_TES_STATUS 0x0060000000000000 /* If MCG_CAP_TES_P */ +#define MC_STATUS_AR 0x0080000000000000 /* If MCG_CAP_CMCI_P */ +#define MC_STATUS_S 0x0100000000000000 /* If MCG_CAP_CMCI_P */ #define MC_STATUS_PCC 0x0200000000000000 #define MC_STATUS_ADDRV 0x0400000000000000 #define MC_STATUS_MISCV 0x0800000000000000 @@ -373,6 +381,10 @@ #define MC_STATUS_UC 0x2000000000000000 #define MC_STATUS_OVER 0x4000000000000000 #define MC_STATUS_VAL 0x8000000000000000 +#define MC_MISC_RA_LSB 0x000000000000003f /* If MCG_CAP_SER_P */ +#define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */ +#define MC_CTL2_THRESHOLD 0x0000000000003fff +#define MC_CTL2_CMCI_EN 0x0000000040000000 /* * The following four 3-byte registers control the non-cacheable regions. -- cgit v1.1 From bfa70a9aba72beceb303770646fb483f9e024647 Mon Sep 17 00:00:00 2001 From: jhb Date: Fri, 26 Mar 2010 18:58:22 +0000 Subject: MFC 205332: Use the same policy for rejecting / not-reject ACPI tables with incorrect checksums as the base acpi(4) driver. This fixes a problem where the MADT parser would reject the MADT table during early boot causing the MP Table to be, but then the acpi(4) driver would attach and use non-SMP interrupt routing. --- sys/amd64/acpica/acpi_machdep.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/acpica/acpi_machdep.c b/sys/amd64/acpica/acpi_machdep.c index 0d866e8..ad5f854 100644 --- a/sys/amd64/acpica/acpi_machdep.c +++ b/sys/amd64/acpica/acpi_machdep.c @@ -187,8 +187,10 @@ map_table(vm_paddr_t pa, int offset, const char *sig) if (ACPI_FAILURE(AcpiTbChecksum(table, length))) { if (bootverbose) printf("ACPI: Failed checksum for table %s\n", sig); +#if (ACPI_CHECKSUM_ABORT) table_unmap(table, length); return (NULL); +#endif } return (table); } -- cgit v1.1 From c78b160bc7863413615e23b4e9e811b8c7703023 Mon Sep 17 00:00:00 2001 From: trasz Date: Sat, 27 Mar 2010 14:58:28 +0000 Subject: MFC r202919: Fix array overflow. This routine is only called from procfs, which is not mounted by default, and I've been unable to trigger a panic without this fix applied anyway. Reviewed by: kib, cperciva --- sys/amd64/ia32/ia32_reg.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/ia32/ia32_reg.c b/sys/amd64/ia32/ia32_reg.c index 83f6783..da5190f 100644 --- a/sys/amd64/ia32/ia32_reg.c +++ b/sys/amd64/ia32/ia32_reg.c @@ -213,8 +213,6 @@ fill_dbregs32(struct thread *td, struct dbreg32 *regs) err = fill_dbregs(td, &dr); for (i = 0; i < 8; i++) regs->dr[i] = dr.dr[i]; - for (i = 8; i < 16; i++) - regs->dr[i] = 0; return (err); } -- cgit v1.1 From 1d3f35048e647a7839adde4669ccc806e42b47c9 Mon Sep 17 00:00:00 2001 From: bz Date: Sat, 27 Mar 2010 17:14:55 +0000 Subject: MFC r201813: In sys//conf/Makefile set TARGET to . That allows sys/conf/makeLINT.mk to only do certain things for certain architectures. Note that neither arm nor mips have the Makefile there, thus essentially not (yet) supporting LINT. This would enable them do add special treatment to sys/conf/makeLINT.mk as well chosing one of the many configurations as LINT. --- sys/amd64/conf/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/Makefile b/sys/amd64/conf/Makefile index 2c006e9..1d2513f 100644 --- a/sys/amd64/conf/Makefile +++ b/sys/amd64/conf/Makefile @@ -1,3 +1,5 @@ # $FreeBSD$ +TARGET=amd64 + .include "${.CURDIR}/../../conf/makeLINT.mk" -- cgit v1.1 From 78bdfe798ddad0bb64dbd05f0eeb4620cd19028b Mon Sep 17 00:00:00 2001 From: attilio Date: Mon, 29 Mar 2010 15:39:17 +0000 Subject: MFC r199852, r202387, r202441, r202534: Handling all the three clocks with the LAPIC may lead to aliasing for softclock and profclock. Revert the change when the LAPIC started taking charge of all three of them. Sponsored by: Sandvine Incorporated --- sys/amd64/amd64/local_apic.c | 71 +++++++++++++++++++++++++++----------------- sys/amd64/include/apicvar.h | 8 ++++- sys/amd64/isa/clock.c | 15 +++++----- 3 files changed, 58 insertions(+), 36 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c index 98ed4df..0d04bbd 100644 --- a/sys/amd64/amd64/local_apic.c +++ b/sys/amd64/amd64/local_apic.c @@ -160,6 +160,9 @@ static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value); struct pic lapic_pic = { .pic_resume = lapic_resume }; +static int lapic_allclocks; +TUNABLE_INT("machdep.lapic_allclocks", &lapic_allclocks); + static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value) { @@ -415,10 +418,11 @@ lapic_disable_pmc(void) /* * Called by cpu_initclocks() on the BSP to setup the local APIC timer so * that it can drive hardclock, statclock, and profclock. This function - * returns true if it is able to use the local APIC timer to drive the - * clocks and false if it is not able. + * returns a positive integer if it is convenient to use the local APIC + * for all the clocks, a negative integer if it is convenient to use the + * local APIC only for the hardclock and 0 if none of them can be handled. */ -int +enum lapic_clock lapic_setup_clock(void) { u_long value; @@ -426,10 +430,10 @@ lapic_setup_clock(void) /* Can't drive the timer without a local APIC. */ if (lapic == NULL) - return (0); + return (LAPIC_CLOCK_NONE); if (resource_int_value("apic", 0, "clock", &i) == 0 && i == 0) - return (0); + return (LAPIC_CLOCK_NONE); /* Start off with a divisor of 2 (power on reset default). */ lapic_timer_divisor = 2; @@ -461,19 +465,27 @@ lapic_setup_clock(void) * (and profhz) run at hz. If 'hz' is below 1500 but above * 750, then we let the lapic timer run at 2 * 'hz'. If 'hz' * is below 750 then we let the lapic timer run at 4 * 'hz'. + * + * Please note that stathz and profhz are set only if all the + * clocks are handled through the local APIC. */ - if (hz >= 1500) + if (lapic_allclocks != 0) { + if (hz >= 1500) + lapic_timer_hz = hz; + else if (hz >= 750) + lapic_timer_hz = hz * 2; + else + lapic_timer_hz = hz * 4; + } else lapic_timer_hz = hz; - else if (hz >= 750) - lapic_timer_hz = hz * 2; - else - lapic_timer_hz = hz * 4; - if (lapic_timer_hz < 128) - stathz = lapic_timer_hz; - else - stathz = lapic_timer_hz / (lapic_timer_hz / 128); - profhz = lapic_timer_hz; lapic_timer_period = value / lapic_timer_hz; + if (lapic_allclocks != 0) { + if (lapic_timer_hz < 128) + stathz = lapic_timer_hz; + else + stathz = lapic_timer_hz / (lapic_timer_hz / 128); + profhz = lapic_timer_hz; + } /* * Start up the timer on the BSP. The APs will kick off their @@ -481,7 +493,7 @@ lapic_setup_clock(void) */ lapic_timer_periodic(lapic_timer_period); lapic_timer_enable_intr(); - return (1); + return (lapic_allclocks == 0 ? LAPIC_CLOCK_HARDCLOCK : LAPIC_CLOCK_ALL); } void @@ -784,20 +796,23 @@ lapic_handle_timer(struct trapframe *frame) else hardclock_cpu(TRAPF_USERMODE(frame)); } + if (lapic_allclocks != 0) { - /* Fire statclock at stathz. */ - la->la_stat_ticks += stathz; - if (la->la_stat_ticks >= lapic_timer_hz) { - la->la_stat_ticks -= lapic_timer_hz; - statclock(TRAPF_USERMODE(frame)); - } + /* Fire statclock at stathz. */ + la->la_stat_ticks += stathz; + if (la->la_stat_ticks >= lapic_timer_hz) { + la->la_stat_ticks -= lapic_timer_hz; + statclock(TRAPF_USERMODE(frame)); + } - /* Fire profclock at profhz, but only when needed. */ - la->la_prof_ticks += profhz; - if (la->la_prof_ticks >= lapic_timer_hz) { - la->la_prof_ticks -= lapic_timer_hz; - if (profprocs != 0) - profclock(TRAPF_USERMODE(frame), TRAPF_PC(frame)); + /* Fire profclock at profhz, but only when needed. */ + la->la_prof_ticks += profhz; + if (la->la_prof_ticks >= lapic_timer_hz) { + la->la_prof_ticks -= lapic_timer_hz; + if (profprocs != 0) + profclock(TRAPF_USERMODE(frame), + TRAPF_PC(frame)); + } } critical_exit(); } diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h index 9d6d538..8f15d84 100644 --- a/sys/amd64/include/apicvar.h +++ b/sys/amd64/include/apicvar.h @@ -157,6 +157,12 @@ #define APIC_BUS_PCI 2 #define APIC_BUS_MAX APIC_BUS_PCI +enum lapic_clock { + LAPIC_CLOCK_NONE, + LAPIC_CLOCK_HARDCLOCK, + LAPIC_CLOCK_ALL +}; + /* * An APIC enumerator is a psuedo bus driver that enumerates APIC's including * CPU's and I/O APIC's. @@ -224,7 +230,7 @@ int lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger); void lapic_set_tpr(u_int vector); void lapic_setup(int boot); -int lapic_setup_clock(void); +enum lapic_clock lapic_setup_clock(void); #endif /* !LOCORE */ #endif /* _MACHINE_APICVAR_H_ */ diff --git a/sys/amd64/isa/clock.c b/sys/amd64/isa/clock.c index adc1743..bf379f3 100644 --- a/sys/amd64/isa/clock.c +++ b/sys/amd64/isa/clock.c @@ -91,7 +91,7 @@ static u_int32_t i8254_offset; static int (*i8254_pending)(struct intsrc *); static int i8254_ticked; static int using_atrtc_timer; -static int using_lapic_timer; +static enum lapic_clock using_lapic_timer = LAPIC_CLOCK_NONE; /* Values for timerX_state: */ #define RELEASED 0 @@ -160,7 +160,8 @@ clkintr(struct trapframe *frame) clkintr_pending = 0; mtx_unlock_spin(&clock_lock); } - KASSERT(!using_lapic_timer, ("clk interrupt enabled with lapic timer")); + KASSERT(using_lapic_timer == LAPIC_CLOCK_NONE, + ("clk interrupt enabled with lapic timer")); if (using_atrtc_timer) { #ifdef SMP @@ -422,7 +423,7 @@ set_i8254_freq(u_int freq, int intr_freq) i8254_timecounter.tc_frequency = freq; mtx_lock_spin(&clock_lock); i8254_freq = freq; - if (using_lapic_timer) + if (using_lapic_timer != LAPIC_CLOCK_NONE) new_i8254_real_max_count = 0x10000; else new_i8254_real_max_count = TIMER_DIV(intr_freq); @@ -485,7 +486,7 @@ cpu_initclocks() * that it can drive hardclock(). Otherwise, change the 8254 * timecounter to user a simpler algorithm. */ - if (!using_lapic_timer) { + if (using_lapic_timer == LAPIC_CLOCK_NONE) { intr_add_handler("clk", 0, (driver_filter_t *)clkintr, NULL, NULL, INTR_TYPE_CLK, NULL); i8254_intsrc = intr_lookup_source(0); @@ -508,7 +509,7 @@ cpu_initclocks() * kernel clocks, then setup the RTC to periodically interrupt to * drive statclock() and profclock(). */ - if (!using_lapic_timer) { + if (using_lapic_timer != LAPIC_CLOCK_ALL) { using_atrtc_timer = atrtc_setup_clock(); if (using_atrtc_timer) { /* Enable periodic interrupts from the RTC. */ @@ -532,7 +533,7 @@ void cpu_startprofclock(void) { - if (using_lapic_timer || !using_atrtc_timer) + if (using_lapic_timer == LAPIC_CLOCK_ALL || !using_atrtc_timer) return; atrtc_rate(RTCSA_PROF); psdiv = pscnt = psratio; @@ -542,7 +543,7 @@ void cpu_stopprofclock(void) { - if (using_lapic_timer || !using_atrtc_timer) + if (using_lapic_timer == LAPIC_CLOCK_ALL || !using_atrtc_timer) return; atrtc_rate(RTCSA_NOPROF); psdiv = pscnt = 1; -- cgit v1.1 From 5fd4298a64631d6515310c8b761c909817c24411 Mon Sep 17 00:00:00 2001 From: attilio Date: Tue, 30 Mar 2010 11:19:29 +0000 Subject: MFC r204641, r204753: Improving the clocks auto-tunning by firstly checking if the atrtc may be correctly initialized and just then assign to softclock/profclock. Sponsored by: Sandvine Incorporated --- sys/amd64/amd64/local_apic.c | 28 +++++++++++++++------------- sys/amd64/include/apicvar.h | 2 +- sys/amd64/isa/clock.c | 24 +++++++++++++++++++----- 3 files changed, 35 insertions(+), 19 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c index 0d04bbd..c274631 100644 --- a/sys/amd64/amd64/local_apic.c +++ b/sys/amd64/amd64/local_apic.c @@ -149,6 +149,7 @@ extern inthand_t IDTVEC(rsvd); volatile lapic_t *lapic; vm_paddr_t lapic_paddr; static u_long lapic_timer_divisor, lapic_timer_period, lapic_timer_hz; +static enum lapic_clock clockcoverage; static void lapic_enable(void); static void lapic_resume(struct pic *pic); @@ -160,9 +161,6 @@ static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value); struct pic lapic_pic = { .pic_resume = lapic_resume }; -static int lapic_allclocks; -TUNABLE_INT("machdep.lapic_allclocks", &lapic_allclocks); - static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value) { @@ -423,17 +421,20 @@ lapic_disable_pmc(void) * local APIC only for the hardclock and 0 if none of them can be handled. */ enum lapic_clock -lapic_setup_clock(void) +lapic_setup_clock(enum lapic_clock srcsdes) { u_long value; int i; - /* Can't drive the timer without a local APIC. */ - if (lapic == NULL) - return (LAPIC_CLOCK_NONE); + /* lapic_setup_clock() should not be called with LAPIC_CLOCK_NONE. */ + MPASS(srcsdes != LAPIC_CLOCK_NONE); - if (resource_int_value("apic", 0, "clock", &i) == 0 && i == 0) - return (LAPIC_CLOCK_NONE); + /* Can't drive the timer without a local APIC. */ + if (lapic == NULL || + (resource_int_value("apic", 0, "clock", &i) == 0 && i == 0)) { + clockcoverage = LAPIC_CLOCK_NONE; + return (clockcoverage); + } /* Start off with a divisor of 2 (power on reset default). */ lapic_timer_divisor = 2; @@ -469,7 +470,7 @@ lapic_setup_clock(void) * Please note that stathz and profhz are set only if all the * clocks are handled through the local APIC. */ - if (lapic_allclocks != 0) { + if (srcsdes == LAPIC_CLOCK_ALL) { if (hz >= 1500) lapic_timer_hz = hz; else if (hz >= 750) @@ -479,7 +480,7 @@ lapic_setup_clock(void) } else lapic_timer_hz = hz; lapic_timer_period = value / lapic_timer_hz; - if (lapic_allclocks != 0) { + if (srcsdes == LAPIC_CLOCK_ALL) { if (lapic_timer_hz < 128) stathz = lapic_timer_hz; else @@ -493,7 +494,8 @@ lapic_setup_clock(void) */ lapic_timer_periodic(lapic_timer_period); lapic_timer_enable_intr(); - return (lapic_allclocks == 0 ? LAPIC_CLOCK_HARDCLOCK : LAPIC_CLOCK_ALL); + clockcoverage = srcsdes; + return (srcsdes); } void @@ -796,7 +798,7 @@ lapic_handle_timer(struct trapframe *frame) else hardclock_cpu(TRAPF_USERMODE(frame)); } - if (lapic_allclocks != 0) { + if (clockcoverage == LAPIC_CLOCK_ALL) { /* Fire statclock at stathz. */ la->la_stat_ticks += stathz; diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h index 8f15d84..110ce81 100644 --- a/sys/amd64/include/apicvar.h +++ b/sys/amd64/include/apicvar.h @@ -230,7 +230,7 @@ int lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger); void lapic_set_tpr(u_int vector); void lapic_setup(int boot); -enum lapic_clock lapic_setup_clock(void); +enum lapic_clock lapic_setup_clock(enum lapic_clock srcsdes); #endif /* !LOCORE */ #endif /* _MACHINE_APICVAR_H_ */ diff --git a/sys/amd64/isa/clock.c b/sys/amd64/isa/clock.c index bf379f3..e5c27d1 100644 --- a/sys/amd64/isa/clock.c +++ b/sys/amd64/isa/clock.c @@ -84,6 +84,9 @@ TUNABLE_INT("hw.i8254.freq", &i8254_freq); int i8254_max_count; static int i8254_real_max_count; +static int lapic_allclocks; +TUNABLE_INT("machdep.lapic_allclocks", &lapic_allclocks); + struct mtx clock_lock; static struct intsrc *i8254_intsrc; static u_int32_t i8254_lastcount; @@ -478,8 +481,22 @@ startrtclock() void cpu_initclocks() { + enum lapic_clock tlsca; + int tasc; + + /* Initialize RTC. */ + atrtc_start(); + tasc = atrtc_setup_clock(); + + /* + * If the atrtc successfully initialized and the users didn't force + * otherwise use the LAPIC in order to cater hardclock only, otherwise + * take in charge all the clock sources. + */ + tlsca = (lapic_allclocks == 0 && tasc != 0) ? LAPIC_CLOCK_HARDCLOCK : + LAPIC_CLOCK_ALL; + using_lapic_timer = lapic_setup_clock(tlsca); - using_lapic_timer = lapic_setup_clock(); /* * If we aren't using the local APIC timer to drive the kernel * clocks, setup the interrupt handler for the 8254 timer 0 so @@ -500,9 +517,6 @@ cpu_initclocks() set_i8254_freq(i8254_freq, hz); } - /* Initialize RTC. */ - atrtc_start(); - /* * If the separate statistics clock hasn't been explicility disabled * and we aren't already using the local APIC timer to drive the @@ -510,7 +524,7 @@ cpu_initclocks() * drive statclock() and profclock(). */ if (using_lapic_timer != LAPIC_CLOCK_ALL) { - using_atrtc_timer = atrtc_setup_clock(); + using_atrtc_timer = tasc; if (using_atrtc_timer) { /* Enable periodic interrupts from the RTC. */ intr_add_handler("rtc", 8, -- cgit v1.1 From 06b4c1f24a99a989afc61994d978942e183fe5e0 Mon Sep 17 00:00:00 2001 From: marcel Date: Wed, 31 Mar 2010 02:43:58 +0000 Subject: MFC rev 198341 and 198342: o Introduce vm_sync_icache() for making the I-cache coherent with the memory or D-cache, depending on the semantics of the platform. vm_sync_icache() is basically a wrapper around pmap_sync_icache(), that translates the vm_map_t argumument to pmap_t. o Introduce pmap_sync_icache() to all PMAP implementation. For powerpc it replaces the pmap_page_executable() function, added to solve the I-cache problem in uiomove_fromphys(). o In proc_rwmem() call vm_sync_icache() when writing to a page that has execute permissions. This assures that when breakpoints are written, the I-cache will be coherent and the process will actually hit the breakpoint. o This also fixes the Book-E PMAP implementation that was missing necessary locking while trying to deal with the I-cache coherency in pmap_enter() (read: mmu_booke_enter_locked). --- sys/amd64/amd64/pmap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 0935506..5ff527f 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -4756,6 +4756,11 @@ if (oldpmap) /* XXX FIXME */ critical_exit(); } +void +pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) +{ +} + /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. -- cgit v1.1 From bc85e840fa31ff95bb72c293fbb52e6765cfcbb3 Mon Sep 17 00:00:00 2001 From: rnoland Date: Sun, 4 Apr 2010 15:42:52 +0000 Subject: MFC r203289,r203367 Enable MTRR on all VIA CPUs that claim support --- sys/amd64/amd64/amd64_mem.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/amd64_mem.c b/sys/amd64/amd64/amd64_mem.c index d7959fd..e50d3e7 100644 --- a/sys/amd64/amd64/amd64_mem.c +++ b/sys/amd64/amd64/amd64_mem.c @@ -707,11 +707,8 @@ amd64_mem_drvinit(void *unused) switch (cpu_vendor_id) { case CPU_VENDOR_INTEL: case CPU_VENDOR_AMD: - break; case CPU_VENDOR_CENTAUR: - if (cpu_exthigh >= 0x80000008) - break; - /* FALLTHROUGH */ + break; default: return; } -- cgit v1.1 From bcba5d5ad835df6f565c7a9d63e8ebe99bd6db23 Mon Sep 17 00:00:00 2001 From: alc Date: Mon, 5 Apr 2010 16:11:42 +0000 Subject: MFC r204907, r204913, r205402, r205573, r205573 Implement AMD's recommended workaround for Erratum 383 on Family 10h processors. Enable machine check exceptions by default. --- sys/amd64/amd64/mca.c | 35 ++++++++++- sys/amd64/amd64/pmap.c | 134 ++++++++++++++++++++++++++++++++++++++--- sys/amd64/include/md_var.h | 1 + sys/amd64/include/specialreg.h | 1 + 4 files changed, 162 insertions(+), 9 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/mca.c b/sys/amd64/amd64/mca.c index 76bee77..ccbab17 100644 --- a/sys/amd64/amd64/mca.c +++ b/sys/amd64/amd64/mca.c @@ -60,11 +60,20 @@ static int mca_count; /* Number of records stored. */ SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL, "Machine Check Architecture"); -static int mca_enabled = 0; +static int mca_enabled = 1; TUNABLE_INT("hw.mca.enabled", &mca_enabled); SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0, "Administrative toggle for machine check support"); +static int amd10h_L1TP = 1; +TUNABLE_INT("hw.mca.amd10h_L1TP", &amd10h_L1TP); +SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0, + "Administrative toggle for logging of level one TLB parity (L1TP) errors"); + +int workaround_erratum383; +SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erratum383, 0, + "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?"); + static STAILQ_HEAD(, mca_internal) mca_records; static struct callout mca_timer; static int mca_ticks = 3600; /* Check hourly by default. */ @@ -527,7 +536,7 @@ void mca_init(void) { uint64_t mcg_cap; - uint64_t ctl; + uint64_t ctl, mask; int skip; int i; @@ -535,6 +544,15 @@ mca_init(void) if (!mca_enabled || !(cpu_feature & CPUID_MCE)) return; + /* + * On AMD Family 10h processors, unless logging of level one TLB + * parity (L1TP) errors is disabled, enable the recommended workaround + * for Erratum 383. + */ + if (cpu_vendor_id == CPU_VENDOR_AMD && + CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP) + workaround_erratum383 = 1; + if (cpu_feature & CPUID_MCA) { if (PCPU_GET(cpuid) == 0) mca_setup(); @@ -545,6 +563,19 @@ mca_init(void) /* Enable MCA features. */ wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE); + /* + * Disable logging of level one TLB parity (L1TP) errors by + * the data cache as an alternative workaround for AMD Family + * 10h Erratum 383. Unlike the recommended workaround, there + * is no performance penalty to this workaround. However, + * L1TP errors will go unreported. + */ + if (cpu_vendor_id == CPU_VENDOR_AMD && + CPUID_TO_FAMILY(cpu_id) == 0x10 && !amd10h_L1TP) { + mask = rdmsr(MSR_MC0_CTL_MASK); + if ((mask & (1UL << 5)) == 0) + wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5)); + } for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) { /* By default enable logging of all errors. */ ctl = 0xffffffffffffffffUL; diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 5ff527f..516048d 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -7,7 +7,7 @@ * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. - * Copyright (c) 2005-2008 Alan L. Cox + * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by @@ -252,6 +252,9 @@ static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); +static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, + pd_entry_t newpde); +static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); @@ -654,13 +657,13 @@ pmap_init(void) pv_entry_high_water = 9 * (pv_entry_max / 10); /* - * Disable large page mappings by default if the kernel is running in - * a virtual machine on an AMD Family 10h processor. This is a work- - * around for Erratum 383. + * If the kernel is running in a virtual machine on an AMD Family 10h + * processor, then it must assume that MCA is enabled by the virtual + * machine monitor. */ if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD && CPUID_TO_FAMILY(cpu_id) == 0x10) - pg_ps_enabled = 0; + workaround_erratum383 = 1; /* * Are large page mappings enabled? @@ -795,6 +798,45 @@ pmap_cache_bits(int mode, boolean_t is_pde) cache_bits |= PG_NC_PWT; return (cache_bits); } + +/* + * After changing the page size for the specified virtual address in the page + * table, flush the corresponding entries from the processor's TLB. Only the + * calling processor's TLB is affected. + * + * The calling thread must be pinned to a processor. + */ +static void +pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) +{ + u_long cr4; + + if ((newpde & PG_PS) == 0) + /* Demotion: flush a specific 2MB page mapping. */ + invlpg(va); + else if ((newpde & PG_G) == 0) + /* + * Promotion: flush every 4KB page mapping from the TLB + * because there are too many to flush individually. + */ + invltlb(); + else { + /* + * Promotion: flush every 4KB page mapping from the TLB, + * including any global (PG_G) mappings. + */ + cr4 = rcr4(); + load_cr4(cr4 & ~CR4_PGE); + /* + * Although preemption at this point could be detrimental to + * performance, it would not lead to an error. PG_G is simply + * ignored if CR4.PGE is clear. Moreover, in case this block + * is re-entered, the load_cr4() either above or below will + * modify CR4.PGE flushing the TLB. + */ + load_cr4(cr4 | CR4_PGE); + } +} #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. @@ -891,6 +933,69 @@ pmap_invalidate_cache(void) smp_cache_flush(); sched_unpin(); } + +struct pde_action { + cpumask_t store; /* processor that updates the PDE */ + cpumask_t invalidate; /* processors that invalidate their TLB */ + vm_offset_t va; + pd_entry_t *pde; + pd_entry_t newpde; +}; + +static void +pmap_update_pde_action(void *arg) +{ + struct pde_action *act = arg; + + if (act->store == PCPU_GET(cpumask)) + pde_store(act->pde, act->newpde); +} + +static void +pmap_update_pde_teardown(void *arg) +{ + struct pde_action *act = arg; + + if ((act->invalidate & PCPU_GET(cpumask)) != 0) + pmap_update_pde_invalidate(act->va, act->newpde); +} + +/* + * Change the page size for the specified virtual address in a way that + * prevents any possibility of the TLB ever having two entries that map the + * same virtual address using different page sizes. This is the recommended + * workaround for Erratum 383 on AMD Family 10h processors. It prevents a + * machine check exception for a TLB state that is improperly diagnosed as a + * hardware error. + */ +static void +pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) +{ + struct pde_action act; + cpumask_t active, cpumask; + + sched_pin(); + cpumask = PCPU_GET(cpumask); + if (pmap == kernel_pmap) + active = all_cpus; + else + active = pmap->pm_active; + if ((active & PCPU_GET(other_cpus)) != 0) { + act.store = cpumask; + act.invalidate = active; + act.va = va; + act.pde = pde; + act.newpde = newpde; + smp_rendezvous_cpus(cpumask | active, + smp_no_rendevous_barrier, pmap_update_pde_action, + pmap_update_pde_teardown, &act); + } else { + pde_store(pde, newpde); + if ((active & cpumask) != 0) + pmap_update_pde_invalidate(va, newpde); + } + sched_unpin(); +} #else /* !SMP */ /* * Normal, non-SMP, invalidation functions. @@ -928,6 +1033,15 @@ pmap_invalidate_cache(void) wbinvd(); } + +static void +pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) +{ + + pde_store(pde, newpde); + if (pmap == kernel_pmap || pmap->pm_active) + pmap_update_pde_invalidate(va, newpde); +} #endif /* !SMP */ static void @@ -2310,7 +2424,10 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ - pde_store(pde, newpde); + if (workaround_erratum383) + pmap_update_pde(pmap, va, pde, newpde); + else + pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. @@ -2926,7 +3043,10 @@ setpte: /* * Map the superpage. */ - pde_store(pde, PG_PS | newpde); + if (workaround_erratum383) + pmap_update_pde(pmap, va, pde, PG_PS | newpde); + else + pde_store(pde, PG_PS | newpde); pmap_pde_promotions++; CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h index 15df851..2b43b37 100644 --- a/sys/amd64/include/md_var.h +++ b/sys/amd64/include/md_var.h @@ -61,6 +61,7 @@ extern char sigcode[]; extern int szsigcode; extern uint64_t *vm_page_dump; extern int vm_page_dump_size; +extern int workaround_erratum383; extern int _udatasel; extern int _ucodesel; extern int _ucode32sel; diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h index 9253462..86a08ce 100644 --- a/sys/amd64/include/specialreg.h +++ b/sys/amd64/include/specialreg.h @@ -506,6 +506,7 @@ #define MSR_TOP_MEM 0xc001001a /* boundary for ram below 4G */ #define MSR_TOP_MEM2 0xc001001d /* boundary for ram above 4G */ #define MSR_K8_UCODE_UPDATE 0xc0010020 /* update microcode */ +#define MSR_MC0_CTL_MASK 0xc0010044 /* VIA ACE crypto featureset: for via_feature_rng */ #define VIA_HAS_RNG 1 /* cpu has RNG */ -- cgit v1.1 From c7d735a07c6450e83b752cd349171b48e34c7989 Mon Sep 17 00:00:00 2001 From: nwhitehorn Date: Wed, 7 Apr 2010 02:24:41 +0000 Subject: MFC r205014,205015: Provide groundwork for 32-bit binary compatibility on non-x86 platforms, for upcoming 64-bit PowerPC and MIPS support. This renames the COMPAT_IA32 option to COMPAT_FREEBSD32, removes some IA32-specific code from MI parts of the kernel and enhances the freebsd32 compatibility code to support big-endian platforms. This MFC is required for MFCs of later changes to the freebsd32 compatibility from HEAD. Requested by: kib --- sys/amd64/amd64/db_trace.c | 2 +- sys/amd64/amd64/exception.S | 2 +- sys/amd64/amd64/vm_machdep.c | 4 ++-- sys/amd64/conf/GENERIC | 2 +- sys/amd64/conf/NOTES | 4 ++-- sys/amd64/conf/XENHVM | 2 +- sys/amd64/include/elf.h | 1 + sys/amd64/include/reg.h | 9 +++++++++ sys/amd64/linux32/linux32_sysvec.c | 4 ++-- 9 files changed, 20 insertions(+), 10 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/db_trace.c b/sys/amd64/amd64/db_trace.c index 73ffac5..cba90f2 100644 --- a/sys/amd64/amd64/db_trace.c +++ b/sys/amd64/amd64/db_trace.c @@ -319,7 +319,7 @@ db_nextframe(struct amd64_frame **fp, db_addr_t *ip, struct thread *td) frame_type = INTERRUPT; else if (strcmp(name, "Xfast_syscall") == 0) frame_type = SYSCALL; -#ifdef COMPAT_IA32 +#ifdef COMPAT_FREEBSD32 else if (strcmp(name, "Xint0x80_syscall") == 0) frame_type = SYSCALL; #endif diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S index 3d1a20e..1799b74 100644 --- a/sys/amd64/amd64/exception.S +++ b/sys/amd64/amd64/exception.S @@ -572,7 +572,7 @@ ENTRY(fork_trampoline) * included. */ -#ifdef COMPAT_IA32 +#ifdef COMPAT_FREEBSD32 .data .p2align 4 .text diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index a99fdaa..d6906ac 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -439,7 +439,7 @@ cpu_set_upcall_kse(struct thread *td, void (*entry)(void *), void *arg, */ cpu_thread_clean(td); -#ifdef COMPAT_IA32 +#ifdef COMPAT_FREEBSD32 if (td->td_proc->p_sysent->sv_flags & SV_ILP32) { /* * Set the trap frame to point at the beginning of the uts @@ -490,7 +490,7 @@ cpu_set_user_tls(struct thread *td, void *tls_base) if ((u_int64_t)tls_base >= VM_MAXUSER_ADDRESS) return (EINVAL); -#ifdef COMPAT_IA32 +#ifdef COMPAT_FREEBSD32 if (td->td_proc->p_sysent->sv_flags & SV_ILP32) { td->td_pcb->pcb_gsbase = (register_t)tls_base; return (0); diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index e5a6955..e9f3c17 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -54,7 +54,7 @@ options PSEUDOFS # Pseudo-filesystem framework options GEOM_PART_GPT # GUID Partition Tables. options GEOM_LABEL # Provides labelization options COMPAT_43TTY # BSD 4.3 TTY compat (sgtty) -options COMPAT_IA32 # Compatible with i386 binaries +options COMPAT_FREEBSD32 # Compatible with i386 binaries options COMPAT_FREEBSD4 # Compatible with FreeBSD4 options COMPAT_FREEBSD5 # Compatible with FreeBSD5 options COMPAT_FREEBSD6 # Compatible with FreeBSD6 diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES index 159f12e..4b6debb 100644 --- a/sys/amd64/conf/NOTES +++ b/sys/amd64/conf/NOTES @@ -445,7 +445,7 @@ options PMAP_SHPGPERPROC=201 #XXX these 32 bit binaries is added. # Enable 32-bit runtime support for FreeBSD/i386 binaries. -options COMPAT_IA32 +options COMPAT_FREEBSD32 # Enable iBCS2 runtime support for SCO and ISC binaries #XXX#options IBCS2 @@ -456,7 +456,7 @@ options COMPAT_IA32 # Enable Linux ABI emulation #XXX#options COMPAT_LINUX -# Enable 32-bit Linux ABI emulation (requires COMPAT_43 and COMPAT_IA32) +# Enable 32-bit Linux ABI emulation (requires COMPAT_43 and COMPAT_FREEBSD32) options COMPAT_LINUX32 # Enable the linux-like proc filesystem support (requires COMPAT_LINUX32 diff --git a/sys/amd64/conf/XENHVM b/sys/amd64/conf/XENHVM index f875f5a..377276e 100644 --- a/sys/amd64/conf/XENHVM +++ b/sys/amd64/conf/XENHVM @@ -55,7 +55,7 @@ options PSEUDOFS # Pseudo-filesystem framework options GEOM_PART_GPT # GUID Partition Tables. options GEOM_LABEL # Provides labelization options COMPAT_43TTY # BSD 4.3 TTY compat (sgtty) -options COMPAT_IA32 # Compatible with i386 binaries +options COMPAT_FREEBSD32 # Compatible with i386 binaries options COMPAT_FREEBSD4 # Compatible with FreeBSD4 options COMPAT_FREEBSD5 # Compatible with FreeBSD5 options COMPAT_FREEBSD6 # Compatible with FreeBSD6 diff --git a/sys/amd64/include/elf.h b/sys/amd64/include/elf.h index 88f4398..678f5d3 100644 --- a/sys/amd64/include/elf.h +++ b/sys/amd64/include/elf.h @@ -42,6 +42,7 @@ #include #define ELF_ARCH EM_X86_64 +#define ELF_ARCH32 EM_386 #define ELF_MACHINE_OK(x) ((x) == EM_X86_64) diff --git a/sys/amd64/include/reg.h b/sys/amd64/include/reg.h index 89211a3..4a83918 100644 --- a/sys/amd64/include/reg.h +++ b/sys/amd64/include/reg.h @@ -37,6 +37,10 @@ #ifndef _MACHINE_REG_H_ #define _MACHINE_REG_H_ +#if defined(_KERNEL) && !defined(_STANDALONE) +#include "opt_compat.h" +#endif + /* * Register set accessible via /proc/$pid/regs and PT_{SET,GET}REGS. */ @@ -116,6 +120,11 @@ struct dbreg { #define DBREG_DRX(d,x) ((d)->dr[(x)]) /* reference dr0 - dr15 by register number */ +#ifdef COMPAT_FREEBSD32 +#include +#include +#endif + #ifdef _KERNEL /* * XXX these interfaces are MI, so they should be declared in a MI place. diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c index 6e3e326..d967ad7 100644 --- a/sys/amd64/linux32/linux32_sysvec.c +++ b/sys/amd64/linux32/linux32_sysvec.c @@ -34,8 +34,8 @@ __FBSDID("$FreeBSD$"); #include "opt_compat.h" -#ifndef COMPAT_IA32 -#error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!" +#ifndef COMPAT_FREEBSD32 +#error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!" #endif #define __ELF_WORD_SIZE 32 -- cgit v1.1 From 9b7228a41e71c09c126a7ed8b5812a56e8d7029e Mon Sep 17 00:00:00 2001 From: kib Date: Tue, 13 Apr 2010 10:23:03 +0000 Subject: MFC r206459: Handle a case when non-canonical address is loaded into the fsbase or gsbase MSR. --- sys/amd64/amd64/exception.S | 30 ++++++++++++++++++++++++++++-- sys/amd64/amd64/trap.c | 8 ++++++++ sys/amd64/include/md_var.h | 4 ++++ 3 files changed, 40 insertions(+), 2 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S index 1799b74..69288f3 100644 --- a/sys/amd64/amd64/exception.S +++ b/sys/amd64/amd64/exception.S @@ -668,7 +668,8 @@ ld_fs: movw %ax,%fs movl $MSR_FSBASE,%ecx movl PCB_FSBASE(%r8),%eax movl PCB_FSBASE+4(%r8),%edx - wrmsr + .globl ld_fsbase +ld_fsbase: wrmsr 1: /* Restore %gs and gsbase */ movw TF_GS(%rsp),%si @@ -685,7 +686,8 @@ ld_gs: movw %si,%gs movl $MSR_KGSBASE,%ecx movl PCB_GSBASE(%r8),%eax movl PCB_GSBASE+4(%r8),%edx - wrmsr + .globl ld_gsbase +ld_gsbase: wrmsr 1: .globl ld_es ld_es: movw TF_ES(%rsp),%es .globl ld_ds @@ -798,6 +800,30 @@ gs_load_fault: call trap movw $KUG32SEL,TF_GS(%rsp) jmp doreti + + ALIGN_TEXT + .globl fsbase_load_fault +fsbase_load_fault: + movl $T_PROTFLT,TF_TRAPNO(%rsp) + movq %rsp, %rdi + call trap + movq PCPU(CURTHREAD),%r8 + movq TD_PCB(%r8),%r8 + movq $0,PCB_FSBASE(%r8) + jmp doreti + + ALIGN_TEXT + .globl gsbase_load_fault +gsbase_load_fault: + popfq + movl $T_PROTFLT,TF_TRAPNO(%rsp) + movq %rsp, %rdi + call trap + movq PCPU(CURTHREAD),%r8 + movq TD_PCB(%r8),%r8 + movq $0,PCB_GSBASE(%r8) + jmp doreti + #ifdef HWPMC_HOOKS ENTRY(end_exceptions) #endif diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 41ca758..4b5d8c7 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -563,6 +563,14 @@ trap(struct trapframe *frame) frame->tf_gs = _ugssel; goto out; } + if (frame->tf_rip == (long)ld_gsbase) { + frame->tf_rip = (long)gsbase_load_fault; + goto out; + } + if (frame->tf_rip == (long)ld_fsbase) { + frame->tf_rip = (long)fsbase_load_fault; + goto out; + } if (PCPU_GET(curpcb)->pcb_onfault != NULL) { frame->tf_rip = (long)PCPU_GET(curpcb)->pcb_onfault; diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h index 2b43b37..88f3e1d 100644 --- a/sys/amd64/include/md_var.h +++ b/sys/amd64/include/md_var.h @@ -83,10 +83,14 @@ void ld_ds(void) __asm(__STRING(ld_ds)); void ld_es(void) __asm(__STRING(ld_es)); void ld_fs(void) __asm(__STRING(ld_fs)); void ld_gs(void) __asm(__STRING(ld_gs)); +void ld_fsbase(void) __asm(__STRING(ld_fsbase)); +void ld_gsbase(void) __asm(__STRING(ld_gsbase)); void ds_load_fault(void) __asm(__STRING(ds_load_fault)); void es_load_fault(void) __asm(__STRING(es_load_fault)); void fs_load_fault(void) __asm(__STRING(fs_load_fault)); void gs_load_fault(void) __asm(__STRING(gs_load_fault)); +void fsbase_load_fault(void) __asm(__STRING(fsbase_load_fault)); +void gsbase_load_fault(void) __asm(__STRING(gsbase_load_fault)); void dump_add_page(vm_paddr_t); void dump_drop_page(vm_paddr_t); void initializecpu(void); -- cgit v1.1 From 624d652fbb0bd336ef0fbf2a373b999101823a58 Mon Sep 17 00:00:00 2001 From: jhb Date: Wed, 14 Apr 2010 15:00:46 +0000 Subject: MFC 205851: Add a handler for the local APIC error interrupt. For now it just prints out the current value of the local APIC error register when the interrupt fires. --- sys/amd64/amd64/apic_vector.S | 12 +++++++++++ sys/amd64/amd64/local_apic.c | 47 ++++++++++++++++++++++++++----------------- sys/amd64/include/apicvar.h | 5 +++-- 3 files changed, 44 insertions(+), 20 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index cebafc8..df94a47 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -104,6 +104,18 @@ IDTVEC(timerint) MEXITCOUNT jmp doreti +/* + * Local APIC error interrupt handler. + */ + .text + SUPERALIGN_TEXT +IDTVEC(errorint) + PUSH_FRAME + FAKE_MCOUNT(TF_RIP(%rsp)) + call lapic_handle_error + MEXITCOUNT + jmp doreti + #ifdef SMP /* * Global address space TLB shootdown. diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c index c274631..8edc971 100644 --- a/sys/amd64/amd64/local_apic.c +++ b/sys/amd64/amd64/local_apic.c @@ -115,14 +115,12 @@ struct lapic { int la_ioint_irqs[APIC_NUM_IOINTS + 1]; } static lapics[MAX_APIC_ID + 1]; -/* XXX: should thermal be an NMI? */ - /* Global defaults for local APIC LVT entries. */ static struct lvt lvts[LVT_MAX + 1] = { { 1, 1, 1, 1, APIC_LVT_DM_EXTINT, 0 }, /* LINT0: masked ExtINT */ { 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 }, /* LINT1: NMI */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT }, /* Timer */ - { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT }, /* Error */ + { 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT }, /* Error */ { 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 }, /* PMC */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT }, /* Thermal */ }; @@ -225,7 +223,10 @@ lapic_init(vm_paddr_t addr) /* Local APIC timer interrupt. */ setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_SYSIGT, SEL_KPL, 0); - /* XXX: error/thermal interrupts */ + /* Local APIC error interrupt. */ + setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_SYSIGT, SEL_KPL, 0); + + /* XXX: Thermal interrupt */ } /* @@ -278,7 +279,7 @@ lapic_dump(const char* str) lapic->id, lapic->version, lapic->ldr, lapic->dfr); printf(" lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n", lapic->lvt_lint0, lapic->lvt_lint1, lapic->tpr, lapic->svr); - printf(" timer: 0x%08x therm: 0x%08x err: 0x%08x pcm: 0x%08x\n", + printf(" timer: 0x%08x therm: 0x%08x err: 0x%08x pmc: 0x%08x\n", lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error, lapic->lvt_pcint); } @@ -326,7 +327,11 @@ lapic_setup(int boot) lapic_timer_enable_intr(); } - /* XXX: Error and thermal LVTs */ + /* Program error LVT and clear any existing errors. */ + lapic->lvt_error = lvt_mode(la, LVT_ERROR, lapic->lvt_error); + lapic->esr = 0; + + /* XXX: Thermal LVT */ intr_restore(eflags); } @@ -725,18 +730,6 @@ lapic_eoi(void) lapic->eoi = 0; } -/* - * Read the contents of the error status register. We have to write - * to the register first before reading from it. - */ -u_int -lapic_error(void) -{ - - lapic->esr = 0; - return (lapic->esr); -} - void lapic_handle_intr(int vector, struct trapframe *frame) { @@ -863,6 +856,24 @@ lapic_timer_enable_intr(void) lapic->lvt_timer = value; } +void +lapic_handle_error(void) +{ + u_int32_t esr; + + /* + * Read the contents of the error status register. Write to + * the register first before reading from it to force the APIC + * to update its value to indicate any errors that have + * occurred since the previous write to the register. + */ + lapic->esr = 0; + esr = lapic->esr; + + printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr); + lapic_eoi(); +} + u_int apic_cpuid(u_int apic_id) { diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h index 110ce81..91bba99 100644 --- a/sys/amd64/include/apicvar.h +++ b/sys/amd64/include/apicvar.h @@ -179,7 +179,8 @@ struct apic_enumerator { inthand_t IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3), IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6), - IDTVEC(apic_isr7), IDTVEC(spuriousint), IDTVEC(timerint); + IDTVEC(apic_isr7), IDTVEC(errorint), IDTVEC(spuriousint), + IDTVEC(timerint); extern vm_paddr_t lapic_paddr; extern int apic_cpuids[]; @@ -211,13 +212,13 @@ void lapic_disable_pmc(void); void lapic_dump(const char *str); int lapic_enable_pmc(void); void lapic_eoi(void); -u_int lapic_error(void); int lapic_id(void); void lapic_init(vm_paddr_t addr); int lapic_intr_pending(u_int vector); void lapic_ipi_raw(register_t icrlo, u_int dest); void lapic_ipi_vectored(u_int vector, int dest); int lapic_ipi_wait(int delay); +void lapic_handle_error(void); void lapic_handle_intr(int vector, struct trapframe *frame); void lapic_handle_timer(struct trapframe *frame); void lapic_reenable_pmc(void); -- cgit v1.1 From 1afdc2f1bbd14b72664ff8d709ab7fbc19afcc1f Mon Sep 17 00:00:00 2001 From: gibbs Date: Wed, 14 Apr 2010 17:01:29 +0000 Subject: MFC 204214: Enforce stronger bus-dma alignment semantics so bus-dma operates correctly with Xen's blkfront driver. --- sys/amd64/amd64/busdma_machdep.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/busdma_machdep.c b/sys/amd64/amd64/busdma_machdep.c index 3197d15..fae6ef3 100644 --- a/sys/amd64/amd64/busdma_machdep.c +++ b/sys/amd64/amd64/busdma_machdep.c @@ -239,8 +239,7 @@ bus_dma_tag_create(bus_dma_tag_t parent, bus_size_t alignment, newtag->alignment = alignment; newtag->boundary = boundary; newtag->lowaddr = trunc_page((vm_paddr_t)lowaddr) + (PAGE_SIZE - 1); - newtag->highaddr = trunc_page((vm_paddr_t)highaddr) + - (PAGE_SIZE - 1); + newtag->highaddr = trunc_page((vm_paddr_t)highaddr) + (PAGE_SIZE - 1); newtag->filter = filter; newtag->filterarg = filterarg; newtag->maxsize = maxsize; @@ -605,13 +604,18 @@ _bus_dmamap_load_buffer(bus_dma_tag_t dmat, vendaddr = (vm_offset_t)buf + buflen; while (vaddr < vendaddr) { + bus_size_t sg_len; + + sg_len = PAGE_SIZE - ((vm_offset_t)vaddr & PAGE_MASK); if (pmap) paddr = pmap_extract(pmap, vaddr); else paddr = pmap_kextract(vaddr); - if (run_filter(dmat, paddr) != 0) + if (run_filter(dmat, paddr) != 0) { + sg_len = roundup2(sg_len, dmat->alignment); map->pagesneeded++; - vaddr += (PAGE_SIZE - ((vm_offset_t)vaddr & PAGE_MASK)); + } + vaddr += sg_len; } CTR1(KTR_BUSDMA, "pagesneeded= %d\n", map->pagesneeded); } @@ -644,6 +648,8 @@ _bus_dmamap_load_buffer(bus_dma_tag_t dmat, bmask = ~(dmat->boundary - 1); for (seg = *segp; buflen > 0 ; ) { + bus_size_t max_sgsize; + /* * Get the physical address for this segment. */ @@ -655,11 +661,15 @@ _bus_dmamap_load_buffer(bus_dma_tag_t dmat, /* * Compute the segment size, and adjust counts. */ - sgsize = PAGE_SIZE - ((u_long)curaddr & PAGE_MASK); - if (sgsize > dmat->maxsegsz) - sgsize = dmat->maxsegsz; - if (buflen < sgsize) - sgsize = buflen; + max_sgsize = MIN(buflen, dmat->maxsegsz); + sgsize = PAGE_SIZE - ((vm_offset_t)curaddr & PAGE_MASK); + if (map->pagesneeded != 0 && run_filter(dmat, curaddr)) { + sgsize = roundup2(sgsize, dmat->alignment); + sgsize = MIN(sgsize, max_sgsize); + curaddr = add_bounce_page(dmat, map, vaddr, sgsize); + } else { + sgsize = MIN(sgsize, max_sgsize); + } /* * Make sure we don't cross any boundaries. @@ -670,9 +680,6 @@ _bus_dmamap_load_buffer(bus_dma_tag_t dmat, sgsize = (baddr - curaddr); } - if (map->pagesneeded != 0 && run_filter(dmat, curaddr)) - curaddr = add_bounce_page(dmat, map, vaddr, sgsize); - /* * Insert chunk into a segment, coalescing with * previous segment if possible. -- cgit v1.1 From ab69bb0ca5ba497377d209f7d6c92fbd4e4972fb Mon Sep 17 00:00:00 2001 From: fabient Date: Fri, 16 Apr 2010 15:43:24 +0000 Subject: MFC r206089, r206684: - Support for uncore counting events: one fixed PMC with the uncore domain clock, 8 programmable PMC. - Westmere based CPU (Xeon 5600, Corei7 980X) support. - New man pages with events list for core and uncore. - Updated Corei7 events with Intel 253669-033US December 2009 doc. There is some removed events in the documentation, they have been kept in the code but documented in the man page as obsolete. - Offcore response events can be setup with rsp token. Sponsored by: NETASQ --- sys/amd64/include/pmc_mdep.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/pmc_mdep.h b/sys/amd64/include/pmc_mdep.h index f233a51..4f16485 100644 --- a/sys/amd64/include/pmc_mdep.h +++ b/sys/amd64/include/pmc_mdep.h @@ -43,17 +43,20 @@ struct pmc_mdep; #include #include #include +#include /* * Intel processors implementing V2 and later of the Intel performance * measurement architecture have PMCs of the following classes: TSC, - * IAF and IAP. + * IAF, IAP, UCF and UCP. */ #define PMC_MDEP_CLASS_INDEX_TSC 0 #define PMC_MDEP_CLASS_INDEX_K8 1 #define PMC_MDEP_CLASS_INDEX_P4 1 #define PMC_MDEP_CLASS_INDEX_IAP 1 #define PMC_MDEP_CLASS_INDEX_IAF 2 +#define PMC_MDEP_CLASS_INDEX_UCP 3 +#define PMC_MDEP_CLASS_INDEX_UCF 4 /* * On the amd64 platform we support the following PMCs. @@ -63,12 +66,16 @@ struct pmc_mdep; * PIV Intel P4/HTT and P4/EMT64 * IAP Intel Core/Core2/Atom CPUs in 64 bits mode. * IAF Intel fixed-function PMCs in Core2 and later CPUs. + * UCP Intel Uncore programmable PMCs. + * UCF Intel Uncore fixed-function PMCs. */ union pmc_md_op_pmcallocate { struct pmc_md_amd_op_pmcallocate pm_amd; struct pmc_md_iaf_op_pmcallocate pm_iaf; struct pmc_md_iap_op_pmcallocate pm_iap; + struct pmc_md_ucf_op_pmcallocate pm_ucf; + struct pmc_md_ucp_op_pmcallocate pm_ucp; struct pmc_md_p4_op_pmcallocate pm_p4; uint64_t __pad[4]; }; @@ -83,6 +90,8 @@ union pmc_md_pmc { struct pmc_md_amd_pmc pm_amd; struct pmc_md_iaf_pmc pm_iaf; struct pmc_md_iap_pmc pm_iap; + struct pmc_md_ucf_pmc pm_ucf; + struct pmc_md_ucp_pmc pm_ucp; struct pmc_md_p4_pmc pm_p4; }; -- cgit v1.1 From a291c9ff146b6f7f473498e1c7de00afd8709c24 Mon Sep 17 00:00:00 2001 From: kib Date: Sat, 17 Apr 2010 09:37:08 +0000 Subject: MFC r206623: ld_gs_base is executing with stack containing only the frame, temporary pushed %rflags has been popped already. --- sys/amd64/amd64/exception.S | 1 - 1 file changed, 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S index 69288f3..65c6452 100644 --- a/sys/amd64/amd64/exception.S +++ b/sys/amd64/amd64/exception.S @@ -815,7 +815,6 @@ fsbase_load_fault: ALIGN_TEXT .globl gsbase_load_fault gsbase_load_fault: - popfq movl $T_PROTFLT,TF_TRAPNO(%rsp) movq %rsp, %rdi call trap -- cgit v1.1 From c61b9f564a268507195acc02310e9abd9d8431d8 Mon Sep 17 00:00:00 2001 From: kib Date: Tue, 20 Apr 2010 08:19:43 +0000 Subject: MFC r206553: Change printf() calls to uprintf() for sigreturn() and trap() complaints about inacessible or wrong mcontext, and for dreaded "kernel trap with interrupts disabled" situation. The later is changed when trap is generated from user mode (shall never be ?). Normalize the messages to include both pid and thread name. --- sys/amd64/amd64/machdep.c | 17 ++++++++++------- sys/amd64/amd64/trap.c | 2 +- sys/amd64/ia32/ia32_signal.c | 12 ++++++++---- 3 files changed, 19 insertions(+), 12 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index c4130a4..7671376 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -422,13 +422,14 @@ sigreturn(td, uap) error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) { - printf("sigreturn (pid %d): copyin failed\n", p->p_pid); + uprintf("pid %d (%s): sigreturn copyin failed\n", + p->p_pid, td->td_name); return (error); } ucp = &uc; if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { - printf("sigreturn (pid %d): mc_flags %x\n", p->p_pid, - ucp->uc_mcontext.mc_flags); + uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, + td->td_name, ucp->uc_mcontext.mc_flags); return (EINVAL); } regs = td->td_frame; @@ -447,8 +448,8 @@ sigreturn(td, uap) * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { - printf("sigreturn (pid %d): rflags = 0x%lx\n", p->p_pid, - rflags); + uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, + td->td_name, rflags); return (EINVAL); } @@ -459,7 +460,8 @@ sigreturn(td, uap) */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { - printf("sigreturn (pid %d): cs = 0x%x\n", p->p_pid, cs); + uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, + td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; @@ -471,7 +473,8 @@ sigreturn(td, uap) ret = set_fpcontext(td, &ucp->uc_mcontext); if (ret != 0) { - printf("sigreturn (pid %d): set_fpcontext\n", p->p_pid); + uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", + p->p_pid, td->td_name, ret); return (ret); } bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 4b5d8c7..8492f4e 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -303,7 +303,7 @@ trap(struct trapframe *frame) * enabled later. */ if (ISPL(frame->tf_cs) == SEL_UPL) - printf( + uprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type); else if (type != T_NMI && type != T_BPTFLT && diff --git a/sys/amd64/ia32/ia32_signal.c b/sys/amd64/ia32/ia32_signal.c index 10ec641..2416988 100644 --- a/sys/amd64/ia32/ia32_signal.c +++ b/sys/amd64/ia32/ia32_signal.c @@ -565,7 +565,8 @@ freebsd4_freebsd32_sigreturn(td, uap) * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { - printf("freebsd4_freebsd32_sigreturn: eflags = 0x%x\n", eflags); + uprintf("pid %d (%s): freebsd4_freebsd32_sigreturn eflags = 0x%x\n", + td->td_proc->p_pid, td->td_name, eflags); return (EINVAL); } @@ -576,7 +577,8 @@ freebsd4_freebsd32_sigreturn(td, uap) */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { - printf("freebsd4_sigreturn: cs = 0x%x\n", cs); + uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", + td->td_proc->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; @@ -647,7 +649,8 @@ freebsd32_sigreturn(td, uap) * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { - printf("freebsd32_sigreturn: eflags = 0x%x\n", eflags); + uprintf("pid %d (%s): freebsd32_sigreturn eflags = 0x%x\n", + td->td_proc->p_pid, td->td_name, eflags); return (EINVAL); } @@ -658,7 +661,8 @@ freebsd32_sigreturn(td, uap) */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { - printf("sigreturn: cs = 0x%x\n", cs); + uprintf("pid %d (%s): sigreturn cs = 0x%x\n", + td->td_proc->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; -- cgit v1.1 From 2647f6f35366535b7a3ede7e9b6d9f6ba5677711 Mon Sep 17 00:00:00 2001 From: attilio Date: Sat, 24 Apr 2010 00:49:19 +0000 Subject: MFC r206421: Default the machdep.lapic_allclocks to be enabled in order to cope with broken atrtc. Now if you want more correct stats on profhz and stathz it may be disabled by setting to 0. Sponsored by: Sandvine Incorporated --- sys/amd64/isa/clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/isa/clock.c b/sys/amd64/isa/clock.c index e5c27d1..6e5da8f 100644 --- a/sys/amd64/isa/clock.c +++ b/sys/amd64/isa/clock.c @@ -84,7 +84,7 @@ TUNABLE_INT("hw.i8254.freq", &i8254_freq); int i8254_max_count; static int i8254_real_max_count; -static int lapic_allclocks; +static int lapic_allclocks = 1; TUNABLE_INT("machdep.lapic_allclocks", &lapic_allclocks); struct mtx clock_lock; -- cgit v1.1 From 6834582d3b1e4ad8f5d6518a45a1f9911cb89dfe Mon Sep 17 00:00:00 2001 From: yongari Date: Mon, 26 Apr 2010 17:03:56 +0000 Subject: MFC r206625: Add driver for Silicon Integrated Systems SiS190/191 Fast/Gigabit Ethernet. This driver was written by Alexander Pohoyda and greatly enhanced by Nikolay Denev. I don't have these hardwares but this driver was tested by Nikolay Denev and xclin. Because SiS didn't release data sheet for this controller, programming information came from Linux driver and OpenSolaris. Unlike other open source driver for SiS190/191, sge(4) takes full advantage of TX/RX checksum offloading and does not require additional copy operation in RX handler. The controller seems to have advanced offloading features like VLAN hardware tag insertion/stripping, TCP segmentation offload(TSO) as well as jumbo frame support but these features are not available yet. Special thanks to xclin cs dot nctu dot edu dot tw> who sent fix for receiving VLAN oversized frames. --- sys/amd64/conf/GENERIC | 1 + 1 file changed, 1 insertion(+) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index e9f3c17..a0edfca 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -224,6 +224,7 @@ device pcn # AMD Am79C97x PCI 10/100 (precedence over 'le') device re # RealTek 8139C+/8169/8169S/8110S device rl # RealTek 8129/8139 device sf # Adaptec AIC-6915 (``Starfire'') +device sge # Silicon Integrated Systems SiS190/191 device sis # Silicon Integrated Systems SiS 900/SiS 7016 device sk # SysKonnect SK-984x & SK-982x gigabit Ethernet device ste # Sundance ST201 (D-Link DFE-550TX) -- cgit v1.1 From 7550515288b109b9d22639e1ebd6ee88eb569f7a Mon Sep 17 00:00:00 2001 From: kib Date: Tue, 27 Apr 2010 10:50:09 +0000 Subject: MFC r206992: As was done in r155238 for i386 and in r155239 for amd64, clear the carry flag for ia32 binary executed on amd64 host in get_mcontext(). --- sys/amd64/ia32/ia32_signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/ia32/ia32_signal.c b/sys/amd64/ia32/ia32_signal.c index 2416988..15ba947 100644 --- a/sys/amd64/ia32/ia32_signal.c +++ b/sys/amd64/ia32/ia32_signal.c @@ -141,9 +141,11 @@ ia32_get_mcontext(struct thread *td, struct ia32_mcontext *mcp, int flags) mcp->mc_esi = tp->tf_rsi; mcp->mc_ebp = tp->tf_rbp; mcp->mc_isp = tp->tf_rsp; + mcp->mc_eflags = tp->tf_rflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_eax = 0; mcp->mc_edx = 0; + mcp->mc_eflags &= ~PSL_C; } else { mcp->mc_eax = tp->tf_rax; mcp->mc_edx = tp->tf_rdx; @@ -152,7 +154,6 @@ ia32_get_mcontext(struct thread *td, struct ia32_mcontext *mcp, int flags) mcp->mc_ecx = tp->tf_rcx; mcp->mc_eip = tp->tf_rip; mcp->mc_cs = tp->tf_cs; - mcp->mc_eflags = tp->tf_rflags; mcp->mc_esp = tp->tf_rsp; mcp->mc_ss = tp->tf_ss; mcp->mc_len = sizeof(*mcp); -- cgit v1.1 From ef43ecae04b397691ada6c71a499614122172876 Mon Sep 17 00:00:00 2001 From: thompsa Date: Thu, 29 Apr 2010 22:44:04 +0000 Subject: MFC r207077 Change USB_DEBUG to #ifdef and allow it to be turned off. Previously this had the illusion of a tunable setting but was always turned on regardless. --- sys/amd64/conf/GENERIC | 1 + 1 file changed, 1 insertion(+) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index a0edfca..999ccb7 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -281,6 +281,7 @@ device firmware # firmware assist module device bpf # Berkeley packet filter # USB support +options USB_DEBUG # enable debug msgs device uhci # UHCI PCI->USB interface device ohci # OHCI PCI->USB interface device ehci # EHCI PCI->USB interface (USB 2.0) -- cgit v1.1 From 01384213a705f4a7e4d5f874b8868b4e8330508f Mon Sep 17 00:00:00 2001 From: imp Date: Sun, 2 May 2010 06:20:42 +0000 Subject: Move to the new way of specifying compat options. The backs out the FOO = BAR form, in favor of listing the mapping in a separate file for more compatibility with older versions of config. --- sys/amd64/conf/GENERIC | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index 999ccb7..7288fef 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -54,7 +54,8 @@ options PSEUDOFS # Pseudo-filesystem framework options GEOM_PART_GPT # GUID Partition Tables. options GEOM_LABEL # Provides labelization options COMPAT_43TTY # BSD 4.3 TTY compat (sgtty) -options COMPAT_FREEBSD32 # Compatible with i386 binaries +#options COMPAT_FREEBSD32 # Compatible with i386 binaries +options COMPAT_IA32 # Compatible with i386 binaries options COMPAT_FREEBSD4 # Compatible with FreeBSD4 options COMPAT_FREEBSD5 # Compatible with FreeBSD5 options COMPAT_FREEBSD6 # Compatible with FreeBSD6 -- cgit v1.1 From 6231a81f242496fceb9b9cf0b3c45863b96efc78 Mon Sep 17 00:00:00 2001 From: imp Date: Sun, 2 May 2010 06:24:17 +0000 Subject: Revert 207494: it was only for testing purposes. --- sys/amd64/conf/GENERIC | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index 7288fef..999ccb7 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -54,8 +54,7 @@ options PSEUDOFS # Pseudo-filesystem framework options GEOM_PART_GPT # GUID Partition Tables. options GEOM_LABEL # Provides labelization options COMPAT_43TTY # BSD 4.3 TTY compat (sgtty) -#options COMPAT_FREEBSD32 # Compatible with i386 binaries -options COMPAT_IA32 # Compatible with i386 binaries +options COMPAT_FREEBSD32 # Compatible with i386 binaries options COMPAT_FREEBSD4 # Compatible with FreeBSD4 options COMPAT_FREEBSD5 # Compatible with FreeBSD5 options COMPAT_FREEBSD6 # Compatible with FreeBSD6 -- cgit v1.1 From 3e52c1f894aa8fc2b8031ef275b1fb830a25bb6f Mon Sep 17 00:00:00 2001 From: kib Date: Thu, 6 May 2010 04:57:10 +0000 Subject: MFC r207570: Style and comment adjustements. --- sys/amd64/amd64/exception.S | 79 ++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 34 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S index 65c6452..41bf173 100644 --- a/sys/amd64/amd64/exception.S +++ b/sys/amd64/amd64/exception.S @@ -50,14 +50,14 @@ .bss .globl dtrace_invop_jump_addr .align 8 - .type dtrace_invop_jump_addr, @object - .size dtrace_invop_jump_addr, 8 + .type dtrace_invop_jump_addr,@object + .size dtrace_invop_jump_addr,8 dtrace_invop_jump_addr: .zero 8 .globl dtrace_invop_calltrap_addr .align 8 - .type dtrace_invop_calltrap_addr, @object - .size dtrace_invop_calltrap_addr, 8 + .type dtrace_invop_calltrap_addr,@object + .size dtrace_invop_calltrap_addr,8 dtrace_invop_calltrap_addr: .zero 8 #endif @@ -157,7 +157,6 @@ IDTVEC(align) * kernel from userland. Reenable interrupts if they were enabled * before the trap. This approximates SDT_SYS386TGT on the i386 port. */ - SUPERALIGN_TEXT .globl alltraps .type alltraps,@function @@ -211,16 +210,16 @@ alltraps_pushregs_no_rdi: * Set our jump address for the jump back in the event that * the breakpoint wasn't caused by DTrace at all. */ - movq $calltrap, dtrace_invop_calltrap_addr(%rip) + movq $calltrap,dtrace_invop_calltrap_addr(%rip) /* Jump to the code hooked in by DTrace. */ - movq dtrace_invop_jump_addr, %rax + movq dtrace_invop_jump_addr,%rax jmpq *dtrace_invop_jump_addr #endif .globl calltrap .type calltrap,@function calltrap: - movq %rsp, %rdi + movq %rsp,%rdi call trap MEXITCOUNT jmp doreti /* Handle any pending ASTs */ @@ -274,9 +273,11 @@ IDTVEC(dblfault) testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz 1f /* already running with kernel GS.base */ swapgs -1: movq %rsp, %rdi +1: + movq %rsp,%rdi call dblfault_handler -2: hlt +2: + hlt jmp 2b IDTVEC(page) @@ -369,7 +370,7 @@ IDTVEC(fast_syscall) movq %r15,TF_R15(%rsp) /* C preserved */ movl $TF_HASSEGS,TF_FLAGS(%rsp) FAKE_MCOUNT(TF_RIP(%rsp)) - movq %rsp, %rdi + movq %rsp,%rdi call syscall movq PCPU(CURPCB),%rax andq $~PCB_FULLCTX,PCB_FLAGS(%rax) @@ -456,7 +457,7 @@ nmi_fromuserspace: /* Note: this label is also used by ddb and gdb: */ nmi_calltrap: FAKE_MCOUNT(TF_RIP(%rsp)) - movq %rsp, %rdi + movq %rsp,%rdi call trap MEXITCOUNT #ifdef HWPMC_HOOKS @@ -555,9 +556,9 @@ nmi_restoreregs: iretq ENTRY(fork_trampoline) - movq %r12, %rdi /* function */ - movq %rbx, %rsi /* arg1 */ - movq %rsp, %rdx /* trapframe pointer */ + movq %r12,%rdi /* function */ + movq %rbx,%rsi /* arg1 */ + movq %rsp,%rdx /* trapframe pointer */ call fork_exit MEXITCOUNT jmp doreti /* Handle any ASTs */ @@ -628,7 +629,7 @@ doreti_ast: testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax) je doreti_exit sti - movq %rsp, %rdi /* pass a pointer to the trapframe */ + movq %rsp,%rdi /* pass a pointer to the trapframe */ call ast jmp doreti_ast @@ -648,8 +649,8 @@ doreti_exit: * Do not reload segment registers for kernel. * Since we do not reload segments registers with sane * values on kernel entry, descriptors referenced by - * segments registers may be not valid. This is fatal - * for the usermode, but is innocent for the kernel. + * segments registers might be not valid. This is fatal + * for user mode, but is not a problem for the kernel. */ testb $SEL_RPL_MASK,TF_CS(%rsp) jz ld_regs @@ -662,14 +663,16 @@ do_segs: /* Restore %fs and fsbase */ movw TF_FS(%rsp),%ax .globl ld_fs -ld_fs: movw %ax,%fs +ld_fs: + movw %ax,%fs cmpw $KUF32SEL,%ax jne 1f movl $MSR_FSBASE,%ecx movl PCB_FSBASE(%r8),%eax movl PCB_FSBASE+4(%r8),%edx .globl ld_fsbase -ld_fsbase: wrmsr +ld_fsbase: + wrmsr 1: /* Restore %gs and gsbase */ movw TF_GS(%rsp),%si @@ -678,7 +681,8 @@ ld_fsbase: wrmsr movl $MSR_GSBASE,%ecx rdmsr .globl ld_gs -ld_gs: movw %si,%gs +ld_gs: + movw %si,%gs wrmsr popfq cmpw $KUG32SEL,%si @@ -687,12 +691,17 @@ ld_gs: movw %si,%gs movl PCB_GSBASE(%r8),%eax movl PCB_GSBASE+4(%r8),%edx .globl ld_gsbase -ld_gsbase: wrmsr -1: .globl ld_es -ld_es: movw TF_ES(%rsp),%es +ld_gsbase: + wrmsr +1: + .globl ld_es +ld_es: + movw TF_ES(%rsp),%es .globl ld_ds -ld_ds: movw TF_DS(%rsp),%ds -ld_regs:movq TF_RDI(%rsp),%rdi +ld_ds: + movw TF_DS(%rsp),%ds +ld_regs: + movq TF_RDI(%rsp),%rdi movq TF_RSI(%rsp),%rsi movq TF_RDX(%rsp),%rdx movq TF_RCX(%rsp),%rcx @@ -711,7 +720,8 @@ ld_regs:movq TF_RDI(%rsp),%rdi jz 1f /* keep running with kernel GS.base */ cli swapgs -1: addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */ +1: + addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */ .globl doreti_iret doreti_iret: iretq @@ -738,7 +748,8 @@ doreti_iret_fault: testl $PSL_I,TF_RFLAGS(%rsp) jz 1f sti -1: movw %fs,TF_FS(%rsp) +1: + movw %fs,TF_FS(%rsp) movw %gs,TF_GS(%rsp) movw %es,TF_ES(%rsp) movw %ds,TF_DS(%rsp) @@ -768,7 +779,7 @@ doreti_iret_fault: .globl ds_load_fault ds_load_fault: movl $T_PROTFLT,TF_TRAPNO(%rsp) - movq %rsp, %rdi + movq %rsp,%rdi call trap movw $KUDSEL,TF_DS(%rsp) jmp doreti @@ -777,7 +788,7 @@ ds_load_fault: .globl es_load_fault es_load_fault: movl $T_PROTFLT,TF_TRAPNO(%rsp) - movq %rsp, %rdi + movq %rsp,%rdi call trap movw $KUDSEL,TF_ES(%rsp) jmp doreti @@ -786,7 +797,7 @@ es_load_fault: .globl fs_load_fault fs_load_fault: movl $T_PROTFLT,TF_TRAPNO(%rsp) - movq %rsp, %rdi + movq %rsp,%rdi call trap movw $KUF32SEL,TF_FS(%rsp) jmp doreti @@ -796,7 +807,7 @@ fs_load_fault: gs_load_fault: popfq movl $T_PROTFLT,TF_TRAPNO(%rsp) - movq %rsp, %rdi + movq %rsp,%rdi call trap movw $KUG32SEL,TF_GS(%rsp) jmp doreti @@ -805,7 +816,7 @@ gs_load_fault: .globl fsbase_load_fault fsbase_load_fault: movl $T_PROTFLT,TF_TRAPNO(%rsp) - movq %rsp, %rdi + movq %rsp,%rdi call trap movq PCPU(CURTHREAD),%r8 movq TD_PCB(%r8),%r8 @@ -816,7 +827,7 @@ fsbase_load_fault: .globl gsbase_load_fault gsbase_load_fault: movl $T_PROTFLT,TF_TRAPNO(%rsp) - movq %rsp, %rdi + movq %rsp,%rdi call trap movq PCPU(CURTHREAD),%r8 movq TD_PCB(%r8),%r8 -- cgit v1.1 From b0947a989bb83db0b51032c99bcf668b5a86817f Mon Sep 17 00:00:00 2001 From: kib Date: Sat, 8 May 2010 12:40:38 +0000 Subject: MFC r207463: Remove debugging code that was not used once since commit. --- sys/amd64/amd64/trap.c | 86 +------------------------------------------------- 1 file changed, 1 insertion(+), 85 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 8492f4e..1fa3d32 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -172,52 +172,6 @@ SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW, extern char *syscallnames[]; -/* #define DEBUG 1 */ -#ifdef DEBUG -static void -report_seg_fault(const char *segn, struct trapframe *frame) -{ - struct proc_ldt *pldt; - struct trapframe *pf; - - pldt = curproc->p_md.md_ldt; - printf("%d: %s load fault %lx %p %d\n", - curproc->p_pid, segn, frame->tf_err, - pldt != NULL ? pldt->ldt_base : NULL, - pldt != NULL ? pldt->ldt_refcnt : 0); - kdb_backtrace(); - pf = (struct trapframe *)frame->tf_rsp; - printf("rdi %lx\n", pf->tf_rdi); - printf("rsi %lx\n", pf->tf_rsi); - printf("rdx %lx\n", pf->tf_rdx); - printf("rcx %lx\n", pf->tf_rcx); - printf("r8 %lx\n", pf->tf_r8); - printf("r9 %lx\n", pf->tf_r9); - printf("rax %lx\n", pf->tf_rax); - printf("rbx %lx\n", pf->tf_rbx); - printf("rbp %lx\n", pf->tf_rbp); - printf("r10 %lx\n", pf->tf_r10); - printf("r11 %lx\n", pf->tf_r11); - printf("r12 %lx\n", pf->tf_r12); - printf("r13 %lx\n", pf->tf_r13); - printf("r14 %lx\n", pf->tf_r14); - printf("r15 %lx\n", pf->tf_r15); - printf("fs %x\n", pf->tf_fs); - printf("gs %x\n", pf->tf_gs); - printf("es %x\n", pf->tf_es); - printf("ds %x\n", pf->tf_ds); - printf("tno %x\n", pf->tf_trapno); - printf("adr %lx\n", pf->tf_addr); - printf("flg %x\n", pf->tf_flags); - printf("err %lx\n", pf->tf_err); - printf("rip %lx\n", pf->tf_rip); - printf("cs %lx\n", pf->tf_cs); - printf("rfl %lx\n", pf->tf_rflags); - printf("rsp %lx\n", pf->tf_rsp); - printf("ss %lx\n", pf->tf_ss); -} -#endif - /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry @@ -314,9 +268,7 @@ trap(struct trapframe *frame) */ printf("kernel trap %d with interrupts disabled\n", type); -#ifdef DEBUG - report_seg_fault("hlt", frame); -#endif + /* * We shouldn't enable interrupts while holding a * spin lock or servicing an NMI. @@ -532,33 +484,21 @@ trap(struct trapframe *frame) goto out; } if (frame->tf_rip == (long)ld_ds) { -#ifdef DEBUG - report_seg_fault("ds", frame); -#endif frame->tf_rip = (long)ds_load_fault; frame->tf_ds = _udatasel; goto out; } if (frame->tf_rip == (long)ld_es) { -#ifdef DEBUG - report_seg_fault("es", frame); -#endif frame->tf_rip = (long)es_load_fault; frame->tf_es = _udatasel; goto out; } if (frame->tf_rip == (long)ld_fs) { -#ifdef DEBUG - report_seg_fault("fs", frame); -#endif frame->tf_rip = (long)fs_load_fault; frame->tf_fs = _ufssel; goto out; } if (frame->tf_rip == (long)ld_gs) { -#ifdef DEBUG - report_seg_fault("gs", frame); -#endif frame->tf_rip = (long)gs_load_fault; frame->tf_gs = _ugssel; goto out; @@ -664,30 +604,6 @@ trap(struct trapframe *frame) ksi.ksi_addr = (void *)addr; trapsignal(td, &ksi); -#ifdef DEBUG -{ - register_t rg,rgk, rf; - - if (type <= MAX_TRAP_MSG) { - uprintf("fatal process exception: %s", - trap_msg[type]); - if ((type == T_PAGEFLT) || (type == T_PROTFLT)) - uprintf(", fault VA = 0x%lx", frame->tf_addr); - uprintf("\n"); - } - rf = rdmsr(0xc0000100); - rg = rdmsr(0xc0000101); - rgk = rdmsr(0xc0000102); - uprintf("pid %d TRAP %d rip %lx err %lx addr %lx cs %lx ss %lx ds %x " - "es %x fs %x fsbase %lx %lx gs %x gsbase %lx %lx %lx\n", - curproc->p_pid, type, frame->tf_rip, frame->tf_err, - frame->tf_addr, - frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es, - frame->tf_fs, td->td_pcb->pcb_fsbase, rf, - frame->tf_gs, td->td_pcb->pcb_gsbase, rg, rgk); -} -#endif - user: userret(td, frame); mtx_assert(&Giant, MA_NOTOWNED); -- cgit v1.1 From 21d551ae0224e62964401c9da958f37d5a6cd5bb Mon Sep 17 00:00:00 2001 From: kib Date: Sat, 8 May 2010 18:54:47 +0000 Subject: MFC r204051 (by imp): n64 has a different size for KINFO_PROC_SIZE. Approved by: imp MFC r207152: Move the constants specifying the size of struct kinfo_proc into machine-specific header files. Add KINFO_PROC32_SIZE for struct kinfo_proc32 for architectures providing COMPAT_FREEBSD32. Add CTASSERT for the size of struct kinfo_proc32. MFC r207269: Style: use #define instead of #define. --- sys/amd64/include/proc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h index 33d5181..acea4c0 100644 --- a/sys/amd64/include/proc.h +++ b/sys/amd64/include/proc.h @@ -53,6 +53,9 @@ struct mdproc { struct system_segment_descriptor md_ldt_sd; }; +#define KINFO_PROC_SIZE 1088 +#define KINFO_PROC32_SIZE 768 + #ifdef _KERNEL /* Get the current kernel thread stack usage. */ -- cgit v1.1 From ff311c2c9afe9a565c19f3e16e22cbff786b27cd Mon Sep 17 00:00:00 2001 From: kib Date: Wed, 12 May 2010 09:34:10 +0000 Subject: MFC r207676: Add definitions for Intel AESNI CPUID bits and print the capabilities on boot. --- sys/amd64/amd64/identcpu.c | 4 ++-- sys/amd64/include/specialreg.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c index b0da729..287c9c2 100644 --- a/sys/amd64/amd64/identcpu.c +++ b/sys/amd64/amd64/identcpu.c @@ -240,7 +240,7 @@ printcpuinfo(void) printf("\n Features2=0x%b", cpu_feature2, "\020" "\001SSE3" /* SSE3 */ - "\002" + "\002PCLMULQDQ" /* Carry-Less Mul Quadword */ "\003DTES64" /* 64-bit Debug Trace */ "\004MON" /* MONITOR/MWAIT Instructions */ "\005DS_CPL" /* CPL Qualified Debug Store */ @@ -264,7 +264,7 @@ printcpuinfo(void) "\027MOVBE" "\030POPCNT" "\031" - "\032" + "\032AESNI" /* AES Crypto*/ "\033XSAVE" "\034OSXSAVE" "\035" diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h index 86a08ce..895619c 100644 --- a/sys/amd64/include/specialreg.h +++ b/sys/amd64/include/specialreg.h @@ -113,6 +113,7 @@ #define CPUID_PBE 0x80000000 #define CPUID2_SSE3 0x00000001 +#define CPUID2_PCLMULQDQ 0x00000002 #define CPUID2_DTES64 0x00000004 #define CPUID2_MON 0x00000008 #define CPUID2_DS_CPL 0x00000010 @@ -131,6 +132,7 @@ #define CPUID2_X2APIC 0x00200000 #define CPUID2_MOVBE 0x00400000 #define CPUID2_POPCNT 0x00800000 +#define CPUID2_AESNI 0x02000000 /* * Important bits in the AMD extended cpuid flags -- cgit v1.1 From 539772c2021ceec4eb869d973b2537e528dc9bb5 Mon Sep 17 00:00:00 2001 From: kib Date: Wed, 19 May 2010 09:30:41 +0000 Subject: MFC r207957: Remove unneeded overrides of the segment registers. --- sys/amd64/amd64/trap.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 1fa3d32..4891e9d 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -485,22 +485,18 @@ trap(struct trapframe *frame) } if (frame->tf_rip == (long)ld_ds) { frame->tf_rip = (long)ds_load_fault; - frame->tf_ds = _udatasel; goto out; } if (frame->tf_rip == (long)ld_es) { frame->tf_rip = (long)es_load_fault; - frame->tf_es = _udatasel; goto out; } if (frame->tf_rip == (long)ld_fs) { frame->tf_rip = (long)fs_load_fault; - frame->tf_fs = _ufssel; goto out; } if (frame->tf_rip == (long)ld_gs) { frame->tf_rip = (long)gs_load_fault; - frame->tf_gs = _ugssel; goto out; } if (frame->tf_rip == (long)ld_gsbase) { -- cgit v1.1 From 857402565bb4044d8ad7fca73b6aa5b8195eda3b Mon Sep 17 00:00:00 2001 From: kib Date: Wed, 19 May 2010 09:32:59 +0000 Subject: MFC r207958: Route all returns from the interrupts and faults through the doreti_iret labeled iretq instruction. MFC r208026: Do not use .extern. --- sys/amd64/amd64/apic_vector.S | 12 ++++++------ sys/amd64/amd64/exception.S | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index df94a47..4cfc18b 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -81,7 +81,7 @@ IDTVEC(spuriousint) /* No EOI cycle used here */ - iretq + jmp doreti_iret ISR_VEC(1, apic_isr1) ISR_VEC(2, apic_isr2) @@ -135,7 +135,7 @@ IDTVEC(invltlb) incl smp_tlb_wait popq %rax - iretq + jmp doreti_iret /* * Single page TLB shootdown @@ -155,7 +155,7 @@ IDTVEC(invlpg) incl smp_tlb_wait popq %rax - iretq + jmp doreti_iret /* * Page range TLB shootdown. @@ -181,7 +181,7 @@ IDTVEC(invlrng) popq %rdx popq %rax - iretq + jmp doreti_iret /* * Invalidate cache. @@ -200,7 +200,7 @@ IDTVEC(invlcache) incl smp_tlb_wait popq %rax - iretq + jmp doreti_iret /* * Handler for IPIs sent via the per-cpu IPI bitmap. @@ -247,7 +247,7 @@ IDTVEC(cpususpend) call cpususpend_handler POP_FRAME - iretq + jmp doreti_iret /* * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU. diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S index 41bf173..0de197b 100644 --- a/sys/amd64/amd64/exception.S +++ b/sys/amd64/amd64/exception.S @@ -553,7 +553,7 @@ nmi_restoreregs: movq TF_R14(%rsp),%r14 movq TF_R15(%rsp),%r15 addq $TF_RIP,%rsp - iretq + jmp doreti_iret ENTRY(fork_trampoline) movq %r12,%rdi /* function */ -- cgit v1.1 From 827618ec09d24ec70e60ed16c4e245b5fc3b7f76 Mon Sep 17 00:00:00 2001 From: attilio Date: Tue, 1 Jun 2010 21:19:58 +0000 Subject: MFC r207329, r208716: - Extract the IODEV_PIO interface from ia64 and make it MI. - On i386 and amd64 the old behaviour is kept but multithreaded processes must use the new interface in order to work well. - Support for the other architectures is greatly improved. Sponsored by: Sandvine Incorporated Approved by: re (kib, bz) --- sys/amd64/amd64/io.c | 40 ++++++---------------------------------- sys/amd64/include/iodev.h | 21 ++++++++++++++++++--- 2 files changed, 24 insertions(+), 37 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/io.c b/sys/amd64/amd64/io.c index 09d6e89..c2d0d51 100644 --- a/sys/amd64/amd64/io.c +++ b/sys/amd64/amd64/io.c @@ -28,60 +28,32 @@ __FBSDID("$FreeBSD$"); #include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include #include -#include -#include - -#include -#include - #include +#include -/* ARGSUSED */ int -ioopen(struct cdev *dev __unused, int flags __unused, int fmt __unused, - struct thread *td) +iodev_open(struct thread *td) { - int error; - - error = priv_check(td, PRIV_IO); - if (error != 0) - return (error); - error = securelevel_gt(td->td_ucred, 0); - if (error != 0) - return (error); td->td_frame->tf_rflags |= PSL_IOPL; - return (0); } -/* ARGSUSED */ int -ioclose(struct cdev *dev __unused, int flags __unused, int fmt __unused, - struct thread *td) +iodev_close(struct thread *td) { - td->td_frame->tf_rflags &= ~PSL_IOPL; + td->td_frame->tf_rflags &= ~PSL_IOPL; return (0); } /* ARGSUSED */ int -ioioctl(struct cdev *dev __unused, u_long cmd __unused, caddr_t data __unused, - int fflag __unused, struct thread *td __unused) +iodev_ioctl(u_long cmd __unused, caddr_t data __unused) { - return (ENXIO); + return (ENOIOCTL); } diff --git a/sys/amd64/include/iodev.h b/sys/amd64/include/iodev.h index 1a0a17a..9f53cac 100644 --- a/sys/amd64/include/iodev.h +++ b/sys/amd64/include/iodev.h @@ -25,7 +25,22 @@ * * $FreeBSD$ */ +#ifndef _MACHINE_IODEV_H_ +#define _MACHINE_IODEV_H_ -d_open_t ioopen; -d_close_t ioclose; -d_ioctl_t ioioctl; +#ifdef _KERNEL +#include + +#define iodev_read_1 inb +#define iodev_read_2 inw +#define iodev_read_4 inl +#define iodev_write_1 outb +#define iodev_write_2 outw +#define iodev_write_4 outl + +int iodev_open(struct thread *td); +int iodev_close(struct thread *td); +int iodev_ioctl(u_long cmd, caddr_t data); + +#endif /* _KERNEL */ +#endif /* _MACHINE_IODEV_H_ */ -- cgit v1.1 From 8001a4e77e6e6359e73988a3515e468ade187b04 Mon Sep 17 00:00:00 2001 From: ken Date: Fri, 11 Jun 2010 19:17:36 +0000 Subject: MFC 199549, 199997, 204158, 207673, and 208901. Bring in a number of netfront changes: r199549 | jhb Remove commented out reference to if_watchdog and an assignment of zero to if_timer. Reviewed by: scottl r199997 | gibbs Add media ioctl support and link notifications so that devd will attempt to run dhclient on a netfront (xn) device that is setup for DHCP in /etc/rc.conf. PR: kern/136251 (fixed differently than the submitted patch) r204158 | kmacy - make printf conditional - fix witness warnings by making configuration lock a mutex r207673 | joel Switch to our preferred 2-clause BSD license. Approved by: kmacy r208901 | ken A number of netfront fixes and stability improvements: - Re-enable TSO. This was broken previously due to CSUM_TSO clearing the CSUM_TCP flag, so our checksum flags were incorrectly set going to the netback driver. That was fixed in r206844 in tcp_output.c, so we can turn TSO back on here. - Fix the way transmit slots are calculated, so that we can't overfill the ring. - Avoid sending packets with more fragments/segments than netback can handle. The Linux netback code can only handle packets of MAX_SKB_FRAGS, which turns out to be 18 on machines with 4K pages. We can easily generate packets with 32 or so fragments with TSO turned on. Right now the solution is just to drop the packets (since netback doesn't seem to handle it gracefully), but we should come up with a way to allow a driver to tell the TCP stack the maximum number of fragments it can handle in a single packet. - Fix the way the consumer is tracked in the receive path. It could get out of sync fairly easily. - Use standard Xen ring macros to make it clearer how netfront is using the rings. - Get rid of Linux-ish negative errno return values. - Added more documentation to the driver. - Refactored code to make it easier to read. - Some other minor fixes. Reviewed by: gibbs Sponsored by: Spectra Logic Approved by: re (bz) --- sys/amd64/include/xen/xenfunc.h | 31 +++++++++++++++---------------- sys/amd64/include/xen/xenvar.h | 26 ++++++++++++-------------- 2 files changed, 27 insertions(+), 30 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/xen/xenfunc.h b/sys/amd64/include/xen/xenfunc.h index b3a6672..d03d4f6 100644 --- a/sys/amd64/include/xen/xenfunc.h +++ b/sys/amd64/include/xen/xenfunc.h @@ -1,6 +1,5 @@ -/* - * - * Copyright (c) 2004,2005 Kip Macy +/*- + * Copyright (c) 2004, 2005 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -11,22 +10,22 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ */ - #ifndef _XEN_XENFUNC_H_ #define _XEN_XENFUNC_H_ diff --git a/sys/amd64/include/xen/xenvar.h b/sys/amd64/include/xen/xenvar.h index 1433b76..d9dbc5d 100644 --- a/sys/amd64/include/xen/xenvar.h +++ b/sys/amd64/include/xen/xenvar.h @@ -1,29 +1,27 @@ -/* +/*- * Copyright (c) 2008 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: - * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. - * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. * * $FreeBSD$ */ -- cgit v1.1 From d45b7f14ae6fa78882fa9ec3be976733ca4767b4 Mon Sep 17 00:00:00 2001 From: grehan Date: Fri, 13 May 2011 04:54:01 +0000 Subject: Import of bhyve hypervisor and utilities, part 1. vmm.ko - kernel module for VT-x, VT-d and hypervisor control bhyve - user-space sequencer and i/o emulation vmmctl - dump of hypervisor register state libvmm - front-end to vmm.ko chardev interface bhyve was designed and implemented by Neel Natu. Thanks to the following folk from NetApp who helped to make this available: Joe CaraDonna Peter Snyder Jeff Heller Sandeep Mann Steve Miller Brian Pawlowski --- sys/amd64/include/specialreg.h | 1 + sys/amd64/include/vmm.h | 268 ++++++ sys/amd64/include/vmm_dev.h | 191 ++++ sys/amd64/vmm/amd/amdv.c | 247 ++++++ sys/amd64/vmm/intel/ept.c | 312 +++++++ sys/amd64/vmm/intel/ept.h | 42 + sys/amd64/vmm/intel/vmcs.c | 451 ++++++++++ sys/amd64/vmm/intel/vmcs.h | 324 +++++++ sys/amd64/vmm/intel/vmx.c | 1673 ++++++++++++++++++++++++++++++++++++ sys/amd64/vmm/intel/vmx.h | 115 +++ sys/amd64/vmm/intel/vmx_controls.h | 92 ++ sys/amd64/vmm/intel/vmx_cpufunc.h | 199 +++++ sys/amd64/vmm/intel/vmx_genassym.c | 81 ++ sys/amd64/vmm/intel/vmx_msr.c | 172 ++++ sys/amd64/vmm/intel/vmx_msr.h | 78 ++ sys/amd64/vmm/intel/vmx_support.S | 204 +++++ sys/amd64/vmm/intel/vtd.c | 637 ++++++++++++++ sys/amd64/vmm/io/iommu.c | 230 +++++ sys/amd64/vmm/io/iommu.h | 67 ++ sys/amd64/vmm/io/ppt.c | 449 ++++++++++ sys/amd64/vmm/io/ppt.h | 40 + sys/amd64/vmm/io/vdev.c | 270 ++++++ sys/amd64/vmm/io/vdev.h | 84 ++ sys/amd64/vmm/io/vlapic.c | 812 +++++++++++++++++ sys/amd64/vmm/io/vlapic.h | 105 +++ sys/amd64/vmm/vmm.c | 737 ++++++++++++++++ sys/amd64/vmm/vmm_dev.c | 468 ++++++++++ sys/amd64/vmm/vmm_ipi.c | 103 +++ sys/amd64/vmm/vmm_ipi.h | 38 + sys/amd64/vmm/vmm_ktr.h | 51 ++ sys/amd64/vmm/vmm_lapic.c | 121 +++ sys/amd64/vmm/vmm_lapic.h | 64 ++ sys/amd64/vmm/vmm_mem.c | 413 +++++++++ sys/amd64/vmm/vmm_mem.h | 38 + sys/amd64/vmm/vmm_msr.c | 264 ++++++ sys/amd64/vmm/vmm_msr.h | 42 + sys/amd64/vmm/vmm_stat.c | 103 +++ sys/amd64/vmm/vmm_stat.h | 71 ++ sys/amd64/vmm/vmm_support.S | 42 + sys/amd64/vmm/vmm_util.c | 111 +++ sys/amd64/vmm/vmm_util.h | 40 + sys/amd64/vmm/x86.c | 113 +++ sys/amd64/vmm/x86.h | 62 ++ 43 files changed, 10025 insertions(+) create mode 100644 sys/amd64/include/vmm.h create mode 100644 sys/amd64/include/vmm_dev.h create mode 100644 sys/amd64/vmm/amd/amdv.c create mode 100644 sys/amd64/vmm/intel/ept.c create mode 100644 sys/amd64/vmm/intel/ept.h create mode 100644 sys/amd64/vmm/intel/vmcs.c create mode 100644 sys/amd64/vmm/intel/vmcs.h create mode 100644 sys/amd64/vmm/intel/vmx.c create mode 100644 sys/amd64/vmm/intel/vmx.h create mode 100644 sys/amd64/vmm/intel/vmx_controls.h create mode 100644 sys/amd64/vmm/intel/vmx_cpufunc.h create mode 100644 sys/amd64/vmm/intel/vmx_genassym.c create mode 100644 sys/amd64/vmm/intel/vmx_msr.c create mode 100644 sys/amd64/vmm/intel/vmx_msr.h create mode 100644 sys/amd64/vmm/intel/vmx_support.S create mode 100644 sys/amd64/vmm/intel/vtd.c create mode 100644 sys/amd64/vmm/io/iommu.c create mode 100644 sys/amd64/vmm/io/iommu.h create mode 100644 sys/amd64/vmm/io/ppt.c create mode 100644 sys/amd64/vmm/io/ppt.h create mode 100644 sys/amd64/vmm/io/vdev.c create mode 100644 sys/amd64/vmm/io/vdev.h create mode 100644 sys/amd64/vmm/io/vlapic.c create mode 100644 sys/amd64/vmm/io/vlapic.h create mode 100644 sys/amd64/vmm/vmm.c create mode 100644 sys/amd64/vmm/vmm_dev.c create mode 100644 sys/amd64/vmm/vmm_ipi.c create mode 100644 sys/amd64/vmm/vmm_ipi.h create mode 100644 sys/amd64/vmm/vmm_ktr.h create mode 100644 sys/amd64/vmm/vmm_lapic.c create mode 100644 sys/amd64/vmm/vmm_lapic.h create mode 100644 sys/amd64/vmm/vmm_mem.c create mode 100644 sys/amd64/vmm/vmm_mem.h create mode 100644 sys/amd64/vmm/vmm_msr.c create mode 100644 sys/amd64/vmm/vmm_msr.h create mode 100644 sys/amd64/vmm/vmm_stat.c create mode 100644 sys/amd64/vmm/vmm_stat.h create mode 100644 sys/amd64/vmm/vmm_support.S create mode 100644 sys/amd64/vmm/vmm_util.c create mode 100644 sys/amd64/vmm/vmm_util.h create mode 100644 sys/amd64/vmm/x86.c create mode 100644 sys/amd64/vmm/x86.h (limited to 'sys/amd64') diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h index 895619c..c95fee0 100644 --- a/sys/amd64/include/specialreg.h +++ b/sys/amd64/include/specialreg.h @@ -297,6 +297,7 @@ */ #define APICBASE_RESERVED 0x000006ff #define APICBASE_BSP 0x00000100 +#define APICBASE_X2APIC 0x00000400 #define APICBASE_ENABLED 0x00000800 #define APICBASE_ADDRESS 0xfffff000 diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h new file mode 100644 index 0000000..0f4c356 --- /dev/null +++ b/sys/amd64/include/vmm.h @@ -0,0 +1,268 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: vmm.h 482 2011-05-09 21:22:43Z grehan $ + */ + +#ifndef _VMM_H_ +#define _VMM_H_ + +#ifdef _KERNEL + +#define VM_MAX_NAMELEN 32 + +struct vm; +struct vm_memory_segment; +struct seg_desc; +struct vm_exit; +struct vm_run; +struct vlapic; + +typedef int (*vmm_init_func_t)(void); +typedef int (*vmm_cleanup_func_t)(void); +typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */ +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, + struct vm_exit *vmexit); +typedef void (*vmi_cleanup_func_t)(void *vmi); +typedef int (*vmi_mmap_func_t)(void *vmi, vm_paddr_t gpa, vm_paddr_t hpa, + size_t length, vm_memattr_t attr, + int prot, boolean_t superpages_ok); +typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, + uint64_t *retval); +typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, + uint64_t val); +typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_inject_event_t)(void *vmi, int vcpu, + int type, int vector, + uint32_t code, int code_valid); +typedef int (*vmi_inject_nmi_t)(void *vmi, int vcpu); +typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); +typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); + +struct vmm_ops { + vmm_init_func_t init; /* module wide initialization */ + vmm_cleanup_func_t cleanup; + + vmi_init_func_t vminit; /* vm-specific initialization */ + vmi_run_func_t vmrun; + vmi_cleanup_func_t vmcleanup; + vmi_mmap_func_t vmmmap; + vmi_get_register_t vmgetreg; + vmi_set_register_t vmsetreg; + vmi_get_desc_t vmgetdesc; + vmi_set_desc_t vmsetdesc; + vmi_inject_event_t vminject; + vmi_inject_nmi_t vmnmi; + vmi_get_cap_t vmgetcap; + vmi_set_cap_t vmsetcap; +}; + +extern struct vmm_ops vmm_ops_intel; +extern struct vmm_ops vmm_ops_amd; + +struct vm *vm_create(const char *name); +void vm_destroy(struct vm *vm); +const char *vm_name(struct vm *vm); +int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa); +int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); +vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size); +int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, + struct vm_memory_segment *seg); +int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); +int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); +int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *ret_desc); +int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc); +int vm_get_pinning(struct vm *vm, int vcpu, int *cpuid); +int vm_set_pinning(struct vm *vm, int vcpu, int cpuid); +int vm_run(struct vm *vm, struct vm_run *vmrun); +int vm_inject_event(struct vm *vm, int vcpu, int type, + int vector, uint32_t error_code, int error_code_valid); +int vm_inject_nmi(struct vm *vm, int vcpu); +uint64_t *vm_guest_msrs(struct vm *vm, int cpu); +struct vlapic *vm_lapic(struct vm *vm, int cpu); +int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); +int vm_set_capability(struct vm *vm, int vcpu, int type, int val); +void vm_activate_cpu(struct vm *vm, int vcpu); +cpumask_t vm_active_cpus(struct vm *vm); + +/* + * Return 1 if device indicated by bus/slot/func is supposed to be a + * pci passthrough device. + * + * Return 0 otherwise. + */ +int vmm_is_pptdev(int bus, int slot, int func); + +void *vm_iommu_domain(struct vm *vm); + +#define VCPU_STOPPED 0 +#define VCPU_RUNNING 1 +void vm_set_run_state(struct vm *vm, int vcpu, int running); +int vm_get_run_state(struct vm *vm, int vcpu, int *hostcpu); + +void *vcpu_stats(struct vm *vm, int vcpu); + +static int __inline +vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) +{ + return (vm_get_run_state(vm, vcpu, hostcpu) == VCPU_RUNNING); +} + +static cpumask_t __inline +vcpu_mask(int vcpuid) +{ + return ((cpumask_t)1 << vcpuid); +} + +#endif /* KERNEL */ + +#define VM_MAXCPU 8 /* maximum virtual cpus */ + +/* + * Identifiers for events that can be injected into the VM + */ +enum vm_event_type { + VM_EVENT_NONE, + VM_HW_INTR, + VM_NMI, + VM_HW_EXCEPTION, + VM_SW_INTR, + VM_PRIV_SW_EXCEPTION, + VM_SW_EXCEPTION, + VM_EVENT_MAX +}; + +/* + * Identifiers for architecturally defined registers. + */ +enum vm_reg_name { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_CR0, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_DR7, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RIP, + VM_REG_GUEST_RFLAGS, + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + VM_REG_GUEST_LDTR, + VM_REG_GUEST_TR, + VM_REG_GUEST_IDTR, + VM_REG_GUEST_GDTR, + VM_REG_GUEST_EFER, + VM_REG_LAST +}; + +/* + * Identifiers for optional vmm capabilities + */ +enum vm_cap_type { + VM_CAP_HALT_EXIT, + VM_CAP_MTRAP_EXIT, + VM_CAP_PAUSE_EXIT, + VM_CAP_UNRESTRICTED_GUEST, + VM_CAP_MAX +}; + +/* + * The 'access' field has the format specified in Table 21-2 of the Intel + * Architecture Manual vol 3b. + * + * XXX The contents of the 'access' field are architecturally defined except + * bit 16 - Segment Unusable. + */ +struct seg_desc { + uint64_t base; + uint32_t limit; + uint32_t access; +}; + +enum vm_exitcode { + VM_EXITCODE_INOUT, + VM_EXITCODE_VMX, + VM_EXITCODE_BOGUS, + VM_EXITCODE_RDMSR, + VM_EXITCODE_WRMSR, + VM_EXITCODE_HLT, + VM_EXITCODE_MTRAP, + VM_EXITCODE_PAUSE, + VM_EXITCODE_MAX, +}; + +struct vm_exit { + enum vm_exitcode exitcode; + int inst_length; /* 0 means unknown */ + uint64_t rip; + union { + struct { + uint16_t bytes:3; /* 1 or 2 or 4 */ + uint16_t in:1; /* out is 0, in is 1 */ + uint16_t string:1; + uint16_t rep:1; + uint16_t port; + uint32_t eax; /* valid for out */ + } inout; + /* + * VMX specific payload. Used when there is no "better" + * exitcode to represent the VM-exit. + */ + struct { + int error; /* vmx inst error */ + uint32_t exit_reason; + uint64_t exit_qualification; + } vmx; + struct { + uint32_t code; /* ecx value */ + uint64_t wval; + } msr; + } u; +}; + +#endif /* _VMM_H_ */ diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h new file mode 100644 index 0000000..1b143b5 --- /dev/null +++ b/sys/amd64/include/vmm_dev.h @@ -0,0 +1,191 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $ + */ + +#ifndef _VMM_DEV_H_ +#define _VMM_DEV_H_ + +#ifdef _KERNEL +void vmmdev_init(void); +void vmmdev_cleanup(void); +#endif + +struct vm_memory_segment { + vm_paddr_t hpa; /* out */ + vm_paddr_t gpa; /* in */ + size_t len; /* in */ +}; + +struct vm_register { + int cpuid; + int regnum; /* enum vm_reg_name */ + uint64_t regval; +}; + +struct vm_seg_desc { /* data or code segment */ + int cpuid; + int regnum; /* enum vm_reg_name */ + struct seg_desc desc; +}; + +struct vm_pin { + int vm_cpuid; + int host_cpuid; /* -1 to unpin */ +}; + +struct vm_run { + int cpuid; + uint64_t rip; /* start running here */ + struct vm_exit vm_exit; +}; + +struct vm_event { + int cpuid; + enum vm_event_type type; + int vector; + uint32_t error_code; + int error_code_valid; +}; + +struct vm_lapic_irq { + int cpuid; + int vector; +}; + +struct vm_capability { + int cpuid; + enum vm_cap_type captype; + int capval; + int allcpus; +}; + +struct vm_pptdev { + int bus; + int slot; + int func; +}; + +struct vm_pptdev_mmio { + int bus; + int slot; + int func; + vm_paddr_t gpa; + vm_paddr_t hpa; + size_t len; +}; + +struct vm_pptdev_msi { + int vcpu; + int bus; + int slot; + int func; + int numvec; /* 0 means disabled */ + int vector; + int destcpu; +}; + +struct vm_nmi { + int cpuid; +}; + +#define MAX_VM_STATS 64 +struct vm_stats { + int cpuid; /* in */ + int num_entries; /* out */ + struct timeval tv; + uint64_t statbuf[MAX_VM_STATS]; +}; + +struct vm_stat_desc { + int index; /* in */ + char desc[128]; /* out */ +}; + +enum { + IOCNUM_RUN, + IOCNUM_SET_PINNING, + IOCNUM_GET_PINNING, + IOCNUM_MAP_MEMORY, + IOCNUM_GET_MEMORY_SEG, + IOCNUM_SET_REGISTER, + IOCNUM_GET_REGISTER, + IOCNUM_SET_SEGMENT_DESCRIPTOR, + IOCNUM_GET_SEGMENT_DESCRIPTOR, + IOCNUM_INJECT_EVENT, + IOCNUM_LAPIC_IRQ, + IOCNUM_SET_CAPABILITY, + IOCNUM_GET_CAPABILITY, + IOCNUM_BIND_PPTDEV, + IOCNUM_UNBIND_PPTDEV, + IOCNUM_MAP_PPTDEV_MMIO, + IOCNUM_PPTDEV_MSI, + IOCNUM_INJECT_NMI, + IOCNUM_VM_STATS, + IOCNUM_VM_STAT_DESC, +}; + +#define VM_RUN \ + _IOWR('v', IOCNUM_RUN, struct vm_run) +#define VM_SET_PINNING \ + _IOW('v', IOCNUM_SET_PINNING, struct vm_pin) +#define VM_GET_PINNING \ + _IOWR('v', IOCNUM_GET_PINNING, struct vm_pin) +#define VM_MAP_MEMORY \ + _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment) +#define VM_GET_MEMORY_SEG \ + _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment) +#define VM_SET_REGISTER \ + _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) +#define VM_GET_REGISTER \ + _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) +#define VM_SET_SEGMENT_DESCRIPTOR \ + _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_GET_SEGMENT_DESCRIPTOR \ + _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_INJECT_EVENT \ + _IOW('v', IOCNUM_INJECT_EVENT, struct vm_event) +#define VM_LAPIC_IRQ \ + _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq) +#define VM_SET_CAPABILITY \ + _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) +#define VM_GET_CAPABILITY \ + _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) +#define VM_BIND_PPTDEV \ + _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev) +#define VM_UNBIND_PPTDEV \ + _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev) +#define VM_MAP_PPTDEV_MMIO \ + _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio) +#define VM_PPTDEV_MSI \ + _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi) +#define VM_INJECT_NMI \ + _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) +#define VM_STATS \ + _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) +#define VM_STAT_DESC \ + _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) +#endif diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c new file mode 100644 index 0000000..41e937a --- /dev/null +++ b/sys/amd64/vmm/amd/amdv.c @@ -0,0 +1,247 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include "io/iommu.h" + +static int +amdv_init(void) +{ + + printf("amdv_init: not implemented\n"); + return (ENXIO); +} + +static int +amdv_cleanup(void) +{ + + printf("amdv_cleanup: not implemented\n"); + return (ENXIO); +} + +static void * +amdv_vminit(struct vm *vm) +{ + + printf("amdv_vminit: not implemented\n"); + return (NULL); +} + +static int +amdv_vmrun(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit) +{ + + printf("amdv_vmrun: not implemented\n"); + return (ENXIO); +} + +static void +amdv_vmcleanup(void *arg) +{ + + printf("amdv_vmcleanup: not implemented\n"); + return; +} + +static int +amdv_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, + vm_memattr_t attr, int prot, boolean_t spok) +{ + + printf("amdv_vmmmap: not implemented\n"); + return (EINVAL); +} + +static int +amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval) +{ + + printf("amdv_getreg: not implemented\n"); + return (EINVAL); +} + +static int +amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val) +{ + + printf("amdv_setreg: not implemented\n"); + return (EINVAL); +} + +static int +amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc) +{ + + printf("amdv_get_desc: not implemented\n"); + return (EINVAL); +} + +static int +amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc) +{ + + printf("amdv_get_desc: not implemented\n"); + return (EINVAL); +} + +static int +amdv_inject_event(void *vmi, int vcpu, int type, int vector, + uint32_t error_code, int error_code_valid) +{ + + printf("amdv_inject_event: not implemented\n"); + return (EINVAL); +} + +static int +amdv_nmi(void *arg, int vcpu) +{ + + printf("amdv_nmi: not implemented\n"); + return (EINVAL); +} + +static int +amdv_getcap(void *arg, int vcpu, int type, int *retval) +{ + + printf("amdv_getcap: not implemented\n"); + return (EINVAL); +} + +static int +amdv_setcap(void *arg, int vcpu, int type, int val) +{ + + printf("amdv_setcap: not implemented\n"); + return (EINVAL); +} + +struct vmm_ops vmm_ops_amd = { + amdv_init, + amdv_cleanup, + amdv_vminit, + amdv_vmrun, + amdv_vmcleanup, + amdv_vmmmap, + amdv_getreg, + amdv_setreg, + amdv_getdesc, + amdv_setdesc, + amdv_inject_event, + amdv_nmi, + amdv_getcap, + amdv_setcap +}; + +static int +amd_iommu_init(void) +{ + + printf("amd_iommu_init: not implemented\n"); + return (ENXIO); +} + +static void +amd_iommu_cleanup(void) +{ + + printf("amd_iommu_cleanup: not implemented\n"); +} + +static void +amd_iommu_enable(void) +{ + + printf("amd_iommu_enable: not implemented\n"); +} + +static void +amd_iommu_disable(void) +{ + + printf("amd_iommu_disable: not implemented\n"); +} + +static void * +amd_iommu_create_domain(vm_paddr_t maxaddr) +{ + + printf("amd_iommu_create_domain: not implemented\n"); + return (NULL); +} + +static void +amd_iommu_destroy_domain(void *domain) +{ + + printf("amd_iommu_destroy_domain: not implemented\n"); +} + +static uint64_t +amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len) +{ + + printf("amd_iommu_create_mapping: not implemented\n"); + return (0); +} + +static void +amd_iommu_add_device(void *domain, int bus, int slot, int func) +{ + + printf("amd_iommu_add_device: not implemented\n"); +} + +static void +amd_iommu_remove_device(void *domain, int bus, int slot, int func) +{ + + printf("amd_iommu_remove_device: not implemented\n"); +} + +struct iommu_ops iommu_ops_amd = { + amd_iommu_init, + amd_iommu_cleanup, + amd_iommu_enable, + amd_iommu_disable, + amd_iommu_create_domain, + amd_iommu_destroy_domain, + amd_iommu_create_mapping, + amd_iommu_add_device, + amd_iommu_remove_device, +}; diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c new file mode 100644 index 0000000..c9fca9d --- /dev/null +++ b/sys/amd64/vmm/intel/ept.c @@ -0,0 +1,312 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include "vmx_cpufunc.h" +#include "vmx_msr.h" +#include "vmx.h" +#include "ept.h" + +#define EPT_PWL4(cap) ((cap) & (1UL << 6)) +#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14)) +#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */ +#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */ +#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32)) +#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20)) + +#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL +#define INVVPID_ALL_TYPES_SUPPORTED(cap) \ + (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK) + +#define INVEPT_ALL_TYPES_MASK 0x6000000UL +#define INVEPT_ALL_TYPES_SUPPORTED(cap) \ + (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK) + +#define EPT_PG_RD (1 << 0) +#define EPT_PG_WR (1 << 1) +#define EPT_PG_EX (1 << 2) +#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) +#define EPT_PG_IGNORE_PAT (1 << 6) +#define EPT_PG_SUPERPAGE (1 << 7) + +#define EPT_ADDR_MASK ((uint64_t)-1 << 12) + +MALLOC_DECLARE(M_VMX); + +static uint64_t page_sizes_mask; + +int +ept_init(void) +{ + int page_shift; + uint64_t cap; + + cap = rdmsr(MSR_VMX_EPT_VPID_CAP); + + /* + * Verify that: + * - page walk length is 4 steps + * - extended page tables can be laid out in write-back memory + * - invvpid instruction with all possible types is supported + * - invept instruction with all possible types is supported + */ + if (!EPT_PWL4(cap) || + !EPT_MEMORY_TYPE_WB(cap) || + !INVVPID_SUPPORTED(cap) || + !INVVPID_ALL_TYPES_SUPPORTED(cap) || + !INVEPT_SUPPORTED(cap) || + !INVEPT_ALL_TYPES_SUPPORTED(cap)) + return (EINVAL); + + /* Set bits in 'page_sizes_mask' for each valid page size */ + page_shift = PAGE_SHIFT; + page_sizes_mask = 1UL << page_shift; /* 4KB page */ + + page_shift += 9; + if (EPT_PDE_SUPERPAGE(cap)) + page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */ + + page_shift += 9; + if (EPT_PDPTE_SUPERPAGE(cap)) + page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */ + + return (0); +} + +static size_t +ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, + vm_memattr_t attr, vm_prot_t prot, boolean_t spok) +{ + int spshift, ptpshift, ptpindex, nlevels; + + /* + * Compute the size of the mapping that we can accomodate. + * + * This is based on three factors: + * - super page sizes supported by the processor + * - alignment of the region starting at 'gpa' and 'hpa' + * - length of the region 'len' + */ + spshift = PAGE_SHIFT; + if (spok) + spshift += (EPT_PWLEVELS - 1) * 9; + while (spshift >= PAGE_SHIFT) { + uint64_t spsize = 1UL << spshift; + if ((page_sizes_mask & spsize) != 0 && + (gpa & (spsize - 1)) == 0 && + (hpa & (spsize - 1)) == 0 && + length >= spsize) { + break; + } + spshift -= 9; + } + + if (spshift < PAGE_SHIFT) { + panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, " + "length 0x%016lx, page_sizes_mask 0x%016lx", + gpa, hpa, length, page_sizes_mask); + } + + nlevels = EPT_PWLEVELS; + while (--nlevels >= 0) { + ptpshift = PAGE_SHIFT + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + /* We have reached the leaf mapping */ + if (spshift >= ptpshift) + break; + + /* + * We are working on a non-leaf page table page. + * + * Create the next level page table page if necessary and point + * to it from the current page table. + */ + if (ptp[ptpindex] == 0) { + void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO); + ptp[ptpindex] = vtophys(nlp); + ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX; + } + + /* Work our way down to the next level page table page */ + ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK); + } + + if ((gpa & ((1UL << ptpshift) - 1)) != 0) { + panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d " + "mismatch\n", gpa, ptpshift); + } + + /* Do the mapping */ + ptp[ptpindex] = hpa; + + /* Apply the access controls */ + if (prot & VM_PROT_READ) + ptp[ptpindex] |= EPT_PG_RD; + if (prot & VM_PROT_WRITE) + ptp[ptpindex] |= EPT_PG_WR; + if (prot & VM_PROT_EXECUTE) + ptp[ptpindex] |= EPT_PG_EX; + + /* + * XXX should we enforce this memory type by setting the ignore PAT + * bit to 1. + */ + ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr); + + if (nlevels > 0) + ptp[ptpindex] |= EPT_PG_SUPERPAGE; + + return (1UL << ptpshift); +} + +static void +ept_free_pt_entry(pt_entry_t pte) +{ + if (pte == 0) + return; + + /* sanity check */ + if ((pte & EPT_PG_SUPERPAGE) != 0) + panic("ept_free_pt_entry: pte cannot have superpage bit"); + + return; +} + +static void +ept_free_pd_entry(pd_entry_t pde) +{ + pt_entry_t *pt; + int i; + + if (pde == 0) + return; + + if ((pde & EPT_PG_SUPERPAGE) == 0) { + pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK); + for (i = 0; i < NPTEPG; i++) + ept_free_pt_entry(pt[i]); + free(pt, M_VMX); /* free the page table page */ + } +} + +static void +ept_free_pdp_entry(pdp_entry_t pdpe) +{ + pd_entry_t *pd; + int i; + + if (pdpe == 0) + return; + + if ((pdpe & EPT_PG_SUPERPAGE) == 0) { + pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK); + for (i = 0; i < NPDEPG; i++) + ept_free_pd_entry(pd[i]); + free(pd, M_VMX); /* free the page directory page */ + } +} + +static void +ept_free_pml4_entry(pml4_entry_t pml4e) +{ + pdp_entry_t *pdp; + int i; + + if (pml4e == 0) + return; + + if ((pml4e & EPT_PG_SUPERPAGE) == 0) { + pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK); + for (i = 0; i < NPDPEPG; i++) + ept_free_pdp_entry(pdp[i]); + free(pdp, M_VMX); /* free the page directory ptr page */ + } +} + +void +ept_vmcleanup(struct vmx *vmx) +{ + int i; + + for (i = 0; i < NPML4EPG; i++) + ept_free_pml4_entry(vmx->pml4ept[i]); +} + +int +ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len, + vm_memattr_t attr, int prot, boolean_t spok) +{ + size_t n; + struct vmx *vmx = arg; + + while (len > 0) { + n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr, + prot, spok); + len -= n; + gpa += n; + hpa += n; + } + + return (0); +} + +static void +invept_single_context(void *arg) +{ + struct invept_desc desc = *(struct invept_desc *)arg; + + invept(INVEPT_TYPE_SINGLE_CONTEXT, desc); +} + +void +ept_invalidate_mappings(u_long pml4ept) +{ + struct invept_desc invept_desc = { 0 }; + + invept_desc.eptp = EPTP(pml4ept); + + smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc); +} diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h new file mode 100644 index 0000000..013c330 --- /dev/null +++ b/sys/amd64/vmm/intel/ept.h @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _EPT_H_ +#define _EPT_H_ + +struct vmx; + +#define EPT_PWLEVELS 4 /* page walk levels */ +#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK) + +int ept_init(void); +int ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, + vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings); +void ept_invalidate_mappings(u_long ept_pml4); +void ept_vmcleanup(struct vmx *vmx); +#endif diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c new file mode 100644 index 0000000..80d45cc --- /dev/null +++ b/sys/amd64/vmm/intel/vmcs.c @@ -0,0 +1,451 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include "vmcs.h" +#include "vmx_cpufunc.h" +#include "ept.h" +#include "vmx.h" + +static uint64_t +vmcs_fix_regval(uint32_t encoding, uint64_t val) +{ + + switch (encoding) { + case VMCS_GUEST_CR0: + val = vmx_fix_cr0(val); + break; + case VMCS_GUEST_CR4: + val = vmx_fix_cr4(val); + break; + default: + break; + } + return (val); +} + +static uint32_t +vmcs_field_encoding(int ident) +{ + switch (ident) { + case VM_REG_GUEST_CR0: + return (VMCS_GUEST_CR0); + case VM_REG_GUEST_CR3: + return (VMCS_GUEST_CR3); + case VM_REG_GUEST_CR4: + return (VMCS_GUEST_CR4); + case VM_REG_GUEST_DR7: + return (VMCS_GUEST_DR7); + case VM_REG_GUEST_RSP: + return (VMCS_GUEST_RSP); + case VM_REG_GUEST_RIP: + return (VMCS_GUEST_RIP); + case VM_REG_GUEST_RFLAGS: + return (VMCS_GUEST_RFLAGS); + case VM_REG_GUEST_ES: + return (VMCS_GUEST_ES_SELECTOR); + case VM_REG_GUEST_CS: + return (VMCS_GUEST_CS_SELECTOR); + case VM_REG_GUEST_SS: + return (VMCS_GUEST_SS_SELECTOR); + case VM_REG_GUEST_DS: + return (VMCS_GUEST_DS_SELECTOR); + case VM_REG_GUEST_FS: + return (VMCS_GUEST_FS_SELECTOR); + case VM_REG_GUEST_GS: + return (VMCS_GUEST_GS_SELECTOR); + case VM_REG_GUEST_TR: + return (VMCS_GUEST_TR_SELECTOR); + case VM_REG_GUEST_LDTR: + return (VMCS_GUEST_LDTR_SELECTOR); + case VM_REG_GUEST_EFER: + return (VMCS_GUEST_IA32_EFER); + default: + return (-1); + } + +} + +static int +vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc) +{ + + switch (seg) { + case VM_REG_GUEST_ES: + *base = VMCS_GUEST_ES_BASE; + *lim = VMCS_GUEST_ES_LIMIT; + *acc = VMCS_GUEST_ES_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_CS: + *base = VMCS_GUEST_CS_BASE; + *lim = VMCS_GUEST_CS_LIMIT; + *acc = VMCS_GUEST_CS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_SS: + *base = VMCS_GUEST_SS_BASE; + *lim = VMCS_GUEST_SS_LIMIT; + *acc = VMCS_GUEST_SS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_DS: + *base = VMCS_GUEST_DS_BASE; + *lim = VMCS_GUEST_DS_LIMIT; + *acc = VMCS_GUEST_DS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_FS: + *base = VMCS_GUEST_FS_BASE; + *lim = VMCS_GUEST_FS_LIMIT; + *acc = VMCS_GUEST_FS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_GS: + *base = VMCS_GUEST_GS_BASE; + *lim = VMCS_GUEST_GS_LIMIT; + *acc = VMCS_GUEST_GS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_TR: + *base = VMCS_GUEST_TR_BASE; + *lim = VMCS_GUEST_TR_LIMIT; + *acc = VMCS_GUEST_TR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_LDTR: + *base = VMCS_GUEST_LDTR_BASE; + *lim = VMCS_GUEST_LDTR_LIMIT; + *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_IDTR: + *base = VMCS_GUEST_IDTR_BASE; + *lim = VMCS_GUEST_IDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + case VM_REG_GUEST_GDTR: + *base = VMCS_GUEST_GDTR_BASE; + *lim = VMCS_GUEST_GDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + default: + return (EINVAL); + } + + return (0); +} + +int +vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval) +{ + int error; + uint32_t encoding; + + /* + * If we need to get at vmx-specific state in the VMCS we can bypass + * the translation of 'ident' to 'encoding' by simply setting the + * sign bit. As it so happens the upper 16 bits are reserved (i.e + * set to 0) in the encodings for the VMCS so we are free to use the + * sign bit. + */ + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + VMPTRLD(vmcs); + error = vmread(encoding, retval); + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val) +{ + int error; + uint32_t encoding; + + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + val = vmcs_fix_regval(encoding, val); + + VMPTRLD(vmcs); + error = vmwrite(encoding, val); + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + panic("vmcs_setdesc: invalid segment register %d", seg); + + VMPTRLD(vmcs); + if ((error = vmwrite(base, desc->base)) != 0) + goto done; + + if ((error = vmwrite(limit, desc->limit)) != 0) + goto done; + + if (access != VMCS_INVALID_ENCODING) { + if ((error = vmwrite(access, desc->access)) != 0) + goto done; + } +done: + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + uint64_t u64; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + panic("vmcs_getdesc: invalid segment register %d", seg); + + VMPTRLD(vmcs); + if ((error = vmread(base, &u64)) != 0) + goto done; + desc->base = u64; + + if ((error = vmread(limit, &u64)) != 0) + goto done; + desc->limit = u64; + + if (access != VMCS_INVALID_ENCODING) { + if ((error = vmread(access, &u64)) != 0) + goto done; + desc->access = u64; + } +done: + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count) +{ + int error; + + VMPTRLD(vmcs); + + /* + * Guest MSRs are saved in the VM-exit MSR-store area. + * Guest MSRs are loaded from the VM-entry MSR-load area. + * Both areas point to the same location in memory. + */ + if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0) + goto done; + + if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0) + goto done; + + error = 0; +done: + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_set_defaults(struct vmcs *vmcs, + u_long host_rip, u_long host_rsp, u_long ept_pml4, + uint32_t pinbased_ctls, uint32_t procbased_ctls, + uint32_t procbased_ctls2, uint32_t exit_ctls, + uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid) +{ + int error, codesel, datasel, tsssel; + u_long cr0, cr4, efer; + uint64_t eptp, pat; + uint32_t exc_bitmap; + + codesel = GSEL(GCODE_SEL, SEL_KPL); + datasel = GSEL(GDATA_SEL, SEL_KPL); + tsssel = GSEL(GPROC0_SEL, SEL_KPL); + + /* + * Make sure we have a "current" VMCS to work with. + */ + VMPTRLD(vmcs); + + /* + * Load the VMX controls + */ + if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0) + goto done; + if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0) + goto done; + if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0) + goto done; + if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0) + goto done; + if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0) + goto done; + + /* Guest state */ + + /* Initialize guest IA32_PAT MSR with the default value */ + pat = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0) + goto done; + + /* Host state */ + + /* Initialize host IA32_PAT MSR */ + pat = rdmsr(MSR_PAT); + if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0) + goto done; + + /* Load the IA32_EFER MSR */ + efer = rdmsr(MSR_EFER); + if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0) + goto done; + + /* Load the control registers */ + cr0 = rcr0(); + if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0) + goto done; + + cr4 = rcr4(); + if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0) + goto done; + + /* Load the segment selectors */ + if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0) + goto done; + + /* + * Load the Base-Address for %fs and idtr. + * + * Note that we exclude %gs, tss and gdtr here because their base + * address is pcpu specific. + */ + if ((error = vmwrite(VMCS_HOST_FS_BASE, 0)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_IDTR_BASE, r_idt.rd_base)) != 0) + goto done; + + /* instruction pointer */ + if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0) + goto done; + + /* stack pointer */ + if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0) + goto done; + + /* eptp */ + eptp = EPTP(ept_pml4); + if ((error = vmwrite(VMCS_EPTP, eptp)) != 0) + goto done; + + /* vpid */ + if ((error = vmwrite(VMCS_VPID, vpid)) != 0) + goto done; + + /* msr bitmap */ + if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0) + goto done; + + /* exception bitmap */ + exc_bitmap = 1 << IDT_MC; + if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0) + goto done; + + /* link pointer */ + if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0) + goto done; +done: + VMCLEAR(vmcs); + return (error); +} + +uint64_t +vmcs_read(uint32_t encoding) +{ + int error; + uint64_t val; + + error = vmread(encoding, &val); + if (error != 0) + panic("vmcs_read(%u) error %d", encoding, error); + + return (val); +} diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h new file mode 100644 index 0000000..c633a59 --- /dev/null +++ b/sys/amd64/vmm/intel/vmcs.h @@ -0,0 +1,324 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMCS_H_ +#define _VMCS_H_ + +#ifdef _KERNEL +struct vmcs { + uint32_t identifier; + uint32_t abort_code; + char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2]; +}; +CTASSERT(sizeof(struct vmcs) == PAGE_SIZE); + +/* MSR save region is composed of an array of 'struct msr_entry' */ +struct msr_entry { + uint32_t index; + uint32_t reserved; + uint64_t val; + +}; + +int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count); +int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp, + u_long ept_pml4, + uint32_t pinbased_ctls, uint32_t procbased_ctls, + uint32_t procbased_ctls2, uint32_t exit_ctls, + uint32_t entry_ctls, u_long msr_bitmap, + uint16_t vpid); +int vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval); +int vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val); +int vmcs_getdesc(struct vmcs *vmcs, int ident, + struct seg_desc *desc); +int vmcs_setdesc(struct vmcs *vmcs, int ident, + struct seg_desc *desc); +uint64_t vmcs_read(uint32_t encoding); + +#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH) +#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP) +#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR) +#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff) +#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION) + +#endif /* _KERNEL */ + +#define VMCS_IDENT(encoding) ((encoding) | 0x80000000) +/* + * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B. + */ +#define VMCS_INVALID_ENCODING 0xffffffff + +/* 16-bit control fields */ +#define VMCS_VPID 0x00000000 + +/* 16-bit guest-state fields */ +#define VMCS_GUEST_ES_SELECTOR 0x00000800 +#define VMCS_GUEST_CS_SELECTOR 0x00000802 +#define VMCS_GUEST_SS_SELECTOR 0x00000804 +#define VMCS_GUEST_DS_SELECTOR 0x00000806 +#define VMCS_GUEST_FS_SELECTOR 0x00000808 +#define VMCS_GUEST_GS_SELECTOR 0x0000080A +#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C +#define VMCS_GUEST_TR_SELECTOR 0x0000080E + +/* 16-bit host-state fields */ +#define VMCS_HOST_ES_SELECTOR 0x00000C00 +#define VMCS_HOST_CS_SELECTOR 0x00000C02 +#define VMCS_HOST_SS_SELECTOR 0x00000C04 +#define VMCS_HOST_DS_SELECTOR 0x00000C06 +#define VMCS_HOST_FS_SELECTOR 0x00000C08 +#define VMCS_HOST_GS_SELECTOR 0x00000C0A +#define VMCS_HOST_TR_SELECTOR 0x00000C0C + +/* 64-bit control fields */ +#define VMCS_IO_BITMAP_A 0x00002000 +#define VMCS_IO_BITMAP_B 0x00002002 +#define VMCS_MSR_BITMAP 0x00002004 +#define VMCS_EXIT_MSR_STORE 0x00002006 +#define VMCS_EXIT_MSR_LOAD 0x00002008 +#define VMCS_ENTRY_MSR_LOAD 0x0000200A +#define VMCS_EXECUTIVE_VMCS 0x0000200C +#define VMCS_TSC_OFFSET 0x00002010 +#define VMCS_VIRTUAL_APIC 0x00002012 +#define VMCS_APIC_ACCESS 0x00002014 +#define VMCS_EPTP 0x0000201A + +/* 64-bit read-only fields */ +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 + +/* 64-bit guest-state fields */ +#define VMCS_LINK_POINTER 0x00002800 +#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802 +#define VMCS_GUEST_IA32_PAT 0x00002804 +#define VMCS_GUEST_IA32_EFER 0x00002806 +#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808 +#define VMCS_GUEST_PDPTE0 0x0000280A +#define VMCS_GUEST_PDPTE1 0x0000280C +#define VMCS_GUEST_PDPTE2 0x0000280E +#define VMCS_GUEST_PDPTE3 0x00002810 + +/* 64-bit host-state fields */ +#define VMCS_HOST_IA32_PAT 0x00002C00 +#define VMCS_HOST_IA32_EFER 0x00002C02 +#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04 + +/* 32-bit control fields */ +#define VMCS_PIN_BASED_CTLS 0x00004000 +#define VMCS_PRI_PROC_BASED_CTLS 0x00004002 +#define VMCS_EXCEPTION_BITMAP 0x00004004 +#define VMCS_PF_ERROR_MASK 0x00004006 +#define VMCS_PF_ERROR_MATCH 0x00004008 +#define VMCS_CR3_TARGET_COUNT 0x0000400A +#define VMCS_EXIT_CTLS 0x0000400C +#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E +#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010 +#define VMCS_ENTRY_CTLS 0x00004012 +#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014 +#define VMCS_ENTRY_INTR_INFO 0x00004016 +#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018 +#define VMCS_ENTRY_INST_LENGTH 0x0000401A +#define VMCS_TPR_THRESHOLD 0x0000401C +#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E +#define VMCS_PLE_GAP 0x00004020 +#define VMCS_PLE_WINDOW 0x00004022 + +/* 32-bit read-only data fields */ +#define VMCS_INSTRUCTION_ERROR 0x00004400 +#define VMCS_EXIT_REASON 0x00004402 +#define VMCS_EXIT_INTERRUPTION_INFO 0x00004404 +#define VMCS_EXIT_INTERRUPTION_ERROR 0x00004406 +#define VMCS_IDT_VECTORING_INFO 0x00004408 +#define VMCS_IDT_VECTORING_ERROR 0x0000440A +#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C +#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E + +/* 32-bit guest-state fields */ +#define VMCS_GUEST_ES_LIMIT 0x00004800 +#define VMCS_GUEST_CS_LIMIT 0x00004802 +#define VMCS_GUEST_SS_LIMIT 0x00004804 +#define VMCS_GUEST_DS_LIMIT 0x00004806 +#define VMCS_GUEST_FS_LIMIT 0x00004808 +#define VMCS_GUEST_GS_LIMIT 0x0000480A +#define VMCS_GUEST_LDTR_LIMIT 0x0000480C +#define VMCS_GUEST_TR_LIMIT 0x0000480E +#define VMCS_GUEST_GDTR_LIMIT 0x00004810 +#define VMCS_GUEST_IDTR_LIMIT 0x00004812 +#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814 +#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816 +#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818 +#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A +#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C +#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E +#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820 +#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822 +#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824 +#define VMCS_GUEST_ACTIVITY 0x00004826 +#define VMCS_GUEST_SMBASE 0x00004828 +#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A +#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E + +/* 32-bit host state fields */ +#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00 + +/* Natural Width control fields */ +#define VMCS_CR0_MASK 0x00006000 +#define VMCS_CR4_MASK 0x00006002 +#define VMCS_CR0_SHADOW 0x00006004 +#define VMCS_CR4_SHADOW 0x00006006 +#define VMCS_CR3_TARGET0 0x00006008 +#define VMCS_CR3_TARGET1 0x0000600A +#define VMCS_CR3_TARGET2 0x0000600C +#define VMCS_CR3_TARGET3 0x0000600E + +/* Natural Width read-only fields */ +#define VMCS_EXIT_QUALIFICATION 0x00006400 +#define VMCS_IO_RCX 0x00006402 +#define VMCS_IO_RSI 0x00006404 +#define VMCS_IO_RDI 0x00006406 +#define VMCS_IO_RIP 0x00006408 +#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A + +/* Natural Width guest-state fields */ +#define VMCS_GUEST_CR0 0x00006800 +#define VMCS_GUEST_CR3 0x00006802 +#define VMCS_GUEST_CR4 0x00006804 +#define VMCS_GUEST_ES_BASE 0x00006806 +#define VMCS_GUEST_CS_BASE 0x00006808 +#define VMCS_GUEST_SS_BASE 0x0000680A +#define VMCS_GUEST_DS_BASE 0x0000680C +#define VMCS_GUEST_FS_BASE 0x0000680E +#define VMCS_GUEST_GS_BASE 0x00006810 +#define VMCS_GUEST_LDTR_BASE 0x00006812 +#define VMCS_GUEST_TR_BASE 0x00006814 +#define VMCS_GUEST_GDTR_BASE 0x00006816 +#define VMCS_GUEST_IDTR_BASE 0x00006818 +#define VMCS_GUEST_DR7 0x0000681A +#define VMCS_GUEST_RSP 0x0000681C +#define VMCS_GUEST_RIP 0x0000681E +#define VMCS_GUEST_RFLAGS 0x00006820 +#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822 +#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824 +#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826 + +/* Natural Width host-state fields */ +#define VMCS_HOST_CR0 0x00006C00 +#define VMCS_HOST_CR3 0x00006C02 +#define VMCS_HOST_CR4 0x00006C04 +#define VMCS_HOST_FS_BASE 0x00006C06 +#define VMCS_HOST_GS_BASE 0x00006C08 +#define VMCS_HOST_TR_BASE 0x00006C0A +#define VMCS_HOST_GDTR_BASE 0x00006C0C +#define VMCS_HOST_IDTR_BASE 0x00006C0E +#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10 +#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12 +#define VMCS_HOST_RSP 0x00006C14 +#define VMCS_HOST_RIP 0x00006c16 + +/* + * VM instruction error numbers + */ +#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5 + +/* + * VMCS exit reasons + */ +#define EXIT_REASON_EXCEPTION 0 +#define EXIT_REASON_EXT_INTR 1 +#define EXIT_REASON_TRIPLE_FAULT 2 +#define EXIT_REASON_INIT 3 +#define EXIT_REASON_SIPI 4 +#define EXIT_REASON_IO_SMI 5 +#define EXIT_REASON_SMI 6 +#define EXIT_REASON_INTR_WINDOW 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_GETSEC 11 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_RSM 17 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMXOFF 26 +#define EXIT_REASON_VMXON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_INOUT 30 +#define EXIT_REASON_RDMSR 31 +#define EXIT_REASON_WRMSR 32 +#define EXIT_REASON_INVAL_VMCS 33 +#define EXIT_REASON_INVAL_MSR 34 +#define EXIT_REASON_MWAIT 36 +#define EXIT_REASON_MTF 37 +#define EXIT_REASON_MONITOR 39 +#define EXIT_REASON_PAUSE 40 +#define EXIT_REASON_MCE 41 +#define EXIT_REASON_TPR 43 +#define EXIT_REASON_APIC 44 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 +#define EXIT_REASON_EPT_FAULT 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 +#define EXIT_REASON_RDTSCP 51 +#define EXIT_REASON_VMX_PREEMPT 52 +#define EXIT_REASON_INVVPID 53 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 + +/* + * VMCS interrupt information fields + */ +#define VMCS_INTERRUPTION_INFO_VALID (1 << 31) +#define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8) +#define VMCS_INTERRUPTION_INFO_NMI (2 << 8) + +/* + * VMCS Guest interruptibility field + */ +#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0) +#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1) +#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2) +#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3) + +/* + * Exit qualification for EXIT_REASON_INVAL_VMCS + */ +#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3 + +#endif diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c new file mode 100644 index 0000000..ec181c4 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx.c @@ -0,0 +1,1673 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include "vmm_lapic.h" +#include "vmm_msr.h" +#include "vmm_ktr.h" +#include "vmm_stat.h" + +#include "vmx_msr.h" +#include "ept.h" +#include "vmx_cpufunc.h" +#include "vmx.h" +#include "x86.h" +#include "vmx_controls.h" + +#define CR4_VMXE (1UL << 13) + +#define PINBASED_CTLS_ONE_SETTING \ + (PINBASED_EXTINT_EXITING | \ + PINBASED_NMI_EXITING | \ + PINBASED_VIRTUAL_NMI) +#define PINBASED_CTLS_ZERO_SETTING 0 + +#define PROCBASED_CTLS_WINDOW_SETTING \ + (PROCBASED_INT_WINDOW_EXITING | \ + PROCBASED_NMI_WINDOW_EXITING) + +#define PROCBASED_CTLS_ONE_SETTING \ + (PROCBASED_SECONDARY_CONTROLS | \ + PROCBASED_IO_EXITING | \ + PROCBASED_MSR_BITMAPS | \ + PROCBASED_CTLS_WINDOW_SETTING) +#define PROCBASED_CTLS_ZERO_SETTING \ + (PROCBASED_CR3_LOAD_EXITING | \ + PROCBASED_CR3_STORE_EXITING | \ + PROCBASED_IO_BITMAPS) + +#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT +#define PROCBASED_CTLS2_ZERO_SETTING 0 + +#define VM_EXIT_CTLS_ONE_SETTING \ + (VM_EXIT_HOST_LMA | \ + VM_EXIT_SAVE_EFER | \ + VM_EXIT_SAVE_PAT | \ + VM_EXIT_LOAD_PAT | \ + VM_EXIT_LOAD_EFER) +#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS + +#define VM_ENTRY_CTLS_ONE_SETTING \ + (VM_ENTRY_LOAD_PAT | \ + VM_ENTRY_LOAD_EFER) +#define VM_ENTRY_CTLS_ZERO_SETTING \ + (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_INTO_SMM | \ + VM_ENTRY_DEACTIVATE_DUAL_MONITOR) + +#define guest_msr_rw(vmx, msr) \ + msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) + +#define HANDLED 1 +#define UNHANDLED 0 + +MALLOC_DEFINE(M_VMX, "vmx", "vmx"); + +extern struct pcpu __pcpu[]; + +static int vmxon_enabled[MAXCPU]; +static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); + +static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; +static uint32_t exit_ctls, entry_ctls; + +static uint64_t cr0_ones_mask, cr0_zeros_mask; +static uint64_t cr4_ones_mask, cr4_zeros_mask; + +static volatile u_int nextvpid; + +/* + * Virtual NMI blocking conditions. + * + * Some processor implementations also require NMI to be blocked if + * the STI_BLOCKING bit is set. It is possible to detect this at runtime + * based on the (exit_reason,exit_qual) tuple being set to + * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). + * + * We take the easy way out and also include STI_BLOCKING as one of the + * gating items for vNMI injection. + */ +static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | + VMCS_INTERRUPTIBILITY_NMI_BLOCKING | + VMCS_INTERRUPTIBILITY_STI_BLOCKING; + +/* + * Optional capabilities + */ +static int cap_halt_exit; +static int cap_pause_exit; +static int cap_unrestricted_guest; +static int cap_monitor_trap; + +/* statistics */ +static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus"); +static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt"); + +#ifdef KTR +static const char * +exit_reason_to_str(int reason) +{ + static char reasonbuf[32]; + + switch (reason) { + case EXIT_REASON_EXCEPTION: + return "exception"; + case EXIT_REASON_EXT_INTR: + return "extint"; + case EXIT_REASON_TRIPLE_FAULT: + return "triplefault"; + case EXIT_REASON_INIT: + return "init"; + case EXIT_REASON_SIPI: + return "sipi"; + case EXIT_REASON_IO_SMI: + return "iosmi"; + case EXIT_REASON_SMI: + return "smi"; + case EXIT_REASON_INTR_WINDOW: + return "intrwindow"; + case EXIT_REASON_NMI_WINDOW: + return "nmiwindow"; + case EXIT_REASON_TASK_SWITCH: + return "taskswitch"; + case EXIT_REASON_CPUID: + return "cpuid"; + case EXIT_REASON_GETSEC: + return "getsec"; + case EXIT_REASON_HLT: + return "hlt"; + case EXIT_REASON_INVD: + return "invd"; + case EXIT_REASON_INVLPG: + return "invlpg"; + case EXIT_REASON_RDPMC: + return "rdpmc"; + case EXIT_REASON_RDTSC: + return "rdtsc"; + case EXIT_REASON_RSM: + return "rsm"; + case EXIT_REASON_VMCALL: + return "vmcall"; + case EXIT_REASON_VMCLEAR: + return "vmclear"; + case EXIT_REASON_VMLAUNCH: + return "vmlaunch"; + case EXIT_REASON_VMPTRLD: + return "vmptrld"; + case EXIT_REASON_VMPTRST: + return "vmptrst"; + case EXIT_REASON_VMREAD: + return "vmread"; + case EXIT_REASON_VMRESUME: + return "vmresume"; + case EXIT_REASON_VMWRITE: + return "vmwrite"; + case EXIT_REASON_VMXOFF: + return "vmxoff"; + case EXIT_REASON_VMXON: + return "vmxon"; + case EXIT_REASON_CR_ACCESS: + return "craccess"; + case EXIT_REASON_DR_ACCESS: + return "draccess"; + case EXIT_REASON_INOUT: + return "inout"; + case EXIT_REASON_RDMSR: + return "rdmsr"; + case EXIT_REASON_WRMSR: + return "wrmsr"; + case EXIT_REASON_INVAL_VMCS: + return "invalvmcs"; + case EXIT_REASON_INVAL_MSR: + return "invalmsr"; + case EXIT_REASON_MWAIT: + return "mwait"; + case EXIT_REASON_MTF: + return "mtf"; + case EXIT_REASON_MONITOR: + return "monitor"; + case EXIT_REASON_PAUSE: + return "pause"; + case EXIT_REASON_MCE: + return "mce"; + case EXIT_REASON_TPR: + return "tpr"; + case EXIT_REASON_APIC: + return "apic"; + case EXIT_REASON_GDTR_IDTR: + return "gdtridtr"; + case EXIT_REASON_LDTR_TR: + return "ldtrtr"; + case EXIT_REASON_EPT_FAULT: + return "eptfault"; + case EXIT_REASON_EPT_MISCONFIG: + return "eptmisconfig"; + case EXIT_REASON_INVEPT: + return "invept"; + case EXIT_REASON_RDTSCP: + return "rdtscp"; + case EXIT_REASON_VMX_PREEMPT: + return "vmxpreempt"; + case EXIT_REASON_INVVPID: + return "invvpid"; + case EXIT_REASON_WBINVD: + return "wbinvd"; + case EXIT_REASON_XSETBV: + return "xsetbv"; + default: + snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); + return (reasonbuf); + } +} + +#ifdef SETJMP_TRACE +static const char * +vmx_setjmp_rc2str(int rc) +{ + switch (rc) { + case VMX_RETURN_DIRECT: + return "direct"; + case VMX_RETURN_LONGJMP: + return "longjmp"; + case VMX_RETURN_VMRESUME: + return "vmresume"; + case VMX_RETURN_VMLAUNCH: + return "vmlaunch"; + default: + return "unknown"; + } +} + +#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ + VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ + (vmxctx)->regname) + +static void +vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) +{ + uint64_t host_rip, host_rsp; + + if (vmxctx != &vmx->ctx[vcpu]) + panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", + vmxctx, &vmx->ctx[vcpu]); + + VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); + VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", + vmx_setjmp_rc2str(rc), rc); + + host_rsp = host_rip = ~0; + vmread(VMCS_HOST_RIP, &host_rip); + vmread(VMCS_HOST_RSP, &host_rsp); + VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", + host_rip, host_rsp); + + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); + + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); +} +#endif +#else +static void __inline +vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) +{ + return; +} +#endif /* KTR */ + +u_long +vmx_fix_cr0(u_long cr0) +{ + + return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); +} + +u_long +vmx_fix_cr4(u_long cr4) +{ + + return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); +} + +static void +msr_save_area_init(struct msr_entry *g_area, int *g_count) +{ + int cnt; + + static struct msr_entry guest_msrs[] = { + { MSR_KGSBASE, 0, 0 }, + }; + + cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); + if (cnt > GUEST_MSR_MAX_ENTRIES) + panic("guest msr save area overrun"); + bcopy(guest_msrs, g_area, sizeof(guest_msrs)); + *g_count = cnt; +} + +static void +vmx_disable(void *arg __unused) +{ + struct invvpid_desc invvpid_desc = { 0 }; + struct invept_desc invept_desc = { 0 }; + + if (vmxon_enabled[curcpu]) { + /* + * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. + * + * VMXON or VMXOFF are not required to invalidate any TLB + * caching structures. This prevents potential retention of + * cached information in the TLB between distinct VMX episodes. + */ + invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); + invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); + vmxoff(); + } + load_cr4(rcr4() & ~CR4_VMXE); +} + +static int +vmx_cleanup(void) +{ + + smp_rendezvous(NULL, vmx_disable, NULL, NULL); + + return (0); +} + +static void +vmx_enable(void *arg __unused) +{ + int error; + + load_cr4(rcr4() | CR4_VMXE); + + *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); + error = vmxon(vmxon_region[curcpu]); + if (error == 0) + vmxon_enabled[curcpu] = 1; +} + +static int +vmx_init(void) +{ + int error; + unsigned int regs[4]; + uint64_t fixed0, fixed1; + uint32_t tmp; + + /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ + do_cpuid(1, regs); + if ((regs[2] & CPUID_0000_0001_FEAT0_VMX) == 0) { + printf("vmx_init: processor does not support VMX operation\n"); + return (ENXIO); + } + + /* Check support for primary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_CTLS_ONE_SETTING, + PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired primary " + "processor-based controls\n"); + return (error); + } + + /* Clear the processor-based ctl bits that are set on demand */ + procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; + + /* Check support for secondary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED_CTLS2_ONE_SETTING, + PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); + if (error) { + printf("vmx_init: processor does not support desired secondary " + "processor-based controls\n"); + return (error); + } + + /* Check support for VPID */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_ENABLE_VPID, 0, &tmp); + if (error == 0) + procbased_ctls2 |= PROCBASED2_ENABLE_VPID; + + /* Check support for pin-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, + PINBASED_CTLS_ONE_SETTING, + PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "pin-based controls\n"); + return (error); + } + + /* Check support for VM-exit controls */ + error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, + VM_EXIT_CTLS_ONE_SETTING, + VM_EXIT_CTLS_ZERO_SETTING, + &exit_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "exit controls\n"); + return (error); + } + + /* Check support for VM-entry controls */ + error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, + VM_ENTRY_CTLS_ONE_SETTING, + VM_ENTRY_CTLS_ZERO_SETTING, + &entry_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "entry controls\n"); + return (error); + } + + /* + * Check support for optional features by testing them + * as individual bits + */ + cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_HLT_EXITING, 0, + &tmp) == 0); + + cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_PROCBASED_CTLS, + PROCBASED_MTF, 0, + &tmp) == 0); + + cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_PAUSE_EXITING, 0, + &tmp) == 0); + + cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_UNRESTRICTED_GUEST, 0, + &tmp) == 0); + + /* Initialize EPT */ + error = ept_init(); + if (error) { + printf("vmx_init: ept initialization failed (%d)\n", error); + return (error); + } + + /* + * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 + */ + fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); + cr0_ones_mask = fixed0 & fixed1; + cr0_zeros_mask = ~fixed0 & ~fixed1; + + /* + * CR0_PE and CR0_PG can be set to zero in VMX non-root operation + * if unrestricted guest execution is allowed. + */ + if (cap_unrestricted_guest) + cr0_ones_mask &= ~(CR0_PG | CR0_PE); + + /* + * Do not allow the guest to set CR0_NW or CR0_CD. + */ + cr0_zeros_mask |= (CR0_NW | CR0_CD); + + fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); + cr4_ones_mask = fixed0 & fixed1; + cr4_zeros_mask = ~fixed0 & ~fixed1; + + /* enable VMX operation */ + smp_rendezvous(NULL, vmx_enable, NULL, NULL); + + return (0); +} + +/* + * If this processor does not support VPIDs then simply return 0. + * + * Otherwise generate the next value of VPID to use. Any value is alright + * as long as it is non-zero. + * + * We always execute in VMX non-root context with EPT enabled. Thus all + * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This + * in turn means that multiple VMs can share the same VPID as long as + * they have distinct EPT page tables. + * + * XXX + * We should optimize this so that it returns VPIDs that are not in + * use. Then we will not unnecessarily invalidate mappings in + * vmx_set_pcpu_defaults() just because two or more vcpus happen to + * use the same 'vpid'. + */ +static uint16_t +vmx_vpid(void) +{ + uint16_t vpid = 0; + + if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) { + do { + vpid = atomic_fetchadd_int(&nextvpid, 1); + } while (vpid == 0); + } + + return (vpid); +} + +static int +vmx_setup_cr0_shadow(struct vmcs *vmcs) +{ + int error; + uint64_t mask, shadow; + + mask = cr0_ones_mask | cr0_zeros_mask; + error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_MASK), mask); + if (error) + return (error); + + shadow = cr0_ones_mask; + error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_SHADOW), shadow); + if (error) + return (error); + + return (0); +} + +static void * +vmx_vminit(struct vm *vm) +{ + uint16_t vpid; + int i, error, guest_msr_count; + struct vmx *vmx; + + vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); + if ((uintptr_t)vmx & PAGE_MASK) { + panic("malloc of struct vmx not aligned on %d byte boundary", + PAGE_SIZE); + } + vmx->vm = vm; + + /* + * Clean up EPTP-tagged guest physical and combined mappings + * + * VMX transitions are not required to invalidate any guest physical + * mappings. So, it may be possible for stale guest physical mappings + * to be present in the processor TLBs. + * + * Combined mappings for this EP4TA are also invalidated for all VPIDs. + */ + ept_invalidate_mappings(vtophys(vmx->pml4ept)); + + msr_bitmap_initialize(vmx->msr_bitmap); + + /* + * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. + * The guest FSBASE and GSBASE are saved and restored during + * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are + * always restored from the vmcs host state area on vm-exit. + * + * Guest KGSBASE is saved and restored in the guest MSR save area. + * Host KGSBASE is restored before returning to userland from the pcb. + * There will be a window of time when we are executing in the host + * kernel context with a value of KGSBASE from the guest. This is ok + * because the value of KGSBASE is inconsequential in kernel context. + * + * MSR_EFER is saved and restored in the guest VMCS area on a + * VM exit and entry respectively. It is also restored from the + * host VMCS area on a VM exit. + * + * MSR_PAT is saved and restored in the guest VMCS are on a VM exit + * and entry respectively. It is also restored from the host VMCS + * area on a VM exit. + */ + if (guest_msr_rw(vmx, MSR_GSBASE) || + guest_msr_rw(vmx, MSR_FSBASE) || + guest_msr_rw(vmx, MSR_KGSBASE) || + guest_msr_rw(vmx, MSR_EFER) || + guest_msr_rw(vmx, MSR_PAT)) + panic("vmx_vminit: error setting guest msr access"); + + for (i = 0; i < VM_MAXCPU; i++) { + vmx->vmcs[i].identifier = vmx_revision(); + error = vmclear(&vmx->vmcs[i]); + if (error != 0) { + panic("vmx_vminit: vmclear error %d on vcpu %d\n", + error, i); + } + + vpid = vmx_vpid(); + + error = vmcs_set_defaults(&vmx->vmcs[i], + (u_long)vmx_longjmp, + (u_long)&vmx->ctx[i], + vtophys(vmx->pml4ept), + pinbased_ctls, + procbased_ctls, + procbased_ctls2, + exit_ctls, entry_ctls, + vtophys(vmx->msr_bitmap), + vpid); + + if (error != 0) + panic("vmx_vminit: vmcs_set_defaults error %d", error); + + vmx->cap[i].set = 0; + vmx->cap[i].proc_ctls = procbased_ctls; + + vmx->state[i].request_nmi = 0; + vmx->state[i].lastcpu = -1; + vmx->state[i].vpid = vpid; + + msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); + + error = vmcs_set_msr_save(&vmx->vmcs[i], + vtophys(vmx->guest_msrs[i]), + guest_msr_count); + if (error != 0) + panic("vmcs_set_msr_save error %d", error); + + error = vmx_setup_cr0_shadow(&vmx->vmcs[i]); + } + + return (vmx); +} + +static int +vmx_handle_cpuid(struct vmxctx *vmxctx) +{ + int handled, func; + + func = vmxctx->guest_rax; + + handled = x86_emulate_cpuid((uint32_t*)(&vmxctx->guest_rax), + (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx), + (uint32_t*)(&vmxctx->guest_rdx)); +#if 0 + printf("%s: func %x rax %lx rbx %lx rcx %lx rdx %lx handled %d\n", + __func__, func, vmxctx->guest_rax, vmxctx->guest_rbx, + vmxctx->guest_rcx, vmxctx->guest_rdx, handled); +#endif + + return (handled); +} + +static __inline void +vmx_run_trace(struct vmx *vmx, int vcpu) +{ +#ifdef KTR + VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip()); +#endif +} + +static __inline void +vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, + int handled, int astpending) +{ +#ifdef KTR + VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", + handled ? "handled" : "unhandled", + exit_reason_to_str(exit_reason), rip); + + if (astpending) + VMM_CTR0(vmx->vm, vcpu, "astpending"); +#endif +} + +static int +vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) +{ + int error, lastcpu; + struct vmxstate *vmxstate; + struct invvpid_desc invvpid_desc = { 0 }; + + vmxstate = &vmx->state[vcpu]; + lastcpu = vmxstate->lastcpu; + vmxstate->lastcpu = curcpu; + + if (lastcpu == curcpu) { + error = 0; + goto done; + } + + vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); + + error = vmwrite(VMCS_HOST_TR_BASE, (u_long)PCPU_GET(tssp)); + if (error != 0) + goto done; + + error = vmwrite(VMCS_HOST_GDTR_BASE, (u_long)&gdt[NGDT * curcpu]); + if (error != 0) + goto done; + + error = vmwrite(VMCS_HOST_GS_BASE, (u_long)&__pcpu[curcpu]); + if (error != 0) + goto done; + + /* + * If we are using VPIDs then invalidate all mappings tagged with 'vpid' + * + * We do this because this vcpu was executing on a different host + * cpu when it last ran. We do not track whether it invalidated + * mappings associated with its 'vpid' during that run. So we must + * assume that the mappings associated with 'vpid' on 'curcpu' are + * stale and invalidate them. + * + * Note that we incur this penalty only when the scheduler chooses to + * move the thread associated with this vcpu between host cpus. + * + * Note also that this will invalidate mappings tagged with 'vpid' + * for "all" EP4TAs. + */ + if (vmxstate->vpid != 0) { + invvpid_desc.vpid = vmxstate->vpid; + invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); + } +done: + return (error); +} + +static void +vm_exit_update_rip(struct vm_exit *vmexit) +{ + int error; + + error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); + if (error) + panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); +} + +/* + * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. + */ +CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); + +static void __inline +vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) +{ + int error; + + vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; + + error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + if (error) + panic("vmx_set_int_window_exiting: vmwrite error %d", error); +} + +static void __inline +vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) +{ + int error; + + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; + + error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + if (error) + panic("vmx_clear_int_window_exiting: vmwrite error %d", error); +} + +static void __inline +vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + int error; + + vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; + + error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + if (error) + panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); +} + +static void __inline +vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + int error; + + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; + + error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + if (error) + panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); +} + +static int +vmx_inject_nmi(struct vmx *vmx, int vcpu) +{ + int error; + uint64_t info, interruptibility; + + /* Bail out if no NMI requested */ + if (vmx->state[vcpu].request_nmi == 0) + return (0); + + error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); + if (error) { + panic("vmx_inject_nmi: vmread(interruptibility) %d", + error); + } + if (interruptibility & nmi_blocking_bits) + goto nmiblocked; + + /* + * Inject the virtual NMI. The vector must be the NMI IDT entry + * or the VMCS entry check will fail. + */ + info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; + info |= IDT_NMI; + + error = vmwrite(VMCS_ENTRY_INTR_INFO, info); + if (error) + panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); + + VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); + + /* Clear the request */ + vmx->state[vcpu].request_nmi = 0; + return (1); + +nmiblocked: + /* + * Set the NMI Window Exiting execution control so we can inject + * the virtual NMI as soon as blocking condition goes away. + */ + vmx_set_nmi_window_exiting(vmx, vcpu); + + VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); + return (1); +} + +static void +vmx_inject_interrupts(struct vmx *vmx, int vcpu) +{ + int error, vector; + uint64_t info, rflags, interruptibility; + + const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; + +#if 1 + /* + * XXX + * If an event is being injected from userland then just return. + * For e.g. we may inject a breakpoint exception to cause the + * guest to enter the debugger so we can inspect its state. + */ + error = vmread(VMCS_ENTRY_INTR_INFO, &info); + if (error) + panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); + if (info & VMCS_INTERRUPTION_INFO_VALID) + return; +#endif + /* + * NMI injection has priority so deal with those first + */ + if (vmx_inject_nmi(vmx, vcpu)) + return; + + /* Ask the local apic for a vector to inject */ + vector = lapic_pending_intr(vmx->vm, vcpu); + if (vector < 0) + return; + + if (vector < 32 || vector > 255) + panic("vmx_inject_interrupts: invalid vector %d\n", vector); + + /* Check RFLAGS.IF and the interruptibility state of the guest */ + error = vmread(VMCS_GUEST_RFLAGS, &rflags); + if (error) + panic("vmx_inject_interrupts: vmread(rflags) %d", error); + + if ((rflags & PSL_I) == 0) + goto cantinject; + + error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); + if (error) { + panic("vmx_inject_interrupts: vmread(interruptibility) %d", + error); + } + if (interruptibility & HWINTR_BLOCKED) + goto cantinject; + + /* Inject the interrupt */ + info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; + info |= vector; + error = vmwrite(VMCS_ENTRY_INTR_INFO, info); + if (error) + panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); + + /* Update the Local APIC ISR */ + lapic_intr_accepted(vmx->vm, vcpu, vector); + + VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); + + return; + +cantinject: + /* + * Set the Interrupt Window Exiting execution control so we can inject + * the interrupt as soon as blocking condition goes away. + */ + vmx_set_int_window_exiting(vmx, vcpu); + + VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); +} + +static int +vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + int error; + uint64_t regval; + const struct vmxctx *vmxctx; + + /* We only handle mov to %cr0 at this time */ + if ((exitqual & 0xff) != 0x00) + return (UNHANDLED); + + vmxctx = &vmx->ctx[vcpu]; + + /* + * We must use vmwrite() directly here because vmcs_setreg() will + * call vmclear(vmcs) as a side-effect which we certainly don't want. + */ + switch ((exitqual >> 8) & 0xf) { + case 0: + regval = vmxctx->guest_rax; + break; + case 1: + regval = vmxctx->guest_rcx; + break; + case 2: + regval = vmxctx->guest_rdx; + break; + case 3: + regval = vmxctx->guest_rbx; + break; + case 4: + error = vmread(VMCS_GUEST_RSP, ®val); + if (error) { + panic("vmx_emulate_cr_access: " + "error %d reading guest rsp", error); + } + break; + case 5: + regval = vmxctx->guest_rbp; + break; + case 6: + regval = vmxctx->guest_rsi; + break; + case 7: + regval = vmxctx->guest_rdi; + break; + case 8: + regval = vmxctx->guest_r8; + break; + case 9: + regval = vmxctx->guest_r9; + break; + case 10: + regval = vmxctx->guest_r10; + break; + case 11: + regval = vmxctx->guest_r11; + break; + case 12: + regval = vmxctx->guest_r12; + break; + case 13: + regval = vmxctx->guest_r13; + break; + case 14: + regval = vmxctx->guest_r14; + break; + case 15: + regval = vmxctx->guest_r15; + break; + } + + regval |= cr0_ones_mask; + regval &= ~cr0_zeros_mask; + error = vmwrite(VMCS_GUEST_CR0, regval); + if (error) + panic("vmx_emulate_cr_access: error %d writing cr0", error); + + return (HANDLED); +} + +static int +vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + int handled; + struct vmcs *vmcs; + struct vmxctx *vmxctx; + uint32_t eax, ecx, edx; + uint64_t qual; + + handled = 0; + vmcs = &vmx->vmcs[vcpu]; + vmxctx = &vmx->ctx[vcpu]; + qual = vmexit->u.vmx.exit_qualification; + vmexit->exitcode = VM_EXITCODE_BOGUS; + + switch (vmexit->u.vmx.exit_reason) { + case EXIT_REASON_CR_ACCESS: + handled = vmx_emulate_cr_access(vmx, vcpu, qual); + break; + case EXIT_REASON_RDMSR: + ecx = vmxctx->guest_rcx; + handled = emulate_rdmsr(vmx->vm, vcpu, ecx); + if (!handled) { + vmexit->exitcode = VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + } + break; + case EXIT_REASON_WRMSR: + eax = vmxctx->guest_rax; + ecx = vmxctx->guest_rcx; + edx = vmxctx->guest_rdx; + handled = emulate_wrmsr(vmx->vm, vcpu, ecx, + (uint64_t)edx << 32 | eax); + if (!handled) { + vmexit->exitcode = VM_EXITCODE_WRMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; + } + break; + case EXIT_REASON_HLT: + vmexit->exitcode = VM_EXITCODE_HLT; + break; + case EXIT_REASON_MTF: + vmexit->exitcode = VM_EXITCODE_MTRAP; + break; + case EXIT_REASON_PAUSE: + vmexit->exitcode = VM_EXITCODE_PAUSE; + break; + case EXIT_REASON_INTR_WINDOW: + vmx_clear_int_window_exiting(vmx, vcpu); + VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); + /* FALLTHRU */ + case EXIT_REASON_EXT_INTR: + /* + * External interrupts serve only to cause VM exits and allow + * the host interrupt handler to run. + * + * If this external interrupt triggers a virtual interrupt + * to a VM, then that state will be recorded by the + * host interrupt handler in the VM's softc. We will inject + * this virtual interrupt during the subsequent VM enter. + */ + + /* + * This is special. We want to treat this as an 'handled' + * VM-exit but not increment the instruction pointer. + */ + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); + return (1); + case EXIT_REASON_NMI_WINDOW: + /* Exit to allow the pending virtual NMI to be injected */ + vmx_clear_nmi_window_exiting(vmx, vcpu); + VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); + return (1); + case EXIT_REASON_INOUT: + vmexit->exitcode = VM_EXITCODE_INOUT; + vmexit->u.inout.bytes = (qual & 0x7) + 1; + vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; + vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; + vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; + vmexit->u.inout.port = (uint16_t)(qual >> 16); + vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); + break; + case EXIT_REASON_CPUID: + handled = vmx_handle_cpuid(vmxctx); + break; + default: + break; + } + + if (handled) { + /* + * It is possible that control is returned to userland + * even though we were able to handle the VM exit in the + * kernel (for e.g. 'astpending' is set in the run loop). + * + * In such a case we want to make sure that the userland + * restarts guest execution at the instruction *after* + * the one we just processed. Therefore we update the + * guest rip in the VMCS and in 'vmexit'. + */ + vm_exit_update_rip(vmexit); + vmexit->rip += vmexit->inst_length; + vmexit->inst_length = 0; + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic VMX exit. + */ + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.error = 0; + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + return (handled); +} + +static int +vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit) +{ + int error, vie, rc, handled, astpending, loopstart; + uint32_t exit_reason; + struct vmx *vmx; + struct vmxctx *vmxctx; + struct vmcs *vmcs; + + vmx = arg; + vmcs = &vmx->vmcs[vcpu]; + vmxctx = &vmx->ctx[vcpu]; + loopstart = 1; + + /* + * XXX Can we avoid doing this every time we do a vm run? + */ + VMPTRLD(vmcs); + + /* + * XXX + * We do this every time because we may setup the virtual machine + * from a different process than the one that actually runs it. + * + * If the life of a virtual machine was spent entirely in the context + * of a single process we could do this once in vmcs_set_defaults(). + */ + if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) + panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); + + if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) + panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); + + if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) + panic("vmx_run: error %d setting up pcpu defaults", error); + + do { + lapic_timer_tick(vmx->vm, vcpu); + vmx_inject_interrupts(vmx, vcpu); + vmx_run_trace(vmx, vcpu); + rc = vmx_setjmp(vmxctx); +#ifdef SETJMP_TRACE + vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); +#endif + switch (rc) { + case VMX_RETURN_DIRECT: + if (loopstart) { + loopstart = 0; + vmx_launch(vmxctx); + } else + vmx_resume(vmxctx); + panic("vmx_launch/resume should not return"); + break; + case VMX_RETURN_LONGJMP: + break; /* vm exit */ + case VMX_RETURN_VMRESUME: + vie = vmcs_instruction_error(); + if (vmxctx->launch_error == VM_FAIL_INVALID || + vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { + printf("vmresume error %d vmcs inst error %d\n", + vmxctx->launch_error, vie); + goto err_exit; + } + vmx_launch(vmxctx); /* try to launch the guest */ + panic("vmx_launch should not return"); + break; + case VMX_RETURN_VMLAUNCH: + vie = vmcs_instruction_error(); +#if 1 + printf("vmlaunch error %d vmcs inst error %d\n", + vmxctx->launch_error, vie); +#endif + goto err_exit; + default: + panic("vmx_setjmp returned %d", rc); + } + + /* + * XXX locking? + * See comments in exception.S about checking for ASTs + * atomically while interrupts are disabled. But it is + * not clear that they apply in our case. + */ + astpending = curthread->td_flags & TDF_ASTPENDING; + + /* enable interrupts */ + enable_intr(); + + /* collect some basic information for VM exit processing */ + vmexit->rip = rip = vmcs_guest_rip(); + vmexit->inst_length = vmexit_instruction_length(); + vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); + vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); + + handled = vmx_exit_process(vmx, vcpu, vmexit); + + vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled, + astpending); + } while (handled && !astpending); + + /* + * If a VM exit has been handled then the exitcode must be BOGUS + * If a VM exit is not handled then the exitcode must not be BOGUS + */ + if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || + (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { + panic("Mismatch between handled (%d) and exitcode (%d)", + handled, vmexit->exitcode); + } + + VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); + + /* + * XXX + * We need to do this to ensure that any VMCS state cached by the + * processor is flushed to memory. We need to do this in case the + * VM moves to a different cpu the next time it runs. + * + * Can we avoid doing this? + */ + VMCLEAR(vmcs); + return (0); + +err_exit: + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.exit_reason = (uint32_t)-1; + vmexit->u.vmx.exit_qualification = (uint32_t)-1; + vmexit->u.vmx.error = vie; + VMCLEAR(vmcs); + return (ENOEXEC); +} + +static void +vmx_vmcleanup(void *arg) +{ + int error; + struct vmx *vmx = arg; + + /* + * XXXSMP we also need to clear the VMCS active on the other vcpus. + */ + error = vmclear(&vmx->vmcs[0]); + if (error != 0) + panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); + + ept_vmcleanup(vmx); + free(vmx, M_VMX); + + return; +} + +static register_t * +vmxctx_regptr(struct vmxctx *vmxctx, int reg) +{ + + switch (reg) { + case VM_REG_GUEST_RAX: + return (&vmxctx->guest_rax); + case VM_REG_GUEST_RBX: + return (&vmxctx->guest_rbx); + case VM_REG_GUEST_RCX: + return (&vmxctx->guest_rcx); + case VM_REG_GUEST_RDX: + return (&vmxctx->guest_rdx); + case VM_REG_GUEST_RSI: + return (&vmxctx->guest_rsi); + case VM_REG_GUEST_RDI: + return (&vmxctx->guest_rdi); + case VM_REG_GUEST_RBP: + return (&vmxctx->guest_rbp); + case VM_REG_GUEST_R8: + return (&vmxctx->guest_r8); + case VM_REG_GUEST_R9: + return (&vmxctx->guest_r9); + case VM_REG_GUEST_R10: + return (&vmxctx->guest_r10); + case VM_REG_GUEST_R11: + return (&vmxctx->guest_r11); + case VM_REG_GUEST_R12: + return (&vmxctx->guest_r12); + case VM_REG_GUEST_R13: + return (&vmxctx->guest_r13); + case VM_REG_GUEST_R14: + return (&vmxctx->guest_r14); + case VM_REG_GUEST_R15: + return (&vmxctx->guest_r15); + default: + break; + } + return (NULL); +} + +static int +vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) +{ + register_t *regp; + + if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { + *retval = *regp; + return (0); + } else + return (EINVAL); +} + +static int +vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) +{ + register_t *regp; + + if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { + *regp = val; + return (0); + } else + return (EINVAL); +} + +static int +vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) +{ + struct vmx *vmx = arg; + + if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) + return (0); + + /* + * If the vcpu is running then don't mess with the VMCS. + * + * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause + * the subsequent vmlaunch/vmresume to fail. + */ + if (vcpu_is_running(vmx->vm, vcpu, NULL)) + panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); + + return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval)); +} + +static int +vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) +{ + int error; + uint64_t ctls; + struct vmx *vmx = arg; + + /* + * XXX Allow caller to set contents of the guest registers saved in + * the 'vmxctx' even though the vcpu might be running. We need this + * specifically to support the rdmsr emulation that will set the + * %eax and %edx registers during vm exit processing. + */ + if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) + return (0); + + /* + * If the vcpu is running then don't mess with the VMCS. + * + * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause + * the subsequent vmlaunch/vmresume to fail. + */ + if (vcpu_is_running(vmx->vm, vcpu, NULL)) + panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); + + error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val); + + if (error == 0) { + /* + * If the "load EFER" VM-entry control is 1 then the + * value of EFER.LMA must be identical to "IA-32e mode guest" + * bit in the VM-entry control. + */ + if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && + (reg == VM_REG_GUEST_EFER)) { + vmcs_getreg(&vmx->vmcs[vcpu], + VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); + if (val & EFER_LMA) + ctls |= VM_ENTRY_GUEST_LMA; + else + ctls &= ~VM_ENTRY_GUEST_LMA; + vmcs_setreg(&vmx->vmcs[vcpu], + VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); + } + } + + return (error); +} + +static int +vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmx *vmx = arg; + + return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); +} + +static int +vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmx *vmx = arg; + + return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); +} + +static int +vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, + int code_valid) +{ + int error; + uint32_t info; + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + + static uint32_t type_map[VM_EVENT_MAX] = { + 0x1, /* VM_EVENT_NONE */ + 0x0, /* VM_HW_INTR */ + 0x2, /* VM_NMI */ + 0x3, /* VM_HW_EXCEPTION */ + 0x4, /* VM_SW_INTR */ + 0x5, /* VM_PRIV_SW_EXCEPTION */ + 0x6, /* VM_SW_EXCEPTION */ + }; + + info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); + info |= VMCS_INTERRUPTION_INFO_VALID; + error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); + if (error != 0) + return (error); + + if (code_valid) { + error = vmcs_setreg(vmcs, + VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), + code); + } + return (error); +} + +static int +vmx_nmi(void *arg, int vcpu) +{ + struct vmx *vmx = arg; + + atomic_set_int(&vmx->state[vcpu].request_nmi, 1); + + return (0); +} + +static int +vmx_getcap(void *arg, int vcpu, int type, int *retval) +{ + struct vmx *vmx = arg; + int vcap; + int ret; + + ret = ENOENT; + + vcap = vmx->cap[vcpu].set; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) + ret = 0; + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) + ret = 0; + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) + ret = 0; + break; + case VM_CAP_UNRESTRICTED_GUEST: + if (cap_unrestricted_guest) + ret = 0; + break; + default: + break; + } + + if (ret == 0) + *retval = (vcap & (1 << type)) ? 1 : 0; + + return (ret); +} + +static int +vmx_setcap(void *arg, int vcpu, int type, int val) +{ + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + uint32_t baseval; + uint32_t *pptr; + int error; + int flag; + int reg; + int retval; + + retval = ENOENT; + pptr = NULL; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_HLT_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_MTF; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_PAUSE_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_UNRESTRICTED_GUEST: + if (cap_unrestricted_guest) { + retval = 0; + baseval = procbased_ctls2; + flag = PROCBASED2_UNRESTRICTED_GUEST; + reg = VMCS_SEC_PROC_BASED_CTLS; + } + break; + default: + break; + } + + if (retval == 0) { + if (val) { + baseval |= flag; + } else { + baseval &= ~flag; + } + VMPTRLD(vmcs); + error = vmwrite(reg, baseval); + VMCLEAR(vmcs); + + if (error) { + retval = error; + } else { + /* + * Update optional stored flags, and record + * setting + */ + if (pptr != NULL) { + *pptr = baseval; + } + + if (val) { + vmx->cap[vcpu].set |= (1 << type); + } else { + vmx->cap[vcpu].set &= ~(1 << type); + } + } + } + + return (retval); +} + +struct vmm_ops vmm_ops_intel = { + vmx_init, + vmx_cleanup, + vmx_vminit, + vmx_run, + vmx_vmcleanup, + ept_vmmmap, + vmx_getreg, + vmx_setreg, + vmx_getdesc, + vmx_setdesc, + vmx_inject, + vmx_nmi, + vmx_getcap, + vmx_setcap +}; diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h new file mode 100644 index 0000000..69697f8 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx.h @@ -0,0 +1,115 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_H_ +#define _VMX_H_ + +#include "vmcs.h" + +#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */ + +struct vmxctx { + register_t guest_rdi; /* Guest state */ + register_t guest_rsi; + register_t guest_rdx; + register_t guest_rcx; + register_t guest_r8; + register_t guest_r9; + register_t guest_rax; + register_t guest_rbx; + register_t guest_rbp; + register_t guest_r10; + register_t guest_r11; + register_t guest_r12; + register_t guest_r13; + register_t guest_r14; + register_t guest_r15; + register_t guest_cr2; + + register_t host_r15; /* Host state */ + register_t host_r14; + register_t host_r13; + register_t host_r12; + register_t host_rbp; + register_t host_rsp; + register_t host_rbx; + register_t host_rip; + /* + * XXX todo debug registers and fpu state + */ + + int launch_error; +}; + +struct vmxcap { + int set; + uint32_t proc_ctls; +}; + +struct vmxstate { + int request_nmi; + int lastcpu; /* host cpu that this 'vcpu' last ran on */ + uint16_t vpid; +}; + +/* virtual machine softc */ +struct vmx { + pml4_entry_t pml4ept[NPML4EPG]; + struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */ + char msr_bitmap[PAGE_SIZE]; + struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES]; + struct vmxctx ctx[VM_MAXCPU]; + struct vmxcap cap[VM_MAXCPU]; + struct vmxstate state[VM_MAXCPU]; + struct vm *vm; +}; +CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0); + +#define VMX_RETURN_DIRECT 0 +#define VMX_RETURN_LONGJMP 1 +#define VMX_RETURN_VMRESUME 2 +#define VMX_RETURN_VMLAUNCH 3 +/* + * vmx_setjmp() returns: + * - 0 when it returns directly + * - 1 when it returns from vmx_longjmp + * - 2 when it returns from vmx_resume (which would only be in the error case) + * - 3 when it returns from vmx_launch (which would only be in the error case) + */ +int vmx_setjmp(struct vmxctx *ctx); +void vmx_longjmp(void); /* returns via vmx_setjmp */ +void vmx_launch(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */ +void vmx_resume(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */ + +u_long vmx_fix_cr0(u_long cr0); +u_long vmx_fix_cr4(u_long cr4); + +#endif diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h new file mode 100644 index 0000000..31f29f8 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_controls.h @@ -0,0 +1,92 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_CONTROLS_H_ +#define _VMX_CONTROLS_H_ + +/* Pin-Based VM-Execution Controls */ +#define PINBASED_EXTINT_EXITING (1 << 0) +#define PINBASED_NMI_EXITING (1 << 3) +#define PINBASED_VIRTUAL_NMI (1 << 5) +#define PINBASED_PREMPTION_TIMER (1 << 6) + +/* Primary Processor-Based VM-Execution Controls */ +#define PROCBASED_INT_WINDOW_EXITING (1 << 2) +#define PROCBASED_TSC_OFFSET (1 << 3) +#define PROCBASED_HLT_EXITING (1 << 7) +#define PROCBASED_INVLPG_EXITING (1 << 9) +#define PROCBASED_MWAIT_EXITING (1 << 10) +#define PROCBASED_RDPMC_EXITING (1 << 11) +#define PROCBASED_RDTSC_EXITING (1 << 12) +#define PROCBASED_CR3_LOAD_EXITING (1 << 15) +#define PROCBASED_CR3_STORE_EXITING (1 << 16) +#define PROCBASED_CR8_LOAD_EXITING (1 << 19) +#define PROCBASED_CR8_STORE_EXITING (1 << 20) +#define PROCBASED_USE_TPR_SHADOW (1 << 21) +#define PROCBASED_NMI_WINDOW_EXITING (1 << 22) +#define PROCBASED_MOV_DR_EXITING (1 << 23) +#define PROCBASED_IO_EXITING (1 << 24) +#define PROCBASED_IO_BITMAPS (1 << 25) +#define PROCBASED_MTF (1 << 27) +#define PROCBASED_MSR_BITMAPS (1 << 28) +#define PROCBASED_MONITOR_EXITING (1 << 29) +#define PROCBASED_PAUSE_EXITING (1 << 30) +#define PROCBASED_SECONDARY_CONTROLS (1 << 31) + +/* Secondary Processor-Based VM-Execution Controls */ +#define PROCBASED2_VIRTUALIZE_APIC (1 << 0) +#define PROCBASED2_ENABLE_EPT (1 << 1) +#define PROCBASED2_DESC_TABLE_EXITING (1 << 2) +#define PROCBASED2_ENABLE_RDTSCP (1 << 3) +#define PROCBASED2_VIRTUALIZE_X2APIC (1 << 4) +#define PROCBASED2_ENABLE_VPID (1 << 5) +#define PROCBASED2_WBINVD_EXITING (1 << 6) +#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7) +#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10) + +/* VM Exit Controls */ +#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2) +#define VM_EXIT_HOST_LMA (1 << 9) +#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12) +#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15) +#define VM_EXIT_SAVE_PAT (1 << 18) +#define VM_EXIT_LOAD_PAT (1 << 19) +#define VM_EXIT_SAVE_EFER (1 << 20) +#define VM_EXIT_LOAD_EFER (1 << 21) +#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22) + +/* VM Entry Controls */ +#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2) +#define VM_ENTRY_GUEST_LMA (1 << 9) +#define VM_ENTRY_INTO_SMM (1 << 10) +#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11) +#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13) +#define VM_ENTRY_LOAD_PAT (1 << 14) +#define VM_ENTRY_LOAD_EFER (1 << 15) + +#endif diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h new file mode 100644 index 0000000..e9f6c6d --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_cpufunc.h @@ -0,0 +1,199 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_CPUFUNC_H_ +#define _VMX_CPUFUNC_H_ + +struct vmcs; + +/* + * Section 5.2 "Conventions" from Intel Architecture Manual 2B. + * + * error + * VMsucceed 0 + * VMFailInvalid 1 + * VMFailValid 2 see also VMCS VM-Instruction Error Field + */ +#define VM_SUCCESS 0 +#define VM_FAIL_INVALID 1 +#define VM_FAIL_VALID 2 +#define VMX_SET_ERROR_CODE(varname) \ + do { \ + __asm __volatile(" jnc 1f;" \ + " mov $1, %0;" /* CF: error = 1 */ \ + " jmp 3f;" \ + "1: jnz 2f;" \ + " mov $2, %0;" /* ZF: error = 2 */ \ + " jmp 3f;" \ + "2: mov $0, %0;" \ + "3: nop" \ + :"=r" (varname)); \ + } while (0) + +/* returns 0 on success and non-zero on failure */ +static __inline int +vmxon(char *region) +{ + int error; + uint64_t addr; + + addr = vtophys(region); + __asm __volatile("vmxon %0" : : "m" (*(uint64_t *)&addr) : "memory"); + VMX_SET_ERROR_CODE(error); + return (error); +} + +/* returns 0 on success and non-zero on failure */ +static __inline int +vmclear(struct vmcs *vmcs) +{ + int error; + uint64_t addr; + + addr = vtophys(vmcs); + __asm __volatile("vmclear %0" : : "m" (*(uint64_t *)&addr) : "memory"); + VMX_SET_ERROR_CODE(error); + return (error); +} + +static __inline void +vmxoff(void) +{ + __asm __volatile("vmxoff"); +} + +static __inline void +vmptrst(uint64_t *addr) +{ + __asm __volatile("vmptrst %0" : : "m" (*addr) : "memory"); +} + +static __inline int +vmptrld(struct vmcs *vmcs) +{ + int error; + uint64_t addr; + + addr = vtophys(vmcs); + __asm __volatile("vmptrld %0" : : "m" (*(uint64_t *)&addr) : "memory"); + VMX_SET_ERROR_CODE(error); + return (error); +} + +static __inline int +vmwrite(uint64_t reg, uint64_t val) +{ + int error; + + __asm __volatile("vmwrite %0, %1" : : "r" (val), "r" (reg) : "memory"); + + VMX_SET_ERROR_CODE(error); + + return (error); +} + +static __inline int +vmread(uint64_t r, uint64_t *addr) +{ + int error; + + __asm __volatile("vmread %0, %1" : : "r" (r), "m" (*addr) : "memory"); + + VMX_SET_ERROR_CODE(error); + + return (error); +} + +static void __inline +VMCLEAR(struct vmcs *vmcs) +{ + int err; + + err = vmclear(vmcs); + if (err != 0) + panic("%s: vmclear(%p) error %d", __func__, vmcs, err); + + critical_exit(); +} + +static void __inline +VMPTRLD(struct vmcs *vmcs) +{ + int err; + + critical_enter(); + + err = vmptrld(vmcs); + if (err != 0) + panic("%s: vmptrld(%p) error %d", __func__, vmcs, err); +} + +#define INVVPID_TYPE_ADDRESS 0UL +#define INVVPID_TYPE_SINGLE_CONTEXT 1UL +#define INVVPID_TYPE_ALL_CONTEXTS 2UL + +struct invvpid_desc { + uint16_t vpid; + uint16_t _res1; + uint32_t _res2; + uint64_t linear_addr; +}; +CTASSERT(sizeof(struct invvpid_desc) == 16); + +static void __inline +invvpid(uint64_t type, struct invvpid_desc desc) +{ + int error; + + __asm __volatile("invvpid %0, %1" :: "m" (desc), "r" (type) : "memory"); + + VMX_SET_ERROR_CODE(error); + if (error) + panic("invvpid error %d", error); +} + +#define INVEPT_TYPE_SINGLE_CONTEXT 1UL +#define INVEPT_TYPE_ALL_CONTEXTS 2UL +struct invept_desc { + uint64_t eptp; + uint64_t _res; +}; +CTASSERT(sizeof(struct invept_desc) == 16); + +static void __inline +invept(uint64_t type, struct invept_desc desc) +{ + int error; + + __asm __volatile("invept %0, %1" :: "m" (desc), "r" (type) : "memory"); + + VMX_SET_ERROR_CODE(error); + if (error) + panic("invept error %d", error); +} +#endif diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c new file mode 100644 index 0000000..c4b1efc --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_genassym.c @@ -0,0 +1,81 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include "vmx.h" +#include "vmx_cpufunc.h" + +ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi)); +ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi)); +ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx)); +ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx)); +ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8)); +ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9)); +ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax)); +ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx)); +ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp)); +ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10)); +ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11)); +ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12)); +ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13)); +ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14)); +ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15)); +ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2)); + +ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15)); +ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14)); +ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13)); +ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12)); +ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp)); +ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp)); +ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx)); +ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip)); + +ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error)); + +ASSYM(VM_SUCCESS, VM_SUCCESS); +ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID); +ASSYM(VM_FAIL_VALID, VM_FAIL_VALID); + +ASSYM(VMX_RETURN_DIRECT, VMX_RETURN_DIRECT); +ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP); +ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME); +ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH); diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c new file mode 100644 index 0000000..1e9a837 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_msr.c @@ -0,0 +1,172 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include + +#include "vmx_msr.h" + +static boolean_t +vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos) +{ + + if (msr_val & (1UL << (bitpos + 32))) + return (TRUE); + else + return (FALSE); +} + +static boolean_t +vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos) +{ + + if ((msr_val & (1UL << bitpos)) == 0) + return (TRUE); + else + return (FALSE); +} + +uint32_t +vmx_revision(void) +{ + + return (rdmsr(MSR_VMX_BASIC) & 0xffffffff); +} + +/* + * Generate a bitmask to be used for the VMCS execution control fields. + * + * The caller specifies what bits should be set to one in 'ones_mask' + * and what bits should be set to zero in 'zeros_mask'. The don't-care + * bits are set to the default value. The default values are obtained + * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining + * VMX Capabilities". + * + * Returns zero on success and non-zero on error. + */ +int +vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval) +{ + int i; + uint64_t val, trueval; + boolean_t true_ctls_avail, one_allowed, zero_allowed; + + /* We cannot ask the same bit to be set to both '1' and '0' */ + if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask)) + return (EINVAL); + + if (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) + true_ctls_avail = TRUE; + else + true_ctls_avail = FALSE; + + val = rdmsr(ctl_reg); + if (true_ctls_avail) + trueval = rdmsr(true_ctl_reg); /* step c */ + else + trueval = val; /* step a */ + + for (i = 0; i < 32; i++) { + one_allowed = vmx_ctl_allows_one_setting(trueval, i); + zero_allowed = vmx_ctl_allows_zero_setting(trueval, i); + + KASSERT(one_allowed || zero_allowed, + ("invalid zero/one setting for bit %d of ctl 0x%0x, " + "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg)); + + if (zero_allowed && !one_allowed) { /* b(i),c(i) */ + if (ones_mask & (1 << i)) + return (EINVAL); + *retval &= ~(1 << i); + } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */ + if (zeros_mask & (1 << i)) + return (EINVAL); + *retval |= 1 << i; + } else { + if (zeros_mask & (1 << i)) /* b(ii),c(ii) */ + *retval &= ~(1 << i); + else if (ones_mask & (1 << i)) /* b(ii), c(ii) */ + *retval |= 1 << i; + else if (!true_ctls_avail) + *retval &= ~(1 << i); /* b(iii) */ + else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/ + *retval &= ~(1 << i); + else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */ + *retval |= 1 << i; + else { + panic("vmx_set_ctlreg: unable to determine " + "correct value of ctl bit %d for msr " + "0x%0x and true msr 0x%0x", i, ctl_reg, + true_ctl_reg); + } + } + } + + return (0); +} + +void +msr_bitmap_initialize(char *bitmap) +{ + + memset(bitmap, 0xff, PAGE_SIZE); +} + +int +msr_bitmap_change_access(char *bitmap, u_int msr, int access) +{ + int byte, bit; + + if (msr >= 0x00000000 && msr <= 0x00001FFF) + byte = msr / 8; + else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) + byte = 1024 + (msr - 0xC0000000) / 8; + else + return (EINVAL); + + bit = msr & 0x7; + + if (access & MSR_BITMAP_ACCESS_READ) + bitmap[byte] &= ~(1 << bit); + else + bitmap[byte] |= 1 << bit; + + byte += 2048; + if (access & MSR_BITMAP_ACCESS_WRITE) + bitmap[byte] &= ~(1 << bit); + else + bitmap[byte] |= 1 << bit; + + return (0); +} diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h new file mode 100644 index 0000000..e6379a9 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_msr.h @@ -0,0 +1,78 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_MSR_H_ +#define _VMX_MSR_H_ + +#define MSR_VMX_BASIC 0x480 +#define MSR_VMX_EPT_VPID_CAP 0x48C + +#define MSR_VMX_PROCBASED_CTLS 0x482 +#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48E + +#define MSR_VMX_PINBASED_CTLS 0x481 +#define MSR_VMX_TRUE_PINBASED_CTLS 0x48D + +#define MSR_VMX_PROCBASED_CTLS2 0x48B + +#define MSR_VMX_EXIT_CTLS 0x483 +#define MSR_VMX_TRUE_EXIT_CTLS 0x48f + +#define MSR_VMX_ENTRY_CTLS 0x484 +#define MSR_VMX_TRUE_ENTRY_CTLS 0x490 + +#define MSR_VMX_CR0_FIXED0 0x486 +#define MSR_VMX_CR0_FIXED1 0x487 + +#define MSR_VMX_CR4_FIXED0 0x488 +#define MSR_VMX_CR4_FIXED1 0x489 + +uint32_t vmx_revision(void); + +int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval); + +/* + * According to Section 21.10.4 "Software Access to Related Structures", + * changes to data structures pointed to by the VMCS must be made only when + * there is no logical processor with a current VMCS that points to the + * data structure. + * + * This pretty much limits us to configuring the MSR bitmap before VMCS + * initialization for SMP VMs. Unless of course we do it the hard way - which + * would involve some form of synchronization between the vcpus to vmclear + * all VMCSs' that point to the bitmap. + */ +#define MSR_BITMAP_ACCESS_NONE 0x0 +#define MSR_BITMAP_ACCESS_READ 0x1 +#define MSR_BITMAP_ACCESS_WRITE 0x2 +#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE) +void msr_bitmap_initialize(char *bitmap); +int msr_bitmap_change_access(char *bitmap, u_int msr, int access); + +#endif diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S new file mode 100644 index 0000000..4d1bf1d --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_support.S @@ -0,0 +1,204 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include + +#include "vmx_assym.s" + +/* + * Assumes that %rdi holds a pointer to the 'vmxctx' + */ +#define VMX_GUEST_RESTORE \ + /* \ + * Make sure that interrupts are disabled before restoring CR2. \ + * Otherwise there could be a page fault during the interrupt \ + * handler execution that would end up trashing CR2. \ + */ \ + cli; \ + movq VMXCTX_GUEST_CR2(%rdi),%rsi; \ + movq %rsi,%cr2; \ + movq VMXCTX_GUEST_RSI(%rdi),%rsi; \ + movq VMXCTX_GUEST_RDX(%rdi),%rdx; \ + movq VMXCTX_GUEST_RCX(%rdi),%rcx; \ + movq VMXCTX_GUEST_R8(%rdi),%r8; \ + movq VMXCTX_GUEST_R9(%rdi),%r9; \ + movq VMXCTX_GUEST_RAX(%rdi),%rax; \ + movq VMXCTX_GUEST_RBX(%rdi),%rbx; \ + movq VMXCTX_GUEST_RBP(%rdi),%rbp; \ + movq VMXCTX_GUEST_R10(%rdi),%r10; \ + movq VMXCTX_GUEST_R11(%rdi),%r11; \ + movq VMXCTX_GUEST_R12(%rdi),%r12; \ + movq VMXCTX_GUEST_R13(%rdi),%r13; \ + movq VMXCTX_GUEST_R14(%rdi),%r14; \ + movq VMXCTX_GUEST_R15(%rdi),%r15; \ + movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */ + +#define VM_INSTRUCTION_ERROR(reg) \ + jnc 1f; \ + movl $VM_FAIL_INVALID,reg; /* CF is set */ \ + jmp 3f; \ +1: jnz 2f; \ + movl $VM_FAIL_VALID,reg; /* ZF is set */ \ + jmp 3f; \ +2: movl $VM_SUCCESS,reg; \ +3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp) + + .text +/* + * int vmx_setjmp(ctxp) + * %rdi = ctxp + * + * Return value is '0' when it returns directly from here. + * Return value is '1' when it returns after a vm exit through vmx_longjmp. + */ +ENTRY(vmx_setjmp) + movq (%rsp),%rax /* return address */ + movq %r15,VMXCTX_HOST_R15(%rdi) + movq %r14,VMXCTX_HOST_R14(%rdi) + movq %r13,VMXCTX_HOST_R13(%rdi) + movq %r12,VMXCTX_HOST_R12(%rdi) + movq %rbp,VMXCTX_HOST_RBP(%rdi) + movq %rsp,VMXCTX_HOST_RSP(%rdi) + movq %rbx,VMXCTX_HOST_RBX(%rdi) + movq %rax,VMXCTX_HOST_RIP(%rdi) + + /* + * XXX save host debug registers + */ + movl $VMX_RETURN_DIRECT,%eax + ret +END(vmx_setjmp) + +/* + * void vmx_return(struct vmxctx *ctxp, int retval) + * %rdi = ctxp + * %rsi = retval + * Return to vmm context through vmx_setjmp() with a value of 'retval'. + */ +ENTRY(vmx_return) + /* Restore host context. */ + movq VMXCTX_HOST_R15(%rdi),%r15 + movq VMXCTX_HOST_R14(%rdi),%r14 + movq VMXCTX_HOST_R13(%rdi),%r13 + movq VMXCTX_HOST_R12(%rdi),%r12 + movq VMXCTX_HOST_RBP(%rdi),%rbp + movq VMXCTX_HOST_RSP(%rdi),%rsp + movq VMXCTX_HOST_RBX(%rdi),%rbx + movq VMXCTX_HOST_RIP(%rdi),%rax + movq %rax,(%rsp) /* return address */ + + /* + * XXX restore host debug registers + */ + movl %esi,%eax + ret +END(vmx_return) + +/* + * void vmx_longjmp(void) + * %rsp points to the struct vmxctx + */ +ENTRY(vmx_longjmp) + /* + * Save guest state that is not automatically saved in the vmcs. + */ + movq %rdi,VMXCTX_GUEST_RDI(%rsp) + movq %rsi,VMXCTX_GUEST_RSI(%rsp) + movq %rdx,VMXCTX_GUEST_RDX(%rsp) + movq %rcx,VMXCTX_GUEST_RCX(%rsp) + movq %r8,VMXCTX_GUEST_R8(%rsp) + movq %r9,VMXCTX_GUEST_R9(%rsp) + movq %rax,VMXCTX_GUEST_RAX(%rsp) + movq %rbx,VMXCTX_GUEST_RBX(%rsp) + movq %rbp,VMXCTX_GUEST_RBP(%rsp) + movq %r10,VMXCTX_GUEST_R10(%rsp) + movq %r11,VMXCTX_GUEST_R11(%rsp) + movq %r12,VMXCTX_GUEST_R12(%rsp) + movq %r13,VMXCTX_GUEST_R13(%rsp) + movq %r14,VMXCTX_GUEST_R14(%rsp) + movq %r15,VMXCTX_GUEST_R15(%rsp) + + movq %cr2,%rdi + movq %rdi,VMXCTX_GUEST_CR2(%rsp) + + movq %rsp,%rdi + movq $VMX_RETURN_LONGJMP,%rsi + callq vmx_return +END(vmx_longjmp) + +/* + * void vmx_resume(struct vmxctx *ctxp) + * %rdi = ctxp + * + * Although the return type is a 'void' this function may return indirectly + * through vmx_setjmp() with a return value of 2. + */ +ENTRY(vmx_resume) + /* + * Restore guest state that is not automatically loaded from the vmcs. + */ + VMX_GUEST_RESTORE + + vmresume + + /* + * Capture the reason why vmresume failed. + */ + VM_INSTRUCTION_ERROR(%eax) + + /* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */ + movq %rsp,%rdi + movq $VMX_RETURN_VMRESUME,%rsi + callq vmx_return +END(vmx_resume) + +/* + * void vmx_launch(struct vmxctx *ctxp) + * %rdi = ctxp + * + * Although the return type is a 'void' this function may return indirectly + * through vmx_setjmp() with a return value of 3. + */ +ENTRY(vmx_launch) + /* + * Restore guest state that is not automatically loaded from the vmcs. + */ + VMX_GUEST_RESTORE + + vmlaunch + + /* + * Capture the reason why vmlaunch failed. + */ + VM_INSTRUCTION_ERROR(%eax) + + /* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */ + movq %rsp,%rdi + movq $VMX_RETURN_VMLAUNCH,%rsi + callq vmx_return +END(vmx_launch) diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c new file mode 100644 index 0000000..24495a9 --- /dev/null +++ b/sys/amd64/vmm/intel/vtd.c @@ -0,0 +1,637 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include + +#include "io/iommu.h" + +/* + * Documented in the "Intel Virtualization Technology for Directed I/O", + * Architecture Spec, September 2008. + */ + +/* Section 10.4 "Register Descriptions" */ +struct vtdmap { + volatile uint32_t version; + volatile uint32_t res0; + volatile uint64_t cap; + volatile uint64_t ext_cap; + volatile uint32_t gcr; + volatile uint32_t gsr; + volatile uint64_t rta; + volatile uint64_t ccr; +}; + +#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F) +#define VTD_CAP_ND(cap) ((cap) & 0x7) +#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1) +#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF) +#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1) + +#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1) +#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1) +#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF) + +#define VTD_GCR_WBF (1 << 27) +#define VTD_GCR_SRTP (1 << 30) +#define VTD_GCR_TE (1 << 31) + +#define VTD_GSR_WBFS (1 << 27) +#define VTD_GSR_RTPS (1 << 30) +#define VTD_GSR_TES (1 << 31) + +#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */ +#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */ + +#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */ +#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */ +#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */ +#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */ +#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */ +#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */ +#define VTD_IIR_DOMAIN_P 32 + +#define VTD_ROOT_PRESENT 0x1 +#define VTD_CTX_PRESENT 0x1 +#define VTD_CTX_TT_ALL (1UL << 2) + +#define VTD_PTE_RD (1UL << 0) +#define VTD_PTE_WR (1UL << 1) +#define VTD_PTE_SUPERPAGE (1UL << 7) +#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL) + +struct domain { + uint64_t *ptp; /* first level page table page */ + int pt_levels; /* number of page table levels */ + int addrwidth; /* 'AW' field in context entry */ + int spsmask; /* supported super page sizes */ + u_int id; /* domain id */ + vm_paddr_t maxaddr; /* highest address to be mapped */ + SLIST_ENTRY(domain) next; +}; + +static SLIST_HEAD(, domain) domhead; + +#define DRHD_MAX_UNITS 8 +static int drhd_num; +static struct vtdmap *vtdmaps[DRHD_MAX_UNITS]; +static int max_domains; +typedef int (*drhd_ident_func_t)(void); + +static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); +static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); + +static MALLOC_DEFINE(M_VTD, "vtd", "vtd"); + +/* + * Config space register definitions from the "Intel 5520 and 5500" datasheet. + */ +static int +tylersburg_vtd_ident(void) +{ + int units, nlbus; + uint16_t did, vid; + uint32_t miscsts, vtbar; + + const int bus = 0; + const int slot = 20; + const int func = 0; + + units = 0; + + vid = pci_cfgregread(bus, slot, func, PCIR_VENDOR, 2); + did = pci_cfgregread(bus, slot, func, PCIR_DEVICE, 2); + if (vid != 0x8086 || did != 0x342E) + goto done; + + /* + * Check if this is a dual IOH configuration. + */ + miscsts = pci_cfgregread(bus, slot, func, 0x9C, 4); + if (miscsts & (1 << 25)) + nlbus = pci_cfgregread(bus, slot, func, 0x160, 1); + else + nlbus = -1; + + vtbar = pci_cfgregread(bus, slot, func, 0x180, 4); + if (vtbar & 0x1) { + vtdmaps[units++] = (struct vtdmap *) + PHYS_TO_DMAP(vtbar & 0xffffe000); + } else if (bootverbose) + printf("VT-d unit in legacy IOH is disabled!\n"); + + if (nlbus != -1) { + vtbar = pci_cfgregread(nlbus, slot, func, 0x180, 4); + if (vtbar & 0x1) { + vtdmaps[units++] = (struct vtdmap *) + PHYS_TO_DMAP(vtbar & 0xffffe000); + } else if (bootverbose) + printf("VT-d unit in non-legacy IOH is disabled!\n"); + } +done: + return (units); +} + +static drhd_ident_func_t drhd_ident_funcs[] = { + tylersburg_vtd_ident, + NULL +}; + +static int +vtd_max_domains(struct vtdmap *vtdmap) +{ + int nd; + + nd = VTD_CAP_ND(vtdmap->cap); + + switch (nd) { + case 0: + return (16); + case 1: + return (64); + case 2: + return (256); + case 3: + return (1024); + case 4: + return (4 * 1024); + case 5: + return (16 * 1024); + case 6: + return (64 * 1024); + default: + panic("vtd_max_domains: invalid value of nd (0x%0x)", nd); + } +} + +static u_int +domain_id(void) +{ + u_int id; + struct domain *dom; + + /* Skip domain id 0 - it is reserved when Caching Mode field is set */ + for (id = 1; id < max_domains; id++) { + SLIST_FOREACH(dom, &domhead, next) { + if (dom->id == id) + break; + } + if (dom == NULL) + break; /* found it */ + } + + if (id >= max_domains) + panic("domain ids exhausted"); + + return (id); +} + +static void +vtd_wbflush(struct vtdmap *vtdmap) +{ + + if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0) + pmap_invalidate_cache(); + + if (VTD_CAP_RWBF(vtdmap->cap)) { + vtdmap->gcr = VTD_GCR_WBF; + while ((vtdmap->gsr & VTD_GSR_WBFS) != 0) + ; + } +} + +static void +vtd_ctx_global_invalidate(struct vtdmap *vtdmap) +{ + + vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL; + while ((vtdmap->ccr & VTD_CCR_ICC) != 0) + ; +} + +static void +vtd_iotlb_global_invalidate(struct vtdmap *vtdmap) +{ + int offset; + volatile uint64_t *iotlb_reg, val; + + vtd_wbflush(vtdmap); + + offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16; + iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8); + + *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL | + VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES; + + while (1) { + val = *iotlb_reg; + if ((val & VTD_IIR_IVT) == 0) + break; + } +} + +static void +vtd_translation_enable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = VTD_GCR_TE; + while ((vtdmap->gsr & VTD_GSR_TES) == 0) + ; +} + +static void +vtd_translation_disable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = 0; + while ((vtdmap->gsr & VTD_GSR_TES) != 0) + ; +} + +static int +vtd_init(void) +{ + int i, units; + struct vtdmap *vtdmap; + vm_paddr_t ctx_paddr; + + for (i = 0; drhd_ident_funcs[i] != NULL; i++) { + units = (*drhd_ident_funcs[i])(); + if (units > 0) + break; + } + + if (units <= 0) + return (ENXIO); + + drhd_num = units; + vtdmap = vtdmaps[0]; + + if (VTD_CAP_CM(vtdmap->cap) != 0) + panic("vtd_init: invalid caching mode"); + + max_domains = vtd_max_domains(vtdmap); + + /* + * Set up the root-table to point to the context-entry tables + */ + for (i = 0; i < 256; i++) { + ctx_paddr = vtophys(ctx_tables[i]); + if (ctx_paddr & PAGE_MASK) + panic("ctx table (0x%0lx) not page aligned", ctx_paddr); + + root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT; + } + + return (0); +} + +static void +vtd_cleanup(void) +{ +} + +static void +vtd_enable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_wbflush(vtdmap); + + /* Update the root table address */ + vtdmap->rta = vtophys(root_table); + vtdmap->gcr = VTD_GCR_SRTP; + while ((vtdmap->gsr & VTD_GSR_RTPS) == 0) + ; + + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + + vtd_translation_enable(vtdmap); + } +} + +static void +vtd_disable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_translation_disable(vtdmap); + } +} + +static void +vtd_add_device(void *arg, int bus, int slot, int func) +{ + int idx; + uint64_t *ctxp; + struct domain *dom = arg; + vm_paddr_t pt_paddr; + struct vtdmap *vtdmap; + + if (bus < 0 || bus > PCI_BUSMAX || + slot < 0 || slot > PCI_SLOTMAX || + func < 0 || func > PCI_FUNCMAX) + panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func); + + vtdmap = vtdmaps[0]; + ctxp = ctx_tables[bus]; + pt_paddr = vtophys(dom->ptp); + idx = (slot << 3 | func) * 2; + + if (ctxp[idx] & VTD_CTX_PRESENT) { + panic("vtd_add_device: device %d/%d/%d is already owned by " + "domain %d", bus, slot, func, + (uint16_t)(ctxp[idx + 1] >> 8)); + } + + /* + * Order is important. The 'present' bit is set only after all fields + * of the context pointer are initialized. + */ + ctxp[idx + 1] = dom->addrwidth | (dom->id << 8); + + if (VTD_ECAP_DI(vtdmap->ext_cap)) + ctxp[idx] = VTD_CTX_TT_ALL; + else + ctxp[idx] = 0; + + ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT; + + /* + * 'Not Present' entries are not cached in either the Context Cache + * or in the IOTLB, so there is no need to invalidate either of them. + */ +} + +static void +vtd_remove_device(void *arg, int bus, int slot, int func) +{ + int i, idx; + uint64_t *ctxp; + struct vtdmap *vtdmap; + + if (bus < 0 || bus > PCI_BUSMAX || + slot < 0 || slot > PCI_SLOTMAX || + func < 0 || func > PCI_FUNCMAX) + panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func); + + ctxp = ctx_tables[bus]; + idx = (slot << 3 | func) * 2; + + /* + * Order is important. The 'present' bit is must be cleared first. + */ + ctxp[idx] = 0; + ctxp[idx + 1] = 0; + + /* + * Invalidate the Context Cache and the IOTLB. + * + * XXX use device-selective invalidation for Context Cache + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + } +} + +static uint64_t +vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + struct domain *dom; + int i, spshift, ptpshift, ptpindex, nlevels; + uint64_t spsize, *ptp; + + dom = arg; + ptpindex = 0; + ptpshift = 0; + + if (gpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa); + + if (hpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa); + + if (len & PAGE_MASK) + panic("vtd_create_mapping: unaligned len 0x%0lx", len); + + /* + * Compute the size of the mapping that we can accomodate. + * + * This is based on three factors: + * - supported super page size + * - alignment of the region starting at 'gpa' and 'hpa' + * - length of the region 'len' + */ + spshift = 48; + for (i = 3; i >= 0; i--) { + spsize = 1UL << spshift; + if ((dom->spsmask & (1 << i)) != 0 && + (gpa & (spsize - 1)) == 0 && + (hpa & (spsize - 1)) == 0 && + (len >= spsize)) { + break; + } + spshift -= 9; + } + + ptp = dom->ptp; + nlevels = dom->pt_levels; + while (--nlevels >= 0) { + ptpshift = 12 + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + /* We have reached the leaf mapping */ + if (spshift >= ptpshift) { + break; + } + + /* + * We are working on a non-leaf page table page. + * + * Create a downstream page table page if necessary and point + * to it from the current page table. + */ + if (ptp[ptpindex] == 0) { + void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO); + ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR; + } + + ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M); + } + + if ((gpa & ((1UL << ptpshift) - 1)) != 0) + panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift); + + /* + * Create a 'gpa' -> 'hpa' mapping + */ + ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR; + + if (nlevels > 0) + ptp[ptpindex] |= VTD_PTE_SUPERPAGE; + + return (1UL << ptpshift); +} + +static void * +vtd_create_domain(vm_paddr_t maxaddr) +{ + struct domain *dom; + vm_paddr_t addr; + int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth; + struct vtdmap *vtdmap; + + if (drhd_num <= 0) + panic("vtd_create_domain: no dma remapping hardware available"); + + vtdmap = vtdmaps[0]; + + /* + * Calculate AGAW. + * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec. + */ + addr = 0; + for (gaw = 0; addr < maxaddr; gaw++) + addr = 1ULL << gaw; + + res = (gaw - 12) % 9; + if (res == 0) + agaw = gaw; + else + agaw = gaw + 9 - res; + + if (agaw > 64) + agaw = 64; + + /* + * Select the smallest Supported AGAW and the corresponding number + * of page table levels. + */ + pt_levels = 2; + sagaw = 30; + addrwidth = 0; + tmp = VTD_CAP_SAGAW(vtdmap->cap); + for (i = 0; i < 5; i++) { + if ((tmp & (1 << i)) != 0 && sagaw >= agaw) + break; + pt_levels++; + addrwidth++; + sagaw += 9; + if (sagaw > 64) + sagaw = 64; + } + + if (i >= 5) { + panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d", + VTD_CAP_SAGAW(vtdmap->cap), agaw); + } + + dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK); + dom->pt_levels = pt_levels; + dom->addrwidth = addrwidth; + dom->spsmask = VTD_CAP_SPS(vtdmap->cap); + dom->id = domain_id(); + dom->maxaddr = maxaddr; + dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK); + if ((uintptr_t)dom->ptp & PAGE_MASK) + panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp); + + SLIST_INSERT_HEAD(&domhead, dom, next); + + return (dom); +} + +static void +vtd_free_ptp(uint64_t *ptp, int level) +{ + int i; + uint64_t *nlp; + + if (level > 1) { + for (i = 0; i < 512; i++) { + if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0) + continue; + if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0) + continue; + nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M); + vtd_free_ptp(nlp, level - 1); + } + } + + bzero(ptp, PAGE_SIZE); + free(ptp, M_VTD); +} + +static void +vtd_destroy_domain(void *arg) +{ + struct domain *dom; + + dom = arg; + + SLIST_REMOVE(&domhead, dom, domain, next); + vtd_free_ptp(dom->ptp, dom->pt_levels); + free(dom, M_VTD); +} + +struct iommu_ops iommu_ops_intel = { + vtd_init, + vtd_cleanup, + vtd_enable, + vtd_disable, + vtd_create_domain, + vtd_destroy_domain, + vtd_create_mapping, + vtd_add_device, + vtd_remove_device, +}; diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c new file mode 100644 index 0000000..baf2447 --- /dev/null +++ b/sys/amd64/vmm/io/iommu.c @@ -0,0 +1,230 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include + +#include + +#include "vmm_util.h" +#include "iommu.h" + +static boolean_t iommu_avail; +static struct iommu_ops *ops; +static void *host_domain; + +static __inline int +IOMMU_INIT(void) +{ + if (ops != NULL) + return ((*ops->init)()); + else + return (ENXIO); +} + +static __inline void +IOMMU_CLEANUP(void) +{ + if (ops != NULL && iommu_avail) + (*ops->cleanup)(); +} + +static __inline void * +IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_domain)(maxaddr)); + else + return (NULL); +} + +static __inline void +IOMMU_DESTROY_DOMAIN(void *dom) +{ + + if (ops != NULL && iommu_avail) + (*ops->destroy_domain)(dom); +} + +static __inline uint64_t +IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_mapping)(domain, gpa, hpa, len)); + else + return (len); /* XXX */ +} + +static __inline void +IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func) +{ + + if (ops != NULL && iommu_avail) + (*ops->add_device)(domain, bus, slot, func); +} + +static __inline void +IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func) +{ + + if (ops != NULL && iommu_avail) + (*ops->remove_device)(domain, bus, slot, func); +} + +static __inline void +IOMMU_ENABLE(void) +{ + + if (ops != NULL && iommu_avail) + (*ops->enable)(); +} + +static __inline void +IOMMU_DISABLE(void) +{ + + if (ops != NULL && iommu_avail) + (*ops->disable)(); +} + +void +iommu_init(void) +{ + int error, bus, slot, func; + vm_paddr_t maxaddr; + const char *name; + device_t dev; + + if (vmm_is_intel()) + ops = &iommu_ops_intel; + else if (vmm_is_amd()) + ops = &iommu_ops_amd; + else + ops = NULL; + + error = IOMMU_INIT(); + if (error) + return; + + iommu_avail = TRUE; + + /* + * Create a domain for the devices owned by the host + */ + maxaddr = ptoa(Maxmem); + host_domain = IOMMU_CREATE_DOMAIN(maxaddr); + if (host_domain == NULL) + panic("iommu_init: unable to create a host domain"); + + /* + * Create 1:1 mappings from '0' to 'Maxmem' for devices assigned to + * the host + */ + iommu_create_mapping(host_domain, 0, 0, maxaddr); + + for (bus = 0; bus <= PCI_BUSMAX; bus++) { + for (slot = 0; slot <= PCI_SLOTMAX; slot++) { + for (func = 0; func <= PCI_FUNCMAX; func++) { + dev = pci_find_dbsf(0, bus, slot, func); + if (dev == NULL) + continue; + + /* skip passthrough devices */ + name = device_get_name(dev); + if (name != NULL && strcmp(name, "ppt") == 0) + continue; + + /* everything else belongs to the host domain */ + iommu_add_device(host_domain, bus, slot, func); + } + } + } + IOMMU_ENABLE(); + +} + +void +iommu_cleanup(void) +{ + IOMMU_DISABLE(); + IOMMU_DESTROY_DOMAIN(host_domain); + IOMMU_CLEANUP(); +} + +void * +iommu_create_domain(vm_paddr_t maxaddr) +{ + + return (IOMMU_CREATE_DOMAIN(maxaddr)); +} + +void +iommu_destroy_domain(void *dom) +{ + + IOMMU_DESTROY_DOMAIN(dom); +} + +void +iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len) +{ + uint64_t mapped, remaining; + + remaining = len; + + while (remaining > 0) { + mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining); + gpa += mapped; + hpa += mapped; + remaining -= mapped; + } +} + +void +iommu_add_device(void *dom, int bus, int slot, int func) +{ + + IOMMU_ADD_DEVICE(dom, bus, slot, func); +} + +void +iommu_remove_device(void *dom, int bus, int slot, int func) +{ + + IOMMU_REMOVE_DEVICE(dom, bus, slot, func); +} diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h new file mode 100644 index 0000000..e4f7229 --- /dev/null +++ b/sys/amd64/vmm/io/iommu.h @@ -0,0 +1,67 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_IOMMU_H_ +#define _IO_IOMMU_H_ + +typedef int (*iommu_init_func_t)(void); +typedef void (*iommu_cleanup_func_t)(void); +typedef void (*iommu_enable_func_t)(void); +typedef void (*iommu_disable_func_t)(void); +typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr); +typedef void (*iommu_destroy_domain_t)(void *domain); +typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t len); +typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func); +typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func); + +struct iommu_ops { + iommu_init_func_t init; /* module wide */ + iommu_cleanup_func_t cleanup; + iommu_enable_func_t enable; + iommu_disable_func_t disable; + + iommu_create_domain_t create_domain; /* domain-specific */ + iommu_destroy_domain_t destroy_domain; + iommu_create_mapping_t create_mapping; + iommu_add_device_t add_device; + iommu_remove_device_t remove_device; +}; + +extern struct iommu_ops iommu_ops_intel; +extern struct iommu_ops iommu_ops_amd; + +void iommu_init(void); +void iommu_cleanup(void); +void *iommu_create_domain(vm_paddr_t maxaddr); +void iommu_destroy_domain(void *dom); +void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, + size_t len); +void iommu_add_device(void *dom, int bus, int slot, int func); +void iommu_remove_device(void *dom, int bus, int slot, int func); +#endif diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c new file mode 100644 index 0000000..dc2f326 --- /dev/null +++ b/sys/amd64/vmm/io/ppt.c @@ -0,0 +1,449 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +#include "vmm_lapic.h" +#include "vmm_ktr.h" + +#include "iommu.h" +#include "ppt.h" + +#define MAX_PPTDEVS (sizeof(pptdevs) / sizeof(pptdevs[0])) +#define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1) +#define MAX_MSIMSGS 32 + +struct pptintr_arg { /* pptintr(pptintr_arg) */ + struct pptdev *pptdev; + int msg; +}; + +static struct pptdev { + device_t dev; + struct vm *vm; /* owner of this device */ + struct vm_memory_segment mmio[MAX_MMIOSEGS]; + struct { + int num_msgs; /* guest state */ + int vector; + int vcpu; + + int startrid; /* host state */ + struct resource *res[MAX_MSIMSGS]; + void *cookie[MAX_MSIMSGS]; + struct pptintr_arg arg[MAX_MSIMSGS]; + } msi; +} pptdevs[32]; + +static int num_pptdevs; + +static int +ppt_probe(device_t dev) +{ + int bus, slot, func; + struct pci_devinfo *dinfo; + + dinfo = (struct pci_devinfo *)device_get_ivars(dev); + + bus = pci_get_bus(dev); + slot = pci_get_slot(dev); + func = pci_get_function(dev); + + /* + * To qualify as a pci passthrough device a device must: + * - be allowed by administrator to be used in this role + * - be an endpoint device + */ + if (vmm_is_pptdev(bus, slot, func) && + (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL) + return (0); + else + return (ENXIO); +} + +static int +ppt_attach(device_t dev) +{ + int n; + + if (num_pptdevs >= MAX_PPTDEVS) { + printf("ppt_attach: maximum number of pci passthrough devices " + "exceeded\n"); + return (ENXIO); + } + + n = num_pptdevs++; + pptdevs[n].dev = dev; + + if (bootverbose) + device_printf(dev, "attached\n"); + + return (0); +} + +static int +ppt_detach(device_t dev) +{ + /* + * XXX check whether there are any pci passthrough devices assigned + * to guests before we allow this driver to detach. + */ + + return (0); +} + +static device_method_t ppt_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, ppt_probe), + DEVMETHOD(device_attach, ppt_attach), + DEVMETHOD(device_detach, ppt_detach), + {0, 0} +}; + +static devclass_t ppt_devclass; +DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0); +DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL); + +static struct pptdev * +ppt_find(int bus, int slot, int func) +{ + device_t dev; + int i, b, s, f; + + for (i = 0; i < num_pptdevs; i++) { + dev = pptdevs[i].dev; + b = pci_get_bus(dev); + s = pci_get_slot(dev); + f = pci_get_function(dev); + if (bus == b && slot == s && func == f) + return (&pptdevs[i]); + } + return (NULL); +} + +static void +ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt) +{ + int i; + struct vm_memory_segment *seg; + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) + continue; + (void)vm_unmap_mmio(vm, seg->gpa, seg->len); + bzero(seg, sizeof(struct vm_memory_segment)); + } +} + +static void +ppt_teardown_msi(struct pptdev *ppt) +{ + int i, rid; + void *cookie; + struct resource *res; + + if (ppt->msi.num_msgs == 0) + return; + + for (i = 0; i < ppt->msi.num_msgs; i++) { + rid = ppt->msi.startrid + i; + res = ppt->msi.res[i]; + cookie = ppt->msi.cookie[i]; + + if (cookie != NULL) + bus_teardown_intr(ppt->dev, res, cookie); + + if (res != NULL) + bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); + + ppt->msi.res[i] = NULL; + ppt->msi.cookie[i] = NULL; + } + + if (ppt->msi.startrid == 1) + pci_release_msi(ppt->dev); + + ppt->msi.num_msgs = 0; +} + +int +ppt_assign_device(struct vm *vm, int bus, int slot, int func) +{ + struct pptdev *ppt; + + ppt = ppt_find(bus, slot, func); + if (ppt != NULL) { + /* + * If this device is owned by a different VM then we + * cannot change its owner. + */ + if (ppt->vm != NULL && ppt->vm != vm) + return (EBUSY); + + ppt->vm = vm; + iommu_add_device(vm_iommu_domain(vm), bus, slot, func); + return (0); + } + return (ENOENT); +} + +int +ppt_unassign_device(struct vm *vm, int bus, int slot, int func) +{ + struct pptdev *ppt; + + ppt = ppt_find(bus, slot, func); + if (ppt != NULL) { + /* + * If this device is not owned by this 'vm' then bail out. + */ + if (ppt->vm != vm) + return (EBUSY); + ppt_unmap_mmio(vm, ppt); + ppt_teardown_msi(ppt); + iommu_remove_device(vm_iommu_domain(vm), bus, slot, func); + ppt->vm = NULL; + return (0); + } + return (ENOENT); +} + +int +ppt_unassign_all(struct vm *vm) +{ + int i, bus, slot, func; + device_t dev; + + for (i = 0; i < num_pptdevs; i++) { + if (pptdevs[i].vm == vm) { + dev = pptdevs[i].dev; + bus = pci_get_bus(dev); + slot = pci_get_slot(dev); + func = pci_get_function(dev); + ppt_unassign_device(vm, bus, slot, func); + } + } + + return (0); +} + +int +ppt_map_mmio(struct vm *vm, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + int i, error; + struct vm_memory_segment *seg; + struct pptdev *ppt; + + ppt = ppt_find(bus, slot, func); + if (ppt != NULL) { + if (ppt->vm != vm) + return (EBUSY); + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) { + error = vm_map_mmio(vm, gpa, len, hpa); + if (error == 0) { + seg->gpa = gpa; + seg->len = len; + seg->hpa = hpa; + } + return (error); + } + } + return (ENOSPC); + } + return (ENOENT); +} + +static int +pptintr(void *arg) +{ + int vec; + struct pptdev *ppt; + struct pptintr_arg *pptarg; + + pptarg = arg; + ppt = pptarg->pptdev; + vec = ppt->msi.vector + pptarg->msg; + + if (ppt->vm != NULL) + (void) lapic_set_intr(ppt->vm, ppt->msi.vcpu, vec); + else { + /* + * XXX + * This is not expected to happen - panic? + */ + } + + /* + * For legacy interrupts give other filters a chance in case + * the interrupt was not generated by the passthrough device. + */ + if (ppt->msi.startrid == 0) + return (FILTER_STRAY); + else + return (FILTER_HANDLED); +} + +/* + * XXX + * When we try to free the MSI resource the kernel will bind the thread to + * the host cpu was originally handling the MSI. The function freeing the + * MSI vector (apic_free_vector()) will panic the kernel if the thread + * is already bound to a cpu. + * + * So, we temporarily unbind the vcpu thread before freeing the MSI resource. + */ +static void +PPT_TEARDOWN_MSI(struct vm *vm, int vcpu, struct pptdev *ppt) +{ + int pincpu = -1; + + vm_get_pinning(vm, vcpu, &pincpu); + + if (pincpu >= 0) + vm_set_pinning(vm, vcpu, -1); + + ppt_teardown_msi(ppt); + + if (pincpu >= 0) + vm_set_pinning(vm, vcpu, pincpu); +} + +int +ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, + int destcpu, int vector, int numvec) +{ + int i, rid, flags; + int msi_count, startrid, error, tmp; + struct pptdev *ppt; + + if ((destcpu >= VM_MAXCPU || destcpu < 0) || + (vector < 0 || vector > 255) || + (numvec < 0 || numvec > MAX_MSIMSGS)) + return (EINVAL); + + ppt = ppt_find(bus, slot, func); + if (ppt == NULL) + return (ENOENT); + if (ppt->vm != vm) /* Make sure we own this device */ + return (EBUSY); + + /* Free any allocated resources */ + PPT_TEARDOWN_MSI(vm, vcpu, ppt); + + if (numvec == 0) /* nothing more to do */ + return (0); + + flags = RF_ACTIVE; + msi_count = pci_msi_count(ppt->dev); + if (msi_count == 0) { + startrid = 0; /* legacy interrupt */ + msi_count = 1; + flags |= RF_SHAREABLE; + } else + startrid = 1; /* MSI */ + + /* + * The device must be capable of supporting the number of vectors + * the guest wants to allocate. + */ + if (numvec > msi_count) + return (EINVAL); + + /* + * Make sure that we can allocate all the MSI vectors that are needed + * by the guest. + */ + if (startrid == 1) { + tmp = numvec; + error = pci_alloc_msi(ppt->dev, &tmp); + if (error) + return (error); + else if (tmp != numvec) { + pci_release_msi(ppt->dev); + return (ENOSPC); + } else { + /* success */ + } + } + + ppt->msi.vector = vector; + ppt->msi.vcpu = destcpu; + ppt->msi.startrid = startrid; + + /* + * Allocate the irq resource and attach it to the interrupt handler. + */ + for (i = 0; i < numvec; i++) { + ppt->msi.num_msgs = i + 1; + ppt->msi.cookie[i] = NULL; + + rid = startrid + i; + ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, + &rid, flags); + if (ppt->msi.res[i] == NULL) + break; + + ppt->msi.arg[i].pptdev = ppt; + ppt->msi.arg[i].msg = i; + + error = bus_setup_intr(ppt->dev, ppt->msi.res[i], + INTR_TYPE_NET | INTR_MPSAFE | INTR_FAST, + pptintr, NULL, &ppt->msi.arg[i], + &ppt->msi.cookie[i]); + if (error != 0) + break; + } + + if (i < numvec) { + PPT_TEARDOWN_MSI(vm, vcpu, ppt); + return (ENXIO); + } + + return (0); +} diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h new file mode 100644 index 0000000..95f3ad0 --- /dev/null +++ b/sys/amd64/vmm/io/ppt.h @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_PPT_H_ +#define _IO_PPT_H_ + +int ppt_assign_device(struct vm *vm, int bus, int slot, int func); +int ppt_unassign_device(struct vm *vm, int bus, int slot, int func); +int ppt_unassign_all(struct vm *vm); +int ppt_map_mmio(struct vm *vm, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, + int destcpu, int vector, int numvec); + +#endif diff --git a/sys/amd64/vmm/io/vdev.c b/sys/amd64/vmm/io/vdev.c new file mode 100644 index 0000000..cd6c5d1 --- /dev/null +++ b/sys/amd64/vmm/io/vdev.c @@ -0,0 +1,270 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include "vdev.h" + +struct vdev { + SLIST_ENTRY(vdev) entry; + struct vdev_ops *ops; + void *dev; +}; +static SLIST_HEAD(, vdev) vdev_head; +static int vdev_count; + +struct vdev_region { + SLIST_ENTRY(vdev_region) entry; + struct vdev_ops *ops; + void *dev; + struct io_region *io; +}; +static SLIST_HEAD(, vdev_region) region_head; +static int region_count; + +static MALLOC_DEFINE(M_VDEV, "vdev", "vdev"); + +#define VDEV_INIT (0) +#define VDEV_RESET (1) +#define VDEV_HALT (2) + +// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"}; + +static int +vdev_system_event(int event) +{ + struct vdev *vd; + int rc; + + // TODO: locking + SLIST_FOREACH(vd, &vdev_head, entry) { + // printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name); + switch (event) { + case VDEV_INIT: + rc = vd->ops->init(vd->dev); + break; + case VDEV_RESET: + rc = vd->ops->reset(vd->dev); + break; + case VDEV_HALT: + rc = vd->ops->halt(vd->dev); + break; + default: + break; + } + if (rc) { + printf("vdev %s init failed rc=%d\n", + vd->ops->name, rc); + return rc; + } + } + return 0; +} + +int +vdev_init(void) +{ + return vdev_system_event(VDEV_INIT); +} + +int +vdev_reset(void) +{ + return vdev_system_event(VDEV_RESET); +} + +int +vdev_halt(void) +{ + return vdev_system_event(VDEV_HALT); +} + +void +vdev_vm_init(void) +{ + SLIST_INIT(&vdev_head); + vdev_count = 0; + + SLIST_INIT(®ion_head); + region_count = 0; +} +void +vdev_vm_cleanup(void) +{ + struct vdev *vd; + + // TODO: locking + while (!SLIST_EMPTY(&vdev_head)) { + vd = SLIST_FIRST(&vdev_head); + SLIST_REMOVE_HEAD(&vdev_head, entry); + free(vd, M_VDEV); + vdev_count--; + } +} + +int +vdev_register(struct vdev_ops *ops, void *dev) +{ + struct vdev *vd; + vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO); + vd->ops = ops; + vd->dev = dev; + + // TODO: locking + SLIST_INSERT_HEAD(&vdev_head, vd, entry); + vdev_count++; + return 0; +} + +void +vdev_unregister(void *dev) +{ + struct vdev *vd, *found; + + found = NULL; + // TODO: locking + SLIST_FOREACH(vd, &vdev_head, entry) { + if (vd->dev == dev) { + found = vd; + } + } + + if (found) { + SLIST_REMOVE(&vdev_head, found, vdev, entry); + free(found, M_VDEV); + } +} + +#define IN_RANGE(val, start, end) \ + (((val) >= (start)) && ((val) < (end))) + +static struct vdev_region* +vdev_find_region(struct io_region *io, void *dev) +{ + struct vdev_region *region, *found; + uint64_t region_base; + uint64_t region_end; + + found = NULL; + + // TODO: locking + // FIXME: we should verify we are in the context the current + // vcpu here as well. + SLIST_FOREACH(region, ®ion_head, entry) { + region_base = region->io->base; + region_end = region_base + region->io->len; + if (IN_RANGE(io->base, region_base, region_end) && + IN_RANGE(io->base+io->len, region_base, region_end+1) && + (dev && dev == region->dev)) { + found = region; + break; + } + } + return found; +} + +int +vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io) +{ + struct vdev_region *region; + + region = vdev_find_region(io, dev); + if (region) { + return -EEXIST; + } + + region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO); + region->io = io; + region->ops = ops; + region->dev = dev; + + // TODO: locking + SLIST_INSERT_HEAD(®ion_head, region, entry); + region_count++; + + return 0; +} + +void +vdev_unregister_region(void *dev, struct io_region *io) +{ + struct vdev_region *region; + + region = vdev_find_region(io, dev); + + if (region) { + SLIST_REMOVE(®ion_head, region, vdev_region, entry); + free(region, M_VDEV); + region_count--; + } +} + +static int +vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read) +{ + struct vdev_region *region; + struct io_region io; + region_attr_t attr; + int rc; + + io.base = gpa; + io.len = size; + + region = vdev_find_region(&io, NULL); + if (!region) + return -EINVAL; + + attr = (read) ? MMIO_READ : MMIO_WRITE; + if (!(region->io->attr & attr)) + return -EPERM; + + if (read) + rc = region->ops->memread(region->dev, gpa, size, data); + else + rc = region->ops->memwrite(region->dev, gpa, size, *data); + + return rc; +} + +int +vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data) +{ + return vdev_memrw(gpa, size, data, 1); +} + +int +vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data) +{ + return vdev_memrw(gpa, size, &data, 0); +} diff --git a/sys/amd64/vmm/io/vdev.h b/sys/amd64/vmm/io/vdev.h new file mode 100644 index 0000000..6feeba8 --- /dev/null +++ b/sys/amd64/vmm/io/vdev.h @@ -0,0 +1,84 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VDEV_H_ +#define _VDEV_H_ + +typedef enum { + BYTE = 1, + WORD = 2, + DWORD = 4, + QWORD = 8, +} opsize_t; + +typedef enum { + MMIO_READ = 1, + MMIO_WRITE = 2, +} region_attr_t; + +struct io_region { + uint64_t base; + uint64_t len; + region_attr_t attr; + int vcpu; +}; + +typedef int (*vdev_init_t)(void* dev); +typedef int (*vdev_reset_t)(void* dev); +typedef int (*vdev_halt_t)(void* dev); +typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data); +typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data); + + +struct vdev_ops { + const char *name; + vdev_init_t init; + vdev_reset_t reset; + vdev_halt_t halt; + vdev_memread_t memread; + vdev_memwrite_t memwrite; +}; + + +void vdev_vm_init(void); +void vdev_vm_cleanup(void); + +int vdev_register(struct vdev_ops *ops, void *dev); +void vdev_unregister(void *dev); + +int vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io); +void vdev_unregister_region(void *dev, struct io_region *io); + +int vdev_init(void); +int vdev_reset(void); +int vdev_halt(void); +int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data); +int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data); + +#endif /* _VDEV_H_ */ + diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c new file mode 100644 index 0000000..a21addf --- /dev/null +++ b/sys/amd64/vmm/io/vlapic.c @@ -0,0 +1,812 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include + +#include + +#include "vmm_lapic.h" +#include "vmm_ktr.h" +#include "vdev.h" +#include "vlapic.h" + +#define VLAPIC_CTR0(vlapic, format) \ + VMM_CTR0((vlapic)->vm, (vlapic)->vcpuid, format) + +#define VLAPIC_CTR1(vlapic, format, p1) \ + VMM_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1) + +#define VLAPIC_CTR_IRR(vlapic, msg) \ +do { \ + uint32_t *irrptr = &(vlapic)->apic.irr0; \ + irrptr[0] = irrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \ +} while (0) + +#define VLAPIC_CTR_ISR(vlapic, msg) \ +do { \ + uint32_t *isrptr = &(vlapic)->apic.isr0; \ + isrptr[0] = isrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \ +} while (0) + +static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); + +#define PRIO(x) ((x) >> 4) + +#define VLAPIC_VERSION (16) +#define VLAPIC_MAXLVT_ENTRIES (5) + +struct vlapic { + struct vm *vm; + int vcpuid; + + struct io_region *mmio; + struct vdev_ops *ops; + struct LAPIC apic; + + int esr_update; + + int divisor; + int ccr_ticks; + + /* + * The 'isrvec_stk' is a stack of vectors injected by the local apic. + * A vector is popped from the stack when the processor does an EOI. + * The vector on the top of the stack is used to compute the + * Processor Priority in conjunction with the TPR. + */ + uint8_t isrvec_stk[ISRVEC_STK_SIZE]; + int isrvec_stk_top; +}; + +static void +vlapic_mask_lvts(uint32_t *lvts, int num_lvt) +{ + int i; + for (i = 0; i < num_lvt; i++) { + *lvts |= APIC_LVT_M; + lvts += 4; + } +} + +#if 0 +static inline void +vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) +{ + printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, + *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, + *lvt & APIC_LVTT_M); +} +#endif + +static uint64_t +vlapic_get_ccr(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + return lapic->ccr_timer; +} + +static void +vlapic_update_errors(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + lapic->esr = 0; // XXX +} + +static void +vlapic_init_ipi(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + lapic->version = VLAPIC_VERSION; + lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT); + lapic->dfr = 0xffffffff; + lapic->svr = APIC_SVR_VECTOR; + vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1); +} + +static int +vlapic_op_reset(void* dev) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + struct LAPIC *lapic = &vlapic->apic; + + memset(lapic, 0, sizeof(*lapic)); + lapic->id = vlapic->vcpuid << 24; + lapic->apr = vlapic->vcpuid; + vlapic_init_ipi(vlapic); + + return 0; + +} + +static int +vlapic_op_init(void* dev) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + vdev_register_region(vlapic->ops, vlapic, vlapic->mmio); + return vlapic_op_reset(dev); +} + +static int +vlapic_op_halt(void* dev) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + vdev_unregister_region(vlapic, vlapic->mmio); + return 0; + +} + +void +vlapic_set_intr_ready(struct vlapic *vlapic, int vector) +{ + struct LAPIC *lapic = &vlapic->apic; + uint32_t *irrptr; + int idx; + + if (vector < 0 || vector >= 256) + panic("vlapic_set_intr_ready: invalid vector %d\n", vector); + + idx = (vector / 32) * 4; + irrptr = &lapic->irr0; + atomic_set_int(&irrptr[idx], 1 << (vector % 32)); + VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); +} + +#define VLAPIC_BUS_FREQ tsc_freq +#define VLAPIC_DCR(x) ((x->dcr_timer & 0x8) >> 1)|(x->dcr_timer & 0x3) + +static int +vlapic_timer_divisor(uint32_t dcr) +{ + switch (dcr & 0xB) { + case APIC_TDCR_2: + return (2); + case APIC_TDCR_4: + return (4); + case APIC_TDCR_8: + return (8); + case APIC_TDCR_16: + return (16); + case APIC_TDCR_32: + return (32); + case APIC_TDCR_64: + return (64); + case APIC_TDCR_128: + return (128); + default: + panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); + } +} + +static void +vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed) +{ + uint32_t icr_timer; + + icr_timer = vlapic->apic.icr_timer; + + vlapic->ccr_ticks = ticks; + if (elapsed < icr_timer) + vlapic->apic.ccr_timer = icr_timer - elapsed; + else { + /* + * This can happen when the guest is trying to run its local + * apic timer higher that the setting of 'hz' in the host. + * + * We deal with this by running the guest local apic timer + * at the rate of the host's 'hz' setting. + */ + vlapic->apic.ccr_timer = 0; + } +} + +static __inline uint32_t * +vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) +{ + struct LAPIC *lapic = &vlapic->apic; + int i; + + if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) { + panic("vlapic_get_lvt: invalid LVT\n"); + } + i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; + return ((&lapic->lvt_timer) + i);; +} + +#if 1 +static void +dump_isrvec_stk(struct vlapic *vlapic) +{ + int i; + uint32_t *isrptr; + + isrptr = &vlapic->apic.isr0; + for (i = 0; i < 8; i++) + printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); + + for (i = 0; i <= vlapic->isrvec_stk_top; i++) + printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); +} +#endif + +/* + * Algorithm adopted from section "Interrupt, Task and Processor Priority" + * in Intel Architecture Manual Vol 3a. + */ +static void +vlapic_update_ppr(struct vlapic *vlapic) +{ + int isrvec, tpr, ppr; + + /* + * Note that the value on the stack at index 0 is always 0. + * + * This is a placeholder for the value of ISRV when none of the + * bits is set in the ISRx registers. + */ + isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; + tpr = vlapic->apic.tpr; + +#if 1 + { + int i, lastprio, curprio, vector, idx; + uint32_t *isrptr; + + if (vlapic->isrvec_stk_top == 0 && isrvec != 0) + panic("isrvec_stk is corrupted: %d", isrvec); + + /* + * Make sure that the priority of the nested interrupts is + * always increasing. + */ + lastprio = -1; + for (i = 1; i <= vlapic->isrvec_stk_top; i++) { + curprio = PRIO(vlapic->isrvec_stk[i]); + if (curprio <= lastprio) { + dump_isrvec_stk(vlapic); + panic("isrvec_stk does not satisfy invariant"); + } + lastprio = curprio; + } + + /* + * Make sure that each bit set in the ISRx registers has a + * corresponding entry on the isrvec stack. + */ + i = 1; + isrptr = &vlapic->apic.isr0; + for (vector = 0; vector < 256; vector++) { + idx = (vector / 32) * 4; + if (isrptr[idx] & (1 << (vector % 32))) { + if (i > vlapic->isrvec_stk_top || + vlapic->isrvec_stk[i] != vector) { + dump_isrvec_stk(vlapic); + panic("ISR and isrvec_stk out of sync"); + } + i++; + } + } + } +#endif + + if (PRIO(tpr) >= PRIO(isrvec)) + ppr = tpr; + else + ppr = isrvec & 0xf0; + + vlapic->apic.ppr = ppr; + VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); +} + +static void +vlapic_process_eoi(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + uint32_t *isrptr; + int i, idx, bitpos; + + isrptr = &lapic->isr0; + + /* + * The x86 architecture reserves the the first 32 vectors for use + * by the processor. + */ + for (i = 7; i > 0; i--) { + idx = i * 4; + bitpos = fls(isrptr[idx]); + if (bitpos != 0) { + if (vlapic->isrvec_stk_top <= 0) { + panic("invalid vlapic isrvec_stk_top %d", + vlapic->isrvec_stk_top); + } + isrptr[idx] &= ~(1 << (bitpos - 1)); + VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); + vlapic->isrvec_stk_top--; + vlapic_update_ppr(vlapic); + return; + } + } +} + +static __inline int +vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask) +{ + return (*lvt & mask); +} + +static __inline int +vlapic_periodic_timer(struct vlapic *vlapic) +{ + uint32_t *lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + + return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); +} + +static void +vlapic_fire_timer(struct vlapic *vlapic) +{ + int vector; + uint32_t *lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + + if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) { + vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR); + vlapic_set_intr_ready(vlapic, vector); + } +} + +static int +lapic_process_icr(struct vlapic *vlapic, uint64_t icrval) +{ + int i; + cpumask_t dmask, thiscpumask; + uint32_t dest, vec, mode; + + thiscpumask = vcpu_mask(vlapic->vcpuid); + + dmask = 0; + dest = icrval >> 32; + vec = icrval & APIC_VECTOR_MASK; + mode = icrval & APIC_DELMODE_MASK; + + if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { + switch (icrval & APIC_DEST_MASK) { + case APIC_DEST_DESTFLD: + dmask = vcpu_mask(dest); + break; + case APIC_DEST_SELF: + dmask = thiscpumask; + break; + case APIC_DEST_ALLISELF: + dmask = vm_active_cpus(vlapic->vm); + break; + case APIC_DEST_ALLESELF: + dmask = vm_active_cpus(vlapic->vm) & ~thiscpumask; + break; + } + + for (i = 0; i < VM_MAXCPU; i++) { + if (dmask & vcpu_mask(i)) { + if (mode == APIC_DELMODE_FIXED) + lapic_set_intr(vlapic->vm, i, vec); + else + vm_inject_nmi(vlapic->vm, i); + } + } + + return (0); /* handled completely in the kernel */ + } + + /* + * XXX this assumes that the startup IPI always succeeds + */ + if (mode == APIC_DELMODE_STARTUP) + vm_activate_cpu(vlapic->vm, dest); + + /* + * This will cause a return to userland. + */ + return (1); +} + +int +vlapic_pending_intr(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + int idx, i, bitpos, vector; + uint32_t *irrptr, val; + + irrptr = &lapic->irr0; + + /* + * The x86 architecture reserves the the first 32 vectors for use + * by the processor. + */ + for (i = 7; i > 0; i--) { + idx = i * 4; + val = atomic_load_acq_int(&irrptr[idx]); + bitpos = fls(val); + if (bitpos != 0) { + vector = i * 32 + (bitpos - 1); + if (PRIO(vector) > PRIO(lapic->ppr)) { + VLAPIC_CTR1(vlapic, "pending intr %d", vector); + return (vector); + } else + break; + } + } + VLAPIC_CTR0(vlapic, "no pending intr"); + return (-1); +} + +void +vlapic_intr_accepted(struct vlapic *vlapic, int vector) +{ + struct LAPIC *lapic = &vlapic->apic; + uint32_t *irrptr, *isrptr; + int idx, stk_top; + + /* + * clear the ready bit for vector being accepted in irr + * and set the vector as in service in isr. + */ + idx = (vector / 32) * 4; + + irrptr = &lapic->irr0; + atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); + VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); + + isrptr = &lapic->isr0; + isrptr[idx] |= 1 << (vector % 32); + VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); + + /* + * Update the PPR + */ + vlapic->isrvec_stk_top++; + + stk_top = vlapic->isrvec_stk_top; + if (stk_top >= ISRVEC_STK_SIZE) + panic("isrvec_stk_top overflow %d", stk_top); + + vlapic->isrvec_stk[stk_top] = vector; + vlapic_update_ppr(vlapic); +} + +int +vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + struct LAPIC *lapic = &vlapic->apic; + uint64_t offset = gpa & ~(PAGE_SIZE); + uint32_t *reg; + int i; + + if (offset > sizeof(*lapic)) { + *data = 0; + return 0; + } + + offset &= ~3; + switch(offset) + { + case APIC_OFFSET_ID: + *data = lapic->id; + break; + case APIC_OFFSET_VER: + *data = lapic->version; + break; + case APIC_OFFSET_TPR: + *data = lapic->tpr; + break; + case APIC_OFFSET_APR: + *data = lapic->apr; + break; + case APIC_OFFSET_PPR: + *data = lapic->ppr; + break; + case APIC_OFFSET_EOI: + *data = lapic->eoi; + break; + case APIC_OFFSET_LDR: + *data = lapic->ldr; + break; + case APIC_OFFSET_DFR: + *data = lapic->dfr; + break; + case APIC_OFFSET_SVR: + *data = lapic->svr; + break; + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + i = (offset - APIC_OFFSET_ISR0) >> 2; + reg = &lapic->isr0; + *data = *(reg + i); + break; + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + i = (offset - APIC_OFFSET_TMR0) >> 2; + reg = &lapic->tmr0; + *data = *(reg + i); + break; + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + i = (offset - APIC_OFFSET_IRR0) >> 2; + reg = &lapic->irr0; + *data = atomic_load_acq_int(reg + i); + break; + case APIC_OFFSET_ESR: + *data = lapic->esr; + break; + case APIC_OFFSET_ICR_LOW: + *data = lapic->icr_lo; + break; + case APIC_OFFSET_ICR_HI: + *data = lapic->icr_hi; + break; + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + reg = vlapic_get_lvt(vlapic, offset); + *data = *(reg); + break; + case APIC_OFFSET_ICR: + *data = lapic->icr_timer; + break; + case APIC_OFFSET_CCR: + *data = vlapic_get_ccr(vlapic); + break; + case APIC_OFFSET_DCR: + *data = lapic->dcr_timer; + break; + case APIC_OFFSET_RRR: + default: + *data = 0; + break; + } + return 0; +} + +int +vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + struct LAPIC *lapic = &vlapic->apic; + uint64_t offset = gpa & ~(PAGE_SIZE); + uint32_t *reg; + int retval; + + if (offset > sizeof(*lapic)) { + return 0; + } + + retval = 0; + offset &= ~3; + switch(offset) + { + case APIC_OFFSET_ID: + lapic->id = data; + break; + case APIC_OFFSET_TPR: + lapic->tpr = data & 0xff; + vlapic_update_ppr(vlapic); + break; + case APIC_OFFSET_EOI: + vlapic_process_eoi(vlapic); + break; + case APIC_OFFSET_LDR: + break; + case APIC_OFFSET_DFR: + break; + case APIC_OFFSET_SVR: + lapic->svr = data; + break; + case APIC_OFFSET_ICR_LOW: + retval = lapic_process_icr(vlapic, data); + break; + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + reg = vlapic_get_lvt(vlapic, offset); + if (!(lapic->svr & APIC_SVR_ENABLE)) { + data |= APIC_LVT_M; + } + *reg = data; + // vlapic_dump_lvt(offset, reg); + break; + case APIC_OFFSET_ICR: + lapic->icr_timer = data; + vlapic_start_timer(vlapic, 0); + break; + + case APIC_OFFSET_DCR: + lapic->dcr_timer = data; + vlapic->divisor = vlapic_timer_divisor(data); + break; + + case APIC_OFFSET_ESR: + vlapic_update_errors(vlapic); + break; + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_CCR: + default: + // Read only. + break; + } + + return (retval); +} + +void +vlapic_timer_tick(struct vlapic *vlapic) +{ + int curticks, delta, periodic; + uint32_t ccr; + uint32_t decrement, remainder; + + curticks = ticks; + + /* Common case */ + delta = curticks - vlapic->ccr_ticks; + if (delta == 0) + return; + + /* Local APIC timer is disabled */ + if (vlapic->apic.icr_timer == 0) + return; + + /* One-shot mode and timer has already counted down to zero */ + periodic = vlapic_periodic_timer(vlapic); + if (!periodic && vlapic->apic.ccr_timer == 0) + return; + /* + * The 'curticks' and 'ccr_ticks' are out of sync by more than + * 2^31 ticks. We deal with this by restarting the timer. + */ + if (delta < 0) { + vlapic_start_timer(vlapic, 0); + return; + } + + ccr = vlapic->apic.ccr_timer; + decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz; + while (delta-- > 0) { + if (ccr <= decrement) { + remainder = decrement - ccr; + vlapic_fire_timer(vlapic); + if (periodic) { + vlapic_start_timer(vlapic, remainder); + ccr = vlapic->apic.ccr_timer; + } else { + /* + * One-shot timer has counted down to zero. + */ + ccr = 0; + break; + } + } else + ccr -= decrement; + } + + vlapic->ccr_ticks = curticks; + vlapic->apic.ccr_timer = ccr; +} + +struct vdev_ops vlapic_dev_ops = { + .name = "vlapic", + .init = vlapic_op_init, + .reset = vlapic_op_reset, + .halt = vlapic_op_halt, + .memread = vlapic_op_mem_read, + .memwrite = vlapic_op_mem_write, +}; +static struct io_region vlapic_mmio[VM_MAXCPU]; + +struct vlapic * +vlapic_init(struct vm *vm, int vcpuid) +{ + struct vlapic *vlapic; + + vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO); + vlapic->vm = vm; + vlapic->vcpuid = vcpuid; + vlapic->ops = &vlapic_dev_ops; + + vlapic->mmio = vlapic_mmio + vcpuid; + vlapic->mmio->base = DEFAULT_APIC_BASE; + vlapic->mmio->len = PAGE_SIZE; + vlapic->mmio->attr = MMIO_READ|MMIO_WRITE; + vlapic->mmio->vcpu = vcpuid; + + vdev_register(&vlapic_dev_ops, vlapic); + + vlapic_op_init(vlapic); + + return (vlapic); +} + +void +vlapic_cleanup(struct vlapic *vlapic) +{ + vdev_unregister(vlapic); + free(vlapic, M_VLAPIC); +} diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h new file mode 100644 index 0000000..861ea8c --- /dev/null +++ b/sys/amd64/vmm/io/vlapic.h @@ -0,0 +1,105 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VLAPIC_H_ +#define _VLAPIC_H_ + +#include "vdev.h" + +struct vm; + +/* + * Map of APIC Registers: Offset Description Access + */ +#define APIC_OFFSET_ID 0x20 // Local APIC ID R/W +#define APIC_OFFSET_VER 0x30 // Local APIC Version R +#define APIC_OFFSET_TPR 0x80 // Task Priority Register R/W +#define APIC_OFFSET_APR 0x90 // Arbitration Priority Register R +#define APIC_OFFSET_PPR 0xA0 // Processor Priority Register R +#define APIC_OFFSET_EOI 0xB0 // EOI Register W +#define APIC_OFFSET_RRR 0xC0 // Remote read R +#define APIC_OFFSET_LDR 0xD0 // Logical Destination R/W +#define APIC_OFFSET_DFR 0xE0 // Destination Format Register 0..27 R; 28..31 R/W +#define APIC_OFFSET_SVR 0xF0 // Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W +#define APIC_OFFSET_ISR0 0x100 // ISR 000-031 R +#define APIC_OFFSET_ISR1 0x110 // ISR 032-063 R +#define APIC_OFFSET_ISR2 0x120 // ISR 064-095 R +#define APIC_OFFSET_ISR3 0x130 // ISR 095-128 R +#define APIC_OFFSET_ISR4 0x140 // ISR 128-159 R +#define APIC_OFFSET_ISR5 0x150 // ISR 160-191 R +#define APIC_OFFSET_ISR6 0x160 // ISR 192-223 R +#define APIC_OFFSET_ISR7 0x170 // ISR 224-255 R +#define APIC_OFFSET_TMR0 0x180 // TMR 000-031 R +#define APIC_OFFSET_TMR1 0x190 // TMR 032-063 R +#define APIC_OFFSET_TMR2 0x1A0 // TMR 064-095 R +#define APIC_OFFSET_TMR3 0x1B0 // TMR 095-128 R +#define APIC_OFFSET_TMR4 0x1C0 // TMR 128-159 R +#define APIC_OFFSET_TMR5 0x1D0 // TMR 160-191 R +#define APIC_OFFSET_TMR6 0x1E0 // TMR 192-223 R +#define APIC_OFFSET_TMR7 0x1F0 // TMR 224-255 R +#define APIC_OFFSET_IRR0 0x200 // IRR 000-031 R +#define APIC_OFFSET_IRR1 0x210 // IRR 032-063 R +#define APIC_OFFSET_IRR2 0x220 // IRR 064-095 R +#define APIC_OFFSET_IRR3 0x230 // IRR 095-128 R +#define APIC_OFFSET_IRR4 0x240 // IRR 128-159 R +#define APIC_OFFSET_IRR5 0x250 // IRR 160-191 R +#define APIC_OFFSET_IRR6 0x260 // IRR 192-223 R +#define APIC_OFFSET_IRR7 0x270 // IRR 224-255 R +#define APIC_OFFSET_ESR 0x280 // Error Status Register R +#define APIC_OFFSET_ICR_LOW 0x300 // Interrupt Command Reg. (0-31) R/W +#define APIC_OFFSET_ICR_HI 0x310 // Interrupt Command Reg. (32-63) R/W +#define APIC_OFFSET_TIMER_LVT 0x320 // Local Vector Table (Timer) R/W +#define APIC_OFFSET_THERM_LVT 0x330 // Local Vector Table (Thermal) R/W (PIV+) +#define APIC_OFFSET_PERF_LVT 0x340 // Local Vector Table (Performance) R/W (P6+) +#define APIC_OFFSET_LINT0_LVT 0x350 // Local Vector Table (LINT0) R/W +#define APIC_OFFSET_LINT1_LVT 0x360 // Local Vector Table (LINT1) R/W +#define APIC_OFFSET_ERROR_LVT 0x370 // Local Vector Table (ERROR) R/W +#define APIC_OFFSET_ICR 0x380 // Initial Count Reg. for Timer R/W +#define APIC_OFFSET_CCR 0x390 // Current Count of Timer R +#define APIC_OFFSET_DCR 0x3E0 // Timer Divide Configuration Reg. R/W + +/* + * 16 priority levels with at most one vector injected per level. + */ +#define ISRVEC_STK_SIZE (16 + 1) + +struct vlapic *vlapic_init(struct vm *vm, int vcpuid); +void vlapic_cleanup(struct vlapic *vlapic); + +int vlapic_op_mem_write(void* dev, uint64_t gpa, + opsize_t size, uint64_t data); + +int vlapic_op_mem_read(void* dev, uint64_t gpa, + opsize_t size, uint64_t *data); + +int vlapic_pending_intr(struct vlapic *vlapic); +void vlapic_intr_accepted(struct vlapic *vlapic, int vector); +void vlapic_set_intr_ready(struct vlapic *vlapic, int vector); +void vlapic_timer_tick(struct vlapic *vlapic); + +#endif /* _VLAPIC_H_ */ diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c new file mode 100644 index 0000000..c93c31e --- /dev/null +++ b/sys/amd64/vmm/vmm.c @@ -0,0 +1,737 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include "vmm_mem.h" +#include "vmm_util.h" +#include +#include "vlapic.h" +#include "vmm_msr.h" +#include "vmm_ipi.h" +#include "vmm_stat.h" + +#include "io/ppt.h" +#include "io/iommu.h" + +struct vlapic; + +struct vcpu { + int flags; + int pincpu; /* host cpuid this vcpu is bound to */ + int hostcpu; /* host cpuid this vcpu last ran on */ + uint64_t guest_msrs[VMM_MSR_NUM]; + struct vlapic *vlapic; + int vcpuid; + struct savefpu savefpu; /* guest fpu state */ + void *stats; +}; +#define VCPU_F_PINNED 0x0001 +#define VCPU_F_RUNNING 0x0002 + +#define VCPU_PINCPU(vm, vcpuid) \ + ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1) + +#define VCPU_UNPIN(vm, vcpuid) (vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED) + +#define VCPU_PIN(vm, vcpuid, host_cpuid) \ +do { \ + vm->vcpu[vcpuid].flags |= VCPU_F_PINNED; \ + vm->vcpu[vcpuid].pincpu = host_cpuid; \ +} while(0) + +#define VM_MAX_MEMORY_SEGMENTS 2 + +struct vm { + void *cookie; /* processor-specific data */ + void *iommu; /* iommu-specific data */ + struct vcpu vcpu[VM_MAXCPU]; + int num_mem_segs; + struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS]; + char name[VM_MAX_NAMELEN]; + + /* + * Mask of active vcpus. + * An active vcpu is one that has been started implicitly (BSP) or + * explicitly (AP) by sending it a startup ipi. + */ + cpumask_t active_cpus; +}; + +static struct vmm_ops *ops; +#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) +#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) + +#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL) +#define VMRUN(vmi, vcpu, rip, vmexit) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, vmexit) : ENXIO) +#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) +#define VMMMAP(vmi, gpa, hpa, len, attr, prot, spm) \ + (ops != NULL ? (*ops->vmmmap)(vmi, gpa, hpa, len, attr, prot, spm) : ENXIO) +#define VMGETREG(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETREG(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) +#define VMGETDESC(vmi, vcpu, num, desc) \ + (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) +#define VMSETDESC(vmi, vcpu, num, desc) \ + (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) +#define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \ + (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO) +#define VMNMI(vmi, vcpu) \ + (ops != NULL ? (*ops->vmnmi)(vmi, vcpu) : ENXIO) +#define VMGETCAP(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETCAP(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) + +#define fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr))) +#define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr))) +#define fpu_start_emulating() __asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \ + : : "n" (CR0_TS) : "ax") +#define fpu_stop_emulating() __asm("clts") + +static MALLOC_DEFINE(M_VM, "vm", "vm"); +CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ + +/* statistics */ +static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); + +static void +vcpu_cleanup(struct vcpu *vcpu) +{ + vlapic_cleanup(vcpu->vlapic); + vmm_stat_free(vcpu->stats); +} + +static void +vcpu_init(struct vm *vm, uint32_t vcpu_id) +{ + struct vcpu *vcpu; + + vcpu = &vm->vcpu[vcpu_id]; + + vcpu->hostcpu = -1; + vcpu->vcpuid = vcpu_id; + vcpu->vlapic = vlapic_init(vm, vcpu_id); + fpugetregs(curthread, &vcpu->savefpu); + vcpu->stats = vmm_stat_alloc(); +} + +static int +vmm_init(void) +{ + int error; + + vmm_ipi_init(); + + error = vmm_mem_init(); + if (error) + return (error); + + if (vmm_is_intel()) + ops = &vmm_ops_intel; + else if (vmm_is_amd()) + ops = &vmm_ops_amd; + else + return (ENXIO); + + vmm_msr_init(); + + return (VMM_INIT()); +} + +static int +vmm_handler(module_t mod, int what, void *arg) +{ + int error; + + switch (what) { + case MOD_LOAD: + vmmdev_init(); + iommu_init(); + error = vmm_init(); + break; + case MOD_UNLOAD: + vmmdev_cleanup(); + iommu_cleanup(); + vmm_ipi_cleanup(); + error = VMM_CLEANUP(); + break; + default: + error = 0; + break; + } + return (error); +} + +static moduledata_t vmm_kmod = { + "vmm", + vmm_handler, + NULL +}; + +/* + * Execute the module load handler after the pci passthru driver has had + * a chance to claim devices. We need this information at the time we do + * iommu initialization. + */ +DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY); +MODULE_VERSION(vmm, 1); + +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); + +struct vm * +vm_create(const char *name) +{ + int i; + struct vm *vm; + vm_paddr_t maxaddr; + + const int BSP = 0; + + if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) + return (NULL); + + vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); + strcpy(vm->name, name); + vm->cookie = VMINIT(vm); + + for (i = 0; i < VM_MAXCPU; i++) { + vcpu_init(vm, i); + guest_msrs_init(vm, i); + } + + maxaddr = vmm_mem_maxaddr(); + vm->iommu = iommu_create_domain(maxaddr); + vm_activate_cpu(vm, BSP); + + return (vm); +} + +void +vm_destroy(struct vm *vm) +{ + int i; + + ppt_unassign_all(vm); + + for (i = 0; i < vm->num_mem_segs; i++) + vmm_mem_free(vm->mem_segs[i].hpa, vm->mem_segs[i].len); + + for (i = 0; i < VM_MAXCPU; i++) + vcpu_cleanup(&vm->vcpu[i]); + + iommu_destroy_domain(vm->iommu); + + VMCLEANUP(vm->cookie); + + free(vm, M_VM); +} + +const char * +vm_name(struct vm *vm) +{ + return (vm->name); +} + +int +vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + const boolean_t spok = TRUE; /* superpage mappings are ok */ + + return (VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE, + VM_PROT_RW, spok)); +} + +int +vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + const boolean_t spok = TRUE; /* superpage mappings are ok */ + + return (VMMMAP(vm->cookie, gpa, 0, len, VM_MEMATTR_UNCACHEABLE, + VM_PROT_NONE, spok)); +} + +int +vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa) +{ + int error; + vm_paddr_t hpa; + + const boolean_t spok = TRUE; /* superpage mappings are ok */ + + /* + * find the hpa if already it was already vm_malloc'd. + */ + hpa = vm_gpa2hpa(vm, gpa, len); + if (hpa != ((vm_paddr_t)-1)) + goto out; + + if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) + return (E2BIG); + + hpa = vmm_mem_alloc(len); + if (hpa == 0) + return (ENOMEM); + + error = VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK, + VM_PROT_ALL, spok); + if (error) { + vmm_mem_free(hpa, len); + return (error); + } + + iommu_create_mapping(vm->iommu, gpa, hpa, len); + + vm->mem_segs[vm->num_mem_segs].gpa = gpa; + vm->mem_segs[vm->num_mem_segs].hpa = hpa; + vm->mem_segs[vm->num_mem_segs].len = len; + vm->num_mem_segs++; +out: + *ret_hpa = hpa; + return (0); +} + +vm_paddr_t +vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + int i; + vm_paddr_t gpabase, gpalimit, hpabase; + + for (i = 0; i < vm->num_mem_segs; i++) { + hpabase = vm->mem_segs[i].hpa; + gpabase = vm->mem_segs[i].gpa; + gpalimit = gpabase + vm->mem_segs[i].len; + if (gpa >= gpabase && gpa + len <= gpalimit) + return ((gpa - gpabase) + hpabase); + } + return ((vm_paddr_t)-1); +} + +int +vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, + struct vm_memory_segment *seg) +{ + int i; + + for (i = 0; i < vm->num_mem_segs; i++) { + if (gpabase == vm->mem_segs[i].gpa) { + *seg = vm->mem_segs[i]; + return (0); + } + } + return (-1); +} + +int +vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (VMGETREG(vm->cookie, vcpu, reg, retval)); +} + +int +vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (VMSETREG(vm->cookie, vcpu, reg, val)); +} + +static boolean_t +is_descriptor_table(int reg) +{ + + switch (reg) { + case VM_REG_GUEST_IDTR: + case VM_REG_GUEST_GDTR: + return (TRUE); + default: + return (FALSE); + } +} + +static boolean_t +is_segment_register(int reg) +{ + + switch (reg) { + case VM_REG_GUEST_ES: + case VM_REG_GUEST_CS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_TR: + case VM_REG_GUEST_LDTR: + return (TRUE); + default: + return (FALSE); + } +} + +int +vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMGETDESC(vm->cookie, vcpu, reg, desc)); +} + +int +vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMSETDESC(vm->cookie, vcpu, reg, desc)); +} + +int +vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid) +{ + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + *cpuid = VCPU_PINCPU(vm, vcpuid); + + return (0); +} + +int +vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid) +{ + struct thread *td; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + td = curthread; /* XXXSMP only safe when muxing vcpus */ + + /* unpin */ + if (host_cpuid < 0) { + VCPU_UNPIN(vm, vcpuid); + thread_lock(td); + sched_unbind(td); + thread_unlock(td); + return (0); + } + + if (CPU_ABSENT(host_cpuid)) + return (EINVAL); + + /* + * XXX we should check that 'host_cpuid' has not already been pinned + * by another vm. + */ + thread_lock(td); + sched_bind(td, host_cpuid); + thread_unlock(td); + VCPU_PIN(vm, vcpuid, host_cpuid); + + return (0); +} + +static void +restore_guest_fpustate(struct vcpu *vcpu) +{ + register_t s; + + s = intr_disable(); + fpu_stop_emulating(); + fxrstor(&vcpu->savefpu); + fpu_start_emulating(); + intr_restore(s); +} + +static void +save_guest_fpustate(struct vcpu *vcpu) +{ + register_t s; + + s = intr_disable(); + fpu_stop_emulating(); + fxsave(&vcpu->savefpu); + fpu_start_emulating(); + intr_restore(s); +} + +int +vm_run(struct vm *vm, struct vm_run *vmrun) +{ + int error, vcpuid; + struct vcpu *vcpu; + struct pcb *pcb; + uint64_t tscval; + + vcpuid = vmrun->cpuid; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + critical_enter(); + + tscval = rdtsc(); + + pcb = PCPU_GET(curpcb); + pcb->pcb_full_iret = 1; + + vcpu->hostcpu = curcpu; + + fpuexit(curthread); + restore_guest_msrs(vm, vcpuid); + restore_guest_fpustate(vcpu); + error = VMRUN(vm->cookie, vcpuid, vmrun->rip, &vmrun->vm_exit); + save_guest_fpustate(vcpu); + restore_host_msrs(vm, vcpuid); + + vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); + + critical_exit(); + + return (error); +} + +int +vm_inject_event(struct vm *vm, int vcpuid, int type, + int vector, uint32_t code, int code_valid) +{ + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0) + return (EINVAL); + + if (vector < 0 || vector > 255) + return (EINVAL); + + return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); +} + +int +vm_inject_nmi(struct vm *vm, int vcpu) +{ + int error; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + error = VMNMI(vm->cookie, vcpu); + vm_interrupt_hostcpu(vm, vcpu); + return (error); +} + +int +vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMGETCAP(vm->cookie, vcpu, type, retval)); +} + +int +vm_set_capability(struct vm *vm, int vcpu, int type, int val) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMSETCAP(vm->cookie, vcpu, type, val)); +} + +uint64_t * +vm_guest_msrs(struct vm *vm, int cpu) +{ + return (vm->vcpu[cpu].guest_msrs); +} + +struct vlapic * +vm_lapic(struct vm *vm, int cpu) +{ + return (vm->vcpu[cpu].vlapic); +} + +boolean_t +vmm_is_pptdev(int bus, int slot, int func) +{ + int found, b, s, f, n; + char *val, *cp, *cp2; + + /* + * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12" + */ + found = 0; + cp = val = getenv("pptdevs"); + while (cp != NULL && *cp != '\0') { + if ((cp2 = strchr(cp, ' ')) != NULL) + *cp2 = '\0'; + + n = sscanf(cp, "%d/%d/%d", &b, &s, &f); + if (n == 3 && bus == b && slot == s && func == f) { + found = 1; + break; + } + + if (cp2 != NULL) + *cp2++ = ' '; + + cp = cp2; + } + freeenv(val); + return (found); +} + +void * +vm_iommu_domain(struct vm *vm) +{ + + return (vm->iommu); +} + +void +vm_set_run_state(struct vm *vm, int vcpuid, int state) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_set_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + if (state == VCPU_RUNNING) { + if (vcpu->flags & VCPU_F_RUNNING) { + panic("vm_set_run_state: %s[%d] is already running", + vm_name(vm), vcpuid); + } + vcpu->flags |= VCPU_F_RUNNING; + } else { + if ((vcpu->flags & VCPU_F_RUNNING) == 0) { + panic("vm_set_run_state: %s[%d] is already stopped", + vm_name(vm), vcpuid); + } + vcpu->flags &= ~VCPU_F_RUNNING; + } +} + +int +vm_get_run_state(struct vm *vm, int vcpuid, int *cpuptr) +{ + int retval, hostcpu; + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_get_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + if (vcpu->flags & VCPU_F_RUNNING) { + retval = VCPU_RUNNING; + hostcpu = vcpu->hostcpu; + } else { + retval = VCPU_STOPPED; + hostcpu = -1; + } + + if (cpuptr) + *cpuptr = hostcpu; + + return (retval); +} + +void +vm_activate_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid >= 0 && vcpuid < VM_MAXCPU) + vm->active_cpus |= vcpu_mask(vcpuid); +} + +cpumask_t +vm_active_cpus(struct vm *vm) +{ + + return (vm->active_cpus); +} + +void * +vcpu_stats(struct vm *vm, int vcpuid) +{ + + return (vm->vcpu[vcpuid].stats); +} diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c new file mode 100644 index 0000000..cf443fc --- /dev/null +++ b/sys/amd64/vmm/vmm_dev.c @@ -0,0 +1,468 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include "vmm_lapic.h" +#include "vmm_stat.h" +#include "io/ppt.h" +#include + +struct vmmdev_softc { + struct vm *vm; /* vm instance cookie */ + struct cdev *cdev; + SLIST_ENTRY(vmmdev_softc) link; +}; +static SLIST_HEAD(, vmmdev_softc) head; + +static struct mtx vmmdev_mtx; + +static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); + +SYSCTL_DECL(_hw_vmm); + +static struct vmmdev_softc * +vmmdev_lookup(const char *name) +{ + struct vmmdev_softc *sc; + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + SLIST_FOREACH(sc, &head, link) { + if (strcmp(name, vm_name(sc->vm)) == 0) + break; + } + + return (sc); +} + +static struct vmmdev_softc * +vmmdev_lookup2(struct cdev *cdev) +{ + struct vmmdev_softc *sc; + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + SLIST_FOREACH(sc, &head, link) { + if (sc->cdev == cdev) + break; + } + + return (sc); +} + +static int +vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) +{ + int error, off, c; + vm_paddr_t hpa, gpa; + struct vmmdev_softc *sc; + + static char zerobuf[PAGE_SIZE]; + + error = 0; + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup2(cdev); + + while (uio->uio_resid > 0 && error == 0) { + gpa = uio->uio_offset; + off = gpa & PAGE_MASK; + c = min(uio->uio_resid, PAGE_SIZE - off); + + /* + * The VM has a hole in its physical memory map. If we want to + * use 'dd' to inspect memory beyond the hole we need to + * provide bogus data for memory that lies in the hole. + * + * Since this device does not support lseek(2), dd(1) will + * read(2) blocks of data to simulate the lseek(2). + */ + hpa = vm_gpa2hpa(sc->vm, gpa, c); + if (hpa == (vm_paddr_t)-1) { + if (uio->uio_rw == UIO_READ) + error = uiomove(zerobuf, c, uio); + else + error = EFAULT; + } else + error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio); + } + + mtx_unlock(&vmmdev_mtx); + return (error); +} + +static int +vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, + struct thread *td) +{ + int error, vcpu; + struct vmmdev_softc *sc; + struct vm_memory_segment *seg; + struct vm_register *vmreg; + struct vm_seg_desc* vmsegdesc; + struct vm_pin *vmpin; + struct vm_run *vmrun; + struct vm_event *vmevent; + struct vm_lapic_irq *vmirq; + struct vm_capability *vmcap; + struct vm_pptdev *pptdev; + struct vm_pptdev_mmio *pptmmio; + struct vm_pptdev_msi *pptmsi; + struct vm_nmi *vmnmi; + struct vm_stats *vmstats; + struct vm_stat_desc *statdesc; + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup2(cdev); + if (sc == NULL) { + mtx_unlock(&vmmdev_mtx); + return (ENXIO); + } + + /* + * Some VMM ioctls can operate only on vcpus that are not running. + */ + switch (cmd) { + case VM_RUN: + case VM_SET_PINNING: + case VM_GET_REGISTER: + case VM_SET_REGISTER: + case VM_GET_SEGMENT_DESCRIPTOR: + case VM_SET_SEGMENT_DESCRIPTOR: + case VM_INJECT_EVENT: + case VM_GET_CAPABILITY: + case VM_SET_CAPABILITY: + case VM_PPTDEV_MSI: + /* + * XXX fragile, handle with care + * Assumes that the first field of the ioctl data is the vcpu. + */ + vcpu = *(int *)data; + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + error = EINVAL; + goto done; + } + + if (vcpu_is_running(sc->vm, vcpu, NULL)) { + error = EBUSY; + goto done; + } + break; + default: + break; + } + + switch(cmd) { + case VM_RUN: + vmrun = (struct vm_run *)data; + + vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_RUNNING); + mtx_unlock(&vmmdev_mtx); + + error = vm_run(sc->vm, vmrun); + + mtx_lock(&vmmdev_mtx); + vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_STOPPED); + break; + case VM_STAT_DESC: { + const char *desc; + statdesc = (struct vm_stat_desc *)data; + desc = vmm_stat_desc(statdesc->index); + if (desc != NULL) { + error = 0; + strlcpy(statdesc->desc, desc, sizeof(statdesc->desc)); + } else + error = EINVAL; + break; + } + case VM_STATS: { + CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES); + vmstats = (struct vm_stats *)data; + getmicrotime(&vmstats->tv); + error = vmm_stat_copy(sc->vm, vmstats->cpuid, + &vmstats->num_entries, vmstats->statbuf); + break; + } + case VM_PPTDEV_MSI: + pptmsi = (struct vm_pptdev_msi *)data; + error = ppt_setup_msi(sc->vm, pptmsi->vcpu, + pptmsi->bus, pptmsi->slot, pptmsi->func, + pptmsi->destcpu, pptmsi->vector, + pptmsi->numvec); + break; + case VM_MAP_PPTDEV_MMIO: + pptmmio = (struct vm_pptdev_mmio *)data; + error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, + pptmmio->func, pptmmio->gpa, pptmmio->len, + pptmmio->hpa); + break; + case VM_BIND_PPTDEV: + pptdev = (struct vm_pptdev *)data; + error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot, + pptdev->func); + break; + case VM_UNBIND_PPTDEV: + pptdev = (struct vm_pptdev *)data; + error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot, + pptdev->func); + break; + case VM_INJECT_EVENT: + vmevent = (struct vm_event *)data; + error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type, + vmevent->vector, + vmevent->error_code, + vmevent->error_code_valid); + break; + case VM_INJECT_NMI: + vmnmi = (struct vm_nmi *)data; + error = vm_inject_nmi(sc->vm, vmnmi->cpuid); + break; + case VM_LAPIC_IRQ: + vmirq = (struct vm_lapic_irq *)data; + error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector); + break; + case VM_SET_PINNING: + vmpin = (struct vm_pin *)data; + error = vm_set_pinning(sc->vm, vmpin->vm_cpuid, + vmpin->host_cpuid); + break; + case VM_GET_PINNING: + vmpin = (struct vm_pin *)data; + error = vm_get_pinning(sc->vm, vmpin->vm_cpuid, + &vmpin->host_cpuid); + break; + case VM_MAP_MEMORY: + seg = (struct vm_memory_segment *)data; + error = vm_malloc(sc->vm, seg->gpa, seg->len, &seg->hpa); + break; + case VM_GET_MEMORY_SEG: + seg = (struct vm_memory_segment *)data; + seg->hpa = seg->len = 0; + (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); + error = 0; + break; + case VM_GET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, + &vmreg->regval); + break; + case VM_SET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, + vmreg->regval); + break; + case VM_SET_SEGMENT_DESCRIPTOR: + vmsegdesc = (struct vm_seg_desc *)data; + error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, + vmsegdesc->regnum, + &vmsegdesc->desc); + break; + case VM_GET_SEGMENT_DESCRIPTOR: + vmsegdesc = (struct vm_seg_desc *)data; + error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, + vmsegdesc->regnum, + &vmsegdesc->desc); + break; + case VM_GET_CAPABILITY: + vmcap = (struct vm_capability *)data; + error = vm_get_capability(sc->vm, vmcap->cpuid, + vmcap->captype, + &vmcap->capval); + break; + case VM_SET_CAPABILITY: + vmcap = (struct vm_capability *)data; + error = vm_set_capability(sc->vm, vmcap->cpuid, + vmcap->captype, + vmcap->capval); + break; + default: + error = ENOTTY; + break; + } +done: + mtx_unlock(&vmmdev_mtx); + + return (error); +} + +static int +vmmdev_mmap(struct cdev *cdev, vm_offset_t offset, vm_paddr_t *paddr, int nprot) +{ + int error; + struct vmmdev_softc *sc; + + error = -1; + mtx_lock(&vmmdev_mtx); + + sc = vmmdev_lookup2(cdev); + if (sc != NULL && (nprot & PROT_EXEC) == 0) { + *paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE); + if (*paddr != (vm_paddr_t)-1) + error = 0; + } + + mtx_unlock(&vmmdev_mtx); + + return (error); +} + +static void +vmmdev_destroy(struct vmmdev_softc *sc) +{ + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + /* + * XXX must stop virtual machine instances that may be still + * running and cleanup their state. + */ + SLIST_REMOVE(&head, sc, vmmdev_softc, link); + destroy_dev(sc->cdev); + vm_destroy(sc->vm); + free(sc, M_VMMDEV); +} + +static int +sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) +{ + int error; + char buf[VM_MAX_NAMELEN]; + struct vmmdev_softc *sc; + + strlcpy(buf, "beavis", sizeof(buf)); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + if (sc == NULL) { + mtx_unlock(&vmmdev_mtx); + return (EINVAL); + } + vmmdev_destroy(sc); + mtx_unlock(&vmmdev_mtx); + return (0); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_vmm_destroy, "A", NULL); + +static struct cdevsw vmmdevsw = { + .d_name = "vmmdev", + .d_version = D_VERSION, + .d_ioctl = vmmdev_ioctl, + .d_mmap = vmmdev_mmap, + .d_read = vmmdev_rw, + .d_write = vmmdev_rw, +}; + +static int +sysctl_vmm_create(SYSCTL_HANDLER_ARGS) +{ + int error; + struct vm *vm; + struct vmmdev_softc *sc; + char buf[VM_MAX_NAMELEN]; + + strlcpy(buf, "beavis", sizeof(buf)); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + + mtx_lock(&vmmdev_mtx); + + sc = vmmdev_lookup(buf); + if (sc != NULL) { + mtx_unlock(&vmmdev_mtx); + return (EEXIST); + } + + vm = vm_create(buf); + if (vm == NULL) { + mtx_unlock(&vmmdev_mtx); + return (EINVAL); + } + + sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); + sc->vm = vm; + sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600, + "vmm/%s", buf); + sc->cdev->si_drv1 = sc; + SLIST_INSERT_HEAD(&head, sc, link); + + mtx_unlock(&vmmdev_mtx); + return (0); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_vmm_create, "A", NULL); + +void +vmmdev_init(void) +{ + mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); +} + +void +vmmdev_cleanup(void) +{ + struct vmmdev_softc *sc, *sc2; + + mtx_lock(&vmmdev_mtx); + + SLIST_FOREACH_SAFE(sc, &head, link, sc2) + vmmdev_destroy(sc); + + mtx_unlock(&vmmdev_mtx); +} diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c new file mode 100644 index 0000000..c8e795b --- /dev/null +++ b/sys/amd64/vmm/vmm_ipi.c @@ -0,0 +1,103 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include "vmm_ipi.h" + +extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn); + +/* + * The default is to use the IPI_AST to interrupt a vcpu. + */ +static int ipinum = IPI_AST; + +CTASSERT(APIC_SPURIOUS_INT == 255); + +void +vmm_ipi_init(void) +{ + int idx; + uintptr_t func; + struct gate_descriptor *ip; + + /* + * Search backwards from the highest IDT vector available for use + * as our IPI vector. We install the 'justreturn' handler at that + * vector and use it to interrupt the vcpus. + * + * We do this because the IPI_AST is heavyweight and saves all + * registers in the trapframe. This is overkill for our use case + * which is simply to EOI the interrupt and return. + */ + idx = APIC_SPURIOUS_INT; + while (--idx >= APIC_IPI_INTS) { + ip = &idt[idx]; + func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); + if (func == (uintptr_t)&IDTVEC(rsvd)) { + ipinum = idx; + setidt(ipinum, IDTVEC(justreturn), SDT_SYSIGT, + SEL_KPL, 0); + break; + } + } + + if (ipinum != IPI_AST && bootverbose) { + printf("vmm_ipi_init: installing ipi handler to interrupt " + "vcpus at vector %d\n", ipinum); + } +} + +void +vmm_ipi_cleanup(void) +{ + if (ipinum != IPI_AST) + setidt(ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); +} + +void +vm_interrupt_hostcpu(struct vm *vm, int vcpu) +{ + int hostcpu; + + if (vcpu_is_running(vm, vcpu, &hostcpu) && hostcpu != curcpu) + ipi_selected((cpumask_t)1 << hostcpu, ipinum); +} diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h new file mode 100644 index 0000000..7ab94bf --- /dev/null +++ b/sys/amd64/vmm/vmm_ipi.h @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_IPI_H_ +#define _VMM_IPI_H_ + +struct vm; + +void vmm_ipi_init(void); +void vmm_ipi_cleanup(void); +void vm_interrupt_hostcpu(struct vm *vm, int vcpu); + +#endif diff --git a/sys/amd64/vmm/vmm_ktr.h b/sys/amd64/vmm/vmm_ktr.h new file mode 100644 index 0000000..e691c61 --- /dev/null +++ b/sys/amd64/vmm/vmm_ktr.h @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_KTR_H_ +#define _VMM_KTR_H_ + +#include +#include + +#define KTR_VMM KTR_GEN + +#define VMM_CTR0(vm, vcpuid, format) \ +CTR3(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu) + +#define VMM_CTR1(vm, vcpuid, format, p1) \ +CTR4(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \ + (p1)) + +#define VMM_CTR2(vm, vcpuid, format, p1, p2) \ +CTR5(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \ + (p1), (p2)) + +#define VMM_CTR3(vm, vcpuid, format, p1, p2, p3) \ +CTR6(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \ + (p1), (p2), (p3)) +#endif diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c new file mode 100644 index 0000000..8704fcf --- /dev/null +++ b/sys/amd64/vmm/vmm_lapic.c @@ -0,0 +1,121 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include "vmm_ipi.h" +#include "vmm_lapic.h" +#include "vlapic.h" + +int +lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val) +{ + int handled; + + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (vlapic_op_mem_write(vlapic, offset, DWORD, val) == 0) + handled = 1; + else + handled = 0; + + return (handled); +} + +int +lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *rv) +{ + int handled; + + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (vlapic_op_mem_read(vlapic, offset, DWORD, rv) == 0) + handled = 1; + else + handled = 0; + + return (handled); +} + +int +lapic_pending_intr(struct vm *vm, int cpu) +{ + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + return (vlapic_pending_intr(vlapic)); +} + +void +lapic_intr_accepted(struct vm *vm, int cpu, int vector) +{ + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + vlapic_intr_accepted(vlapic, vector); +} + +int +lapic_set_intr(struct vm *vm, int cpu, int vector) +{ + struct vlapic *vlapic; + + if (cpu < 0 || cpu >= VM_MAXCPU) + return (EINVAL); + + if (vector < 32 || vector > 255) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + vlapic_set_intr_ready(vlapic, vector); + + vm_interrupt_hostcpu(vm, cpu); + + return (0); +} + +void +lapic_timer_tick(struct vm *vm, int cpu) +{ + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + vlapic_timer_tick(vlapic); +} diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h new file mode 100644 index 0000000..815b2f7 --- /dev/null +++ b/sys/amd64/vmm/vmm_lapic.h @@ -0,0 +1,64 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_LAPIC_H_ +#define _VMM_LAPIC_H_ + +struct vm; + +int lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val); +int lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *retval); +void lapic_timer_tick(struct vm *vm, int cpu); + +/* + * Returns a vector between 32 and 255 if an interrupt is pending in the + * IRR that can be delivered based on the current state of ISR and TPR. + * + * Note that the vector does not automatically transition to the ISR as a + * result of calling this function. + * + * Returns -1 if there is no eligible vector that can be delivered to the + * guest at this time. + */ +int lapic_pending_intr(struct vm *vm, int cpu); + +/* + * Transition 'vector' from IRR to ISR. This function is called with the + * vector returned by 'lapic_pending_intr()' when the guest is able to + * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that + * block interrupt delivery). + */ +void lapic_intr_accepted(struct vm *vm, int cpu, int vector); + +/* + * Signals to the LAPIC that an interrupt at 'vector' needs to be generated + * to the 'cpu', the state is recorded in IRR. + */ +int lapic_set_intr(struct vm *vm, int cpu, int vector); + +#endif diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c new file mode 100644 index 0000000..9ce1e80 --- /dev/null +++ b/sys/amd64/vmm/vmm_mem.c @@ -0,0 +1,413 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include "vmm_util.h" +#include "vmm_mem.h" + +static MALLOC_DEFINE(M_VMM_MEM, "vmm memory", "vmm memory"); + +#define MB (1024 * 1024) +#define GB (1024 * MB) + +#define VMM_MEM_MAXSEGS 64 + +/* protected by vmm_mem_mtx */ +static struct { + vm_paddr_t base; + vm_size_t length; +} vmm_mem_avail[VMM_MEM_MAXSEGS]; + +static int vmm_mem_nsegs; + +static vm_paddr_t maxaddr; + +static struct mtx vmm_mem_mtx; + +/* + * Steal any memory that was deliberately hidden from FreeBSD either by + * the use of MAXMEM kernel config option or the hw.physmem loader tunable. + */ +static int +vmm_mem_steal_memory(void) +{ + int nsegs; + caddr_t kmdp; + uint32_t smapsize; + uint64_t base, length; + struct bios_smap *smapbase, *smap, *smapend; + + /* + * Borrowed from hammer_time() and getmemsize() in machdep.c + */ + kmdp = preload_search_by_type("elf kernel"); + if (kmdp == NULL) + kmdp = preload_search_by_type("elf64 kernel"); + + smapbase = (struct bios_smap *)preload_search_info(kmdp, + MODINFO_METADATA | MODINFOMD_SMAP); + if (smapbase == NULL) + panic("No BIOS smap info from loader!"); + + smapsize = *((uint32_t *)smapbase - 1); + smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); + + nsegs = 0; + for (smap = smapbase; smap < smapend; smap++) { + /* + * XXX + * Assuming non-overlapping, monotonically increasing + * memory segments. + */ + if (smap->type != SMAP_TYPE_MEMORY) + continue; + if (smap->length == 0) + break; + + base = roundup(smap->base, NBPDR); + length = rounddown(smap->length, NBPDR); + + /* Skip this segment if FreeBSD is using all of it. */ + if (base + length <= ptoa(Maxmem)) + continue; + + /* + * If FreeBSD is using part of this segment then adjust + * 'base' and 'length' accordingly. + */ + if (base < ptoa(Maxmem)) { + uint64_t used; + used = roundup(ptoa(Maxmem), NBPDR) - base; + base += used; + length -= used; + } + + if (length == 0) + continue; + + vmm_mem_avail[nsegs].base = base; + vmm_mem_avail[nsegs].length = length; + + if (base + length > maxaddr) + maxaddr = base + length; + + if (0 && bootverbose) { + printf("vmm_mem_populate: index %d, base 0x%0lx, " + "length %ld\n", + nsegs, vmm_mem_avail[nsegs].base, + vmm_mem_avail[nsegs].length); + } + + nsegs++; + if (nsegs >= VMM_MEM_MAXSEGS) { + printf("vmm_mem_populate: maximum number of vmm memory " + "segments reached!\n"); + return (ENOSPC); + } + } + + vmm_mem_nsegs = nsegs; + + return (0); +} + +static void +vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end) +{ + vm_paddr_t addr, remaining; + int pdpi, pdi, superpage_size; + pml4_entry_t *pml4p; + pdp_entry_t *pdp; + pd_entry_t *pd; + uint64_t page_attr_bits; + + if (end >= NBPML4) + panic("Cannot map memory beyond %ldGB", NBPML4 / GB); + + /* XXX FreeBSD 8.1 does not use 1G superpages in the direct map */ + if (0 && vmm_supports_1G_pages()) + superpage_size = NBPDP; + else + superpage_size = NBPDR; + + /* + * Get the page directory pointer page that contains the direct + * map address mappings. + */ + pml4p = kernel_pmap->pm_pml4; + pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4p[DMPML4I] & ~PAGE_MASK); + + page_attr_bits = PG_RW | PG_V | PG_PS | PG_G; + addr = start; + while (addr < end) { + remaining = end - addr; + pdpi = addr / NBPDP; + if (superpage_size == NBPDP && + remaining >= NBPDP && + addr % NBPDP == 0) { + /* + * If there isn't a mapping for this address then + * create one but if there is one already make sure + * it matches what we expect it to be. + */ + if (pdp[pdpi] == 0) { + pdp[pdpi] = addr | page_attr_bits; + if (0 && bootverbose) { + printf("vmm_mem_populate: mapping " + "0x%lx with 1GB page at " + "pdpi %d\n", addr, pdpi); + } + } else { + pdp_entry_t pdpe = pdp[pdpi]; + if ((pdpe & ~PAGE_MASK) != addr || + (pdpe & page_attr_bits) != page_attr_bits) { + panic("An invalid mapping 0x%016lx " + "already exists for 0x%016lx\n", + pdpe, addr); + } + } + addr += NBPDP; + } else { + if (remaining < NBPDR) { + panic("vmm_mem_populate: remaining (%ld) must " + "be greater than NBPDR (%d)\n", + remaining, NBPDR); + } + if (pdp[pdpi] == 0) { + /* + * XXX we lose this memory forever because + * we do not keep track of the virtual address + * that would be required to free this page. + */ + pd = malloc(PAGE_SIZE, M_VMM_MEM, + M_WAITOK | M_ZERO); + if ((uintptr_t)pd & PAGE_MASK) { + panic("vmm_mem_populate: page directory" + "page not aligned on %d " + "boundary\n", PAGE_SIZE); + } + pdp[pdpi] = vtophys(pd); + pdp[pdpi] |= PG_RW | PG_V | PG_U; + if (0 && bootverbose) { + printf("Creating page directory " + "at pdp index %d for 0x%016lx\n", + pdpi, addr); + } + } + pdi = (addr % NBPDP) / NBPDR; + pd = (pd_entry_t *)PHYS_TO_DMAP(pdp[pdpi] & ~PAGE_MASK); + + /* + * Create a new mapping if one doesn't already exist + * or validate it if it does. + */ + if (pd[pdi] == 0) { + pd[pdi] = addr | page_attr_bits; + if (0 && bootverbose) { + printf("vmm_mem_populate: mapping " + "0x%lx with 2MB page at " + "pdpi %d, pdi %d\n", + addr, pdpi, pdi); + } + } else { + pd_entry_t pde = pd[pdi]; + if ((pde & ~PAGE_MASK) != addr || + (pde & page_attr_bits) != page_attr_bits) { + panic("An invalid mapping 0x%016lx " + "already exists for 0x%016lx\n", + pde, addr); + } + } + addr += NBPDR; + } + } +} + +static int +vmm_mem_populate(void) +{ + int seg, error; + vm_paddr_t start, end; + + /* populate the vmm_mem_avail[] array */ + error = vmm_mem_steal_memory(); + if (error) + return (error); + + /* + * Now map the memory that was hidden from FreeBSD in + * the direct map VA space. + */ + for (seg = 0; seg < vmm_mem_nsegs; seg++) { + start = vmm_mem_avail[seg].base; + end = start + vmm_mem_avail[seg].length; + if ((start & PDRMASK) != 0 || (end & PDRMASK) != 0) { + panic("start (0x%016lx) and end (0x%016lx) must be " + "aligned on a %dMB boundary\n", + start, end, NBPDR / MB); + } + vmm_mem_direct_map(start, end); + } + + return (0); +} + +int +vmm_mem_init(void) +{ + int error; + + mtx_init(&vmm_mem_mtx, "vmm_mem_mtx", NULL, MTX_DEF); + + error = vmm_mem_populate(); + if (error) + return (error); + + return (0); +} + +vm_paddr_t +vmm_mem_alloc(size_t size) +{ + int i; + vm_paddr_t addr; + + if ((size & PDRMASK) != 0) { + panic("vmm_mem_alloc: size 0x%0lx must be " + "aligned on a 0x%0x boundary\n", size, NBPDR); + } + + addr = 0; + + mtx_lock(&vmm_mem_mtx); + for (i = 0; i < vmm_mem_nsegs; i++) { + if (vmm_mem_avail[i].length >= size) { + addr = vmm_mem_avail[i].base; + vmm_mem_avail[i].base += size; + vmm_mem_avail[i].length -= size; + /* remove a zero length segment */ + if (vmm_mem_avail[i].length == 0) { + memmove(&vmm_mem_avail[i], + &vmm_mem_avail[i + 1], + (vmm_mem_nsegs - (i + 1)) * + sizeof(vmm_mem_avail[0])); + vmm_mem_nsegs--; + } + break; + } + } + mtx_unlock(&vmm_mem_mtx); + + return (addr); +} + +void +vmm_mem_free(vm_paddr_t base, size_t length) +{ + int i; + + if ((base & PDRMASK) != 0 || (length & PDRMASK) != 0) { + panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be " + "aligned on a 0x%0x boundary\n", base, length, NBPDR); + } + + mtx_lock(&vmm_mem_mtx); + + for (i = 0; i < vmm_mem_nsegs; i++) { + if (vmm_mem_avail[i].base > base) + break; + } + + if (vmm_mem_nsegs >= VMM_MEM_MAXSEGS) + panic("vmm_mem_free: cannot free any more segments"); + + /* Create a new segment at index 'i' */ + memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i], + (vmm_mem_nsegs - i) * sizeof(vmm_mem_avail[0])); + + vmm_mem_avail[i].base = base; + vmm_mem_avail[i].length = length; + + vmm_mem_nsegs++; + +coalesce_some_more: + for (i = 0; i < vmm_mem_nsegs - 1; i++) { + if (vmm_mem_avail[i].base + vmm_mem_avail[i].length == + vmm_mem_avail[i + 1].base) { + vmm_mem_avail[i].length += vmm_mem_avail[i + 1].length; + memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i + 2], + (vmm_mem_nsegs - (i + 2)) * sizeof(vmm_mem_avail[0])); + vmm_mem_nsegs--; + goto coalesce_some_more; + } + } + + mtx_unlock(&vmm_mem_mtx); +} + +vm_paddr_t +vmm_mem_maxaddr(void) +{ + + return (maxaddr); +} + +void +vmm_mem_dump(void) +{ + int i; + vm_paddr_t base; + vm_size_t length; + + mtx_lock(&vmm_mem_mtx); + for (i = 0; i < vmm_mem_nsegs; i++) { + base = vmm_mem_avail[i].base; + length = vmm_mem_avail[i].length; + printf("%-4d0x%016lx 0x%016lx\n", i, base, base + length); + } + mtx_unlock(&vmm_mem_mtx); +} diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h new file mode 100644 index 0000000..ef1bf1a --- /dev/null +++ b/sys/amd64/vmm/vmm_mem.h @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_MEM_H_ +#define _VMM_MEM_H_ + +int vmm_mem_init(void); +vm_paddr_t vmm_mem_alloc(size_t size); +void vmm_mem_free(vm_paddr_t start, size_t size); +vm_paddr_t vmm_mem_maxaddr(void); +void vmm_mem_dump(void); + +#endif diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c new file mode 100644 index 0000000..152aa7b --- /dev/null +++ b/sys/amd64/vmm/vmm_msr.c @@ -0,0 +1,264 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include + +#include +#include "vmm_lapic.h" +#include "vmm_msr.h" + +#define VMM_MSR_F_EMULATE 0x01 +#define VMM_MSR_F_READONLY 0x02 + +struct vmm_msr { + int num; + int flags; + uint64_t hostval; +}; + +static struct vmm_msr vmm_msr[] = { + { MSR_LSTAR, 0 }, + { MSR_CSTAR, 0 }, + { MSR_STAR, 0 }, + { MSR_SF_MASK, 0 }, + { MSR_APICBASE, VMM_MSR_F_EMULATE }, + { MSR_BIOS_SIGN,VMM_MSR_F_EMULATE }, + { MSR_MCG_CAP, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY }, +}; + +#define vmm_msr_num (sizeof(vmm_msr) / sizeof(vmm_msr[0])) +CTASSERT(VMM_MSR_NUM >= vmm_msr_num); + +#define readonly_msr(idx) \ + ((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0) + +#define emulated_msr(idx) \ + ((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0) + +void +vmm_msr_init(void) +{ + int i; + + for (i = 0; i < vmm_msr_num; i++) { + if (emulated_msr(i)) + continue; + /* + * XXX this assumes that the value of the host msr does not + * change after we have cached it. + */ + vmm_msr[i].hostval = rdmsr(vmm_msr[i].num); + } +} + +void +guest_msrs_init(struct vm *vm, int cpu) +{ + int i; + uint64_t *guest_msrs; + + guest_msrs = vm_guest_msrs(vm, cpu); + + for (i = 0; i < vmm_msr_num; i++) { + switch (vmm_msr[i].num) { + case MSR_LSTAR: + case MSR_CSTAR: + case MSR_STAR: + case MSR_SF_MASK: + case MSR_BIOS_SIGN: + case MSR_MCG_CAP: + guest_msrs[i] = 0; + break; + case MSR_APICBASE: + guest_msrs[i] = DEFAULT_APIC_BASE | APICBASE_ENABLED | + APICBASE_X2APIC; + if (cpu == 0) + guest_msrs[i] |= APICBASE_BSP; + break; + default: + panic("guest_msrs_init: missing initialization for msr " + "0x%0x", vmm_msr[i].num); + } + } +} + +static boolean_t +x2apic_msr(u_int num) +{ + + if (num >= 0x800 && num <= 0xBFF) + return (TRUE); + else + return (FALSE); +} + +static u_int +x2apic_msr_to_regoff(u_int msr) +{ + + return ((msr - 0x800) << 4); +} + +static boolean_t +x2apic_msr_id(u_int num) +{ + return (num == 0x802); +} + +static int +msr_num_to_idx(u_int num) +{ + int i; + + for (i = 0; i < vmm_msr_num; i++) + if (vmm_msr[i].num == num) + return (i); + + return (-1); +} + +int +emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val) +{ + int handled, idx; + uint64_t *guest_msrs; + + handled = 0; + + if (x2apic_msr(num)) + return (lapic_write(vm, cpu, x2apic_msr_to_regoff(num), val)); + + idx = msr_num_to_idx(num); + if (idx < 0) + goto done; + + if (!readonly_msr(idx)) { + guest_msrs = vm_guest_msrs(vm, cpu); + + /* Stash the value */ + guest_msrs[idx] = val; + + /* Update processor state for non-emulated MSRs */ + if (!emulated_msr(idx)) + wrmsr(vmm_msr[idx].num, val); + } + + handled = 1; +done: + return (handled); +} + +int +emulate_rdmsr(struct vm *vm, int cpu, u_int num) +{ + int error, handled, idx; + uint32_t eax, edx; + uint64_t result, *guest_msrs; + + handled = 0; + + if (x2apic_msr(num)) { + handled = lapic_read(vm, cpu, x2apic_msr_to_regoff(num), + &result); + /* + * The version ID needs to be massaged + */ + if (x2apic_msr_id(num)) { + result = result >> 24; + } + goto done; + } + + idx = msr_num_to_idx(num); + if (idx < 0) + goto done; + + guest_msrs = vm_guest_msrs(vm, cpu); + result = guest_msrs[idx]; + + /* + * If this is not an emulated msr register make sure that the processor + * state matches our cached state. + */ + if (!emulated_msr(idx) && (rdmsr(num) != result)) { + panic("emulate_rdmsr: msr 0x%0x has inconsistent cached " + "(0x%016lx) and actual (0x%016lx) values", num, + result, rdmsr(num)); + } + + handled = 1; + +done: + if (handled) { + eax = result; + edx = result >> 32; + error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax); + if (error) + panic("vm_set_register(rax) error %d", error); + error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx); + if (error) + panic("vm_set_register(rdx) error %d", error); + } + return (handled); +} + +void +restore_guest_msrs(struct vm *vm, int cpu) +{ + int i; + uint64_t *guest_msrs; + + guest_msrs = vm_guest_msrs(vm, cpu); + + for (i = 0; i < vmm_msr_num; i++) { + if (emulated_msr(i)) + continue; + else + wrmsr(vmm_msr[i].num, guest_msrs[i]); + } +} + +void +restore_host_msrs(struct vm *vm, int cpu) +{ + int i; + + for (i = 0; i < vmm_msr_num; i++) { + if (emulated_msr(i)) + continue; + else + wrmsr(vmm_msr[i].num, vmm_msr[i].hostval); + } +} diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h new file mode 100644 index 0000000..1e15787 --- /dev/null +++ b/sys/amd64/vmm/vmm_msr.h @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_MSR_H_ +#define _VMM_MSR_H_ + +#define VMM_MSR_NUM 16 +struct vm; + +void vmm_msr_init(void); +int emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val); +int emulate_rdmsr(struct vm *vm, int vcpu, u_int msr); +void guest_msrs_init(struct vm *vm, int cpu); +void restore_host_msrs(struct vm *vm, int cpu); +void restore_guest_msrs(struct vm *vm, int cpu); + +#endif diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c new file mode 100644 index 0000000..e6f5c48 --- /dev/null +++ b/sys/amd64/vmm/vmm_stat.c @@ -0,0 +1,103 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include "vmm_stat.h" + +static int vstnum; +static struct vmm_stat_type *vsttab[MAX_VMM_STAT_TYPES]; + +static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat"); + +void +vmm_stat_init(void *arg) +{ + struct vmm_stat_type *vst = arg; + + /* We require all stats to identify themselves with a description */ + if (vst->desc == NULL) + return; + + if (vstnum >= MAX_VMM_STAT_TYPES) { + printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc); + return; + } + + vst->index = vstnum; + vsttab[vstnum++] = vst; +} + +int +vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf) +{ + int i; + uint64_t *stats; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + stats = vcpu_stats(vm, vcpu); + for (i = 0; i < vstnum; i++) + buf[i] = stats[i]; + *num_stats = vstnum; + return (0); +} + +void * +vmm_stat_alloc(void) +{ + u_long size; + + size = vstnum * sizeof(uint64_t); + + return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK)); +} + +void +vmm_stat_free(void *vp) +{ + free(vp, M_VMM_STAT); +} + +const char * +vmm_stat_desc(int index) +{ + + if (index >= 0 && index < vstnum) + return (vsttab[index]->desc); + else + return (NULL); +} diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h new file mode 100644 index 0000000..7c075a6 --- /dev/null +++ b/sys/amd64/vmm/vmm_stat.h @@ -0,0 +1,71 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_STAT_H_ +#define _VMM_STAT_H_ + +struct vm; + +#define MAX_VMM_STAT_TYPES 64 /* arbitrary */ + +struct vmm_stat_type { + const char *desc; /* description of statistic */ + int index; /* position in the stats buffer */ +}; + +void vmm_stat_init(void *arg); + +#define VMM_STAT_DEFINE(type, desc) \ + struct vmm_stat_type type[1] = { \ + { desc, -1 } \ + }; \ + SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type) + +void *vmm_stat_alloc(void); +void vmm_stat_free(void *vp); + +/* + * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries + */ +int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf); +const char *vmm_stat_desc(int index); + +static void __inline +vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats = vcpu_stats(vm, vcpu); + if (vst->index >= 0) + stats[vst->index] += x; +#endif +} + +#endif diff --git a/sys/amd64/vmm/vmm_support.S b/sys/amd64/vmm/vmm_support.S new file mode 100644 index 0000000..2afc608 --- /dev/null +++ b/sys/amd64/vmm/vmm_support.S @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#define LOCORE + +#include + +#define LA_EOI 0xB0 + + .text + SUPERALIGN_TEXT +IDTVEC(justreturn) + pushq %rax + movq lapic, %rax + movl $0, LA_EOI(%rax) + popq %rax + iretq diff --git a/sys/amd64/vmm/vmm_util.c b/sys/amd64/vmm/vmm_util.c new file mode 100644 index 0000000..f245f92 --- /dev/null +++ b/sys/amd64/vmm/vmm_util.c @@ -0,0 +1,111 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include + +#include "vmm_util.h" + +boolean_t +vmm_is_intel(void) +{ + + if (strcmp(cpu_vendor, "GenuineIntel") == 0) + return (TRUE); + else + return (FALSE); +} + +boolean_t +vmm_is_amd(void) +{ + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) + return (TRUE); + else + return (FALSE); +} + +boolean_t +vmm_supports_1G_pages(void) +{ + unsigned int regs[4]; + + /* + * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages + * + * Both Intel and AMD support this bit. + */ + if (cpu_exthigh >= 0x80000001) { + do_cpuid(0x80000001, regs); + if (regs[3] & (1 << 26)) + return (TRUE); + } + return (FALSE); +} + +#include +#include +#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x)) +#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x)) +void +dump_trapframe(struct trapframe *tf) +{ + DUMP_REG(rdi); + DUMP_REG(rsi); + DUMP_REG(rdx); + DUMP_REG(rcx); + DUMP_REG(r8); + DUMP_REG(r9); + DUMP_REG(rax); + DUMP_REG(rbx); + DUMP_REG(rbp); + DUMP_REG(r10); + DUMP_REG(r11); + DUMP_REG(r12); + DUMP_REG(r13); + DUMP_REG(r14); + DUMP_REG(r15); + DUMP_REG(trapno); + DUMP_REG(addr); + DUMP_REG(flags); + DUMP_REG(err); + DUMP_REG(rip); + DUMP_REG(rflags); + DUMP_REG(rsp); + DUMP_SEG(cs); + DUMP_SEG(ss); + DUMP_SEG(fs); + DUMP_SEG(gs); + DUMP_SEG(es); + DUMP_SEG(ds); +} diff --git a/sys/amd64/vmm/vmm_util.h b/sys/amd64/vmm/vmm_util.h new file mode 100644 index 0000000..7f82332 --- /dev/null +++ b/sys/amd64/vmm/vmm_util.h @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_UTIL_H_ +#define _VMM_UTIL_H_ + +struct trapframe; + +boolean_t vmm_is_intel(void); +boolean_t vmm_is_amd(void); +boolean_t vmm_supports_1G_pages(void); + +void dump_trapframe(struct trapframe *tf); + +#endif diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c new file mode 100644 index 0000000..45c4c53 --- /dev/null +++ b/sys/amd64/vmm/x86.c @@ -0,0 +1,113 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include + +#include "x86.h" + +int +x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) +{ + unsigned int func, regs[4]; + + func = *eax; + + cpuid_count(*eax, *ecx, regs); + + switch(func) { + case CPUID_0000_0000: + case CPUID_0000_0002: + case CPUID_0000_0003: + case CPUID_0000_0004: + case CPUID_0000_000A: + break; + + case CPUID_8000_0000: + case CPUID_8000_0001: + case CPUID_8000_0002: + case CPUID_8000_0003: + case CPUID_8000_0004: + case CPUID_8000_0006: + case CPUID_8000_0007: + case CPUID_8000_0008: + + break; + + case CPUID_0000_0001: + /* + * Override the APIC ID only in ebx + */ + regs[1] &= ~(CPUID_0000_0001_APICID_MASK); + /* + * XXX fixme for MP case, set apicid properly for cpu. + */ + regs[1] |= (0 << CPUID_0000_0001_APICID_SHIFT); + + /* + * Don't expose VMX capability. + * Advertise x2APIC capability. + */ + regs[2] &= ~CPUID_0000_0001_FEAT0_VMX; + regs[2] |= CPUID2_X2APIC; + + /* + * Machine check handling is done in the host. + * Hide MTRR capability. + */ + regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR); + + break; + + case CPUID_0000_000B: + /* + * XXXSMP fixme + * Processor topology enumeration + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = *ecx & 0xff; + regs[3] = 0; + break; + + default: + return (0); + } + + *eax = regs[0]; + *ebx = regs[1]; + *ecx = regs[2]; + *edx = regs[3]; + return (1); +} + diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h new file mode 100644 index 0000000..bc4f8a4 --- /dev/null +++ b/sys/amd64/vmm/x86.h @@ -0,0 +1,62 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _X86_H_ +#define _X86_H_ + +#define CPUID_0000_0000 (0x0) +#define CPUID_0000_0001 (0x1) +#define CPUID_0000_0002 (0x2) +#define CPUID_0000_0003 (0x3) +#define CPUID_0000_0004 (0x4) +#define CPUID_0000_000A (0xA) +#define CPUID_0000_000B (0xB) +#define CPUID_8000_0000 (0x80000000) +#define CPUID_8000_0001 (0x80000001) +#define CPUID_8000_0002 (0x80000002) +#define CPUID_8000_0003 (0x80000003) +#define CPUID_8000_0004 (0x80000004) +#define CPUID_8000_0006 (0x80000006) +#define CPUID_8000_0007 (0x80000007) +#define CPUID_8000_0008 (0x80000008) + +/* + * CPUID instruction Fn0000_0001: + */ +#define CPUID_0000_0001_APICID_MASK (0xff<<24) +#define CPUID_0000_0001_APICID_SHIFT 24 + +/* + * CPUID instruction Fn0000_0001 ECX + */ +#define CPUID_0000_0001_FEAT0_VMX (1<<5) + +int x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, + uint32_t *edx); + +#endif -- cgit v1.1 From f15f5629368bc3b043d8dfbe6b60d8261e97874d Mon Sep 17 00:00:00 2001 From: grehan Date: Sat, 14 May 2011 18:37:24 +0000 Subject: bhyve import part 2 of 2, guest kernel changes. This branch is now considered frozen: future bhyve development will take place in a branch off -CURRENT. sys/dev/bvm/bvm_console.c sys/dev/bvm/bvm_dbg.c - simple console driver/gdb debug port used for bringup. supported by user-space bhyve executable sys/conf/options.amd64 sys/amd64/amd64/minidump_machdep.c - allow NKPT to be set in the kernel config file sys/amd64/conf/GENERIC - mptable config options; bhyve user-space executable creates an mptable with number of CPUs, and optional vendor extension - add bvm console/debug - set NKPT to 512 to allow loading of large RAM disks from the loader - include kdb/gdb sys/amd64/amd64/local_apic.c sys/amd64/amd64/apic_vector.S sys/amd64/include/specialreg.h - if x2apic mode available, use MSRs to access the local APIC, otherwise fall back to 'classic' MMIO mode sys/amd64/amd64/mp_machdep.c - support AP spinup on CPU models that don't have real-mode support by overwriting the real-mode page with a message that supplies the bhyve user-space executable with enough information to start the AP directly in 64-bit mode. sys/amd64/amd64/vm_machdep.c - insert pause statements into cpu shutdown busy-wait loops sys/dev/blackhole/blackhole.c sys/modules/blackhole/Makefile - boot-time loadable module that claims all PCI bus/slot/funcs specified in an env var that are to be used for PCI passthrough sys/amd64/amd64/intr_machdep.c - allow round-robin assignment of device interrupts to CPUs to be disabled from the loader sys/amd64/include/bus.h - convert string ins/outs instructions to loops of individual in/out since bhyve doesn't support these yet sys/kern/subr_bus.c - if the device was no created with a fixed devclass, then remove it's association with the devclass it was associated with during probe. Otherwise, new drivers do not get a chance to probe/attach since the device will stay married to the first driver that it probed successfully but failed to attach. Sponsored by: NetApp, Inc. --- sys/amd64/amd64/apic_vector.S | 53 ++-- sys/amd64/amd64/intr_machdep.c | 6 + sys/amd64/amd64/local_apic.c | 494 ++++++++++++++++++++++++++++++++----- sys/amd64/amd64/minidump_machdep.c | 1 + sys/amd64/amd64/mp_machdep.c | 43 ++++ sys/amd64/amd64/vm_machdep.c | 11 +- sys/amd64/conf/GENERIC | 11 + sys/amd64/include/bus.h | 60 +++-- sys/amd64/include/specialreg.h | 33 ++- 9 files changed, 610 insertions(+), 102 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index 4cfc18b..6e9aa79 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -55,7 +55,14 @@ IDTVEC(vec_name) ; \ PUSH_FRAME ; \ FAKE_MCOUNT(TF_RIP(%rsp)) ; \ movq lapic, %rdx ; /* pointer to local APIC */ \ + testq %rdx, %rdx; \ + jnz 3f; \ + movl $MSR_APIC_ISR ## index, %ecx; \ + rdmsr; \ + jmp 4f; \ +3: ; \ movl LA_ISR + 16 * (index)(%rdx), %eax ; /* load ISR */ \ +4: ; \ bsrl %eax, %eax ; /* index of highset set bit in ISR */ \ jz 2f ; \ addl $(32 * index),%eax ; \ @@ -117,6 +124,26 @@ IDTVEC(errorint) jmp doreti #ifdef SMP + +/* + * We assume that %rax is being saved/restored outside of this macro + */ +#define DO_EOI \ + movq lapic, %rax; \ + testq %rax, %rax; \ + jz 8f; \ + movl $0, LA_EOI(%rax); \ + jmp 9f; \ +8:; \ + pushq %rcx; \ + pushq %rdx; \ + xorl %edx, %edx; /* eax is already zero */ \ + movl $MSR_APIC_EOI, %ecx; \ + wrmsr; \ + popq %rdx; \ + popq %rcx; \ +9: + /* * Global address space TLB shootdown. */ @@ -128,8 +155,7 @@ IDTVEC(invltlb) movq %cr3, %rax /* invalidate the TLB */ movq %rax, %cr3 - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ + DO_EOI lock incl smp_tlb_wait @@ -148,8 +174,7 @@ IDTVEC(invlpg) movq smp_tlb_addr1, %rax invlpg (%rax) /* invalidate single page */ - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ + DO_EOI lock incl smp_tlb_wait @@ -173,8 +198,7 @@ IDTVEC(invlrng) cmpq %rax, %rdx jb 1b - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ + DO_EOI lock incl smp_tlb_wait @@ -193,8 +217,7 @@ IDTVEC(invlcache) wbinvd - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ + DO_EOI lock incl smp_tlb_wait @@ -210,9 +233,8 @@ IDTVEC(invlcache) IDTVEC(ipi_intr_bitmap_handler) PUSH_FRAME - movq lapic, %rdx - movl $0, LA_EOI(%rdx) /* End Of Interrupt to APIC */ - + DO_EOI + FAKE_MCOUNT(TF_RIP(%rsp)) call ipi_bitmap_handler @@ -227,8 +249,7 @@ IDTVEC(ipi_intr_bitmap_handler) IDTVEC(cpustop) PUSH_FRAME - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ + DO_EOI call cpustop_handler jmp doreti @@ -241,8 +262,7 @@ IDTVEC(cpustop) IDTVEC(cpususpend) PUSH_FRAME - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ + DO_EOI call cpususpend_handler @@ -259,7 +279,6 @@ IDTVEC(cpususpend) IDTVEC(rendezvous) PUSH_FRAME call smp_rendezvous_action - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ + DO_EOI jmp doreti #endif /* SMP */ diff --git a/sys/amd64/amd64/intr_machdep.c b/sys/amd64/amd64/intr_machdep.c index 6ab80df..941cecf 100644 --- a/sys/amd64/amd64/intr_machdep.c +++ b/sys/amd64/amd64/intr_machdep.c @@ -78,6 +78,8 @@ static STAILQ_HEAD(, pic) pics; #ifdef SMP static int assign_cpu; +static int round_robin_interrupts = 1; +TUNABLE_INT("round_robin_interrupts", &round_robin_interrupts); #endif static int intr_assign_cpu(void *arg, u_char cpu); @@ -460,6 +462,10 @@ intr_next_cpu(void) if (!assign_cpu) return (cpu_apic_ids[0]); + /* All interrupts go to the BSP if not allowed to round robin */ + if (!round_robin_interrupts) + return (cpu_apic_ids[0]); + mtx_lock_spin(&icu_lock); apic_id = cpu_apic_ids[current_cpu]; do { diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c index 8edc971..f5c2938 100644 --- a/sys/amd64/amd64/local_apic.c +++ b/sys/amd64/amd64/local_apic.c @@ -148,6 +148,7 @@ volatile lapic_t *lapic; vm_paddr_t lapic_paddr; static u_long lapic_timer_divisor, lapic_timer_period, lapic_timer_hz; static enum lapic_clock clockcoverage; +static int x2apic; static void lapic_enable(void); static void lapic_resume(struct pic *pic); @@ -156,6 +157,36 @@ static void lapic_timer_oneshot(u_int count); static void lapic_timer_periodic(u_int count); static void lapic_timer_set_divisor(u_int divisor); static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value); +static uint32_t lapic_version(void); +static uint32_t lapic_ldr(void); +static uint32_t lapic_dfr(void); +static uint32_t lapic_lvt_lint0(void); +static void lapic_set_lvt_lint0(uint32_t value); +static uint32_t lapic_lvt_lint1(void); +static void lapic_set_lvt_lint1(uint32_t value); +static uint32_t lapic_tpr(void); +static uint32_t lapic_svr(void); +static void lapic_set_svr(uint32_t value); +static uint32_t lapic_lvt_timer(void); +static void lapic_set_lvt_timer(uint32_t value); +static uint32_t lapic_lvt_thermal(void); +static uint32_t lapic_lvt_error(void); +static void lapic_set_lvt_error(uint32_t value); +static uint32_t lapic_lvt_pcint(void); +static void lapic_set_lvt_pcint(uint32_t value); +static uint32_t lapic_esr(void); +static void lapic_set_esr(uint32_t value); +static uint32_t lapic_ccr_timer(void); +static void lapic_set_dcr_timer(uint32_t value); +static void lapic_set_icr_timer(uint32_t value); +uint32_t lapic_irr(int num); +uint32_t lapic_tmr(int num); +uint32_t lapic_isr(int num); +static uint32_t lapic_icr_lo(void); +static void lapic_set_icr_lo(uint32_t value); +static uint32_t lapic_icr_hi(void); +static void lapic_set_icr_hi(uint32_t value); +static boolean_t lapic_missing(void); struct pic lapic_pic = { .pic_resume = lapic_resume }; @@ -206,12 +237,20 @@ lvt_mode(struct lapic *la, u_int pin, uint32_t value) void lapic_init(vm_paddr_t addr) { - - /* Map the local APIC and setup the spurious interrupt handler. */ - KASSERT(trunc_page(addr) == addr, - ("local APIC not aligned on a page boundary")); - lapic = pmap_mapdev(addr, sizeof(lapic_t)); - lapic_paddr = addr; + if ((cpu_feature2 & CPUID2_X2APIC) != 0 && + (rdmsr(MSR_APICBASE) & APICBASE_X2APIC) != 0) { + x2apic = 1; + if (bootverbose) + printf("Local APIC access using x2APIC MSRs\n"); + } else { + /* + * Map the local APIC and setup the spurious interrupt handler. + */ + KASSERT(trunc_page(addr) == addr, + ("local APIC not aligned on a page boundary")); + lapic = pmap_mapdev(addr, sizeof(lapic_t)); + lapic_paddr = addr; + } setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); /* Perform basic initialization of the BSP's local APIC. */ @@ -276,12 +315,12 @@ lapic_dump(const char* str) printf("cpu%d %s:\n", PCPU_GET(cpuid), str); printf(" ID: 0x%08x VER: 0x%08x LDR: 0x%08x DFR: 0x%08x\n", - lapic->id, lapic->version, lapic->ldr, lapic->dfr); + lapic_id(), lapic_version(), lapic_ldr(), lapic_dfr()); printf(" lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n", - lapic->lvt_lint0, lapic->lvt_lint1, lapic->tpr, lapic->svr); + lapic_lvt_lint0(), lapic_lvt_lint1(), lapic_tpr(), lapic_svr()); printf(" timer: 0x%08x therm: 0x%08x err: 0x%08x pmc: 0x%08x\n", - lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error, - lapic->lvt_pcint); + lapic_lvt_timer(), lapic_lvt_thermal(), lapic_lvt_error(), + lapic_lvt_pcint()); } void @@ -295,7 +334,7 @@ lapic_setup(int boot) la = &lapics[lapic_id()]; KASSERT(la->la_present, ("missing APIC structure")); eflags = intr_disable(); - maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; + maxlvt = (lapic_version() & APIC_VER_MAXLVT) >> MAXLVTSHIFT; /* Initialize the TPR to allow all interrupts. */ lapic_set_tpr(0); @@ -304,15 +343,15 @@ lapic_setup(int boot) lapic_enable(); /* Program LINT[01] LVT entries. */ - lapic->lvt_lint0 = lvt_mode(la, LVT_LINT0, lapic->lvt_lint0); - lapic->lvt_lint1 = lvt_mode(la, LVT_LINT1, lapic->lvt_lint1); + lapic_set_lvt_lint0(lvt_mode(la, LVT_LINT0, lapic_lvt_lint0())); + lapic_set_lvt_lint1(lvt_mode(la, LVT_LINT1, lapic_lvt_lint1())); /* Program the PMC LVT entry if present. */ if (maxlvt >= LVT_PMC) - lapic->lvt_pcint = lvt_mode(la, LVT_PMC, lapic->lvt_pcint); + lapic_set_lvt_pcint(lvt_mode(la, LVT_PMC, lapic_lvt_pcint())); /* Program timer LVT and setup handler. */ - lapic->lvt_timer = lvt_mode(la, LVT_TIMER, lapic->lvt_timer); + lapic_set_lvt_timer(lvt_mode(la, LVT_TIMER, lapic_lvt_timer())); if (boot) { snprintf(buf, sizeof(buf), "cpu%d: timer", PCPU_GET(cpuid)); intrcnt_add(buf, &la->la_timer_count); @@ -328,8 +367,8 @@ lapic_setup(int boot) } /* Program error LVT and clear any existing errors. */ - lapic->lvt_error = lvt_mode(la, LVT_ERROR, lapic->lvt_error); - lapic->esr = 0; + lapic_set_lvt_error(lvt_mode(la, LVT_ERROR, lapic_lvt_error())); + lapic_set_esr(0); /* XXX: Thermal LVT */ @@ -342,9 +381,9 @@ lapic_reenable_pmc(void) #ifdef HWPMC_HOOKS uint32_t value; - value = lapic->lvt_pcint; + value = lapic_lvt_pcint(); value &= ~APIC_LVT_M; - lapic->lvt_pcint = value; + lapic_set_lvt_pcint(value); #endif } @@ -355,7 +394,7 @@ lapic_update_pmc(void *dummy) struct lapic *la; la = &lapics[lapic_id()]; - lapic->lvt_pcint = lvt_mode(la, LVT_PMC, lapic->lvt_pcint); + lapic_set_lvt_pcint(lvt_mode(la, LVT_PMC, lapic_lvt_pcint())); } #endif @@ -366,11 +405,11 @@ lapic_enable_pmc(void) u_int32_t maxlvt; /* Fail if the local APIC is not present. */ - if (lapic == NULL) + if (lapic_missing()) return (0); /* Fail if the PMC LVT is not present. */ - maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; + maxlvt = (lapic_version() & APIC_VER_MAXLVT) >> MAXLVTSHIFT; if (maxlvt < LVT_PMC) return (0); @@ -400,11 +439,11 @@ lapic_disable_pmc(void) u_int32_t maxlvt; /* Fail if the local APIC is not present. */ - if (lapic == NULL) + if (lapic_missing()) return; /* Fail if the PMC LVT is not present. */ - maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; + maxlvt = (lapic_version() & APIC_VER_MAXLVT) >> MAXLVTSHIFT; if (maxlvt < LVT_PMC) return; @@ -435,7 +474,7 @@ lapic_setup_clock(enum lapic_clock srcsdes) MPASS(srcsdes != LAPIC_CLOCK_NONE); /* Can't drive the timer without a local APIC. */ - if (lapic == NULL || + if (lapic_missing() || (resource_int_value("apic", 0, "clock", &i) == 0 && i == 0)) { clockcoverage = LAPIC_CLOCK_NONE; return (clockcoverage); @@ -449,7 +488,7 @@ lapic_setup_clock(enum lapic_clock srcsdes) lapic_timer_set_divisor(lapic_timer_divisor); lapic_timer_oneshot(APIC_TIMER_MAX_COUNT); DELAY(2000000); - value = APIC_TIMER_MAX_COUNT - lapic->ccr_timer; + value = APIC_TIMER_MAX_COUNT - lapic_ccr_timer(); if (value != APIC_TIMER_MAX_COUNT) break; lapic_timer_divisor <<= 1; @@ -509,9 +548,9 @@ lapic_disable(void) uint32_t value; /* Software disable the local APIC. */ - value = lapic->svr; + value = lapic_svr(); value &= ~APIC_SVR_SWEN; - lapic->svr = value; + lapic_set_svr(value); } static void @@ -520,10 +559,10 @@ lapic_enable(void) u_int32_t value; /* Program the spurious vector to enable the local APIC. */ - value = lapic->svr; + value = lapic_svr(); value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS); value |= (APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT); - lapic->svr = value; + lapic_set_svr(value); } /* Reset the local APIC on the BSP during resume. */ @@ -534,19 +573,342 @@ lapic_resume(struct pic *pic) lapic_setup(0); } +static uint32_t +lapic_version(void) +{ + + if (x2apic) + return (rdmsr(MSR_APIC_VERSION)); + else + return (lapic->version); +} + +static uint32_t +lapic_ldr(void) +{ + + if (x2apic) + return (rdmsr(MSR_APIC_LDR)); + else + return (lapic->ldr); +} + +static uint32_t +lapic_dfr(void) +{ + + if (x2apic) + return (0xffffffff); /* DFR not available in x2APIC mode */ + else + return (lapic->dfr); +} + +static uint32_t +lapic_lvt_lint0(void) +{ + + if (x2apic) + return (rdmsr(MSR_APIC_LVT_LINT0)); + else + return (lapic->lvt_lint0); +} + +static void +lapic_set_lvt_lint0(uint32_t value) +{ + + if (x2apic) + wrmsr(MSR_APIC_LVT_LINT0, value); + else + lapic->lvt_lint0 = value; +} + +static uint32_t +lapic_lvt_lint1(void) +{ + + if (x2apic) + return (rdmsr(MSR_APIC_LVT_LINT1)); + else + return (lapic->lvt_lint1); +} + +static void +lapic_set_lvt_lint1(uint32_t value) +{ + + if (x2apic) + wrmsr(MSR_APIC_LVT_LINT1, value); + else + lapic->lvt_lint1 = value; +} + +static uint32_t +lapic_tpr(void) +{ + + if (x2apic) + return (rdmsr(MSR_APIC_TPR)); + else + return (lapic->tpr); +} + +static uint32_t +lapic_svr(void) +{ + + if (x2apic) + return (rdmsr(MSR_APIC_SVR)); + else + return (lapic->svr); +} + +static void +lapic_set_svr(uint32_t value) +{ + + if (x2apic) + wrmsr(MSR_APIC_SVR, value); + else + lapic->svr = value; +} + +static uint32_t +lapic_lvt_timer(void) +{ + + if (x2apic) + return (rdmsr(MSR_APIC_LVT_TIMER)); + else + return (lapic->lvt_timer); +} + +static void +lapic_set_lvt_timer(uint32_t value) +{ + + if (x2apic) + wrmsr(MSR_APIC_LVT_TIMER, value); + else + lapic->lvt_timer = value; +} + +static uint32_t +lapic_lvt_thermal(void) +{ + + if (x2apic) + return (rdmsr(MSR_APIC_LVT_THERMAL)); + else + return (lapic->lvt_thermal); +} + +static uint32_t +lapic_lvt_error(void) +{ + + if (x2apic) + return (rdmsr(MSR_APIC_LVT_ERROR)); + else + return (lapic->lvt_error); +} + +static void +lapic_set_lvt_error(uint32_t value) +{ + + if (x2apic) + wrmsr(MSR_APIC_LVT_ERROR, value); + else + lapic->lvt_error = value; +} + +static uint32_t +lapic_lvt_pcint(void) +{ + + if (x2apic) + return (rdmsr(MSR_APIC_LVT_PCINT)); + else + return (lapic->lvt_pcint); +} + +static void +lapic_set_lvt_pcint(uint32_t value) +{ + + if (x2apic) + wrmsr(MSR_APIC_LVT_PCINT, value); + else + lapic->lvt_pcint = value; +} + +static uint32_t +lapic_esr(void) +{ + + if (x2apic) + return (rdmsr(MSR_APIC_ESR)); + else + return (lapic->esr); +} + +static void +lapic_set_esr(uint32_t value) +{ + + if (x2apic) + wrmsr(MSR_APIC_ESR, value); + else + lapic->esr = value; +} + +static uint32_t +lapic_ccr_timer(void) +{ + + if (x2apic) + return (rdmsr(MSR_APIC_CCR_TIMER)); + else + return (lapic->ccr_timer); +} + +static void +lapic_set_dcr_timer(uint32_t value) +{ + + if (x2apic) + wrmsr(MSR_APIC_DCR_TIMER, value); + else + lapic->dcr_timer = value; +} + +static void +lapic_set_icr_timer(uint32_t value) +{ + + if (x2apic) + wrmsr(MSR_APIC_ICR_TIMER, value); + else + lapic->icr_timer = value; +} + +uint32_t +lapic_tmr(int num) +{ + int msr; + volatile uint32_t *regptr; + + KASSERT(num >= 0 && num < 8, ("lapic_tmr: invalid num %d", num)); + + if (x2apic) { + msr = MSR_APIC_TMR0 + num; + return (rdmsr(msr)); + } else { + regptr = &lapic->tmr0; + return (regptr[num * 4]); + } +} + +uint32_t +lapic_irr(int num) +{ + int msr; + volatile uint32_t *regptr; + + KASSERT(num >= 0 && num < 8, ("lapic_irr: invalid num %d", num)); + + if (x2apic) { + msr = MSR_APIC_IRR0 + num; + return (rdmsr(msr)); + } else { + regptr = &lapic->irr0; + return (regptr[num * 4]); + } +} + +uint32_t +lapic_isr(int num) +{ + int msr; + volatile uint32_t *regptr; + + KASSERT(num >= 0 && num < 8, ("lapic_isr: invalid num %d", num)); + + if (x2apic) { + msr = MSR_APIC_ISR0 + num; + return (rdmsr(msr)); + } else { + regptr = &lapic->isr0; + return (regptr[num * 4]); + } +} + +static uint32_t icr_hi_stashed[MAXCPU]; + +static uint32_t +lapic_icr_lo(void) +{ + + if (x2apic) + return (0); + else + return (lapic->icr_lo); +} + +static void +lapic_set_icr_lo(uint32_t value) +{ + + if (x2apic) { + wrmsr(MSR_APIC_ICR, + (uint64_t)icr_hi_stashed[curcpu] << 32 | value); + } else + lapic->icr_lo = value; +} + +static uint32_t +lapic_icr_hi(void) +{ + + if (x2apic) + return (0); + else + return (lapic->icr_hi); +} + +static void +lapic_set_icr_hi(uint32_t value) +{ + if (x2apic) + icr_hi_stashed[curcpu] = value >> APIC_ID_SHIFT; /* XXX */ + else + lapic->icr_hi = value; +} + +static boolean_t +lapic_missing(void) +{ + + if (x2apic == 0 && lapic == NULL) + return (TRUE); + else + return (FALSE); +} + int lapic_id(void) { - KASSERT(lapic != NULL, ("local APIC is not mapped")); - return (lapic->id >> APIC_ID_SHIFT); + if (x2apic) + return (rdmsr(MSR_APIC_ID)); + else + return (lapic->id >> APIC_ID_SHIFT); } int lapic_intr_pending(u_int vector) { - volatile u_int32_t *irr; - /* * The IRR registers are an array of 128-bit registers each of * which only describes 32 interrupts in the low 32 bits.. Thus, @@ -556,8 +918,7 @@ lapic_intr_pending(u_int vector) * modulus the vector by 32 to determine the individual bit to * test. */ - irr = &lapic->irr0; - return (irr[(vector / 32) * 4] & 1 << (vector % 32)); + return (lapic_irr(vector / 32) & 1 << (vector % 32)); } void @@ -713,13 +1074,19 @@ void lapic_set_tpr(u_int vector) { #ifdef CHEAP_TPR - lapic->tpr = vector; + if (x2apic) + wrmsr(MSR_APIC_TPR, vector); + else + lapic->tpr = vector; #else u_int32_t tpr; - tpr = lapic->tpr & ~APIC_TPR_PRIO; + tpr = lapic_tpr() & ~APIC_TPR_PRIO; tpr |= vector; - lapic->tpr = tpr; + if (x2apic) + wrmsr(MSR_APIC_TPR, tpr); + else + lapic->tpr = tpr; #endif } @@ -727,7 +1094,10 @@ void lapic_eoi(void) { - lapic->eoi = 0; + if (x2apic) + wrmsr(MSR_APIC_EOI, 0); + else + lapic->eoi = 0; } void @@ -819,7 +1189,7 @@ lapic_timer_set_divisor(u_int divisor) KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor)); KASSERT(ffs(divisor) <= sizeof(lapic_timer_divisors) / sizeof(u_int32_t), ("lapic: invalid divisor %u", divisor)); - lapic->dcr_timer = lapic_timer_divisors[ffs(divisor) - 1]; + lapic_set_dcr_timer(lapic_timer_divisors[ffs(divisor) - 1]); } static void @@ -827,11 +1197,11 @@ lapic_timer_oneshot(u_int count) { u_int32_t value; - value = lapic->lvt_timer; + value = lapic_lvt_timer(); value &= ~APIC_LVTT_TM; value |= APIC_LVTT_TM_ONE_SHOT; - lapic->lvt_timer = value; - lapic->icr_timer = count; + lapic_set_lvt_timer(value); + lapic_set_icr_timer(count); } static void @@ -839,11 +1209,11 @@ lapic_timer_periodic(u_int count) { u_int32_t value; - value = lapic->lvt_timer; + value = lapic_lvt_timer(); value &= ~APIC_LVTT_TM; value |= APIC_LVTT_TM_PERIODIC; - lapic->lvt_timer = value; - lapic->icr_timer = count; + lapic_set_lvt_timer(value); + lapic_set_icr_timer(count); } static void @@ -851,9 +1221,9 @@ lapic_timer_enable_intr(void) { u_int32_t value; - value = lapic->lvt_timer; + value = lapic_lvt_timer(); value &= ~APIC_LVT_M; - lapic->lvt_timer = value; + lapic_set_lvt_timer(value); } void @@ -867,8 +1237,8 @@ lapic_handle_error(void) * to update its value to indicate any errors that have * occurred since the previous write to the register. */ - lapic->esr = 0; - esr = lapic->esr; + lapic_set_esr(0); + esr = lapic_esr(); printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr); lapic_eoi(); @@ -1115,17 +1485,17 @@ DB_SHOW_COMMAND(lapic, db_show_lapic) uint32_t v; db_printf("lapic ID = %d\n", lapic_id()); - v = lapic->version; + v = lapic_version(); db_printf("version = %d.%d\n", (v & APIC_VER_VERSION) >> 4, v & 0xf); db_printf("max LVT = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT); - v = lapic->svr; + v = lapic_svr(); db_printf("SVR = %02x (%s)\n", v & APIC_SVR_VECTOR, v & APIC_SVR_ENABLE ? "enabled" : "disabled"); - db_printf("TPR = %02x\n", lapic->tpr); + db_printf("TPR = %02x\n", lapic_tpr()); #define dump_field(prefix, index) \ - dump_mask(__XSTRING(prefix ## index), lapic->prefix ## index, \ + dump_mask(__XSTRING(prefix ## index), lapic_ ## prefix(index), \ index * 32) db_printf("In-service Interrupts:\n"); @@ -1300,7 +1670,7 @@ lapic_ipi_wait(int delay) } else incr = 1; for (x = 0; x < delay; x += incr) { - if ((lapic->icr_lo & APIC_DELSTAT_MASK) == APIC_DELSTAT_IDLE) + if ((lapic_icr_lo() & APIC_DELSTAT_MASK) == APIC_DELSTAT_IDLE) return (1); ia32_pause(); } @@ -1313,7 +1683,7 @@ lapic_ipi_raw(register_t icrlo, u_int dest) register_t value, eflags; /* XXX: Need more sanity checking of icrlo? */ - KASSERT(lapic != NULL, ("%s called too early", __func__)); + KASSERT(!lapic_missing(), ("%s called too early", __func__)); KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0, ("%s: invalid dest field", __func__)); KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0, @@ -1322,17 +1692,17 @@ lapic_ipi_raw(register_t icrlo, u_int dest) /* Set destination in ICR HI register if it is being used. */ eflags = intr_disable(); if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) { - value = lapic->icr_hi; + value = lapic_icr_hi(); value &= ~APIC_ID_MASK; value |= dest << APIC_ID_SHIFT; - lapic->icr_hi = value; + lapic_set_icr_hi(value); } /* Program the contents of the IPI and dispatch it. */ - value = lapic->icr_lo; + value = lapic_icr_lo(); value &= APIC_ICRLO_RESV_MASK; value |= icrlo; - lapic->icr_lo = value; + lapic_set_icr_lo(value); intr_restore(eflags); } @@ -1409,7 +1779,7 @@ lapic_ipi_vectored(u_int vector, int dest) printf("APIC: IPI might be stuck\n"); #else /* !needsattention */ /* Wait until mesage is sent without a timeout. */ - while (lapic->icr_lo & APIC_DELSTAT_PEND) + while (lapic_icr_lo() & APIC_DELSTAT_PEND) ia32_pause(); #endif /* needsattention */ } diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c index a9af809..4377c81 100644 --- a/sys/amd64/amd64/minidump_machdep.c +++ b/sys/amd64/amd64/minidump_machdep.c @@ -27,6 +27,7 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_pmap.h" #include #include #include diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 0ef8017..8f8825d 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -140,6 +140,26 @@ struct cpu_info { int cpu_apic_ids[MAXCPU]; int apic_cpuids[MAX_APIC_ID + 1]; +/* + * Trampoline for hypervisor direct 64-bit jump. + * + * 0 - signature for guest->host verification + * 8 - virtual address of this page + * 16 - instruction virtual address + * 24 - stack pointer virtual address + * 32 - CR3, physical address of kernel page table + * 40 - 24-byte area for null/code/data GDT entries + */ +#define MP_V64T_SIG 0xcafebabecafebabeULL +struct mp_v64tramp { + uint64_t mt_sig; + uint64_t mt_virt; + uint64_t mt_eip; + uint64_t mt_rsp; + uint64_t mt_cr3; + uint64_t mt_gdtr[3]; +}; + /* Holds pending bitmap based IPIs per CPU */ static volatile u_int cpu_ipi_pending[MAXCPU]; @@ -873,6 +893,29 @@ start_all_aps(void) bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8; bootAP = cpu; + /* + * If running in a VM that doesn't support the unrestricted + * guest 16-bit mode, forget most of the above and create + * the data block that allows the hypervisor to direct-jump + * into 64-bit mode. Copy this over the top of the 16-bit + * bootstrap. The startup-IPI informs the hypervisor which + * physical page this data block lies in. The hypervisor + * will then use the block to initialise register state of + * the AP in an almost identical fashion to how it builds + * the BSP initial register state. + */ + if (testenv("hw.use_bvm_mptramp")) { + struct mp_v64tramp mv; + + bzero(&mv, sizeof(mv)); + mv.mt_sig = MP_V64T_SIG; + mv.mt_virt = (uint64_t) va; + mv.mt_eip = (uint64_t) init_secondary; + mv.mt_rsp = (uint64_t) bootSTK; + mv.mt_cr3 = KPML4phys; + bcopy(&mv, (void *) va, sizeof(mv)); + } + /* attempt to start the Application Processor */ if (!start_ap(apic_id)) { /* restore the warmstart vector */ diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index d6906ac..fe2e256 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -507,8 +507,10 @@ cpu_reset_proxy() { cpu_reset_proxy_active = 1; - while (cpu_reset_proxy_active == 1) + while (cpu_reset_proxy_active == 1) { + ia32_pause(); ; /* Wait for other cpu to see that we've started */ + } stop_cpus((1< 0) { + *addr = inb(bsh + offset); + count--; + addr++; + } + } else { #ifdef __GNUCLIKE_ASM __asm __volatile(" \n\ cld \n\ @@ -290,9 +294,13 @@ bus_space_read_multi_2(bus_space_tag_t tag, bus_space_handle_t bsh, bus_size_t offset, u_int16_t *addr, size_t count) { - if (tag == AMD64_BUS_SPACE_IO) - insw(bsh + offset, addr, count); - else { + if (tag == AMD64_BUS_SPACE_IO) { + while (count > 0) { + *addr = inw(bsh + offset); + count--; + addr++; + } + } else { #ifdef __GNUCLIKE_ASM __asm __volatile(" \n\ cld \n\ @@ -311,9 +319,13 @@ bus_space_read_multi_4(bus_space_tag_t tag, bus_space_handle_t bsh, bus_size_t offset, u_int32_t *addr, size_t count) { - if (tag == AMD64_BUS_SPACE_IO) - insl(bsh + offset, addr, count); - else { + if (tag == AMD64_BUS_SPACE_IO) { + while (count > 0) { + *addr = inl(bsh + offset); + count--; + addr++; + } + } else { #ifdef __GNUCLIKE_ASM __asm __volatile(" \n\ cld \n\ @@ -533,9 +545,13 @@ bus_space_write_multi_1(bus_space_tag_t tag, bus_space_handle_t bsh, bus_size_t offset, const u_int8_t *addr, size_t count) { - if (tag == AMD64_BUS_SPACE_IO) - outsb(bsh + offset, addr, count); - else { + if (tag == AMD64_BUS_SPACE_IO) { + while (count > 0) { + outb(bsh + offset, *addr); + addr++; + count--; + } + } else { #ifdef __GNUCLIKE_ASM __asm __volatile(" \n\ cld \n\ @@ -554,9 +570,13 @@ bus_space_write_multi_2(bus_space_tag_t tag, bus_space_handle_t bsh, bus_size_t offset, const u_int16_t *addr, size_t count) { - if (tag == AMD64_BUS_SPACE_IO) - outsw(bsh + offset, addr, count); - else { + if (tag == AMD64_BUS_SPACE_IO) { + while (count > 0) { + outw(bsh + offset, *addr); + addr++; + count--; + } + } else { #ifdef __GNUCLIKE_ASM __asm __volatile(" \n\ cld \n\ @@ -575,9 +595,13 @@ bus_space_write_multi_4(bus_space_tag_t tag, bus_space_handle_t bsh, bus_size_t offset, const u_int32_t *addr, size_t count) { - if (tag == AMD64_BUS_SPACE_IO) - outsl(bsh + offset, addr, count); - else { + if (tag == AMD64_BUS_SPACE_IO) { + while (count > 0) { + outl(bsh + offset, *addr); + addr++; + count--; + } + } else { #ifdef __GNUCLIKE_ASM __asm __volatile(" \n\ cld \n\ diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h index c95fee0..42653cc 100644 --- a/sys/amd64/include/specialreg.h +++ b/sys/amd64/include/specialreg.h @@ -292,12 +292,41 @@ #define MSR_MC4_ADDR 0x412 #define MSR_MC4_MISC 0x413 +/* X2APIC MSRs */ +#define MSR_APIC_ID 0x802 +#define MSR_APIC_VERSION 0x803 +#define MSR_APIC_TPR 0x808 +#define MSR_APIC_EOI 0x80b +#define MSR_APIC_LDR 0x80d +#define MSR_APIC_SVR 0x80f +#define MSR_APIC_ISR0 0x810 +#define MSR_APIC_ISR1 0x811 +#define MSR_APIC_ISR2 0x812 +#define MSR_APIC_ISR3 0x813 +#define MSR_APIC_ISR4 0x814 +#define MSR_APIC_ISR5 0x815 +#define MSR_APIC_ISR6 0x816 +#define MSR_APIC_ISR7 0x817 +#define MSR_APIC_TMR0 0x818 +#define MSR_APIC_IRR0 0x820 +#define MSR_APIC_ESR 0x828 +#define MSR_APIC_ICR 0x830 +#define MSR_APIC_LVT_TIMER 0x832 +#define MSR_APIC_LVT_THERMAL 0x833 +#define MSR_APIC_LVT_PCINT 0x834 +#define MSR_APIC_LVT_LINT0 0x835 +#define MSR_APIC_LVT_LINT1 0x836 +#define MSR_APIC_LVT_ERROR 0x837 +#define MSR_APIC_ICR_TIMER 0x838 +#define MSR_APIC_CCR_TIMER 0x839 +#define MSR_APIC_DCR_TIMER 0x83e + /* * Constants related to MSR's. */ -#define APICBASE_RESERVED 0x000006ff +#define APICBASE_RESERVED 0x000002ff #define APICBASE_BSP 0x00000100 -#define APICBASE_X2APIC 0x00000400 +#define APICBASE_X2APIC 0x00000400 #define APICBASE_ENABLED 0x00000800 #define APICBASE_ADDRESS 0xfffff000 -- cgit v1.1 From 6d6dbef9d4de7a3f8785d8db43c17cf992065785 Mon Sep 17 00:00:00 2001 From: jhb Date: Sun, 15 May 2011 02:09:12 +0000 Subject: Enable handling of 1GB pages in the direct map since HEAD supports those. Submitted by: neel --- sys/amd64/vmm/vmm_mem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c index 9ce1e80..764a6e9 100644 --- a/sys/amd64/vmm/vmm_mem.c +++ b/sys/amd64/vmm/vmm_mem.c @@ -168,8 +168,7 @@ vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end) if (end >= NBPML4) panic("Cannot map memory beyond %ldGB", NBPML4 / GB); - /* XXX FreeBSD 8.1 does not use 1G superpages in the direct map */ - if (0 && vmm_supports_1G_pages()) + if (vmm_supports_1G_pages()) superpage_size = NBPDP; else superpage_size = NBPDR; -- cgit v1.1 From 5a44aef8a38d88c0d5b265a72c5d90ef2d73cf95 Mon Sep 17 00:00:00 2001 From: grehan Date: Thu, 19 May 2011 21:53:25 +0000 Subject: Changes to allow the GENERIC+bhye kernel built from this branch to run as a 1/2 CPU guest on an 8.1 bhyve host. bhyve/inout.c inout.h fbsdrun.c - Rather than exiting on accesses to unhandled i/o ports, emulate hardware by returning -1 on reads and ignoring writes to unhandled ports. Support the previous mode by allowing a 'strict' parameter to be set from the command line. The 8.1 guest kernel was vastly cut down from GENERIC and had no ISA devices. Booting GENERIC exposes a massive amount of random touching of i/o ports (hello syscons/vga/atkbdc). bhyve/consport.c dev/bvm/bvm_console.c - implement a simplistic signature for the bvm console by returning 'bv' for an inw on the port. Also, set the priority of the console to CN_REMOTE if the signature was returned. This works better in an environment where multiple consoles are in the kernel (hello syscons) bhyve/rtc.c - return 0 for the access to RTC_EQUIPMENT (yes, you syscons) amd64/vmm/x86.c x86.h - hide a bunch more CPUID leaf 1 bits from the guest to prevent cpufreq drivers from probing. The next step will be to move CPUID handling completely into user-space. This will allow the full spectrum of changes from presenting a lowest-common-denominator CPU type/feature set, to exposing (almost) everything that the host can support. Reviewed by: neel Obtained from: NetApp --- sys/amd64/vmm/x86.c | 21 +++++++++++++++++++-- sys/amd64/vmm/x86.h | 1 + 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c index 45c4c53..f6b38e0 100644 --- a/sys/amd64/vmm/x86.c +++ b/sys/amd64/vmm/x86.c @@ -75,13 +75,19 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) regs[1] |= (0 << CPUID_0000_0001_APICID_SHIFT); /* - * Don't expose VMX capability. + * Don't expose VMX, SpeedStep or TME capability. * Advertise x2APIC capability. */ - regs[2] &= ~CPUID_0000_0001_FEAT0_VMX; + regs[2] &= ~(CPUID_0000_0001_FEAT0_VMX | CPUID2_EST | + CPUID2_TM2); regs[2] |= CPUID2_X2APIC; /* + * Hide thermal monitoring + */ + regs[3] &= ~(CPUID_ACPI | CPUID_TM); + + /* * Machine check handling is done in the host. * Hide MTRR capability. */ @@ -89,6 +95,17 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) break; + case CPUID_0000_0006: + /* + * Handle the access, but report 0 for + * all options + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + case CPUID_0000_000B: /* * XXXSMP fixme diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h index bc4f8a4..b437d61 100644 --- a/sys/amd64/vmm/x86.h +++ b/sys/amd64/vmm/x86.h @@ -34,6 +34,7 @@ #define CPUID_0000_0002 (0x2) #define CPUID_0000_0003 (0x3) #define CPUID_0000_0004 (0x4) +#define CPUID_0000_0006 (0x6) #define CPUID_0000_000A (0xA) #define CPUID_0000_000B (0xB) #define CPUID_8000_0000 (0x80000000) -- cgit v1.1 From e24f5ed9f2b1a7f169a57fcc109ed55bd5ef8954 Mon Sep 17 00:00:00 2001 From: neel Date: Fri, 20 May 2011 02:08:05 +0000 Subject: Avoid unnecessary sign extension when promoted to a 64-bit integer. This was benign because the interruption info field is a 32-bit quantity and the hardware guarantees that the upper 32-bits are all zeros. But it did make reading the objdump output very confusing. --- sys/amd64/vmm/intel/vmcs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h index c633a59..853c9c6 100644 --- a/sys/amd64/vmm/intel/vmcs.h +++ b/sys/amd64/vmm/intel/vmcs.h @@ -304,7 +304,7 @@ uint64_t vmcs_read(uint32_t encoding); /* * VMCS interrupt information fields */ -#define VMCS_INTERRUPTION_INFO_VALID (1 << 31) +#define VMCS_INTERRUPTION_INFO_VALID (1U << 31) #define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8) #define VMCS_INTERRUPTION_INFO_NMI (2 << 8) -- cgit v1.1 From 3930d0afcfdc805e3c97fcc29a2c1ffbd8b31c7c Mon Sep 17 00:00:00 2001 From: neel Date: Fri, 20 May 2011 03:23:09 +0000 Subject: Fix a long standing bug in VMXCTX_GUEST_RESTORE(). There was an assumption by the "callers" of this macro that on "return" the %rsp will be pointing to the 'vmxctx'. The macro was not doing this and thus when trying to restore host state on an error from "vmlaunch" or "vmresume" we were treating the memory locations on the host stack as 'struct vmxctx'. This led to all sorts of weird bugs like double faults or invalid instruction faults. This bug is exposed by the -O2 option used to compile the kernel module. With the -O2 flag the compiler will optimize the following piece of code: int loopstart = 1; ... if (loopstart) { loopstart = 0; vmx_launch(); } else vmx_resume(); into this: vmx_launch(); Since vmx_launch() and vmx_resume() are declared to be __dead2 functions the compiler is free to do this. The compiler has no way to know that the functions return indirectly through vmx_setjmp(). This optimization in turn leads us to trigger the bug in VMXCTX_GUEST_RESTORE(). With this change we can boot a 8.1 guest on a 9.0 host. Reported by: jhb@ --- sys/amd64/vmm/intel/vmx.c | 8 ++++---- sys/amd64/vmm/intel/vmx.h | 4 ++++ sys/amd64/vmm/intel/vmx_genassym.c | 1 + sys/amd64/vmm/intel/vmx_support.S | 22 ++++++++++++++++++---- 4 files changed, 27 insertions(+), 8 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index ec181c4..44eae67 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1189,7 +1189,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) static int vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit) { - int error, vie, rc, handled, astpending, loopstart; + int error, vie, rc, handled, astpending; uint32_t exit_reason; struct vmx *vmx; struct vmxctx *vmxctx; @@ -1198,7 +1198,7 @@ vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit) vmx = arg; vmcs = &vmx->vmcs[vcpu]; vmxctx = &vmx->ctx[vcpu]; - loopstart = 1; + vmxctx->launched = 0; /* * XXX Can we avoid doing this every time we do a vm run? @@ -1232,8 +1232,8 @@ vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit) #endif switch (rc) { case VMX_RETURN_DIRECT: - if (loopstart) { - loopstart = 0; + if (vmxctx->launched == 0) { + vmxctx->launched = 1; vmx_launch(vmxctx); } else vmx_resume(vmxctx); diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h index 69697f8..61d72a8 100644 --- a/sys/amd64/vmm/intel/vmx.h +++ b/sys/amd64/vmm/intel/vmx.h @@ -34,6 +34,9 @@ #define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */ struct vmxctx { + register_t tmpstk[32]; /* vmx_return() stack */ + register_t tmpstktop; + register_t guest_rdi; /* Guest state */ register_t guest_rsi; register_t guest_rdx; @@ -63,6 +66,7 @@ struct vmxctx { * XXX todo debug registers and fpu state */ + int launched; /* vmcs launch state */ int launch_error; }; diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c index c4b1efc..c5b5bf9 100644 --- a/sys/amd64/vmm/intel/vmx_genassym.c +++ b/sys/amd64/vmm/intel/vmx_genassym.c @@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include "vmx.h" #include "vmx_cpufunc.h" +ASSYM(VMXCTX_TMPSTKTOP, offsetof(struct vmxctx, tmpstktop)); ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi)); ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi)); ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx)); diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S index 4d1bf1d..8bdba86 100644 --- a/sys/amd64/vmm/intel/vmx_support.S +++ b/sys/amd64/vmm/intel/vmx_support.S @@ -31,15 +31,23 @@ #include "vmx_assym.s" /* - * Assumes that %rdi holds a pointer to the 'vmxctx' + * Assumes that %rdi holds a pointer to the 'vmxctx'. + * + * On "return" all registers are updated to reflect guest state. The two + * exceptions are %rip and %rsp. These registers are atomically switched + * by hardware from the guest area of the vmcs. + * + * We modify %rsp to point to the 'vmxctx' so we can use it to restore + * host context in case of an error with 'vmlaunch' or 'vmresume'. */ #define VMX_GUEST_RESTORE \ /* \ - * Make sure that interrupts are disabled before restoring CR2. \ - * Otherwise there could be a page fault during the interrupt \ - * handler execution that would end up trashing CR2. \ + * Disable interrupts before updating %rsp. The location that \ + * %rsp points to is a 'vmxctx' and not a real stack so we \ + * don't want an interrupt handler to trash it. \ */ \ cli; \ + movq %rdi,%rsp; \ movq VMXCTX_GUEST_CR2(%rdi),%rsi; \ movq %rsi,%cr2; \ movq VMXCTX_GUEST_RSI(%rdi),%rsi; \ @@ -148,6 +156,8 @@ ENTRY(vmx_longjmp) movq %rsp,%rdi movq $VMX_RETURN_LONGJMP,%rsi + + addq $VMXCTX_TMPSTKTOP,%rsp callq vmx_return END(vmx_longjmp) @@ -174,6 +184,8 @@ ENTRY(vmx_resume) /* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */ movq %rsp,%rdi movq $VMX_RETURN_VMRESUME,%rsi + + addq $VMXCTX_TMPSTKTOP,%rsp callq vmx_return END(vmx_resume) @@ -200,5 +212,7 @@ ENTRY(vmx_launch) /* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */ movq %rsp,%rdi movq $VMX_RETURN_VMLAUNCH,%rsi + + addq $VMXCTX_TMPSTKTOP,%rsp callq vmx_return END(vmx_launch) -- cgit v1.1 From 63af589b2ca587eebab8b8b1c05cc910d72a6a56 Mon Sep 17 00:00:00 2001 From: jhb Date: Thu, 2 Jun 2011 13:49:19 +0000 Subject: Add a 'show vmcs' DDB command to dump state about the current CPU's current VMCS. --- sys/amd64/vmm/intel/vmcs.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++ sys/amd64/vmm/intel/vmcs.h | 2 + sys/amd64/vmm/intel/vmx.c | 2 +- 3 files changed, 99 insertions(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c index 80d45cc..8c53465 100644 --- a/sys/amd64/vmm/intel/vmcs.c +++ b/sys/amd64/vmm/intel/vmcs.c @@ -26,6 +26,8 @@ * $FreeBSD$ */ +#include "opt_ddb.h" + #include __FBSDID("$FreeBSD$"); @@ -45,6 +47,10 @@ __FBSDID("$FreeBSD$"); #include "ept.h" #include "vmx.h" +#ifdef DDB +#include +#endif + static uint64_t vmcs_fix_regval(uint32_t encoding, uint64_t val) { @@ -449,3 +455,93 @@ vmcs_read(uint32_t encoding) return (val); } + +#ifdef DDB +extern int vmxon_enabled[]; + +DB_SHOW_COMMAND(vmcs, db_show_vmcs) +{ + uint64_t cur_vmcs, val; + uint32_t exit; + + if (!vmxon_enabled[curcpu]) { + db_printf("VMX not enabled\n"); + return; + } + + if (have_addr) { + db_printf("Only current VMCS supported\n"); + return; + } + + vmptrst(&cur_vmcs); + if (cur_vmcs == VMCS_INITIAL) { + db_printf("No current VM context\n"); + return; + } + db_printf("VMCS: %jx\n", cur_vmcs); + db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID)); + db_printf("Activity: "); + val = vmcs_read(VMCS_GUEST_ACTIVITY); + switch (val) { + case 0: + db_printf("Active"); + break; + case 1: + db_printf("HLT"); + break; + case 2: + db_printf("Shutdown"); + break; + case 3: + db_printf("Wait for SIPI"); + break; + default: + db_printf("Unknown: %#lx", val); + } + db_printf("\n"); + exit = vmcs_read(VMCS_EXIT_REASON); + if (exit & 0x80000000) + db_printf("Entry Failure Reason: %u\n", exit & 0xffff); + else + db_printf("Exit Reason: %u\n", exit & 0xffff); + db_printf("Qualification: %#lx\n", vmcs_exit_qualification()); + db_printf("Guest Linear Address: %#lx\n", + vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)); + switch (exit & 0x8000ffff) { + case EXIT_REASON_EXCEPTION: + case EXIT_REASON_EXT_INTR: + val = vmcs_read(VMCS_EXIT_INTERRUPTION_INFO); + db_printf("Interrupt Type: "); + switch (val >> 8 & 0x7) { + case 0: + db_printf("external"); + break; + case 2: + db_printf("NMI"); + break; + case 3: + db_printf("HW exception"); + break; + case 4: + db_printf("SW exception"); + break; + default: + db_printf("?? %lu", val >> 8 & 0x7); + break; + } + db_printf(" Vector: %lu", val & 0xff); + if (val & 0x800) + db_printf(" Error Code: %lx", + vmcs_read(VMCS_EXIT_INTERRUPTION_ERROR)); + db_printf("\n"); + break; + case EXIT_REASON_EPT_FAULT: + case EXIT_REASON_EPT_MISCONFIG: + db_printf("Guest Physical Address: %#lx\n", + vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)); + break; + } + db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error()); +} +#endif diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h index 853c9c6..be2f29c 100644 --- a/sys/amd64/vmm/intel/vmcs.h +++ b/sys/amd64/vmm/intel/vmcs.h @@ -68,6 +68,8 @@ uint64_t vmcs_read(uint32_t encoding); #endif /* _KERNEL */ +#define VMCS_INITIAL 0xffffffffffffffff + #define VMCS_IDENT(encoding) ((encoding) | 0x80000000) /* * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B. diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 44eae67..805d035 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -110,7 +110,7 @@ MALLOC_DEFINE(M_VMX, "vmx", "vmx"); extern struct pcpu __pcpu[]; -static int vmxon_enabled[MAXCPU]; +int vmxon_enabled[MAXCPU]; static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; -- cgit v1.1 From acc044270835dd36a1405fca8d586d5008514a39 Mon Sep 17 00:00:00 2001 From: jhb Date: Thu, 2 Jun 2011 14:04:07 +0000 Subject: Some tweaks to the CPUID support: - Don't always pass the cpuid request to the current CPU as some nodes we will emulate purely in software. - Pass in the APIC ID of the virtual CPU so we can return the proper APIC ID. - Always report a completely flat topology with no SMT or multicore. - Report the CPUID2_HV feature and implement support for the 0x40000000 CPUID level. - Use existing constants from when possible and use cpu_feature2 when checking for VMX support. --- sys/amd64/vmm/intel/vmx.c | 11 ++++--- sys/amd64/vmm/x86.c | 76 ++++++++++++++++++++++++++++++++++++----------- sys/amd64/vmm/x86.h | 2 +- 3 files changed, 65 insertions(+), 24 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 805d035..73d60c2 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -418,13 +419,11 @@ static int vmx_init(void) { int error; - unsigned int regs[4]; uint64_t fixed0, fixed1; uint32_t tmp; /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ - do_cpuid(1, regs); - if ((regs[2] & CPUID_0000_0001_FEAT0_VMX) == 0) { + if (!(cpu_feature2 & CPUID2_VMX)) { printf("vmx_init: processor does not support VMX operation\n"); return (ENXIO); } @@ -705,7 +704,7 @@ vmx_vminit(struct vm *vm) } static int -vmx_handle_cpuid(struct vmxctx *vmxctx) +vmx_handle_cpuid(int vcpu, struct vmxctx *vmxctx) { int handled, func; @@ -713,7 +712,7 @@ vmx_handle_cpuid(struct vmxctx *vmxctx) handled = x86_emulate_cpuid((uint32_t*)(&vmxctx->guest_rax), (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx), - (uint32_t*)(&vmxctx->guest_rdx)); + (uint32_t*)(&vmxctx->guest_rdx), vcpu); #if 0 printf("%s: func %x rax %lx rbx %lx rcx %lx rdx %lx handled %d\n", __func__, func, vmxctx->guest_rax, vmxctx->guest_rbx, @@ -1148,7 +1147,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); break; case EXIT_REASON_CPUID: - handled = vmx_handle_cpuid(vmxctx); + handled = vmx_handle_cpuid(vcpu, vmxctx); break; default: break; diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c index f6b38e0..93c21d7 100644 --- a/sys/amd64/vmm/x86.c +++ b/sys/amd64/vmm/x86.c @@ -30,27 +30,51 @@ __FBSDID("$FreeBSD$"); #include +#include #include +#include #include #include "x86.h" +#define CPUID_VM_HIGH 0x40000000 + +static const char bhyve_id[12] = "BHyVE BHyVE "; + int -x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) +x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx, + uint32_t vcpu_id) { unsigned int func, regs[4]; func = *eax; - cpuid_count(*eax, *ecx, regs); + /* + * Requests for invalid CPUID levels should map to the highest + * available level instead. + */ + if (cpu_exthigh != 0 && *eax >= 0x80000000) { + if (*eax > cpu_exthigh) + *eax = cpu_exthigh; + } else if (*eax >= 0x40000000) { + if (*eax > CPUID_VM_HIGH) + *eax = CPUID_VM_HIGH; + } else if (*eax > cpu_high) { + *eax = cpu_high; + } - switch(func) { + /* + * In general the approach used for CPU topology is to + * advertise a flat topology where all CPUs are packages with + * no multi-core or SMT. + */ + switch (func) { case CPUID_0000_0000: case CPUID_0000_0002: case CPUID_0000_0003: - case CPUID_0000_0004: case CPUID_0000_000A: + cpuid_count(*eax, *ecx, regs); break; case CPUID_8000_0000: @@ -61,26 +85,24 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) case CPUID_8000_0006: case CPUID_8000_0007: case CPUID_8000_0008: - + cpuid_count(*eax, *ecx, regs); break; case CPUID_0000_0001: + do_cpuid(1, regs); + /* * Override the APIC ID only in ebx */ - regs[1] &= ~(CPUID_0000_0001_APICID_MASK); - /* - * XXX fixme for MP case, set apicid properly for cpu. - */ - regs[1] |= (0 << CPUID_0000_0001_APICID_SHIFT); + regs[1] &= ~(CPUID_LOCAL_APIC_ID); + regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); /* * Don't expose VMX, SpeedStep or TME capability. - * Advertise x2APIC capability. + * Advertise x2APIC capability and Hypervisor guest. */ - regs[2] &= ~(CPUID_0000_0001_FEAT0_VMX | CPUID2_EST | - CPUID2_TM2); - regs[2] |= CPUID2_X2APIC; + regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); + regs[2] |= CPUID2_X2APIC | CPUID2_HV; /* * Hide thermal monitoring @@ -93,6 +115,21 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) */ regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR); + /* + * Disable multi-core. + */ + regs[1] &= ~CPUID_HTT_CORES; + regs[3] &= ~CPUID_HTT; + break; + + case CPUID_0000_0004: + do_cpuid(4, regs); + + /* + * Do not expose topology. + */ + regs[0] &= 0xffff8000; + regs[0] |= 0x04008000; break; case CPUID_0000_0006: @@ -108,16 +145,22 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) case CPUID_0000_000B: /* - * XXXSMP fixme * Processor topology enumeration */ regs[0] = 0; regs[1] = 0; regs[2] = *ecx & 0xff; - regs[3] = 0; + regs[3] = vcpu_id; break; + case 0x40000000: + regs[0] = CPUID_VM_HIGH; + bcopy(bhyve_id, ®s[1], 4); + bcopy(bhyve_id, ®s[2], 4); + bcopy(bhyve_id, ®s[3], 4); + break; default: + /* XXX: Leaf 5? */ return (0); } @@ -127,4 +170,3 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) *edx = regs[3]; return (1); } - diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h index b437d61..d672831 100644 --- a/sys/amd64/vmm/x86.h +++ b/sys/amd64/vmm/x86.h @@ -58,6 +58,6 @@ #define CPUID_0000_0001_FEAT0_VMX (1<<5) int x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, - uint32_t *edx); + uint32_t *edx, uint32_t vcpu_id); #endif -- cgit v1.1 From 8c8399924a8001cc92bcb6bf2fb8f959ed5bc722 Mon Sep 17 00:00:00 2001 From: neel Date: Wed, 6 Jul 2011 21:40:48 +0000 Subject: Get rid of redundant initialization of 'dmask'. It was being re-initialized shortly afterwards. --- sys/amd64/vmm/io/vlapic.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 0a14127..cf7cb0d 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -443,8 +443,6 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval) cpuset_t dmask; uint32_t dest, vec, mode; - CPU_ZERO(&dmask); - dest = icrval >> 32; vec = icrval & APIC_VECTOR_MASK; mode = icrval & APIC_DELMODE_MASK; -- cgit v1.1 From 6e4718b6d1bec5c2ecfc16c6fb78a1ceead6a735 Mon Sep 17 00:00:00 2001 From: neel Date: Mon, 26 Sep 2011 07:05:40 +0000 Subject: Kernel configuration for a bhyve guest. --- sys/amd64/conf/BHYVE | 345 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 sys/amd64/conf/BHYVE (limited to 'sys/amd64') diff --git a/sys/amd64/conf/BHYVE b/sys/amd64/conf/BHYVE new file mode 100644 index 0000000..de36445 --- /dev/null +++ b/sys/amd64/conf/BHYVE @@ -0,0 +1,345 @@ +# +# BHYVE -- Kernel configuration file for FreeBSD/amd64 bhyve guest +# +# For more information on this file, please read the config(5) manual page, +# and/or the handbook section on Kernel Configuration Files: +# +# http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html +# +# The handbook is also available locally in /usr/share/doc/handbook +# if you've installed the doc distribution, otherwise always see the +# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the +# latest information. +# +# An exhaustive list of options and more detailed explanations of the +# device lines is also present in the ../../conf/NOTES and NOTES files. +# If you are in doubt as to the purpose or necessity of a line, check first +# in NOTES. +# +# $FreeBSD: projects/bhyve/sys/amd64/conf/GENERIC 221914 2011-05-14 20:35:01Z jhb $ + +cpu HAMMER +ident BHYVE + +makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols + +options SCHED_ULE # ULE scheduler +options PREEMPTION # Enable kernel thread preemption +options INET # InterNETworking +options INET6 # IPv6 communications protocols +options SCTP # Stream Control Transmission Protocol +options FFS # Berkeley Fast Filesystem +options SOFTUPDATES # Enable FFS soft updates support +options UFS_ACL # Support for access control lists +options UFS_DIRHASH # Improve performance on big directories +options UFS_GJOURNAL # Enable gjournal-based UFS journaling +options MD_ROOT # MD is a potential root device +options NFSCL # New Network Filesystem Client +options NFSD # New Network Filesystem Server +options NFSLOCKD # Network Lock Manager +options NFS_ROOT # NFS usable as /, requires NFSCLIENT +options MSDOSFS # MSDOS Filesystem +options CD9660 # ISO 9660 Filesystem +options PROCFS # Process filesystem (requires PSEUDOFS) +options PSEUDOFS # Pseudo-filesystem framework +options GEOM_PART_GPT # GUID Partition Tables. +options GEOM_LABEL # Provides labelization +options COMPAT_FREEBSD32 # Compatible with i386 binaries +options COMPAT_FREEBSD4 # Compatible with FreeBSD4 +options COMPAT_FREEBSD5 # Compatible with FreeBSD5 +options COMPAT_FREEBSD6 # Compatible with FreeBSD6 +options COMPAT_FREEBSD7 # Compatible with FreeBSD7 +options SCSI_DELAY=5000 # Delay (in ms) before probing SCSI +options KTRACE # ktrace(1) support +options STACK # stack(9) support +options SYSVSHM # SYSV-style shared memory +options SYSVMSG # SYSV-style message queues +options SYSVSEM # SYSV-style semaphores +options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions +options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. +options KBD_INSTALL_CDEV # install a CDEV entry in /dev +#options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) +options AUDIT # Security event auditing +options MAC # TrustedBSD MAC Framework +#options KDTRACE_FRAME # Ensure frames are compiled in +#options KDTRACE_HOOKS # Kernel DTrace hooks +options INCLUDE_CONFIG_FILE # Include this file in kernel + +# Debugging for use in -current +options KDB # Enable kernel debugger support. +options DDB # Support DDB. +options GDB # Support remote GDB. +options DEADLKRES # Enable the deadlock resolver +options INVARIANTS # Enable calls of extra sanity checking +options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS +options WITNESS # Enable checks to detect deadlocks and cycles +options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed +options MALLOC_DEBUG_MAXZONES=8 # Separate malloc(9) zones + +# Make an SMP-capable kernel by default +options SMP # Symmetric MultiProcessor Kernel + +# CPU frequency control +#device cpufreq + +# Bus support. +#device acpi +device pci + +# Floppy drives +#device fdc + +# ATA controllers +#device ahci # AHCI-compatible SATA controllers +#device ata # Legacy ATA/SATA controllers +#options ATA_CAM # Handle legacy controllers with CAM +#options ATA_STATIC_ID # Static device numbering +#device mvs # Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA +#device siis # SiliconImage SiI3124/SiI3132/SiI3531 SATA + +# SCSI Controllers +#device ahc # AHA2940 and onboard AIC7xxx devices +#options AHC_REG_PRETTY_PRINT # Print register bitfields in debug + # output. Adds ~128k to driver. +#device ahd # AHA39320/29320 and onboard AIC79xx devices +#options AHD_REG_PRETTY_PRINT # Print register bitfields in debug + # output. Adds ~215k to driver. +#device amd # AMD 53C974 (Tekram DC-390(T)) +#device hptiop # Highpoint RocketRaid 3xxx series +#device isp # Qlogic family +#device ispfw # Firmware for QLogic HBAs- normally a module +#device mpt # LSI-Logic MPT-Fusion +#device mps # LSI-Logic MPT-Fusion 2 +#device ncr # NCR/Symbios Logic +#device sym # NCR/Symbios Logic (newer chipsets + those of `ncr') +#device trm # Tekram DC395U/UW/F DC315U adapters + +#device adv # Advansys SCSI adapters +#device adw # Advansys wide SCSI adapters +#device aic # Adaptec 15[012]x SCSI adapters, AIC-6[23]60. +#device bt # Buslogic/Mylex MultiMaster SCSI adapters + +# ATA/SCSI peripherals +#device scbus # SCSI bus (required for ATA/SCSI) +#device ch # SCSI media changers +#device da # Direct Access (disks) +#device sa # Sequential Access (tape etc) +#device cd # CD +#device pass # Passthrough device (direct ATA/SCSI access) +#device ses # SCSI Environmental Services (and SAF-TE) + +# RAID controllers interfaced to the SCSI subsystem +#device amr # AMI MegaRAID +#device arcmsr # Areca SATA II RAID +#XXX it is not 64-bit clean, -scottl +#device asr # DPT SmartRAID V, VI and Adaptec SCSI RAID +#device ciss # Compaq Smart RAID 5* +#device dpt # DPT Smartcache III, IV - See NOTES for options +#device hptmv # Highpoint RocketRAID 182x +#device hptrr # Highpoint RocketRAID 17xx, 22xx, 23xx, 25xx +#device iir # Intel Integrated RAID +#device ips # IBM (Adaptec) ServeRAID +#device mly # Mylex AcceleRAID/eXtremeRAID +#device twa # 3ware 9000 series PATA/SATA RAID + +# RAID controllers +#device aac # Adaptec FSA RAID +#device aacp # SCSI passthrough for aac (requires CAM) +#device ida # Compaq Smart RAID +#device mfi # LSI MegaRAID SAS +#device mlx # Mylex DAC960 family +#XXX pointer/int warnings +#device pst # Promise Supertrak SX6000 +#device twe # 3ware ATA RAID + +# atkbdc0 controls both the keyboard and the PS/2 mouse +#device atkbdc # AT keyboard controller +#device atkbd # AT keyboard +#device psm # PS/2 mouse + +#device kbdmux # keyboard multiplexer + +#device vga # VGA video card driver + +#device splash # Splash screen and screen saver support + +# syscons is the default console driver, resembling an SCO console +#device sc +#options SC_PIXEL_MODE # add support for the raster text mode + +#device agp # support several AGP chipsets + +# PCCARD (PCMCIA) support +# PCMCIA and cardbus bridge support +#device cbb # cardbus (yenta) bridge +#device pccard # PC Card (16-bit) bus +#device cardbus # CardBus (32-bit) bus + +# Serial (COM) ports +#device uart # Generic UART driver + +# Parallel port +#device ppc +#device ppbus # Parallel port bus (required) +#device lpt # Printer +#device plip # TCP/IP over parallel +#device ppi # Parallel port interface device +#device vpo # Requires scbus and da + +# If you've got a "dumb" serial or parallel PCI card that is +# supported by the puc(4) glue driver, uncomment the following +# line to enable it (connects to sio, uart and/or ppc drivers): +#device puc + +# PCI Ethernet NICs. +#device bxe # Broadcom BCM57710/BCM57711/BCM57711E 10Gb Ethernet +#device de # DEC/Intel DC21x4x (``Tulip'') +#device em # Intel PRO/1000 Gigabit Ethernet Family +#device igb # Intel PRO/1000 PCIE Server Gigabit Family +#device ixgbe # Intel PRO/10GbE PCIE Ethernet Family +#device le # AMD Am7900 LANCE and Am79C9xx PCnet +#device ti # Alteon Networks Tigon I/II gigabit Ethernet +#device txp # 3Com 3cR990 (``Typhoon'') +#device vx # 3Com 3c590, 3c595 (``Vortex'') + +# PCI Ethernet NICs that use the common MII bus controller code. +# NOTE: Be sure to keep the 'device miibus' line in order to use these NICs! +#device miibus # MII bus support +#device ae # Attansic/Atheros L2 FastEthernet +#device age # Attansic/Atheros L1 Gigabit Ethernet +#device alc # Atheros AR8131/AR8132 Ethernet +#device ale # Atheros AR8121/AR8113/AR8114 Ethernet +#device bce # Broadcom BCM5706/BCM5708 Gigabit Ethernet +#device bfe # Broadcom BCM440x 10/100 Ethernet +#device bge # Broadcom BCM570xx Gigabit Ethernet +#device dc # DEC/Intel 21143 and various workalikes +#device et # Agere ET1310 10/100/Gigabit Ethernet +#device fxp # Intel EtherExpress PRO/100B (82557, 82558) +#device jme # JMicron JMC250 Gigabit/JMC260 Fast Ethernet +#device lge # Level 1 LXT1001 gigabit Ethernet +#device msk # Marvell/SysKonnect Yukon II Gigabit Ethernet +#device nfe # nVidia nForce MCP on-board Ethernet +#device nge # NatSemi DP83820 gigabit Ethernet +#device nve # nVidia nForce MCP on-board Ethernet Networking +#device pcn # AMD Am79C97x PCI 10/100 (precedence over 'le') +#device re # RealTek 8139C+/8169/8169S/8110S +#device rl # RealTek 8129/8139 +#device sf # Adaptec AIC-6915 (``Starfire'') +#device sge # Silicon Integrated Systems SiS190/191 +#device sis # Silicon Integrated Systems SiS 900/SiS 7016 +#device sk # SysKonnect SK-984x & SK-982x gigabit Ethernet +#device ste # Sundance ST201 (D-Link DFE-550TX) +#device stge # Sundance/Tamarack TC9021 gigabit Ethernet +#device tl # Texas Instruments ThunderLAN +#device tx # SMC EtherPower II (83c170 ``EPIC'') +#device vge # VIA VT612x gigabit Ethernet +#device vr # VIA Rhine, Rhine II +#device wb # Winbond W89C840F +#device xl # 3Com 3c90x (``Boomerang'', ``Cyclone'') + +# ISA Ethernet NICs. pccard NICs included. +#device cs # Crystal Semiconductor CS89x0 NIC +# 'device ed' requires 'device miibus' +#device ed # NE[12]000, SMC Ultra, 3c503, DS8390 cards +#device ex # Intel EtherExpress Pro/10 and Pro/10+ +#device ep # Etherlink III based cards +#device fe # Fujitsu MB8696x based cards +#device sn # SMC's 9000 series of Ethernet chips +#device xe # Xircom pccard Ethernet + +# Wireless NIC cards +#device wlan # 802.11 support +#options IEEE80211_DEBUG # enable debug msgs +#options IEEE80211_AMPDU_AGE # age frames in AMPDU reorder q's +#options IEEE80211_SUPPORT_MESH # enable 802.11s draft support +#device wlan_wep # 802.11 WEP support +#device wlan_ccmp # 802.11 CCMP support +#device wlan_tkip # 802.11 TKIP support +#device wlan_amrr # AMRR transmit rate control algorithm +#device an # Aironet 4500/4800 802.11 wireless NICs. +#device ath # Atheros NIC's +#device ath_pci # Atheros pci/cardbus glue +#device ath_hal # pci/cardbus chip support +#options AH_SUPPORT_AR5416 # enable AR5416 tx/rx descriptors +#device ath_rate_sample # SampleRate tx rate control for ath +#device bwi # Broadcom BCM430x/BCM431x wireless NICs. +#device bwn # Broadcom BCM43xx wireless NICs. +#device ipw # Intel 2100 wireless NICs. +#device iwi # Intel 2200BG/2225BG/2915ABG wireless NICs. +#device iwn # Intel 4965/1000/5000/6000 wireless NICs. +#device malo # Marvell Libertas wireless NICs. +#device mwl # Marvell 88W8363 802.11n wireless NICs. +#device ral # Ralink Technology RT2500 wireless NICs. +#device wi # WaveLAN/Intersil/Symbol 802.11 wireless NICs. +#device wpi # Intel 3945ABG wireless NICs. + +# Pseudo devices. +device loop # Network loopback +device random # Entropy device +device ether # Ethernet support +device vlan # 802.1Q VLAN support +device tun # Packet tunnel. +device pty # BSD-style compatibility pseudo ttys +device md # Memory "disks" +device gif # IPv6 and IPv4 tunneling +device faith # IPv6-to-IPv4 relaying (translation) +device firmware # firmware assist module + +# The `bpf' device enables the Berkeley Packet Filter. +# Be aware of the administrative consequences of enabling this! +# Note that 'bpf' is required for DHCP. +device bpf # Berkeley packet filter + +# USB support +#options USB_DEBUG # enable debug msgs +#device uhci # UHCI PCI->USB interface +#device ohci # OHCI PCI->USB interface +#device ehci # EHCI PCI->USB interface (USB 2.0) +#device usb # USB Bus (required) +#device udbp # USB Double Bulk Pipe devices (needs netgraph) +#device uhid # "Human Interface Devices" +#device ukbd # Keyboard +#device ulpt # Printer +#device umass # Disks/Mass storage - Requires scbus and da +#device ums # Mouse +#device urio # Diamond Rio 500 MP3 player +# USB Serial devices +#device u3g # USB-based 3G modems (Option, Huawei, Sierra) +#device uark # Technologies ARK3116 based serial adapters +#device ubsa # Belkin F5U103 and compatible serial adapters +#device uftdi # For FTDI usb serial adapters +#device uipaq # Some WinCE based devices +#device uplcom # Prolific PL-2303 serial adapters +#device uslcom # SI Labs CP2101/CP2102 serial adapters +#device uvisor # Visor and Palm devices +#device uvscom # USB serial support for DDI pocket's PHS +# USB Ethernet, requires miibus +#device aue # ADMtek USB Ethernet +#device axe # ASIX Electronics USB Ethernet +#device cdce # Generic USB over Ethernet +#device cue # CATC USB Ethernet +#device kue # Kawasaki LSI USB Ethernet +#device rue # RealTek RTL8150 USB Ethernet +#device udav # Davicom DM9601E USB +# USB Wireless +#device rum # Ralink Technology RT2501USB wireless NICs +#device run # Ralink Technology RT2700/RT2800/RT3000 NICs. +#device uath # Atheros AR5523 wireless NICs +#device upgt # Conexant/Intersil PrismGT wireless NICs. +#device ural # Ralink Technology RT2500USB wireless NICs +#device urtw # Realtek RTL8187B/L wireless NICs +#device zyd # ZyDAS zb1211/zb1211b wireless NICs + +# FireWire support +#device firewire # FireWire bus code +#device sbp # SCSI over FireWire (Requires scbus and da) +#device fwe # Ethernet over FireWire (non-standard!) +#device fwip # IP over FireWire (RFC 2734,3146) +#device dcons # Dumb console driver +#device dcons_crom # Configuration ROM for dcons + +device bvmconsole # brain dead simple bvm console +device bvmdebug # brain dead simple bvm gdb pipe + +device mptable +options NKPT=256 -- cgit v1.1 From d08191b4175ebda3e5ac2fabbe62e2bdf139a201 Mon Sep 17 00:00:00 2001 From: grehan Date: Sat, 24 Dec 2011 19:39:02 +0000 Subject: Add support for running as a nested hypervisor under VMWare Fusion, on systems with VT-x/EPT (e.g. Sandybridge Macbooks). This will most likely work on VMWare Workstation8/Player4 as well. See the VMWare app note at: http://communities.vmware.com/docs/DOC-8970 Fusion doesn't propagate the PAT MSR auto save-restore entry/exit control bits. Deal with this by noting that fact and setting up the PAT MSR to essentially be a no-op - it is init'd to power-on default, and a software shadow copy maintained. Since it is treated as a no-op, o/s settings are essentially ignored. This may not give correct results, but since the hypervisor is running nested, a number of bets are already off. On a quad-core/HT-enabled 'MacBook8,2', nested VMs with 1/2/4 vCPUs were fired up. The more nested vCPUs the worse the performance, unless the VMs were started up in multiplexed mode where things worked perfectly up to the limit of 8 vCPUs. Reviewed by: neel --- sys/amd64/vmm/intel/vmx.c | 72 +++++++++++++++++++++++++++++++++++------------ sys/amd64/vmm/vmm_msr.c | 37 ++++++++++++++++++++++++ sys/amd64/vmm/vmm_msr.h | 1 + 3 files changed, 92 insertions(+), 18 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 73d60c2..f8d5833 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -85,17 +86,22 @@ __FBSDID("$FreeBSD$"); #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT #define PROCBASED_CTLS2_ZERO_SETTING 0 -#define VM_EXIT_CTLS_ONE_SETTING \ +#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ (VM_EXIT_HOST_LMA | \ VM_EXIT_SAVE_EFER | \ - VM_EXIT_SAVE_PAT | \ - VM_EXIT_LOAD_PAT | \ VM_EXIT_LOAD_EFER) + +#define VM_EXIT_CTLS_ONE_SETTING \ + (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ + VM_EXIT_SAVE_PAT | \ + VM_EXIT_LOAD_PAT) #define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS +#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER + #define VM_ENTRY_CTLS_ONE_SETTING \ - (VM_ENTRY_LOAD_PAT | \ - VM_ENTRY_LOAD_EFER) + (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ + VM_ENTRY_LOAD_PAT) #define VM_ENTRY_CTLS_ZERO_SETTING \ (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ VM_ENTRY_INTO_SMM | \ @@ -122,6 +128,8 @@ static uint64_t cr4_ones_mask, cr4_zeros_mask; static volatile u_int nextvpid; +static int vmx_no_patmsr; + /* * Virtual NMI blocking conditions. * @@ -476,16 +484,39 @@ vmx_init(void) VM_EXIT_CTLS_ZERO_SETTING, &exit_ctls); if (error) { - printf("vmx_init: processor does not support desired " - "exit controls\n"); - return (error); + /* Try again without the PAT MSR bits */ + error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, + MSR_VMX_TRUE_EXIT_CTLS, + VM_EXIT_CTLS_ONE_SETTING_NO_PAT, + VM_EXIT_CTLS_ZERO_SETTING, + &exit_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "exit controls\n"); + return (error); + } else { + if (bootverbose) + printf("vmm: PAT MSR access not supported\n"); + guest_msr_valid(MSR_PAT); + vmx_no_patmsr = 1; + } } /* Check support for VM-entry controls */ - error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, - VM_ENTRY_CTLS_ONE_SETTING, - VM_ENTRY_CTLS_ZERO_SETTING, - &entry_ctls); + if (!vmx_no_patmsr) { + error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, + MSR_VMX_TRUE_ENTRY_CTLS, + VM_ENTRY_CTLS_ONE_SETTING, + VM_ENTRY_CTLS_ZERO_SETTING, + &entry_ctls); + } else { + error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, + MSR_VMX_TRUE_ENTRY_CTLS, + VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, + VM_ENTRY_CTLS_ZERO_SETTING, + &entry_ctls); + } + if (error) { printf("vmx_init: processor does not support desired " "entry controls\n"); @@ -646,18 +677,23 @@ vmx_vminit(struct vm *vm) * MSR_EFER is saved and restored in the guest VMCS area on a * VM exit and entry respectively. It is also restored from the * host VMCS area on a VM exit. - * - * MSR_PAT is saved and restored in the guest VMCS are on a VM exit - * and entry respectively. It is also restored from the host VMCS - * area on a VM exit. */ if (guest_msr_rw(vmx, MSR_GSBASE) || guest_msr_rw(vmx, MSR_FSBASE) || guest_msr_rw(vmx, MSR_KGSBASE) || - guest_msr_rw(vmx, MSR_EFER) || - guest_msr_rw(vmx, MSR_PAT)) + guest_msr_rw(vmx, MSR_EFER)) panic("vmx_vminit: error setting guest msr access"); + /* + * MSR_PAT is saved and restored in the guest VMCS are on a VM exit + * and entry respectively. It is also restored from the host VMCS + * area on a VM exit. However, if running on a system with no + * MSR_PAT save/restore support, leave access disabled so accesses + * will be trapped. + */ + if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) + panic("vmx_vminit: error setting guest pat msr access"); + for (i = 0; i < VM_MAXCPU; i++) { vmx->vmcs[i].identifier = vmx_revision(); error = vmclear(&vmx->vmcs[i]); diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c index 99ac293..31bfcab 100644 --- a/sys/amd64/vmm/vmm_msr.c +++ b/sys/amd64/vmm/vmm_msr.c @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #define VMM_MSR_F_EMULATE 0x01 #define VMM_MSR_F_READONLY 0x02 +#define VMM_MSR_F_INVALID 0x04 struct vmm_msr { int num; @@ -54,6 +55,7 @@ static struct vmm_msr vmm_msr[] = { { MSR_CSTAR, 0 }, { MSR_STAR, 0 }, { MSR_SF_MASK, 0 }, + { MSR_PAT, VMM_MSR_F_EMULATE | VMM_MSR_F_INVALID }, { MSR_APICBASE, VMM_MSR_F_EMULATE }, { MSR_BIOS_SIGN,VMM_MSR_F_EMULATE }, { MSR_MCG_CAP, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY }, @@ -68,6 +70,9 @@ CTASSERT(VMM_MSR_NUM >= vmm_msr_num); #define emulated_msr(idx) \ ((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0) +#define invalid_msr(idx) \ + ((vmm_msr[(idx)].flags & VMM_MSR_F_INVALID) != 0) + void vmm_msr_init(void) { @@ -108,6 +113,16 @@ guest_msrs_init(struct vm *vm, int cpu) if (cpu == 0) guest_msrs[i] |= APICBASE_BSP; break; + case MSR_PAT: + guest_msrs[i] = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + break; default: panic("guest_msrs_init: missing initialization for msr " "0x%0x", vmm_msr[i].num); @@ -165,6 +180,9 @@ emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val) if (idx < 0) goto done; + if (invalid_msr(idx)) + goto done; + if (!readonly_msr(idx)) { guest_msrs = vm_guest_msrs(vm, cpu); @@ -206,6 +224,9 @@ emulate_rdmsr(struct vm *vm, int cpu, u_int num) if (idx < 0) goto done; + if (invalid_msr(idx)) + goto done; + guest_msrs = vm_guest_msrs(vm, cpu); result = guest_msrs[idx]; @@ -263,3 +284,19 @@ restore_host_msrs(struct vm *vm, int cpu) wrmsr(vmm_msr[i].num, vmm_msr[i].hostval); } } + +/* + * Must be called by the CPU-specific code before any guests are + * created + */ +void +guest_msr_valid(int msr) +{ + int i; + + for (i = 0; i < vmm_msr_num; i++) { + if (vmm_msr[i].num == msr && invalid_msr(i)) { + vmm_msr[i].flags &= ~VMM_MSR_F_INVALID; + } + } +} diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h index 1e15787..8a1fda3 100644 --- a/sys/amd64/vmm/vmm_msr.h +++ b/sys/amd64/vmm/vmm_msr.h @@ -36,6 +36,7 @@ void vmm_msr_init(void); int emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val); int emulate_rdmsr(struct vm *vm, int vcpu, u_int msr); void guest_msrs_init(struct vm *vm, int cpu); +void guest_msr_valid(int msr); void restore_host_msrs(struct vm *vm, int cpu); void restore_guest_msrs(struct vm *vm, int cpu); -- cgit v1.1 From bd54a55c5a9baae09ba63f311e15c0d30f9ca683 Mon Sep 17 00:00:00 2001 From: emaste Date: Tue, 6 Mar 2012 21:13:12 +0000 Subject: Remove duplicated license text. --- sys/amd64/vmm/io/vlapic.c | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index cf7cb0d..61adef9 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -26,34 +26,6 @@ * $FreeBSD$ */ -/*- - * Copyright (c) 2011 NetApp, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - #include __FBSDID("$FreeBSD$"); -- cgit v1.1 From 9f0c999f8126597eb572b80056df88335dbd0070 Mon Sep 17 00:00:00 2001 From: grehan Date: Sat, 28 Apr 2012 16:28:00 +0000 Subject: MSI-x interrupt support for PCI pass-thru devices. Includes instruction emulation for memory r/w access. This opens the door for io-apic, local apic, hpet timer, and legacy device emulation. Submitted by: ryan dot berryhill at sandvine dot com Reviewed by: grehan Obtained from: Sandvine --- sys/amd64/include/vmm.h | 6 +- sys/amd64/include/vmm_dev.h | 14 ++++ sys/amd64/vmm/intel/vmcs.h | 1 + sys/amd64/vmm/intel/vmx.c | 4 + sys/amd64/vmm/io/ppt.c | 181 +++++++++++++++++++++++++++++++++++++++++++- sys/amd64/vmm/io/ppt.h | 3 +- sys/amd64/vmm/io/vlapic.c | 1 + sys/amd64/vmm/vmm_dev.c | 9 +++ 8 files changed, 213 insertions(+), 6 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 26646fb..1ad01c6 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -227,7 +227,8 @@ enum vm_exitcode { VM_EXITCODE_HLT, VM_EXITCODE_MTRAP, VM_EXITCODE_PAUSE, - VM_EXITCODE_MAX, + VM_EXITCODE_PAGING, + VM_EXITCODE_MAX }; struct vm_exit { @@ -243,6 +244,9 @@ struct vm_exit { uint16_t port; uint32_t eax; /* valid for out */ } inout; + struct { + uint64_t cr3; + } paging; /* * VMX specific payload. Used when there is no "better" * exitcode to represent the VM-exit. diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index 1b143b5..d1a50d6 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -108,6 +108,17 @@ struct vm_pptdev_msi { int destcpu; }; +struct vm_pptdev_msix { + int vcpu; + int bus; + int slot; + int func; + int idx; + uint32_t msg; + uint32_t vector_control; + uint64_t addr; +}; + struct vm_nmi { int cpuid; }; @@ -143,6 +154,7 @@ enum { IOCNUM_UNBIND_PPTDEV, IOCNUM_MAP_PPTDEV_MMIO, IOCNUM_PPTDEV_MSI, + IOCNUM_PPTDEV_MSIX, IOCNUM_INJECT_NMI, IOCNUM_VM_STATS, IOCNUM_VM_STAT_DESC, @@ -182,6 +194,8 @@ enum { _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio) #define VM_PPTDEV_MSI \ _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi) +#define VM_PPTDEV_MSIX \ + _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix) #define VM_INJECT_NMI \ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) #define VM_STATS \ diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h index be2f29c..a7cf4f6 100644 --- a/sys/amd64/vmm/intel/vmcs.h +++ b/sys/amd64/vmm/intel/vmcs.h @@ -65,6 +65,7 @@ uint64_t vmcs_read(uint32_t encoding); #define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR) #define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff) #define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION) +#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3) #endif /* _KERNEL */ diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index f8d5833..4bbcea8 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1185,6 +1185,10 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) case EXIT_REASON_CPUID: handled = vmx_handle_cpuid(vcpu, vmxctx); break; + case EXIT_REASON_EPT_FAULT: + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->u.paging.cr3 = vmcs_guest_cr3(); + break; default: break; } diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c index fcb36ad..ace2877 100644 --- a/sys/amd64/vmm/io/ppt.c +++ b/sys/amd64/vmm/io/ppt.c @@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -56,9 +57,12 @@ __FBSDID("$FreeBSD$"); #define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1) #define MAX_MSIMSGS 32 +MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources"); + struct pptintr_arg { /* pptintr(pptintr_arg) */ struct pptdev *pptdev; - int msg; + int vec; + int vcpu; }; static struct pptdev { @@ -75,6 +79,16 @@ static struct pptdev { void *cookie[MAX_MSIMSGS]; struct pptintr_arg arg[MAX_MSIMSGS]; } msi; + + struct { + int num_msgs; + int startrid; + int msix_table_rid; + struct resource *msix_table_res; + struct resource **res; + void **cookie; + struct pptintr_arg *arg; + } msix; } pptdevs[32]; static int num_pptdevs; @@ -209,6 +223,57 @@ ppt_teardown_msi(struct pptdev *ppt) ppt->msi.num_msgs = 0; } +static void +ppt_teardown_msix_intr(struct pptdev *ppt, int idx) +{ + int rid; + struct resource *res; + void *cookie; + + rid = ppt->msix.startrid + idx; + res = ppt->msix.res[idx]; + cookie = ppt->msix.cookie[idx]; + + if (cookie != NULL) + bus_teardown_intr(ppt->dev, res, cookie); + + if (res != NULL) + bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); + + ppt->msix.res[idx] = NULL; + ppt->msix.cookie[idx] = NULL; +} + +static void +ppt_teardown_msix(struct pptdev *ppt) +{ + int i, error; + + if (ppt->msix.num_msgs == 0) + return; + + for (i = 0; i < ppt->msix.num_msgs; i++) + ppt_teardown_msix_intr(ppt, i); + + if (ppt->msix.msix_table_res) { + bus_release_resource(ppt->dev, SYS_RES_MEMORY, + ppt->msix.msix_table_rid, + ppt->msix.msix_table_res); + ppt->msix.msix_table_res = NULL; + ppt->msix.msix_table_rid = 0; + } + + free(ppt->msix.res, M_PPTMSIX); + free(ppt->msix.cookie, M_PPTMSIX); + free(ppt->msix.arg, M_PPTMSIX); + + error = pci_release_msi(ppt->dev); + if (error) + printf("ppt_teardown_msix: Failed to release MSI-X resources (error %i)\n", error); + + ppt->msix.num_msgs = 0; +} + int ppt_assign_device(struct vm *vm, int bus, int slot, int func) { @@ -244,6 +309,7 @@ ppt_unassign_device(struct vm *vm, int bus, int slot, int func) return (EBUSY); ppt_unmap_mmio(vm, ppt); ppt_teardown_msi(ppt); + ppt_teardown_msix(ppt); iommu_remove_device(vm_iommu_domain(vm), bus, slot, func); ppt->vm = NULL; return (0); @@ -309,10 +375,10 @@ pptintr(void *arg) pptarg = arg; ppt = pptarg->pptdev; - vec = ppt->msi.vector + pptarg->msg; + vec = pptarg->vec; if (ppt->vm != NULL) - (void) lapic_set_intr(ppt->vm, ppt->msi.vcpu, vec); + (void) lapic_set_intr(ppt->vm, pptarg->vcpu, vec); else { /* * XXX @@ -431,7 +497,7 @@ ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, break; ppt->msi.arg[i].pptdev = ppt; - ppt->msi.arg[i].msg = i; + ppt->msi.arg[i].vec = vector + i; error = bus_setup_intr(ppt->dev, ppt->msi.res[i], INTR_TYPE_NET | INTR_MPSAFE, @@ -448,3 +514,110 @@ ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, return (0); } + +int +ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, + int idx, uint32_t msg, uint32_t vector_control, uint64_t addr) +{ + struct pptdev *ppt; + struct pci_devinfo *dinfo; + int numvec, vector_count, rid, error; + size_t res_size, cookie_size, arg_size; + + ppt = ppt_find(bus, slot, func); + if (ppt == NULL) + return (ENOENT); + if (ppt->vm != vm) /* Make sure we own this device */ + return (EBUSY); + + dinfo = device_get_ivars(ppt->dev); + if (!dinfo) + return (ENXIO); + + /* + * First-time configuration: + * Allocate the MSI-X table + * Allocate the IRQ resources + * Set up some variables in ppt->msix + */ + if (!ppt->msix.msix_table_res) { + ppt->msix.res = NULL; + ppt->msix.cookie = NULL; + ppt->msix.arg = NULL; + + rid = dinfo->cfg.msix.msix_table_bar; + ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev, SYS_RES_MEMORY, + &rid, RF_ACTIVE); + if (ppt->msix.msix_table_res == NULL) + return (ENOSPC); + + ppt->msix.msix_table_rid = rid; + + vector_count = numvec = pci_msix_count(ppt->dev); + + error = pci_alloc_msix(ppt->dev, &numvec); + if (error) + return (error); + else if (vector_count != numvec) { + pci_release_msi(ppt->dev); + return (ENOSPC); + } + + ppt->msix.num_msgs = numvec; + + ppt->msix.startrid = 1; + + res_size = numvec * sizeof(ppt->msix.res[0]); + cookie_size = numvec * sizeof(ppt->msix.cookie[0]); + arg_size = numvec * sizeof(ppt->msix.arg[0]); + + ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK); + ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX, M_WAITOK); + ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK); + if (ppt->msix.res == NULL || ppt->msix.cookie == NULL || + ppt->msix.arg == NULL) { + ppt_teardown_msix(ppt); + return (ENOSPC); + } + bzero(ppt->msix.res, res_size); + bzero(ppt->msix.cookie, cookie_size); + bzero(ppt->msix.arg, arg_size); + } + + if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + /* Tear down the IRQ if it's already set up */ + ppt_teardown_msix_intr(ppt, idx); + + /* Allocate the IRQ resource */ + ppt->msix.cookie[idx] = NULL; + rid = ppt->msix.startrid + idx; + ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, + &rid, RF_ACTIVE); + if (ppt->msix.res[idx] == NULL) + return (ENXIO); + + ppt->msix.arg[idx].pptdev = ppt; + ppt->msix.arg[idx].vec = msg; + ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF; + + /* Setup the MSI-X interrupt */ + error = bus_setup_intr(ppt->dev, ppt->msix.res[idx], + INTR_TYPE_NET | INTR_MPSAFE, + pptintr, NULL, &ppt->msix.arg[idx], + &ppt->msix.cookie[idx]); + + if (error != 0) { + bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]); + bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]); + ppt->msix.cookie[idx] = NULL; + ppt->msix.res[idx] = NULL; + return (ENXIO); + } + } else { + /* Masked, tear it down if it's already been set up */ + ppt_teardown_msix_intr(ppt, idx); + } + + return (0); +} + diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h index 95f3ad0..63c8228 100644 --- a/sys/amd64/vmm/io/ppt.h +++ b/sys/amd64/vmm/io/ppt.h @@ -36,5 +36,6 @@ int ppt_map_mmio(struct vm *vm, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, int destcpu, int vector, int numvec); - +int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, + int idx, uint32_t msg, uint32_t vector_control, uint64_t addr); #endif diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 61adef9..6c91128 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -778,6 +778,7 @@ vlapic_init(struct vm *vm, int vcpuid) void vlapic_cleanup(struct vlapic *vlapic) { + vlapic_op_halt(vlapic); vdev_unregister(vlapic); free(vlapic, M_VLAPIC); } diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index 8f124a5..571c37c 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -158,6 +158,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct vm_pptdev *pptdev; struct vm_pptdev_mmio *pptmmio; struct vm_pptdev_msi *pptmsi; + struct vm_pptdev_msix *pptmsix; struct vm_nmi *vmnmi; struct vm_stats *vmstats; struct vm_stat_desc *statdesc; @@ -240,6 +241,14 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, pptmsi->destcpu, pptmsi->vector, pptmsi->numvec); break; + case VM_PPTDEV_MSIX: + pptmsix = (struct vm_pptdev_msix *)data; + error = ppt_setup_msix(sc->vm, pptmsix->vcpu, + pptmsix->bus, pptmsix->slot, + pptmsix->func, pptmsix->idx, + pptmsix->msg, pptmsix->vector_control, + pptmsix->addr); + break; case VM_MAP_PPTDEV_MMIO: pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, -- cgit v1.1 From fc13a01d538ded0843702a871a58cba4147b6037 Mon Sep 17 00:00:00 2001 From: grehan Date: Thu, 3 May 2012 05:04:37 +0000 Subject: Until the issue of how to handle guest XCR0 state is resolved, prevent CURRENT guests from hitting unhandled xsetbv exits by hiding the xsave/osxsave/avx cpuid2 bits. --- sys/amd64/vmm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c index 93c21d7..669fa4b 100644 --- a/sys/amd64/vmm/x86.c +++ b/sys/amd64/vmm/x86.c @@ -105,6 +105,13 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx, regs[2] |= CPUID2_X2APIC | CPUID2_HV; /* + * Hide xsave/osxsave/avx until the FPU save/restore + * issues are resolved + */ + regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE | + CPUID2_AVX); + + /* * Hide thermal monitoring */ regs[3] &= ~(CPUID_ACPI | CPUID_TM); -- cgit v1.1 From add4e182f64694d0cd5926bbe0e8008e85eccece Mon Sep 17 00:00:00 2001 From: neel Date: Wed, 25 Jul 2012 00:21:16 +0000 Subject: Verify that VMX operation has been enabled by BIOS before executing the VMXON instruction. Reported by "s vas" on freebsd-virtualization@ --- sys/amd64/vmm/intel/vmx.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 4bbcea8..df28fe9 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -427,7 +427,7 @@ static int vmx_init(void) { int error; - uint64_t fixed0, fixed1; + uint64_t fixed0, fixed1, feature_control; uint32_t tmp; /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ @@ -436,6 +436,16 @@ vmx_init(void) return (ENXIO); } + /* + * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits + * are set (bits 0 and 2 respectively). + */ + feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); + if ((feature_control & 0x5) != 0x5) { + printf("vmx_init: VMX operation disabled by BIOS\n"); + return (ENXIO); + } + /* Check support for primary processor-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, -- cgit v1.1 From d40b98f60b39af7a76f82076ee5e53f389dc3cba Mon Sep 17 00:00:00 2001 From: neel Date: Sat, 4 Aug 2012 02:06:55 +0000 Subject: Force certain bits in %cr4 to be hard-wired to '1' or '0' from a guest's perspective. If we don't do this some guest OSes (e.g. Linux) will reset the CR4_VMXE bit in %cr4 with disastrous consequences. Reported by: grehan --- sys/amd64/vmm/intel/vmx.c | 68 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 16 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index df28fe9..be58444 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -627,23 +627,38 @@ vmx_vpid(void) } static int -vmx_setup_cr0_shadow(struct vmcs *vmcs) +vmx_setup_cr_shadow(int which, struct vmcs *vmcs) { - int error; - uint64_t mask, shadow; + int error, mask_ident, shadow_ident; + uint64_t mask_value, shadow_value; + + if (which != 0 && which != 4) + panic("vmx_setup_cr_shadow: unknown cr%d", which); + + if (which == 0) { + mask_ident = VMCS_CR0_MASK; + mask_value = cr0_ones_mask | cr0_zeros_mask; + shadow_ident = VMCS_CR0_SHADOW; + shadow_value = cr0_ones_mask; + } else { + mask_ident = VMCS_CR4_MASK; + mask_value = cr4_ones_mask | cr4_zeros_mask; + shadow_ident = VMCS_CR4_SHADOW; + shadow_value = cr4_ones_mask; + } - mask = cr0_ones_mask | cr0_zeros_mask; - error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_MASK), mask); + error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value); if (error) return (error); - shadow = cr0_ones_mask; - error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_SHADOW), shadow); + error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value); if (error) return (error); return (0); } +#define vmx_setup_cr0_shadow(vmcs) vmx_setup_cr_shadow(0, (vmcs)) +#define vmx_setup_cr4_shadow(vmcs) vmx_setup_cr_shadow(4, (vmcs)) static void * vmx_vminit(struct vm *vm) @@ -744,6 +759,12 @@ vmx_vminit(struct vm *vm) panic("vmcs_set_msr_save error %d", error); error = vmx_setup_cr0_shadow(&vmx->vmcs[i]); + if (error != 0) + panic("vmx_setup_cr0_shadow %d", error); + + error = vmx_setup_cr4_shadow(&vmx->vmcs[i]); + if (error != 0) + panic("vmx_setup_cr4_shadow %d", error); } return (vmx); @@ -1031,12 +1052,16 @@ cantinject: static int vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) { - int error; - uint64_t regval; + int error, cr, vmcs_guest_cr; + uint64_t regval, ones_mask, zeros_mask; const struct vmxctx *vmxctx; - /* We only handle mov to %cr0 at this time */ - if ((exitqual & 0xff) != 0x00) + /* We only handle mov to %cr0 or %cr4 at this time */ + if ((exitqual & 0xf0) != 0x00) + return (UNHANDLED); + + cr = exitqual & 0xf; + if (cr != 0 && cr != 4) return (UNHANDLED); vmxctx = &vmx->ctx[vcpu]; @@ -1100,11 +1125,22 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) break; } - regval |= cr0_ones_mask; - regval &= ~cr0_zeros_mask; - error = vmwrite(VMCS_GUEST_CR0, regval); - if (error) - panic("vmx_emulate_cr_access: error %d writing cr0", error); + if (cr == 0) { + ones_mask = cr0_ones_mask; + zeros_mask = cr0_zeros_mask; + vmcs_guest_cr = VMCS_GUEST_CR0; + } else { + ones_mask = cr4_ones_mask; + zeros_mask = cr4_zeros_mask; + vmcs_guest_cr = VMCS_GUEST_CR4; + } + regval |= ones_mask; + regval &= ~zeros_mask; + error = vmwrite(vmcs_guest_cr, regval); + if (error) { + panic("vmx_emulate_cr_access: error %d writing cr%d", + error, cr); + } return (HANDLED); } -- cgit v1.1 From 66c8120152f661ab4690b86ac87beeb00cc887e5 Mon Sep 17 00:00:00 2001 From: neel Date: Sat, 4 Aug 2012 04:30:26 +0000 Subject: Include 'device uart' in the guest kernel. --- sys/amd64/conf/BHYVE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/BHYVE b/sys/amd64/conf/BHYVE index de36445..89c8ea2 100644 --- a/sys/amd64/conf/BHYVE +++ b/sys/amd64/conf/BHYVE @@ -176,7 +176,7 @@ device pci #device cardbus # CardBus (32-bit) bus # Serial (COM) ports -#device uart # Generic UART driver +device uart # Generic UART driver # Parallel port #device ppc -- cgit v1.1 From 6c5ad005bed33e80c94460b6694d199348dac472 Mon Sep 17 00:00:00 2001 From: grehan Date: Sun, 26 Aug 2012 01:41:41 +0000 Subject: Add sysctls to display the total and free amount of hard-wired mem for VMs # sysctl hw.vmm hw.vmm.mem_free: 2145386496 hw.vmm.mem_total: 2145386496 Submitted by: Takeshi HASEGAWA hasegaw at gmail com --- sys/amd64/vmm/vmm_dev.c | 19 +++++++++++++++++++ sys/amd64/vmm/vmm_mem.c | 24 ++++++++++++++++++++++++ sys/amd64/vmm/vmm_mem.h | 3 +++ 3 files changed, 46 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index 571c37c..116b5f1 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include #include "vmm_lapic.h" #include "vmm_stat.h" +#include "vmm_mem.h" #include "io/ppt.h" #include @@ -458,6 +459,24 @@ sysctl_vmm_create(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, NULL, 0, sysctl_vmm_create, "A", NULL); +static int +sysctl_vmm_mem_total(SYSCTL_HANDLER_ARGS) +{ + size_t val = vmm_mem_get_mem_total(); + return sysctl_handle_long(oidp, &val, 0, req); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, mem_total, CTLTYPE_LONG | CTLFLAG_RD, + 0, 0, sysctl_vmm_mem_total, "LU", "Amount of Total memory"); + +static int +sysctl_vmm_mem_free(SYSCTL_HANDLER_ARGS) +{ + size_t val = vmm_mem_get_mem_free(); + return sysctl_handle_long(oidp, &val, 0, req); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, mem_free, CTLTYPE_LONG | CTLFLAG_RD, + 0, 0, sysctl_vmm_mem_free, "LU", "Amount of Free memory"); + void vmmdev_init(void) { diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c index 764a6e9..54f98ac 100644 --- a/sys/amd64/vmm/vmm_mem.c +++ b/sys/amd64/vmm/vmm_mem.c @@ -63,6 +63,7 @@ static struct { } vmm_mem_avail[VMM_MEM_MAXSEGS]; static int vmm_mem_nsegs; +size_t vmm_mem_total_bytes; static vm_paddr_t maxaddr; @@ -96,6 +97,7 @@ vmm_mem_steal_memory(void) smapsize = *((uint32_t *)smapbase - 1); smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); + vmm_mem_total_bytes = 0; nsegs = 0; for (smap = smapbase; smap < smapend; smap++) { /* @@ -131,6 +133,7 @@ vmm_mem_steal_memory(void) vmm_mem_avail[nsegs].base = base; vmm_mem_avail[nsegs].length = length; + vmm_mem_total_bytes += length; if (base + length > maxaddr) maxaddr = base + length; @@ -344,6 +347,27 @@ vmm_mem_alloc(size_t size) return (addr); } +size_t +vmm_mem_get_mem_total(void) +{ + return vmm_mem_total_bytes; +} + +size_t +vmm_mem_get_mem_free(void) +{ + size_t length = 0; + int i; + + mtx_lock(&vmm_mem_mtx); + for (i = 0; i < vmm_mem_nsegs; i++) { + length += vmm_mem_avail[i].length; + } + mtx_unlock(&vmm_mem_mtx); + + return(length); +} + void vmm_mem_free(vm_paddr_t base, size_t length) { diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h index ef1bf1a..a83e9be 100644 --- a/sys/amd64/vmm/vmm_mem.h +++ b/sys/amd64/vmm/vmm_mem.h @@ -35,4 +35,7 @@ void vmm_mem_free(vm_paddr_t start, size_t size); vm_paddr_t vmm_mem_maxaddr(void); void vmm_mem_dump(void); +size_t vmm_mem_get_mem_total(void); +size_t vmm_mem_get_mem_free(void); + #endif -- cgit v1.1 From c0caea8c2fc75a9ca5f5a67dd11462ef6542afc2 Mon Sep 17 00:00:00 2001 From: neel Date: Fri, 21 Sep 2012 03:09:23 +0000 Subject: Restructure the x2apic access code in preparation for supporting memory mapped access to the local apic. The vlapic code is now aware of the mode that the guest is using to access the local apic. Reviewed by: grehan@ --- sys/amd64/vmm/io/vlapic.c | 34 +++++++++++++++++++-- sys/amd64/vmm/io/vlapic.h | 3 ++ sys/amd64/vmm/vmm_lapic.c | 78 +++++++++++++++++++++++++++++++++++++++-------- sys/amd64/vmm/vmm_lapic.h | 6 ++-- sys/amd64/vmm/vmm_msr.c | 46 +++------------------------- 5 files changed, 108 insertions(+), 59 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 6c91128..f1d363f 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include @@ -86,6 +87,8 @@ static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); #define VLAPIC_VERSION (16) #define VLAPIC_MAXLVT_ENTRIES (5) +#define x2apic(vlapic) ((vlapic)->msr_apicbase & APICBASE_X2APIC) + struct vlapic { struct vm *vm; int vcpuid; @@ -107,6 +110,8 @@ struct vlapic { */ uint8_t isrvec_stk[ISRVEC_STK_SIZE]; int isrvec_stk_top; + + uint64_t msr_apicbase; }; static void @@ -161,7 +166,6 @@ vlapic_op_reset(void* dev) struct LAPIC *lapic = &vlapic->apic; memset(lapic, 0, sizeof(*lapic)); - lapic->id = vlapic->vcpuid << 24; lapic->apr = vlapic->vcpuid; vlapic_init_ipi(vlapic); @@ -542,7 +546,10 @@ vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data) switch(offset) { case APIC_OFFSET_ID: - *data = lapic->id; + if (x2apic(vlapic)) + *data = vlapic->vcpuid; + else + *data = vlapic->vcpuid << 24; break; case APIC_OFFSET_VER: *data = lapic->version; @@ -631,7 +638,6 @@ vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data) switch(offset) { case APIC_OFFSET_ID: - lapic->id = data; break; case APIC_OFFSET_TPR: lapic->tpr = data & 0xff; @@ -760,6 +766,14 @@ vlapic_init(struct vm *vm, int vcpuid) vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO); vlapic->vm = vm; vlapic->vcpuid = vcpuid; + + vlapic->msr_apicbase = DEFAULT_APIC_BASE | + APICBASE_ENABLED | + APICBASE_X2APIC; + + if (vcpuid == 0) + vlapic->msr_apicbase |= APICBASE_BSP; + vlapic->ops = &vlapic_dev_ops; vlapic->mmio = vlapic_mmio + vcpuid; @@ -782,3 +796,17 @@ vlapic_cleanup(struct vlapic *vlapic) vdev_unregister(vlapic); free(vlapic, M_VLAPIC); } + +uint64_t +vlapic_get_apicbase(struct vlapic *vlapic) +{ + + return (vlapic->msr_apicbase); +} + +void +vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val) +{ + + vlapic->msr_apicbase = val; +} diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h index 861ea8c..cecd4d3 100644 --- a/sys/amd64/vmm/io/vlapic.h +++ b/sys/amd64/vmm/io/vlapic.h @@ -102,4 +102,7 @@ void vlapic_intr_accepted(struct vlapic *vlapic, int vector); void vlapic_set_intr_ready(struct vlapic *vlapic, int vector); void vlapic_timer_tick(struct vlapic *vlapic); +uint64_t vlapic_get_apicbase(struct vlapic *vlapic); +void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val); + #endif /* _VLAPIC_H_ */ diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c index 4aca087..13550b4 100644 --- a/sys/amd64/vmm/vmm_lapic.c +++ b/sys/amd64/vmm/vmm_lapic.c @@ -33,20 +33,18 @@ __FBSDID("$FreeBSD$"); #include #include +#include + #include #include "vmm_ipi.h" #include "vmm_lapic.h" #include "vlapic.h" -int -lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val) +static int +lapic_write(struct vlapic *vlapic, u_int offset, uint64_t val) { int handled; - struct vlapic *vlapic; - - vlapic = vm_lapic(vm, cpu); - if (vlapic_op_mem_write(vlapic, offset, DWORD, val) == 0) handled = 1; else @@ -55,15 +53,11 @@ lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val) return (handled); } -int -lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *rv) +static int +lapic_read(struct vlapic *vlapic, u_int offset, uint64_t *rv) { int handled; - struct vlapic *vlapic; - - vlapic = vm_lapic(vm, cpu); - if (vlapic_op_mem_read(vlapic, offset, DWORD, rv) == 0) handled = 1; else @@ -120,3 +114,63 @@ lapic_timer_tick(struct vm *vm, int cpu) vlapic_timer_tick(vlapic); } + +static boolean_t +x2apic_msr(u_int msr) +{ + if (msr >= 0x800 && msr <= 0xBFF) + return (TRUE); + else + return (FALSE); +} + +static u_int +x2apic_msr_to_regoff(u_int msr) +{ + + return ((msr - 0x800) << 4); +} + +boolean_t +lapic_msr(u_int msr) +{ + + if (x2apic_msr(msr) || (msr == MSR_APICBASE)) + return (TRUE); + else + return (FALSE); +} + +int +lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval) +{ + int handled; + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (msr == MSR_APICBASE) { + *rval = vlapic_get_apicbase(vlapic); + handled = 1; + } else + handled = lapic_read(vlapic, x2apic_msr_to_regoff(msr), rval); + + return (handled); +} + +int +lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val) +{ + int handled; + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (msr == MSR_APICBASE) { + vlapic_set_apicbase(vlapic, val); + handled = 1; + } else + handled = lapic_write(vlapic, x2apic_msr_to_regoff(msr), val); + + return (handled); +} diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h index 815b2f7..60f7696 100644 --- a/sys/amd64/vmm/vmm_lapic.h +++ b/sys/amd64/vmm/vmm_lapic.h @@ -31,8 +31,10 @@ struct vm; -int lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val); -int lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *retval); +boolean_t lapic_msr(u_int num); +int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval); +int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval); + void lapic_timer_tick(struct vm *vm, int cpu); /* diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c index 31bfcab..bc67f98 100644 --- a/sys/amd64/vmm/vmm_msr.c +++ b/sys/amd64/vmm/vmm_msr.c @@ -34,7 +34,6 @@ __FBSDID("$FreeBSD$"); #include #include -#include #include #include "vmm_lapic.h" @@ -56,7 +55,6 @@ static struct vmm_msr vmm_msr[] = { { MSR_STAR, 0 }, { MSR_SF_MASK, 0 }, { MSR_PAT, VMM_MSR_F_EMULATE | VMM_MSR_F_INVALID }, - { MSR_APICBASE, VMM_MSR_F_EMULATE }, { MSR_BIOS_SIGN,VMM_MSR_F_EMULATE }, { MSR_MCG_CAP, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY }, }; @@ -107,12 +105,6 @@ guest_msrs_init(struct vm *vm, int cpu) case MSR_MCG_CAP: guest_msrs[i] = 0; break; - case MSR_APICBASE: - guest_msrs[i] = DEFAULT_APIC_BASE | APICBASE_ENABLED | - APICBASE_X2APIC; - if (cpu == 0) - guest_msrs[i] |= APICBASE_BSP; - break; case MSR_PAT: guest_msrs[i] = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | @@ -130,29 +122,6 @@ guest_msrs_init(struct vm *vm, int cpu) } } -static boolean_t -x2apic_msr(u_int num) -{ - - if (num >= 0x800 && num <= 0xBFF) - return (TRUE); - else - return (FALSE); -} - -static u_int -x2apic_msr_to_regoff(u_int msr) -{ - - return ((msr - 0x800) << 4); -} - -static boolean_t -x2apic_msr_id(u_int num) -{ - return (num == 0x802); -} - static int msr_num_to_idx(u_int num) { @@ -173,8 +142,8 @@ emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val) handled = 0; - if (x2apic_msr(num)) - return (lapic_write(vm, cpu, x2apic_msr_to_regoff(num), val)); + if (lapic_msr(num)) + return (lapic_wrmsr(vm, cpu, num, val)); idx = msr_num_to_idx(num); if (idx < 0) @@ -208,15 +177,8 @@ emulate_rdmsr(struct vm *vm, int cpu, u_int num) handled = 0; - if (x2apic_msr(num)) { - handled = lapic_read(vm, cpu, x2apic_msr_to_regoff(num), - &result); - /* - * The version ID needs to be massaged - */ - if (x2apic_msr_id(num)) { - result = result >> 24; - } + if (lapic_msr(num)) { + handled = lapic_rdmsr(vm, cpu, num, &result); goto done; } -- cgit v1.1 From 34b672cc8af9ef3fbee45a3c9cc28a7e30c9ef16 Mon Sep 17 00:00:00 2001 From: neel Date: Mon, 24 Sep 2012 19:32:24 +0000 Subject: Stash the 'vm_exit' information in each 'struct vcpu'. There is no functional change at this time but this paves the way for vm exit handler functions to easily modify the exit reason going forward. --- sys/amd64/include/vmm.h | 4 ++-- sys/amd64/vmm/amd/amdv.c | 2 +- sys/amd64/vmm/intel/vmx.c | 5 ++++- sys/amd64/vmm/vmm.c | 23 ++++++++++++++++++++--- 4 files changed, 27 insertions(+), 7 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 1ad01c6..61faf56 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -43,8 +43,7 @@ struct vlapic; typedef int (*vmm_init_func_t)(void); typedef int (*vmm_cleanup_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */ -typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, - struct vm_exit *vmexit); +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef int (*vmi_mmap_func_t)(void *vmi, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, vm_memattr_t attr, @@ -112,6 +111,7 @@ int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); int vm_set_capability(struct vm *vm, int vcpu, int type, int val); void vm_activate_cpu(struct vm *vm, int vcpu); cpuset_t vm_active_cpus(struct vm *vm); +struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); /* * Return 1 if device indicated by bus/slot/func is supposed to be a diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c index 6844cc0..674337d 100644 --- a/sys/amd64/vmm/amd/amdv.c +++ b/sys/amd64/vmm/amd/amdv.c @@ -62,7 +62,7 @@ amdv_vminit(struct vm *vm) } static int -amdv_vmrun(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit) +amdv_vmrun(void *arg, int vcpu, register_t rip) { printf("amdv_vmrun: not implemented\n"); diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index be58444..88f870c 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1272,19 +1272,22 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) } static int -vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit) +vmx_run(void *arg, int vcpu, register_t rip) { int error, vie, rc, handled, astpending; uint32_t exit_reason; struct vmx *vmx; struct vmxctx *vmxctx; struct vmcs *vmcs; + struct vm_exit *vmexit; vmx = arg; vmcs = &vmx->vmcs[vcpu]; vmxctx = &vmx->ctx[vcpu]; vmxctx->launched = 0; + vmexit = vm_exitinfo(vmx->vm, vcpu); + /* * XXX Can we avoid doing this every time we do a vm run? */ diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 62cc2a2..d896f6d 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -72,6 +72,7 @@ struct vcpu { int vcpuid; struct savefpu *guestfpu; /* guest fpu state */ void *stats; + struct vm_exit exitinfo; }; #define VCPU_F_PINNED 0x0001 #define VCPU_F_RUNNING 0x0002 @@ -110,8 +111,8 @@ static struct vmm_ops *ops; #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) #define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL) -#define VMRUN(vmi, vcpu, rip, vmexit) \ - (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, vmexit) : ENXIO) +#define VMRUN(vmi, vcpu, rip) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO) #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) #define VMMMAP(vmi, gpa, hpa, len, attr, prot, spm) \ (ops != NULL ? (*ops->vmmmap)(vmi, gpa, hpa, len, attr, prot, spm) : ENXIO) @@ -164,6 +165,19 @@ vcpu_init(struct vm *vm, uint32_t vcpu_id) vcpu->stats = vmm_stat_alloc(); } +struct vm_exit * +vm_exitinfo(struct vm *vm, int cpuid) +{ + struct vcpu *vcpu; + + if (cpuid < 0 || cpuid >= VM_MAXCPU) + panic("vm_exitinfo: invalid cpuid %d", cpuid); + + vcpu = &vm->vcpu[cpuid]; + + return (&vcpu->exitinfo); +} + static int vmm_init(void) { @@ -545,12 +559,15 @@ vm_run(struct vm *vm, struct vm_run *vmrun) restore_guest_msrs(vm, vcpuid); restore_guest_fpustate(vcpu); - error = VMRUN(vm->cookie, vcpuid, vmrun->rip, &vmrun->vm_exit); + error = VMRUN(vm->cookie, vcpuid, vmrun->rip); save_guest_fpustate(vcpu); restore_host_msrs(vm, vcpuid); vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); + /* copy the exit information */ + bcopy(&vcpu->exitinfo, &vmrun->vm_exit, sizeof(struct vm_exit)); + critical_exit(); return (error); -- cgit v1.1 From c34be7b811ad199e64f66db339e7f64c773ca0a7 Mon Sep 17 00:00:00 2001 From: neel Date: Tue, 25 Sep 2012 02:33:25 +0000 Subject: Add an explicit exit code 'SPINUP_AP' to tell the controlling process that an AP needs to be activated by spinning up an execution context for it. The local apic emulation is now completely done in the hypervisor and it will detect writes to the ICR_LO register that try to bring up the AP. In response to such writes it will return to userspace with an exit code of SPINUP_AP. Reviewed by: grehan --- sys/amd64/include/vmm.h | 5 ++++ sys/amd64/vmm/intel/vmx.c | 8 +++++++ sys/amd64/vmm/io/vlapic.c | 59 +++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 67 insertions(+), 5 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 61faf56..e841963 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -228,6 +228,7 @@ enum vm_exitcode { VM_EXITCODE_MTRAP, VM_EXITCODE_PAUSE, VM_EXITCODE_PAGING, + VM_EXITCODE_SPINUP_AP, VM_EXITCODE_MAX }; @@ -260,6 +261,10 @@ struct vm_exit { uint32_t code; /* ecx value */ uint64_t wval; } msr; + struct { + int vcpu; + uint64_t rip; + } spinup_ap; } u; }; diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 88f870c..6689013 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1253,6 +1253,14 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) vm_exit_update_rip(vmexit); vmexit->rip += vmexit->inst_length; vmexit->inst_length = 0; + + /* + * Special case for spinning up an AP - exit to userspace to + * give the controlling process a chance to intercept and + * spin up a thread for the AP. + */ + if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP) + handled = 0; } else { if (vmexit->exitcode == VM_EXITCODE_BOGUS) { /* diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index f1d363f..9b7d3cb 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -89,6 +89,12 @@ static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); #define x2apic(vlapic) ((vlapic)->msr_apicbase & APICBASE_X2APIC) +enum boot_state { + BS_INIT, + BS_SIPI, + BS_RUNNING +}; + struct vlapic { struct vm *vm; int vcpuid; @@ -112,6 +118,7 @@ struct vlapic { int isrvec_stk_top; uint64_t msr_apicbase; + enum boot_state boot_state; }; static void @@ -168,6 +175,11 @@ vlapic_op_reset(void* dev) memset(lapic, 0, sizeof(*lapic)); lapic->apr = vlapic->vcpuid; vlapic_init_ipi(vlapic); + + if (vlapic->vcpuid == 0) + vlapic->boot_state = BS_RUNNING; /* BSP */ + else + vlapic->boot_state = BS_INIT; /* AP */ return 0; @@ -418,6 +430,8 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval) int i; cpuset_t dmask; uint32_t dest, vec, mode; + struct vlapic *vlapic2; + struct vm_exit *vmexit; dest = icrval >> 32; vec = icrval & APIC_VECTOR_MASK; @@ -452,11 +466,46 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval) return (0); /* handled completely in the kernel */ } - /* - * XXX this assumes that the startup IPI always succeeds - */ - if (mode == APIC_DELMODE_STARTUP) - vm_activate_cpu(vlapic->vm, dest); + if (mode == APIC_DELMODE_INIT) { + if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) + return (0); + + if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { + vlapic2 = vm_lapic(vlapic->vm, dest); + + /* move from INIT to waiting-for-SIPI state */ + if (vlapic2->boot_state == BS_INIT) { + vlapic2->boot_state = BS_SIPI; + } + + return (0); + } + } + + if (mode == APIC_DELMODE_STARTUP) { + if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { + vlapic2 = vm_lapic(vlapic->vm, dest); + + /* + * Ignore SIPIs in any state other than wait-for-SIPI + */ + if (vlapic2->boot_state != BS_SIPI) + return (0); + + vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); + vmexit->exitcode = VM_EXITCODE_SPINUP_AP; + vmexit->u.spinup_ap.vcpu = dest; + vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT; + + /* + * XXX this assumes that the startup IPI always succeeds + */ + vlapic2->boot_state = BS_RUNNING; + vm_activate_cpu(vlapic2->vm, dest); + + return (0); + } + } /* * This will cause a return to userland. -- cgit v1.1 From ebdd69568d7fa97153aa47a86afe367476a0a1de Mon Sep 17 00:00:00 2001 From: neel Date: Tue, 25 Sep 2012 19:08:51 +0000 Subject: Add ioctls to control the X2APIC capability exposed by the virtual machine to the guest. At the moment this simply sets the state in the 'vcpu' instance but there is no code that acts upon these settings. --- sys/amd64/include/vmm.h | 11 +++++++++++ sys/amd64/include/vmm_dev.h | 11 +++++++++++ sys/amd64/vmm/vmm.c | 27 +++++++++++++++++++++++++++ sys/amd64/vmm/vmm_dev.c | 12 ++++++++++++ 4 files changed, 61 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index e841963..0b3a29c 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -40,6 +40,8 @@ struct vm_exit; struct vm_run; struct vlapic; +enum x2apic_state; + typedef int (*vmm_init_func_t)(void); typedef int (*vmm_cleanup_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */ @@ -109,6 +111,8 @@ uint64_t *vm_guest_msrs(struct vm *vm, int cpu); struct vlapic *vm_lapic(struct vm *vm, int cpu); int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); int vm_set_capability(struct vm *vm, int vcpu, int type, int val); +int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); +int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); void vm_activate_cpu(struct vm *vm, int vcpu); cpuset_t vm_active_cpus(struct vm *vm); struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); @@ -205,6 +209,13 @@ enum vm_cap_type { VM_CAP_MAX }; +enum x2apic_state { + X2APIC_ENABLED, + X2APIC_AVAILABLE, + X2APIC_DISABLED, + X2APIC_STATE_LAST +}; + /* * The 'access' field has the format specified in Table 21-2 of the Intel * Architecture Manual vol 3b. diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index d1a50d6..fc64fd8 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -136,6 +136,11 @@ struct vm_stat_desc { char desc[128]; /* out */ }; +struct vm_x2apic { + int cpuid; + enum x2apic_state state; +}; + enum { IOCNUM_RUN, IOCNUM_SET_PINNING, @@ -158,6 +163,8 @@ enum { IOCNUM_INJECT_NMI, IOCNUM_VM_STATS, IOCNUM_VM_STAT_DESC, + IOCNUM_SET_X2APIC_STATE, + IOCNUM_GET_X2APIC_STATE, }; #define VM_RUN \ @@ -202,4 +209,8 @@ enum { _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) #define VM_STAT_DESC \ _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) +#define VM_SET_X2APIC_STATE \ + _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic) +#define VM_GET_X2APIC_STATE \ + _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic) #endif diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index d896f6d..29dbe67 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -73,6 +73,7 @@ struct vcpu { struct savefpu *guestfpu; /* guest fpu state */ void *stats; struct vm_exit exitinfo; + enum x2apic_state x2apic_state; }; #define VCPU_F_PINNED 0x0001 #define VCPU_F_RUNNING 0x0002 @@ -163,6 +164,7 @@ vcpu_init(struct vm *vm, uint32_t vcpu_id) vcpu->guestfpu = fpu_save_area_alloc(); fpu_save_area_reset(vcpu->guestfpu); vcpu->stats = vmm_stat_alloc(); + vcpu->x2apic_state = X2APIC_ENABLED; } struct vm_exit * @@ -745,3 +747,28 @@ vcpu_stats(struct vm *vm, int vcpuid) return (vm->vcpu[vcpuid].stats); } + +int +vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) +{ + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + *state = vm->vcpu[vcpuid].x2apic_state; + + return (0); +} + +int +vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) +{ + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (state < 0 || state >= X2APIC_STATE_LAST) + return (EINVAL); + + vm->vcpu[vcpuid].x2apic_state = state; + + return (0); +} diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index 116b5f1..686ddec 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -163,6 +163,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct vm_nmi *vmnmi; struct vm_stats *vmstats; struct vm_stat_desc *statdesc; + struct vm_x2apic *x2apic; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup2(cdev); @@ -185,6 +186,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, case VM_GET_CAPABILITY: case VM_SET_CAPABILITY: case VM_PPTDEV_MSI: + case VM_SET_X2APIC_STATE: /* * XXX fragile, handle with care * Assumes that the first field of the ioctl data is the vcpu. @@ -335,6 +337,16 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, vmcap->captype, vmcap->capval); break; + case VM_SET_X2APIC_STATE: + x2apic = (struct vm_x2apic *)data; + error = vm_set_x2apic_state(sc->vm, + x2apic->cpuid, x2apic->state); + break; + case VM_GET_X2APIC_STATE: + x2apic = (struct vm_x2apic *)data; + error = vm_get_x2apic_state(sc->vm, + x2apic->cpuid, &x2apic->state); + break; default: error = ENOTTY; break; -- cgit v1.1 From bc269b51afe43aab28df7ea0d543c167bb7c7d2e Mon Sep 17 00:00:00 2001 From: neel Date: Tue, 25 Sep 2012 22:31:35 +0000 Subject: Add support for trapping MMIO writes to local apic registers and emulating them. The default behavior is still to present the local apic to the guest in the x2apic mode. --- sys/amd64/vmm/intel/vmcs.h | 10 + sys/amd64/vmm/intel/vmx.c | 74 +++++-- sys/amd64/vmm/io/vlapic.c | 39 +++- sys/amd64/vmm/vmm.c | 2 +- sys/amd64/vmm/vmm_instruction_emul.c | 385 +++++++++++++++++++++++++++++++++++ sys/amd64/vmm/vmm_instruction_emul.h | 91 +++++++++ sys/amd64/vmm/vmm_lapic.c | 71 +++++++ sys/amd64/vmm/vmm_lapic.h | 3 + sys/amd64/vmm/x86.c | 22 +- sys/amd64/vmm/x86.h | 4 +- 10 files changed, 676 insertions(+), 25 deletions(-) create mode 100644 sys/amd64/vmm/vmm_instruction_emul.c create mode 100644 sys/amd64/vmm/vmm_instruction_emul.h (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h index a7cf4f6..84532f4 100644 --- a/sys/amd64/vmm/intel/vmcs.h +++ b/sys/amd64/vmm/intel/vmcs.h @@ -66,6 +66,7 @@ uint64_t vmcs_read(uint32_t encoding); #define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff) #define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION) #define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3) +#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS) #endif /* _KERNEL */ @@ -324,4 +325,13 @@ uint64_t vmcs_read(uint32_t encoding); */ #define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3 +/* + * Exit qualification for EPT violation + */ +#define EPT_VIOLATION_DATA_READ (1UL << 0) +#define EPT_VIOLATION_DATA_WRITE (1UL << 1) +#define EPT_VIOLATION_INST_FETCH (1UL << 2) +#define EPT_VIOLATION_GLA_VALID (1UL << 7) +#define EPT_VIOLATION_XLAT_VALID (1UL << 8) + #endif diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 6689013..ed0996e 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -48,6 +48,8 @@ __FBSDID("$FreeBSD$"); #include #include +#include + #include #include "vmm_lapic.h" #include "vmm_msr.h" @@ -60,6 +62,7 @@ __FBSDID("$FreeBSD$"); #include "vmx.h" #include "x86.h" #include "vmx_controls.h" +#include "vmm_instruction_emul.h" #define CR4_VMXE (1UL << 13) @@ -771,21 +774,17 @@ vmx_vminit(struct vm *vm) } static int -vmx_handle_cpuid(int vcpu, struct vmxctx *vmxctx) +vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) { int handled, func; func = vmxctx->guest_rax; - handled = x86_emulate_cpuid((uint32_t*)(&vmxctx->guest_rax), - (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx), - (uint32_t*)(&vmxctx->guest_rdx), vcpu); -#if 0 - printf("%s: func %x rax %lx rbx %lx rcx %lx rdx %lx handled %d\n", - __func__, func, vmxctx->guest_rax, vmxctx->guest_rbx, - vmxctx->guest_rcx, vmxctx->guest_rdx, handled); -#endif - + handled = x86_emulate_cpuid(vm, vcpu, + (uint32_t*)(&vmxctx->guest_rax), + (uint32_t*)(&vmxctx->guest_rbx), + (uint32_t*)(&vmxctx->guest_rcx), + (uint32_t*)(&vmxctx->guest_rdx)); return (handled); } @@ -1146,13 +1145,54 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) } static int +vmx_lapic_fault(struct vm *vm, int cpu, + uint64_t gpa, uint64_t rip, uint64_t cr3, uint64_t ept_qual) +{ + int read, write, handled; + + /* + * For this to be a legitimate access to the local apic: + * - the GPA in the local apic page + * - the GPA must be aligned on a 16 byte boundary + */ + if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) + return (UNHANDLED); + + if ((gpa & 0xF) != 0) + return (UNHANDLED); + + /* EPT violation on an instruction fetch doesn't make sense here */ + if (ept_qual & EPT_VIOLATION_INST_FETCH) + return (UNHANDLED); + + /* EPT violation must be a read fault or a write fault but not both */ + read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; + write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; + if ((read ^ write) == 0) + return (UNHANDLED); + + /* + * The EPT violation must have been caused by accessing a guest-physical + * address that is a translation of a guest-linear address. + */ + if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || + (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { + return (UNHANDLED); + } + + handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, rip, cr3); + + return (handled); +} + +static int vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { int handled; struct vmcs *vmcs; struct vmxctx *vmxctx; uint32_t eax, ecx, edx; - uint64_t qual; + uint64_t qual, gpa, cr3; handled = 0; vmcs = &vmx->vmcs[vcpu]; @@ -1229,11 +1269,17 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); break; case EXIT_REASON_CPUID: - handled = vmx_handle_cpuid(vcpu, vmxctx); + handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); break; case EXIT_REASON_EPT_FAULT: - vmexit->exitcode = VM_EXITCODE_PAGING; - vmexit->u.paging.cr3 = vmcs_guest_cr3(); + gpa = vmcs_gpa(); + cr3 = vmcs_guest_cr3(); + handled = vmx_lapic_fault(vmx->vm, vcpu, + gpa, vmexit->rip, cr3, qual); + if (!handled) { + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->u.paging.cr3 = cr3; + } break; default: break; diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 9b7d3cb..aedc692 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -87,7 +87,7 @@ static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); #define VLAPIC_VERSION (16) #define VLAPIC_MAXLVT_ENTRIES (5) -#define x2apic(vlapic) ((vlapic)->msr_apicbase & APICBASE_X2APIC) +#define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0) enum boot_state { BS_INIT, @@ -433,7 +433,10 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval) struct vlapic *vlapic2; struct vm_exit *vmexit; - dest = icrval >> 32; + if (x2apic(vlapic)) + dest = icrval >> 32; + else + dest = icrval >> (32 + 24); vec = icrval & APIC_VECTOR_MASK; mode = icrval & APIC_DELMODE_MASK; @@ -703,8 +706,18 @@ vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data) lapic->svr = data; break; case APIC_OFFSET_ICR_LOW: + if (!x2apic(vlapic)) { + data &= 0xffffffff; + data |= (uint64_t)lapic->icr_hi << 32; + } retval = lapic_process_icr(vlapic, data); break; + case APIC_OFFSET_ICR_HI: + if (!x2apic(vlapic)) { + retval = 0; + lapic->icr_hi = data; + } + break; case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: reg = vlapic_get_lvt(vlapic, offset); if (!(lapic->svr & APIC_SVR_ENABLE)) { @@ -810,19 +823,26 @@ static struct io_region vlapic_mmio[VM_MAXCPU]; struct vlapic * vlapic_init(struct vm *vm, int vcpuid) { + int err; + enum x2apic_state state; struct vlapic *vlapic; + err = vm_get_x2apic_state(vm, vcpuid, &state); + if (err) + panic("vlapic_set_apicbase: err %d fetching x2apic state", err); + vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO); vlapic->vm = vm; vlapic->vcpuid = vcpuid; - vlapic->msr_apicbase = DEFAULT_APIC_BASE | - APICBASE_ENABLED | - APICBASE_X2APIC; + vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; if (vcpuid == 0) vlapic->msr_apicbase |= APICBASE_BSP; + if (state == X2APIC_ENABLED) + vlapic->msr_apicbase |= APICBASE_X2APIC; + vlapic->ops = &vlapic_dev_ops; vlapic->mmio = vlapic_mmio + vcpuid; @@ -856,6 +876,15 @@ vlapic_get_apicbase(struct vlapic *vlapic) void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val) { + int err; + enum x2apic_state state; + + err = vm_get_x2apic_state(vlapic->vm, vlapic->vcpuid, &state); + if (err) + panic("vlapic_set_apicbase: err %d fetching x2apic state", err); + + if (state == X2APIC_DISABLED) + val &= ~APICBASE_X2APIC; vlapic->msr_apicbase = val; } diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 29dbe67..764ffbb 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -160,11 +160,11 @@ vcpu_init(struct vm *vm, uint32_t vcpu_id) vcpu->hostcpu = -1; vcpu->vcpuid = vcpu_id; + vcpu->x2apic_state = X2APIC_ENABLED; vcpu->vlapic = vlapic_init(vm, vcpu_id); vcpu->guestfpu = fpu_save_area_alloc(); fpu_save_area_reset(vcpu->guestfpu); vcpu->stats = vmm_stat_alloc(); - vcpu->x2apic_state = X2APIC_ENABLED; } struct vm_exit * diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c new file mode 100644 index 0000000..fe01d69 --- /dev/null +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -0,0 +1,385 @@ +/*- + * Copyright (c) 2012 Sandvine, Inc. + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "vmm_instruction_emul.h" + +#define GB (1024 * 1024 * 1024) + +static enum vm_reg_name gpr_map[16] = { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RBP, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15 +}; + +static void +vie_init(struct vie *vie) +{ + + bzero(vie, sizeof(struct vie)); + + vie->op_size = VIE_OP_SIZE_32BIT; + + vie->base_register = VM_REG_LAST; + vie->index_register = VM_REG_LAST; + vie->operand_register = VM_REG_LAST; +} + +static int +gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys, + uint64_t *gpa, uint64_t *gpaend) +{ + vm_paddr_t hpa; + int nlevels, ptpshift, ptpindex; + uint64_t *ptpbase, pte, pgsize; + + /* + * XXX assumes 64-bit guest with 4 page walk levels + */ + nlevels = 4; + while (--nlevels >= 0) { + /* Zero out the lower 12 bits and the upper 12 bits */ + ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; + + hpa = vm_gpa2hpa(vm, ptpphys, PAGE_SIZE); + if (hpa == -1) + goto error; + + ptpbase = (uint64_t *)PHYS_TO_DMAP(hpa); + + ptpshift = PAGE_SHIFT + nlevels * 9; + ptpindex = (gla >> ptpshift) & 0x1FF; + pgsize = 1UL << ptpshift; + + pte = ptpbase[ptpindex]; + + if ((pte & PG_V) == 0) + goto error; + + if (pte & PG_PS) { + if (pgsize > 1 * GB) + goto error; + else + break; + } + + ptpphys = pte; + } + + /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ + pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; + *gpa = pte | (gla & (pgsize - 1)); + *gpaend = pte + pgsize; + return (0); + +error: + return (-1); +} + +void +vmm_fetch_instruction(struct vm *vm, uint64_t rip, uint64_t cr3, + struct vie *vie) +{ + int n, err; + uint64_t hpa, gpa, gpaend; + + /* + * XXX cache previously fetched instructions using 'rip' as the tag + */ + + vie_init(vie); + + /* + * Copy up to 15 bytes of the instruction stream into 'vie' + */ + while (vie->num_valid < VIE_INST_SIZE) { + err = gla2gpa(vm, rip, cr3, &gpa, &gpaend); + if (err) + break; + + n = min(VIE_INST_SIZE - vie->num_valid, gpaend - gpa); + + hpa = vm_gpa2hpa(vm, gpa, n); + if (hpa == -1) + break; + + bcopy((void *)PHYS_TO_DMAP(hpa), &vie->inst[vie->num_valid], n); + + rip += n; + vie->num_valid += n; + } +} + +static int +vie_peek(struct vie *vie, uint8_t *x) +{ + if (vie->num_processed < vie->num_valid) { + *x = vie->inst[vie->num_processed]; + return (0); + } else + return (-1); +} + +static void +vie_advance(struct vie *vie) +{ + if (vie->num_processed >= vie->num_valid) + panic("vie_advance: %d/%d", vie->num_processed, vie->num_valid); + + vie->num_processed++; +} + +static int +decode_rex(struct vie *vie) +{ + uint8_t x; + + if (vie_peek(vie, &x)) + return (-1); + + if (x >= 0x40 && x <= 0x4F) { + vie->rex_w = x & 0x8 ? 1 : 0; + vie->rex_r = x & 0x4 ? 1 : 0; + vie->rex_x = x & 0x2 ? 1 : 0; + vie->rex_b = x & 0x1 ? 1 : 0; + + vie_advance(vie); + } + + return (0); +} + +static int +decode_opcode(struct vie *vie) +{ + uint8_t x; + + static const uint8_t flags[256] = { + [0x89] = VIE_F_HAS_MODRM | VIE_F_FROM_REG | VIE_F_TO_RM, + [0x8B] = VIE_F_HAS_MODRM | VIE_F_FROM_RM | VIE_F_TO_REG, + [0xC7] = VIE_F_HAS_MODRM | VIE_F_FROM_IMM | VIE_F_TO_RM, + }; + + if (vie_peek(vie, &x)) + return (-1); + + vie->opcode_byte = x; + vie->opcode_flags = flags[x]; + + vie_advance(vie); + + if (vie->opcode_flags == 0) + return (-1); + else + return (0); +} + +/* + * XXX assuming 32-bit or 64-bit guest + */ +static int +decode_modrm(struct vie *vie) +{ + uint8_t x; + + if ((vie->opcode_flags & VIE_F_HAS_MODRM) == 0) + return (0); + + if (vie_peek(vie, &x)) + return (-1); + + vie->mod = (x >> 6) & 0x3; + vie->rm = (x >> 0) & 0x7; + vie->reg = (x >> 3) & 0x7; + + if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || + (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { + /* + * Table 2-5: Special Cases of REX Encodings + * + * mod=0, r/m=5 is used in the compatibility mode to + * indicate a disp32 without a base register. + * + * mod!=3, r/m=4 is used in the compatibility mode to + * indicate that the SIB byte is present. + * + * The 'b' bit in the REX prefix is don't care in + * this case. + */ + } else { + vie->rm |= (vie->rex_b << 3); + } + + vie->reg |= (vie->rex_r << 3); + + /* SIB addressing not supported yet */ + if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) + return (-1); + + vie->base_register = gpr_map[vie->rm]; + + if (vie->opcode_flags & (VIE_F_FROM_REG | VIE_F_TO_REG)) + vie->operand_register = gpr_map[vie->reg]; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + case VIE_MOD_INDIRECT: + if (vie->rm == VIE_RM_DISP32) { + vie->disp_bytes = 4; + vie->base_register = VM_REG_LAST; /* no base */ + } + break; + } + + /* calculate the operand size */ + if (vie->rex_w) + vie->op_size = VIE_OP_SIZE_64BIT; + + if (vie->opcode_flags & VIE_F_FROM_IMM) + vie->imm_bytes = 4; + + vie_advance(vie); + + return (0); +} + +static int +decode_displacement(struct vie *vie) +{ + int n, i; + uint8_t x; + + union { + char buf[4]; + int8_t signed8; + int32_t signed32; + } u; + + if ((n = vie->disp_bytes) == 0) + return (0); + + if (n != 1 && n != 4) + panic("decode_displacement: invalid disp_bytes %d", n); + + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + + if (n == 1) + vie->displacement = u.signed8; /* sign-extended */ + else + vie->displacement = u.signed32; /* sign-extended */ + + return (0); +} + +static int +decode_immediate(struct vie *vie) +{ + int i, n; + uint8_t x; + union { + char buf[4]; + int32_t signed32; + } u; + + if ((n = vie->imm_bytes) == 0) + return (0); + + if (n != 4) + panic("decode_immediate: invalid imm_bytes %d", n); + + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + + vie->immediate = u.signed32; /* sign-extended */ + + return (0); +} + +int +vmm_decode_instruction(struct vie *vie) +{ + if (decode_rex(vie)) + return (-1); + + if (decode_opcode(vie)) + return (-1); + + if (decode_modrm(vie)) + return (-1); + + if (decode_displacement(vie)) + return (-1); + + if (decode_immediate(vie)) + return (-1); + + return (0); +} diff --git a/sys/amd64/vmm/vmm_instruction_emul.h b/sys/amd64/vmm/vmm_instruction_emul.h new file mode 100644 index 0000000..94937f2 --- /dev/null +++ b/sys/amd64/vmm/vmm_instruction_emul.h @@ -0,0 +1,91 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_INSTRUCTION_EMUL_H_ +#define _VMM_INSTRUCTION_EMUL_H_ + +enum vie_op_size { + VIE_OP_SIZE_32BIT, /* default */ + VIE_OP_SIZE_64BIT, + VIE_OP_SIZE_8BIT +}; + +#define VIE_INST_SIZE 15 +struct vie { + uint8_t inst[VIE_INST_SIZE]; + + uint8_t rex_w:1, + rex_r:1, + rex_x:1, + rex_b:1; + + uint8_t mod:2, + reg:4, + rm:4; + + + uint8_t opcode_byte; + uint16_t opcode_flags; + uint8_t disp_bytes; + uint8_t imm_bytes; + + int num_valid; + int num_processed; + + enum vm_reg_name base_register; + enum vm_reg_name index_register; + enum vm_reg_name operand_register; + + int op_size; + int64_t displacement; + int64_t immediate; +}; + +#define VIE_F_HAS_MODRM (1 << 0) +#define VIE_F_FROM_RM (1 << 1) +#define VIE_F_FROM_REG (1 << 2) +#define VIE_F_TO_RM (1 << 3) +#define VIE_F_TO_REG (1 << 4) +#define VIE_F_FROM_IMM (1 << 5) + +#define VIE_MOD_INDIRECT 0 +#define VIE_MOD_INDIRECT_DISP8 1 +#define VIE_MOD_INDIRECT_DISP32 2 +#define VIE_MOD_DIRECT 3 + +#define VIE_RM_SIB 4 +#define VIE_RM_DISP32 5 + +struct vm; + +void vmm_fetch_instruction(struct vm *vm, uint64_t rip, uint64_t cr3, + struct vie *vie); + +int vmm_decode_instruction(struct vie *vie); + +#endif diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c index 13550b4..0d797e6 100644 --- a/sys/amd64/vmm/vmm_lapic.c +++ b/sys/amd64/vmm/vmm_lapic.c @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include "vmm_ipi.h" #include "vmm_lapic.h" #include "vlapic.h" +#include "vmm_instruction_emul.h" static int lapic_write(struct vlapic *vlapic, u_int offset, uint64_t val) @@ -174,3 +175,73 @@ lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val) return (handled); } + +int +lapic_mmio(struct vm *vm, int cpu, u_int offset, int read, + uint64_t rip, uint64_t cr3) +{ + int handled, error; + uint64_t val; + struct vie vie; + struct vlapic *vlapic; + + const int UNHANDLED = 0; + + vlapic = vm_lapic(vm, cpu); + + vmm_fetch_instruction(vm, rip, cr3, &vie); + + if (vmm_decode_instruction(&vie) != 0) + return (UNHANDLED); + + /* Only 32-bit accesses to local apic */ + if (vie.op_size != VIE_OP_SIZE_32BIT) + return (UNHANDLED); + + /* + * XXX + * The operand register in which we store the result of the + * read must be a GPR that we can modify even if the vcpu + * is "running". All the GPRs qualify except for %rsp. + * + * This is a limitation of the vm_set_register() API + * and can be fixed if necessary. + */ + if (vie.operand_register == VM_REG_GUEST_RSP) + return (UNHANDLED); + + if (read) { + if ((vie.opcode_flags & VIE_F_TO_REG) == 0) + return (UNHANDLED); + + if (vie.operand_register >= VM_REG_LAST) + return (UNHANDLED); + + handled = lapic_read(vlapic, offset, &val); + if (handled) { + error = vm_set_register(vm, cpu, vie.operand_register, + val); + if (error) + panic("lapic_mmio: error %d setting gpr %d", + error, vie.operand_register); + } + } else { + if ((vie.opcode_flags & VIE_F_FROM_REG) && + (vie.operand_register < VM_REG_LAST)) { + error = vm_get_register(vm, cpu, vie.operand_register, + &val); + if (error) { + panic("lapic_mmio: error %d getting gpr %d", + error, vie.operand_register); + } + } else if (vie.opcode_flags & VIE_F_FROM_IMM) { + val = vie.immediate; + } else { + return (UNHANDLED); + } + + handled = lapic_write(vlapic, offset, val); + } + + return (handled); +} diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h index 60f7696..7bba4e3 100644 --- a/sys/amd64/vmm/vmm_lapic.h +++ b/sys/amd64/vmm/vmm_lapic.h @@ -35,6 +35,9 @@ boolean_t lapic_msr(u_int num); int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval); int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval); +int lapic_mmio(struct vm *vm, int cpu, u_int offset, int read, + uint64_t rip, uint64_t cr3); + void lapic_timer_tick(struct vm *vm, int cpu); /* diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c index 669fa4b..47ba975 100644 --- a/sys/amd64/vmm/x86.c +++ b/sys/amd64/vmm/x86.c @@ -29,13 +29,17 @@ #include __FBSDID("$FreeBSD$"); +#include #include #include +#include #include #include #include +#include + #include "x86.h" #define CPUID_VM_HIGH 0x40000000 @@ -43,10 +47,12 @@ __FBSDID("$FreeBSD$"); static const char bhyve_id[12] = "BHyVE BHyVE "; int -x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx, - uint32_t vcpu_id) +x86_emulate_cpuid(struct vm *vm, int vcpu_id, + uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { + int error; unsigned int func, regs[4]; + enum x2apic_state x2apic_state; func = *eax; @@ -91,6 +97,12 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx, case CPUID_0000_0001: do_cpuid(1, regs); + error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); + if (error) { + panic("x86_emulate_cpuid: error %d " + "fetching x2apic state", error); + } + /* * Override the APIC ID only in ebx */ @@ -102,7 +114,11 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx, * Advertise x2APIC capability and Hypervisor guest. */ regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); - regs[2] |= CPUID2_X2APIC | CPUID2_HV; + + regs[2] |= CPUID2_HV; + + if (x2apic_state != X2APIC_DISABLED) + regs[2] |= CPUID2_X2APIC; /* * Hide xsave/osxsave/avx until the FPU save/restore diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h index d672831..d19e1d8 100644 --- a/sys/amd64/vmm/x86.h +++ b/sys/amd64/vmm/x86.h @@ -57,7 +57,7 @@ */ #define CPUID_0000_0001_FEAT0_VMX (1<<5) -int x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, - uint32_t *edx, uint32_t vcpu_id); +int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx); #endif -- cgit v1.1 From 5dbc1ca26acaa3175dae7b9d0c45151fba0275ab Mon Sep 17 00:00:00 2001 From: neel Date: Wed, 26 Sep 2012 00:06:17 +0000 Subject: Add an option "-a" to present the local apic in the XAPIC mode instead of the default X2APIC mode to the guest. --- sys/amd64/vmm/io/vlapic.c | 22 +++++++++++++--------- sys/amd64/vmm/io/vlapic.h | 3 +++ sys/amd64/vmm/vmm.c | 4 +++- 3 files changed, 19 insertions(+), 10 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index aedc692..1e8a4e8 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -823,14 +823,8 @@ static struct io_region vlapic_mmio[VM_MAXCPU]; struct vlapic * vlapic_init(struct vm *vm, int vcpuid) { - int err; - enum x2apic_state state; struct vlapic *vlapic; - err = vm_get_x2apic_state(vm, vcpuid, &state); - if (err) - panic("vlapic_set_apicbase: err %d fetching x2apic state", err); - vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO); vlapic->vm = vm; vlapic->vcpuid = vcpuid; @@ -840,9 +834,6 @@ vlapic_init(struct vm *vm, int vcpuid) if (vcpuid == 0) vlapic->msr_apicbase |= APICBASE_BSP; - if (state == X2APIC_ENABLED) - vlapic->msr_apicbase |= APICBASE_X2APIC; - vlapic->ops = &vlapic_dev_ops; vlapic->mmio = vlapic_mmio + vcpuid; @@ -888,3 +879,16 @@ vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val) vlapic->msr_apicbase = val; } + +void +vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) +{ + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, vcpuid); + + if (state == X2APIC_ENABLED) + vlapic->msr_apicbase |= APICBASE_X2APIC; + else + vlapic->msr_apicbase &= ~APICBASE_X2APIC; +} diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h index cecd4d3..f43289d 100644 --- a/sys/amd64/vmm/io/vlapic.h +++ b/sys/amd64/vmm/io/vlapic.h @@ -88,6 +88,8 @@ struct vm; */ #define ISRVEC_STK_SIZE (16 + 1) +enum x2apic_state; + struct vlapic *vlapic_init(struct vm *vm, int vcpuid); void vlapic_cleanup(struct vlapic *vlapic); @@ -104,5 +106,6 @@ void vlapic_timer_tick(struct vlapic *vlapic); uint64_t vlapic_get_apicbase(struct vlapic *vlapic); void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val); +void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s); #endif /* _VLAPIC_H_ */ diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 764ffbb..db2f9b8 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -160,8 +160,8 @@ vcpu_init(struct vm *vm, uint32_t vcpu_id) vcpu->hostcpu = -1; vcpu->vcpuid = vcpu_id; - vcpu->x2apic_state = X2APIC_ENABLED; vcpu->vlapic = vlapic_init(vm, vcpu_id); + vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED); vcpu->guestfpu = fpu_save_area_alloc(); fpu_save_area_reset(vcpu->guestfpu); vcpu->stats = vmm_stat_alloc(); @@ -770,5 +770,7 @@ vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) vm->vcpu[vcpuid].x2apic_state = state; + vlapic_set_x2apic_state(vm, vcpuid, state); + return (0); } -- cgit v1.1 From b65259b285734eec4d40fe639b4e84a6f4bf9f02 Mon Sep 17 00:00:00 2001 From: neel Date: Thu, 27 Sep 2012 00:27:58 +0000 Subject: Intel VT-x provides the length of the instruction at the time of the nested page table fault. Use this when fetching the instruction bytes from the guest memory. Also modify the lapic_mmio() API so that a decoded instruction is fed into it instead of having it fetch the instruction bytes from the guest. This is useful for hardware assists like SVM that provide the faulting instruction as part of the vmexit. --- sys/amd64/vmm/intel/vmx.c | 16 +++++++++++++--- sys/amd64/vmm/vmm_instruction_emul.c | 22 ++++++++++++++-------- sys/amd64/vmm/vmm_instruction_emul.h | 4 ++-- sys/amd64/vmm/vmm_lapic.c | 33 +++++++++++++-------------------- sys/amd64/vmm/vmm_lapic.h | 4 ++-- 5 files changed, 44 insertions(+), 35 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index ed0996e..a2c8e76 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1146,9 +1146,11 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) static int vmx_lapic_fault(struct vm *vm, int cpu, - uint64_t gpa, uint64_t rip, uint64_t cr3, uint64_t ept_qual) + uint64_t gpa, uint64_t rip, int inst_length, + uint64_t cr3, uint64_t ept_qual) { int read, write, handled; + struct vie vie; /* * For this to be a legitimate access to the local apic: @@ -1180,7 +1182,14 @@ vmx_lapic_fault(struct vm *vm, int cpu, return (UNHANDLED); } - handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, rip, cr3); + /* Fetch, decode and emulate the faulting instruction */ + if (vmm_fetch_instruction(vm, rip, inst_length, cr3, &vie) != 0) + return (UNHANDLED); + + if (vmm_decode_instruction(&vie) != 0) + return (UNHANDLED); + + handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, &vie); return (handled); } @@ -1275,7 +1284,8 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) gpa = vmcs_gpa(); cr3 = vmcs_guest_cr3(); handled = vmx_lapic_fault(vmx->vm, vcpu, - gpa, vmexit->rip, cr3, qual); + gpa, vmexit->rip, vmexit->inst_length, + cr3, qual); if (!handled) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->u.paging.cr3 = cr3; diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index fe01d69..66af72c 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -128,9 +128,9 @@ error: return (-1); } -void -vmm_fetch_instruction(struct vm *vm, uint64_t rip, uint64_t cr3, - struct vie *vie) +int +vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length, + uint64_t cr3, struct vie *vie) { int n, err; uint64_t hpa, gpa, gpaend; @@ -139,17 +139,18 @@ vmm_fetch_instruction(struct vm *vm, uint64_t rip, uint64_t cr3, * XXX cache previously fetched instructions using 'rip' as the tag */ + if (inst_length > VIE_INST_SIZE) + panic("vmm_fetch_instruction: invalid length %d", inst_length); + vie_init(vie); - /* - * Copy up to 15 bytes of the instruction stream into 'vie' - */ - while (vie->num_valid < VIE_INST_SIZE) { + /* Copy the instruction into 'vie' */ + while (vie->num_valid < inst_length) { err = gla2gpa(vm, rip, cr3, &gpa, &gpaend); if (err) break; - n = min(VIE_INST_SIZE - vie->num_valid, gpaend - gpa); + n = min(inst_length - vie->num_valid, gpaend - gpa); hpa = vm_gpa2hpa(vm, gpa, n); if (hpa == -1) @@ -160,6 +161,11 @@ vmm_fetch_instruction(struct vm *vm, uint64_t rip, uint64_t cr3, rip += n; vie->num_valid += n; } + + if (vie->num_valid == inst_length) + return (0); + else + return (-1); } static int diff --git a/sys/amd64/vmm/vmm_instruction_emul.h b/sys/amd64/vmm/vmm_instruction_emul.h index 94937f2..1fa9e2b 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.h +++ b/sys/amd64/vmm/vmm_instruction_emul.h @@ -83,8 +83,8 @@ struct vie { struct vm; -void vmm_fetch_instruction(struct vm *vm, uint64_t rip, uint64_t cr3, - struct vie *vie); +int vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length, + uint64_t cr3, struct vie *vie); int vmm_decode_instruction(struct vie *vie); diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c index 0d797e6..ace6010 100644 --- a/sys/amd64/vmm/vmm_lapic.c +++ b/sys/amd64/vmm/vmm_lapic.c @@ -177,25 +177,18 @@ lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val) } int -lapic_mmio(struct vm *vm, int cpu, u_int offset, int read, - uint64_t rip, uint64_t cr3) +lapic_mmio(struct vm *vm, int cpu, u_int offset, int read, struct vie *vie) { int handled, error; uint64_t val; - struct vie vie; struct vlapic *vlapic; const int UNHANDLED = 0; vlapic = vm_lapic(vm, cpu); - vmm_fetch_instruction(vm, rip, cr3, &vie); - - if (vmm_decode_instruction(&vie) != 0) - return (UNHANDLED); - /* Only 32-bit accesses to local apic */ - if (vie.op_size != VIE_OP_SIZE_32BIT) + if (vie->op_size != VIE_OP_SIZE_32BIT) return (UNHANDLED); /* @@ -207,35 +200,35 @@ lapic_mmio(struct vm *vm, int cpu, u_int offset, int read, * This is a limitation of the vm_set_register() API * and can be fixed if necessary. */ - if (vie.operand_register == VM_REG_GUEST_RSP) + if (vie->operand_register == VM_REG_GUEST_RSP) return (UNHANDLED); if (read) { - if ((vie.opcode_flags & VIE_F_TO_REG) == 0) + if ((vie->opcode_flags & VIE_F_TO_REG) == 0) return (UNHANDLED); - if (vie.operand_register >= VM_REG_LAST) + if (vie->operand_register >= VM_REG_LAST) return (UNHANDLED); handled = lapic_read(vlapic, offset, &val); if (handled) { - error = vm_set_register(vm, cpu, vie.operand_register, + error = vm_set_register(vm, cpu, vie->operand_register, val); if (error) panic("lapic_mmio: error %d setting gpr %d", - error, vie.operand_register); + error, vie->operand_register); } } else { - if ((vie.opcode_flags & VIE_F_FROM_REG) && - (vie.operand_register < VM_REG_LAST)) { - error = vm_get_register(vm, cpu, vie.operand_register, + if ((vie->opcode_flags & VIE_F_FROM_REG) && + (vie->operand_register < VM_REG_LAST)) { + error = vm_get_register(vm, cpu, vie->operand_register, &val); if (error) { panic("lapic_mmio: error %d getting gpr %d", - error, vie.operand_register); + error, vie->operand_register); } - } else if (vie.opcode_flags & VIE_F_FROM_IMM) { - val = vie.immediate; + } else if (vie->opcode_flags & VIE_F_FROM_IMM) { + val = vie->immediate; } else { return (UNHANDLED); } diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h index 7bba4e3..e9ff8fd 100644 --- a/sys/amd64/vmm/vmm_lapic.h +++ b/sys/amd64/vmm/vmm_lapic.h @@ -30,13 +30,13 @@ #define _VMM_LAPIC_H_ struct vm; +struct vie; boolean_t lapic_msr(u_int num); int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval); int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval); -int lapic_mmio(struct vm *vm, int cpu, u_int offset, int read, - uint64_t rip, uint64_t cr3); +int lapic_mmio(struct vm *vm, int cpu, u_int offset, int rd, struct vie *); void lapic_timer_tick(struct vm *vm, int cpu); -- cgit v1.1 From bc87f08e9822e6446dc91b0451317740259de95c Mon Sep 17 00:00:00 2001 From: neel Date: Sat, 29 Sep 2012 01:15:45 +0000 Subject: Get rid of assumptions in the hypervisor that the host physical memory associated with guest physical memory is contiguous. In this case vm_malloc() was using vm_gpa2hpa() to indirectly infer whether or not the address range had already been allocated. Replace this instead with an explicit API 'vm_gpa_available()' that returns TRUE if a page is available for allocation in guest physical address space. --- sys/amd64/include/vmm.h | 2 +- sys/amd64/vmm/vmm.c | 60 +++++++++++++++++++++++++++++++++++++++++-------- sys/amd64/vmm/vmm_dev.c | 2 +- 3 files changed, 53 insertions(+), 11 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 0b3a29c..bb2f778 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -89,7 +89,7 @@ extern struct vmm_ops vmm_ops_amd; struct vm *vm_create(const char *name); void vm_destroy(struct vm *vm); const char *vm_name(struct vm *vm); -int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa); +int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size); diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index db2f9b8..06109b1 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -315,20 +315,63 @@ vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) VM_PROT_NONE, spok)); } +/* + * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise + */ +static boolean_t +vm_gpa_available(struct vm *vm, vm_paddr_t gpa) +{ + int i; + vm_paddr_t gpabase, gpalimit; + + if (gpa & PAGE_MASK) + panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa); + + for (i = 0; i < vm->num_mem_segs; i++) { + gpabase = vm->mem_segs[i].gpa; + gpalimit = gpabase + vm->mem_segs[i].len; + if (gpa >= gpabase && gpa < gpalimit) + return (FALSE); + } + + return (TRUE); +} + int -vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa) +vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) { - int error; - vm_paddr_t hpa; + int error, available, allocated; + vm_paddr_t g, hpa; const boolean_t spok = TRUE; /* superpage mappings are ok */ + + if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) + return (EINVAL); + available = allocated = 0; + g = gpa; + while (g < gpa + len) { + if (vm_gpa_available(vm, g)) + available++; + else + allocated++; + + g += PAGE_SIZE; + } + /* - * find the hpa if already it was already vm_malloc'd. + * If there are some allocated and some available pages in the address + * range then it is an error. */ - hpa = vm_gpa2hpa(vm, gpa, len); - if (hpa != ((vm_paddr_t)-1)) - goto out; + if (allocated && available) + return (EINVAL); + + /* + * If the entire address range being requested has already been + * allocated then there isn't anything more to do. + */ + if (allocated && available == 0) + return (0); if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) return (E2BIG); @@ -350,8 +393,7 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa) vm->mem_segs[vm->num_mem_segs].hpa = hpa; vm->mem_segs[vm->num_mem_segs].len = len; vm->num_mem_segs++; -out: - *ret_hpa = hpa; + return (0); } diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index 686ddec..b504e6b 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -295,7 +295,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, break; case VM_MAP_MEMORY: seg = (struct vm_memory_segment *)data; - error = vm_malloc(sc->vm, seg->gpa, seg->len, &seg->hpa); + error = vm_malloc(sc->vm, seg->gpa, seg->len); break; case VM_GET_MEMORY_SEG: seg = (struct vm_memory_segment *)data; -- cgit v1.1 From 3e50e0220bcda77b0a8e06a5f6095a206368e01b Mon Sep 17 00:00:00 2001 From: neel Date: Wed, 3 Oct 2012 00:46:30 +0000 Subject: Get rid of assumptions in the hypervisor that the host physical memory associated with guest physical memory is contiguous. Rewrite vm_gpa2hpa() to get the GPA to HPA mapping by querying the nested page tables. --- sys/amd64/include/vmm.h | 11 +++-- sys/amd64/vmm/amd/amdv.c | 15 ++++-- sys/amd64/vmm/intel/ept.c | 116 +++++++++++++++++++++++++++++++++++++++------- sys/amd64/vmm/intel/ept.h | 3 +- sys/amd64/vmm/intel/vmx.c | 3 +- sys/amd64/vmm/vmm.c | 31 ++++++------- 6 files changed, 134 insertions(+), 45 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index bb2f778..be22eec 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -47,9 +47,11 @@ typedef int (*vmm_cleanup_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */ typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip); typedef void (*vmi_cleanup_func_t)(void *vmi); -typedef int (*vmi_mmap_func_t)(void *vmi, vm_paddr_t gpa, vm_paddr_t hpa, - size_t length, vm_memattr_t attr, - int prot, boolean_t superpages_ok); +typedef int (*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa, + vm_paddr_t hpa, size_t length, + vm_memattr_t attr, int prot, + boolean_t superpages_ok); +typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa); typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, uint64_t *retval); typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, @@ -72,7 +74,8 @@ struct vmm_ops { vmi_init_func_t vminit; /* vm-specific initialization */ vmi_run_func_t vmrun; vmi_cleanup_func_t vmcleanup; - vmi_mmap_func_t vmmmap; + vmi_mmap_set_func_t vmmmap_set; + vmi_mmap_get_func_t vmmmap_get; vmi_get_register_t vmgetreg; vmi_set_register_t vmsetreg; vmi_get_desc_t vmgetdesc; diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c index 674337d..b50f972 100644 --- a/sys/amd64/vmm/amd/amdv.c +++ b/sys/amd64/vmm/amd/amdv.c @@ -78,11 +78,19 @@ amdv_vmcleanup(void *arg) } static int -amdv_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, +amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, vm_memattr_t attr, int prot, boolean_t spok) { - printf("amdv_vmmmap: not implemented\n"); + printf("amdv_vmmmap_set: not implemented\n"); + return (EINVAL); +} + +static vm_paddr_t +amdv_vmmmap_get(void *arg, vm_paddr_t gpa) +{ + + printf("amdv_vmmmap_get: not implemented\n"); return (EINVAL); } @@ -157,7 +165,8 @@ struct vmm_ops vmm_ops_amd = { amdv_vminit, amdv_vmrun, amdv_vmcleanup, - amdv_vmmmap, + amdv_vmmmap_set, + amdv_vmmmap_get, amdv_getreg, amdv_setreg, amdv_getdesc, diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c index c9fca9d..4f91601 100644 --- a/sys/amd64/vmm/intel/ept.c +++ b/sys/amd64/vmm/intel/ept.c @@ -115,6 +115,40 @@ ept_init(void) return (0); } +#if 0 +static void +ept_dump(uint64_t *ptp, int nlevels) +{ + int i, t, tabs; + uint64_t *ptpnext, ptpval; + + if (--nlevels < 0) + return; + + tabs = 3 - nlevels; + for (t = 0; t < tabs; t++) + printf("\t"); + printf("PTP = %p\n", ptp); + + for (i = 0; i < 512; i++) { + ptpval = ptp[i]; + + if (ptpval == 0) + continue; + + for (t = 0; t < tabs; t++) + printf("\t"); + printf("%3d 0x%016lx\n", i, ptpval); + + if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) { + ptpnext = (uint64_t *) + PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK); + ept_dump(ptpnext, nlevels); + } + } +} +#endif + static size_t ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, vm_memattr_t attr, vm_prot_t prot, boolean_t spok) @@ -179,29 +213,64 @@ ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, "mismatch\n", gpa, ptpshift); } - /* Do the mapping */ - ptp[ptpindex] = hpa; + if (prot != VM_PROT_NONE) { + /* Do the mapping */ + ptp[ptpindex] = hpa; - /* Apply the access controls */ - if (prot & VM_PROT_READ) - ptp[ptpindex] |= EPT_PG_RD; - if (prot & VM_PROT_WRITE) - ptp[ptpindex] |= EPT_PG_WR; - if (prot & VM_PROT_EXECUTE) - ptp[ptpindex] |= EPT_PG_EX; + /* Apply the access controls */ + if (prot & VM_PROT_READ) + ptp[ptpindex] |= EPT_PG_RD; + if (prot & VM_PROT_WRITE) + ptp[ptpindex] |= EPT_PG_WR; + if (prot & VM_PROT_EXECUTE) + ptp[ptpindex] |= EPT_PG_EX; - /* - * XXX should we enforce this memory type by setting the ignore PAT - * bit to 1. - */ - ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr); + /* + * XXX should we enforce this memory type by setting the + * ignore PAT bit to 1. + */ + ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr); - if (nlevels > 0) - ptp[ptpindex] |= EPT_PG_SUPERPAGE; + if (nlevels > 0) + ptp[ptpindex] |= EPT_PG_SUPERPAGE; + } else { + /* Remove the mapping */ + ptp[ptpindex] = 0; + } return (1UL << ptpshift); } +static vm_paddr_t +ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa) +{ + int nlevels, ptpshift, ptpindex; + uint64_t ptpval, hpabase, pgmask; + + nlevels = EPT_PWLEVELS; + while (--nlevels >= 0) { + ptpshift = PAGE_SHIFT + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + ptpval = ptp[ptpindex]; + + /* Cannot make progress beyond this point */ + if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0) + break; + + if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) { + pgmask = (1UL << ptpshift) - 1; + hpabase = ptpval & ~pgmask; + return (hpabase | (gpa & pgmask)); + } + + /* Work our way down to the next level page table page */ + ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK); + } + + return ((vm_paddr_t)-1); +} + static void ept_free_pt_entry(pt_entry_t pte) { @@ -276,8 +345,8 @@ ept_vmcleanup(struct vmx *vmx) } int -ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len, - vm_memattr_t attr, int prot, boolean_t spok) +ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len, + vm_memattr_t attr, int prot, boolean_t spok) { size_t n; struct vmx *vmx = arg; @@ -293,6 +362,17 @@ ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len, return (0); } +vm_paddr_t +ept_vmmmap_get(void *arg, vm_paddr_t gpa) +{ + vm_paddr_t hpa; + struct vmx *vmx; + + vmx = arg; + hpa = ept_lookup_mapping(vmx->pml4ept, gpa); + return (hpa); +} + static void invept_single_context(void *arg) { diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h index 013c330..2d7258d 100644 --- a/sys/amd64/vmm/intel/ept.h +++ b/sys/amd64/vmm/intel/ept.h @@ -35,8 +35,9 @@ struct vmx; #define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK) int ept_init(void); -int ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, +int ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings); +vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa); void ept_invalidate_mappings(u_long ept_pml4); void ept_vmcleanup(struct vmx *vmx); #endif diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index a2c8e76..3fbe5a1 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1813,7 +1813,8 @@ struct vmm_ops vmm_ops_intel = { vmx_vminit, vmx_run, vmx_vmcleanup, - ept_vmmmap, + ept_vmmmap_set, + ept_vmmmap_get, vmx_getreg, vmx_setreg, vmx_getdesc, diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 06109b1..62bb753 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -115,8 +115,12 @@ static struct vmm_ops *ops; #define VMRUN(vmi, vcpu, rip) \ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO) #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) -#define VMMMAP(vmi, gpa, hpa, len, attr, prot, spm) \ - (ops != NULL ? (*ops->vmmmap)(vmi, gpa, hpa, len, attr, prot, spm) : ENXIO) +#define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \ + (ops != NULL ? \ + (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \ + ENXIO) +#define VMMMAP_GET(vmi, gpa) \ + (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO) #define VMGETREG(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) #define VMSETREG(vmi, vcpu, num, val) \ @@ -302,8 +306,8 @@ vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { const boolean_t spok = TRUE; /* superpage mappings are ok */ - return (VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE, - VM_PROT_RW, spok)); + return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE, + VM_PROT_RW, spok)); } int @@ -311,8 +315,8 @@ vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { const boolean_t spok = TRUE; /* superpage mappings are ok */ - return (VMMMAP(vm->cookie, gpa, 0, len, VM_MEMATTR_UNCACHEABLE, - VM_PROT_NONE, spok)); + return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0, + VM_PROT_NONE, spok)); } /* @@ -380,8 +384,8 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) if (hpa == 0) return (ENOMEM); - error = VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK, - VM_PROT_ALL, spok); + error = VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK, + VM_PROT_ALL, spok); if (error) { vmm_mem_free(hpa, len); return (error); @@ -400,17 +404,8 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len) { - int i; - vm_paddr_t gpabase, gpalimit, hpabase; - for (i = 0; i < vm->num_mem_segs; i++) { - hpabase = vm->mem_segs[i].hpa; - gpabase = vm->mem_segs[i].gpa; - gpalimit = gpabase + vm->mem_segs[i].len; - if (gpa >= gpabase && gpa + len <= gpalimit) - return ((gpa - gpabase) + hpabase); - } - return ((vm_paddr_t)-1); + return (VMMMAP_GET(vm->cookie, gpa)); } int -- cgit v1.1 From 77ab4804ac42198ff996def6bc2d7acc841626a5 Mon Sep 17 00:00:00 2001 From: neel Date: Wed, 3 Oct 2012 01:18:51 +0000 Subject: Get rid of assumptions in the hypervisor that the host physical memory associated with guest physical memory is contiguous. Add check to vm_gpa2hpa() that the range indicated by [gpa,gpa+len) is all contained within a single 4KB page. --- sys/amd64/vmm/vmm.c | 5 +++++ sys/amd64/vmm/vmm_instruction_emul.c | 5 +++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 62bb753..3dabbd6 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -404,6 +404,11 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len) { + vm_paddr_t nextpage; + + nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE); + if (len > nextpage - gpa) + panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len); return (VMMMAP_GET(vm->cookie, gpa)); } diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index 66af72c..7ef4dbb 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -133,7 +133,7 @@ vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length, uint64_t cr3, struct vie *vie) { int n, err; - uint64_t hpa, gpa, gpaend; + uint64_t hpa, gpa, gpaend, off; /* * XXX cache previously fetched instructions using 'rip' as the tag @@ -150,7 +150,8 @@ vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length, if (err) break; - n = min(inst_length - vie->num_valid, gpaend - gpa); + off = gpa & PAGE_MASK; + n = min(inst_length - vie->num_valid, PAGE_SIZE - off); hpa = vm_gpa2hpa(vm, gpa, n); if (hpa == -1) -- cgit v1.1 From 18dd2c0d511c600e708ac8f756e8e51151b43656 Mon Sep 17 00:00:00 2001 From: neel Date: Thu, 4 Oct 2012 02:27:14 +0000 Subject: Change vm_malloc() to map pages in the guest physical address space in 4KB chunks. This breaks the assumption that the entire memory segment is contiguously allocated in the host physical address space. This also paves the way to satisfy the 4KB page allocations by requesting free pages from the VM subsystem as opposed to hard-partitioning host memory at boot time. --- sys/amd64/include/vmm_dev.h | 1 - sys/amd64/vmm/io/ppt.c | 1 - sys/amd64/vmm/vmm.c | 61 +++++++++++++++++++++++++++++++++++---------- sys/amd64/vmm/vmm_dev.c | 2 +- sys/amd64/vmm/vmm_mem.c | 8 +++--- 5 files changed, 53 insertions(+), 20 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index fc64fd8..42ad236 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -35,7 +35,6 @@ void vmmdev_cleanup(void); #endif struct vm_memory_segment { - vm_paddr_t hpa; /* out */ vm_paddr_t gpa; /* in */ size_t len; /* in */ }; diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c index ace2877..e81fdbc 100644 --- a/sys/amd64/vmm/io/ppt.c +++ b/sys/amd64/vmm/io/ppt.c @@ -356,7 +356,6 @@ ppt_map_mmio(struct vm *vm, int bus, int slot, int func, if (error == 0) { seg->gpa = gpa; seg->len = len; - seg->hpa = hpa; } return (error); } diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 3dabbd6..7bd3f7f 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -275,6 +275,28 @@ vm_create(const char *name) return (vm); } +static void +vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg) +{ + size_t len; + vm_paddr_t hpa; + + len = 0; + while (len < seg->len) { + hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE); + if (hpa == (vm_paddr_t)-1) { + panic("vm_free_mem_segs: cannot free hpa " + "associated with gpa 0x%016lx", seg->gpa + len); + } + + vmm_mem_free(hpa, PAGE_SIZE); + + len += PAGE_SIZE; + } + + bzero(seg, sizeof(struct vm_memory_segment)); +} + void vm_destroy(struct vm *vm) { @@ -283,7 +305,9 @@ vm_destroy(struct vm *vm) ppt_unassign_all(vm); for (i = 0; i < vm->num_mem_segs; i++) - vmm_mem_free(vm->mem_segs[i].hpa, vm->mem_segs[i].len); + vm_free_mem_seg(vm, &vm->mem_segs[i]); + + vm->num_mem_segs = 0; for (i = 0; i < VM_MAXCPU; i++) vcpu_cleanup(&vm->vcpu[i]); @@ -345,6 +369,7 @@ int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) { int error, available, allocated; + struct vm_memory_segment *seg; vm_paddr_t g, hpa; const boolean_t spok = TRUE; /* superpage mappings are ok */ @@ -380,22 +405,32 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) return (E2BIG); - hpa = vmm_mem_alloc(len); - if (hpa == 0) - return (ENOMEM); + seg = &vm->mem_segs[vm->num_mem_segs]; - error = VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK, - VM_PROT_ALL, spok); - if (error) { - vmm_mem_free(hpa, len); - return (error); + seg->gpa = gpa; + seg->len = 0; + while (seg->len < len) { + hpa = vmm_mem_alloc(PAGE_SIZE); + if (hpa == 0) { + error = ENOMEM; + break; + } + + error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE, + VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok); + if (error) + break; + + iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE); + + seg->len += PAGE_SIZE; } - iommu_create_mapping(vm->iommu, gpa, hpa, len); + if (seg->len != len) { + vm_free_mem_seg(vm, seg); + return (error); + } - vm->mem_segs[vm->num_mem_segs].gpa = gpa; - vm->mem_segs[vm->num_mem_segs].hpa = hpa; - vm->mem_segs[vm->num_mem_segs].len = len; vm->num_mem_segs++; return (0); diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index b504e6b..91edbe8 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -299,7 +299,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, break; case VM_GET_MEMORY_SEG: seg = (struct vm_memory_segment *)data; - seg->hpa = seg->len = 0; + seg->len = 0; (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); error = 0; break; diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c index 54f98ac..eb05b9d 100644 --- a/sys/amd64/vmm/vmm_mem.c +++ b/sys/amd64/vmm/vmm_mem.c @@ -318,9 +318,9 @@ vmm_mem_alloc(size_t size) int i; vm_paddr_t addr; - if ((size & PDRMASK) != 0) { + if ((size & PAGE_MASK) != 0) { panic("vmm_mem_alloc: size 0x%0lx must be " - "aligned on a 0x%0x boundary\n", size, NBPDR); + "aligned on a 0x%0x boundary\n", size, PAGE_SIZE); } addr = 0; @@ -373,9 +373,9 @@ vmm_mem_free(vm_paddr_t base, size_t length) { int i; - if ((base & PDRMASK) != 0 || (length & PDRMASK) != 0) { + if ((base & PAGE_MASK) != 0 || (length & PAGE_MASK) != 0) { panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be " - "aligned on a 0x%0x boundary\n", base, length, NBPDR); + "aligned on a 0x%0x boundary\n", base, length, PAGE_SIZE); } mtx_lock(&vmm_mem_mtx); -- cgit v1.1 From ca6e3cf9305492be70c87be05119c96a49cbecf9 Mon Sep 17 00:00:00 2001 From: neel Date: Mon, 8 Oct 2012 23:41:26 +0000 Subject: Allocate memory pages for the guest from the host's free page queue. It is no longer necessary to hard-partition the memory between the host and guests at boot time. --- sys/amd64/vmm/amd/amdv.c | 17 ++ sys/amd64/vmm/intel/vtd.c | 50 +++++- sys/amd64/vmm/io/iommu.c | 51 +++++- sys/amd64/vmm/io/iommu.h | 8 + sys/amd64/vmm/vmm.c | 33 +++- sys/amd64/vmm/vmm_dev.c | 18 --- sys/amd64/vmm/vmm_mem.c | 386 +++++----------------------------------------- sys/amd64/vmm/vmm_mem.h | 4 - 8 files changed, 193 insertions(+), 374 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c index b50f972..020743f 100644 --- a/sys/amd64/vmm/amd/amdv.c +++ b/sys/amd64/vmm/amd/amdv.c @@ -230,6 +230,14 @@ amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, return (0); } +static uint64_t +amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len) +{ + + printf("amd_iommu_remove_mapping: not implemented\n"); + return (0); +} + static void amd_iommu_add_device(void *domain, int bus, int slot, int func) { @@ -244,6 +252,13 @@ amd_iommu_remove_device(void *domain, int bus, int slot, int func) printf("amd_iommu_remove_device: not implemented\n"); } +static void +amd_iommu_invalidate_tlb(void *domain) +{ + + printf("amd_iommu_invalidate_tlb: not implemented\n"); +} + struct iommu_ops iommu_ops_amd = { amd_iommu_init, amd_iommu_cleanup, @@ -252,6 +267,8 @@ struct iommu_ops iommu_ops_amd = { amd_iommu_create_domain, amd_iommu_destroy_domain, amd_iommu_create_mapping, + amd_iommu_remove_mapping, amd_iommu_add_device, amd_iommu_remove_device, + amd_iommu_invalidate_tlb, }; diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c index 24495a9..ef0e9bc 100644 --- a/sys/amd64/vmm/intel/vtd.c +++ b/sys/amd64/vmm/intel/vtd.c @@ -444,8 +444,12 @@ vtd_remove_device(void *arg, int bus, int slot, int func) } } +#define CREATE_MAPPING 0 +#define REMOVE_MAPPING 1 + static uint64_t -vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len, + int remove) { struct domain *dom; int i, spshift, ptpshift, ptpindex, nlevels; @@ -513,16 +517,50 @@ vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift); /* - * Create a 'gpa' -> 'hpa' mapping + * Update the 'gpa' -> 'hpa' mapping */ - ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR; + if (remove) { + ptp[ptpindex] = 0; + } else { + ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR; - if (nlevels > 0) - ptp[ptpindex] |= VTD_PTE_SUPERPAGE; + if (nlevels > 0) + ptp[ptpindex] |= VTD_PTE_SUPERPAGE; + } return (1UL << ptpshift); } +static uint64_t +vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + + return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING)); +} + +static uint64_t +vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len) +{ + + return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING)); +} + +static void +vtd_invalidate_tlb(void *dom) +{ + int i; + struct vtdmap *vtdmap; + + /* + * Invalidate the IOTLB. + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_iotlb_global_invalidate(vtdmap); + } +} + static void * vtd_create_domain(vm_paddr_t maxaddr) { @@ -632,6 +670,8 @@ struct iommu_ops iommu_ops_intel = { vtd_create_domain, vtd_destroy_domain, vtd_create_mapping, + vtd_remove_mapping, vtd_add_device, vtd_remove_device, + vtd_invalidate_tlb, }; diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c index baf2447..c8447cc 100644 --- a/sys/amd64/vmm/io/iommu.c +++ b/sys/amd64/vmm/io/iommu.c @@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$"); #include #include "vmm_util.h" +#include "vmm_mem.h" #include "iommu.h" static boolean_t iommu_avail; @@ -90,6 +91,16 @@ IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) return (len); /* XXX */ } +static __inline uint64_t +IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->remove_mapping)(domain, gpa, len)); + else + return (len); /* XXX */ +} + static __inline void IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func) { @@ -107,6 +118,14 @@ IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func) } static __inline void +IOMMU_INVALIDATE_TLB(void *domain) +{ + + if (ops != NULL && iommu_avail) + (*ops->invalidate_tlb)(domain); +} + +static __inline void IOMMU_ENABLE(void) { @@ -146,13 +165,13 @@ iommu_init(void) /* * Create a domain for the devices owned by the host */ - maxaddr = ptoa(Maxmem); + maxaddr = vmm_mem_maxaddr(); host_domain = IOMMU_CREATE_DOMAIN(maxaddr); if (host_domain == NULL) panic("iommu_init: unable to create a host domain"); /* - * Create 1:1 mappings from '0' to 'Maxmem' for devices assigned to + * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to * the host */ iommu_create_mapping(host_domain, 0, 0, maxaddr); @@ -216,6 +235,27 @@ iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len) } void +iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len) +{ + uint64_t unmapped, remaining; + + remaining = len; + + while (remaining > 0) { + unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining); + gpa += unmapped; + remaining -= unmapped; + } +} + +void * +iommu_host_domain(void) +{ + + return (host_domain); +} + +void iommu_add_device(void *dom, int bus, int slot, int func) { @@ -228,3 +268,10 @@ iommu_remove_device(void *dom, int bus, int slot, int func) IOMMU_REMOVE_DEVICE(dom, bus, slot, func); } + +void +iommu_invalidate_tlb(void *domain) +{ + + IOMMU_INVALIDATE_TLB(domain); +} diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h index e4f7229..d5c1d6e 100644 --- a/sys/amd64/vmm/io/iommu.h +++ b/sys/amd64/vmm/io/iommu.h @@ -37,8 +37,11 @@ typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr); typedef void (*iommu_destroy_domain_t)(void *domain); typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len); +typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa, + uint64_t len); typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func); typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func); +typedef void (*iommu_invalidate_tlb_t)(void *dom); struct iommu_ops { iommu_init_func_t init; /* module wide */ @@ -49,8 +52,10 @@ struct iommu_ops { iommu_create_domain_t create_domain; /* domain-specific */ iommu_destroy_domain_t destroy_domain; iommu_create_mapping_t create_mapping; + iommu_remove_mapping_t remove_mapping; iommu_add_device_t add_device; iommu_remove_device_t remove_device; + iommu_invalidate_tlb_t invalidate_tlb; }; extern struct iommu_ops iommu_ops_intel; @@ -58,10 +63,13 @@ extern struct iommu_ops iommu_ops_amd; void iommu_init(void); void iommu_cleanup(void); +void *iommu_host_domain(void); void *iommu_create_domain(vm_paddr_t maxaddr); void iommu_destroy_domain(void *dom); void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len); +void iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len); void iommu_add_device(void *dom, int bus, int slot, int func); void iommu_remove_device(void *dom, int bus, int slot, int func); +void iommu_invalidate_tlb(void *domain); #endif diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 7bd3f7f..bcd322a 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -280,6 +280,9 @@ vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg) { size_t len; vm_paddr_t hpa; + void *host_domain; + + host_domain = iommu_host_domain(); len = 0; while (len < seg->len) { @@ -289,11 +292,24 @@ vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg) "associated with gpa 0x%016lx", seg->gpa + len); } + /* + * Remove the 'gpa' to 'hpa' mapping in VMs domain. + * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'. + */ + iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE); + iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE); + vmm_mem_free(hpa, PAGE_SIZE); len += PAGE_SIZE; } + /* + * Invalidate cached translations associated with 'vm->iommu' since + * we have now moved some pages from it. + */ + iommu_invalidate_tlb(vm->iommu); + bzero(seg, sizeof(struct vm_memory_segment)); } @@ -371,6 +387,7 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) int error, available, allocated; struct vm_memory_segment *seg; vm_paddr_t g, hpa; + void *host_domain; const boolean_t spok = TRUE; /* superpage mappings are ok */ @@ -405,8 +422,11 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) return (E2BIG); + host_domain = iommu_host_domain(); + seg = &vm->mem_segs[vm->num_mem_segs]; + error = 0; seg->gpa = gpa; seg->len = 0; while (seg->len < len) { @@ -421,16 +441,27 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) if (error) break; + /* + * Remove the 1:1 mapping for 'hpa' from the 'host_domain'. + * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain. + */ + iommu_remove_mapping(host_domain, hpa, PAGE_SIZE); iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE); seg->len += PAGE_SIZE; } - if (seg->len != len) { + if (error) { vm_free_mem_seg(vm, seg); return (error); } + /* + * Invalidate cached translations associated with 'host_domain' since + * we have now moved some pages from it. + */ + iommu_invalidate_tlb(host_domain); + vm->num_mem_segs++; return (0); diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index 91edbe8..66f5184 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -471,24 +471,6 @@ sysctl_vmm_create(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, NULL, 0, sysctl_vmm_create, "A", NULL); -static int -sysctl_vmm_mem_total(SYSCTL_HANDLER_ARGS) -{ - size_t val = vmm_mem_get_mem_total(); - return sysctl_handle_long(oidp, &val, 0, req); -} -SYSCTL_PROC(_hw_vmm, OID_AUTO, mem_total, CTLTYPE_LONG | CTLFLAG_RD, - 0, 0, sysctl_vmm_mem_total, "LU", "Amount of Total memory"); - -static int -sysctl_vmm_mem_free(SYSCTL_HANDLER_ARGS) -{ - size_t val = vmm_mem_get_mem_free(); - return sysctl_handle_long(oidp, &val, 0, req); -} -SYSCTL_PROC(_hw_vmm, OID_AUTO, mem_free, CTLTYPE_LONG | CTLFLAG_RD, - 0, 0, sysctl_vmm_mem_free, "LU", "Amount of Free memory"); - void vmmdev_init(void) { diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c index eb05b9d..8745339 100644 --- a/sys/amd64/vmm/vmm_mem.c +++ b/sys/amd64/vmm/vmm_mem.c @@ -36,9 +36,12 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include +#include +#include #include #include @@ -49,265 +52,21 @@ __FBSDID("$FreeBSD$"); #include "vmm_util.h" #include "vmm_mem.h" -static MALLOC_DEFINE(M_VMM_MEM, "vmm memory", "vmm memory"); +SYSCTL_DECL(_hw_vmm); -#define MB (1024 * 1024) -#define GB (1024 * MB) - -#define VMM_MEM_MAXSEGS 64 - -/* protected by vmm_mem_mtx */ -static struct { - vm_paddr_t base; - vm_size_t length; -} vmm_mem_avail[VMM_MEM_MAXSEGS]; - -static int vmm_mem_nsegs; -size_t vmm_mem_total_bytes; - -static vm_paddr_t maxaddr; - -static struct mtx vmm_mem_mtx; - -/* - * Steal any memory that was deliberately hidden from FreeBSD either by - * the use of MAXMEM kernel config option or the hw.physmem loader tunable. - */ -static int -vmm_mem_steal_memory(void) -{ - int nsegs; - caddr_t kmdp; - uint32_t smapsize; - uint64_t base, length; - struct bios_smap *smapbase, *smap, *smapend; - - /* - * Borrowed from hammer_time() and getmemsize() in machdep.c - */ - kmdp = preload_search_by_type("elf kernel"); - if (kmdp == NULL) - kmdp = preload_search_by_type("elf64 kernel"); - - smapbase = (struct bios_smap *)preload_search_info(kmdp, - MODINFO_METADATA | MODINFOMD_SMAP); - if (smapbase == NULL) - panic("No BIOS smap info from loader!"); - - smapsize = *((uint32_t *)smapbase - 1); - smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); - - vmm_mem_total_bytes = 0; - nsegs = 0; - for (smap = smapbase; smap < smapend; smap++) { - /* - * XXX - * Assuming non-overlapping, monotonically increasing - * memory segments. - */ - if (smap->type != SMAP_TYPE_MEMORY) - continue; - if (smap->length == 0) - break; - - base = roundup(smap->base, NBPDR); - length = rounddown(smap->length, NBPDR); - - /* Skip this segment if FreeBSD is using all of it. */ - if (base + length <= ptoa(Maxmem)) - continue; - - /* - * If FreeBSD is using part of this segment then adjust - * 'base' and 'length' accordingly. - */ - if (base < ptoa(Maxmem)) { - uint64_t used; - used = roundup(ptoa(Maxmem), NBPDR) - base; - base += used; - length -= used; - } - - if (length == 0) - continue; - - vmm_mem_avail[nsegs].base = base; - vmm_mem_avail[nsegs].length = length; - vmm_mem_total_bytes += length; - - if (base + length > maxaddr) - maxaddr = base + length; - - if (0 && bootverbose) { - printf("vmm_mem_populate: index %d, base 0x%0lx, " - "length %ld\n", - nsegs, vmm_mem_avail[nsegs].base, - vmm_mem_avail[nsegs].length); - } - - nsegs++; - if (nsegs >= VMM_MEM_MAXSEGS) { - printf("vmm_mem_populate: maximum number of vmm memory " - "segments reached!\n"); - return (ENOSPC); - } - } - - vmm_mem_nsegs = nsegs; - - return (0); -} +static u_long pages_allocated; +SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD, + &pages_allocated, 0, "4KB pages allocated"); static void -vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end) +update_pages_allocated(int howmany) { - vm_paddr_t addr, remaining; - int pdpi, pdi, superpage_size; - pml4_entry_t *pml4p; - pdp_entry_t *pdp; - pd_entry_t *pd; - uint64_t page_attr_bits; - - if (end >= NBPML4) - panic("Cannot map memory beyond %ldGB", NBPML4 / GB); - - if (vmm_supports_1G_pages()) - superpage_size = NBPDP; - else - superpage_size = NBPDR; - - /* - * Get the page directory pointer page that contains the direct - * map address mappings. - */ - pml4p = kernel_pmap->pm_pml4; - pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4p[DMPML4I] & ~PAGE_MASK); - - page_attr_bits = PG_RW | PG_V | PG_PS | PG_G; - addr = start; - while (addr < end) { - remaining = end - addr; - pdpi = addr / NBPDP; - if (superpage_size == NBPDP && - remaining >= NBPDP && - addr % NBPDP == 0) { - /* - * If there isn't a mapping for this address then - * create one but if there is one already make sure - * it matches what we expect it to be. - */ - if (pdp[pdpi] == 0) { - pdp[pdpi] = addr | page_attr_bits; - if (0 && bootverbose) { - printf("vmm_mem_populate: mapping " - "0x%lx with 1GB page at " - "pdpi %d\n", addr, pdpi); - } - } else { - pdp_entry_t pdpe = pdp[pdpi]; - if ((pdpe & ~PAGE_MASK) != addr || - (pdpe & page_attr_bits) != page_attr_bits) { - panic("An invalid mapping 0x%016lx " - "already exists for 0x%016lx\n", - pdpe, addr); - } - } - addr += NBPDP; - } else { - if (remaining < NBPDR) { - panic("vmm_mem_populate: remaining (%ld) must " - "be greater than NBPDR (%d)\n", - remaining, NBPDR); - } - if (pdp[pdpi] == 0) { - /* - * XXX we lose this memory forever because - * we do not keep track of the virtual address - * that would be required to free this page. - */ - pd = malloc(PAGE_SIZE, M_VMM_MEM, - M_WAITOK | M_ZERO); - if ((uintptr_t)pd & PAGE_MASK) { - panic("vmm_mem_populate: page directory" - "page not aligned on %d " - "boundary\n", PAGE_SIZE); - } - pdp[pdpi] = vtophys(pd); - pdp[pdpi] |= PG_RW | PG_V | PG_U; - if (0 && bootverbose) { - printf("Creating page directory " - "at pdp index %d for 0x%016lx\n", - pdpi, addr); - } - } - pdi = (addr % NBPDP) / NBPDR; - pd = (pd_entry_t *)PHYS_TO_DMAP(pdp[pdpi] & ~PAGE_MASK); - - /* - * Create a new mapping if one doesn't already exist - * or validate it if it does. - */ - if (pd[pdi] == 0) { - pd[pdi] = addr | page_attr_bits; - if (0 && bootverbose) { - printf("vmm_mem_populate: mapping " - "0x%lx with 2MB page at " - "pdpi %d, pdi %d\n", - addr, pdpi, pdi); - } - } else { - pd_entry_t pde = pd[pdi]; - if ((pde & ~PAGE_MASK) != addr || - (pde & page_attr_bits) != page_attr_bits) { - panic("An invalid mapping 0x%016lx " - "already exists for 0x%016lx\n", - pde, addr); - } - } - addr += NBPDR; - } - } -} - -static int -vmm_mem_populate(void) -{ - int seg, error; - vm_paddr_t start, end; - - /* populate the vmm_mem_avail[] array */ - error = vmm_mem_steal_memory(); - if (error) - return (error); - - /* - * Now map the memory that was hidden from FreeBSD in - * the direct map VA space. - */ - for (seg = 0; seg < vmm_mem_nsegs; seg++) { - start = vmm_mem_avail[seg].base; - end = start + vmm_mem_avail[seg].length; - if ((start & PDRMASK) != 0 || (end & PDRMASK) != 0) { - panic("start (0x%016lx) and end (0x%016lx) must be " - "aligned on a %dMB boundary\n", - start, end, NBPDR / MB); - } - vmm_mem_direct_map(start, end); - } - - return (0); + pages_allocated += howmany; /* XXX locking? */ } int vmm_mem_init(void) { - int error; - - mtx_init(&vmm_mem_mtx, "vmm_mem_mtx", NULL, MTX_DEF); - - error = vmm_mem_populate(); - if (error) - return (error); return (0); } @@ -315,122 +74,61 @@ vmm_mem_init(void) vm_paddr_t vmm_mem_alloc(size_t size) { - int i; - vm_paddr_t addr; + int flags; + vm_page_t m; + vm_paddr_t pa; - if ((size & PAGE_MASK) != 0) { - panic("vmm_mem_alloc: size 0x%0lx must be " - "aligned on a 0x%0x boundary\n", size, PAGE_SIZE); - } + if (size != PAGE_SIZE) + panic("vmm_mem_alloc: invalid allocation size %lu", size); - addr = 0; + flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO; - mtx_lock(&vmm_mem_mtx); - for (i = 0; i < vmm_mem_nsegs; i++) { - if (vmm_mem_avail[i].length >= size) { - addr = vmm_mem_avail[i].base; - vmm_mem_avail[i].base += size; - vmm_mem_avail[i].length -= size; - /* remove a zero length segment */ - if (vmm_mem_avail[i].length == 0) { - memmove(&vmm_mem_avail[i], - &vmm_mem_avail[i + 1], - (vmm_mem_nsegs - (i + 1)) * - sizeof(vmm_mem_avail[0])); - vmm_mem_nsegs--; - } + while (1) { + /* + * XXX need policy to determine when to back off the allocation + */ + m = vm_page_alloc(NULL, 0, flags); + if (m == NULL) + VM_WAIT; + else break; - } } - mtx_unlock(&vmm_mem_mtx); - - return (addr); -} - -size_t -vmm_mem_get_mem_total(void) -{ - return vmm_mem_total_bytes; -} -size_t -vmm_mem_get_mem_free(void) -{ - size_t length = 0; - int i; + pa = VM_PAGE_TO_PHYS(m); + + if ((m->flags & PG_ZERO) == 0) + pagezero((void *)PHYS_TO_DMAP(pa)); - mtx_lock(&vmm_mem_mtx); - for (i = 0; i < vmm_mem_nsegs; i++) { - length += vmm_mem_avail[i].length; - } - mtx_unlock(&vmm_mem_mtx); + update_pages_allocated(1); - return(length); + return (pa); } void vmm_mem_free(vm_paddr_t base, size_t length) { - int i; + vm_page_t m; - if ((base & PAGE_MASK) != 0 || (length & PAGE_MASK) != 0) { - panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be " - "aligned on a 0x%0x boundary\n", base, length, PAGE_SIZE); + if (base & PAGE_MASK) { + panic("vmm_mem_free: base 0x%0lx must be aligned on a " + "0x%0x boundary\n", base, PAGE_SIZE); } - mtx_lock(&vmm_mem_mtx); - - for (i = 0; i < vmm_mem_nsegs; i++) { - if (vmm_mem_avail[i].base > base) - break; - } - - if (vmm_mem_nsegs >= VMM_MEM_MAXSEGS) - panic("vmm_mem_free: cannot free any more segments"); - - /* Create a new segment at index 'i' */ - memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i], - (vmm_mem_nsegs - i) * sizeof(vmm_mem_avail[0])); - - vmm_mem_avail[i].base = base; - vmm_mem_avail[i].length = length; + if (length != PAGE_SIZE) + panic("vmm_mem_free: invalid length %lu", length); - vmm_mem_nsegs++; + m = PHYS_TO_VM_PAGE(base); + m->wire_count--; + vm_page_free(m); + atomic_subtract_int(&cnt.v_wire_count, 1); -coalesce_some_more: - for (i = 0; i < vmm_mem_nsegs - 1; i++) { - if (vmm_mem_avail[i].base + vmm_mem_avail[i].length == - vmm_mem_avail[i + 1].base) { - vmm_mem_avail[i].length += vmm_mem_avail[i + 1].length; - memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i + 2], - (vmm_mem_nsegs - (i + 2)) * sizeof(vmm_mem_avail[0])); - vmm_mem_nsegs--; - goto coalesce_some_more; - } - } - - mtx_unlock(&vmm_mem_mtx); + update_pages_allocated(-1); } vm_paddr_t vmm_mem_maxaddr(void) { - return (maxaddr); -} - -void -vmm_mem_dump(void) -{ - int i; - vm_paddr_t base; - vm_size_t length; - - mtx_lock(&vmm_mem_mtx); - for (i = 0; i < vmm_mem_nsegs; i++) { - base = vmm_mem_avail[i].base; - length = vmm_mem_avail[i].length; - printf("%-4d0x%016lx 0x%016lx\n", i, base, base + length); - } - mtx_unlock(&vmm_mem_mtx); + return (ptoa(Maxmem)); } diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h index a83e9be..7d45c74 100644 --- a/sys/amd64/vmm/vmm_mem.h +++ b/sys/amd64/vmm/vmm_mem.h @@ -33,9 +33,5 @@ int vmm_mem_init(void); vm_paddr_t vmm_mem_alloc(size_t size); void vmm_mem_free(vm_paddr_t start, size_t size); vm_paddr_t vmm_mem_maxaddr(void); -void vmm_mem_dump(void); - -size_t vmm_mem_get_mem_total(void); -size_t vmm_mem_get_mem_free(void); #endif -- cgit v1.1 From d09cf38e2541aaf20c762b633d757f620e4ae745 Mon Sep 17 00:00:00 2001 From: neel Date: Thu, 11 Oct 2012 19:28:07 +0000 Subject: Deliver the MSI to the correct guest virtual cpu. Prior to this change the MSI was being delivered unconditionally to vcpu 0 regardless of how the guest programmed the MSI delivery. --- sys/amd64/vmm/io/ppt.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c index e81fdbc..d6fef9a 100644 --- a/sys/amd64/vmm/io/ppt.c +++ b/sys/amd64/vmm/io/ppt.c @@ -71,8 +71,6 @@ static struct pptdev { struct vm_memory_segment mmio[MAX_MMIOSEGS]; struct { int num_msgs; /* guest state */ - int vector; - int vcpu; int startrid; /* host state */ struct resource *res[MAX_MSIMSGS]; @@ -478,8 +476,6 @@ ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, } } - ppt->msi.vector = vector; - ppt->msi.vcpu = destcpu; ppt->msi.startrid = startrid; /* @@ -497,6 +493,7 @@ ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, ppt->msi.arg[i].pptdev = ppt; ppt->msi.arg[i].vec = vector + i; + ppt->msi.arg[i].vcpu = destcpu; error = bus_setup_intr(ppt->dev, ppt->msi.res[i], INTR_TYPE_NET | INTR_MPSAFE, -- cgit v1.1 From 97c20149fa1e35f2bcc34d8b7058467aa8d51d80 Mon Sep 17 00:00:00 2001 From: neel Date: Thu, 11 Oct 2012 19:39:54 +0000 Subject: Fix warnings generated by 'debug.witness.watch' during VM creation and destruction for calling malloc() with M_WAITOK while holding a mutex. Do not allow vmm.ko to be unloaded until all virtual machines are destroyed. --- sys/amd64/include/vmm_dev.h | 2 +- sys/amd64/vmm/vmm.c | 10 +++--- sys/amd64/vmm/vmm_dev.c | 86 +++++++++++++++++++++++++++------------------ 3 files changed, 59 insertions(+), 39 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index 42ad236..79f893d 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -31,7 +31,7 @@ #ifdef _KERNEL void vmmdev_init(void); -void vmmdev_cleanup(void); +int vmmdev_cleanup(void); #endif struct vm_memory_segment { diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index bcd322a..019b9a8 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -219,10 +219,12 @@ vmm_handler(module_t mod, int what, void *arg) error = vmm_init(); break; case MOD_UNLOAD: - vmmdev_cleanup(); - iommu_cleanup(); - vmm_ipi_cleanup(); - error = VMM_CLEANUP(); + error = vmmdev_cleanup(); + if (error == 0) { + iommu_cleanup(); + vmm_ipi_cleanup(); + error = VMM_CLEANUP(); + } break; default: error = 0; diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index 66f5184..1eba226 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -88,18 +88,11 @@ vmmdev_lookup(const char *name) static struct vmmdev_softc * vmmdev_lookup2(struct cdev *cdev) { - struct vmmdev_softc *sc; - #ifdef notyet /* XXX kernel is not compiled with invariants */ mtx_assert(&vmmdev_mtx, MA_OWNED); #endif - SLIST_FOREACH(sc, &head, link) { - if (sc->cdev == cdev) - break; - } - - return (sc); + return (cdev->si_drv1); } static int @@ -114,6 +107,8 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) error = 0; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup2(cdev); + if (sc == NULL) + error = ENXIO; while (uio->uio_resid > 0 && error == 0) { gpa = uio->uio_offset; @@ -380,20 +375,25 @@ vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr, } static void -vmmdev_destroy(struct vmmdev_softc *sc) +vmmdev_destroy(struct vmmdev_softc *sc, boolean_t unlink) { -#ifdef notyet /* XXX kernel is not compiled with invariants */ - mtx_assert(&vmmdev_mtx, MA_OWNED); -#endif - /* * XXX must stop virtual machine instances that may be still * running and cleanup their state. */ - SLIST_REMOVE(&head, sc, vmmdev_softc, link); - destroy_dev(sc->cdev); - vm_destroy(sc->vm); + if (sc->cdev) + destroy_dev(sc->cdev); + + if (sc->vm) + vm_destroy(sc->vm); + + if (unlink) { + mtx_lock(&vmmdev_mtx); + SLIST_REMOVE(&head, sc, vmmdev_softc, link); + mtx_unlock(&vmmdev_mtx); + } + free(sc, M_VMMDEV); } @@ -409,14 +409,22 @@ sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) if (error != 0 || req->newptr == NULL) return (error); + /* + * XXX TODO if any process has this device open then fail + */ + mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); if (sc == NULL) { mtx_unlock(&vmmdev_mtx); return (EINVAL); } - vmmdev_destroy(sc); + + sc->cdev->si_drv1 = NULL; mtx_unlock(&vmmdev_mtx); + + vmmdev_destroy(sc, TRUE); + return (0); } SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, @@ -436,7 +444,7 @@ sysctl_vmm_create(SYSCTL_HANDLER_ARGS) { int error; struct vm *vm; - struct vmmdev_softc *sc; + struct vmmdev_softc *sc, *sc2; char buf[VM_MAX_NAMELEN]; strlcpy(buf, "beavis", sizeof(buf)); @@ -445,27 +453,37 @@ sysctl_vmm_create(SYSCTL_HANDLER_ARGS) return (error); mtx_lock(&vmmdev_mtx); - sc = vmmdev_lookup(buf); - if (sc != NULL) { - mtx_unlock(&vmmdev_mtx); + mtx_unlock(&vmmdev_mtx); + if (sc != NULL) return (EEXIST); - } vm = vm_create(buf); - if (vm == NULL) { - mtx_unlock(&vmmdev_mtx); + if (vm == NULL) return (EINVAL); - } sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); sc->vm = vm; + + /* + * Lookup the name again just in case somebody sneaked in when we + * dropped the lock. + */ + mtx_lock(&vmmdev_mtx); + sc2 = vmmdev_lookup(buf); + if (sc2 == NULL) + SLIST_INSERT_HEAD(&head, sc, link); + mtx_unlock(&vmmdev_mtx); + + if (sc2 != NULL) { + vmmdev_destroy(sc, FALSE); + return (EEXIST); + } + sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); sc->cdev->si_drv1 = sc; - SLIST_INSERT_HEAD(&head, sc, link); - mtx_unlock(&vmmdev_mtx); return (0); } SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, @@ -477,15 +495,15 @@ vmmdev_init(void) mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); } -void +int vmmdev_cleanup(void) { - struct vmmdev_softc *sc, *sc2; - - mtx_lock(&vmmdev_mtx); + int error; - SLIST_FOREACH_SAFE(sc, &head, link, sc2) - vmmdev_destroy(sc); + if (SLIST_EMPTY(&head)) + error = 0; + else + error = EBUSY; - mtx_unlock(&vmmdev_mtx); + return (error); } -- cgit v1.1 From e3e8a520e280f32230da3ddfa4c5260fea0e15a1 Mon Sep 17 00:00:00 2001 From: neel Date: Fri, 12 Oct 2012 18:32:44 +0000 Subject: Provide per-vcpu locks instead of relying on a single big lock. This also gets rid of all the witness.watch warnings related to calling malloc(M_WAITOK) while holding a mutex. Reviewed by: grehan --- sys/amd64/include/vmm.h | 19 ++++++---- sys/amd64/vmm/intel/vmx.c | 4 +-- sys/amd64/vmm/io/ppt.c | 2 ++ sys/amd64/vmm/vmm.c | 91 +++++++++++++++++++++++++++++++---------------- sys/amd64/vmm/vmm_dev.c | 67 ++++++++++++++++++++++++---------- sys/amd64/vmm/vmm_ipi.c | 24 ++++--------- sys/amd64/vmm/vmm_ipi.h | 3 +- 7 files changed, 134 insertions(+), 76 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index be22eec..4dfdd04 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -130,19 +130,24 @@ int vmm_is_pptdev(int bus, int slot, int func); void *vm_iommu_domain(struct vm *vm); -#define VCPU_STOPPED 0 -#define VCPU_RUNNING 1 -void vm_set_run_state(struct vm *vm, int vcpu, int running); -int vm_get_run_state(struct vm *vm, int vcpu, int *hostcpu); +enum vcpu_state { + VCPU_IDLE, + VCPU_RUNNING, + VCPU_CANNOT_RUN, +}; -void *vcpu_stats(struct vm *vm, int vcpu); +int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state); +enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu); static int __inline -vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) +vcpu_is_running(struct vm *vm, int vcpu) { - return (vm_get_run_state(vm, vcpu, hostcpu) == VCPU_RUNNING); + return (vcpu_get_state(vm, vcpu) == VCPU_RUNNING); } +void *vcpu_stats(struct vm *vm, int vcpu); +void vm_interrupt_hostcpu(struct vm *vm, int vcpu); + #endif /* KERNEL */ #define VM_MAXCPU 8 /* maximum virtual cpus */ diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 3fbe5a1..6a1dbed 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1568,7 +1568,7 @@ vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause * the subsequent vmlaunch/vmresume to fail. */ - if (vcpu_is_running(vmx->vm, vcpu, NULL)) + if (vcpu_is_running(vmx->vm, vcpu)) panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval)); @@ -1596,7 +1596,7 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause * the subsequent vmlaunch/vmresume to fail. */ - if (vcpu_is_running(vmx->vm, vcpu, NULL)) + if (vcpu_is_running(vmx->vm, vcpu)) panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val); diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c index d6fef9a..3044fc5 100644 --- a/sys/amd64/vmm/io/ppt.c +++ b/sys/amd64/vmm/io/ppt.c @@ -53,6 +53,8 @@ __FBSDID("$FreeBSD$"); #include "iommu.h" #include "ppt.h" +/* XXX locking */ + #define MAX_PPTDEVS (sizeof(pptdevs) / sizeof(pptdevs[0])) #define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1) #define MAX_MSIMSGS 32 diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 019b9a8..8d8f143 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include @@ -65,6 +66,8 @@ struct vlapic; struct vcpu { int flags; + enum vcpu_state state; + struct mtx mtx; int pincpu; /* host cpuid this vcpu is bound to */ int hostcpu; /* host cpuid this vcpu last ran on */ uint64_t guest_msrs[VMM_MSR_NUM]; @@ -76,7 +79,6 @@ struct vcpu { enum x2apic_state x2apic_state; }; #define VCPU_F_PINNED 0x0001 -#define VCPU_F_RUNNING 0x0002 #define VCPU_PINCPU(vm, vcpuid) \ ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1) @@ -89,6 +91,10 @@ do { \ vm->vcpu[vcpuid].pincpu = host_cpuid; \ } while(0) +#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_DEF) +#define vcpu_lock(v) mtx_lock(&((v)->mtx)) +#define vcpu_unlock(v) mtx_unlock(&((v)->mtx)) + #define VM_MAX_MEMORY_SEGMENTS 2 struct vm { @@ -162,7 +168,8 @@ vcpu_init(struct vm *vm, uint32_t vcpu_id) vcpu = &vm->vcpu[vcpu_id]; - vcpu->hostcpu = -1; + vcpu_lock_init(vcpu); + vcpu->hostcpu = NOCPU; vcpu->vcpuid = vcpu_id; vcpu->vlapic = vlapic_init(vm, vcpu_id); vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED); @@ -667,11 +674,13 @@ vm_run(struct vm *vm, struct vm_run *vmrun) pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); - vcpu->hostcpu = curcpu; - restore_guest_msrs(vm, vcpuid); restore_guest_fpustate(vcpu); + + vcpu->hostcpu = curcpu; error = VMRUN(vm->cookie, vcpuid, vmrun->rip); + vcpu->hostcpu = NOCPU; + save_guest_fpustate(vcpu); restore_host_msrs(vm, vcpuid); @@ -787,9 +796,10 @@ vm_iommu_domain(struct vm *vm) return (vm->iommu); } -void -vm_set_run_state(struct vm *vm, int vcpuid, int state) +int +vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state) { + int error; struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) @@ -797,43 +807,42 @@ vm_set_run_state(struct vm *vm, int vcpuid, int state) vcpu = &vm->vcpu[vcpuid]; - if (state == VCPU_RUNNING) { - if (vcpu->flags & VCPU_F_RUNNING) { - panic("vm_set_run_state: %s[%d] is already running", - vm_name(vm), vcpuid); - } - vcpu->flags |= VCPU_F_RUNNING; + vcpu_lock(vcpu); + + /* + * The following state transitions are allowed: + * IDLE -> RUNNING -> IDLE + * IDLE -> CANNOT_RUN -> IDLE + */ + if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) || + (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) { + error = 0; + vcpu->state = state; } else { - if ((vcpu->flags & VCPU_F_RUNNING) == 0) { - panic("vm_set_run_state: %s[%d] is already stopped", - vm_name(vm), vcpuid); - } - vcpu->flags &= ~VCPU_F_RUNNING; + error = EBUSY; } + + vcpu_unlock(vcpu); + + return (error); } -int -vm_get_run_state(struct vm *vm, int vcpuid, int *cpuptr) +enum vcpu_state +vcpu_get_state(struct vm *vm, int vcpuid) { - int retval, hostcpu; struct vcpu *vcpu; + enum vcpu_state state; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_get_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; - if (vcpu->flags & VCPU_F_RUNNING) { - retval = VCPU_RUNNING; - hostcpu = vcpu->hostcpu; - } else { - retval = VCPU_STOPPED; - hostcpu = -1; - } - if (cpuptr) - *cpuptr = hostcpu; + vcpu_lock(vcpu); + state = vcpu->state; + vcpu_unlock(vcpu); - return (retval); + return (state); } void @@ -884,3 +893,25 @@ vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) return (0); } + +void +vm_interrupt_hostcpu(struct vm *vm, int vcpuid) +{ + int hostcpu; + struct vcpu *vcpu; + + vcpu = &vm->vcpu[vcpuid]; + + /* + * XXX racy but the worst case is that we'll send an unnecessary IPI + * to the 'hostcpu'. + * + * We cannot use vcpu_is_running() here because it acquires vcpu->mtx + * which is not allowed inside a critical section. + */ + hostcpu = vcpu->hostcpu; + if (hostcpu == NOCPU || hostcpu == curcpu) + return; + + ipi_cpu(hostcpu, vmm_ipinum); +} diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index 1eba226..0150ebd 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -88,9 +88,6 @@ vmmdev_lookup(const char *name) static struct vmmdev_softc * vmmdev_lookup2(struct cdev *cdev) { -#ifdef notyet /* XXX kernel is not compiled with invariants */ - mtx_assert(&vmmdev_mtx, MA_OWNED); -#endif return (cdev->si_drv1); } @@ -141,7 +138,8 @@ static int vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { - int error, vcpu; + int error, vcpu, state_changed; + enum vcpu_state new_state; struct vmmdev_softc *sc; struct vm_memory_segment *seg; struct vm_register *vmreg; @@ -160,12 +158,12 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct vm_stat_desc *statdesc; struct vm_x2apic *x2apic; - mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup2(cdev); - if (sc == NULL) { - mtx_unlock(&vmmdev_mtx); + if (sc == NULL) return (ENXIO); - } + + vcpu = -1; + state_changed = 0; /* * Some VMM ioctls can operate only on vcpus that are not running. @@ -181,6 +179,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, case VM_GET_CAPABILITY: case VM_SET_CAPABILITY: case VM_PPTDEV_MSI: + case VM_PPTDEV_MSIX: case VM_SET_X2APIC_STATE: /* * XXX fragile, handle with care @@ -192,11 +191,42 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, goto done; } - if (vcpu_is_running(sc->vm, vcpu, NULL)) { - error = EBUSY; + if (cmd == VM_RUN) + new_state = VCPU_RUNNING; + else + new_state = VCPU_CANNOT_RUN; + + error = vcpu_set_state(sc->vm, vcpu, new_state); + if (error) + goto done; + + state_changed = 1; + break; + + case VM_MAP_PPTDEV_MMIO: + case VM_BIND_PPTDEV: + case VM_UNBIND_PPTDEV: + case VM_MAP_MEMORY: + /* + * ioctls that operate on the entire virtual machine must + * prevent all vcpus from running. + */ + error = 0; + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { + error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN); + if (error) + break; + } + + if (error) { + while (--vcpu >= 0) + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); goto done; } + + state_changed = 2; break; + default: break; } @@ -204,14 +234,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, switch(cmd) { case VM_RUN: vmrun = (struct vm_run *)data; - - vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_RUNNING); - mtx_unlock(&vmmdev_mtx); - error = vm_run(sc->vm, vmrun); - - mtx_lock(&vmmdev_mtx); - vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_STOPPED); break; case VM_STAT_DESC: { const char *desc; @@ -346,9 +369,15 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, error = ENOTTY; break; } -done: - mtx_unlock(&vmmdev_mtx); + if (state_changed == 1) { + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); + } else if (state_changed == 2) { + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); + } + +done: return (error); } diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c index 055f86f..643d326 100644 --- a/sys/amd64/vmm/vmm_ipi.c +++ b/sys/amd64/vmm/vmm_ipi.c @@ -38,7 +38,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include #include "vmm_ipi.h" @@ -48,7 +47,7 @@ extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn); /* * The default is to use the IPI_AST to interrupt a vcpu. */ -static int ipinum = IPI_AST; +int vmm_ipinum = IPI_AST; CTASSERT(APIC_SPURIOUS_INT == 255); @@ -73,31 +72,22 @@ vmm_ipi_init(void) ip = &idt[idx]; func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); if (func == (uintptr_t)&IDTVEC(rsvd)) { - ipinum = idx; - setidt(ipinum, IDTVEC(justreturn), SDT_SYSIGT, + vmm_ipinum = idx; + setidt(vmm_ipinum, IDTVEC(justreturn), SDT_SYSIGT, SEL_KPL, 0); break; } } - if (ipinum != IPI_AST && bootverbose) { + if (vmm_ipinum != IPI_AST && bootverbose) { printf("vmm_ipi_init: installing ipi handler to interrupt " - "vcpus at vector %d\n", ipinum); + "vcpus at vector %d\n", vmm_ipinum); } } void vmm_ipi_cleanup(void) { - if (ipinum != IPI_AST) - setidt(ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); -} - -void -vm_interrupt_hostcpu(struct vm *vm, int vcpu) -{ - int hostcpu; - - if (vcpu_is_running(vm, vcpu, &hostcpu) && hostcpu != curcpu) - ipi_cpu(hostcpu, ipinum); + if (vmm_ipinum != IPI_AST) + setidt(vmm_ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); } diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h index 7ab94bf..91552e3 100644 --- a/sys/amd64/vmm/vmm_ipi.h +++ b/sys/amd64/vmm/vmm_ipi.h @@ -31,8 +31,9 @@ struct vm; +extern int vmm_ipinum; + void vmm_ipi_init(void); void vmm_ipi_cleanup(void); -void vm_interrupt_hostcpu(struct vm *vm, int vcpu); #endif -- cgit v1.1 From 8fb5b5f8de608d18362583be1e90150aab0b4d33 Mon Sep 17 00:00:00 2001 From: grehan Date: Fri, 12 Oct 2012 23:12:19 +0000 Subject: Add the guest physical address and r/w/x bits to the paging exit in preparation for a rework of bhyve MMIO handling. Reviewed by: neel Obtained from: NetApp --- sys/amd64/include/vmm.h | 2 ++ sys/amd64/vmm/intel/vmx.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 4dfdd04..d0dfb04 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -266,6 +266,8 @@ struct vm_exit { } inout; struct { uint64_t cr3; + uint64_t gpa; + int rwx; } paging; /* * VMX specific payload. Used when there is no "better" diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 6a1dbed..81969ea 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1289,6 +1289,8 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) if (!handled) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->u.paging.cr3 = cr3; + vmexit->u.paging.gpa = gpa; + vmexit->u.paging.rwx = qual & 0x7; } break; default: -- cgit v1.1 From 26dd051c2cb82a04c38681e6726fef1fa8287c0d Mon Sep 17 00:00:00 2001 From: neel Date: Sat, 20 Oct 2012 08:23:05 +0000 Subject: Calculate the number of host ticks until the next guest timer interrupt. This information will be used in conjunction with guest "HLT exiting" to yield the thread hosting the virtual cpu. Obtained from: NetApp --- sys/amd64/vmm/io/vlapic.c | 113 +++++++++++++++++++++++++--------------------- sys/amd64/vmm/io/vlapic.h | 2 +- sys/amd64/vmm/vmm_lapic.c | 4 +- sys/amd64/vmm/vmm_lapic.h | 2 +- 4 files changed, 65 insertions(+), 56 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 1e8a4e8..911ed64 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -121,6 +121,31 @@ struct vlapic { enum boot_state boot_state; }; +#define VLAPIC_BUS_FREQ tsc_freq + +static int +vlapic_timer_divisor(uint32_t dcr) +{ + switch (dcr & 0xB) { + case APIC_TDCR_2: + return (2); + case APIC_TDCR_4: + return (4); + case APIC_TDCR_8: + return (8); + case APIC_TDCR_16: + return (16); + case APIC_TDCR_32: + return (32); + case APIC_TDCR_64: + return (64); + case APIC_TDCR_128: + return (128); + default: + panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); + } +} + static void vlapic_mask_lvts(uint32_t *lvts, int num_lvt) { @@ -175,6 +200,7 @@ vlapic_op_reset(void* dev) memset(lapic, 0, sizeof(*lapic)); lapic->apr = vlapic->vcpuid; vlapic_init_ipi(vlapic); + vlapic->divisor = vlapic_timer_divisor(lapic->dcr_timer); if (vlapic->vcpuid == 0) vlapic->boot_state = BS_RUNNING; /* BSP */ @@ -218,32 +244,6 @@ vlapic_set_intr_ready(struct vlapic *vlapic, int vector) VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); } -#define VLAPIC_BUS_FREQ tsc_freq -#define VLAPIC_DCR(x) ((x->dcr_timer & 0x8) >> 1)|(x->dcr_timer & 0x3) - -static int -vlapic_timer_divisor(uint32_t dcr) -{ - switch (dcr & 0xB) { - case APIC_TDCR_2: - return (2); - case APIC_TDCR_4: - return (4); - case APIC_TDCR_8: - return (8); - case APIC_TDCR_16: - return (16); - case APIC_TDCR_32: - return (32); - case APIC_TDCR_64: - return (64); - case APIC_TDCR_128: - return (128); - default: - panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); - } -} - static void vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed) { @@ -755,59 +755,68 @@ vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data) return (retval); } -void +int vlapic_timer_tick(struct vlapic *vlapic) { - int curticks, delta, periodic; + int curticks, delta, periodic, fired; uint32_t ccr; - uint32_t decrement, remainder; + uint32_t decrement, leftover; +restart: curticks = ticks; - - /* Common case */ delta = curticks - vlapic->ccr_ticks; - if (delta == 0) - return; /* Local APIC timer is disabled */ if (vlapic->apic.icr_timer == 0) - return; + return (-1); /* One-shot mode and timer has already counted down to zero */ periodic = vlapic_periodic_timer(vlapic); if (!periodic && vlapic->apic.ccr_timer == 0) - return; + return (-1); /* * The 'curticks' and 'ccr_ticks' are out of sync by more than * 2^31 ticks. We deal with this by restarting the timer. */ if (delta < 0) { vlapic_start_timer(vlapic, 0); - return; + goto restart; } - ccr = vlapic->apic.ccr_timer; + fired = 0; decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz; + + vlapic->ccr_ticks = curticks; + ccr = vlapic->apic.ccr_timer; + while (delta-- > 0) { - if (ccr <= decrement) { - remainder = decrement - ccr; - vlapic_fire_timer(vlapic); - if (periodic) { - vlapic_start_timer(vlapic, remainder); - ccr = vlapic->apic.ccr_timer; - } else { - /* - * One-shot timer has counted down to zero. - */ - ccr = 0; - break; - } - } else + if (ccr > decrement) { ccr -= decrement; + continue; + } + + /* Trigger the local apic timer interrupt */ + vlapic_fire_timer(vlapic); + if (periodic) { + leftover = decrement - ccr; + vlapic_start_timer(vlapic, leftover); + ccr = vlapic->apic.ccr_timer; + } else { + /* + * One-shot timer has counted down to zero. + */ + ccr = 0; + } + fired = 1; + break; } - vlapic->ccr_ticks = curticks; vlapic->apic.ccr_timer = ccr; + + if (!fired) + return ((ccr / decrement) + 1); + else + return (0); } struct vdev_ops vlapic_dev_ops = { diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h index f43289d..00de019 100644 --- a/sys/amd64/vmm/io/vlapic.h +++ b/sys/amd64/vmm/io/vlapic.h @@ -102,7 +102,7 @@ int vlapic_op_mem_read(void* dev, uint64_t gpa, int vlapic_pending_intr(struct vlapic *vlapic); void vlapic_intr_accepted(struct vlapic *vlapic, int vector); void vlapic_set_intr_ready(struct vlapic *vlapic, int vector); -void vlapic_timer_tick(struct vlapic *vlapic); +int vlapic_timer_tick(struct vlapic *vlapic); uint64_t vlapic_get_apicbase(struct vlapic *vlapic); void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val); diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c index ace6010..bb22122 100644 --- a/sys/amd64/vmm/vmm_lapic.c +++ b/sys/amd64/vmm/vmm_lapic.c @@ -106,14 +106,14 @@ lapic_set_intr(struct vm *vm, int cpu, int vector) return (0); } -void +int lapic_timer_tick(struct vm *vm, int cpu) { struct vlapic *vlapic; vlapic = vm_lapic(vm, cpu); - vlapic_timer_tick(vlapic); + return (vlapic_timer_tick(vlapic)); } static boolean_t diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h index e9ff8fd..59fc016 100644 --- a/sys/amd64/vmm/vmm_lapic.h +++ b/sys/amd64/vmm/vmm_lapic.h @@ -38,7 +38,7 @@ int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval); int lapic_mmio(struct vm *vm, int cpu, u_int offset, int rd, struct vie *); -void lapic_timer_tick(struct vm *vm, int cpu); +int lapic_timer_tick(struct vm *vm, int cpu); /* * Returns a vector between 32 and 255 if an interrupt is pending in the -- cgit v1.1 From a74007510aa98cb51b2d7cc4056a994e3bf64763 Mon Sep 17 00:00:00 2001 From: neel Date: Tue, 23 Oct 2012 02:20:42 +0000 Subject: Test for AST pending with interrupts disabled right before entering the guest. If an IPI was delivered to this cpu before interrupts were disabled then return right away via vmx_setjmp() with a return value of VMX_RETURN_AST. Obtained from: NetApp --- sys/amd64/vmm/intel/vmx.c | 65 +++++++++++++++++++++++++------------- sys/amd64/vmm/intel/vmx.h | 2 ++ sys/amd64/vmm/intel/vmx_genassym.c | 7 ++++ sys/amd64/vmm/intel/vmx_support.S | 40 +++++++++++++++++++---- 4 files changed, 86 insertions(+), 28 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 81969ea..11b8c9f 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -290,6 +290,8 @@ vmx_setjmp_rc2str(int rc) return "vmresume"; case VMX_RETURN_VMLAUNCH: return "vmlaunch"; + case VMX_RETURN_AST: + return "ast"; default: return "unknown"; } @@ -798,15 +800,20 @@ vmx_run_trace(struct vmx *vmx, int vcpu) static __inline void vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, - int handled, int astpending) + int handled) { #ifdef KTR VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", handled ? "handled" : "unhandled", exit_reason_to_str(exit_reason), rip); +#endif +} - if (astpending) - VMM_CTR0(vmx->vm, vcpu, "astpending"); +static __inline void +vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) +{ +#ifdef KTR + VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); #endif } @@ -981,19 +988,19 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu) const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; -#if 1 /* - * XXX - * If an event is being injected from userland then just return. - * For e.g. we may inject a breakpoint exception to cause the - * guest to enter the debugger so we can inspect its state. + * If there is already an interrupt pending then just return. + * + * This could happen if an interrupt was injected on a prior + * VM entry but the actual entry into guest mode was aborted + * because of a pending AST. */ error = vmread(VMCS_ENTRY_INTR_INFO, &info); if (error) panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); if (info & VMCS_INTERRUPTION_INFO_VALID) return; -#endif + /* * NMI injection has priority so deal with those first */ @@ -1301,7 +1308,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) /* * It is possible that control is returned to userland * even though we were able to handle the VM exit in the - * kernel (for e.g. 'astpending' is set in the run loop). + * kernel. * * In such a case we want to make sure that the userland * restarts guest execution at the instruction *after* @@ -1352,6 +1359,7 @@ vmx_run(void *arg, int vcpu, register_t rip) vmxctx = &vmx->ctx[vcpu]; vmxctx->launched = 0; + astpending = 0; vmexit = vm_exitinfo(vmx->vm, vcpu); /* @@ -1395,6 +1403,9 @@ vmx_run(void *arg, int vcpu, register_t rip) break; case VMX_RETURN_LONGJMP: break; /* vm exit */ + case VMX_RETURN_AST: + astpending = 1; + break; case VMX_RETURN_VMRESUME: vie = vmcs_instruction_error(); if (vmxctx->launch_error == VM_FAIL_INVALID || @@ -1417,14 +1428,6 @@ vmx_run(void *arg, int vcpu, register_t rip) panic("vmx_setjmp returned %d", rc); } - /* - * XXX locking? - * See comments in exception.S about checking for ASTs - * atomically while interrupts are disabled. But it is - * not clear that they apply in our case. - */ - astpending = curthread->td_flags & TDF_ASTPENDING; - /* enable interrupts */ enable_intr(); @@ -1434,11 +1437,18 @@ vmx_run(void *arg, int vcpu, register_t rip) vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); + if (astpending) { + handled = 1; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_BOGUS; + vmx_astpending_trace(vmx, vcpu, rip); + break; + } + handled = vmx_exit_process(vmx, vcpu, vmexit); + vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); - vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled, - astpending); - } while (handled && !astpending); + } while (handled); /* * If a VM exit has been handled then the exitcode must be BOGUS @@ -1646,7 +1656,7 @@ vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, int code_valid) { int error; - uint32_t info; + uint64_t info; struct vmx *vmx = arg; struct vmcs *vmcs = &vmx->vmcs[vcpu]; @@ -1660,6 +1670,17 @@ vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 0x6, /* VM_SW_EXCEPTION */ }; + /* + * If there is already an exception pending to be delivered to the + * vcpu then just return. + */ + error = vmcs_getreg(vmcs, VMCS_ENTRY_INTR_INFO, &info); + if (error) + return (error); + + if (info & VMCS_INTERRUPTION_INFO_VALID) + return (EAGAIN); + info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); info |= VMCS_INTERRUPTION_INFO_VALID; error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h index 61d72a8..d4c90fa 100644 --- a/sys/amd64/vmm/intel/vmx.h +++ b/sys/amd64/vmm/intel/vmx.h @@ -101,12 +101,14 @@ CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0); #define VMX_RETURN_LONGJMP 1 #define VMX_RETURN_VMRESUME 2 #define VMX_RETURN_VMLAUNCH 3 +#define VMX_RETURN_AST 4 /* * vmx_setjmp() returns: * - 0 when it returns directly * - 1 when it returns from vmx_longjmp * - 2 when it returns from vmx_resume (which would only be in the error case) * - 3 when it returns from vmx_launch (which would only be in the error case) + * - 4 when it returns from vmx_resume or vmx_launch because of AST pending */ int vmx_setjmp(struct vmxctx *ctx); void vmx_longjmp(void); /* returns via vmx_setjmp */ diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c index c5b5bf9..823a05d 100644 --- a/sys/amd64/vmm/intel/vmx_genassym.c +++ b/sys/amd64/vmm/intel/vmx_genassym.c @@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -80,3 +81,9 @@ ASSYM(VMX_RETURN_DIRECT, VMX_RETURN_DIRECT); ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP); ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME); ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH); +ASSYM(VMX_RETURN_AST, VMX_RETURN_AST); + +ASSYM(TDF_ASTPENDING, TDF_ASTPENDING); +ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED); +ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); +ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread)); diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S index 8bdba86..4ba582a 100644 --- a/sys/amd64/vmm/intel/vmx_support.S +++ b/sys/amd64/vmm/intel/vmx_support.S @@ -31,6 +31,32 @@ #include "vmx_assym.s" /* + * Disable interrupts before updating %rsp in VMX_CHECK_AST or + * VMX_GUEST_RESTORE. + * + * The location that %rsp points to is a 'vmxctx' and not a + * real stack so we don't want an interrupt handler to trash it + */ +#define VMX_DISABLE_INTERRUPTS cli + +/* + * If the thread hosting the vcpu has an ast pending then take care of it + * by returning from vmx_setjmp() with a return value of VMX_RETURN_AST. + * + * Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts + * are disabled. + */ +#define VMX_CHECK_AST \ + movq PCPU(CURTHREAD),%rax; \ + testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax); \ + je 9f; \ + movq $VMX_RETURN_AST,%rsi; \ + movq %rdi,%rsp; \ + addq $VMXCTX_TMPSTKTOP,%rsp; \ + callq vmx_return; \ +9: + +/* * Assumes that %rdi holds a pointer to the 'vmxctx'. * * On "return" all registers are updated to reflect guest state. The two @@ -41,12 +67,6 @@ * host context in case of an error with 'vmlaunch' or 'vmresume'. */ #define VMX_GUEST_RESTORE \ - /* \ - * Disable interrupts before updating %rsp. The location that \ - * %rsp points to is a 'vmxctx' and not a real stack so we \ - * don't want an interrupt handler to trash it. \ - */ \ - cli; \ movq %rdi,%rsp; \ movq VMXCTX_GUEST_CR2(%rdi),%rsi; \ movq %rsi,%cr2; \ @@ -169,6 +189,10 @@ END(vmx_longjmp) * through vmx_setjmp() with a return value of 2. */ ENTRY(vmx_resume) + VMX_DISABLE_INTERRUPTS + + VMX_CHECK_AST + /* * Restore guest state that is not automatically loaded from the vmcs. */ @@ -197,6 +221,10 @@ END(vmx_resume) * through vmx_setjmp() with a return value of 3. */ ENTRY(vmx_launch) + VMX_DISABLE_INTERRUPTS + + VMX_CHECK_AST + /* * Restore guest state that is not automatically loaded from the vmcs. */ -- cgit v1.1 From 583a9ef76d9ec8f3bb8e7927281cfe79fc0c0584 Mon Sep 17 00:00:00 2001 From: neel Date: Wed, 24 Oct 2012 02:54:21 +0000 Subject: Maintain state regarding NMI delivery to guest vcpu in VT-x independent manner. Also add a stats counter to count the number of NMIs delivered per vcpu. Obtained from: NetApp --- sys/amd64/include/vmm.h | 4 ++-- sys/amd64/vmm/amd/amdv.c | 9 --------- sys/amd64/vmm/intel/vmx.c | 16 ++-------------- sys/amd64/vmm/intel/vmx.h | 1 - sys/amd64/vmm/vmm.c | 49 +++++++++++++++++++++++++++++++++++++++-------- 5 files changed, 45 insertions(+), 34 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index d0dfb04..8f78b8f 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -63,7 +63,6 @@ typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, typedef int (*vmi_inject_event_t)(void *vmi, int vcpu, int type, int vector, uint32_t code, int code_valid); -typedef int (*vmi_inject_nmi_t)(void *vmi, int vcpu); typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); @@ -81,7 +80,6 @@ struct vmm_ops { vmi_get_desc_t vmgetdesc; vmi_set_desc_t vmsetdesc; vmi_inject_event_t vminject; - vmi_inject_nmi_t vmnmi; vmi_get_cap_t vmgetcap; vmi_set_cap_t vmsetcap; }; @@ -110,6 +108,8 @@ int vm_run(struct vm *vm, struct vm_run *vmrun); int vm_inject_event(struct vm *vm, int vcpu, int type, int vector, uint32_t error_code, int error_code_valid); int vm_inject_nmi(struct vm *vm, int vcpu); +int vm_nmi_pending(struct vm *vm, int vcpuid); +void vm_nmi_clear(struct vm *vm, int vcpuid); uint64_t *vm_guest_msrs(struct vm *vm, int cpu); struct vlapic *vm_lapic(struct vm *vm, int cpu); int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c index 020743f..dc071d3 100644 --- a/sys/amd64/vmm/amd/amdv.c +++ b/sys/amd64/vmm/amd/amdv.c @@ -136,14 +136,6 @@ amdv_inject_event(void *vmi, int vcpu, int type, int vector, } static int -amdv_nmi(void *arg, int vcpu) -{ - - printf("amdv_nmi: not implemented\n"); - return (EINVAL); -} - -static int amdv_getcap(void *arg, int vcpu, int type, int *retval) { @@ -172,7 +164,6 @@ struct vmm_ops vmm_ops_amd = { amdv_getdesc, amdv_setdesc, amdv_inject_event, - amdv_nmi, amdv_getcap, amdv_setcap }; diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 11b8c9f..16acfff 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -751,7 +751,6 @@ vmx_vminit(struct vm *vm) vmx->cap[i].set = 0; vmx->cap[i].proc_ctls = procbased_ctls; - vmx->state[i].request_nmi = 0; vmx->state[i].lastcpu = -1; vmx->state[i].vpid = vpid; @@ -940,7 +939,7 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu) uint64_t info, interruptibility; /* Bail out if no NMI requested */ - if (vmx->state[vcpu].request_nmi == 0) + if (!vm_nmi_pending(vmx->vm, vcpu)) return (0); error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); @@ -965,7 +964,7 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu) VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); /* Clear the request */ - vmx->state[vcpu].request_nmi = 0; + vm_nmi_clear(vmx->vm, vcpu); return (1); nmiblocked: @@ -1696,16 +1695,6 @@ vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, } static int -vmx_nmi(void *arg, int vcpu) -{ - struct vmx *vmx = arg; - - atomic_set_int(&vmx->state[vcpu].request_nmi, 1); - - return (0); -} - -static int vmx_getcap(void *arg, int vcpu, int type, int *retval) { struct vmx *vmx = arg; @@ -1843,7 +1832,6 @@ struct vmm_ops vmm_ops_intel = { vmx_getdesc, vmx_setdesc, vmx_inject, - vmx_nmi, vmx_getcap, vmx_setcap }; diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h index d4c90fa..c7cd567 100644 --- a/sys/amd64/vmm/intel/vmx.h +++ b/sys/amd64/vmm/intel/vmx.h @@ -76,7 +76,6 @@ struct vmxcap { }; struct vmxstate { - int request_nmi; int lastcpu; /* host cpu that this 'vcpu' last ran on */ uint16_t vpid; }; diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 8d8f143..6c6df21 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -77,6 +77,7 @@ struct vcpu { void *stats; struct vm_exit exitinfo; enum x2apic_state x2apic_state; + int nmi_pending; }; #define VCPU_F_PINNED 0x0001 @@ -137,8 +138,6 @@ static struct vmm_ops *ops; (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) #define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \ (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO) -#define VMNMI(vmi, vcpu) \ - (ops != NULL ? (*ops->vmnmi)(vmi, vcpu) : ENXIO) #define VMGETCAP(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) #define VMSETCAP(vmi, vcpu, num, val) \ @@ -710,17 +709,51 @@ vm_inject_event(struct vm *vm, int vcpuid, int type, return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); } +VMM_STAT_DEFINE(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); + int -vm_inject_nmi(struct vm *vm, int vcpu) +vm_inject_nmi(struct vm *vm, int vcpuid) { - int error; + struct vcpu *vcpu; - if (vcpu < 0 || vcpu >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); - error = VMNMI(vm->cookie, vcpu); - vm_interrupt_hostcpu(vm, vcpu); - return (error); + vcpu = &vm->vcpu[vcpuid]; + + vcpu->nmi_pending = 1; + vm_interrupt_hostcpu(vm, vcpuid); + return (0); +} + +int +vm_nmi_pending(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + return (vcpu->nmi_pending); +} + +void +vm_nmi_clear(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->nmi_pending == 0) + panic("vm_nmi_clear: inconsistent nmi_pending state"); + + vcpu->nmi_pending = 0; + vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); } int -- cgit v1.1 From 80aee5fb8aa2abb172630d47efcb1f8f26f6bcc4 Mon Sep 17 00:00:00 2001 From: neel Date: Thu, 25 Oct 2012 04:08:26 +0000 Subject: Hide the monitor/mwait instruction capability from the guest until we know how to properly intercept it. Obtained from: NetApp --- sys/amd64/vmm/x86.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c index 47ba975..ca0d785 100644 --- a/sys/amd64/vmm/x86.c +++ b/sys/amd64/vmm/x86.c @@ -128,6 +128,12 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, CPUID2_AVX); /* + * Hide monitor/mwait until we know how to deal with + * these instructions. + */ + regs[2] &= ~CPUID2_MON; + + /* * Hide thermal monitoring */ regs[3] &= ~(CPUID_ACPI | CPUID_TM); -- cgit v1.1 From bcb3589583c269dcc88504fcf7c0dedc7c03f123 Mon Sep 17 00:00:00 2001 From: neel Date: Thu, 25 Oct 2012 04:29:21 +0000 Subject: If the guest vcpu wants to idle then use that opportunity to relinquish the host cpu to the scheduler until the guest is ready to run again. This implies that the host cpu utilization will now closely mirror the actual load imposed by the guest vcpu. Also, the vcpu mutex now needs to be of type MTX_SPIN since we need to acquire it inside a critical section. Obtained from: NetApp --- sys/amd64/vmm/intel/vmx.c | 21 ++++++++-- sys/amd64/vmm/vmm.c | 97 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 95 insertions(+), 23 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 16acfff..2052dc9 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -159,6 +159,8 @@ static int cap_monitor_trap; /* statistics */ static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus"); static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt"); +static VMM_STAT_DEFINE(VMEXIT_HLT_IGNORED, "number of times hlt was ignored"); +static VMM_STAT_DEFINE(VMEXIT_HLT, "number of times hlt was intercepted"); #ifdef KTR static const char * @@ -1203,11 +1205,11 @@ vmx_lapic_fault(struct vm *vm, int cpu, static int vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { - int handled; + int error, handled; struct vmcs *vmcs; struct vmxctx *vmxctx; uint32_t eax, ecx, edx; - uint64_t qual, gpa, cr3; + uint64_t qual, gpa, cr3, intr_info; handled = 0; vmcs = &vmx->vmcs[vcpu]; @@ -1240,7 +1242,20 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) } break; case EXIT_REASON_HLT: - vmexit->exitcode = VM_EXITCODE_HLT; + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); + /* + * If there is an event waiting to be injected then there is + * no need to 'hlt'. + */ + error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info); + if (error) + panic("vmx_exit_process: vmread(intrinfo) %d", error); + + if (intr_info & VMCS_INTERRUPTION_INFO_VALID) { + handled = 1; + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1); + } else + vmexit->exitcode = VM_EXITCODE_HLT; break; case EXIT_REASON_MTF: vmexit->exitcode = VM_EXITCODE_MTRAP; diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 6c6df21..8bc9581 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$"); #include "vmm_msr.h" #include "vmm_ipi.h" #include "vmm_stat.h" +#include "vmm_lapic.h" #include "io/ppt.h" #include "io/iommu.h" @@ -92,9 +93,9 @@ do { \ vm->vcpu[vcpuid].pincpu = host_cpuid; \ } while(0) -#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_DEF) -#define vcpu_lock(v) mtx_lock(&((v)->mtx)) -#define vcpu_unlock(v) mtx_unlock(&((v)->mtx)) +#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) +#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) +#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) #define VM_MAX_MEMORY_SEGMENTS 2 @@ -651,13 +652,16 @@ save_guest_fpustate(struct vcpu *vcpu) fpu_start_emulating(); } +static VMM_STAT_DEFINE(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); + int vm_run(struct vm *vm, struct vm_run *vmrun) { - int error, vcpuid; + int error, vcpuid, sleepticks, t; struct vcpu *vcpu; struct pcb *pcb; - uint64_t tscval; + uint64_t tscval, rip; + struct vm_exit *vme; vcpuid = vmrun->cpuid; @@ -665,7 +669,9 @@ vm_run(struct vm *vm, struct vm_run *vmrun) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; - + vme = &vmrun->vm_exit; + rip = vmrun->rip; +restart: critical_enter(); tscval = rdtsc(); @@ -677,7 +683,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun) restore_guest_fpustate(vcpu); vcpu->hostcpu = curcpu; - error = VMRUN(vm->cookie, vcpuid, vmrun->rip); + error = VMRUN(vm->cookie, vcpuid, rip); vcpu->hostcpu = NOCPU; save_guest_fpustate(vcpu); @@ -686,10 +692,52 @@ vm_run(struct vm *vm, struct vm_run *vmrun) vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); /* copy the exit information */ - bcopy(&vcpu->exitinfo, &vmrun->vm_exit, sizeof(struct vm_exit)); + bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit)); critical_exit(); + /* + * Oblige the guest's desire to 'hlt' by sleeping until the vcpu + * is ready to run. + */ + if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) { + vcpu_lock(vcpu); + + /* + * Figure out the number of host ticks until the next apic + * timer interrupt in the guest. + */ + sleepticks = lapic_timer_tick(vm, vcpuid); + + /* + * If the guest local apic timer is disabled then sleep for + * a long time but not forever. + */ + if (sleepticks < 0) + sleepticks = hz; + + /* + * Do a final check for pending NMI or interrupts before + * really putting this thread to sleep. + * + * These interrupts could have happened any time after we + * returned from VMRUN() and before we grabbed the vcpu lock. + */ + if (!vm_nmi_pending(vm, vcpuid) && + lapic_pending_intr(vm, vcpuid) < 0) { + if (sleepticks <= 0) + panic("invalid sleepticks %d", sleepticks); + t = ticks; + msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks); + vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); + } + + vcpu_unlock(vcpu); + + rip = vme->rip + vme->inst_length; + goto restart; + } + return (error); } @@ -709,7 +757,7 @@ vm_inject_event(struct vm *vm, int vcpuid, int type, return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); } -VMM_STAT_DEFINE(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); +static VMM_STAT_DEFINE(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); int vm_inject_nmi(struct vm *vm, int vcpuid) @@ -935,16 +983,25 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid) vcpu = &vm->vcpu[vcpuid]; - /* - * XXX racy but the worst case is that we'll send an unnecessary IPI - * to the 'hostcpu'. - * - * We cannot use vcpu_is_running() here because it acquires vcpu->mtx - * which is not allowed inside a critical section. - */ + vcpu_lock(vcpu); hostcpu = vcpu->hostcpu; - if (hostcpu == NOCPU || hostcpu == curcpu) - return; - - ipi_cpu(hostcpu, vmm_ipinum); + if (hostcpu == NOCPU) { + /* + * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then + * the host thread must be sleeping waiting for an event to + * kick the vcpu out of 'hlt'. + * + * XXX this is racy because the condition exists right before + * and after calling VMRUN() in vm_run(). The wakeup() is + * benign in this case. + */ + if (vcpu->state == VCPU_RUNNING) + wakeup_one(vcpu); + } else { + if (vcpu->state != VCPU_RUNNING) + panic("invalid vcpu state %d", vcpu->state); + if (hostcpu != curcpu) + ipi_cpu(hostcpu, vmm_ipinum); + } + vcpu_unlock(vcpu); } -- cgit v1.1 From cbd59fc940c5caaf0cde3410c8772176220fd1a1 Mon Sep 17 00:00:00 2001 From: neel Date: Fri, 26 Oct 2012 03:12:40 +0000 Subject: Unconditionally enable fpu emulation by setting CR0.TS in the host after the guest does a vm exit. This allows us to trap any fpu access in the host context while the fpu still has "dirty" state belonging to the guest. Reported by: "s vas" on freebsd-virtualization@ Obtained from: NetApp --- sys/amd64/vmm/intel/vmcs.c | 10 +++++++++- sys/amd64/vmm/vmm.c | 13 +++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c index 8c53465..26ac5f8 100644 --- a/sys/amd64/vmm/intel/vmcs.c +++ b/sys/amd64/vmm/intel/vmcs.c @@ -367,7 +367,15 @@ vmcs_set_defaults(struct vmcs *vmcs, goto done; /* Load the control registers */ - cr0 = rcr0(); + + /* + * We always want CR0.TS to be set when the processor does a VM exit. + * + * With emulation turned on unconditionally after a VM exit, we are + * able to trap inadvertent use of the FPU until the guest FPU state + * has been safely squirreled away. + */ + cr0 = rcr0() | CR0_TS; if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0) goto done; diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 8bc9581..6efc01f 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -640,14 +640,27 @@ restore_guest_fpustate(struct vcpu *vcpu) /* flush host state to the pcb */ fpuexit(curthread); + + /* restore guest FPU state */ fpu_stop_emulating(); fpurestore(vcpu->guestfpu); + + /* + * The FPU is now "dirty" with the guest's state so turn on emulation + * to trap any access to the FPU by the host. + */ + fpu_start_emulating(); } static void save_guest_fpustate(struct vcpu *vcpu) { + if ((rcr0() & CR0_TS) == 0) + panic("fpu emulation not enabled in host!"); + + /* save guest FPU state */ + fpu_stop_emulating(); fpusave(vcpu->guestfpu); fpu_start_emulating(); } -- cgit v1.1 From dc37578ed255be09cc4e4fcd2ebf48781c91eabc Mon Sep 17 00:00:00 2001 From: grehan Date: Fri, 26 Oct 2012 22:32:26 +0000 Subject: Set the valid field of the newly allocated field as all other vm page allocators do. This fixes a panic when a virtio block device is mounted as root, with the host system dying in vm_page_dirty with invalid bits. Reviewed by: neel Obtained from: NetApp --- sys/amd64/vmm/vmm_mem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c index 8745339..04f99b1 100644 --- a/sys/amd64/vmm/vmm_mem.c +++ b/sys/amd64/vmm/vmm_mem.c @@ -99,6 +99,7 @@ vmm_mem_alloc(size_t size) if ((m->flags & PG_ZERO) == 0) pagezero((void *)PHYS_TO_DMAP(pa)); + m->valid = VM_PAGE_BITS_ALL; update_pages_allocated(1); -- cgit v1.1 From 9631d598ccea6dd526400bad0a438a10c8294542 Mon Sep 17 00:00:00 2001 From: neel Date: Mon, 29 Oct 2012 01:51:24 +0000 Subject: Corral all the host state associated with the virtual machine into its own file. This state is independent of the type of hardware assist used so there is really no need for it to be in Intel-specific code. Obtained from: NetApp --- sys/amd64/vmm/intel/vmcs.c | 30 +++++------ sys/amd64/vmm/intel/vmx.c | 11 ++-- sys/amd64/vmm/vmm.c | 2 + sys/amd64/vmm/vmm_host.c | 124 +++++++++++++++++++++++++++++++++++++++++++++ sys/amd64/vmm/vmm_host.h | 75 +++++++++++++++++++++++++++ 5 files changed, 218 insertions(+), 24 deletions(-) create mode 100644 sys/amd64/vmm/vmm_host.c create mode 100644 sys/amd64/vmm/vmm_host.h (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c index 26ac5f8..a5784dd 100644 --- a/sys/amd64/vmm/intel/vmcs.c +++ b/sys/amd64/vmm/intel/vmcs.c @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include "vmm_host.h" #include "vmcs.h" #include "vmx_cpufunc.h" #include "ept.h" @@ -314,12 +315,12 @@ vmcs_set_defaults(struct vmcs *vmcs, { int error, codesel, datasel, tsssel; u_long cr0, cr4, efer; - uint64_t eptp, pat; + uint64_t eptp, pat, fsbase, idtrbase; uint32_t exc_bitmap; - codesel = GSEL(GCODE_SEL, SEL_KPL); - datasel = GSEL(GDATA_SEL, SEL_KPL); - tsssel = GSEL(GPROC0_SEL, SEL_KPL); + codesel = vmm_get_host_codesel(); + datasel = vmm_get_host_datasel(); + tsssel = vmm_get_host_tsssel(); /* * Make sure we have a "current" VMCS to work with. @@ -357,29 +358,22 @@ vmcs_set_defaults(struct vmcs *vmcs, /* Host state */ /* Initialize host IA32_PAT MSR */ - pat = rdmsr(MSR_PAT); + pat = vmm_get_host_pat(); if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0) goto done; /* Load the IA32_EFER MSR */ - efer = rdmsr(MSR_EFER); + efer = vmm_get_host_efer(); if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0) goto done; /* Load the control registers */ - /* - * We always want CR0.TS to be set when the processor does a VM exit. - * - * With emulation turned on unconditionally after a VM exit, we are - * able to trap inadvertent use of the FPU until the guest FPU state - * has been safely squirreled away. - */ - cr0 = rcr0() | CR0_TS; + cr0 = vmm_get_host_cr0(); if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0) goto done; - cr4 = rcr4(); + cr4 = vmm_get_host_cr4() | CR4_VMXE; if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0) goto done; @@ -411,10 +405,12 @@ vmcs_set_defaults(struct vmcs *vmcs, * Note that we exclude %gs, tss and gdtr here because their base * address is pcpu specific. */ - if ((error = vmwrite(VMCS_HOST_FS_BASE, 0)) != 0) + fsbase = vmm_get_host_fsbase(); + if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0) goto done; - if ((error = vmwrite(VMCS_HOST_IDTR_BASE, r_idt.rd_base)) != 0) + idtrbase = vmm_get_host_idtrbase(); + if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0) goto done; /* instruction pointer */ diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 2052dc9..ace2683 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include "vmm_host.h" #include "vmm_lapic.h" #include "vmm_msr.h" #include "vmm_ktr.h" @@ -64,8 +65,6 @@ __FBSDID("$FreeBSD$"); #include "vmx_controls.h" #include "vmm_instruction_emul.h" -#define CR4_VMXE (1UL << 13) - #define PINBASED_CTLS_ONE_SETTING \ (PINBASED_EXTINT_EXITING | \ PINBASED_NMI_EXITING | \ @@ -118,8 +117,6 @@ __FBSDID("$FreeBSD$"); MALLOC_DEFINE(M_VMX, "vmx", "vmx"); -extern struct pcpu __pcpu[]; - int vmxon_enabled[MAXCPU]; static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); @@ -836,15 +833,15 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); - error = vmwrite(VMCS_HOST_TR_BASE, (u_long)PCPU_GET(tssp)); + error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); if (error != 0) goto done; - error = vmwrite(VMCS_HOST_GDTR_BASE, (u_long)&gdt[NGDT * curcpu]); + error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); if (error != 0) goto done; - error = vmwrite(VMCS_HOST_GS_BASE, (u_long)&__pcpu[curcpu]); + error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); if (error != 0) goto done; diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 6efc01f..eae9ccc 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include "vmm_host.h" #include "vmm_mem.h" #include "vmm_util.h" #include @@ -196,6 +197,7 @@ vmm_init(void) { int error; + vmm_host_state_init(); vmm_ipi_init(); error = vmm_mem_init(); diff --git a/sys/amd64/vmm/vmm_host.c b/sys/amd64/vmm/vmm_host.c new file mode 100644 index 0000000..8dfef73 --- /dev/null +++ b/sys/amd64/vmm/vmm_host.c @@ -0,0 +1,124 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include + +#include "vmm_host.h" + +static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4; + +void +vmm_host_state_init(void) +{ + + vmm_host_efer = rdmsr(MSR_EFER); + vmm_host_pat = rdmsr(MSR_PAT); + + /* + * We always want CR0.TS to be set when the processor does a VM exit. + * + * With emulation turned on unconditionally after a VM exit, we are + * able to trap inadvertent use of the FPU until the guest FPU state + * has been safely squirreled away. + */ + vmm_host_cr0 = rcr0() | CR0_TS; + + vmm_host_cr4 = rcr4(); +} + +uint64_t +vmm_get_host_pat(void) +{ + + return (vmm_host_pat); +} + +uint64_t +vmm_get_host_efer(void) +{ + + return (vmm_host_efer); +} + +uint64_t +vmm_get_host_cr0(void) +{ + + return (vmm_host_cr0); +} + +uint64_t +vmm_get_host_cr4(void) +{ + + return (vmm_host_cr4); +} + +uint64_t +vmm_get_host_datasel(void) +{ + + return (GSEL(GDATA_SEL, SEL_KPL)); + +} + +uint64_t +vmm_get_host_codesel(void) +{ + + return (GSEL(GCODE_SEL, SEL_KPL)); +} + +uint64_t +vmm_get_host_tsssel(void) +{ + + return (GSEL(GPROC0_SEL, SEL_KPL)); +} + +uint64_t +vmm_get_host_fsbase(void) +{ + + return (0); +} + +uint64_t +vmm_get_host_idtrbase(void) +{ + + return (r_idt.rd_base); +} diff --git a/sys/amd64/vmm/vmm_host.h b/sys/amd64/vmm/vmm_host.h new file mode 100644 index 0000000..839f54a --- /dev/null +++ b/sys/amd64/vmm/vmm_host.h @@ -0,0 +1,75 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_HOST_H_ +#define _VMM_HOST_H_ + +#ifndef _KERNEL +#error "no user-servicable parts inside" +#endif + +void vmm_host_state_init(void); + +uint64_t vmm_get_host_pat(void); +uint64_t vmm_get_host_efer(void); +uint64_t vmm_get_host_cr0(void); +uint64_t vmm_get_host_cr4(void); +uint64_t vmm_get_host_datasel(void); +uint64_t vmm_get_host_codesel(void); +uint64_t vmm_get_host_tsssel(void); +uint64_t vmm_get_host_fsbase(void); +uint64_t vmm_get_host_idtrbase(void); + +/* + * Inline access to host state that is used on every VM entry + */ +static __inline uint64_t +vmm_get_host_trbase(void) +{ + + return ((uint64_t)PCPU_GET(tssp)); +} + +static __inline uint64_t +vmm_get_host_gdtrbase(void) +{ + + return ((uint64_t)&gdt[NGDT * curcpu]); +} + +struct pcpu; +extern struct pcpu __pcpu[]; + +static __inline uint64_t +vmm_get_host_gsbase(void) +{ + + return ((uint64_t)&__pcpu[curcpu]); +} + +#endif -- cgit v1.1 From aee862ac3fd36264249b7160eaecaeacab119ac3 Mon Sep 17 00:00:00 2001 From: neel Date: Mon, 29 Oct 2012 23:58:15 +0000 Subject: Convert VMCS_ENTRY_INTR_INFO field into a vmcs identifier before passing it to vmcs_getreg(). Without this conversion vmcs_getreg() will return EINVAL. In particular this prevented injection of the breakpoint exception into the guest via the "-B" option to /usr/sbin/bhyve which is hugely useful when debugging guest hangs. This was broken in r241921. Pointy hat: me Obtained from: NetApp --- sys/amd64/vmm/intel/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index ace2683..7a9cfb8 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1685,7 +1685,7 @@ vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, * If there is already an exception pending to be delivered to the * vcpu then just return. */ - error = vmcs_getreg(vmcs, VMCS_ENTRY_INTR_INFO, &info); + error = vmcs_getreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info); if (error) return (error); -- cgit v1.1 From 091578815ab0408c9aa2133e259263351101a008 Mon Sep 17 00:00:00 2001 From: grehan Date: Tue, 6 Nov 2012 02:43:41 +0000 Subject: Fix issue found with clang build. Avoid code insertion by the compiler between inline asm statements that would in turn modify the flags value set by the first asm, and used by the second. Solve by making the common error block a string that can be pulled into the first inline asm, and using symbolic labels for asm variables. bhyve can now build/run fine when compiled with clang. Reviewed by: neel Obtained from: NetApp --- sys/amd64/vmm/intel/vmx_cpufunc.h | 77 ++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 29 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h index e9f6c6d..2e66443 100644 --- a/sys/amd64/vmm/intel/vmx_cpufunc.h +++ b/sys/amd64/vmm/intel/vmx_cpufunc.h @@ -42,18 +42,15 @@ struct vmcs; #define VM_SUCCESS 0 #define VM_FAIL_INVALID 1 #define VM_FAIL_VALID 2 -#define VMX_SET_ERROR_CODE(varname) \ - do { \ - __asm __volatile(" jnc 1f;" \ - " mov $1, %0;" /* CF: error = 1 */ \ - " jmp 3f;" \ - "1: jnz 2f;" \ - " mov $2, %0;" /* ZF: error = 2 */ \ - " jmp 3f;" \ - "2: mov $0, %0;" \ - "3: nop" \ - :"=r" (varname)); \ - } while (0) +#define VMX_SET_ERROR_CODE \ + " jnc 1f;" \ + " mov $1, %[error];" /* CF: error = 1 */ \ + " jmp 3f;" \ + "1: jnz 2f;" \ + " mov $2, %[error];" /* ZF: error = 2 */ \ + " jmp 3f;" \ + "2: mov $0, %[error];" \ + "3:" /* returns 0 on success and non-zero on failure */ static __inline int @@ -63,8 +60,12 @@ vmxon(char *region) uint64_t addr; addr = vtophys(region); - __asm __volatile("vmxon %0" : : "m" (*(uint64_t *)&addr) : "memory"); - VMX_SET_ERROR_CODE(error); + __asm __volatile("vmxon %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); } @@ -76,21 +77,26 @@ vmclear(struct vmcs *vmcs) uint64_t addr; addr = vtophys(vmcs); - __asm __volatile("vmclear %0" : : "m" (*(uint64_t *)&addr) : "memory"); - VMX_SET_ERROR_CODE(error); + __asm __volatile("vmclear %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); return (error); } static __inline void vmxoff(void) { + __asm __volatile("vmxoff"); } static __inline void vmptrst(uint64_t *addr) { - __asm __volatile("vmptrst %0" : : "m" (*addr) : "memory"); + + __asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory"); } static __inline int @@ -100,8 +106,11 @@ vmptrld(struct vmcs *vmcs) uint64_t addr; addr = vtophys(vmcs); - __asm __volatile("vmptrld %0" : : "m" (*(uint64_t *)&addr) : "memory"); - VMX_SET_ERROR_CODE(error); + __asm __volatile("vmptrld %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); return (error); } @@ -110,9 +119,11 @@ vmwrite(uint64_t reg, uint64_t val) { int error; - __asm __volatile("vmwrite %0, %1" : : "r" (val), "r" (reg) : "memory"); - - VMX_SET_ERROR_CODE(error); + __asm __volatile("vmwrite %[val], %[reg];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [val] "r" (val), [reg] "r" (reg) + : "memory"); return (error); } @@ -122,9 +133,11 @@ vmread(uint64_t r, uint64_t *addr) { int error; - __asm __volatile("vmread %0, %1" : : "r" (r), "m" (*addr) : "memory"); - - VMX_SET_ERROR_CODE(error); + __asm __volatile("vmread %[r], %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [r] "r" (r), [addr] "m" (*addr) + : "memory"); return (error); } @@ -170,9 +183,12 @@ invvpid(uint64_t type, struct invvpid_desc desc) { int error; - __asm __volatile("invvpid %0, %1" :: "m" (desc), "r" (type) : "memory"); + __asm __volatile("invvpid %[desc], %[type];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [desc] "m" (desc), [type] "r" (type) + : "memory"); - VMX_SET_ERROR_CODE(error); if (error) panic("invvpid error %d", error); } @@ -190,9 +206,12 @@ invept(uint64_t type, struct invept_desc desc) { int error; - __asm __volatile("invept %0, %1" :: "m" (desc), "r" (type) : "memory"); + __asm __volatile("invept %[desc], %[type];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [desc] "m" (desc), [type] "r" (type) + : "memory"); - VMX_SET_ERROR_CODE(error); if (error) panic("invept error %d", error); } -- cgit v1.1 From 5a600cdfe44adae619eca970bef1539b3ac6ae35 Mon Sep 17 00:00:00 2001 From: grehan Date: Tue, 20 Nov 2012 06:01:03 +0000 Subject: Handle CPUID leaf 0x7 now that FreeBSD is using it. Return 0's for now. Reviewed by: neel Obtained from: NetApp --- sys/amd64/vmm/x86.c | 1 + sys/amd64/vmm/x86.h | 1 + 2 files changed, 2 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c index ca0d785..94abe09 100644 --- a/sys/amd64/vmm/x86.c +++ b/sys/amd64/vmm/x86.c @@ -162,6 +162,7 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, break; case CPUID_0000_0006: + case CPUID_0000_0007: /* * Handle the access, but report 0 for * all options diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h index d19e1d8..368e967 100644 --- a/sys/amd64/vmm/x86.h +++ b/sys/amd64/vmm/x86.h @@ -35,6 +35,7 @@ #define CPUID_0000_0003 (0x3) #define CPUID_0000_0004 (0x4) #define CPUID_0000_0006 (0x6) +#define CPUID_0000_0007 (0x7) #define CPUID_0000_000A (0xA) #define CPUID_0000_000B (0xB) #define CPUID_8000_0000 (0x80000000) -- cgit v1.1 From 575baa2d8a6961e9a82ca9272a78d3c01cfcbdf1 Mon Sep 17 00:00:00 2001 From: neel Date: Thu, 22 Nov 2012 00:08:20 +0000 Subject: Get rid of redundant comparision which is guaranteed to be "true" for unsigned integers. Obtained from: NetApp --- sys/amd64/vmm/intel/vmx_msr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c index 1e9a837..2aba63c 100644 --- a/sys/amd64/vmm/intel/vmx_msr.c +++ b/sys/amd64/vmm/intel/vmx_msr.c @@ -148,7 +148,7 @@ msr_bitmap_change_access(char *bitmap, u_int msr, int access) { int byte, bit; - if (msr >= 0x00000000 && msr <= 0x00001FFF) + if (msr <= 0x00001FFF) byte = msr / 8; else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) byte = 1024 + (msr - 0xC0000000) / 8; -- cgit v1.1 From d8bfa0f5754e3a60b0d0e1d425a6038be554e73d Mon Sep 17 00:00:00 2001 From: neel Date: Thu, 22 Nov 2012 04:07:18 +0000 Subject: Fix a bug in the MSI-X resource allocation for PCI passthrough devices. In the case where the underlying host had disabled MSI-X via the "hw.pci.enable_msix" tunable, the ppt_setup_msix() function would fail and return an error without properly cleaning up. This in turn would cause a page fault on the next boot of the guest. Fix this by calling ppt_teardown_msix() in all the error return paths. Obtained from: NetApp --- sys/amd64/vmm/io/ppt.c | 63 +++++++++++++++++++++----------------------------- 1 file changed, 26 insertions(+), 37 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c index 3044fc5..fdf136b 100644 --- a/sys/amd64/vmm/io/ppt.c +++ b/sys/amd64/vmm/io/ppt.c @@ -247,7 +247,7 @@ ppt_teardown_msix_intr(struct pptdev *ppt, int idx) static void ppt_teardown_msix(struct pptdev *ppt) { - int i, error; + int i; if (ppt->msix.num_msgs == 0) return; @@ -267,9 +267,7 @@ ppt_teardown_msix(struct pptdev *ppt) free(ppt->msix.cookie, M_PPTMSIX); free(ppt->msix.arg, M_PPTMSIX); - error = pci_release_msi(ppt->dev); - if (error) - printf("ppt_teardown_msix: Failed to release MSI-X resources (error %i)\n", error); + pci_release_msi(ppt->dev); ppt->msix.num_msgs = 0; } @@ -519,7 +517,7 @@ ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, { struct pptdev *ppt; struct pci_devinfo *dinfo; - int numvec, vector_count, rid, error; + int numvec, alloced, rid, error; size_t res_size, cookie_size, arg_size; ppt = ppt_find(bus, slot, func); @@ -538,48 +536,39 @@ ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, * Allocate the IRQ resources * Set up some variables in ppt->msix */ - if (!ppt->msix.msix_table_res) { - ppt->msix.res = NULL; - ppt->msix.cookie = NULL; - ppt->msix.arg = NULL; - - rid = dinfo->cfg.msix.msix_table_bar; - ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev, SYS_RES_MEMORY, - &rid, RF_ACTIVE); - if (ppt->msix.msix_table_res == NULL) - return (ENOSPC); - - ppt->msix.msix_table_rid = rid; - - vector_count = numvec = pci_msix_count(ppt->dev); - - error = pci_alloc_msix(ppt->dev, &numvec); - if (error) - return (error); - else if (vector_count != numvec) { - pci_release_msi(ppt->dev); - return (ENOSPC); - } - - ppt->msix.num_msgs = numvec; + if (ppt->msix.num_msgs == 0) { + numvec = pci_msix_count(ppt->dev); + if (numvec <= 0) + return (EINVAL); ppt->msix.startrid = 1; + ppt->msix.num_msgs = numvec; res_size = numvec * sizeof(ppt->msix.res[0]); cookie_size = numvec * sizeof(ppt->msix.cookie[0]); arg_size = numvec * sizeof(ppt->msix.arg[0]); - ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK); - ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX, M_WAITOK); - ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK); - if (ppt->msix.res == NULL || ppt->msix.cookie == NULL || - ppt->msix.arg == NULL) { + ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO); + ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX, + M_WAITOK | M_ZERO); + ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO); + + rid = dinfo->cfg.msix.msix_table_bar; + ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev, + SYS_RES_MEMORY, &rid, RF_ACTIVE); + + if (ppt->msix.msix_table_res == NULL) { ppt_teardown_msix(ppt); return (ENOSPC); } - bzero(ppt->msix.res, res_size); - bzero(ppt->msix.cookie, cookie_size); - bzero(ppt->msix.arg, arg_size); + ppt->msix.msix_table_rid = rid; + + alloced = numvec; + error = pci_alloc_msix(ppt->dev, &alloced); + if (error || alloced != numvec) { + ppt_teardown_msix(ppt); + return (error == 0 ? ENOSPC: error); + } } if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { -- cgit v1.1 From 36ab9a2e1ab7d2b1884270275584f989cfd65e2b Mon Sep 17 00:00:00 2001 From: neel Date: Wed, 28 Nov 2012 00:02:17 +0000 Subject: Revamp the x86 instruction emulation in bhyve. On a nested page table fault the hypervisor will: - fetch the instruction using the guest %rip and %cr3 - decode the instruction in 'struct vie' - emulate the instruction in host kernel context for local apic accesses - any other type of mmio access is punted up to user-space (e.g. ioapic) The decoded instruction is passed as collateral to the user-space process that is handling the PAGING exit. The emulation code is fleshed out to include more addressing modes (e.g. SIB) and more types of operands (e.g. imm8). The source code is unified into a single file (vmm_instruction_emul.c) that is compiled into vmm.ko as well as /usr/sbin/bhyve. Reviewed by: grehan Obtained from: NetApp --- sys/amd64/include/vmm.h | 3 + sys/amd64/include/vmm_instruction_emul.h | 113 ++++++++ sys/amd64/vmm/intel/vmcs.h | 1 + sys/amd64/vmm/intel/vmx.c | 45 ++- sys/amd64/vmm/vmm_instruction_emul.c | 481 +++++++++++++++++++++++++++---- sys/amd64/vmm/vmm_instruction_emul.h | 91 ------ sys/amd64/vmm/vmm_lapic.c | 83 ++---- sys/amd64/vmm/vmm_lapic.h | 6 +- 8 files changed, 605 insertions(+), 218 deletions(-) create mode 100644 sys/amd64/include/vmm_instruction_emul.h delete mode 100644 sys/amd64/vmm/vmm_instruction_emul.h (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 8f78b8f..2fb2194 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -150,6 +150,8 @@ void vm_interrupt_hostcpu(struct vm *vm, int vcpu); #endif /* KERNEL */ +#include + #define VM_MAXCPU 8 /* maximum virtual cpus */ /* @@ -268,6 +270,7 @@ struct vm_exit { uint64_t cr3; uint64_t gpa; int rwx; + struct vie vie; } paging; /* * VMX specific payload. Used when there is no "better" diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h new file mode 100644 index 0000000..4cc494b --- /dev/null +++ b/sys/amd64/include/vmm_instruction_emul.h @@ -0,0 +1,113 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_INSTRUCTION_EMUL_H_ +#define _VMM_INSTRUCTION_EMUL_H_ + +/* + * The data structures 'vie' and 'vie_op' are meant to be opaque to the + * consumers of instruction decoding. The only reason why their contents + * need to be exposed is because they are part of the 'vm_exit' structure. + */ +struct vie_op { + uint8_t op_byte; /* actual opcode byte */ + uint8_t op_type; /* type of operation (e.g. MOV) */ + uint16_t op_flags; +}; + +#define VIE_INST_SIZE 15 +struct vie { + uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ + uint8_t num_valid; /* size of the instruction */ + uint8_t num_processed; + + uint8_t rex_w:1, /* REX prefix */ + rex_r:1, + rex_x:1, + rex_b:1; + + uint8_t mod:2, /* ModRM byte */ + reg:4, + rm:4; + + uint8_t ss:2, /* SIB byte */ + index:4, + base:4; + + uint8_t disp_bytes; + uint8_t imm_bytes; + + uint8_t scale; + int base_register; /* VM_REG_GUEST_xyz */ + int index_register; /* VM_REG_GUEST_xyz */ + + int64_t displacement; /* optional addr displacement */ + int64_t immediate; /* optional immediate operand */ + + uint8_t decoded; /* set to 1 if successfully decoded */ + + struct vie_op op; /* opcode description */ +}; + +/* + * Callback functions to read and write memory regions. + */ +typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t *rval, int rsize, void *arg); + +typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t wval, int wsize, void *arg); + +/* + * Emulate the decoded 'vie' instruction. + * + * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region + * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * s + */ +int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t mrr, mem_region_write_t mrw, + void *mrarg); + +#ifdef _KERNEL +/* + * APIs to fetch and decode the instruction from nested page fault handler. + */ +int vmm_fetch_instruction(struct vm *vm, int cpuid, + uint64_t rip, int inst_length, uint64_t cr3, + struct vie *vie); + +int vmm_decode_instruction(struct vm *vm, int cpuid, + uint64_t gla, struct vie *vie); +#endif /* _KERNEL */ + +#endif /* _VMM_INSTRUCTION_EMUL_H_ */ diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h index 84532f4..f39eed2 100644 --- a/sys/amd64/vmm/intel/vmcs.h +++ b/sys/amd64/vmm/intel/vmcs.h @@ -67,6 +67,7 @@ uint64_t vmcs_read(uint32_t encoding); #define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION) #define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3) #define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS) +#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS) #endif /* _KERNEL */ diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 7a9cfb8..b185c57 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -63,7 +63,6 @@ __FBSDID("$FreeBSD$"); #include "vmx.h" #include "x86.h" #include "vmx_controls.h" -#include "vmm_instruction_emul.h" #define PINBASED_CTLS_ONE_SETTING \ (PINBASED_EXTINT_EXITING | \ @@ -1150,23 +1149,11 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) } static int -vmx_lapic_fault(struct vm *vm, int cpu, - uint64_t gpa, uint64_t rip, int inst_length, - uint64_t cr3, uint64_t ept_qual) +vmx_ept_fault(struct vm *vm, int cpu, + uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length, + uint64_t cr3, uint64_t ept_qual, struct vie *vie) { - int read, write, handled; - struct vie vie; - - /* - * For this to be a legitimate access to the local apic: - * - the GPA in the local apic page - * - the GPA must be aligned on a 16 byte boundary - */ - if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) - return (UNHANDLED); - - if ((gpa & 0xF) != 0) - return (UNHANDLED); + int read, write, error; /* EPT violation on an instruction fetch doesn't make sense here */ if (ept_qual & EPT_VIOLATION_INST_FETCH) @@ -1188,15 +1175,22 @@ vmx_lapic_fault(struct vm *vm, int cpu, } /* Fetch, decode and emulate the faulting instruction */ - if (vmm_fetch_instruction(vm, rip, inst_length, cr3, &vie) != 0) + if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0) return (UNHANDLED); - if (vmm_decode_instruction(&vie) != 0) + if (vmm_decode_instruction(vm, cpu, gla, vie) != 0) return (UNHANDLED); - handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, &vie); + /* + * Check if this is a local apic access + */ + if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) + return (UNHANDLED); - return (handled); + error = vmm_emulate_instruction(vm, cpu, gpa, vie, + lapic_mmio_read, lapic_mmio_write, 0); + + return (error ? UNHANDLED : HANDLED); } static int @@ -1206,7 +1200,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) struct vmcs *vmcs; struct vmxctx *vmxctx; uint32_t eax, ecx, edx; - uint64_t qual, gpa, cr3, intr_info; + uint64_t qual, gla, gpa, cr3, intr_info; handled = 0; vmcs = &vmx->vmcs[vcpu]; @@ -1299,11 +1293,12 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); break; case EXIT_REASON_EPT_FAULT: + gla = vmcs_gla(); gpa = vmcs_gpa(); cr3 = vmcs_guest_cr3(); - handled = vmx_lapic_fault(vmx->vm, vcpu, - gpa, vmexit->rip, vmexit->inst_length, - cr3, qual); + handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa, + vmexit->rip, vmexit->inst_length, + cr3, qual, &vmexit->u.paging.vie); if (!handled) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->u.paging.cr3 = cr3; diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index 7ef4dbb..5e5399b 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -30,6 +30,7 @@ #include __FBSDID("$FreeBSD$"); +#ifdef _KERNEL #include #include #include @@ -40,10 +41,60 @@ __FBSDID("$FreeBSD$"); #include #include #include +#else /* !_KERNEL */ +#include +#include -#include "vmm_instruction_emul.h" +#include + +#include +#endif /* _KERNEL */ + + + +/* struct vie_op.op_type */ +enum { + VIE_OP_TYPE_NONE = 0, + VIE_OP_TYPE_MOV, + VIE_OP_TYPE_AND, + VIE_OP_TYPE_LAST +}; + +/* struct vie_op.op_flags */ +#define VIE_OP_F_IMM (1 << 0) /* immediate operand present */ +#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ + +static const struct vie_op one_byte_opcodes[256] = { + [0x89] = { + .op_byte = 0x89, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x8B] = { + .op_byte = 0x8B, + .op_type = VIE_OP_TYPE_MOV, + }, + [0xC7] = { + .op_byte = 0xC7, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM, + }, + [0x23] = { + .op_byte = 0x23, + .op_type = VIE_OP_TYPE_AND, + } +}; + +/* struct vie.mod */ +#define VIE_MOD_INDIRECT 0 +#define VIE_MOD_INDIRECT_DISP8 1 +#define VIE_MOD_INDIRECT_DISP32 2 +#define VIE_MOD_DIRECT 3 -#define GB (1024 * 1024 * 1024) +/* struct vie.rm */ +#define VIE_RM_SIB 4 +#define VIE_RM_DISP32 5 + +#define GB (1024 * 1024 * 1024) static enum vm_reg_name gpr_map[16] = { VM_REG_GUEST_RAX, @@ -64,17 +115,232 @@ static enum vm_reg_name gpr_map[16] = { VM_REG_GUEST_R15 }; +static uint64_t size2mask[] = { + [1] = 0xff, + [2] = 0xffff, + [4] = 0xffffffff, + [8] = 0xffffffffffffffff, +}; + +static int +vie_valid_register(enum vm_reg_name reg) +{ +#ifdef _KERNEL + /* + * XXX + * The operand register in which we store the result of the + * read must be a GPR that we can modify even if the vcpu + * is "running". All the GPRs qualify except for %rsp. + * + * This is a limitation of the vm_set_register() API + * and can be fixed if necessary. + */ + if (reg == VM_REG_GUEST_RSP) + return (0); +#endif + return (1); +} + +static int +vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) +{ + int error; + + if (!vie_valid_register(reg)) + return (EINVAL); + + error = vm_get_register(vm, vcpuid, reg, rval); + + return (error); +} + +static int +vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, + uint64_t val, int size) +{ + int error; + uint64_t origval; + + if (!vie_valid_register(reg)) + return (EINVAL); + + switch (size) { + case 1: + case 2: + error = vie_read_register(vm, vcpuid, reg, &origval); + if (error) + return (error); + val &= size2mask[size]; + val |= origval & ~size2mask[size]; + break; + case 4: + val &= 0xffffffffUL; + break; + case 8: + break; + default: + return (EINVAL); + } + + error = vm_set_register(vm, vcpuid, reg, val); + return (error); +} + +/* + * The following simplifying assumptions are made during emulation: + * + * - guest is in 64-bit mode + * - default address size is 64-bits + * - default operand size is 32-bits + * + * - operand size override is not supported + * + * - address size override is not supported + */ +static int +emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t val; + + size = 4; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x89: + /* + * MOV from reg (ModRM:reg) to mem (ModRM:r/m) + * 89/r: mov r/m32, r32 + * REX.W + 89/r mov r/m64, r64 + */ + if (vie->rex_w) + size = 8; + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val); + if (error == 0) { + val &= size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + } + break; + case 0x8B: + /* + * MOV from mem (ModRM:r/m) to reg (ModRM:reg) + * 8B/r: mov r32, r/m32 + * REX.W 8B/r: mov r64, r/m64 + */ + if (vie->rex_w) + size = 8; + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) { + reg = gpr_map[vie->reg]; + error = vie_update_register(vm, vcpuid, reg, val, size); + } + break; + case 0xC7: + /* + * MOV from imm32 to mem (ModRM:r/m) + * C7/0 mov r/m32, imm32 + * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) + */ + val = vie->immediate; /* already sign-extended */ + + if (vie->rex_w) + size = 8; + + if (size != 8) + val &= size2mask[size]; + + error = memwrite(vm, vcpuid, gpa, val, size, arg); + break; + default: + break; + } + + return (error); +} + +static int +emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t val1, val2; + + size = 4; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x23: + /* + * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the + * result in reg. + * + * 23/r and r32, r/m32 + * REX.W + 23/r and r64, r/m64 + */ + if (vie->rex_w) + size = 8; + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + val1 &= val2; + error = vie_update_register(vm, vcpuid, reg, val1, size); + break; + default: + break; + } + return (error); +} + +int +vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, + void *memarg) +{ + int error; + + if (!vie->decoded) + return (EINVAL); + + switch (vie->op.op_type) { + case VIE_OP_TYPE_MOV: + error = emulate_mov(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_AND: + error = emulate_and(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +#ifdef _KERNEL static void vie_init(struct vie *vie) { bzero(vie, sizeof(struct vie)); - vie->op_size = VIE_OP_SIZE_32BIT; - vie->base_register = VM_REG_LAST; vie->index_register = VM_REG_LAST; - vie->operand_register = VM_REG_LAST; } static int @@ -129,7 +395,7 @@ error: } int -vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length, +vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length, uint64_t cr3, struct vie *vie) { int n, err; @@ -172,6 +438,7 @@ vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length, static int vie_peek(struct vie *vie, uint8_t *x) { + if (vie->num_processed < vie->num_valid) { *x = vie->inst[vie->num_processed]; return (0); @@ -182,8 +449,6 @@ vie_peek(struct vie *vie, uint8_t *x) static void vie_advance(struct vie *vie) { - if (vie->num_processed >= vie->num_valid) - panic("vie_advance: %d/%d", vie->num_processed, vie->num_valid); vie->num_processed++; } @@ -213,24 +478,16 @@ decode_opcode(struct vie *vie) { uint8_t x; - static const uint8_t flags[256] = { - [0x89] = VIE_F_HAS_MODRM | VIE_F_FROM_REG | VIE_F_TO_RM, - [0x8B] = VIE_F_HAS_MODRM | VIE_F_FROM_RM | VIE_F_TO_REG, - [0xC7] = VIE_F_HAS_MODRM | VIE_F_FROM_IMM | VIE_F_TO_RM, - }; - if (vie_peek(vie, &x)) return (-1); - vie->opcode_byte = x; - vie->opcode_flags = flags[x]; + vie->op = one_byte_opcodes[x]; - vie_advance(vie); - - if (vie->opcode_flags == 0) + if (vie->op.op_type == VIE_OP_TYPE_NONE) return (-1); - else - return (0); + + vie_advance(vie); + return (0); } /* @@ -241,9 +498,6 @@ decode_modrm(struct vie *vie) { uint8_t x; - if ((vie->opcode_flags & VIE_F_HAS_MODRM) == 0) - return (0); - if (vie_peek(vie, &x)) return (-1); @@ -251,35 +505,40 @@ decode_modrm(struct vie *vie) vie->rm = (x >> 0) & 0x7; vie->reg = (x >> 3) & 0x7; + /* + * A direct addressing mode makes no sense in the context of an EPT + * fault. There has to be a memory access involved to cause the + * EPT fault. + */ + if (vie->mod == VIE_MOD_DIRECT) + return (-1); + if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { - /* - * Table 2-5: Special Cases of REX Encodings - * - * mod=0, r/m=5 is used in the compatibility mode to - * indicate a disp32 without a base register. - * - * mod!=3, r/m=4 is used in the compatibility mode to - * indicate that the SIB byte is present. - * - * The 'b' bit in the REX prefix is don't care in - * this case. - */ + /* + * Table 2-5: Special Cases of REX Encodings + * + * mod=0, r/m=5 is used in the compatibility mode to + * indicate a disp32 without a base register. + * + * mod!=3, r/m=4 is used in the compatibility mode to + * indicate that the SIB byte is present. + * + * The 'b' bit in the REX prefix is don't care in + * this case. + */ } else { vie->rm |= (vie->rex_b << 3); } vie->reg |= (vie->rex_r << 3); - /* SIB addressing not supported yet */ + /* SIB */ if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) - return (-1); + goto done; vie->base_register = gpr_map[vie->rm]; - if (vie->opcode_flags & (VIE_F_FROM_REG | VIE_F_TO_REG)) - vie->operand_register = gpr_map[vie->reg]; - switch (vie->mod) { case VIE_MOD_INDIRECT_DISP8: vie->disp_bytes = 1; @@ -295,12 +554,76 @@ decode_modrm(struct vie *vie) break; } - /* calculate the operand size */ - if (vie->rex_w) - vie->op_size = VIE_OP_SIZE_64BIT; - - if (vie->opcode_flags & VIE_F_FROM_IMM) + /* Figure out immediate operand size (if any) */ + if (vie->op.op_flags & VIE_OP_F_IMM) vie->imm_bytes = 4; + else if (vie->op.op_flags & VIE_OP_F_IMM8) + vie->imm_bytes = 1; + +done: + vie_advance(vie); + + return (0); +} + +static int +decode_sib(struct vie *vie) +{ + uint8_t x; + + /* Proceed only if SIB byte is present */ + if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) + return (0); + + if (vie_peek(vie, &x)) + return (-1); + + /* De-construct the SIB byte */ + vie->ss = (x >> 6) & 0x3; + vie->index = (x >> 3) & 0x7; + vie->base = (x >> 0) & 0x7; + + /* Apply the REX prefix modifiers */ + vie->index |= vie->rex_x << 3; + vie->base |= vie->rex_b << 3; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + } + + if (vie->mod == VIE_MOD_INDIRECT && + (vie->base == 5 || vie->base == 13)) { + /* + * Special case when base register is unused if mod = 0 + * and base = %rbp or %r13. + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + vie->disp_bytes = 4; + } else { + vie->base_register = gpr_map[vie->base]; + } + + /* + * All encodings of 'index' are valid except for %rsp (4). + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + if (vie->index != 4) + vie->index_register = gpr_map[vie->index]; + + /* 'scale' makes sense only in the context of an index register */ + if (vie->index_register < VM_REG_LAST) + vie->scale = 1 << vie->ss; vie_advance(vie); @@ -348,13 +671,14 @@ decode_immediate(struct vie *vie) uint8_t x; union { char buf[4]; + int8_t signed8; int32_t signed32; } u; if ((n = vie->imm_bytes) == 0) return (0); - if (n != 4) + if (n != 1 && n != 4) panic("decode_immediate: invalid imm_bytes %d", n); for (i = 0; i < n; i++) { @@ -365,14 +689,62 @@ decode_immediate(struct vie *vie) vie_advance(vie); } - vie->immediate = u.signed32; /* sign-extended */ + if (n == 1) + vie->immediate = u.signed8; /* sign-extended */ + else + vie->immediate = u.signed32; /* sign-extended */ return (0); } +#define VERIFY_GLA +/* + * Verify that the 'guest linear address' provided as collateral of the nested + * page table fault matches with our instruction decoding. + */ +#ifdef VERIFY_GLA +static int +verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) +{ + int error; + uint64_t base, idx; + + base = 0; + if (vie->base_register != VM_REG_LAST) { + error = vm_get_register(vm, cpuid, vie->base_register, &base); + if (error) { + printf("verify_gla: error %d getting base reg %d\n", + error, vie->base_register); + return (-1); + } + } + + idx = 0; + if (vie->index_register != VM_REG_LAST) { + error = vm_get_register(vm, cpuid, vie->index_register, &idx); + if (error) { + printf("verify_gla: error %d getting index reg %d\n", + error, vie->index_register); + return (-1); + } + } + + if (base + vie->scale * idx + vie->displacement != gla) { + printf("verify_gla mismatch: " + "base(0x%0lx), scale(%d), index(0x%0lx), " + "disp(0x%0lx), gla(0x%0lx)\n", + base, vie->scale, idx, vie->displacement, gla); + return (-1); + } + + return (0); +} +#endif /* VERIFY_GLA */ + int -vmm_decode_instruction(struct vie *vie) +vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) { + if (decode_rex(vie)) return (-1); @@ -382,11 +754,22 @@ vmm_decode_instruction(struct vie *vie) if (decode_modrm(vie)) return (-1); + if (decode_sib(vie)) + return (-1); + if (decode_displacement(vie)) return (-1); if (decode_immediate(vie)) return (-1); +#ifdef VERIFY_GLA + if (verify_gla(vm, cpuid, gla, vie)) + return (-1); +#endif + + vie->decoded = 1; /* success */ + return (0); } +#endif /* _KERNEL */ diff --git a/sys/amd64/vmm/vmm_instruction_emul.h b/sys/amd64/vmm/vmm_instruction_emul.h deleted file mode 100644 index 1fa9e2b..0000000 --- a/sys/amd64/vmm/vmm_instruction_emul.h +++ /dev/null @@ -1,91 +0,0 @@ -/*- - * Copyright (c) 2012 NetApp, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _VMM_INSTRUCTION_EMUL_H_ -#define _VMM_INSTRUCTION_EMUL_H_ - -enum vie_op_size { - VIE_OP_SIZE_32BIT, /* default */ - VIE_OP_SIZE_64BIT, - VIE_OP_SIZE_8BIT -}; - -#define VIE_INST_SIZE 15 -struct vie { - uint8_t inst[VIE_INST_SIZE]; - - uint8_t rex_w:1, - rex_r:1, - rex_x:1, - rex_b:1; - - uint8_t mod:2, - reg:4, - rm:4; - - - uint8_t opcode_byte; - uint16_t opcode_flags; - uint8_t disp_bytes; - uint8_t imm_bytes; - - int num_valid; - int num_processed; - - enum vm_reg_name base_register; - enum vm_reg_name index_register; - enum vm_reg_name operand_register; - - int op_size; - int64_t displacement; - int64_t immediate; -}; - -#define VIE_F_HAS_MODRM (1 << 0) -#define VIE_F_FROM_RM (1 << 1) -#define VIE_F_FROM_REG (1 << 2) -#define VIE_F_TO_RM (1 << 3) -#define VIE_F_TO_REG (1 << 4) -#define VIE_F_FROM_IMM (1 << 5) - -#define VIE_MOD_INDIRECT 0 -#define VIE_MOD_INDIRECT_DISP8 1 -#define VIE_MOD_INDIRECT_DISP32 2 -#define VIE_MOD_DIRECT 3 - -#define VIE_RM_SIB 4 -#define VIE_RM_DISP32 5 - -struct vm; - -int vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length, - uint64_t cr3, struct vie *vie); - -int vmm_decode_instruction(struct vie *vie); - -#endif diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c index bb22122..dabcf06 100644 --- a/sys/amd64/vmm/vmm_lapic.c +++ b/sys/amd64/vmm/vmm_lapic.c @@ -34,12 +34,12 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include "vmm_ipi.h" #include "vmm_lapic.h" #include "vlapic.h" -#include "vmm_instruction_emul.h" static int lapic_write(struct vlapic *vlapic, u_int offset, uint64_t val) @@ -177,64 +177,45 @@ lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val) } int -lapic_mmio(struct vm *vm, int cpu, u_int offset, int read, struct vie *vie) +lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size, + void *arg) { - int handled, error; - uint64_t val; + int error; + uint64_t off; struct vlapic *vlapic; - const int UNHANDLED = 0; + off = gpa - DEFAULT_APIC_BASE; + + /* + * Memory mapped local apic accesses must be 4 bytes wide and + * aligned on a 16-byte boundary. + */ + if (size != 4 || off & 0xf) + return (EINVAL); vlapic = vm_lapic(vm, cpu); + error = vlapic_op_mem_write(vlapic, off, DWORD, wval); + return (error); +} + +int +lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size, + void *arg) +{ + int error; + uint64_t off; + struct vlapic *vlapic; - /* Only 32-bit accesses to local apic */ - if (vie->op_size != VIE_OP_SIZE_32BIT) - return (UNHANDLED); + off = gpa - DEFAULT_APIC_BASE; /* - * XXX - * The operand register in which we store the result of the - * read must be a GPR that we can modify even if the vcpu - * is "running". All the GPRs qualify except for %rsp. - * - * This is a limitation of the vm_set_register() API - * and can be fixed if necessary. + * Memory mapped local apic accesses must be 4 bytes wide and + * aligned on a 16-byte boundary. */ - if (vie->operand_register == VM_REG_GUEST_RSP) - return (UNHANDLED); - - if (read) { - if ((vie->opcode_flags & VIE_F_TO_REG) == 0) - return (UNHANDLED); - - if (vie->operand_register >= VM_REG_LAST) - return (UNHANDLED); - - handled = lapic_read(vlapic, offset, &val); - if (handled) { - error = vm_set_register(vm, cpu, vie->operand_register, - val); - if (error) - panic("lapic_mmio: error %d setting gpr %d", - error, vie->operand_register); - } - } else { - if ((vie->opcode_flags & VIE_F_FROM_REG) && - (vie->operand_register < VM_REG_LAST)) { - error = vm_get_register(vm, cpu, vie->operand_register, - &val); - if (error) { - panic("lapic_mmio: error %d getting gpr %d", - error, vie->operand_register); - } - } else if (vie->opcode_flags & VIE_F_FROM_IMM) { - val = vie->immediate; - } else { - return (UNHANDLED); - } - - handled = lapic_write(vlapic, offset, val); - } + if (size != 4 || off & 0xf) + return (EINVAL); - return (handled); + vlapic = vm_lapic(vm, cpu); + error = vlapic_op_mem_read(vlapic, off, DWORD, rval); + return (error); } diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h index 59fc016..a79912e 100644 --- a/sys/amd64/vmm/vmm_lapic.h +++ b/sys/amd64/vmm/vmm_lapic.h @@ -30,13 +30,15 @@ #define _VMM_LAPIC_H_ struct vm; -struct vie; boolean_t lapic_msr(u_int num); int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval); int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval); -int lapic_mmio(struct vm *vm, int cpu, u_int offset, int rd, struct vie *); +int lapic_mmio_read(void *vm, int cpu, uint64_t gpa, + uint64_t *rval, int size, void *arg); +int lapic_mmio_write(void *vm, int cpu, uint64_t gpa, + uint64_t wval, int size, void *arg); int lapic_timer_tick(struct vm *vm, int cpu); -- cgit v1.1 From 308122a0f1279a0305554f864d6b01f9c132511a Mon Sep 17 00:00:00 2001 From: neel Date: Wed, 28 Nov 2012 13:10:18 +0000 Subject: Change emulate_rdmsr() and emulate_wrmsr() to return 0 on sucess and errno on failure. The conversion from the return value to HANDLED or UNHANDLED can be done locally in vmx_exit_process(). Obtained from: NetApp --- sys/amd64/vmm/intel/vmx.c | 14 +++++++------ sys/amd64/vmm/vmm_lapic.c | 52 +++++++++++++++-------------------------------- sys/amd64/vmm/vmm_msr.c | 36 ++++++++++++-------------------- 3 files changed, 37 insertions(+), 65 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index b185c57..af4a03f 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1214,23 +1214,25 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) break; case EXIT_REASON_RDMSR: ecx = vmxctx->guest_rcx; - handled = emulate_rdmsr(vmx->vm, vcpu, ecx); - if (!handled) { + error = emulate_rdmsr(vmx->vm, vcpu, ecx); + if (error) { vmexit->exitcode = VM_EXITCODE_RDMSR; vmexit->u.msr.code = ecx; - } + } else + handled = 1; break; case EXIT_REASON_WRMSR: eax = vmxctx->guest_rax; ecx = vmxctx->guest_rcx; edx = vmxctx->guest_rdx; - handled = emulate_wrmsr(vmx->vm, vcpu, ecx, + error = emulate_wrmsr(vmx->vm, vcpu, ecx, (uint64_t)edx << 32 | eax); - if (!handled) { + if (error) { vmexit->exitcode = VM_EXITCODE_WRMSR; vmexit->u.msr.code = ecx; vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; - } + } else + handled = 1; break; case EXIT_REASON_HLT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c index dabcf06..d024b71 100644 --- a/sys/amd64/vmm/vmm_lapic.c +++ b/sys/amd64/vmm/vmm_lapic.c @@ -41,32 +41,6 @@ __FBSDID("$FreeBSD$"); #include "vmm_lapic.h" #include "vlapic.h" -static int -lapic_write(struct vlapic *vlapic, u_int offset, uint64_t val) -{ - int handled; - - if (vlapic_op_mem_write(vlapic, offset, DWORD, val) == 0) - handled = 1; - else - handled = 0; - - return (handled); -} - -static int -lapic_read(struct vlapic *vlapic, u_int offset, uint64_t *rv) -{ - int handled; - - if (vlapic_op_mem_read(vlapic, offset, DWORD, rv) == 0) - handled = 1; - else - handled = 0; - - return (handled); -} - int lapic_pending_intr(struct vm *vm, int cpu) { @@ -145,35 +119,41 @@ lapic_msr(u_int msr) int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval) { - int handled; + int error; + u_int offset; struct vlapic *vlapic; vlapic = vm_lapic(vm, cpu); if (msr == MSR_APICBASE) { *rval = vlapic_get_apicbase(vlapic); - handled = 1; - } else - handled = lapic_read(vlapic, x2apic_msr_to_regoff(msr), rval); + error = 0; + } else { + offset = x2apic_msr_to_regoff(msr); + error = vlapic_op_mem_read(vlapic, offset, DWORD, rval); + } - return (handled); + return (error); } int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val) { - int handled; + int error; + u_int offset; struct vlapic *vlapic; vlapic = vm_lapic(vm, cpu); if (msr == MSR_APICBASE) { vlapic_set_apicbase(vlapic, val); - handled = 1; - } else - handled = lapic_write(vlapic, x2apic_msr_to_regoff(msr), val); + error = 0; + } else { + offset = x2apic_msr_to_regoff(msr); + error = vlapic_op_mem_write(vlapic, offset, DWORD, val); + } - return (handled); + return (error); } int diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c index bc67f98..d97c819 100644 --- a/sys/amd64/vmm/vmm_msr.c +++ b/sys/amd64/vmm/vmm_msr.c @@ -41,7 +41,7 @@ __FBSDID("$FreeBSD$"); #define VMM_MSR_F_EMULATE 0x01 #define VMM_MSR_F_READONLY 0x02 -#define VMM_MSR_F_INVALID 0x04 +#define VMM_MSR_F_INVALID 0x04 /* guest_msr_valid() can override this */ struct vmm_msr { int num; @@ -137,20 +137,15 @@ msr_num_to_idx(u_int num) int emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val) { - int handled, idx; + int idx; uint64_t *guest_msrs; - handled = 0; - if (lapic_msr(num)) return (lapic_wrmsr(vm, cpu, num, val)); idx = msr_num_to_idx(num); - if (idx < 0) - goto done; - - if (invalid_msr(idx)) - goto done; + if (idx < 0 || invalid_msr(idx)) + return (EINVAL); if (!readonly_msr(idx)) { guest_msrs = vm_guest_msrs(vm, cpu); @@ -163,31 +158,26 @@ emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val) wrmsr(vmm_msr[idx].num, val); } - handled = 1; -done: - return (handled); + return (0); } int emulate_rdmsr(struct vm *vm, int cpu, u_int num) { - int error, handled, idx; + int error, idx; uint32_t eax, edx; uint64_t result, *guest_msrs; - handled = 0; - if (lapic_msr(num)) { - handled = lapic_rdmsr(vm, cpu, num, &result); + error = lapic_rdmsr(vm, cpu, num, &result); goto done; } idx = msr_num_to_idx(num); - if (idx < 0) - goto done; - - if (invalid_msr(idx)) + if (idx < 0 || invalid_msr(idx)) { + error = EINVAL; goto done; + } guest_msrs = vm_guest_msrs(vm, cpu); result = guest_msrs[idx]; @@ -202,10 +192,10 @@ emulate_rdmsr(struct vm *vm, int cpu, u_int num) result, rdmsr(num)); } - handled = 1; + error = 0; done: - if (handled) { + if (error == 0) { eax = result; edx = result >> 32; error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax); @@ -215,7 +205,7 @@ done: if (error) panic("vm_set_register(rdx) error %d", error); } - return (handled); + return (error); } void -- cgit v1.1 From da4e87dfd614fffb88e5a93c988e1caec9c9efe7 Mon Sep 17 00:00:00 2001 From: neel Date: Wed, 28 Nov 2012 13:34:44 +0000 Subject: Cleanup the user-space paging exit handler now that the unified instruction emulation is in place. Obtained from: NetApp --- sys/amd64/include/vmm.h | 2 -- sys/amd64/vmm/intel/vmx.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 2fb2194..024c30e 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -267,9 +267,7 @@ struct vm_exit { uint32_t eax; /* valid for out */ } inout; struct { - uint64_t cr3; uint64_t gpa; - int rwx; struct vie vie; } paging; /* diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index af4a03f..2b6ef35 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1303,9 +1303,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) cr3, qual, &vmexit->u.paging.vie); if (!handled) { vmexit->exitcode = VM_EXITCODE_PAGING; - vmexit->u.paging.cr3 = cr3; vmexit->u.paging.gpa = gpa; - vmexit->u.paging.rwx = qual & 0x7; } break; default: -- cgit v1.1 From ffd1f089c33d0e59c0cb85b52bc683272f7880dd Mon Sep 17 00:00:00 2001 From: grehan Date: Thu, 29 Nov 2012 06:26:42 +0000 Subject: Add support for the 0x81 AND instruction, now generated by clang in the local APIC code. 0x81 is a read-modify-write instruction - the EPT check that only allowed read or write and not both has been relaxed to allow read and write. Reviewed by: neel Obtained from: NetApp --- sys/amd64/vmm/intel/vmx.c | 9 +++++---- sys/amd64/vmm/vmm_instruction_emul.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 4 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 2b6ef35..4f267bb 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1159,15 +1159,16 @@ vmx_ept_fault(struct vm *vm, int cpu, if (ept_qual & EPT_VIOLATION_INST_FETCH) return (UNHANDLED); - /* EPT violation must be a read fault or a write fault but not both */ + /* EPT violation must be a read fault or a write fault */ read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; - if ((read ^ write) == 0) + if ((read | write) == 0) return (UNHANDLED); /* - * The EPT violation must have been caused by accessing a guest-physical - * address that is a translation of a guest-linear address. + * The EPT violation must have been caused by accessing a + * guest-physical address that is a translation of a guest-linear + * address. */ if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index 5e5399b..0a7286b 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -81,6 +81,11 @@ static const struct vie_op one_byte_opcodes[256] = { [0x23] = { .op_byte = 0x23, .op_type = VIE_OP_TYPE_AND, + }, + [0x81] = { + .op_byte = 0x81, + .op_type = VIE_OP_TYPE_AND, + .op_flags = VIE_OP_F_IMM, } }; @@ -299,6 +304,30 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, val1 &= val2; error = vie_update_register(vm, vcpuid, reg, val1, size); break; + case 0x81: + printf("0x81 AND\n"); + /* + * AND reg (ModRM:reg) with immediate and store the + * result in reg + * + * 81/ and r/m32, imm32 + * REX.W + 81/ and r/m64, imm32 sign-extended to 64 + */ + if (vie->rex_w) + size = 8; + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val1, size, arg); + if (error) + break; + + /* + * perform the operation with the pre-fetched immediate + * operand and write the result + */ + val1 &= vie->immediate; + error = memwrite(vm, vcpuid, gpa, val1, size, arg); + break; default: break; } -- cgit v1.1 From f59654890648245951f48676dc390a6d95f03aae Mon Sep 17 00:00:00 2001 From: grehan Date: Thu, 29 Nov 2012 15:08:13 +0000 Subject: Remove debug printf. Pointed out by: emaste --- sys/amd64/vmm/vmm_instruction_emul.c | 1 - 1 file changed, 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index 0a7286b..1c4abf8 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -305,7 +305,6 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, error = vie_update_register(vm, vcpuid, reg, val1, size); break; case 0x81: - printf("0x81 AND\n"); /* * AND reg (ModRM:reg) with immediate and store the * result in reg -- cgit v1.1 From 7f24aaf567c1daf5f2478b28960fa3f98e18e374 Mon Sep 17 00:00:00 2001 From: grehan Date: Fri, 30 Nov 2012 05:40:24 +0000 Subject: Properly screen for the AND 0x81 instruction from the set of group1 0x81 instructions that use the reg bits as an extended opcode. Still todo: properly update rflags. Pointed out by: jilles@ --- sys/amd64/vmm/vmm_instruction_emul.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index 1c4abf8..e73f6bb 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -83,6 +83,7 @@ static const struct vie_op one_byte_opcodes[256] = { .op_type = VIE_OP_TYPE_AND, }, [0x81] = { + /* XXX Group 1 extended opcode - not just AND */ .op_byte = 0x81, .op_type = VIE_OP_TYPE_AND, .op_flags = VIE_OP_F_IMM, @@ -311,7 +312,13 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * * 81/ and r/m32, imm32 * REX.W + 81/ and r/m64, imm32 sign-extended to 64 + * + * Currently, only the AND operation of the 0x81 opcode + * is implemented (ModRM:reg = b100). */ + if ((vie->reg & 7) != 4) + break; + if (vie->rex_w) size = 8; -- cgit v1.1 From 7d7f92fbade54e46285282d2c5f456298084d794 Mon Sep 17 00:00:00 2001 From: neel Date: Sun, 16 Dec 2012 00:57:14 +0000 Subject: Prefer x2apic mode when running inside a virtual machine. Provide a tunable 'machdep.x2apic_desired' to let the administrator override the default behavior. Provide a read-only sysctl 'machdep.x2apic' to let the administrator know whether the kernel is using x2apic or legacy mmio to access local apic. Tested with Parallels Desktop 8 and bhyve hypervisors. Also tested running on bare metal Intel Xeon E5-2658. Obtained from: NetApp Discussed with: jhb, attilio, avg, grehan --- sys/amd64/amd64/mp_machdep.c | 2 ++ sys/amd64/include/apicvar.h | 1 + 2 files changed, 3 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index b4a0be4..f7423be 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -708,6 +708,8 @@ init_secondary(void) wrmsr(MSR_STAR, msr); wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); + lapic_init_ap(); + /* Disable local APIC just to be sure. */ lapic_disable(); diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h index ae2f5b9..dee5900 100644 --- a/sys/amd64/include/apicvar.h +++ b/sys/amd64/include/apicvar.h @@ -209,6 +209,7 @@ int lapic_enable_pmc(void); void lapic_eoi(void); int lapic_id(void); void lapic_init(vm_paddr_t addr); +void lapic_init_ap(void); int lapic_intr_pending(u_int vector); void lapic_ipi_raw(register_t icrlo, u_int dest); void lapic_ipi_vectored(u_int vector, int dest); -- cgit v1.1 From bc64633d9d492d8fadbd7972ea2ae81a660233a3 Mon Sep 17 00:00:00 2001 From: neel Date: Sun, 16 Dec 2012 01:20:08 +0000 Subject: Modify the default behavior of bhyve such that it no longer forces the use of x2apic mode on the guest. The guest can decide whether or not it wants to use legacy mmio or x2apic access to the APIC by writing to the MSR_APICBASE register. Obtained from: NetApp --- sys/amd64/vmm/io/vlapic.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 911ed64..15fc6c2 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -896,8 +896,6 @@ vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) vlapic = vm_lapic(vm, vcpuid); - if (state == X2APIC_ENABLED) - vlapic->msr_apicbase |= APICBASE_X2APIC; - else + if (state == X2APIC_DISABLED) vlapic->msr_apicbase &= ~APICBASE_X2APIC; } -- cgit v1.1 From 01173b0b4a9b00c153489a51f2cba1b3d0cfc119 Mon Sep 17 00:00:00 2001 From: neel Date: Fri, 4 Jan 2013 02:04:41 +0000 Subject: The "unrestricted guest" capability is a feature of Intel VT-x that allows the guest to execute real or unpaged protected mode code - bhyve relies on this feature to execute the AP bootstrap code. Get rid of the hack that allowed bhyve to support SMP guests on processors that do not have the "unrestricted guest" capability. This hack was entirely FreeBSD-specific and would not work with any other guest OS. Instead, limit the number of vcpus to 1 when executing on processors without "unrestricted guest" capability. Suggested by: grehan Obtained from: NetApp --- sys/amd64/amd64/mp_machdep.c | 43 ------------------------------------------- 1 file changed, 43 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index f7423be..33e0814 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -145,26 +145,6 @@ struct cpu_info { int cpu_apic_ids[MAXCPU]; int apic_cpuids[MAX_APIC_ID + 1]; -/* - * Trampoline for hypervisor direct 64-bit jump. - * - * 0 - signature for guest->host verification - * 8 - virtual address of this page - * 16 - instruction virtual address - * 24 - stack pointer virtual address - * 32 - CR3, physical address of kernel page table - * 40 - 24-byte area for null/code/data GDT entries - */ -#define MP_V64T_SIG 0xcafebabecafebabeULL -struct mp_v64tramp { - uint64_t mt_sig; - uint64_t mt_virt; - uint64_t mt_eip; - uint64_t mt_rsp; - uint64_t mt_cr3; - uint64_t mt_gdtr[3]; -}; - /* Holds pending bitmap based IPIs per CPU */ static volatile u_int cpu_ipi_pending[MAXCPU]; @@ -967,29 +947,6 @@ start_all_aps(void) bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8; bootAP = cpu; - /* - * If running in a VM that doesn't support the unrestricted - * guest 16-bit mode, forget most of the above and create - * the data block that allows the hypervisor to direct-jump - * into 64-bit mode. Copy this over the top of the 16-bit - * bootstrap. The startup-IPI informs the hypervisor which - * physical page this data block lies in. The hypervisor - * will then use the block to initialise register state of - * the AP in an almost identical fashion to how it builds - * the BSP initial register state. - */ - if (testenv("hw.use_bvm_mptramp")) { - struct mp_v64tramp mv; - - bzero(&mv, sizeof(mv)); - mv.mt_sig = MP_V64T_SIG; - mv.mt_virt = (uint64_t) va; - mv.mt_eip = (uint64_t) init_secondary; - mv.mt_rsp = (uint64_t) bootSTK; - mv.mt_cr3 = KPML4phys; - bcopy(&mv, (void *) va, sizeof(mv)); - } - /* attempt to start the Application Processor */ if (!start_ap(apic_id)) { /* restore the warmstart vector */ -- cgit v1.1 From fec8c768eb1c50ae42da476ee843414b7f87b8f5 Mon Sep 17 00:00:00 2001 From: neel Date: Fri, 4 Jan 2013 02:49:12 +0000 Subject: There is no need for 'start_emulating()' and 'stop_emulating()' to be defined in so remove them from there. Obtained from: NetApp --- sys/amd64/include/cpufunc.h | 17 ----------------- sys/amd64/vmm/vmm.c | 4 ++-- 2 files changed, 2 insertions(+), 19 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/cpufunc.h b/sys/amd64/include/cpufunc.h index 7243173..881fcd2 100644 --- a/sys/amd64/include/cpufunc.h +++ b/sys/amd64/include/cpufunc.h @@ -705,23 +705,6 @@ intr_disable(void) return (rflags); } -#ifndef CR0_TS -/* Defined in */ -#define CR0_TS 0x00000008 -#endif -static __inline void -start_emulating(void) -{ - __asm __volatile("smsw %%ax; orb %0,%%al; lmsw %%ax" - : : "n" (CR0_TS) : "ax"); -} - -static __inline void -stop_emulating(void) -{ - __asm __volatile("clts"); -} - static __inline void intr_restore(register_t rflags) { diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index eae9ccc..a4dea79 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -145,8 +145,8 @@ static struct vmm_ops *ops; #define VMSETCAP(vmi, vcpu, num, val) \ (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) -#define fpu_start_emulating() start_emulating() -#define fpu_stop_emulating() stop_emulating() +#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) +#define fpu_stop_emulating() clts() static MALLOC_DEFINE(M_VM, "vm", "vm"); CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ -- cgit v1.1 From 736fc919674c3c284d5611e7e9b572385c4dbc0e Mon Sep 17 00:00:00 2001 From: neel Date: Fri, 4 Jan 2013 03:02:43 +0000 Subject: There is no need for a special 'BHYVE' kernel configuration file anymore - 'GENERIC' works fine. Obtained from: NetApp --- sys/amd64/conf/BHYVE | 345 --------------------------------------------------- 1 file changed, 345 deletions(-) delete mode 100644 sys/amd64/conf/BHYVE (limited to 'sys/amd64') diff --git a/sys/amd64/conf/BHYVE b/sys/amd64/conf/BHYVE deleted file mode 100644 index 89c8ea2..0000000 --- a/sys/amd64/conf/BHYVE +++ /dev/null @@ -1,345 +0,0 @@ -# -# BHYVE -- Kernel configuration file for FreeBSD/amd64 bhyve guest -# -# For more information on this file, please read the config(5) manual page, -# and/or the handbook section on Kernel Configuration Files: -# -# http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html -# -# The handbook is also available locally in /usr/share/doc/handbook -# if you've installed the doc distribution, otherwise always see the -# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the -# latest information. -# -# An exhaustive list of options and more detailed explanations of the -# device lines is also present in the ../../conf/NOTES and NOTES files. -# If you are in doubt as to the purpose or necessity of a line, check first -# in NOTES. -# -# $FreeBSD: projects/bhyve/sys/amd64/conf/GENERIC 221914 2011-05-14 20:35:01Z jhb $ - -cpu HAMMER -ident BHYVE - -makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols - -options SCHED_ULE # ULE scheduler -options PREEMPTION # Enable kernel thread preemption -options INET # InterNETworking -options INET6 # IPv6 communications protocols -options SCTP # Stream Control Transmission Protocol -options FFS # Berkeley Fast Filesystem -options SOFTUPDATES # Enable FFS soft updates support -options UFS_ACL # Support for access control lists -options UFS_DIRHASH # Improve performance on big directories -options UFS_GJOURNAL # Enable gjournal-based UFS journaling -options MD_ROOT # MD is a potential root device -options NFSCL # New Network Filesystem Client -options NFSD # New Network Filesystem Server -options NFSLOCKD # Network Lock Manager -options NFS_ROOT # NFS usable as /, requires NFSCLIENT -options MSDOSFS # MSDOS Filesystem -options CD9660 # ISO 9660 Filesystem -options PROCFS # Process filesystem (requires PSEUDOFS) -options PSEUDOFS # Pseudo-filesystem framework -options GEOM_PART_GPT # GUID Partition Tables. -options GEOM_LABEL # Provides labelization -options COMPAT_FREEBSD32 # Compatible with i386 binaries -options COMPAT_FREEBSD4 # Compatible with FreeBSD4 -options COMPAT_FREEBSD5 # Compatible with FreeBSD5 -options COMPAT_FREEBSD6 # Compatible with FreeBSD6 -options COMPAT_FREEBSD7 # Compatible with FreeBSD7 -options SCSI_DELAY=5000 # Delay (in ms) before probing SCSI -options KTRACE # ktrace(1) support -options STACK # stack(9) support -options SYSVSHM # SYSV-style shared memory -options SYSVMSG # SYSV-style message queues -options SYSVSEM # SYSV-style semaphores -options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions -options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. -options KBD_INSTALL_CDEV # install a CDEV entry in /dev -#options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) -options AUDIT # Security event auditing -options MAC # TrustedBSD MAC Framework -#options KDTRACE_FRAME # Ensure frames are compiled in -#options KDTRACE_HOOKS # Kernel DTrace hooks -options INCLUDE_CONFIG_FILE # Include this file in kernel - -# Debugging for use in -current -options KDB # Enable kernel debugger support. -options DDB # Support DDB. -options GDB # Support remote GDB. -options DEADLKRES # Enable the deadlock resolver -options INVARIANTS # Enable calls of extra sanity checking -options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS -options WITNESS # Enable checks to detect deadlocks and cycles -options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed -options MALLOC_DEBUG_MAXZONES=8 # Separate malloc(9) zones - -# Make an SMP-capable kernel by default -options SMP # Symmetric MultiProcessor Kernel - -# CPU frequency control -#device cpufreq - -# Bus support. -#device acpi -device pci - -# Floppy drives -#device fdc - -# ATA controllers -#device ahci # AHCI-compatible SATA controllers -#device ata # Legacy ATA/SATA controllers -#options ATA_CAM # Handle legacy controllers with CAM -#options ATA_STATIC_ID # Static device numbering -#device mvs # Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA -#device siis # SiliconImage SiI3124/SiI3132/SiI3531 SATA - -# SCSI Controllers -#device ahc # AHA2940 and onboard AIC7xxx devices -#options AHC_REG_PRETTY_PRINT # Print register bitfields in debug - # output. Adds ~128k to driver. -#device ahd # AHA39320/29320 and onboard AIC79xx devices -#options AHD_REG_PRETTY_PRINT # Print register bitfields in debug - # output. Adds ~215k to driver. -#device amd # AMD 53C974 (Tekram DC-390(T)) -#device hptiop # Highpoint RocketRaid 3xxx series -#device isp # Qlogic family -#device ispfw # Firmware for QLogic HBAs- normally a module -#device mpt # LSI-Logic MPT-Fusion -#device mps # LSI-Logic MPT-Fusion 2 -#device ncr # NCR/Symbios Logic -#device sym # NCR/Symbios Logic (newer chipsets + those of `ncr') -#device trm # Tekram DC395U/UW/F DC315U adapters - -#device adv # Advansys SCSI adapters -#device adw # Advansys wide SCSI adapters -#device aic # Adaptec 15[012]x SCSI adapters, AIC-6[23]60. -#device bt # Buslogic/Mylex MultiMaster SCSI adapters - -# ATA/SCSI peripherals -#device scbus # SCSI bus (required for ATA/SCSI) -#device ch # SCSI media changers -#device da # Direct Access (disks) -#device sa # Sequential Access (tape etc) -#device cd # CD -#device pass # Passthrough device (direct ATA/SCSI access) -#device ses # SCSI Environmental Services (and SAF-TE) - -# RAID controllers interfaced to the SCSI subsystem -#device amr # AMI MegaRAID -#device arcmsr # Areca SATA II RAID -#XXX it is not 64-bit clean, -scottl -#device asr # DPT SmartRAID V, VI and Adaptec SCSI RAID -#device ciss # Compaq Smart RAID 5* -#device dpt # DPT Smartcache III, IV - See NOTES for options -#device hptmv # Highpoint RocketRAID 182x -#device hptrr # Highpoint RocketRAID 17xx, 22xx, 23xx, 25xx -#device iir # Intel Integrated RAID -#device ips # IBM (Adaptec) ServeRAID -#device mly # Mylex AcceleRAID/eXtremeRAID -#device twa # 3ware 9000 series PATA/SATA RAID - -# RAID controllers -#device aac # Adaptec FSA RAID -#device aacp # SCSI passthrough for aac (requires CAM) -#device ida # Compaq Smart RAID -#device mfi # LSI MegaRAID SAS -#device mlx # Mylex DAC960 family -#XXX pointer/int warnings -#device pst # Promise Supertrak SX6000 -#device twe # 3ware ATA RAID - -# atkbdc0 controls both the keyboard and the PS/2 mouse -#device atkbdc # AT keyboard controller -#device atkbd # AT keyboard -#device psm # PS/2 mouse - -#device kbdmux # keyboard multiplexer - -#device vga # VGA video card driver - -#device splash # Splash screen and screen saver support - -# syscons is the default console driver, resembling an SCO console -#device sc -#options SC_PIXEL_MODE # add support for the raster text mode - -#device agp # support several AGP chipsets - -# PCCARD (PCMCIA) support -# PCMCIA and cardbus bridge support -#device cbb # cardbus (yenta) bridge -#device pccard # PC Card (16-bit) bus -#device cardbus # CardBus (32-bit) bus - -# Serial (COM) ports -device uart # Generic UART driver - -# Parallel port -#device ppc -#device ppbus # Parallel port bus (required) -#device lpt # Printer -#device plip # TCP/IP over parallel -#device ppi # Parallel port interface device -#device vpo # Requires scbus and da - -# If you've got a "dumb" serial or parallel PCI card that is -# supported by the puc(4) glue driver, uncomment the following -# line to enable it (connects to sio, uart and/or ppc drivers): -#device puc - -# PCI Ethernet NICs. -#device bxe # Broadcom BCM57710/BCM57711/BCM57711E 10Gb Ethernet -#device de # DEC/Intel DC21x4x (``Tulip'') -#device em # Intel PRO/1000 Gigabit Ethernet Family -#device igb # Intel PRO/1000 PCIE Server Gigabit Family -#device ixgbe # Intel PRO/10GbE PCIE Ethernet Family -#device le # AMD Am7900 LANCE and Am79C9xx PCnet -#device ti # Alteon Networks Tigon I/II gigabit Ethernet -#device txp # 3Com 3cR990 (``Typhoon'') -#device vx # 3Com 3c590, 3c595 (``Vortex'') - -# PCI Ethernet NICs that use the common MII bus controller code. -# NOTE: Be sure to keep the 'device miibus' line in order to use these NICs! -#device miibus # MII bus support -#device ae # Attansic/Atheros L2 FastEthernet -#device age # Attansic/Atheros L1 Gigabit Ethernet -#device alc # Atheros AR8131/AR8132 Ethernet -#device ale # Atheros AR8121/AR8113/AR8114 Ethernet -#device bce # Broadcom BCM5706/BCM5708 Gigabit Ethernet -#device bfe # Broadcom BCM440x 10/100 Ethernet -#device bge # Broadcom BCM570xx Gigabit Ethernet -#device dc # DEC/Intel 21143 and various workalikes -#device et # Agere ET1310 10/100/Gigabit Ethernet -#device fxp # Intel EtherExpress PRO/100B (82557, 82558) -#device jme # JMicron JMC250 Gigabit/JMC260 Fast Ethernet -#device lge # Level 1 LXT1001 gigabit Ethernet -#device msk # Marvell/SysKonnect Yukon II Gigabit Ethernet -#device nfe # nVidia nForce MCP on-board Ethernet -#device nge # NatSemi DP83820 gigabit Ethernet -#device nve # nVidia nForce MCP on-board Ethernet Networking -#device pcn # AMD Am79C97x PCI 10/100 (precedence over 'le') -#device re # RealTek 8139C+/8169/8169S/8110S -#device rl # RealTek 8129/8139 -#device sf # Adaptec AIC-6915 (``Starfire'') -#device sge # Silicon Integrated Systems SiS190/191 -#device sis # Silicon Integrated Systems SiS 900/SiS 7016 -#device sk # SysKonnect SK-984x & SK-982x gigabit Ethernet -#device ste # Sundance ST201 (D-Link DFE-550TX) -#device stge # Sundance/Tamarack TC9021 gigabit Ethernet -#device tl # Texas Instruments ThunderLAN -#device tx # SMC EtherPower II (83c170 ``EPIC'') -#device vge # VIA VT612x gigabit Ethernet -#device vr # VIA Rhine, Rhine II -#device wb # Winbond W89C840F -#device xl # 3Com 3c90x (``Boomerang'', ``Cyclone'') - -# ISA Ethernet NICs. pccard NICs included. -#device cs # Crystal Semiconductor CS89x0 NIC -# 'device ed' requires 'device miibus' -#device ed # NE[12]000, SMC Ultra, 3c503, DS8390 cards -#device ex # Intel EtherExpress Pro/10 and Pro/10+ -#device ep # Etherlink III based cards -#device fe # Fujitsu MB8696x based cards -#device sn # SMC's 9000 series of Ethernet chips -#device xe # Xircom pccard Ethernet - -# Wireless NIC cards -#device wlan # 802.11 support -#options IEEE80211_DEBUG # enable debug msgs -#options IEEE80211_AMPDU_AGE # age frames in AMPDU reorder q's -#options IEEE80211_SUPPORT_MESH # enable 802.11s draft support -#device wlan_wep # 802.11 WEP support -#device wlan_ccmp # 802.11 CCMP support -#device wlan_tkip # 802.11 TKIP support -#device wlan_amrr # AMRR transmit rate control algorithm -#device an # Aironet 4500/4800 802.11 wireless NICs. -#device ath # Atheros NIC's -#device ath_pci # Atheros pci/cardbus glue -#device ath_hal # pci/cardbus chip support -#options AH_SUPPORT_AR5416 # enable AR5416 tx/rx descriptors -#device ath_rate_sample # SampleRate tx rate control for ath -#device bwi # Broadcom BCM430x/BCM431x wireless NICs. -#device bwn # Broadcom BCM43xx wireless NICs. -#device ipw # Intel 2100 wireless NICs. -#device iwi # Intel 2200BG/2225BG/2915ABG wireless NICs. -#device iwn # Intel 4965/1000/5000/6000 wireless NICs. -#device malo # Marvell Libertas wireless NICs. -#device mwl # Marvell 88W8363 802.11n wireless NICs. -#device ral # Ralink Technology RT2500 wireless NICs. -#device wi # WaveLAN/Intersil/Symbol 802.11 wireless NICs. -#device wpi # Intel 3945ABG wireless NICs. - -# Pseudo devices. -device loop # Network loopback -device random # Entropy device -device ether # Ethernet support -device vlan # 802.1Q VLAN support -device tun # Packet tunnel. -device pty # BSD-style compatibility pseudo ttys -device md # Memory "disks" -device gif # IPv6 and IPv4 tunneling -device faith # IPv6-to-IPv4 relaying (translation) -device firmware # firmware assist module - -# The `bpf' device enables the Berkeley Packet Filter. -# Be aware of the administrative consequences of enabling this! -# Note that 'bpf' is required for DHCP. -device bpf # Berkeley packet filter - -# USB support -#options USB_DEBUG # enable debug msgs -#device uhci # UHCI PCI->USB interface -#device ohci # OHCI PCI->USB interface -#device ehci # EHCI PCI->USB interface (USB 2.0) -#device usb # USB Bus (required) -#device udbp # USB Double Bulk Pipe devices (needs netgraph) -#device uhid # "Human Interface Devices" -#device ukbd # Keyboard -#device ulpt # Printer -#device umass # Disks/Mass storage - Requires scbus and da -#device ums # Mouse -#device urio # Diamond Rio 500 MP3 player -# USB Serial devices -#device u3g # USB-based 3G modems (Option, Huawei, Sierra) -#device uark # Technologies ARK3116 based serial adapters -#device ubsa # Belkin F5U103 and compatible serial adapters -#device uftdi # For FTDI usb serial adapters -#device uipaq # Some WinCE based devices -#device uplcom # Prolific PL-2303 serial adapters -#device uslcom # SI Labs CP2101/CP2102 serial adapters -#device uvisor # Visor and Palm devices -#device uvscom # USB serial support for DDI pocket's PHS -# USB Ethernet, requires miibus -#device aue # ADMtek USB Ethernet -#device axe # ASIX Electronics USB Ethernet -#device cdce # Generic USB over Ethernet -#device cue # CATC USB Ethernet -#device kue # Kawasaki LSI USB Ethernet -#device rue # RealTek RTL8150 USB Ethernet -#device udav # Davicom DM9601E USB -# USB Wireless -#device rum # Ralink Technology RT2501USB wireless NICs -#device run # Ralink Technology RT2700/RT2800/RT3000 NICs. -#device uath # Atheros AR5523 wireless NICs -#device upgt # Conexant/Intersil PrismGT wireless NICs. -#device ural # Ralink Technology RT2500USB wireless NICs -#device urtw # Realtek RTL8187B/L wireless NICs -#device zyd # ZyDAS zb1211/zb1211b wireless NICs - -# FireWire support -#device firewire # FireWire bus code -#device sbp # SCSI over FireWire (Requires scbus and da) -#device fwe # Ethernet over FireWire (non-standard!) -#device fwip # IP over FireWire (RFC 2734,3146) -#device dcons # Dumb console driver -#device dcons_crom # Configuration ROM for dcons - -device bvmconsole # brain dead simple bvm console -device bvmdebug # brain dead simple bvm gdb pipe - -device mptable -options NKPT=256 -- cgit v1.1 From eda0d7f2563ff44f103201dbb5a841351f0c024a Mon Sep 17 00:00:00 2001 From: neel Date: Sat, 5 Jan 2013 03:35:30 +0000 Subject: bhyve does not require a custom configuration file anymore so make the GENERIC identical to the one in HEAD. Obtained from: NetApp --- sys/amd64/conf/GENERIC | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index 9c72500..7aff1e8 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -74,7 +74,7 @@ options INCLUDE_CONFIG_FILE # Include this file in kernel # Debugging support. Always need this: options KDB # Enable kernel debugger support. # For minimum debugger support (stable branch) use: -options KDB_TRACE # Print a stack trace for a panic. +#options KDB_TRACE # Print a stack trace for a panic. # For full debugger support use this instead: options DDB # Support DDB. options GDB # Support remote GDB. @@ -326,11 +326,6 @@ device fwip # IP over FireWire (RFC 2734,3146) device dcons # Dumb console driver device dcons_crom # Configuration ROM for dcons -# bhyve options -device bvmconsole # brain dead simple bvm console -device bvmdebug # brain dead simple bvm gdb pipe -device mptable - # Sound support device sound # Generic sound driver (required) device snd_cmi # CMedia CMI8338/CMI8738 -- cgit v1.1 From d184bb1077cf7d96f98f7b5b1fb24951ff6a80e7 Mon Sep 17 00:00:00 2001 From: neel Date: Sun, 6 Jan 2013 05:37:26 +0000 Subject: Revert changes for x2apic support from projects/bhyve. During the early days of bhyve it did not support instruction emulation which necessitated the use of x2apic to access the local apic. This is no longer the case and the dependency on x2apic has gone away. The x2apic patches can be considered independently of bhyve and will be merged into head via projects/x2apic. Discussed with: grehan --- sys/amd64/amd64/apic_vector.S | 55 ++++++++++++++----------------------------- sys/amd64/amd64/mp_machdep.c | 2 -- sys/amd64/include/apicvar.h | 1 - 3 files changed, 18 insertions(+), 40 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index 8004153..6465247 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -57,15 +57,8 @@ IDTVEC(vec_name) ; \ PUSH_FRAME ; \ FAKE_MCOUNT(TF_RIP(%rsp)) ; \ movq lapic, %rdx ; /* pointer to local APIC */ \ - testq %rdx, %rdx; \ - jnz 3f; \ - movl $MSR_APIC_ISR ## index, %ecx; \ - rdmsr; \ - jmp 4f; \ -3: ; \ movl LA_ISR + 16 * (index)(%rdx), %eax ; /* load ISR */ \ -4: ; \ - bsrl %eax, %eax ; /* index of highset set bit in ISR */ \ + bsrl %eax, %eax ; /* index of highest set bit in ISR */ \ jz 1f ; \ addl $(32 * index),%eax ; \ movq %rsp, %rsi ; \ @@ -136,26 +129,6 @@ IDTVEC(errorint) jmp doreti #ifdef SMP - -/* - * We assume that %rax is being saved/restored outside of this macro - */ -#define DO_EOI \ - movq lapic, %rax; \ - testq %rax, %rax; \ - jz 8f; \ - movl $0, LA_EOI(%rax); \ - jmp 9f; \ -8:; \ - pushq %rcx; \ - pushq %rdx; \ - xorl %edx, %edx; /* eax is already zero */ \ - movl $MSR_APIC_EOI, %ecx; \ - wrmsr; \ - popq %rdx; \ - popq %rcx; \ -9: - /* * Global address space TLB shootdown. */ @@ -180,7 +153,8 @@ IDTVEC(invltlb) movq %cr3, %rax /* invalidate the TLB */ movq %rax, %cr3 - DO_EOI + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait @@ -212,7 +186,8 @@ IDTVEC(invlpg) movq smp_tlb_addr1, %rax invlpg (%rax) /* invalidate single page */ - DO_EOI + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait @@ -249,7 +224,8 @@ IDTVEC(invlrng) cmpq %rax, %rdx jb 1b - DO_EOI + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait @@ -276,7 +252,8 @@ IDTVEC(invlcache) wbinvd - DO_EOI + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait @@ -292,8 +269,9 @@ IDTVEC(invlcache) IDTVEC(ipi_intr_bitmap_handler) PUSH_FRAME - DO_EOI - + movq lapic, %rdx + movl $0, LA_EOI(%rdx) /* End Of Interrupt to APIC */ + FAKE_MCOUNT(TF_RIP(%rsp)) call ipi_bitmap_handler @@ -308,7 +286,8 @@ IDTVEC(ipi_intr_bitmap_handler) IDTVEC(cpustop) PUSH_FRAME - DO_EOI + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ call cpustop_handler jmp doreti @@ -322,7 +301,8 @@ IDTVEC(cpususpend) PUSH_FRAME call cpususpend_handler - DO_EOI + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ jmp doreti /* @@ -340,6 +320,7 @@ IDTVEC(rendezvous) incq (%rax) #endif call smp_rendezvous_action - DO_EOI + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ jmp doreti #endif /* SMP */ diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 33e0814..d2e4aad 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -688,8 +688,6 @@ init_secondary(void) wrmsr(MSR_STAR, msr); wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); - lapic_init_ap(); - /* Disable local APIC just to be sure. */ lapic_disable(); diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h index dee5900..ae2f5b9 100644 --- a/sys/amd64/include/apicvar.h +++ b/sys/amd64/include/apicvar.h @@ -209,7 +209,6 @@ int lapic_enable_pmc(void); void lapic_eoi(void); int lapic_id(void); void lapic_init(vm_paddr_t addr); -void lapic_init_ap(void); int lapic_intr_pending(u_int vector); void lapic_ipi_raw(register_t icrlo, u_int dest); void lapic_ipi_vectored(u_int vector, int dest); -- cgit v1.1 From 4c17637f9cfd75b0ebb0474d3f2f39483a453913 Mon Sep 17 00:00:00 2001 From: neel Date: Wed, 9 Jan 2013 03:32:23 +0000 Subject: IFC @ r245205 --- sys/amd64/amd64/vm_machdep.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index a40eaba..9883715 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -574,10 +574,9 @@ cpu_reset_proxy() cpuset_t tcrp; cpu_reset_proxy_active = 1; - while (cpu_reset_proxy_active == 1) { - ia32_pause(); - ; /* Wait for other cpu to see that we've started */ - } + while (cpu_reset_proxy_active == 1) + ia32_pause(); /* Wait for other cpu to see that we've started */ + CPU_SETOF(cpu_reset_proxyid, &tcrp); stop_cpus(tcrp); printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid); -- cgit v1.1 From df2fc90f0e744447190cbd4a0d67474ddadfa96c Mon Sep 17 00:00:00 2001 From: bryanv Date: Sun, 13 Jan 2013 07:14:16 +0000 Subject: Add VirtIO to the i386 and amd64 GENERIC kernels This also removes the kludge from r239009 that covered only the network driver. Reviewed by: grehan Approved by: grehan (mentor) MFC after: 1 week --- sys/amd64/conf/GENERIC | 8 ++++++++ sys/amd64/conf/NOTES | 9 +++++++++ 2 files changed, 17 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index 48f41b3..e53f692 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -332,3 +332,11 @@ device snd_via8233 # VIA VT8233x Audio device mmc # MMC/SD bus device mmcsd # MMC/SD memory card device sdhci # Generic PCI SD Host Controller + +# VirtIO support +device virtio # Generic VirtIO bus (required) +device virtio_pci # VirtIO PCI device +device vtnet # VirtIO Ethernet device +device virtio_blk # VirtIO Block device +device virtio_scsi # VirtIO SCSI device +device virtio_balloon # VirtIO Memory Balloon device diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES index 6562981..a4371f7 100644 --- a/sys/amd64/conf/NOTES +++ b/sys/amd64/conf/NOTES @@ -440,6 +440,15 @@ device safe # SafeNet 1141 options SAFE_DEBUG # enable debugging support: hw.safe.debug options SAFE_RNDTEST # enable rndtest support +# +# VirtIO support +device virtio # Generic VirtIO bus (required) +device virtio_pci # VirtIO PCI Interface +device vtnet # VirtIO Ethernet device +device virtio_blk # VirtIO Block device +device virtio_scsi # VirtIO SCSI device +device virtio_balloon # VirtIO Memory Balloon device + ##################################################################### # -- cgit v1.1 From 4a3c4478d3346235378985c52457d9bd03d7f401 Mon Sep 17 00:00:00 2001 From: jhb Date: Thu, 17 Jan 2013 21:32:25 +0000 Subject: Don't attempt to use clflush on the local APIC register window. Various CPUs exhibit bad behavior if this is done (Intel Errata AAJ3, hangs on Pentium-M, and trashing of the local APIC registers on a VIA C7). The local APIC is implicitly mapped UC already via MTRRs, so the clflush isn't necessary anyway. MFC after: 2 weeks --- sys/amd64/amd64/pmap.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 8e06ff9..352cb34 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1150,6 +1150,15 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) eva - sva < PMAP_CLFLUSH_THRESHOLD) { /* + * XXX: Some CPUs fault, hang, or trash the local APIC + * registers if we use CLFLUSH on the local APIC + * range. The local APIC is always uncached, so we + * don't need to flush for that range anyway. + */ + if (pmap_kextract(sva) == lapic_paddr) + return; + + /* * Otherwise, do per-cache line flush. Use the mfence * instruction to insure that previous stores are * included in the write-back. The processor -- cgit v1.1 From fe9918fd5583c14112273668674d7275662c4961 Mon Sep 17 00:00:00 2001 From: jhb Date: Sat, 19 Jan 2013 01:18:22 +0000 Subject: Fix build with SMP disabled.` Reported by: bf --- sys/amd64/amd64/pmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 352cb34..f73e956 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -102,6 +102,7 @@ __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include +#include #include #include #include @@ -133,6 +134,8 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include #include #include #include -- cgit v1.1 From a0f44a6f49338c9ec7e274ad4e8a272008cd1f60 Mon Sep 17 00:00:00 2001 From: neel Date: Sun, 20 Jan 2013 03:42:49 +0000 Subject: Add svn properties to the recently merged bhyve source files. The pre-commit hook will not allow any commits without the svn:keywords property in head. --- sys/amd64/include/vmm.h | 2 +- sys/amd64/include/vmm_dev.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 024c30e..ec94083 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: vmm.h 482 2011-05-09 21:22:43Z grehan $ + * $FreeBSD$ */ #ifndef _VMM_H_ diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index 79f893d..2311673 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $ + * $FreeBSD$ */ #ifndef _VMM_DEV_H_ -- cgit v1.1 From 92a8d9884de2b1441cb846d50856e44cfa2b0024 Mon Sep 17 00:00:00 2001 From: neel Date: Mon, 21 Jan 2013 01:33:10 +0000 Subject: Postpone vmm module initialization until after SMP is initialized - particularly that 'smp_started != 0'. This is required because the VT-x initialization calls smp_rendezvous() to set the CR4_VMXE bit on all the cpus. With this change we can preload vmm.ko from the loader. Reported by: alfred@, sbruno@ Obtained from: NetApp --- sys/amd64/vmm/vmm.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index a4dea79..d0e6427 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -249,11 +249,15 @@ static moduledata_t vmm_kmod = { }; /* - * Execute the module load handler after the pci passthru driver has had - * a chance to claim devices. We need this information at the time we do - * iommu initialization. + * vmm initialization has the following dependencies: + * + * - iommu initialization must happen after the pci passthru driver has had + * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). + * + * - VT-x initialization requires smp_rendezvous() and therefore must happen + * after SMP is fully functional (after SI_SUB_SMP). */ -DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY); +DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); -- cgit v1.1 From af17a55dfd7a008dea74152e32f5d6c803b46bdd Mon Sep 17 00:00:00 2001 From: jhb Date: Wed, 23 Jan 2013 21:44:48 +0000 Subject: Don't assume that all Linux TCP-level socket options are identical to FreeBSD TCP-level socket options (only the first two are). Instead, using a mapping function and fail unsupported options as we do for other socket option levels. MFC after: 2 weeks --- sys/amd64/linux32/linux.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/linux32/linux.h b/sys/amd64/linux32/linux.h index 2c269d3..c18ee22 100644 --- a/sys/amd64/linux32/linux.h +++ b/sys/amd64/linux32/linux.h @@ -725,6 +725,13 @@ union l_semun { #define LINUX_IP_ADD_MEMBERSHIP 35 #define LINUX_IP_DROP_MEMBERSHIP 36 +#define LINUX_TCP_NODELAY 1 +#define LINUX_TCP_MAXSEG 2 +#define LINUX_TCP_KEEPIDLE 4 +#define LINUX_TCP_KEEPINTVL 5 +#define LINUX_TCP_KEEPCNT 6 +#define LINUX_TCP_MD5SIG 14 + struct l_sockaddr { l_ushort sa_family; char sa_data[14]; -- cgit v1.1 From 94554367a02e21be52d85147b7190cad036ebd31 Mon Sep 17 00:00:00 2001 From: grehan Date: Fri, 25 Jan 2013 21:38:31 +0000 Subject: Always allow access to the sysenter cs/esp/eip MSRs since they are automatically saved and restored in the VMCS. Reviewed by: neel Obtained from: NetApp --- sys/amd64/vmm/intel/vmx.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 4f267bb..287ac8c 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -696,6 +696,10 @@ vmx_vminit(struct vm *vm) * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are * always restored from the vmcs host state area on vm-exit. * + * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in + * how they are saved/restored so can be directly accessed by the + * guest. + * * Guest KGSBASE is saved and restored in the guest MSR save area. * Host KGSBASE is restored before returning to userland from the pcb. * There will be a window of time when we are executing in the host @@ -708,6 +712,9 @@ vmx_vminit(struct vm *vm) */ if (guest_msr_rw(vmx, MSR_GSBASE) || guest_msr_rw(vmx, MSR_FSBASE) || + guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || + guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || + guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || guest_msr_rw(vmx, MSR_KGSBASE) || guest_msr_rw(vmx, MSR_EFER)) panic("vmx_vminit: error setting guest msr access"); -- cgit v1.1 From 2617d9f095bb1dfa934ef021a7237482304fcdb9 Mon Sep 17 00:00:00 2001 From: jhb Date: Tue, 29 Jan 2013 18:41:30 +0000 Subject: Reduce duplication between i386/linux/linux.h and amd64/linux32/linux.h by moving bits that are MI out into headers in compat/linux. Reviewed by: Chagin Dmitry dmitry | gmail MFC after: 2 weeks --- sys/amd64/linux32/linux.h | 160 ------------------------------------- sys/amd64/linux32/linux32_sysvec.c | 1 + 2 files changed, 1 insertion(+), 160 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/linux32/linux.h b/sys/amd64/linux32/linux.h index c18ee22..7b52a64 100644 --- a/sys/amd64/linux32/linux.h +++ b/sys/amd64/linux32/linux.h @@ -107,11 +107,6 @@ typedef struct { /* * Miscellaneous */ -#define LINUX_NAME_MAX 255 -#define LINUX_MAX_UTSNAME 65 - -#define LINUX_CTL_MAXNAME 10 - #define LINUX_AT_COUNT 16 /* Count of used aux entry types. * Keep this synchronized with * elf_linux_fixup() code. @@ -127,11 +122,6 @@ struct l___sysctl_args l_ulong __spare[4]; } __packed; -/* Scheduling policies */ -#define LINUX_SCHED_OTHER 0 -#define LINUX_SCHED_FIFO 1 -#define LINUX_SCHED_RR 2 - /* Resource limits */ #define LINUX_RLIMIT_CPU 0 #define LINUX_RLIMIT_FSIZE 1 @@ -265,15 +255,6 @@ struct l_statfs64 { l_int f_spare[6]; } __packed; -struct l_new_utsname { - char sysname[LINUX_MAX_UTSNAME]; - char nodename[LINUX_MAX_UTSNAME]; - char release[LINUX_MAX_UTSNAME]; - char version[LINUX_MAX_UTSNAME]; - char machine[LINUX_MAX_UTSNAME]; - char domainname[LINUX_MAX_UTSNAME]; -} __packed; - /* * Signalling */ @@ -535,27 +516,9 @@ struct l_rt_sigframe { l_handler_t sf_handler; } __packed; -extern int bsd_to_linux_signal[]; -extern int linux_to_bsd_signal[]; extern struct sysentvec elf_linux_sysvec; /* - * Pluggable ioctl handlers - */ -struct linux_ioctl_args; -struct thread; - -typedef int linux_ioctl_function_t(struct thread *, struct linux_ioctl_args *); - -struct linux_ioctl_handler { - linux_ioctl_function_t *func; - int low, high; -}; - -int linux_ioctl_register_handler(struct linux_ioctl_handler *h); -int linux_ioctl_unregister_handler(struct linux_ioctl_handler *h); - -/* * open/fcntl flags */ #define LINUX_O_RDONLY 00000000 @@ -597,65 +560,6 @@ int linux_ioctl_unregister_handler(struct linux_ioctl_handler *h); #define LINUX_F_WRLCK 1 #define LINUX_F_UNLCK 2 -/* - * posix_fadvise advice - */ -#define LINUX_POSIX_FADV_NORMAL 0 -#define LINUX_POSIX_FADV_RANDOM 1 -#define LINUX_POSIX_FADV_SEQUENTIAL 2 -#define LINUX_POSIX_FADV_WILLNEED 3 -#define LINUX_POSIX_FADV_DONTNEED 4 -#define LINUX_POSIX_FADV_NOREUSE 5 - -/* - * mount flags - */ -#define LINUX_MS_RDONLY 0x0001 -#define LINUX_MS_NOSUID 0x0002 -#define LINUX_MS_NODEV 0x0004 -#define LINUX_MS_NOEXEC 0x0008 -#define LINUX_MS_REMOUNT 0x0020 - -/* - * SystemV IPC defines - */ -#define LINUX_SEMOP 1 -#define LINUX_SEMGET 2 -#define LINUX_SEMCTL 3 -#define LINUX_MSGSND 11 -#define LINUX_MSGRCV 12 -#define LINUX_MSGGET 13 -#define LINUX_MSGCTL 14 -#define LINUX_SHMAT 21 -#define LINUX_SHMDT 22 -#define LINUX_SHMGET 23 -#define LINUX_SHMCTL 24 - -#define LINUX_IPC_RMID 0 -#define LINUX_IPC_SET 1 -#define LINUX_IPC_STAT 2 -#define LINUX_IPC_INFO 3 - -#define LINUX_SHM_LOCK 11 -#define LINUX_SHM_UNLOCK 12 -#define LINUX_SHM_STAT 13 -#define LINUX_SHM_INFO 14 - -#define LINUX_SHM_RDONLY 0x1000 -#define LINUX_SHM_RND 0x2000 -#define LINUX_SHM_REMAP 0x4000 - -/* semctl commands */ -#define LINUX_GETPID 11 -#define LINUX_GETVAL 12 -#define LINUX_GETALL 13 -#define LINUX_GETNCNT 14 -#define LINUX_GETZCNT 15 -#define LINUX_SETVAL 16 -#define LINUX_SETALL 17 -#define LINUX_SEM_STAT 18 -#define LINUX_SEM_INFO 19 - union l_semun { l_int val; l_uintptr_t buf; @@ -667,25 +571,6 @@ union l_semun { /* * Socket defines */ -#define LINUX_SOCKET 1 -#define LINUX_BIND 2 -#define LINUX_CONNECT 3 -#define LINUX_LISTEN 4 -#define LINUX_ACCEPT 5 -#define LINUX_GETSOCKNAME 6 -#define LINUX_GETPEERNAME 7 -#define LINUX_SOCKETPAIR 8 -#define LINUX_SEND 9 -#define LINUX_RECV 10 -#define LINUX_SENDTO 11 -#define LINUX_RECVFROM 12 -#define LINUX_SHUTDOWN 13 -#define LINUX_SETSOCKOPT 14 -#define LINUX_GETSOCKOPT 15 -#define LINUX_SENDMSG 16 -#define LINUX_RECVMSG 17 -#define LINUX_ACCEPT4 18 - #define LINUX_SOL_SOCKET 1 #define LINUX_SOL_IP 0 #define LINUX_SOL_IPX 256 @@ -714,24 +599,6 @@ union l_semun { #define LINUX_SO_TIMESTAMP 29 #define LINUX_SO_ACCEPTCONN 30 -#define LINUX_IP_TOS 1 -#define LINUX_IP_TTL 2 -#define LINUX_IP_HDRINCL 3 -#define LINUX_IP_OPTIONS 4 - -#define LINUX_IP_MULTICAST_IF 32 -#define LINUX_IP_MULTICAST_TTL 33 -#define LINUX_IP_MULTICAST_LOOP 34 -#define LINUX_IP_ADD_MEMBERSHIP 35 -#define LINUX_IP_DROP_MEMBERSHIP 36 - -#define LINUX_TCP_NODELAY 1 -#define LINUX_TCP_MAXSEG 2 -#define LINUX_TCP_KEEPIDLE 4 -#define LINUX_TCP_KEEPINTVL 5 -#define LINUX_TCP_KEEPCNT 6 -#define LINUX_TCP_MD5SIG 14 - struct l_sockaddr { l_ushort sa_family; char sa_data[14]; @@ -897,30 +764,6 @@ struct l_user_desc { #define LINUX_GET_USEABLE(desc) \ (((desc)->b >> LINUX_ENTRY_B_USEABLE) & 1) -#define LINUX_CLOCK_REALTIME 0 -#define LINUX_CLOCK_MONOTONIC 1 -#define LINUX_CLOCK_PROCESS_CPUTIME_ID 2 -#define LINUX_CLOCK_THREAD_CPUTIME_ID 3 -#define LINUX_CLOCK_REALTIME_HR 4 -#define LINUX_CLOCK_MONOTONIC_HR 5 - -#define LINUX_CLONE_VM 0x00000100 -#define LINUX_CLONE_FS 0x00000200 -#define LINUX_CLONE_FILES 0x00000400 -#define LINUX_CLONE_SIGHAND 0x00000800 -#define LINUX_CLONE_PID 0x00001000 /* No longer exist in Linux */ -#define LINUX_CLONE_VFORK 0x00004000 -#define LINUX_CLONE_PARENT 0x00008000 -#define LINUX_CLONE_THREAD 0x00010000 -#define LINUX_CLONE_SETTLS 0x00080000 -#define LINUX_CLONE_PARENT_SETTID 0x00100000 -#define LINUX_CLONE_CHILD_CLEARTID 0x00200000 -#define LINUX_CLONE_CHILD_SETTID 0x01000000 - -#define LINUX_THREADING_FLAGS \ - (LINUX_CLONE_VM | LINUX_CLONE_FS | LINUX_CLONE_FILES | \ - LINUX_CLONE_SIGHAND | LINUX_CLONE_THREAD) - struct iovec; struct l_iovec32 { @@ -942,7 +785,4 @@ struct linux_robust_list_head { l_uintptr_t pending_list; }; -int linux_set_upcall_kse(struct thread *td, register_t stack); -int linux_set_cloned_tls(struct thread *td, void *desc); - #endif /* !_AMD64_LINUX_H_ */ diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c index 5afc9ce..42500da 100644 --- a/sys/amd64/linux32/linux32_sysvec.c +++ b/sys/amd64/linux32/linux32_sysvec.c @@ -78,6 +78,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include -- cgit v1.1 From 1ae7af0ed87a074e560452e9f87dc50964a20275 Mon Sep 17 00:00:00 2001 From: neel Date: Wed, 30 Jan 2013 04:09:09 +0000 Subject: Add emulation support for instruction "88/r: mov r/m8, r8". This instruction moves a byte from a register to a memory location. Tested by: tycho nightingale at pluribusnetworks com --- sys/amd64/include/vmm_instruction_emul.h | 3 +- sys/amd64/vmm/vmm_instruction_emul.c | 58 ++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h index 4cc494b..4c7a346 100644 --- a/sys/amd64/include/vmm_instruction_emul.h +++ b/sys/amd64/include/vmm_instruction_emul.h @@ -49,7 +49,8 @@ struct vie { uint8_t rex_w:1, /* REX prefix */ rex_r:1, rex_x:1, - rex_b:1; + rex_b:1, + rex_present:1; uint8_t mod:2, /* ModRM byte */ reg:4, diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index e73f6bb..40748ea 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -65,6 +65,10 @@ enum { #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ static const struct vie_op one_byte_opcodes[256] = { + [0x88] = { + .op_byte = 0x88, + .op_type = VIE_OP_TYPE_MOV, + }, [0x89] = { .op_byte = 0x89, .op_type = VIE_OP_TYPE_MOV, @@ -161,6 +165,46 @@ vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) } static int +vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) +{ + uint64_t val; + int error, rshift; + enum vm_reg_name reg; + + rshift = 0; + reg = gpr_map[vie->reg]; + + /* + * 64-bit mode imposes limitations on accessing legacy byte registers. + * + * The legacy high-byte registers cannot be addressed if the REX + * prefix is present. In this case the values 4, 5, 6 and 7 of the + * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. + * + * If the REX prefix is not present then the values 4, 5, 6 and 7 + * of the 'ModRM:reg' field address the legacy high-byte registers, + * %ah, %ch, %dh and %bh respectively. + */ + if (!vie->rex_present) { + if (vie->reg & 0x4) { + /* + * Obtain the value of %ah by reading %rax and shifting + * right by 8 bits (same for %bh, %ch and %dh). + */ + rshift = 8; + reg = gpr_map[vie->reg & 0x3]; + } + } + + if (!vie_valid_register(reg)) + return (EINVAL); + + error = vm_get_register(vm, vcpuid, reg, &val); + *rval = val >> rshift; + return (error); +} + +static int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t val, int size) { @@ -209,12 +253,24 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, { int error, size; enum vm_reg_name reg; + uint8_t byte; uint64_t val; size = 4; error = EINVAL; switch (vie->op.op_byte) { + case 0x88: + /* + * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) + * 88/r: mov r/m8, r8 + * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) + */ + size = 1; + error = vie_read_bytereg(vm, vcpuid, vie, &byte); + if (error == 0) + error = memwrite(vm, vcpuid, gpa, byte, size, arg); + break; case 0x89: /* * MOV from reg (ModRM:reg) to mem (ModRM:r/m) @@ -497,6 +553,8 @@ decode_rex(struct vie *vie) return (-1); if (x >= 0x40 && x <= 0x4F) { + vie->rex_present = 1; + vie->rex_w = x & 0x8 ? 1 : 0; vie->rex_r = x & 0x4 ? 1 : 0; vie->rex_x = x & 0x2 ? 1 : 0; -- cgit v1.1 From c9a45ab898c8adbadbd15cf73d00a9dbf1d4ba52 Mon Sep 17 00:00:00 2001 From: neel Date: Fri, 1 Feb 2013 01:16:26 +0000 Subject: Increase the number of passthru devices supported by bhyve. The maximum length of an environment variable puts a limitation on the number of passthru devices that can be specified via a single variable. The workaround is to allow user to specify passthru devices via multiple environment variables instead of a single one. Obtained from: NetApp --- sys/amd64/vmm/io/ppt.c | 2 +- sys/amd64/vmm/vmm.c | 44 ++++++++++++++++++++++++++++---------------- 2 files changed, 29 insertions(+), 17 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c index fdf136b..d3ec8d1 100644 --- a/sys/amd64/vmm/io/ppt.c +++ b/sys/amd64/vmm/io/ppt.c @@ -89,7 +89,7 @@ static struct pptdev { void **cookie; struct pptintr_arg *arg; } msix; -} pptdevs[32]; +} pptdevs[64]; static int num_pptdevs; diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index d0e6427..82d4baa 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -862,30 +862,42 @@ vm_lapic(struct vm *vm, int cpu) boolean_t vmm_is_pptdev(int bus, int slot, int func) { - int found, b, s, f, n; + int found, i, n; + int b, s, f; char *val, *cp, *cp2; /* - * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12" + * XXX + * The length of an environment variable is limited to 128 bytes which + * puts an upper limit on the number of passthru devices that may be + * specified using a single environment variable. + * + * Work around this by scanning multiple environment variable + * names instead of a single one - yuck! */ + const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; + + /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ found = 0; - cp = val = getenv("pptdevs"); - while (cp != NULL && *cp != '\0') { - if ((cp2 = strchr(cp, ' ')) != NULL) - *cp2 = '\0'; - - n = sscanf(cp, "%d/%d/%d", &b, &s, &f); - if (n == 3 && bus == b && slot == s && func == f) { - found = 1; - break; - } + for (i = 0; names[i] != NULL && !found; i++) { + cp = val = getenv(names[i]); + while (cp != NULL && *cp != '\0') { + if ((cp2 = strchr(cp, ' ')) != NULL) + *cp2 = '\0'; + + n = sscanf(cp, "%d/%d/%d", &b, &s, &f); + if (n == 3 && bus == b && slot == s && func == f) { + found = 1; + break; + } - if (cp2 != NULL) - *cp2++ = ' '; + if (cp2 != NULL) + *cp2++ = ' '; - cp = cp2; + cp = cp2; + } + freeenv(val); } - freeenv(val); return (found); } -- cgit v1.1 From 81de6f5cc49043ac5e2135ad996dfb05f2bd2a32 Mon Sep 17 00:00:00 2001 From: neel Date: Fri, 1 Feb 2013 03:49:09 +0000 Subject: Fix a broken assumption in the passthru implementation that the MSI-X table can only be located at the beginning or the end of the BAR. If the MSI-table is located in the middle of a BAR then we will split the BAR into two and create two mappings - one before the table and one after the table - leaving a hole in place of the table so accesses to it can be trapped and emulated. Obtained from: NetApp --- sys/amd64/vmm/io/ppt.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c index d3ec8d1..4a05985 100644 --- a/sys/amd64/vmm/io/ppt.c +++ b/sys/amd64/vmm/io/ppt.c @@ -56,9 +56,18 @@ __FBSDID("$FreeBSD$"); /* XXX locking */ #define MAX_PPTDEVS (sizeof(pptdevs) / sizeof(pptdevs[0])) -#define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1) #define MAX_MSIMSGS 32 +/* + * If the MSI-X table is located in the middle of a BAR then that MMIO + * region gets split into two segments - one segment above the MSI-X table + * and the other segment below the MSI-X table - with a hole in place of + * the MSI-X table so accesses to it can be trapped and emulated. + * + * So, allocate a MMIO segment for each BAR register + 1 additional segment. + */ +#define MAX_MMIOSEGS ((PCIR_MAX_BAR_0 + 1) + 1) + MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources"); struct pptintr_arg { /* pptintr(pptintr_arg) */ -- cgit v1.1 From 6a1efe1ad9984c8085ef28facb9d7f1cc2f01b6a Mon Sep 17 00:00:00 2001 From: eadler Date: Fri, 1 Feb 2013 20:17:11 +0000 Subject: Remove support for plip from the GENERIC kernel as no systems in the last 10 years require this support. Discussed with: db Discussed with: kib Reviewed by: imp Reviewed by: jhb Reviewed by: -hackers Approved by: cperciva (mentor) --- sys/amd64/conf/GENERIC | 1 - 1 file changed, 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index e53f692..5819a0d 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -197,7 +197,6 @@ device uart # Generic UART driver device ppc device ppbus # Parallel port bus (required) device lpt # Printer -device plip # TCP/IP over parallel device ppi # Parallel port interface device #device vpo # Requires scbus and da -- cgit v1.1 From 09a43450b8e300637ed1d8238be2e28d3a727adb Mon Sep 17 00:00:00 2001 From: avg Date: Sat, 2 Feb 2013 12:02:42 +0000 Subject: x86 suspend/resume: suspend pics and pseudo-pics in reverse order - change 'pics' from STAILQ to TAILQ - ensure that Local APIC is always first in 'pics' Reviewed by: jhb Tested by: Sergey V. Dyatko , KAHO Toshikazu MFC after: 12 days --- sys/amd64/include/intr_machdep.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sys/amd64') diff --git a/sys/amd64/include/intr_machdep.h b/sys/amd64/include/intr_machdep.h index 700e35f..8671605 100644 --- a/sys/amd64/include/intr_machdep.h +++ b/sys/amd64/include/intr_machdep.h @@ -94,7 +94,7 @@ struct pic { int (*pic_config_intr)(struct intsrc *, enum intr_trigger, enum intr_polarity); int (*pic_assign_cpu)(struct intsrc *, u_int apic_id); - STAILQ_ENTRY(pic) pics; + TAILQ_ENTRY(pic) pics; }; /* Flags for pic_disable_source() */ -- cgit v1.1 From 2e2156704e3464a21d9828a2a25672095f24255d Mon Sep 17 00:00:00 2001 From: avg Date: Sat, 2 Feb 2013 12:04:32 +0000 Subject: cpususpend_handler: mark AP as resumed only after fully setting up lapic Reviewed by: jhb Tested by: Sergey V. Dyatko , KAHO Toshikazu MFC after: 12 days --- sys/amd64/amd64/mp_machdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'sys/amd64') diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index d2e4aad..31dbb3f 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -1431,11 +1431,11 @@ cpususpend_handler(void) while (!CPU_ISSET(cpu, &started_cpus)) ia32_pause(); - CPU_CLR_ATOMIC(cpu, &started_cpus); - /* Resume MCA and local APIC */ mca_resume(); lapic_setup(0); + + CPU_CLR_ATOMIC(cpu, &started_cpus); } /* -- cgit v1.1