From 3330f5394fa1d580c13d5ae2f59b89291fd579e1 Mon Sep 17 00:00:00 2001
From: attilio <attilio@FreeBSD.org>
Date: Thu, 13 Aug 2009 17:54:11 +0000
Subject: MFC r196196:

* Completely remove the option STOP_NMI from the kernel.  This option
  has proven to have a good effect when entering KDB by using a NMI,
  but it completely violates all the good rules about interrupts
  disabled while holding a spinlock in other occasions.  This can be the
  cause of deadlocks on events where a normal IPI_STOP is expected.
* Add an new IPI called IPI_STOP_HARD on all the supported architectures.
  This IPI is responsible for sending a stop message among CPUs using a
  privileged channel when disponible. In other cases it just does match a
  normal IPI_STOP.
  Right now the IPI_STOP_HARD functionality uses a NMI on ia32 and amd64
  architectures, while on the other has a normal IPI_STOP effect. It is
  responsibility of maintainers to eventually implement an hard stop
  when necessary and possible.
* Use the new IPI facility in order to implement a new userend SMP kernel
  function called stop_cpus_hard(). That is specular to stop_cpu() but
  it does use the privileged channel for the stopping facility.
* Let KDB use the newly introduced function stop_cpus_hard() and leave
  stop_cpus() for all the other cases
* Disable interrupts on CPU0 when starting the process of APs suspension.
* Style cleanup and comments adding

This patch should fix the reboot/shutdown deadlocks many users are
constantly reporting on mailing lists.

Please don't forget to update your config file with the STOP_NMI
option removal

Reviewed by:  jhb
Tested by:    pho, bz, rink
Approved by:  re (kib)
---
 sys/amd64/amd64/local_apic.c |  13 +++++-
 sys/amd64/amd64/mp_machdep.c | 100 ++++++++++++++-----------------------------
 sys/amd64/amd64/trap.c       |   2 -
 sys/amd64/conf/GENERIC       |   1 -
 sys/amd64/conf/NOTES         |   5 ---
 sys/amd64/conf/XENHVM        |   1 -
 sys/amd64/include/apicvar.h  |   6 +--
 sys/amd64/include/smp.h      |   5 +--
 8 files changed, 44 insertions(+), 89 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c
index 14559f3..cd3073c 100644
--- a/sys/amd64/amd64/local_apic.c
+++ b/sys/amd64/amd64/local_apic.c
@@ -1238,8 +1238,17 @@ lapic_ipi_vectored(u_int vector, int dest)
 	KASSERT((vector & ~APIC_VECTOR_MASK) == 0,
 	    ("%s: invalid vector %d", __func__, vector));
 
-	icrlo = vector | APIC_DELMODE_FIXED | APIC_DESTMODE_PHY |
-	    APIC_LEVEL_DEASSERT | APIC_TRIGMOD_EDGE;
+	icrlo = APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE;
+
+	/*
+	 * IPI_STOP_HARD is just a "fake" vector used to send a NMI.
+	 * Use special rules regard NMI if passed, otherwise specify
+	 * the vector.
+	 */
+	if (vector == IPI_STOP_HARD)
+		icrlo |= APIC_DELMODE_NMI | APIC_LEVEL_ASSERT;
+	else
+		icrlo |= vector | APIC_DELMODE_FIXED | APIC_LEVEL_DEASSERT;
 	destfield = 0;
 	switch (dest) {
 	case APIC_IPI_DEST_SELF:
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 52c209c..0ef8017 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -114,31 +114,12 @@ volatile int smp_tlb_wait;
 
 extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
 
-#ifdef STOP_NMI
-static volatile cpumask_t ipi_nmi_pending;
-
-static void	ipi_nmi_selected(cpumask_t cpus);
-#endif 
-
 /*
  * Local data and functions.
  */
 
-#ifdef STOP_NMI
-/* 
- * Provide an alternate method of stopping other CPUs. If another CPU has
- * disabled interrupts the conventional STOP IPI will be blocked. This 
- * NMI-based stop should get through in that case.
- */
-static int stop_cpus_with_nmi = 1;
-SYSCTL_INT(_debug, OID_AUTO, stop_cpus_with_nmi, CTLTYPE_INT | CTLFLAG_RW,
-    &stop_cpus_with_nmi, 0, "");
-TUNABLE_INT("debug.stop_cpus_with_nmi", &stop_cpus_with_nmi);
-#else
-#define	stop_cpus_with_nmi	0
-#endif
-
 static u_int logical_cpus;
+static volatile cpumask_t ipi_nmi_pending;
 
 /* used to hold the AP's until we are ready to release them */
 static struct mtx ap_boot_mtx;
@@ -1158,12 +1139,14 @@ ipi_selected(cpumask_t cpus, u_int ipi)
 		ipi = IPI_BITMAP_VECTOR;
 	}
 
-#ifdef STOP_NMI
-	if (ipi == IPI_STOP && stop_cpus_with_nmi) {
-		ipi_nmi_selected(cpus);
-		return;
-	}
-#endif
+	/*
+	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
+	 * of help in order to understand what is the source.
+	 * Set the mask of receiving CPUs for this purpose.
+	 */
+	if (ipi == IPI_STOP_HARD)
+		atomic_set_int(&ipi_nmi_pending, cpus);
+
 	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
 	while ((cpu = ffs(cpus)) != 0) {
 		cpu--;
@@ -1194,64 +1177,43 @@ void
 ipi_all_but_self(u_int ipi)
 {
 
-	if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
+	if (IPI_IS_BITMAPED(ipi)) {
 		ipi_selected(PCPU_GET(other_cpus), ipi);
 		return;
 	}
-	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
-	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
-}
 
-#ifdef STOP_NMI
-/*
- * send NMI IPI to selected CPUs
- */
-
-#define	BEFORE_SPIN	1000000
-
-static void
-ipi_nmi_selected(cpumask_t cpus)
-{
-	int cpu;
-	register_t icrlo;
-
-	icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT 
-		| APIC_TRIGMOD_EDGE; 
-	
-	CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus);
-
-	atomic_set_int(&ipi_nmi_pending, cpus);
-
-	while ((cpu = ffs(cpus)) != 0) {
-		cpu--;
-		cpus &= ~(1 << cpu);
-
-		KASSERT(cpu_apic_ids[cpu] != -1,
-		    ("IPI NMI to non-existent CPU %d", cpu));
-		
-		/* Wait for an earlier IPI to finish. */
-		if (!lapic_ipi_wait(BEFORE_SPIN))
-			panic("ipi_nmi_selected: previous IPI has not cleared");
+	/*
+	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
+	 * of help in order to understand what is the source.
+	 * Set the mask of receiving CPUs for this purpose.
+	 */
+	if (ipi == IPI_STOP_HARD)
+		atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus));
 
-		lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]);
-	}
+	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
+	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
 }
 
 int
-ipi_nmi_handler(void)
+ipi_nmi_handler()
 {
-	int cpumask = PCPU_GET(cpumask);
+	cpumask_t cpumask;
 
-	if (!(ipi_nmi_pending & cpumask))
-		return 1;
+	/*
+	 * As long as there is not a simple way to know about a NMI's
+	 * source, if the bitmask for the current CPU is present in
+	 * the global pending bitword an IPI_STOP_HARD has been issued
+	 * and should be handled.
+	 */
+	cpumask = PCPU_GET(cpumask);
+	if ((ipi_nmi_pending & cpumask) == 0)
+		return (1);
 
 	atomic_clear_int(&ipi_nmi_pending, cpumask);
 	cpustop_handler();
-	return 0;
+	return (0);
 }
      
-#endif /* STOP_NMI */
-
 /*
  * Handle an IPI_STOP by saving our current context and spinning until we
  * are resumed.
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index fee3caf..323e8d1 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -239,13 +239,11 @@ trap(struct trapframe *frame)
 	type = frame->tf_trapno;
 
 #ifdef SMP
-#ifdef STOP_NMI
 	/* Handler for NMI IPIs used for stopping CPUs. */
 	if (type == T_NMI) {
 	         if (ipi_nmi_handler() == 0)
 	                   goto out;
 	}
-#endif /* STOP_NMI */
 #endif /* SMP */
 
 #ifdef KDB
diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index 73a4fb6..a49f7bc 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -69,7 +69,6 @@ options 	P1003_1B_SEMAPHORES	# POSIX-style semaphores
 options 	_KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions
 options 	PRINTF_BUFR_SIZE=128	# Prevent printf output being interspersed.
 options 	KBD_INSTALL_CDEV	# install a CDEV entry in /dev
-options 	STOP_NMI		# Stop CPUS using NMI instead of IPI
 options 	HWPMC_HOOKS		# Necessary kernel hooks for hwpmc(4)
 options 	AUDIT			# Security event auditing
 options 	MAC			# TrustedBSD MAC Framework
diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES
index 088a381..27fe068 100644
--- a/sys/amd64/conf/NOTES
+++ b/sys/amd64/conf/NOTES
@@ -30,11 +30,6 @@ device		mptable			# Optional MPSPEC mptable support
 #
 options 	MP_WATCHDOG
 
-# 
-# Debugging options.
-#
-options 	STOP_NMI		# Stop CPUS using NMI instead of IPI
-
 
 
 #####################################################################
diff --git a/sys/amd64/conf/XENHVM b/sys/amd64/conf/XENHVM
index 5e108d5..1536e3c 100644
--- a/sys/amd64/conf/XENHVM
+++ b/sys/amd64/conf/XENHVM
@@ -68,7 +68,6 @@ options 	SYSVMSG			# SYSV-style message queues
 options 	SYSVSEM			# SYSV-style semaphores
 options 	_KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions
 options 	KBD_INSTALL_CDEV	# install a CDEV entry in /dev
-options 	STOP_NMI		# Stop CPUS using NMI instead of IPI
 options 	HWPMC_HOOKS		# Necessary kernel hooks for hwpmc(4)
 options 	AUDIT			# Security event auditing
 #options 	KDTRACE_FRAME		# Ensure frames are compiled in
diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h
index 84ba3b8..73fff6c 100644
--- a/sys/amd64/include/apicvar.h
+++ b/sys/amd64/include/apicvar.h
@@ -102,11 +102,6 @@
  * smp_ipi_mtx and waits for the completion of the IPI (Only one IPI user 
  * at a time) The second group uses a single interrupt and a bitmap to avoid
  * redundant IPI interrupts.
- *
- * Right now IPI_STOP used by kdb shares the interrupt priority class with
- * the two IPI groups mentioned above. As such IPI_STOP may cause a deadlock.
- * Eventually IPI_STOP should use NMI IPIs - this would eliminate this and
- * other deadlocks caused by IPI_STOP.
  */ 
 
 /* Interrupts for local APIC LVT entries other than the timer. */
@@ -134,6 +129,7 @@
 
 #define	IPI_STOP	(APIC_IPI_INTS + 7)	/* Stop CPU until restarted. */
 #define	IPI_SUSPEND	(APIC_IPI_INTS + 8)	/* Suspend CPU until restarted. */
+#define	IPI_STOP_HARD	(APIC_IPI_INTS + 9)	/* Stop CPU with a NMI. */
 
 /*
  * The spurious interrupt can share the priority class with the IPIs since
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index d295715..1cc21a4 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -52,6 +52,7 @@ void	cpu_add(u_int apic_id, char boot_cpu);
 void	cpustop_handler(void);
 void	cpususpend_handler(void);
 void	init_secondary(void);
+int	ipi_nmi_handler(void);
 void	ipi_selected(cpumask_t cpus, u_int ipi);
 void	ipi_all_but_self(u_int ipi);
 void 	ipi_bitmap_handler(struct trapframe frame);
@@ -66,10 +67,6 @@ void	smp_masked_invlpg_range(cpumask_t mask, vm_offset_t startva,
 void	smp_invltlb(void);
 void	smp_masked_invltlb(cpumask_t mask);
 
-#ifdef STOP_NMI
-int	ipi_nmi_handler(void);
-#endif
-
 #endif /* !LOCORE */
 #endif /* SMP */
 
-- 
cgit v1.1


From 7042429fac0d3bfb1df101cef080ac39f6400e91 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Fri, 14 Aug 2009 20:57:21 +0000
Subject: Adjust the handling of the local APIC PMC interrupt vector: - Provide
 lapic_disable_pmc(), lapic_enable_pmc(), and lapic_reenable_pmc()   routines
 in the local APIC code that the hwpmc(4) driver can use to   manage the local
 APIC PMC interrupt vector. - Do not enable the local APIC PMC interrupt
 vector by default when   HWPMC_HOOKS is enabled.  Instead, the hwpmc(4)
 driver explicitly   enables the interrupt when it is succesfully initialized
 and disables   the interrupt when it is unloaded.  This avoids enabling the
 interrupt   on unsupported CPUs which may result in spurious NMIs.

Reported by:	rnoland
Reviewed by:	jkoshy
Approved by:	re (kib)
MFC after:	2 weeks
---
 sys/amd64/amd64/local_apic.c | 86 ++++++++++++++++++++++++++++++++++++++++++--
 sys/amd64/include/apicvar.h  |  3 ++
 sys/amd64/include/pmc_mdep.h |  1 -
 3 files changed, 86 insertions(+), 4 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c
index cd3073c..13bd774 100644
--- a/sys/amd64/amd64/local_apic.c
+++ b/sys/amd64/amd64/local_apic.c
@@ -123,7 +123,7 @@ static struct lvt lvts[LVT_MAX + 1] = {
 	{ 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 },	/* LINT1: NMI */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT },	/* Timer */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT },	/* Error */
-	{ 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 },	/* PMC */
+	{ 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 },	/* PMC */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT },	/* Thermal */
 };
 
@@ -305,11 +305,9 @@ lapic_setup(int boot)
 	lapic->lvt_lint0 = lvt_mode(la, LVT_LINT0, lapic->lvt_lint0);
 	lapic->lvt_lint1 = lvt_mode(la, LVT_LINT1, lapic->lvt_lint1);
 
-#ifdef	HWPMC_HOOKS
 	/* Program the PMC LVT entry if present. */
 	if (maxlvt >= LVT_PMC)
 		lapic->lvt_pcint = lvt_mode(la, LVT_PMC, lapic->lvt_pcint);
-#endif
 
 	/* Program timer LVT and setup handler. */
 	lapic->lvt_timer = lvt_mode(la, LVT_TIMER, lapic->lvt_timer);
@@ -332,6 +330,88 @@ lapic_setup(int boot)
 	intr_restore(eflags);
 }
 
+void
+lapic_reenable_pmc(void)
+{
+#ifdef HWPMC_HOOKS
+	uint32_t value;
+
+	value =  lapic->lvt_pcint;
+	value &= ~APIC_LVT_M;
+	lapic->lvt_pcint = value;
+#endif
+}
+
+#ifdef HWPMC_HOOKS
+static void
+lapic_update_pmc(void *dummy)
+{
+	struct lapic *la;
+
+	la = &lapics[lapic_id()];
+	lapic->lvt_pcint = lvt_mode(la, LVT_PMC, lapic->lvt_pcint);
+}
+#endif
+
+int
+lapic_enable_pmc(void)
+{
+#ifdef HWPMC_HOOKS
+	u_int32_t maxlvt;
+
+	/* Fail if the local APIC is not present. */
+	if (lapic == NULL)
+		return (0);
+
+	/* Fail if the PMC LVT is not present. */
+	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+	if (maxlvt < LVT_PMC)
+		return (0);
+
+	lvts[LVT_PMC].lvt_masked = 0;
+
+#ifdef SMP
+	/*
+	 * If hwpmc was loaded at boot time then the APs may not be
+	 * started yet.  In that case, don't forward the request to
+	 * them as they will program the lvt when they start.
+	 */
+	if (smp_started)
+		smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
+	else
+#endif
+		lapic_update_pmc(NULL);
+	return (1);
+#else
+	return (0);
+#endif
+}
+
+void
+lapic_disable_pmc(void)
+{
+#ifdef HWPMC_HOOKS
+	u_int32_t maxlvt;
+
+	/* Fail if the local APIC is not present. */
+	if (lapic == NULL)
+		return;
+
+	/* Fail if the PMC LVT is not present. */
+	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+	if (maxlvt < LVT_PMC)
+		return;
+
+	lvts[LVT_PMC].lvt_masked = 1;
+
+#ifdef SMP
+	/* The APs should always be started when hwpmc is unloaded. */
+	KASSERT(mp_ncpus == 1 || smp_started, ("hwpmc unloaded too early"));
+#endif
+	smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
+#endif
+}
+
 /*
  * Called by cpu_initclocks() on the BSP to setup the local APIC timer so
  * that it can drive hardclock, statclock, and profclock.  This function
diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h
index 73fff6c..9d6d538 100644
--- a/sys/amd64/include/apicvar.h
+++ b/sys/amd64/include/apicvar.h
@@ -201,7 +201,9 @@ int	ioapic_set_triggermode(void *cookie, u_int pin,
 int	ioapic_set_smi(void *cookie, u_int pin);
 void	lapic_create(u_int apic_id, int boot_cpu);
 void	lapic_disable(void);
+void	lapic_disable_pmc(void);
 void	lapic_dump(const char *str);
+int	lapic_enable_pmc(void);
 void	lapic_eoi(void);
 u_int	lapic_error(void);
 int	lapic_id(void);
@@ -212,6 +214,7 @@ void	lapic_ipi_vectored(u_int vector, int dest);
 int	lapic_ipi_wait(int delay);
 void	lapic_handle_intr(int vector, struct trapframe *frame);
 void	lapic_handle_timer(struct trapframe *frame);
+void	lapic_reenable_pmc(void);
 void	lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id);
 int	lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked);
 int	lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode);
diff --git a/sys/amd64/include/pmc_mdep.h b/sys/amd64/include/pmc_mdep.h
index f8c26f2..f233a51 100644
--- a/sys/amd64/include/pmc_mdep.h
+++ b/sys/amd64/include/pmc_mdep.h
@@ -115,7 +115,6 @@ union pmc_md_pmc {
  */
 
 void	start_exceptions(void), end_exceptions(void);
-void	pmc_x86_lapic_enable_pmc_interrupt(void);
 
 struct pmc_mdep *pmc_amd_initialize(void);
 void	pmc_amd_finalize(struct pmc_mdep *_md);
-- 
cgit v1.1


From b8de80198d700a59246c08580b7121e1bff8f268 Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Mon, 17 Aug 2009 13:32:56 +0000
Subject: MFC r196318: Correct accounting error when allocating a a page table
 page to implement a user-space demotion.

Approved by:	re (rwatson)
---
 sys/amd64/amd64/pmap.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 622ed62..b9eee49 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -2261,6 +2261,8 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 			    " in pmap %p", va, pmap);
 			return (FALSE);
 		}
+		if (va < VM_MAXUSER_ADDRESS)
+			pmap->pm_stats.resident_count++;
 	}
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
-- 
cgit v1.1


From 594f0177cea5291ff5a34f8ae3e061bab9704c13 Mon Sep 17 00:00:00 2001
From: ed <ed@FreeBSD.org>
Date: Wed, 19 Aug 2009 20:44:22 +0000
Subject: MFC r196390:

  Make the MacBookPro3,1 hardware boot again.

  Tested by:    Patrick Lamaiziere <patfbsd davenulle org>
  Approved by:  re (kib)
---
 sys/amd64/amd64/machdep.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 8aee975..2c54be2 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -217,6 +217,7 @@ cpu_startup(dummy)
 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
+		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
 			if (bootverbose)
 				printf("Disabling LEGACY_USB_EN bit on "
-- 
cgit v1.1


From 47bc5699d9da2d3c1c892b9f881316374c875607 Mon Sep 17 00:00:00 2001
From: jkim <jkim@FreeBSD.org>
Date: Thu, 20 Aug 2009 23:04:21 +0000
Subject: MFC:	r196412

Check whether the SMBIOS reports reasonable amount of memory.  If it is
less than "avail memory", fall back to Maxmem to avoid user confusion.
We use SMBIOS information to display "real memory" since r190599 but
some broken SMBIOS implementation reported only half of actual memory.

Tested by:	bz
Approved by:	re (kib)
---
 sys/amd64/amd64/machdep.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 2c54be2..0bfd7ad 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -236,19 +236,21 @@ cpu_startup(dummy)
 #ifdef PERFMON
 	perfmon_init();
 #endif
+	realmem = Maxmem;
+
+	/*
+	 * Display physical memory if SMBIOS reports reasonable amount.
+	 */
+	memsize = 0;
 	sysenv = getenv("smbios.memory.enabled");
 	if (sysenv != NULL) {
-		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10);
+		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
 		freeenv(sysenv);
-	} else
-		memsize = 0;
-	if (memsize > 0)
-		printf("real memory  = %ju (%ju MB)\n", memsize << 10,
-		    memsize >> 10);
-	else
-		printf("real memory  = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem),
-		    ptoa((uintmax_t)Maxmem) / 1048576);
-	realmem = Maxmem;
+	}
+	if (memsize < ptoa((uintmax_t)cnt.v_free_count))
+		memsize = ptoa((uintmax_t)Maxmem);
+	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
+
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
-- 
cgit v1.1


From bef0247c6bb7c48a6b86bdb9a7b23a355d81fd06 Mon Sep 17 00:00:00 2001
From: bz <bz@FreeBSD.org>
Date: Thu, 27 Aug 2009 17:34:13 +0000
Subject: MFC r196512:

  Fix handling of .note.ABI-tag section for GNU systems [1].
  Handle GNU/Linux according to LSB Core Specification 4.0,
  Chapter 11. Object Format, 11.8. ABI note tag.

  Also check the first word of desc, not only name, according to
  glibc abi-tags specification to distinguish between Linux and
  kFreeBSD.

  Add explicit handling for Debian GNU/kFreeBSD, which runs
  on our kernels as well [2].

  In {amd64,i386}/trap.c, when checking osrel of the current process,
  also check the ABI to not change the signal behaviour for Linux
  binary processes, now that we save an osrel version for all three
  from the lists above in struct proc [2].

  These changes make it possible to run FreeBSD, Debian GNU/kFreeBSD
  and Linux binaries on the same machine again for at least i386 and
  amd64, and no longer break kFreeBSD which was detected as GNU(/Linux).

PR:		kern/135468
Submitted by:	dchagin [1] (initial patch)
Suggested by:	kib [2]
Tested by:	Petr Salinger (Petr.Salinger seznam.cz) for kFreeBSD
Reviewed by:	kib
Approved by:	re (kensmith)
---
 sys/amd64/amd64/elf_machdep.c      | 17 +++++++++++++++++
 sys/amd64/amd64/trap.c             |  4 +++-
 sys/amd64/linux32/linux32_sysvec.c | 35 ++++++++++++++++++++++++++++++-----
 3 files changed, 50 insertions(+), 6 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c
index c5e19cf..ea48b25 100644
--- a/sys/amd64/amd64/elf_machdep.c
+++ b/sys/amd64/amd64/elf_machdep.c
@@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/sysent.h>
 #include <sys/imgact_elf.h>
 #include <sys/syscall.h>
+#include <sys/sysent.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 
@@ -108,6 +109,22 @@ SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY,
 	(sysinit_cfunc_t) elf64_insert_brand_entry,
 	&freebsd_brand_oinfo);
 
+static Elf64_Brandinfo kfreebsd_brand_info = {
+	.brand		= ELFOSABI_FREEBSD,
+	.machine	= EM_X86_64,
+	.compat_3_brand	= "FreeBSD",
+	.emul_path	= NULL,
+	.interp_path	= "/lib/ld-kfreebsd-x86-64.so.1",
+	.sysvec		= &elf64_freebsd_sysvec,
+	.interp_newpath	= NULL,
+	.brand_note	= &elf64_kfreebsd_brandnote,
+	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
+};
+
+SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY,
+	(sysinit_cfunc_t) elf64_insert_brand_entry,
+	&kfreebsd_brand_info);
+
 
 void
 elf64_dump_thread(struct thread *td __unused, void *dst __unused,
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 323e8d1..65f761e 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -409,7 +409,9 @@ trap(struct trapframe *frame)
 					 * This check also covers the images
 					 * without the ABI-tag ELF note.
 					 */
-					if (p->p_osrel >= 700004) {
+					if (SV_CURPROC_ABI() ==
+					    SV_ABI_FREEBSD &&
+					    p->p_osrel >= 700004) {
 						i = SIGSEGV;
 						ucode = SEGV_ACCERR;
 					} else {
diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c
index 77186a1..54a04ee 100644
--- a/sys/amd64/linux32/linux32_sysvec.c
+++ b/sys/amd64/linux32/linux32_sysvec.c
@@ -127,6 +127,7 @@ static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
 static void	exec_linux_setregs(struct thread *td, u_long entry,
 				   u_long stack, u_long ps_strings);
 static void	linux32_fixlimit(struct rlimit *rl, int which);
+static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
 
 static eventhandler_tag linux_exit_tag;
 static eventhandler_tag linux_schedtail_tag;
@@ -1066,14 +1067,38 @@ struct sysentvec elf_linux_sysvec = {
 	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32
 };
 
-static char GNULINUX_ABI_VENDOR[] = "GNU";
+static char GNU_ABI_VENDOR[] = "GNU";
+static int GNULINUX_ABI_DESC = 0;
+
+static boolean_t
+linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
+{
+	const Elf32_Word *desc;
+	uintptr_t p;
+
+	p = (uintptr_t)(note + 1);
+	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
+
+	desc = (const Elf32_Word *)p;
+	if (desc[0] != GNULINUX_ABI_DESC)
+		return (FALSE);
+
+	/*
+	 * For linux we encode osrel as follows (see linux_mib.c):
+	 * VVVMMMIII (version, major, minor), see linux_mib.c.
+	 */
+	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
+
+	return (TRUE);
+}
 
 static Elf_Brandnote linux32_brandnote = {
-	.hdr.n_namesz	= sizeof(GNULINUX_ABI_VENDOR),
-	.hdr.n_descsz	= 16,
+	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
+	.hdr.n_descsz	= 16,	/* XXX at least 16 */
 	.hdr.n_type	= 1,
-	.vendor		= GNULINUX_ABI_VENDOR,
-	.flags		= 0
+	.vendor		= GNU_ABI_VENDOR,
+	.flags		= BN_TRANSLATE_OSREL,
+	.trans_osrel	= linux32_trans_osrel
 };
 
 static Elf32_Brandinfo linux_brand = {
-- 
cgit v1.1


From b29642064f11390c250c27e7c2abfa6de93a6b9b Mon Sep 17 00:00:00 2001
From: rnoland <rnoland@FreeBSD.org>
Date: Tue, 1 Sep 2009 16:41:28 +0000
Subject: MFC 196643

Swap the start/end virtual addresses in pmap_invalidate_cache_range().

This fixes the functionality on non SelfSnoop hardware.

Found by:	rnoland
Submitted by:	alc
Reviewed by:	kib
Approved by:	re (rwatson)
---
 sys/amd64/amd64/pmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index b9eee49..f0da536 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -943,8 +943,8 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 		 * coherence domain.
 		 */
 		mfence();
-		for (; eva < sva; eva += cpu_clflush_line_size)
-			clflush(eva);
+		for (; sva < eva; sva += cpu_clflush_line_size)
+			clflush(sva);
 		mfence();
 	} else {
 
-- 
cgit v1.1


From 08cdcfb10a2c831ad0a66910b1abf2228435bfa4 Mon Sep 17 00:00:00 2001
From: bz <bz@FreeBSD.org>
Date: Wed, 2 Sep 2009 10:39:46 +0000
Subject: MFC r196653:   Make sure FreeBSD binaries without .note.ABI-tag
 section work   correctly and do not match a colliding Debian GNU/kFreeBSD  
 brandinfo statements.   For this mark the Debian GNU/kFreeBSD brandinfo that
 it must have   an .note.ABI-tag section and ignore the old EI_OSABI brandinfo
   when comparing a possibly colliding set of options.

  Due to SYSINIT we add the brandinfo in a non-deterministic order,
  so native FreeBSD is not always first. We may want to consider
  to force native FreeBSD to come first as well.

  The only way a problem could currently be noticed is when running an
  i386 binary without the .note.ABI-tag on amd64 and the Debian GNU/kFreeBSD
  brandinfo  was matched first,  as the fallback to ld-elf32.so.1 does
  not exist in that case.

Reported and tested by:	ticso
In collaboration with:	kib
MFC after:		3 days
Approved by:		re (rwatson)
---
 sys/amd64/amd64/elf_machdep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c
index ea48b25..d5e7a6e 100644
--- a/sys/amd64/amd64/elf_machdep.c
+++ b/sys/amd64/amd64/elf_machdep.c
@@ -118,7 +118,7 @@ static Elf64_Brandinfo kfreebsd_brand_info = {
 	.sysvec		= &elf64_freebsd_sysvec,
 	.interp_newpath	= NULL,
 	.brand_note	= &elf64_kfreebsd_brandnote,
-	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
+	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY
 };
 
 SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY,
-- 
cgit v1.1


From 9421144e6c2f592d781e0ad5d7866acfbb2be796 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Thu, 3 Sep 2009 13:54:58 +0000
Subject: MFC 196705 and 196707: - Improve pmap_change_attr() on i386 so that
 it is able to demote a large   (2/4MB) page into 4KB pages as needed.  This
 should be fairly rare in   practice. - Simplify pmap_change_attr() a bit:   -
 Always calculate the cache bits instead of doing it on-demand.   - Always set
 changed to TRUE rather than only doing it if it is false.

Approved by:	re (kib)
---
 sys/amd64/amd64/pmap.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index f0da536..4e35ef4 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -4476,7 +4476,8 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 	if (base < DMAP_MIN_ADDRESS)
 		return (EINVAL);
 
-	cache_bits_pde = cache_bits_pte = -1;
+	cache_bits_pde = pmap_cache_bits(mode, 1);
+	cache_bits_pte = pmap_cache_bits(mode, 0);
 	changed = FALSE;
 
 	/*
@@ -4493,8 +4494,6 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 			 * memory type, then we need not demote this page. Just
 			 * increment tmpva to the next 1GB page frame.
 			 */
-			if (cache_bits_pde < 0)
-				cache_bits_pde = pmap_cache_bits(mode, 1);
 			if ((*pdpe & PG_PDE_CACHE) == cache_bits_pde) {
 				tmpva = trunc_1gpage(tmpva) + NBPDP;
 				continue;
@@ -4522,8 +4521,6 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 			 * memory type, then we need not demote this page. Just
 			 * increment tmpva to the next 2MB page frame.
 			 */
-			if (cache_bits_pde < 0)
-				cache_bits_pde = pmap_cache_bits(mode, 1);
 			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
 				tmpva = trunc_2mpage(tmpva) + NBPDR;
 				continue;
@@ -4557,12 +4554,9 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 	for (tmpva = base; tmpva < base + size; ) {
 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
 		if (*pdpe & PG_PS) {
-			if (cache_bits_pde < 0)
-				cache_bits_pde = pmap_cache_bits(mode, 1);
 			if ((*pdpe & PG_PDE_CACHE) != cache_bits_pde) {
 				pmap_pde_attr(pdpe, cache_bits_pde);
-				if (!changed)
-					changed = TRUE;
+				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
 				if (pa_start == pa_end) {
@@ -4588,12 +4582,9 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 		}
 		pde = pmap_pdpe_to_pde(pdpe, tmpva);
 		if (*pde & PG_PS) {
-			if (cache_bits_pde < 0)
-				cache_bits_pde = pmap_cache_bits(mode, 1);
 			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
 				pmap_pde_attr(pde, cache_bits_pde);
-				if (!changed)
-					changed = TRUE;
+				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
 				if (pa_start == pa_end) {
@@ -4616,13 +4607,10 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 			}
 			tmpva = trunc_2mpage(tmpva) + NBPDR;
 		} else {
-			if (cache_bits_pte < 0)
-				cache_bits_pte = pmap_cache_bits(mode, 0);
 			pte = pmap_pde_to_pte(pde, tmpva);
 			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
 				pmap_pte_attr(pte, cache_bits_pte);
-				if (!changed)
-					changed = TRUE;
+				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
 				if (pa_start == pa_end) {
-- 
cgit v1.1


From 61b83a071cf0407f3e644c4e9e3610216bb637bd Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Tue, 8 Sep 2009 21:50:34 +0000
Subject: MFC 196745: Don't attempt to bind the current thread to the CPU an
 IRQ is bound to when removing an interrupt handler from an IRQ during
 shutdown.  During shutdown we are already bound to CPU 0 and this was
 triggering a panic.

Approved by:	re (kib)
---
 sys/amd64/amd64/local_apic.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c
index 13bd774..87bec91 100644
--- a/sys/amd64/amd64/local_apic.c
+++ b/sys/amd64/amd64/local_apic.c
@@ -990,18 +990,21 @@ apic_free_vector(u_int apic_id, u_int vector, u_int irq)
 	 * we don't lose an interrupt delivery race.
 	 */
 	td = curthread;
-	thread_lock(td);
-	if (sched_is_bound(td))
-		panic("apic_free_vector: Thread already bound.\n");
-	sched_bind(td, apic_cpuid(apic_id));
-	thread_unlock(td);
+	if (!rebooting) {
+		thread_lock(td);
+		if (sched_is_bound(td))
+			panic("apic_free_vector: Thread already bound.\n");
+		sched_bind(td, apic_cpuid(apic_id));
+		thread_unlock(td);
+	}
 	mtx_lock_spin(&icu_lock);
 	lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = -1;
 	mtx_unlock_spin(&icu_lock);
-	thread_lock(td);
-	sched_unbind(td);
-	thread_unlock(td);
-
+	if (!rebooting) {
+		thread_lock(td);
+		sched_unbind(td);
+		thread_unlock(td);
+	}
 }
 
 /* Map an IDT vector (APIC) to an IRQ (interrupt source). */
-- 
cgit v1.1


From 90a09c13dfdde3ffcf990b535aa7fe8eeb253dbf Mon Sep 17 00:00:00 2001
From: kensmith <kensmith@FreeBSD.org>
Date: Thu, 10 Sep 2009 14:04:00 +0000
Subject: Remove extra debugging support that is turned on for head but turned
 off for stable branches:

	- shift to MALLOC_PRODUCTION
	- turn off automatic crash dumps
	- Remove kernel debuggers, INVARIANTS*[1], WITNESS* from
	  GENERIC kernel config files[2]

[1] INVARIANTS* left on for ia64 by request marcel
[2] sun4v was left as-is

Reviewed by:	marcel, kib
Approved by:	re (implicit)
---
 sys/amd64/conf/GENERIC | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index a49f7bc..ddd3035 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -76,15 +76,6 @@ options		FLOWTABLE		# per-cpu routing cache
 #options 	KDTRACE_FRAME		# Ensure frames are compiled in
 #options 	KDTRACE_HOOKS		# Kernel DTrace hooks
 
-# Debugging for use in -current
-options 	KDB			# Enable kernel debugger support.
-options 	DDB			# Support DDB.
-options 	GDB			# Support remote GDB.
-options 	INVARIANTS		# Enable calls of extra sanity checking
-options 	INVARIANT_SUPPORT	# Extra sanity checks of internal structures, required by INVARIANTS
-options 	WITNESS			# Enable checks to detect deadlocks and cycles
-options 	WITNESS_SKIPSPIN	# Don't run witness on spinlocks for speed
-
 # Make an SMP-capable kernel by default
 options 	SMP			# Symmetric MultiProcessor Kernel
 
-- 
cgit v1.1


From 5ee8918a73551dc2a9235ad05250c3225f12ddd1 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Fri, 25 Sep 2009 15:08:26 +0000
Subject: MFC 197410: - Split the logic to parse an SMAP entry out into a
 separate function on   amd64 similar to i386.  This fixes a bug on amd64
 where overlapping   entries would not cause the SMAP parsing to stop. -
 Change the SMAP parsing code to do a sorted insertion into physmap[]  
 instead of an append to support systems with out-of-order SMAP entries.

Approved by:	re (kib)
---
 sys/amd64/amd64/machdep.c | 106 +++++++++++++++++++++++++++++++---------------
 1 file changed, 73 insertions(+), 33 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 0bfd7ad..95db5d2 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1192,6 +1192,77 @@ isa_irq_pending(void)
 
 u_int basemem;
 
+static int
+add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
+{
+	int i, insert_idx, physmap_idx;
+
+	physmap_idx = *physmap_idxp;
+
+	if (boothowto & RB_VERBOSE)
+		printf("SMAP type=%02x base=%016lx len=%016lx\n",
+		    smap->type, smap->base, smap->length);
+
+	if (smap->type != SMAP_TYPE_MEMORY)
+		return (1);
+
+	if (smap->length == 0)
+		return (0);
+
+	/*
+	 * Find insertion point while checking for overlap.  Start off by
+	 * assuming the new entry will be added to the end.
+	 */
+	insert_idx = physmap_idx + 2;
+	for (i = 0; i <= physmap_idx; i += 2) {
+		if (smap->base < physmap[i + 1]) {
+			if (smap->base + smap->length <= physmap[i]) {
+				insert_idx = i;
+				break;
+			}
+			if (boothowto & RB_VERBOSE)
+				printf(
+		    "Overlapping memory regions, ignoring second region\n");
+			return (1);
+		}
+	}
+
+	/* See if we can prepend to the next entry. */
+	if (insert_idx <= physmap_idx &&
+	    smap->base + smap->length == physmap[insert_idx]) {
+		physmap[insert_idx] = smap->base;
+		return (1);
+	}
+
+	/* See if we can append to the previous entry. */
+	if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) {
+		physmap[insert_idx - 1] += smap->length;
+		return (1);
+	}
+
+	physmap_idx += 2;
+	*physmap_idxp = physmap_idx;
+	if (physmap_idx == PHYSMAP_SIZE) {
+		printf(
+		"Too many segments in the physical address map, giving up\n");
+		return (0);
+	}
+
+	/*
+	 * Move the last 'N' entries down to make room for the new
+	 * entry if needed.
+	 */
+	for (i = physmap_idx; i > insert_idx; i -= 2) {
+		physmap[i] = physmap[i - 2];
+		physmap[i + 1] = physmap[i - 1];
+	}
+
+	/* Insert the new entry. */
+	physmap[insert_idx] = smap->base;
+	physmap[insert_idx + 1] = smap->base + smap->length;
+	return (1);
+}
+
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
@@ -1235,40 +1306,9 @@ getmemsize(caddr_t kmdp, u_int64_t first)
 	smapsize = *((u_int32_t *)smapbase - 1);
 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
-	for (smap = smapbase; smap < smapend; smap++) {
-		if (boothowto & RB_VERBOSE)
-			printf("SMAP type=%02x base=%016lx len=%016lx\n",
-			    smap->type, smap->base, smap->length);
-
-		if (smap->type != SMAP_TYPE_MEMORY)
-			continue;
-
-		if (smap->length == 0)
-			continue;
-
-		for (i = 0; i <= physmap_idx; i += 2) {
-			if (smap->base < physmap[i + 1]) {
-				if (boothowto & RB_VERBOSE)
-					printf(
-	"Overlapping or non-monotonic memory region, ignoring second region\n");
-				continue;
-			}
-		}
-
-		if (smap->base == physmap[physmap_idx + 1]) {
-			physmap[physmap_idx + 1] += smap->length;
-			continue;
-		}
-
-		physmap_idx += 2;
-		if (physmap_idx == PHYSMAP_SIZE) {
-			printf(
-		"Too many segments in the physical address map, giving up\n");
+	for (smap = smapbase; smap < smapend; smap++)
+		if (!add_smap_entry(smap, physmap, &physmap_idx))
 			break;
-		}
-		physmap[physmap_idx] = smap->base;
-		physmap[physmap_idx + 1] = smap->base + smap->length;
-	}
 
 	/*
 	 * Find the 'base memory' segment for SMP
-- 
cgit v1.1


From 05c6929c66e6fa7d7d8229d117ccc2cda3df4027 Mon Sep 17 00:00:00 2001
From: rpaulo <rpaulo@FreeBSD.org>
Date: Thu, 1 Oct 2009 10:06:09 +0000
Subject: MFC r197653:   Improve 802.11s comment.

Approved by:	re (kib)
---
 sys/amd64/conf/GENERIC | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index ddd3035..24300bd 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -248,7 +248,7 @@ device		xe		# Xircom pccard Ethernet
 device		wlan		# 802.11 support
 options 	IEEE80211_DEBUG	# enable debug msgs
 options 	IEEE80211_AMPDU_AGE # age frames in AMPDU reorder q's
-options 	IEEE80211_SUPPORT_MESH	# enable 802.11s D3.0 support
+options 	IEEE80211_SUPPORT_MESH	# enable 802.11s draft support
 device		wlan_wep	# 802.11 WEP support
 device		wlan_ccmp	# 802.11 CCMP support
 device		wlan_tkip	# 802.11 TKIP support
-- 
cgit v1.1


From 96a6dd3944a98cac6db65b9808bc9ed92e7d83ce Mon Sep 17 00:00:00 2001
From: alc <alc@FreeBSD.org>
Date: Fri, 2 Oct 2009 05:11:46 +0000
Subject: MFC r197580   Temporarily disable the use of 1GB page mappings by the
 direct map.

Approved by:	re (kib)
---
 sys/amd64/amd64/pmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 4e35ef4..97de6b6 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -440,7 +440,7 @@ create_pagetables(vm_paddr_t *firstaddr)
 	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
 		ndmpdp = 4;
 	DMPDPphys = allocpages(firstaddr, NDMPML4E);
-	if ((amd_feature & AMDID_PAGE1GB) == 0)
+	if (TRUE || (amd_feature & AMDID_PAGE1GB) == 0)
 		DMPDphys = allocpages(firstaddr, ndmpdp);
 	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
 
@@ -474,7 +474,7 @@ create_pagetables(vm_paddr_t *firstaddr)
 
 	/* Now set up the direct map space using either 2MB or 1GB pages */
 	/* Preset PG_M and PG_A because demotion expects it */
-	if ((amd_feature & AMDID_PAGE1GB) == 0) {
+	if (TRUE || (amd_feature & AMDID_PAGE1GB) == 0) {
 		for (i = 0; i < NPDEPG * ndmpdp; i++) {
 			((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
 			((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS |
-- 
cgit v1.1


From 67dc2f16e41d0734777d6902781f14eb3a4b2042 Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Sun, 4 Oct 2009 12:20:59 +0000
Subject: MFC r197663: As a workaround, for Intel CPUs, do not use CLFLUSH in
 pmap_invalidate_cache_range() when self-snoop is apparently not reported in
 cpu features.

Approved by:	re (bz, kensmith)
---
 sys/amd64/amd64/initcpu.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c
index c293c1a..0037d66 100644
--- a/sys/amd64/amd64/initcpu.c
+++ b/sys/amd64/amd64/initcpu.c
@@ -165,4 +165,10 @@ initializecpu(void)
 	 */
 	if ((cpu_feature & CPUID_CLFSH) != 0)
 		cpu_clflush_line_size = ((cpu_procinfo >> 8) & 0xff) * 8;
+	/*
+	 * XXXKIB: (temporary) hack to work around traps generated when
+	 * CLFLUSHing APIC registers window.
+	 */
+	if (cpu_vendor_id == CPU_VENDOR_INTEL && !(cpu_feature & CPUID_SS))
+		cpu_feature &= ~CPUID_CLFSH;
 }
-- 
cgit v1.1


From 9bce578b0a2ab3f2a08aa77bdb16d4b6ed69bf46 Mon Sep 17 00:00:00 2001
From: attilio <attilio@FreeBSD.org>
Date: Mon, 12 Oct 2009 16:05:31 +0000
Subject: MFC r197803, r197824, r197910: Per their definition, atomic
 instructions used in conjuction with memory barriers should also ensure that
 the compiler doesn't reorder paths where they are used.  GCC, however, does
 that aggressively, even in presence of volatile operands.  The most reliable
 way GCC offers for avoid instructions reordering is clobbering "memory". Not
 all our memory barriers, right now, clobber memory for GCC-like compilers.
 Fix these cases.

Approved by:	re (kib)
---
 sys/amd64/include/atomic.h | 104 ++++++++++++++++++++++++++-------------------
 1 file changed, 60 insertions(+), 44 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/atomic.h b/sys/amd64/include/atomic.h
index d2a3846..a2bd930 100644
--- a/sys/amd64/include/atomic.h
+++ b/sys/amd64/include/atomic.h
@@ -32,9 +32,9 @@
 #error this file needs sys/cdefs.h as a prerequisite
 #endif
 
-#define mb()	__asm__ __volatile__ ("mfence;": : :"memory")
-#define wmb()	__asm__ __volatile__ ("sfence;": : :"memory")
-#define rmb()	__asm__ __volatile__ ("lfence;": : :"memory")
+#define	mb()	__asm __volatile("mfence;" : : : "memory")
+#define	wmb()	__asm __volatile("sfence;" : : : "memory")
+#define	rmb()	__asm __volatile("lfence;" : : : "memory")
 
 /*
  * Various simple operations on memory, each of which is atomic in the
@@ -73,7 +73,8 @@
  */
 #if defined(KLD_MODULE) || !defined(__GNUCLIKE_ASM)
 #define	ATOMIC_ASM(NAME, TYPE, OP, CONS, V)			\
-void atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
+void atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v);	\
+void atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
 
 int	atomic_cmpset_int(volatile u_int *dst, u_int exp, u_int src);
 int	atomic_cmpset_long(volatile u_long *dst, u_long exp, u_long src);
@@ -97,8 +98,9 @@ void		atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
 #endif
 
 /*
- * The assembly is volatilized to demark potential before-and-after side
- * effects if an interrupt or SMP collision were to occur.
+ * The assembly is volatilized to avoid code chunk removal by the compiler.
+ * GCC aggressively reorders operations and memory clobbering is necessary
+ * in order to avoid that for memory barriers.
  */
 #define	ATOMIC_ASM(NAME, TYPE, OP, CONS, V)		\
 static __inline void					\
@@ -108,6 +110,15 @@ atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
 	: "=m" (*p)					\
 	: CONS (V), "m" (*p));				\
 }							\
+							\
+static __inline void					\
+atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
+{							\
+	__asm __volatile(MPLOCKED OP			\
+	: "=m" (*p)					\
+	: CONS (V), "m" (*p)				\
+	: "memory");					\
+}							\
 struct __hack
 
 /*
@@ -205,18 +216,23 @@ atomic_fetchadd_long(volatile u_long *p, u_long v)
  * PentiumPro or higher, reads may pass writes, so for that case we have
  * to use a serializing instruction (i.e. with LOCK) to do the load in
  * SMP kernels.  For UP kernels, however, the cache of the single processor
- * is always consistent, so we don't need any memory barriers.
+ * is always consistent, so we only need to take care of compiler.
  */
 #define	ATOMIC_STORE_LOAD(TYPE, LOP, SOP)		\
 static __inline u_##TYPE				\
 atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
 {							\
-	return (*p);					\
+	u_##TYPE tmp;					\
+							\
+	tmp = *p;					\
+	__asm __volatile ("" : : : "memory");		\
+	return (tmp);					\
 }							\
 							\
 static __inline void					\
 atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
 {							\
+	__asm __volatile ("" : : : "memory");		\
 	*p = v;						\
 }							\
 struct __hack
@@ -247,7 +263,8 @@ atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
 	__asm __volatile(SOP				\
 	: "=m" (*p),			/* 0 */		\
 	  "+r" (v)			/* 1 */		\
-	: "m" (*p));			/* 2 */		\
+	: "m" (*p)			/* 2 */		\
+	: "memory");					\
 }							\
 struct __hack
 
@@ -327,44 +344,43 @@ u_long	atomic_readandclear_long(volatile u_long *addr);
 
 #endif /* __GNUCLIKE_ASM */
 
-/* Acquire and release variants are identical to the normal ones. */
-#define	atomic_set_acq_char		atomic_set_char
-#define	atomic_set_rel_char		atomic_set_char
-#define	atomic_clear_acq_char		atomic_clear_char
-#define	atomic_clear_rel_char		atomic_clear_char
-#define	atomic_add_acq_char		atomic_add_char
-#define	atomic_add_rel_char		atomic_add_char
-#define	atomic_subtract_acq_char	atomic_subtract_char
-#define	atomic_subtract_rel_char	atomic_subtract_char
-
-#define	atomic_set_acq_short		atomic_set_short
-#define	atomic_set_rel_short		atomic_set_short
-#define	atomic_clear_acq_short		atomic_clear_short
-#define	atomic_clear_rel_short		atomic_clear_short
-#define	atomic_add_acq_short		atomic_add_short
-#define	atomic_add_rel_short		atomic_add_short
-#define	atomic_subtract_acq_short	atomic_subtract_short
-#define	atomic_subtract_rel_short	atomic_subtract_short
-
-#define	atomic_set_acq_int		atomic_set_int
-#define	atomic_set_rel_int		atomic_set_int
-#define	atomic_clear_acq_int		atomic_clear_int
-#define	atomic_clear_rel_int		atomic_clear_int
-#define	atomic_add_acq_int		atomic_add_int
-#define	atomic_add_rel_int		atomic_add_int
-#define	atomic_subtract_acq_int		atomic_subtract_int
-#define	atomic_subtract_rel_int		atomic_subtract_int
+#define	atomic_set_acq_char		atomic_set_barr_char
+#define	atomic_set_rel_char		atomic_set_barr_char
+#define	atomic_clear_acq_char		atomic_clear_barr_char
+#define	atomic_clear_rel_char		atomic_clear_barr_char
+#define	atomic_add_acq_char		atomic_add_barr_char
+#define	atomic_add_rel_char		atomic_add_barr_char
+#define	atomic_subtract_acq_char	atomic_subtract_barr_char
+#define	atomic_subtract_rel_char	atomic_subtract_barr_char
+
+#define	atomic_set_acq_short		atomic_set_barr_short
+#define	atomic_set_rel_short		atomic_set_barr_short
+#define	atomic_clear_acq_short		atomic_clear_barr_short
+#define	atomic_clear_rel_short		atomic_clear_barr_short
+#define	atomic_add_acq_short		atomic_add_barr_short
+#define	atomic_add_rel_short		atomic_add_barr_short
+#define	atomic_subtract_acq_short	atomic_subtract_barr_short
+#define	atomic_subtract_rel_short	atomic_subtract_barr_short
+
+#define	atomic_set_acq_int		atomic_set_barr_int
+#define	atomic_set_rel_int		atomic_set_barr_int
+#define	atomic_clear_acq_int		atomic_clear_barr_int
+#define	atomic_clear_rel_int		atomic_clear_barr_int
+#define	atomic_add_acq_int		atomic_add_barr_int
+#define	atomic_add_rel_int		atomic_add_barr_int
+#define	atomic_subtract_acq_int		atomic_subtract_barr_int
+#define	atomic_subtract_rel_int		atomic_subtract_barr_int
 #define	atomic_cmpset_acq_int		atomic_cmpset_int
 #define	atomic_cmpset_rel_int		atomic_cmpset_int
 
-#define	atomic_set_acq_long		atomic_set_long
-#define	atomic_set_rel_long		atomic_set_long
-#define	atomic_clear_acq_long		atomic_clear_long
-#define	atomic_clear_rel_long		atomic_clear_long
-#define	atomic_add_acq_long		atomic_add_long
-#define	atomic_add_rel_long		atomic_add_long
-#define	atomic_subtract_acq_long	atomic_subtract_long
-#define	atomic_subtract_rel_long	atomic_subtract_long
+#define	atomic_set_acq_long		atomic_set_barr_long
+#define	atomic_set_rel_long		atomic_set_barr_long
+#define	atomic_clear_acq_long		atomic_clear_barr_long
+#define	atomic_clear_rel_long		atomic_clear_barr_long
+#define	atomic_add_acq_long		atomic_add_barr_long
+#define	atomic_add_rel_long		atomic_add_barr_long
+#define	atomic_subtract_acq_long	atomic_subtract_barr_long
+#define	atomic_subtract_rel_long	atomic_subtract_barr_long
 #define	atomic_cmpset_acq_long		atomic_cmpset_long
 #define	atomic_cmpset_rel_long		atomic_cmpset_long
 
-- 
cgit v1.1


From 0c7713cc54c6b456ec24664e0b523960564cf1f4 Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Tue, 20 Oct 2009 13:32:28 +0000
Subject: MFC r197933: Define architectural load bases for PIE binaries.

MFC r198203 (by marius):
Change load base for sparc to match default gcc memory layout model.

Approved by:	re (kensmith)
---
 sys/amd64/include/elf.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/elf.h b/sys/amd64/include/elf.h
index e5c95f7..88f4398 100644
--- a/sys/amd64/include/elf.h
+++ b/sys/amd64/include/elf.h
@@ -106,4 +106,10 @@ __ElfType(Auxinfo);
 #define	ELF_TARG_MACH	EM_X86_64
 #define	ELF_TARG_VER	1
 
+#if __ELF_WORD_SIZE == 32
+#define	ET_DYN_LOAD_ADDR 0x01001000
+#else
+#define	ET_DYN_LOAD_ADDR 0x01021000
+#endif
+
 #endif /* !_MACHINE_ELF_H_ */
-- 
cgit v1.1


From d72d5acfe5ce013b74ce3ec8b4b585db39d70756 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Thu, 29 Oct 2009 16:00:27 +0000
Subject: MFC 197439: Extract the code to find and map the MADT ACPI table
 during early kernel startup and genericize it so it can be reused to map
 other tables as well: - Add a routine to walk a list of ACPI subtables such
 as those used in the   APIC and SRAT tables in the MI acpi(4) driver. - Move
 the routines for mapping and unmapping an ACPI table as well as   mapping the
 RSDT or XSDT and searching for a table with a given signature   out into
 acpica_machdep.c for both amd64 and i386.

---
 sys/amd64/acpica/acpi_machdep.c    | 244 +++++++++++++++++++++++++++++++++++++
 sys/amd64/acpica/madt.c            | 233 ++---------------------------------
 sys/amd64/include/acpica_machdep.h |   3 +
 3 files changed, 257 insertions(+), 223 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/acpica/acpi_machdep.c b/sys/amd64/acpica/acpi_machdep.c
index b902c12..0d866e8 100644
--- a/sys/amd64/acpica/acpi_machdep.c
+++ b/sys/amd64/acpica/acpi_machdep.c
@@ -32,8 +32,12 @@ __FBSDID("$FreeBSD$");
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/sysctl.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
 
 #include <contrib/dev/acpica/include/acpi.h>
+#include <contrib/dev/acpica/include/accommon.h>
+#include <contrib/dev/acpica/include/actables.h>
 
 #include <dev/acpica/acpivar.h>
 
@@ -100,6 +104,246 @@ acpi_cpu_c1()
 }
 
 /*
+ * Support for mapping ACPI tables during early boot.  Currently this
+ * uses the crashdump map to map each table.  However, the crashdump
+ * map is created in pmap_bootstrap() right after the direct map, so
+ * we should be able to just use pmap_mapbios() here instead.
+ *
+ * This makes the following assumptions about how we use this KVA:
+ * pages 0 and 1 are used to map in the header of each table found via
+ * the RSDT or XSDT and pages 2 to n are used to map in the RSDT or
+ * XSDT.  This has to use 2 pages for the table headers in case a
+ * header spans a page boundary.
+ *
+ * XXX: We don't ensure the table fits in the available address space
+ * in the crashdump map.
+ */
+
+/*
+ * Map some memory using the crashdump map.  'offset' is an offset in
+ * pages into the crashdump map to use for the start of the mapping.
+ */
+static void *
+table_map(vm_paddr_t pa, int offset, vm_offset_t length)
+{
+	vm_offset_t va, off;
+	void *data;
+
+	off = pa & PAGE_MASK;
+	length = roundup(length + off, PAGE_SIZE);
+	pa = pa & PG_FRAME;
+	va = (vm_offset_t)pmap_kenter_temporary(pa, offset) +
+	    (offset * PAGE_SIZE);
+	data = (void *)(va + off);
+	length -= PAGE_SIZE;
+	while (length > 0) {
+		va += PAGE_SIZE;
+		pa += PAGE_SIZE;
+		length -= PAGE_SIZE;
+		pmap_kenter(va, pa);
+		invlpg(va);
+	}
+	return (data);
+}
+
+/* Unmap memory previously mapped with table_map(). */
+static void
+table_unmap(void *data, vm_offset_t length)
+{
+	vm_offset_t va, off;
+
+	va = (vm_offset_t)data;
+	off = va & PAGE_MASK;
+	length = roundup(length + off, PAGE_SIZE);
+	va &= ~PAGE_MASK;
+	while (length > 0) {
+		pmap_kremove(va);
+		invlpg(va);
+		va += PAGE_SIZE;
+		length -= PAGE_SIZE;
+	}
+}
+
+/*
+ * Map a table at a given offset into the crashdump map.  It first
+ * maps the header to determine the table length and then maps the
+ * entire table.
+ */
+static void *
+map_table(vm_paddr_t pa, int offset, const char *sig)
+{
+	ACPI_TABLE_HEADER *header;
+	vm_offset_t length;
+	void *table;
+
+	header = table_map(pa, offset, sizeof(ACPI_TABLE_HEADER));
+	if (strncmp(header->Signature, sig, ACPI_NAME_SIZE) != 0) {
+		table_unmap(header, sizeof(ACPI_TABLE_HEADER));
+		return (NULL);
+	}
+	length = header->Length;
+	table_unmap(header, sizeof(ACPI_TABLE_HEADER));
+	table = table_map(pa, offset, length);
+	if (ACPI_FAILURE(AcpiTbChecksum(table, length))) {
+		if (bootverbose)
+			printf("ACPI: Failed checksum for table %s\n", sig);
+		table_unmap(table, length);
+		return (NULL);
+	}
+	return (table);
+}
+
+/*
+ * See if a given ACPI table is the requested table.  Returns the
+ * length of the able if it matches or zero on failure.
+ */
+static int
+probe_table(vm_paddr_t address, const char *sig)
+{
+	ACPI_TABLE_HEADER *table;
+
+	table = table_map(address, 0, sizeof(ACPI_TABLE_HEADER));
+	if (table == NULL) {
+		if (bootverbose)
+			printf("ACPI: Failed to map table at 0x%jx\n",
+			    (uintmax_t)address);
+		return (0);
+	}
+	if (bootverbose)
+		printf("Table '%.4s' at 0x%jx\n", table->Signature,
+		    (uintmax_t)address);
+
+	if (strncmp(table->Signature, sig, ACPI_NAME_SIZE) != 0) {
+		table_unmap(table, sizeof(ACPI_TABLE_HEADER));
+		return (0);
+	}
+	table_unmap(table, sizeof(ACPI_TABLE_HEADER));
+	return (1);
+}
+
+/*
+ * Try to map a table at a given physical address previously returned
+ * by acpi_find_table().
+ */
+void *
+acpi_map_table(vm_paddr_t pa, const char *sig)
+{
+
+	return (map_table(pa, 0, sig));
+}
+
+/* Unmap a table previously mapped via acpi_map_table(). */
+void
+acpi_unmap_table(void *table)
+{
+	ACPI_TABLE_HEADER *header;
+
+	header = (ACPI_TABLE_HEADER *)table;
+	table_unmap(table, header->Length);
+}
+
+/*
+ * Return the physical address of the requested table or zero if one
+ * is not found.
+ */
+vm_paddr_t
+acpi_find_table(const char *sig)
+{
+	ACPI_PHYSICAL_ADDRESS rsdp_ptr;
+	ACPI_TABLE_RSDP *rsdp;
+	ACPI_TABLE_RSDT *rsdt;
+	ACPI_TABLE_XSDT *xsdt;
+	ACPI_TABLE_HEADER *table;
+	vm_paddr_t addr;
+	int i, count;
+
+	if (resource_disabled("acpi", 0))
+		return (0);
+
+	/*
+	 * Map in the RSDP.  Since ACPI uses AcpiOsMapMemory() which in turn
+	 * calls pmap_mapbios() to find the RSDP, we assume that we can use
+	 * pmap_mapbios() to map the RSDP.
+	 */
+	if ((rsdp_ptr = AcpiOsGetRootPointer()) == 0)
+		return (0);
+	rsdp = pmap_mapbios(rsdp_ptr, sizeof(ACPI_TABLE_RSDP));
+	if (rsdp == NULL) {
+		if (bootverbose)
+			printf("ACPI: Failed to map RSDP\n");
+		return (0);
+	}
+
+	/*
+	 * For ACPI >= 2.0, use the XSDT if it is available.
+	 * Otherwise, use the RSDT.  We map the XSDT or RSDT at page 2
+	 * in the crashdump area.  Pages 0 and 1 are used to map in the
+	 * headers of candidate ACPI tables.
+	 */
+	addr = 0;
+	if (rsdp->Revision >= 2 && rsdp->XsdtPhysicalAddress != 0) {
+		/*
+		 * AcpiOsGetRootPointer only verifies the checksum for
+		 * the version 1.0 portion of the RSDP.  Version 2.0 has
+		 * an additional checksum that we verify first.
+		 */
+		if (AcpiTbChecksum((UINT8 *)rsdp, ACPI_RSDP_XCHECKSUM_LENGTH)) {
+			if (bootverbose)
+				printf("ACPI: RSDP failed extended checksum\n");
+			return (0);
+		}
+		xsdt = map_table(rsdp->XsdtPhysicalAddress, 2, ACPI_SIG_XSDT);
+		if (xsdt == NULL) {
+			if (bootverbose)
+				printf("ACPI: Failed to map XSDT\n");
+			return (0);
+		}
+		count = (xsdt->Header.Length - sizeof(ACPI_TABLE_HEADER)) /
+		    sizeof(UINT64);
+		for (i = 0; i < count; i++)
+			if (probe_table(xsdt->TableOffsetEntry[i], sig)) {
+				addr = xsdt->TableOffsetEntry[i];
+				break;
+			}
+		acpi_unmap_table(xsdt);
+	} else {
+		rsdt = map_table(rsdp->RsdtPhysicalAddress, 2, ACPI_SIG_RSDT);
+		if (rsdt == NULL) {
+			if (bootverbose)
+				printf("ACPI: Failed to map RSDT\n");
+			return (0);
+		}
+		count = (rsdt->Header.Length - sizeof(ACPI_TABLE_HEADER)) /
+		    sizeof(UINT32);
+		for (i = 0; i < count; i++)
+			if (probe_table(rsdt->TableOffsetEntry[i], sig)) {
+				addr = rsdt->TableOffsetEntry[i];
+				break;
+			}
+		acpi_unmap_table(rsdt);
+	}
+	pmap_unmapbios((vm_offset_t)rsdp, sizeof(ACPI_TABLE_RSDP));
+	if (addr == 0) {
+		if (bootverbose)
+			printf("ACPI: No %s table found\n", sig);
+		return (0);
+	}
+	if (bootverbose)
+		printf("%s: Found table at 0x%jx\n", sig, (uintmax_t)addr);
+
+	/*
+	 * Verify that we can map the full table and that its checksum is
+	 * correct, etc.
+	 */
+	table = map_table(addr, 0, sig);
+	if (table == NULL)
+		return (0);
+	acpi_unmap_table(table);
+
+	return (addr);
+}
+
+/*
  * ACPI nexus(4) driver.
  */
 static int
diff --git a/sys/amd64/acpica/madt.c b/sys/amd64/acpica/madt.c
index b27f8e4..a409682 100644
--- a/sys/amd64/acpica/madt.c
+++ b/sys/amd64/acpica/madt.c
@@ -36,27 +36,19 @@ __FBSDID("$FreeBSD$");
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/smp.h>
-
 #include <vm/vm.h>
-#include <vm/vm_param.h>
 #include <vm/pmap.h>
 
 #include <machine/apicreg.h>
-#include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <machine/apicvar.h>
-#include <machine/md_var.h>
-#include <machine/specialreg.h>
 
 #include <contrib/dev/acpica/include/acpi.h>
-#include <contrib/dev/acpica/include/accommon.h>
 #include <contrib/dev/acpica/include/actables.h>
 
 #include <dev/acpica/acpivar.h>
 #include <dev/pci/pcivar.h>
 
-typedef	void madt_entry_handler(ACPI_SUBTABLE_HEADER *entry, void *arg);
-
 /* These two arrays are indexed by APIC IDs. */
 struct ioapic_info {
 	void *io_apic;
@@ -79,8 +71,6 @@ static enum intr_polarity interrupt_polarity(UINT16 IntiFlags, UINT8 Source);
 static enum intr_trigger interrupt_trigger(UINT16 IntiFlags, UINT8 Source);
 static int	madt_find_cpu(u_int acpi_id, u_int *apic_id);
 static int	madt_find_interrupt(int intr, void **apic, u_int *pin);
-static void	*madt_map(vm_paddr_t pa, int offset, vm_offset_t length);
-static void	*madt_map_table(vm_paddr_t pa, int offset, const char *sig);
 static void	madt_parse_apics(ACPI_SUBTABLE_HEADER *entry, void *arg);
 static void	madt_parse_interrupt_override(
 		    ACPI_MADT_INTERRUPT_OVERRIDE *intr);
@@ -92,13 +82,10 @@ static int	madt_probe(void);
 static int	madt_probe_cpus(void);
 static void	madt_probe_cpus_handler(ACPI_SUBTABLE_HEADER *entry,
 		    void *arg __unused);
-static int	madt_probe_table(vm_paddr_t address);
 static void	madt_register(void *dummy);
 static int	madt_setup_local(void);
 static int	madt_setup_io(void);
-static void	madt_unmap(void *data, vm_offset_t length);
-static void	madt_unmap_table(void *table);
-static void	madt_walk_table(madt_entry_handler *handler, void *arg);
+static void	madt_walk_table(acpi_subtable_handler *handler, void *arg);
 
 static struct apic_enumerator madt_enumerator = {
 	"MADT",
@@ -109,224 +96,30 @@ static struct apic_enumerator madt_enumerator = {
 };
 
 /*
- * Code to abuse the crashdump map to map in the tables for the early
- * probe.  We cheat and make the following assumptions about how we
- * use this KVA: pages 0 and 1 are used to map in the header of each
- * table found via the RSDT or XSDT and pages 2 to n are used to map
- * in the RSDT or XSDT.  We have to use 2 pages for the table headers
- * in case a header spans a page boundary.  The offset is in pages;
- * the length is in bytes.
- */
-static void *
-madt_map(vm_paddr_t pa, int offset, vm_offset_t length)
-{
-	vm_offset_t va, off;
-	void *data;
-
-	off = pa & PAGE_MASK;
-	length = roundup(length + off, PAGE_SIZE);
-	pa = pa & PG_FRAME;
-	va = (vm_offset_t)pmap_kenter_temporary(pa, offset) +
-	    (offset * PAGE_SIZE);
-	data = (void *)(va + off);
-	length -= PAGE_SIZE;
-	while (length > 0) {
-		va += PAGE_SIZE;
-		pa += PAGE_SIZE;
-		length -= PAGE_SIZE;
-		pmap_kenter(va, pa);
-		invlpg(va);
-	}
-	return (data);
-}
-
-static void
-madt_unmap(void *data, vm_offset_t length)
-{
-	vm_offset_t va, off;
-
-	va = (vm_offset_t)data;
-	off = va & PAGE_MASK;
-	length = roundup(length + off, PAGE_SIZE);
-	va &= ~PAGE_MASK;
-	while (length > 0) {
-		pmap_kremove(va);
-		invlpg(va);
-		va += PAGE_SIZE;
-		length -= PAGE_SIZE;
-	}
-}
-
-static void *
-madt_map_table(vm_paddr_t pa, int offset, const char *sig)
-{
-	ACPI_TABLE_HEADER *header;
-	vm_offset_t length;
-	void *table;
-
-	header = madt_map(pa, offset, sizeof(ACPI_TABLE_HEADER));
-	if (strncmp(header->Signature, sig, ACPI_NAME_SIZE) != 0) {
-		madt_unmap(header, sizeof(ACPI_TABLE_HEADER));
-		return (NULL);
-	}
-	length = header->Length;
-	madt_unmap(header, sizeof(ACPI_TABLE_HEADER));
-	table = madt_map(pa, offset, length);
-	if (ACPI_FAILURE(AcpiTbChecksum(table, length))) {
-		if (bootverbose)
-			printf("MADT: Failed checksum for table %s\n", sig);
-		madt_unmap(table, length);
-		return (NULL);
-	}
-	return (table);
-}
-
-static void
-madt_unmap_table(void *table)
-{
-	ACPI_TABLE_HEADER *header;
-
-	header = (ACPI_TABLE_HEADER *)table;
-	madt_unmap(table, header->Length);
-}
-
-/*
  * Look for an ACPI Multiple APIC Description Table ("APIC")
  */
 static int
 madt_probe(void)
 {
-	ACPI_PHYSICAL_ADDRESS rsdp_ptr;
-	ACPI_TABLE_RSDP *rsdp;
-	ACPI_TABLE_RSDT *rsdt;
-	ACPI_TABLE_XSDT *xsdt;
-	int i, count;
 
-	if (resource_disabled("acpi", 0))
+	madt_physaddr = acpi_find_table(ACPI_SIG_MADT);
+	if (madt_physaddr == 0)
 		return (ENXIO);
-
-	/*
-	 * Map in the RSDP.  Since ACPI uses AcpiOsMapMemory() which in turn
-	 * calls pmap_mapbios() to find the RSDP, we assume that we can use
-	 * pmap_mapbios() to map the RSDP.
-	 */
-	if ((rsdp_ptr = AcpiOsGetRootPointer()) == 0)
-		return (ENXIO);
-	rsdp = pmap_mapbios(rsdp_ptr, sizeof(ACPI_TABLE_RSDP));
-	if (rsdp == NULL) {
-		if (bootverbose)
-			printf("MADT: Failed to map RSDP\n");
-		return (ENXIO);
-	}
-
-	/*
-	 * For ACPI >= 2.0, use the XSDT if it is available.
-	 * Otherwise, use the RSDT.  We map the XSDT or RSDT at page 1
-	 * in the crashdump area.  Page 0 is used to map in the
-	 * headers of candidate ACPI tables.
-	 */
-	if (rsdp->Revision >= 2 && rsdp->XsdtPhysicalAddress != 0) {
-		/*
-		 * AcpiOsGetRootPointer only verifies the checksum for
-		 * the version 1.0 portion of the RSDP.  Version 2.0 has
-		 * an additional checksum that we verify first.
-		 */
-		if (AcpiTbChecksum((UINT8 *)rsdp, ACPI_RSDP_XCHECKSUM_LENGTH)) {
-			if (bootverbose)
-				printf("MADT: RSDP failed extended checksum\n");
-			return (ENXIO);
-		}
-		xsdt = madt_map_table(rsdp->XsdtPhysicalAddress, 2,
-		    ACPI_SIG_XSDT);
-		if (xsdt == NULL) {
-			if (bootverbose)
-				printf("MADT: Failed to map XSDT\n");
-			return (ENXIO);
-		}
-		count = (xsdt->Header.Length - sizeof(ACPI_TABLE_HEADER)) /
-		    sizeof(UINT64);
-		for (i = 0; i < count; i++)
-			if (madt_probe_table(xsdt->TableOffsetEntry[i]))
-				break;
-		madt_unmap_table(xsdt);
-	} else {
-		rsdt = madt_map_table(rsdp->RsdtPhysicalAddress, 2,
-		    ACPI_SIG_RSDT);
-		if (rsdt == NULL) {
-			if (bootverbose)
-				printf("MADT: Failed to map RSDT\n");
-			return (ENXIO);
-		}
-		count = (rsdt->Header.Length - sizeof(ACPI_TABLE_HEADER)) /
-		    sizeof(UINT32);
-		for (i = 0; i < count; i++)
-			if (madt_probe_table(rsdt->TableOffsetEntry[i]))
-				break;
-		madt_unmap_table(rsdt);
-	}
-	pmap_unmapbios((vm_offset_t)rsdp, sizeof(ACPI_TABLE_RSDP));
-	if (madt_physaddr == 0) {
-		if (bootverbose)
-			printf("MADT: No MADT table found\n");
-		return (ENXIO);
-	}
-	if (bootverbose)
-		printf("MADT: Found table at 0x%jx\n",
-		    (uintmax_t)madt_physaddr);
-
-	/*
-	 * Verify that we can map the full table and that its checksum is
-	 * correct, etc.
-	 */
-	madt = madt_map_table(madt_physaddr, 0, ACPI_SIG_MADT);
-	if (madt == NULL)
-		return (ENXIO);
-	madt_unmap_table(madt);
-	madt = NULL;
-
 	return (0);
 }
 
 /*
- * See if a given ACPI table is the MADT.
- */
-static int
-madt_probe_table(vm_paddr_t address)
-{
-	ACPI_TABLE_HEADER *table;
-
-	table = madt_map(address, 0, sizeof(ACPI_TABLE_HEADER));
-	if (table == NULL) {
-		if (bootverbose)
-			printf("MADT: Failed to map table at 0x%jx\n",
-			    (uintmax_t)address);
-		return (0);
-	}
-	if (bootverbose)
-		printf("Table '%.4s' at 0x%jx\n", table->Signature,
-		    (uintmax_t)address);
-
-	if (strncmp(table->Signature, ACPI_SIG_MADT, ACPI_NAME_SIZE) != 0) {
-		madt_unmap(table, sizeof(ACPI_TABLE_HEADER));
-		return (0);
-	}
-	madt_physaddr = address;
-	madt_length = table->Length;
-	madt_unmap(table, sizeof(ACPI_TABLE_HEADER));
-	return (1);
-}
-
-/*
  * Run through the MP table enumerating CPUs.
  */
 static int
 madt_probe_cpus(void)
 {
 
-	madt = madt_map_table(madt_physaddr, 0, ACPI_SIG_MADT);
+	madt = acpi_map_table(madt_physaddr, ACPI_SIG_MADT);
+	madt_length = madt->Header.Length;
 	KASSERT(madt != NULL, ("Unable to re-map MADT"));
 	madt_walk_table(madt_probe_cpus_handler, NULL);
-	madt_unmap_table(madt);
+	acpi_unmap_table(madt);
 	madt = NULL;
 	return (0);
 }
@@ -417,17 +210,11 @@ SYSINIT(madt_register, SI_SUB_TUNABLES - 1, SI_ORDER_FIRST,
  * Call the handler routine for each entry in the MADT table.
  */
 static void
-madt_walk_table(madt_entry_handler *handler, void *arg)
+madt_walk_table(acpi_subtable_handler *handler, void *arg)
 {
-	ACPI_SUBTABLE_HEADER *entry;
-	u_char *p, *end;
-
-	end = (u_char *)(madt) + madt->Header.Length;
-	for (p = (u_char *)(madt + 1); p < end; ) {
-		entry = (ACPI_SUBTABLE_HEADER *)p;
-		handler(entry, arg);
-		p += entry->Length;
-	}
+
+	acpi_walk_subtables(madt + 1, (char *)madt + madt->Header.Length,
+	    handler, arg);
 }
 
 static void
diff --git a/sys/amd64/include/acpica_machdep.h b/sys/amd64/include/acpica_machdep.h
index 76cc69e..9943af7 100644
--- a/sys/amd64/include/acpica_machdep.h
+++ b/sys/amd64/include/acpica_machdep.h
@@ -77,5 +77,8 @@ extern int	acpi_release_global_lock(uint32_t *lock);
 
 void	acpi_SetDefaultIntrModel(int model);
 void	acpi_cpu_c1(void);
+void	*acpi_map_table(vm_paddr_t pa, const char *sig);
+void	acpi_unmap_table(void *table);
+vm_paddr_t acpi_find_table(const char *sig);
 
 #endif /* __ACPICA_MACHDEP_H__ */
-- 
cgit v1.1


From 6248096baf3dba8807416018717b2cf49d9418e3 Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Thu, 29 Oct 2009 16:24:39 +0000
Subject: MFC r197389: Do panic regardeless of execution mode at the moment of
 T_RESERVED trap.

---
 sys/amd64/amd64/trap.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 65f761e..cfccf3c 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -253,6 +253,11 @@ trap(struct trapframe *frame)
 	}
 #endif
 
+	if (type == T_RESERVED) {
+		trap_fatal(frame, 0);
+		goto out;
+	}
+
 #ifdef	HWPMC_HOOKS
 	/*
 	 * CPU PMCs interrupt using an NMI.  If the PMC module is
-- 
cgit v1.1


From 23c01e6e9ddfd2d071ce9f2df153c573c89de41b Mon Sep 17 00:00:00 2001
From: alc <alc@FreeBSD.org>
Date: Sat, 31 Oct 2009 18:54:26 +0000
Subject: MFC r197316   Add a new sysctl for reporting all of the supported
 page sizes.

---
 sys/amd64/include/param.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h
index edcf427..10d3ab3 100644
--- a/sys/amd64/include/param.h
+++ b/sys/amd64/include/param.h
@@ -118,6 +118,8 @@
 #define	NBPML4		(1ul<<PML4SHIFT)/* bytes/page map lev4 table */
 #define	PML4MASK	(NBPML4-1)
 
+#define	MAXPAGESIZES	3	/* maximum number of supported page sizes */
+
 #define IOPAGES	2		/* pages of i/o permission bitmap */
 
 #ifndef	KSTACK_PAGES
-- 
cgit v1.1


From 948fb067798eef6f482913b54d5ef382158ac04c Mon Sep 17 00:00:00 2001
From: alc <alc@FreeBSD.org>
Date: Sat, 31 Oct 2009 19:02:08 +0000
Subject: MFC r197317   When superpages are enabled, add the 2 or 4MB page size
 to the array of   supported page sizes.

---
 sys/amd64/amd64/pmap.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 97de6b6..d3d653d 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -663,6 +663,11 @@ pmap_init(void)
 	 * Are large page mappings enabled?
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
+	if (pg_ps_enabled) {
+		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
+		    ("pmap_init: can't assign to pagesizes[1]"));
+		pagesizes[1] = NBPDR;
+	}
 
 	/*
 	 * Calculate the size of the pv head table for superpages.
-- 
cgit v1.1


From c8f0456bbb423d4a66a049f99b5edaabde6bc7a0 Mon Sep 17 00:00:00 2001
From: avg <avg@FreeBSD.org>
Date: Sun, 1 Nov 2009 17:45:37 +0000
Subject: MFC 197647: cpufunc.h: unify/correct style of c extension names

---
 sys/amd64/include/cpufunc.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/cpufunc.h b/sys/amd64/include/cpufunc.h
index eb264ae..dee78cb 100644
--- a/sys/amd64/include/cpufunc.h
+++ b/sys/amd64/include/cpufunc.h
@@ -277,7 +277,7 @@ static __inline void
 mfence(void)
 {
 
-	__asm__ __volatile("mfence" : : : "memory");
+	__asm __volatile("mfence" : : : "memory");
 }
 
 static __inline void
@@ -457,14 +457,14 @@ load_es(u_int sel)
 	__asm __volatile("mov %0,%%es" : : "rm" (sel));
 }
 
-static inline void
+static __inline void
 cpu_monitor(const void *addr, int extensions, int hints)
 {
 	__asm __volatile("monitor;"
 	    : :"a" (addr), "c" (extensions), "d"(hints));
 }
 
-static inline void
+static __inline void
 cpu_mwait(int extensions, int hints)
 {
 	__asm __volatile("mwait;" : :"a" (hints), "c" (extensions));
-- 
cgit v1.1


From 6ecbe62b9495ba34cbecad58409cc5c20016dffe Mon Sep 17 00:00:00 2001
From: avg <avg@FreeBSD.org>
Date: Sun, 1 Nov 2009 18:39:26 +0000
Subject: MFC 197450: number of cleanups in i386 and amd64 pci md code

---
 sys/amd64/pci/pci_cfgreg.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/pci/pci_cfgreg.c b/sys/amd64/pci/pci_cfgreg.c
index be9e404..3e29a58 100644
--- a/sys/amd64/pci/pci_cfgreg.c
+++ b/sys/amd64/pci/pci_cfgreg.c
@@ -181,9 +181,9 @@ pci_cfgenable(unsigned bus, unsigned slot, unsigned func, int reg, int bytes)
 {
 	int dataport = 0;
 
-	if (bus <= PCI_BUSMAX && slot < 32 && func <= PCI_FUNCMAX &&
-	    reg <= PCI_REGMAX && bytes != 3 && (unsigned) bytes <= 4 &&
-	    (reg & (bytes - 1)) == 0) {
+	if (bus <= PCI_BUSMAX && slot <= PCI_SLOTMAX && func <= PCI_FUNCMAX &&
+	    (unsigned)reg <= PCI_REGMAX && bytes != 3 &&
+	    (unsigned)bytes <= 4 && (reg & (bytes - 1)) == 0) {
 		outl(CONF1_ADDR_PORT, (1 << 31) | (bus << 16) | (slot << 11) 
 		    | (func << 8) | (reg & ~0x03));
 		dataport = CONF1_DATA_PORT + (reg & 0x03);
@@ -281,7 +281,7 @@ pcie_cfgregopen(uint64_t base, uint8_t minbus, uint8_t maxbus)
 	 * fall back to using type 1 config access instead.
 	 */
 	if (pci_cfgregopen() != 0) {
-		for (slot = 0; slot < 32; slot++) {
+		for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
 			val1 = pcireg_cfgread(0, slot, 0, 0, 4);
 			if (val1 == 0xffffffff)
 				continue;
@@ -309,8 +309,8 @@ pciereg_cfgread(int bus, unsigned slot, unsigned func, unsigned reg,
 	volatile vm_offset_t va;
 	int data = -1;
 
-	if (bus < pcie_minbus || bus > pcie_maxbus || slot >= 32 ||
-	    func > PCI_FUNCMAX || reg >= 0x1000)
+	if (bus < pcie_minbus || bus > pcie_maxbus || slot > PCI_SLOTMAX ||
+	    func > PCI_FUNCMAX || reg > PCIE_REGMAX)
 		return (-1);
 
 	va = PCIE_VADDR(pcie_base, reg, bus, slot, func);
@@ -336,8 +336,8 @@ pciereg_cfgwrite(int bus, unsigned slot, unsigned func, unsigned reg, int data,
 {
 	volatile vm_offset_t va;
 
-	if (bus < pcie_minbus || bus > pcie_maxbus || slot >= 32 ||
-	    func > PCI_FUNCMAX || reg >= 0x1000)
+	if (bus < pcie_minbus || bus > pcie_maxbus || slot > PCI_SLOTMAX ||
+	    func > PCI_FUNCMAX || reg > PCIE_REGMAX)
 		return;
 
 	va = PCIE_VADDR(pcie_base, reg, bus, slot, func);
-- 
cgit v1.1


From 096a30cf601c35e8da952b7712d8474b7aca22b2 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Wed, 4 Nov 2009 20:49:14 +0000
Subject: MFC 198554: Fix some problems with effective mmap() offsets > 32
 bits.  This was partially fixed on amd64 earlier.  Rather than forcing
 linux_mmap_common() to use a 32-bit offset, have it accept a 64-bit file
 offset.  This offset is then passed to the real mmap() call.  Rather than
 inventing a structure to hold the normal linux_mmap args that has a 64-bit
 offset, just pass each of the arguments individually to linux_mmap_common()
 since that more closes matches the existing style of various kern_foo()
 functions.

---
 sys/amd64/linux32/linux32_machdep.c | 66 +++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 36 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/linux32/linux32_machdep.c b/sys/amd64/linux32/linux32_machdep.c
index 42ea070..46119b6 100644
--- a/sys/amd64/linux32/linux32_machdep.c
+++ b/sys/amd64/linux32/linux32_machdep.c
@@ -91,6 +91,10 @@ linux_to_bsd_sigaltstack(int lsa)
 	return (bsa);
 }
 
+static int	linux_mmap_common(struct thread *td, l_uintptr_t addr,
+		    l_size_t len, l_int prot, l_int flags, l_int fd,
+		    l_loff_t pos);
+
 int
 bsd_to_linux_sigaltstack(int bsa)
 {
@@ -759,12 +763,9 @@ linux_clone(struct thread *td, struct linux_clone_args *args)
 #define STACK_SIZE  (2 * 1024 * 1024)
 #define GUARD_SIZE  (4 * PAGE_SIZE)
 
-static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
-
 int
 linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
 {
-	struct l_mmap_argv linux_args;
 
 #ifdef DEBUG
 	if (ldebug(mmap2))
@@ -773,14 +774,9 @@ linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
 		    args->flags, args->fd, args->pgoff);
 #endif
 
-	linux_args.addr = PTROUT(args->addr);
-	linux_args.len = args->len;
-	linux_args.prot = args->prot;
-	linux_args.flags = args->flags;
-	linux_args.fd = args->fd;
-	linux_args.pgoff = args->pgoff;
-
-	return (linux_mmap_common(td, &linux_args));
+	return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot,
+		args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff *
+		PAGE_SIZE));
 }
 
 int
@@ -799,15 +795,15 @@ linux_mmap(struct thread *td, struct linux_mmap_args *args)
 		    linux_args.addr, linux_args.len, linux_args.prot,
 		    linux_args.flags, linux_args.fd, linux_args.pgoff);
 #endif
-	if ((linux_args.pgoff % PAGE_SIZE) != 0)
-		return (EINVAL);
-	linux_args.pgoff /= PAGE_SIZE;
 
-	return (linux_mmap_common(td, &linux_args));
+	return (linux_mmap_common(td, linux_args.addr, linux_args.len,
+	    linux_args.prot, linux_args.flags, linux_args.fd,
+	    (uint32_t)linux_args.pgoff));
 }
 
 static int
-linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
+linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot,
+    l_int flags, l_int fd, l_loff_t pos)
 {
 	struct proc *p = td->td_proc;
 	struct mmap_args /* {
@@ -830,21 +826,20 @@ linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
 	 * Linux mmap(2):
 	 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE
 	 */
-	if (! ((linux_args->flags & LINUX_MAP_SHARED) ^
-	    (linux_args->flags & LINUX_MAP_PRIVATE)))
+	if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE)))
 		return (EINVAL);
 
-	if (linux_args->flags & LINUX_MAP_SHARED)
+	if (flags & LINUX_MAP_SHARED)
 		bsd_args.flags |= MAP_SHARED;
-	if (linux_args->flags & LINUX_MAP_PRIVATE)
+	if (flags & LINUX_MAP_PRIVATE)
 		bsd_args.flags |= MAP_PRIVATE;
-	if (linux_args->flags & LINUX_MAP_FIXED)
+	if (flags & LINUX_MAP_FIXED)
 		bsd_args.flags |= MAP_FIXED;
-	if (linux_args->flags & LINUX_MAP_ANON)
+	if (flags & LINUX_MAP_ANON)
 		bsd_args.flags |= MAP_ANON;
 	else
 		bsd_args.flags |= MAP_NOSYNC;
-	if (linux_args->flags & LINUX_MAP_GROWSDOWN)
+	if (flags & LINUX_MAP_GROWSDOWN)
 		bsd_args.flags |= MAP_STACK;
 
 	/*
@@ -852,12 +847,12 @@ linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
 	 * on Linux/i386. We do this to ensure maximum compatibility.
 	 * Linux/ia64 does the same in i386 emulation mode.
 	 */
-	bsd_args.prot = linux_args->prot;
+	bsd_args.prot = prot;
 	if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
 		bsd_args.prot |= PROT_READ | PROT_EXEC;
 
 	/* Linux does not check file descriptor when MAP_ANONYMOUS is set. */
-	bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : linux_args->fd;
+	bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd;
 	if (bsd_args.fd != -1) {
 		/*
 		 * Linux follows Solaris mmap(2) description:
@@ -882,7 +877,7 @@ linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
 		fdrop(fp, td);
 	}
 
-	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
+	if (flags & LINUX_MAP_GROWSDOWN) {
 		/*
 		 * The Linux MAP_GROWSDOWN option does not limit auto
 		 * growth of the region.  Linux mmap with this option
@@ -905,8 +900,7 @@ linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
 		 * fixed size of (STACK_SIZE - GUARD_SIZE).
 		 */
 
-		if ((caddr_t)PTRIN(linux_args->addr) + linux_args->len >
-		    p->p_vmspace->vm_maxsaddr) {
+		if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) {
 			/*
 			 * Some Linux apps will attempt to mmap
 			 * thread stacks near the top of their
@@ -937,19 +931,19 @@ linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
 		 * we map the full stack, since we don't have a way
 		 * to autogrow it.
 		 */
-		if (linux_args->len > STACK_SIZE - GUARD_SIZE) {
-			bsd_args.addr = (caddr_t)PTRIN(linux_args->addr);
-			bsd_args.len = linux_args->len;
+		if (len > STACK_SIZE - GUARD_SIZE) {
+			bsd_args.addr = (caddr_t)PTRIN(addr);
+			bsd_args.len = len;
 		} else {
-			bsd_args.addr = (caddr_t)PTRIN(linux_args->addr) -
-			    (STACK_SIZE - GUARD_SIZE - linux_args->len);
+			bsd_args.addr = (caddr_t)PTRIN(addr) -
+			    (STACK_SIZE - GUARD_SIZE - len);
 			bsd_args.len = STACK_SIZE - GUARD_SIZE;
 		}
 	} else {
-		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr);
-		bsd_args.len  = linux_args->len;
+		bsd_args.addr = (caddr_t)PTRIN(addr);
+		bsd_args.len  = len;
 	}
-	bsd_args.pos = (off_t)linux_args->pgoff * PAGE_SIZE;
+	bsd_args.pos = pos;
 
 #ifdef DEBUG
 	if (ldebug(mmap))
-- 
cgit v1.1


From 72c2b241e0df95c69568e87e57e73171f83a3097 Mon Sep 17 00:00:00 2001
From: attilio <attilio@FreeBSD.org>
Date: Fri, 6 Nov 2009 10:15:15 +0000
Subject: MFC r198868, r198950: Opteron rev E family of processor expose a bug
 where acq memory barriers can be broken, resulting in random breakages.
 Printout a warning message if affecred family and model are found.

---
 sys/amd64/amd64/identcpu.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c
index 2c1b804..19ddd96 100644
--- a/sys/amd64/amd64/identcpu.c
+++ b/sys/amd64/amd64/identcpu.c
@@ -607,6 +607,21 @@ print_AMD_info(void)
 		printf(", %d lines/tag", (regs[2] >> 8) & 0x0f);
 		print_AMD_l2_assoc((regs[2] >> 12) & 0x0f);	
 	}
+
+	/*
+	 * Opteron Rev E shows a bug as in very rare occasions a read memory 
+	 * barrier is not performed as expected if it is followed by a 
+	 * non-atomic read-modify-write instruction.  
+	 * As long as that bug pops up very rarely (intensive machine usage
+	 * on other operating systems generally generates one unexplainable 
+	 * crash any 2 months) and as long as a model specific fix would be
+	 * impratical at this stage, print out a warning string if the broken
+	 * model and family are identified.
+	 */
+	if (CPUID_TO_FAMILY(cpu_id) == 0xf && CPUID_TO_MODEL(cpu_id) >= 0x20 &&
+	    CPUID_TO_MODEL(cpu_id) <= 0x3f)
+		printf("WARNING: This architecture revision has known SMP "
+		    "hardware bugs which may cause random instability\n");
 }
 
 static void
-- 
cgit v1.1


From 7a40b8619c3f71cce2a269846280bdba81b22164 Mon Sep 17 00:00:00 2001
From: attilio <attilio@FreeBSD.org>
Date: Fri, 6 Nov 2009 15:24:48 +0000
Subject: MFC r197070: Consolidate CPUID to CPU family/model macros for amd64
 and i386 to reduce unnecessary #ifdef's for shared code between them.

This MFC should unbreak the kernel build breakage introduced by
r198977.

Reported by:	kib
Pointy hat to:	me
---
 sys/amd64/amd64/identcpu.c     | 14 +++++++-------
 sys/amd64/amd64/initcpu.c      |  4 ++--
 sys/amd64/amd64/msi.c          |  4 ++--
 sys/amd64/include/specialreg.h |  4 ++--
 4 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c
index 19ddd96..420dd03 100644
--- a/sys/amd64/amd64/identcpu.c
+++ b/sys/amd64/amd64/identcpu.c
@@ -371,21 +371,21 @@ printcpuinfo(void)
 			switch (cpu_vendor_id) {
 			case CPU_VENDOR_AMD:
 				if ((amd_pminfo & AMDPM_TSC_INVARIANT) ||
-				    AMD64_CPU_FAMILY(cpu_id) >= 0x10 ||
+				    CPUID_TO_FAMILY(cpu_id) >= 0x10 ||
 				    cpu_id == 0x60fb2)
 					tsc_is_invariant = 1;
 				break;
 			case CPU_VENDOR_INTEL:
 				if ((amd_pminfo & AMDPM_TSC_INVARIANT) ||
-				    (AMD64_CPU_FAMILY(cpu_id) == 0x6 &&
-				    AMD64_CPU_MODEL(cpu_id) >= 0xe) ||
-				    (AMD64_CPU_FAMILY(cpu_id) == 0xf &&
-				    AMD64_CPU_MODEL(cpu_id) >= 0x3))
+				    (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+				    CPUID_TO_MODEL(cpu_id) >= 0xe) ||
+				    (CPUID_TO_FAMILY(cpu_id) == 0xf &&
+				    CPUID_TO_MODEL(cpu_id) >= 0x3))
 					tsc_is_invariant = 1;
 				break;
 			case CPU_VENDOR_CENTAUR:
-				if (AMD64_CPU_FAMILY(cpu_id) == 0x6 &&
-				    AMD64_CPU_MODEL(cpu_id) >= 0xf &&
+				if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+				    CPUID_TO_MODEL(cpu_id) >= 0xf &&
 				    (rdmsr(0x1203) & 0x100000000ULL) == 0)
 					tsc_is_invariant = 1;
 				break;
diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c
index 0037d66..7aaff82 100644
--- a/sys/amd64/amd64/initcpu.c
+++ b/sys/amd64/amd64/initcpu.c
@@ -154,8 +154,8 @@ initializecpu(void)
 		pg_nx = PG_NX;
 	}
 	if (cpu_vendor_id == CPU_VENDOR_CENTAUR &&
-	    AMD64_CPU_FAMILY(cpu_id) == 0x6 &&
-	    AMD64_CPU_MODEL(cpu_id) >= 0xf)
+	    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+	    CPUID_TO_MODEL(cpu_id) >= 0xf)
 		init_via();
 
 	/*
diff --git a/sys/amd64/amd64/msi.c b/sys/amd64/amd64/msi.c
index 736b692..91a8cbb 100644
--- a/sys/amd64/amd64/msi.c
+++ b/sys/amd64/amd64/msi.c
@@ -275,8 +275,8 @@ msi_init(void)
 	case CPU_VENDOR_AMD:
 		break;
 	case CPU_VENDOR_CENTAUR:
-		if (AMD64_CPU_FAMILY(cpu_id) == 0x6 &&
-		    AMD64_CPU_MODEL(cpu_id) >= 0xf)
+		if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+		    CPUID_TO_MODEL(cpu_id) >= 0xf)
 			break;
 		/* FALLTHROUGH */
 	default:
diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h
index 88ff734..d1f0c89 100644
--- a/sys/amd64/include/specialreg.h
+++ b/sys/amd64/include/specialreg.h
@@ -168,10 +168,10 @@
 #define	CPUID_FAMILY		0x00000f00
 #define	CPUID_EXT_MODEL		0x000f0000
 #define	CPUID_EXT_FAMILY	0x0ff00000
-#define	AMD64_CPU_MODEL(id) \
+#define	CPUID_TO_MODEL(id) \
     ((((id) & CPUID_MODEL) >> 4) | \
     (((id) & CPUID_EXT_MODEL) >> 12))
-#define	AMD64_CPU_FAMILY(id) \
+#define	CPUID_TO_FAMILY(id) \
     ((((id) & CPUID_FAMILY) >> 8) + \
     (((id) & CPUID_EXT_FAMILY) >> 20))
 
-- 
cgit v1.1


From c60a1c40ea3922c4c5f89319d756bf2fa348cd3f Mon Sep 17 00:00:00 2001
From: kensmith <kensmith@FreeBSD.org>
Date: Mon, 9 Nov 2009 21:39:42 +0000
Subject: Comment out the sbp(4) entry for GENERIC config files that contain
 it. There are known issues with this driver that are beyond what can be fixed
 for 8.0-RELEASE and the bugs can cause boot failure on some systems. It's not
 clear if it impacts all systems and there is interest in getting the problem
 fixed so for now just comment it out instead of remove it.

Commit straight to stable/8, this is an 8.0-RELEASE issue.  Head was left
alone so work on it can continue there.

Reviewed by:	Primary misc. architecture maintainers (marcel, marius)
---
 sys/amd64/conf/GENERIC | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index 24300bd..d0f24e2 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -313,7 +313,7 @@ device		udav		# Davicom DM9601E USB
 
 # FireWire support
 device		firewire	# FireWire bus code
-device		sbp		# SCSI over FireWire (Requires scbus and da)
+#device		sbp		# SCSI over FireWire (Requires scbus and da)
 device		fwe		# Ethernet over FireWire (non-standard!)
 device		fwip		# IP over FireWire (RFC 2734,3146)
 device		dcons		# Dumb console driver
-- 
cgit v1.1


From 890346338074885650a925acecc26c6214f9254d Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Tue, 17 Nov 2009 15:56:45 +0000
Subject: MFC 198043: Move the USB wireless drivers down into their own section
 next to the USB ethernet drivers.

---
 sys/amd64/conf/GENERIC | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index d0f24e2..f205106 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -288,10 +288,6 @@ device		ukbd		# Keyboard
 device		ulpt		# Printer
 device		umass		# Disks/Mass storage - Requires scbus and da
 device		ums		# Mouse
-device		rum		# Ralink Technology RT2501USB wireless NICs
-device		uath		# Atheros AR5523 wireless NICs
-device		ural		# Ralink Technology RT2500USB wireless NICs
-device		zyd		# ZyDAS zb1211/zb1211b wireless NICs
 device		urio		# Diamond Rio 500 MP3 player
 # USB Serial devices
 device		uark		# Technologies ARK3116 based serial adapters
@@ -310,6 +306,11 @@ device		cue		# CATC USB Ethernet
 device		kue		# Kawasaki LSI USB Ethernet
 device		rue		# RealTek RTL8150 USB Ethernet
 device		udav		# Davicom DM9601E USB
+# USB Wireless
+device		rum		# Ralink Technology RT2501USB wireless NICs
+device		uath		# Atheros AR5523 wireless NICs
+device		ural		# Ralink Technology RT2500USB wireless NICs
+device		zyd		# ZyDAS zb1211/zb1211b wireless NICs
 
 # FireWire support
 device		firewire	# FireWire bus code
-- 
cgit v1.1


From 629ad8710b200ef57d28598657647a1306cf2b16 Mon Sep 17 00:00:00 2001
From: kuriyama <kuriyama@FreeBSD.org>
Date: Sun, 22 Nov 2009 14:32:32 +0000
Subject: - MFC r199067,199215,199253

  - Add hw.clflush_disable loader tunable to avoid panic (trap 9) at
    map_invalidate_cache_range() even if CPU is not Intel.

  - This tunable can be set to -1 (default), 0 and 1.  -1 is same as
    current behavior, which automatically disable CLFLUSH on Intel CPUs
    without CPUID_SS (should be occured on Xen only).  You can specify 1
    when this panic happened on non-Intel CPUs (such as AMD's).  Because
    disabling CLFLUSH may reduce performance, you can try with setting 0
    on Intel CPUs without SS to use CLFLUSH feature.

  - Amd64 init_secondary() calls initializecpu() while curthread is
    still not properly set up. r199067 added the call to
    TUNABLE_INT_FETCH() to initializecpu() that results in hang because
    AP are started when kernel environment is already dynamic and thus
    needs to acquire mutex, that is too early in AP start sequence to
    work.

    Extract the code that should be executed only once, because it sets
    up global variables, from initializecpu() to initializecpucache(),
    and call the later only from hammer_time() executed on BSP. Now,
    TUNABLE_INT_FETCH() is done only once at BSP at the early boot
    stage.
---
 sys/amd64/amd64/initcpu.c  | 22 +++++++++++++++++++++-
 sys/amd64/amd64/machdep.c  |  1 +
 sys/amd64/include/md_var.h |  1 +
 3 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c
index 7aaff82..c97ad3d 100644
--- a/sys/amd64/amd64/initcpu.c
+++ b/sys/amd64/amd64/initcpu.c
@@ -47,6 +47,12 @@ __FBSDID("$FreeBSD$");
 static int	hw_instruction_sse;
 SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD,
     &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU");
+/*
+ * -1: automatic (default)
+ *  0: keep enable CLFLUSH
+ *  1: force disable CLFLUSH
+ */
+static int	hw_clflush_disable = -1;
 
 int	cpu;			/* Are we 386, 386sx, 486, etc? */
 u_int	cpu_feature;		/* Feature flags */
@@ -157,6 +163,11 @@ initializecpu(void)
 	    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 	    CPUID_TO_MODEL(cpu_id) >= 0xf)
 		init_via();
+}
+
+void
+initializecpucache()
+{
 
 	/*
 	 * CPUID with %eax = 1, %ebx returns
@@ -169,6 +180,15 @@ initializecpu(void)
 	 * XXXKIB: (temporary) hack to work around traps generated when
 	 * CLFLUSHing APIC registers window.
 	 */
-	if (cpu_vendor_id == CPU_VENDOR_INTEL && !(cpu_feature & CPUID_SS))
+	TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable);
+	if (cpu_vendor_id == CPU_VENDOR_INTEL && !(cpu_feature & CPUID_SS) &&
+	    hw_clflush_disable == -1)
+		cpu_feature &= ~CPUID_CLFSH;
+	/*
+	 * Allow to disable CLFLUSH feature manually by
+	 * hw.clflush_disable tunable.  This may help Xen guest on some AMD
+	 * CPUs.
+	 */
+	if (hw_clflush_disable == 1)
 		cpu_feature &= ~CPUID_CLFSH;
 }
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 95db5d2..e4c51a3 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1667,6 +1667,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 
 	identify_cpu();		/* Final stage of CPU initialization */
 	initializecpu();	/* Initialize CPU registers */
+	initializecpucache();
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	common_tss[0].tss_rsp0 = thread0.td_kstack + \
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
index c66fc9f..15df851 100644
--- a/sys/amd64/include/md_var.h
+++ b/sys/amd64/include/md_var.h
@@ -89,6 +89,7 @@ void	gs_load_fault(void) __asm(__STRING(gs_load_fault));
 void	dump_add_page(vm_paddr_t);
 void	dump_drop_page(vm_paddr_t);
 void	initializecpu(void);
+void	initializecpucache(void);
 void	fillw(int /*u_short*/ pat, void *base, size_t cnt);
 void	fpstate_drop(struct thread *td);
 int	is_physical_memory(vm_paddr_t addr);
-- 
cgit v1.1


From c75ccf4f6c42e9755a06177deed1a3b2aa9025ed Mon Sep 17 00:00:00 2001
From: bz <bz@FreeBSD.org>
Date: Sat, 5 Dec 2009 20:37:46 +0000
Subject: MFC r197518:

  lindev(4) [1] is supposed to be a collection of linux-specific pseudo
  devices that we also support, just not by default (thus only LINT or
  module builds by default).

  While currently there is only "/dev/full" [2], we are planning to see more
  in the future.  We may decide to change the module/dependency logic in the
  future should the list grow too long.

  This is not part of linux.ko as also non-linux binaries like kFreeBSD
  userland or ports can make use of this as well.

Suggested by:	rwatson [1] (name)
Submitted by:	ed [2]
Discussed with:	markm, ed, rwatson, kib (weeks ago)
Reviewed by:	rwatson, brueffer (prev. version)
PR:		kern/68961
---
 sys/amd64/conf/NOTES | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES
index 27fe068..5361224 100644
--- a/sys/amd64/conf/NOTES
+++ b/sys/amd64/conf/NOTES
@@ -503,3 +503,6 @@ options 	VM_KMEM_SIZE_SCALE
 # Enable NDIS binary driver support
 options 	NDISAPI
 device		ndis
+
+# Linux-specific pseudo devices support
+device		lindev
-- 
cgit v1.1


From 4b8cc441d499645fb9384830f45f987eda50ae9f Mon Sep 17 00:00:00 2001
From: bz <bz@FreeBSD.org>
Date: Sat, 5 Dec 2009 20:43:15 +0000
Subject: MFC r197729:

  Make sure that the primary native brandinfo always gets added
  first and the native ia32 compat as middle (before other things).
  o(ld)brandinfo as well as third party like linux, kfreebsd, etc.
  stays on SI_ORDER_ANY coming last.

  The reason for this is only to make sure that even in case we would
  overflow the MAX_BRANDS sized array, the native FreeBSD brandinfo
  would still be there and the system would be operational.

Reviewed by:	kib
---
 sys/amd64/amd64/elf_machdep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c
index d5e7a6e..dc7c8b9 100644
--- a/sys/amd64/amd64/elf_machdep.c
+++ b/sys/amd64/amd64/elf_machdep.c
@@ -89,7 +89,7 @@ static Elf64_Brandinfo freebsd_brand_info = {
 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
 };
 
-SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_ANY,
+SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST,
 	(sysinit_cfunc_t) elf64_insert_brand_entry,
 	&freebsd_brand_info);
 
-- 
cgit v1.1


From 4f817226e165dbc93626909eb579a189bd4b6b67 Mon Sep 17 00:00:00 2001
From: avg <avg@FreeBSD.org>
Date: Tue, 8 Dec 2009 15:21:39 +0000
Subject: MFC r199184: reflect that pg_ps_enabled is a tunable

---
 sys/amd64/amd64/pmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index d3d653d..70fc041 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -181,7 +181,7 @@ pt_entry_t pg_nx;
 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 
 static int pg_ps_enabled = 1;
-SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RD, &pg_ps_enabled, 0,
+SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
     "Are large page mappings enabled?");
 
 static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
-- 
cgit v1.1


From 8dde51c9b8bb821ae73fe850b7e6fd6d5d2bf998 Mon Sep 17 00:00:00 2001
From: avg <avg@FreeBSD.org>
Date: Tue, 8 Dec 2009 15:27:06 +0000
Subject: MFC r199968: x86 cpu features: add MOVBE reporting and flag

---
 sys/amd64/amd64/identcpu.c     | 2 +-
 sys/amd64/include/specialreg.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c
index 420dd03..3cd2f5e 100644
--- a/sys/amd64/amd64/identcpu.c
+++ b/sys/amd64/amd64/identcpu.c
@@ -259,7 +259,7 @@ printcpuinfo(void)
 				"\024SSE4.1"
 				"\025SSE4.2"
 				"\026x2APIC"	/* xAPIC Extensions */
-				"\027<b22>"
+				"\027MOVBE"
 				"\030POPCNT"
 				"\031<b24>"
 				"\032<b25>"
diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h
index d1f0c89..8cadbcd 100644
--- a/sys/amd64/include/specialreg.h
+++ b/sys/amd64/include/specialreg.h
@@ -129,6 +129,7 @@
 #define	CPUID2_SSE41	0x00080000
 #define	CPUID2_SSE42	0x00100000
 #define	CPUID2_X2APIC	0x00200000
+#define	CPUID2_MOVBE	0x00400000
 #define	CPUID2_POPCNT	0x00800000
 
 /*
-- 
cgit v1.1


From e317625370af5eb44b8855a15df69c7848fc1b45 Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Sat, 12 Dec 2009 20:06:25 +0000
Subject: MFC r199135: Extract the code that records syscall results in the
 frame into MD function cpu_set_syscall_retval().

---
 sys/amd64/amd64/trap.c       | 34 +---------------------------------
 sys/amd64/amd64/vm_machdep.c | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 33 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index cfccf3c..5583c82 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -1004,39 +1004,7 @@ syscall(struct trapframe *frame)
 #endif
 	}
 
-	switch (error) {
-	case 0:
-		frame->tf_rax = td->td_retval[0];
-		frame->tf_rdx = td->td_retval[1];
-		frame->tf_rflags &= ~PSL_C;
-		break;
-
-	case ERESTART:
-		/*
-		 * Reconstruct pc, we know that 'syscall' is 2 bytes.
-		 * We have to do a full context restore so that %r10
-		 * (which was holding the value of %rcx) is restored for
-		 * the next iteration.
-		 */
-		frame->tf_rip -= frame->tf_err;
-		frame->tf_r10 = frame->tf_rcx;
-		td->td_pcb->pcb_flags |= PCB_FULLCTX;
-		break;
-
-	case EJUSTRETURN:
-		break;
-
-	default:
- 		if (p->p_sysent->sv_errsize) {
- 			if (error >= p->p_sysent->sv_errsize)
-  				error = -1;	/* XXX */
-   			else
-  				error = p->p_sysent->sv_errtbl[error];
-		}
-		frame->tf_rax = error;
-		frame->tf_rflags |= PSL_C;
-		break;
-	}
+	cpu_set_syscall_retval(td, error);
 
 	/*
 	 * Traced syscall.
diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
index 51d1d62..6e56740 100644
--- a/sys/amd64/amd64/vm_machdep.c
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -317,6 +317,45 @@ cpu_thread_free(struct thread *td)
 	cpu_thread_clean(td);
 }
 
+void
+cpu_set_syscall_retval(struct thread *td, int error)
+{
+
+	switch (error) {
+	case 0:
+		td->td_frame->tf_rax = td->td_retval[0];
+		td->td_frame->tf_rdx = td->td_retval[1];
+		td->td_frame->tf_rflags &= ~PSL_C;
+		break;
+
+	case ERESTART:
+		/*
+		 * Reconstruct pc, we know that 'syscall' is 2 bytes.
+		 * We have to do a full context restore so that %r10
+		 * (which was holding the value of %rcx) is restored
+		 * for the next iteration.
+		 */
+		td->td_frame->tf_rip -= td->td_frame->tf_err;
+		td->td_frame->tf_r10 = td->td_frame->tf_rcx;
+		td->td_pcb->pcb_flags |= PCB_FULLCTX;
+		break;
+
+	case EJUSTRETURN:
+		break;
+
+	default:
+		if (td->td_proc->p_sysent->sv_errsize) {
+			if (error >= td->td_proc->p_sysent->sv_errsize)
+				error = -1;	/* XXX */
+			else
+				error = td->td_proc->p_sysent->sv_errtbl[error];
+		}
+		td->td_frame->tf_rax = error;
+		td->td_frame->tf_rflags |= PSL_C;
+		break;
+	}
+}
+
 /*
  * Initialize machine state (pcb and trap frame) for a new thread about to
  * upcall. Put enough state in the new thread's PCB to get it to go back 
-- 
cgit v1.1


From 62403394c0144ba68b98a015c199ceabd258a12b Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Sat, 19 Dec 2009 10:28:24 +0000
Subject: MFC r200444: For ia32 syscall(), call cpu_set_syscall_retval().

---
 sys/amd64/amd64/vm_machdep.c  |  6 +++++-
 sys/amd64/ia32/ia32_syscall.c | 30 +-----------------------------
 2 files changed, 6 insertions(+), 30 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
index 6e56740..a99fdaa 100644
--- a/sys/amd64/amd64/vm_machdep.c
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -330,10 +330,14 @@ cpu_set_syscall_retval(struct thread *td, int error)
 
 	case ERESTART:
 		/*
-		 * Reconstruct pc, we know that 'syscall' is 2 bytes.
+		 * Reconstruct pc, we know that 'syscall' is 2 bytes,
+		 * lcall $X,y is 7 bytes, int 0x80 is 2 bytes.
+		 * We saved this in tf_err.
 		 * We have to do a full context restore so that %r10
 		 * (which was holding the value of %rcx) is restored
 		 * for the next iteration.
+		 * r10 restore is only required for freebsd/amd64 processes,
+		 * but shall be innocent for any ia32 ABI.
 		 */
 		td->td_frame->tf_rip -= td->td_frame->tf_err;
 		td->td_frame->tf_r10 = td->td_frame->tf_rcx;
diff --git a/sys/amd64/ia32/ia32_syscall.c b/sys/amd64/ia32/ia32_syscall.c
index 4807248..5e20876 100644
--- a/sys/amd64/ia32/ia32_syscall.c
+++ b/sys/amd64/ia32/ia32_syscall.c
@@ -183,35 +183,7 @@ ia32_syscall(struct trapframe *frame)
 		AUDIT_SYSCALL_EXIT(error, td);
 	}
 
-	switch (error) {
-	case 0:
-		frame->tf_rax = td->td_retval[0];
-		frame->tf_rdx = td->td_retval[1];
-		frame->tf_rflags &= ~PSL_C;
-		break;
-
-	case ERESTART:
-		/*
-		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
-		 * int 0x80 is 2 bytes. We saved this in tf_err.
-		 */
-		frame->tf_rip -= frame->tf_err;
-		break;
-
-	case EJUSTRETURN:
-		break;
-
-	default:
- 		if (p->p_sysent->sv_errsize) {
- 			if (error >= p->p_sysent->sv_errsize)
-  				error = -1;	/* XXX */
-   			else
-  				error = p->p_sysent->sv_errtbl[error];
-		}
-		frame->tf_rax = error;
-		frame->tf_rflags |= PSL_C;
-		break;
-	}
+	cpu_set_syscall_retval(td, error);
 
 	/*
 	 * Traced syscall.
-- 
cgit v1.1


From 6e07528a2ecb9c4786663c06930b2f0d59a50694 Mon Sep 17 00:00:00 2001
From: avg <avg@FreeBSD.org>
Date: Sat, 19 Dec 2009 10:38:28 +0000
Subject: MFC r200033: mca: improve status checking, recording and reporting

---
 sys/amd64/amd64/mca.c   | 111 +++++++++++++++++++++++++++---------------------
 sys/amd64/include/mca.h |   1 +
 2 files changed, 63 insertions(+), 49 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/mca.c b/sys/amd64/amd64/mca.c
index d291d00..7014f75 100644
--- a/sys/amd64/amd64/mca.c
+++ b/sys/amd64/amd64/mca.c
@@ -117,48 +117,6 @@ sysctl_mca_records(SYSCTL_HANDLER_ARGS)
 	return (SYSCTL_OUT(req, &record, sizeof(record)));
 }
 
-static struct mca_record *
-mca_record_entry(int bank)
-{
-	struct mca_internal *rec;
-	uint64_t status;
-	u_int p[4];
-
-	status = rdmsr(MSR_MC_STATUS(bank));
-	if (!(status & MC_STATUS_VAL))
-		return (NULL);
-
-	rec = malloc(sizeof(*rec), M_MCA, M_NOWAIT | M_ZERO);
-	if (rec == NULL) {
-		printf("MCA: Unable to allocate space for an event.\n");
-		return (NULL);
-	}
-
-	/* Save exception information. */
-	rec->rec.mr_status = status;
-	if (status & MC_STATUS_ADDRV)
-		rec->rec.mr_addr = rdmsr(MSR_MC_ADDR(bank));
-	if (status & MC_STATUS_MISCV)
-		rec->rec.mr_misc = rdmsr(MSR_MC_MISC(bank));
-	rec->rec.mr_tsc = rdtsc();
-	rec->rec.mr_apic_id = PCPU_GET(apic_id);
-
-	/*
-	 * Clear machine check.  Don't do this for uncorrectable
-	 * errors so that the BIOS can see them.
-	 */
-	if (!(rec->rec.mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
-		wrmsr(MSR_MC_STATUS(bank), 0);
-		do_cpuid(0, p);
-	}
-
-	mtx_lock_spin(&mca_lock);
-	STAILQ_INSERT_TAIL(&mca_records, rec, link);
-	mca_count++;
-	mtx_unlock_spin(&mca_lock);
-	return (&rec->rec);
-}
-
 static const char *
 mca_error_ttype(uint16_t mca_error)
 {
@@ -219,11 +177,13 @@ mca_error_request(uint16_t mca_error)
 }
 
 /* Dump details about a single machine check. */
-static void
-mca_log(struct mca_record *rec)
+static void __nonnull(1)
+mca_log(const struct mca_record *rec)
 {
 	uint16_t mca_error;
 
+	printf("MCA: bank %d, status 0x%016llx\n", rec->mr_bank,
+	    (long long)rec->mr_status);
 	printf("MCA: CPU %d ", rec->mr_apic_id);
 	if (rec->mr_status & MC_STATUS_UC)
 		printf("UNCOR ");
@@ -329,6 +289,59 @@ mca_log(struct mca_record *rec)
 		printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr);
 }
 
+static int __nonnull(2)
+mca_check_status(int bank, struct mca_record *rec)
+{
+	uint64_t status;
+	u_int p[4];
+
+	status = rdmsr(MSR_MC_STATUS(bank));
+	if (!(status & MC_STATUS_VAL))
+		return (0);
+
+	/* Save exception information. */
+	rec->mr_status = status;
+	rec->mr_bank = bank;
+	rec->mr_addr = 0;
+	if (status & MC_STATUS_ADDRV)
+		rec->mr_addr = rdmsr(MSR_MC_ADDR(bank));
+	rec->mr_misc = 0;
+	if (status & MC_STATUS_MISCV)
+		rec->mr_misc = rdmsr(MSR_MC_MISC(bank));
+	rec->mr_tsc = rdtsc();
+	rec->mr_apic_id = PCPU_GET(apic_id);
+
+	/*
+	 * Clear machine check.  Don't do this for uncorrectable
+	 * errors so that the BIOS can see them.
+	 */
+	if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
+		wrmsr(MSR_MC_STATUS(bank), 0);
+		do_cpuid(0, p);
+	}
+	return (1);
+}
+
+static void __nonnull(1)
+mca_record_entry(const struct mca_record *record)
+{
+	struct mca_internal *rec;
+
+	rec = malloc(sizeof(*rec), M_MCA, M_NOWAIT);
+	if (rec == NULL) {
+		printf("MCA: Unable to allocate space for an event.\n");
+		mca_log(record);
+		return;
+	}
+
+	rec->rec = *record;
+	rec->logged = 0;
+	mtx_lock_spin(&mca_lock);
+	STAILQ_INSERT_TAIL(&mca_records, rec, link);
+	mca_count++;
+	mtx_unlock_spin(&mca_lock);
+}
+
 /*
  * This scans all the machine check banks of the current CPU to see if
  * there are any machine checks.  Any non-recoverable errors are
@@ -341,7 +354,7 @@ mca_log(struct mca_record *rec)
 static int
 mca_scan(int mcip)
 {
-	struct mca_record *rec;
+	struct mca_record rec;
 	uint64_t mcg_cap, ucmask;
 	int count, i, recoverable;
 
@@ -354,13 +367,13 @@ mca_scan(int mcip)
 		ucmask |= MC_STATUS_OVER;
 	mcg_cap = rdmsr(MSR_MCG_CAP);
 	for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
-		rec = mca_record_entry(i);
-		if (rec != NULL) {
+		if (mca_check_status(i, &rec)) {
 			count++;
-			if (rec->mr_status & ucmask) {
+			if (rec.mr_status & ucmask) {
 				recoverable = 0;
-				mca_log(rec);
+				mca_log(&rec);
 			}
+			mca_record_entry(&rec);
 		}
 	}
 	return (mcip ? recoverable : count);
diff --git a/sys/amd64/include/mca.h b/sys/amd64/include/mca.h
index c43d989..ddc3aeb 100644
--- a/sys/amd64/include/mca.h
+++ b/sys/amd64/include/mca.h
@@ -36,6 +36,7 @@ struct mca_record {
 	uint64_t	mr_misc;
 	uint64_t	mr_tsc;
 	int		mr_apic_id;
+	int		mr_bank;
 };
 
 #ifdef _KERNEL
-- 
cgit v1.1


From 2d6460c70b9c9e42fa2165c305d435aabe264008 Mon Sep 17 00:00:00 2001
From: avg <avg@FreeBSD.org>
Date: Sat, 19 Dec 2009 10:44:26 +0000
Subject: MFC r200064: mca: small enhancements related to cpu quirks

---
 sys/amd64/amd64/mca.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/mca.c b/sys/amd64/amd64/mca.c
index 7014f75..0403de4 100644
--- a/sys/amd64/amd64/mca.c
+++ b/sys/amd64/amd64/mca.c
@@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
+#include <machine/cputypes.h>
 #include <machine/mca.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
@@ -478,6 +479,8 @@ void
 mca_init(void)
 {
 	uint64_t mcg_cap;
+	uint64_t ctl;
+	int skip;
 	int i;
 
 	/* MCE is required. */
@@ -495,15 +498,26 @@ mca_init(void)
 			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
 
 		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
-			/*
-			 * Enable logging of all errors.  For P6
-			 * processors, MC0_CTL is always enabled.
-			 *
-			 * XXX: Better CPU test needed here?
-			 */
-			if (!(i == 0 && (cpu_id & 0xf00) == 0x600))
-				wrmsr(MSR_MC_CTL(i), 0xffffffffffffffffUL);
+			/* By default enable logging of all errors. */
+			ctl = 0xffffffffffffffffUL;
+			skip = 0;
+
+			if (cpu_vendor_id == CPU_VENDOR_INTEL) {
+				/*
+				 * For P6 models before Nehalem MC0_CTL is
+				 * always enabled and reserved.
+				 */
+				if (i == 0 && CPUID_TO_FAMILY(cpu_id) == 0x6
+				    && CPUID_TO_MODEL(cpu_id) < 0x1a)
+					skip = 1;
+			} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
+				/* BKDG for Family 10h: unset GartTblWkEn. */
+				if (i == 4 && CPUID_TO_FAMILY(cpu_id) >= 0xf)
+					ctl &= ~(1UL << 10);
+			}
 
+			if (!skip)
+				wrmsr(MSR_MC_CTL(i), ctl);
 			/* Clear all errors. */
 			wrmsr(MSR_MC_STATUS(i), 0);
 		}
-- 
cgit v1.1


From d1f389d774670bd276da5e368fad88de21804cbb Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Sat, 19 Dec 2009 11:31:28 +0000
Subject: MFC r198507: Use kern_sigprocmask() instead of direct manipulation of
 td_sigmask to reschedule newly blocked signals.

MFC r198590:
Trapsignal() calls kern_sigprocmask() when delivering catched signal
with proc lock held.

MFC r198670:
For trapsignal() and postsig(), kern_sigprocmask() is called with
both process lock and curproc->p_sigacts->ps_mtx locked. Prevent lock
recursion on ps_mtx in reschedule_signals().
---
 sys/amd64/amd64/machdep.c          |  8 ++------
 sys/amd64/ia32/ia32_signal.c       | 30 ++++++++----------------------
 sys/amd64/linux32/linux32_sysvec.c | 18 ++++++------------
 3 files changed, 16 insertions(+), 40 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index e4c51a3..c4130a4 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -415,7 +415,7 @@ sigreturn(td, uap)
 	ucontext_t uc;
 	struct proc *p = td->td_proc;
 	struct trapframe *regs;
-	const ucontext_t *ucp;
+	ucontext_t *ucp;
 	long rflags;
 	int cs, error, ret;
 	ksiginfo_t ksi;
@@ -478,7 +478,6 @@ sigreturn(td, uap)
 	td->td_pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
 	td->td_pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
 
-	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
@@ -486,10 +485,7 @@ sigreturn(td, uap)
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
-	td->td_sigmask = ucp->uc_sigmask;
-	SIG_CANTMASK(td->td_sigmask);
-	signotify(td);
-	PROC_UNLOCK(p);
+	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	td->td_pcb->pcb_flags |= PCB_FULLCTX;
 	td->td_pcb->pcb_full_iret = 1;
 	return (EJUSTRETURN);
diff --git a/sys/amd64/ia32/ia32_signal.c b/sys/amd64/ia32/ia32_signal.c
index d7c1dd5..10ec641 100644
--- a/sys/amd64/ia32/ia32_signal.c
+++ b/sys/amd64/ia32/ia32_signal.c
@@ -244,10 +244,8 @@ freebsd32_setcontext(struct thread *td, struct freebsd32_setcontext_args *uap)
 		if (ret == 0) {
 			ret = ia32_set_mcontext(td, &uc.uc_mcontext);
 			if (ret == 0) {
-				SIG_CANTMASK(uc.uc_sigmask);
-				PROC_LOCK(td->td_proc);
-				td->td_sigmask = uc.uc_sigmask;
-				PROC_UNLOCK(td->td_proc);
+				kern_sigprocmask(td, SIG_SETMASK,
+				    &uc.uc_sigmask, NULL, 0);
 			}
 		}
 	}
@@ -273,10 +271,8 @@ freebsd32_swapcontext(struct thread *td, struct freebsd32_swapcontext_args *uap)
 			if (ret == 0) {
 				ret = ia32_set_mcontext(td, &uc.uc_mcontext);
 				if (ret == 0) {
-					SIG_CANTMASK(uc.uc_sigmask);
-					PROC_LOCK(td->td_proc);
-					td->td_sigmask = uc.uc_sigmask;
-					PROC_UNLOCK(td->td_proc);
+					kern_sigprocmask(td, SIG_SETMASK,
+					    &uc.uc_sigmask, NULL, 0);
 				}
 			}
 		}
@@ -544,9 +540,8 @@ freebsd4_freebsd32_sigreturn(td, uap)
 	} */ *uap;
 {
 	struct ia32_ucontext4 uc;
-	struct proc *p = td->td_proc;
 	struct trapframe *regs;
-	const struct ia32_ucontext4 *ucp;
+	struct ia32_ucontext4 *ucp;
 	int cs, eflags, error;
 	ksiginfo_t ksi;
 
@@ -610,11 +605,7 @@ freebsd4_freebsd32_sigreturn(td, uap)
 	regs->tf_fs = ucp->uc_mcontext.mc_fs;
 	regs->tf_gs = ucp->uc_mcontext.mc_gs;
 
-	PROC_LOCK(p);
-	td->td_sigmask = ucp->uc_sigmask;
-	SIG_CANTMASK(td->td_sigmask);
-	signotify(td);
-	PROC_UNLOCK(p);
+	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	td->td_pcb->pcb_full_iret = 1;
 	return (EJUSTRETURN);
 }
@@ -631,9 +622,8 @@ freebsd32_sigreturn(td, uap)
 	} */ *uap;
 {
 	struct ia32_ucontext uc;
-	struct proc *p = td->td_proc;
 	struct trapframe *regs;
-	const struct ia32_ucontext *ucp;
+	struct ia32_ucontext *ucp;
 	int cs, eflags, error, ret;
 	ksiginfo_t ksi;
 
@@ -702,11 +692,7 @@ freebsd32_sigreturn(td, uap)
 	regs->tf_gs = ucp->uc_mcontext.mc_gs;
 	regs->tf_flags = TF_HASSEGS;
 
-	PROC_LOCK(p);
-	td->td_sigmask = ucp->uc_sigmask;
-	SIG_CANTMASK(td->td_sigmask);
-	signotify(td);
-	PROC_UNLOCK(p);
+	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	td->td_pcb->pcb_full_iret = 1;
 	return (EJUSTRETURN);
 }
diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c
index 54a04ee..6e3e326 100644
--- a/sys/amd64/linux32/linux32_sysvec.c
+++ b/sys/amd64/linux32/linux32_sysvec.c
@@ -565,9 +565,9 @@ linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 int
 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
 {
-	struct proc *p = td->td_proc;
 	struct l_sigframe frame;
 	struct trapframe *regs;
+	sigset_t bmask;
 	l_sigset_t lmask;
 	int eflags, i;
 	ksiginfo_t ksi;
@@ -623,11 +623,8 @@ linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
 	lmask.__bits[0] = frame.sf_sc.sc_mask;
 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
 		lmask.__bits[i+1] = frame.sf_extramask[i];
-	PROC_LOCK(p);
-	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
-	SIG_CANTMASK(td->td_sigmask);
-	signotify(td);
-	PROC_UNLOCK(p);
+	linux_to_bsd_sigset(&lmask, &bmask);
+	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
 
 	/*
 	 * Restore signal context.
@@ -666,9 +663,9 @@ linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
 int
 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
 {
-	struct proc *p = td->td_proc;
 	struct l_ucontext uc;
 	struct l_sigcontext *context;
+	sigset_t bmask;
 	l_stack_t *lss;
 	stack_t ss;
 	struct trapframe *regs;
@@ -725,11 +722,8 @@ linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
 		return(EINVAL);
 	}
 
-	PROC_LOCK(p);
-	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
-	SIG_CANTMASK(td->td_sigmask);
-	signotify(td);
-	PROC_UNLOCK(p);
+	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
+	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
 
 	/*
 	 * Restore signal context
-- 
cgit v1.1


From dca5db95ead46cf3b3e992a4b6ce1b96273e1df5 Mon Sep 17 00:00:00 2001
From: avg <avg@FreeBSD.org>
Date: Mon, 21 Dec 2009 05:58:55 +0000
Subject: MFC r199969: amdsbwd: new driver for AMD SB600/SB7xx watchdog timer

---
 sys/amd64/conf/NOTES | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES
index 5361224..a231d33 100644
--- a/sys/amd64/conf/NOTES
+++ b/sys/amd64/conf/NOTES
@@ -385,8 +385,10 @@ device		asmc
 # Hardware watchdog timers:
 #
 # ichwd: Intel ICH watchdog timer
+# amdsbwd: AMD SB7xx watchdog timer
 #
 device		ichwd
+device		amdsbwd
 
 #
 # Temperature sensors:
-- 
cgit v1.1


From 0b09bc897f9960c759f1437056dd1dae0a7f27b0 Mon Sep 17 00:00:00 2001
From: dougb <dougb@FreeBSD.org>
Date: Tue, 29 Dec 2009 05:35:25 +0000
Subject: MFC r200594:

Add INCLUDE_CONFIG_FILE, and a note in comments about how to also
include the comments with CONFIGARGS
---
 sys/amd64/conf/DEFAULTS | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/DEFAULTS b/sys/amd64/conf/DEFAULTS
index 78952ee..d6d39b6 100644
--- a/sys/amd64/conf/DEFAULTS
+++ b/sys/amd64/conf/DEFAULTS
@@ -21,3 +21,9 @@ options 	GEOM_PART_EBR
 options 	GEOM_PART_EBR_COMPAT
 options 	GEOM_PART_MBR
 
+# Store the plain version of the configuration file in the kernel itself.
+# To store the entire file, including comments, put this in /etc/src.conf:
+# CONFIGARGS=	-C
+# See config(8) for more details.
+#
+options 	INCLUDE_CONFIG_FILE	# Include this file in kernel
-- 
cgit v1.1


From cf0d4c606046680632b3e1b5711ae921a707be70 Mon Sep 17 00:00:00 2001
From: imp <imp@FreeBSD.org>
Date: Mon, 4 Jan 2010 21:33:10 +0000
Subject: Revert 201158.  DEFAULTS isn't for this kind of thing.a

---
 sys/amd64/conf/DEFAULTS | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/DEFAULTS b/sys/amd64/conf/DEFAULTS
index d6d39b6..1fb52b3 100644
--- a/sys/amd64/conf/DEFAULTS
+++ b/sys/amd64/conf/DEFAULTS
@@ -20,10 +20,3 @@ options 	GEOM_PART_BSD
 options 	GEOM_PART_EBR
 options 	GEOM_PART_EBR_COMPAT
 options 	GEOM_PART_MBR
-
-# Store the plain version of the configuration file in the kernel itself.
-# To store the entire file, including comments, put this in /etc/src.conf:
-# CONFIGARGS=	-C
-# See config(8) for more details.
-#
-options 	INCLUDE_CONFIG_FILE	# Include this file in kernel
-- 
cgit v1.1


From e906e61885fb51fb5334cec9007bd7cd17507e99 Mon Sep 17 00:00:00 2001
From: brooks <brooks@FreeBSD.org>
Date: Tue, 12 Jan 2010 06:00:56 +0000
Subject: MFC r201443:   Add vlan(4) to all GENERIC kernels.

---
 sys/amd64/conf/GENERIC | 1 +
 1 file changed, 1 insertion(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index f205106..6dc5c2c 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -265,6 +265,7 @@ device		wi		# WaveLAN/Intersil/Symbol 802.11 wireless NICs.
 device		loop		# Network loopback
 device		random		# Entropy device
 device		ether		# Ethernet support
+device		vlan		# 802.1Q VLAN support
 device		tun		# Packet tunnel.
 device		pty		# BSD-style compatibility pseudo ttys
 device		md		# Memory "disks"
-- 
cgit v1.1


From aa96e9a5bce21018931dd47a420764621121186f Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Fri, 15 Jan 2010 22:19:51 +0000
Subject: MFC r201890: Set md_ldt after md_ldt_sd is populated.

---
 sys/amd64/amd64/sys_machdep.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/sys_machdep.c b/sys/amd64/amd64/sys_machdep.c
index 1cba8a2..bb81664 100644
--- a/sys/amd64/amd64/sys_machdep.c
+++ b/sys/amd64/amd64/sys_machdep.c
@@ -420,13 +420,14 @@ user_ldt_alloc(struct proc *p, int force)
 		return (pldt);
 	}
 
-	mdp->md_ldt = new_ldt;
 	if (pldt != NULL) {
 		bcopy(pldt->ldt_base, new_ldt->ldt_base, max_ldt_segment *
 		    sizeof(struct user_segment_descriptor));
 		user_ldt_derefl(pldt);
 	}
 	ssdtosyssd(&sldt, &p->p_md.md_ldt_sd);
+	atomic_store_rel_ptr((volatile uintptr_t *)&mdp->md_ldt,
+	    (uintptr_t)new_ldt);
 	if (p == curproc)
 		set_user_ldt(mdp);
 
-- 
cgit v1.1


From 258a09a63f93b76b1ccd2c3d907309d3556f50b7 Mon Sep 17 00:00:00 2001
From: imp <imp@FreeBSD.org>
Date: Mon, 18 Jan 2010 00:53:21 +0000
Subject: MFC r202019:   Add INCLUDE_CONFIG_FILE in GENERIC on all non-embedded
 platforms.   # This is the resolution of removing it from DEFAULTS...

---
 sys/amd64/conf/GENERIC | 1 +
 1 file changed, 1 insertion(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index 6dc5c2c..e5a6955 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -75,6 +75,7 @@ options 	MAC			# TrustedBSD MAC Framework
 options		FLOWTABLE		# per-cpu routing cache
 #options 	KDTRACE_FRAME		# Ensure frames are compiled in
 #options 	KDTRACE_HOOKS		# Kernel DTrace hooks
+options 	INCLUDE_CONFIG_FILE     # Include this file in kernel
 
 # Make an SMP-capable kernel by default
 options 	SMP			# Symmetric MultiProcessor Kernel
-- 
cgit v1.1


From f870e8630e0e421cce286c928d950a148a741bd3 Mon Sep 17 00:00:00 2001
From: alc <alc@FreeBSD.org>
Date: Mon, 18 Jan 2010 21:17:03 +0000
Subject: MFC r202085   Simplify pmap_init().  Additionally, correct a harmless
 misbehavior on   i386.

---
 sys/amd64/amd64/pmap.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 70fc041..b26cc68 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -626,7 +626,6 @@ pmap_page_init(vm_page_t m)
 void
 pmap_init(void)
 {
-	pd_entry_t *pd;
 	vm_page_t mpte;
 	vm_size_t s;
 	int i, pv_npg;
@@ -635,18 +634,13 @@ pmap_init(void)
 	 * Initialize the vm page array entries for the kernel pmap's
 	 * page table pages.
 	 */ 
-	pd = pmap_pde(kernel_pmap, KERNBASE);
 	for (i = 0; i < NKPT; i++) {
-		if ((pd[i] & (PG_PS | PG_V)) == (PG_PS | PG_V))
-			continue;
-		KASSERT((pd[i] & PG_V) != 0,
-		    ("pmap_init: page table page is missing"));
-		mpte = PHYS_TO_VM_PAGE(pd[i] & PG_FRAME);
+		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
 		KASSERT(mpte >= vm_page_array &&
 		    mpte < &vm_page_array[vm_page_array_size],
 		    ("pmap_init: page table page is out of range"));
 		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
-		mpte->phys_addr = pd[i] & PG_FRAME;
+		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
 	}
 
 	/*
-- 
cgit v1.1


From ed3f8b6cbec7668d7c3684c50f6db189f85ee4d7 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Thu, 21 Jan 2010 15:10:20 +0000
Subject: MFC 202286: Update the ident for the XENHVM kernel config to match
 the filename.

---
 sys/amd64/conf/XENHVM | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/XENHVM b/sys/amd64/conf/XENHVM
index 1536e3c..f875f5a 100644
--- a/sys/amd64/conf/XENHVM
+++ b/sys/amd64/conf/XENHVM
@@ -19,7 +19,7 @@
 # $FreeBSD$
 
 cpu		HAMMER
-ident		GENERIC
+ident		XENHVM
 
 # To statically compile in device wiring instead of /boot/device.hints
 #hints		"GENERIC.hints"		# Default places to look for devices.
-- 
cgit v1.1


From 41fd8cafd516ff7948cdf8e99dbdb1e760f8df59 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Thu, 21 Jan 2010 17:54:29 +0000
Subject: MFC 198134,198149,198170,198171,198391,200948: Add a facility for
 associating optional descriptions with active interrupt handlers.  This is
 primarily intended as a way to allow devices that use multiple interrupts
 (e.g. MSI) to meaningfully distinguish the various interrupt handlers. - Add
 a new BUS_DESCRIBE_INTR() method to the bus interface to associate   a
 description with an active interrupt handler setup by BUS_SETUP_INTR.   It
 has a default method (bus_generic_describe_intr()) which simply passes   the
 request up to the parent device. - Add a bus_describe_intr() wrapper around
 BUS_DESCRIBE_INTR() that supports   printf(9) style formatting using var
 args. - Reserve MAXCOMLEN bytes in the intr_handler structure to hold the
 name of   an interrupt handler and copy the name passed to
 intr_event_add_handler()   into that buffer instead of just saving the
 pointer to the name. - Add a new intr_event_describe_handler() which appends
 a description string   to an interrupt handler's name. - Implement support
 for interrupt descriptions on amd64, i386, and sparc64 by   having the
 nexus(4) driver supply a custom bus_describe_intr method that   invokes a new
 intr_describe() MD routine which in turn looks up the   associated interrupt
 event and invokes intr_event_describe_handler().

---
 sys/amd64/amd64/intr_machdep.c   | 17 +++++++++++++++++
 sys/amd64/amd64/nexus.c          | 12 ++++++++++++
 sys/amd64/include/intr_machdep.h |  1 +
 3 files changed, 30 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/intr_machdep.c b/sys/amd64/amd64/intr_machdep.c
index 212ac0d..6ab80df 100644
--- a/sys/amd64/amd64/intr_machdep.c
+++ b/sys/amd64/amd64/intr_machdep.c
@@ -400,6 +400,23 @@ atpic_reset(void)
 }
 #endif
 
+/* Add a description to an active interrupt handler. */
+int
+intr_describe(u_int vector, void *ih, const char *descr)
+{
+	struct intsrc *isrc;
+	int error;
+
+	isrc = intr_lookup_source(vector);
+	if (isrc == NULL)
+		return (EINVAL);
+	error = intr_event_describe_handler(isrc->is_event, ih, descr);
+	if (error)
+		return (error);
+	intrcnt_updatename(isrc);
+	return (0);
+}
+
 #ifdef DDB
 /*
  * Dump data about interrupt handlers
diff --git a/sys/amd64/amd64/nexus.c b/sys/amd64/amd64/nexus.c
index 5eafd3b..61cb587 100644
--- a/sys/amd64/amd64/nexus.c
+++ b/sys/amd64/amd64/nexus.c
@@ -92,6 +92,9 @@ static	int nexus_bind_intr(device_t, device_t, struct resource *, int);
 #endif
 static	int nexus_config_intr(device_t, int, enum intr_trigger,
 			      enum intr_polarity);
+static	int nexus_describe_intr(device_t dev, device_t child,
+				struct resource *irq, void *cookie,
+				const char *descr);
 static	int nexus_activate_resource(device_t, device_t, int, int,
 				    struct resource *);
 static	int nexus_deactivate_resource(device_t, device_t, int, int,
@@ -135,6 +138,7 @@ static device_method_t nexus_methods[] = {
 	DEVMETHOD(bus_bind_intr,	nexus_bind_intr),
 #endif
 	DEVMETHOD(bus_config_intr,	nexus_config_intr),
+	DEVMETHOD(bus_describe_intr,	nexus_describe_intr),
 	DEVMETHOD(bus_get_resource_list, nexus_get_reslist),
 	DEVMETHOD(bus_set_resource,	nexus_set_resource),
 	DEVMETHOD(bus_get_resource,	nexus_get_resource),
@@ -479,6 +483,14 @@ nexus_config_intr(device_t dev, int irq, enum intr_trigger trig,
 	return (intr_config_intr(irq, trig, pol));
 }
 
+static int
+nexus_describe_intr(device_t dev, device_t child, struct resource *irq,
+    void *cookie, const char *descr)
+{
+
+	return (intr_describe(rman_get_start(irq), cookie, descr));
+}
+
 static struct resource_list *
 nexus_get_reslist(device_t dev, device_t child)
 {
diff --git a/sys/amd64/include/intr_machdep.h b/sys/amd64/include/intr_machdep.h
index 634db19..6cd4eee 100644
--- a/sys/amd64/include/intr_machdep.h
+++ b/sys/amd64/include/intr_machdep.h
@@ -151,6 +151,7 @@ int	intr_bind(u_int vector, u_char cpu);
 #endif
 int	intr_config_intr(int vector, enum intr_trigger trig,
     enum intr_polarity pol);
+int	intr_describe(u_int vector, void *ih, const char *descr);
 void	intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame);
 u_int	intr_next_cpu(void);
 struct intsrc *intr_lookup_source(int vector);
-- 
cgit v1.1


From 8be646a544a7d87a9a3ceff771365482bb960a53 Mon Sep 17 00:00:00 2001
From: marcel <marcel@FreeBSD.org>
Date: Fri, 22 Jan 2010 03:50:43 +0000
Subject: MFC rev. 202097: Use io(4) for I/O port access on ia64, rather than
 through sysarch(2).

---
 sys/amd64/amd64/io.c      | 9 +++++++++
 sys/amd64/include/iodev.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/io.c b/sys/amd64/amd64/io.c
index 02d9c8d..09d6e89 100644
--- a/sys/amd64/amd64/io.c
+++ b/sys/amd64/amd64/io.c
@@ -76,3 +76,12 @@ ioclose(struct cdev *dev __unused, int flags __unused, int fmt __unused,
 
 	return (0);
 }
+
+/* ARGSUSED */
+int
+ioioctl(struct cdev *dev __unused, u_long cmd __unused, caddr_t data __unused,
+    int fflag __unused, struct thread *td __unused)
+{
+
+	return (ENXIO);
+}
diff --git a/sys/amd64/include/iodev.h b/sys/amd64/include/iodev.h
index 4b35d8b..1a0a17a 100644
--- a/sys/amd64/include/iodev.h
+++ b/sys/amd64/include/iodev.h
@@ -28,3 +28,4 @@
 
 d_open_t	ioopen;
 d_close_t	ioclose;
+d_ioctl_t	ioioctl;
-- 
cgit v1.1


From 250c6042c1a01d96dc2843c5fb8641674678684a Mon Sep 17 00:00:00 2001
From: gavin <gavin@FreeBSD.org>
Date: Fri, 5 Feb 2010 08:52:51 +0000
Subject: Merge r202161 from head:

  Spell "Hz" correctly wherever it is user-visible.

PR:		bin/142566
Submitted by:	N.J. Mann   njm njm.me.uk
---
 sys/amd64/amd64/local_apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c
index 87bec91..98ed4df 100644
--- a/sys/amd64/amd64/local_apic.c
+++ b/sys/amd64/amd64/local_apic.c
@@ -448,7 +448,7 @@ lapic_setup_clock(void)
 		panic("lapic: Divisor too big");
 	value /= 2;
 	if (bootverbose)
-		printf("lapic: Divisor %lu, Frequency %lu hz\n",
+		printf("lapic: Divisor %lu, Frequency %lu Hz\n",
 		    lapic_timer_divisor, value);
 
 	/*
-- 
cgit v1.1


From 105ceef6e77714e5419b12c57bcc1fa98d485e01 Mon Sep 17 00:00:00 2001
From: avg <avg@FreeBSD.org>
Date: Sat, 6 Feb 2010 12:17:20 +0000
Subject: MFC r203160: add static qualifier to definition of a static function

---
 sys/amd64/amd64/msi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/msi.c b/sys/amd64/amd64/msi.c
index 91a8cbb..6745ce2 100644
--- a/sys/amd64/amd64/msi.c
+++ b/sys/amd64/amd64/msi.c
@@ -288,7 +288,7 @@ msi_init(void)
 	mtx_init(&msi_lock, "msi", NULL, MTX_DEF);
 }
 
-void
+static void
 msi_create_source(void)
 {
 	struct msi_intsrc *msi;
-- 
cgit v1.1


From e896a698a55dac3d00b1d933e56575d08d06bf37 Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Sun, 7 Feb 2010 11:37:38 +0000
Subject: MFC r202882: For i386, amd64 and ia32 on amd64 MD syscall(), reread
 syscall number and arguments after ptracestop(), if debugger modified
 anything in the process environment.

---
 sys/amd64/amd64/trap.c        | 176 +++++++++++++++++++++++++-----------------
 sys/amd64/ia32/ia32_syscall.c | 149 +++++++++++++++++++++--------------
 2 files changed, 200 insertions(+), 125 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 5583c82..41ca758 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -884,95 +884,131 @@ dblfault_handler(struct trapframe *frame)
 	panic("double fault");
 }
 
-/*
- *	syscall -	system call request C handler
- *
- *	A system call is essentially treated as a trap.
- */
-void
-syscall(struct trapframe *frame)
-{
-	caddr_t params;
+struct syscall_args {
+	u_int code;
 	struct sysent *callp;
-	struct thread *td = curthread;
-	struct proc *p = td->td_proc;
-	register_t orig_tf_rflags;
-	int error;
-	int narg;
 	register_t args[8];
 	register_t *argp;
-	u_int code;
-	int reg, regcnt;
-	ksiginfo_t ksi;
-
-	PCPU_INC(cnt.v_syscall);
+	int narg;
+};
 
-#ifdef DIAGNOSTIC
-	if (ISPL(frame->tf_cs) != SEL_UPL) {
-		panic("syscall");
-		/* NOT REACHED */
-	}
-#endif
+static int
+fetch_syscall_args(struct thread *td, struct syscall_args *sa)
+{
+	struct proc *p;
+	struct trapframe *frame;
+	caddr_t params;
+	int reg, regcnt, error;
 
+	p = td->td_proc;
+	frame = td->td_frame;
 	reg = 0;
 	regcnt = 6;
-	td->td_pticks = 0;
-	td->td_frame = frame;
-	if (td->td_ucred != p->p_ucred) 
-		cred_update_thread(td);
+
 	params = (caddr_t)frame->tf_rsp + sizeof(register_t);
-	code = frame->tf_rax;
-	orig_tf_rflags = frame->tf_rflags;
+	sa->code = frame->tf_rax;
 
 	if (p->p_sysent->sv_prepsyscall) {
-		(*p->p_sysent->sv_prepsyscall)(frame, (int *)args, &code, &params);
+		(*p->p_sysent->sv_prepsyscall)(frame, (int *)sa->args,
+		    &sa->code, &params);
 	} else {
-		if (code == SYS_syscall || code == SYS___syscall) {
-			code = frame->tf_rdi;
+		if (sa->code == SYS_syscall || sa->code == SYS___syscall) {
+			sa->code = frame->tf_rdi;
 			reg++;
 			regcnt--;
 		}
 	}
-
  	if (p->p_sysent->sv_mask)
- 		code &= p->p_sysent->sv_mask;
+ 		sa->code &= p->p_sysent->sv_mask;
 
- 	if (code >= p->p_sysent->sv_size)
- 		callp = &p->p_sysent->sv_table[0];
+ 	if (sa->code >= p->p_sysent->sv_size)
+ 		sa->callp = &p->p_sysent->sv_table[0];
   	else
- 		callp = &p->p_sysent->sv_table[code];
+ 		sa->callp = &p->p_sysent->sv_table[sa->code];
 
-	narg = callp->sy_narg;
-	KASSERT(narg <= sizeof(args) / sizeof(args[0]),
+	sa->narg = sa->callp->sy_narg;
+	KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]),
 	    ("Too many syscall arguments!"));
 	error = 0;
-	argp = &frame->tf_rdi;
-	argp += reg;
-	bcopy(argp, args, sizeof(args[0]) * regcnt);
-	if (narg > regcnt) {
+	sa->argp = &frame->tf_rdi;
+	sa->argp += reg;
+	bcopy(sa->argp, sa->args, sizeof(sa->args[0]) * regcnt);
+	if (sa->narg > regcnt) {
 		KASSERT(params != NULL, ("copyin args with no params!"));
-		error = copyin(params, &args[regcnt],
-	    		(narg - regcnt) * sizeof(args[0]));
+		error = copyin(params, &sa->args[regcnt],
+	    	    (sa->narg - regcnt) * sizeof(sa->args[0]));
 	}
-	argp = &args[0];
+	sa->argp = &sa->args[0];
 
+	/*
+	 * This may result in two records if debugger modified
+	 * registers or memory during sleep at stop/ptrace point.
+	 */
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSCALL))
-		ktrsyscall(code, narg, argp);
+		ktrsyscall(sa->code, sa->narg, sa->argp);
 #endif
+	return (error);
+}
 
-	CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td,
-	    td->td_proc->p_pid, td->td_name, code);
+/*
+ *	syscall -	system call request C handler
+ *
+ *	A system call is essentially treated as a trap.
+ */
+void
+syscall(struct trapframe *frame)
+{
+	struct thread *td;
+	struct proc *p;
+	struct syscall_args sa;
+	register_t orig_tf_rflags;
+	int error;
+	ksiginfo_t ksi;
 
+	PCPU_INC(cnt.v_syscall);
+	td = curthread;
+	p = td->td_proc;
 	td->td_syscalls++;
 
+#ifdef DIAGNOSTIC
+	if (ISPL(frame->tf_cs) != SEL_UPL) {
+		panic("syscall");
+		/* NOT REACHED */
+	}
+#endif
+
+	td->td_pticks = 0;
+	td->td_frame = frame;
+	if (td->td_ucred != p->p_ucred) 
+		cred_update_thread(td);
+	orig_tf_rflags = frame->tf_rflags;
+	if (p->p_flag & P_TRACED) {
+		PROC_LOCK(p);
+		td->td_dbgflags &= ~TDB_USERWR;
+		PROC_UNLOCK(p);
+	}
+	error = fetch_syscall_args(td, &sa);
+
+	CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td,
+	    td->td_proc->p_pid, td->td_name, sa.code);
+
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = frame->tf_rdx;
 
-		STOPEVENT(p, S_SCE, narg);
-
+		STOPEVENT(p, S_SCE, sa.narg);
 		PTRACESTOP_SC(p, td, S_PT_SCE);
+		if (td->td_dbgflags & TDB_USERWR) {
+			/*
+			 * Reread syscall number and arguments if
+			 * debugger modified registers or memory.
+			 */
+			error = fetch_syscall_args(td, &sa);
+			if (error != 0)
+				goto retval;
+			td->td_retval[1] = frame->tf_rdx;
+		}
 
 #ifdef KDTRACE_HOOKS
 		/*
@@ -980,13 +1016,13 @@ syscall(struct trapframe *frame)
 		 * callback and if there is a probe active for the
 		 * syscall 'entry', process the probe.
 		 */
-		if (systrace_probe_func != NULL && callp->sy_entry != 0)
-			(*systrace_probe_func)(callp->sy_entry, code, callp,
-			    args);
+		if (systrace_probe_func != NULL && sa.callp->sy_entry != 0)
+			(*systrace_probe_func)(sa.callp->sy_entry, sa.code,
+			    sa.callp, sa.args);
 #endif
 
-		AUDIT_SYSCALL_ENTER(code, td);
-		error = (*callp->sy_call)(td, argp);
+		AUDIT_SYSCALL_ENTER(sa.code, td);
+		error = (*sa.callp->sy_call)(td, sa.argp);
 		AUDIT_SYSCALL_EXIT(error, td);
 
 		/* Save the latest error return value. */
@@ -998,12 +1034,12 @@ syscall(struct trapframe *frame)
 		 * callback and if there is a probe active for the
 		 * syscall 'return', process the probe.
 		 */
-		if (systrace_probe_func != NULL && callp->sy_return != 0)
-			(*systrace_probe_func)(callp->sy_return, code, callp,
-			    args);
+		if (systrace_probe_func != NULL && sa.callp->sy_return != 0)
+			(*systrace_probe_func)(sa.callp->sy_return, sa.code,
+			    sa.callp, sa.args);
 #endif
 	}
-
+ retval:
 	cpu_set_syscall_retval(td, error);
 
 	/*
@@ -1022,14 +1058,16 @@ syscall(struct trapframe *frame)
 	 * Check for misbehavior.
 	 */
 	WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning",
-	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???");
+	    (sa.code >= 0 && sa.code < SYS_MAXSYSCALL) ?
+	     syscallnames[sa.code] : "???");
 	KASSERT(td->td_critnest == 0,
 	    ("System call %s returning in a critical section",
-	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"));
+	    (sa.code >= 0 && sa.code < SYS_MAXSYSCALL) ?
+	     syscallnames[sa.code] : "???"));
 	KASSERT(td->td_locks == 0,
 	    ("System call %s returning with %d locks held",
-	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???",
-	    td->td_locks));
+	    (sa.code >= 0 && sa.code < SYS_MAXSYSCALL) ?
+	     syscallnames[sa.code] : "???", td->td_locks));
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
@@ -1037,11 +1075,11 @@ syscall(struct trapframe *frame)
 	userret(td, frame);
 
 	CTR4(KTR_SYSC, "syscall exit thread %p pid %d proc %s code %d", td,
-	    td->td_proc->p_pid, td->td_name, code);
+	    td->td_proc->p_pid, td->td_name, sa.code);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
-		ktrsysret(code, error, td->td_retval[0]);
+		ktrsysret(sa.code, error, td->td_retval[0]);
 #endif
 
 	/*
@@ -1049,7 +1087,7 @@ syscall(struct trapframe *frame)
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
-	STOPEVENT(p, S_SCX, code);
+	STOPEVENT(p, S_SCX, sa.code);
 
 	PTRACESTOP_SC(p, td, S_PT_SCX);
 }
diff --git a/sys/amd64/ia32/ia32_syscall.c b/sys/amd64/ia32/ia32_syscall.c
index 5e20876..aa1ae6c 100644
--- a/sys/amd64/ia32/ia32_syscall.c
+++ b/sys/amd64/ia32/ia32_syscall.c
@@ -88,101 +88,136 @@ extern const char *freebsd32_syscallnames[];
 
 void ia32_syscall(struct trapframe *frame);	/* Called from asm code */
 
-void
-ia32_syscall(struct trapframe *frame)
-{
+struct ia32_syscall_args {
+	u_int code;
 	caddr_t params;
-	int i;
 	struct sysent *callp;
-	struct thread *td = curthread;
-	struct proc *p = td->td_proc;
-	register_t orig_tf_rflags;
-	int error;
+	u_int64_t args64[8];
 	int narg;
+};
+
+static int
+fetch_ia32_syscall_args(struct thread *td, struct ia32_syscall_args *sa)
+{
+	struct proc *p;
+	struct trapframe *frame;
 	u_int32_t args[8];
-	u_int64_t args64[8];
-	u_int code;
-	ksiginfo_t ksi;
+	int error, i;
 
-	PCPU_INC(cnt.v_syscall);
-	td->td_pticks = 0;
-	td->td_frame = frame;
-	if (td->td_ucred != p->p_ucred) 
-		cred_update_thread(td);
-	params = (caddr_t)frame->tf_rsp + sizeof(u_int32_t);
-	code = frame->tf_rax;
-	orig_tf_rflags = frame->tf_rflags;
+	p = td->td_proc;
+	frame = td->td_frame;
+
+	sa->params = (caddr_t)frame->tf_rsp + sizeof(u_int32_t);
+	sa->code = frame->tf_rax;
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/*
 		 * The prep code is MP aware.
 		 */
-		(*p->p_sysent->sv_prepsyscall)(frame, args, &code, &params);
+		(*p->p_sysent->sv_prepsyscall)(frame, args, &sa->code,
+		    &sa->params);
 	} else {
 		/*
 		 * Need to check if this is a 32 bit or 64 bit syscall.
 		 * fuword is MP aware.
 		 */
-		if (code == SYS_syscall) {
+		if (sa->code == SYS_syscall) {
 			/*
 			 * Code is first argument, followed by actual args.
 			 */
-			code = fuword32(params);
-			params += sizeof(int);
-		} else if (code == SYS___syscall) {
+			sa->code = fuword32(sa->params);
+			sa->params += sizeof(int);
+		} else if (sa->code == SYS___syscall) {
 			/*
 			 * Like syscall, but code is a quad, so as to maintain
 			 * quad alignment for the rest of the arguments.
 			 * We use a 32-bit fetch in case params is not
 			 * aligned.
 			 */
-			code = fuword32(params);
-			params += sizeof(quad_t);
+			sa->code = fuword32(sa->params);
+			sa->params += sizeof(quad_t);
 		}
 	}
-
  	if (p->p_sysent->sv_mask)
- 		code &= p->p_sysent->sv_mask;
-
- 	if (code >= p->p_sysent->sv_size)
- 		callp = &p->p_sysent->sv_table[0];
+ 		sa->code &= p->p_sysent->sv_mask;
+ 	if (sa->code >= p->p_sysent->sv_size)
+ 		sa->callp = &p->p_sysent->sv_table[0];
   	else
- 		callp = &p->p_sysent->sv_table[code];
-
-	narg = callp->sy_narg;
+ 		sa->callp = &p->p_sysent->sv_table[sa->code];
+	sa->narg = sa->callp->sy_narg;
 
-	/*
-	 * copyin and the ktrsyscall()/ktrsysret() code is MP-aware
-	 */
-	if (params != NULL && narg != 0)
-		error = copyin(params, (caddr_t)args,
-		    (u_int)(narg * sizeof(int)));
+	if (sa->params != NULL && sa->narg != 0)
+		error = copyin(sa->params, (caddr_t)args,
+		    (u_int)(sa->narg * sizeof(int)));
 	else
 		error = 0;
 
-	for (i = 0; i < narg; i++)
-		args64[i] = args[i];
+	for (i = 0; i < sa->narg; i++)
+		sa->args64[i] = args[i];
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSCALL))
-		ktrsyscall(code, narg, args64);
+		ktrsyscall(sa->code, sa->narg, sa->args64);
 #endif
+
+	return (error);
+}
+
+void
+ia32_syscall(struct trapframe *frame)
+{
+	struct thread *td;
+	struct proc *p;
+	struct ia32_syscall_args sa;
+	register_t orig_tf_rflags;
+	int error;
+	ksiginfo_t ksi;
+
+	PCPU_INC(cnt.v_syscall);
+	td = curthread;
+	p = td->td_proc;
+	td->td_syscalls++;
+
+	td->td_pticks = 0;
+	td->td_frame = frame;
+	if (td->td_ucred != p->p_ucred) 
+		cred_update_thread(td);
+	orig_tf_rflags = frame->tf_rflags;
+	if (p->p_flag & P_TRACED) {
+		PROC_LOCK(p);
+		td->td_dbgflags &= ~TDB_USERWR;
+		PROC_UNLOCK(p);
+	}
+	error = fetch_ia32_syscall_args(td, &sa);
+
 	CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td,
-	    td->td_proc->p_pid, td->td_proc->p_comm, code);
+	    td->td_proc->p_pid, td->td_name, sa.code);
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = frame->tf_rdx;
 
-		STOPEVENT(p, S_SCE, narg);
-
+		STOPEVENT(p, S_SCE, sa.narg);
 		PTRACESTOP_SC(p, td, S_PT_SCE);
+		if (td->td_dbgflags & TDB_USERWR) {
+			/*
+			 * Reread syscall number and arguments if
+			 * debugger modified registers or memory.
+			 */
+			error = fetch_ia32_syscall_args(td, &sa);
+			if (error != 0)
+				goto retval;
+			td->td_retval[1] = frame->tf_rdx;
+		}
 
-		AUDIT_SYSCALL_ENTER(code, td);
-		error = (*callp->sy_call)(td, args64);
+		AUDIT_SYSCALL_ENTER(sa.code, td);
+		error = (*sa.callp->sy_call)(td, sa.args64);
 		AUDIT_SYSCALL_EXIT(error, td);
-	}
 
+		/* Save the latest error return value. */
+		td->td_errno = error;
+	}
+ retval:
 	cpu_set_syscall_retval(td, error);
 
 	/*
@@ -201,14 +236,16 @@ ia32_syscall(struct trapframe *frame)
 	 * Check for misbehavior.
 	 */
 	WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning",
-	    (code >= 0 && code < SYS_MAXSYSCALL) ? freebsd32_syscallnames[code] : "???");
+	    (sa.code >= 0 && sa.code < SYS_MAXSYSCALL) ?
+	     freebsd32_syscallnames[sa.code] : "???");
 	KASSERT(td->td_critnest == 0,
 	    ("System call %s returning in a critical section",
-	    (code >= 0 && code < SYS_MAXSYSCALL) ? freebsd32_syscallnames[code] : "???"));
+	    (sa.code >= 0 && sa.code < SYS_MAXSYSCALL) ?
+	     freebsd32_syscallnames[sa.code] : "???"));
 	KASSERT(td->td_locks == 0,
 	    ("System call %s returning with %d locks held",
-	    (code >= 0 && code < SYS_MAXSYSCALL) ? freebsd32_syscallnames[code] : "???",
-	    td->td_locks));
+	    (sa.code >= 0 && sa.code < SYS_MAXSYSCALL) ?
+	     freebsd32_syscallnames[sa.code] : "???", td->td_locks));
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
@@ -216,10 +253,10 @@ ia32_syscall(struct trapframe *frame)
 	userret(td, frame);
 
 	CTR4(KTR_SYSC, "syscall exit thread %p pid %d proc %s code %d", td,
-	    td->td_proc->p_pid, td->td_proc->p_comm, code);
+	    td->td_proc->p_pid, td->td_proc->p_comm, sa.code);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
-		ktrsysret(code, error, td->td_retval[0]);
+		ktrsysret(sa.code, error, td->td_retval[0]);
 #endif
 
 	/*
@@ -227,7 +264,7 @@ ia32_syscall(struct trapframe *frame)
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
-	STOPEVENT(p, S_SCX, code);
+	STOPEVENT(p, S_SCX, sa.code);
  
 	PTRACESTOP_SC(p, td, S_PT_SCX);
 }
-- 
cgit v1.1


From 05b666175c3574ab196b72aa01279fbccde0db29 Mon Sep 17 00:00:00 2001
From: delphij <delphij@FreeBSD.org>
Date: Tue, 2 Mar 2010 01:56:55 +0000
Subject: MFC x86emu/x86bios emulator and make previously i386 only dpms and
 vesa framebuffer driver, etc. work on FreeBSD/amd64.

A significant amount of improvements were done by jkim@ during the recent
months to make vesa(4) work better, over the initial code import.  This
work is based on OpenBSD's x86emu implementation and contributed by
paradox <ddkprog yahoo com> and swell.k at gmail com.

Hopefully I have stolen all their work to 8-STABLE :)

All bugs in this commit are mine, as usual.
---
 sys/amd64/conf/NOTES | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES
index a231d33..159f12e 100644
--- a/sys/amd64/conf/NOTES
+++ b/sys/amd64/conf/NOTES
@@ -154,6 +154,17 @@ options 	AGP_DEBUG
 #####################################################################
 # HARDWARE DEVICE CONFIGURATION
 
+# To include support for VGA VESA video modes
+options 	VESA
+
+# Turn on extra debugging checks and output for VESA support.
+options 	VESA_DEBUG
+
+device		dpms		# DPMS suspend & resume via VESA BIOS
+
+# x86 real mode BIOS emulator, required by atkbdc/dpms/vesa
+options		X86BIOS
+
 #
 # Optional devices:
 #
@@ -213,6 +224,9 @@ options 	VGA_WIDTH90		# support 90 column modes
 # Debugging.
 options 	VGA_DEBUG
 
+# Linear framebuffer driver for S3 VESA 1.2 cards. Works on top of VESA.
+device		s3pci
+
 # 3Dfx Voodoo Graphics, Voodoo II /dev/3dfx CDEV support.  This will create
 # the /dev/3dfx0 device to work with glide implementations.  This should get
 # linked to /dev/3dfx and /dev/voodoo.  Note that this is not the same as
-- 
cgit v1.1


From 39a08e2d4d047871df11c95a4989509bcede329c Mon Sep 17 00:00:00 2001
From: alc <alc@FreeBSD.org>
Date: Tue, 2 Mar 2010 16:29:08 +0000
Subject: MFC r204420   When running as a guest operating system, the FreeBSD
 kernel must assume   that the virtual machine monitor has enabled machine
 check exceptions.   Unfortunately, on AMD Family 10h processors the machine
 check hardware   has a bug (Erratum 383) that can result in a false machine
 check exception   when a superpage promotion occurs.  Thus, I am disabling
 superpage   promotion when the FreeBSD kernel is running as a guest operating
 system   on an AMD Family 10h processor.

---
 sys/amd64/amd64/pmap.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index b26cc68..7bb81cc 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -654,6 +654,15 @@ pmap_init(void)
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 
 	/*
+	 * Disable large page mappings by default if the kernel is running in
+	 * a virtual machine on an AMD Family 10h processor.  This is a work-
+	 * around for Erratum 383.
+	 */
+	if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
+	    CPUID_TO_FAMILY(cpu_id) == 0x10)
+		pg_ps_enabled = 0;
+
+	/*
 	 * Are large page mappings enabled?
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
-- 
cgit v1.1


From 9bc472d6f744e6faa934776114dd11efb219ced6 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Mon, 8 Mar 2010 21:36:20 +0000
Subject: MFC 204518: Print the contents of the miscellaneous (MISC) register
 to the console if it is valid along with the other register values when a
 machine check is encountered.

---
 sys/amd64/amd64/mca.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/mca.c b/sys/amd64/amd64/mca.c
index 0403de4..b0e842a 100644
--- a/sys/amd64/amd64/mca.c
+++ b/sys/amd64/amd64/mca.c
@@ -288,6 +288,8 @@ mca_log(const struct mca_record *rec)
 	printf("\n");
 	if (rec->mr_status & MC_STATUS_ADDRV)
 		printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr);
+	if (rec->mr_status & MC_STATUS_MISCV)
+		printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
 }
 
 static int __nonnull(2)
-- 
cgit v1.1


From 5906cbf86b9bd4b4a3d60c8276db5df0a4742d72 Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Wed, 24 Mar 2010 09:45:17 +0000
Subject: MFC r204957: Fall back to wbinvd when region for CLFLUSH is >= 2MB.

MFC r205334 (by avg):
Fix a typo in a comment.
---
 sys/amd64/amd64/pmap.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 7bb81cc..0935506 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -941,7 +941,8 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 
 	if (cpu_feature & CPUID_SS)
 		; /* If "Self Snoop" is supported, do nothing. */
-	else if (cpu_feature & CPUID_CLFSH) {
+	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
+		 eva - sva < 2 * 1024 * 1024) {
 
 		/*
 		 * Otherwise, do per-cache line flush.  Use the mfence
@@ -958,7 +959,8 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 
 		/*
 		 * No targeted cache flush methods are supported by CPU,
-		 * globally invalidate cache as a last resort.
+		 * or the supplied range is bigger than 2MB.
+		 * Globally invalidate cache.
 		 */
 		pmap_invalidate_cache();
 	}
-- 
cgit v1.1


From 52188d4fac4c52b1c94084a738e21504945c2a0b Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Thu, 25 Mar 2010 15:48:23 +0000
Subject: MFC 205013: Print out the family and model from the cpu_id.  This is
 especially useful given the advent of the extended family and extended model
 fields.  The values are printed in hex to match their common usage in
 documentation.

---
 sys/amd64/amd64/identcpu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c
index 3cd2f5e..b0da729 100644
--- a/sys/amd64/amd64/identcpu.c
+++ b/sys/amd64/amd64/identcpu.c
@@ -187,7 +187,9 @@ printcpuinfo(void)
 	if (cpu_vendor_id == CPU_VENDOR_INTEL ||
 	    cpu_vendor_id == CPU_VENDOR_AMD ||
 	    cpu_vendor_id == CPU_VENDOR_CENTAUR) {
-		printf("  Stepping = %u", cpu_id & 0xf);
+		printf("  Family = %x", CPUID_TO_FAMILY(cpu_id));
+		printf("  Model = %x", CPUID_TO_MODEL(cpu_id));
+		printf("  Stepping = %u", cpu_id & CPUID_STEPPING);
 		if (cpu_high > 0) {
 
 			/*
-- 
cgit v1.1


From e3fe54954e01dab6b1f05a15f0d43b54ce512210 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Fri, 26 Mar 2010 13:01:30 +0000
Subject: MFC 205210,205448: Remove unneeded type specifiers from 64-bit
 constants.  The compiler infers their natural type from the constants'
 values.

---
 sys/amd64/include/specialreg.h | 44 +++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h
index 8cadbcd..733f4d7 100644
--- a/sys/amd64/include/specialreg.h
+++ b/sys/amd64/include/specialreg.h
@@ -320,16 +320,16 @@
 #define	MTRR_N64K		8	/* numbers of fixed-size entries */
 #define	MTRR_N16K		16
 #define	MTRR_N4K		64
-#define	MTRR_CAP_WC		0x0000000000000400UL
-#define	MTRR_CAP_FIXED		0x0000000000000100UL
-#define	MTRR_CAP_VCNT		0x00000000000000ffUL
-#define	MTRR_DEF_ENABLE		0x0000000000000800UL
-#define	MTRR_DEF_FIXED_ENABLE	0x0000000000000400UL
-#define	MTRR_DEF_TYPE		0x00000000000000ffUL
-#define	MTRR_PHYSBASE_PHYSBASE	0x000ffffffffff000UL
-#define	MTRR_PHYSBASE_TYPE	0x00000000000000ffUL
-#define	MTRR_PHYSMASK_PHYSMASK	0x000ffffffffff000UL
-#define	MTRR_PHYSMASK_VALID	0x0000000000000800UL
+#define	MTRR_CAP_WC		0x0000000000000400
+#define	MTRR_CAP_FIXED		0x0000000000000100
+#define	MTRR_CAP_VCNT		0x00000000000000ff
+#define	MTRR_DEF_ENABLE		0x0000000000000800
+#define	MTRR_DEF_FIXED_ENABLE	0x0000000000000400
+#define	MTRR_DEF_TYPE		0x00000000000000ff
+#define	MTRR_PHYSBASE_PHYSBASE	0x000ffffffffff000
+#define	MTRR_PHYSBASE_TYPE	0x00000000000000ff
+#define	MTRR_PHYSMASK_PHYSMASK	0x000ffffffffff000
+#define	MTRR_PHYSMASK_VALID	0x0000000000000800
 
 /* Performance Control Register (5x86 only). */
 #define	PCR0			0x20
@@ -357,22 +357,22 @@
 #define	MCG_STATUS_RIPV		0x00000001
 #define	MCG_STATUS_EIPV		0x00000002
 #define	MCG_STATUS_MCIP		0x00000004
-#define	MCG_CTL_ENABLE		0xffffffffffffffffUL
-#define	MCG_CTL_DISABLE		0x0000000000000000UL
+#define	MCG_CTL_ENABLE		0xffffffffffffffff
+#define	MCG_CTL_DISABLE		0x0000000000000000
 #define	MSR_MC_CTL(x)		(MSR_MC0_CTL + (x) * 4)
 #define	MSR_MC_STATUS(x)	(MSR_MC0_STATUS + (x) * 4)
 #define	MSR_MC_ADDR(x)		(MSR_MC0_ADDR + (x) * 4)
 #define	MSR_MC_MISC(x)		(MSR_MC0_MISC + (x) * 4)
-#define	MC_STATUS_MCA_ERROR	0x000000000000ffffUL
-#define	MC_STATUS_MODEL_ERROR	0x00000000ffff0000UL
-#define	MC_STATUS_OTHER_INFO	0x01ffffff00000000UL
-#define	MC_STATUS_PCC		0x0200000000000000UL
-#define	MC_STATUS_ADDRV		0x0400000000000000UL
-#define	MC_STATUS_MISCV		0x0800000000000000UL
-#define	MC_STATUS_EN		0x1000000000000000UL
-#define	MC_STATUS_UC		0x2000000000000000UL
-#define	MC_STATUS_OVER		0x4000000000000000UL
-#define	MC_STATUS_VAL		0x8000000000000000UL
+#define	MC_STATUS_MCA_ERROR	0x000000000000ffff
+#define	MC_STATUS_MODEL_ERROR	0x00000000ffff0000
+#define	MC_STATUS_OTHER_INFO	0x01ffffff00000000
+#define	MC_STATUS_PCC		0x0200000000000000
+#define	MC_STATUS_ADDRV		0x0400000000000000
+#define	MC_STATUS_MISCV		0x0800000000000000
+#define	MC_STATUS_EN		0x1000000000000000
+#define	MC_STATUS_UC		0x2000000000000000
+#define	MC_STATUS_OVER		0x4000000000000000
+#define	MC_STATUS_VAL		0x8000000000000000
 
 /*
  * The following four 3-byte registers control the non-cacheable regions.
-- 
cgit v1.1


From 34dd3613f570b31797d837896d2c5c2c0bc61360 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Fri, 26 Mar 2010 13:49:46 +0000
Subject: MFC 205214: - Extend the machine check record structure to include
 several fields useful   for parsing model-specific and other fields in
 machine check events   including the global machine check capabilities and
 status registers,   CPU identification, and the FreeBSD CPU ID. - Report
 these added fields in the console log of a machine check so that   a record
 structure can be reconstituted from the console messages. - Parse new
 architectural errors including memory controller errors.

---
 sys/amd64/amd64/mca.c          | 52 +++++++++++++++++++++++++++++++++++++++---
 sys/amd64/include/mca.h        |  5 ++++
 sys/amd64/include/specialreg.h | 12 ++++++++++
 3 files changed, 66 insertions(+), 3 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/mca.c b/sys/amd64/amd64/mca.c
index b0e842a..76bee77 100644
--- a/sys/amd64/amd64/mca.c
+++ b/sys/amd64/amd64/mca.c
@@ -177,19 +177,46 @@ mca_error_request(uint16_t mca_error)
 	return ("???");
 }
 
+static const char *
+mca_error_mmtype(uint16_t mca_error)
+{
+
+	switch ((mca_error & 0x70) >> 4) {
+	case 0x0:
+		return ("GEN");
+	case 0x1:
+		return ("RD");
+	case 0x2:
+		return ("WR");
+	case 0x3:
+		return ("AC");
+	case 0x4:
+		return ("MS");
+	}
+	return ("???");
+}
+
 /* Dump details about a single machine check. */
 static void __nonnull(1)
 mca_log(const struct mca_record *rec)
 {
 	uint16_t mca_error;
 
-	printf("MCA: bank %d, status 0x%016llx\n", rec->mr_bank,
+	printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
 	    (long long)rec->mr_status);
-	printf("MCA: CPU %d ", rec->mr_apic_id);
+	printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
+	    (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
+	printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor,
+	    rec->mr_cpu_id, rec->mr_apic_id);
+	printf("MCA: CPU %d ", rec->mr_cpu);
 	if (rec->mr_status & MC_STATUS_UC)
 		printf("UNCOR ");
-	else
+	else {
 		printf("COR ");
+		if (rec->mr_mcg_cap & MCG_CAP_TES_P)
+			printf("(%lld) ", ((long long)rec->mr_status &
+			    MC_STATUS_COR_COUNT) >> 38);
+	}
 	if (rec->mr_status & MC_STATUS_PCC)
 		printf("PCC ");
 	if (rec->mr_status & MC_STATUS_OVER)
@@ -212,6 +239,9 @@ mca_log(const struct mca_record *rec)
 	case 0x0004:
 		printf("FRC error");
 		break;
+	case 0x0005:
+		printf("internal parity error");
+		break;
 	case 0x0400:
 		printf("internal timer error");
 		break;
@@ -236,6 +266,17 @@ mca_log(const struct mca_record *rec)
 			break;
 		}
 
+		/* Memory controller error. */
+		if ((mca_error & 0xef80) == 0x0080) {
+			printf("%s channel ", mca_error_mmtype(mca_error));
+			if ((mca_error & 0x000f) != 0x000f)
+				printf("%d", mca_error & 0x000f);
+			else
+				printf("??");
+			printf(" memory error");
+			break;
+		}
+		
 		/* Cache error. */
 		if ((mca_error & 0xef00) == 0x0100) {
 			printf("%sCACHE %s %s error",
@@ -313,6 +354,11 @@ mca_check_status(int bank, struct mca_record *rec)
 		rec->mr_misc = rdmsr(MSR_MC_MISC(bank));
 	rec->mr_tsc = rdtsc();
 	rec->mr_apic_id = PCPU_GET(apic_id);
+	rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
+	rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS);
+	rec->mr_cpu_id = cpu_id;
+	rec->mr_cpu_vendor_id = cpu_vendor_id;
+	rec->mr_cpu = PCPU_GET(cpuid);
 
 	/*
 	 * Clear machine check.  Don't do this for uncorrectable
diff --git a/sys/amd64/include/mca.h b/sys/amd64/include/mca.h
index ddc3aeb..bc09480 100644
--- a/sys/amd64/include/mca.h
+++ b/sys/amd64/include/mca.h
@@ -37,6 +37,11 @@ struct mca_record {
 	uint64_t	mr_tsc;
 	int		mr_apic_id;
 	int		mr_bank;
+	uint64_t	mr_mcg_cap;
+	uint64_t	mr_mcg_status;
+	int		mr_cpu_id;
+	int		mr_cpu_vendor_id;
+	int		mr_cpu;
 };
 
 #ifdef _KERNEL
diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h
index 733f4d7..9253462 100644
--- a/sys/amd64/include/specialreg.h
+++ b/sys/amd64/include/specialreg.h
@@ -267,6 +267,7 @@
 #define	MSR_MTRR16kBase		0x258
 #define	MSR_MTRR4kBase		0x268
 #define	MSR_PAT			0x277
+#define	MSR_MC0_CTL2		0x280
 #define	MSR_MTRRdefType		0x2ff
 #define	MSR_MC0_CTL		0x400
 #define	MSR_MC0_STATUS		0x401
@@ -352,8 +353,10 @@
 #define	MCG_CAP_COUNT		0x000000ff
 #define	MCG_CAP_CTL_P		0x00000100
 #define	MCG_CAP_EXT_P		0x00000200
+#define	MCG_CAP_CMCI_P		0x00000400
 #define	MCG_CAP_TES_P		0x00000800
 #define	MCG_CAP_EXT_CNT		0x00ff0000
+#define	MCG_CAP_SER_P		0x01000000
 #define	MCG_STATUS_RIPV		0x00000001
 #define	MCG_STATUS_EIPV		0x00000002
 #define	MCG_STATUS_MCIP		0x00000004
@@ -363,9 +366,14 @@
 #define	MSR_MC_STATUS(x)	(MSR_MC0_STATUS + (x) * 4)
 #define	MSR_MC_ADDR(x)		(MSR_MC0_ADDR + (x) * 4)
 #define	MSR_MC_MISC(x)		(MSR_MC0_MISC + (x) * 4)
+#define	MSR_MC_CTL2(x)		(MSR_MC0_CTL2 + (x))	/* If MCG_CAP_CMCI_P */
 #define	MC_STATUS_MCA_ERROR	0x000000000000ffff
 #define	MC_STATUS_MODEL_ERROR	0x00000000ffff0000
 #define	MC_STATUS_OTHER_INFO	0x01ffffff00000000
+#define	MC_STATUS_COR_COUNT	0x001fffc000000000	/* If MCG_CAP_TES_P */
+#define	MC_STATUS_TES_STATUS	0x0060000000000000	/* If MCG_CAP_TES_P */
+#define	MC_STATUS_AR		0x0080000000000000	/* If MCG_CAP_CMCI_P */
+#define	MC_STATUS_S		0x0100000000000000	/* If MCG_CAP_CMCI_P */
 #define	MC_STATUS_PCC		0x0200000000000000
 #define	MC_STATUS_ADDRV		0x0400000000000000
 #define	MC_STATUS_MISCV		0x0800000000000000
@@ -373,6 +381,10 @@
 #define	MC_STATUS_UC		0x2000000000000000
 #define	MC_STATUS_OVER		0x4000000000000000
 #define	MC_STATUS_VAL		0x8000000000000000
+#define	MC_MISC_RA_LSB		0x000000000000003f	/* If MCG_CAP_SER_P */
+#define	MC_MISC_ADDRESS_MODE	0x00000000000001c0	/* If MCG_CAP_SER_P */
+#define	MC_CTL2_THRESHOLD	0x0000000000003fff
+#define	MC_CTL2_CMCI_EN		0x0000000040000000
 
 /*
  * The following four 3-byte registers control the non-cacheable regions.
-- 
cgit v1.1


From bfa70a9aba72beceb303770646fb483f9e024647 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Fri, 26 Mar 2010 18:58:22 +0000
Subject: MFC 205332: Use the same policy for rejecting / not-reject ACPI
 tables with incorrect checksums as the base acpi(4) driver.  This fixes a
 problem where the MADT parser would reject the MADT table during early boot
 causing the MP Table to be, but then the acpi(4) driver would attach and use
 non-SMP interrupt routing.

---
 sys/amd64/acpica/acpi_machdep.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/acpica/acpi_machdep.c b/sys/amd64/acpica/acpi_machdep.c
index 0d866e8..ad5f854 100644
--- a/sys/amd64/acpica/acpi_machdep.c
+++ b/sys/amd64/acpica/acpi_machdep.c
@@ -187,8 +187,10 @@ map_table(vm_paddr_t pa, int offset, const char *sig)
 	if (ACPI_FAILURE(AcpiTbChecksum(table, length))) {
 		if (bootverbose)
 			printf("ACPI: Failed checksum for table %s\n", sig);
+#if (ACPI_CHECKSUM_ABORT)
 		table_unmap(table, length);
 		return (NULL);
+#endif
 	}
 	return (table);
 }
-- 
cgit v1.1


From c78b160bc7863413615e23b4e9e811b8c7703023 Mon Sep 17 00:00:00 2001
From: trasz <trasz@FreeBSD.org>
Date: Sat, 27 Mar 2010 14:58:28 +0000
Subject: MFC r202919:

Fix array overflow.  This routine is only called from procfs,
which is not mounted by default, and I've been unable to trigger
a panic without this fix applied anyway.

Reviewed by:	kib, cperciva
---
 sys/amd64/ia32/ia32_reg.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/ia32/ia32_reg.c b/sys/amd64/ia32/ia32_reg.c
index 83f6783..da5190f 100644
--- a/sys/amd64/ia32/ia32_reg.c
+++ b/sys/amd64/ia32/ia32_reg.c
@@ -213,8 +213,6 @@ fill_dbregs32(struct thread *td, struct dbreg32 *regs)
 	err = fill_dbregs(td, &dr);
 	for (i = 0; i < 8; i++)
 		regs->dr[i] = dr.dr[i];
-	for (i = 8; i < 16; i++)
-		regs->dr[i] = 0;
 	return (err);
 }
 
-- 
cgit v1.1


From 1d3f35048e647a7839adde4669ccc806e42b47c9 Mon Sep 17 00:00:00 2001
From: bz <bz@FreeBSD.org>
Date: Sat, 27 Mar 2010 17:14:55 +0000
Subject: MFC r201813:

  In sys/<arch>/conf/Makefile set TARGET to <arch>. That allows
  sys/conf/makeLINT.mk to only do certain things for certain
  architectures.

  Note that neither arm nor mips have the Makefile there, thus
  essentially not (yet) supporting LINT.  This would enable them
  do add special treatment to sys/conf/makeLINT.mk as well chosing
  one of the many configurations as LINT.
---
 sys/amd64/conf/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/Makefile b/sys/amd64/conf/Makefile
index 2c006e9..1d2513f 100644
--- a/sys/amd64/conf/Makefile
+++ b/sys/amd64/conf/Makefile
@@ -1,3 +1,5 @@
 # $FreeBSD$
 
+TARGET=amd64
+
 .include "${.CURDIR}/../../conf/makeLINT.mk"
-- 
cgit v1.1


From 78bdfe798ddad0bb64dbd05f0eeb4620cd19028b Mon Sep 17 00:00:00 2001
From: attilio <attilio@FreeBSD.org>
Date: Mon, 29 Mar 2010 15:39:17 +0000
Subject: MFC r199852, r202387, r202441, r202534: Handling all the three clocks
 with the LAPIC may lead to aliasing for softclock and profclock. Revert the
 change when the LAPIC started taking charge of all three of them.

Sponsored by:	Sandvine Incorporated
---
 sys/amd64/amd64/local_apic.c | 71 +++++++++++++++++++++++++++-----------------
 sys/amd64/include/apicvar.h  |  8 ++++-
 sys/amd64/isa/clock.c        | 15 +++++-----
 3 files changed, 58 insertions(+), 36 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c
index 98ed4df..0d04bbd 100644
--- a/sys/amd64/amd64/local_apic.c
+++ b/sys/amd64/amd64/local_apic.c
@@ -160,6 +160,9 @@ static uint32_t	lvt_mode(struct lapic *la, u_int pin, uint32_t value);
 
 struct pic lapic_pic = { .pic_resume = lapic_resume };
 
+static int lapic_allclocks;
+TUNABLE_INT("machdep.lapic_allclocks", &lapic_allclocks);
+
 static uint32_t
 lvt_mode(struct lapic *la, u_int pin, uint32_t value)
 {
@@ -415,10 +418,11 @@ lapic_disable_pmc(void)
 /*
  * Called by cpu_initclocks() on the BSP to setup the local APIC timer so
  * that it can drive hardclock, statclock, and profclock.  This function
- * returns true if it is able to use the local APIC timer to drive the
- * clocks and false if it is not able.
+ * returns a positive integer if it is convenient to use the local APIC
+ * for all the clocks, a negative integer if it is convenient to use the
+ * local APIC only for the hardclock and 0 if none of them can be handled.
  */
-int
+enum lapic_clock
 lapic_setup_clock(void)
 {
 	u_long value;
@@ -426,10 +430,10 @@ lapic_setup_clock(void)
 
 	/* Can't drive the timer without a local APIC. */
 	if (lapic == NULL)
-		return (0);
+		return (LAPIC_CLOCK_NONE);
 
 	if (resource_int_value("apic", 0, "clock", &i) == 0 && i == 0)
-		return (0);
+		return (LAPIC_CLOCK_NONE);
 
 	/* Start off with a divisor of 2 (power on reset default). */
 	lapic_timer_divisor = 2;
@@ -461,19 +465,27 @@ lapic_setup_clock(void)
 	 * (and profhz) run at hz.  If 'hz' is below 1500 but above
 	 * 750, then we let the lapic timer run at 2 * 'hz'.  If 'hz'
 	 * is below 750 then we let the lapic timer run at 4 * 'hz'.
+	 *
+	 * Please note that stathz and profhz are set only if all the
+	 * clocks are handled through the local APIC.
 	 */
-	if (hz >= 1500)
+	if (lapic_allclocks != 0) {
+		if (hz >= 1500)
+			lapic_timer_hz = hz;
+		else if (hz >= 750)
+			lapic_timer_hz = hz * 2;
+		else
+			lapic_timer_hz = hz * 4;
+	} else
 		lapic_timer_hz = hz;
-	else if (hz >= 750)
-		lapic_timer_hz = hz * 2;
-	else
-		lapic_timer_hz = hz * 4;
-	if (lapic_timer_hz < 128)
-		stathz = lapic_timer_hz;
-	else
-		stathz = lapic_timer_hz / (lapic_timer_hz / 128);
-	profhz = lapic_timer_hz;
 	lapic_timer_period = value / lapic_timer_hz;
+	if (lapic_allclocks != 0) {
+		if (lapic_timer_hz < 128)
+			stathz = lapic_timer_hz;
+		else
+			stathz = lapic_timer_hz / (lapic_timer_hz / 128);
+		profhz = lapic_timer_hz;
+	}
 
 	/*
 	 * Start up the timer on the BSP.  The APs will kick off their
@@ -481,7 +493,7 @@ lapic_setup_clock(void)
 	 */
 	lapic_timer_periodic(lapic_timer_period);
 	lapic_timer_enable_intr();
-	return (1);
+	return (lapic_allclocks == 0 ? LAPIC_CLOCK_HARDCLOCK : LAPIC_CLOCK_ALL);
 }
 
 void
@@ -784,20 +796,23 @@ lapic_handle_timer(struct trapframe *frame)
 		else
 			hardclock_cpu(TRAPF_USERMODE(frame));
 	}
+	if (lapic_allclocks != 0) {
 
-	/* Fire statclock at stathz. */
-	la->la_stat_ticks += stathz;
-	if (la->la_stat_ticks >= lapic_timer_hz) {
-		la->la_stat_ticks -= lapic_timer_hz;
-		statclock(TRAPF_USERMODE(frame));
-	}
+		/* Fire statclock at stathz. */
+		la->la_stat_ticks += stathz;
+		if (la->la_stat_ticks >= lapic_timer_hz) {
+			la->la_stat_ticks -= lapic_timer_hz;
+			statclock(TRAPF_USERMODE(frame));
+		}
 
-	/* Fire profclock at profhz, but only when needed. */
-	la->la_prof_ticks += profhz;
-	if (la->la_prof_ticks >= lapic_timer_hz) {
-		la->la_prof_ticks -= lapic_timer_hz;
-		if (profprocs != 0)
-			profclock(TRAPF_USERMODE(frame), TRAPF_PC(frame));
+		/* Fire profclock at profhz, but only when needed. */
+		la->la_prof_ticks += profhz;
+		if (la->la_prof_ticks >= lapic_timer_hz) {
+			la->la_prof_ticks -= lapic_timer_hz;
+			if (profprocs != 0)
+				profclock(TRAPF_USERMODE(frame),
+				    TRAPF_PC(frame));
+		}
 	}
 	critical_exit();
 }
diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h
index 9d6d538..8f15d84 100644
--- a/sys/amd64/include/apicvar.h
+++ b/sys/amd64/include/apicvar.h
@@ -157,6 +157,12 @@
 #define	APIC_BUS_PCI		2
 #define	APIC_BUS_MAX		APIC_BUS_PCI
 
+enum lapic_clock {
+	LAPIC_CLOCK_NONE,
+	LAPIC_CLOCK_HARDCLOCK,
+	LAPIC_CLOCK_ALL
+};
+
 /*
  * An APIC enumerator is a psuedo bus driver that enumerates APIC's including
  * CPU's and I/O APIC's.
@@ -224,7 +230,7 @@ int	lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
 	    enum intr_trigger trigger);
 void	lapic_set_tpr(u_int vector);
 void	lapic_setup(int boot);
-int	lapic_setup_clock(void);
+enum lapic_clock	lapic_setup_clock(void);
 
 #endif /* !LOCORE */
 #endif /* _MACHINE_APICVAR_H_ */
diff --git a/sys/amd64/isa/clock.c b/sys/amd64/isa/clock.c
index adc1743..bf379f3 100644
--- a/sys/amd64/isa/clock.c
+++ b/sys/amd64/isa/clock.c
@@ -91,7 +91,7 @@ static	u_int32_t i8254_offset;
 static	int	(*i8254_pending)(struct intsrc *);
 static	int	i8254_ticked;
 static	int	using_atrtc_timer;
-static	int	using_lapic_timer;
+static	enum lapic_clock using_lapic_timer = LAPIC_CLOCK_NONE;
 
 /* Values for timerX_state: */
 #define	RELEASED	0
@@ -160,7 +160,8 @@ clkintr(struct trapframe *frame)
 		clkintr_pending = 0;
 		mtx_unlock_spin(&clock_lock);
 	}
-	KASSERT(!using_lapic_timer, ("clk interrupt enabled with lapic timer"));
+	KASSERT(using_lapic_timer == LAPIC_CLOCK_NONE,
+	    ("clk interrupt enabled with lapic timer"));
 
 	if (using_atrtc_timer) {
 #ifdef SMP
@@ -422,7 +423,7 @@ set_i8254_freq(u_int freq, int intr_freq)
 	i8254_timecounter.tc_frequency = freq;
 	mtx_lock_spin(&clock_lock);
 	i8254_freq = freq;
-	if (using_lapic_timer)
+	if (using_lapic_timer != LAPIC_CLOCK_NONE)
 		new_i8254_real_max_count = 0x10000;
 	else
 		new_i8254_real_max_count = TIMER_DIV(intr_freq);
@@ -485,7 +486,7 @@ cpu_initclocks()
 	 * that it can drive hardclock().  Otherwise, change the 8254
 	 * timecounter to user a simpler algorithm.
 	 */
-	if (!using_lapic_timer) {
+	if (using_lapic_timer == LAPIC_CLOCK_NONE) {
 		intr_add_handler("clk", 0, (driver_filter_t *)clkintr, NULL,
 		    NULL, INTR_TYPE_CLK, NULL);
 		i8254_intsrc = intr_lookup_source(0);
@@ -508,7 +509,7 @@ cpu_initclocks()
 	 * kernel clocks, then setup the RTC to periodically interrupt to
 	 * drive statclock() and profclock().
 	 */
-	if (!using_lapic_timer) {
+	if (using_lapic_timer != LAPIC_CLOCK_ALL) {
 		using_atrtc_timer = atrtc_setup_clock();
 		if (using_atrtc_timer) {
 			/* Enable periodic interrupts from the RTC. */
@@ -532,7 +533,7 @@ void
 cpu_startprofclock(void)
 {
 
-	if (using_lapic_timer || !using_atrtc_timer)
+	if (using_lapic_timer == LAPIC_CLOCK_ALL || !using_atrtc_timer)
 		return;
 	atrtc_rate(RTCSA_PROF);
 	psdiv = pscnt = psratio;
@@ -542,7 +543,7 @@ void
 cpu_stopprofclock(void)
 {
 
-	if (using_lapic_timer || !using_atrtc_timer)
+	if (using_lapic_timer == LAPIC_CLOCK_ALL || !using_atrtc_timer)
 		return;
 	atrtc_rate(RTCSA_NOPROF);
 	psdiv = pscnt = 1;
-- 
cgit v1.1


From 5fd4298a64631d6515310c8b761c909817c24411 Mon Sep 17 00:00:00 2001
From: attilio <attilio@FreeBSD.org>
Date: Tue, 30 Mar 2010 11:19:29 +0000
Subject: MFC r204641, r204753: Improving the clocks auto-tunning by firstly
 checking if the atrtc may be correctly initialized and just then assign to
 softclock/profclock.

Sponsored by:   Sandvine Incorporated
---
 sys/amd64/amd64/local_apic.c | 28 +++++++++++++++-------------
 sys/amd64/include/apicvar.h  |  2 +-
 sys/amd64/isa/clock.c        | 24 +++++++++++++++++++-----
 3 files changed, 35 insertions(+), 19 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c
index 0d04bbd..c274631 100644
--- a/sys/amd64/amd64/local_apic.c
+++ b/sys/amd64/amd64/local_apic.c
@@ -149,6 +149,7 @@ extern inthand_t IDTVEC(rsvd);
 volatile lapic_t *lapic;
 vm_paddr_t lapic_paddr;
 static u_long lapic_timer_divisor, lapic_timer_period, lapic_timer_hz;
+static enum lapic_clock clockcoverage;
 
 static void	lapic_enable(void);
 static void	lapic_resume(struct pic *pic);
@@ -160,9 +161,6 @@ static uint32_t	lvt_mode(struct lapic *la, u_int pin, uint32_t value);
 
 struct pic lapic_pic = { .pic_resume = lapic_resume };
 
-static int lapic_allclocks;
-TUNABLE_INT("machdep.lapic_allclocks", &lapic_allclocks);
-
 static uint32_t
 lvt_mode(struct lapic *la, u_int pin, uint32_t value)
 {
@@ -423,17 +421,20 @@ lapic_disable_pmc(void)
  * local APIC only for the hardclock and 0 if none of them can be handled.
  */
 enum lapic_clock
-lapic_setup_clock(void)
+lapic_setup_clock(enum lapic_clock srcsdes)
 {
 	u_long value;
 	int i;
 
-	/* Can't drive the timer without a local APIC. */
-	if (lapic == NULL)
-		return (LAPIC_CLOCK_NONE);
+	/* lapic_setup_clock() should not be called with LAPIC_CLOCK_NONE. */
+	MPASS(srcsdes != LAPIC_CLOCK_NONE);
 
-	if (resource_int_value("apic", 0, "clock", &i) == 0 && i == 0)
-		return (LAPIC_CLOCK_NONE);
+	/* Can't drive the timer without a local APIC. */
+	if (lapic == NULL ||
+	    (resource_int_value("apic", 0, "clock", &i) == 0 && i == 0)) {
+		clockcoverage = LAPIC_CLOCK_NONE;
+		return (clockcoverage);
+	}
 
 	/* Start off with a divisor of 2 (power on reset default). */
 	lapic_timer_divisor = 2;
@@ -469,7 +470,7 @@ lapic_setup_clock(void)
 	 * Please note that stathz and profhz are set only if all the
 	 * clocks are handled through the local APIC.
 	 */
-	if (lapic_allclocks != 0) {
+	if (srcsdes == LAPIC_CLOCK_ALL) {
 		if (hz >= 1500)
 			lapic_timer_hz = hz;
 		else if (hz >= 750)
@@ -479,7 +480,7 @@ lapic_setup_clock(void)
 	} else
 		lapic_timer_hz = hz;
 	lapic_timer_period = value / lapic_timer_hz;
-	if (lapic_allclocks != 0) {
+	if (srcsdes == LAPIC_CLOCK_ALL) {
 		if (lapic_timer_hz < 128)
 			stathz = lapic_timer_hz;
 		else
@@ -493,7 +494,8 @@ lapic_setup_clock(void)
 	 */
 	lapic_timer_periodic(lapic_timer_period);
 	lapic_timer_enable_intr();
-	return (lapic_allclocks == 0 ? LAPIC_CLOCK_HARDCLOCK : LAPIC_CLOCK_ALL);
+	clockcoverage = srcsdes;
+	return (srcsdes);
 }
 
 void
@@ -796,7 +798,7 @@ lapic_handle_timer(struct trapframe *frame)
 		else
 			hardclock_cpu(TRAPF_USERMODE(frame));
 	}
-	if (lapic_allclocks != 0) {
+	if (clockcoverage == LAPIC_CLOCK_ALL) {
 
 		/* Fire statclock at stathz. */
 		la->la_stat_ticks += stathz;
diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h
index 8f15d84..110ce81 100644
--- a/sys/amd64/include/apicvar.h
+++ b/sys/amd64/include/apicvar.h
@@ -230,7 +230,7 @@ int	lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
 	    enum intr_trigger trigger);
 void	lapic_set_tpr(u_int vector);
 void	lapic_setup(int boot);
-enum lapic_clock	lapic_setup_clock(void);
+enum lapic_clock	lapic_setup_clock(enum lapic_clock srcsdes);
 
 #endif /* !LOCORE */
 #endif /* _MACHINE_APICVAR_H_ */
diff --git a/sys/amd64/isa/clock.c b/sys/amd64/isa/clock.c
index bf379f3..e5c27d1 100644
--- a/sys/amd64/isa/clock.c
+++ b/sys/amd64/isa/clock.c
@@ -84,6 +84,9 @@ TUNABLE_INT("hw.i8254.freq", &i8254_freq);
 int	i8254_max_count;
 static int i8254_real_max_count;
 
+static int lapic_allclocks;
+TUNABLE_INT("machdep.lapic_allclocks", &lapic_allclocks);
+
 struct mtx clock_lock;
 static	struct intsrc *i8254_intsrc;
 static	u_int32_t i8254_lastcount;
@@ -478,8 +481,22 @@ startrtclock()
 void
 cpu_initclocks()
 {
+	enum lapic_clock tlsca;
+	int tasc;
+
+	/* Initialize RTC. */
+	atrtc_start();
+	tasc = atrtc_setup_clock();
+
+	/*
+	 * If the atrtc successfully initialized and the users didn't force
+	 * otherwise use the LAPIC in order to cater hardclock only, otherwise
+	 * take in charge all the clock sources.
+	 */
+	tlsca = (lapic_allclocks == 0 && tasc != 0) ? LAPIC_CLOCK_HARDCLOCK :
+	    LAPIC_CLOCK_ALL;
+	using_lapic_timer = lapic_setup_clock(tlsca);
 
-	using_lapic_timer = lapic_setup_clock();
 	/*
 	 * If we aren't using the local APIC timer to drive the kernel
 	 * clocks, setup the interrupt handler for the 8254 timer 0 so
@@ -500,9 +517,6 @@ cpu_initclocks()
 		set_i8254_freq(i8254_freq, hz);
 	}
 
-	/* Initialize RTC. */
-	atrtc_start();
-
 	/*
 	 * If the separate statistics clock hasn't been explicility disabled
 	 * and we aren't already using the local APIC timer to drive the
@@ -510,7 +524,7 @@ cpu_initclocks()
 	 * drive statclock() and profclock().
 	 */
 	if (using_lapic_timer != LAPIC_CLOCK_ALL) {
-		using_atrtc_timer = atrtc_setup_clock();
+		using_atrtc_timer = tasc; 
 		if (using_atrtc_timer) {
 			/* Enable periodic interrupts from the RTC. */
 			intr_add_handler("rtc", 8,
-- 
cgit v1.1


From 06b4c1f24a99a989afc61994d978942e183fe5e0 Mon Sep 17 00:00:00 2001
From: marcel <marcel@FreeBSD.org>
Date: Wed, 31 Mar 2010 02:43:58 +0000
Subject: MFC rev 198341 and 198342: o   Introduce vm_sync_icache() for making
 the I-cache coherent with     the memory or D-cache, depending on the
 semantics of the platform.     vm_sync_icache() is basically a wrapper around
 pmap_sync_icache(),     that translates the vm_map_t argumument to pmap_t. o 
  Introduce pmap_sync_icache() to all PMAP implementation. For powerpc     it
 replaces the pmap_page_executable() function, added to solve     the I-cache
 problem in uiomove_fromphys(). o   In proc_rwmem() call vm_sync_icache() when
 writing to a page that     has execute permissions. This assures that when
 breakpoints are     written, the I-cache will be coherent and the process
 will actually     hit the breakpoint. o   This also fixes the Book-E PMAP
 implementation that was missing     necessary locking while trying to deal
 with the I-cache coherency     in pmap_enter() (read:
 mmu_booke_enter_locked).

---
 sys/amd64/amd64/pmap.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 0935506..5ff527f 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -4756,6 +4756,11 @@ if (oldpmap)	/* XXX FIXME */
 	critical_exit();
 }
 
+void
+pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
+{
+}
+
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
-- 
cgit v1.1


From bc85e840fa31ff95bb72c293fbb52e6765cfcbb3 Mon Sep 17 00:00:00 2001
From: rnoland <rnoland@FreeBSD.org>
Date: Sun, 4 Apr 2010 15:42:52 +0000
Subject: MFC r203289,r203367

Enable MTRR on all VIA CPUs that claim support
---
 sys/amd64/amd64/amd64_mem.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/amd64_mem.c b/sys/amd64/amd64/amd64_mem.c
index d7959fd..e50d3e7 100644
--- a/sys/amd64/amd64/amd64_mem.c
+++ b/sys/amd64/amd64/amd64_mem.c
@@ -707,11 +707,8 @@ amd64_mem_drvinit(void *unused)
 	switch (cpu_vendor_id) {
 	case CPU_VENDOR_INTEL:
 	case CPU_VENDOR_AMD:
-		break;
 	case CPU_VENDOR_CENTAUR:
-		if (cpu_exthigh >= 0x80000008)
-			break;
-		/* FALLTHROUGH */
+		break;
 	default:
 		return;
 	}
-- 
cgit v1.1


From bcba5d5ad835df6f565c7a9d63e8ebe99bd6db23 Mon Sep 17 00:00:00 2001
From: alc <alc@FreeBSD.org>
Date: Mon, 5 Apr 2010 16:11:42 +0000
Subject: MFC r204907, r204913, r205402, r205573, r205573   Implement AMD's
 recommended workaround for Erratum 383 on Family 10h   processors.

  Enable machine check exceptions by default.
---
 sys/amd64/amd64/mca.c          |  35 ++++++++++-
 sys/amd64/amd64/pmap.c         | 134 ++++++++++++++++++++++++++++++++++++++---
 sys/amd64/include/md_var.h     |   1 +
 sys/amd64/include/specialreg.h |   1 +
 4 files changed, 162 insertions(+), 9 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/mca.c b/sys/amd64/amd64/mca.c
index 76bee77..ccbab17 100644
--- a/sys/amd64/amd64/mca.c
+++ b/sys/amd64/amd64/mca.c
@@ -60,11 +60,20 @@ static int mca_count;		/* Number of records stored. */
 
 SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL, "Machine Check Architecture");
 
-static int mca_enabled = 0;
+static int mca_enabled = 1;
 TUNABLE_INT("hw.mca.enabled", &mca_enabled);
 SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
     "Administrative toggle for machine check support");
 
+static int amd10h_L1TP = 1;
+TUNABLE_INT("hw.mca.amd10h_L1TP", &amd10h_L1TP);
+SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
+    "Administrative toggle for logging of level one TLB parity (L1TP) errors");
+
+int workaround_erratum383;
+SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erratum383, 0,
+    "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
+
 static STAILQ_HEAD(, mca_internal) mca_records;
 static struct callout mca_timer;
 static int mca_ticks = 3600;	/* Check hourly by default. */
@@ -527,7 +536,7 @@ void
 mca_init(void)
 {
 	uint64_t mcg_cap;
-	uint64_t ctl;
+	uint64_t ctl, mask;
 	int skip;
 	int i;
 
@@ -535,6 +544,15 @@ mca_init(void)
 	if (!mca_enabled || !(cpu_feature & CPUID_MCE))
 		return;
 
+	/*
+	 * On AMD Family 10h processors, unless logging of level one TLB
+	 * parity (L1TP) errors is disabled, enable the recommended workaround
+	 * for Erratum 383.
+	 */
+	if (cpu_vendor_id == CPU_VENDOR_AMD &&
+	    CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
+		workaround_erratum383 = 1;
+
 	if (cpu_feature & CPUID_MCA) {
 		if (PCPU_GET(cpuid) == 0)
 			mca_setup();
@@ -545,6 +563,19 @@ mca_init(void)
 			/* Enable MCA features. */
 			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
 
+		/*
+		 * Disable logging of level one TLB parity (L1TP) errors by
+		 * the data cache as an alternative workaround for AMD Family
+		 * 10h Erratum 383.  Unlike the recommended workaround, there
+		 * is no performance penalty to this workaround.  However,
+		 * L1TP errors will go unreported.
+		 */
+		if (cpu_vendor_id == CPU_VENDOR_AMD &&
+		    CPUID_TO_FAMILY(cpu_id) == 0x10 && !amd10h_L1TP) {
+			mask = rdmsr(MSR_MC0_CTL_MASK);
+			if ((mask & (1UL << 5)) == 0)
+				wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
+		}
 		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
 			/* By default enable logging of all errors. */
 			ctl = 0xffffffffffffffffUL;
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 5ff527f..516048d 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -7,7 +7,7 @@
  * All rights reserved.
  * Copyright (c) 2003 Peter Wemm
  * All rights reserved.
- * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu>
+ * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
@@ -252,6 +252,9 @@ static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m);
+static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
+    pd_entry_t newpde);
+static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
 
 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
@@ -654,13 +657,13 @@ pmap_init(void)
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 
 	/*
-	 * Disable large page mappings by default if the kernel is running in
-	 * a virtual machine on an AMD Family 10h processor.  This is a work-
-	 * around for Erratum 383.
+	 * If the kernel is running in a virtual machine on an AMD Family 10h
+	 * processor, then it must assume that MCA is enabled by the virtual
+	 * machine monitor.
 	 */
 	if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
 	    CPUID_TO_FAMILY(cpu_id) == 0x10)
-		pg_ps_enabled = 0;
+		workaround_erratum383 = 1;
 
 	/*
 	 * Are large page mappings enabled?
@@ -795,6 +798,45 @@ pmap_cache_bits(int mode, boolean_t is_pde)
 		cache_bits |= PG_NC_PWT;
 	return (cache_bits);
 }
+
+/*
+ * After changing the page size for the specified virtual address in the page
+ * table, flush the corresponding entries from the processor's TLB.  Only the
+ * calling processor's TLB is affected.
+ *
+ * The calling thread must be pinned to a processor.
+ */
+static void
+pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
+{
+	u_long cr4;
+
+	if ((newpde & PG_PS) == 0)
+		/* Demotion: flush a specific 2MB page mapping. */
+		invlpg(va);
+	else if ((newpde & PG_G) == 0)
+		/*
+		 * Promotion: flush every 4KB page mapping from the TLB
+		 * because there are too many to flush individually.
+		 */
+		invltlb();
+	else {
+		/*
+		 * Promotion: flush every 4KB page mapping from the TLB,
+		 * including any global (PG_G) mappings.
+		 */
+		cr4 = rcr4();
+		load_cr4(cr4 & ~CR4_PGE);
+		/*
+		 * Although preemption at this point could be detrimental to
+		 * performance, it would not lead to an error.  PG_G is simply
+		 * ignored if CR4.PGE is clear.  Moreover, in case this block
+		 * is re-entered, the load_cr4() either above or below will
+		 * modify CR4.PGE flushing the TLB.
+		 */
+		load_cr4(cr4 | CR4_PGE);
+	}
+}
 #ifdef SMP
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
@@ -891,6 +933,69 @@ pmap_invalidate_cache(void)
 	smp_cache_flush();
 	sched_unpin();
 }
+
+struct pde_action {
+	cpumask_t store;	/* processor that updates the PDE */
+	cpumask_t invalidate;	/* processors that invalidate their TLB */
+	vm_offset_t va;
+	pd_entry_t *pde;
+	pd_entry_t newpde;
+};
+
+static void
+pmap_update_pde_action(void *arg)
+{
+	struct pde_action *act = arg;
+
+	if (act->store == PCPU_GET(cpumask))
+		pde_store(act->pde, act->newpde);
+}
+
+static void
+pmap_update_pde_teardown(void *arg)
+{
+	struct pde_action *act = arg;
+
+	if ((act->invalidate & PCPU_GET(cpumask)) != 0)
+		pmap_update_pde_invalidate(act->va, act->newpde);
+}
+
+/*
+ * Change the page size for the specified virtual address in a way that
+ * prevents any possibility of the TLB ever having two entries that map the
+ * same virtual address using different page sizes.  This is the recommended
+ * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
+ * machine check exception for a TLB state that is improperly diagnosed as a
+ * hardware error.
+ */
+static void
+pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
+{
+	struct pde_action act;
+	cpumask_t active, cpumask;
+
+	sched_pin();
+	cpumask = PCPU_GET(cpumask);
+	if (pmap == kernel_pmap)
+		active = all_cpus;
+	else
+		active = pmap->pm_active;
+	if ((active & PCPU_GET(other_cpus)) != 0) {
+		act.store = cpumask;
+		act.invalidate = active;
+		act.va = va;
+		act.pde = pde;
+		act.newpde = newpde;
+		smp_rendezvous_cpus(cpumask | active,
+		    smp_no_rendevous_barrier, pmap_update_pde_action,
+		    pmap_update_pde_teardown, &act);
+	} else {
+		pde_store(pde, newpde);
+		if ((active & cpumask) != 0)
+			pmap_update_pde_invalidate(va, newpde);
+	}
+	sched_unpin();
+}
 #else /* !SMP */
 /*
  * Normal, non-SMP, invalidation functions.
@@ -928,6 +1033,15 @@ pmap_invalidate_cache(void)
 
 	wbinvd();
 }
+
+static void
+pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
+{
+
+	pde_store(pde, newpde);
+	if (pmap == kernel_pmap || pmap->pm_active)
+		pmap_update_pde_invalidate(va, newpde);
+}
 #endif /* !SMP */
 
 static void
@@ -2310,7 +2424,10 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 	 * processor changing the setting of PG_A and/or PG_M between
 	 * the read above and the store below. 
 	 */
-	pde_store(pde, newpde);	
+	if (workaround_erratum383)
+		pmap_update_pde(pmap, va, pde, newpde);
+	else
+		pde_store(pde, newpde);
 
 	/*
 	 * Invalidate a stale recursive mapping of the page table page.
@@ -2926,7 +3043,10 @@ setpte:
 	/*
 	 * Map the superpage.
 	 */
-	pde_store(pde, PG_PS | newpde);
+	if (workaround_erratum383)
+		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
+	else
+		pde_store(pde, PG_PS | newpde);
 
 	pmap_pde_promotions++;
 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
index 15df851..2b43b37 100644
--- a/sys/amd64/include/md_var.h
+++ b/sys/amd64/include/md_var.h
@@ -61,6 +61,7 @@ extern	char	sigcode[];
 extern	int	szsigcode;
 extern	uint64_t *vm_page_dump;
 extern	int	vm_page_dump_size;
+extern	int	workaround_erratum383;
 extern	int	_udatasel;
 extern	int	_ucodesel;
 extern	int	_ucode32sel;
diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h
index 9253462..86a08ce 100644
--- a/sys/amd64/include/specialreg.h
+++ b/sys/amd64/include/specialreg.h
@@ -506,6 +506,7 @@
 #define	MSR_TOP_MEM	0xc001001a	/* boundary for ram below 4G */
 #define	MSR_TOP_MEM2	0xc001001d	/* boundary for ram above 4G */
 #define	MSR_K8_UCODE_UPDATE	0xc0010020	/* update microcode */
+#define	MSR_MC0_CTL_MASK	0xc0010044
 
 /* VIA ACE crypto featureset: for via_feature_rng */
 #define	VIA_HAS_RNG		1	/* cpu has RNG */
-- 
cgit v1.1


From c7d735a07c6450e83b752cd349171b48e34c7989 Mon Sep 17 00:00:00 2001
From: nwhitehorn <nwhitehorn@FreeBSD.org>
Date: Wed, 7 Apr 2010 02:24:41 +0000
Subject: MFC r205014,205015:

Provide groundwork for 32-bit binary compatibility on non-x86 platforms,
for upcoming 64-bit PowerPC and MIPS support. This renames the COMPAT_IA32
option to COMPAT_FREEBSD32, removes some IA32-specific code from MI parts
of the kernel and enhances the freebsd32 compatibility code to support
big-endian platforms.

This MFC is required for MFCs of later changes to the freebsd32
compatibility from HEAD.

Requested by:	kib
---
 sys/amd64/amd64/db_trace.c         | 2 +-
 sys/amd64/amd64/exception.S        | 2 +-
 sys/amd64/amd64/vm_machdep.c       | 4 ++--
 sys/amd64/conf/GENERIC             | 2 +-
 sys/amd64/conf/NOTES               | 4 ++--
 sys/amd64/conf/XENHVM              | 2 +-
 sys/amd64/include/elf.h            | 1 +
 sys/amd64/include/reg.h            | 9 +++++++++
 sys/amd64/linux32/linux32_sysvec.c | 4 ++--
 9 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/db_trace.c b/sys/amd64/amd64/db_trace.c
index 73ffac5..cba90f2 100644
--- a/sys/amd64/amd64/db_trace.c
+++ b/sys/amd64/amd64/db_trace.c
@@ -319,7 +319,7 @@ db_nextframe(struct amd64_frame **fp, db_addr_t *ip, struct thread *td)
 			frame_type = INTERRUPT;
 		else if (strcmp(name, "Xfast_syscall") == 0)
 			frame_type = SYSCALL;
-#ifdef COMPAT_IA32
+#ifdef COMPAT_FREEBSD32
 		else if (strcmp(name, "Xint0x80_syscall") == 0)
 			frame_type = SYSCALL;
 #endif
diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S
index 3d1a20e..1799b74 100644
--- a/sys/amd64/amd64/exception.S
+++ b/sys/amd64/amd64/exception.S
@@ -572,7 +572,7 @@ ENTRY(fork_trampoline)
  * included.
  */
 
-#ifdef COMPAT_IA32
+#ifdef COMPAT_FREEBSD32
 	.data
 	.p2align 4
 	.text
diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
index a99fdaa..d6906ac 100644
--- a/sys/amd64/amd64/vm_machdep.c
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -439,7 +439,7 @@ cpu_set_upcall_kse(struct thread *td, void (*entry)(void *), void *arg,
 	 */
 	cpu_thread_clean(td);
 
-#ifdef COMPAT_IA32
+#ifdef COMPAT_FREEBSD32
 	if (td->td_proc->p_sysent->sv_flags & SV_ILP32) {
 		/*
 	 	 * Set the trap frame to point at the beginning of the uts
@@ -490,7 +490,7 @@ cpu_set_user_tls(struct thread *td, void *tls_base)
 	if ((u_int64_t)tls_base >= VM_MAXUSER_ADDRESS)
 		return (EINVAL);
 
-#ifdef COMPAT_IA32
+#ifdef COMPAT_FREEBSD32
 	if (td->td_proc->p_sysent->sv_flags & SV_ILP32) {
 		td->td_pcb->pcb_gsbase = (register_t)tls_base;
 		return (0);
diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index e5a6955..e9f3c17 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -54,7 +54,7 @@ options 	PSEUDOFS		# Pseudo-filesystem framework
 options 	GEOM_PART_GPT		# GUID Partition Tables.
 options 	GEOM_LABEL		# Provides labelization
 options 	COMPAT_43TTY		# BSD 4.3 TTY compat (sgtty)
-options 	COMPAT_IA32		# Compatible with i386 binaries
+options 	COMPAT_FREEBSD32	# Compatible with i386 binaries
 options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
 options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
 options 	COMPAT_FREEBSD6		# Compatible with FreeBSD6
diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES
index 159f12e..4b6debb 100644
--- a/sys/amd64/conf/NOTES
+++ b/sys/amd64/conf/NOTES
@@ -445,7 +445,7 @@ options 	PMAP_SHPGPERPROC=201
 #XXX these 32 bit binaries is added.
 
 # Enable 32-bit runtime support for FreeBSD/i386 binaries.
-options 	COMPAT_IA32
+options 	COMPAT_FREEBSD32
 
 # Enable iBCS2 runtime support for SCO and ISC binaries
 #XXX#options 	IBCS2
@@ -456,7 +456,7 @@ options 	COMPAT_IA32
 # Enable Linux ABI emulation
 #XXX#options 	COMPAT_LINUX
 
-# Enable 32-bit Linux ABI emulation (requires COMPAT_43 and COMPAT_IA32)
+# Enable 32-bit Linux ABI emulation (requires COMPAT_43 and COMPAT_FREEBSD32)
 options 	COMPAT_LINUX32
 
 # Enable the linux-like proc filesystem support (requires COMPAT_LINUX32
diff --git a/sys/amd64/conf/XENHVM b/sys/amd64/conf/XENHVM
index f875f5a..377276e 100644
--- a/sys/amd64/conf/XENHVM
+++ b/sys/amd64/conf/XENHVM
@@ -55,7 +55,7 @@ options 	PSEUDOFS		# Pseudo-filesystem framework
 options 	GEOM_PART_GPT		# GUID Partition Tables.
 options 	GEOM_LABEL		# Provides labelization
 options 	COMPAT_43TTY		# BSD 4.3 TTY compat (sgtty)
-options 	COMPAT_IA32		# Compatible with i386 binaries
+options 	COMPAT_FREEBSD32	# Compatible with i386 binaries
 options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
 options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
 options 	COMPAT_FREEBSD6		# Compatible with FreeBSD6
diff --git a/sys/amd64/include/elf.h b/sys/amd64/include/elf.h
index 88f4398..678f5d3 100644
--- a/sys/amd64/include/elf.h
+++ b/sys/amd64/include/elf.h
@@ -42,6 +42,7 @@
 #include <sys/elf_generic.h>
 
 #define	ELF_ARCH	EM_X86_64
+#define	ELF_ARCH32	EM_386
 
 #define	ELF_MACHINE_OK(x) ((x) == EM_X86_64)
 
diff --git a/sys/amd64/include/reg.h b/sys/amd64/include/reg.h
index 89211a3..4a83918 100644
--- a/sys/amd64/include/reg.h
+++ b/sys/amd64/include/reg.h
@@ -37,6 +37,10 @@
 #ifndef _MACHINE_REG_H_
 #define	_MACHINE_REG_H_
 
+#if defined(_KERNEL) && !defined(_STANDALONE)
+#include "opt_compat.h"
+#endif
+
 /*
  * Register set accessible via /proc/$pid/regs and PT_{SET,GET}REGS.
  */
@@ -116,6 +120,11 @@ struct dbreg {
 #define	DBREG_DRX(d,x)	((d)->dr[(x)])	/* reference dr0 - dr15 by
 					   register number */
 
+#ifdef COMPAT_FREEBSD32
+#include <machine/fpu.h>
+#include <compat/ia32/ia32_reg.h>
+#endif
+
 #ifdef _KERNEL
 /*
  * XXX these interfaces are MI, so they should be declared in a MI place.
diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c
index 6e3e326..d967ad7 100644
--- a/sys/amd64/linux32/linux32_sysvec.c
+++ b/sys/amd64/linux32/linux32_sysvec.c
@@ -34,8 +34,8 @@
 __FBSDID("$FreeBSD$");
 #include "opt_compat.h"
 
-#ifndef COMPAT_IA32
-#error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!"
+#ifndef COMPAT_FREEBSD32
+#error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
 #endif
 
 #define	__ELF_WORD_SIZE	32
-- 
cgit v1.1


From 9b7228a41e71c09c126a7ed8b5812a56e8d7029e Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Tue, 13 Apr 2010 10:23:03 +0000
Subject: MFC r206459: Handle a case when non-canonical address is loaded into
 the fsbase or gsbase MSR.

---
 sys/amd64/amd64/exception.S | 30 ++++++++++++++++++++++++++++--
 sys/amd64/amd64/trap.c      |  8 ++++++++
 sys/amd64/include/md_var.h  |  4 ++++
 3 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S
index 1799b74..69288f3 100644
--- a/sys/amd64/amd64/exception.S
+++ b/sys/amd64/amd64/exception.S
@@ -668,7 +668,8 @@ ld_fs:	movw	%ax,%fs
 	movl	$MSR_FSBASE,%ecx
 	movl	PCB_FSBASE(%r8),%eax
 	movl	PCB_FSBASE+4(%r8),%edx
-	wrmsr
+	.globl	ld_fsbase
+ld_fsbase: wrmsr
 1:
 	/* Restore %gs and gsbase */
 	movw	TF_GS(%rsp),%si
@@ -685,7 +686,8 @@ ld_gs:	movw	%si,%gs
 	movl	$MSR_KGSBASE,%ecx
 	movl	PCB_GSBASE(%r8),%eax
 	movl	PCB_GSBASE+4(%r8),%edx
-	wrmsr
+	.globl	ld_gsbase
+ld_gsbase: wrmsr
 1:	.globl	ld_es
 ld_es:	movw	TF_ES(%rsp),%es
 	.globl	ld_ds
@@ -798,6 +800,30 @@ gs_load_fault:
 	call	trap
 	movw	$KUG32SEL,TF_GS(%rsp)
 	jmp	doreti
+
+	ALIGN_TEXT
+	.globl	fsbase_load_fault
+fsbase_load_fault:
+	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
+	movq	%rsp, %rdi
+	call	trap
+	movq	PCPU(CURTHREAD),%r8
+	movq	TD_PCB(%r8),%r8
+	movq	$0,PCB_FSBASE(%r8)
+	jmp	doreti
+
+	ALIGN_TEXT
+	.globl	gsbase_load_fault
+gsbase_load_fault:
+	popfq
+	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
+	movq	%rsp, %rdi
+	call	trap
+	movq	PCPU(CURTHREAD),%r8
+	movq	TD_PCB(%r8),%r8
+	movq	$0,PCB_GSBASE(%r8)
+	jmp	doreti
+
 #ifdef HWPMC_HOOKS
 	ENTRY(end_exceptions)
 #endif
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 41ca758..4b5d8c7 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -563,6 +563,14 @@ trap(struct trapframe *frame)
 				frame->tf_gs = _ugssel;
 				goto out;
 			}
+			if (frame->tf_rip == (long)ld_gsbase) {
+				frame->tf_rip = (long)gsbase_load_fault;
+				goto out;
+			}
+			if (frame->tf_rip == (long)ld_fsbase) {
+				frame->tf_rip = (long)fsbase_load_fault;
+				goto out;
+			}
 			if (PCPU_GET(curpcb)->pcb_onfault != NULL) {
 				frame->tf_rip =
 				    (long)PCPU_GET(curpcb)->pcb_onfault;
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
index 2b43b37..88f3e1d 100644
--- a/sys/amd64/include/md_var.h
+++ b/sys/amd64/include/md_var.h
@@ -83,10 +83,14 @@ void	ld_ds(void) __asm(__STRING(ld_ds));
 void	ld_es(void) __asm(__STRING(ld_es));
 void	ld_fs(void) __asm(__STRING(ld_fs));
 void	ld_gs(void) __asm(__STRING(ld_gs));
+void	ld_fsbase(void) __asm(__STRING(ld_fsbase));
+void	ld_gsbase(void) __asm(__STRING(ld_gsbase));
 void	ds_load_fault(void) __asm(__STRING(ds_load_fault));
 void	es_load_fault(void) __asm(__STRING(es_load_fault));
 void	fs_load_fault(void) __asm(__STRING(fs_load_fault));
 void	gs_load_fault(void) __asm(__STRING(gs_load_fault));
+void	fsbase_load_fault(void) __asm(__STRING(fsbase_load_fault));
+void	gsbase_load_fault(void) __asm(__STRING(gsbase_load_fault));
 void	dump_add_page(vm_paddr_t);
 void	dump_drop_page(vm_paddr_t);
 void	initializecpu(void);
-- 
cgit v1.1


From 624d652fbb0bd336ef0fbf2a373b999101823a58 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Wed, 14 Apr 2010 15:00:46 +0000
Subject: MFC 205851: Add a handler for the local APIC error interrupt.  For
 now it just prints out the current value of the local APIC error register
 when the interrupt fires.

---
 sys/amd64/amd64/apic_vector.S | 12 +++++++++++
 sys/amd64/amd64/local_apic.c  | 47 ++++++++++++++++++++++++++-----------------
 sys/amd64/include/apicvar.h   |  5 +++--
 3 files changed, 44 insertions(+), 20 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index cebafc8..df94a47 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -104,6 +104,18 @@ IDTVEC(timerint)
 	MEXITCOUNT
 	jmp	doreti
 
+/*
+ * Local APIC error interrupt handler.
+ */
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(errorint)
+	PUSH_FRAME
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	call	lapic_handle_error
+	MEXITCOUNT
+	jmp	doreti
+
 #ifdef SMP
 /*
  * Global address space TLB shootdown.
diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c
index c274631..8edc971 100644
--- a/sys/amd64/amd64/local_apic.c
+++ b/sys/amd64/amd64/local_apic.c
@@ -115,14 +115,12 @@ struct lapic {
 	int la_ioint_irqs[APIC_NUM_IOINTS + 1];
 } static lapics[MAX_APIC_ID + 1];
 
-/* XXX: should thermal be an NMI? */
-
 /* Global defaults for local APIC LVT entries. */
 static struct lvt lvts[LVT_MAX + 1] = {
 	{ 1, 1, 1, 1, APIC_LVT_DM_EXTINT, 0 },	/* LINT0: masked ExtINT */
 	{ 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 },	/* LINT1: NMI */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT },	/* Timer */
-	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT },	/* Error */
+	{ 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT },	/* Error */
 	{ 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 },	/* PMC */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT },	/* Thermal */
 };
@@ -225,7 +223,10 @@ lapic_init(vm_paddr_t addr)
 	/* Local APIC timer interrupt. */
 	setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_SYSIGT, SEL_KPL, 0);
 
-	/* XXX: error/thermal interrupts */
+	/* Local APIC error interrupt. */
+	setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_SYSIGT, SEL_KPL, 0);
+
+	/* XXX: Thermal interrupt */
 }
 
 /*
@@ -278,7 +279,7 @@ lapic_dump(const char* str)
 	    lapic->id, lapic->version, lapic->ldr, lapic->dfr);
 	printf("  lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
 	    lapic->lvt_lint0, lapic->lvt_lint1, lapic->tpr, lapic->svr);
-	printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x pcm: 0x%08x\n",
+	printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x pmc: 0x%08x\n",
 	    lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error,
 	    lapic->lvt_pcint);
 }
@@ -326,7 +327,11 @@ lapic_setup(int boot)
 		lapic_timer_enable_intr();
 	}
 
-	/* XXX: Error and thermal LVTs */
+	/* Program error LVT and clear any existing errors. */
+	lapic->lvt_error = lvt_mode(la, LVT_ERROR, lapic->lvt_error);
+	lapic->esr = 0;
+
+	/* XXX: Thermal LVT */
 
 	intr_restore(eflags);
 }
@@ -725,18 +730,6 @@ lapic_eoi(void)
 	lapic->eoi = 0;
 }
 
-/*
- * Read the contents of the error status register.  We have to write
- * to the register first before reading from it.
- */
-u_int
-lapic_error(void)
-{
-
-	lapic->esr = 0;
-	return (lapic->esr);
-}
-
 void
 lapic_handle_intr(int vector, struct trapframe *frame)
 {
@@ -863,6 +856,24 @@ lapic_timer_enable_intr(void)
 	lapic->lvt_timer = value;
 }
 
+void
+lapic_handle_error(void)
+{
+	u_int32_t esr;
+
+	/*
+	 * Read the contents of the error status register.  Write to
+	 * the register first before reading from it to force the APIC
+	 * to update its value to indicate any errors that have
+	 * occurred since the previous write to the register.
+	 */
+	lapic->esr = 0;
+	esr = lapic->esr;
+
+	printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr);
+	lapic_eoi();
+}
+
 u_int
 apic_cpuid(u_int apic_id)
 {
diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h
index 110ce81..91bba99 100644
--- a/sys/amd64/include/apicvar.h
+++ b/sys/amd64/include/apicvar.h
@@ -179,7 +179,8 @@ struct apic_enumerator {
 inthand_t
 	IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
 	IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
-	IDTVEC(apic_isr7), IDTVEC(spuriousint), IDTVEC(timerint);
+	IDTVEC(apic_isr7), IDTVEC(errorint), IDTVEC(spuriousint),
+	IDTVEC(timerint);
 
 extern vm_paddr_t lapic_paddr;
 extern int apic_cpuids[];
@@ -211,13 +212,13 @@ void	lapic_disable_pmc(void);
 void	lapic_dump(const char *str);
 int	lapic_enable_pmc(void);
 void	lapic_eoi(void);
-u_int	lapic_error(void);
 int	lapic_id(void);
 void	lapic_init(vm_paddr_t addr);
 int	lapic_intr_pending(u_int vector);
 void	lapic_ipi_raw(register_t icrlo, u_int dest);
 void	lapic_ipi_vectored(u_int vector, int dest);
 int	lapic_ipi_wait(int delay);
+void	lapic_handle_error(void);
 void	lapic_handle_intr(int vector, struct trapframe *frame);
 void	lapic_handle_timer(struct trapframe *frame);
 void	lapic_reenable_pmc(void);
-- 
cgit v1.1


From 1afdc2f1bbd14b72664ff8d709ab7fbc19afcc1f Mon Sep 17 00:00:00 2001
From: gibbs <gibbs@FreeBSD.org>
Date: Wed, 14 Apr 2010 17:01:29 +0000
Subject: MFC 204214: Enforce stronger bus-dma alignment semantics so bus-dma
 operates correctly with Xen's blkfront driver.

---
 sys/amd64/amd64/busdma_machdep.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/busdma_machdep.c b/sys/amd64/amd64/busdma_machdep.c
index 3197d15..fae6ef3 100644
--- a/sys/amd64/amd64/busdma_machdep.c
+++ b/sys/amd64/amd64/busdma_machdep.c
@@ -239,8 +239,7 @@ bus_dma_tag_create(bus_dma_tag_t parent, bus_size_t alignment,
 	newtag->alignment = alignment;
 	newtag->boundary = boundary;
 	newtag->lowaddr = trunc_page((vm_paddr_t)lowaddr) + (PAGE_SIZE - 1);
-	newtag->highaddr = trunc_page((vm_paddr_t)highaddr) +
-	    (PAGE_SIZE - 1);
+	newtag->highaddr = trunc_page((vm_paddr_t)highaddr) + (PAGE_SIZE - 1);
 	newtag->filter = filter;
 	newtag->filterarg = filterarg;
 	newtag->maxsize = maxsize;
@@ -605,13 +604,18 @@ _bus_dmamap_load_buffer(bus_dma_tag_t dmat,
 		vendaddr = (vm_offset_t)buf + buflen;
 
 		while (vaddr < vendaddr) {
+			bus_size_t sg_len;
+
+			sg_len = PAGE_SIZE - ((vm_offset_t)vaddr & PAGE_MASK);
 			if (pmap)
 				paddr = pmap_extract(pmap, vaddr);
 			else
 				paddr = pmap_kextract(vaddr);
-			if (run_filter(dmat, paddr) != 0)
+			if (run_filter(dmat, paddr) != 0) {
+				sg_len = roundup2(sg_len, dmat->alignment);
 				map->pagesneeded++;
-			vaddr += (PAGE_SIZE - ((vm_offset_t)vaddr & PAGE_MASK));
+			}
+			vaddr += sg_len;
 		}
 		CTR1(KTR_BUSDMA, "pagesneeded= %d\n", map->pagesneeded);
 	}
@@ -644,6 +648,8 @@ _bus_dmamap_load_buffer(bus_dma_tag_t dmat,
 	bmask = ~(dmat->boundary - 1);
 
 	for (seg = *segp; buflen > 0 ; ) {
+		bus_size_t max_sgsize;
+
 		/*
 		 * Get the physical address for this segment.
 		 */
@@ -655,11 +661,15 @@ _bus_dmamap_load_buffer(bus_dma_tag_t dmat,
 		/*
 		 * Compute the segment size, and adjust counts.
 		 */
-		sgsize = PAGE_SIZE - ((u_long)curaddr & PAGE_MASK);
-		if (sgsize > dmat->maxsegsz)
-			sgsize = dmat->maxsegsz;
-		if (buflen < sgsize)
-			sgsize = buflen;
+		max_sgsize = MIN(buflen, dmat->maxsegsz);
+		sgsize = PAGE_SIZE - ((vm_offset_t)curaddr & PAGE_MASK);
+		if (map->pagesneeded != 0 && run_filter(dmat, curaddr)) {
+			sgsize = roundup2(sgsize, dmat->alignment);
+			sgsize = MIN(sgsize, max_sgsize);
+			curaddr = add_bounce_page(dmat, map, vaddr, sgsize);
+		} else {
+			sgsize = MIN(sgsize, max_sgsize);
+		}
 
 		/*
 		 * Make sure we don't cross any boundaries.
@@ -670,9 +680,6 @@ _bus_dmamap_load_buffer(bus_dma_tag_t dmat,
 				sgsize = (baddr - curaddr);
 		}
 
-		if (map->pagesneeded != 0 && run_filter(dmat, curaddr))
-			curaddr = add_bounce_page(dmat, map, vaddr, sgsize);
-
 		/*
 		 * Insert chunk into a segment, coalescing with
 		 * previous segment if possible.
-- 
cgit v1.1


From ab69bb0ca5ba497377d209f7d6c92fbd4e4972fb Mon Sep 17 00:00:00 2001
From: fabient <fabient@FreeBSD.org>
Date: Fri, 16 Apr 2010 15:43:24 +0000
Subject: MFC r206089, r206684:

- Support for uncore counting events: one fixed PMC with the uncore
   domain clock, 8 programmable PMC.
- Westmere based CPU (Xeon 5600, Corei7 980X) support.
- New man pages with events list for core and uncore.
- Updated Corei7 events with Intel 253669-033US December 2009 doc.
  There is some removed events in the documentation, they have been
  kept in the code but documented in the man page as obsolete.
- Offcore response events can be setup with rsp token.

Sponsored by: NETASQ
---
 sys/amd64/include/pmc_mdep.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/pmc_mdep.h b/sys/amd64/include/pmc_mdep.h
index f233a51..4f16485 100644
--- a/sys/amd64/include/pmc_mdep.h
+++ b/sys/amd64/include/pmc_mdep.h
@@ -43,17 +43,20 @@ struct pmc_mdep;
 #include <dev/hwpmc/hwpmc_core.h>
 #include <dev/hwpmc/hwpmc_piv.h>
 #include <dev/hwpmc/hwpmc_tsc.h>
+#include <dev/hwpmc/hwpmc_uncore.h>
 
 /*
  * Intel processors implementing V2 and later of the Intel performance
  * measurement architecture have PMCs of the following classes: TSC,
- * IAF and IAP.
+ * IAF, IAP, UCF and UCP.
  */
 #define	PMC_MDEP_CLASS_INDEX_TSC	0
 #define	PMC_MDEP_CLASS_INDEX_K8		1
 #define	PMC_MDEP_CLASS_INDEX_P4		1
 #define	PMC_MDEP_CLASS_INDEX_IAP	1
 #define	PMC_MDEP_CLASS_INDEX_IAF	2
+#define	PMC_MDEP_CLASS_INDEX_UCP	3
+#define	PMC_MDEP_CLASS_INDEX_UCF	4
 
 /*
  * On the amd64 platform we support the following PMCs.
@@ -63,12 +66,16 @@ struct pmc_mdep;
  * PIV		Intel P4/HTT and P4/EMT64
  * IAP		Intel Core/Core2/Atom CPUs in 64 bits mode.
  * IAF		Intel fixed-function PMCs in Core2 and later CPUs.
+ * UCP		Intel Uncore programmable PMCs.
+ * UCF		Intel Uncore fixed-function PMCs.
  */
 
 union pmc_md_op_pmcallocate  {
 	struct pmc_md_amd_op_pmcallocate	pm_amd;
 	struct pmc_md_iaf_op_pmcallocate	pm_iaf;
 	struct pmc_md_iap_op_pmcallocate	pm_iap;
+	struct pmc_md_ucf_op_pmcallocate	pm_ucf;
+	struct pmc_md_ucp_op_pmcallocate	pm_ucp;
 	struct pmc_md_p4_op_pmcallocate		pm_p4;
 	uint64_t				__pad[4];
 };
@@ -83,6 +90,8 @@ union pmc_md_pmc {
 	struct pmc_md_amd_pmc	pm_amd;
 	struct pmc_md_iaf_pmc	pm_iaf;
 	struct pmc_md_iap_pmc	pm_iap;
+	struct pmc_md_ucf_pmc	pm_ucf;
+	struct pmc_md_ucp_pmc	pm_ucp;
 	struct pmc_md_p4_pmc	pm_p4;
 };
 
-- 
cgit v1.1


From a291c9ff146b6f7f473498e1c7de00afd8709c24 Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Sat, 17 Apr 2010 09:37:08 +0000
Subject: MFC r206623: ld_gs_base is executing with stack containing only the
 frame, temporary pushed %rflags has been popped already.

---
 sys/amd64/amd64/exception.S | 1 -
 1 file changed, 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S
index 69288f3..65c6452 100644
--- a/sys/amd64/amd64/exception.S
+++ b/sys/amd64/amd64/exception.S
@@ -815,7 +815,6 @@ fsbase_load_fault:
 	ALIGN_TEXT
 	.globl	gsbase_load_fault
 gsbase_load_fault:
-	popfq
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
 	movq	%rsp, %rdi
 	call	trap
-- 
cgit v1.1


From c61b9f564a268507195acc02310e9abd9d8431d8 Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Tue, 20 Apr 2010 08:19:43 +0000
Subject: MFC r206553: Change printf() calls to uprintf() for sigreturn() and
 trap() complaints about inacessible or wrong mcontext, and for dreaded
 "kernel trap with interrupts disabled" situation. The later is changed when
 trap is generated from user mode (shall never be ?).

Normalize the messages to include both pid and thread name.
---
 sys/amd64/amd64/machdep.c    | 17 ++++++++++-------
 sys/amd64/amd64/trap.c       |  2 +-
 sys/amd64/ia32/ia32_signal.c | 12 ++++++++----
 3 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index c4130a4..7671376 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -422,13 +422,14 @@ sigreturn(td, uap)
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0) {
-		printf("sigreturn (pid %d): copyin failed\n", p->p_pid);
+		uprintf("pid %d (%s): sigreturn copyin failed\n",
+		    p->p_pid, td->td_name);
 		return (error);
 	}
 	ucp = &uc;
 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
-		printf("sigreturn (pid %d): mc_flags %x\n", p->p_pid,
-		    ucp->uc_mcontext.mc_flags);
+		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
+		    td->td_name, ucp->uc_mcontext.mc_flags);
 		return (EINVAL);
 	}
 	regs = td->td_frame;
@@ -447,8 +448,8 @@ sigreturn(td, uap)
 	 * one less debugger trap, so allowing it is fairly harmless.
 	 */
 	if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
-		printf("sigreturn (pid %d): rflags = 0x%lx\n", p->p_pid,
-		    rflags);
+		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
+		    td->td_name, rflags);
 		return (EINVAL);
 	}
 
@@ -459,7 +460,8 @@ sigreturn(td, uap)
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
-		printf("sigreturn (pid %d): cs = 0x%x\n", p->p_pid, cs);
+		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
+		    td->td_name, cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
@@ -471,7 +473,8 @@ sigreturn(td, uap)
 
 	ret = set_fpcontext(td, &ucp->uc_mcontext);
 	if (ret != 0) {
-		printf("sigreturn (pid %d): set_fpcontext\n", p->p_pid);
+		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
+		    p->p_pid, td->td_name, ret);
 		return (ret);
 	}
 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 4b5d8c7..8492f4e 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -303,7 +303,7 @@ trap(struct trapframe *frame)
 		 * enabled later.
 		 */
 		if (ISPL(frame->tf_cs) == SEL_UPL)
-			printf(
+			uprintf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curthread->td_name, type);
 		else if (type != T_NMI && type != T_BPTFLT &&
diff --git a/sys/amd64/ia32/ia32_signal.c b/sys/amd64/ia32/ia32_signal.c
index 10ec641..2416988 100644
--- a/sys/amd64/ia32/ia32_signal.c
+++ b/sys/amd64/ia32/ia32_signal.c
@@ -565,7 +565,8 @@ freebsd4_freebsd32_sigreturn(td, uap)
 	 * one less debugger trap, so allowing it is fairly harmless.
 	 */
 	if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
-		printf("freebsd4_freebsd32_sigreturn: eflags = 0x%x\n", eflags);
+		uprintf("pid %d (%s): freebsd4_freebsd32_sigreturn eflags = 0x%x\n",
+		    td->td_proc->p_pid, td->td_name, eflags);
 		return (EINVAL);
 	}
 
@@ -576,7 +577,8 @@ freebsd4_freebsd32_sigreturn(td, uap)
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
-		printf("freebsd4_sigreturn: cs = 0x%x\n", cs);
+		uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
+		    td->td_proc->p_pid, td->td_name, cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
@@ -647,7 +649,8 @@ freebsd32_sigreturn(td, uap)
 	 * one less debugger trap, so allowing it is fairly harmless.
 	 */
 	if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
-		printf("freebsd32_sigreturn: eflags = 0x%x\n", eflags);
+		uprintf("pid %d (%s): freebsd32_sigreturn eflags = 0x%x\n",
+		    td->td_proc->p_pid, td->td_name, eflags);
 		return (EINVAL);
 	}
 
@@ -658,7 +661,8 @@ freebsd32_sigreturn(td, uap)
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
-		printf("sigreturn: cs = 0x%x\n", cs);
+		uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
+		    td->td_proc->p_pid, td->td_name, cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
-- 
cgit v1.1


From 2647f6f35366535b7a3ede7e9b6d9f6ba5677711 Mon Sep 17 00:00:00 2001
From: attilio <attilio@FreeBSD.org>
Date: Sat, 24 Apr 2010 00:49:19 +0000
Subject: MFC r206421: Default the machdep.lapic_allclocks to be enabled in
 order to cope with broken atrtc. Now if you want more correct stats on profhz
 and stathz it may be disabled by setting to 0.

Sponsored by:	Sandvine Incorporated
---
 sys/amd64/isa/clock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/isa/clock.c b/sys/amd64/isa/clock.c
index e5c27d1..6e5da8f 100644
--- a/sys/amd64/isa/clock.c
+++ b/sys/amd64/isa/clock.c
@@ -84,7 +84,7 @@ TUNABLE_INT("hw.i8254.freq", &i8254_freq);
 int	i8254_max_count;
 static int i8254_real_max_count;
 
-static int lapic_allclocks;
+static int lapic_allclocks = 1;
 TUNABLE_INT("machdep.lapic_allclocks", &lapic_allclocks);
 
 struct mtx clock_lock;
-- 
cgit v1.1


From 6834582d3b1e4ad8f5d6518a45a1f9911cb89dfe Mon Sep 17 00:00:00 2001
From: yongari <yongari@FreeBSD.org>
Date: Mon, 26 Apr 2010 17:03:56 +0000
Subject: MFC r206625:   Add driver for Silicon Integrated Systems SiS190/191
 Fast/Gigabit Ethernet.   This driver was written by Alexander Pohoyda and
 greatly enhanced   by Nikolay Denev. I don't have these hardwares but this
 driver was   tested by Nikolay Denev and xclin.

  Because SiS didn't release data sheet for this controller, programming
  information came from Linux driver and OpenSolaris. Unlike other open
  source driver for SiS190/191, sge(4) takes full advantage of TX/RX
  checksum offloading and does not require additional copy operation in
  RX handler.
  The controller seems to have advanced offloading features like VLAN
  hardware tag insertion/stripping, TCP segmentation offload(TSO) as
  well as jumbo frame support but these features are not available
  yet. Special thanks to xclin <xclin<> cs dot nctu dot edu dot tw>
  who sent fix for receiving VLAN oversized frames.
---
 sys/amd64/conf/GENERIC | 1 +
 1 file changed, 1 insertion(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index e9f3c17..a0edfca 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -224,6 +224,7 @@ device		pcn		# AMD Am79C97x PCI 10/100 (precedence over 'le')
 device		re		# RealTek 8139C+/8169/8169S/8110S
 device		rl		# RealTek 8129/8139
 device		sf		# Adaptec AIC-6915 (``Starfire'')
+device		sge		# Silicon Integrated Systems SiS190/191
 device		sis		# Silicon Integrated Systems SiS 900/SiS 7016
 device		sk		# SysKonnect SK-984x & SK-982x gigabit Ethernet
 device		ste		# Sundance ST201 (D-Link DFE-550TX)
-- 
cgit v1.1


From 7550515288b109b9d22639e1ebd6ee88eb569f7a Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Tue, 27 Apr 2010 10:50:09 +0000
Subject: MFC r206992: As was done in r155238 for i386 and in r155239 for
 amd64, clear the carry flag for ia32 binary executed on amd64 host in
 get_mcontext().

---
 sys/amd64/ia32/ia32_signal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/ia32/ia32_signal.c b/sys/amd64/ia32/ia32_signal.c
index 2416988..15ba947 100644
--- a/sys/amd64/ia32/ia32_signal.c
+++ b/sys/amd64/ia32/ia32_signal.c
@@ -141,9 +141,11 @@ ia32_get_mcontext(struct thread *td, struct ia32_mcontext *mcp, int flags)
 	mcp->mc_esi = tp->tf_rsi;
 	mcp->mc_ebp = tp->tf_rbp;
 	mcp->mc_isp = tp->tf_rsp;
+	mcp->mc_eflags = tp->tf_rflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_eax = 0;
 		mcp->mc_edx = 0;
+		mcp->mc_eflags &= ~PSL_C;
 	} else {
 		mcp->mc_eax = tp->tf_rax;
 		mcp->mc_edx = tp->tf_rdx;
@@ -152,7 +154,6 @@ ia32_get_mcontext(struct thread *td, struct ia32_mcontext *mcp, int flags)
 	mcp->mc_ecx = tp->tf_rcx;
 	mcp->mc_eip = tp->tf_rip;
 	mcp->mc_cs = tp->tf_cs;
-	mcp->mc_eflags = tp->tf_rflags;
 	mcp->mc_esp = tp->tf_rsp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
-- 
cgit v1.1


From ef43ecae04b397691ada6c71a499614122172876 Mon Sep 17 00:00:00 2001
From: thompsa <thompsa@FreeBSD.org>
Date: Thu, 29 Apr 2010 22:44:04 +0000
Subject: MFC r207077

 Change USB_DEBUG to #ifdef and allow it to be turned off. Previously this had
 the illusion of a tunable setting but was always turned on regardless.
---
 sys/amd64/conf/GENERIC | 1 +
 1 file changed, 1 insertion(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index a0edfca..999ccb7 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -281,6 +281,7 @@ device		firmware	# firmware assist module
 device		bpf		# Berkeley packet filter
 
 # USB support
+options 	USB_DEBUG	# enable debug msgs
 device		uhci		# UHCI PCI->USB interface
 device		ohci		# OHCI PCI->USB interface
 device		ehci		# EHCI PCI->USB interface (USB 2.0)
-- 
cgit v1.1


From 01384213a705f4a7e4d5f874b8868b4e8330508f Mon Sep 17 00:00:00 2001
From: imp <imp@FreeBSD.org>
Date: Sun, 2 May 2010 06:20:42 +0000
Subject: Move to the new way of specifying compat options.  The backs out the
 FOO = BAR form, in favor of listing the mapping in a separate file for more
 compatibility with older versions of config.

---
 sys/amd64/conf/GENERIC | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index 999ccb7..7288fef 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -54,7 +54,8 @@ options 	PSEUDOFS		# Pseudo-filesystem framework
 options 	GEOM_PART_GPT		# GUID Partition Tables.
 options 	GEOM_LABEL		# Provides labelization
 options 	COMPAT_43TTY		# BSD 4.3 TTY compat (sgtty)
-options 	COMPAT_FREEBSD32	# Compatible with i386 binaries
+#options 	COMPAT_FREEBSD32	# Compatible with i386 binaries
+options 	COMPAT_IA32	# Compatible with i386 binaries
 options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
 options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
 options 	COMPAT_FREEBSD6		# Compatible with FreeBSD6
-- 
cgit v1.1


From 6231a81f242496fceb9b9cf0b3c45863b96efc78 Mon Sep 17 00:00:00 2001
From: imp <imp@FreeBSD.org>
Date: Sun, 2 May 2010 06:24:17 +0000
Subject: Revert 207494: it was only for testing purposes.

---
 sys/amd64/conf/GENERIC | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index 7288fef..999ccb7 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -54,8 +54,7 @@ options 	PSEUDOFS		# Pseudo-filesystem framework
 options 	GEOM_PART_GPT		# GUID Partition Tables.
 options 	GEOM_LABEL		# Provides labelization
 options 	COMPAT_43TTY		# BSD 4.3 TTY compat (sgtty)
-#options 	COMPAT_FREEBSD32	# Compatible with i386 binaries
-options 	COMPAT_IA32	# Compatible with i386 binaries
+options 	COMPAT_FREEBSD32	# Compatible with i386 binaries
 options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
 options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
 options 	COMPAT_FREEBSD6		# Compatible with FreeBSD6
-- 
cgit v1.1


From 3e52c1f894aa8fc2b8031ef275b1fb830a25bb6f Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Thu, 6 May 2010 04:57:10 +0000
Subject: MFC r207570: Style and comment adjustements.

---
 sys/amd64/amd64/exception.S | 79 ++++++++++++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 34 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S
index 65c6452..41bf173 100644
--- a/sys/amd64/amd64/exception.S
+++ b/sys/amd64/amd64/exception.S
@@ -50,14 +50,14 @@
 	.bss
 	.globl	dtrace_invop_jump_addr
 	.align	8
-	.type	dtrace_invop_jump_addr, @object
-        .size	dtrace_invop_jump_addr, 8
+	.type	dtrace_invop_jump_addr,@object
+	.size	dtrace_invop_jump_addr,8
 dtrace_invop_jump_addr:
 	.zero	8
 	.globl	dtrace_invop_calltrap_addr
 	.align	8
-	.type	dtrace_invop_calltrap_addr, @object
-        .size	dtrace_invop_calltrap_addr, 8
+	.type	dtrace_invop_calltrap_addr,@object
+	.size	dtrace_invop_calltrap_addr,8
 dtrace_invop_calltrap_addr:
 	.zero	8
 #endif
@@ -157,7 +157,6 @@ IDTVEC(align)
 	 * kernel from userland.  Reenable interrupts if they were enabled
 	 * before the trap.  This approximates SDT_SYS386TGT on the i386 port.
 	 */
-
 	SUPERALIGN_TEXT
 	.globl	alltraps
 	.type	alltraps,@function
@@ -211,16 +210,16 @@ alltraps_pushregs_no_rdi:
 	 * Set our jump address for the jump back in the event that
 	 * the breakpoint wasn't caused by DTrace at all.
 	 */
-	movq	$calltrap, dtrace_invop_calltrap_addr(%rip)
+	movq	$calltrap,dtrace_invop_calltrap_addr(%rip)
 
 	/* Jump to the code hooked in by DTrace. */
-	movq	dtrace_invop_jump_addr, %rax
+	movq	dtrace_invop_jump_addr,%rax
 	jmpq	*dtrace_invop_jump_addr
 #endif
 	.globl	calltrap
 	.type	calltrap,@function
 calltrap:
-	movq	%rsp, %rdi
+	movq	%rsp,%rdi
 	call	trap
 	MEXITCOUNT
 	jmp	doreti			/* Handle any pending ASTs */
@@ -274,9 +273,11 @@ IDTVEC(dblfault)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	1f			/* already running with kernel GS.base */
 	swapgs
-1:	movq	%rsp, %rdi
+1:
+	movq	%rsp,%rdi
 	call	dblfault_handler
-2:	hlt
+2:
+	hlt
 	jmp	2b
 
 IDTVEC(page)
@@ -369,7 +370,7 @@ IDTVEC(fast_syscall)
 	movq	%r15,TF_R15(%rsp)	/* C preserved */
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	FAKE_MCOUNT(TF_RIP(%rsp))
-	movq	%rsp, %rdi
+	movq	%rsp,%rdi
 	call	syscall
 	movq	PCPU(CURPCB),%rax
 	andq	$~PCB_FULLCTX,PCB_FLAGS(%rax)
@@ -456,7 +457,7 @@ nmi_fromuserspace:
 /* Note: this label is also used by ddb and gdb: */
 nmi_calltrap:
 	FAKE_MCOUNT(TF_RIP(%rsp))
-	movq	%rsp, %rdi
+	movq	%rsp,%rdi
 	call	trap
 	MEXITCOUNT
 #ifdef HWPMC_HOOKS
@@ -555,9 +556,9 @@ nmi_restoreregs:
 	iretq
 
 ENTRY(fork_trampoline)
-	movq	%r12, %rdi		/* function */
-	movq	%rbx, %rsi		/* arg1 */
-	movq	%rsp, %rdx		/* trapframe pointer */
+	movq	%r12,%rdi		/* function */
+	movq	%rbx,%rsi		/* arg1 */
+	movq	%rsp,%rdx		/* trapframe pointer */
 	call	fork_exit
 	MEXITCOUNT
 	jmp	doreti			/* Handle any ASTs */
@@ -628,7 +629,7 @@ doreti_ast:
 	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
 	je	doreti_exit
 	sti
-	movq	%rsp, %rdi			/* pass a pointer to the trapframe */
+	movq	%rsp,%rdi	/* pass a pointer to the trapframe */
 	call	ast
 	jmp	doreti_ast
 
@@ -648,8 +649,8 @@ doreti_exit:
 	 * Do not reload segment registers for kernel.
 	 * Since we do not reload segments registers with sane
 	 * values on kernel entry, descriptors referenced by
-	 * segments registers may be not valid. This is fatal
-	 * for the usermode, but is innocent for the kernel.
+	 * segments registers might be not valid.  This is fatal
+	 * for user mode, but is not a problem for the kernel.
 	 */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	ld_regs
@@ -662,14 +663,16 @@ do_segs:
 	/* Restore %fs and fsbase */
 	movw	TF_FS(%rsp),%ax
 	.globl	ld_fs
-ld_fs:	movw	%ax,%fs
+ld_fs:
+	movw	%ax,%fs
 	cmpw	$KUF32SEL,%ax
 	jne	1f
 	movl	$MSR_FSBASE,%ecx
 	movl	PCB_FSBASE(%r8),%eax
 	movl	PCB_FSBASE+4(%r8),%edx
 	.globl	ld_fsbase
-ld_fsbase: wrmsr
+ld_fsbase:
+	wrmsr
 1:
 	/* Restore %gs and gsbase */
 	movw	TF_GS(%rsp),%si
@@ -678,7 +681,8 @@ ld_fsbase: wrmsr
 	movl	$MSR_GSBASE,%ecx
 	rdmsr
 	.globl	ld_gs
-ld_gs:	movw	%si,%gs
+ld_gs:
+	movw	%si,%gs
 	wrmsr
 	popfq
 	cmpw	$KUG32SEL,%si
@@ -687,12 +691,17 @@ ld_gs:	movw	%si,%gs
 	movl	PCB_GSBASE(%r8),%eax
 	movl	PCB_GSBASE+4(%r8),%edx
 	.globl	ld_gsbase
-ld_gsbase: wrmsr
-1:	.globl	ld_es
-ld_es:	movw	TF_ES(%rsp),%es
+ld_gsbase:
+	wrmsr
+1:
+	.globl	ld_es
+ld_es:
+	movw	TF_ES(%rsp),%es
 	.globl	ld_ds
-ld_ds:	movw	TF_DS(%rsp),%ds
-ld_regs:movq	TF_RDI(%rsp),%rdi
+ld_ds:
+	movw	TF_DS(%rsp),%ds
+ld_regs:
+	movq	TF_RDI(%rsp),%rdi
 	movq	TF_RSI(%rsp),%rsi
 	movq	TF_RDX(%rsp),%rdx
 	movq	TF_RCX(%rsp),%rcx
@@ -711,7 +720,8 @@ ld_regs:movq	TF_RDI(%rsp),%rdi
 	jz	1f			/* keep running with kernel GS.base */
 	cli
 	swapgs
-1:	addq	$TF_RIP,%rsp		/* skip over tf_err, tf_trapno */
+1:
+	addq	$TF_RIP,%rsp		/* skip over tf_err, tf_trapno */
 	.globl	doreti_iret
 doreti_iret:
 	iretq
@@ -738,7 +748,8 @@ doreti_iret_fault:
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	1f
 	sti
-1:	movw	%fs,TF_FS(%rsp)
+1:
+	movw	%fs,TF_FS(%rsp)
 	movw	%gs,TF_GS(%rsp)
 	movw	%es,TF_ES(%rsp)
 	movw	%ds,TF_DS(%rsp)
@@ -768,7 +779,7 @@ doreti_iret_fault:
 	.globl	ds_load_fault
 ds_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
-	movq	%rsp, %rdi
+	movq	%rsp,%rdi
 	call	trap
 	movw	$KUDSEL,TF_DS(%rsp)
 	jmp	doreti
@@ -777,7 +788,7 @@ ds_load_fault:
 	.globl	es_load_fault
 es_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
-	movq	%rsp, %rdi
+	movq	%rsp,%rdi
 	call	trap
 	movw	$KUDSEL,TF_ES(%rsp)
 	jmp	doreti
@@ -786,7 +797,7 @@ es_load_fault:
 	.globl	fs_load_fault
 fs_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
-	movq	%rsp, %rdi
+	movq	%rsp,%rdi
 	call	trap
 	movw	$KUF32SEL,TF_FS(%rsp)
 	jmp	doreti
@@ -796,7 +807,7 @@ fs_load_fault:
 gs_load_fault:
 	popfq
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
-	movq	%rsp, %rdi
+	movq	%rsp,%rdi
 	call	trap
 	movw	$KUG32SEL,TF_GS(%rsp)
 	jmp	doreti
@@ -805,7 +816,7 @@ gs_load_fault:
 	.globl	fsbase_load_fault
 fsbase_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
-	movq	%rsp, %rdi
+	movq	%rsp,%rdi
 	call	trap
 	movq	PCPU(CURTHREAD),%r8
 	movq	TD_PCB(%r8),%r8
@@ -816,7 +827,7 @@ fsbase_load_fault:
 	.globl	gsbase_load_fault
 gsbase_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
-	movq	%rsp, %rdi
+	movq	%rsp,%rdi
 	call	trap
 	movq	PCPU(CURTHREAD),%r8
 	movq	TD_PCB(%r8),%r8
-- 
cgit v1.1


From b0947a989bb83db0b51032c99bcf668b5a86817f Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Sat, 8 May 2010 12:40:38 +0000
Subject: MFC r207463: Remove debugging code that was not used once since
 commit.

---
 sys/amd64/amd64/trap.c | 86 +-------------------------------------------------
 1 file changed, 1 insertion(+), 85 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 8492f4e..1fa3d32 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -172,52 +172,6 @@ SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW,
 
 extern char *syscallnames[];
 
-/* #define DEBUG 1 */
-#ifdef DEBUG
-static void
-report_seg_fault(const char *segn, struct trapframe *frame)
-{
-	struct proc_ldt *pldt;
-	struct trapframe *pf;
-
-	pldt = curproc->p_md.md_ldt;
-	printf("%d: %s load fault %lx %p %d\n",
-	    curproc->p_pid, segn, frame->tf_err,
-	    pldt != NULL ? pldt->ldt_base : NULL,
-	    pldt != NULL ? pldt->ldt_refcnt : 0);
-	kdb_backtrace();
-	pf = (struct trapframe *)frame->tf_rsp;
-	printf("rdi %lx\n", pf->tf_rdi);
-	printf("rsi %lx\n", pf->tf_rsi);
-	printf("rdx %lx\n", pf->tf_rdx);
-	printf("rcx %lx\n", pf->tf_rcx);
-	printf("r8  %lx\n", pf->tf_r8);
-	printf("r9  %lx\n", pf->tf_r9);
-	printf("rax %lx\n", pf->tf_rax);
-	printf("rbx %lx\n", pf->tf_rbx);
-	printf("rbp %lx\n", pf->tf_rbp);
-	printf("r10 %lx\n", pf->tf_r10);
-	printf("r11 %lx\n", pf->tf_r11);
-	printf("r12 %lx\n", pf->tf_r12);
-	printf("r13 %lx\n", pf->tf_r13);
-	printf("r14 %lx\n", pf->tf_r14);
-	printf("r15 %lx\n", pf->tf_r15);
-	printf("fs  %x\n", pf->tf_fs);
-	printf("gs  %x\n", pf->tf_gs);
-	printf("es  %x\n", pf->tf_es);
-	printf("ds  %x\n", pf->tf_ds);
-	printf("tno %x\n", pf->tf_trapno);
-	printf("adr %lx\n", pf->tf_addr);
-	printf("flg %x\n", pf->tf_flags);
-	printf("err %lx\n", pf->tf_err);
-	printf("rip %lx\n", pf->tf_rip);
-	printf("cs  %lx\n", pf->tf_cs);
-	printf("rfl %lx\n", pf->tf_rflags);
-	printf("rsp %lx\n", pf->tf_rsp);
-	printf("ss  %lx\n", pf->tf_ss);
-}
-#endif
-
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
@@ -314,9 +268,7 @@ trap(struct trapframe *frame)
 			 */
 			printf("kernel trap %d with interrupts disabled\n",
 			    type);
-#ifdef DEBUG
-			report_seg_fault("hlt", frame);
-#endif
+
 			/*
 			 * We shouldn't enable interrupts while holding a
 			 * spin lock or servicing an NMI.
@@ -532,33 +484,21 @@ trap(struct trapframe *frame)
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_ds) {
-#ifdef DEBUG
-				report_seg_fault("ds", frame);
-#endif
 				frame->tf_rip = (long)ds_load_fault;
 				frame->tf_ds = _udatasel;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_es) {
-#ifdef DEBUG
-				report_seg_fault("es", frame);
-#endif
 				frame->tf_rip = (long)es_load_fault;
 				frame->tf_es = _udatasel;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_fs) {
-#ifdef DEBUG
-				report_seg_fault("fs", frame);
-#endif
 				frame->tf_rip = (long)fs_load_fault;
 				frame->tf_fs = _ufssel;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_gs) {
-#ifdef DEBUG
-				report_seg_fault("gs", frame);
-#endif
 				frame->tf_rip = (long)gs_load_fault;
 				frame->tf_gs = _ugssel;
 				goto out;
@@ -664,30 +604,6 @@ trap(struct trapframe *frame)
 	ksi.ksi_addr = (void *)addr;
 	trapsignal(td, &ksi);
 
-#ifdef DEBUG
-{
-	register_t rg,rgk, rf;
-
-	if (type <= MAX_TRAP_MSG) {
-		uprintf("fatal process exception: %s",
-			trap_msg[type]);
-		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
-			uprintf(", fault VA = 0x%lx", frame->tf_addr);
-		uprintf("\n");
-	}
-	rf = rdmsr(0xc0000100);
-	rg = rdmsr(0xc0000101);
-	rgk = rdmsr(0xc0000102);
-	uprintf("pid %d TRAP %d rip %lx err %lx addr %lx cs %lx ss %lx ds %x "
-		"es %x fs %x fsbase %lx %lx gs %x gsbase %lx %lx %lx\n",
-		curproc->p_pid, type, frame->tf_rip, frame->tf_err,
-		frame->tf_addr,
-		frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es,
-		frame->tf_fs, td->td_pcb->pcb_fsbase, rf,
-		frame->tf_gs, td->td_pcb->pcb_gsbase, rg, rgk);
-}
-#endif
-
 user:
 	userret(td, frame);
 	mtx_assert(&Giant, MA_NOTOWNED);
-- 
cgit v1.1


From 21d551ae0224e62964401c9da958f37d5a6cd5bb Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Sat, 8 May 2010 18:54:47 +0000
Subject: MFC r204051 (by imp): n64 has a different size for KINFO_PROC_SIZE.

Approved by:	imp

MFC r207152:
Move the constants specifying the size of struct kinfo_proc into
machine-specific header files. Add KINFO_PROC32_SIZE for struct
kinfo_proc32 for architectures providing COMPAT_FREEBSD32. Add
CTASSERT for the size of struct kinfo_proc32.

MFC r207269:
Style: use #define<TAB> instead of #define<SPACE>.
---
 sys/amd64/include/proc.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h
index 33d5181..acea4c0 100644
--- a/sys/amd64/include/proc.h
+++ b/sys/amd64/include/proc.h
@@ -53,6 +53,9 @@ struct mdproc {
 	struct system_segment_descriptor md_ldt_sd;
 };
 
+#define	KINFO_PROC_SIZE 1088
+#define	KINFO_PROC32_SIZE 768
+
 #ifdef	_KERNEL
 
 /* Get the current kernel thread stack usage. */
-- 
cgit v1.1


From ff311c2c9afe9a565c19f3e16e22cbff786b27cd Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Wed, 12 May 2010 09:34:10 +0000
Subject: MFC r207676: Add definitions for Intel AESNI CPUID bits and print the
 capabilities on boot.

---
 sys/amd64/amd64/identcpu.c     | 4 ++--
 sys/amd64/include/specialreg.h | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c
index b0da729..287c9c2 100644
--- a/sys/amd64/amd64/identcpu.c
+++ b/sys/amd64/amd64/identcpu.c
@@ -240,7 +240,7 @@ printcpuinfo(void)
 				printf("\n  Features2=0x%b", cpu_feature2,
 				"\020"
 				"\001SSE3"	/* SSE3 */
-				"\002<b1>"
+				"\002PCLMULQDQ"	/* Carry-Less Mul Quadword */
 				"\003DTES64"	/* 64-bit Debug Trace */
 				"\004MON"	/* MONITOR/MWAIT Instructions */
 				"\005DS_CPL"	/* CPL Qualified Debug Store */
@@ -264,7 +264,7 @@ printcpuinfo(void)
 				"\027MOVBE"
 				"\030POPCNT"
 				"\031<b24>"
-				"\032<b25>"
+				"\032AESNI"	/* AES Crypto*/
 				"\033XSAVE"
 				"\034OSXSAVE"
 				"\035<b28>"
diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h
index 86a08ce..895619c 100644
--- a/sys/amd64/include/specialreg.h
+++ b/sys/amd64/include/specialreg.h
@@ -113,6 +113,7 @@
 #define	CPUID_PBE	0x80000000
 
 #define	CPUID2_SSE3	0x00000001
+#define	CPUID2_PCLMULQDQ 0x00000002
 #define	CPUID2_DTES64	0x00000004
 #define	CPUID2_MON	0x00000008
 #define	CPUID2_DS_CPL	0x00000010
@@ -131,6 +132,7 @@
 #define	CPUID2_X2APIC	0x00200000
 #define	CPUID2_MOVBE	0x00400000
 #define	CPUID2_POPCNT	0x00800000
+#define	CPUID2_AESNI	0x02000000
 
 /*
  * Important bits in the AMD extended cpuid flags
-- 
cgit v1.1


From 539772c2021ceec4eb869d973b2537e528dc9bb5 Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Wed, 19 May 2010 09:30:41 +0000
Subject: MFC r207957: Remove unneeded overrides of the segment registers.

---
 sys/amd64/amd64/trap.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 1fa3d32..4891e9d 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -485,22 +485,18 @@ trap(struct trapframe *frame)
 			}
 			if (frame->tf_rip == (long)ld_ds) {
 				frame->tf_rip = (long)ds_load_fault;
-				frame->tf_ds = _udatasel;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_es) {
 				frame->tf_rip = (long)es_load_fault;
-				frame->tf_es = _udatasel;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_fs) {
 				frame->tf_rip = (long)fs_load_fault;
-				frame->tf_fs = _ufssel;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_gs) {
 				frame->tf_rip = (long)gs_load_fault;
-				frame->tf_gs = _ugssel;
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_gsbase) {
-- 
cgit v1.1


From 857402565bb4044d8ad7fca73b6aa5b8195eda3b Mon Sep 17 00:00:00 2001
From: kib <kib@FreeBSD.org>
Date: Wed, 19 May 2010 09:32:59 +0000
Subject: MFC r207958: Route all returns from the interrupts and faults through
 the doreti_iret labeled iretq instruction.

MFC r208026:
Do not use .extern.
---
 sys/amd64/amd64/apic_vector.S | 12 ++++++------
 sys/amd64/amd64/exception.S   |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index df94a47..4cfc18b 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -81,7 +81,7 @@ IDTVEC(spuriousint)
 
 	/* No EOI cycle used here */
 
-	iretq
+	jmp	doreti_iret
 
 	ISR_VEC(1, apic_isr1)
 	ISR_VEC(2, apic_isr2)
@@ -135,7 +135,7 @@ IDTVEC(invltlb)
 	incl	smp_tlb_wait
 
 	popq	%rax
-	iretq
+	jmp	doreti_iret
 
 /*
  * Single page TLB shootdown
@@ -155,7 +155,7 @@ IDTVEC(invlpg)
 	incl	smp_tlb_wait
 
 	popq	%rax
-	iretq
+	jmp	doreti_iret
 
 /*
  * Page range TLB shootdown.
@@ -181,7 +181,7 @@ IDTVEC(invlrng)
 
 	popq	%rdx
 	popq	%rax
-	iretq
+	jmp	doreti_iret
 
 /*
  * Invalidate cache.
@@ -200,7 +200,7 @@ IDTVEC(invlcache)
 	incl	smp_tlb_wait
 
 	popq	%rax
-	iretq
+	jmp	doreti_iret
 
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.
@@ -247,7 +247,7 @@ IDTVEC(cpususpend)
 	call	cpususpend_handler
 
 	POP_FRAME
-	iretq
+	jmp	doreti_iret
 
 /*
  * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU.
diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S
index 41bf173..0de197b 100644
--- a/sys/amd64/amd64/exception.S
+++ b/sys/amd64/amd64/exception.S
@@ -553,7 +553,7 @@ nmi_restoreregs:
 	movq	TF_R14(%rsp),%r14
 	movq	TF_R15(%rsp),%r15
 	addq	$TF_RIP,%rsp
-	iretq
+	jmp	doreti_iret
 
 ENTRY(fork_trampoline)
 	movq	%r12,%rdi		/* function */
-- 
cgit v1.1


From 827618ec09d24ec70e60ed16c4e245b5fc3b7f76 Mon Sep 17 00:00:00 2001
From: attilio <attilio@FreeBSD.org>
Date: Tue, 1 Jun 2010 21:19:58 +0000
Subject: MFC r207329, r208716: - Extract the IODEV_PIO interface from ia64 and
 make it MI. - On i386 and amd64 the old behaviour is kept but multithreaded  
 processes must use the new interface in order to work well. - Support for the
 other architectures is greatly improved.

Sponsored by:	Sandvine Incorporated

Approved by:	re (kib, bz)
---
 sys/amd64/amd64/io.c      | 40 ++++++----------------------------------
 sys/amd64/include/iodev.h | 21 ++++++++++++++++++---
 2 files changed, 24 insertions(+), 37 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/io.c b/sys/amd64/amd64/io.c
index 09d6e89..c2d0d51 100644
--- a/sys/amd64/amd64/io.c
+++ b/sys/amd64/amd64/io.c
@@ -28,60 +28,32 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
-#include <sys/conf.h>
-#include <sys/fcntl.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
-#include <sys/mutex.h>
-#include <sys/priv.h>
 #include <sys/proc.h>
-#include <sys/signalvar.h>
-#include <sys/systm.h>
 
-#include <machine/db_machdep.h>
 #include <machine/frame.h>
-#include <machine/psl.h>
-#include <machine/specialreg.h>
-
-#include <vm/vm.h>
-#include <vm/pmap.h>
-
 #include <machine/iodev.h>
+#include <machine/psl.h>
 
-/* ARGSUSED */
 int
-ioopen(struct cdev *dev __unused, int flags __unused, int fmt __unused,
-    struct thread *td)
+iodev_open(struct thread *td)
 {
-	int error;
-
-	error = priv_check(td, PRIV_IO);
-	if (error != 0)
-		return (error);
-	error = securelevel_gt(td->td_ucred, 0);
-	if (error != 0)
-		return (error);
 
 	td->td_frame->tf_rflags |= PSL_IOPL;
-
 	return (0);
 }
 
-/* ARGSUSED */
 int
-ioclose(struct cdev *dev __unused, int flags __unused, int fmt __unused,
-    struct thread *td)
+iodev_close(struct thread *td)
 {
-	td->td_frame->tf_rflags &= ~PSL_IOPL;
 
+	td->td_frame->tf_rflags &= ~PSL_IOPL;
 	return (0);
 }
 
 /* ARGSUSED */
 int
-ioioctl(struct cdev *dev __unused, u_long cmd __unused, caddr_t data __unused,
-    int fflag __unused, struct thread *td __unused)
+iodev_ioctl(u_long cmd __unused, caddr_t data __unused)
 {
 
-	return (ENXIO);
+	return (ENOIOCTL);
 }
diff --git a/sys/amd64/include/iodev.h b/sys/amd64/include/iodev.h
index 1a0a17a..9f53cac 100644
--- a/sys/amd64/include/iodev.h
+++ b/sys/amd64/include/iodev.h
@@ -25,7 +25,22 @@
  *
  * $FreeBSD$
  */
+#ifndef _MACHINE_IODEV_H_
+#define	_MACHINE_IODEV_H_
 
-d_open_t	ioopen;
-d_close_t	ioclose;
-d_ioctl_t	ioioctl;
+#ifdef _KERNEL
+#include <machine/cpufunc.h>
+
+#define	iodev_read_1	inb
+#define	iodev_read_2	inw
+#define	iodev_read_4	inl
+#define	iodev_write_1	outb
+#define	iodev_write_2	outw
+#define	iodev_write_4	outl
+
+int	 iodev_open(struct thread *td);
+int	 iodev_close(struct thread *td);
+int	 iodev_ioctl(u_long cmd, caddr_t data);
+
+#endif /* _KERNEL */
+#endif /* _MACHINE_IODEV_H_ */
-- 
cgit v1.1


From 8001a4e77e6e6359e73988a3515e468ade187b04 Mon Sep 17 00:00:00 2001
From: ken <ken@FreeBSD.org>
Date: Fri, 11 Jun 2010 19:17:36 +0000
Subject: MFC 199549, 199997, 204158, 207673, and 208901.

Bring in a number of netfront changes:

r199549 | jhb

  Remove commented out reference to if_watchdog and an assignment of zero to
  if_timer.

  Reviewed by:	scottl

r199997 | gibbs

  Add media ioctl support and link notifications so that devd will attempt
  to run dhclient on a netfront (xn) device that is setup for DHCP in
  /etc/rc.conf.

  PR:		kern/136251 (fixed differently than the submitted patch)

r204158 | kmacy

  - make printf conditional
  - fix witness warnings by making configuration lock a mutex

r207673 | joel

  Switch to our preferred 2-clause BSD license.

  Approved by:	kmacy

r208901 | ken

  A number of netfront fixes and stability improvements:

   - Re-enable TSO.  This was broken previously due to CSUM_TSO clearing the
     CSUM_TCP flag, so our checksum flags were incorrectly set going to the
     netback driver.  That was fixed in r206844 in tcp_output.c, so we can
     turn TSO back on here.

   - Fix the way transmit slots are calculated, so that we can't overfill
     the ring.

   - Avoid sending packets with more fragments/segments than netback can
     handle.  The Linux netback code can only handle packets of
     MAX_SKB_FRAGS, which turns out to be 18 on machines with 4K pages.  We
     can easily generate packets with 32 or so fragments with TSO turned on.
     Right now the solution is just to drop the packets (since netback
     doesn't seem to handle it gracefully), but we should come up with a way
     to allow a driver to tell the TCP stack the maximum number of fragments
     it can handle in a single packet.

   - Fix the way the consumer is tracked in the receive path.  It could get
     out of sync fairly easily.

   - Use standard Xen ring macros to make it clearer how netfront is using
     the rings.

   - Get rid of Linux-ish negative errno return values.

   - Added more documentation to the driver.

   - Refactored code to make it easier to read.

   - Some other minor fixes.

  Reviewed by:	gibbs
  Sponsored by:	Spectra Logic

Approved by:	re (bz)
---
 sys/amd64/include/xen/xenfunc.h | 31 +++++++++++++++----------------
 sys/amd64/include/xen/xenvar.h  | 26 ++++++++++++--------------
 2 files changed, 27 insertions(+), 30 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/xen/xenfunc.h b/sys/amd64/include/xen/xenfunc.h
index b3a6672..d03d4f6 100644
--- a/sys/amd64/include/xen/xenfunc.h
+++ b/sys/amd64/include/xen/xenfunc.h
@@ -1,6 +1,5 @@
-/*
- *
- * Copyright (c) 2004,2005 Kip Macy
+/*-
+ * Copyright (c) 2004, 2005 Kip Macy
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -11,22 +10,22 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 4. The name of the author may not be used to endorse or promote products
- *    derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
  */
 
-
 #ifndef _XEN_XENFUNC_H_
 #define _XEN_XENFUNC_H_
 
diff --git a/sys/amd64/include/xen/xenvar.h b/sys/amd64/include/xen/xenvar.h
index 1433b76..d9dbc5d 100644
--- a/sys/amd64/include/xen/xenvar.h
+++ b/sys/amd64/include/xen/xenvar.h
@@ -1,29 +1,27 @@
-/*
+/*-
  * Copyright (c) 2008 Kip Macy
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
- *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
- *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
-- 
cgit v1.1


From d45b7f14ae6fa78882fa9ec3be976733ca4767b4 Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Fri, 13 May 2011 04:54:01 +0000
Subject: Import of bhyve hypervisor and utilities, part 1.   vmm.ko - kernel
 module for VT-x, VT-d and hypervisor control   bhyve  - user-space sequencer
 and i/o emulation   vmmctl - dump of hypervisor register state   libvmm -
 front-end to vmm.ko chardev interface

bhyve was designed and implemented by Neel Natu.

Thanks to the following folk from NetApp who helped to make this available:
	Joe CaraDonna
	Peter Snyder
	Jeff Heller
	Sandeep Mann
	Steve Miller
	Brian Pawlowski
---
 sys/amd64/include/specialreg.h     |    1 +
 sys/amd64/include/vmm.h            |  268 ++++++
 sys/amd64/include/vmm_dev.h        |  191 ++++
 sys/amd64/vmm/amd/amdv.c           |  247 ++++++
 sys/amd64/vmm/intel/ept.c          |  312 +++++++
 sys/amd64/vmm/intel/ept.h          |   42 +
 sys/amd64/vmm/intel/vmcs.c         |  451 ++++++++++
 sys/amd64/vmm/intel/vmcs.h         |  324 +++++++
 sys/amd64/vmm/intel/vmx.c          | 1673 ++++++++++++++++++++++++++++++++++++
 sys/amd64/vmm/intel/vmx.h          |  115 +++
 sys/amd64/vmm/intel/vmx_controls.h |   92 ++
 sys/amd64/vmm/intel/vmx_cpufunc.h  |  199 +++++
 sys/amd64/vmm/intel/vmx_genassym.c |   81 ++
 sys/amd64/vmm/intel/vmx_msr.c      |  172 ++++
 sys/amd64/vmm/intel/vmx_msr.h      |   78 ++
 sys/amd64/vmm/intel/vmx_support.S  |  204 +++++
 sys/amd64/vmm/intel/vtd.c          |  637 ++++++++++++++
 sys/amd64/vmm/io/iommu.c           |  230 +++++
 sys/amd64/vmm/io/iommu.h           |   67 ++
 sys/amd64/vmm/io/ppt.c             |  449 ++++++++++
 sys/amd64/vmm/io/ppt.h             |   40 +
 sys/amd64/vmm/io/vdev.c            |  270 ++++++
 sys/amd64/vmm/io/vdev.h            |   84 ++
 sys/amd64/vmm/io/vlapic.c          |  812 +++++++++++++++++
 sys/amd64/vmm/io/vlapic.h          |  105 +++
 sys/amd64/vmm/vmm.c                |  737 ++++++++++++++++
 sys/amd64/vmm/vmm_dev.c            |  468 ++++++++++
 sys/amd64/vmm/vmm_ipi.c            |  103 +++
 sys/amd64/vmm/vmm_ipi.h            |   38 +
 sys/amd64/vmm/vmm_ktr.h            |   51 ++
 sys/amd64/vmm/vmm_lapic.c          |  121 +++
 sys/amd64/vmm/vmm_lapic.h          |   64 ++
 sys/amd64/vmm/vmm_mem.c            |  413 +++++++++
 sys/amd64/vmm/vmm_mem.h            |   38 +
 sys/amd64/vmm/vmm_msr.c            |  264 ++++++
 sys/amd64/vmm/vmm_msr.h            |   42 +
 sys/amd64/vmm/vmm_stat.c           |  103 +++
 sys/amd64/vmm/vmm_stat.h           |   71 ++
 sys/amd64/vmm/vmm_support.S        |   42 +
 sys/amd64/vmm/vmm_util.c           |  111 +++
 sys/amd64/vmm/vmm_util.h           |   40 +
 sys/amd64/vmm/x86.c                |  113 +++
 sys/amd64/vmm/x86.h                |   62 ++
 43 files changed, 10025 insertions(+)
 create mode 100644 sys/amd64/include/vmm.h
 create mode 100644 sys/amd64/include/vmm_dev.h
 create mode 100644 sys/amd64/vmm/amd/amdv.c
 create mode 100644 sys/amd64/vmm/intel/ept.c
 create mode 100644 sys/amd64/vmm/intel/ept.h
 create mode 100644 sys/amd64/vmm/intel/vmcs.c
 create mode 100644 sys/amd64/vmm/intel/vmcs.h
 create mode 100644 sys/amd64/vmm/intel/vmx.c
 create mode 100644 sys/amd64/vmm/intel/vmx.h
 create mode 100644 sys/amd64/vmm/intel/vmx_controls.h
 create mode 100644 sys/amd64/vmm/intel/vmx_cpufunc.h
 create mode 100644 sys/amd64/vmm/intel/vmx_genassym.c
 create mode 100644 sys/amd64/vmm/intel/vmx_msr.c
 create mode 100644 sys/amd64/vmm/intel/vmx_msr.h
 create mode 100644 sys/amd64/vmm/intel/vmx_support.S
 create mode 100644 sys/amd64/vmm/intel/vtd.c
 create mode 100644 sys/amd64/vmm/io/iommu.c
 create mode 100644 sys/amd64/vmm/io/iommu.h
 create mode 100644 sys/amd64/vmm/io/ppt.c
 create mode 100644 sys/amd64/vmm/io/ppt.h
 create mode 100644 sys/amd64/vmm/io/vdev.c
 create mode 100644 sys/amd64/vmm/io/vdev.h
 create mode 100644 sys/amd64/vmm/io/vlapic.c
 create mode 100644 sys/amd64/vmm/io/vlapic.h
 create mode 100644 sys/amd64/vmm/vmm.c
 create mode 100644 sys/amd64/vmm/vmm_dev.c
 create mode 100644 sys/amd64/vmm/vmm_ipi.c
 create mode 100644 sys/amd64/vmm/vmm_ipi.h
 create mode 100644 sys/amd64/vmm/vmm_ktr.h
 create mode 100644 sys/amd64/vmm/vmm_lapic.c
 create mode 100644 sys/amd64/vmm/vmm_lapic.h
 create mode 100644 sys/amd64/vmm/vmm_mem.c
 create mode 100644 sys/amd64/vmm/vmm_mem.h
 create mode 100644 sys/amd64/vmm/vmm_msr.c
 create mode 100644 sys/amd64/vmm/vmm_msr.h
 create mode 100644 sys/amd64/vmm/vmm_stat.c
 create mode 100644 sys/amd64/vmm/vmm_stat.h
 create mode 100644 sys/amd64/vmm/vmm_support.S
 create mode 100644 sys/amd64/vmm/vmm_util.c
 create mode 100644 sys/amd64/vmm/vmm_util.h
 create mode 100644 sys/amd64/vmm/x86.c
 create mode 100644 sys/amd64/vmm/x86.h

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h
index 895619c..c95fee0 100644
--- a/sys/amd64/include/specialreg.h
+++ b/sys/amd64/include/specialreg.h
@@ -297,6 +297,7 @@
  */
 #define	APICBASE_RESERVED	0x000006ff
 #define	APICBASE_BSP		0x00000100
+#define APICBASE_X2APIC		0x00000400
 #define	APICBASE_ENABLED	0x00000800
 #define	APICBASE_ADDRESS	0xfffff000
 
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
new file mode 100644
index 0000000..0f4c356
--- /dev/null
+++ b/sys/amd64/include/vmm.h
@@ -0,0 +1,268 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: vmm.h 482 2011-05-09 21:22:43Z grehan $
+ */
+
+#ifndef _VMM_H_
+#define	_VMM_H_
+
+#ifdef _KERNEL
+
+#define	VM_MAX_NAMELEN	32
+
+struct vm;
+struct vm_memory_segment;
+struct seg_desc;
+struct vm_exit;
+struct vm_run;
+struct vlapic;
+
+typedef int	(*vmm_init_func_t)(void);
+typedef int	(*vmm_cleanup_func_t)(void);
+typedef void *	(*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
+typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
+				  struct vm_exit *vmexit);
+typedef void	(*vmi_cleanup_func_t)(void *vmi);
+typedef int	(*vmi_mmap_func_t)(void *vmi, vm_paddr_t gpa, vm_paddr_t hpa,
+				   size_t length, vm_memattr_t attr,
+				   int prot, boolean_t superpages_ok);
+typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
+				      uint64_t *retval);
+typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
+				      uint64_t val);
+typedef int	(*vmi_get_desc_t)(void *vmi, int vcpu, int num,
+				  struct seg_desc *desc);
+typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
+				  struct seg_desc *desc);
+typedef int	(*vmi_inject_event_t)(void *vmi, int vcpu,
+				      int type, int vector,
+				      uint32_t code, int code_valid);
+typedef	int	(*vmi_inject_nmi_t)(void *vmi, int vcpu);
+typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
+typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+
+struct vmm_ops {
+	vmm_init_func_t		init;		/* module wide initialization */
+	vmm_cleanup_func_t	cleanup;
+
+	vmi_init_func_t		vminit;		/* vm-specific initialization */
+	vmi_run_func_t		vmrun;
+	vmi_cleanup_func_t	vmcleanup;
+	vmi_mmap_func_t		vmmmap;
+	vmi_get_register_t	vmgetreg;
+	vmi_set_register_t	vmsetreg;
+	vmi_get_desc_t		vmgetdesc;
+	vmi_set_desc_t		vmsetdesc;
+	vmi_inject_event_t	vminject;
+	vmi_inject_nmi_t	vmnmi;
+	vmi_get_cap_t		vmgetcap;
+	vmi_set_cap_t		vmsetcap;
+};
+
+extern struct vmm_ops vmm_ops_intel;
+extern struct vmm_ops vmm_ops_amd;
+
+struct vm *vm_create(const char *name);
+void vm_destroy(struct vm *vm);
+const char *vm_name(struct vm *vm);
+int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa);
+int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
+vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
+int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+	      struct vm_memory_segment *seg);
+int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
+int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
+int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+		    struct seg_desc *ret_desc);
+int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+		    struct seg_desc *desc);
+int vm_get_pinning(struct vm *vm, int vcpu, int *cpuid);
+int vm_set_pinning(struct vm *vm, int vcpu, int cpuid);
+int vm_run(struct vm *vm, struct vm_run *vmrun);
+int vm_inject_event(struct vm *vm, int vcpu, int type,
+		    int vector, uint32_t error_code, int error_code_valid);
+int vm_inject_nmi(struct vm *vm, int vcpu);
+uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
+struct vlapic *vm_lapic(struct vm *vm, int cpu);
+int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
+int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
+void vm_activate_cpu(struct vm *vm, int vcpu);
+cpumask_t vm_active_cpus(struct vm *vm);
+
+/*
+ * Return 1 if device indicated by bus/slot/func is supposed to be a
+ * pci passthrough device.
+ *
+ * Return 0 otherwise.
+ */
+int vmm_is_pptdev(int bus, int slot, int func);
+
+void *vm_iommu_domain(struct vm *vm);
+
+#define	VCPU_STOPPED	0
+#define	VCPU_RUNNING	1
+void vm_set_run_state(struct vm *vm, int vcpu, int running);
+int vm_get_run_state(struct vm *vm, int vcpu, int *hostcpu);
+
+void *vcpu_stats(struct vm *vm, int vcpu);
+
+static int __inline
+vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
+{
+	return (vm_get_run_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
+}
+
+static cpumask_t __inline
+vcpu_mask(int vcpuid)
+{
+	return ((cpumask_t)1 << vcpuid);
+}
+
+#endif	/* KERNEL */
+
+#define	VM_MAXCPU	8			/* maximum virtual cpus */
+
+/*
+ * Identifiers for events that can be injected into the VM
+ */
+enum vm_event_type {
+	VM_EVENT_NONE,
+	VM_HW_INTR,
+	VM_NMI,
+	VM_HW_EXCEPTION,
+	VM_SW_INTR,
+	VM_PRIV_SW_EXCEPTION,
+	VM_SW_EXCEPTION,
+	VM_EVENT_MAX
+};
+
+/*
+ * Identifiers for architecturally defined registers.
+ */
+enum vm_reg_name {
+	VM_REG_GUEST_RAX,
+	VM_REG_GUEST_RBX,
+	VM_REG_GUEST_RCX,
+	VM_REG_GUEST_RDX,
+	VM_REG_GUEST_RSI,
+	VM_REG_GUEST_RDI,
+	VM_REG_GUEST_RBP,
+	VM_REG_GUEST_R8,
+	VM_REG_GUEST_R9,
+	VM_REG_GUEST_R10,
+	VM_REG_GUEST_R11,
+	VM_REG_GUEST_R12,
+	VM_REG_GUEST_R13,
+	VM_REG_GUEST_R14,
+	VM_REG_GUEST_R15,
+	VM_REG_GUEST_CR0,
+	VM_REG_GUEST_CR3,
+	VM_REG_GUEST_CR4,
+	VM_REG_GUEST_DR7,
+	VM_REG_GUEST_RSP,
+	VM_REG_GUEST_RIP,
+	VM_REG_GUEST_RFLAGS,
+	VM_REG_GUEST_ES,
+	VM_REG_GUEST_CS,
+	VM_REG_GUEST_SS,
+	VM_REG_GUEST_DS,
+	VM_REG_GUEST_FS,
+	VM_REG_GUEST_GS,
+	VM_REG_GUEST_LDTR,
+	VM_REG_GUEST_TR,
+	VM_REG_GUEST_IDTR,
+	VM_REG_GUEST_GDTR,
+	VM_REG_GUEST_EFER,
+	VM_REG_LAST
+};
+
+/*
+ * Identifiers for optional vmm capabilities
+ */
+enum vm_cap_type {
+	VM_CAP_HALT_EXIT,
+	VM_CAP_MTRAP_EXIT,
+	VM_CAP_PAUSE_EXIT,
+	VM_CAP_UNRESTRICTED_GUEST,
+	VM_CAP_MAX
+};
+
+/*
+ * The 'access' field has the format specified in Table 21-2 of the Intel
+ * Architecture Manual vol 3b.
+ *
+ * XXX The contents of the 'access' field are architecturally defined except
+ * bit 16 - Segment Unusable.
+ */
+struct seg_desc {
+	uint64_t	base;
+	uint32_t	limit;
+	uint32_t	access;
+};
+
+enum vm_exitcode {
+	VM_EXITCODE_INOUT,
+	VM_EXITCODE_VMX,
+	VM_EXITCODE_BOGUS,
+	VM_EXITCODE_RDMSR,
+	VM_EXITCODE_WRMSR,
+	VM_EXITCODE_HLT,
+	VM_EXITCODE_MTRAP,
+	VM_EXITCODE_PAUSE,
+	VM_EXITCODE_MAX,
+};
+
+struct vm_exit {
+	enum vm_exitcode	exitcode;
+	int			inst_length;	/* 0 means unknown */
+	uint64_t		rip;
+	union {
+		struct {
+			uint16_t	bytes:3;	/* 1 or 2 or 4 */
+			uint16_t	in:1;		/* out is 0, in is 1 */
+			uint16_t	string:1;
+			uint16_t	rep:1;
+			uint16_t	port;
+			uint32_t	eax;		/* valid for out */
+		} inout;
+		/*
+		 * VMX specific payload. Used when there is no "better"
+		 * exitcode to represent the VM-exit.
+		 */
+		struct {
+			int		error;		/* vmx inst error */
+			uint32_t	exit_reason;
+			uint64_t	exit_qualification;
+		} vmx;
+		struct {
+			uint32_t	code;		/* ecx value */
+			uint64_t	wval;
+		} msr;
+	} u;
+};
+
+#endif	/* _VMM_H_ */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
new file mode 100644
index 0000000..1b143b5
--- /dev/null
+++ b/sys/amd64/include/vmm_dev.h
@@ -0,0 +1,191 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $
+ */
+
+#ifndef	_VMM_DEV_H_
+#define	_VMM_DEV_H_
+
+#ifdef _KERNEL
+void	vmmdev_init(void);
+void	vmmdev_cleanup(void);
+#endif
+
+struct vm_memory_segment {
+	vm_paddr_t	hpa;	/* out */
+	vm_paddr_t	gpa;	/* in */
+	size_t		len;	/* in */
+};
+
+struct vm_register {
+	int		cpuid;
+	int		regnum;		/* enum vm_reg_name */
+	uint64_t	regval;
+};
+
+struct vm_seg_desc {			/* data or code segment */
+	int		cpuid;
+	int		regnum;		/* enum vm_reg_name */
+	struct seg_desc desc;
+};
+
+struct vm_pin {
+	int		vm_cpuid;
+	int		host_cpuid;	/* -1 to unpin */
+};
+
+struct vm_run {
+	int		cpuid;
+	uint64_t	rip;		/* start running here */
+	struct vm_exit	vm_exit;
+};
+
+struct vm_event {
+	int		cpuid;
+	enum vm_event_type type;
+	int		vector;
+	uint32_t	error_code;
+	int		error_code_valid;
+};
+
+struct vm_lapic_irq {
+	int		cpuid;
+	int		vector;
+};
+
+struct vm_capability {
+	int		cpuid;
+	enum vm_cap_type captype;
+	int		capval;
+	int		allcpus;
+};
+
+struct vm_pptdev {
+	int		bus;
+	int		slot;
+	int		func;
+};
+
+struct vm_pptdev_mmio {
+	int		bus;
+	int		slot;
+	int		func;
+	vm_paddr_t	gpa;
+	vm_paddr_t	hpa;
+	size_t		len;
+};
+
+struct vm_pptdev_msi {
+	int		vcpu;
+	int		bus;
+	int		slot;
+	int		func;
+	int		numvec;		/* 0 means disabled */
+	int		vector;
+	int		destcpu;
+};
+
+struct vm_nmi {
+	int		cpuid;
+};
+
+#define	MAX_VM_STATS	64
+struct vm_stats {
+	int		cpuid;				/* in */
+	int		num_entries;			/* out */
+	struct timeval	tv;
+	uint64_t	statbuf[MAX_VM_STATS];
+};
+
+struct vm_stat_desc {
+	int		index;				/* in */
+	char		desc[128];			/* out */
+};
+
+enum {
+	IOCNUM_RUN,
+	IOCNUM_SET_PINNING,
+	IOCNUM_GET_PINNING,
+	IOCNUM_MAP_MEMORY,
+	IOCNUM_GET_MEMORY_SEG,
+	IOCNUM_SET_REGISTER,
+	IOCNUM_GET_REGISTER,
+	IOCNUM_SET_SEGMENT_DESCRIPTOR,
+	IOCNUM_GET_SEGMENT_DESCRIPTOR,
+	IOCNUM_INJECT_EVENT,
+	IOCNUM_LAPIC_IRQ,
+	IOCNUM_SET_CAPABILITY,
+	IOCNUM_GET_CAPABILITY,
+	IOCNUM_BIND_PPTDEV,
+	IOCNUM_UNBIND_PPTDEV,
+	IOCNUM_MAP_PPTDEV_MMIO,
+	IOCNUM_PPTDEV_MSI,
+	IOCNUM_INJECT_NMI,
+	IOCNUM_VM_STATS,
+	IOCNUM_VM_STAT_DESC,
+};
+
+#define	VM_RUN		\
+	_IOWR('v', IOCNUM_RUN, struct vm_run)
+#define	VM_SET_PINNING	\
+	_IOW('v', IOCNUM_SET_PINNING, struct vm_pin)
+#define	VM_GET_PINNING	\
+	_IOWR('v', IOCNUM_GET_PINNING, struct vm_pin)
+#define	VM_MAP_MEMORY	\
+	_IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
+#define	VM_GET_MEMORY_SEG \
+	_IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
+#define	VM_SET_REGISTER \
+	_IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
+#define	VM_GET_REGISTER \
+	_IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
+#define	VM_SET_SEGMENT_DESCRIPTOR \
+	_IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define	VM_GET_SEGMENT_DESCRIPTOR \
+	_IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define	VM_INJECT_EVENT	\
+	_IOW('v', IOCNUM_INJECT_EVENT, struct vm_event)
+#define	VM_LAPIC_IRQ 		\
+	_IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
+#define	VM_SET_CAPABILITY \
+	_IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
+#define	VM_GET_CAPABILITY \
+	_IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
+#define	VM_BIND_PPTDEV \
+	_IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
+#define	VM_UNBIND_PPTDEV \
+	_IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
+#define	VM_MAP_PPTDEV_MMIO \
+	_IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
+#define	VM_PPTDEV_MSI \
+	_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
+#define VM_INJECT_NMI \
+	_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
+#define	VM_STATS \
+	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
+#define	VM_STAT_DESC \
+	_IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
+#endif
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
new file mode 100644
index 0000000..41e937a
--- /dev/null
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -0,0 +1,247 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+
+#include <machine/vmm.h>
+#include "io/iommu.h"
+
+static int
+amdv_init(void)
+{
+
+	printf("amdv_init: not implemented\n");
+	return (ENXIO);
+}
+
+static int
+amdv_cleanup(void)
+{
+
+	printf("amdv_cleanup: not implemented\n");
+	return (ENXIO);
+}
+
+static void *
+amdv_vminit(struct vm *vm)
+{
+
+	printf("amdv_vminit: not implemented\n");
+	return (NULL);
+}
+
+static int
+amdv_vmrun(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit)
+{
+
+	printf("amdv_vmrun: not implemented\n");
+	return (ENXIO);
+}
+
+static void
+amdv_vmcleanup(void *arg)
+{
+
+	printf("amdv_vmcleanup: not implemented\n");
+	return;
+}
+
+static int
+amdv_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+	    vm_memattr_t attr, int prot, boolean_t spok)
+{
+
+	printf("amdv_vmmmap: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
+{
+	
+	printf("amdv_getreg: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val)
+{
+	
+	printf("amdv_setreg: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+	printf("amdv_get_desc: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+	printf("amdv_get_desc: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_inject_event(void *vmi, int vcpu, int type, int vector,
+		  uint32_t error_code, int error_code_valid)
+{
+
+	printf("amdv_inject_event: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_nmi(void *arg, int vcpu)
+{
+
+	printf("amdv_nmi: not implemented\n");	
+        return (EINVAL);
+}
+
+static int
+amdv_getcap(void *arg, int vcpu, int type, int *retval)
+{
+
+	printf("amdv_getcap: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setcap(void *arg, int vcpu, int type, int val)
+{
+
+	printf("amdv_setcap: not implemented\n");
+	return (EINVAL);
+}
+
+struct vmm_ops vmm_ops_amd = {
+	amdv_init,
+	amdv_cleanup,
+	amdv_vminit,
+	amdv_vmrun,
+	amdv_vmcleanup,
+	amdv_vmmmap,
+	amdv_getreg,
+	amdv_setreg,
+	amdv_getdesc,
+	amdv_setdesc,
+	amdv_inject_event,
+	amdv_nmi,
+	amdv_getcap,
+	amdv_setcap
+};
+
+static int
+amd_iommu_init(void)
+{
+
+	printf("amd_iommu_init: not implemented\n");
+	return (ENXIO);
+}
+
+static void
+amd_iommu_cleanup(void)
+{
+
+	printf("amd_iommu_cleanup: not implemented\n");
+}
+
+static void
+amd_iommu_enable(void)
+{
+
+	printf("amd_iommu_enable: not implemented\n");
+}
+
+static void
+amd_iommu_disable(void)
+{
+
+	printf("amd_iommu_disable: not implemented\n");
+}
+
+static void *
+amd_iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+	printf("amd_iommu_create_domain: not implemented\n");
+	return (NULL);
+}
+
+static void
+amd_iommu_destroy_domain(void *domain)
+{
+
+	printf("amd_iommu_destroy_domain: not implemented\n");
+}
+
+static uint64_t
+amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa,
+			 uint64_t len)
+{
+
+	printf("amd_iommu_create_mapping: not implemented\n");
+	return (0);
+}
+
+static void
+amd_iommu_add_device(void *domain, int bus, int slot, int func)
+{
+
+	printf("amd_iommu_add_device: not implemented\n");
+}
+
+static void
+amd_iommu_remove_device(void *domain, int bus, int slot, int func)
+{
+
+	printf("amd_iommu_remove_device: not implemented\n");
+}
+
+struct iommu_ops iommu_ops_amd = {
+	amd_iommu_init,
+	amd_iommu_cleanup,
+	amd_iommu_enable,
+	amd_iommu_disable,
+	amd_iommu_create_domain,
+	amd_iommu_destroy_domain,
+	amd_iommu_create_mapping,
+	amd_iommu_add_device,
+	amd_iommu_remove_device,
+};
diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c
new file mode 100644
index 0000000..c9fca9d
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.c
@@ -0,0 +1,312 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/param.h>
+#include <machine/cpufunc.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmx_cpufunc.h"
+#include "vmx_msr.h"
+#include "vmx.h"
+#include "ept.h"
+
+#define	EPT_PWL4(cap)			((cap) & (1UL << 6))
+#define	EPT_MEMORY_TYPE_WB(cap)		((cap) & (1UL << 14))
+#define	EPT_PDE_SUPERPAGE(cap)		((cap) & (1UL << 16))	/* 2MB pages */
+#define	EPT_PDPTE_SUPERPAGE(cap)	((cap) & (1UL << 17))	/* 1GB pages */
+#define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))
+#define	INVEPT_SUPPORTED(cap)		((cap) & (1UL << 20))
+
+#define	INVVPID_ALL_TYPES_MASK		0xF0000000000UL
+#define	INVVPID_ALL_TYPES_SUPPORTED(cap)	\
+	(((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
+
+#define	INVEPT_ALL_TYPES_MASK		0x6000000UL
+#define	INVEPT_ALL_TYPES_SUPPORTED(cap)		\
+	(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
+
+#define	EPT_PG_RD			(1 << 0)
+#define	EPT_PG_WR			(1 << 1)
+#define	EPT_PG_EX			(1 << 2)
+#define	EPT_PG_MEMORY_TYPE(x)		((x) << 3)
+#define	EPT_PG_IGNORE_PAT		(1 << 6)
+#define	EPT_PG_SUPERPAGE		(1 << 7)
+
+#define	EPT_ADDR_MASK			((uint64_t)-1 << 12)
+
+MALLOC_DECLARE(M_VMX);
+
+static uint64_t page_sizes_mask;
+
+int
+ept_init(void)
+{
+	int page_shift;
+	uint64_t cap;
+
+	cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
+
+	/*
+	 * Verify that:
+	 * - page walk length is 4 steps
+	 * - extended page tables can be laid out in write-back memory
+	 * - invvpid instruction with all possible types is supported
+	 * - invept instruction with all possible types is supported
+	 */
+	if (!EPT_PWL4(cap) ||
+	    !EPT_MEMORY_TYPE_WB(cap) ||
+	    !INVVPID_SUPPORTED(cap) ||
+	    !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
+	    !INVEPT_SUPPORTED(cap) ||
+	    !INVEPT_ALL_TYPES_SUPPORTED(cap))
+		return (EINVAL);
+
+	/* Set bits in 'page_sizes_mask' for each valid page size */
+	page_shift = PAGE_SHIFT;
+	page_sizes_mask = 1UL << page_shift;		/* 4KB page */
+
+	page_shift += 9;
+	if (EPT_PDE_SUPERPAGE(cap))
+		page_sizes_mask |= 1UL << page_shift;	/* 2MB superpage */
+
+	page_shift += 9;
+	if (EPT_PDPTE_SUPERPAGE(cap))
+		page_sizes_mask |= 1UL << page_shift;	/* 1GB superpage */
+
+	return (0);
+}
+
+static size_t
+ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+		   vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
+{
+	int spshift, ptpshift, ptpindex, nlevels;
+
+	/*
+	 * Compute the size of the mapping that we can accomodate.
+	 *
+	 * This is based on three factors:
+	 * - super page sizes supported by the processor
+	 * - alignment of the region starting at 'gpa' and 'hpa'
+	 * - length of the region 'len'
+	 */
+	spshift = PAGE_SHIFT;
+	if (spok)
+		spshift += (EPT_PWLEVELS - 1) * 9;
+	while (spshift >= PAGE_SHIFT) {
+		uint64_t spsize = 1UL << spshift;
+		if ((page_sizes_mask & spsize) != 0 &&
+		    (gpa & (spsize - 1)) == 0 &&
+		    (hpa & (spsize - 1)) == 0 &&
+		    length >= spsize) {
+			break;
+		}
+		spshift -= 9;
+	}
+
+	if (spshift < PAGE_SHIFT) {
+		panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
+		      "length 0x%016lx, page_sizes_mask 0x%016lx",
+		      gpa, hpa, length, page_sizes_mask);
+	}
+
+	nlevels = EPT_PWLEVELS;
+	while (--nlevels >= 0) {
+		ptpshift = PAGE_SHIFT + nlevels * 9;
+		ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+		/* We have reached the leaf mapping */
+		if (spshift >= ptpshift)
+			break;
+
+		/*
+		 * We are working on a non-leaf page table page.
+		 *
+		 * Create the next level page table page if necessary and point
+		 * to it from the current page table.
+		 */
+		if (ptp[ptpindex] == 0) {
+			void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
+			ptp[ptpindex] = vtophys(nlp);
+			ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
+		}
+
+		/* Work our way down to the next level page table page */
+		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
+	}
+
+	if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
+		panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
+		      "mismatch\n", gpa, ptpshift);
+	}
+
+	/* Do the mapping */
+	ptp[ptpindex] = hpa;
+
+	/* Apply the access controls */
+	if (prot & VM_PROT_READ)
+		ptp[ptpindex] |= EPT_PG_RD;
+	if (prot & VM_PROT_WRITE)
+		ptp[ptpindex] |= EPT_PG_WR;
+	if (prot & VM_PROT_EXECUTE)
+		ptp[ptpindex] |= EPT_PG_EX;
+
+	/*
+	 * XXX should we enforce this memory type by setting the ignore PAT
+	 * bit to 1.
+	 */
+	ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
+
+	if (nlevels > 0)
+		ptp[ptpindex] |= EPT_PG_SUPERPAGE;
+
+	return (1UL << ptpshift);
+}
+
+static void
+ept_free_pt_entry(pt_entry_t pte)
+{
+	if (pte == 0)
+		return;
+
+	/* sanity check */
+	if ((pte & EPT_PG_SUPERPAGE) != 0)
+		panic("ept_free_pt_entry: pte cannot have superpage bit");
+
+	return;
+}
+
+static void
+ept_free_pd_entry(pd_entry_t pde)
+{
+	pt_entry_t	*pt;
+	int		i;
+
+	if (pde == 0)
+		return;
+
+	if ((pde & EPT_PG_SUPERPAGE) == 0) {
+		pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
+		for (i = 0; i < NPTEPG; i++)
+			ept_free_pt_entry(pt[i]);
+		free(pt, M_VMX);	/* free the page table page */
+	}
+}
+
+static void
+ept_free_pdp_entry(pdp_entry_t pdpe)
+{
+	pd_entry_t 	*pd;
+	int		 i;
+
+	if (pdpe == 0)
+		return;
+
+	if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
+		pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
+		for (i = 0; i < NPDEPG; i++)
+			ept_free_pd_entry(pd[i]);
+		free(pd, M_VMX);	/* free the page directory page */
+	}
+}
+
+static void
+ept_free_pml4_entry(pml4_entry_t pml4e)
+{
+	pdp_entry_t	*pdp;
+	int		i;
+
+	if (pml4e == 0)
+		return;
+
+	if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
+		pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
+		for (i = 0; i < NPDPEPG; i++)
+			ept_free_pdp_entry(pdp[i]);
+		free(pdp, M_VMX);	/* free the page directory ptr page */
+	}
+}
+
+void
+ept_vmcleanup(struct vmx *vmx)
+{
+	int 		 i;
+
+	for (i = 0; i < NPML4EPG; i++)
+		ept_free_pml4_entry(vmx->pml4ept[i]);
+}
+
+int
+ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
+	   vm_memattr_t attr, int prot, boolean_t spok)
+{
+	size_t n;
+	struct vmx *vmx = arg;
+
+	while (len > 0) {
+		n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
+				       prot, spok);
+		len -= n;
+		gpa += n;
+		hpa += n;
+	}
+
+	return (0);
+}
+
+static void
+invept_single_context(void *arg)
+{
+	struct invept_desc desc = *(struct invept_desc *)arg;
+
+	invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
+}
+
+void
+ept_invalidate_mappings(u_long pml4ept)
+{
+	struct invept_desc invept_desc = { 0 };
+
+	invept_desc.eptp = EPTP(pml4ept);
+
+	smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
+}
diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h
new file mode 100644
index 0000000..013c330
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.h
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_EPT_H_
+#define	_EPT_H_
+
+struct vmx;
+
+#define	EPT_PWLEVELS	4		/* page walk levels */
+#define	EPTP(pml4)	((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
+
+int	ept_init(void);
+int	ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+	    vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
+void	ept_invalidate_mappings(u_long ept_pml4);
+void	ept_vmcleanup(struct vmx *vmx);
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
new file mode 100644
index 0000000..80d45cc
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -0,0 +1,451 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/pcpu.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/segments.h>
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmcs.h"
+#include "vmx_cpufunc.h"
+#include "ept.h"
+#include "vmx.h"
+
+static uint64_t
+vmcs_fix_regval(uint32_t encoding, uint64_t val)
+{
+
+	switch (encoding) {
+	case VMCS_GUEST_CR0:
+		val = vmx_fix_cr0(val);
+		break;
+	case VMCS_GUEST_CR4:
+		val = vmx_fix_cr4(val);
+		break;
+	default:
+		break;
+	}
+	return (val);
+}
+
+static uint32_t
+vmcs_field_encoding(int ident)
+{
+	switch (ident) {
+	case VM_REG_GUEST_CR0:
+		return (VMCS_GUEST_CR0);
+	case VM_REG_GUEST_CR3:
+		return (VMCS_GUEST_CR3);
+	case VM_REG_GUEST_CR4:
+		return (VMCS_GUEST_CR4);
+	case VM_REG_GUEST_DR7:
+		return (VMCS_GUEST_DR7);
+	case VM_REG_GUEST_RSP:
+		return (VMCS_GUEST_RSP);
+	case VM_REG_GUEST_RIP:
+		return (VMCS_GUEST_RIP);
+	case VM_REG_GUEST_RFLAGS:
+		return (VMCS_GUEST_RFLAGS);
+	case VM_REG_GUEST_ES:
+		return (VMCS_GUEST_ES_SELECTOR);
+	case VM_REG_GUEST_CS:
+		return (VMCS_GUEST_CS_SELECTOR);
+	case VM_REG_GUEST_SS:
+		return (VMCS_GUEST_SS_SELECTOR);
+	case VM_REG_GUEST_DS:
+		return (VMCS_GUEST_DS_SELECTOR);
+	case VM_REG_GUEST_FS:
+		return (VMCS_GUEST_FS_SELECTOR);
+	case VM_REG_GUEST_GS:
+		return (VMCS_GUEST_GS_SELECTOR);
+	case VM_REG_GUEST_TR:
+		return (VMCS_GUEST_TR_SELECTOR);
+	case VM_REG_GUEST_LDTR:
+		return (VMCS_GUEST_LDTR_SELECTOR);
+	case VM_REG_GUEST_EFER:
+		return (VMCS_GUEST_IA32_EFER);
+	default:
+		return (-1);
+	}
+
+}
+
+static int
+vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
+{
+
+	switch (seg) {
+	case VM_REG_GUEST_ES:
+		*base = VMCS_GUEST_ES_BASE;
+		*lim = VMCS_GUEST_ES_LIMIT;
+		*acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_CS:
+		*base = VMCS_GUEST_CS_BASE;
+		*lim = VMCS_GUEST_CS_LIMIT;
+		*acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_SS:
+		*base = VMCS_GUEST_SS_BASE;
+		*lim = VMCS_GUEST_SS_LIMIT;
+		*acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_DS:
+		*base = VMCS_GUEST_DS_BASE;
+		*lim = VMCS_GUEST_DS_LIMIT;
+		*acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_FS:
+		*base = VMCS_GUEST_FS_BASE;
+		*lim = VMCS_GUEST_FS_LIMIT;
+		*acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_GS:
+		*base = VMCS_GUEST_GS_BASE;
+		*lim = VMCS_GUEST_GS_LIMIT;
+		*acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_TR:
+		*base = VMCS_GUEST_TR_BASE;
+		*lim = VMCS_GUEST_TR_LIMIT;
+		*acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_LDTR:
+		*base = VMCS_GUEST_LDTR_BASE;
+		*lim = VMCS_GUEST_LDTR_LIMIT;
+		*acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_IDTR:
+		*base = VMCS_GUEST_IDTR_BASE;
+		*lim = VMCS_GUEST_IDTR_LIMIT;
+		*acc = VMCS_INVALID_ENCODING;
+		break;
+	case VM_REG_GUEST_GDTR:
+		*base = VMCS_GUEST_GDTR_BASE;
+		*lim = VMCS_GUEST_GDTR_LIMIT;
+		*acc = VMCS_INVALID_ENCODING;
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+int
+vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval)
+{
+	int error;
+	uint32_t encoding;
+
+	/*
+	 * If we need to get at vmx-specific state in the VMCS we can bypass
+	 * the translation of 'ident' to 'encoding' by simply setting the
+	 * sign bit. As it so happens the upper 16 bits are reserved (i.e
+	 * set to 0) in the encodings for the VMCS so we are free to use the
+	 * sign bit.
+	 */
+	if (ident < 0)
+		encoding = ident & 0x7fffffff;
+	else
+		encoding = vmcs_field_encoding(ident);
+
+	if (encoding == (uint32_t)-1)
+		return (EINVAL);
+
+	VMPTRLD(vmcs);
+	error = vmread(encoding, retval);
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val)
+{
+	int error;
+	uint32_t encoding;
+
+	if (ident < 0)
+		encoding = ident & 0x7fffffff;
+	else
+		encoding = vmcs_field_encoding(ident);
+
+	if (encoding == (uint32_t)-1)
+		return (EINVAL);
+
+	val = vmcs_fix_regval(encoding, val);
+
+	VMPTRLD(vmcs);
+	error = vmwrite(encoding, val);
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+	int error;
+	uint32_t base, limit, access;
+
+	error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+	if (error != 0)
+		panic("vmcs_setdesc: invalid segment register %d", seg);
+
+	VMPTRLD(vmcs);
+	if ((error = vmwrite(base, desc->base)) != 0)
+		goto done;
+
+	if ((error = vmwrite(limit, desc->limit)) != 0)
+		goto done;
+
+	if (access != VMCS_INVALID_ENCODING) {
+		if ((error = vmwrite(access, desc->access)) != 0)
+			goto done;
+	}
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+	int error;
+	uint32_t base, limit, access;
+	uint64_t u64;
+
+	error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+	if (error != 0)
+		panic("vmcs_getdesc: invalid segment register %d", seg);
+
+	VMPTRLD(vmcs);
+	if ((error = vmread(base, &u64)) != 0)
+		goto done;
+	desc->base = u64;
+
+	if ((error = vmread(limit, &u64)) != 0)
+		goto done;
+	desc->limit = u64;
+
+	if (access != VMCS_INVALID_ENCODING) {
+		if ((error = vmread(access, &u64)) != 0)
+			goto done;
+		desc->access = u64;
+	}
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
+{
+	int error;
+
+	VMPTRLD(vmcs);
+
+	/*
+	 * Guest MSRs are saved in the VM-exit MSR-store area.
+	 * Guest MSRs are loaded from the VM-entry MSR-load area.
+	 * Both areas point to the same location in memory.
+	 */
+	if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
+		goto done;
+
+	error = 0;
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_set_defaults(struct vmcs *vmcs,
+		  u_long host_rip, u_long host_rsp, u_long ept_pml4,
+		  uint32_t pinbased_ctls, uint32_t procbased_ctls,
+		  uint32_t procbased_ctls2, uint32_t exit_ctls,
+		  uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
+{
+	int error, codesel, datasel, tsssel;
+	u_long cr0, cr4, efer;
+	uint64_t eptp, pat;
+	uint32_t exc_bitmap;
+
+	codesel = GSEL(GCODE_SEL, SEL_KPL);
+	datasel = GSEL(GDATA_SEL, SEL_KPL);
+	tsssel = GSEL(GPROC0_SEL, SEL_KPL);
+
+	/*
+	 * Make sure we have a "current" VMCS to work with.
+	 */
+	VMPTRLD(vmcs);
+
+	/*
+	 * Load the VMX controls
+	 */
+	if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
+		goto done;
+
+	/* Guest state */
+
+	/* Initialize guest IA32_PAT MSR with the default value */
+	pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
+	      PAT_VALUE(1, PAT_WRITE_THROUGH)	|
+	      PAT_VALUE(2, PAT_UNCACHED)	|
+	      PAT_VALUE(3, PAT_UNCACHEABLE)	|
+	      PAT_VALUE(4, PAT_WRITE_BACK)	|
+	      PAT_VALUE(5, PAT_WRITE_THROUGH)	|
+	      PAT_VALUE(6, PAT_UNCACHED)	|
+	      PAT_VALUE(7, PAT_UNCACHEABLE);
+	if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
+		goto done;
+
+	/* Host state */
+
+	/* Initialize host IA32_PAT MSR */
+	pat = rdmsr(MSR_PAT);
+	if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
+		goto done;
+
+	/* Load the IA32_EFER MSR */
+	efer = rdmsr(MSR_EFER);
+	if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
+		goto done;
+
+	/* Load the control registers */
+	cr0 = rcr0();
+	if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
+		goto done;
+	
+	cr4 = rcr4();
+	if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
+		goto done;
+
+	/* Load the segment selectors */
+	if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
+		goto done;
+
+	/*
+	 * Load the Base-Address for %fs and idtr.
+	 *
+	 * Note that we exclude %gs, tss and gdtr here because their base
+	 * address is pcpu specific.
+	 */
+	if ((error = vmwrite(VMCS_HOST_FS_BASE, 0)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_IDTR_BASE, r_idt.rd_base)) != 0)
+		goto done;
+
+	/* instruction pointer */
+	if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
+		goto done;
+
+	/* stack pointer */
+	if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
+		goto done;
+
+	/* eptp */
+	eptp = EPTP(ept_pml4);
+	if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
+		goto done;
+
+	/* vpid */
+	if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
+		goto done;
+
+	/* msr bitmap */
+	if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
+		goto done;
+
+	/* exception bitmap */
+	exc_bitmap = 1 << IDT_MC;
+	if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
+		goto done;
+
+	/* link pointer */
+	if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
+		goto done;
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+uint64_t
+vmcs_read(uint32_t encoding)
+{
+	int error;
+	uint64_t val;
+
+	error = vmread(encoding, &val);
+	if (error != 0)
+		panic("vmcs_read(%u) error %d", encoding, error);
+
+	return (val);
+}
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
new file mode 100644
index 0000000..c633a59
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -0,0 +1,324 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMCS_H_
+#define	_VMCS_H_
+
+#ifdef _KERNEL
+struct vmcs {
+	uint32_t	identifier;
+	uint32_t	abort_code;
+	char		_impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
+};
+CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
+
+/* MSR save region is composed of an array of 'struct msr_entry' */
+struct msr_entry {
+	uint32_t	index;
+	uint32_t	reserved;
+	uint64_t	val;
+
+};
+
+int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
+int	vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
+			  u_long ept_pml4,
+			  uint32_t pinbased_ctls, uint32_t procbased_ctls,
+			  uint32_t procbased_ctls2, uint32_t exit_ctls,
+			  uint32_t entry_ctls, u_long msr_bitmap,
+			  uint16_t vpid);
+int	vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval);
+int	vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val);
+int	vmcs_getdesc(struct vmcs *vmcs, int ident,
+		     struct seg_desc *desc);
+int	vmcs_setdesc(struct vmcs *vmcs, int ident,
+		     struct seg_desc *desc);
+uint64_t vmcs_read(uint32_t encoding);
+
+#define	vmexit_instruction_length()	vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
+#define	vmcs_guest_rip()		vmcs_read(VMCS_GUEST_RIP)
+#define	vmcs_instruction_error()	vmcs_read(VMCS_INSTRUCTION_ERROR)
+#define	vmcs_exit_reason()		(vmcs_read(VMCS_EXIT_REASON) & 0xffff)
+#define	vmcs_exit_qualification()	vmcs_read(VMCS_EXIT_QUALIFICATION)
+
+#endif	/* _KERNEL */
+
+#define	VMCS_IDENT(encoding)		((encoding) | 0x80000000)
+/*
+ * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
+ */
+#define	VMCS_INVALID_ENCODING		0xffffffff
+
+/* 16-bit control fields */
+#define	VMCS_VPID			0x00000000
+
+/* 16-bit guest-state fields */
+#define	VMCS_GUEST_ES_SELECTOR		0x00000800
+#define	VMCS_GUEST_CS_SELECTOR		0x00000802
+#define	VMCS_GUEST_SS_SELECTOR		0x00000804
+#define	VMCS_GUEST_DS_SELECTOR		0x00000806
+#define	VMCS_GUEST_FS_SELECTOR		0x00000808
+#define	VMCS_GUEST_GS_SELECTOR		0x0000080A
+#define	VMCS_GUEST_LDTR_SELECTOR	0x0000080C
+#define	VMCS_GUEST_TR_SELECTOR		0x0000080E
+
+/* 16-bit host-state fields */
+#define	VMCS_HOST_ES_SELECTOR		0x00000C00
+#define	VMCS_HOST_CS_SELECTOR		0x00000C02
+#define	VMCS_HOST_SS_SELECTOR		0x00000C04
+#define	VMCS_HOST_DS_SELECTOR		0x00000C06
+#define	VMCS_HOST_FS_SELECTOR		0x00000C08
+#define	VMCS_HOST_GS_SELECTOR		0x00000C0A
+#define	VMCS_HOST_TR_SELECTOR		0x00000C0C
+
+/* 64-bit control fields */
+#define	VMCS_IO_BITMAP_A		0x00002000
+#define	VMCS_IO_BITMAP_B		0x00002002
+#define	VMCS_MSR_BITMAP			0x00002004
+#define	VMCS_EXIT_MSR_STORE		0x00002006
+#define	VMCS_EXIT_MSR_LOAD		0x00002008
+#define	VMCS_ENTRY_MSR_LOAD		0x0000200A
+#define	VMCS_EXECUTIVE_VMCS		0x0000200C
+#define	VMCS_TSC_OFFSET			0x00002010
+#define	VMCS_VIRTUAL_APIC		0x00002012
+#define	VMCS_APIC_ACCESS		0x00002014
+#define	VMCS_EPTP			0x0000201A
+
+/* 64-bit read-only fields */
+#define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
+
+/* 64-bit guest-state fields */
+#define	VMCS_LINK_POINTER		0x00002800
+#define	VMCS_GUEST_IA32_DEBUGCTL	0x00002802
+#define	VMCS_GUEST_IA32_PAT		0x00002804
+#define	VMCS_GUEST_IA32_EFER		0x00002806
+#define	VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
+#define	VMCS_GUEST_PDPTE0		0x0000280A
+#define	VMCS_GUEST_PDPTE1		0x0000280C
+#define	VMCS_GUEST_PDPTE2		0x0000280E
+#define	VMCS_GUEST_PDPTE3		0x00002810
+
+/* 64-bit host-state fields */
+#define	VMCS_HOST_IA32_PAT		0x00002C00
+#define	VMCS_HOST_IA32_EFER		0x00002C02
+#define	VMCS_HOST_IA32_PERF_GLOBAL_CTRL	0x00002C04
+
+/* 32-bit control fields */
+#define	VMCS_PIN_BASED_CTLS		0x00004000
+#define	VMCS_PRI_PROC_BASED_CTLS	0x00004002
+#define	VMCS_EXCEPTION_BITMAP		0x00004004
+#define	VMCS_PF_ERROR_MASK		0x00004006
+#define	VMCS_PF_ERROR_MATCH		0x00004008
+#define	VMCS_CR3_TARGET_COUNT		0x0000400A
+#define	VMCS_EXIT_CTLS			0x0000400C
+#define	VMCS_EXIT_MSR_STORE_COUNT	0x0000400E
+#define	VMCS_EXIT_MSR_LOAD_COUNT	0x00004010
+#define	VMCS_ENTRY_CTLS			0x00004012
+#define	VMCS_ENTRY_MSR_LOAD_COUNT	0x00004014
+#define	VMCS_ENTRY_INTR_INFO		0x00004016
+#define	VMCS_ENTRY_EXCEPTION_ERROR	0x00004018
+#define	VMCS_ENTRY_INST_LENGTH		0x0000401A
+#define	VMCS_TPR_THRESHOLD		0x0000401C
+#define	VMCS_SEC_PROC_BASED_CTLS	0x0000401E
+#define	VMCS_PLE_GAP			0x00004020
+#define	VMCS_PLE_WINDOW			0x00004022
+
+/* 32-bit read-only data fields */
+#define	VMCS_INSTRUCTION_ERROR		0x00004400
+#define	VMCS_EXIT_REASON		0x00004402
+#define	VMCS_EXIT_INTERRUPTION_INFO	0x00004404
+#define	VMCS_EXIT_INTERRUPTION_ERROR	0x00004406
+#define	VMCS_IDT_VECTORING_INFO		0x00004408
+#define	VMCS_IDT_VECTORING_ERROR	0x0000440A
+#define	VMCS_EXIT_INSTRUCTION_LENGTH	0x0000440C
+#define	VMCS_EXIT_INSTRUCTION_INFO	0x0000440E
+
+/* 32-bit guest-state fields */
+#define	VMCS_GUEST_ES_LIMIT		0x00004800
+#define	VMCS_GUEST_CS_LIMIT		0x00004802
+#define	VMCS_GUEST_SS_LIMIT		0x00004804
+#define	VMCS_GUEST_DS_LIMIT		0x00004806
+#define	VMCS_GUEST_FS_LIMIT		0x00004808
+#define	VMCS_GUEST_GS_LIMIT		0x0000480A
+#define	VMCS_GUEST_LDTR_LIMIT		0x0000480C
+#define	VMCS_GUEST_TR_LIMIT		0x0000480E
+#define	VMCS_GUEST_GDTR_LIMIT		0x00004810
+#define	VMCS_GUEST_IDTR_LIMIT		0x00004812
+#define	VMCS_GUEST_ES_ACCESS_RIGHTS	0x00004814
+#define	VMCS_GUEST_CS_ACCESS_RIGHTS	0x00004816
+#define	VMCS_GUEST_SS_ACCESS_RIGHTS	0x00004818
+#define	VMCS_GUEST_DS_ACCESS_RIGHTS	0x0000481A
+#define	VMCS_GUEST_FS_ACCESS_RIGHTS	0x0000481C
+#define	VMCS_GUEST_GS_ACCESS_RIGHTS	0x0000481E
+#define	VMCS_GUEST_LDTR_ACCESS_RIGHTS	0x00004820
+#define	VMCS_GUEST_TR_ACCESS_RIGHTS	0x00004822
+#define	VMCS_GUEST_INTERRUPTIBILITY	0x00004824
+#define	VMCS_GUEST_ACTIVITY		0x00004826
+#define VMCS_GUEST_SMBASE		0x00004828
+#define	VMCS_GUEST_IA32_SYSENTER_CS	0x0000482A
+#define	VMCS_PREEMPTION_TIMER_VALUE	0x0000482E
+
+/* 32-bit host state fields */
+#define	VMCS_HOST_IA32_SYSENTER_CS	0x00004C00
+
+/* Natural Width control fields */
+#define	VMCS_CR0_MASK			0x00006000
+#define	VMCS_CR4_MASK			0x00006002
+#define	VMCS_CR0_SHADOW			0x00006004
+#define	VMCS_CR4_SHADOW			0x00006006
+#define	VMCS_CR3_TARGET0		0x00006008
+#define	VMCS_CR3_TARGET1		0x0000600A
+#define	VMCS_CR3_TARGET2		0x0000600C
+#define	VMCS_CR3_TARGET3		0x0000600E
+
+/* Natural Width read-only fields */
+#define	VMCS_EXIT_QUALIFICATION		0x00006400
+#define	VMCS_IO_RCX			0x00006402
+#define	VMCS_IO_RSI			0x00006404
+#define	VMCS_IO_RDI			0x00006406
+#define	VMCS_IO_RIP			0x00006408
+#define	VMCS_GUEST_LINEAR_ADDRESS	0x0000640A
+
+/* Natural Width guest-state fields */
+#define	VMCS_GUEST_CR0			0x00006800
+#define	VMCS_GUEST_CR3			0x00006802
+#define	VMCS_GUEST_CR4			0x00006804
+#define	VMCS_GUEST_ES_BASE		0x00006806
+#define	VMCS_GUEST_CS_BASE		0x00006808
+#define	VMCS_GUEST_SS_BASE		0x0000680A
+#define	VMCS_GUEST_DS_BASE		0x0000680C
+#define	VMCS_GUEST_FS_BASE		0x0000680E
+#define	VMCS_GUEST_GS_BASE		0x00006810
+#define	VMCS_GUEST_LDTR_BASE		0x00006812
+#define	VMCS_GUEST_TR_BASE		0x00006814
+#define	VMCS_GUEST_GDTR_BASE		0x00006816
+#define	VMCS_GUEST_IDTR_BASE		0x00006818
+#define	VMCS_GUEST_DR7			0x0000681A
+#define	VMCS_GUEST_RSP			0x0000681C
+#define	VMCS_GUEST_RIP			0x0000681E
+#define	VMCS_GUEST_RFLAGS		0x00006820
+#define	VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
+#define	VMCS_GUEST_IA32_SYSENTER_ESP	0x00006824
+#define	VMCS_GUEST_IA32_SYSENTER_EIP	0x00006826
+
+/* Natural Width host-state fields */
+#define	VMCS_HOST_CR0			0x00006C00
+#define	VMCS_HOST_CR3			0x00006C02
+#define	VMCS_HOST_CR4			0x00006C04
+#define	VMCS_HOST_FS_BASE		0x00006C06
+#define	VMCS_HOST_GS_BASE		0x00006C08
+#define	VMCS_HOST_TR_BASE		0x00006C0A
+#define	VMCS_HOST_GDTR_BASE		0x00006C0C
+#define	VMCS_HOST_IDTR_BASE		0x00006C0E
+#define	VMCS_HOST_IA32_SYSENTER_ESP	0x00006C10
+#define	VMCS_HOST_IA32_SYSENTER_EIP	0x00006C12
+#define	VMCS_HOST_RSP			0x00006C14
+#define	VMCS_HOST_RIP			0x00006c16
+
+/*
+ * VM instruction error numbers
+ */
+#define	VMRESUME_WITH_NON_LAUNCHED_VMCS	5
+
+/*
+ * VMCS exit reasons
+ */
+#define EXIT_REASON_EXCEPTION		0
+#define EXIT_REASON_EXT_INTR		1
+#define EXIT_REASON_TRIPLE_FAULT	2
+#define EXIT_REASON_INIT		3
+#define EXIT_REASON_SIPI		4
+#define EXIT_REASON_IO_SMI		5
+#define EXIT_REASON_SMI			6
+#define EXIT_REASON_INTR_WINDOW		7
+#define EXIT_REASON_NMI_WINDOW		8
+#define EXIT_REASON_TASK_SWITCH		9
+#define EXIT_REASON_CPUID		10
+#define EXIT_REASON_GETSEC		11
+#define EXIT_REASON_HLT			12
+#define EXIT_REASON_INVD		13
+#define EXIT_REASON_INVLPG		14
+#define EXIT_REASON_RDPMC		15
+#define EXIT_REASON_RDTSC		16
+#define EXIT_REASON_RSM			17
+#define EXIT_REASON_VMCALL		18
+#define EXIT_REASON_VMCLEAR		19
+#define EXIT_REASON_VMLAUNCH		20
+#define EXIT_REASON_VMPTRLD		21
+#define EXIT_REASON_VMPTRST		22
+#define EXIT_REASON_VMREAD		23
+#define EXIT_REASON_VMRESUME		24
+#define EXIT_REASON_VMWRITE		25
+#define EXIT_REASON_VMXOFF		26
+#define EXIT_REASON_VMXON		27
+#define EXIT_REASON_CR_ACCESS		28
+#define EXIT_REASON_DR_ACCESS		29
+#define EXIT_REASON_INOUT		30
+#define EXIT_REASON_RDMSR		31
+#define EXIT_REASON_WRMSR		32
+#define EXIT_REASON_INVAL_VMCS		33
+#define EXIT_REASON_INVAL_MSR		34
+#define EXIT_REASON_MWAIT		36
+#define EXIT_REASON_MTF			37
+#define EXIT_REASON_MONITOR		39
+#define EXIT_REASON_PAUSE		40
+#define EXIT_REASON_MCE			41
+#define EXIT_REASON_TPR			43
+#define EXIT_REASON_APIC		44
+#define EXIT_REASON_GDTR_IDTR		46
+#define EXIT_REASON_LDTR_TR		47
+#define EXIT_REASON_EPT_FAULT		48
+#define EXIT_REASON_EPT_MISCONFIG	49
+#define EXIT_REASON_INVEPT		50
+#define EXIT_REASON_RDTSCP		51
+#define EXIT_REASON_VMX_PREEMPT		52
+#define EXIT_REASON_INVVPID		53
+#define EXIT_REASON_WBINVD		54
+#define EXIT_REASON_XSETBV		55
+
+/*
+ * VMCS interrupt information fields
+ */
+#define	VMCS_INTERRUPTION_INFO_VALID	(1 << 31)
+#define	VMCS_INTERRUPTION_INFO_HW_INTR	(0 << 8)
+#define	VMCS_INTERRUPTION_INFO_NMI	(2 << 8)
+
+/*
+ * VMCS Guest interruptibility field
+ */
+#define	VMCS_INTERRUPTIBILITY_STI_BLOCKING	(1 << 0)
+#define	VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING	(1 << 1)
+#define	VMCS_INTERRUPTIBILITY_SMI_BLOCKING	(1 << 2)
+#define	VMCS_INTERRUPTIBILITY_NMI_BLOCKING	(1 << 3)
+
+/*
+ * Exit qualification for EXIT_REASON_INVAL_VMCS
+ */
+#define	EXIT_QUAL_NMI_WHILE_STI_BLOCKING	3
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
new file mode 100644
index 0000000..ec181c4
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -0,0 +1,1673 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/psl.h>
+#include <machine/cpufunc.h>
+#include <machine/pmap.h>
+#include <machine/segments.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+#include "vmm_ktr.h"
+#include "vmm_stat.h"
+
+#include "vmx_msr.h"
+#include "ept.h"
+#include "vmx_cpufunc.h"
+#include "vmx.h"
+#include "x86.h"
+#include "vmx_controls.h"
+
+#define	CR4_VMXE	(1UL << 13)
+
+#define	PINBASED_CTLS_ONE_SETTING					\
+	(PINBASED_EXTINT_EXITING	|				\
+	 PINBASED_NMI_EXITING		|				\
+	 PINBASED_VIRTUAL_NMI)
+#define	PINBASED_CTLS_ZERO_SETTING	0
+
+#define PROCBASED_CTLS_WINDOW_SETTING					\
+	(PROCBASED_INT_WINDOW_EXITING	|				\
+	 PROCBASED_NMI_WINDOW_EXITING)
+
+#define	PROCBASED_CTLS_ONE_SETTING 					\
+	(PROCBASED_SECONDARY_CONTROLS	|				\
+	 PROCBASED_IO_EXITING		|				\
+	 PROCBASED_MSR_BITMAPS		|				\
+	 PROCBASED_CTLS_WINDOW_SETTING)
+#define	PROCBASED_CTLS_ZERO_SETTING	\
+	(PROCBASED_CR3_LOAD_EXITING |	\
+	PROCBASED_CR3_STORE_EXITING |	\
+	PROCBASED_IO_BITMAPS)
+
+#define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
+#define	PROCBASED_CTLS2_ZERO_SETTING	0
+
+#define	VM_EXIT_CTLS_ONE_SETTING					\
+	(VM_EXIT_HOST_LMA			|			\
+	VM_EXIT_SAVE_EFER			|			\
+	VM_EXIT_SAVE_PAT			|			\
+	VM_EXIT_LOAD_PAT			|			\
+	VM_EXIT_LOAD_EFER)
+#define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
+
+#define	VM_ENTRY_CTLS_ONE_SETTING					\
+	(VM_ENTRY_LOAD_PAT			|			\
+	VM_ENTRY_LOAD_EFER)
+#define	VM_ENTRY_CTLS_ZERO_SETTING					\
+	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
+	VM_ENTRY_INTO_SMM			|			\
+	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
+
+#define	guest_msr_rw(vmx, msr) \
+	msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
+
+#define	HANDLED		1
+#define	UNHANDLED	0
+
+MALLOC_DEFINE(M_VMX, "vmx", "vmx");
+
+extern  struct pcpu __pcpu[];
+
+static int vmxon_enabled[MAXCPU];
+static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
+
+static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
+static uint32_t exit_ctls, entry_ctls;
+
+static uint64_t cr0_ones_mask, cr0_zeros_mask;
+static uint64_t cr4_ones_mask, cr4_zeros_mask;
+
+static volatile u_int nextvpid;
+
+/*
+ * Virtual NMI blocking conditions.
+ *
+ * Some processor implementations also require NMI to be blocked if
+ * the STI_BLOCKING bit is set. It is possible to detect this at runtime
+ * based on the (exit_reason,exit_qual) tuple being set to 
+ * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING).
+ *
+ * We take the easy way out and also include STI_BLOCKING as one of the
+ * gating items for vNMI injection.
+ */
+static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING |
+				    VMCS_INTERRUPTIBILITY_NMI_BLOCKING |
+				    VMCS_INTERRUPTIBILITY_STI_BLOCKING;
+
+/*
+ * Optional capabilities
+ */
+static int cap_halt_exit;
+static int cap_pause_exit;
+static int cap_unrestricted_guest;
+static int cap_monitor_trap;
+ 
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus");
+static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt");
+
+#ifdef KTR
+static const char *
+exit_reason_to_str(int reason)
+{
+	static char reasonbuf[32];
+
+	switch (reason) {
+	case EXIT_REASON_EXCEPTION:
+		return "exception";
+	case EXIT_REASON_EXT_INTR:
+		return "extint";
+	case EXIT_REASON_TRIPLE_FAULT:
+		return "triplefault";
+	case EXIT_REASON_INIT:
+		return "init";
+	case EXIT_REASON_SIPI:
+		return "sipi";
+	case EXIT_REASON_IO_SMI:
+		return "iosmi";
+	case EXIT_REASON_SMI:
+		return "smi";
+	case EXIT_REASON_INTR_WINDOW:
+		return "intrwindow";
+	case EXIT_REASON_NMI_WINDOW:
+		return "nmiwindow";
+	case EXIT_REASON_TASK_SWITCH:
+		return "taskswitch";
+	case EXIT_REASON_CPUID:
+		return "cpuid";
+	case EXIT_REASON_GETSEC:
+		return "getsec";
+	case EXIT_REASON_HLT:
+		return "hlt";
+	case EXIT_REASON_INVD:
+		return "invd";
+	case EXIT_REASON_INVLPG:
+		return "invlpg";
+	case EXIT_REASON_RDPMC:
+		return "rdpmc";
+	case EXIT_REASON_RDTSC:
+		return "rdtsc";
+	case EXIT_REASON_RSM:
+		return "rsm";
+	case EXIT_REASON_VMCALL:
+		return "vmcall";
+	case EXIT_REASON_VMCLEAR:
+		return "vmclear";
+	case EXIT_REASON_VMLAUNCH:
+		return "vmlaunch";
+	case EXIT_REASON_VMPTRLD:
+		return "vmptrld";
+	case EXIT_REASON_VMPTRST:
+		return "vmptrst";
+	case EXIT_REASON_VMREAD:
+		return "vmread";
+	case EXIT_REASON_VMRESUME:
+		return "vmresume";
+	case EXIT_REASON_VMWRITE:
+		return "vmwrite";
+	case EXIT_REASON_VMXOFF:
+		return "vmxoff";
+	case EXIT_REASON_VMXON:
+		return "vmxon";
+	case EXIT_REASON_CR_ACCESS:
+		return "craccess";
+	case EXIT_REASON_DR_ACCESS:
+		return "draccess";
+	case EXIT_REASON_INOUT:
+		return "inout";
+	case EXIT_REASON_RDMSR:
+		return "rdmsr";
+	case EXIT_REASON_WRMSR:
+		return "wrmsr";
+	case EXIT_REASON_INVAL_VMCS:
+		return "invalvmcs";
+	case EXIT_REASON_INVAL_MSR:
+		return "invalmsr";
+	case EXIT_REASON_MWAIT:
+		return "mwait";
+	case EXIT_REASON_MTF:
+		return "mtf";
+	case EXIT_REASON_MONITOR:
+		return "monitor";
+	case EXIT_REASON_PAUSE:
+		return "pause";
+	case EXIT_REASON_MCE:
+		return "mce";
+	case EXIT_REASON_TPR:
+		return "tpr";
+	case EXIT_REASON_APIC:
+		return "apic";
+	case EXIT_REASON_GDTR_IDTR:
+		return "gdtridtr";
+	case EXIT_REASON_LDTR_TR:
+		return "ldtrtr";
+	case EXIT_REASON_EPT_FAULT:
+		return "eptfault";
+	case EXIT_REASON_EPT_MISCONFIG:
+		return "eptmisconfig";
+	case EXIT_REASON_INVEPT:
+		return "invept";
+	case EXIT_REASON_RDTSCP:
+		return "rdtscp";
+	case EXIT_REASON_VMX_PREEMPT:
+		return "vmxpreempt";
+	case EXIT_REASON_INVVPID:
+		return "invvpid";
+	case EXIT_REASON_WBINVD:
+		return "wbinvd";
+	case EXIT_REASON_XSETBV:
+		return "xsetbv";
+	default:
+		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
+		return (reasonbuf);
+	}
+}
+
+#ifdef SETJMP_TRACE
+static const char *
+vmx_setjmp_rc2str(int rc)
+{
+	switch (rc) {
+	case VMX_RETURN_DIRECT:
+		return "direct";
+	case VMX_RETURN_LONGJMP:
+		return "longjmp";
+	case VMX_RETURN_VMRESUME:
+		return "vmresume";
+	case VMX_RETURN_VMLAUNCH:
+		return "vmlaunch";
+	default:
+		return "unknown";
+	}
+}
+
+#define	SETJMP_TRACE(vmx, vcpu, vmxctx, regname)			  \
+	VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \
+		 (vmxctx)->regname)
+
+static void
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+	uint64_t host_rip, host_rsp;
+
+	if (vmxctx != &vmx->ctx[vcpu])
+		panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p",
+			vmxctx, &vmx->ctx[vcpu]);
+
+	VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx);
+	VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)",
+		 vmx_setjmp_rc2str(rc), rc);
+
+	host_rsp = host_rip = ~0;
+	vmread(VMCS_HOST_RIP, &host_rip);
+	vmread(VMCS_HOST_RSP, &host_rsp);
+	VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx",
+		 host_rip, host_rsp);
+
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip);
+
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2);
+}
+#endif
+#else
+static void __inline
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+	return;
+}
+#endif	/* KTR */
+
+u_long
+vmx_fix_cr0(u_long cr0)
+{
+
+	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
+}
+
+u_long
+vmx_fix_cr4(u_long cr4)
+{
+
+	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
+}
+
+static void
+msr_save_area_init(struct msr_entry *g_area, int *g_count)
+{
+	int cnt;
+
+	static struct msr_entry guest_msrs[] = {
+		{ MSR_KGSBASE, 0, 0 },
+	};
+
+	cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
+	if (cnt > GUEST_MSR_MAX_ENTRIES)
+		panic("guest msr save area overrun");
+	bcopy(guest_msrs, g_area, sizeof(guest_msrs));
+	*g_count = cnt;
+}
+
+static void
+vmx_disable(void *arg __unused)
+{
+	struct invvpid_desc invvpid_desc = { 0 };
+	struct invept_desc invept_desc = { 0 };
+
+	if (vmxon_enabled[curcpu]) {
+		/*
+		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
+		 *
+		 * VMXON or VMXOFF are not required to invalidate any TLB
+		 * caching structures. This prevents potential retention of
+		 * cached information in the TLB between distinct VMX episodes.
+		 */
+		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
+		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
+		vmxoff();
+	}
+	load_cr4(rcr4() & ~CR4_VMXE);
+}
+
+static int
+vmx_cleanup(void)
+{
+
+	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
+
+	return (0);
+}
+
+static void
+vmx_enable(void *arg __unused)
+{
+	int error;
+
+	load_cr4(rcr4() | CR4_VMXE);
+
+	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
+	error = vmxon(vmxon_region[curcpu]);
+	if (error == 0)
+		vmxon_enabled[curcpu] = 1;
+}
+
+static int
+vmx_init(void)
+{
+	int error;
+	unsigned int regs[4];
+	uint64_t fixed0, fixed1;
+	uint32_t tmp;
+
+	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
+	do_cpuid(1, regs);
+	if ((regs[2] & CPUID_0000_0001_FEAT0_VMX) == 0) {
+		printf("vmx_init: processor does not support VMX operation\n");
+		return (ENXIO);
+	}
+
+	/* Check support for primary processor-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+			       MSR_VMX_TRUE_PROCBASED_CTLS,
+			       PROCBASED_CTLS_ONE_SETTING,
+			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired primary "
+		       "processor-based controls\n");
+		return (error);
+	}
+
+	/* Clear the processor-based ctl bits that are set on demand */
+	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
+
+	/* Check support for secondary processor-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+			       MSR_VMX_PROCBASED_CTLS2,
+			       PROCBASED_CTLS2_ONE_SETTING,
+			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
+	if (error) {
+		printf("vmx_init: processor does not support desired secondary "
+		       "processor-based controls\n");
+		return (error);
+	}
+
+	/* Check support for VPID */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
+			       PROCBASED2_ENABLE_VPID, 0, &tmp);
+	if (error == 0)
+		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
+
+	/* Check support for pin-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+			       MSR_VMX_TRUE_PINBASED_CTLS,
+			       PINBASED_CTLS_ONE_SETTING,
+			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired "
+		       "pin-based controls\n");
+		return (error);
+	}
+
+	/* Check support for VM-exit controls */
+	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
+			       VM_EXIT_CTLS_ONE_SETTING,
+			       VM_EXIT_CTLS_ZERO_SETTING,
+			       &exit_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired "
+		       "exit controls\n");
+		       return (error);
+	}
+
+	/* Check support for VM-entry controls */
+	error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
+			       VM_ENTRY_CTLS_ONE_SETTING,
+			       VM_ENTRY_CTLS_ZERO_SETTING,
+			       &entry_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired "
+		       "entry controls\n");
+		       return (error);
+	}
+
+	/*
+	 * Check support for optional features by testing them
+	 * as individual bits
+	 */
+	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					MSR_VMX_TRUE_PROCBASED_CTLS,
+					PROCBASED_HLT_EXITING, 0,
+					&tmp) == 0);
+
+	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					MSR_VMX_PROCBASED_CTLS,
+					PROCBASED_MTF, 0,
+					&tmp) == 0);
+
+	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					 MSR_VMX_TRUE_PROCBASED_CTLS,
+					 PROCBASED_PAUSE_EXITING, 0,
+					 &tmp) == 0);
+
+	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+					MSR_VMX_PROCBASED_CTLS2,
+					PROCBASED2_UNRESTRICTED_GUEST, 0,
+				        &tmp) == 0);
+
+	/* Initialize EPT */
+	error = ept_init();
+	if (error) {
+		printf("vmx_init: ept initialization failed (%d)\n", error);
+		return (error);
+	}
+
+	/*
+	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
+	 */
+	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
+	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
+	cr0_ones_mask = fixed0 & fixed1;
+	cr0_zeros_mask = ~fixed0 & ~fixed1;
+
+	/*
+	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
+	 * if unrestricted guest execution is allowed.
+	 */
+	if (cap_unrestricted_guest)
+		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
+
+	/*
+	 * Do not allow the guest to set CR0_NW or CR0_CD.
+	 */
+	cr0_zeros_mask |= (CR0_NW | CR0_CD);
+
+	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
+	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
+	cr4_ones_mask = fixed0 & fixed1;
+	cr4_zeros_mask = ~fixed0 & ~fixed1;
+
+	/* enable VMX operation */
+	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
+
+	return (0);
+}
+
+/*
+ * If this processor does not support VPIDs then simply return 0.
+ *
+ * Otherwise generate the next value of VPID to use. Any value is alright
+ * as long as it is non-zero.
+ *
+ * We always execute in VMX non-root context with EPT enabled. Thus all
+ * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This
+ * in turn means that multiple VMs can share the same VPID as long as
+ * they have distinct EPT page tables.
+ *
+ * XXX
+ * We should optimize this so that it returns VPIDs that are not in
+ * use. Then we will not unnecessarily invalidate mappings in
+ * vmx_set_pcpu_defaults() just because two or more vcpus happen to
+ * use the same 'vpid'.
+ */
+static uint16_t
+vmx_vpid(void)
+{
+	uint16_t vpid = 0;
+
+	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) {
+		do {
+			vpid = atomic_fetchadd_int(&nextvpid, 1);
+		} while (vpid == 0);
+	}
+
+	return (vpid);
+}
+
+static int
+vmx_setup_cr0_shadow(struct vmcs *vmcs)
+{
+	int error;
+	uint64_t mask, shadow;
+
+	mask = cr0_ones_mask | cr0_zeros_mask;
+	error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_MASK), mask);
+	if (error)
+		return (error);
+
+	shadow = cr0_ones_mask;
+	error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_SHADOW), shadow);
+	if (error)
+		return (error);
+
+	return (0);
+}
+
+static void *
+vmx_vminit(struct vm *vm)
+{
+	uint16_t vpid;
+	int i, error, guest_msr_count;
+	struct vmx *vmx;
+
+	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
+	if ((uintptr_t)vmx & PAGE_MASK) {
+		panic("malloc of struct vmx not aligned on %d byte boundary",
+		      PAGE_SIZE);
+	}
+	vmx->vm = vm;
+
+	/*
+	 * Clean up EPTP-tagged guest physical and combined mappings
+	 *
+	 * VMX transitions are not required to invalidate any guest physical
+	 * mappings. So, it may be possible for stale guest physical mappings
+	 * to be present in the processor TLBs.
+	 *
+	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
+	 */
+	ept_invalidate_mappings(vtophys(vmx->pml4ept));
+
+	msr_bitmap_initialize(vmx->msr_bitmap);
+
+	/*
+	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
+	 * The guest FSBASE and GSBASE are saved and restored during
+	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
+	 * always restored from the vmcs host state area on vm-exit.
+	 *
+	 * Guest KGSBASE is saved and restored in the guest MSR save area.
+	 * Host KGSBASE is restored before returning to userland from the pcb.
+	 * There will be a window of time when we are executing in the host
+	 * kernel context with a value of KGSBASE from the guest. This is ok
+	 * because the value of KGSBASE is inconsequential in kernel context.
+	 *
+	 * MSR_EFER is saved and restored in the guest VMCS area on a
+	 * VM exit and entry respectively. It is also restored from the
+	 * host VMCS area on a VM exit.
+	 *
+	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
+	 * and entry respectively. It is also restored from the host VMCS
+	 * area on a VM exit.
+	 */
+	if (guest_msr_rw(vmx, MSR_GSBASE) ||
+	    guest_msr_rw(vmx, MSR_FSBASE) ||
+	    guest_msr_rw(vmx, MSR_KGSBASE) ||
+	    guest_msr_rw(vmx, MSR_EFER) ||
+	    guest_msr_rw(vmx, MSR_PAT))
+		panic("vmx_vminit: error setting guest msr access");
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		vmx->vmcs[i].identifier = vmx_revision();
+		error = vmclear(&vmx->vmcs[i]);
+		if (error != 0) {
+			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
+			      error, i);
+		}
+
+		vpid = vmx_vpid();
+
+		error = vmcs_set_defaults(&vmx->vmcs[i],
+					  (u_long)vmx_longjmp,
+					  (u_long)&vmx->ctx[i],
+					  vtophys(vmx->pml4ept),
+					  pinbased_ctls,
+					  procbased_ctls,
+					  procbased_ctls2,
+					  exit_ctls, entry_ctls,
+					  vtophys(vmx->msr_bitmap),
+					  vpid);
+
+		if (error != 0)
+			panic("vmx_vminit: vmcs_set_defaults error %d", error);
+
+		vmx->cap[i].set = 0;
+		vmx->cap[i].proc_ctls = procbased_ctls;
+
+		vmx->state[i].request_nmi = 0;
+		vmx->state[i].lastcpu = -1;
+		vmx->state[i].vpid = vpid;
+
+		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
+
+		error = vmcs_set_msr_save(&vmx->vmcs[i],
+					  vtophys(vmx->guest_msrs[i]),
+					  guest_msr_count);
+		if (error != 0)
+			panic("vmcs_set_msr_save error %d", error);
+
+		error = vmx_setup_cr0_shadow(&vmx->vmcs[i]);
+	}
+
+	return (vmx);
+}
+
+static int
+vmx_handle_cpuid(struct vmxctx *vmxctx)
+{
+	int handled, func;
+	
+	func = vmxctx->guest_rax;
+
+	handled = x86_emulate_cpuid((uint32_t*)(&vmxctx->guest_rax),
+	    (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx),
+	    (uint32_t*)(&vmxctx->guest_rdx));
+#if 0
+	printf("%s: func %x rax %lx rbx %lx rcx %lx rdx %lx handled %d\n",
+		__func__, func, vmxctx->guest_rax, vmxctx->guest_rbx,
+		vmxctx->guest_rcx, vmxctx->guest_rdx, handled);
+#endif
+
+	return (handled);
+}
+
+static __inline void
+vmx_run_trace(struct vmx *vmx, int vcpu)
+{
+#ifdef KTR
+	VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip());
+#endif
+}
+
+static __inline void
+vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
+	       int handled, int astpending)
+{
+#ifdef KTR
+	VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
+		 handled ? "handled" : "unhandled",
+		 exit_reason_to_str(exit_reason), rip);
+
+	if (astpending)
+		VMM_CTR0(vmx->vm, vcpu, "astpending");
+#endif
+}
+
+static int
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
+{
+	int error, lastcpu;
+	struct vmxstate *vmxstate;
+	struct invvpid_desc invvpid_desc = { 0 };
+
+	vmxstate = &vmx->state[vcpu];
+	lastcpu = vmxstate->lastcpu;
+	vmxstate->lastcpu = curcpu;
+
+	if (lastcpu == curcpu) {
+		error = 0;
+		goto done;
+	}
+
+	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+	error = vmwrite(VMCS_HOST_TR_BASE, (u_long)PCPU_GET(tssp));
+	if (error != 0)
+		goto done;
+
+	error = vmwrite(VMCS_HOST_GDTR_BASE, (u_long)&gdt[NGDT * curcpu]);
+	if (error != 0)
+		goto done;
+
+	error = vmwrite(VMCS_HOST_GS_BASE, (u_long)&__pcpu[curcpu]);
+	if (error != 0)
+		goto done;
+
+	/*
+	 * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+	 *
+	 * We do this because this vcpu was executing on a different host
+	 * cpu when it last ran. We do not track whether it invalidated
+	 * mappings associated with its 'vpid' during that run. So we must
+	 * assume that the mappings associated with 'vpid' on 'curcpu' are
+	 * stale and invalidate them.
+	 *
+	 * Note that we incur this penalty only when the scheduler chooses to
+	 * move the thread associated with this vcpu between host cpus.
+	 *
+	 * Note also that this will invalidate mappings tagged with 'vpid'
+	 * for "all" EP4TAs.
+	 */
+	if (vmxstate->vpid != 0) {
+		invvpid_desc.vpid = vmxstate->vpid;
+		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+	}
+done:
+	return (error);
+}
+
+static void 
+vm_exit_update_rip(struct vm_exit *vmexit)
+{
+	int error;
+
+	error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length);
+	if (error)
+		panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+}
+
+/*
+ * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
+ */
+CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
+
+static void __inline
+vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_set_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_clear_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_set_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static int
+vmx_inject_nmi(struct vmx *vmx, int vcpu)
+{
+	int error;
+	uint64_t info, interruptibility;
+
+	/* Bail out if no NMI requested */
+	if (vmx->state[vcpu].request_nmi == 0)
+		return (0);
+
+	error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+	if (error) {
+		panic("vmx_inject_nmi: vmread(interruptibility) %d",
+			error);
+	}
+	if (interruptibility & nmi_blocking_bits)
+		goto nmiblocked;
+
+	/*
+	 * Inject the virtual NMI. The vector must be the NMI IDT entry
+	 * or the VMCS entry check will fail.
+	 */
+	info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID;
+	info |= IDT_NMI;
+
+	error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+	if (error)
+		panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error);
+
+	VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI");
+
+	/* Clear the request */
+	vmx->state[vcpu].request_nmi = 0;
+	return (1);
+
+nmiblocked:
+	/*
+	 * Set the NMI Window Exiting execution control so we can inject
+	 * the virtual NMI as soon as blocking condition goes away.
+	 */
+	vmx_set_nmi_window_exiting(vmx, vcpu);
+
+	VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
+	return (1);
+}
+
+static void
+vmx_inject_interrupts(struct vmx *vmx, int vcpu)
+{
+	int error, vector;
+	uint64_t info, rflags, interruptibility;
+
+	const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING |
+				   VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING;
+
+#if 1
+	/*
+	 * XXX
+	 * If an event is being injected from userland then just return.
+	 * For e.g. we may inject a breakpoint exception to cause the
+	 * guest to enter the debugger so we can inspect its state.
+	 */
+	error = vmread(VMCS_ENTRY_INTR_INFO, &info);
+	if (error)
+		panic("vmx_inject_interrupts: vmread(intrinfo) %d", error);
+	if (info & VMCS_INTERRUPTION_INFO_VALID)
+		return;
+#endif
+	/*
+	 * NMI injection has priority so deal with those first
+	 */
+	if (vmx_inject_nmi(vmx, vcpu))
+		return;
+
+	/* Ask the local apic for a vector to inject */
+	vector = lapic_pending_intr(vmx->vm, vcpu);
+	if (vector < 0)
+		return;
+
+	if (vector < 32 || vector > 255)
+		panic("vmx_inject_interrupts: invalid vector %d\n", vector);
+
+	/* Check RFLAGS.IF and the interruptibility state of the guest */
+	error = vmread(VMCS_GUEST_RFLAGS, &rflags);
+	if (error)
+		panic("vmx_inject_interrupts: vmread(rflags) %d", error);
+
+	if ((rflags & PSL_I) == 0)
+		goto cantinject;
+
+	error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+	if (error) {
+		panic("vmx_inject_interrupts: vmread(interruptibility) %d",
+			error);
+	}
+	if (interruptibility & HWINTR_BLOCKED)
+		goto cantinject;
+
+	/* Inject the interrupt */
+	info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID;
+	info |= vector;
+	error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+	if (error)
+		panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error);
+
+	/* Update the Local APIC ISR */
+	lapic_intr_accepted(vmx->vm, vcpu, vector);
+
+	VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
+
+	return;
+
+cantinject:
+	/*
+	 * Set the Interrupt Window Exiting execution control so we can inject
+	 * the interrupt as soon as blocking condition goes away.
+	 */
+	vmx_set_int_window_exiting(vmx, vcpu);
+
+	VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
+}
+
+static int
+vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+	int error;
+	uint64_t regval;
+	const struct vmxctx *vmxctx;
+
+	/* We only handle mov to %cr0 at this time */
+	if ((exitqual & 0xff) != 0x00)
+		return (UNHANDLED);
+
+	vmxctx = &vmx->ctx[vcpu];
+
+	/*
+	 * We must use vmwrite() directly here because vmcs_setreg() will
+	 * call vmclear(vmcs) as a side-effect which we certainly don't want.
+	 */
+	switch ((exitqual >> 8) & 0xf) {
+	case 0:
+		regval = vmxctx->guest_rax;
+		break;
+	case 1:
+		regval = vmxctx->guest_rcx;
+		break;
+	case 2:
+		regval = vmxctx->guest_rdx;
+		break;
+	case 3:
+		regval = vmxctx->guest_rbx;
+		break;
+	case 4:
+		error = vmread(VMCS_GUEST_RSP, &regval);
+		if (error) {
+			panic("vmx_emulate_cr_access: "
+			      "error %d reading guest rsp", error);
+		}
+		break;
+	case 5:
+		regval = vmxctx->guest_rbp;
+		break;
+	case 6:
+		regval = vmxctx->guest_rsi;
+		break;
+	case 7:
+		regval = vmxctx->guest_rdi;
+		break;
+	case 8:
+		regval = vmxctx->guest_r8;
+		break;
+	case 9:
+		regval = vmxctx->guest_r9;
+		break;
+	case 10:
+		regval = vmxctx->guest_r10;
+		break;
+	case 11:
+		regval = vmxctx->guest_r11;
+		break;
+	case 12:
+		regval = vmxctx->guest_r12;
+		break;
+	case 13:
+		regval = vmxctx->guest_r13;
+		break;
+	case 14:
+		regval = vmxctx->guest_r14;
+		break;
+	case 15:
+		regval = vmxctx->guest_r15;
+		break;
+	}
+
+	regval |= cr0_ones_mask;
+	regval &= ~cr0_zeros_mask;
+	error = vmwrite(VMCS_GUEST_CR0, regval);
+	if (error)
+		panic("vmx_emulate_cr_access: error %d writing cr0", error);
+
+	return (HANDLED);
+}
+
+static int
+vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+	int handled;
+	struct vmcs *vmcs;
+	struct vmxctx *vmxctx;
+	uint32_t eax, ecx, edx;
+	uint64_t qual;
+
+	handled = 0;
+	vmcs = &vmx->vmcs[vcpu];
+	vmxctx = &vmx->ctx[vcpu];
+	qual = vmexit->u.vmx.exit_qualification;
+	vmexit->exitcode = VM_EXITCODE_BOGUS;
+
+	switch (vmexit->u.vmx.exit_reason) {
+	case EXIT_REASON_CR_ACCESS:
+		handled = vmx_emulate_cr_access(vmx, vcpu, qual);
+		break;
+	case EXIT_REASON_RDMSR:
+		ecx = vmxctx->guest_rcx;
+		handled = emulate_rdmsr(vmx->vm, vcpu, ecx);
+		if (!handled) {
+			vmexit->exitcode = VM_EXITCODE_RDMSR;
+			vmexit->u.msr.code = ecx;
+		}
+		break;
+	case EXIT_REASON_WRMSR:
+		eax = vmxctx->guest_rax;
+		ecx = vmxctx->guest_rcx;
+		edx = vmxctx->guest_rdx;
+		handled = emulate_wrmsr(vmx->vm, vcpu, ecx,
+					(uint64_t)edx << 32 | eax);
+		if (!handled) {
+			vmexit->exitcode = VM_EXITCODE_WRMSR;
+			vmexit->u.msr.code = ecx;
+			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
+		}
+		break;
+	case EXIT_REASON_HLT:
+		vmexit->exitcode = VM_EXITCODE_HLT;
+		break;
+	case EXIT_REASON_MTF:
+		vmexit->exitcode = VM_EXITCODE_MTRAP;
+		break;
+	case EXIT_REASON_PAUSE:
+		vmexit->exitcode = VM_EXITCODE_PAUSE;
+		break;
+	case EXIT_REASON_INTR_WINDOW:
+		vmx_clear_int_window_exiting(vmx, vcpu);
+		VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
+		/* FALLTHRU */
+	case EXIT_REASON_EXT_INTR:
+		/*
+		 * External interrupts serve only to cause VM exits and allow
+		 * the host interrupt handler to run.
+		 *
+		 * If this external interrupt triggers a virtual interrupt
+		 * to a VM, then that state will be recorded by the
+		 * host interrupt handler in the VM's softc. We will inject
+		 * this virtual interrupt during the subsequent VM enter.
+		 */
+
+		/*
+		 * This is special. We want to treat this as an 'handled'
+		 * VM-exit but not increment the instruction pointer.
+		 */
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
+		return (1);
+	case EXIT_REASON_NMI_WINDOW:
+		/* Exit to allow the pending virtual NMI to be injected */
+		vmx_clear_nmi_window_exiting(vmx, vcpu);
+		VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
+		return (1);
+	case EXIT_REASON_INOUT:
+		vmexit->exitcode = VM_EXITCODE_INOUT;
+		vmexit->u.inout.bytes = (qual & 0x7) + 1;
+		vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
+		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
+		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
+		vmexit->u.inout.port = (uint16_t)(qual >> 16);
+		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
+		break;
+	case EXIT_REASON_CPUID:
+		handled = vmx_handle_cpuid(vmxctx);
+		break;
+	default:
+		break;
+	}
+
+	if (handled) {
+		/*
+		 * It is possible that control is returned to userland
+		 * even though we were able to handle the VM exit in the
+		 * kernel (for e.g. 'astpending' is set in the run loop).
+		 *
+		 * In such a case we want to make sure that the userland
+		 * restarts guest execution at the instruction *after*
+		 * the one we just processed. Therefore we update the
+		 * guest rip in the VMCS and in 'vmexit'.
+		 */
+		vm_exit_update_rip(vmexit);
+		vmexit->rip += vmexit->inst_length;
+		vmexit->inst_length = 0;
+	} else {
+		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
+			/*
+			 * If this VM exit was not claimed by anybody then
+			 * treat it as a generic VMX exit.
+			 */
+			vmexit->exitcode = VM_EXITCODE_VMX;
+			vmexit->u.vmx.error = 0;
+		} else {
+			/*
+			 * The exitcode and collateral have been populated.
+			 * The VM exit will be processed further in userland.
+			 */
+		}
+	}
+	return (handled);
+}
+
+static int
+vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit)
+{
+	int error, vie, rc, handled, astpending, loopstart;
+	uint32_t exit_reason;
+	struct vmx *vmx;
+	struct vmxctx *vmxctx;
+	struct vmcs *vmcs;
+	
+	vmx = arg;
+	vmcs = &vmx->vmcs[vcpu];
+	vmxctx = &vmx->ctx[vcpu];
+	loopstart = 1;
+
+	/*
+	 * XXX Can we avoid doing this every time we do a vm run?
+	 */
+	VMPTRLD(vmcs);
+
+	/*
+	 * XXX
+	 * We do this every time because we may setup the virtual machine
+	 * from a different process than the one that actually runs it.
+	 *
+	 * If the life of a virtual machine was spent entirely in the context
+	 * of a single process we could do this once in vmcs_set_defaults().
+	 */
+	if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0)
+		panic("vmx_run: error %d writing to VMCS_HOST_CR3", error);
+
+	if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0)
+		panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+
+	if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0)
+		panic("vmx_run: error %d setting up pcpu defaults", error);
+
+	do {
+		lapic_timer_tick(vmx->vm, vcpu);
+		vmx_inject_interrupts(vmx, vcpu);
+		vmx_run_trace(vmx, vcpu);
+		rc = vmx_setjmp(vmxctx);
+#ifdef SETJMP_TRACE
+		vmx_setjmp_trace(vmx, vcpu, vmxctx, rc);
+#endif
+		switch (rc) {
+		case VMX_RETURN_DIRECT:
+			if (loopstart) {
+				loopstart = 0;
+				vmx_launch(vmxctx);
+			} else
+				vmx_resume(vmxctx);
+			panic("vmx_launch/resume should not return");
+			break;
+		case VMX_RETURN_LONGJMP:
+			break;			/* vm exit */
+		case VMX_RETURN_VMRESUME:
+			vie = vmcs_instruction_error();
+			if (vmxctx->launch_error == VM_FAIL_INVALID ||
+			    vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) {
+				printf("vmresume error %d vmcs inst error %d\n",
+					vmxctx->launch_error, vie);
+				goto err_exit;
+			}
+			vmx_launch(vmxctx);	/* try to launch the guest */
+			panic("vmx_launch should not return");
+			break;
+		case VMX_RETURN_VMLAUNCH:
+			vie = vmcs_instruction_error();
+#if 1
+			printf("vmlaunch error %d vmcs inst error %d\n",
+				vmxctx->launch_error, vie);
+#endif
+			goto err_exit;
+		default:
+			panic("vmx_setjmp returned %d", rc);
+		}
+		
+		/*
+		 * XXX locking?
+		 * See comments in exception.S about checking for ASTs
+		 * atomically while interrupts are disabled. But it is
+		 * not clear that they apply in our case.
+		 */
+		astpending = curthread->td_flags & TDF_ASTPENDING;
+
+		/* enable interrupts */
+		enable_intr();
+
+		/* collect some basic information for VM exit processing */
+		vmexit->rip = rip = vmcs_guest_rip();
+		vmexit->inst_length = vmexit_instruction_length();
+		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
+		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
+
+		handled = vmx_exit_process(vmx, vcpu, vmexit);
+
+		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled,
+			       astpending);
+	} while (handled && !astpending);
+
+	/*
+	 * If a VM exit has been handled then the exitcode must be BOGUS
+	 * If a VM exit is not handled then the exitcode must not be BOGUS
+	 */
+	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
+	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
+		panic("Mismatch between handled (%d) and exitcode (%d)",
+		      handled, vmexit->exitcode);
+	}
+
+	VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode);
+
+	/*
+	 * XXX
+	 * We need to do this to ensure that any VMCS state cached by the
+	 * processor is flushed to memory. We need to do this in case the
+	 * VM moves to a different cpu the next time it runs.
+	 *
+	 * Can we avoid doing this?
+	 */
+	VMCLEAR(vmcs);
+	return (0);
+
+err_exit:
+	vmexit->exitcode = VM_EXITCODE_VMX;
+	vmexit->u.vmx.exit_reason = (uint32_t)-1;
+	vmexit->u.vmx.exit_qualification = (uint32_t)-1;
+	vmexit->u.vmx.error = vie;
+	VMCLEAR(vmcs);
+	return (ENOEXEC);
+}
+
+static void
+vmx_vmcleanup(void *arg)
+{
+	int error;
+	struct vmx *vmx = arg;
+
+	/*
+	 * XXXSMP we also need to clear the VMCS active on the other vcpus.
+	 */
+	error = vmclear(&vmx->vmcs[0]);
+	if (error != 0)
+		panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
+
+	ept_vmcleanup(vmx);
+	free(vmx, M_VMX);
+
+	return;
+}
+
+static register_t *
+vmxctx_regptr(struct vmxctx *vmxctx, int reg)
+{
+
+	switch (reg) {
+	case VM_REG_GUEST_RAX:
+		return (&vmxctx->guest_rax);
+	case VM_REG_GUEST_RBX:
+		return (&vmxctx->guest_rbx);
+	case VM_REG_GUEST_RCX:
+		return (&vmxctx->guest_rcx);
+	case VM_REG_GUEST_RDX:
+		return (&vmxctx->guest_rdx);
+	case VM_REG_GUEST_RSI:
+		return (&vmxctx->guest_rsi);
+	case VM_REG_GUEST_RDI:
+		return (&vmxctx->guest_rdi);
+	case VM_REG_GUEST_RBP:
+		return (&vmxctx->guest_rbp);
+	case VM_REG_GUEST_R8:
+		return (&vmxctx->guest_r8);
+	case VM_REG_GUEST_R9:
+		return (&vmxctx->guest_r9);
+	case VM_REG_GUEST_R10:
+		return (&vmxctx->guest_r10);
+	case VM_REG_GUEST_R11:
+		return (&vmxctx->guest_r11);
+	case VM_REG_GUEST_R12:
+		return (&vmxctx->guest_r12);
+	case VM_REG_GUEST_R13:
+		return (&vmxctx->guest_r13);
+	case VM_REG_GUEST_R14:
+		return (&vmxctx->guest_r14);
+	case VM_REG_GUEST_R15:
+		return (&vmxctx->guest_r15);
+	default:
+		break;
+	}
+	return (NULL);
+}
+
+static int
+vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
+{
+	register_t *regp;
+
+	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+		*retval = *regp;
+		return (0);
+	} else
+		return (EINVAL);
+}
+
+static int
+vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
+{
+	register_t *regp;
+
+	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+		*regp = val;
+		return (0);
+	} else
+		return (EINVAL);
+}
+
+static int
+vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
+{
+	struct vmx *vmx = arg;
+
+	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
+		return (0);
+
+	/*
+	 * If the vcpu is running then don't mess with the VMCS.
+	 *
+	 * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause
+	 * the subsequent vmlaunch/vmresume to fail.
+	 */
+	if (vcpu_is_running(vmx->vm, vcpu, NULL))
+		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+	return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval));
+}
+
+static int
+vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
+{
+	int error;
+	uint64_t ctls;
+	struct vmx *vmx = arg;
+
+	/*
+	 * XXX Allow caller to set contents of the guest registers saved in
+	 * the 'vmxctx' even though the vcpu might be running. We need this
+	 * specifically to support the rdmsr emulation that will set the
+	 * %eax and %edx registers during vm exit processing.
+	 */
+	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
+		return (0);
+
+	/*
+	 * If the vcpu is running then don't mess with the VMCS.
+	 *
+	 * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause
+	 * the subsequent vmlaunch/vmresume to fail.
+	 */
+	if (vcpu_is_running(vmx->vm, vcpu, NULL))
+		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+	error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val);
+
+	if (error == 0) {
+		/*
+		 * If the "load EFER" VM-entry control is 1 then the
+		 * value of EFER.LMA must be identical to "IA-32e mode guest"
+		 * bit in the VM-entry control.
+		 */
+		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
+		    (reg == VM_REG_GUEST_EFER)) {
+			vmcs_getreg(&vmx->vmcs[vcpu],
+				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
+			if (val & EFER_LMA)
+				ctls |= VM_ENTRY_GUEST_LMA;
+			else
+				ctls &= ~VM_ENTRY_GUEST_LMA;
+			vmcs_setreg(&vmx->vmcs[vcpu],
+				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
+		}
+	}
+
+	return (error);
+}
+
+static int
+vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+	struct vmx *vmx = arg;
+
+	return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+	struct vmx *vmx = arg;
+
+	return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
+	   int code_valid)
+{
+	int error;
+	uint32_t info;
+	struct vmx *vmx = arg;
+	struct vmcs *vmcs = &vmx->vmcs[vcpu];
+
+	static uint32_t type_map[VM_EVENT_MAX] = {
+		0x1,		/* VM_EVENT_NONE */
+		0x0,		/* VM_HW_INTR */
+		0x2,		/* VM_NMI */
+		0x3,		/* VM_HW_EXCEPTION */
+		0x4,		/* VM_SW_INTR */
+		0x5,		/* VM_PRIV_SW_EXCEPTION */
+		0x6,		/* VM_SW_EXCEPTION */
+	};
+
+	info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
+	info |= VMCS_INTERRUPTION_INFO_VALID;
+	error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
+	if (error != 0)
+		return (error);
+
+	if (code_valid) {
+		error = vmcs_setreg(vmcs,
+				    VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
+				    code);
+	}
+	return (error);
+}
+
+static int
+vmx_nmi(void *arg, int vcpu)
+{
+	struct vmx *vmx = arg;
+
+	atomic_set_int(&vmx->state[vcpu].request_nmi, 1);
+
+	return (0);
+}
+
+static int
+vmx_getcap(void *arg, int vcpu, int type, int *retval)
+{
+	struct vmx *vmx = arg;
+	int vcap;
+	int ret;
+
+	ret = ENOENT;
+
+	vcap = vmx->cap[vcpu].set;
+
+	switch (type) {
+	case VM_CAP_HALT_EXIT:
+		if (cap_halt_exit)
+			ret = 0;
+		break;
+	case VM_CAP_PAUSE_EXIT:
+		if (cap_pause_exit)
+			ret = 0;
+		break;
+	case VM_CAP_MTRAP_EXIT:
+		if (cap_monitor_trap)
+			ret = 0;
+		break;
+	case VM_CAP_UNRESTRICTED_GUEST:
+		if (cap_unrestricted_guest)
+			ret = 0;
+		break;
+	default:
+		break;
+	}
+
+	if (ret == 0)
+		*retval = (vcap & (1 << type)) ? 1 : 0;
+
+	return (ret);
+}
+
+static int
+vmx_setcap(void *arg, int vcpu, int type, int val)
+{
+	struct vmx *vmx = arg;
+	struct vmcs *vmcs = &vmx->vmcs[vcpu];
+	uint32_t baseval;
+	uint32_t *pptr;
+	int error;
+	int flag;
+	int reg;
+	int retval;
+
+	retval = ENOENT;
+	pptr = NULL;
+
+	switch (type) {
+	case VM_CAP_HALT_EXIT:
+		if (cap_halt_exit) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_HLT_EXITING;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_MTRAP_EXIT:
+		if (cap_monitor_trap) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_MTF;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_PAUSE_EXIT:
+		if (cap_pause_exit) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_PAUSE_EXITING;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_UNRESTRICTED_GUEST:
+		if (cap_unrestricted_guest) {
+			retval = 0;
+			baseval = procbased_ctls2;
+			flag = PROCBASED2_UNRESTRICTED_GUEST;
+			reg = VMCS_SEC_PROC_BASED_CTLS;
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (retval == 0) {
+		if (val) {
+			baseval |= flag;
+		} else {
+			baseval &= ~flag;
+		}
+		VMPTRLD(vmcs);
+		error = vmwrite(reg, baseval);
+		VMCLEAR(vmcs);
+
+		if (error) {
+			retval = error;
+		} else {
+			/*
+			 * Update optional stored flags, and record
+			 * setting
+			 */
+			if (pptr != NULL) {
+				*pptr = baseval;
+			}
+
+			if (val) {
+				vmx->cap[vcpu].set |= (1 << type);
+			} else {
+				vmx->cap[vcpu].set &= ~(1 << type);
+			}
+		}
+	}
+
+        return (retval);
+}
+
+struct vmm_ops vmm_ops_intel = {
+	vmx_init,
+	vmx_cleanup,
+	vmx_vminit,
+	vmx_run,
+	vmx_vmcleanup,
+	ept_vmmmap,
+	vmx_getreg,
+	vmx_setreg,
+	vmx_getdesc,
+	vmx_setdesc,
+	vmx_inject,
+	vmx_nmi,
+	vmx_getcap,
+	vmx_setcap
+};
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
new file mode 100644
index 0000000..69697f8
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -0,0 +1,115 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_H_
+#define	_VMX_H_
+
+#include "vmcs.h"
+
+#define	GUEST_MSR_MAX_ENTRIES	64		/* arbitrary */
+
+struct vmxctx {
+	register_t	guest_rdi;		/* Guest state */
+	register_t	guest_rsi;
+	register_t	guest_rdx;
+	register_t	guest_rcx;
+	register_t	guest_r8;
+	register_t	guest_r9;
+	register_t	guest_rax;
+	register_t	guest_rbx;
+	register_t	guest_rbp;
+	register_t	guest_r10;
+	register_t	guest_r11;
+	register_t	guest_r12;
+	register_t	guest_r13;
+	register_t	guest_r14;
+	register_t	guest_r15;
+	register_t	guest_cr2;
+
+	register_t	host_r15;		/* Host state */
+	register_t	host_r14;
+	register_t	host_r13;
+	register_t	host_r12;
+	register_t	host_rbp;
+	register_t	host_rsp;
+	register_t	host_rbx;
+	register_t	host_rip;
+	/*
+	 * XXX todo debug registers and fpu state
+	 */
+	
+	int		launch_error;
+};
+
+struct vmxcap {
+	int	set;
+	uint32_t proc_ctls;
+};
+
+struct vmxstate {
+	int	request_nmi;	
+	int	lastcpu;	/* host cpu that this 'vcpu' last ran on */
+	uint16_t vpid;
+};
+
+/* virtual machine softc */
+struct vmx {
+	pml4_entry_t	pml4ept[NPML4EPG];
+	struct vmcs	vmcs[VM_MAXCPU];	/* one vmcs per virtual cpu */
+	char		msr_bitmap[PAGE_SIZE];
+	struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
+	struct vmxctx	ctx[VM_MAXCPU];
+	struct vmxcap	cap[VM_MAXCPU];
+	struct vmxstate	state[VM_MAXCPU];
+	struct vm	*vm;
+};
+CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
+
+#define	VMX_RETURN_DIRECT	0
+#define	VMX_RETURN_LONGJMP	1
+#define	VMX_RETURN_VMRESUME	2
+#define	VMX_RETURN_VMLAUNCH	3
+/*
+ * vmx_setjmp() returns:
+ * - 0 when it returns directly
+ * - 1 when it returns from vmx_longjmp
+ * - 2 when it returns from vmx_resume (which would only be in the error case)
+ * - 3 when it returns from vmx_launch (which would only be in the error case)
+ */
+int	vmx_setjmp(struct vmxctx *ctx);
+void	vmx_longjmp(void);			/* returns via vmx_setjmp */
+void	vmx_launch(struct vmxctx *ctx) __dead2;	/* may return via vmx_setjmp */
+void	vmx_resume(struct vmxctx *ctx) __dead2;	/* may return via vmx_setjmp */
+
+u_long	vmx_fix_cr0(u_long cr0);
+u_long	vmx_fix_cr4(u_long cr4);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h
new file mode 100644
index 0000000..31f29f8
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_controls.h
@@ -0,0 +1,92 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_CONTROLS_H_
+#define	_VMX_CONTROLS_H_
+
+/* Pin-Based VM-Execution Controls */
+#define	PINBASED_EXTINT_EXITING		(1 << 0)
+#define	PINBASED_NMI_EXITING		(1 << 3)
+#define	PINBASED_VIRTUAL_NMI		(1 << 5)
+#define	PINBASED_PREMPTION_TIMER	(1 << 6)
+
+/* Primary Processor-Based VM-Execution Controls */
+#define	PROCBASED_INT_WINDOW_EXITING	(1 << 2)
+#define	PROCBASED_TSC_OFFSET		(1 << 3)
+#define	PROCBASED_HLT_EXITING		(1 << 7)
+#define	PROCBASED_INVLPG_EXITING	(1 << 9)
+#define	PROCBASED_MWAIT_EXITING		(1 << 10)
+#define	PROCBASED_RDPMC_EXITING		(1 << 11)
+#define	PROCBASED_RDTSC_EXITING		(1 << 12)
+#define	PROCBASED_CR3_LOAD_EXITING	(1 << 15)
+#define	PROCBASED_CR3_STORE_EXITING	(1 << 16)
+#define	PROCBASED_CR8_LOAD_EXITING	(1 << 19)
+#define	PROCBASED_CR8_STORE_EXITING	(1 << 20)
+#define	PROCBASED_USE_TPR_SHADOW	(1 << 21)
+#define	PROCBASED_NMI_WINDOW_EXITING	(1 << 22)
+#define PROCBASED_MOV_DR_EXITING	(1 << 23)
+#define	PROCBASED_IO_EXITING		(1 << 24)
+#define	PROCBASED_IO_BITMAPS		(1 << 25)
+#define	PROCBASED_MTF			(1 << 27)
+#define	PROCBASED_MSR_BITMAPS		(1 << 28)
+#define	PROCBASED_MONITOR_EXITING	(1 << 29)
+#define	PROCBASED_PAUSE_EXITING		(1 << 30)
+#define	PROCBASED_SECONDARY_CONTROLS	(1 << 31)
+
+/* Secondary Processor-Based VM-Execution Controls */
+#define	PROCBASED2_VIRTUALIZE_APIC	(1 << 0)
+#define	PROCBASED2_ENABLE_EPT		(1 << 1)
+#define	PROCBASED2_DESC_TABLE_EXITING	(1 << 2)
+#define	PROCBASED2_ENABLE_RDTSCP	(1 << 3)
+#define	PROCBASED2_VIRTUALIZE_X2APIC	(1 << 4)
+#define	PROCBASED2_ENABLE_VPID		(1 << 5)
+#define	PROCBASED2_WBINVD_EXITING	(1 << 6)
+#define	PROCBASED2_UNRESTRICTED_GUEST	(1 << 7)
+#define	PROCBASED2_PAUSE_LOOP_EXITING	(1 << 10)
+
+/* VM Exit Controls */
+#define	VM_EXIT_SAVE_DEBUG_CONTROLS	(1 << 2)
+#define	VM_EXIT_HOST_LMA		(1 << 9)
+#define	VM_EXIT_LOAD_PERF_GLOBAL_CTRL	(1 << 12)
+#define	VM_EXIT_ACKNOWLEDGE_INTERRUPT	(1 << 15)
+#define	VM_EXIT_SAVE_PAT		(1 << 18)
+#define	VM_EXIT_LOAD_PAT		(1 << 19)
+#define	VM_EXIT_SAVE_EFER		(1 << 20)
+#define	VM_EXIT_LOAD_EFER		(1 << 21)
+#define	VM_EXIT_SAVE_PREEMPTION_TIMER	(1 << 22)
+
+/* VM Entry Controls */
+#define	VM_ENTRY_LOAD_DEBUG_CONTROLS	(1 << 2)
+#define	VM_ENTRY_GUEST_LMA		(1 << 9)
+#define	VM_ENTRY_INTO_SMM		(1 << 10)
+#define	VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11)
+#define	VM_ENTRY_LOAD_PERF_GLOBAL_CTRL	(1 << 13)
+#define	VM_ENTRY_LOAD_PAT		(1 << 14)
+#define	VM_ENTRY_LOAD_EFER		(1 << 15)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h
new file mode 100644
index 0000000..e9f6c6d
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_cpufunc.h
@@ -0,0 +1,199 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMX_CPUFUNC_H_
+#define	_VMX_CPUFUNC_H_
+
+struct vmcs;
+
+/*
+ * Section 5.2 "Conventions" from Intel Architecture Manual 2B.
+ *
+ *			error
+ * VMsucceed		  0
+ * VMFailInvalid	  1
+ * VMFailValid		  2	see also VMCS VM-Instruction Error Field
+ */
+#define	VM_SUCCESS		0
+#define	VM_FAIL_INVALID		1
+#define	VM_FAIL_VALID		2
+#define	VMX_SET_ERROR_CODE(varname)					\
+	do {								\
+	__asm __volatile("	jnc 1f;"				\
+			 "	mov $1, %0;"	/* CF: error = 1 */	\
+			 "	jmp 3f;"				\
+			 "1:	jnz 2f;"				\
+			 "	mov $2, %0;"	/* ZF: error = 2 */	\
+			 "	jmp 3f;"				\
+			 "2:	mov $0, %0;"				\
+			 "3:	nop"					\
+			 :"=r" (varname));				\
+	} while (0)
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmxon(char *region)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(region);
+	__asm __volatile("vmxon %0" : : "m" (*(uint64_t *)&addr) : "memory");
+	VMX_SET_ERROR_CODE(error);
+	return (error);
+}
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmclear(struct vmcs *vmcs)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(vmcs);
+	__asm __volatile("vmclear %0" : : "m" (*(uint64_t *)&addr) : "memory");
+	VMX_SET_ERROR_CODE(error);
+	return (error);
+}
+
+static __inline void
+vmxoff(void)
+{
+	__asm __volatile("vmxoff");
+}
+
+static __inline void
+vmptrst(uint64_t *addr)
+{
+	__asm __volatile("vmptrst %0" : : "m" (*addr) : "memory");
+}
+
+static __inline int
+vmptrld(struct vmcs *vmcs)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(vmcs);
+	__asm __volatile("vmptrld %0" : : "m" (*(uint64_t *)&addr) : "memory");
+	VMX_SET_ERROR_CODE(error);
+	return (error);
+}
+
+static __inline int
+vmwrite(uint64_t reg, uint64_t val)
+{
+	int error;
+
+	__asm __volatile("vmwrite %0, %1" : : "r" (val), "r" (reg) : "memory");
+
+	VMX_SET_ERROR_CODE(error);
+
+	return (error);
+}
+
+static __inline int
+vmread(uint64_t r, uint64_t *addr)
+{
+	int error;
+
+	__asm __volatile("vmread %0, %1" : : "r" (r), "m" (*addr) : "memory");
+
+	VMX_SET_ERROR_CODE(error);
+
+	return (error);
+}
+
+static void __inline
+VMCLEAR(struct vmcs *vmcs)
+{
+	int err;
+
+	err = vmclear(vmcs);
+	if (err != 0)
+		panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
+
+	critical_exit();
+}
+
+static void __inline
+VMPTRLD(struct vmcs *vmcs)
+{
+	int err;
+
+	critical_enter();
+
+	err = vmptrld(vmcs);
+	if (err != 0)
+		panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
+}
+
+#define	INVVPID_TYPE_ADDRESS		0UL
+#define	INVVPID_TYPE_SINGLE_CONTEXT	1UL
+#define	INVVPID_TYPE_ALL_CONTEXTS	2UL
+
+struct invvpid_desc {
+	uint16_t	vpid;
+	uint16_t	_res1;
+	uint32_t	_res2;
+	uint64_t	linear_addr;
+};
+CTASSERT(sizeof(struct invvpid_desc) == 16);
+
+static void __inline
+invvpid(uint64_t type, struct invvpid_desc desc)
+{
+	int error;
+
+	__asm __volatile("invvpid %0, %1" :: "m" (desc), "r" (type) : "memory");
+
+	VMX_SET_ERROR_CODE(error);
+	if (error)
+		panic("invvpid error %d", error);
+}
+
+#define	INVEPT_TYPE_SINGLE_CONTEXT	1UL
+#define	INVEPT_TYPE_ALL_CONTEXTS	2UL
+struct invept_desc {
+	uint64_t	eptp;
+	uint64_t	_res;
+};
+CTASSERT(sizeof(struct invept_desc) == 16);
+
+static void __inline
+invept(uint64_t type, struct invept_desc desc)
+{
+	int error;
+
+	__asm __volatile("invept %0, %1" :: "m" (desc), "r" (type) : "memory");
+
+	VMX_SET_ERROR_CODE(error);
+	if (error)
+		panic("invept error %d", error);
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c
new file mode 100644
index 0000000..c4b1efc
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@@ -0,0 +1,81 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/assym.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmx.h"
+#include "vmx_cpufunc.h"
+
+ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi));
+ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi));
+ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx));
+ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx));
+ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8));
+ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9));
+ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax));
+ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx));
+ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp));
+ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10));
+ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11));
+ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12));
+ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13));
+ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14));
+ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15));
+ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2));
+
+ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15));
+ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14));
+ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13));
+ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12));
+ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp));
+ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp));
+ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
+ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
+
+ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
+
+ASSYM(VM_SUCCESS,	VM_SUCCESS);
+ASSYM(VM_FAIL_INVALID,	VM_FAIL_INVALID);
+ASSYM(VM_FAIL_VALID,	VM_FAIL_VALID);
+
+ASSYM(VMX_RETURN_DIRECT,	VMX_RETURN_DIRECT);
+ASSYM(VMX_RETURN_LONGJMP,	VMX_RETURN_LONGJMP);
+ASSYM(VMX_RETURN_VMRESUME,	VMX_RETURN_VMRESUME);
+ASSYM(VMX_RETURN_VMLAUNCH,	VMX_RETURN_VMLAUNCH);
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
new file mode 100644
index 0000000..1e9a837
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -0,0 +1,172 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/cpufunc.h>
+
+#include "vmx_msr.h"
+
+static boolean_t
+vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
+{
+
+	if (msr_val & (1UL << (bitpos + 32)))
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+static boolean_t
+vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
+{
+
+	if ((msr_val & (1UL << bitpos)) == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+uint32_t
+vmx_revision(void)
+{
+
+	return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
+}
+
+/*
+ * Generate a bitmask to be used for the VMCS execution control fields.
+ *
+ * The caller specifies what bits should be set to one in 'ones_mask'
+ * and what bits should be set to zero in 'zeros_mask'. The don't-care
+ * bits are set to the default value. The default values are obtained
+ * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
+ * VMX Capabilities".
+ *
+ * Returns zero on success and non-zero on error.
+ */
+int
+vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+	       uint32_t zeros_mask, uint32_t *retval)
+{
+	int i;
+	uint64_t val, trueval;
+	boolean_t true_ctls_avail, one_allowed, zero_allowed;
+
+	/* We cannot ask the same bit to be set to both '1' and '0' */
+	if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
+		return (EINVAL);
+
+	if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
+		true_ctls_avail = TRUE;
+	else
+		true_ctls_avail = FALSE;
+
+	val = rdmsr(ctl_reg);
+	if (true_ctls_avail)
+		trueval = rdmsr(true_ctl_reg);		/* step c */
+	else
+		trueval = val;				/* step a */
+
+	for (i = 0; i < 32; i++) {
+		one_allowed = vmx_ctl_allows_one_setting(trueval, i);
+		zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
+
+		KASSERT(one_allowed || zero_allowed,
+			("invalid zero/one setting for bit %d of ctl 0x%0x, "
+			 "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
+
+		if (zero_allowed && !one_allowed) {		/* b(i),c(i) */
+			if (ones_mask & (1 << i))
+				return (EINVAL);
+			*retval &= ~(1 << i);
+		} else if (one_allowed && !zero_allowed) {	/* b(i),c(i) */
+			if (zeros_mask & (1 << i))
+				return (EINVAL);
+			*retval |= 1 << i;
+		} else {
+			if (zeros_mask & (1 << i))	/* b(ii),c(ii) */
+				*retval &= ~(1 << i);
+			else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
+				*retval |= 1 << i;
+			else if (!true_ctls_avail)
+				*retval &= ~(1 << i);	/* b(iii) */
+			else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
+				*retval &= ~(1 << i);
+			else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
+				*retval |= 1 << i;
+			else {
+				panic("vmx_set_ctlreg: unable to determine "
+				      "correct value of ctl bit %d for msr "
+				      "0x%0x and true msr 0x%0x", i, ctl_reg,
+				      true_ctl_reg);
+			}
+		}
+	}
+
+	return (0);
+}
+
+void
+msr_bitmap_initialize(char *bitmap)
+{
+
+	memset(bitmap, 0xff, PAGE_SIZE);
+}
+
+int
+msr_bitmap_change_access(char *bitmap, u_int msr, int access)
+{
+	int byte, bit;
+
+	if (msr >= 0x00000000 && msr <= 0x00001FFF)
+		byte = msr / 8;
+	else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
+		byte = 1024 + (msr - 0xC0000000) / 8;
+	else
+		return (EINVAL);
+
+	bit = msr & 0x7;
+
+	if (access & MSR_BITMAP_ACCESS_READ)
+		bitmap[byte] &= ~(1 << bit);
+	else
+		bitmap[byte] |= 1 << bit;
+
+	byte += 2048;
+	if (access & MSR_BITMAP_ACCESS_WRITE)
+		bitmap[byte] &= ~(1 << bit);
+	else
+		bitmap[byte] |= 1 << bit;
+
+	return (0);
+}
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
new file mode 100644
index 0000000..e6379a9
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_MSR_H_
+#define	_VMX_MSR_H_
+
+#define	MSR_VMX_BASIC			0x480
+#define	MSR_VMX_EPT_VPID_CAP		0x48C
+
+#define	MSR_VMX_PROCBASED_CTLS		0x482
+#define	MSR_VMX_TRUE_PROCBASED_CTLS	0x48E
+
+#define	MSR_VMX_PINBASED_CTLS		0x481
+#define	MSR_VMX_TRUE_PINBASED_CTLS	0x48D
+
+#define	MSR_VMX_PROCBASED_CTLS2		0x48B
+
+#define	MSR_VMX_EXIT_CTLS		0x483
+#define	MSR_VMX_TRUE_EXIT_CTLS		0x48f
+
+#define	MSR_VMX_ENTRY_CTLS		0x484
+#define	MSR_VMX_TRUE_ENTRY_CTLS		0x490
+
+#define	MSR_VMX_CR0_FIXED0		0x486
+#define	MSR_VMX_CR0_FIXED1		0x487
+
+#define	MSR_VMX_CR4_FIXED0		0x488
+#define	MSR_VMX_CR4_FIXED1		0x489
+
+uint32_t vmx_revision(void);
+
+int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+		   uint32_t zeros_mask, uint32_t *retval);
+
+/*
+ * According to Section 21.10.4 "Software Access to Related Structures",
+ * changes to data structures pointed to by the VMCS must be made only when
+ * there is no logical processor with a current VMCS that points to the
+ * data structure.
+ *
+ * This pretty much limits us to configuring the MSR bitmap before VMCS
+ * initialization for SMP VMs. Unless of course we do it the hard way - which
+ * would involve some form of synchronization between the vcpus to vmclear
+ * all VMCSs' that point to the bitmap.
+ */
+#define	MSR_BITMAP_ACCESS_NONE	0x0
+#define	MSR_BITMAP_ACCESS_READ	0x1
+#define	MSR_BITMAP_ACCESS_WRITE	0x2
+#define	MSR_BITMAP_ACCESS_RW	(MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
+void	msr_bitmap_initialize(char *bitmap);
+int	msr_bitmap_change_access(char *bitmap, u_int msr, int access);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
new file mode 100644
index 0000000..4d1bf1d
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+
+#include "vmx_assym.s"
+
+/*
+ * Assumes that %rdi holds a pointer to the 'vmxctx'
+ */
+#define	VMX_GUEST_RESTORE						\
+	/*								\
+	 * Make sure that interrupts are disabled before restoring CR2.	\
+	 * Otherwise there could be a page fault during the interrupt	\
+	 * handler execution that would end up trashing CR2.		\
+	 */								\
+	cli;								\
+	movq	VMXCTX_GUEST_CR2(%rdi),%rsi;				\
+	movq	%rsi,%cr2;						\
+	movq	VMXCTX_GUEST_RSI(%rdi),%rsi;				\
+	movq	VMXCTX_GUEST_RDX(%rdi),%rdx;				\
+	movq	VMXCTX_GUEST_RCX(%rdi),%rcx;				\
+	movq	VMXCTX_GUEST_R8(%rdi),%r8;				\
+	movq	VMXCTX_GUEST_R9(%rdi),%r9;				\
+	movq	VMXCTX_GUEST_RAX(%rdi),%rax;				\
+	movq	VMXCTX_GUEST_RBX(%rdi),%rbx;				\
+	movq	VMXCTX_GUEST_RBP(%rdi),%rbp;				\
+	movq	VMXCTX_GUEST_R10(%rdi),%r10;				\
+	movq	VMXCTX_GUEST_R11(%rdi),%r11;				\
+	movq	VMXCTX_GUEST_R12(%rdi),%r12;				\
+	movq	VMXCTX_GUEST_R13(%rdi),%r13;				\
+	movq	VMXCTX_GUEST_R14(%rdi),%r14;				\
+	movq	VMXCTX_GUEST_R15(%rdi),%r15;				\
+	movq	VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
+
+#define	VM_INSTRUCTION_ERROR(reg)					\
+	jnc 	1f;							\
+	movl 	$VM_FAIL_INVALID,reg;		/* CF is set */		\
+	jmp 	3f;							\
+1:	jnz 	2f;							\
+	movl 	$VM_FAIL_VALID,reg;		/* ZF is set */		\
+	jmp 	3f;							\
+2:	movl 	$VM_SUCCESS,reg;					\
+3:	movl	reg,VMXCTX_LAUNCH_ERROR(%rsp)
+
+	.text
+/*
+ * int vmx_setjmp(ctxp)
+ * %rdi = ctxp
+ *
+ * Return value is '0' when it returns directly from here.
+ * Return value is '1' when it returns after a vm exit through vmx_longjmp.
+ */
+ENTRY(vmx_setjmp)
+	movq	(%rsp),%rax			/* return address */
+	movq    %r15,VMXCTX_HOST_R15(%rdi)
+	movq    %r14,VMXCTX_HOST_R14(%rdi)
+	movq    %r13,VMXCTX_HOST_R13(%rdi)
+	movq    %r12,VMXCTX_HOST_R12(%rdi)
+	movq    %rbp,VMXCTX_HOST_RBP(%rdi)
+	movq    %rsp,VMXCTX_HOST_RSP(%rdi)
+	movq    %rbx,VMXCTX_HOST_RBX(%rdi)
+	movq    %rax,VMXCTX_HOST_RIP(%rdi)
+
+	/*
+	 * XXX save host debug registers
+	 */
+	movl	$VMX_RETURN_DIRECT,%eax
+	ret
+END(vmx_setjmp)
+
+/*
+ * void vmx_return(struct vmxctx *ctxp, int retval)
+ * %rdi = ctxp
+ * %rsi = retval
+ * Return to vmm context through vmx_setjmp() with a value of 'retval'.
+ */
+ENTRY(vmx_return)
+	/* Restore host context. */
+	movq	VMXCTX_HOST_R15(%rdi),%r15
+	movq	VMXCTX_HOST_R14(%rdi),%r14
+	movq	VMXCTX_HOST_R13(%rdi),%r13
+	movq	VMXCTX_HOST_R12(%rdi),%r12
+	movq	VMXCTX_HOST_RBP(%rdi),%rbp
+	movq	VMXCTX_HOST_RSP(%rdi),%rsp
+	movq	VMXCTX_HOST_RBX(%rdi),%rbx
+	movq	VMXCTX_HOST_RIP(%rdi),%rax
+	movq	%rax,(%rsp)			/* return address */
+
+	/*
+	 * XXX restore host debug registers
+	 */
+	movl	%esi,%eax
+	ret
+END(vmx_return)
+
+/*
+ * void vmx_longjmp(void)
+ * %rsp points to the struct vmxctx
+ */
+ENTRY(vmx_longjmp)
+	/*
+	 * Save guest state that is not automatically saved in the vmcs.
+	 */
+	movq	%rdi,VMXCTX_GUEST_RDI(%rsp)
+	movq	%rsi,VMXCTX_GUEST_RSI(%rsp)
+	movq	%rdx,VMXCTX_GUEST_RDX(%rsp)
+	movq	%rcx,VMXCTX_GUEST_RCX(%rsp)
+	movq	%r8,VMXCTX_GUEST_R8(%rsp)
+	movq	%r9,VMXCTX_GUEST_R9(%rsp)
+	movq	%rax,VMXCTX_GUEST_RAX(%rsp)
+	movq	%rbx,VMXCTX_GUEST_RBX(%rsp)
+	movq	%rbp,VMXCTX_GUEST_RBP(%rsp)
+	movq	%r10,VMXCTX_GUEST_R10(%rsp)
+	movq	%r11,VMXCTX_GUEST_R11(%rsp)
+	movq	%r12,VMXCTX_GUEST_R12(%rsp)
+	movq	%r13,VMXCTX_GUEST_R13(%rsp)
+	movq	%r14,VMXCTX_GUEST_R14(%rsp)
+	movq	%r15,VMXCTX_GUEST_R15(%rsp)
+
+	movq	%cr2,%rdi
+	movq	%rdi,VMXCTX_GUEST_CR2(%rsp)
+
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_LONGJMP,%rsi
+	callq	vmx_return
+END(vmx_longjmp)
+
+/*
+ * void vmx_resume(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 2.
+ */
+ENTRY(vmx_resume)
+	/*
+	 * Restore guest state that is not automatically loaded from the vmcs.
+	 */
+	VMX_GUEST_RESTORE
+
+	vmresume
+
+	/*
+	 * Capture the reason why vmresume failed.
+	 */
+	VM_INSTRUCTION_ERROR(%eax)
+
+	/* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_VMRESUME,%rsi
+	callq	vmx_return
+END(vmx_resume)
+
+/*
+ * void vmx_launch(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 3.
+ */
+ENTRY(vmx_launch)
+	/*
+	 * Restore guest state that is not automatically loaded from the vmcs.
+	 */
+	VMX_GUEST_RESTORE
+
+	vmlaunch
+
+	/*
+	 * Capture the reason why vmlaunch failed.
+	 */
+	VM_INSTRUCTION_ERROR(%eax)
+
+	/* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_VMLAUNCH,%rsi
+	callq	vmx_return
+END(vmx_launch)
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
new file mode 100644
index 0000000..24495a9
--- /dev/null
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -0,0 +1,637 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/pci_cfgreg.h>
+
+#include "io/iommu.h"
+
+/*
+ * Documented in the "Intel Virtualization Technology for Directed I/O",
+ * Architecture Spec, September 2008.
+ */
+
+/* Section 10.4 "Register Descriptions" */
+struct vtdmap {
+	volatile uint32_t	version;
+	volatile uint32_t	res0;
+	volatile uint64_t	cap;
+	volatile uint64_t	ext_cap;
+	volatile uint32_t	gcr;
+	volatile uint32_t	gsr;
+	volatile uint64_t	rta;
+	volatile uint64_t	ccr;
+};
+
+#define	VTD_CAP_SAGAW(cap)	(((cap) >> 8) & 0x1F)
+#define	VTD_CAP_ND(cap)		((cap) & 0x7)
+#define	VTD_CAP_CM(cap)		(((cap) >> 7) & 0x1)
+#define	VTD_CAP_SPS(cap)	(((cap) >> 34) & 0xF)
+#define	VTD_CAP_RWBF(cap)	(((cap) >> 4) & 0x1)
+
+#define	VTD_ECAP_DI(ecap)	(((ecap) >> 2) & 0x1)
+#define	VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
+#define	VTD_ECAP_IRO(ecap)	(((ecap) >> 8) & 0x3FF)
+
+#define	VTD_GCR_WBF		(1 << 27)
+#define	VTD_GCR_SRTP		(1 << 30)
+#define	VTD_GCR_TE		(1 << 31)
+
+#define	VTD_GSR_WBFS		(1 << 27)
+#define	VTD_GSR_RTPS		(1 << 30)
+#define	VTD_GSR_TES		(1 << 31)
+
+#define	VTD_CCR_ICC		(1UL << 63)	/* invalidate context cache */
+#define	VTD_CCR_CIRG_GLOBAL	(1UL << 61)	/* global invalidation */
+
+#define	VTD_IIR_IVT		(1UL << 63)	/* invalidation IOTLB */
+#define	VTD_IIR_IIRG_GLOBAL	(1ULL << 60)	/* global IOTLB invalidation */
+#define	VTD_IIR_IIRG_DOMAIN	(2ULL << 60)	/* domain IOTLB invalidation */
+#define	VTD_IIR_IIRG_PAGE	(3ULL << 60)	/* page IOTLB invalidation */
+#define	VTD_IIR_DRAIN_READS	(1ULL << 49)	/* drain pending DMA reads */
+#define	VTD_IIR_DRAIN_WRITES	(1ULL << 48)	/* drain pending DMA writes */
+#define	VTD_IIR_DOMAIN_P	32
+
+#define	VTD_ROOT_PRESENT	0x1
+#define	VTD_CTX_PRESENT		0x1
+#define	VTD_CTX_TT_ALL		(1UL << 2)
+
+#define	VTD_PTE_RD		(1UL << 0)
+#define	VTD_PTE_WR		(1UL << 1)
+#define	VTD_PTE_SUPERPAGE	(1UL << 7)
+#define	VTD_PTE_ADDR_M		(0x000FFFFFFFFFF000UL)
+
+struct domain {
+	uint64_t	*ptp;		/* first level page table page */
+	int		pt_levels;	/* number of page table levels */
+	int		addrwidth;	/* 'AW' field in context entry */
+	int		spsmask;	/* supported super page sizes */
+	u_int		id;		/* domain id */
+	vm_paddr_t	maxaddr;	/* highest address to be mapped */
+	SLIST_ENTRY(domain) next;
+};
+
+static SLIST_HEAD(, domain) domhead;
+
+#define	DRHD_MAX_UNITS	8
+static int		drhd_num;
+static struct vtdmap	*vtdmaps[DRHD_MAX_UNITS];
+static int		max_domains;
+typedef int		(*drhd_ident_func_t)(void);
+
+static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+
+static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
+
+/*
+ * Config space register definitions from the "Intel 5520 and 5500" datasheet.
+ */
+static int
+tylersburg_vtd_ident(void)
+{
+	int units, nlbus;
+	uint16_t did, vid;
+	uint32_t miscsts, vtbar;
+
+	const int bus = 0;
+	const int slot = 20;
+	const int func = 0;
+
+	units = 0;
+
+	vid = pci_cfgregread(bus, slot, func, PCIR_VENDOR, 2);
+	did = pci_cfgregread(bus, slot, func, PCIR_DEVICE, 2);
+	if (vid != 0x8086 || did != 0x342E)
+		goto done;
+
+	/*
+	 * Check if this is a dual IOH configuration.
+	 */
+	miscsts = pci_cfgregread(bus, slot, func, 0x9C, 4);
+	if (miscsts & (1 << 25))
+		nlbus = pci_cfgregread(bus, slot, func, 0x160, 1);
+	else	
+		nlbus = -1;
+
+	vtbar = pci_cfgregread(bus, slot, func, 0x180, 4);
+	if (vtbar & 0x1) {
+		vtdmaps[units++] = (struct vtdmap *)
+					PHYS_TO_DMAP(vtbar & 0xffffe000);
+	} else if (bootverbose)
+		printf("VT-d unit in legacy IOH is disabled!\n");
+
+	if (nlbus != -1) {
+		vtbar = pci_cfgregread(nlbus, slot, func, 0x180, 4);
+		if (vtbar & 0x1) {
+			vtdmaps[units++] = (struct vtdmap *)
+					   PHYS_TO_DMAP(vtbar & 0xffffe000);
+		} else if (bootverbose)
+			printf("VT-d unit in non-legacy IOH is disabled!\n");
+	}
+done:
+	return (units);
+}
+
+static drhd_ident_func_t drhd_ident_funcs[] = {
+	tylersburg_vtd_ident,
+	NULL
+};
+
+static int
+vtd_max_domains(struct vtdmap *vtdmap)
+{
+	int nd;
+
+	nd = VTD_CAP_ND(vtdmap->cap);
+
+	switch (nd) {
+	case 0:
+		return (16);
+	case 1:
+		return (64);
+	case 2:
+		return (256);
+	case 3:
+		return (1024);
+	case 4:
+		return (4 * 1024);
+	case 5:
+		return (16 * 1024);
+	case 6:
+		return (64 * 1024);
+	default:
+		panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
+	}
+}
+
+static u_int
+domain_id(void)
+{
+	u_int id;
+	struct domain *dom;
+
+	/* Skip domain id 0 - it is reserved when Caching Mode field is set */
+	for (id = 1; id < max_domains; id++) {
+		SLIST_FOREACH(dom, &domhead, next) {
+			if (dom->id == id)
+				break;
+		}
+		if (dom == NULL)
+			break;		/* found it */
+	}
+	
+	if (id >= max_domains)
+		panic("domain ids exhausted");
+
+	return (id);
+}
+
+static void
+vtd_wbflush(struct vtdmap *vtdmap)
+{
+
+	if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
+		pmap_invalidate_cache();
+
+	if (VTD_CAP_RWBF(vtdmap->cap)) {
+		vtdmap->gcr = VTD_GCR_WBF;
+		while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
+			;
+	}
+}
+
+static void
+vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
+{
+
+	vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
+	while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
+		;
+}
+
+static void
+vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
+{
+	int offset;
+	volatile uint64_t *iotlb_reg, val;
+
+	vtd_wbflush(vtdmap);
+
+	offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
+	iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
+	
+	*iotlb_reg =  VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
+		      VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
+
+	while (1) {
+		val = *iotlb_reg;
+		if ((val & VTD_IIR_IVT) == 0)
+			break;
+	}
+}
+
+static void
+vtd_translation_enable(struct vtdmap *vtdmap)
+{
+
+	vtdmap->gcr = VTD_GCR_TE;
+	while ((vtdmap->gsr & VTD_GSR_TES) == 0)
+		;
+}
+
+static void
+vtd_translation_disable(struct vtdmap *vtdmap)
+{
+
+	vtdmap->gcr = 0;
+	while ((vtdmap->gsr & VTD_GSR_TES) != 0)
+		;
+}
+
+static int
+vtd_init(void)
+{
+	int i, units;
+	struct vtdmap *vtdmap;
+	vm_paddr_t ctx_paddr;
+	
+	for (i = 0; drhd_ident_funcs[i] != NULL; i++) {
+		units = (*drhd_ident_funcs[i])();
+		if (units > 0)
+			break;
+	}
+
+	if (units <= 0)
+		return (ENXIO);
+
+	drhd_num = units;
+	vtdmap = vtdmaps[0];
+
+	if (VTD_CAP_CM(vtdmap->cap) != 0)
+		panic("vtd_init: invalid caching mode");
+
+	max_domains = vtd_max_domains(vtdmap);
+
+	/*
+	 * Set up the root-table to point to the context-entry tables
+	 */
+	for (i = 0; i < 256; i++) {
+		ctx_paddr = vtophys(ctx_tables[i]);
+		if (ctx_paddr & PAGE_MASK)
+			panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
+
+		root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
+	}
+
+	return (0);
+}
+
+static void
+vtd_cleanup(void)
+{
+}
+
+static void
+vtd_enable(void)
+{
+	int i;
+	struct vtdmap *vtdmap;
+
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_wbflush(vtdmap);
+
+		/* Update the root table address */
+		vtdmap->rta = vtophys(root_table);
+		vtdmap->gcr = VTD_GCR_SRTP;
+		while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
+			;
+
+		vtd_ctx_global_invalidate(vtdmap);
+		vtd_iotlb_global_invalidate(vtdmap);
+
+		vtd_translation_enable(vtdmap);
+	}
+}
+
+static void
+vtd_disable(void)
+{
+	int i;
+	struct vtdmap *vtdmap;
+
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_translation_disable(vtdmap);
+	}
+}
+
+static void
+vtd_add_device(void *arg, int bus, int slot, int func)
+{
+	int idx;
+	uint64_t *ctxp;
+	struct domain *dom = arg;
+	vm_paddr_t pt_paddr;
+	struct vtdmap *vtdmap;
+
+	if (bus < 0 || bus > PCI_BUSMAX ||
+	    slot < 0 || slot > PCI_SLOTMAX ||
+	    func < 0 || func > PCI_FUNCMAX)
+		panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+	vtdmap = vtdmaps[0];
+	ctxp = ctx_tables[bus];
+	pt_paddr = vtophys(dom->ptp);
+	idx = (slot << 3 | func) * 2;
+
+	if (ctxp[idx] & VTD_CTX_PRESENT) {
+		panic("vtd_add_device: device %d/%d/%d is already owned by "
+		      "domain %d", bus, slot, func,
+		      (uint16_t)(ctxp[idx + 1] >> 8));
+	}
+
+	/*
+	 * Order is important. The 'present' bit is set only after all fields
+	 * of the context pointer are initialized.
+	 */
+	ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
+
+	if (VTD_ECAP_DI(vtdmap->ext_cap))
+		ctxp[idx] = VTD_CTX_TT_ALL;
+	else
+		ctxp[idx] = 0;
+
+	ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
+
+	/*
+	 * 'Not Present' entries are not cached in either the Context Cache
+	 * or in the IOTLB, so there is no need to invalidate either of them.
+	 */
+}
+
+static void
+vtd_remove_device(void *arg, int bus, int slot, int func)
+{
+	int i, idx;
+	uint64_t *ctxp;
+	struct vtdmap *vtdmap;
+
+	if (bus < 0 || bus > PCI_BUSMAX ||
+	    slot < 0 || slot > PCI_SLOTMAX ||
+	    func < 0 || func > PCI_FUNCMAX)
+		panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+	ctxp = ctx_tables[bus];
+	idx = (slot << 3 | func) * 2;
+
+	/*
+	 * Order is important. The 'present' bit is must be cleared first.
+	 */
+	ctxp[idx] = 0;
+	ctxp[idx + 1] = 0;
+
+	/*
+	 * Invalidate the Context Cache and the IOTLB.
+	 *
+	 * XXX use device-selective invalidation for Context Cache
+	 * XXX use domain-selective invalidation for IOTLB
+	 */
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_ctx_global_invalidate(vtdmap);
+		vtd_iotlb_global_invalidate(vtdmap);
+	}
+}
+
+static uint64_t
+vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+	struct domain *dom;
+	int i, spshift, ptpshift, ptpindex, nlevels;
+	uint64_t spsize, *ptp;
+
+	dom = arg;
+	ptpindex = 0;
+	ptpshift = 0;
+
+	if (gpa & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
+
+	if (hpa & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
+
+	if (len & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned len 0x%0lx", len);
+
+	/*
+	 * Compute the size of the mapping that we can accomodate.
+	 *
+	 * This is based on three factors:
+	 * - supported super page size
+	 * - alignment of the region starting at 'gpa' and 'hpa'
+	 * - length of the region 'len'
+	 */
+	spshift = 48;
+	for (i = 3; i >= 0; i--) {
+		spsize = 1UL << spshift;
+		if ((dom->spsmask & (1 << i)) != 0 &&
+		    (gpa & (spsize - 1)) == 0 &&
+		    (hpa & (spsize - 1)) == 0 &&
+		    (len >= spsize)) {
+			break;
+		}
+		spshift -= 9;
+	}
+
+	ptp = dom->ptp;
+	nlevels = dom->pt_levels;
+	while (--nlevels >= 0) {
+		ptpshift = 12 + nlevels * 9;
+		ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+		/* We have reached the leaf mapping */
+		if (spshift >= ptpshift) {
+			break;
+		}
+
+		/*
+		 * We are working on a non-leaf page table page.
+		 *
+		 * Create a downstream page table page if necessary and point
+		 * to it from the current page table.
+		 */
+		if (ptp[ptpindex] == 0) {
+			void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
+			ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
+		}
+
+		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
+	}
+
+	if ((gpa & ((1UL << ptpshift) - 1)) != 0)
+		panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
+
+	/*
+	 * Create a 'gpa' -> 'hpa' mapping
+	 */
+	ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
+
+	if (nlevels > 0)
+		ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
+
+	return (1UL << ptpshift);
+}
+
+static void *
+vtd_create_domain(vm_paddr_t maxaddr)
+{
+	struct domain *dom;
+	vm_paddr_t addr;
+	int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
+	struct vtdmap *vtdmap;
+
+	if (drhd_num <= 0)
+		panic("vtd_create_domain: no dma remapping hardware available");
+
+	vtdmap = vtdmaps[0];
+
+	/*
+	 * Calculate AGAW.
+	 * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
+	 */
+	addr = 0;
+	for (gaw = 0; addr < maxaddr; gaw++)
+		addr = 1ULL << gaw;
+
+	res = (gaw - 12) % 9;
+	if (res == 0)
+		agaw = gaw;
+	else
+		agaw = gaw + 9 - res;
+
+	if (agaw > 64)
+		agaw = 64;
+
+	/*
+	 * Select the smallest Supported AGAW and the corresponding number
+	 * of page table levels.
+	 */
+	pt_levels = 2;
+	sagaw = 30;
+	addrwidth = 0;
+	tmp = VTD_CAP_SAGAW(vtdmap->cap);
+	for (i = 0; i < 5; i++) {
+		if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
+			break;
+		pt_levels++;
+		addrwidth++;
+		sagaw += 9;
+		if (sagaw > 64)
+			sagaw = 64;
+	}
+
+	if (i >= 5) {
+		panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
+		      VTD_CAP_SAGAW(vtdmap->cap), agaw);
+	}
+
+	dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
+	dom->pt_levels = pt_levels;
+	dom->addrwidth = addrwidth;
+	dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
+	dom->id = domain_id();
+	dom->maxaddr = maxaddr;
+	dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
+	if ((uintptr_t)dom->ptp & PAGE_MASK)
+		panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
+
+	SLIST_INSERT_HEAD(&domhead, dom, next);
+
+	return (dom);
+}
+
+static void
+vtd_free_ptp(uint64_t *ptp, int level)
+{
+	int i;
+	uint64_t *nlp;
+
+	if (level > 1) {
+		for (i = 0; i < 512; i++) {
+			if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
+				continue;
+			if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
+				continue;
+			nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
+			vtd_free_ptp(nlp, level - 1);
+		}
+	}
+
+	bzero(ptp, PAGE_SIZE);
+	free(ptp, M_VTD);
+}
+
+static void
+vtd_destroy_domain(void *arg)
+{
+	struct domain *dom;
+	
+	dom = arg;
+
+	SLIST_REMOVE(&domhead, dom, domain, next);
+	vtd_free_ptp(dom->ptp, dom->pt_levels);
+	free(dom, M_VTD);
+}
+
+struct iommu_ops iommu_ops_intel = {
+	vtd_init,
+	vtd_cleanup,
+	vtd_enable,
+	vtd_disable,
+	vtd_create_domain,
+	vtd_destroy_domain,
+	vtd_create_mapping,
+	vtd_add_device,
+	vtd_remove_device,
+};
diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c
new file mode 100644
index 0000000..baf2447
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.c
@@ -0,0 +1,230 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+#include "iommu.h"
+
+static boolean_t iommu_avail;
+static struct iommu_ops *ops;
+static void *host_domain;
+
+static __inline int
+IOMMU_INIT(void)
+{
+	if (ops != NULL)
+		return ((*ops->init)());
+	else
+		return (ENXIO);
+}
+
+static __inline void
+IOMMU_CLEANUP(void)
+{
+	if (ops != NULL && iommu_avail)
+		(*ops->cleanup)();
+}
+
+static __inline void *
+IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr)
+{
+
+	if (ops != NULL && iommu_avail)
+		return ((*ops->create_domain)(maxaddr));
+	else
+		return (NULL);
+}
+
+static __inline void
+IOMMU_DESTROY_DOMAIN(void *dom)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->destroy_domain)(dom);
+}
+
+static __inline uint64_t
+IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+	if (ops != NULL && iommu_avail)
+		return ((*ops->create_mapping)(domain, gpa, hpa, len));
+	else
+		return (len);		/* XXX */
+}
+
+static __inline void
+IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->add_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->remove_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_ENABLE(void)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->enable)();
+}
+
+static __inline void
+IOMMU_DISABLE(void)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->disable)();
+}
+
+void
+iommu_init(void)
+{
+	int error, bus, slot, func;
+	vm_paddr_t maxaddr;
+	const char *name;
+	device_t dev;
+
+	if (vmm_is_intel())
+		ops = &iommu_ops_intel;
+	else if (vmm_is_amd())
+		ops = &iommu_ops_amd;
+	else
+		ops = NULL;
+
+	error = IOMMU_INIT();
+	if (error)
+		return;
+
+	iommu_avail = TRUE;
+
+	/*
+	 * Create a domain for the devices owned by the host
+	 */
+	maxaddr = ptoa(Maxmem);
+	host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
+	if (host_domain == NULL)
+		panic("iommu_init: unable to create a host domain");
+
+	/*
+	 * Create 1:1 mappings from '0' to 'Maxmem' for devices assigned to
+	 * the host
+	 */
+	iommu_create_mapping(host_domain, 0, 0, maxaddr);
+
+	for (bus = 0; bus <= PCI_BUSMAX; bus++) {
+		for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
+			for (func = 0; func <= PCI_FUNCMAX; func++) {
+				dev = pci_find_dbsf(0, bus, slot, func);
+				if (dev == NULL)
+					continue;
+
+				/* skip passthrough devices */
+				name = device_get_name(dev);
+				if (name != NULL && strcmp(name, "ppt") == 0)
+					continue;
+
+				/* everything else belongs to the host domain */
+				iommu_add_device(host_domain, bus, slot, func);
+			}
+		}
+	}
+	IOMMU_ENABLE();
+
+}
+
+void
+iommu_cleanup(void)
+{
+	IOMMU_DISABLE();
+	IOMMU_DESTROY_DOMAIN(host_domain);
+	IOMMU_CLEANUP();
+}
+
+void *
+iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+	return (IOMMU_CREATE_DOMAIN(maxaddr));
+}
+
+void
+iommu_destroy_domain(void *dom)
+{
+
+	IOMMU_DESTROY_DOMAIN(dom);
+}
+
+void
+iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
+{
+	uint64_t mapped, remaining;
+
+	remaining = len;
+
+	while (remaining > 0) {
+		mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining);
+		gpa += mapped;
+		hpa += mapped;
+		remaining -= mapped;
+	}
+}
+
+void
+iommu_add_device(void *dom, int bus, int slot, int func)
+{
+
+	IOMMU_ADD_DEVICE(dom, bus, slot, func);
+}
+
+void
+iommu_remove_device(void *dom, int bus, int slot, int func)
+{
+
+	IOMMU_REMOVE_DEVICE(dom, bus, slot, func);
+}
diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h
new file mode 100644
index 0000000..e4f7229
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.h
@@ -0,0 +1,67 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_IOMMU_H_
+#define	_IO_IOMMU_H_
+
+typedef int (*iommu_init_func_t)(void);
+typedef void (*iommu_cleanup_func_t)(void);
+typedef void (*iommu_enable_func_t)(void);
+typedef void (*iommu_disable_func_t)(void);
+typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr);
+typedef void (*iommu_destroy_domain_t)(void *domain);
+typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa,
+					   vm_paddr_t hpa, uint64_t len);
+typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func);
+typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func);
+
+struct iommu_ops {
+	iommu_init_func_t	init;		/* module wide */
+	iommu_cleanup_func_t	cleanup;
+	iommu_enable_func_t	enable;
+	iommu_disable_func_t	disable;
+
+	iommu_create_domain_t	create_domain;	/* domain-specific */
+	iommu_destroy_domain_t	destroy_domain;
+	iommu_create_mapping_t	create_mapping;
+	iommu_add_device_t	add_device;
+	iommu_remove_device_t	remove_device;
+};
+
+extern struct iommu_ops iommu_ops_intel;
+extern struct iommu_ops iommu_ops_amd;
+
+void	iommu_init(void);
+void	iommu_cleanup(void);
+void	*iommu_create_domain(vm_paddr_t maxaddr);
+void	iommu_destroy_domain(void *dom);
+void	iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa,
+			     size_t len);
+void	iommu_add_device(void *dom, int bus, int slot, int func);
+void	iommu_remove_device(void *dom, int bus, int slot, int func);
+#endif
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
new file mode 100644
index 0000000..dc2f326
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.c
@@ -0,0 +1,449 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/pciio.h>
+#include <sys/rman.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/resource.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+
+#include "iommu.h"
+#include "ppt.h"
+
+#define	MAX_PPTDEVS	(sizeof(pptdevs) / sizeof(pptdevs[0]))
+#define	MAX_MMIOSEGS	(PCIR_MAX_BAR_0 + 1)
+#define	MAX_MSIMSGS	32
+
+struct pptintr_arg {				/* pptintr(pptintr_arg) */
+	struct pptdev	*pptdev;
+	int		msg;
+};
+
+static struct pptdev {
+	device_t	dev;
+	struct vm	*vm;			/* owner of this device */
+	struct vm_memory_segment mmio[MAX_MMIOSEGS];
+	struct {
+		int	num_msgs;		/* guest state */
+		int	vector;
+		int	vcpu;
+
+		int	startrid;		/* host state */
+		struct resource *res[MAX_MSIMSGS];
+		void	*cookie[MAX_MSIMSGS];
+		struct pptintr_arg arg[MAX_MSIMSGS];
+	} msi;
+} pptdevs[32];
+
+static int num_pptdevs;
+
+static int
+ppt_probe(device_t dev)
+{
+	int bus, slot, func;
+	struct pci_devinfo *dinfo;
+
+	dinfo = (struct pci_devinfo *)device_get_ivars(dev);
+
+	bus = pci_get_bus(dev);
+	slot = pci_get_slot(dev);
+	func = pci_get_function(dev);
+
+	/*
+	 * To qualify as a pci passthrough device a device must:
+	 * - be allowed by administrator to be used in this role
+	 * - be an endpoint device
+	 */
+	if (vmm_is_pptdev(bus, slot, func) &&
+	    (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL)
+		return (0);
+	else
+		return (ENXIO);
+}
+
+static int
+ppt_attach(device_t dev)
+{
+	int n;
+
+	if (num_pptdevs >= MAX_PPTDEVS) {
+		printf("ppt_attach: maximum number of pci passthrough devices "
+		       "exceeded\n");
+		return (ENXIO);
+	}
+
+	n = num_pptdevs++;
+	pptdevs[n].dev = dev;
+
+	if (bootverbose)
+		device_printf(dev, "attached\n");
+
+	return (0);
+}
+
+static int
+ppt_detach(device_t dev)
+{
+	/*
+	 * XXX check whether there are any pci passthrough devices assigned
+	 * to guests before we allow this driver to detach.
+	 */
+
+	return (0);
+}
+
+static device_method_t ppt_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		ppt_probe),
+	DEVMETHOD(device_attach,	ppt_attach),
+	DEVMETHOD(device_detach,	ppt_detach),
+	{0, 0}
+};
+
+static devclass_t ppt_devclass;
+DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0);
+DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
+
+static struct pptdev *
+ppt_find(int bus, int slot, int func)
+{
+	device_t dev;
+	int i, b, s, f;
+
+	for (i = 0; i < num_pptdevs; i++) {
+		dev = pptdevs[i].dev;
+		b = pci_get_bus(dev);
+		s = pci_get_slot(dev);
+		f = pci_get_function(dev);
+		if (bus == b && slot == s && func == f)
+			return (&pptdevs[i]);
+	}
+	return (NULL);
+}
+
+static void
+ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
+{
+	int i;
+	struct vm_memory_segment *seg;
+
+	for (i = 0; i < MAX_MMIOSEGS; i++) {
+		seg = &ppt->mmio[i];
+		if (seg->len == 0)
+			continue;
+		(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
+		bzero(seg, sizeof(struct vm_memory_segment));
+	}
+}
+
+static void
+ppt_teardown_msi(struct pptdev *ppt)
+{
+	int i, rid;
+	void *cookie;
+	struct resource *res;
+
+	if (ppt->msi.num_msgs == 0)
+		return;
+
+	for (i = 0; i < ppt->msi.num_msgs; i++) {
+		rid = ppt->msi.startrid + i;
+		res = ppt->msi.res[i];
+		cookie = ppt->msi.cookie[i];
+
+		if (cookie != NULL)
+			bus_teardown_intr(ppt->dev, res, cookie);
+
+		if (res != NULL)
+			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+		
+		ppt->msi.res[i] = NULL;
+		ppt->msi.cookie[i] = NULL;
+	}
+
+	if (ppt->msi.startrid == 1)
+		pci_release_msi(ppt->dev);
+
+	ppt->msi.num_msgs = 0;
+}
+
+int
+ppt_assign_device(struct vm *vm, int bus, int slot, int func)
+{
+	struct pptdev *ppt;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt != NULL) {
+		/*
+		 * If this device is owned by a different VM then we
+		 * cannot change its owner.
+		 */
+		if (ppt->vm != NULL && ppt->vm != vm)
+			return (EBUSY);
+
+		ppt->vm = vm;
+		iommu_add_device(vm_iommu_domain(vm), bus, slot, func);
+		return (0);
+	}
+	return (ENOENT);
+}
+
+int
+ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
+{
+	struct pptdev *ppt;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt != NULL) {
+		/*
+		 * If this device is not owned by this 'vm' then bail out.
+		 */
+		if (ppt->vm != vm)
+			return (EBUSY);
+		ppt_unmap_mmio(vm, ppt);
+		ppt_teardown_msi(ppt);
+		iommu_remove_device(vm_iommu_domain(vm), bus, slot, func);
+		ppt->vm = NULL;
+		return (0);
+	}
+	return (ENOENT);
+}
+
+int
+ppt_unassign_all(struct vm *vm)
+{
+	int i, bus, slot, func;
+	device_t dev;
+
+	for (i = 0; i < num_pptdevs; i++) {
+		if (pptdevs[i].vm == vm) {
+			dev = pptdevs[i].dev;
+			bus = pci_get_bus(dev);
+			slot = pci_get_slot(dev);
+			func = pci_get_function(dev);
+			ppt_unassign_device(vm, bus, slot, func);
+		}
+	}
+
+	return (0);
+}
+
+int
+ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+	     vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+	int i, error;
+	struct vm_memory_segment *seg;
+	struct pptdev *ppt;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt != NULL) {
+		if (ppt->vm != vm)
+			return (EBUSY);
+
+		for (i = 0; i < MAX_MMIOSEGS; i++) {
+			seg = &ppt->mmio[i];
+			if (seg->len == 0) {
+				error = vm_map_mmio(vm, gpa, len, hpa);
+				if (error == 0) {
+					seg->gpa = gpa;
+					seg->len = len;
+					seg->hpa = hpa;
+				}
+				return (error);
+			}
+		}
+		return (ENOSPC);
+	}
+	return (ENOENT);
+}
+
+static int
+pptintr(void *arg)
+{
+	int vec;
+	struct pptdev *ppt;
+	struct pptintr_arg *pptarg;
+	
+	pptarg = arg;
+	ppt = pptarg->pptdev;
+	vec = ppt->msi.vector + pptarg->msg;
+
+	if (ppt->vm != NULL)
+		(void) lapic_set_intr(ppt->vm, ppt->msi.vcpu, vec);
+	else {
+		/*
+		 * XXX
+		 * This is not expected to happen - panic?
+		 */
+	}
+
+	/*
+	 * For legacy interrupts give other filters a chance in case
+	 * the interrupt was not generated by the passthrough device.
+	 */
+	if (ppt->msi.startrid == 0)
+		return (FILTER_STRAY);
+	else
+		return (FILTER_HANDLED);
+}
+
+/*
+ * XXX
+ * When we try to free the MSI resource the kernel will bind the thread to
+ * the host cpu was originally handling the MSI. The function freeing the
+ * MSI vector (apic_free_vector()) will panic the kernel if the thread
+ * is already bound to a cpu.
+ * 
+ * So, we temporarily unbind the vcpu thread before freeing the MSI resource.
+ */
+static void
+PPT_TEARDOWN_MSI(struct vm *vm, int vcpu, struct pptdev *ppt)
+{
+	int pincpu = -1;
+
+	vm_get_pinning(vm, vcpu, &pincpu);
+
+	if (pincpu >= 0)
+		vm_set_pinning(vm, vcpu, -1);
+
+	ppt_teardown_msi(ppt);
+
+	if (pincpu >= 0)
+		vm_set_pinning(vm, vcpu, pincpu);
+}
+
+int
+ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+	      int destcpu, int vector, int numvec)
+{
+	int i, rid, flags;
+	int msi_count, startrid, error, tmp;
+	struct pptdev *ppt;
+
+	if ((destcpu >= VM_MAXCPU || destcpu < 0) ||
+	    (vector < 0 || vector > 255) ||
+	    (numvec < 0 || numvec > MAX_MSIMSGS))
+		return (EINVAL);
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt == NULL)
+		return (ENOENT);
+	if (ppt->vm != vm)		/* Make sure we own this device */
+		return (EBUSY);
+
+	/* Free any allocated resources */
+	PPT_TEARDOWN_MSI(vm, vcpu, ppt);
+
+	if (numvec == 0)		/* nothing more to do */
+		return (0);
+
+	flags = RF_ACTIVE;
+	msi_count = pci_msi_count(ppt->dev);
+	if (msi_count == 0) {
+		startrid = 0;		/* legacy interrupt */
+		msi_count = 1;
+		flags |= RF_SHAREABLE;
+	} else
+		startrid = 1;		/* MSI */
+
+	/*
+	 * The device must be capable of supporting the number of vectors
+	 * the guest wants to allocate.
+	 */
+	if (numvec > msi_count)
+		return (EINVAL);
+
+	/*
+	 * Make sure that we can allocate all the MSI vectors that are needed
+	 * by the guest.
+	 */
+	if (startrid == 1) {
+		tmp = numvec;
+		error = pci_alloc_msi(ppt->dev, &tmp);
+		if (error)
+			return (error);
+		else if (tmp != numvec) {
+			pci_release_msi(ppt->dev);
+			return (ENOSPC);
+		} else {
+			/* success */
+		}
+	}
+	
+	ppt->msi.vector = vector;
+	ppt->msi.vcpu = destcpu;
+	ppt->msi.startrid = startrid;
+
+	/*
+	 * Allocate the irq resource and attach it to the interrupt handler.
+	 */
+	for (i = 0; i < numvec; i++) {
+		ppt->msi.num_msgs = i + 1;
+		ppt->msi.cookie[i] = NULL;
+
+		rid = startrid + i;
+		ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+							 &rid, flags);
+		if (ppt->msi.res[i] == NULL)
+			break;
+
+		ppt->msi.arg[i].pptdev = ppt;
+		ppt->msi.arg[i].msg = i;
+
+		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
+				       INTR_TYPE_NET | INTR_MPSAFE | INTR_FAST,
+				       pptintr, NULL, &ppt->msi.arg[i],
+				       &ppt->msi.cookie[i]);
+		if (error != 0)
+			break;
+	}
+	
+	if (i < numvec) {
+		PPT_TEARDOWN_MSI(vm, vcpu, ppt);
+		return (ENXIO);
+	}
+
+	return (0);
+}
diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h
new file mode 100644
index 0000000..95f3ad0
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_PPT_H_
+#define	_IO_PPT_H_
+
+int	ppt_assign_device(struct vm *vm, int bus, int slot, int func);
+int	ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
+int	ppt_unassign_all(struct vm *vm);
+int	ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+		     vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int	ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+		      int destcpu, int vector, int numvec);
+
+#endif
diff --git a/sys/amd64/vmm/io/vdev.c b/sys/amd64/vmm/io/vdev.c
new file mode 100644
index 0000000..cd6c5d1
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.c
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include "vdev.h"
+
+struct vdev {
+	SLIST_ENTRY(vdev) 	 entry;
+	struct vdev_ops 	*ops;
+	void			*dev;
+};
+static SLIST_HEAD(, vdev)	vdev_head;
+static int 		  	vdev_count;
+
+struct vdev_region {
+	SLIST_ENTRY(vdev_region) 	 entry;
+	struct vdev_ops 		*ops;
+	void				*dev;
+	struct io_region		*io;
+};
+static SLIST_HEAD(, vdev_region)	 region_head;
+static int 		  		 region_count;
+
+static MALLOC_DEFINE(M_VDEV, "vdev", "vdev");
+
+#define VDEV_INIT 	(0)
+#define VDEV_RESET	(1)
+#define VDEV_HALT	(2)
+
+// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"};
+
+static int
+vdev_system_event(int event)
+{
+	struct vdev 	*vd;
+	int		 rc;
+
+	// TODO: locking
+	SLIST_FOREACH(vd, &vdev_head, entry) {
+		// printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name);
+		switch (event) {
+			case VDEV_INIT:
+				rc = vd->ops->init(vd->dev);
+				break;
+			case VDEV_RESET:
+				rc = vd->ops->reset(vd->dev);
+				break;
+			case VDEV_HALT:
+				rc = vd->ops->halt(vd->dev);
+				break;
+			default:
+				break;
+		}
+		if (rc) {
+			printf("vdev %s init failed rc=%d\n",
+			    vd->ops->name, rc);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+int
+vdev_init(void)
+{
+	return vdev_system_event(VDEV_INIT);
+}
+
+int
+vdev_reset(void)
+{
+	return vdev_system_event(VDEV_RESET);
+}
+
+int
+vdev_halt(void)
+{
+	return vdev_system_event(VDEV_HALT);
+}
+
+void
+vdev_vm_init(void)
+{
+	SLIST_INIT(&vdev_head);
+	vdev_count = 0;
+
+	SLIST_INIT(&region_head);
+	region_count = 0;
+}
+void
+vdev_vm_cleanup(void)
+{
+	struct vdev *vd;
+     
+	// TODO: locking
+	while (!SLIST_EMPTY(&vdev_head)) {
+		vd = SLIST_FIRST(&vdev_head);
+		SLIST_REMOVE_HEAD(&vdev_head, entry);
+		free(vd, M_VDEV);
+		vdev_count--;
+	}
+}
+
+int
+vdev_register(struct vdev_ops *ops, void *dev)
+{
+	struct vdev *vd;
+	vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO); 
+	vd->ops = ops;
+	vd->dev = dev;
+	
+	// TODO: locking
+	SLIST_INSERT_HEAD(&vdev_head, vd, entry); 
+	vdev_count++;
+	return 0;
+}
+
+void
+vdev_unregister(void *dev)
+{
+	struct vdev 	*vd, *found;
+
+	found = NULL;
+	// TODO: locking
+	SLIST_FOREACH(vd, &vdev_head, entry) {
+		if (vd->dev == dev) {
+			found = vd;
+		}
+	}
+
+	if (found) {
+		SLIST_REMOVE(&vdev_head, found, vdev, entry);
+		free(found, M_VDEV);
+	}
+}
+
+#define IN_RANGE(val, start, end)	\
+    (((val) >= (start)) && ((val) < (end)))
+
+static struct vdev_region*
+vdev_find_region(struct io_region *io, void *dev) 
+{
+	struct 		vdev_region *region, *found;
+	uint64_t	region_base;
+	uint64_t	region_end;
+
+	found = NULL;
+
+	// TODO: locking
+	// FIXME: we should verify we are in the context the current
+	// 	  vcpu here as well.
+	SLIST_FOREACH(region, &region_head, entry) {
+		region_base = region->io->base;
+		region_end = region_base + region->io->len;
+		if (IN_RANGE(io->base, region_base, region_end) &&
+		    IN_RANGE(io->base+io->len, region_base, region_end+1) &&
+		    (dev && dev == region->dev)) {
+			found = region;
+			break;
+		}
+	}
+	return found;
+}
+
+int
+vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io)
+{
+	struct vdev_region *region;
+
+	region = vdev_find_region(io, dev);
+	if (region) {
+		return -EEXIST;
+	}
+
+	region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO);
+	region->io = io;
+	region->ops = ops;
+	region->dev = dev;
+
+	// TODO: locking
+	SLIST_INSERT_HEAD(&region_head, region, entry); 
+	region_count++;
+
+	return 0;
+}
+
+void
+vdev_unregister_region(void *dev, struct io_region *io)
+{
+	struct vdev_region *region;
+
+	region = vdev_find_region(io, dev);
+	
+	if (region) {
+		SLIST_REMOVE(&region_head, region, vdev_region, entry);
+		free(region, M_VDEV);
+		region_count--;
+	}
+}
+
+static int
+vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read)
+{
+	struct vdev_region 	*region;
+	struct io_region	 io;
+	region_attr_t		 attr;
+	int			 rc;
+
+	io.base = gpa;
+	io.len = size;
+
+	region = vdev_find_region(&io, NULL);
+	if (!region)
+		return -EINVAL;
+	
+	attr = (read) ? MMIO_READ : MMIO_WRITE;
+	if (!(region->io->attr & attr))
+		return -EPERM;
+
+	if (read)
+		rc = region->ops->memread(region->dev, gpa, size, data);
+	else 
+		rc = region->ops->memwrite(region->dev, gpa, size, *data);
+
+	return rc;
+}
+
+int
+vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data)
+{
+	return vdev_memrw(gpa, size, data, 1);
+}
+
+int
+vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data)
+{
+	return vdev_memrw(gpa, size, &data, 0);
+}
diff --git a/sys/amd64/vmm/io/vdev.h b/sys/amd64/vmm/io/vdev.h
new file mode 100644
index 0000000..6feeba8
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.h
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VDEV_H_
+#define	_VDEV_H_
+
+typedef enum {
+	BYTE	= 1,
+	WORD	= 2,
+	DWORD	= 4,
+	QWORD	= 8,
+} opsize_t;
+
+typedef enum {
+	MMIO_READ = 1,
+	MMIO_WRITE = 2,
+} region_attr_t;
+
+struct io_region {
+	uint64_t	base;
+	uint64_t	len;
+	region_attr_t	attr;
+	int		vcpu;
+};
+
+typedef int (*vdev_init_t)(void* dev);
+typedef int (*vdev_reset_t)(void* dev);
+typedef int (*vdev_halt_t)(void* dev);
+typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data);
+typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data);
+
+
+struct vdev_ops {
+	const char	*name;
+	vdev_init_t	init;
+	vdev_reset_t	reset;
+	vdev_halt_t	halt;
+	vdev_memread_t	memread;
+	vdev_memwrite_t	memwrite;
+};
+
+
+void vdev_vm_init(void);
+void vdev_vm_cleanup(void);
+
+int  vdev_register(struct vdev_ops *ops, void *dev);
+void vdev_unregister(void *dev);
+
+int  vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io);
+void vdev_unregister_region(void *dev, struct io_region *io);
+
+int vdev_init(void);
+int vdev_reset(void);
+int vdev_halt(void);
+int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data);
+int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data);
+
+#endif	/* _VDEV_H_ */
+
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
new file mode 100644
index 0000000..a21addf
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -0,0 +1,812 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+
+#include <machine/clock.h>
+#include <machine/apicreg.h>
+
+#include <machine/vmm.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+#include "vdev.h"
+#include "vlapic.h"
+
+#define	VLAPIC_CTR0(vlapic, format)					\
+	VMM_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
+
+#define	VLAPIC_CTR1(vlapic, format, p1)					\
+	VMM_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
+
+#define	VLAPIC_CTR_IRR(vlapic, msg)					\
+do {									\
+	uint32_t *irrptr = &(vlapic)->apic.irr0;			\
+	irrptr[0] = irrptr[0];	/* silence compiler */			\
+	VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]);	\
+} while (0)
+
+#define	VLAPIC_CTR_ISR(vlapic, msg)					\
+do {									\
+	uint32_t *isrptr = &(vlapic)->apic.isr0;			\
+	isrptr[0] = isrptr[0];	/* silence compiler */			\
+	VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]);	\
+} while (0)
+
+static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
+
+#define	PRIO(x)			((x) >> 4)
+
+#define VLAPIC_VERSION		(16)
+#define VLAPIC_MAXLVT_ENTRIES	(5)
+
+struct vlapic {
+	struct vm		*vm;
+	int			vcpuid;
+
+	struct io_region	*mmio;
+	struct vdev_ops		*ops;
+	struct LAPIC		 apic;
+
+	int			 esr_update;
+
+	int			 divisor;
+	int			 ccr_ticks;
+
+	/*
+	 * The 'isrvec_stk' is a stack of vectors injected by the local apic.
+	 * A vector is popped from the stack when the processor does an EOI.
+	 * The vector on the top of the stack is used to compute the
+	 * Processor Priority in conjunction with the TPR.
+	 */
+	uint8_t			 isrvec_stk[ISRVEC_STK_SIZE];
+	int			 isrvec_stk_top;
+};
+
+static void
+vlapic_mask_lvts(uint32_t *lvts, int num_lvt)
+{
+	int i;
+	for (i = 0; i < num_lvt; i++) {
+		*lvts |= APIC_LVT_M;
+		lvts += 4;
+	}
+}
+
+#if 0
+static inline void
+vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
+{
+	printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
+	    *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
+	    *lvt & APIC_LVTT_M);
+}
+#endif
+
+static uint64_t
+vlapic_get_ccr(struct vlapic *vlapic)
+{
+	struct LAPIC    *lapic = &vlapic->apic;
+	return lapic->ccr_timer;
+}
+
+static void
+vlapic_update_errors(struct vlapic *vlapic)
+{
+	struct LAPIC    *lapic = &vlapic->apic;
+	lapic->esr = 0; // XXX 
+}
+
+static void
+vlapic_init_ipi(struct vlapic *vlapic)
+{
+	struct LAPIC    *lapic = &vlapic->apic;
+	lapic->version = VLAPIC_VERSION;
+	lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT);
+	lapic->dfr = 0xffffffff;
+	lapic->svr = APIC_SVR_VECTOR;
+	vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1);
+}
+
+static int
+vlapic_op_reset(void* dev)
+{
+	struct vlapic 	*vlapic = (struct vlapic*)dev;
+	struct LAPIC	*lapic = &vlapic->apic;
+
+	memset(lapic, 0, sizeof(*lapic));
+	lapic->id = vlapic->vcpuid << 24;
+	lapic->apr = vlapic->vcpuid;
+	vlapic_init_ipi(vlapic);
+	
+	return 0;
+
+}
+
+static int
+vlapic_op_init(void* dev)
+{
+	struct vlapic *vlapic = (struct vlapic*)dev;
+	vdev_register_region(vlapic->ops, vlapic, vlapic->mmio);
+	return vlapic_op_reset(dev);
+}
+
+static int
+vlapic_op_halt(void* dev)
+{
+	struct vlapic *vlapic = (struct vlapic*)dev;
+	vdev_unregister_region(vlapic, vlapic->mmio);
+	return 0;
+
+}
+
+void
+vlapic_set_intr_ready(struct vlapic *vlapic, int vector)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint32_t	*irrptr;
+	int		idx;
+
+	if (vector < 0 || vector >= 256)
+		panic("vlapic_set_intr_ready: invalid vector %d\n", vector);
+
+	idx = (vector / 32) * 4;
+	irrptr = &lapic->irr0;
+	atomic_set_int(&irrptr[idx], 1 << (vector % 32));
+	VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
+}
+
+#define VLAPIC_BUS_FREQ	tsc_freq
+#define VLAPIC_DCR(x)	((x->dcr_timer & 0x8) >> 1)|(x->dcr_timer & 0x3)
+
+static int
+vlapic_timer_divisor(uint32_t dcr)
+{
+	switch (dcr & 0xB) {
+	case APIC_TDCR_2:
+		return (2);
+	case APIC_TDCR_4:
+		return (4);
+	case APIC_TDCR_8:
+		return (8);
+	case APIC_TDCR_16:
+		return (16);
+	case APIC_TDCR_32:
+		return (32);
+	case APIC_TDCR_64:
+		return (64);
+	case APIC_TDCR_128:
+		return (128);
+	default:
+		panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
+	}
+}
+
+static void
+vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed)
+{
+	uint32_t icr_timer;
+
+	icr_timer = vlapic->apic.icr_timer;
+
+	vlapic->ccr_ticks = ticks;
+	if (elapsed < icr_timer)
+		vlapic->apic.ccr_timer = icr_timer - elapsed;
+	else {
+		/*
+		 * This can happen when the guest is trying to run its local
+		 * apic timer higher that the setting of 'hz' in the host.
+		 *
+		 * We deal with this by running the guest local apic timer
+		 * at the rate of the host's 'hz' setting.
+		 */
+		vlapic->apic.ccr_timer = 0;
+	}
+}
+
+static __inline uint32_t *
+vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	int 		 i;
+
+	if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) {
+		panic("vlapic_get_lvt: invalid LVT\n");
+	}
+	i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
+	return ((&lapic->lvt_timer) + i);;
+}
+
+#if 1
+static void
+dump_isrvec_stk(struct vlapic *vlapic)
+{
+	int i;
+	uint32_t *isrptr;
+
+	isrptr = &vlapic->apic.isr0;
+	for (i = 0; i < 8; i++)
+		printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
+
+	for (i = 0; i <= vlapic->isrvec_stk_top; i++)
+		printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
+}
+#endif
+
+/*
+ * Algorithm adopted from section "Interrupt, Task and Processor Priority"
+ * in Intel Architecture Manual Vol 3a.
+ */
+static void
+vlapic_update_ppr(struct vlapic *vlapic)
+{
+	int isrvec, tpr, ppr;
+
+	/*
+	 * Note that the value on the stack at index 0 is always 0.
+	 *
+	 * This is a placeholder for the value of ISRV when none of the
+	 * bits is set in the ISRx registers.
+	 */
+	isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
+	tpr = vlapic->apic.tpr;
+
+#if 1
+	{
+		int i, lastprio, curprio, vector, idx;
+		uint32_t *isrptr;
+
+		if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
+			panic("isrvec_stk is corrupted: %d", isrvec);
+
+		/*
+		 * Make sure that the priority of the nested interrupts is
+		 * always increasing.
+		 */
+		lastprio = -1;
+		for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
+			curprio = PRIO(vlapic->isrvec_stk[i]);
+			if (curprio <= lastprio) {
+				dump_isrvec_stk(vlapic);
+				panic("isrvec_stk does not satisfy invariant");
+			}
+			lastprio = curprio;
+		}
+
+		/*
+		 * Make sure that each bit set in the ISRx registers has a
+		 * corresponding entry on the isrvec stack.
+		 */
+		i = 1;
+		isrptr = &vlapic->apic.isr0;
+		for (vector = 0; vector < 256; vector++) {
+			idx = (vector / 32) * 4;
+			if (isrptr[idx] & (1 << (vector % 32))) {
+				if (i > vlapic->isrvec_stk_top ||
+				    vlapic->isrvec_stk[i] != vector) {
+					dump_isrvec_stk(vlapic);
+					panic("ISR and isrvec_stk out of sync");
+				}
+				i++;
+			}
+		}
+	}
+#endif
+
+	if (PRIO(tpr) >= PRIO(isrvec))
+		ppr = tpr;
+	else
+		ppr = isrvec & 0xf0;
+
+	vlapic->apic.ppr = ppr;
+	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
+}
+
+static void
+vlapic_process_eoi(struct vlapic *vlapic)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint32_t	*isrptr;
+	int		i, idx, bitpos;
+
+	isrptr = &lapic->isr0;
+
+	/*
+	 * The x86 architecture reserves the the first 32 vectors for use
+	 * by the processor.
+	 */
+	for (i = 7; i > 0; i--) {
+		idx = i * 4;
+		bitpos = fls(isrptr[idx]);
+		if (bitpos != 0) {
+			if (vlapic->isrvec_stk_top <= 0) {
+				panic("invalid vlapic isrvec_stk_top %d",
+				      vlapic->isrvec_stk_top);
+			}
+			isrptr[idx] &= ~(1 << (bitpos - 1));
+			VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
+			vlapic->isrvec_stk_top--;
+			vlapic_update_ppr(vlapic);
+			return;
+		}
+	}
+}
+
+static __inline int
+vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask)
+{
+	return (*lvt & mask);
+}
+
+static __inline int
+vlapic_periodic_timer(struct vlapic *vlapic)
+{
+	uint32_t *lvt;
+	
+	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+	return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
+}
+
+static void
+vlapic_fire_timer(struct vlapic *vlapic)
+{
+	int vector;
+	uint32_t *lvt;
+	
+	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+	if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) {
+		vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR);
+		vlapic_set_intr_ready(vlapic, vector);
+	}
+}
+
+static int
+lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
+{
+	int i;
+	cpumask_t dmask, thiscpumask;
+	uint32_t dest, vec, mode;
+	
+	thiscpumask = vcpu_mask(vlapic->vcpuid);
+
+	dmask = 0;
+	dest = icrval >> 32;
+	vec = icrval & APIC_VECTOR_MASK;
+	mode = icrval & APIC_DELMODE_MASK;
+
+	if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
+		switch (icrval & APIC_DEST_MASK) {
+		case APIC_DEST_DESTFLD:
+			dmask = vcpu_mask(dest);
+			break;
+		case APIC_DEST_SELF:
+			dmask = thiscpumask;
+			break;
+		case APIC_DEST_ALLISELF:
+			dmask = vm_active_cpus(vlapic->vm);
+			break;
+		case APIC_DEST_ALLESELF:
+			dmask = vm_active_cpus(vlapic->vm) & ~thiscpumask;
+			break;
+		}
+
+		for (i = 0; i < VM_MAXCPU; i++) {
+			if (dmask & vcpu_mask(i)) {
+				if (mode == APIC_DELMODE_FIXED)
+					lapic_set_intr(vlapic->vm, i, vec);
+				else
+					vm_inject_nmi(vlapic->vm, i);
+			}
+		}
+
+		return (0);	/* handled completely in the kernel */
+	}
+
+	/*
+	 * XXX this assumes that the startup IPI always succeeds
+	 */
+	if (mode == APIC_DELMODE_STARTUP)
+		vm_activate_cpu(vlapic->vm, dest);
+
+	/*
+	 * This will cause a return to userland.
+	 */
+	return (1);
+}
+
+int
+vlapic_pending_intr(struct vlapic *vlapic)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	int	  	 idx, i, bitpos, vector;
+	uint32_t	*irrptr, val;
+
+	irrptr = &lapic->irr0;
+
+	/*
+	 * The x86 architecture reserves the the first 32 vectors for use
+	 * by the processor.
+	 */
+	for (i = 7; i > 0; i--) {
+		idx = i * 4;
+		val = atomic_load_acq_int(&irrptr[idx]);
+		bitpos = fls(val);
+		if (bitpos != 0) {
+			vector = i * 32 + (bitpos - 1);
+			if (PRIO(vector) > PRIO(lapic->ppr)) {
+				VLAPIC_CTR1(vlapic, "pending intr %d", vector);
+				return (vector);
+			} else 
+				break;
+		}
+	}
+	VLAPIC_CTR0(vlapic, "no pending intr");
+	return (-1);
+}
+
+void
+vlapic_intr_accepted(struct vlapic *vlapic, int vector)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint32_t	*irrptr, *isrptr;
+	int		idx, stk_top;
+
+	/*
+	 * clear the ready bit for vector being accepted in irr 
+	 * and set the vector as in service in isr.
+	 */
+	idx = (vector / 32) * 4;
+
+	irrptr = &lapic->irr0;
+	atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
+	VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
+
+	isrptr = &lapic->isr0;
+	isrptr[idx] |= 1 << (vector % 32);
+	VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
+
+	/*
+	 * Update the PPR
+	 */
+	vlapic->isrvec_stk_top++;
+
+	stk_top = vlapic->isrvec_stk_top;
+	if (stk_top >= ISRVEC_STK_SIZE)
+		panic("isrvec_stk_top overflow %d", stk_top);
+
+	vlapic->isrvec_stk[stk_top] = vector;
+	vlapic_update_ppr(vlapic);
+}
+
+int
+vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data)
+{
+	struct vlapic 	*vlapic = (struct vlapic*)dev;
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint64_t	 offset = gpa & ~(PAGE_SIZE);
+	uint32_t	*reg;
+	int		 i;
+
+	if (offset > sizeof(*lapic)) {
+		*data = 0;
+		return 0;
+	}
+	
+	offset &= ~3;
+	switch(offset)
+	{
+		case APIC_OFFSET_ID:
+			*data = lapic->id;
+			break;
+		case APIC_OFFSET_VER:
+			*data = lapic->version;
+			break;
+		case APIC_OFFSET_TPR:
+			*data = lapic->tpr;
+			break;
+		case APIC_OFFSET_APR:
+			*data = lapic->apr;
+			break;
+		case APIC_OFFSET_PPR:
+			*data = lapic->ppr;
+			break;
+		case APIC_OFFSET_EOI:
+			*data = lapic->eoi;
+			break;
+		case APIC_OFFSET_LDR:
+			*data = lapic->ldr;
+			break;
+		case APIC_OFFSET_DFR:
+			*data = lapic->dfr;
+			break;
+		case APIC_OFFSET_SVR:
+			*data = lapic->svr;
+			break;
+		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+			i = (offset - APIC_OFFSET_ISR0) >> 2;
+			reg = &lapic->isr0;
+			*data = *(reg + i);
+			break;
+		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+			i = (offset - APIC_OFFSET_TMR0) >> 2;
+			reg = &lapic->tmr0;
+			*data = *(reg + i);
+			break;
+		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+			i = (offset - APIC_OFFSET_IRR0) >> 2;
+			reg = &lapic->irr0;
+			*data = atomic_load_acq_int(reg + i);
+			break;
+		case APIC_OFFSET_ESR:
+			*data = lapic->esr;
+			break;
+		case APIC_OFFSET_ICR_LOW: 
+			*data = lapic->icr_lo;
+			break;
+		case APIC_OFFSET_ICR_HI: 
+			*data = lapic->icr_hi;
+			break;
+		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+			reg = vlapic_get_lvt(vlapic, offset);	
+			*data = *(reg);
+			break;
+		case APIC_OFFSET_ICR:
+			*data = lapic->icr_timer;
+			break;
+		case APIC_OFFSET_CCR:
+			*data = vlapic_get_ccr(vlapic);
+			break;
+		case APIC_OFFSET_DCR:
+			*data = lapic->dcr_timer;
+			break;
+		case APIC_OFFSET_RRR:
+		default:
+			*data = 0;
+			break;
+	}
+	return 0;
+}
+
+int
+vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data)
+{
+	struct vlapic 	*vlapic = (struct vlapic*)dev;
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint64_t	 offset = gpa & ~(PAGE_SIZE);
+	uint32_t	*reg;
+	int		retval;
+
+	if (offset > sizeof(*lapic)) {
+		return 0;
+	}
+
+	retval = 0;
+	offset &= ~3;
+	switch(offset)
+	{
+		case APIC_OFFSET_ID:
+			lapic->id = data;
+			break;
+		case APIC_OFFSET_TPR:
+			lapic->tpr = data & 0xff;
+			vlapic_update_ppr(vlapic);
+			break;
+		case APIC_OFFSET_EOI:
+			vlapic_process_eoi(vlapic);
+			break;
+		case APIC_OFFSET_LDR:
+			break;
+		case APIC_OFFSET_DFR:
+			break;
+		case APIC_OFFSET_SVR:
+			lapic->svr = data;
+			break;
+		case APIC_OFFSET_ICR_LOW: 
+			retval = lapic_process_icr(vlapic, data);
+			break;
+		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+			reg = vlapic_get_lvt(vlapic, offset);	
+			if (!(lapic->svr & APIC_SVR_ENABLE)) {
+				data |= APIC_LVT_M;
+			}
+			*reg = data;
+			// vlapic_dump_lvt(offset, reg);
+			break;
+		case APIC_OFFSET_ICR:
+			lapic->icr_timer = data;
+			vlapic_start_timer(vlapic, 0);
+			break;
+
+		case APIC_OFFSET_DCR:
+			lapic->dcr_timer = data;
+			vlapic->divisor = vlapic_timer_divisor(data);
+			break;
+
+		case APIC_OFFSET_ESR:
+			vlapic_update_errors(vlapic);
+			break;
+		case APIC_OFFSET_VER:
+		case APIC_OFFSET_APR:
+		case APIC_OFFSET_PPR:
+		case APIC_OFFSET_RRR:
+		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+		case APIC_OFFSET_CCR:
+		default:
+			// Read only.
+			break;
+	}
+
+	return (retval);
+}
+
+void
+vlapic_timer_tick(struct vlapic *vlapic)
+{
+	int curticks, delta, periodic;
+	uint32_t ccr;
+	uint32_t decrement, remainder;
+
+	curticks = ticks;
+
+	/* Common case */
+	delta = curticks - vlapic->ccr_ticks;
+	if (delta == 0)
+		return;
+
+	/* Local APIC timer is disabled */
+	if (vlapic->apic.icr_timer == 0)
+		return;
+
+	/* One-shot mode and timer has already counted down to zero */
+	periodic = vlapic_periodic_timer(vlapic);
+	if (!periodic && vlapic->apic.ccr_timer == 0)
+		return;
+	/*
+	 * The 'curticks' and 'ccr_ticks' are out of sync by more than
+	 * 2^31 ticks. We deal with this by restarting the timer.
+	 */
+	if (delta < 0) {
+		vlapic_start_timer(vlapic, 0);
+		return;
+	}
+
+	ccr = vlapic->apic.ccr_timer;
+	decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz;
+	while (delta-- > 0) {
+		if (ccr <= decrement) {
+			remainder = decrement - ccr;
+			vlapic_fire_timer(vlapic);
+			if (periodic) {
+				vlapic_start_timer(vlapic, remainder);
+				ccr = vlapic->apic.ccr_timer;
+			} else {
+				/*
+				 * One-shot timer has counted down to zero.
+				 */
+				ccr = 0;
+				break;
+			}
+		} else 
+			ccr -= decrement;
+	}
+
+	vlapic->ccr_ticks = curticks;
+	vlapic->apic.ccr_timer = ccr;
+}
+
+struct vdev_ops vlapic_dev_ops = {
+	.name = "vlapic",
+	.init = vlapic_op_init,
+	.reset = vlapic_op_reset,
+	.halt = vlapic_op_halt,
+	.memread = vlapic_op_mem_read,
+	.memwrite = vlapic_op_mem_write,
+};
+static struct io_region vlapic_mmio[VM_MAXCPU];
+
+struct vlapic *
+vlapic_init(struct vm *vm, int vcpuid)
+{
+	struct vlapic 		*vlapic;
+
+	vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
+	vlapic->vm = vm;
+	vlapic->vcpuid = vcpuid;
+	vlapic->ops = &vlapic_dev_ops;
+
+	vlapic->mmio = vlapic_mmio + vcpuid;
+	vlapic->mmio->base = DEFAULT_APIC_BASE;
+	vlapic->mmio->len = PAGE_SIZE;
+	vlapic->mmio->attr = MMIO_READ|MMIO_WRITE;
+	vlapic->mmio->vcpu = vcpuid;
+
+	vdev_register(&vlapic_dev_ops, vlapic);
+
+	vlapic_op_init(vlapic);
+
+	return (vlapic);
+}
+
+void
+vlapic_cleanup(struct vlapic *vlapic)
+{
+	vdev_unregister(vlapic);
+	free(vlapic, M_VLAPIC);
+}
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
new file mode 100644
index 0000000..861ea8c
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -0,0 +1,105 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VLAPIC_H_
+#define	_VLAPIC_H_
+
+#include "vdev.h"
+
+struct vm;
+  
+/*
+ * Map of APIC Registers:       Offset  Description          		 	Access
+ */
+#define APIC_OFFSET_ID 		0x20    // Local APIC ID               		R/W
+#define APIC_OFFSET_VER 	0x30    // Local APIC Version              	R
+#define APIC_OFFSET_TPR 	0x80    // Task Priority Register          	R/W
+#define APIC_OFFSET_APR 	0x90    // Arbitration Priority Register   	R
+#define APIC_OFFSET_PPR 	0xA0    // Processor Priority Register     	R
+#define APIC_OFFSET_EOI 	0xB0    // EOI Register                    	W
+#define APIC_OFFSET_RRR 	0xC0    // Remote read                     	R
+#define APIC_OFFSET_LDR 	0xD0    // Logical Destination             	R/W
+#define APIC_OFFSET_DFR 	0xE0    // Destination Format Register     	0..27 R;  28..31 R/W
+#define APIC_OFFSET_SVR 	0xF0    // Spurious Interrupt Vector Reg.  	0..3  R;  4..9   R/W
+#define APIC_OFFSET_ISR0 	0x100   // ISR  000-031                    	R
+#define APIC_OFFSET_ISR1 	0x110   // ISR  032-063                    	R
+#define APIC_OFFSET_ISR2 	0x120   // ISR  064-095                    	R
+#define APIC_OFFSET_ISR3 	0x130   // ISR  095-128                    	R
+#define APIC_OFFSET_ISR4 	0x140   // ISR  128-159                    	R
+#define APIC_OFFSET_ISR5 	0x150   // ISR  160-191                    	R
+#define APIC_OFFSET_ISR6 	0x160   // ISR  192-223                    	R
+#define APIC_OFFSET_ISR7 	0x170   // ISR  224-255                    	R
+#define APIC_OFFSET_TMR0 	0x180   // TMR  000-031                    	R
+#define APIC_OFFSET_TMR1 	0x190   // TMR  032-063                    	R
+#define APIC_OFFSET_TMR2 	0x1A0   // TMR  064-095                    	R
+#define APIC_OFFSET_TMR3 	0x1B0   // TMR  095-128                    	R
+#define APIC_OFFSET_TMR4 	0x1C0   // TMR  128-159                    	R
+#define APIC_OFFSET_TMR5 	0x1D0   // TMR  160-191                    	R
+#define APIC_OFFSET_TMR6 	0x1E0   // TMR  192-223                    	R
+#define APIC_OFFSET_TMR7 	0x1F0   // TMR  224-255                    	R
+#define APIC_OFFSET_IRR0 	0x200   // IRR  000-031                    	R
+#define APIC_OFFSET_IRR1 	0x210   // IRR  032-063                    	R
+#define APIC_OFFSET_IRR2 	0x220   // IRR  064-095                    	R
+#define APIC_OFFSET_IRR3 	0x230   // IRR  095-128                    	R
+#define APIC_OFFSET_IRR4 	0x240   // IRR  128-159                    	R
+#define APIC_OFFSET_IRR5 	0x250   // IRR  160-191                    	R
+#define APIC_OFFSET_IRR6 	0x260   // IRR  192-223                    	R
+#define APIC_OFFSET_IRR7 	0x270   // IRR  224-255                    	R
+#define APIC_OFFSET_ESR		0x280   // Error Status Register           	R
+#define APIC_OFFSET_ICR_LOW 	0x300   // Interrupt Command Reg. (0-31)   	R/W
+#define APIC_OFFSET_ICR_HI 	0x310   // Interrupt Command Reg. (32-63)  	R/W
+#define APIC_OFFSET_TIMER_LVT 	0x320   // Local Vector Table (Timer)      	R/W
+#define APIC_OFFSET_THERM_LVT 	0x330   // Local Vector Table (Thermal)    	R/W (PIV+)
+#define APIC_OFFSET_PERF_LVT 	0x340   // Local Vector Table (Performance) 	R/W (P6+)
+#define APIC_OFFSET_LINT0_LVT 	0x350   // Local Vector Table (LINT0)      	R/W
+#define APIC_OFFSET_LINT1_LVT 	0x360 	// Local Vector Table (LINT1)      	R/W
+#define APIC_OFFSET_ERROR_LVT 	0x370   // Local Vector Table (ERROR)      	R/W
+#define APIC_OFFSET_ICR 	0x380   // Initial Count Reg. for Timer    	R/W
+#define APIC_OFFSET_CCR 	0x390   // Current Count of Timer          	R
+#define APIC_OFFSET_DCR 	0x3E0   // Timer Divide Configuration Reg. 	R/W
+
+/*
+ * 16 priority levels with at most one vector injected per level.
+ */
+#define	ISRVEC_STK_SIZE		(16 + 1)
+
+struct vlapic *vlapic_init(struct vm *vm, int vcpuid);
+void vlapic_cleanup(struct vlapic *vlapic);
+
+int vlapic_op_mem_write(void* dev, uint64_t gpa,
+    			opsize_t size, uint64_t data);
+
+int vlapic_op_mem_read(void* dev, uint64_t gpa,
+    			opsize_t size, uint64_t *data);
+
+int vlapic_pending_intr(struct vlapic *vlapic);
+void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
+void vlapic_set_intr_ready(struct vlapic *vlapic, int vector);
+void vlapic_timer_tick(struct vlapic *vlapic);
+
+#endif	/* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
new file mode 100644
index 0000000..c93c31e
--- /dev/null
+++ b/sys/amd64/vmm/vmm.c
@@ -0,0 +1,737 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+
+#include <machine/vm.h>
+#include <machine/pcb.h>
+#include <machine/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_mem.h"
+#include "vmm_util.h"
+#include <machine/vmm_dev.h>
+#include "vlapic.h"
+#include "vmm_msr.h"
+#include "vmm_ipi.h"
+#include "vmm_stat.h"
+
+#include "io/ppt.h"
+#include "io/iommu.h"
+
+struct vlapic;
+
+struct vcpu {
+	int		flags;
+	int		pincpu;		/* host cpuid this vcpu is bound to */
+	int		hostcpu;	/* host cpuid this vcpu last ran on */
+	uint64_t	guest_msrs[VMM_MSR_NUM];
+	struct vlapic	*vlapic;
+	int		 vcpuid;
+	struct savefpu	savefpu;	/* guest fpu state */
+	void		*stats;
+};
+#define	VCPU_F_PINNED	0x0001
+#define	VCPU_F_RUNNING	0x0002
+
+#define	VCPU_PINCPU(vm, vcpuid)	\
+    ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1)
+
+#define	VCPU_UNPIN(vm, vcpuid)	(vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED)
+
+#define	VCPU_PIN(vm, vcpuid, host_cpuid)				\
+do {									\
+	vm->vcpu[vcpuid].flags |= VCPU_F_PINNED;			\
+	vm->vcpu[vcpuid].pincpu = host_cpuid;				\
+} while(0)
+
+#define	VM_MAX_MEMORY_SEGMENTS	2
+
+struct vm {
+	void		*cookie;	/* processor-specific data */
+	void		*iommu;		/* iommu-specific data */
+	struct vcpu	vcpu[VM_MAXCPU];
+	int		num_mem_segs;
+	struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+	char		name[VM_MAX_NAMELEN];
+
+	/*
+	 * Mask of active vcpus.
+	 * An active vcpu is one that has been started implicitly (BSP) or
+	 * explicitly (AP) by sending it a startup ipi.
+	 */
+	cpumask_t	active_cpus;
+};
+
+static struct vmm_ops *ops;
+#define	VMM_INIT()	(ops != NULL ? (*ops->init)() : 0)
+#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
+
+#define	VMINIT(vm)	(ops != NULL ? (*ops->vminit)(vm): NULL)
+#define	VMRUN(vmi, vcpu, rip, vmexit) \
+	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, vmexit) : ENXIO)
+#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
+#define	VMMMAP(vmi, gpa, hpa, len, attr, prot, spm)	\
+    (ops != NULL ? (*ops->vmmmap)(vmi, gpa, hpa, len, attr, prot, spm) : ENXIO)
+#define	VMGETREG(vmi, vcpu, num, retval)		\
+	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
+#define	VMSETREG(vmi, vcpu, num, val)		\
+	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
+#define	VMGETDESC(vmi, vcpu, num, desc)		\
+	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define	VMSETDESC(vmi, vcpu, num, desc)		\
+	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
+	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
+#define	VMNMI(vmi, vcpu)	\
+	(ops != NULL ? (*ops->vmnmi)(vmi, vcpu) : ENXIO)
+#define	VMGETCAP(vmi, vcpu, num, retval)	\
+	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
+#define	VMSETCAP(vmi, vcpu, num, val)		\
+	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
+
+#define	fxrstor(addr)		__asm("fxrstor %0" : : "m" (*(addr)))
+#define	fxsave(addr)		__asm __volatile("fxsave %0" : "=m" (*(addr)))
+#define	fpu_start_emulating()	__asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \
+				      : : "n" (CR0_TS) : "ax")
+#define	fpu_stop_emulating()	__asm("clts")
+
+static MALLOC_DEFINE(M_VM, "vm", "vm");
+CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
+
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
+
+static void
+vcpu_cleanup(struct vcpu *vcpu)
+{
+	vlapic_cleanup(vcpu->vlapic);
+	vmm_stat_free(vcpu->stats);
+}
+
+static void
+vcpu_init(struct vm *vm, uint32_t vcpu_id)
+{
+	struct vcpu *vcpu;
+	
+	vcpu = &vm->vcpu[vcpu_id];
+
+	vcpu->hostcpu = -1;
+	vcpu->vcpuid = vcpu_id;
+	vcpu->vlapic = vlapic_init(vm, vcpu_id);
+	fpugetregs(curthread, &vcpu->savefpu);
+	vcpu->stats = vmm_stat_alloc();
+}
+
+static int
+vmm_init(void)
+{
+	int error;
+
+	vmm_ipi_init();
+
+	error = vmm_mem_init();
+	if (error)
+		return (error);
+	
+	if (vmm_is_intel())
+		ops = &vmm_ops_intel;
+	else if (vmm_is_amd())
+		ops = &vmm_ops_amd;
+	else
+		return (ENXIO);
+
+	vmm_msr_init();
+
+	return (VMM_INIT());
+}
+
+static int
+vmm_handler(module_t mod, int what, void *arg)
+{
+	int error;
+
+	switch (what) {
+	case MOD_LOAD:
+		vmmdev_init();
+		iommu_init();
+		error = vmm_init();
+		break;
+	case MOD_UNLOAD:
+		vmmdev_cleanup();
+		iommu_cleanup();
+		vmm_ipi_cleanup();
+		error = VMM_CLEANUP();
+		break;
+	default:
+		error = 0;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t vmm_kmod = {
+	"vmm",
+	vmm_handler,
+	NULL
+};
+
+/*
+ * Execute the module load handler after the pci passthru driver has had
+ * a chance to claim devices. We need this information at the time we do
+ * iommu initialization.
+ */
+DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY);
+MODULE_VERSION(vmm, 1);
+
+SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
+
+struct vm *
+vm_create(const char *name)
+{
+	int i;
+	struct vm *vm;
+	vm_paddr_t maxaddr;
+
+	const int BSP = 0;
+
+	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
+		return (NULL);
+
+	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
+	strcpy(vm->name, name);
+	vm->cookie = VMINIT(vm);
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		vcpu_init(vm, i);
+		guest_msrs_init(vm, i);
+	}
+
+	maxaddr = vmm_mem_maxaddr();
+	vm->iommu = iommu_create_domain(maxaddr);
+	vm_activate_cpu(vm, BSP);
+
+	return (vm);
+}
+
+void
+vm_destroy(struct vm *vm)
+{
+	int i;
+
+	ppt_unassign_all(vm);
+
+	for (i = 0; i < vm->num_mem_segs; i++)
+		vmm_mem_free(vm->mem_segs[i].hpa, vm->mem_segs[i].len);
+
+	for (i = 0; i < VM_MAXCPU; i++)
+		vcpu_cleanup(&vm->vcpu[i]);
+
+	iommu_destroy_domain(vm->iommu);
+
+	VMCLEANUP(vm->cookie);
+
+	free(vm, M_VM);
+}
+
+const char *
+vm_name(struct vm *vm)
+{
+	return (vm->name);
+}
+
+int
+vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+
+	return (VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
+		       VM_PROT_RW, spok));
+}
+
+int
+vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+
+	return (VMMMAP(vm->cookie, gpa, 0, len, VM_MEMATTR_UNCACHEABLE,
+		       VM_PROT_NONE, spok));
+}
+
+int
+vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa)
+{
+	int error;
+	vm_paddr_t hpa;
+
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+	
+	/*
+	 * find the hpa if already it was already vm_malloc'd.
+	 */
+	hpa = vm_gpa2hpa(vm, gpa, len);
+	if (hpa != ((vm_paddr_t)-1))
+		goto out;
+
+	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
+		return (E2BIG);
+
+	hpa = vmm_mem_alloc(len);
+	if (hpa == 0)
+		return (ENOMEM);
+
+	error = VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK,
+		       VM_PROT_ALL, spok);
+	if (error) {
+		vmm_mem_free(hpa, len);
+		return (error);
+	}
+
+	iommu_create_mapping(vm->iommu, gpa, hpa, len);
+
+	vm->mem_segs[vm->num_mem_segs].gpa = gpa;
+	vm->mem_segs[vm->num_mem_segs].hpa = hpa;
+	vm->mem_segs[vm->num_mem_segs].len = len;
+	vm->num_mem_segs++;
+out:
+	*ret_hpa = hpa;
+	return (0);
+}
+
+vm_paddr_t
+vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+	int i;
+	vm_paddr_t gpabase, gpalimit, hpabase;
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		hpabase = vm->mem_segs[i].hpa;
+		gpabase = vm->mem_segs[i].gpa;
+		gpalimit = gpabase + vm->mem_segs[i].len;
+		if (gpa >= gpabase && gpa + len <= gpalimit)
+			return ((gpa - gpabase) + hpabase);
+	}
+	return ((vm_paddr_t)-1);
+}
+
+int
+vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+		  struct vm_memory_segment *seg)
+{
+	int i;
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		if (gpabase == vm->mem_segs[i].gpa) {
+			*seg = vm->mem_segs[i];
+			return (0);
+		}
+	}
+	return (-1);
+}
+
+int
+vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
+{
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (reg >= VM_REG_LAST)
+		return (EINVAL);
+
+	return (VMGETREG(vm->cookie, vcpu, reg, retval));
+}
+
+int
+vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
+{
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (reg >= VM_REG_LAST)
+		return (EINVAL);
+
+	return (VMSETREG(vm->cookie, vcpu, reg, val));
+}
+
+static boolean_t
+is_descriptor_table(int reg)
+{
+
+	switch (reg) {
+	case VM_REG_GUEST_IDTR:
+	case VM_REG_GUEST_GDTR:
+		return (TRUE);
+	default:
+		return (FALSE);
+	}
+}
+
+static boolean_t
+is_segment_register(int reg)
+{
+	
+	switch (reg) {
+	case VM_REG_GUEST_ES:
+	case VM_REG_GUEST_CS:
+	case VM_REG_GUEST_SS:
+	case VM_REG_GUEST_DS:
+	case VM_REG_GUEST_FS:
+	case VM_REG_GUEST_GS:
+	case VM_REG_GUEST_TR:
+	case VM_REG_GUEST_LDTR:
+		return (TRUE);
+	default:
+		return (FALSE);
+	}
+}
+
+int
+vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+		struct seg_desc *desc)
+{
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (!is_segment_register(reg) && !is_descriptor_table(reg))
+		return (EINVAL);
+
+	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+		struct seg_desc *desc)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (!is_segment_register(reg) && !is_descriptor_table(reg))
+		return (EINVAL);
+
+	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid)
+{
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	*cpuid = VCPU_PINCPU(vm, vcpuid);
+
+	return (0);
+}
+
+int
+vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid)
+{
+	struct thread *td;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	td = curthread;		/* XXXSMP only safe when muxing vcpus */
+
+	/* unpin */
+	if (host_cpuid < 0) {
+		VCPU_UNPIN(vm, vcpuid);
+		thread_lock(td);
+		sched_unbind(td);
+		thread_unlock(td);
+		return (0);
+	}
+
+	if (CPU_ABSENT(host_cpuid))
+		return (EINVAL);
+
+	/*
+	 * XXX we should check that 'host_cpuid' has not already been pinned
+	 * by another vm.
+	 */
+	thread_lock(td);
+	sched_bind(td, host_cpuid);
+	thread_unlock(td);
+	VCPU_PIN(vm, vcpuid, host_cpuid);
+
+	return (0);
+}
+
+static void
+restore_guest_fpustate(struct vcpu *vcpu)
+{
+	register_t s;
+
+	s = intr_disable();
+	fpu_stop_emulating();
+	fxrstor(&vcpu->savefpu);
+	fpu_start_emulating();
+	intr_restore(s);
+}
+
+static void
+save_guest_fpustate(struct vcpu *vcpu)
+{
+	register_t s;
+
+	s = intr_disable();
+	fpu_stop_emulating();
+	fxsave(&vcpu->savefpu);
+	fpu_start_emulating();
+	intr_restore(s);
+}
+
+int
+vm_run(struct vm *vm, struct vm_run *vmrun)
+{
+	int error, vcpuid;
+	struct vcpu *vcpu;
+	struct pcb *pcb;
+	uint64_t tscval;
+
+	vcpuid = vmrun->cpuid;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	critical_enter();
+
+	tscval = rdtsc();
+
+	pcb = PCPU_GET(curpcb);
+	pcb->pcb_full_iret = 1;
+
+	vcpu->hostcpu = curcpu;
+
+	fpuexit(curthread);
+	restore_guest_msrs(vm, vcpuid);
+	restore_guest_fpustate(vcpu);
+	error = VMRUN(vm->cookie, vcpuid, vmrun->rip, &vmrun->vm_exit);
+	save_guest_fpustate(vcpu);
+	restore_host_msrs(vm, vcpuid);
+
+	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
+
+	critical_exit();
+
+	return (error);
+}
+
+int
+vm_inject_event(struct vm *vm, int vcpuid, int type,
+		int vector, uint32_t code, int code_valid)
+{
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
+		return (EINVAL);
+
+	if (vector < 0 || vector > 255)
+		return (EINVAL);
+
+	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
+}
+
+int
+vm_inject_nmi(struct vm *vm, int vcpu)
+{
+	int error;
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	error = VMNMI(vm->cookie, vcpu);
+	vm_interrupt_hostcpu(vm, vcpu);
+	return (error);
+}
+
+int
+vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (type < 0 || type >= VM_CAP_MAX)
+		return (EINVAL);
+
+	return (VMGETCAP(vm->cookie, vcpu, type, retval));
+}
+
+int
+vm_set_capability(struct vm *vm, int vcpu, int type, int val)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (type < 0 || type >= VM_CAP_MAX)
+		return (EINVAL);
+
+	return (VMSETCAP(vm->cookie, vcpu, type, val));
+}
+
+uint64_t *
+vm_guest_msrs(struct vm *vm, int cpu)
+{
+	return (vm->vcpu[cpu].guest_msrs);
+}
+
+struct vlapic *
+vm_lapic(struct vm *vm, int cpu)
+{
+	return (vm->vcpu[cpu].vlapic);
+}
+
+boolean_t
+vmm_is_pptdev(int bus, int slot, int func)
+{
+	int found, b, s, f, n;
+	char *val, *cp, *cp2;
+
+	/*
+	 * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12"
+	 */
+	found = 0;
+	cp = val = getenv("pptdevs");
+	while (cp != NULL && *cp != '\0') {
+		if ((cp2 = strchr(cp, ' ')) != NULL)
+			*cp2 = '\0';
+
+		n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
+		if (n == 3 && bus == b && slot == s && func == f) {
+			found = 1;
+			break;
+		}
+		
+		if (cp2 != NULL)
+			*cp2++ = ' ';
+
+		cp = cp2;
+	}
+	freeenv(val);
+	return (found);
+}
+
+void *
+vm_iommu_domain(struct vm *vm)
+{
+
+	return (vm->iommu);
+}
+
+void
+vm_set_run_state(struct vm *vm, int vcpuid, int state)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	if (state == VCPU_RUNNING) {
+		if (vcpu->flags & VCPU_F_RUNNING) {
+			panic("vm_set_run_state: %s[%d] is already running",
+			      vm_name(vm), vcpuid);
+		}
+		vcpu->flags |= VCPU_F_RUNNING;
+	} else {
+		if ((vcpu->flags & VCPU_F_RUNNING) == 0) {
+			panic("vm_set_run_state: %s[%d] is already stopped",
+			      vm_name(vm), vcpuid);
+		}
+		vcpu->flags &= ~VCPU_F_RUNNING;
+	}
+}
+
+int
+vm_get_run_state(struct vm *vm, int vcpuid, int *cpuptr)
+{
+	int retval, hostcpu;
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+	if (vcpu->flags & VCPU_F_RUNNING) {
+		retval = VCPU_RUNNING;
+		hostcpu = vcpu->hostcpu;
+	} else {
+		retval = VCPU_STOPPED;
+		hostcpu = -1;
+	}
+
+	if (cpuptr)
+		*cpuptr = hostcpu;
+
+	return (retval);
+}
+
+void
+vm_activate_cpu(struct vm *vm, int vcpuid)
+{
+
+	if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
+		vm->active_cpus |= vcpu_mask(vcpuid);
+}
+
+cpumask_t
+vm_active_cpus(struct vm *vm)
+{
+
+	return (vm->active_cpus);
+}
+
+void *
+vcpu_stats(struct vm *vm, int vcpuid)
+{
+
+	return (vm->vcpu[vcpuid].stats);
+}
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
new file mode 100644
index 0000000..cf443fc
--- /dev/null
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -0,0 +1,468 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/sysctl.h>
+#include <sys/libkern.h>
+#include <sys/ioccom.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_stat.h"
+#include "io/ppt.h"
+#include <machine/vmm_dev.h>
+
+struct vmmdev_softc {
+	struct vm	*vm;		/* vm instance cookie */
+	struct cdev	*cdev;
+	SLIST_ENTRY(vmmdev_softc) link;
+};
+static SLIST_HEAD(, vmmdev_softc) head;
+
+static struct mtx vmmdev_mtx;
+
+static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
+
+SYSCTL_DECL(_hw_vmm);
+
+static struct vmmdev_softc *
+vmmdev_lookup(const char *name)
+{
+	struct vmmdev_softc *sc;
+
+#ifdef notyet	/* XXX kernel is not compiled with invariants */
+	mtx_assert(&vmmdev_mtx, MA_OWNED);
+#endif
+
+	SLIST_FOREACH(sc, &head, link) {
+		if (strcmp(name, vm_name(sc->vm)) == 0)
+			break;
+	}
+
+	return (sc);
+}
+
+static struct vmmdev_softc *
+vmmdev_lookup2(struct cdev *cdev)
+{
+	struct vmmdev_softc *sc;
+
+#ifdef notyet	/* XXX kernel is not compiled with invariants */
+	mtx_assert(&vmmdev_mtx, MA_OWNED);
+#endif
+
+	SLIST_FOREACH(sc, &head, link) {
+		if (sc->cdev == cdev)
+			break;
+	}
+
+	return (sc);
+}
+
+static int
+vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
+{
+	int error, off, c;
+	vm_paddr_t hpa, gpa;
+	struct vmmdev_softc *sc;
+
+	static char zerobuf[PAGE_SIZE];
+
+	error = 0;
+	mtx_lock(&vmmdev_mtx);
+	sc = vmmdev_lookup2(cdev);
+
+	while (uio->uio_resid > 0 && error == 0) {
+		gpa = uio->uio_offset;
+		off = gpa & PAGE_MASK;
+		c = min(uio->uio_resid, PAGE_SIZE - off);
+
+		/*
+		 * The VM has a hole in its physical memory map. If we want to
+		 * use 'dd' to inspect memory beyond the hole we need to
+		 * provide bogus data for memory that lies in the hole.
+		 *
+		 * Since this device does not support lseek(2), dd(1) will
+		 * read(2) blocks of data to simulate the lseek(2).
+		 */
+		hpa = vm_gpa2hpa(sc->vm, gpa, c);
+		if (hpa == (vm_paddr_t)-1) {
+			if (uio->uio_rw == UIO_READ)
+				error = uiomove(zerobuf, c, uio);
+			else
+				error = EFAULT;
+		} else
+			error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
+	}
+
+	mtx_unlock(&vmmdev_mtx);
+	return (error);
+}
+
+static int
+vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
+	     struct thread *td)
+{
+	int error, vcpu;
+	struct vmmdev_softc *sc;
+	struct vm_memory_segment *seg;
+	struct vm_register *vmreg;
+	struct vm_seg_desc* vmsegdesc;
+	struct vm_pin *vmpin;
+	struct vm_run *vmrun;
+	struct vm_event *vmevent;
+	struct vm_lapic_irq *vmirq;
+	struct vm_capability *vmcap;
+	struct vm_pptdev *pptdev;
+	struct vm_pptdev_mmio *pptmmio;
+	struct vm_pptdev_msi *pptmsi;
+	struct vm_nmi *vmnmi;
+	struct vm_stats *vmstats;
+	struct vm_stat_desc *statdesc;
+
+	mtx_lock(&vmmdev_mtx);
+	sc = vmmdev_lookup2(cdev);
+	if (sc == NULL) {
+		mtx_unlock(&vmmdev_mtx);
+		return (ENXIO);
+	}
+
+	/*
+	 * Some VMM ioctls can operate only on vcpus that are not running.
+	 */
+	switch (cmd) {
+	case VM_RUN:
+	case VM_SET_PINNING:
+	case VM_GET_REGISTER:
+	case VM_SET_REGISTER:
+	case VM_GET_SEGMENT_DESCRIPTOR:
+	case VM_SET_SEGMENT_DESCRIPTOR:
+	case VM_INJECT_EVENT:
+	case VM_GET_CAPABILITY:
+	case VM_SET_CAPABILITY:
+	case VM_PPTDEV_MSI:
+		/*
+		 * XXX fragile, handle with care
+		 * Assumes that the first field of the ioctl data is the vcpu.
+		 */
+		vcpu = *(int *)data;
+		if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+			error = EINVAL;
+			goto done;
+		}
+
+		if (vcpu_is_running(sc->vm, vcpu, NULL)) {
+			error = EBUSY;
+			goto done;
+		}
+		break;
+	default:
+		break;
+	}
+
+	switch(cmd) {
+	case VM_RUN:
+		vmrun = (struct vm_run *)data;
+
+		vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_RUNNING);
+		mtx_unlock(&vmmdev_mtx);
+
+		error = vm_run(sc->vm, vmrun);
+
+		mtx_lock(&vmmdev_mtx);
+		vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_STOPPED);
+		break;
+	case VM_STAT_DESC: {
+		const char *desc;
+		statdesc = (struct vm_stat_desc *)data;
+		desc = vmm_stat_desc(statdesc->index);
+		if (desc != NULL) {
+			error = 0;
+			strlcpy(statdesc->desc, desc, sizeof(statdesc->desc));
+		} else
+			error = EINVAL;
+		break;
+	}
+	case VM_STATS: {
+		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES);
+		vmstats = (struct vm_stats *)data;
+		getmicrotime(&vmstats->tv);
+		error = vmm_stat_copy(sc->vm, vmstats->cpuid,
+				      &vmstats->num_entries, vmstats->statbuf);
+		break;
+	}
+	case VM_PPTDEV_MSI:
+		pptmsi = (struct vm_pptdev_msi *)data;
+		error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
+				      pptmsi->bus, pptmsi->slot, pptmsi->func,
+				      pptmsi->destcpu, pptmsi->vector,
+				      pptmsi->numvec);
+		break;
+	case VM_MAP_PPTDEV_MMIO:
+		pptmmio = (struct vm_pptdev_mmio *)data;
+		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
+				     pptmmio->func, pptmmio->gpa, pptmmio->len,
+				     pptmmio->hpa);
+		break;
+	case VM_BIND_PPTDEV:
+		pptdev = (struct vm_pptdev *)data;
+		error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
+					  pptdev->func);
+		break;
+	case VM_UNBIND_PPTDEV:
+		pptdev = (struct vm_pptdev *)data;
+		error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
+					    pptdev->func);
+		break;
+	case VM_INJECT_EVENT:
+		vmevent = (struct vm_event *)data;
+		error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type,
+					vmevent->vector,
+					vmevent->error_code,
+					vmevent->error_code_valid);
+		break;
+	case VM_INJECT_NMI:
+		vmnmi = (struct vm_nmi *)data;
+		error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
+		break;
+	case VM_LAPIC_IRQ:
+		vmirq = (struct vm_lapic_irq *)data;
+		error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector);
+		break;
+	case VM_SET_PINNING:
+		vmpin = (struct vm_pin *)data;
+		error = vm_set_pinning(sc->vm, vmpin->vm_cpuid,
+				       vmpin->host_cpuid);
+		break;
+	case VM_GET_PINNING:
+		vmpin = (struct vm_pin *)data;
+		error = vm_get_pinning(sc->vm, vmpin->vm_cpuid,
+				       &vmpin->host_cpuid);
+		break;
+	case VM_MAP_MEMORY:
+		seg = (struct vm_memory_segment *)data;
+		error = vm_malloc(sc->vm, seg->gpa, seg->len, &seg->hpa);
+		break;
+	case VM_GET_MEMORY_SEG:
+		seg = (struct vm_memory_segment *)data;
+		seg->hpa = seg->len = 0;
+		(void)vm_gpabase2memseg(sc->vm, seg->gpa, seg);
+		error = 0;
+		break;
+	case VM_GET_REGISTER:
+		vmreg = (struct vm_register *)data;
+		error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+					&vmreg->regval);
+		break;
+	case VM_SET_REGISTER:
+		vmreg = (struct vm_register *)data;
+		error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+					vmreg->regval);
+		break;
+	case VM_SET_SEGMENT_DESCRIPTOR:
+		vmsegdesc = (struct vm_seg_desc *)data;
+		error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
+					vmsegdesc->regnum,
+					&vmsegdesc->desc);
+		break;
+	case VM_GET_SEGMENT_DESCRIPTOR:
+		vmsegdesc = (struct vm_seg_desc *)data;
+		error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
+					vmsegdesc->regnum,
+					&vmsegdesc->desc);
+		break;
+	case VM_GET_CAPABILITY:
+		vmcap = (struct vm_capability *)data;
+		error = vm_get_capability(sc->vm, vmcap->cpuid,
+					  vmcap->captype,
+					  &vmcap->capval);
+		break;
+	case VM_SET_CAPABILITY:
+		vmcap = (struct vm_capability *)data;
+		error = vm_set_capability(sc->vm, vmcap->cpuid,
+					  vmcap->captype,
+					  vmcap->capval);
+		break;
+	default:
+		error = ENOTTY;
+		break;
+	}
+done:
+	mtx_unlock(&vmmdev_mtx);
+
+	return (error);
+}
+
+static int
+vmmdev_mmap(struct cdev *cdev, vm_offset_t offset, vm_paddr_t *paddr, int nprot)
+{
+	int error;
+	struct vmmdev_softc *sc;
+
+	error = -1;
+	mtx_lock(&vmmdev_mtx);
+
+	sc = vmmdev_lookup2(cdev);
+	if (sc != NULL && (nprot & PROT_EXEC) == 0) {
+		*paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
+		if (*paddr != (vm_paddr_t)-1)
+			error = 0;
+	}
+
+	mtx_unlock(&vmmdev_mtx);
+
+	return (error);
+}
+
+static void
+vmmdev_destroy(struct vmmdev_softc *sc)
+{
+
+#ifdef notyet	/* XXX kernel is not compiled with invariants */
+	mtx_assert(&vmmdev_mtx, MA_OWNED);
+#endif
+
+	/*
+	 * XXX must stop virtual machine instances that may be still
+	 * running and cleanup their state.
+	 */
+	SLIST_REMOVE(&head, sc, vmmdev_softc, link);
+	destroy_dev(sc->cdev);
+	vm_destroy(sc->vm);
+	free(sc, M_VMMDEV);
+}
+
+static int
+sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	char buf[VM_MAX_NAMELEN];
+	struct vmmdev_softc *sc;
+
+	strlcpy(buf, "beavis", sizeof(buf));
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	mtx_lock(&vmmdev_mtx);
+	sc = vmmdev_lookup(buf);
+	if (sc == NULL) {
+		mtx_unlock(&vmmdev_mtx);
+		return (EINVAL);
+	}
+	vmmdev_destroy(sc);
+	mtx_unlock(&vmmdev_mtx);
+	return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
+	    NULL, 0, sysctl_vmm_destroy, "A", NULL);
+
+static struct cdevsw vmmdevsw = {
+	.d_name		= "vmmdev",
+	.d_version	= D_VERSION,
+	.d_ioctl	= vmmdev_ioctl,
+	.d_mmap		= vmmdev_mmap,
+	.d_read		= vmmdev_rw,
+	.d_write	= vmmdev_rw,
+};
+
+static int
+sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	struct vm *vm;
+	struct vmmdev_softc *sc;
+	char buf[VM_MAX_NAMELEN];
+
+	strlcpy(buf, "beavis", sizeof(buf));
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	mtx_lock(&vmmdev_mtx);
+
+	sc = vmmdev_lookup(buf);
+	if (sc != NULL) {
+		mtx_unlock(&vmmdev_mtx);
+		return (EEXIST);
+	}
+
+	vm = vm_create(buf);
+	if (vm == NULL) {
+		mtx_unlock(&vmmdev_mtx);
+		return (EINVAL);
+	}
+
+	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
+	sc->vm = vm;
+	sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+			    "vmm/%s", buf);
+	sc->cdev->si_drv1 = sc;
+	SLIST_INSERT_HEAD(&head, sc, link);
+
+	mtx_unlock(&vmmdev_mtx);
+	return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
+	    NULL, 0, sysctl_vmm_create, "A", NULL);
+
+void
+vmmdev_init(void)
+{
+	mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
+}
+
+void
+vmmdev_cleanup(void)
+{
+	struct vmmdev_softc *sc, *sc2;
+
+	mtx_lock(&vmmdev_mtx);
+
+	SLIST_FOREACH_SAFE(sc, &head, link, sc2)
+		vmmdev_destroy(sc);
+
+	mtx_unlock(&vmmdev_mtx);
+}
diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c
new file mode 100644
index 0000000..c8e795b
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.c
@@ -0,0 +1,103 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/bus.h>
+
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
+#include <machine/segments.h>
+#include <machine/md_var.h>
+#include <machine/smp.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+
+extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn);
+
+/*
+ * The default is to use the IPI_AST to interrupt a vcpu.
+ */
+static int ipinum = IPI_AST;
+
+CTASSERT(APIC_SPURIOUS_INT == 255);
+
+void
+vmm_ipi_init(void)
+{
+	int idx;
+	uintptr_t func;
+	struct gate_descriptor *ip;
+
+	/*
+	 * Search backwards from the highest IDT vector available for use
+	 * as our IPI vector. We install the 'justreturn' handler at that
+	 * vector and use it to interrupt the vcpus.
+	 *
+	 * We do this because the IPI_AST is heavyweight and saves all
+	 * registers in the trapframe. This is overkill for our use case
+	 * which is simply to EOI the interrupt and return.
+	 */
+	idx = APIC_SPURIOUS_INT;
+	while (--idx >= APIC_IPI_INTS) {
+		ip = &idt[idx];
+		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+		if (func == (uintptr_t)&IDTVEC(rsvd)) {
+			ipinum = idx;
+			setidt(ipinum, IDTVEC(justreturn), SDT_SYSIGT,
+			       SEL_KPL, 0);
+			break;
+		}
+	}
+	
+	if (ipinum != IPI_AST && bootverbose) {
+		printf("vmm_ipi_init: installing ipi handler to interrupt "
+		       "vcpus at vector %d\n", ipinum);
+	}
+}
+
+void
+vmm_ipi_cleanup(void)
+{
+	if (ipinum != IPI_AST)
+		setidt(ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+}
+
+void
+vm_interrupt_hostcpu(struct vm *vm, int vcpu)
+{
+	int hostcpu;
+
+	if (vcpu_is_running(vm, vcpu, &hostcpu) && hostcpu != curcpu)
+		ipi_selected((cpumask_t)1 << hostcpu, ipinum);
+}
diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h
new file mode 100644
index 0000000..7ab94bf
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_IPI_H_
+#define _VMM_IPI_H_
+
+struct vm;
+
+void	vmm_ipi_init(void);
+void	vmm_ipi_cleanup(void);
+void	vm_interrupt_hostcpu(struct vm *vm, int vcpu);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_ktr.h b/sys/amd64/vmm/vmm_ktr.h
new file mode 100644
index 0000000..e691c61
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ktr.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_KTR_H_
+#define	_VMM_KTR_H_
+
+#include <sys/ktr.h>
+#include <sys/pcpu.h>
+
+#define	KTR_VMM	KTR_GEN
+
+#define	VMM_CTR0(vm, vcpuid, format)					\
+CTR3(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu)
+
+#define	VMM_CTR1(vm, vcpuid, format, p1)				\
+CTR4(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+			(p1))
+
+#define	VMM_CTR2(vm, vcpuid, format, p1, p2)				\
+CTR5(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+			(p1), (p2))
+
+#define	VMM_CTR3(vm, vcpuid, format, p1, p2, p3)			\
+CTR6(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+			(p1), (p2), (p3))
+#endif
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
new file mode 100644
index 0000000..8704fcf
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -0,0 +1,121 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+#include "vmm_lapic.h"
+#include "vlapic.h"
+
+int
+lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val)
+{
+	int handled;
+
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	if (vlapic_op_mem_write(vlapic, offset, DWORD, val) == 0)
+		handled = 1;
+	else
+		handled = 0;
+
+	return (handled);
+}
+
+int
+lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *rv)
+{
+	int handled;
+
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	if (vlapic_op_mem_read(vlapic, offset, DWORD, rv) == 0)
+		handled = 1;
+	else
+		handled = 0;
+
+	return (handled);
+}
+
+int
+lapic_pending_intr(struct vm *vm, int cpu)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	return (vlapic_pending_intr(vlapic));
+}
+
+void
+lapic_intr_accepted(struct vm *vm, int cpu, int vector)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	vlapic_intr_accepted(vlapic, vector);
+}
+
+int
+lapic_set_intr(struct vm *vm, int cpu, int vector)
+{
+	struct vlapic *vlapic;
+
+	if (cpu < 0 || cpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (vector < 32 || vector > 255)
+		return (EINVAL);
+
+	vlapic = vm_lapic(vm, cpu);
+	vlapic_set_intr_ready(vlapic, vector);
+
+	vm_interrupt_hostcpu(vm, cpu);
+
+	return (0);
+}
+
+void
+lapic_timer_tick(struct vm *vm, int cpu)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	vlapic_timer_tick(vlapic);
+}
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
new file mode 100644
index 0000000..815b2f7
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -0,0 +1,64 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_LAPIC_H_
+#define	_VMM_LAPIC_H_
+
+struct vm;
+
+int	lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val);
+int	lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *retval);
+void	lapic_timer_tick(struct vm *vm, int cpu);
+
+/*
+ * Returns a vector between 32 and 255 if an interrupt is pending in the
+ * IRR that can be delivered based on the current state of ISR and TPR.
+ *
+ * Note that the vector does not automatically transition to the ISR as a
+ * result of calling this function.
+ *
+ * Returns -1 if there is no eligible vector that can be delivered to the
+ * guest at this time.
+ */
+int	lapic_pending_intr(struct vm *vm, int cpu);
+
+/*
+ * Transition 'vector' from IRR to ISR. This function is called with the
+ * vector returned by 'lapic_pending_intr()' when the guest is able to
+ * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
+ * block interrupt delivery).
+ */
+void	lapic_intr_accepted(struct vm *vm, int cpu, int vector);
+
+/*
+ * Signals to the LAPIC that an interrupt at 'vector' needs to be generated
+ * to the 'cpu', the state is recorded in IRR.
+ */
+int	lapic_set_intr(struct vm *vm, int cpu, int vector);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c
new file mode 100644
index 0000000..9ce1e80
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.c
@@ -0,0 +1,413 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/linker.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/md_var.h>
+#include <machine/metadata.h>
+#include <machine/pc/bios.h>
+#include <machine/vmparam.h>
+#include <machine/pmap.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+
+static MALLOC_DEFINE(M_VMM_MEM, "vmm memory", "vmm memory");
+
+#define	MB		(1024 * 1024)
+#define	GB		(1024 * MB)
+
+#define	VMM_MEM_MAXSEGS	64
+
+/* protected by vmm_mem_mtx */
+static struct {
+	vm_paddr_t	base;
+	vm_size_t	length;
+} vmm_mem_avail[VMM_MEM_MAXSEGS];
+
+static int vmm_mem_nsegs;
+
+static vm_paddr_t maxaddr;
+
+static struct mtx vmm_mem_mtx;
+
+/*
+ * Steal any memory that was deliberately hidden from FreeBSD either by
+ * the use of MAXMEM kernel config option or the hw.physmem loader tunable.
+ */
+static int
+vmm_mem_steal_memory(void)
+{
+	int nsegs;
+	caddr_t kmdp;
+	uint32_t smapsize;
+	uint64_t base, length;
+	struct bios_smap *smapbase, *smap, *smapend;
+
+	/*
+	 * Borrowed from hammer_time() and getmemsize() in machdep.c
+	 */
+	kmdp = preload_search_by_type("elf kernel");
+	if (kmdp == NULL)
+		kmdp = preload_search_by_type("elf64 kernel");
+
+	smapbase = (struct bios_smap *)preload_search_info(kmdp,
+		MODINFO_METADATA | MODINFOMD_SMAP);
+	if (smapbase == NULL)
+		panic("No BIOS smap info from loader!");
+
+	smapsize = *((uint32_t *)smapbase - 1);
+	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
+
+	nsegs = 0;
+	for (smap = smapbase; smap < smapend; smap++) {
+		/*
+		 * XXX
+		 * Assuming non-overlapping, monotonically increasing
+		 * memory segments.
+		 */
+		if (smap->type != SMAP_TYPE_MEMORY)
+			continue;
+		if (smap->length == 0)
+			break;
+
+		base = roundup(smap->base, NBPDR);
+		length = rounddown(smap->length, NBPDR);
+
+		/* Skip this segment if FreeBSD is using all of it. */
+		if (base + length <= ptoa(Maxmem))
+			continue;
+
+		/*
+		 * If FreeBSD is using part of this segment then adjust
+		 * 'base' and 'length' accordingly.
+		 */
+		if (base < ptoa(Maxmem)) {
+			uint64_t used;
+			used = roundup(ptoa(Maxmem), NBPDR) - base;
+			base += used;
+			length -= used;
+		}
+
+		if (length == 0)
+			continue;
+
+		vmm_mem_avail[nsegs].base = base;
+		vmm_mem_avail[nsegs].length = length;
+
+		if (base + length > maxaddr)
+			maxaddr = base + length;
+
+		if (0 && bootverbose) {
+			printf("vmm_mem_populate: index %d, base 0x%0lx, "
+			       "length %ld\n",
+			       nsegs, vmm_mem_avail[nsegs].base,
+			       vmm_mem_avail[nsegs].length);
+		}
+
+		nsegs++;
+		if (nsegs >= VMM_MEM_MAXSEGS) {
+			printf("vmm_mem_populate: maximum number of vmm memory "
+			       "segments reached!\n");
+			return (ENOSPC);
+		}
+	}
+
+	vmm_mem_nsegs = nsegs;
+
+	return (0);
+}
+
+static void
+vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end)
+{
+	vm_paddr_t addr, remaining;
+	int pdpi, pdi, superpage_size;
+	pml4_entry_t *pml4p;
+	pdp_entry_t *pdp;
+	pd_entry_t *pd;
+	uint64_t page_attr_bits;
+
+	if (end >= NBPML4)
+		panic("Cannot map memory beyond %ldGB", NBPML4 / GB);
+
+	/* XXX FreeBSD 8.1 does not use 1G superpages in the direct map */
+	if (0 && vmm_supports_1G_pages())
+		superpage_size = NBPDP;
+	else
+		superpage_size = NBPDR;
+
+	/*
+	 * Get the page directory pointer page that contains the direct
+	 * map address mappings.
+	 */
+	pml4p = kernel_pmap->pm_pml4;
+	pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4p[DMPML4I] & ~PAGE_MASK);
+
+	page_attr_bits = PG_RW | PG_V | PG_PS | PG_G;
+	addr = start;
+	while (addr < end) {
+		remaining = end - addr;
+		pdpi = addr / NBPDP;
+		if (superpage_size == NBPDP &&
+		    remaining >= NBPDP &&
+		    addr % NBPDP == 0) {
+			/*
+			 * If there isn't a mapping for this address then
+			 * create one but if there is one already make sure
+			 * it matches what we expect it to be.
+			 */
+			if (pdp[pdpi] == 0) {
+				pdp[pdpi] = addr | page_attr_bits;
+				if (0 && bootverbose) {
+					printf("vmm_mem_populate: mapping "
+					       "0x%lx with 1GB page at "
+					       "pdpi %d\n", addr, pdpi);
+				}
+			} else {
+				pdp_entry_t pdpe = pdp[pdpi];
+				if ((pdpe & ~PAGE_MASK) != addr ||
+				    (pdpe & page_attr_bits) != page_attr_bits) {
+					panic("An invalid mapping 0x%016lx "
+					      "already exists for 0x%016lx\n",
+					      pdpe, addr);
+				}
+			}
+			addr += NBPDP;
+		} else {
+			if (remaining < NBPDR) {
+				panic("vmm_mem_populate: remaining (%ld) must "
+				      "be greater than NBPDR (%d)\n",
+				      remaining, NBPDR);
+			}
+			if (pdp[pdpi] == 0) {
+				/*
+				 * XXX we lose this memory forever because
+				 * we do not keep track of the virtual address
+				 * that would be required to free this page.
+				 */
+				pd = malloc(PAGE_SIZE, M_VMM_MEM,
+					    M_WAITOK | M_ZERO);
+				if ((uintptr_t)pd & PAGE_MASK) {
+					panic("vmm_mem_populate: page directory"
+					      "page not aligned on %d "
+					      "boundary\n", PAGE_SIZE);
+				}
+				pdp[pdpi] = vtophys(pd);
+				pdp[pdpi] |= PG_RW | PG_V | PG_U;
+				if (0 && bootverbose) {
+					printf("Creating page directory "
+					       "at pdp index %d for 0x%016lx\n",
+					       pdpi, addr);
+				}
+			}
+			pdi = (addr % NBPDP) / NBPDR;
+			pd = (pd_entry_t *)PHYS_TO_DMAP(pdp[pdpi] & ~PAGE_MASK);
+
+			/*
+			 * Create a new mapping if one doesn't already exist
+			 * or validate it if it does.
+			 */
+			if (pd[pdi] == 0) {
+				pd[pdi] = addr | page_attr_bits;
+				if (0 && bootverbose) {
+					printf("vmm_mem_populate: mapping "
+					       "0x%lx with 2MB page at "
+					       "pdpi %d, pdi %d\n",
+					       addr, pdpi, pdi);
+				}
+			} else {
+				pd_entry_t pde = pd[pdi];
+				if ((pde & ~PAGE_MASK) != addr ||
+				    (pde & page_attr_bits) != page_attr_bits) {
+					panic("An invalid mapping 0x%016lx "
+					      "already exists for 0x%016lx\n",
+					      pde, addr);
+				}
+			}
+			addr += NBPDR;
+		}
+	}
+}
+
+static int
+vmm_mem_populate(void)
+{
+	int seg, error;
+	vm_paddr_t start, end;
+
+	/* populate the vmm_mem_avail[] array */
+	error = vmm_mem_steal_memory();
+	if (error)
+		return (error);
+	
+	/*
+	 * Now map the memory that was hidden from FreeBSD in
+	 * the direct map VA space.
+	 */
+	for (seg = 0; seg < vmm_mem_nsegs; seg++) {
+		start = vmm_mem_avail[seg].base;
+		end = start + vmm_mem_avail[seg].length;
+		if ((start & PDRMASK) != 0 || (end & PDRMASK) != 0) {
+			panic("start (0x%016lx) and end (0x%016lx) must be "
+			      "aligned on a %dMB boundary\n",
+			      start, end, NBPDR / MB);
+		}
+		vmm_mem_direct_map(start, end);
+	}
+
+	return (0);
+}
+
+int
+vmm_mem_init(void)
+{
+	int error;
+
+	mtx_init(&vmm_mem_mtx, "vmm_mem_mtx", NULL, MTX_DEF);
+
+	error = vmm_mem_populate();
+	if (error)
+		return (error);
+
+	return (0);
+}
+
+vm_paddr_t
+vmm_mem_alloc(size_t size)
+{
+	int i;
+	vm_paddr_t addr;
+
+	if ((size & PDRMASK) != 0) {
+		panic("vmm_mem_alloc: size 0x%0lx must be "
+		      "aligned on a 0x%0x boundary\n", size, NBPDR);
+	}
+
+	addr = 0;
+
+	mtx_lock(&vmm_mem_mtx);
+	for (i = 0; i < vmm_mem_nsegs; i++) {
+		if (vmm_mem_avail[i].length >= size) {
+			addr = vmm_mem_avail[i].base;
+			vmm_mem_avail[i].base += size;
+			vmm_mem_avail[i].length -= size;
+			/* remove a zero length segment */
+			if (vmm_mem_avail[i].length == 0) {
+				memmove(&vmm_mem_avail[i],
+					&vmm_mem_avail[i + 1],
+					(vmm_mem_nsegs - (i + 1)) *
+					 sizeof(vmm_mem_avail[0]));
+				vmm_mem_nsegs--;
+			}
+			break;
+		}
+	}
+	mtx_unlock(&vmm_mem_mtx);
+
+	return (addr);
+}
+
+void
+vmm_mem_free(vm_paddr_t base, size_t length)
+{
+	int i;
+
+	if ((base & PDRMASK) != 0 || (length & PDRMASK) != 0) {
+		panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be "
+		      "aligned on a 0x%0x boundary\n", base, length, NBPDR);
+	}
+
+	mtx_lock(&vmm_mem_mtx);
+
+	for (i = 0; i < vmm_mem_nsegs; i++) {
+		if (vmm_mem_avail[i].base > base)
+			break;
+	}
+
+	if (vmm_mem_nsegs >= VMM_MEM_MAXSEGS)
+		panic("vmm_mem_free: cannot free any more segments");
+
+	/* Create a new segment at index 'i' */
+	memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i],
+		(vmm_mem_nsegs - i) * sizeof(vmm_mem_avail[0]));
+
+	vmm_mem_avail[i].base = base;
+	vmm_mem_avail[i].length = length;
+
+	vmm_mem_nsegs++;
+
+coalesce_some_more:
+	for (i = 0; i < vmm_mem_nsegs - 1; i++) {
+		if (vmm_mem_avail[i].base + vmm_mem_avail[i].length ==
+		    vmm_mem_avail[i + 1].base) {
+			vmm_mem_avail[i].length += vmm_mem_avail[i + 1].length;
+			memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i + 2],
+			  (vmm_mem_nsegs - (i + 2)) * sizeof(vmm_mem_avail[0]));
+			vmm_mem_nsegs--;
+			goto coalesce_some_more;
+		}
+	}
+
+	mtx_unlock(&vmm_mem_mtx);
+}
+
+vm_paddr_t
+vmm_mem_maxaddr(void)
+{
+
+	return (maxaddr);
+}
+
+void
+vmm_mem_dump(void)
+{
+	int i;
+	vm_paddr_t base;
+	vm_size_t length;
+
+	mtx_lock(&vmm_mem_mtx);
+	for (i = 0; i < vmm_mem_nsegs; i++) {
+		base = vmm_mem_avail[i].base;
+		length = vmm_mem_avail[i].length;
+		printf("%-4d0x%016lx    0x%016lx\n", i, base, base + length);
+	}
+	mtx_unlock(&vmm_mem_mtx);
+}
diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h
new file mode 100644
index 0000000..ef1bf1a
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_MEM_H_
+#define	_VMM_MEM_H_
+
+int		vmm_mem_init(void);
+vm_paddr_t	vmm_mem_alloc(size_t size);
+void		vmm_mem_free(vm_paddr_t start, size_t size);
+vm_paddr_t	vmm_mem_maxaddr(void);
+void		vmm_mem_dump(void);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c
new file mode 100644
index 0000000..152aa7b
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.c
@@ -0,0 +1,264 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/specialreg.h>
+#include <machine/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+
+#define	VMM_MSR_F_EMULATE	0x01
+#define	VMM_MSR_F_READONLY	0x02
+
+struct vmm_msr {
+	int		num;
+	int		flags;
+	uint64_t	hostval;
+};
+
+static struct vmm_msr vmm_msr[] = {
+	{ MSR_LSTAR,	0 },
+	{ MSR_CSTAR,	0 },
+	{ MSR_STAR,	0 },
+	{ MSR_SF_MASK,	0 },
+	{ MSR_APICBASE,	VMM_MSR_F_EMULATE },
+	{ MSR_BIOS_SIGN,VMM_MSR_F_EMULATE },
+	{ MSR_MCG_CAP,	VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
+};
+
+#define	vmm_msr_num	(sizeof(vmm_msr) / sizeof(vmm_msr[0]))
+CTASSERT(VMM_MSR_NUM >= vmm_msr_num);
+
+#define	readonly_msr(idx)	\
+	((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0)
+
+#define	emulated_msr(idx)	\
+	((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0)
+
+void
+vmm_msr_init(void)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (emulated_msr(i))
+			continue;
+		/*
+		 * XXX this assumes that the value of the host msr does not
+		 * change after we have cached it.
+		 */
+		vmm_msr[i].hostval = rdmsr(vmm_msr[i].num);
+	}
+}
+
+void
+guest_msrs_init(struct vm *vm, int cpu)
+{
+	int i;
+	uint64_t *guest_msrs;
+
+	guest_msrs = vm_guest_msrs(vm, cpu);
+	
+	for (i = 0; i < vmm_msr_num; i++) {
+		switch (vmm_msr[i].num) {
+		case MSR_LSTAR:
+		case MSR_CSTAR:
+		case MSR_STAR:
+		case MSR_SF_MASK:
+		case MSR_BIOS_SIGN:
+		case MSR_MCG_CAP:
+			guest_msrs[i] = 0;
+			break;
+		case MSR_APICBASE:
+			guest_msrs[i] = DEFAULT_APIC_BASE | APICBASE_ENABLED |
+					APICBASE_X2APIC;
+			if (cpu == 0)
+				guest_msrs[i] |= APICBASE_BSP;
+			break;
+		default:
+			panic("guest_msrs_init: missing initialization for msr "
+			      "0x%0x", vmm_msr[i].num);
+		}
+	}
+}
+
+static boolean_t
+x2apic_msr(u_int num)
+{
+
+	if (num >= 0x800 && num <= 0xBFF)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+static u_int
+x2apic_msr_to_regoff(u_int msr)
+{
+
+	return ((msr - 0x800) << 4);
+}
+
+static boolean_t
+x2apic_msr_id(u_int num)
+{
+	return (num == 0x802);
+}
+
+static int
+msr_num_to_idx(u_int num)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++)
+		if (vmm_msr[i].num == num)
+			return (i);
+
+	return (-1);
+}
+
+int
+emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
+{
+	int handled, idx;
+	uint64_t *guest_msrs;
+
+	handled = 0;
+
+	if (x2apic_msr(num))
+		return (lapic_write(vm, cpu, x2apic_msr_to_regoff(num), val));
+
+	idx = msr_num_to_idx(num);
+	if (idx < 0)
+		goto done;
+
+	if (!readonly_msr(idx)) {
+		guest_msrs = vm_guest_msrs(vm, cpu);
+
+		/* Stash the value */
+		guest_msrs[idx] = val;
+
+		/* Update processor state for non-emulated MSRs */
+		if (!emulated_msr(idx))
+			wrmsr(vmm_msr[idx].num, val);
+	}
+
+	handled = 1;
+done:
+	return (handled);
+}
+
+int
+emulate_rdmsr(struct vm *vm, int cpu, u_int num)
+{
+	int error, handled, idx;
+	uint32_t eax, edx;
+	uint64_t result, *guest_msrs;
+
+	handled = 0;
+
+	if (x2apic_msr(num)) {
+		handled = lapic_read(vm, cpu, x2apic_msr_to_regoff(num),
+				     &result);
+		/*
+		 * The version ID needs to be massaged
+		 */
+		if (x2apic_msr_id(num)) {
+			result = result >> 24;
+		}
+		goto done;
+	}
+
+	idx = msr_num_to_idx(num);
+	if (idx < 0)
+		goto done;
+
+	guest_msrs = vm_guest_msrs(vm, cpu);
+	result = guest_msrs[idx];
+
+	/*
+	 * If this is not an emulated msr register make sure that the processor
+	 * state matches our cached state.
+	 */
+	if (!emulated_msr(idx) && (rdmsr(num) != result)) {
+		panic("emulate_rdmsr: msr 0x%0x has inconsistent cached "
+		      "(0x%016lx) and actual (0x%016lx) values", num,
+		      result, rdmsr(num));
+	}
+
+	handled = 1;
+
+done:
+	if (handled) {
+		eax = result;
+		edx = result >> 32;
+		error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax);
+		if (error)
+			panic("vm_set_register(rax) error %d", error);
+		error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx);
+		if (error)
+			panic("vm_set_register(rdx) error %d", error);
+	}
+	return (handled);
+}
+
+void
+restore_guest_msrs(struct vm *vm, int cpu)
+{
+	int i;
+	uint64_t *guest_msrs;
+
+	guest_msrs = vm_guest_msrs(vm, cpu);
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (emulated_msr(i))
+			continue;
+		else
+			wrmsr(vmm_msr[i].num, guest_msrs[i]);
+	}
+}
+
+void
+restore_host_msrs(struct vm *vm, int cpu)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (emulated_msr(i))
+			continue;
+		else
+			wrmsr(vmm_msr[i].num, vmm_msr[i].hostval);
+	}
+}
diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h
new file mode 100644
index 0000000..1e15787
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.h
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_MSR_H_
+#define	_VMM_MSR_H_
+
+#define	VMM_MSR_NUM	16
+struct vm;
+
+void	vmm_msr_init(void);
+int	emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val);
+int	emulate_rdmsr(struct vm *vm, int vcpu, u_int msr);
+void	guest_msrs_init(struct vm *vm, int cpu);
+void	restore_host_msrs(struct vm *vm, int cpu);
+void	restore_guest_msrs(struct vm *vm, int cpu);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c
new file mode 100644
index 0000000..e6f5c48
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.c
@@ -0,0 +1,103 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <machine/vmm.h>
+#include "vmm_stat.h"
+
+static int vstnum;
+static struct vmm_stat_type *vsttab[MAX_VMM_STAT_TYPES];
+
+static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat");
+
+void
+vmm_stat_init(void *arg)
+{
+	struct vmm_stat_type *vst = arg;
+
+	/* We require all stats to identify themselves with a description */
+	if (vst->desc == NULL)
+		return;
+
+	if (vstnum >= MAX_VMM_STAT_TYPES) {
+		printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc);
+		return;
+	}
+
+	vst->index = vstnum;
+	vsttab[vstnum++] = vst;
+}
+
+int
+vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf)
+{
+	int i;
+	uint64_t *stats;
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+		
+	stats = vcpu_stats(vm, vcpu);
+	for (i = 0; i < vstnum; i++)
+		buf[i] = stats[i];
+	*num_stats = vstnum;
+	return (0);
+}
+
+void *
+vmm_stat_alloc(void)
+{
+	u_long size;
+	
+	size = vstnum * sizeof(uint64_t);
+
+	return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK));
+}
+
+void
+vmm_stat_free(void *vp)
+{
+	free(vp, M_VMM_STAT);
+}
+
+const char *
+vmm_stat_desc(int index)
+{
+
+	if (index >= 0 && index < vstnum)
+		return (vsttab[index]->desc);
+	else
+		return (NULL);
+}
diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h
new file mode 100644
index 0000000..7c075a6
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_STAT_H_
+#define	_VMM_STAT_H_
+
+struct vm;
+
+#define	MAX_VMM_STAT_TYPES	64		/* arbitrary */
+
+struct vmm_stat_type {
+	const char	*desc;		/* description of statistic */
+	int	index;			/* position in the stats buffer */
+};
+
+void	vmm_stat_init(void *arg);
+
+#define	VMM_STAT_DEFINE(type, desc)					\
+	struct vmm_stat_type type[1] = {				\
+		{ desc, -1 }						\
+	};								\
+	SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
+
+void	*vmm_stat_alloc(void);
+void 	vmm_stat_free(void *vp);
+
+/*
+ * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries
+ */
+int	vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf);
+const char *vmm_stat_desc(int index);
+
+static void __inline
+vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
+{
+#ifdef	VMM_KEEP_STATS
+	uint64_t *stats = vcpu_stats(vm, vcpu);
+	if (vst->index >= 0)
+		stats[vst->index] += x;
+#endif
+}
+
+#endif
diff --git a/sys/amd64/vmm/vmm_support.S b/sys/amd64/vmm/vmm_support.S
new file mode 100644
index 0000000..2afc608
--- /dev/null
+++ b/sys/amd64/vmm/vmm_support.S
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define	LOCORE
+
+#include <machine/asmacros.h>
+
+#define	LA_EOI	0xB0
+
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(justreturn)
+	pushq	%rax
+	movq	lapic, %rax
+	movl	$0, LA_EOI(%rax)
+	popq	%rax
+	iretq
diff --git a/sys/amd64/vmm/vmm_util.c b/sys/amd64/vmm/vmm_util.c
new file mode 100644
index 0000000..f245f92
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.c
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/libkern.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+
+boolean_t
+vmm_is_intel(void)
+{
+
+	if (strcmp(cpu_vendor, "GenuineIntel") == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+boolean_t
+vmm_is_amd(void)
+{
+	if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+boolean_t
+vmm_supports_1G_pages(void)
+{
+	unsigned int regs[4];
+
+	/*
+	 * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages
+	 *
+	 * Both Intel and AMD support this bit.
+	 */
+	if (cpu_exthigh >= 0x80000001) {
+		do_cpuid(0x80000001, regs);
+		if (regs[3] & (1 << 26))
+			return (TRUE);
+	}
+	return (FALSE);
+}
+
+#include <sys/proc.h>
+#include <machine/frame.h>
+#define	DUMP_REG(x)	printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x))
+#define	DUMP_SEG(x)	printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x))
+void
+dump_trapframe(struct trapframe *tf)
+{
+	DUMP_REG(rdi);
+	DUMP_REG(rsi);
+	DUMP_REG(rdx);
+	DUMP_REG(rcx);
+	DUMP_REG(r8);
+	DUMP_REG(r9);
+	DUMP_REG(rax);
+	DUMP_REG(rbx);
+	DUMP_REG(rbp);
+	DUMP_REG(r10);
+	DUMP_REG(r11);
+	DUMP_REG(r12);
+	DUMP_REG(r13);
+	DUMP_REG(r14);
+	DUMP_REG(r15);
+	DUMP_REG(trapno);
+	DUMP_REG(addr);
+	DUMP_REG(flags);
+	DUMP_REG(err);
+	DUMP_REG(rip);
+	DUMP_REG(rflags);
+	DUMP_REG(rsp);
+	DUMP_SEG(cs);
+	DUMP_SEG(ss);
+	DUMP_SEG(fs);
+	DUMP_SEG(gs);
+	DUMP_SEG(es);
+	DUMP_SEG(ds);
+}
diff --git a/sys/amd64/vmm/vmm_util.h b/sys/amd64/vmm/vmm_util.h
new file mode 100644
index 0000000..7f82332
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_UTIL_H_
+#define	_VMM_UTIL_H_
+
+struct trapframe;
+
+boolean_t	vmm_is_intel(void);
+boolean_t	vmm_is_amd(void);
+boolean_t	vmm_supports_1G_pages(void);
+
+void		dump_trapframe(struct trapframe *tf);
+
+#endif
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
new file mode 100644
index 0000000..45c4c53
--- /dev/null
+++ b/sys/amd64/vmm/x86.c
@@ -0,0 +1,113 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
+
+#include "x86.h"
+
+int
+x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+	unsigned int 	func, regs[4];
+
+	func = *eax;
+
+	cpuid_count(*eax, *ecx, regs);
+
+	switch(func) {
+		case CPUID_0000_0000:
+		case CPUID_0000_0002:
+		case CPUID_0000_0003:
+		case CPUID_0000_0004:
+		case CPUID_0000_000A:
+			break;
+
+		case CPUID_8000_0000:
+		case CPUID_8000_0001:
+		case CPUID_8000_0002:
+		case CPUID_8000_0003:
+		case CPUID_8000_0004:
+		case CPUID_8000_0006:
+		case CPUID_8000_0007:
+		case CPUID_8000_0008:
+
+			break;
+
+		case CPUID_0000_0001:
+			/*
+			 * Override the APIC ID only in ebx
+			 */
+			regs[1] &= ~(CPUID_0000_0001_APICID_MASK);
+			/*
+			 * XXX fixme for MP case, set apicid properly for cpu. 
+			 */
+			regs[1] |= (0 << CPUID_0000_0001_APICID_SHIFT);
+
+			/*
+			 * Don't expose VMX capability.
+			 * Advertise x2APIC capability.
+			 */
+			regs[2] &= ~CPUID_0000_0001_FEAT0_VMX;
+			regs[2] |= CPUID2_X2APIC;
+
+			/*
+			 * Machine check handling is done in the host.
+			 * Hide MTRR capability.
+			 */
+			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
+
+			break;
+
+		case CPUID_0000_000B:
+			/*
+			 * XXXSMP fixme
+			 * Processor topology enumeration
+			 */
+			regs[0] = 0;
+			regs[1] = 0;
+			regs[2] = *ecx & 0xff;
+			regs[3] = 0;
+			break;
+
+		default:
+			return (0);
+	}
+
+	*eax = regs[0];
+	*ebx = regs[1];
+	*ecx = regs[2];
+	*edx = regs[3];
+	return (1);
+}
+
diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h
new file mode 100644
index 0000000..bc4f8a4
--- /dev/null
+++ b/sys/amd64/vmm/x86.h
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _X86_H_
+#define	_X86_H_
+
+#define CPUID_0000_0000 (0x0)
+#define CPUID_0000_0001	(0x1)
+#define CPUID_0000_0002 (0x2)
+#define CPUID_0000_0003 (0x3)
+#define CPUID_0000_0004 (0x4)
+#define	CPUID_0000_000A	(0xA)
+#define	CPUID_0000_000B	(0xB)
+#define CPUID_8000_0000	(0x80000000)
+#define CPUID_8000_0001	(0x80000001)
+#define CPUID_8000_0002	(0x80000002)
+#define CPUID_8000_0003	(0x80000003)
+#define CPUID_8000_0004	(0x80000004)
+#define CPUID_8000_0006	(0x80000006)
+#define CPUID_8000_0007	(0x80000007)
+#define CPUID_8000_0008	(0x80000008)
+
+/*
+ * CPUID instruction Fn0000_0001:
+ */
+#define CPUID_0000_0001_APICID_MASK			(0xff<<24)
+#define CPUID_0000_0001_APICID_SHIFT			24
+
+/*
+ * CPUID instruction Fn0000_0001 ECX
+ */
+#define CPUID_0000_0001_FEAT0_VMX	(1<<5)
+
+int x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
+		      uint32_t *edx);
+
+#endif
-- 
cgit v1.1


From f15f5629368bc3b043d8dfbe6b60d8261e97874d Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Sat, 14 May 2011 18:37:24 +0000
Subject: bhyve import part 2 of 2, guest kernel changes. This branch is now
 considered frozen: future bhyve development will take place in a branch off
 -CURRENT.

sys/dev/bvm/bvm_console.c
sys/dev/bvm/bvm_dbg.c
- simple console driver/gdb debug port used for bringup. supported
  by user-space bhyve executable

sys/conf/options.amd64
sys/amd64/amd64/minidump_machdep.c
- allow NKPT to be set in the kernel config file

sys/amd64/conf/GENERIC
- mptable config options; bhyve user-space executable creates an mptable
  with number of CPUs, and optional vendor extension
- add bvm console/debug
- set NKPT to 512 to allow loading of large RAM disks from the loader
- include kdb/gdb

sys/amd64/amd64/local_apic.c
sys/amd64/amd64/apic_vector.S
sys/amd64/include/specialreg.h
- if x2apic mode available, use MSRs to access the local APIC, otherwise
  fall back to 'classic' MMIO mode

sys/amd64/amd64/mp_machdep.c
- support AP spinup on CPU models that don't have real-mode support by
  overwriting the real-mode page with a message that supplies the bhyve
  user-space executable with enough information to start the AP directly
  in 64-bit mode.

sys/amd64/amd64/vm_machdep.c
- insert pause statements into cpu shutdown busy-wait loops

sys/dev/blackhole/blackhole.c
sys/modules/blackhole/Makefile
- boot-time loadable module that claims all PCI bus/slot/funcs specified
  in an env var that are to be used for PCI passthrough

sys/amd64/amd64/intr_machdep.c
- allow round-robin assignment of device interrupts to CPUs to be disabled
  from the loader

sys/amd64/include/bus.h
- convert string ins/outs instructions to loops of individual in/out since
  bhyve doesn't support these yet

sys/kern/subr_bus.c
- if the device was no created with a fixed devclass, then remove it's
  association with the devclass it was associated with during probe.
  Otherwise, new drivers do not get a chance to probe/attach since the
  device will stay married to the first driver that it probed successfully
  but failed to attach.

Sponsored by:	NetApp, Inc.
---
 sys/amd64/amd64/apic_vector.S      |  53 ++--
 sys/amd64/amd64/intr_machdep.c     |   6 +
 sys/amd64/amd64/local_apic.c       | 494 ++++++++++++++++++++++++++++++++-----
 sys/amd64/amd64/minidump_machdep.c |   1 +
 sys/amd64/amd64/mp_machdep.c       |  43 ++++
 sys/amd64/amd64/vm_machdep.c       |  11 +-
 sys/amd64/conf/GENERIC             |  11 +
 sys/amd64/include/bus.h            |  60 +++--
 sys/amd64/include/specialreg.h     |  33 ++-
 9 files changed, 610 insertions(+), 102 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 4cfc18b..6e9aa79 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -55,7 +55,14 @@ IDTVEC(vec_name) ;							\
 	PUSH_FRAME ;							\
 	FAKE_MCOUNT(TF_RIP(%rsp)) ;					\
 	movq	lapic, %rdx ;	/* pointer to local APIC */		\
+	testq	%rdx, %rdx;						\
+	jnz	3f;							\
+	movl	$MSR_APIC_ISR ## index, %ecx;				\
+	rdmsr;								\
+	jmp	4f;							\
+3:	;								\
 	movl	LA_ISR + 16 * (index)(%rdx), %eax ;	/* load ISR */	\
+4:	;								\
 	bsrl	%eax, %eax ;	/* index of highset set bit in ISR */	\
 	jz	2f ;							\
 	addl	$(32 * index),%eax ;					\
@@ -117,6 +124,26 @@ IDTVEC(errorint)
 	jmp	doreti
 
 #ifdef SMP
+
+/*
+ * We assume that %rax is being saved/restored outside of this macro
+ */
+#define	DO_EOI								\
+	movq	lapic, %rax;						\
+	testq	%rax, %rax;						\
+	jz	8f;							\
+	movl	$0, LA_EOI(%rax);					\
+	jmp	9f;							\
+8:;									\
+	pushq	%rcx;							\
+	pushq	%rdx;							\
+	xorl	%edx, %edx;	/* eax is already zero */		\
+	movl	$MSR_APIC_EOI, %ecx;					\
+	wrmsr;								\
+	popq	%rdx;							\
+	popq	%rcx;							\
+9:
+	
 /*
  * Global address space TLB shootdown.
  */
@@ -128,8 +155,7 @@ IDTVEC(invltlb)
 	movq	%cr3, %rax		/* invalidate the TLB */
 	movq	%rax, %cr3
 
-	movq	lapic, %rax
-	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
+	DO_EOI
 
 	lock
 	incl	smp_tlb_wait
@@ -148,8 +174,7 @@ IDTVEC(invlpg)
 	movq	smp_tlb_addr1, %rax
 	invlpg	(%rax)			/* invalidate single page */
 
-	movq	lapic, %rax
-	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
+	DO_EOI
 
 	lock
 	incl	smp_tlb_wait
@@ -173,8 +198,7 @@ IDTVEC(invlrng)
 	cmpq	%rax, %rdx
 	jb	1b
 
-	movq	lapic, %rax
-	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
+	DO_EOI
 
 	lock
 	incl	smp_tlb_wait
@@ -193,8 +217,7 @@ IDTVEC(invlcache)
 
 	wbinvd
 
-	movq	lapic, %rax
-	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
+	DO_EOI
 
 	lock
 	incl	smp_tlb_wait
@@ -210,9 +233,8 @@ IDTVEC(invlcache)
 IDTVEC(ipi_intr_bitmap_handler)		
 	PUSH_FRAME
 
-	movq	lapic, %rdx
-	movl	$0, LA_EOI(%rdx)	/* End Of Interrupt to APIC */
-	
+	DO_EOI
+
 	FAKE_MCOUNT(TF_RIP(%rsp))
 
 	call	ipi_bitmap_handler
@@ -227,8 +249,7 @@ IDTVEC(ipi_intr_bitmap_handler)
 IDTVEC(cpustop)
 	PUSH_FRAME
 
-	movq	lapic, %rax
-	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
+	DO_EOI
 
 	call	cpustop_handler
 	jmp	doreti
@@ -241,8 +262,7 @@ IDTVEC(cpustop)
 IDTVEC(cpususpend)
 	PUSH_FRAME
 
-	movq	lapic, %rax
-	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
+	DO_EOI
 
 	call	cpususpend_handler
 
@@ -259,7 +279,6 @@ IDTVEC(cpususpend)
 IDTVEC(rendezvous)
 	PUSH_FRAME
 	call	smp_rendezvous_action
-	movq	lapic, %rax
-	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
+	DO_EOI
 	jmp	doreti
 #endif /* SMP */
diff --git a/sys/amd64/amd64/intr_machdep.c b/sys/amd64/amd64/intr_machdep.c
index 6ab80df..941cecf 100644
--- a/sys/amd64/amd64/intr_machdep.c
+++ b/sys/amd64/amd64/intr_machdep.c
@@ -78,6 +78,8 @@ static STAILQ_HEAD(, pic) pics;
 
 #ifdef SMP
 static int assign_cpu;
+static int round_robin_interrupts = 1;
+TUNABLE_INT("round_robin_interrupts", &round_robin_interrupts);
 #endif
 
 static int	intr_assign_cpu(void *arg, u_char cpu);
@@ -460,6 +462,10 @@ intr_next_cpu(void)
 	if (!assign_cpu)
 		return (cpu_apic_ids[0]);
 
+	/* All interrupts go to the BSP if not allowed to round robin */
+	if (!round_robin_interrupts)
+		return (cpu_apic_ids[0]);
+
 	mtx_lock_spin(&icu_lock);
 	apic_id = cpu_apic_ids[current_cpu];
 	do {
diff --git a/sys/amd64/amd64/local_apic.c b/sys/amd64/amd64/local_apic.c
index 8edc971..f5c2938 100644
--- a/sys/amd64/amd64/local_apic.c
+++ b/sys/amd64/amd64/local_apic.c
@@ -148,6 +148,7 @@ volatile lapic_t *lapic;
 vm_paddr_t lapic_paddr;
 static u_long lapic_timer_divisor, lapic_timer_period, lapic_timer_hz;
 static enum lapic_clock clockcoverage;
+static int x2apic;
 
 static void	lapic_enable(void);
 static void	lapic_resume(struct pic *pic);
@@ -156,6 +157,36 @@ static void	lapic_timer_oneshot(u_int count);
 static void	lapic_timer_periodic(u_int count);
 static void	lapic_timer_set_divisor(u_int divisor);
 static uint32_t	lvt_mode(struct lapic *la, u_int pin, uint32_t value);
+static uint32_t	lapic_version(void);
+static uint32_t	lapic_ldr(void);
+static uint32_t	lapic_dfr(void);
+static uint32_t	lapic_lvt_lint0(void);
+static void	lapic_set_lvt_lint0(uint32_t value);
+static uint32_t	lapic_lvt_lint1(void);
+static void	lapic_set_lvt_lint1(uint32_t value);
+static uint32_t	lapic_tpr(void);
+static uint32_t	lapic_svr(void);
+static void	lapic_set_svr(uint32_t value);
+static uint32_t	lapic_lvt_timer(void);
+static void	lapic_set_lvt_timer(uint32_t value);
+static uint32_t	lapic_lvt_thermal(void);
+static uint32_t	lapic_lvt_error(void);
+static void	lapic_set_lvt_error(uint32_t value);
+static uint32_t	lapic_lvt_pcint(void);
+static void	lapic_set_lvt_pcint(uint32_t value);
+static uint32_t	lapic_esr(void);
+static void	lapic_set_esr(uint32_t value);
+static uint32_t	lapic_ccr_timer(void);
+static void	lapic_set_dcr_timer(uint32_t value);
+static void	lapic_set_icr_timer(uint32_t value);
+uint32_t	lapic_irr(int num);
+uint32_t	lapic_tmr(int num);
+uint32_t	lapic_isr(int num);
+static uint32_t	lapic_icr_lo(void);
+static void	lapic_set_icr_lo(uint32_t value);
+static uint32_t	lapic_icr_hi(void);
+static void	lapic_set_icr_hi(uint32_t value);
+static boolean_t lapic_missing(void);
 
 struct pic lapic_pic = { .pic_resume = lapic_resume };
 
@@ -206,12 +237,20 @@ lvt_mode(struct lapic *la, u_int pin, uint32_t value)
 void
 lapic_init(vm_paddr_t addr)
 {
-
-	/* Map the local APIC and setup the spurious interrupt handler. */
-	KASSERT(trunc_page(addr) == addr,
-	    ("local APIC not aligned on a page boundary"));
-	lapic = pmap_mapdev(addr, sizeof(lapic_t));
-	lapic_paddr = addr;
+	if ((cpu_feature2 & CPUID2_X2APIC) != 0 &&
+	    (rdmsr(MSR_APICBASE) & APICBASE_X2APIC) != 0) {
+		x2apic = 1;
+		if (bootverbose)
+			printf("Local APIC access using x2APIC MSRs\n");
+	} else {
+		/*
+		 * Map the local APIC and setup the spurious interrupt handler.
+		 */
+		KASSERT(trunc_page(addr) == addr,
+		    ("local APIC not aligned on a page boundary"));
+		lapic = pmap_mapdev(addr, sizeof(lapic_t));
+		lapic_paddr = addr;
+	}
 	setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Perform basic initialization of the BSP's local APIC. */
@@ -276,12 +315,12 @@ lapic_dump(const char* str)
 
 	printf("cpu%d %s:\n", PCPU_GET(cpuid), str);
 	printf("     ID: 0x%08x   VER: 0x%08x LDR: 0x%08x DFR: 0x%08x\n",
-	    lapic->id, lapic->version, lapic->ldr, lapic->dfr);
+	    lapic_id(), lapic_version(), lapic_ldr(), lapic_dfr());
 	printf("  lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
-	    lapic->lvt_lint0, lapic->lvt_lint1, lapic->tpr, lapic->svr);
+	    lapic_lvt_lint0(), lapic_lvt_lint1(), lapic_tpr(), lapic_svr());
 	printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x pmc: 0x%08x\n",
-	    lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error,
-	    lapic->lvt_pcint);
+	    lapic_lvt_timer(), lapic_lvt_thermal(), lapic_lvt_error(),
+	    lapic_lvt_pcint());
 }
 
 void
@@ -295,7 +334,7 @@ lapic_setup(int boot)
 	la = &lapics[lapic_id()];
 	KASSERT(la->la_present, ("missing APIC structure"));
 	eflags = intr_disable();
-	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+	maxlvt = (lapic_version() & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 
 	/* Initialize the TPR to allow all interrupts. */
 	lapic_set_tpr(0);
@@ -304,15 +343,15 @@ lapic_setup(int boot)
 	lapic_enable();
 
 	/* Program LINT[01] LVT entries. */
-	lapic->lvt_lint0 = lvt_mode(la, LVT_LINT0, lapic->lvt_lint0);
-	lapic->lvt_lint1 = lvt_mode(la, LVT_LINT1, lapic->lvt_lint1);
+	lapic_set_lvt_lint0(lvt_mode(la, LVT_LINT0, lapic_lvt_lint0()));
+	lapic_set_lvt_lint1(lvt_mode(la, LVT_LINT1, lapic_lvt_lint1()));
 
 	/* Program the PMC LVT entry if present. */
 	if (maxlvt >= LVT_PMC)
-		lapic->lvt_pcint = lvt_mode(la, LVT_PMC, lapic->lvt_pcint);
+		lapic_set_lvt_pcint(lvt_mode(la, LVT_PMC, lapic_lvt_pcint()));
 
 	/* Program timer LVT and setup handler. */
-	lapic->lvt_timer = lvt_mode(la, LVT_TIMER, lapic->lvt_timer);
+	lapic_set_lvt_timer(lvt_mode(la, LVT_TIMER, lapic_lvt_timer()));
 	if (boot) {
 		snprintf(buf, sizeof(buf), "cpu%d: timer", PCPU_GET(cpuid));
 		intrcnt_add(buf, &la->la_timer_count);
@@ -328,8 +367,8 @@ lapic_setup(int boot)
 	}
 
 	/* Program error LVT and clear any existing errors. */
-	lapic->lvt_error = lvt_mode(la, LVT_ERROR, lapic->lvt_error);
-	lapic->esr = 0;
+	lapic_set_lvt_error(lvt_mode(la, LVT_ERROR, lapic_lvt_error()));
+	lapic_set_esr(0);
 
 	/* XXX: Thermal LVT */
 
@@ -342,9 +381,9 @@ lapic_reenable_pmc(void)
 #ifdef HWPMC_HOOKS
 	uint32_t value;
 
-	value =  lapic->lvt_pcint;
+	value =  lapic_lvt_pcint();
 	value &= ~APIC_LVT_M;
-	lapic->lvt_pcint = value;
+	lapic_set_lvt_pcint(value);
 #endif
 }
 
@@ -355,7 +394,7 @@ lapic_update_pmc(void *dummy)
 	struct lapic *la;
 
 	la = &lapics[lapic_id()];
-	lapic->lvt_pcint = lvt_mode(la, LVT_PMC, lapic->lvt_pcint);
+	lapic_set_lvt_pcint(lvt_mode(la, LVT_PMC, lapic_lvt_pcint()));
 }
 #endif
 
@@ -366,11 +405,11 @@ lapic_enable_pmc(void)
 	u_int32_t maxlvt;
 
 	/* Fail if the local APIC is not present. */
-	if (lapic == NULL)
+	if (lapic_missing())
 		return (0);
 
 	/* Fail if the PMC LVT is not present. */
-	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+	maxlvt = (lapic_version() & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	if (maxlvt < LVT_PMC)
 		return (0);
 
@@ -400,11 +439,11 @@ lapic_disable_pmc(void)
 	u_int32_t maxlvt;
 
 	/* Fail if the local APIC is not present. */
-	if (lapic == NULL)
+	if (lapic_missing())
 		return;
 
 	/* Fail if the PMC LVT is not present. */
-	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+	maxlvt = (lapic_version() & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	if (maxlvt < LVT_PMC)
 		return;
 
@@ -435,7 +474,7 @@ lapic_setup_clock(enum lapic_clock srcsdes)
 	MPASS(srcsdes != LAPIC_CLOCK_NONE);
 
 	/* Can't drive the timer without a local APIC. */
-	if (lapic == NULL ||
+	if (lapic_missing() ||
 	    (resource_int_value("apic", 0, "clock", &i) == 0 && i == 0)) {
 		clockcoverage = LAPIC_CLOCK_NONE;
 		return (clockcoverage);
@@ -449,7 +488,7 @@ lapic_setup_clock(enum lapic_clock srcsdes)
 		lapic_timer_set_divisor(lapic_timer_divisor);
 		lapic_timer_oneshot(APIC_TIMER_MAX_COUNT);
 		DELAY(2000000);
-		value = APIC_TIMER_MAX_COUNT - lapic->ccr_timer;
+		value = APIC_TIMER_MAX_COUNT - lapic_ccr_timer();
 		if (value != APIC_TIMER_MAX_COUNT)
 			break;
 		lapic_timer_divisor <<= 1;
@@ -509,9 +548,9 @@ lapic_disable(void)
 	uint32_t value;
 
 	/* Software disable the local APIC. */
-	value = lapic->svr;
+	value = lapic_svr();
 	value &= ~APIC_SVR_SWEN;
-	lapic->svr = value;
+	lapic_set_svr(value);
 }
 
 static void
@@ -520,10 +559,10 @@ lapic_enable(void)
 	u_int32_t value;
 
 	/* Program the spurious vector to enable the local APIC. */
-	value = lapic->svr;
+	value = lapic_svr();
 	value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS);
 	value |= (APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT);
-	lapic->svr = value;
+	lapic_set_svr(value);
 }
 
 /* Reset the local APIC on the BSP during resume. */
@@ -534,19 +573,342 @@ lapic_resume(struct pic *pic)
 	lapic_setup(0);
 }
 
+static uint32_t
+lapic_version(void)
+{
+
+	if (x2apic)
+		return (rdmsr(MSR_APIC_VERSION));
+	else
+		return (lapic->version);
+}
+
+static uint32_t
+lapic_ldr(void)
+{
+
+	if (x2apic)
+		return (rdmsr(MSR_APIC_LDR));
+	else
+		return (lapic->ldr);
+}
+
+static uint32_t
+lapic_dfr(void)
+{
+
+	if (x2apic)
+		return (0xffffffff);	/* DFR not available in x2APIC mode */
+	else
+		return (lapic->dfr);
+}
+
+static uint32_t
+lapic_lvt_lint0(void)
+{
+
+	if (x2apic)
+		return (rdmsr(MSR_APIC_LVT_LINT0));
+	else
+		return (lapic->lvt_lint0);
+}
+
+static void
+lapic_set_lvt_lint0(uint32_t value)
+{
+
+	if (x2apic)
+		wrmsr(MSR_APIC_LVT_LINT0, value);
+	else
+		lapic->lvt_lint0 = value;
+}
+
+static uint32_t
+lapic_lvt_lint1(void)
+{
+
+	if (x2apic)
+		return (rdmsr(MSR_APIC_LVT_LINT1));
+	else
+		return (lapic->lvt_lint1);
+}
+
+static void
+lapic_set_lvt_lint1(uint32_t value)
+{
+
+	if (x2apic)
+		wrmsr(MSR_APIC_LVT_LINT1, value);
+	else
+		lapic->lvt_lint1 = value;
+}
+
+static uint32_t
+lapic_tpr(void)
+{
+
+	if (x2apic)
+		return (rdmsr(MSR_APIC_TPR));
+	else
+		return (lapic->tpr);
+}
+
+static uint32_t
+lapic_svr(void)
+{
+
+	if (x2apic)
+		return (rdmsr(MSR_APIC_SVR));
+	else
+		return (lapic->svr);
+}
+
+static void
+lapic_set_svr(uint32_t value)
+{
+
+	if (x2apic)
+		wrmsr(MSR_APIC_SVR, value);
+	else
+		lapic->svr = value;
+}
+
+static uint32_t
+lapic_lvt_timer(void)
+{
+
+	if (x2apic)
+		return (rdmsr(MSR_APIC_LVT_TIMER));
+	else
+		return (lapic->lvt_timer);
+}
+
+static void
+lapic_set_lvt_timer(uint32_t value)
+{
+
+	if (x2apic)
+		wrmsr(MSR_APIC_LVT_TIMER, value);
+	else
+		lapic->lvt_timer = value;
+}
+
+static uint32_t
+lapic_lvt_thermal(void)
+{
+
+	if (x2apic)
+		return (rdmsr(MSR_APIC_LVT_THERMAL));
+	else
+		return (lapic->lvt_thermal);
+}
+
+static uint32_t
+lapic_lvt_error(void)
+{
+
+	if (x2apic)
+		return (rdmsr(MSR_APIC_LVT_ERROR));
+	else
+		return (lapic->lvt_error);
+}
+
+static void
+lapic_set_lvt_error(uint32_t value)
+{
+
+	if (x2apic)
+		wrmsr(MSR_APIC_LVT_ERROR, value);
+	else
+		lapic->lvt_error = value;
+}
+
+static uint32_t
+lapic_lvt_pcint(void)
+{
+
+	if (x2apic)
+		return (rdmsr(MSR_APIC_LVT_PCINT));
+	else
+		return (lapic->lvt_pcint);
+}
+
+static void
+lapic_set_lvt_pcint(uint32_t value)
+{
+
+	if (x2apic)
+		wrmsr(MSR_APIC_LVT_PCINT, value);
+	else
+		lapic->lvt_pcint = value;
+}
+
+static uint32_t
+lapic_esr(void)
+{
+
+	if (x2apic)
+		return (rdmsr(MSR_APIC_ESR));
+	else
+		return (lapic->esr);
+}
+
+static void
+lapic_set_esr(uint32_t value)
+{
+
+	if (x2apic)
+		wrmsr(MSR_APIC_ESR, value);
+	else
+		lapic->esr = value;
+}
+
+static uint32_t
+lapic_ccr_timer(void)
+{
+
+	if (x2apic)
+		return (rdmsr(MSR_APIC_CCR_TIMER));
+	else
+		return (lapic->ccr_timer);
+}
+
+static void
+lapic_set_dcr_timer(uint32_t value)
+{
+
+	if (x2apic)
+		wrmsr(MSR_APIC_DCR_TIMER, value);
+	else
+		lapic->dcr_timer = value;
+}
+
+static void
+lapic_set_icr_timer(uint32_t value)
+{
+
+	if (x2apic)
+		wrmsr(MSR_APIC_ICR_TIMER, value);
+	else
+		lapic->icr_timer = value;
+}
+
+uint32_t
+lapic_tmr(int num)
+{
+	int msr;
+	volatile uint32_t *regptr;
+
+	KASSERT(num >= 0 && num < 8, ("lapic_tmr: invalid num %d", num));
+
+	if (x2apic) {
+		msr = MSR_APIC_TMR0 + num;
+		return (rdmsr(msr));
+	} else {
+		regptr = &lapic->tmr0;
+		return (regptr[num * 4]);
+	}
+}
+
+uint32_t
+lapic_irr(int num)
+{
+	int msr;
+	volatile uint32_t *regptr;
+
+	KASSERT(num >= 0 && num < 8, ("lapic_irr: invalid num %d", num));
+
+	if (x2apic) {
+		msr = MSR_APIC_IRR0 + num;
+		return (rdmsr(msr));
+	} else {
+		regptr = &lapic->irr0;
+		return (regptr[num * 4]);
+	}
+}
+
+uint32_t
+lapic_isr(int num)
+{
+	int msr;
+	volatile uint32_t *regptr;
+
+	KASSERT(num >= 0 && num < 8, ("lapic_isr: invalid num %d", num));
+
+	if (x2apic) {
+		msr = MSR_APIC_ISR0 + num;
+		return (rdmsr(msr));
+	} else {
+		regptr = &lapic->isr0;
+		return (regptr[num * 4]);
+	}
+}
+
+static uint32_t icr_hi_stashed[MAXCPU];
+
+static uint32_t
+lapic_icr_lo(void)
+{
+
+	if (x2apic)
+		return (0);
+	else
+		return (lapic->icr_lo);
+}
+
+static void
+lapic_set_icr_lo(uint32_t value)
+{
+
+	if (x2apic) {
+		wrmsr(MSR_APIC_ICR,
+		      (uint64_t)icr_hi_stashed[curcpu] << 32 | value);
+	} else
+		lapic->icr_lo = value;
+}
+
+static uint32_t
+lapic_icr_hi(void)
+{
+
+	if (x2apic)
+		return (0);
+	else
+		return (lapic->icr_hi);
+}
+
+static void
+lapic_set_icr_hi(uint32_t value)
+{
+	if (x2apic)
+		icr_hi_stashed[curcpu] = value >> APIC_ID_SHIFT; /* XXX */
+	else
+		lapic->icr_hi = value;
+}
+
+static boolean_t
+lapic_missing(void)
+{
+
+	if (x2apic == 0 && lapic == NULL)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
 int
 lapic_id(void)
 {
 
-	KASSERT(lapic != NULL, ("local APIC is not mapped"));
-	return (lapic->id >> APIC_ID_SHIFT);
+	if (x2apic)
+		return (rdmsr(MSR_APIC_ID));
+	else
+		return (lapic->id >> APIC_ID_SHIFT);
 }
 
 int
 lapic_intr_pending(u_int vector)
 {
-	volatile u_int32_t *irr;
-
 	/*
 	 * The IRR registers are an array of 128-bit registers each of
 	 * which only describes 32 interrupts in the low 32 bits..  Thus,
@@ -556,8 +918,7 @@ lapic_intr_pending(u_int vector)
 	 * modulus the vector by 32 to determine the individual bit to
 	 * test.
 	 */
-	irr = &lapic->irr0;
-	return (irr[(vector / 32) * 4] & 1 << (vector % 32));
+	return (lapic_irr(vector / 32) & 1 << (vector % 32));
 }
 
 void
@@ -713,13 +1074,19 @@ void
 lapic_set_tpr(u_int vector)
 {
 #ifdef CHEAP_TPR
-	lapic->tpr = vector;
+	if (x2apic)
+		wrmsr(MSR_APIC_TPR, vector);
+	else
+		lapic->tpr = vector;
 #else
 	u_int32_t tpr;
 
-	tpr = lapic->tpr & ~APIC_TPR_PRIO;
+	tpr = lapic_tpr() & ~APIC_TPR_PRIO;
 	tpr |= vector;
-	lapic->tpr = tpr;
+	if (x2apic)
+		wrmsr(MSR_APIC_TPR, tpr);
+	else
+		lapic->tpr = tpr;
 #endif
 }
 
@@ -727,7 +1094,10 @@ void
 lapic_eoi(void)
 {
 
-	lapic->eoi = 0;
+	if (x2apic)
+		wrmsr(MSR_APIC_EOI, 0);
+	else
+		lapic->eoi = 0;
 }
 
 void
@@ -819,7 +1189,7 @@ lapic_timer_set_divisor(u_int divisor)
 	KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor));
 	KASSERT(ffs(divisor) <= sizeof(lapic_timer_divisors) /
 	    sizeof(u_int32_t), ("lapic: invalid divisor %u", divisor));
-	lapic->dcr_timer = lapic_timer_divisors[ffs(divisor) - 1];
+	lapic_set_dcr_timer(lapic_timer_divisors[ffs(divisor) - 1]);
 }
 
 static void
@@ -827,11 +1197,11 @@ lapic_timer_oneshot(u_int count)
 {
 	u_int32_t value;
 
-	value = lapic->lvt_timer;
+	value = lapic_lvt_timer();
 	value &= ~APIC_LVTT_TM;
 	value |= APIC_LVTT_TM_ONE_SHOT;
-	lapic->lvt_timer = value;
-	lapic->icr_timer = count;
+	lapic_set_lvt_timer(value);
+	lapic_set_icr_timer(count);
 }
 
 static void
@@ -839,11 +1209,11 @@ lapic_timer_periodic(u_int count)
 {
 	u_int32_t value;
 
-	value = lapic->lvt_timer;
+	value = lapic_lvt_timer();
 	value &= ~APIC_LVTT_TM;
 	value |= APIC_LVTT_TM_PERIODIC;
-	lapic->lvt_timer = value;
-	lapic->icr_timer = count;
+	lapic_set_lvt_timer(value);
+	lapic_set_icr_timer(count);
 }
 
 static void
@@ -851,9 +1221,9 @@ lapic_timer_enable_intr(void)
 {
 	u_int32_t value;
 
-	value = lapic->lvt_timer;
+	value = lapic_lvt_timer();
 	value &= ~APIC_LVT_M;
-	lapic->lvt_timer = value;
+	lapic_set_lvt_timer(value);
 }
 
 void
@@ -867,8 +1237,8 @@ lapic_handle_error(void)
 	 * to update its value to indicate any errors that have
 	 * occurred since the previous write to the register.
 	 */
-	lapic->esr = 0;
-	esr = lapic->esr;
+	lapic_set_esr(0);
+	esr = lapic_esr();
 
 	printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr);
 	lapic_eoi();
@@ -1115,17 +1485,17 @@ DB_SHOW_COMMAND(lapic, db_show_lapic)
 	uint32_t v;
 
 	db_printf("lapic ID = %d\n", lapic_id());
-	v = lapic->version;
+	v = lapic_version();
 	db_printf("version  = %d.%d\n", (v & APIC_VER_VERSION) >> 4,
 	    v & 0xf);
 	db_printf("max LVT  = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT);
-	v = lapic->svr;
+	v = lapic_svr();
 	db_printf("SVR      = %02x (%s)\n", v & APIC_SVR_VECTOR,
 	    v & APIC_SVR_ENABLE ? "enabled" : "disabled");
-	db_printf("TPR      = %02x\n", lapic->tpr);
+	db_printf("TPR      = %02x\n", lapic_tpr());
 
 #define dump_field(prefix, index)					\
-	dump_mask(__XSTRING(prefix ## index), lapic->prefix ## index,	\
+	dump_mask(__XSTRING(prefix ## index), lapic_ ## prefix(index),	\
 	    index * 32)
 
 	db_printf("In-service Interrupts:\n");
@@ -1300,7 +1670,7 @@ lapic_ipi_wait(int delay)
 	} else
 		incr = 1;
 	for (x = 0; x < delay; x += incr) {
-		if ((lapic->icr_lo & APIC_DELSTAT_MASK) == APIC_DELSTAT_IDLE)
+		if ((lapic_icr_lo() & APIC_DELSTAT_MASK) == APIC_DELSTAT_IDLE)
 			return (1);
 		ia32_pause();
 	}
@@ -1313,7 +1683,7 @@ lapic_ipi_raw(register_t icrlo, u_int dest)
 	register_t value, eflags;
 
 	/* XXX: Need more sanity checking of icrlo? */
-	KASSERT(lapic != NULL, ("%s called too early", __func__));
+	KASSERT(!lapic_missing(), ("%s called too early", __func__));
 	KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 	    ("%s: invalid dest field", __func__));
 	KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0,
@@ -1322,17 +1692,17 @@ lapic_ipi_raw(register_t icrlo, u_int dest)
 	/* Set destination in ICR HI register if it is being used. */
 	eflags = intr_disable();
 	if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) {
-		value = lapic->icr_hi;
+		value = lapic_icr_hi();
 		value &= ~APIC_ID_MASK;
 		value |= dest << APIC_ID_SHIFT;
-		lapic->icr_hi = value;
+		lapic_set_icr_hi(value);
 	}
 
 	/* Program the contents of the IPI and dispatch it. */
-	value = lapic->icr_lo;
+	value = lapic_icr_lo();
 	value &= APIC_ICRLO_RESV_MASK;
 	value |= icrlo;
-	lapic->icr_lo = value;
+	lapic_set_icr_lo(value);
 	intr_restore(eflags);
 }
 
@@ -1409,7 +1779,7 @@ lapic_ipi_vectored(u_int vector, int dest)
 		printf("APIC: IPI might be stuck\n");
 #else /* !needsattention */
 		/* Wait until mesage is sent without a timeout. */
-		while (lapic->icr_lo & APIC_DELSTAT_PEND)
+		while (lapic_icr_lo() & APIC_DELSTAT_PEND)
 			ia32_pause();
 #endif /* needsattention */
 	}
diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c
index a9af809..4377c81 100644
--- a/sys/amd64/amd64/minidump_machdep.c
+++ b/sys/amd64/amd64/minidump_machdep.c
@@ -27,6 +27,7 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_pmap.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 0ef8017..8f8825d 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -140,6 +140,26 @@ struct cpu_info {
 int cpu_apic_ids[MAXCPU];
 int apic_cpuids[MAX_APIC_ID + 1];
 
+/*
+ * Trampoline for hypervisor direct 64-bit jump.
+ *
+ *   0  -	signature for guest->host verification
+ *   8  -	virtual address of this page
+ *  16  -	instruction virtual address
+ *  24  -	stack pointer virtual address
+ *  32  -	CR3, physical address of kernel page table
+ *  40  -	24-byte area for null/code/data GDT entries
+ */
+#define MP_V64T_SIG	0xcafebabecafebabeULL
+struct mp_v64tramp {
+	uint64_t	mt_sig;
+	uint64_t	mt_virt;
+	uint64_t	mt_eip;
+	uint64_t	mt_rsp;
+	uint64_t	mt_cr3;
+	uint64_t	mt_gdtr[3];
+};
+
 /* Holds pending bitmap based IPIs per CPU */
 static volatile u_int cpu_ipi_pending[MAXCPU];
 
@@ -873,6 +893,29 @@ start_all_aps(void)
 		bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8;
 		bootAP = cpu;
 
+		/*
+		 * If running in a VM that doesn't support the unrestricted
+		 * guest 16-bit mode, forget most of the above and create
+		 * the data block that allows the hypervisor to direct-jump
+		 * into 64-bit mode. Copy this over the top of the 16-bit
+		 * bootstrap. The startup-IPI informs the hypervisor which
+		 * physical page this data block lies in. The hypervisor
+		 * will then use the block to initialise register state of
+		 * the AP in an almost identical fashion to how it builds
+		 * the BSP initial register state.
+		 */
+		if (testenv("hw.use_bvm_mptramp")) {
+			struct mp_v64tramp mv;
+
+			bzero(&mv, sizeof(mv));
+			mv.mt_sig = MP_V64T_SIG;
+			mv.mt_virt = (uint64_t) va;
+			mv.mt_eip = (uint64_t) init_secondary;
+			mv.mt_rsp = (uint64_t) bootSTK;
+			mv.mt_cr3 = KPML4phys;
+			bcopy(&mv, (void *) va, sizeof(mv));
+		}
+
 		/* attempt to start the Application Processor */
 		if (!start_ap(apic_id)) {
 			/* restore the warmstart vector */
diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
index d6906ac..fe2e256 100644
--- a/sys/amd64/amd64/vm_machdep.c
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -507,8 +507,10 @@ cpu_reset_proxy()
 {
 
 	cpu_reset_proxy_active = 1;
-	while (cpu_reset_proxy_active == 1)
+	while (cpu_reset_proxy_active == 1) {
+		ia32_pause();
 		;	/* Wait for other cpu to see that we've started */
+	}
 	stop_cpus((1<<cpu_reset_proxyid));
 	printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
 	DELAY(1000000);
@@ -539,14 +541,17 @@ cpu_reset()
 			atomic_store_rel_int(&started_cpus, 1 << 0);
 
 			cnt = 0;
-			while (cpu_reset_proxy_active == 0 && cnt < 10000000)
+			while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
+				ia32_pause();
 				cnt++;	/* Wait for BSP to announce restart */
+			}
 			if (cpu_reset_proxy_active == 0)
 				printf("cpu_reset: Failed to restart BSP\n");
 			enable_intr();
 			cpu_reset_proxy_active = 2;
 
-			while (1);
+			while (1)
+				ia32_pause();
 			/* NOTREACHED */
 		}
 
diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index 999ccb7..ea379d0 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -323,3 +323,14 @@ device		fwe		# Ethernet over FireWire (non-standard!)
 device		fwip		# IP over FireWire (RFC 2734,3146)
 device		dcons		# Dumb console driver
 device		dcons_crom	# Configuration ROM for dcons
+
+# bhyve options
+device		bvmconsole	# brain dead simple bvm console
+device		bvmdebug	# brain dead simple bvm gdb pipe
+device		mptable
+
+options		KDB
+options		KDB_TRACE
+options		DDB
+options		GDB
+
diff --git a/sys/amd64/include/bus.h b/sys/amd64/include/bus.h
index e25f427..b7f3ab6 100644
--- a/sys/amd64/include/bus.h
+++ b/sys/amd64/include/bus.h
@@ -269,9 +269,13 @@ bus_space_read_multi_1(bus_space_tag_t tag, bus_space_handle_t bsh,
 		       bus_size_t offset, u_int8_t *addr, size_t count)
 {
 
-	if (tag == AMD64_BUS_SPACE_IO)
-		insb(bsh + offset, addr, count);
-	else {
+	if (tag == AMD64_BUS_SPACE_IO) {
+		while (count > 0) {
+			*addr = inb(bsh + offset);
+			count--;
+			addr++;
+		}
+	} else {
 #ifdef __GNUCLIKE_ASM
 		__asm __volatile("				\n\
 			cld					\n\
@@ -290,9 +294,13 @@ bus_space_read_multi_2(bus_space_tag_t tag, bus_space_handle_t bsh,
 		       bus_size_t offset, u_int16_t *addr, size_t count)
 {
 
-	if (tag == AMD64_BUS_SPACE_IO)
-		insw(bsh + offset, addr, count);
-	else {
+	if (tag == AMD64_BUS_SPACE_IO) {
+		while (count > 0) {
+			*addr = inw(bsh + offset);
+			count--;
+			addr++;
+		}
+	} else {
 #ifdef __GNUCLIKE_ASM
 		__asm __volatile("				\n\
 			cld					\n\
@@ -311,9 +319,13 @@ bus_space_read_multi_4(bus_space_tag_t tag, bus_space_handle_t bsh,
 		       bus_size_t offset, u_int32_t *addr, size_t count)
 {
 
-	if (tag == AMD64_BUS_SPACE_IO)
-		insl(bsh + offset, addr, count);
-	else {
+	if (tag == AMD64_BUS_SPACE_IO) {
+		while (count > 0) {
+			*addr = inl(bsh + offset);
+			count--;
+			addr++;
+		}
+	} else {
 #ifdef __GNUCLIKE_ASM
 		__asm __volatile("				\n\
 			cld					\n\
@@ -533,9 +545,13 @@ bus_space_write_multi_1(bus_space_tag_t tag, bus_space_handle_t bsh,
 			bus_size_t offset, const u_int8_t *addr, size_t count)
 {
 
-	if (tag == AMD64_BUS_SPACE_IO)
-		outsb(bsh + offset, addr, count);
-	else {
+	if (tag == AMD64_BUS_SPACE_IO) {
+		while (count > 0) {
+			outb(bsh + offset, *addr);
+			addr++;
+			count--;
+		}
+	} else {
 #ifdef __GNUCLIKE_ASM
 		__asm __volatile("				\n\
 			cld					\n\
@@ -554,9 +570,13 @@ bus_space_write_multi_2(bus_space_tag_t tag, bus_space_handle_t bsh,
 			bus_size_t offset, const u_int16_t *addr, size_t count)
 {
 
-	if (tag == AMD64_BUS_SPACE_IO)
-		outsw(bsh + offset, addr, count);
-	else {
+	if (tag == AMD64_BUS_SPACE_IO) {
+		while (count > 0) {
+			outw(bsh + offset, *addr);
+			addr++;
+			count--;
+		}
+	} else {
 #ifdef __GNUCLIKE_ASM
 		__asm __volatile("				\n\
 			cld					\n\
@@ -575,9 +595,13 @@ bus_space_write_multi_4(bus_space_tag_t tag, bus_space_handle_t bsh,
 			bus_size_t offset, const u_int32_t *addr, size_t count)
 {
 
-	if (tag == AMD64_BUS_SPACE_IO)
-		outsl(bsh + offset, addr, count);
-	else {
+	if (tag == AMD64_BUS_SPACE_IO) {
+		while (count > 0) {
+			outl(bsh + offset, *addr);
+			addr++;
+			count--;
+		}
+	} else {
 #ifdef __GNUCLIKE_ASM
 		__asm __volatile("				\n\
 			cld					\n\
diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h
index c95fee0..42653cc 100644
--- a/sys/amd64/include/specialreg.h
+++ b/sys/amd64/include/specialreg.h
@@ -292,12 +292,41 @@
 #define	MSR_MC4_ADDR		0x412
 #define	MSR_MC4_MISC		0x413
 
+/* X2APIC MSRs */
+#define	MSR_APIC_ID		0x802
+#define	MSR_APIC_VERSION	0x803
+#define	MSR_APIC_TPR		0x808
+#define	MSR_APIC_EOI		0x80b
+#define	MSR_APIC_LDR		0x80d
+#define	MSR_APIC_SVR		0x80f
+#define	MSR_APIC_ISR0		0x810
+#define	MSR_APIC_ISR1		0x811
+#define	MSR_APIC_ISR2		0x812
+#define	MSR_APIC_ISR3		0x813
+#define	MSR_APIC_ISR4		0x814
+#define	MSR_APIC_ISR5		0x815
+#define	MSR_APIC_ISR6		0x816
+#define	MSR_APIC_ISR7		0x817
+#define	MSR_APIC_TMR0		0x818
+#define	MSR_APIC_IRR0		0x820
+#define	MSR_APIC_ESR		0x828
+#define	MSR_APIC_ICR		0x830
+#define	MSR_APIC_LVT_TIMER	0x832
+#define	MSR_APIC_LVT_THERMAL	0x833
+#define	MSR_APIC_LVT_PCINT	0x834
+#define	MSR_APIC_LVT_LINT0	0x835
+#define	MSR_APIC_LVT_LINT1	0x836
+#define	MSR_APIC_LVT_ERROR	0x837
+#define	MSR_APIC_ICR_TIMER	0x838
+#define	MSR_APIC_CCR_TIMER	0x839
+#define	MSR_APIC_DCR_TIMER	0x83e
+
 /*
  * Constants related to MSR's.
  */
-#define	APICBASE_RESERVED	0x000006ff
+#define	APICBASE_RESERVED	0x000002ff
 #define	APICBASE_BSP		0x00000100
-#define APICBASE_X2APIC		0x00000400
+#define	APICBASE_X2APIC		0x00000400
 #define	APICBASE_ENABLED	0x00000800
 #define	APICBASE_ADDRESS	0xfffff000
 
-- 
cgit v1.1


From 6d6dbef9d4de7a3f8785d8db43c17cf992065785 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Sun, 15 May 2011 02:09:12 +0000
Subject: Enable handling of 1GB pages in the direct map since HEAD supports
 those.

Submitted by:	neel
---
 sys/amd64/vmm/vmm_mem.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c
index 9ce1e80..764a6e9 100644
--- a/sys/amd64/vmm/vmm_mem.c
+++ b/sys/amd64/vmm/vmm_mem.c
@@ -168,8 +168,7 @@ vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end)
 	if (end >= NBPML4)
 		panic("Cannot map memory beyond %ldGB", NBPML4 / GB);
 
-	/* XXX FreeBSD 8.1 does not use 1G superpages in the direct map */
-	if (0 && vmm_supports_1G_pages())
+	if (vmm_supports_1G_pages())
 		superpage_size = NBPDP;
 	else
 		superpage_size = NBPDR;
-- 
cgit v1.1


From 5a44aef8a38d88c0d5b265a72c5d90ef2d73cf95 Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Thu, 19 May 2011 21:53:25 +0000
Subject: Changes to allow the GENERIC+bhye kernel built from this branch to
 run as a 1/2 CPU guest on an 8.1 bhyve host.

bhyve/inout.c
      inout.h
      fbsdrun.c
 - Rather than exiting on accesses to unhandled i/o ports, emulate
   hardware by returning -1 on reads and ignoring writes to unhandled
   ports. Support the previous mode by allowing a 'strict' parameter
   to be set from the command line.
   The 8.1 guest kernel was vastly cut down from GENERIC and had no
   ISA devices. Booting GENERIC exposes a massive amount of random
   touching of i/o ports (hello syscons/vga/atkbdc).

bhyve/consport.c
dev/bvm/bvm_console.c
 - implement a simplistic signature for the bvm console by returning
   'bv' for an inw on the port. Also, set the priority of the console
   to CN_REMOTE if the signature was returned. This works better in
   an environment where multiple consoles are in the kernel (hello syscons)

bhyve/rtc.c
 - return 0 for the access to RTC_EQUIPMENT (yes, you syscons)

amd64/vmm/x86.c
          x86.h
 - hide a bunch more CPUID leaf 1 bits from the guest to prevent
   cpufreq drivers from probing.
   The next step will be to move CPUID handling completely into
   user-space. This will allow the full spectrum of changes from
   presenting a lowest-common-denominator CPU type/feature set, to
   exposing (almost) everything that the host can support.

Reviewed by:	neel
Obtained from:	NetApp
---
 sys/amd64/vmm/x86.c | 21 +++++++++++++++++++--
 sys/amd64/vmm/x86.h |  1 +
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
index 45c4c53..f6b38e0 100644
--- a/sys/amd64/vmm/x86.c
+++ b/sys/amd64/vmm/x86.c
@@ -75,13 +75,19 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 			regs[1] |= (0 << CPUID_0000_0001_APICID_SHIFT);
 
 			/*
-			 * Don't expose VMX capability.
+			 * Don't expose VMX, SpeedStep or TME capability.
 			 * Advertise x2APIC capability.
 			 */
-			regs[2] &= ~CPUID_0000_0001_FEAT0_VMX;
+			regs[2] &= ~(CPUID_0000_0001_FEAT0_VMX | CPUID2_EST |
+				     CPUID2_TM2);
 			regs[2] |= CPUID2_X2APIC;
 
 			/*
+			 * Hide thermal monitoring
+			 */
+			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
+			
+			/*
 			 * Machine check handling is done in the host.
 			 * Hide MTRR capability.
 			 */
@@ -89,6 +95,17 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 
 			break;
 
+		case CPUID_0000_0006:
+			/*
+			 * Handle the access, but report 0 for
+			 * all options
+			 */
+			regs[0] = 0;
+			regs[1] = 0;
+			regs[2] = 0;
+			regs[3] = 0;
+			break;
+
 		case CPUID_0000_000B:
 			/*
 			 * XXXSMP fixme
diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h
index bc4f8a4..b437d61 100644
--- a/sys/amd64/vmm/x86.h
+++ b/sys/amd64/vmm/x86.h
@@ -34,6 +34,7 @@
 #define CPUID_0000_0002 (0x2)
 #define CPUID_0000_0003 (0x3)
 #define CPUID_0000_0004 (0x4)
+#define CPUID_0000_0006 (0x6)
 #define	CPUID_0000_000A	(0xA)
 #define	CPUID_0000_000B	(0xB)
 #define CPUID_8000_0000	(0x80000000)
-- 
cgit v1.1


From e24f5ed9f2b1a7f169a57fcc109ed55bd5ef8954 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Fri, 20 May 2011 02:08:05 +0000
Subject: Avoid unnecessary sign extension when promoted to a 64-bit integer.

This was benign because the interruption info field is a 32-bit quantity and
the hardware guarantees that the upper 32-bits are all zeros. But it did make
reading the objdump output very confusing.
---
 sys/amd64/vmm/intel/vmcs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index c633a59..853c9c6 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -304,7 +304,7 @@ uint64_t vmcs_read(uint32_t encoding);
 /*
  * VMCS interrupt information fields
  */
-#define	VMCS_INTERRUPTION_INFO_VALID	(1 << 31)
+#define	VMCS_INTERRUPTION_INFO_VALID	(1U << 31)
 #define	VMCS_INTERRUPTION_INFO_HW_INTR	(0 << 8)
 #define	VMCS_INTERRUPTION_INFO_NMI	(2 << 8)
 
-- 
cgit v1.1


From 3930d0afcfdc805e3c97fcc29a2c1ffbd8b31c7c Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Fri, 20 May 2011 03:23:09 +0000
Subject: Fix a long standing bug in VMXCTX_GUEST_RESTORE().

There was an assumption by the "callers" of this macro that on "return" the
%rsp will be pointing to the 'vmxctx'. The macro was not doing this and thus
when trying to restore host state on an error from "vmlaunch" or "vmresume"
we were treating the memory locations on the host stack as 'struct vmxctx'.
This led to all sorts of weird bugs like double faults or invalid instruction
faults.

This bug is exposed by the -O2 option used to compile the kernel module. With
the -O2 flag the compiler will optimize the following piece of code:

	int loopstart = 1;
	...
	if (loopstart) {
		loopstart = 0;
		vmx_launch();
	} else
		vmx_resume();

into this:

	vmx_launch();

Since vmx_launch() and vmx_resume() are declared to be __dead2 functions the
compiler is free to do this. The compiler has no way to know that the
functions return indirectly through vmx_setjmp(). This optimization in turn
leads us to trigger the bug in VMXCTX_GUEST_RESTORE().

With this change we can boot a 8.1 guest on a 9.0 host.

Reported by: jhb@
---
 sys/amd64/vmm/intel/vmx.c          |  8 ++++----
 sys/amd64/vmm/intel/vmx.h          |  4 ++++
 sys/amd64/vmm/intel/vmx_genassym.c |  1 +
 sys/amd64/vmm/intel/vmx_support.S  | 22 ++++++++++++++++++----
 4 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index ec181c4..44eae67 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1189,7 +1189,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 static int
 vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit)
 {
-	int error, vie, rc, handled, astpending, loopstart;
+	int error, vie, rc, handled, astpending;
 	uint32_t exit_reason;
 	struct vmx *vmx;
 	struct vmxctx *vmxctx;
@@ -1198,7 +1198,7 @@ vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit)
 	vmx = arg;
 	vmcs = &vmx->vmcs[vcpu];
 	vmxctx = &vmx->ctx[vcpu];
-	loopstart = 1;
+	vmxctx->launched = 0;
 
 	/*
 	 * XXX Can we avoid doing this every time we do a vm run?
@@ -1232,8 +1232,8 @@ vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit)
 #endif
 		switch (rc) {
 		case VMX_RETURN_DIRECT:
-			if (loopstart) {
-				loopstart = 0;
+			if (vmxctx->launched == 0) {
+				vmxctx->launched = 1;
 				vmx_launch(vmxctx);
 			} else
 				vmx_resume(vmxctx);
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
index 69697f8..61d72a8 100644
--- a/sys/amd64/vmm/intel/vmx.h
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -34,6 +34,9 @@
 #define	GUEST_MSR_MAX_ENTRIES	64		/* arbitrary */
 
 struct vmxctx {
+	register_t	tmpstk[32];		/* vmx_return() stack */
+	register_t	tmpstktop;
+
 	register_t	guest_rdi;		/* Guest state */
 	register_t	guest_rsi;
 	register_t	guest_rdx;
@@ -63,6 +66,7 @@ struct vmxctx {
 	 * XXX todo debug registers and fpu state
 	 */
 	
+	int		launched;		/* vmcs launch state */
 	int		launch_error;
 };
 
diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c
index c4b1efc..c5b5bf9 100644
--- a/sys/amd64/vmm/intel/vmx_genassym.c
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$");
 #include "vmx.h"
 #include "vmx_cpufunc.h"
 
+ASSYM(VMXCTX_TMPSTKTOP, offsetof(struct vmxctx, tmpstktop));
 ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi));
 ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi));
 ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx));
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
index 4d1bf1d..8bdba86 100644
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -31,15 +31,23 @@
 #include "vmx_assym.s"
 
 /*
- * Assumes that %rdi holds a pointer to the 'vmxctx'
+ * Assumes that %rdi holds a pointer to the 'vmxctx'.
+ *
+ * On "return" all registers are updated to reflect guest state. The two
+ * exceptions are %rip and %rsp. These registers are atomically switched
+ * by hardware from the guest area of the vmcs.
+ *
+ * We modify %rsp to point to the 'vmxctx' so we can use it to restore
+ * host context in case of an error with 'vmlaunch' or 'vmresume'.
  */
 #define	VMX_GUEST_RESTORE						\
 	/*								\
-	 * Make sure that interrupts are disabled before restoring CR2.	\
-	 * Otherwise there could be a page fault during the interrupt	\
-	 * handler execution that would end up trashing CR2.		\
+	 * Disable interrupts before updating %rsp. The location that	\
+	 * %rsp points to is a 'vmxctx' and not a real stack so we	\
+	 * don't want an interrupt handler to trash it.			\
 	 */								\
 	cli;								\
+	movq	%rdi,%rsp;						\
 	movq	VMXCTX_GUEST_CR2(%rdi),%rsi;				\
 	movq	%rsi,%cr2;						\
 	movq	VMXCTX_GUEST_RSI(%rdi),%rsi;				\
@@ -148,6 +156,8 @@ ENTRY(vmx_longjmp)
 
 	movq	%rsp,%rdi
 	movq	$VMX_RETURN_LONGJMP,%rsi
+
+	addq	$VMXCTX_TMPSTKTOP,%rsp
 	callq	vmx_return
 END(vmx_longjmp)
 
@@ -174,6 +184,8 @@ ENTRY(vmx_resume)
 	/* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
 	movq	%rsp,%rdi
 	movq	$VMX_RETURN_VMRESUME,%rsi
+
+	addq	$VMXCTX_TMPSTKTOP,%rsp
 	callq	vmx_return
 END(vmx_resume)
 
@@ -200,5 +212,7 @@ ENTRY(vmx_launch)
 	/* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
 	movq	%rsp,%rdi
 	movq	$VMX_RETURN_VMLAUNCH,%rsi
+
+	addq	$VMXCTX_TMPSTKTOP,%rsp
 	callq	vmx_return
 END(vmx_launch)
-- 
cgit v1.1


From 63af589b2ca587eebab8b8b1c05cc910d72a6a56 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Thu, 2 Jun 2011 13:49:19 +0000
Subject: Add a 'show vmcs' DDB command to dump state about the current CPU's
 current VMCS.

---
 sys/amd64/vmm/intel/vmcs.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++
 sys/amd64/vmm/intel/vmcs.h |  2 +
 sys/amd64/vmm/intel/vmx.c  |  2 +-
 3 files changed, 99 insertions(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index 80d45cc..8c53465 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -26,6 +26,8 @@
  * $FreeBSD$
  */
 
+#include "opt_ddb.h"
+
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
@@ -45,6 +47,10 @@ __FBSDID("$FreeBSD$");
 #include "ept.h"
 #include "vmx.h"
 
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
 static uint64_t
 vmcs_fix_regval(uint32_t encoding, uint64_t val)
 {
@@ -449,3 +455,93 @@ vmcs_read(uint32_t encoding)
 
 	return (val);
 }
+
+#ifdef DDB
+extern int vmxon_enabled[];
+
+DB_SHOW_COMMAND(vmcs, db_show_vmcs)
+{
+	uint64_t cur_vmcs, val;
+	uint32_t exit;
+
+	if (!vmxon_enabled[curcpu]) {
+		db_printf("VMX not enabled\n");
+		return;
+	}
+
+	if (have_addr) {
+		db_printf("Only current VMCS supported\n");
+		return;
+	}
+
+	vmptrst(&cur_vmcs);
+	if (cur_vmcs == VMCS_INITIAL) {
+		db_printf("No current VM context\n");
+		return;
+	}
+	db_printf("VMCS: %jx\n", cur_vmcs);
+	db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID));
+	db_printf("Activity: ");
+	val = vmcs_read(VMCS_GUEST_ACTIVITY);
+	switch (val) {
+	case 0:
+		db_printf("Active");
+		break;
+	case 1:
+		db_printf("HLT");
+		break;
+	case 2:
+		db_printf("Shutdown");
+		break;
+	case 3:
+		db_printf("Wait for SIPI");
+		break;
+	default:
+		db_printf("Unknown: %#lx", val);
+	}
+	db_printf("\n");
+	exit = vmcs_read(VMCS_EXIT_REASON);
+	if (exit & 0x80000000)
+		db_printf("Entry Failure Reason: %u\n", exit & 0xffff);
+	else
+		db_printf("Exit Reason: %u\n", exit & 0xffff);
+	db_printf("Qualification: %#lx\n", vmcs_exit_qualification());
+	db_printf("Guest Linear Address: %#lx\n",
+	    vmcs_read(VMCS_GUEST_LINEAR_ADDRESS));
+	switch (exit & 0x8000ffff) {
+	case EXIT_REASON_EXCEPTION:
+	case EXIT_REASON_EXT_INTR:
+		val = vmcs_read(VMCS_EXIT_INTERRUPTION_INFO);
+		db_printf("Interrupt Type: ");
+		switch (val >> 8 & 0x7) {
+		case 0:
+			db_printf("external");
+			break;
+		case 2:
+			db_printf("NMI");
+			break;
+		case 3:
+			db_printf("HW exception");
+			break;
+		case 4:
+			db_printf("SW exception");
+			break;
+		default:
+			db_printf("?? %lu", val >> 8 & 0x7);
+			break;
+		}
+		db_printf("  Vector: %lu", val & 0xff);
+		if (val & 0x800)
+			db_printf("  Error Code: %lx",
+			    vmcs_read(VMCS_EXIT_INTERRUPTION_ERROR));
+		db_printf("\n");
+		break;
+	case EXIT_REASON_EPT_FAULT:
+	case EXIT_REASON_EPT_MISCONFIG:
+		db_printf("Guest Physical Address: %#lx\n",
+		    vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS));
+		break;
+	}
+	db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error());
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 853c9c6..be2f29c 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -68,6 +68,8 @@ uint64_t vmcs_read(uint32_t encoding);
 
 #endif	/* _KERNEL */
 
+#define	VMCS_INITIAL			0xffffffffffffffff
+
 #define	VMCS_IDENT(encoding)		((encoding) | 0x80000000)
 /*
  * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 44eae67..805d035 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -110,7 +110,7 @@ MALLOC_DEFINE(M_VMX, "vmx", "vmx");
 
 extern  struct pcpu __pcpu[];
 
-static int vmxon_enabled[MAXCPU];
+int vmxon_enabled[MAXCPU];
 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
 
 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
-- 
cgit v1.1


From acc044270835dd36a1405fca8d586d5008514a39 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Thu, 2 Jun 2011 14:04:07 +0000
Subject: Some tweaks to the CPUID support: - Don't always pass the cpuid
 request to the current CPU as some nodes   we will emulate purely in
 software. - Pass in the APIC ID of the virtual CPU so we can return the
 proper APIC   ID. - Always report a completely flat topology with no SMT or
 multicore. - Report the CPUID2_HV feature and implement support for the
 0x40000000   CPUID level. - Use existing constants from
 <machine/specialreg.h> when possible and   use cpu_feature2 when checking for
 VMX support.

---
 sys/amd64/vmm/intel/vmx.c | 11 ++++---
 sys/amd64/vmm/x86.c       | 76 ++++++++++++++++++++++++++++++++++++-----------
 sys/amd64/vmm/x86.h       |  2 +-
 3 files changed, 65 insertions(+), 24 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 805d035..73d60c2 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 
 #include <machine/psl.h>
 #include <machine/cpufunc.h>
+#include <machine/md_var.h>
 #include <machine/pmap.h>
 #include <machine/segments.h>
 #include <machine/vmparam.h>
@@ -418,13 +419,11 @@ static int
 vmx_init(void)
 {
 	int error;
-	unsigned int regs[4];
 	uint64_t fixed0, fixed1;
 	uint32_t tmp;
 
 	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
-	do_cpuid(1, regs);
-	if ((regs[2] & CPUID_0000_0001_FEAT0_VMX) == 0) {
+	if (!(cpu_feature2 & CPUID2_VMX)) {
 		printf("vmx_init: processor does not support VMX operation\n");
 		return (ENXIO);
 	}
@@ -705,7 +704,7 @@ vmx_vminit(struct vm *vm)
 }
 
 static int
-vmx_handle_cpuid(struct vmxctx *vmxctx)
+vmx_handle_cpuid(int vcpu, struct vmxctx *vmxctx)
 {
 	int handled, func;
 	
@@ -713,7 +712,7 @@ vmx_handle_cpuid(struct vmxctx *vmxctx)
 
 	handled = x86_emulate_cpuid((uint32_t*)(&vmxctx->guest_rax),
 	    (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx),
-	    (uint32_t*)(&vmxctx->guest_rdx));
+	    (uint32_t*)(&vmxctx->guest_rdx), vcpu);
 #if 0
 	printf("%s: func %x rax %lx rbx %lx rcx %lx rdx %lx handled %d\n",
 		__func__, func, vmxctx->guest_rax, vmxctx->guest_rbx,
@@ -1148,7 +1147,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
 		break;
 	case EXIT_REASON_CPUID:
-		handled = vmx_handle_cpuid(vmxctx);
+		handled = vmx_handle_cpuid(vcpu, vmxctx);
 		break;
 	default:
 		break;
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
index f6b38e0..93c21d7 100644
--- a/sys/amd64/vmm/x86.c
+++ b/sys/amd64/vmm/x86.c
@@ -30,27 +30,51 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
+#include <sys/systm.h>
 
 #include <machine/cpufunc.h>
+#include <machine/md_var.h>
 #include <machine/specialreg.h>
 
 #include "x86.h"
 
+#define	CPUID_VM_HIGH		0x40000000
+
+static const char bhyve_id[12] = "BHyVE BHyVE ";
+
 int
-x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx,
+	uint32_t vcpu_id)
 {
 	unsigned int 	func, regs[4];
 
 	func = *eax;
 
-	cpuid_count(*eax, *ecx, regs);
+	/*
+	 * Requests for invalid CPUID levels should map to the highest
+	 * available level instead.
+	 */
+	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
+		if (*eax > cpu_exthigh)
+			*eax = cpu_exthigh;
+	} else if (*eax >= 0x40000000) {
+		if (*eax > CPUID_VM_HIGH)
+			*eax = CPUID_VM_HIGH;
+	} else if (*eax > cpu_high) {
+		*eax = cpu_high;
+	}
 
-	switch(func) {
+	/*
+	 * In general the approach used for CPU topology is to
+	 * advertise a flat topology where all CPUs are packages with
+	 * no multi-core or SMT.
+	 */
+	switch (func) {
 		case CPUID_0000_0000:
 		case CPUID_0000_0002:
 		case CPUID_0000_0003:
-		case CPUID_0000_0004:
 		case CPUID_0000_000A:
+			cpuid_count(*eax, *ecx, regs);
 			break;
 
 		case CPUID_8000_0000:
@@ -61,26 +85,24 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 		case CPUID_8000_0006:
 		case CPUID_8000_0007:
 		case CPUID_8000_0008:
-
+			cpuid_count(*eax, *ecx, regs);
 			break;
 
 		case CPUID_0000_0001:
+			do_cpuid(1, regs);
+
 			/*
 			 * Override the APIC ID only in ebx
 			 */
-			regs[1] &= ~(CPUID_0000_0001_APICID_MASK);
-			/*
-			 * XXX fixme for MP case, set apicid properly for cpu. 
-			 */
-			regs[1] |= (0 << CPUID_0000_0001_APICID_SHIFT);
+			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
+			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
 
 			/*
 			 * Don't expose VMX, SpeedStep or TME capability.
-			 * Advertise x2APIC capability.
+			 * Advertise x2APIC capability and Hypervisor guest.
 			 */
-			regs[2] &= ~(CPUID_0000_0001_FEAT0_VMX | CPUID2_EST |
-				     CPUID2_TM2);
-			regs[2] |= CPUID2_X2APIC;
+			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
+			regs[2] |= CPUID2_X2APIC | CPUID2_HV;
 
 			/*
 			 * Hide thermal monitoring
@@ -93,6 +115,21 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 			 */
 			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
 
+			/*
+			 * Disable multi-core.
+			 */
+			regs[1] &= ~CPUID_HTT_CORES;
+			regs[3] &= ~CPUID_HTT;
+			break;
+
+		case CPUID_0000_0004:
+			do_cpuid(4, regs);
+
+			/*
+			 * Do not expose topology.
+			 */
+			regs[0] &= 0xffff8000;
+			regs[0] |= 0x04008000;
 			break;
 
 		case CPUID_0000_0006:
@@ -108,16 +145,22 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 
 		case CPUID_0000_000B:
 			/*
-			 * XXXSMP fixme
 			 * Processor topology enumeration
 			 */
 			regs[0] = 0;
 			regs[1] = 0;
 			regs[2] = *ecx & 0xff;
-			regs[3] = 0;
+			regs[3] = vcpu_id;
 			break;
 
+		case 0x40000000:
+			regs[0] = CPUID_VM_HIGH;
+			bcopy(bhyve_id, &regs[1], 4);
+			bcopy(bhyve_id, &regs[2], 4);
+			bcopy(bhyve_id, &regs[3], 4);
+			break;
 		default:
+			/* XXX: Leaf 5? */
 			return (0);
 	}
 
@@ -127,4 +170,3 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 	*edx = regs[3];
 	return (1);
 }
-
diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h
index b437d61..d672831 100644
--- a/sys/amd64/vmm/x86.h
+++ b/sys/amd64/vmm/x86.h
@@ -58,6 +58,6 @@
 #define CPUID_0000_0001_FEAT0_VMX	(1<<5)
 
 int x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
-		      uint32_t *edx);
+		      uint32_t *edx, uint32_t vcpu_id);
 
 #endif
-- 
cgit v1.1


From 8c8399924a8001cc92bcb6bf2fb8f959ed5bc722 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Wed, 6 Jul 2011 21:40:48 +0000
Subject: Get rid of redundant initialization of 'dmask'. It was being
 re-initialized shortly afterwards.

---
 sys/amd64/vmm/io/vlapic.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 0a14127..cf7cb0d 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -443,8 +443,6 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
 	cpuset_t dmask;
 	uint32_t dest, vec, mode;
 	
-	CPU_ZERO(&dmask);
-
 	dest = icrval >> 32;
 	vec = icrval & APIC_VECTOR_MASK;
 	mode = icrval & APIC_DELMODE_MASK;
-- 
cgit v1.1


From 6e4718b6d1bec5c2ecfc16c6fb78a1ceead6a735 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Mon, 26 Sep 2011 07:05:40 +0000
Subject: Kernel configuration for a bhyve guest.

---
 sys/amd64/conf/BHYVE | 345 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 345 insertions(+)
 create mode 100644 sys/amd64/conf/BHYVE

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/BHYVE b/sys/amd64/conf/BHYVE
new file mode 100644
index 0000000..de36445
--- /dev/null
+++ b/sys/amd64/conf/BHYVE
@@ -0,0 +1,345 @@
+#
+# BHYVE -- Kernel configuration file for FreeBSD/amd64 bhyve guest
+#
+# For more information on this file, please read the config(5) manual page,
+# and/or the handbook section on Kernel Configuration Files:
+#
+#    http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
+#
+# The handbook is also available locally in /usr/share/doc/handbook
+# if you've installed the doc distribution, otherwise always see the
+# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
+# latest information.
+#
+# An exhaustive list of options and more detailed explanations of the
+# device lines is also present in the ../../conf/NOTES and NOTES files.
+# If you are in doubt as to the purpose or necessity of a line, check first
+# in NOTES.
+#
+# $FreeBSD: projects/bhyve/sys/amd64/conf/GENERIC 221914 2011-05-14 20:35:01Z jhb $
+
+cpu		HAMMER
+ident		BHYVE
+
+makeoptions	DEBUG=-g		# Build kernel with gdb(1) debug symbols
+
+options 	SCHED_ULE		# ULE scheduler
+options 	PREEMPTION		# Enable kernel thread preemption
+options 	INET			# InterNETworking
+options 	INET6			# IPv6 communications protocols
+options 	SCTP			# Stream Control Transmission Protocol
+options 	FFS			# Berkeley Fast Filesystem
+options 	SOFTUPDATES		# Enable FFS soft updates support
+options 	UFS_ACL			# Support for access control lists
+options 	UFS_DIRHASH		# Improve performance on big directories
+options 	UFS_GJOURNAL		# Enable gjournal-based UFS journaling
+options 	MD_ROOT			# MD is a potential root device
+options 	NFSCL			# New Network Filesystem Client
+options 	NFSD			# New Network Filesystem Server
+options 	NFSLOCKD		# Network Lock Manager
+options 	NFS_ROOT		# NFS usable as /, requires NFSCLIENT
+options 	MSDOSFS			# MSDOS Filesystem
+options 	CD9660			# ISO 9660 Filesystem
+options 	PROCFS			# Process filesystem (requires PSEUDOFS)
+options 	PSEUDOFS		# Pseudo-filesystem framework
+options 	GEOM_PART_GPT		# GUID Partition Tables.
+options 	GEOM_LABEL		# Provides labelization
+options 	COMPAT_FREEBSD32	# Compatible with i386 binaries
+options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
+options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
+options 	COMPAT_FREEBSD6		# Compatible with FreeBSD6
+options 	COMPAT_FREEBSD7		# Compatible with FreeBSD7
+options 	SCSI_DELAY=5000		# Delay (in ms) before probing SCSI
+options 	KTRACE			# ktrace(1) support
+options 	STACK			# stack(9) support
+options 	SYSVSHM			# SYSV-style shared memory
+options 	SYSVMSG			# SYSV-style message queues
+options 	SYSVSEM			# SYSV-style semaphores
+options 	_KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions
+options 	PRINTF_BUFR_SIZE=128	# Prevent printf output being interspersed.
+options 	KBD_INSTALL_CDEV	# install a CDEV entry in /dev
+#options 	HWPMC_HOOKS		# Necessary kernel hooks for hwpmc(4)
+options 	AUDIT			# Security event auditing
+options 	MAC			# TrustedBSD MAC Framework
+#options 	KDTRACE_FRAME		# Ensure frames are compiled in
+#options 	KDTRACE_HOOKS		# Kernel DTrace hooks
+options 	INCLUDE_CONFIG_FILE     # Include this file in kernel
+
+# Debugging for use in -current
+options 	KDB			# Enable kernel debugger support.
+options 	DDB			# Support DDB.
+options 	GDB			# Support remote GDB.
+options 	DEADLKRES		# Enable the deadlock resolver
+options 	INVARIANTS		# Enable calls of extra sanity checking
+options 	INVARIANT_SUPPORT	# Extra sanity checks of internal structures, required by INVARIANTS
+options 	WITNESS			# Enable checks to detect deadlocks and cycles
+options 	WITNESS_SKIPSPIN	# Don't run witness on spinlocks for speed
+options 	MALLOC_DEBUG_MAXZONES=8	# Separate malloc(9) zones
+
+# Make an SMP-capable kernel by default
+options 	SMP			# Symmetric MultiProcessor Kernel
+
+# CPU frequency control
+#device		cpufreq
+
+# Bus support.
+#device		acpi
+device		pci
+
+# Floppy drives
+#device		fdc
+
+# ATA controllers
+#device		ahci		# AHCI-compatible SATA controllers
+#device		ata		# Legacy ATA/SATA controllers
+#options 	ATA_CAM		# Handle legacy controllers with CAM
+#options 	ATA_STATIC_ID	# Static device numbering
+#device		mvs		# Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA
+#device		siis		# SiliconImage SiI3124/SiI3132/SiI3531 SATA
+
+# SCSI Controllers
+#device		ahc		# AHA2940 and onboard AIC7xxx devices
+#options 	AHC_REG_PRETTY_PRINT	# Print register bitfields in debug
+					# output.  Adds ~128k to driver.
+#device		ahd		# AHA39320/29320 and onboard AIC79xx devices
+#options 	AHD_REG_PRETTY_PRINT	# Print register bitfields in debug
+					# output.  Adds ~215k to driver.
+#device		amd		# AMD 53C974 (Tekram DC-390(T))
+#device		hptiop		# Highpoint RocketRaid 3xxx series
+#device		isp		# Qlogic family
+#device		ispfw		# Firmware for QLogic HBAs- normally a module
+#device		mpt		# LSI-Logic MPT-Fusion
+#device		mps		# LSI-Logic MPT-Fusion 2
+#device		ncr		# NCR/Symbios Logic
+#device		sym		# NCR/Symbios Logic (newer chipsets + those of `ncr')
+#device		trm		# Tekram DC395U/UW/F DC315U adapters
+
+#device		adv		# Advansys SCSI adapters
+#device		adw		# Advansys wide SCSI adapters
+#device		aic		# Adaptec 15[012]x SCSI adapters, AIC-6[23]60.
+#device		bt		# Buslogic/Mylex MultiMaster SCSI adapters
+
+# ATA/SCSI peripherals
+#device		scbus		# SCSI bus (required for ATA/SCSI)
+#device		ch		# SCSI media changers
+#device		da		# Direct Access (disks)
+#device		sa		# Sequential Access (tape etc)
+#device		cd		# CD
+#device		pass		# Passthrough device (direct ATA/SCSI access)
+#device		ses		# SCSI Environmental Services (and SAF-TE)
+
+# RAID controllers interfaced to the SCSI subsystem
+#device		amr		# AMI MegaRAID
+#device		arcmsr		# Areca SATA II RAID
+#XXX it is not 64-bit clean, -scottl
+#device		asr		# DPT SmartRAID V, VI and Adaptec SCSI RAID
+#device		ciss		# Compaq Smart RAID 5*
+#device		dpt		# DPT Smartcache III, IV - See NOTES for options
+#device		hptmv		# Highpoint RocketRAID 182x
+#device		hptrr		# Highpoint RocketRAID 17xx, 22xx, 23xx, 25xx
+#device		iir		# Intel Integrated RAID
+#device		ips		# IBM (Adaptec) ServeRAID
+#device		mly		# Mylex AcceleRAID/eXtremeRAID
+#device		twa		# 3ware 9000 series PATA/SATA RAID
+
+# RAID controllers
+#device		aac		# Adaptec FSA RAID
+#device		aacp		# SCSI passthrough for aac (requires CAM)
+#device		ida		# Compaq Smart RAID
+#device		mfi		# LSI MegaRAID SAS
+#device		mlx		# Mylex DAC960 family
+#XXX pointer/int warnings
+#device		pst		# Promise Supertrak SX6000
+#device		twe		# 3ware ATA RAID
+
+# atkbdc0 controls both the keyboard and the PS/2 mouse
+#device		atkbdc		# AT keyboard controller
+#device		atkbd		# AT keyboard
+#device		psm		# PS/2 mouse
+
+#device		kbdmux		# keyboard multiplexer
+
+#device		vga		# VGA video card driver
+
+#device		splash		# Splash screen and screen saver support
+
+# syscons is the default console driver, resembling an SCO console
+#device		sc
+#options 	SC_PIXEL_MODE	# add support for the raster text mode
+
+#device		agp		# support several AGP chipsets
+
+# PCCARD (PCMCIA) support
+# PCMCIA and cardbus bridge support
+#device		cbb		# cardbus (yenta) bridge
+#device		pccard		# PC Card (16-bit) bus
+#device		cardbus		# CardBus (32-bit) bus
+
+# Serial (COM) ports
+#device		uart		# Generic UART driver
+
+# Parallel port
+#device		ppc
+#device		ppbus		# Parallel port bus (required)
+#device		lpt		# Printer
+#device		plip		# TCP/IP over parallel
+#device		ppi		# Parallel port interface device
+#device		vpo		# Requires scbus and da
+
+# If you've got a "dumb" serial or parallel PCI card that is
+# supported by the puc(4) glue driver, uncomment the following
+# line to enable it (connects to sio, uart and/or ppc drivers):
+#device		puc
+
+# PCI Ethernet NICs.
+#device		bxe		# Broadcom BCM57710/BCM57711/BCM57711E 10Gb Ethernet
+#device		de		# DEC/Intel DC21x4x (``Tulip'')
+#device		em		# Intel PRO/1000 Gigabit Ethernet Family
+#device		igb		# Intel PRO/1000 PCIE Server Gigabit Family
+#device		ixgbe		# Intel PRO/10GbE PCIE Ethernet Family
+#device		le		# AMD Am7900 LANCE and Am79C9xx PCnet
+#device		ti		# Alteon Networks Tigon I/II gigabit Ethernet
+#device		txp		# 3Com 3cR990 (``Typhoon'')
+#device		vx		# 3Com 3c590, 3c595 (``Vortex'')
+
+# PCI Ethernet NICs that use the common MII bus controller code.
+# NOTE: Be sure to keep the 'device miibus' line in order to use these NICs!
+#device		miibus		# MII bus support
+#device		ae		# Attansic/Atheros L2 FastEthernet
+#device		age		# Attansic/Atheros L1 Gigabit Ethernet
+#device		alc		# Atheros AR8131/AR8132 Ethernet
+#device		ale		# Atheros AR8121/AR8113/AR8114 Ethernet
+#device		bce		# Broadcom BCM5706/BCM5708 Gigabit Ethernet
+#device		bfe		# Broadcom BCM440x 10/100 Ethernet
+#device		bge		# Broadcom BCM570xx Gigabit Ethernet
+#device		dc		# DEC/Intel 21143 and various workalikes
+#device		et		# Agere ET1310 10/100/Gigabit Ethernet
+#device		fxp		# Intel EtherExpress PRO/100B (82557, 82558)
+#device		jme		# JMicron JMC250 Gigabit/JMC260 Fast Ethernet
+#device		lge		# Level 1 LXT1001 gigabit Ethernet
+#device		msk		# Marvell/SysKonnect Yukon II Gigabit Ethernet
+#device		nfe		# nVidia nForce MCP on-board Ethernet
+#device		nge		# NatSemi DP83820 gigabit Ethernet
+#device		nve		# nVidia nForce MCP on-board Ethernet Networking
+#device		pcn		# AMD Am79C97x PCI 10/100 (precedence over 'le')
+#device		re		# RealTek 8139C+/8169/8169S/8110S
+#device		rl		# RealTek 8129/8139
+#device		sf		# Adaptec AIC-6915 (``Starfire'')
+#device		sge		# Silicon Integrated Systems SiS190/191
+#device		sis		# Silicon Integrated Systems SiS 900/SiS 7016
+#device		sk		# SysKonnect SK-984x & SK-982x gigabit Ethernet
+#device		ste		# Sundance ST201 (D-Link DFE-550TX)
+#device		stge		# Sundance/Tamarack TC9021 gigabit Ethernet
+#device		tl		# Texas Instruments ThunderLAN
+#device		tx		# SMC EtherPower II (83c170 ``EPIC'')
+#device		vge		# VIA VT612x gigabit Ethernet
+#device		vr		# VIA Rhine, Rhine II
+#device		wb		# Winbond W89C840F
+#device		xl		# 3Com 3c90x (``Boomerang'', ``Cyclone'')
+
+# ISA Ethernet NICs.  pccard NICs included.
+#device		cs		# Crystal Semiconductor CS89x0 NIC
+# 'device ed' requires 'device miibus'
+#device		ed		# NE[12]000, SMC Ultra, 3c503, DS8390 cards
+#device		ex		# Intel EtherExpress Pro/10 and Pro/10+
+#device		ep		# Etherlink III based cards
+#device		fe		# Fujitsu MB8696x based cards
+#device		sn		# SMC's 9000 series of Ethernet chips
+#device		xe		# Xircom pccard Ethernet
+
+# Wireless NIC cards
+#device		wlan		# 802.11 support
+#options 	IEEE80211_DEBUG	# enable debug msgs
+#options 	IEEE80211_AMPDU_AGE # age frames in AMPDU reorder q's
+#options 	IEEE80211_SUPPORT_MESH	# enable 802.11s draft support
+#device		wlan_wep	# 802.11 WEP support
+#device		wlan_ccmp	# 802.11 CCMP support
+#device		wlan_tkip	# 802.11 TKIP support
+#device		wlan_amrr	# AMRR transmit rate control algorithm
+#device		an		# Aironet 4500/4800 802.11 wireless NICs.
+#device		ath		# Atheros NIC's
+#device		ath_pci		# Atheros pci/cardbus glue
+#device		ath_hal		# pci/cardbus chip support
+#options 	AH_SUPPORT_AR5416	# enable AR5416 tx/rx descriptors
+#device		ath_rate_sample	# SampleRate tx rate control for ath
+#device		bwi		# Broadcom BCM430x/BCM431x wireless NICs.
+#device		bwn		# Broadcom BCM43xx wireless NICs.
+#device		ipw		# Intel 2100 wireless NICs.
+#device		iwi		# Intel 2200BG/2225BG/2915ABG wireless NICs.
+#device		iwn		# Intel 4965/1000/5000/6000 wireless NICs.
+#device		malo		# Marvell Libertas wireless NICs.
+#device		mwl		# Marvell 88W8363 802.11n wireless NICs.
+#device		ral		# Ralink Technology RT2500 wireless NICs.
+#device		wi		# WaveLAN/Intersil/Symbol 802.11 wireless NICs.
+#device		wpi		# Intel 3945ABG wireless NICs.
+
+# Pseudo devices.
+device		loop		# Network loopback
+device		random		# Entropy device
+device		ether		# Ethernet support
+device		vlan		# 802.1Q VLAN support
+device		tun		# Packet tunnel.
+device		pty		# BSD-style compatibility pseudo ttys
+device		md		# Memory "disks"
+device		gif		# IPv6 and IPv4 tunneling
+device		faith		# IPv6-to-IPv4 relaying (translation)
+device		firmware	# firmware assist module
+
+# The `bpf' device enables the Berkeley Packet Filter.
+# Be aware of the administrative consequences of enabling this!
+# Note that 'bpf' is required for DHCP.
+device		bpf		# Berkeley packet filter
+
+# USB support
+#options 	USB_DEBUG	# enable debug msgs
+#device		uhci		# UHCI PCI->USB interface
+#device		ohci		# OHCI PCI->USB interface
+#device		ehci		# EHCI PCI->USB interface (USB 2.0)
+#device		usb		# USB Bus (required)
+#device		udbp		# USB Double Bulk Pipe devices (needs netgraph)
+#device		uhid		# "Human Interface Devices"
+#device		ukbd		# Keyboard
+#device		ulpt		# Printer
+#device		umass		# Disks/Mass storage - Requires scbus and da
+#device		ums		# Mouse
+#device		urio		# Diamond Rio 500 MP3 player
+# USB Serial devices
+#device		u3g		# USB-based 3G modems (Option, Huawei, Sierra)
+#device		uark		# Technologies ARK3116 based serial adapters
+#device		ubsa		# Belkin F5U103 and compatible serial adapters
+#device		uftdi		# For FTDI usb serial adapters
+#device		uipaq		# Some WinCE based devices
+#device		uplcom		# Prolific PL-2303 serial adapters
+#device		uslcom		# SI Labs CP2101/CP2102 serial adapters
+#device		uvisor		# Visor and Palm devices
+#device		uvscom		# USB serial support for DDI pocket's PHS
+# USB Ethernet, requires miibus
+#device		aue		# ADMtek USB Ethernet
+#device		axe		# ASIX Electronics USB Ethernet
+#device		cdce		# Generic USB over Ethernet
+#device		cue		# CATC USB Ethernet
+#device		kue		# Kawasaki LSI USB Ethernet
+#device		rue		# RealTek RTL8150 USB Ethernet
+#device		udav		# Davicom DM9601E USB
+# USB Wireless
+#device		rum		# Ralink Technology RT2501USB wireless NICs
+#device		run		# Ralink Technology RT2700/RT2800/RT3000 NICs.
+#device		uath		# Atheros AR5523 wireless NICs
+#device		upgt		# Conexant/Intersil PrismGT wireless NICs.
+#device		ural		# Ralink Technology RT2500USB wireless NICs
+#device		urtw		# Realtek RTL8187B/L wireless NICs
+#device		zyd		# ZyDAS zb1211/zb1211b wireless NICs
+
+# FireWire support
+#device		firewire	# FireWire bus code
+#device		sbp		# SCSI over FireWire (Requires scbus and da)
+#device		fwe		# Ethernet over FireWire (non-standard!)
+#device		fwip		# IP over FireWire (RFC 2734,3146)
+#device		dcons		# Dumb console driver
+#device		dcons_crom	# Configuration ROM for dcons
+
+device		bvmconsole	# brain dead simple bvm console
+device		bvmdebug	# brain dead simple bvm gdb pipe
+
+device		mptable
+options		NKPT=256
-- 
cgit v1.1


From d08191b4175ebda3e5ac2fabbe62e2bdf139a201 Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Sat, 24 Dec 2011 19:39:02 +0000
Subject: Add support for running as a nested hypervisor under VMWare Fusion,
 on systems with VT-x/EPT (e.g. Sandybridge Macbooks). This will most likely
 work on VMWare Workstation8/Player4 as well. See the VMWare app note at:

  http://communities.vmware.com/docs/DOC-8970

Fusion doesn't propagate the PAT MSR auto save-restore entry/exit
control bits. Deal with this by noting that fact and setting up the
PAT MSR to essentially be a no-op - it is init'd to power-on default,
and a software shadow copy maintained.

Since it is treated as a no-op, o/s settings are essentially ignored.
This may not give correct results, but since the hypervisor is running
nested, a number of bets are already off.

On a quad-core/HT-enabled 'MacBook8,2', nested VMs with 1/2/4 vCPUs were
fired up. The more nested vCPUs the worse the performance, unless the VMs
were started up in multiplexed mode where things worked perfectly up to
the limit of 8 vCPUs.

Reviewed by:	neel
---
 sys/amd64/vmm/intel/vmx.c | 72 +++++++++++++++++++++++++++++++++++------------
 sys/amd64/vmm/vmm_msr.c   | 37 ++++++++++++++++++++++++
 sys/amd64/vmm/vmm_msr.h   |  1 +
 3 files changed, 92 insertions(+), 18 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 73d60c2..f8d5833 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/md_var.h>
 #include <machine/pmap.h>
 #include <machine/segments.h>
+#include <machine/specialreg.h>
 #include <machine/vmparam.h>
 
 #include <machine/vmm.h>
@@ -85,17 +86,22 @@ __FBSDID("$FreeBSD$");
 #define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
 #define	PROCBASED_CTLS2_ZERO_SETTING	0
 
-#define	VM_EXIT_CTLS_ONE_SETTING					\
+#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT					\
 	(VM_EXIT_HOST_LMA			|			\
 	VM_EXIT_SAVE_EFER			|			\
-	VM_EXIT_SAVE_PAT			|			\
-	VM_EXIT_LOAD_PAT			|			\
 	VM_EXIT_LOAD_EFER)
+
+#define	VM_EXIT_CTLS_ONE_SETTING					\
+	(VM_EXIT_CTLS_ONE_SETTING_NO_PAT       	|			\
+	VM_EXIT_SAVE_PAT			|			\
+	VM_EXIT_LOAD_PAT)
 #define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
 
+#define	VM_ENTRY_CTLS_ONE_SETTING_NO_PAT	VM_ENTRY_LOAD_EFER
+
 #define	VM_ENTRY_CTLS_ONE_SETTING					\
-	(VM_ENTRY_LOAD_PAT			|			\
-	VM_ENTRY_LOAD_EFER)
+	(VM_ENTRY_CTLS_ONE_SETTING_NO_PAT     	|			\
+	VM_ENTRY_LOAD_PAT)
 #define	VM_ENTRY_CTLS_ZERO_SETTING					\
 	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
 	VM_ENTRY_INTO_SMM			|			\
@@ -122,6 +128,8 @@ static uint64_t cr4_ones_mask, cr4_zeros_mask;
 
 static volatile u_int nextvpid;
 
+static int vmx_no_patmsr;
+
 /*
  * Virtual NMI blocking conditions.
  *
@@ -476,16 +484,39 @@ vmx_init(void)
 			       VM_EXIT_CTLS_ZERO_SETTING,
 			       &exit_ctls);
 	if (error) {
-		printf("vmx_init: processor does not support desired "
-		       "exit controls\n");
-		       return (error);
+		/* Try again without the PAT MSR bits */
+		error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
+				       MSR_VMX_TRUE_EXIT_CTLS,
+				       VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
+				       VM_EXIT_CTLS_ZERO_SETTING,
+				       &exit_ctls);
+		if (error) {
+			printf("vmx_init: processor does not support desired "
+			       "exit controls\n");
+			return (error);
+		} else {
+			if (bootverbose)
+				printf("vmm: PAT MSR access not supported\n");
+			guest_msr_valid(MSR_PAT);
+			vmx_no_patmsr = 1;
+		}
 	}
 
 	/* Check support for VM-entry controls */
-	error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
-			       VM_ENTRY_CTLS_ONE_SETTING,
-			       VM_ENTRY_CTLS_ZERO_SETTING,
-			       &entry_ctls);
+	if (!vmx_no_patmsr) {
+		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
+				       MSR_VMX_TRUE_ENTRY_CTLS,
+				       VM_ENTRY_CTLS_ONE_SETTING,
+				       VM_ENTRY_CTLS_ZERO_SETTING,
+				       &entry_ctls);
+	} else {
+		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
+				       MSR_VMX_TRUE_ENTRY_CTLS,
+				       VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
+				       VM_ENTRY_CTLS_ZERO_SETTING,
+				       &entry_ctls);
+	}
+
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		       "entry controls\n");
@@ -646,18 +677,23 @@ vmx_vminit(struct vm *vm)
 	 * MSR_EFER is saved and restored in the guest VMCS area on a
 	 * VM exit and entry respectively. It is also restored from the
 	 * host VMCS area on a VM exit.
-	 *
-	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
-	 * and entry respectively. It is also restored from the host VMCS
-	 * area on a VM exit.
 	 */
 	if (guest_msr_rw(vmx, MSR_GSBASE) ||
 	    guest_msr_rw(vmx, MSR_FSBASE) ||
 	    guest_msr_rw(vmx, MSR_KGSBASE) ||
-	    guest_msr_rw(vmx, MSR_EFER) ||
-	    guest_msr_rw(vmx, MSR_PAT))
+	    guest_msr_rw(vmx, MSR_EFER))
 		panic("vmx_vminit: error setting guest msr access");
 
+	/*
+	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
+	 * and entry respectively. It is also restored from the host VMCS
+	 * area on a VM exit. However, if running on a system with no
+	 * MSR_PAT save/restore support, leave access disabled so accesses
+	 * will be trapped.
+	 */
+	if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
+		panic("vmx_vminit: error setting guest pat msr access");
+
 	for (i = 0; i < VM_MAXCPU; i++) {
 		vmx->vmcs[i].identifier = vmx_revision();
 		error = vmclear(&vmx->vmcs[i]);
diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c
index 99ac293..31bfcab 100644
--- a/sys/amd64/vmm/vmm_msr.c
+++ b/sys/amd64/vmm/vmm_msr.c
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 
 #define	VMM_MSR_F_EMULATE	0x01
 #define	VMM_MSR_F_READONLY	0x02
+#define VMM_MSR_F_INVALID	0x04
 
 struct vmm_msr {
 	int		num;
@@ -54,6 +55,7 @@ static struct vmm_msr vmm_msr[] = {
 	{ MSR_CSTAR,	0 },
 	{ MSR_STAR,	0 },
 	{ MSR_SF_MASK,	0 },
+	{ MSR_PAT,      VMM_MSR_F_EMULATE | VMM_MSR_F_INVALID },
 	{ MSR_APICBASE,	VMM_MSR_F_EMULATE },
 	{ MSR_BIOS_SIGN,VMM_MSR_F_EMULATE },
 	{ MSR_MCG_CAP,	VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
@@ -68,6 +70,9 @@ CTASSERT(VMM_MSR_NUM >= vmm_msr_num);
 #define	emulated_msr(idx)	\
 	((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0)
 
+#define invalid_msr(idx)	\
+	((vmm_msr[(idx)].flags & VMM_MSR_F_INVALID) != 0)
+
 void
 vmm_msr_init(void)
 {
@@ -108,6 +113,16 @@ guest_msrs_init(struct vm *vm, int cpu)
 			if (cpu == 0)
 				guest_msrs[i] |= APICBASE_BSP;
 			break;
+		case MSR_PAT:
+			guest_msrs[i] = PAT_VALUE(0, PAT_WRITE_BACK)      |
+				PAT_VALUE(1, PAT_WRITE_THROUGH)   |
+				PAT_VALUE(2, PAT_UNCACHED)        |
+				PAT_VALUE(3, PAT_UNCACHEABLE)     |
+				PAT_VALUE(4, PAT_WRITE_BACK)      |
+				PAT_VALUE(5, PAT_WRITE_THROUGH)   |
+				PAT_VALUE(6, PAT_UNCACHED)        |
+				PAT_VALUE(7, PAT_UNCACHEABLE);
+			break;
 		default:
 			panic("guest_msrs_init: missing initialization for msr "
 			      "0x%0x", vmm_msr[i].num);
@@ -165,6 +180,9 @@ emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
 	if (idx < 0)
 		goto done;
 
+	if (invalid_msr(idx))
+		goto done;
+
 	if (!readonly_msr(idx)) {
 		guest_msrs = vm_guest_msrs(vm, cpu);
 
@@ -206,6 +224,9 @@ emulate_rdmsr(struct vm *vm, int cpu, u_int num)
 	if (idx < 0)
 		goto done;
 
+	if (invalid_msr(idx))
+		goto done;
+
 	guest_msrs = vm_guest_msrs(vm, cpu);
 	result = guest_msrs[idx];
 
@@ -263,3 +284,19 @@ restore_host_msrs(struct vm *vm, int cpu)
 			wrmsr(vmm_msr[i].num, vmm_msr[i].hostval);
 	}
 }
+
+/*
+ * Must be called by the CPU-specific code before any guests are
+ * created
+ */
+void
+guest_msr_valid(int msr)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (vmm_msr[i].num == msr && invalid_msr(i)) {
+			vmm_msr[i].flags &= ~VMM_MSR_F_INVALID;
+		}
+	}
+}
diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h
index 1e15787..8a1fda3 100644
--- a/sys/amd64/vmm/vmm_msr.h
+++ b/sys/amd64/vmm/vmm_msr.h
@@ -36,6 +36,7 @@ void	vmm_msr_init(void);
 int	emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val);
 int	emulate_rdmsr(struct vm *vm, int vcpu, u_int msr);
 void	guest_msrs_init(struct vm *vm, int cpu);
+void	guest_msr_valid(int msr);
 void	restore_host_msrs(struct vm *vm, int cpu);
 void	restore_guest_msrs(struct vm *vm, int cpu);
 
-- 
cgit v1.1


From bd54a55c5a9baae09ba63f311e15c0d30f9ca683 Mon Sep 17 00:00:00 2001
From: emaste <emaste@FreeBSD.org>
Date: Tue, 6 Mar 2012 21:13:12 +0000
Subject: Remove duplicated license text.

---
 sys/amd64/vmm/io/vlapic.c | 28 ----------------------------
 1 file changed, 28 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index cf7cb0d..61adef9 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -26,34 +26,6 @@
  * $FreeBSD$
  */
 
-/*-
- * Copyright (c) 2011 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-- 
cgit v1.1


From 9f0c999f8126597eb572b80056df88335dbd0070 Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Sat, 28 Apr 2012 16:28:00 +0000
Subject: MSI-x interrupt support for PCI pass-thru devices.

Includes instruction emulation for memory r/w access. This
opens the door for io-apic, local apic, hpet timer, and
legacy device emulation.

Submitted by:	ryan dot berryhill at sandvine dot com
Reviewed by:	grehan
Obtained from:	Sandvine
---
 sys/amd64/include/vmm.h     |   6 +-
 sys/amd64/include/vmm_dev.h |  14 ++++
 sys/amd64/vmm/intel/vmcs.h  |   1 +
 sys/amd64/vmm/intel/vmx.c   |   4 +
 sys/amd64/vmm/io/ppt.c      | 181 +++++++++++++++++++++++++++++++++++++++++++-
 sys/amd64/vmm/io/ppt.h      |   3 +-
 sys/amd64/vmm/io/vlapic.c   |   1 +
 sys/amd64/vmm/vmm_dev.c     |   9 +++
 8 files changed, 213 insertions(+), 6 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 26646fb..1ad01c6 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -227,7 +227,8 @@ enum vm_exitcode {
 	VM_EXITCODE_HLT,
 	VM_EXITCODE_MTRAP,
 	VM_EXITCODE_PAUSE,
-	VM_EXITCODE_MAX,
+	VM_EXITCODE_PAGING,
+	VM_EXITCODE_MAX
 };
 
 struct vm_exit {
@@ -243,6 +244,9 @@ struct vm_exit {
 			uint16_t	port;
 			uint32_t	eax;		/* valid for out */
 		} inout;
+		struct {
+			uint64_t	cr3;
+		} paging;
 		/*
 		 * VMX specific payload. Used when there is no "better"
 		 * exitcode to represent the VM-exit.
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index 1b143b5..d1a50d6 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -108,6 +108,17 @@ struct vm_pptdev_msi {
 	int		destcpu;
 };
 
+struct vm_pptdev_msix {
+	int		vcpu;
+	int		bus;
+	int		slot;
+	int		func;
+	int		idx;
+	uint32_t	msg;
+	uint32_t	vector_control;
+	uint64_t	addr;
+};
+
 struct vm_nmi {
 	int		cpuid;
 };
@@ -143,6 +154,7 @@ enum {
 	IOCNUM_UNBIND_PPTDEV,
 	IOCNUM_MAP_PPTDEV_MMIO,
 	IOCNUM_PPTDEV_MSI,
+	IOCNUM_PPTDEV_MSIX,
 	IOCNUM_INJECT_NMI,
 	IOCNUM_VM_STATS,
 	IOCNUM_VM_STAT_DESC,
@@ -182,6 +194,8 @@ enum {
 	_IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
 #define	VM_PPTDEV_MSI \
 	_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
+#define	VM_PPTDEV_MSIX \
+	_IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
 #define VM_INJECT_NMI \
 	_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
 #define	VM_STATS \
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index be2f29c..a7cf4f6 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -65,6 +65,7 @@ uint64_t vmcs_read(uint32_t encoding);
 #define	vmcs_instruction_error()	vmcs_read(VMCS_INSTRUCTION_ERROR)
 #define	vmcs_exit_reason()		(vmcs_read(VMCS_EXIT_REASON) & 0xffff)
 #define	vmcs_exit_qualification()	vmcs_read(VMCS_EXIT_QUALIFICATION)
+#define	vmcs_guest_cr3()		vmcs_read(VMCS_GUEST_CR3)
 
 #endif	/* _KERNEL */
 
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index f8d5833..4bbcea8 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1185,6 +1185,10 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	case EXIT_REASON_CPUID:
 		handled = vmx_handle_cpuid(vcpu, vmxctx);
 		break;
+	case EXIT_REASON_EPT_FAULT:
+		vmexit->exitcode = VM_EXITCODE_PAGING;
+		vmexit->u.paging.cr3 = vmcs_guest_cr3();
+		break;
 	default:
 		break;
 	}
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
index fcb36ad..ace2877 100644
--- a/sys/amd64/vmm/io/ppt.c
+++ b/sys/amd64/vmm/io/ppt.c
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
+#include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/pciio.h>
@@ -56,9 +57,12 @@ __FBSDID("$FreeBSD$");
 #define	MAX_MMIOSEGS	(PCIR_MAX_BAR_0 + 1)
 #define	MAX_MSIMSGS	32
 
+MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
+
 struct pptintr_arg {				/* pptintr(pptintr_arg) */
 	struct pptdev	*pptdev;
-	int		msg;
+	int		vec;
+	int 		vcpu;
 };
 
 static struct pptdev {
@@ -75,6 +79,16 @@ static struct pptdev {
 		void	*cookie[MAX_MSIMSGS];
 		struct pptintr_arg arg[MAX_MSIMSGS];
 	} msi;
+
+	struct {
+		int num_msgs;
+		int startrid;
+		int msix_table_rid;
+		struct resource *msix_table_res;
+		struct resource **res;
+		void **cookie;
+		struct pptintr_arg *arg;
+	} msix;
 } pptdevs[32];
 
 static int num_pptdevs;
@@ -209,6 +223,57 @@ ppt_teardown_msi(struct pptdev *ppt)
 	ppt->msi.num_msgs = 0;
 }
 
+static void 
+ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
+{
+	int rid;
+	struct resource *res;
+	void *cookie;
+
+	rid = ppt->msix.startrid + idx;
+	res = ppt->msix.res[idx];
+	cookie = ppt->msix.cookie[idx];
+
+	if (cookie != NULL) 
+		bus_teardown_intr(ppt->dev, res, cookie);
+
+	if (res != NULL) 
+		bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+
+	ppt->msix.res[idx] = NULL;
+	ppt->msix.cookie[idx] = NULL;
+}
+
+static void 
+ppt_teardown_msix(struct pptdev *ppt)
+{
+	int i, error;
+
+	if (ppt->msix.num_msgs == 0) 
+		return;
+
+	for (i = 0; i < ppt->msix.num_msgs; i++) 
+		ppt_teardown_msix_intr(ppt, i);
+
+	if (ppt->msix.msix_table_res) {
+		bus_release_resource(ppt->dev, SYS_RES_MEMORY, 
+				     ppt->msix.msix_table_rid,
+				     ppt->msix.msix_table_res);
+		ppt->msix.msix_table_res = NULL;
+		ppt->msix.msix_table_rid = 0;
+	}
+
+	free(ppt->msix.res, M_PPTMSIX);
+	free(ppt->msix.cookie, M_PPTMSIX);
+	free(ppt->msix.arg, M_PPTMSIX);
+
+	error = pci_release_msi(ppt->dev);
+	if (error) 
+		printf("ppt_teardown_msix: Failed to release MSI-X resources (error %i)\n", error);
+
+	ppt->msix.num_msgs = 0;
+}
+
 int
 ppt_assign_device(struct vm *vm, int bus, int slot, int func)
 {
@@ -244,6 +309,7 @@ ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
 			return (EBUSY);
 		ppt_unmap_mmio(vm, ppt);
 		ppt_teardown_msi(ppt);
+		ppt_teardown_msix(ppt);
 		iommu_remove_device(vm_iommu_domain(vm), bus, slot, func);
 		ppt->vm = NULL;
 		return (0);
@@ -309,10 +375,10 @@ pptintr(void *arg)
 	
 	pptarg = arg;
 	ppt = pptarg->pptdev;
-	vec = ppt->msi.vector + pptarg->msg;
+	vec = pptarg->vec;
 
 	if (ppt->vm != NULL)
-		(void) lapic_set_intr(ppt->vm, ppt->msi.vcpu, vec);
+		(void) lapic_set_intr(ppt->vm, pptarg->vcpu, vec);
 	else {
 		/*
 		 * XXX
@@ -431,7 +497,7 @@ ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
 			break;
 
 		ppt->msi.arg[i].pptdev = ppt;
-		ppt->msi.arg[i].msg = i;
+		ppt->msi.arg[i].vec = vector + i;
 
 		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
 				       INTR_TYPE_NET | INTR_MPSAFE,
@@ -448,3 +514,110 @@ ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
 
 	return (0);
 }
+
+int
+ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+	       int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
+{
+	struct pptdev *ppt;
+	struct pci_devinfo *dinfo;
+	int numvec, vector_count, rid, error;
+	size_t res_size, cookie_size, arg_size;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt == NULL)
+		return (ENOENT);
+	if (ppt->vm != vm)		/* Make sure we own this device */
+		return (EBUSY);
+
+	dinfo = device_get_ivars(ppt->dev);
+	if (!dinfo) 
+		return (ENXIO);
+
+	/* 
+	 * First-time configuration:
+	 * 	Allocate the MSI-X table
+	 *	Allocate the IRQ resources
+	 *	Set up some variables in ppt->msix
+	 */
+	if (!ppt->msix.msix_table_res) {
+		ppt->msix.res = NULL;
+		ppt->msix.cookie = NULL;
+		ppt->msix.arg = NULL;
+
+		rid = dinfo->cfg.msix.msix_table_bar;
+		ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev, SYS_RES_MEMORY,
+								  &rid, RF_ACTIVE);
+		if (ppt->msix.msix_table_res == NULL) 
+			return (ENOSPC);
+
+		ppt->msix.msix_table_rid = rid;
+
+		vector_count = numvec = pci_msix_count(ppt->dev);
+
+		error = pci_alloc_msix(ppt->dev, &numvec);
+		if (error) 
+			return (error);
+		else if (vector_count != numvec) {
+			pci_release_msi(ppt->dev);
+			return (ENOSPC);
+		} 
+
+		ppt->msix.num_msgs = numvec;
+
+		ppt->msix.startrid = 1;
+
+		res_size = numvec * sizeof(ppt->msix.res[0]);
+		cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
+		arg_size = numvec * sizeof(ppt->msix.arg[0]);
+
+		ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK);
+		ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX, M_WAITOK);
+		ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK);
+		if (ppt->msix.res == NULL || ppt->msix.cookie == NULL || 
+		    ppt->msix.arg == NULL) {
+			ppt_teardown_msix(ppt);
+			return (ENOSPC);
+		}
+		bzero(ppt->msix.res, res_size);
+		bzero(ppt->msix.cookie, cookie_size);
+		bzero(ppt->msix.arg, arg_size);
+	}
+
+	if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+		/* Tear down the IRQ if it's already set up */
+		ppt_teardown_msix_intr(ppt, idx);
+
+		/* Allocate the IRQ resource */
+		ppt->msix.cookie[idx] = NULL;
+		rid = ppt->msix.startrid + idx;
+		ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+							    &rid, RF_ACTIVE);
+		if (ppt->msix.res[idx] == NULL)
+			return (ENXIO);
+	
+		ppt->msix.arg[idx].pptdev = ppt;
+		ppt->msix.arg[idx].vec = msg;
+		ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF;
+	
+		/* Setup the MSI-X interrupt */
+		error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
+				       INTR_TYPE_NET | INTR_MPSAFE,
+				       pptintr, NULL, &ppt->msix.arg[idx],
+				       &ppt->msix.cookie[idx]);
+	
+		if (error != 0) {
+			bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
+			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
+			ppt->msix.cookie[idx] = NULL;
+			ppt->msix.res[idx] = NULL;
+			return (ENXIO);
+		}
+	} else {
+		/* Masked, tear it down if it's already been set up */
+		ppt_teardown_msix_intr(ppt, idx);
+	}
+
+	return (0);
+}
+
diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h
index 95f3ad0..63c8228 100644
--- a/sys/amd64/vmm/io/ppt.h
+++ b/sys/amd64/vmm/io/ppt.h
@@ -36,5 +36,6 @@ int	ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
 		     vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int	ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
 		      int destcpu, int vector, int numvec);
-
+int	ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+		       int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
 #endif
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 61adef9..6c91128 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -778,6 +778,7 @@ vlapic_init(struct vm *vm, int vcpuid)
 void
 vlapic_cleanup(struct vlapic *vlapic)
 {
+	vlapic_op_halt(vlapic);
 	vdev_unregister(vlapic);
 	free(vlapic, M_VLAPIC);
 }
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 8f124a5..571c37c 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -158,6 +158,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_pptdev *pptdev;
 	struct vm_pptdev_mmio *pptmmio;
 	struct vm_pptdev_msi *pptmsi;
+	struct vm_pptdev_msix *pptmsix;
 	struct vm_nmi *vmnmi;
 	struct vm_stats *vmstats;
 	struct vm_stat_desc *statdesc;
@@ -240,6 +241,14 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 				      pptmsi->destcpu, pptmsi->vector,
 				      pptmsi->numvec);
 		break;
+	case VM_PPTDEV_MSIX:
+		pptmsix = (struct vm_pptdev_msix *)data;
+		error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
+				       pptmsix->bus, pptmsix->slot, 
+				       pptmsix->func, pptmsix->idx,
+				       pptmsix->msg, pptmsix->vector_control,
+				       pptmsix->addr);
+		break;
 	case VM_MAP_PPTDEV_MMIO:
 		pptmmio = (struct vm_pptdev_mmio *)data;
 		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
-- 
cgit v1.1


From fc13a01d538ded0843702a871a58cba4147b6037 Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Thu, 3 May 2012 05:04:37 +0000
Subject: Until the issue of how to handle guest XCR0 state is resolved,
 prevent CURRENT guests from hitting unhandled xsetbv exits by hiding the
 xsave/osxsave/avx cpuid2 bits.

---
 sys/amd64/vmm/x86.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
index 93c21d7..669fa4b 100644
--- a/sys/amd64/vmm/x86.c
+++ b/sys/amd64/vmm/x86.c
@@ -105,6 +105,13 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx,
 			regs[2] |= CPUID2_X2APIC | CPUID2_HV;
 
 			/*
+			 * Hide xsave/osxsave/avx until the FPU save/restore
+			 * issues are resolved
+			 */
+			regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE |
+				     CPUID2_AVX);
+
+			/*
 			 * Hide thermal monitoring
 			 */
 			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
-- 
cgit v1.1


From add4e182f64694d0cd5926bbe0e8008e85eccece Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Wed, 25 Jul 2012 00:21:16 +0000
Subject: Verify that VMX operation has been enabled by BIOS before executing
 the VMXON instruction.

Reported by "s vas" on freebsd-virtualization@
---
 sys/amd64/vmm/intel/vmx.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 4bbcea8..df28fe9 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -427,7 +427,7 @@ static int
 vmx_init(void)
 {
 	int error;
-	uint64_t fixed0, fixed1;
+	uint64_t fixed0, fixed1, feature_control;
 	uint32_t tmp;
 
 	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
@@ -436,6 +436,16 @@ vmx_init(void)
 		return (ENXIO);
 	}
 
+	/*
+	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
+	 * are set (bits 0 and 2 respectively).
+	 */
+	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
+	if ((feature_control & 0x5) != 0x5) {
+		printf("vmx_init: VMX operation disabled by BIOS\n");
+		return (ENXIO);
+	}
+
 	/* Check support for primary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 			       MSR_VMX_TRUE_PROCBASED_CTLS,
-- 
cgit v1.1


From d40b98f60b39af7a76f82076ee5e53f389dc3cba Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Sat, 4 Aug 2012 02:06:55 +0000
Subject: Force certain bits in %cr4 to be hard-wired to '1' or '0' from a
 guest's perspective. If we don't do this some guest OSes (e.g. Linux) will
 reset the CR4_VMXE bit in %cr4 with disastrous consequences.

Reported by: grehan
---
 sys/amd64/vmm/intel/vmx.c | 68 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 16 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index df28fe9..be58444 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -627,23 +627,38 @@ vmx_vpid(void)
 }
 
 static int
-vmx_setup_cr0_shadow(struct vmcs *vmcs)
+vmx_setup_cr_shadow(int which, struct vmcs *vmcs)
 {
-	int error;
-	uint64_t mask, shadow;
+	int error, mask_ident, shadow_ident;
+	uint64_t mask_value, shadow_value;
+
+	if (which != 0 && which != 4)
+		panic("vmx_setup_cr_shadow: unknown cr%d", which);
+
+	if (which == 0) {
+		mask_ident = VMCS_CR0_MASK;
+		mask_value = cr0_ones_mask | cr0_zeros_mask;
+		shadow_ident = VMCS_CR0_SHADOW;
+		shadow_value = cr0_ones_mask;
+	} else {
+		mask_ident = VMCS_CR4_MASK;
+		mask_value = cr4_ones_mask | cr4_zeros_mask;
+		shadow_ident = VMCS_CR4_SHADOW;
+		shadow_value = cr4_ones_mask;
+	}
 
-	mask = cr0_ones_mask | cr0_zeros_mask;
-	error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_MASK), mask);
+	error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value);
 	if (error)
 		return (error);
 
-	shadow = cr0_ones_mask;
-	error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_SHADOW), shadow);
+	error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value);
 	if (error)
 		return (error);
 
 	return (0);
 }
+#define	vmx_setup_cr0_shadow(vmcs)	vmx_setup_cr_shadow(0, (vmcs))
+#define	vmx_setup_cr4_shadow(vmcs)	vmx_setup_cr_shadow(4, (vmcs))
 
 static void *
 vmx_vminit(struct vm *vm)
@@ -744,6 +759,12 @@ vmx_vminit(struct vm *vm)
 			panic("vmcs_set_msr_save error %d", error);
 
 		error = vmx_setup_cr0_shadow(&vmx->vmcs[i]);
+		if (error != 0)
+			panic("vmx_setup_cr0_shadow %d", error);
+
+		error = vmx_setup_cr4_shadow(&vmx->vmcs[i]);
+		if (error != 0)
+			panic("vmx_setup_cr4_shadow %d", error);
 	}
 
 	return (vmx);
@@ -1031,12 +1052,16 @@ cantinject:
 static int
 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
-	int error;
-	uint64_t regval;
+	int error, cr, vmcs_guest_cr;
+	uint64_t regval, ones_mask, zeros_mask;
 	const struct vmxctx *vmxctx;
 
-	/* We only handle mov to %cr0 at this time */
-	if ((exitqual & 0xff) != 0x00)
+	/* We only handle mov to %cr0 or %cr4 at this time */
+	if ((exitqual & 0xf0) != 0x00)
+		return (UNHANDLED);
+
+	cr = exitqual & 0xf;
+	if (cr != 0 && cr != 4)
 		return (UNHANDLED);
 
 	vmxctx = &vmx->ctx[vcpu];
@@ -1100,11 +1125,22 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 		break;
 	}
 
-	regval |= cr0_ones_mask;
-	regval &= ~cr0_zeros_mask;
-	error = vmwrite(VMCS_GUEST_CR0, regval);
-	if (error)
-		panic("vmx_emulate_cr_access: error %d writing cr0", error);
+	if (cr == 0) {
+		ones_mask = cr0_ones_mask;
+		zeros_mask = cr0_zeros_mask;
+		vmcs_guest_cr = VMCS_GUEST_CR0;
+	} else {
+		ones_mask = cr4_ones_mask;
+		zeros_mask = cr4_zeros_mask;
+		vmcs_guest_cr = VMCS_GUEST_CR4;
+	}
+	regval |= ones_mask;
+	regval &= ~zeros_mask;
+	error = vmwrite(vmcs_guest_cr, regval);
+	if (error) {
+		panic("vmx_emulate_cr_access: error %d writing cr%d",
+		      error, cr);
+	}
 
 	return (HANDLED);
 }
-- 
cgit v1.1


From 66c8120152f661ab4690b86ac87beeb00cc887e5 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Sat, 4 Aug 2012 04:30:26 +0000
Subject: Include 'device uart' in the guest kernel.

---
 sys/amd64/conf/BHYVE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/BHYVE b/sys/amd64/conf/BHYVE
index de36445..89c8ea2 100644
--- a/sys/amd64/conf/BHYVE
+++ b/sys/amd64/conf/BHYVE
@@ -176,7 +176,7 @@ device		pci
 #device		cardbus		# CardBus (32-bit) bus
 
 # Serial (COM) ports
-#device		uart		# Generic UART driver
+device		uart		# Generic UART driver
 
 # Parallel port
 #device		ppc
-- 
cgit v1.1


From 6c5ad005bed33e80c94460b6694d199348dac472 Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Sun, 26 Aug 2012 01:41:41 +0000
Subject: Add sysctls to display the total and free amount of hard-wired mem
 for VMs    # sysctl hw.vmm    hw.vmm.mem_free: 2145386496   
 hw.vmm.mem_total: 2145386496

Submitted by:	Takeshi HASEGAWA hasegaw at gmail com
---
 sys/amd64/vmm/vmm_dev.c | 19 +++++++++++++++++++
 sys/amd64/vmm/vmm_mem.c | 24 ++++++++++++++++++++++++
 sys/amd64/vmm/vmm_mem.h |  3 +++
 3 files changed, 46 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 571c37c..116b5f1 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmm.h>
 #include "vmm_lapic.h"
 #include "vmm_stat.h"
+#include "vmm_mem.h"
 #include "io/ppt.h"
 #include <machine/vmm_dev.h>
 
@@ -458,6 +459,24 @@ sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
 SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
 	    NULL, 0, sysctl_vmm_create, "A", NULL);
 
+static int
+sysctl_vmm_mem_total(SYSCTL_HANDLER_ARGS)
+{
+	size_t val = vmm_mem_get_mem_total();
+	return sysctl_handle_long(oidp, &val, 0, req);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, mem_total, CTLTYPE_LONG | CTLFLAG_RD,
+		0, 0, sysctl_vmm_mem_total, "LU", "Amount of Total memory");
+
+static int
+sysctl_vmm_mem_free(SYSCTL_HANDLER_ARGS)
+{
+	size_t val = vmm_mem_get_mem_free();
+	return sysctl_handle_long(oidp, &val, 0, req);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, mem_free, CTLTYPE_LONG | CTLFLAG_RD,
+		0, 0, sysctl_vmm_mem_free, "LU", "Amount of Free memory");
+
 void
 vmmdev_init(void)
 {
diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c
index 764a6e9..54f98ac 100644
--- a/sys/amd64/vmm/vmm_mem.c
+++ b/sys/amd64/vmm/vmm_mem.c
@@ -63,6 +63,7 @@ static struct {
 } vmm_mem_avail[VMM_MEM_MAXSEGS];
 
 static int vmm_mem_nsegs;
+size_t vmm_mem_total_bytes;
 
 static vm_paddr_t maxaddr;
 
@@ -96,6 +97,7 @@ vmm_mem_steal_memory(void)
 	smapsize = *((uint32_t *)smapbase - 1);
 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
+	vmm_mem_total_bytes = 0;
 	nsegs = 0;
 	for (smap = smapbase; smap < smapend; smap++) {
 		/*
@@ -131,6 +133,7 @@ vmm_mem_steal_memory(void)
 
 		vmm_mem_avail[nsegs].base = base;
 		vmm_mem_avail[nsegs].length = length;
+		vmm_mem_total_bytes += length;
 
 		if (base + length > maxaddr)
 			maxaddr = base + length;
@@ -344,6 +347,27 @@ vmm_mem_alloc(size_t size)
 	return (addr);
 }
 
+size_t
+vmm_mem_get_mem_total(void)
+{
+	return vmm_mem_total_bytes;
+}
+
+size_t
+vmm_mem_get_mem_free(void)
+{
+	size_t length = 0;
+	int i;
+
+	mtx_lock(&vmm_mem_mtx);
+	for (i = 0; i < vmm_mem_nsegs; i++) {
+		length += vmm_mem_avail[i].length;
+	}
+	mtx_unlock(&vmm_mem_mtx);
+
+	return(length);
+}
+
 void
 vmm_mem_free(vm_paddr_t base, size_t length)
 {
diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h
index ef1bf1a..a83e9be 100644
--- a/sys/amd64/vmm/vmm_mem.h
+++ b/sys/amd64/vmm/vmm_mem.h
@@ -35,4 +35,7 @@ void		vmm_mem_free(vm_paddr_t start, size_t size);
 vm_paddr_t	vmm_mem_maxaddr(void);
 void		vmm_mem_dump(void);
 
+size_t		vmm_mem_get_mem_total(void);
+size_t		vmm_mem_get_mem_free(void);
+
 #endif
-- 
cgit v1.1


From c0caea8c2fc75a9ca5f5a67dd11462ef6542afc2 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Fri, 21 Sep 2012 03:09:23 +0000
Subject: Restructure the x2apic access code in preparation for supporting
 memory mapped access to the local apic.

The vlapic code is now aware of the mode that the guest is using to access the
local apic.

Reviewed by: grehan@
---
 sys/amd64/vmm/io/vlapic.c | 34 +++++++++++++++++++--
 sys/amd64/vmm/io/vlapic.h |  3 ++
 sys/amd64/vmm/vmm_lapic.c | 78 +++++++++++++++++++++++++++++++++++++++--------
 sys/amd64/vmm/vmm_lapic.h |  6 ++--
 sys/amd64/vmm/vmm_msr.c   | 46 +++-------------------------
 5 files changed, 108 insertions(+), 59 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 6c91128..f1d363f 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/smp.h>
 
 #include <machine/clock.h>
+#include <x86/specialreg.h>
 #include <x86/apicreg.h>
 
 #include <machine/vmm.h>
@@ -86,6 +87,8 @@ static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
 #define VLAPIC_VERSION		(16)
 #define VLAPIC_MAXLVT_ENTRIES	(5)
 
+#define	x2apic(vlapic)		((vlapic)->msr_apicbase & APICBASE_X2APIC)
+
 struct vlapic {
 	struct vm		*vm;
 	int			vcpuid;
@@ -107,6 +110,8 @@ struct vlapic {
 	 */
 	uint8_t			 isrvec_stk[ISRVEC_STK_SIZE];
 	int			 isrvec_stk_top;
+
+	uint64_t		msr_apicbase;
 };
 
 static void
@@ -161,7 +166,6 @@ vlapic_op_reset(void* dev)
 	struct LAPIC	*lapic = &vlapic->apic;
 
 	memset(lapic, 0, sizeof(*lapic));
-	lapic->id = vlapic->vcpuid << 24;
 	lapic->apr = vlapic->vcpuid;
 	vlapic_init_ipi(vlapic);
 	
@@ -542,7 +546,10 @@ vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data)
 	switch(offset)
 	{
 		case APIC_OFFSET_ID:
-			*data = lapic->id;
+			if (x2apic(vlapic))
+				*data = vlapic->vcpuid;
+			else
+				*data = vlapic->vcpuid << 24;
 			break;
 		case APIC_OFFSET_VER:
 			*data = lapic->version;
@@ -631,7 +638,6 @@ vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data)
 	switch(offset)
 	{
 		case APIC_OFFSET_ID:
-			lapic->id = data;
 			break;
 		case APIC_OFFSET_TPR:
 			lapic->tpr = data & 0xff;
@@ -760,6 +766,14 @@ vlapic_init(struct vm *vm, int vcpuid)
 	vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
 	vlapic->vm = vm;
 	vlapic->vcpuid = vcpuid;
+
+	vlapic->msr_apicbase = DEFAULT_APIC_BASE |
+			       APICBASE_ENABLED |
+			       APICBASE_X2APIC;
+
+	if (vcpuid == 0)
+		vlapic->msr_apicbase |= APICBASE_BSP;
+
 	vlapic->ops = &vlapic_dev_ops;
 
 	vlapic->mmio = vlapic_mmio + vcpuid;
@@ -782,3 +796,17 @@ vlapic_cleanup(struct vlapic *vlapic)
 	vdev_unregister(vlapic);
 	free(vlapic, M_VLAPIC);
 }
+
+uint64_t
+vlapic_get_apicbase(struct vlapic *vlapic)
+{
+
+	return (vlapic->msr_apicbase);
+}
+
+void
+vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
+{
+
+	vlapic->msr_apicbase = val;
+}
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
index 861ea8c..cecd4d3 100644
--- a/sys/amd64/vmm/io/vlapic.h
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -102,4 +102,7 @@ void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
 void vlapic_set_intr_ready(struct vlapic *vlapic, int vector);
 void vlapic_timer_tick(struct vlapic *vlapic);
 
+uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
+void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);
+
 #endif	/* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
index 4aca087..13550b4 100644
--- a/sys/amd64/vmm/vmm_lapic.c
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -33,20 +33,18 @@ __FBSDID("$FreeBSD$");
 #include <sys/systm.h>
 #include <sys/smp.h>
 
+#include <x86/specialreg.h>
+
 #include <machine/vmm.h>
 #include "vmm_ipi.h"
 #include "vmm_lapic.h"
 #include "vlapic.h"
 
-int
-lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val)
+static int
+lapic_write(struct vlapic *vlapic, u_int offset, uint64_t val)
 {
 	int handled;
 
-	struct vlapic *vlapic;
-
-	vlapic = vm_lapic(vm, cpu);
-
 	if (vlapic_op_mem_write(vlapic, offset, DWORD, val) == 0)
 		handled = 1;
 	else
@@ -55,15 +53,11 @@ lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val)
 	return (handled);
 }
 
-int
-lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *rv)
+static int
+lapic_read(struct vlapic *vlapic, u_int offset, uint64_t *rv)
 {
 	int handled;
 
-	struct vlapic *vlapic;
-
-	vlapic = vm_lapic(vm, cpu);
-
 	if (vlapic_op_mem_read(vlapic, offset, DWORD, rv) == 0)
 		handled = 1;
 	else
@@ -120,3 +114,63 @@ lapic_timer_tick(struct vm *vm, int cpu)
 
 	vlapic_timer_tick(vlapic);
 }
+
+static boolean_t
+x2apic_msr(u_int msr)
+{
+	if (msr >= 0x800 && msr <= 0xBFF)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+static u_int
+x2apic_msr_to_regoff(u_int msr)
+{
+
+	return ((msr - 0x800) << 4);
+}
+
+boolean_t
+lapic_msr(u_int msr)
+{
+
+	if (x2apic_msr(msr) || (msr == MSR_APICBASE))
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+int
+lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval)
+{
+	int handled;
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	if (msr == MSR_APICBASE) {
+		*rval = vlapic_get_apicbase(vlapic);
+		handled = 1;
+	} else
+		handled = lapic_read(vlapic, x2apic_msr_to_regoff(msr), rval);
+
+	return (handled);
+}
+
+int
+lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
+{
+	int handled;
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	if (msr == MSR_APICBASE) {
+		vlapic_set_apicbase(vlapic, val);
+		handled = 1;
+	} else
+		handled = lapic_write(vlapic, x2apic_msr_to_regoff(msr), val);
+
+	return (handled);
+}
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
index 815b2f7..60f7696 100644
--- a/sys/amd64/vmm/vmm_lapic.h
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -31,8 +31,10 @@
 
 struct vm;
 
-int	lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val);
-int	lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *retval);
+boolean_t lapic_msr(u_int num);
+int	lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval);
+int	lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
+
 void	lapic_timer_tick(struct vm *vm, int cpu);
 
 /*
diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c
index 31bfcab..bc67f98 100644
--- a/sys/amd64/vmm/vmm_msr.c
+++ b/sys/amd64/vmm/vmm_msr.c
@@ -34,7 +34,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/smp.h>
 
 #include <machine/specialreg.h>
-#include <x86/apicreg.h>
 
 #include <machine/vmm.h>
 #include "vmm_lapic.h"
@@ -56,7 +55,6 @@ static struct vmm_msr vmm_msr[] = {
 	{ MSR_STAR,	0 },
 	{ MSR_SF_MASK,	0 },
 	{ MSR_PAT,      VMM_MSR_F_EMULATE | VMM_MSR_F_INVALID },
-	{ MSR_APICBASE,	VMM_MSR_F_EMULATE },
 	{ MSR_BIOS_SIGN,VMM_MSR_F_EMULATE },
 	{ MSR_MCG_CAP,	VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
 };
@@ -107,12 +105,6 @@ guest_msrs_init(struct vm *vm, int cpu)
 		case MSR_MCG_CAP:
 			guest_msrs[i] = 0;
 			break;
-		case MSR_APICBASE:
-			guest_msrs[i] = DEFAULT_APIC_BASE | APICBASE_ENABLED |
-					APICBASE_X2APIC;
-			if (cpu == 0)
-				guest_msrs[i] |= APICBASE_BSP;
-			break;
 		case MSR_PAT:
 			guest_msrs[i] = PAT_VALUE(0, PAT_WRITE_BACK)      |
 				PAT_VALUE(1, PAT_WRITE_THROUGH)   |
@@ -130,29 +122,6 @@ guest_msrs_init(struct vm *vm, int cpu)
 	}
 }
 
-static boolean_t
-x2apic_msr(u_int num)
-{
-
-	if (num >= 0x800 && num <= 0xBFF)
-		return (TRUE);
-	else
-		return (FALSE);
-}
-
-static u_int
-x2apic_msr_to_regoff(u_int msr)
-{
-
-	return ((msr - 0x800) << 4);
-}
-
-static boolean_t
-x2apic_msr_id(u_int num)
-{
-	return (num == 0x802);
-}
-
 static int
 msr_num_to_idx(u_int num)
 {
@@ -173,8 +142,8 @@ emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
 
 	handled = 0;
 
-	if (x2apic_msr(num))
-		return (lapic_write(vm, cpu, x2apic_msr_to_regoff(num), val));
+	if (lapic_msr(num))
+		return (lapic_wrmsr(vm, cpu, num, val));
 
 	idx = msr_num_to_idx(num);
 	if (idx < 0)
@@ -208,15 +177,8 @@ emulate_rdmsr(struct vm *vm, int cpu, u_int num)
 
 	handled = 0;
 
-	if (x2apic_msr(num)) {
-		handled = lapic_read(vm, cpu, x2apic_msr_to_regoff(num),
-				     &result);
-		/*
-		 * The version ID needs to be massaged
-		 */
-		if (x2apic_msr_id(num)) {
-			result = result >> 24;
-		}
+	if (lapic_msr(num)) {
+		handled = lapic_rdmsr(vm, cpu, num, &result);
 		goto done;
 	}
 
-- 
cgit v1.1


From 34b672cc8af9ef3fbee45a3c9cc28a7e30c9ef16 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Mon, 24 Sep 2012 19:32:24 +0000
Subject: Stash the 'vm_exit' information in each 'struct vcpu'.

There is no functional change at this time but this paves the way for vm exit
handler functions to easily modify the exit reason going forward.
---
 sys/amd64/include/vmm.h   |  4 ++--
 sys/amd64/vmm/amd/amdv.c  |  2 +-
 sys/amd64/vmm/intel/vmx.c |  5 ++++-
 sys/amd64/vmm/vmm.c       | 23 ++++++++++++++++++++---
 4 files changed, 27 insertions(+), 7 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 1ad01c6..61faf56 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -43,8 +43,7 @@ struct vlapic;
 typedef int	(*vmm_init_func_t)(void);
 typedef int	(*vmm_cleanup_func_t)(void);
 typedef void *	(*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
-typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
-				  struct vm_exit *vmexit);
+typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
 typedef void	(*vmi_cleanup_func_t)(void *vmi);
 typedef int	(*vmi_mmap_func_t)(void *vmi, vm_paddr_t gpa, vm_paddr_t hpa,
 				   size_t length, vm_memattr_t attr,
@@ -112,6 +111,7 @@ int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
 int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
 void vm_activate_cpu(struct vm *vm, int vcpu);
 cpuset_t vm_active_cpus(struct vm *vm);
+struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
 
 /*
  * Return 1 if device indicated by bus/slot/func is supposed to be a
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
index 6844cc0..674337d 100644
--- a/sys/amd64/vmm/amd/amdv.c
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -62,7 +62,7 @@ amdv_vminit(struct vm *vm)
 }
 
 static int
-amdv_vmrun(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit)
+amdv_vmrun(void *arg, int vcpu, register_t rip)
 {
 
 	printf("amdv_vmrun: not implemented\n");
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index be58444..88f870c 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1272,19 +1272,22 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 }
 
 static int
-vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit)
+vmx_run(void *arg, int vcpu, register_t rip)
 {
 	int error, vie, rc, handled, astpending;
 	uint32_t exit_reason;
 	struct vmx *vmx;
 	struct vmxctx *vmxctx;
 	struct vmcs *vmcs;
+	struct vm_exit *vmexit;
 	
 	vmx = arg;
 	vmcs = &vmx->vmcs[vcpu];
 	vmxctx = &vmx->ctx[vcpu];
 	vmxctx->launched = 0;
 
+	vmexit = vm_exitinfo(vmx->vm, vcpu);
+
 	/*
 	 * XXX Can we avoid doing this every time we do a vm run?
 	 */
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 62cc2a2..d896f6d 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -72,6 +72,7 @@ struct vcpu {
 	int		 vcpuid;
 	struct savefpu	*guestfpu;	/* guest fpu state */
 	void		*stats;
+	struct vm_exit	exitinfo;
 };
 #define	VCPU_F_PINNED	0x0001
 #define	VCPU_F_RUNNING	0x0002
@@ -110,8 +111,8 @@ static struct vmm_ops *ops;
 #define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
 
 #define	VMINIT(vm)	(ops != NULL ? (*ops->vminit)(vm): NULL)
-#define	VMRUN(vmi, vcpu, rip, vmexit) \
-	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, vmexit) : ENXIO)
+#define	VMRUN(vmi, vcpu, rip) \
+	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
 #define	VMMMAP(vmi, gpa, hpa, len, attr, prot, spm)	\
     (ops != NULL ? (*ops->vmmmap)(vmi, gpa, hpa, len, attr, prot, spm) : ENXIO)
@@ -164,6 +165,19 @@ vcpu_init(struct vm *vm, uint32_t vcpu_id)
 	vcpu->stats = vmm_stat_alloc();
 }
 
+struct vm_exit *
+vm_exitinfo(struct vm *vm, int cpuid)
+{
+	struct vcpu *vcpu;
+
+	if (cpuid < 0 || cpuid >= VM_MAXCPU)
+		panic("vm_exitinfo: invalid cpuid %d", cpuid);
+
+	vcpu = &vm->vcpu[cpuid];
+
+	return (&vcpu->exitinfo);
+}
+
 static int
 vmm_init(void)
 {
@@ -545,12 +559,15 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 
 	restore_guest_msrs(vm, vcpuid);	
 	restore_guest_fpustate(vcpu);
-	error = VMRUN(vm->cookie, vcpuid, vmrun->rip, &vmrun->vm_exit);
+	error = VMRUN(vm->cookie, vcpuid, vmrun->rip);
 	save_guest_fpustate(vcpu);
 	restore_host_msrs(vm, vcpuid);
 
 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 
+	/* copy the exit information */
+	bcopy(&vcpu->exitinfo, &vmrun->vm_exit, sizeof(struct vm_exit));
+
 	critical_exit();
 
 	return (error);
-- 
cgit v1.1


From c34be7b811ad199e64f66db339e7f64c773ca0a7 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Tue, 25 Sep 2012 02:33:25 +0000
Subject: Add an explicit exit code 'SPINUP_AP' to tell the controlling process
 that an AP needs to be activated by spinning up an execution context for it.

The local apic emulation is now completely done in the hypervisor and it will
detect writes to the ICR_LO register that try to bring up the AP. In response
to such writes it will return to userspace with an exit code of SPINUP_AP.

Reviewed by: grehan
---
 sys/amd64/include/vmm.h   |  5 ++++
 sys/amd64/vmm/intel/vmx.c |  8 +++++++
 sys/amd64/vmm/io/vlapic.c | 59 +++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 67 insertions(+), 5 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 61faf56..e841963 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -228,6 +228,7 @@ enum vm_exitcode {
 	VM_EXITCODE_MTRAP,
 	VM_EXITCODE_PAUSE,
 	VM_EXITCODE_PAGING,
+	VM_EXITCODE_SPINUP_AP,
 	VM_EXITCODE_MAX
 };
 
@@ -260,6 +261,10 @@ struct vm_exit {
 			uint32_t	code;		/* ecx value */
 			uint64_t	wval;
 		} msr;
+		struct {
+			int		vcpu;
+			uint64_t	rip;
+		} spinup_ap;
 	} u;
 };
 
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 88f870c..6689013 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1253,6 +1253,14 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		vm_exit_update_rip(vmexit);
 		vmexit->rip += vmexit->inst_length;
 		vmexit->inst_length = 0;
+
+		/*
+		 * Special case for spinning up an AP - exit to userspace to
+		 * give the controlling process a chance to intercept and
+		 * spin up a thread for the AP.
+		 */
+		if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
+			handled = 0;
 	} else {
 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 			/*
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index f1d363f..9b7d3cb 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -89,6 +89,12 @@ static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
 
 #define	x2apic(vlapic)		((vlapic)->msr_apicbase & APICBASE_X2APIC)
 
+enum boot_state {
+	BS_INIT,
+	BS_SIPI,
+	BS_RUNNING
+};
+
 struct vlapic {
 	struct vm		*vm;
 	int			vcpuid;
@@ -112,6 +118,7 @@ struct vlapic {
 	int			 isrvec_stk_top;
 
 	uint64_t		msr_apicbase;
+	enum boot_state		boot_state;
 };
 
 static void
@@ -168,6 +175,11 @@ vlapic_op_reset(void* dev)
 	memset(lapic, 0, sizeof(*lapic));
 	lapic->apr = vlapic->vcpuid;
 	vlapic_init_ipi(vlapic);
+
+	if (vlapic->vcpuid == 0)
+		vlapic->boot_state = BS_RUNNING;	/* BSP */
+	else
+		vlapic->boot_state = BS_INIT;		/* AP */
 	
 	return 0;
 
@@ -418,6 +430,8 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
 	int i;
 	cpuset_t dmask;
 	uint32_t dest, vec, mode;
+	struct vlapic *vlapic2;
+	struct vm_exit *vmexit;
 	
 	dest = icrval >> 32;
 	vec = icrval & APIC_VECTOR_MASK;
@@ -452,11 +466,46 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
 		return (0);	/* handled completely in the kernel */
 	}
 
-	/*
-	 * XXX this assumes that the startup IPI always succeeds
-	 */
-	if (mode == APIC_DELMODE_STARTUP)
-		vm_activate_cpu(vlapic->vm, dest);
+	if (mode == APIC_DELMODE_INIT) {
+		if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
+			return (0);
+
+		if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+			vlapic2 = vm_lapic(vlapic->vm, dest);
+
+			/* move from INIT to waiting-for-SIPI state */
+			if (vlapic2->boot_state == BS_INIT) {
+				vlapic2->boot_state = BS_SIPI;
+			}
+
+			return (0);
+		}
+	}
+
+	if (mode == APIC_DELMODE_STARTUP) {
+		if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+			vlapic2 = vm_lapic(vlapic->vm, dest);
+
+			/*
+			 * Ignore SIPIs in any state other than wait-for-SIPI
+			 */
+			if (vlapic2->boot_state != BS_SIPI)
+				return (0);
+
+			vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
+			vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
+			vmexit->u.spinup_ap.vcpu = dest;
+			vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
+
+			/*
+			 * XXX this assumes that the startup IPI always succeeds
+			 */
+			vlapic2->boot_state = BS_RUNNING;
+			vm_activate_cpu(vlapic2->vm, dest);
+
+			return (0);
+		}
+	}
 
 	/*
 	 * This will cause a return to userland.
-- 
cgit v1.1


From ebdd69568d7fa97153aa47a86afe367476a0a1de Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Tue, 25 Sep 2012 19:08:51 +0000
Subject: Add ioctls to control the X2APIC capability exposed by the virtual
 machine to the guest.

At the moment this simply sets the state in the 'vcpu' instance but there is
no code that acts upon these settings.
---
 sys/amd64/include/vmm.h     | 11 +++++++++++
 sys/amd64/include/vmm_dev.h | 11 +++++++++++
 sys/amd64/vmm/vmm.c         | 27 +++++++++++++++++++++++++++
 sys/amd64/vmm/vmm_dev.c     | 12 ++++++++++++
 4 files changed, 61 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index e841963..0b3a29c 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -40,6 +40,8 @@ struct vm_exit;
 struct vm_run;
 struct vlapic;
 
+enum x2apic_state;
+
 typedef int	(*vmm_init_func_t)(void);
 typedef int	(*vmm_cleanup_func_t)(void);
 typedef void *	(*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
@@ -109,6 +111,8 @@ uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
 struct vlapic *vm_lapic(struct vm *vm, int cpu);
 int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
 int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
+int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
+int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
 void vm_activate_cpu(struct vm *vm, int vcpu);
 cpuset_t vm_active_cpus(struct vm *vm);
 struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
@@ -205,6 +209,13 @@ enum vm_cap_type {
 	VM_CAP_MAX
 };
 
+enum x2apic_state {
+	X2APIC_ENABLED,
+	X2APIC_AVAILABLE,
+	X2APIC_DISABLED,
+	X2APIC_STATE_LAST
+};
+
 /*
  * The 'access' field has the format specified in Table 21-2 of the Intel
  * Architecture Manual vol 3b.
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index d1a50d6..fc64fd8 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -136,6 +136,11 @@ struct vm_stat_desc {
 	char		desc[128];			/* out */
 };
 
+struct vm_x2apic {
+	int			cpuid;
+	enum x2apic_state	state;
+};
+
 enum {
 	IOCNUM_RUN,
 	IOCNUM_SET_PINNING,
@@ -158,6 +163,8 @@ enum {
 	IOCNUM_INJECT_NMI,
 	IOCNUM_VM_STATS,
 	IOCNUM_VM_STAT_DESC,
+	IOCNUM_SET_X2APIC_STATE,
+	IOCNUM_GET_X2APIC_STATE,
 };
 
 #define	VM_RUN		\
@@ -202,4 +209,8 @@ enum {
 	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
 #define	VM_STAT_DESC \
 	_IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
+#define	VM_SET_X2APIC_STATE \
+	_IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
+#define	VM_GET_X2APIC_STATE \
+	_IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
 #endif
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index d896f6d..29dbe67 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -73,6 +73,7 @@ struct vcpu {
 	struct savefpu	*guestfpu;	/* guest fpu state */
 	void		*stats;
 	struct vm_exit	exitinfo;
+	enum x2apic_state x2apic_state;
 };
 #define	VCPU_F_PINNED	0x0001
 #define	VCPU_F_RUNNING	0x0002
@@ -163,6 +164,7 @@ vcpu_init(struct vm *vm, uint32_t vcpu_id)
 	vcpu->guestfpu = fpu_save_area_alloc();
 	fpu_save_area_reset(vcpu->guestfpu);
 	vcpu->stats = vmm_stat_alloc();
+	vcpu->x2apic_state = X2APIC_ENABLED;
 }
 
 struct vm_exit *
@@ -745,3 +747,28 @@ vcpu_stats(struct vm *vm, int vcpuid)
 
 	return (vm->vcpu[vcpuid].stats);
 }
+
+int
+vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
+{
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	*state = vm->vcpu[vcpuid].x2apic_state;
+
+	return (0);
+}
+
+int
+vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (state < 0 || state >= X2APIC_STATE_LAST)
+		return (EINVAL);
+
+	vm->vcpu[vcpuid].x2apic_state = state;
+
+	return (0);
+}
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 116b5f1..686ddec 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -163,6 +163,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_nmi *vmnmi;
 	struct vm_stats *vmstats;
 	struct vm_stat_desc *statdesc;
+	struct vm_x2apic *x2apic;
 
 	mtx_lock(&vmmdev_mtx);
 	sc = vmmdev_lookup2(cdev);
@@ -185,6 +186,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_GET_CAPABILITY:
 	case VM_SET_CAPABILITY:
 	case VM_PPTDEV_MSI:
+	case VM_SET_X2APIC_STATE:
 		/*
 		 * XXX fragile, handle with care
 		 * Assumes that the first field of the ioctl data is the vcpu.
@@ -335,6 +337,16 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 					  vmcap->captype,
 					  vmcap->capval);
 		break;
+	case VM_SET_X2APIC_STATE:
+		x2apic = (struct vm_x2apic *)data;
+		error = vm_set_x2apic_state(sc->vm,
+					    x2apic->cpuid, x2apic->state);
+		break;
+	case VM_GET_X2APIC_STATE:
+		x2apic = (struct vm_x2apic *)data;
+		error = vm_get_x2apic_state(sc->vm,
+					    x2apic->cpuid, &x2apic->state);
+		break;
 	default:
 		error = ENOTTY;
 		break;
-- 
cgit v1.1


From bc269b51afe43aab28df7ea0d543c167bb7c7d2e Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Tue, 25 Sep 2012 22:31:35 +0000
Subject: Add support for trapping MMIO writes to local apic registers and
 emulating them.

The default behavior is still to present the local apic to the guest in the
x2apic mode.
---
 sys/amd64/vmm/intel/vmcs.h           |  10 +
 sys/amd64/vmm/intel/vmx.c            |  74 +++++--
 sys/amd64/vmm/io/vlapic.c            |  39 +++-
 sys/amd64/vmm/vmm.c                  |   2 +-
 sys/amd64/vmm/vmm_instruction_emul.c | 385 +++++++++++++++++++++++++++++++++++
 sys/amd64/vmm/vmm_instruction_emul.h |  91 +++++++++
 sys/amd64/vmm/vmm_lapic.c            |  71 +++++++
 sys/amd64/vmm/vmm_lapic.h            |   3 +
 sys/amd64/vmm/x86.c                  |  22 +-
 sys/amd64/vmm/x86.h                  |   4 +-
 10 files changed, 676 insertions(+), 25 deletions(-)
 create mode 100644 sys/amd64/vmm/vmm_instruction_emul.c
 create mode 100644 sys/amd64/vmm/vmm_instruction_emul.h

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index a7cf4f6..84532f4 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -66,6 +66,7 @@ uint64_t vmcs_read(uint32_t encoding);
 #define	vmcs_exit_reason()		(vmcs_read(VMCS_EXIT_REASON) & 0xffff)
 #define	vmcs_exit_qualification()	vmcs_read(VMCS_EXIT_QUALIFICATION)
 #define	vmcs_guest_cr3()		vmcs_read(VMCS_GUEST_CR3)
+#define	vmcs_gpa()			vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
 
 #endif	/* _KERNEL */
 
@@ -324,4 +325,13 @@ uint64_t vmcs_read(uint32_t encoding);
  */
 #define	EXIT_QUAL_NMI_WHILE_STI_BLOCKING	3
 
+/*
+ * Exit qualification for EPT violation
+ */
+#define	EPT_VIOLATION_DATA_READ		(1UL << 0)
+#define	EPT_VIOLATION_DATA_WRITE	(1UL << 1)
+#define	EPT_VIOLATION_INST_FETCH	(1UL << 2)
+#define	EPT_VIOLATION_GLA_VALID		(1UL << 7)
+#define	EPT_VIOLATION_XLAT_VALID	(1UL << 8)
+
 #endif
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 6689013..ed0996e 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -48,6 +48,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/specialreg.h>
 #include <machine/vmparam.h>
 
+#include <x86/apicreg.h>
+
 #include <machine/vmm.h>
 #include "vmm_lapic.h"
 #include "vmm_msr.h"
@@ -60,6 +62,7 @@ __FBSDID("$FreeBSD$");
 #include "vmx.h"
 #include "x86.h"
 #include "vmx_controls.h"
+#include "vmm_instruction_emul.h"
 
 #define	CR4_VMXE	(1UL << 13)
 
@@ -771,21 +774,17 @@ vmx_vminit(struct vm *vm)
 }
 
 static int
-vmx_handle_cpuid(int vcpu, struct vmxctx *vmxctx)
+vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
 {
 	int handled, func;
 	
 	func = vmxctx->guest_rax;
 
-	handled = x86_emulate_cpuid((uint32_t*)(&vmxctx->guest_rax),
-	    (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx),
-	    (uint32_t*)(&vmxctx->guest_rdx), vcpu);
-#if 0
-	printf("%s: func %x rax %lx rbx %lx rcx %lx rdx %lx handled %d\n",
-		__func__, func, vmxctx->guest_rax, vmxctx->guest_rbx,
-		vmxctx->guest_rcx, vmxctx->guest_rdx, handled);
-#endif
-
+	handled = x86_emulate_cpuid(vm, vcpu,
+				    (uint32_t*)(&vmxctx->guest_rax),
+				    (uint32_t*)(&vmxctx->guest_rbx),
+				    (uint32_t*)(&vmxctx->guest_rcx),
+				    (uint32_t*)(&vmxctx->guest_rdx));
 	return (handled);
 }
 
@@ -1146,13 +1145,54 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 }
 
 static int
+vmx_lapic_fault(struct vm *vm, int cpu,
+		uint64_t gpa, uint64_t rip, uint64_t cr3, uint64_t ept_qual)
+{
+	int read, write, handled;
+
+	/*
+	 * For this to be a legitimate access to the local apic:
+	 * - the GPA in the local apic page
+	 * - the GPA must be aligned on a 16 byte boundary
+	 */
+	if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
+		return (UNHANDLED);
+
+	if ((gpa & 0xF) != 0)
+		return (UNHANDLED);
+
+	/* EPT violation on an instruction fetch doesn't make sense here */
+	if (ept_qual & EPT_VIOLATION_INST_FETCH)
+		return (UNHANDLED);
+
+	/* EPT violation must be a read fault or a write fault but not both */
+	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
+	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
+	if ((read ^ write) == 0)
+		return (UNHANDLED);
+
+	/*
+	 * The EPT violation must have been caused by accessing a guest-physical
+	 * address that is a translation of a guest-linear address.
+	 */
+	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
+	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
+		return (UNHANDLED);
+	}
+
+	handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, rip, cr3);
+
+	return (handled);
+}
+
+static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
 	int handled;
 	struct vmcs *vmcs;
 	struct vmxctx *vmxctx;
 	uint32_t eax, ecx, edx;
-	uint64_t qual;
+	uint64_t qual, gpa, cr3;
 
 	handled = 0;
 	vmcs = &vmx->vmcs[vcpu];
@@ -1229,11 +1269,17 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
 		break;
 	case EXIT_REASON_CPUID:
-		handled = vmx_handle_cpuid(vcpu, vmxctx);
+		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
 		break;
 	case EXIT_REASON_EPT_FAULT:
-		vmexit->exitcode = VM_EXITCODE_PAGING;
-		vmexit->u.paging.cr3 = vmcs_guest_cr3();
+		gpa = vmcs_gpa();
+		cr3 = vmcs_guest_cr3();
+		handled = vmx_lapic_fault(vmx->vm, vcpu,
+					  gpa, vmexit->rip, cr3, qual);
+		if (!handled) {
+			vmexit->exitcode = VM_EXITCODE_PAGING;
+			vmexit->u.paging.cr3 = cr3;
+		}
 		break;
 	default:
 		break;
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 9b7d3cb..aedc692 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -87,7 +87,7 @@ static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
 #define VLAPIC_VERSION		(16)
 #define VLAPIC_MAXLVT_ENTRIES	(5)
 
-#define	x2apic(vlapic)		((vlapic)->msr_apicbase & APICBASE_X2APIC)
+#define	x2apic(vlapic)	(((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
 
 enum boot_state {
 	BS_INIT,
@@ -433,7 +433,10 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
 	struct vlapic *vlapic2;
 	struct vm_exit *vmexit;
 	
-	dest = icrval >> 32;
+	if (x2apic(vlapic))
+		dest = icrval >> 32;
+	else
+		dest = icrval >> (32 + 24);
 	vec = icrval & APIC_VECTOR_MASK;
 	mode = icrval & APIC_DELMODE_MASK;
 
@@ -703,8 +706,18 @@ vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data)
 			lapic->svr = data;
 			break;
 		case APIC_OFFSET_ICR_LOW: 
+			if (!x2apic(vlapic)) {
+				data &= 0xffffffff;
+				data |= (uint64_t)lapic->icr_hi << 32;
+			}
 			retval = lapic_process_icr(vlapic, data);
 			break;
+		case APIC_OFFSET_ICR_HI:
+			if (!x2apic(vlapic)) {
+				retval = 0;
+				lapic->icr_hi = data;
+			}
+			break;
 		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 			reg = vlapic_get_lvt(vlapic, offset);	
 			if (!(lapic->svr & APIC_SVR_ENABLE)) {
@@ -810,19 +823,26 @@ static struct io_region vlapic_mmio[VM_MAXCPU];
 struct vlapic *
 vlapic_init(struct vm *vm, int vcpuid)
 {
+	int err;
+	enum x2apic_state state;
 	struct vlapic 		*vlapic;
 
+	err = vm_get_x2apic_state(vm, vcpuid, &state);
+	if (err)
+		panic("vlapic_set_apicbase: err %d fetching x2apic state", err);
+
 	vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
 	vlapic->vm = vm;
 	vlapic->vcpuid = vcpuid;
 
-	vlapic->msr_apicbase = DEFAULT_APIC_BASE |
-			       APICBASE_ENABLED |
-			       APICBASE_X2APIC;
+	vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
 
 	if (vcpuid == 0)
 		vlapic->msr_apicbase |= APICBASE_BSP;
 
+	if (state == X2APIC_ENABLED)
+		vlapic->msr_apicbase |= APICBASE_X2APIC;
+
 	vlapic->ops = &vlapic_dev_ops;
 
 	vlapic->mmio = vlapic_mmio + vcpuid;
@@ -856,6 +876,15 @@ vlapic_get_apicbase(struct vlapic *vlapic)
 void
 vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
 {
+	int err;
+	enum x2apic_state state;
+
+	err = vm_get_x2apic_state(vlapic->vm, vlapic->vcpuid, &state);
+	if (err)
+		panic("vlapic_set_apicbase: err %d fetching x2apic state", err);
+
+	if (state == X2APIC_DISABLED)
+		val &= ~APICBASE_X2APIC;
 
 	vlapic->msr_apicbase = val;
 }
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 29dbe67..764ffbb 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -160,11 +160,11 @@ vcpu_init(struct vm *vm, uint32_t vcpu_id)
 
 	vcpu->hostcpu = -1;
 	vcpu->vcpuid = vcpu_id;
+	vcpu->x2apic_state = X2APIC_ENABLED;
 	vcpu->vlapic = vlapic_init(vm, vcpu_id);
 	vcpu->guestfpu = fpu_save_area_alloc();
 	fpu_save_area_reset(vcpu->guestfpu);
 	vcpu->stats = vmm_stat_alloc();
-	vcpu->x2apic_state = X2APIC_ENABLED;
 }
 
 struct vm_exit *
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
new file mode 100644
index 0000000..fe01d69
--- /dev/null
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -0,0 +1,385 @@
+/*-
+ * Copyright (c) 2012 Sandvine, Inc.
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/pcpu.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/vmm.h>
+
+#include "vmm_instruction_emul.h"
+
+#define	GB	(1024 * 1024 * 1024)
+
+static enum vm_reg_name gpr_map[16] = {
+	VM_REG_GUEST_RAX,
+	VM_REG_GUEST_RCX,
+	VM_REG_GUEST_RDX,
+	VM_REG_GUEST_RBX,
+	VM_REG_GUEST_RSP,
+	VM_REG_GUEST_RBP,
+	VM_REG_GUEST_RSI,
+	VM_REG_GUEST_RDI,
+	VM_REG_GUEST_R8,
+	VM_REG_GUEST_R9,
+	VM_REG_GUEST_R10,
+	VM_REG_GUEST_R11,
+	VM_REG_GUEST_R12,
+	VM_REG_GUEST_R13,
+	VM_REG_GUEST_R14,
+	VM_REG_GUEST_R15
+};
+
+static void
+vie_init(struct vie *vie)
+{
+
+	bzero(vie, sizeof(struct vie));
+
+	vie->op_size = VIE_OP_SIZE_32BIT;
+
+	vie->base_register = VM_REG_LAST;
+	vie->index_register = VM_REG_LAST;
+	vie->operand_register = VM_REG_LAST;
+}
+
+static int
+gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
+	uint64_t *gpa, uint64_t *gpaend)
+{
+	vm_paddr_t hpa;
+	int nlevels, ptpshift, ptpindex;
+	uint64_t *ptpbase, pte, pgsize;
+
+	/*
+	 * XXX assumes 64-bit guest with 4 page walk levels
+	 */
+	nlevels = 4;
+	while (--nlevels >= 0) {
+		/* Zero out the lower 12 bits and the upper 12 bits */
+		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
+
+		hpa = vm_gpa2hpa(vm, ptpphys, PAGE_SIZE);
+		if (hpa == -1)
+			goto error;
+
+		ptpbase = (uint64_t *)PHYS_TO_DMAP(hpa);
+
+		ptpshift = PAGE_SHIFT + nlevels * 9;
+		ptpindex = (gla >> ptpshift) & 0x1FF;
+		pgsize = 1UL << ptpshift;
+
+		pte = ptpbase[ptpindex];
+
+		if ((pte & PG_V) == 0)
+			goto error;
+
+		if (pte & PG_PS) {
+			if (pgsize > 1 * GB)
+				goto error;
+			else
+				break;
+		}
+
+		ptpphys = pte;
+	}
+
+	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
+	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
+	*gpa = pte | (gla & (pgsize - 1));
+	*gpaend = pte + pgsize;
+	return (0);
+
+error:
+	return (-1);
+}
+
+void
+vmm_fetch_instruction(struct vm *vm, uint64_t rip, uint64_t cr3,
+		      struct vie *vie)
+{
+	int n, err;
+	uint64_t hpa, gpa, gpaend;
+
+	/*
+	 * XXX cache previously fetched instructions using 'rip' as the tag
+	 */
+
+	vie_init(vie);
+
+	/*
+	 * Copy up to 15 bytes of the instruction stream into 'vie'
+	 */
+	while (vie->num_valid < VIE_INST_SIZE) {
+		err = gla2gpa(vm, rip, cr3, &gpa, &gpaend);
+		if (err)
+			break;
+
+		n = min(VIE_INST_SIZE - vie->num_valid, gpaend - gpa);
+
+		hpa = vm_gpa2hpa(vm, gpa, n);
+		if (hpa == -1)
+			break;
+
+		bcopy((void *)PHYS_TO_DMAP(hpa), &vie->inst[vie->num_valid], n);
+
+		rip += n;
+		vie->num_valid += n;
+	}
+}
+
+static int
+vie_peek(struct vie *vie, uint8_t *x)
+{
+	if (vie->num_processed < vie->num_valid) {
+		*x = vie->inst[vie->num_processed];
+		return (0);
+	} else
+		return (-1);
+}
+
+static void
+vie_advance(struct vie *vie)
+{
+	if (vie->num_processed >= vie->num_valid)
+		panic("vie_advance: %d/%d", vie->num_processed, vie->num_valid);
+
+	vie->num_processed++;
+}
+
+static int
+decode_rex(struct vie *vie)
+{
+	uint8_t x;
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	if (x >= 0x40 && x <= 0x4F) {
+		vie->rex_w = x & 0x8 ? 1 : 0;
+		vie->rex_r = x & 0x4 ? 1 : 0;
+		vie->rex_x = x & 0x2 ? 1 : 0;
+		vie->rex_b = x & 0x1 ? 1 : 0;
+
+		vie_advance(vie);
+	}
+
+	return (0);
+}
+
+static int
+decode_opcode(struct vie *vie)
+{
+	uint8_t x;
+
+	static const uint8_t flags[256] = {
+		[0x89] = VIE_F_HAS_MODRM | VIE_F_FROM_REG | VIE_F_TO_RM,
+		[0x8B] = VIE_F_HAS_MODRM | VIE_F_FROM_RM | VIE_F_TO_REG,
+		[0xC7] = VIE_F_HAS_MODRM | VIE_F_FROM_IMM | VIE_F_TO_RM,
+	};
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	vie->opcode_byte = x;
+	vie->opcode_flags = flags[x];
+
+	vie_advance(vie);
+
+	if (vie->opcode_flags == 0)
+		return (-1);
+	else
+		return (0);
+}
+
+/*
+ * XXX assuming 32-bit or 64-bit guest
+ */
+static int
+decode_modrm(struct vie *vie)
+{
+	uint8_t x;
+
+	if ((vie->opcode_flags & VIE_F_HAS_MODRM) == 0)
+		return (0);
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	vie->mod = (x >> 6) & 0x3;
+	vie->rm =  (x >> 0) & 0x7;
+	vie->reg = (x >> 3) & 0x7;
+
+	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
+	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
+			/*
+			 * Table 2-5: Special Cases of REX Encodings
+			 *
+			 * mod=0, r/m=5 is used in the compatibility mode to
+			 * indicate a disp32 without a base register.
+			 *
+			 * mod!=3, r/m=4 is used in the compatibility mode to
+			 * indicate that the SIB byte is present.
+			 *
+			 * The 'b' bit in the REX prefix is don't care in
+			 * this case.
+			 */
+	} else {
+		vie->rm |= (vie->rex_b << 3);
+	}
+
+	vie->reg |= (vie->rex_r << 3);
+
+	/* SIB addressing not supported yet */
+	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
+		return (-1);
+
+	vie->base_register = gpr_map[vie->rm];
+
+	if (vie->opcode_flags & (VIE_F_FROM_REG | VIE_F_TO_REG))
+		vie->operand_register = gpr_map[vie->reg];
+
+	switch (vie->mod) {
+	case VIE_MOD_INDIRECT_DISP8:
+		vie->disp_bytes = 1;
+		break;
+	case VIE_MOD_INDIRECT_DISP32:
+		vie->disp_bytes = 4;
+		break;
+	case VIE_MOD_INDIRECT:
+		if (vie->rm == VIE_RM_DISP32) {
+			vie->disp_bytes = 4;
+			vie->base_register = VM_REG_LAST;	/* no base */
+		}
+		break;
+	}
+
+	/* calculate the operand size */
+	if (vie->rex_w)
+		vie->op_size = VIE_OP_SIZE_64BIT;
+
+	if (vie->opcode_flags & VIE_F_FROM_IMM)
+		vie->imm_bytes = 4;
+
+	vie_advance(vie);
+
+	return (0);
+}
+
+static int
+decode_displacement(struct vie *vie)
+{
+	int n, i;
+	uint8_t x;
+
+	union {
+		char	buf[4];
+		int8_t	signed8;
+		int32_t	signed32;
+	} u;
+
+	if ((n = vie->disp_bytes) == 0)
+		return (0);
+
+	if (n != 1 && n != 4)
+		panic("decode_displacement: invalid disp_bytes %d", n);
+
+	for (i = 0; i < n; i++) {
+		if (vie_peek(vie, &x))
+			return (-1);
+
+		u.buf[i] = x;
+		vie_advance(vie);
+	}
+
+	if (n == 1)
+		vie->displacement = u.signed8;		/* sign-extended */
+	else
+		vie->displacement = u.signed32;		/* sign-extended */
+
+	return (0);
+}
+
+static int
+decode_immediate(struct vie *vie)
+{
+	int i, n;
+	uint8_t x;
+	union {
+		char	buf[4];
+		int32_t	signed32;
+	} u;
+
+	if ((n = vie->imm_bytes) == 0)
+		return (0);
+
+	if (n != 4)
+		panic("decode_immediate: invalid imm_bytes %d", n);
+
+	for (i = 0; i < n; i++) {
+		if (vie_peek(vie, &x))
+			return (-1);
+
+		u.buf[i] = x;
+		vie_advance(vie);
+	}
+	
+	vie->immediate = u.signed32;		/* sign-extended */
+
+	return (0);
+}
+
+int
+vmm_decode_instruction(struct vie *vie)
+{
+	if (decode_rex(vie))
+		return (-1);
+
+	if (decode_opcode(vie))
+		return (-1);
+
+	if (decode_modrm(vie))
+		return (-1);
+
+	if (decode_displacement(vie))
+		return (-1);
+	
+	if (decode_immediate(vie))
+		return (-1);
+
+	return (0);
+}
diff --git a/sys/amd64/vmm/vmm_instruction_emul.h b/sys/amd64/vmm/vmm_instruction_emul.h
new file mode 100644
index 0000000..94937f2
--- /dev/null
+++ b/sys/amd64/vmm/vmm_instruction_emul.h
@@ -0,0 +1,91 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_INSTRUCTION_EMUL_H_
+#define _VMM_INSTRUCTION_EMUL_H_
+
+enum vie_op_size {
+	VIE_OP_SIZE_32BIT,		/* default */
+	VIE_OP_SIZE_64BIT,
+	VIE_OP_SIZE_8BIT
+};
+
+#define	VIE_INST_SIZE	15
+struct vie {
+	uint8_t		inst[VIE_INST_SIZE];
+
+	uint8_t		rex_w:1,
+			rex_r:1,
+			rex_x:1,
+			rex_b:1;
+
+	uint8_t		mod:2,
+			reg:4,
+			rm:4;
+
+
+	uint8_t		opcode_byte;
+	uint16_t	opcode_flags;
+	uint8_t		disp_bytes;
+	uint8_t		imm_bytes;
+
+	int		num_valid;
+	int		num_processed;
+
+	enum vm_reg_name base_register;
+	enum vm_reg_name index_register;
+	enum vm_reg_name operand_register;
+
+	int		op_size;
+	int64_t		displacement;
+	int64_t		immediate;
+};
+
+#define	VIE_F_HAS_MODRM	(1 << 0)
+#define	VIE_F_FROM_RM	(1 << 1)
+#define	VIE_F_FROM_REG	(1 << 2)
+#define	VIE_F_TO_RM	(1 << 3)
+#define	VIE_F_TO_REG	(1 << 4)
+#define	VIE_F_FROM_IMM	(1 << 5)
+
+#define	VIE_MOD_INDIRECT		0
+#define	VIE_MOD_INDIRECT_DISP8		1
+#define	VIE_MOD_INDIRECT_DISP32		2
+#define	VIE_MOD_DIRECT			3
+
+#define	VIE_RM_SIB			4
+#define	VIE_RM_DISP32			5
+
+struct vm;
+
+void	vmm_fetch_instruction(struct vm *vm, uint64_t rip, uint64_t cr3,
+			      struct vie *vie);
+
+int	vmm_decode_instruction(struct vie *vie);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
index 13550b4..0d797e6 100644
--- a/sys/amd64/vmm/vmm_lapic.c
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
 #include "vmm_ipi.h"
 #include "vmm_lapic.h"
 #include "vlapic.h"
+#include "vmm_instruction_emul.h"
 
 static int
 lapic_write(struct vlapic *vlapic, u_int offset, uint64_t val)
@@ -174,3 +175,73 @@ lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
 
 	return (handled);
 }
+
+int
+lapic_mmio(struct vm *vm, int cpu, u_int offset, int read,
+	   uint64_t rip, uint64_t cr3)
+{
+	int handled, error;
+	uint64_t val;
+	struct vie vie;
+	struct vlapic *vlapic;
+
+	const int UNHANDLED = 0;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	vmm_fetch_instruction(vm, rip, cr3, &vie);
+
+	if (vmm_decode_instruction(&vie) != 0)
+		return (UNHANDLED);
+
+	/* Only 32-bit accesses to local apic */
+	if (vie.op_size != VIE_OP_SIZE_32BIT)
+		return (UNHANDLED);
+
+	/*
+	 * XXX
+	 * The operand register in which we store the result of the
+	 * read must be a GPR that we can modify even if the vcpu
+	 * is "running". All the GPRs qualify except for %rsp.
+	 *
+	 * This is a limitation of the vm_set_register() API
+	 * and can be fixed if necessary.
+	 */
+	if (vie.operand_register == VM_REG_GUEST_RSP)
+		return (UNHANDLED);
+
+	if (read) {
+		if ((vie.opcode_flags & VIE_F_TO_REG) == 0)
+			return (UNHANDLED);
+
+		if (vie.operand_register >= VM_REG_LAST)
+			return (UNHANDLED);
+
+		handled = lapic_read(vlapic, offset, &val);
+		if (handled) {
+			error = vm_set_register(vm, cpu, vie.operand_register,
+						val);
+			if (error)
+				panic("lapic_mmio: error %d setting gpr %d",
+				      error, vie.operand_register);
+		}
+	} else {
+		if ((vie.opcode_flags & VIE_F_FROM_REG) &&
+		    (vie.operand_register < VM_REG_LAST)) {
+			error = vm_get_register(vm, cpu, vie.operand_register,
+						&val);
+			if (error) {
+				panic("lapic_mmio: error %d getting gpr %d",
+				      error, vie.operand_register);
+			}
+		} else if (vie.opcode_flags & VIE_F_FROM_IMM) {
+			val = vie.immediate;
+		} else {
+			return (UNHANDLED);
+		}
+
+		handled = lapic_write(vlapic, offset, val);
+	}
+
+	return (handled);
+}
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
index 60f7696..7bba4e3 100644
--- a/sys/amd64/vmm/vmm_lapic.h
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -35,6 +35,9 @@ boolean_t lapic_msr(u_int num);
 int	lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval);
 int	lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
 
+int	lapic_mmio(struct vm *vm, int cpu, u_int offset, int read,
+		   uint64_t rip, uint64_t cr3);
+
 void	lapic_timer_tick(struct vm *vm, int cpu);
 
 /*
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
index 669fa4b..47ba975 100644
--- a/sys/amd64/vmm/x86.c
+++ b/sys/amd64/vmm/x86.c
@@ -29,13 +29,17 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
+#include <sys/cpuset.h>
 
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
+#include <machine/vmm.h>
+
 #include "x86.h"
 
 #define	CPUID_VM_HIGH		0x40000000
@@ -43,10 +47,12 @@ __FBSDID("$FreeBSD$");
 static const char bhyve_id[12] = "BHyVE BHyVE ";
 
 int
-x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx,
-	uint32_t vcpu_id)
+x86_emulate_cpuid(struct vm *vm, int vcpu_id,
+		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 {
+	int error;
 	unsigned int 	func, regs[4];
+	enum x2apic_state x2apic_state;
 
 	func = *eax;
 
@@ -91,6 +97,12 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx,
 		case CPUID_0000_0001:
 			do_cpuid(1, regs);
 
+			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
+			if (error) {
+				panic("x86_emulate_cpuid: error %d "
+				      "fetching x2apic state", error);
+			}
+
 			/*
 			 * Override the APIC ID only in ebx
 			 */
@@ -102,7 +114,11 @@ x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx,
 			 * Advertise x2APIC capability and Hypervisor guest.
 			 */
 			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
-			regs[2] |= CPUID2_X2APIC | CPUID2_HV;
+
+			regs[2] |= CPUID2_HV;
+
+			if (x2apic_state != X2APIC_DISABLED)
+				regs[2] |= CPUID2_X2APIC;
 
 			/*
 			 * Hide xsave/osxsave/avx until the FPU save/restore
diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h
index d672831..d19e1d8 100644
--- a/sys/amd64/vmm/x86.h
+++ b/sys/amd64/vmm/x86.h
@@ -57,7 +57,7 @@
  */
 #define CPUID_0000_0001_FEAT0_VMX	(1<<5)
 
-int x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
-		      uint32_t *edx, uint32_t vcpu_id);
+int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx,
+		      uint32_t *ecx, uint32_t *edx);
 
 #endif
-- 
cgit v1.1


From 5dbc1ca26acaa3175dae7b9d0c45151fba0275ab Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Wed, 26 Sep 2012 00:06:17 +0000
Subject: Add an option "-a" to present the local apic in the XAPIC mode
 instead of the default X2APIC mode to the guest.

---
 sys/amd64/vmm/io/vlapic.c | 22 +++++++++++++---------
 sys/amd64/vmm/io/vlapic.h |  3 +++
 sys/amd64/vmm/vmm.c       |  4 +++-
 3 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index aedc692..1e8a4e8 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -823,14 +823,8 @@ static struct io_region vlapic_mmio[VM_MAXCPU];
 struct vlapic *
 vlapic_init(struct vm *vm, int vcpuid)
 {
-	int err;
-	enum x2apic_state state;
 	struct vlapic 		*vlapic;
 
-	err = vm_get_x2apic_state(vm, vcpuid, &state);
-	if (err)
-		panic("vlapic_set_apicbase: err %d fetching x2apic state", err);
-
 	vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
 	vlapic->vm = vm;
 	vlapic->vcpuid = vcpuid;
@@ -840,9 +834,6 @@ vlapic_init(struct vm *vm, int vcpuid)
 	if (vcpuid == 0)
 		vlapic->msr_apicbase |= APICBASE_BSP;
 
-	if (state == X2APIC_ENABLED)
-		vlapic->msr_apicbase |= APICBASE_X2APIC;
-
 	vlapic->ops = &vlapic_dev_ops;
 
 	vlapic->mmio = vlapic_mmio + vcpuid;
@@ -888,3 +879,16 @@ vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
 
 	vlapic->msr_apicbase = val;
 }
+
+void
+vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, vcpuid);
+
+	if (state == X2APIC_ENABLED)
+		vlapic->msr_apicbase |= APICBASE_X2APIC;
+	else
+		vlapic->msr_apicbase &= ~APICBASE_X2APIC;
+}
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
index cecd4d3..f43289d 100644
--- a/sys/amd64/vmm/io/vlapic.h
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -88,6 +88,8 @@ struct vm;
  */
 #define	ISRVEC_STK_SIZE		(16 + 1)
 
+enum x2apic_state;
+
 struct vlapic *vlapic_init(struct vm *vm, int vcpuid);
 void vlapic_cleanup(struct vlapic *vlapic);
 
@@ -104,5 +106,6 @@ void vlapic_timer_tick(struct vlapic *vlapic);
 
 uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
 void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);
+void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s);
 
 #endif	/* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 764ffbb..db2f9b8 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -160,8 +160,8 @@ vcpu_init(struct vm *vm, uint32_t vcpu_id)
 
 	vcpu->hostcpu = -1;
 	vcpu->vcpuid = vcpu_id;
-	vcpu->x2apic_state = X2APIC_ENABLED;
 	vcpu->vlapic = vlapic_init(vm, vcpu_id);
+	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
 	vcpu->guestfpu = fpu_save_area_alloc();
 	fpu_save_area_reset(vcpu->guestfpu);
 	vcpu->stats = vmm_stat_alloc();
@@ -770,5 +770,7 @@ vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 
 	vm->vcpu[vcpuid].x2apic_state = state;
 
+	vlapic_set_x2apic_state(vm, vcpuid, state);
+
 	return (0);
 }
-- 
cgit v1.1


From b65259b285734eec4d40fe639b4e84a6f4bf9f02 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Thu, 27 Sep 2012 00:27:58 +0000
Subject: Intel VT-x provides the length of the instruction at the time of the
 nested page table fault. Use this when fetching the instruction bytes from
 the guest memory.

Also modify the lapic_mmio() API so that a decoded instruction is fed into it
instead of having it fetch the instruction bytes from the guest. This is
useful for hardware assists like SVM that provide the faulting instruction
as part of the vmexit.
---
 sys/amd64/vmm/intel/vmx.c            | 16 +++++++++++++---
 sys/amd64/vmm/vmm_instruction_emul.c | 22 ++++++++++++++--------
 sys/amd64/vmm/vmm_instruction_emul.h |  4 ++--
 sys/amd64/vmm/vmm_lapic.c            | 33 +++++++++++++--------------------
 sys/amd64/vmm/vmm_lapic.h            |  4 ++--
 5 files changed, 44 insertions(+), 35 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index ed0996e..a2c8e76 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1146,9 +1146,11 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 
 static int
 vmx_lapic_fault(struct vm *vm, int cpu,
-		uint64_t gpa, uint64_t rip, uint64_t cr3, uint64_t ept_qual)
+		uint64_t gpa, uint64_t rip, int inst_length,
+		uint64_t cr3, uint64_t ept_qual)
 {
 	int read, write, handled;
+	struct vie vie;
 
 	/*
 	 * For this to be a legitimate access to the local apic:
@@ -1180,7 +1182,14 @@ vmx_lapic_fault(struct vm *vm, int cpu,
 		return (UNHANDLED);
 	}
 
-	handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, rip, cr3);
+	/* Fetch, decode and emulate the faulting instruction */
+	if (vmm_fetch_instruction(vm, rip, inst_length, cr3, &vie) != 0)
+		return (UNHANDLED);
+
+	if (vmm_decode_instruction(&vie) != 0)
+		return (UNHANDLED);
+
+	handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, &vie);
 
 	return (handled);
 }
@@ -1275,7 +1284,8 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		gpa = vmcs_gpa();
 		cr3 = vmcs_guest_cr3();
 		handled = vmx_lapic_fault(vmx->vm, vcpu,
-					  gpa, vmexit->rip, cr3, qual);
+					  gpa, vmexit->rip, vmexit->inst_length,
+					  cr3, qual);
 		if (!handled) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->u.paging.cr3 = cr3;
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index fe01d69..66af72c 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -128,9 +128,9 @@ error:
 	return (-1);
 }
 
-void
-vmm_fetch_instruction(struct vm *vm, uint64_t rip, uint64_t cr3,
-		      struct vie *vie)
+int
+vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
+		      uint64_t cr3, struct vie *vie)
 {
 	int n, err;
 	uint64_t hpa, gpa, gpaend;
@@ -139,17 +139,18 @@ vmm_fetch_instruction(struct vm *vm, uint64_t rip, uint64_t cr3,
 	 * XXX cache previously fetched instructions using 'rip' as the tag
 	 */
 
+	if (inst_length > VIE_INST_SIZE)
+		panic("vmm_fetch_instruction: invalid length %d", inst_length);
+
 	vie_init(vie);
 
-	/*
-	 * Copy up to 15 bytes of the instruction stream into 'vie'
-	 */
-	while (vie->num_valid < VIE_INST_SIZE) {
+	/* Copy the instruction into 'vie' */
+	while (vie->num_valid < inst_length) {
 		err = gla2gpa(vm, rip, cr3, &gpa, &gpaend);
 		if (err)
 			break;
 
-		n = min(VIE_INST_SIZE - vie->num_valid, gpaend - gpa);
+		n = min(inst_length - vie->num_valid, gpaend - gpa);
 
 		hpa = vm_gpa2hpa(vm, gpa, n);
 		if (hpa == -1)
@@ -160,6 +161,11 @@ vmm_fetch_instruction(struct vm *vm, uint64_t rip, uint64_t cr3,
 		rip += n;
 		vie->num_valid += n;
 	}
+
+	if (vie->num_valid == inst_length)
+		return (0);
+	else
+		return (-1);
 }
 
 static int
diff --git a/sys/amd64/vmm/vmm_instruction_emul.h b/sys/amd64/vmm/vmm_instruction_emul.h
index 94937f2..1fa9e2b 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.h
+++ b/sys/amd64/vmm/vmm_instruction_emul.h
@@ -83,8 +83,8 @@ struct vie {
 
 struct vm;
 
-void	vmm_fetch_instruction(struct vm *vm, uint64_t rip, uint64_t cr3,
-			      struct vie *vie);
+int	vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
+			      uint64_t cr3, struct vie *vie);
 
 int	vmm_decode_instruction(struct vie *vie);
 
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
index 0d797e6..ace6010 100644
--- a/sys/amd64/vmm/vmm_lapic.c
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -177,25 +177,18 @@ lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
 }
 
 int
-lapic_mmio(struct vm *vm, int cpu, u_int offset, int read,
-	   uint64_t rip, uint64_t cr3)
+lapic_mmio(struct vm *vm, int cpu, u_int offset, int read, struct vie *vie)
 {
 	int handled, error;
 	uint64_t val;
-	struct vie vie;
 	struct vlapic *vlapic;
 
 	const int UNHANDLED = 0;
 
 	vlapic = vm_lapic(vm, cpu);
 
-	vmm_fetch_instruction(vm, rip, cr3, &vie);
-
-	if (vmm_decode_instruction(&vie) != 0)
-		return (UNHANDLED);
-
 	/* Only 32-bit accesses to local apic */
-	if (vie.op_size != VIE_OP_SIZE_32BIT)
+	if (vie->op_size != VIE_OP_SIZE_32BIT)
 		return (UNHANDLED);
 
 	/*
@@ -207,35 +200,35 @@ lapic_mmio(struct vm *vm, int cpu, u_int offset, int read,
 	 * This is a limitation of the vm_set_register() API
 	 * and can be fixed if necessary.
 	 */
-	if (vie.operand_register == VM_REG_GUEST_RSP)
+	if (vie->operand_register == VM_REG_GUEST_RSP)
 		return (UNHANDLED);
 
 	if (read) {
-		if ((vie.opcode_flags & VIE_F_TO_REG) == 0)
+		if ((vie->opcode_flags & VIE_F_TO_REG) == 0)
 			return (UNHANDLED);
 
-		if (vie.operand_register >= VM_REG_LAST)
+		if (vie->operand_register >= VM_REG_LAST)
 			return (UNHANDLED);
 
 		handled = lapic_read(vlapic, offset, &val);
 		if (handled) {
-			error = vm_set_register(vm, cpu, vie.operand_register,
+			error = vm_set_register(vm, cpu, vie->operand_register,
 						val);
 			if (error)
 				panic("lapic_mmio: error %d setting gpr %d",
-				      error, vie.operand_register);
+				      error, vie->operand_register);
 		}
 	} else {
-		if ((vie.opcode_flags & VIE_F_FROM_REG) &&
-		    (vie.operand_register < VM_REG_LAST)) {
-			error = vm_get_register(vm, cpu, vie.operand_register,
+		if ((vie->opcode_flags & VIE_F_FROM_REG) &&
+		    (vie->operand_register < VM_REG_LAST)) {
+			error = vm_get_register(vm, cpu, vie->operand_register,
 						&val);
 			if (error) {
 				panic("lapic_mmio: error %d getting gpr %d",
-				      error, vie.operand_register);
+				      error, vie->operand_register);
 			}
-		} else if (vie.opcode_flags & VIE_F_FROM_IMM) {
-			val = vie.immediate;
+		} else if (vie->opcode_flags & VIE_F_FROM_IMM) {
+			val = vie->immediate;
 		} else {
 			return (UNHANDLED);
 		}
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
index 7bba4e3..e9ff8fd 100644
--- a/sys/amd64/vmm/vmm_lapic.h
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -30,13 +30,13 @@
 #define	_VMM_LAPIC_H_
 
 struct vm;
+struct vie;
 
 boolean_t lapic_msr(u_int num);
 int	lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval);
 int	lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
 
-int	lapic_mmio(struct vm *vm, int cpu, u_int offset, int read,
-		   uint64_t rip, uint64_t cr3);
+int	lapic_mmio(struct vm *vm, int cpu, u_int offset, int rd, struct vie *);
 
 void	lapic_timer_tick(struct vm *vm, int cpu);
 
-- 
cgit v1.1


From bc87f08e9822e6446dc91b0451317740259de95c Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Sat, 29 Sep 2012 01:15:45 +0000
Subject: Get rid of assumptions in the hypervisor that the host physical
 memory associated with guest physical memory is contiguous.

In this case vm_malloc() was using vm_gpa2hpa() to indirectly infer whether
or not the address range had already been allocated.

Replace this instead with an explicit API 'vm_gpa_available()' that returns
TRUE if a page is available for allocation in guest physical address space.
---
 sys/amd64/include/vmm.h |  2 +-
 sys/amd64/vmm/vmm.c     | 60 +++++++++++++++++++++++++++++++++++++++++--------
 sys/amd64/vmm/vmm_dev.c |  2 +-
 3 files changed, 53 insertions(+), 11 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 0b3a29c..bb2f778 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -89,7 +89,7 @@ extern struct vmm_ops vmm_ops_amd;
 struct vm *vm_create(const char *name);
 void vm_destroy(struct vm *vm);
 const char *vm_name(struct vm *vm);
-int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa);
+int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
 int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
 vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index db2f9b8..06109b1 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -315,20 +315,63 @@ vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 		       VM_PROT_NONE, spok));
 }
 
+/*
+ * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
+ */
+static boolean_t
+vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
+{
+	int i;
+	vm_paddr_t gpabase, gpalimit;
+
+	if (gpa & PAGE_MASK)
+		panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		gpabase = vm->mem_segs[i].gpa;
+		gpalimit = gpabase + vm->mem_segs[i].len;
+		if (gpa >= gpabase && gpa < gpalimit)
+			return (FALSE);
+	}
+
+	return (TRUE);
+}
+
 int
-vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa)
+vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
-	int error;
-	vm_paddr_t hpa;
+	int error, available, allocated;
+	vm_paddr_t g, hpa;
 
 	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+
+	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
+		return (EINVAL);
 	
+	available = allocated = 0;
+	g = gpa;
+	while (g < gpa + len) {
+		if (vm_gpa_available(vm, g))
+			available++;
+		else
+			allocated++;
+
+		g += PAGE_SIZE;
+	}
+
 	/*
-	 * find the hpa if already it was already vm_malloc'd.
+	 * If there are some allocated and some available pages in the address
+	 * range then it is an error.
 	 */
-	hpa = vm_gpa2hpa(vm, gpa, len);
-	if (hpa != ((vm_paddr_t)-1))
-		goto out;
+	if (allocated && available)
+		return (EINVAL);
+
+	/*
+	 * If the entire address range being requested has already been
+	 * allocated then there isn't anything more to do.
+	 */
+	if (allocated && available == 0)
+		return (0);
 
 	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
 		return (E2BIG);
@@ -350,8 +393,7 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa)
 	vm->mem_segs[vm->num_mem_segs].hpa = hpa;
 	vm->mem_segs[vm->num_mem_segs].len = len;
 	vm->num_mem_segs++;
-out:
-	*ret_hpa = hpa;
+
 	return (0);
 }
 
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 686ddec..b504e6b 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -295,7 +295,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		break;
 	case VM_MAP_MEMORY:
 		seg = (struct vm_memory_segment *)data;
-		error = vm_malloc(sc->vm, seg->gpa, seg->len, &seg->hpa);
+		error = vm_malloc(sc->vm, seg->gpa, seg->len);
 		break;
 	case VM_GET_MEMORY_SEG:
 		seg = (struct vm_memory_segment *)data;
-- 
cgit v1.1


From 3e50e0220bcda77b0a8e06a5f6095a206368e01b Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Wed, 3 Oct 2012 00:46:30 +0000
Subject: Get rid of assumptions in the hypervisor that the host physical
 memory associated with guest physical memory is contiguous.

Rewrite vm_gpa2hpa() to get the GPA to HPA mapping by querying the nested
page tables.
---
 sys/amd64/include/vmm.h   |  11 +++--
 sys/amd64/vmm/amd/amdv.c  |  15 ++++--
 sys/amd64/vmm/intel/ept.c | 116 +++++++++++++++++++++++++++++++++++++++-------
 sys/amd64/vmm/intel/ept.h |   3 +-
 sys/amd64/vmm/intel/vmx.c |   3 +-
 sys/amd64/vmm/vmm.c       |  31 ++++++-------
 6 files changed, 134 insertions(+), 45 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index bb2f778..be22eec 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -47,9 +47,11 @@ typedef int	(*vmm_cleanup_func_t)(void);
 typedef void *	(*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
 typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
 typedef void	(*vmi_cleanup_func_t)(void *vmi);
-typedef int	(*vmi_mmap_func_t)(void *vmi, vm_paddr_t gpa, vm_paddr_t hpa,
-				   size_t length, vm_memattr_t attr,
-				   int prot, boolean_t superpages_ok);
+typedef int	(*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
+				       vm_paddr_t hpa, size_t length,
+				       vm_memattr_t attr, int prot,
+				       boolean_t superpages_ok);
+typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
 typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t *retval);
 typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
@@ -72,7 +74,8 @@ struct vmm_ops {
 	vmi_init_func_t		vminit;		/* vm-specific initialization */
 	vmi_run_func_t		vmrun;
 	vmi_cleanup_func_t	vmcleanup;
-	vmi_mmap_func_t		vmmmap;
+	vmi_mmap_set_func_t	vmmmap_set;
+	vmi_mmap_get_func_t	vmmmap_get;
 	vmi_get_register_t	vmgetreg;
 	vmi_set_register_t	vmsetreg;
 	vmi_get_desc_t		vmgetdesc;
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
index 674337d..b50f972 100644
--- a/sys/amd64/vmm/amd/amdv.c
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -78,11 +78,19 @@ amdv_vmcleanup(void *arg)
 }
 
 static int
-amdv_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
 	    vm_memattr_t attr, int prot, boolean_t spok)
 {
 
-	printf("amdv_vmmmap: not implemented\n");
+	printf("amdv_vmmmap_set: not implemented\n");
+	return (EINVAL);
+}
+
+static vm_paddr_t
+amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+
+	printf("amdv_vmmmap_get: not implemented\n");
 	return (EINVAL);
 }
 
@@ -157,7 +165,8 @@ struct vmm_ops vmm_ops_amd = {
 	amdv_vminit,
 	amdv_vmrun,
 	amdv_vmcleanup,
-	amdv_vmmmap,
+	amdv_vmmmap_set,
+	amdv_vmmmap_get,
 	amdv_getreg,
 	amdv_setreg,
 	amdv_getdesc,
diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c
index c9fca9d..4f91601 100644
--- a/sys/amd64/vmm/intel/ept.c
+++ b/sys/amd64/vmm/intel/ept.c
@@ -115,6 +115,40 @@ ept_init(void)
 	return (0);
 }
 
+#if 0
+static void
+ept_dump(uint64_t *ptp, int nlevels)
+{
+	int i, t, tabs;
+	uint64_t *ptpnext, ptpval;
+
+	if (--nlevels < 0)
+		return;
+
+	tabs = 3 - nlevels;
+	for (t = 0; t < tabs; t++)
+		printf("\t");
+	printf("PTP = %p\n", ptp);
+
+	for (i = 0; i < 512; i++) {
+		ptpval = ptp[i];
+
+		if (ptpval == 0)
+			continue;
+		
+		for (t = 0; t < tabs; t++)
+			printf("\t");
+		printf("%3d 0x%016lx\n", i, ptpval);
+
+		if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
+			ptpnext = (uint64_t *)
+				  PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+			ept_dump(ptpnext, nlevels);
+		}
+	}
+}
+#endif
+
 static size_t
 ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
 		   vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
@@ -179,29 +213,64 @@ ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
 		      "mismatch\n", gpa, ptpshift);
 	}
 
-	/* Do the mapping */
-	ptp[ptpindex] = hpa;
+	if (prot != VM_PROT_NONE) {
+		/* Do the mapping */
+		ptp[ptpindex] = hpa;
 
-	/* Apply the access controls */
-	if (prot & VM_PROT_READ)
-		ptp[ptpindex] |= EPT_PG_RD;
-	if (prot & VM_PROT_WRITE)
-		ptp[ptpindex] |= EPT_PG_WR;
-	if (prot & VM_PROT_EXECUTE)
-		ptp[ptpindex] |= EPT_PG_EX;
+		/* Apply the access controls */
+		if (prot & VM_PROT_READ)
+			ptp[ptpindex] |= EPT_PG_RD;
+		if (prot & VM_PROT_WRITE)
+			ptp[ptpindex] |= EPT_PG_WR;
+		if (prot & VM_PROT_EXECUTE)
+			ptp[ptpindex] |= EPT_PG_EX;
 
-	/*
-	 * XXX should we enforce this memory type by setting the ignore PAT
-	 * bit to 1.
-	 */
-	ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
+		/*
+		 * XXX should we enforce this memory type by setting the
+		 * ignore PAT bit to 1.
+		 */
+		ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
 
-	if (nlevels > 0)
-		ptp[ptpindex] |= EPT_PG_SUPERPAGE;
+		if (nlevels > 0)
+			ptp[ptpindex] |= EPT_PG_SUPERPAGE;
+	} else {
+		/* Remove the mapping */
+		ptp[ptpindex] = 0;
+	}
 
 	return (1UL << ptpshift);
 }
 
+static vm_paddr_t
+ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
+{
+	int nlevels, ptpshift, ptpindex;
+	uint64_t ptpval, hpabase, pgmask;
+
+	nlevels = EPT_PWLEVELS;
+	while (--nlevels >= 0) {
+		ptpshift = PAGE_SHIFT + nlevels * 9;
+		ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+		ptpval = ptp[ptpindex];
+
+		/* Cannot make progress beyond this point */
+		if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
+			break;
+
+		if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
+			pgmask = (1UL << ptpshift) - 1;
+			hpabase = ptpval & ~pgmask;
+			return (hpabase | (gpa & pgmask));
+		}
+
+		/* Work our way down to the next level page table page */
+		ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+	}
+
+	return ((vm_paddr_t)-1);
+}
+
 static void
 ept_free_pt_entry(pt_entry_t pte)
 {
@@ -276,8 +345,8 @@ ept_vmcleanup(struct vmx *vmx)
 }
 
 int
-ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
-	   vm_memattr_t attr, int prot, boolean_t spok)
+ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
+		vm_memattr_t attr, int prot, boolean_t spok)
 {
 	size_t n;
 	struct vmx *vmx = arg;
@@ -293,6 +362,17 @@ ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
 	return (0);
 }
 
+vm_paddr_t
+ept_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+	vm_paddr_t hpa;
+	struct vmx *vmx;
+
+	vmx = arg;
+	hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
+	return (hpa);
+}
+
 static void
 invept_single_context(void *arg)
 {
diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h
index 013c330..2d7258d 100644
--- a/sys/amd64/vmm/intel/ept.h
+++ b/sys/amd64/vmm/intel/ept.h
@@ -35,8 +35,9 @@ struct vmx;
 #define	EPTP(pml4)	((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
 
 int	ept_init(void);
-int	ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+int	ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
 	    vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
+vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
 void	ept_invalidate_mappings(u_long ept_pml4);
 void	ept_vmcleanup(struct vmx *vmx);
 #endif
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index a2c8e76..3fbe5a1 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1813,7 +1813,8 @@ struct vmm_ops vmm_ops_intel = {
 	vmx_vminit,
 	vmx_run,
 	vmx_vmcleanup,
-	ept_vmmmap,
+	ept_vmmmap_set,
+	ept_vmmmap_get,
 	vmx_getreg,
 	vmx_setreg,
 	vmx_getdesc,
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 06109b1..62bb753 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -115,8 +115,12 @@ static struct vmm_ops *ops;
 #define	VMRUN(vmi, vcpu, rip) \
 	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
-#define	VMMMAP(vmi, gpa, hpa, len, attr, prot, spm)	\
-    (ops != NULL ? (*ops->vmmmap)(vmi, gpa, hpa, len, attr, prot, spm) : ENXIO)
+#define	VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm)			\
+    	(ops != NULL ? 							\
+    	(*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) :	\
+	ENXIO)
+#define	VMMMAP_GET(vmi, gpa) \
+	(ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
 #define	VMGETREG(vmi, vcpu, num, retval)		\
 	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETREG(vmi, vcpu, num, val)		\
@@ -302,8 +306,8 @@ vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
 	const boolean_t spok = TRUE;	/* superpage mappings are ok */
 
-	return (VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
-		       VM_PROT_RW, spok));
+	return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
+			   VM_PROT_RW, spok));
 }
 
 int
@@ -311,8 +315,8 @@ vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
 	const boolean_t spok = TRUE;	/* superpage mappings are ok */
 
-	return (VMMMAP(vm->cookie, gpa, 0, len, VM_MEMATTR_UNCACHEABLE,
-		       VM_PROT_NONE, spok));
+	return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
+			   VM_PROT_NONE, spok));
 }
 
 /*
@@ -380,8 +384,8 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 	if (hpa == 0)
 		return (ENOMEM);
 
-	error = VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK,
-		       VM_PROT_ALL, spok);
+	error = VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK,
+			   VM_PROT_ALL, spok);
 	if (error) {
 		vmm_mem_free(hpa, len);
 		return (error);
@@ -400,17 +404,8 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 vm_paddr_t
 vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
-	int i;
-	vm_paddr_t gpabase, gpalimit, hpabase;
 
-	for (i = 0; i < vm->num_mem_segs; i++) {
-		hpabase = vm->mem_segs[i].hpa;
-		gpabase = vm->mem_segs[i].gpa;
-		gpalimit = gpabase + vm->mem_segs[i].len;
-		if (gpa >= gpabase && gpa + len <= gpalimit)
-			return ((gpa - gpabase) + hpabase);
-	}
-	return ((vm_paddr_t)-1);
+	return (VMMMAP_GET(vm->cookie, gpa));
 }
 
 int
-- 
cgit v1.1


From 77ab4804ac42198ff996def6bc2d7acc841626a5 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Wed, 3 Oct 2012 01:18:51 +0000
Subject: Get rid of assumptions in the hypervisor that the host physical
 memory associated with guest physical memory is contiguous.

Add check to vm_gpa2hpa() that the range indicated by [gpa,gpa+len) is all
contained within a single 4KB page.
---
 sys/amd64/vmm/vmm.c                  | 5 +++++
 sys/amd64/vmm/vmm_instruction_emul.c | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 62bb753..3dabbd6 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -404,6 +404,11 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 vm_paddr_t
 vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
+	vm_paddr_t nextpage;
+
+	nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
+	if (len > nextpage - gpa)
+		panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
 
 	return (VMMMAP_GET(vm->cookie, gpa));
 }
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 66af72c..7ef4dbb 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -133,7 +133,7 @@ vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
 		      uint64_t cr3, struct vie *vie)
 {
 	int n, err;
-	uint64_t hpa, gpa, gpaend;
+	uint64_t hpa, gpa, gpaend, off;
 
 	/*
 	 * XXX cache previously fetched instructions using 'rip' as the tag
@@ -150,7 +150,8 @@ vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
 		if (err)
 			break;
 
-		n = min(inst_length - vie->num_valid, gpaend - gpa);
+		off = gpa & PAGE_MASK;
+		n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
 
 		hpa = vm_gpa2hpa(vm, gpa, n);
 		if (hpa == -1)
-- 
cgit v1.1


From 18dd2c0d511c600e708ac8f756e8e51151b43656 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Thu, 4 Oct 2012 02:27:14 +0000
Subject: Change vm_malloc() to map pages in the guest physical address space
 in 4KB chunks. This breaks the assumption that the entire memory segment is
 contiguously allocated in the host physical address space.

This also paves the way to satisfy the 4KB page allocations by requesting
free pages from the VM subsystem as opposed to hard-partitioning host memory
at boot time.
---
 sys/amd64/include/vmm_dev.h |  1 -
 sys/amd64/vmm/io/ppt.c      |  1 -
 sys/amd64/vmm/vmm.c         | 61 +++++++++++++++++++++++++++++++++++----------
 sys/amd64/vmm/vmm_dev.c     |  2 +-
 sys/amd64/vmm/vmm_mem.c     |  8 +++---
 5 files changed, 53 insertions(+), 20 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index fc64fd8..42ad236 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -35,7 +35,6 @@ void	vmmdev_cleanup(void);
 #endif
 
 struct vm_memory_segment {
-	vm_paddr_t	hpa;	/* out */
 	vm_paddr_t	gpa;	/* in */
 	size_t		len;	/* in */
 };
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
index ace2877..e81fdbc 100644
--- a/sys/amd64/vmm/io/ppt.c
+++ b/sys/amd64/vmm/io/ppt.c
@@ -356,7 +356,6 @@ ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
 				if (error == 0) {
 					seg->gpa = gpa;
 					seg->len = len;
-					seg->hpa = hpa;
 				}
 				return (error);
 			}
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 3dabbd6..7bd3f7f 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -275,6 +275,28 @@ vm_create(const char *name)
 	return (vm);
 }
 
+static void
+vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
+{
+	size_t len;
+	vm_paddr_t hpa;
+
+	len = 0;
+	while (len < seg->len) {
+		hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
+		if (hpa == (vm_paddr_t)-1) {
+			panic("vm_free_mem_segs: cannot free hpa "
+			      "associated with gpa 0x%016lx", seg->gpa + len);
+		}
+
+		vmm_mem_free(hpa, PAGE_SIZE);
+
+		len += PAGE_SIZE;
+	}
+
+	bzero(seg, sizeof(struct vm_memory_segment));
+}
+
 void
 vm_destroy(struct vm *vm)
 {
@@ -283,7 +305,9 @@ vm_destroy(struct vm *vm)
 	ppt_unassign_all(vm);
 
 	for (i = 0; i < vm->num_mem_segs; i++)
-		vmm_mem_free(vm->mem_segs[i].hpa, vm->mem_segs[i].len);
+		vm_free_mem_seg(vm, &vm->mem_segs[i]);
+
+	vm->num_mem_segs = 0;
 
 	for (i = 0; i < VM_MAXCPU; i++)
 		vcpu_cleanup(&vm->vcpu[i]);
@@ -345,6 +369,7 @@ int
 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
 	int error, available, allocated;
+	struct vm_memory_segment *seg;
 	vm_paddr_t g, hpa;
 
 	const boolean_t spok = TRUE;	/* superpage mappings are ok */
@@ -380,22 +405,32 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
 		return (E2BIG);
 
-	hpa = vmm_mem_alloc(len);
-	if (hpa == 0)
-		return (ENOMEM);
+	seg = &vm->mem_segs[vm->num_mem_segs];
 
-	error = VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK,
-			   VM_PROT_ALL, spok);
-	if (error) {
-		vmm_mem_free(hpa, len);
-		return (error);
+	seg->gpa = gpa;
+	seg->len = 0;
+	while (seg->len < len) {
+		hpa = vmm_mem_alloc(PAGE_SIZE);
+		if (hpa == 0) {
+			error = ENOMEM;
+			break;
+		}
+
+		error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
+				   VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
+		if (error)
+			break;
+
+		iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
+
+		seg->len += PAGE_SIZE;
 	}
 
-	iommu_create_mapping(vm->iommu, gpa, hpa, len);
+	if (seg->len != len) {
+		vm_free_mem_seg(vm, seg);
+		return (error);
+	}
 
-	vm->mem_segs[vm->num_mem_segs].gpa = gpa;
-	vm->mem_segs[vm->num_mem_segs].hpa = hpa;
-	vm->mem_segs[vm->num_mem_segs].len = len;
 	vm->num_mem_segs++;
 
 	return (0);
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index b504e6b..91edbe8 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -299,7 +299,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		break;
 	case VM_GET_MEMORY_SEG:
 		seg = (struct vm_memory_segment *)data;
-		seg->hpa = seg->len = 0;
+		seg->len = 0;
 		(void)vm_gpabase2memseg(sc->vm, seg->gpa, seg);
 		error = 0;
 		break;
diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c
index 54f98ac..eb05b9d 100644
--- a/sys/amd64/vmm/vmm_mem.c
+++ b/sys/amd64/vmm/vmm_mem.c
@@ -318,9 +318,9 @@ vmm_mem_alloc(size_t size)
 	int i;
 	vm_paddr_t addr;
 
-	if ((size & PDRMASK) != 0) {
+	if ((size & PAGE_MASK) != 0) {
 		panic("vmm_mem_alloc: size 0x%0lx must be "
-		      "aligned on a 0x%0x boundary\n", size, NBPDR);
+		      "aligned on a 0x%0x boundary\n", size, PAGE_SIZE);
 	}
 
 	addr = 0;
@@ -373,9 +373,9 @@ vmm_mem_free(vm_paddr_t base, size_t length)
 {
 	int i;
 
-	if ((base & PDRMASK) != 0 || (length & PDRMASK) != 0) {
+	if ((base & PAGE_MASK) != 0 || (length & PAGE_MASK) != 0) {
 		panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be "
-		      "aligned on a 0x%0x boundary\n", base, length, NBPDR);
+		      "aligned on a 0x%0x boundary\n", base, length, PAGE_SIZE);
 	}
 
 	mtx_lock(&vmm_mem_mtx);
-- 
cgit v1.1


From ca6e3cf9305492be70c87be05119c96a49cbecf9 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Mon, 8 Oct 2012 23:41:26 +0000
Subject: Allocate memory pages for the guest from the host's free page queue.

It is no longer necessary to hard-partition the memory between the host
and guests at boot time.
---
 sys/amd64/vmm/amd/amdv.c  |  17 ++
 sys/amd64/vmm/intel/vtd.c |  50 +++++-
 sys/amd64/vmm/io/iommu.c  |  51 +++++-
 sys/amd64/vmm/io/iommu.h  |   8 +
 sys/amd64/vmm/vmm.c       |  33 +++-
 sys/amd64/vmm/vmm_dev.c   |  18 ---
 sys/amd64/vmm/vmm_mem.c   | 386 +++++-----------------------------------------
 sys/amd64/vmm/vmm_mem.h   |   4 -
 8 files changed, 193 insertions(+), 374 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
index b50f972..020743f 100644
--- a/sys/amd64/vmm/amd/amdv.c
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -230,6 +230,14 @@ amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa,
 	return (0);
 }
 
+static uint64_t
+amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+	printf("amd_iommu_remove_mapping: not implemented\n");
+	return (0);
+}
+
 static void
 amd_iommu_add_device(void *domain, int bus, int slot, int func)
 {
@@ -244,6 +252,13 @@ amd_iommu_remove_device(void *domain, int bus, int slot, int func)
 	printf("amd_iommu_remove_device: not implemented\n");
 }
 
+static void
+amd_iommu_invalidate_tlb(void *domain)
+{
+
+	printf("amd_iommu_invalidate_tlb: not implemented\n");
+}
+
 struct iommu_ops iommu_ops_amd = {
 	amd_iommu_init,
 	amd_iommu_cleanup,
@@ -252,6 +267,8 @@ struct iommu_ops iommu_ops_amd = {
 	amd_iommu_create_domain,
 	amd_iommu_destroy_domain,
 	amd_iommu_create_mapping,
+	amd_iommu_remove_mapping,
 	amd_iommu_add_device,
 	amd_iommu_remove_device,
+	amd_iommu_invalidate_tlb,
 };
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
index 24495a9..ef0e9bc 100644
--- a/sys/amd64/vmm/intel/vtd.c
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -444,8 +444,12 @@ vtd_remove_device(void *arg, int bus, int slot, int func)
 	}
 }
 
+#define	CREATE_MAPPING	0
+#define	REMOVE_MAPPING	1
+
 static uint64_t
-vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
+		   int remove)
 {
 	struct domain *dom;
 	int i, spshift, ptpshift, ptpindex, nlevels;
@@ -513,16 +517,50 @@ vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
 		panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
 
 	/*
-	 * Create a 'gpa' -> 'hpa' mapping
+	 * Update the 'gpa' -> 'hpa' mapping
 	 */
-	ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
+	if (remove) {
+		ptp[ptpindex] = 0;
+	} else {
+		ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
 
-	if (nlevels > 0)
-		ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
+		if (nlevels > 0)
+			ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
+	}
 
 	return (1UL << ptpshift);
 }
 
+static uint64_t
+vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+	return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
+}
+
+static uint64_t
+vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
+{
+
+	return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
+}
+
+static void
+vtd_invalidate_tlb(void *dom)
+{
+	int i;
+	struct vtdmap *vtdmap;
+
+	/*
+	 * Invalidate the IOTLB.
+	 * XXX use domain-selective invalidation for IOTLB
+	 */
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_iotlb_global_invalidate(vtdmap);
+	}
+}
+
 static void *
 vtd_create_domain(vm_paddr_t maxaddr)
 {
@@ -632,6 +670,8 @@ struct iommu_ops iommu_ops_intel = {
 	vtd_create_domain,
 	vtd_destroy_domain,
 	vtd_create_mapping,
+	vtd_remove_mapping,
 	vtd_add_device,
 	vtd_remove_device,
+	vtd_invalidate_tlb,
 };
diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c
index baf2447..c8447cc 100644
--- a/sys/amd64/vmm/io/iommu.c
+++ b/sys/amd64/vmm/io/iommu.c
@@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/md_var.h>
 
 #include "vmm_util.h"
+#include "vmm_mem.h"
 #include "iommu.h"
 
 static boolean_t iommu_avail;
@@ -90,6 +91,16 @@ IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
 		return (len);		/* XXX */
 }
 
+static __inline uint64_t
+IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+	if (ops != NULL && iommu_avail)
+		return ((*ops->remove_mapping)(domain, gpa, len));
+	else
+		return (len);		/* XXX */
+}
+
 static __inline void
 IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func)
 {
@@ -107,6 +118,14 @@ IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func)
 }
 
 static __inline void
+IOMMU_INVALIDATE_TLB(void *domain)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->invalidate_tlb)(domain);
+}
+
+static __inline void
 IOMMU_ENABLE(void)
 {
 
@@ -146,13 +165,13 @@ iommu_init(void)
 	/*
 	 * Create a domain for the devices owned by the host
 	 */
-	maxaddr = ptoa(Maxmem);
+	maxaddr = vmm_mem_maxaddr();
 	host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
 	if (host_domain == NULL)
 		panic("iommu_init: unable to create a host domain");
 
 	/*
-	 * Create 1:1 mappings from '0' to 'Maxmem' for devices assigned to
+	 * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to
 	 * the host
 	 */
 	iommu_create_mapping(host_domain, 0, 0, maxaddr);
@@ -216,6 +235,27 @@ iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
 }
 
 void
+iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len)
+{
+	uint64_t unmapped, remaining;
+
+	remaining = len;
+
+	while (remaining > 0) {
+		unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining);
+		gpa += unmapped;
+		remaining -= unmapped;
+	}
+}
+
+void *
+iommu_host_domain(void)
+{
+
+	return (host_domain);
+}
+
+void
 iommu_add_device(void *dom, int bus, int slot, int func)
 {
 
@@ -228,3 +268,10 @@ iommu_remove_device(void *dom, int bus, int slot, int func)
 
 	IOMMU_REMOVE_DEVICE(dom, bus, slot, func);
 }
+
+void
+iommu_invalidate_tlb(void *domain)
+{
+
+	IOMMU_INVALIDATE_TLB(domain);
+}
diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h
index e4f7229..d5c1d6e 100644
--- a/sys/amd64/vmm/io/iommu.h
+++ b/sys/amd64/vmm/io/iommu.h
@@ -37,8 +37,11 @@ typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr);
 typedef void (*iommu_destroy_domain_t)(void *domain);
 typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa,
 					   vm_paddr_t hpa, uint64_t len);
+typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa,
+					   uint64_t len);
 typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func);
 typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func);
+typedef void (*iommu_invalidate_tlb_t)(void *dom);
 
 struct iommu_ops {
 	iommu_init_func_t	init;		/* module wide */
@@ -49,8 +52,10 @@ struct iommu_ops {
 	iommu_create_domain_t	create_domain;	/* domain-specific */
 	iommu_destroy_domain_t	destroy_domain;
 	iommu_create_mapping_t	create_mapping;
+	iommu_remove_mapping_t	remove_mapping;
 	iommu_add_device_t	add_device;
 	iommu_remove_device_t	remove_device;
+	iommu_invalidate_tlb_t	invalidate_tlb;
 };
 
 extern struct iommu_ops iommu_ops_intel;
@@ -58,10 +63,13 @@ extern struct iommu_ops iommu_ops_amd;
 
 void	iommu_init(void);
 void	iommu_cleanup(void);
+void	*iommu_host_domain(void);
 void	*iommu_create_domain(vm_paddr_t maxaddr);
 void	iommu_destroy_domain(void *dom);
 void	iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa,
 			     size_t len);
+void	iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len);
 void	iommu_add_device(void *dom, int bus, int slot, int func);
 void	iommu_remove_device(void *dom, int bus, int slot, int func);
+void	iommu_invalidate_tlb(void *domain);
 #endif
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 7bd3f7f..bcd322a 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -280,6 +280,9 @@ vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
 {
 	size_t len;
 	vm_paddr_t hpa;
+	void *host_domain;
+
+	host_domain = iommu_host_domain();
 
 	len = 0;
 	while (len < seg->len) {
@@ -289,11 +292,24 @@ vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
 			      "associated with gpa 0x%016lx", seg->gpa + len);
 		}
 
+		/*
+		 * Remove the 'gpa' to 'hpa' mapping in VMs domain.
+		 * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
+		 */
+		iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
+		iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
+
 		vmm_mem_free(hpa, PAGE_SIZE);
 
 		len += PAGE_SIZE;
 	}
 
+	/*
+	 * Invalidate cached translations associated with 'vm->iommu' since
+	 * we have now moved some pages from it.
+	 */
+	iommu_invalidate_tlb(vm->iommu);
+
 	bzero(seg, sizeof(struct vm_memory_segment));
 }
 
@@ -371,6 +387,7 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 	int error, available, allocated;
 	struct vm_memory_segment *seg;
 	vm_paddr_t g, hpa;
+	void *host_domain;
 
 	const boolean_t spok = TRUE;	/* superpage mappings are ok */
 
@@ -405,8 +422,11 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
 		return (E2BIG);
 
+	host_domain = iommu_host_domain();
+
 	seg = &vm->mem_segs[vm->num_mem_segs];
 
+	error = 0;
 	seg->gpa = gpa;
 	seg->len = 0;
 	while (seg->len < len) {
@@ -421,16 +441,27 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 		if (error)
 			break;
 
+		/*
+		 * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
+		 * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
+		 */
+		iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
 		iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
 
 		seg->len += PAGE_SIZE;
 	}
 
-	if (seg->len != len) {
+	if (error) {
 		vm_free_mem_seg(vm, seg);
 		return (error);
 	}
 
+	/*
+	 * Invalidate cached translations associated with 'host_domain' since
+	 * we have now moved some pages from it.
+	 */
+	iommu_invalidate_tlb(host_domain);
+
 	vm->num_mem_segs++;
 
 	return (0);
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 91edbe8..66f5184 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -471,24 +471,6 @@ sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
 SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
 	    NULL, 0, sysctl_vmm_create, "A", NULL);
 
-static int
-sysctl_vmm_mem_total(SYSCTL_HANDLER_ARGS)
-{
-	size_t val = vmm_mem_get_mem_total();
-	return sysctl_handle_long(oidp, &val, 0, req);
-}
-SYSCTL_PROC(_hw_vmm, OID_AUTO, mem_total, CTLTYPE_LONG | CTLFLAG_RD,
-		0, 0, sysctl_vmm_mem_total, "LU", "Amount of Total memory");
-
-static int
-sysctl_vmm_mem_free(SYSCTL_HANDLER_ARGS)
-{
-	size_t val = vmm_mem_get_mem_free();
-	return sysctl_handle_long(oidp, &val, 0, req);
-}
-SYSCTL_PROC(_hw_vmm, OID_AUTO, mem_free, CTLTYPE_LONG | CTLFLAG_RD,
-		0, 0, sysctl_vmm_mem_free, "LU", "Amount of Free memory");
-
 void
 vmmdev_init(void)
 {
diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c
index eb05b9d..8745339 100644
--- a/sys/amd64/vmm/vmm_mem.c
+++ b/sys/amd64/vmm/vmm_mem.c
@@ -36,9 +36,12 @@ __FBSDID("$FreeBSD$");
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
+#include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 
 #include <machine/md_var.h>
 #include <machine/metadata.h>
@@ -49,265 +52,21 @@ __FBSDID("$FreeBSD$");
 #include "vmm_util.h"
 #include "vmm_mem.h"
 
-static MALLOC_DEFINE(M_VMM_MEM, "vmm memory", "vmm memory");
+SYSCTL_DECL(_hw_vmm);
 
-#define	MB		(1024 * 1024)
-#define	GB		(1024 * MB)
-
-#define	VMM_MEM_MAXSEGS	64
-
-/* protected by vmm_mem_mtx */
-static struct {
-	vm_paddr_t	base;
-	vm_size_t	length;
-} vmm_mem_avail[VMM_MEM_MAXSEGS];
-
-static int vmm_mem_nsegs;
-size_t vmm_mem_total_bytes;
-
-static vm_paddr_t maxaddr;
-
-static struct mtx vmm_mem_mtx;
-
-/*
- * Steal any memory that was deliberately hidden from FreeBSD either by
- * the use of MAXMEM kernel config option or the hw.physmem loader tunable.
- */
-static int
-vmm_mem_steal_memory(void)
-{
-	int nsegs;
-	caddr_t kmdp;
-	uint32_t smapsize;
-	uint64_t base, length;
-	struct bios_smap *smapbase, *smap, *smapend;
-
-	/*
-	 * Borrowed from hammer_time() and getmemsize() in machdep.c
-	 */
-	kmdp = preload_search_by_type("elf kernel");
-	if (kmdp == NULL)
-		kmdp = preload_search_by_type("elf64 kernel");
-
-	smapbase = (struct bios_smap *)preload_search_info(kmdp,
-		MODINFO_METADATA | MODINFOMD_SMAP);
-	if (smapbase == NULL)
-		panic("No BIOS smap info from loader!");
-
-	smapsize = *((uint32_t *)smapbase - 1);
-	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
-
-	vmm_mem_total_bytes = 0;
-	nsegs = 0;
-	for (smap = smapbase; smap < smapend; smap++) {
-		/*
-		 * XXX
-		 * Assuming non-overlapping, monotonically increasing
-		 * memory segments.
-		 */
-		if (smap->type != SMAP_TYPE_MEMORY)
-			continue;
-		if (smap->length == 0)
-			break;
-
-		base = roundup(smap->base, NBPDR);
-		length = rounddown(smap->length, NBPDR);
-
-		/* Skip this segment if FreeBSD is using all of it. */
-		if (base + length <= ptoa(Maxmem))
-			continue;
-
-		/*
-		 * If FreeBSD is using part of this segment then adjust
-		 * 'base' and 'length' accordingly.
-		 */
-		if (base < ptoa(Maxmem)) {
-			uint64_t used;
-			used = roundup(ptoa(Maxmem), NBPDR) - base;
-			base += used;
-			length -= used;
-		}
-
-		if (length == 0)
-			continue;
-
-		vmm_mem_avail[nsegs].base = base;
-		vmm_mem_avail[nsegs].length = length;
-		vmm_mem_total_bytes += length;
-
-		if (base + length > maxaddr)
-			maxaddr = base + length;
-
-		if (0 && bootverbose) {
-			printf("vmm_mem_populate: index %d, base 0x%0lx, "
-			       "length %ld\n",
-			       nsegs, vmm_mem_avail[nsegs].base,
-			       vmm_mem_avail[nsegs].length);
-		}
-
-		nsegs++;
-		if (nsegs >= VMM_MEM_MAXSEGS) {
-			printf("vmm_mem_populate: maximum number of vmm memory "
-			       "segments reached!\n");
-			return (ENOSPC);
-		}
-	}
-
-	vmm_mem_nsegs = nsegs;
-
-	return (0);
-}
+static u_long pages_allocated;
+SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD,
+	     &pages_allocated, 0, "4KB pages allocated");
 
 static void
-vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end)
+update_pages_allocated(int howmany)
 {
-	vm_paddr_t addr, remaining;
-	int pdpi, pdi, superpage_size;
-	pml4_entry_t *pml4p;
-	pdp_entry_t *pdp;
-	pd_entry_t *pd;
-	uint64_t page_attr_bits;
-
-	if (end >= NBPML4)
-		panic("Cannot map memory beyond %ldGB", NBPML4 / GB);
-
-	if (vmm_supports_1G_pages())
-		superpage_size = NBPDP;
-	else
-		superpage_size = NBPDR;
-
-	/*
-	 * Get the page directory pointer page that contains the direct
-	 * map address mappings.
-	 */
-	pml4p = kernel_pmap->pm_pml4;
-	pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4p[DMPML4I] & ~PAGE_MASK);
-
-	page_attr_bits = PG_RW | PG_V | PG_PS | PG_G;
-	addr = start;
-	while (addr < end) {
-		remaining = end - addr;
-		pdpi = addr / NBPDP;
-		if (superpage_size == NBPDP &&
-		    remaining >= NBPDP &&
-		    addr % NBPDP == 0) {
-			/*
-			 * If there isn't a mapping for this address then
-			 * create one but if there is one already make sure
-			 * it matches what we expect it to be.
-			 */
-			if (pdp[pdpi] == 0) {
-				pdp[pdpi] = addr | page_attr_bits;
-				if (0 && bootverbose) {
-					printf("vmm_mem_populate: mapping "
-					       "0x%lx with 1GB page at "
-					       "pdpi %d\n", addr, pdpi);
-				}
-			} else {
-				pdp_entry_t pdpe = pdp[pdpi];
-				if ((pdpe & ~PAGE_MASK) != addr ||
-				    (pdpe & page_attr_bits) != page_attr_bits) {
-					panic("An invalid mapping 0x%016lx "
-					      "already exists for 0x%016lx\n",
-					      pdpe, addr);
-				}
-			}
-			addr += NBPDP;
-		} else {
-			if (remaining < NBPDR) {
-				panic("vmm_mem_populate: remaining (%ld) must "
-				      "be greater than NBPDR (%d)\n",
-				      remaining, NBPDR);
-			}
-			if (pdp[pdpi] == 0) {
-				/*
-				 * XXX we lose this memory forever because
-				 * we do not keep track of the virtual address
-				 * that would be required to free this page.
-				 */
-				pd = malloc(PAGE_SIZE, M_VMM_MEM,
-					    M_WAITOK | M_ZERO);
-				if ((uintptr_t)pd & PAGE_MASK) {
-					panic("vmm_mem_populate: page directory"
-					      "page not aligned on %d "
-					      "boundary\n", PAGE_SIZE);
-				}
-				pdp[pdpi] = vtophys(pd);
-				pdp[pdpi] |= PG_RW | PG_V | PG_U;
-				if (0 && bootverbose) {
-					printf("Creating page directory "
-					       "at pdp index %d for 0x%016lx\n",
-					       pdpi, addr);
-				}
-			}
-			pdi = (addr % NBPDP) / NBPDR;
-			pd = (pd_entry_t *)PHYS_TO_DMAP(pdp[pdpi] & ~PAGE_MASK);
-
-			/*
-			 * Create a new mapping if one doesn't already exist
-			 * or validate it if it does.
-			 */
-			if (pd[pdi] == 0) {
-				pd[pdi] = addr | page_attr_bits;
-				if (0 && bootverbose) {
-					printf("vmm_mem_populate: mapping "
-					       "0x%lx with 2MB page at "
-					       "pdpi %d, pdi %d\n",
-					       addr, pdpi, pdi);
-				}
-			} else {
-				pd_entry_t pde = pd[pdi];
-				if ((pde & ~PAGE_MASK) != addr ||
-				    (pde & page_attr_bits) != page_attr_bits) {
-					panic("An invalid mapping 0x%016lx "
-					      "already exists for 0x%016lx\n",
-					      pde, addr);
-				}
-			}
-			addr += NBPDR;
-		}
-	}
-}
-
-static int
-vmm_mem_populate(void)
-{
-	int seg, error;
-	vm_paddr_t start, end;
-
-	/* populate the vmm_mem_avail[] array */
-	error = vmm_mem_steal_memory();
-	if (error)
-		return (error);
-	
-	/*
-	 * Now map the memory that was hidden from FreeBSD in
-	 * the direct map VA space.
-	 */
-	for (seg = 0; seg < vmm_mem_nsegs; seg++) {
-		start = vmm_mem_avail[seg].base;
-		end = start + vmm_mem_avail[seg].length;
-		if ((start & PDRMASK) != 0 || (end & PDRMASK) != 0) {
-			panic("start (0x%016lx) and end (0x%016lx) must be "
-			      "aligned on a %dMB boundary\n",
-			      start, end, NBPDR / MB);
-		}
-		vmm_mem_direct_map(start, end);
-	}
-
-	return (0);
+	pages_allocated += howmany;	/* XXX locking? */
 }
 
 int
 vmm_mem_init(void)
 {
-	int error;
-
-	mtx_init(&vmm_mem_mtx, "vmm_mem_mtx", NULL, MTX_DEF);
-
-	error = vmm_mem_populate();
-	if (error)
-		return (error);
 
 	return (0);
 }
@@ -315,122 +74,61 @@ vmm_mem_init(void)
 vm_paddr_t
 vmm_mem_alloc(size_t size)
 {
-	int i;
-	vm_paddr_t addr;
+	int flags;
+	vm_page_t m;
+	vm_paddr_t pa;
 
-	if ((size & PAGE_MASK) != 0) {
-		panic("vmm_mem_alloc: size 0x%0lx must be "
-		      "aligned on a 0x%0x boundary\n", size, PAGE_SIZE);
-	}
+	if (size != PAGE_SIZE)
+		panic("vmm_mem_alloc: invalid allocation size %lu", size);
 
-	addr = 0;
+	flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
+		VM_ALLOC_ZERO;
 
-	mtx_lock(&vmm_mem_mtx);
-	for (i = 0; i < vmm_mem_nsegs; i++) {
-		if (vmm_mem_avail[i].length >= size) {
-			addr = vmm_mem_avail[i].base;
-			vmm_mem_avail[i].base += size;
-			vmm_mem_avail[i].length -= size;
-			/* remove a zero length segment */
-			if (vmm_mem_avail[i].length == 0) {
-				memmove(&vmm_mem_avail[i],
-					&vmm_mem_avail[i + 1],
-					(vmm_mem_nsegs - (i + 1)) *
-					 sizeof(vmm_mem_avail[0]));
-				vmm_mem_nsegs--;
-			}
+	while (1) {
+		/*
+		 * XXX need policy to determine when to back off the allocation
+		 */
+		m = vm_page_alloc(NULL, 0, flags);
+		if (m == NULL)
+			VM_WAIT;
+		else
 			break;
-		}
 	}
-	mtx_unlock(&vmm_mem_mtx);
-
-	return (addr);
-}
-
-size_t
-vmm_mem_get_mem_total(void)
-{
-	return vmm_mem_total_bytes;
-}
 
-size_t
-vmm_mem_get_mem_free(void)
-{
-	size_t length = 0;
-	int i;
+	pa = VM_PAGE_TO_PHYS(m);
+	
+	if ((m->flags & PG_ZERO) == 0)
+		pagezero((void *)PHYS_TO_DMAP(pa));
 
-	mtx_lock(&vmm_mem_mtx);
-	for (i = 0; i < vmm_mem_nsegs; i++) {
-		length += vmm_mem_avail[i].length;
-	}
-	mtx_unlock(&vmm_mem_mtx);
+	update_pages_allocated(1);
 
-	return(length);
+	return (pa);
 }
 
 void
 vmm_mem_free(vm_paddr_t base, size_t length)
 {
-	int i;
+	vm_page_t m;
 
-	if ((base & PAGE_MASK) != 0 || (length & PAGE_MASK) != 0) {
-		panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be "
-		      "aligned on a 0x%0x boundary\n", base, length, PAGE_SIZE);
+	if (base & PAGE_MASK) {
+		panic("vmm_mem_free: base 0x%0lx must be aligned on a "
+		      "0x%0x boundary\n", base, PAGE_SIZE);
 	}
 
-	mtx_lock(&vmm_mem_mtx);
-
-	for (i = 0; i < vmm_mem_nsegs; i++) {
-		if (vmm_mem_avail[i].base > base)
-			break;
-	}
-
-	if (vmm_mem_nsegs >= VMM_MEM_MAXSEGS)
-		panic("vmm_mem_free: cannot free any more segments");
-
-	/* Create a new segment at index 'i' */
-	memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i],
-		(vmm_mem_nsegs - i) * sizeof(vmm_mem_avail[0]));
-
-	vmm_mem_avail[i].base = base;
-	vmm_mem_avail[i].length = length;
+	if (length != PAGE_SIZE)
+		panic("vmm_mem_free: invalid length %lu", length);
 
-	vmm_mem_nsegs++;
+	m = PHYS_TO_VM_PAGE(base);
+	m->wire_count--;
+	vm_page_free(m);
+	atomic_subtract_int(&cnt.v_wire_count, 1);
 
-coalesce_some_more:
-	for (i = 0; i < vmm_mem_nsegs - 1; i++) {
-		if (vmm_mem_avail[i].base + vmm_mem_avail[i].length ==
-		    vmm_mem_avail[i + 1].base) {
-			vmm_mem_avail[i].length += vmm_mem_avail[i + 1].length;
-			memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i + 2],
-			  (vmm_mem_nsegs - (i + 2)) * sizeof(vmm_mem_avail[0]));
-			vmm_mem_nsegs--;
-			goto coalesce_some_more;
-		}
-	}
-
-	mtx_unlock(&vmm_mem_mtx);
+	update_pages_allocated(-1);
 }
 
 vm_paddr_t
 vmm_mem_maxaddr(void)
 {
 
-	return (maxaddr);
-}
-
-void
-vmm_mem_dump(void)
-{
-	int i;
-	vm_paddr_t base;
-	vm_size_t length;
-
-	mtx_lock(&vmm_mem_mtx);
-	for (i = 0; i < vmm_mem_nsegs; i++) {
-		base = vmm_mem_avail[i].base;
-		length = vmm_mem_avail[i].length;
-		printf("%-4d0x%016lx    0x%016lx\n", i, base, base + length);
-	}
-	mtx_unlock(&vmm_mem_mtx);
+	return (ptoa(Maxmem));
 }
diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h
index a83e9be..7d45c74 100644
--- a/sys/amd64/vmm/vmm_mem.h
+++ b/sys/amd64/vmm/vmm_mem.h
@@ -33,9 +33,5 @@ int		vmm_mem_init(void);
 vm_paddr_t	vmm_mem_alloc(size_t size);
 void		vmm_mem_free(vm_paddr_t start, size_t size);
 vm_paddr_t	vmm_mem_maxaddr(void);
-void		vmm_mem_dump(void);
-
-size_t		vmm_mem_get_mem_total(void);
-size_t		vmm_mem_get_mem_free(void);
 
 #endif
-- 
cgit v1.1


From d09cf38e2541aaf20c762b633d757f620e4ae745 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Thu, 11 Oct 2012 19:28:07 +0000
Subject: Deliver the MSI to the correct guest virtual cpu.

Prior to this change the MSI was being delivered unconditionally to vcpu 0
regardless of how the guest programmed the MSI delivery.
---
 sys/amd64/vmm/io/ppt.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
index e81fdbc..d6fef9a 100644
--- a/sys/amd64/vmm/io/ppt.c
+++ b/sys/amd64/vmm/io/ppt.c
@@ -71,8 +71,6 @@ static struct pptdev {
 	struct vm_memory_segment mmio[MAX_MMIOSEGS];
 	struct {
 		int	num_msgs;		/* guest state */
-		int	vector;
-		int	vcpu;
 
 		int	startrid;		/* host state */
 		struct resource *res[MAX_MSIMSGS];
@@ -478,8 +476,6 @@ ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
 		}
 	}
 	
-	ppt->msi.vector = vector;
-	ppt->msi.vcpu = destcpu;
 	ppt->msi.startrid = startrid;
 
 	/*
@@ -497,6 +493,7 @@ ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
 
 		ppt->msi.arg[i].pptdev = ppt;
 		ppt->msi.arg[i].vec = vector + i;
+		ppt->msi.arg[i].vcpu = destcpu;
 
 		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
 				       INTR_TYPE_NET | INTR_MPSAFE,
-- 
cgit v1.1


From 97c20149fa1e35f2bcc34d8b7058467aa8d51d80 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Thu, 11 Oct 2012 19:39:54 +0000
Subject: Fix warnings generated by 'debug.witness.watch' during VM creation
 and destruction for calling malloc() with M_WAITOK while holding a mutex.

Do not allow vmm.ko to be unloaded until all virtual machines are destroyed.
---
 sys/amd64/include/vmm_dev.h |  2 +-
 sys/amd64/vmm/vmm.c         | 10 +++---
 sys/amd64/vmm/vmm_dev.c     | 86 +++++++++++++++++++++++++++------------------
 3 files changed, 59 insertions(+), 39 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index 42ad236..79f893d 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -31,7 +31,7 @@
 
 #ifdef _KERNEL
 void	vmmdev_init(void);
-void	vmmdev_cleanup(void);
+int	vmmdev_cleanup(void);
 #endif
 
 struct vm_memory_segment {
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index bcd322a..019b9a8 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -219,10 +219,12 @@ vmm_handler(module_t mod, int what, void *arg)
 		error = vmm_init();
 		break;
 	case MOD_UNLOAD:
-		vmmdev_cleanup();
-		iommu_cleanup();
-		vmm_ipi_cleanup();
-		error = VMM_CLEANUP();
+		error = vmmdev_cleanup();
+		if (error == 0) {
+			iommu_cleanup();
+			vmm_ipi_cleanup();
+			error = VMM_CLEANUP();
+		}
 		break;
 	default:
 		error = 0;
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 66f5184..1eba226 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -88,18 +88,11 @@ vmmdev_lookup(const char *name)
 static struct vmmdev_softc *
 vmmdev_lookup2(struct cdev *cdev)
 {
-	struct vmmdev_softc *sc;
-
 #ifdef notyet	/* XXX kernel is not compiled with invariants */
 	mtx_assert(&vmmdev_mtx, MA_OWNED);
 #endif
 
-	SLIST_FOREACH(sc, &head, link) {
-		if (sc->cdev == cdev)
-			break;
-	}
-
-	return (sc);
+	return (cdev->si_drv1);
 }
 
 static int
@@ -114,6 +107,8 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 	error = 0;
 	mtx_lock(&vmmdev_mtx);
 	sc = vmmdev_lookup2(cdev);
+	if (sc == NULL)
+		error = ENXIO;
 
 	while (uio->uio_resid > 0 && error == 0) {
 		gpa = uio->uio_offset;
@@ -380,20 +375,25 @@ vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
 }
 
 static void
-vmmdev_destroy(struct vmmdev_softc *sc)
+vmmdev_destroy(struct vmmdev_softc *sc, boolean_t unlink)
 {
 
-#ifdef notyet	/* XXX kernel is not compiled with invariants */
-	mtx_assert(&vmmdev_mtx, MA_OWNED);
-#endif
-
 	/*
 	 * XXX must stop virtual machine instances that may be still
 	 * running and cleanup their state.
 	 */
-	SLIST_REMOVE(&head, sc, vmmdev_softc, link);
-	destroy_dev(sc->cdev);
-	vm_destroy(sc->vm);
+	if (sc->cdev)
+		destroy_dev(sc->cdev);
+
+	if (sc->vm)
+		vm_destroy(sc->vm);
+
+	if (unlink) {
+		mtx_lock(&vmmdev_mtx);
+		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
+		mtx_unlock(&vmmdev_mtx);
+	}
+
 	free(sc, M_VMMDEV);
 }
 
@@ -409,14 +409,22 @@ sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
+	/*
+	 * XXX TODO if any process has this device open then fail
+	 */
+
 	mtx_lock(&vmmdev_mtx);
 	sc = vmmdev_lookup(buf);
 	if (sc == NULL) {
 		mtx_unlock(&vmmdev_mtx);
 		return (EINVAL);
 	}
-	vmmdev_destroy(sc);
+
+	sc->cdev->si_drv1 = NULL;
 	mtx_unlock(&vmmdev_mtx);
+
+	vmmdev_destroy(sc, TRUE);
+
 	return (0);
 }
 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
@@ -436,7 +444,7 @@ sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vm *vm;
-	struct vmmdev_softc *sc;
+	struct vmmdev_softc *sc, *sc2;
 	char buf[VM_MAX_NAMELEN];
 
 	strlcpy(buf, "beavis", sizeof(buf));
@@ -445,27 +453,37 @@ sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
 		return (error);
 
 	mtx_lock(&vmmdev_mtx);
-
 	sc = vmmdev_lookup(buf);
-	if (sc != NULL) {
-		mtx_unlock(&vmmdev_mtx);
+	mtx_unlock(&vmmdev_mtx);
+	if (sc != NULL)
 		return (EEXIST);
-	}
 
 	vm = vm_create(buf);
-	if (vm == NULL) {
-		mtx_unlock(&vmmdev_mtx);
+	if (vm == NULL)
 		return (EINVAL);
-	}
 
 	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
 	sc->vm = vm;
+
+	/*
+	 * Lookup the name again just in case somebody sneaked in when we
+	 * dropped the lock.
+	 */
+	mtx_lock(&vmmdev_mtx);
+	sc2 = vmmdev_lookup(buf);
+	if (sc2 == NULL)
+		SLIST_INSERT_HEAD(&head, sc, link);
+	mtx_unlock(&vmmdev_mtx);
+
+	if (sc2 != NULL) {
+		vmmdev_destroy(sc, FALSE);
+		return (EEXIST);
+	}
+
 	sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
 			    "vmm/%s", buf);
 	sc->cdev->si_drv1 = sc;
-	SLIST_INSERT_HEAD(&head, sc, link);
 
-	mtx_unlock(&vmmdev_mtx);
 	return (0);
 }
 SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
@@ -477,15 +495,15 @@ vmmdev_init(void)
 	mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
 }
 
-void
+int
 vmmdev_cleanup(void)
 {
-	struct vmmdev_softc *sc, *sc2;
-
-	mtx_lock(&vmmdev_mtx);
+	int error;
 
-	SLIST_FOREACH_SAFE(sc, &head, link, sc2)
-		vmmdev_destroy(sc);
+	if (SLIST_EMPTY(&head))
+		error = 0;
+	else
+		error = EBUSY;
 
-	mtx_unlock(&vmmdev_mtx);
+	return (error);
 }
-- 
cgit v1.1


From e3e8a520e280f32230da3ddfa4c5260fea0e15a1 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Fri, 12 Oct 2012 18:32:44 +0000
Subject: Provide per-vcpu locks instead of relying on a single big lock.

This also gets rid of all the witness.watch warnings related to calling
malloc(M_WAITOK) while holding a mutex.

Reviewed by:	grehan
---
 sys/amd64/include/vmm.h   | 19 ++++++----
 sys/amd64/vmm/intel/vmx.c |  4 +--
 sys/amd64/vmm/io/ppt.c    |  2 ++
 sys/amd64/vmm/vmm.c       | 91 +++++++++++++++++++++++++++++++----------------
 sys/amd64/vmm/vmm_dev.c   | 67 ++++++++++++++++++++++++----------
 sys/amd64/vmm/vmm_ipi.c   | 24 ++++---------
 sys/amd64/vmm/vmm_ipi.h   |  3 +-
 7 files changed, 134 insertions(+), 76 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index be22eec..4dfdd04 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -130,19 +130,24 @@ int vmm_is_pptdev(int bus, int slot, int func);
 
 void *vm_iommu_domain(struct vm *vm);
 
-#define	VCPU_STOPPED	0
-#define	VCPU_RUNNING	1
-void vm_set_run_state(struct vm *vm, int vcpu, int running);
-int vm_get_run_state(struct vm *vm, int vcpu, int *hostcpu);
+enum vcpu_state {
+	VCPU_IDLE,
+	VCPU_RUNNING,
+	VCPU_CANNOT_RUN,
+};
 
-void *vcpu_stats(struct vm *vm, int vcpu);
+int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state);
+enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu);
 
 static int __inline
-vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
+vcpu_is_running(struct vm *vm, int vcpu)
 {
-	return (vm_get_run_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
+	return (vcpu_get_state(vm, vcpu) == VCPU_RUNNING);
 }
 
+void *vcpu_stats(struct vm *vm, int vcpu);
+void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
+
 #endif	/* KERNEL */
 
 #define	VM_MAXCPU	8			/* maximum virtual cpus */
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 3fbe5a1..6a1dbed 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1568,7 +1568,7 @@ vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
 	 * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause
 	 * the subsequent vmlaunch/vmresume to fail.
 	 */
-	if (vcpu_is_running(vmx->vm, vcpu, NULL))
+	if (vcpu_is_running(vmx->vm, vcpu))
 		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval));
@@ -1596,7 +1596,7 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 	 * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause
 	 * the subsequent vmlaunch/vmresume to fail.
 	 */
-	if (vcpu_is_running(vmx->vm, vcpu, NULL))
+	if (vcpu_is_running(vmx->vm, vcpu))
 		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val);
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
index d6fef9a..3044fc5 100644
--- a/sys/amd64/vmm/io/ppt.c
+++ b/sys/amd64/vmm/io/ppt.c
@@ -53,6 +53,8 @@ __FBSDID("$FreeBSD$");
 #include "iommu.h"
 #include "ppt.h"
 
+/* XXX locking */
+
 #define	MAX_PPTDEVS	(sizeof(pptdevs) / sizeof(pptdevs[0]))
 #define	MAX_MMIOSEGS	(PCIR_MAX_BAR_0 + 1)
 #define	MAX_MSIMSGS	32
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 019b9a8..8d8f143 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$");
 
 #include <machine/vm.h>
 #include <machine/pcb.h>
+#include <machine/smp.h>
 #include <x86/apicreg.h>
 
 #include <machine/vmm.h>
@@ -65,6 +66,8 @@ struct vlapic;
 
 struct vcpu {
 	int		flags;
+	enum vcpu_state	state;
+	struct mtx	mtx;
 	int		pincpu;		/* host cpuid this vcpu is bound to */
 	int		hostcpu;	/* host cpuid this vcpu last ran on */
 	uint64_t	guest_msrs[VMM_MSR_NUM];
@@ -76,7 +79,6 @@ struct vcpu {
 	enum x2apic_state x2apic_state;
 };
 #define	VCPU_F_PINNED	0x0001
-#define	VCPU_F_RUNNING	0x0002
 
 #define	VCPU_PINCPU(vm, vcpuid)	\
     ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1)
@@ -89,6 +91,10 @@ do {									\
 	vm->vcpu[vcpuid].pincpu = host_cpuid;				\
 } while(0)
 
+#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_DEF)
+#define	vcpu_lock(v)		mtx_lock(&((v)->mtx))
+#define	vcpu_unlock(v)		mtx_unlock(&((v)->mtx))
+
 #define	VM_MAX_MEMORY_SEGMENTS	2
 
 struct vm {
@@ -162,7 +168,8 @@ vcpu_init(struct vm *vm, uint32_t vcpu_id)
 	
 	vcpu = &vm->vcpu[vcpu_id];
 
-	vcpu->hostcpu = -1;
+	vcpu_lock_init(vcpu);
+	vcpu->hostcpu = NOCPU;
 	vcpu->vcpuid = vcpu_id;
 	vcpu->vlapic = vlapic_init(vm, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
@@ -667,11 +674,13 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	pcb = PCPU_GET(curpcb);
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 
-	vcpu->hostcpu = curcpu;
-
 	restore_guest_msrs(vm, vcpuid);	
 	restore_guest_fpustate(vcpu);
+
+	vcpu->hostcpu = curcpu;
 	error = VMRUN(vm->cookie, vcpuid, vmrun->rip);
+	vcpu->hostcpu = NOCPU;
+
 	save_guest_fpustate(vcpu);
 	restore_host_msrs(vm, vcpuid);
 
@@ -787,9 +796,10 @@ vm_iommu_domain(struct vm *vm)
 	return (vm->iommu);
 }
 
-void
-vm_set_run_state(struct vm *vm, int vcpuid, int state)
+int
+vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
 {
+	int error;
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
@@ -797,43 +807,42 @@ vm_set_run_state(struct vm *vm, int vcpuid, int state)
 
 	vcpu = &vm->vcpu[vcpuid];
 
-	if (state == VCPU_RUNNING) {
-		if (vcpu->flags & VCPU_F_RUNNING) {
-			panic("vm_set_run_state: %s[%d] is already running",
-			      vm_name(vm), vcpuid);
-		}
-		vcpu->flags |= VCPU_F_RUNNING;
+	vcpu_lock(vcpu);
+
+	/*
+	 * The following state transitions are allowed:
+	 * IDLE -> RUNNING -> IDLE
+	 * IDLE -> CANNOT_RUN -> IDLE
+	 */
+	if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
+	    (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
+		error = 0;
+		vcpu->state = state;
 	} else {
-		if ((vcpu->flags & VCPU_F_RUNNING) == 0) {
-			panic("vm_set_run_state: %s[%d] is already stopped",
-			      vm_name(vm), vcpuid);
-		}
-		vcpu->flags &= ~VCPU_F_RUNNING;
+		error = EBUSY;
 	}
+
+	vcpu_unlock(vcpu);
+
+	return (error);
 }
 
-int
-vm_get_run_state(struct vm *vm, int vcpuid, int *cpuptr)
+enum vcpu_state
+vcpu_get_state(struct vm *vm, int vcpuid)
 {
-	int retval, hostcpu;
 	struct vcpu *vcpu;
+	enum vcpu_state state;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
-	if (vcpu->flags & VCPU_F_RUNNING) {
-		retval = VCPU_RUNNING;
-		hostcpu = vcpu->hostcpu;
-	} else {
-		retval = VCPU_STOPPED;
-		hostcpu = -1;
-	}
 
-	if (cpuptr)
-		*cpuptr = hostcpu;
+	vcpu_lock(vcpu);
+	state = vcpu->state;
+	vcpu_unlock(vcpu);
 
-	return (retval);
+	return (state);
 }
 
 void
@@ -884,3 +893,25 @@ vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 
 	return (0);
 }
+
+void
+vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
+{
+	int hostcpu;
+	struct vcpu *vcpu;
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	/*
+	 * XXX racy but the worst case is that we'll send an unnecessary IPI
+	 * to the 'hostcpu'.
+	 *
+	 * We cannot use vcpu_is_running() here because it acquires vcpu->mtx
+	 * which is not allowed inside a critical section.
+	 */
+	hostcpu = vcpu->hostcpu;
+	if (hostcpu == NOCPU || hostcpu == curcpu)
+		return;
+
+	ipi_cpu(hostcpu, vmm_ipinum);
+}
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 1eba226..0150ebd 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -88,9 +88,6 @@ vmmdev_lookup(const char *name)
 static struct vmmdev_softc *
 vmmdev_lookup2(struct cdev *cdev)
 {
-#ifdef notyet	/* XXX kernel is not compiled with invariants */
-	mtx_assert(&vmmdev_mtx, MA_OWNED);
-#endif
 
 	return (cdev->si_drv1);
 }
@@ -141,7 +138,8 @@ static int
 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	     struct thread *td)
 {
-	int error, vcpu;
+	int error, vcpu, state_changed;
+	enum vcpu_state new_state;
 	struct vmmdev_softc *sc;
 	struct vm_memory_segment *seg;
 	struct vm_register *vmreg;
@@ -160,12 +158,12 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_stat_desc *statdesc;
 	struct vm_x2apic *x2apic;
 
-	mtx_lock(&vmmdev_mtx);
 	sc = vmmdev_lookup2(cdev);
-	if (sc == NULL) {
-		mtx_unlock(&vmmdev_mtx);
+	if (sc == NULL)
 		return (ENXIO);
-	}
+
+	vcpu = -1;
+	state_changed = 0;
 
 	/*
 	 * Some VMM ioctls can operate only on vcpus that are not running.
@@ -181,6 +179,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_GET_CAPABILITY:
 	case VM_SET_CAPABILITY:
 	case VM_PPTDEV_MSI:
+	case VM_PPTDEV_MSIX:
 	case VM_SET_X2APIC_STATE:
 		/*
 		 * XXX fragile, handle with care
@@ -192,11 +191,42 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 			goto done;
 		}
 
-		if (vcpu_is_running(sc->vm, vcpu, NULL)) {
-			error = EBUSY;
+		if (cmd == VM_RUN)
+			new_state = VCPU_RUNNING;
+		else
+			new_state = VCPU_CANNOT_RUN;
+
+		error = vcpu_set_state(sc->vm, vcpu, new_state);
+		if (error)
+			goto done;
+
+		state_changed = 1;
+		break;
+
+	case VM_MAP_PPTDEV_MMIO:
+	case VM_BIND_PPTDEV:
+	case VM_UNBIND_PPTDEV:
+	case VM_MAP_MEMORY:
+		/*
+		 * ioctls that operate on the entire virtual machine must
+		 * prevent all vcpus from running.
+		 */
+		error = 0;
+		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
+			error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN);
+			if (error)
+				break;
+		}
+
+		if (error) {
+			while (--vcpu >= 0)
+				vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
 			goto done;
 		}
+
+		state_changed = 2;
 		break;
+
 	default:
 		break;
 	}
@@ -204,14 +234,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	switch(cmd) {
 	case VM_RUN:
 		vmrun = (struct vm_run *)data;
-
-		vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_RUNNING);
-		mtx_unlock(&vmmdev_mtx);
-
 		error = vm_run(sc->vm, vmrun);
-
-		mtx_lock(&vmmdev_mtx);
-		vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_STOPPED);
 		break;
 	case VM_STAT_DESC: {
 		const char *desc;
@@ -346,9 +369,15 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		error = ENOTTY;
 		break;
 	}
-done:
-	mtx_unlock(&vmmdev_mtx);
 
+	if (state_changed == 1) {
+		vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+	} else if (state_changed == 2) {
+		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
+			vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+	}
+
+done:
 	return (error);
 }
 
diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c
index 055f86f..643d326 100644
--- a/sys/amd64/vmm/vmm_ipi.c
+++ b/sys/amd64/vmm/vmm_ipi.c
@@ -38,7 +38,6 @@ __FBSDID("$FreeBSD$");
 #include <machine/apicvar.h>
 #include <machine/segments.h>
 #include <machine/md_var.h>
-#include <machine/smp.h>
 
 #include <machine/vmm.h>
 #include "vmm_ipi.h"
@@ -48,7 +47,7 @@ extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn);
 /*
  * The default is to use the IPI_AST to interrupt a vcpu.
  */
-static int ipinum = IPI_AST;
+int vmm_ipinum = IPI_AST;
 
 CTASSERT(APIC_SPURIOUS_INT == 255);
 
@@ -73,31 +72,22 @@ vmm_ipi_init(void)
 		ip = &idt[idx];
 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func == (uintptr_t)&IDTVEC(rsvd)) {
-			ipinum = idx;
-			setidt(ipinum, IDTVEC(justreturn), SDT_SYSIGT,
+			vmm_ipinum = idx;
+			setidt(vmm_ipinum, IDTVEC(justreturn), SDT_SYSIGT,
 			       SEL_KPL, 0);
 			break;
 		}
 	}
 	
-	if (ipinum != IPI_AST && bootverbose) {
+	if (vmm_ipinum != IPI_AST && bootverbose) {
 		printf("vmm_ipi_init: installing ipi handler to interrupt "
-		       "vcpus at vector %d\n", ipinum);
+		       "vcpus at vector %d\n", vmm_ipinum);
 	}
 }
 
 void
 vmm_ipi_cleanup(void)
 {
-	if (ipinum != IPI_AST)
-		setidt(ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
-}
-
-void
-vm_interrupt_hostcpu(struct vm *vm, int vcpu)
-{
-	int hostcpu;
-
-	if (vcpu_is_running(vm, vcpu, &hostcpu) && hostcpu != curcpu)
-		ipi_cpu(hostcpu, ipinum);
+	if (vmm_ipinum != IPI_AST)
+		setidt(vmm_ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
 }
diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h
index 7ab94bf..91552e3 100644
--- a/sys/amd64/vmm/vmm_ipi.h
+++ b/sys/amd64/vmm/vmm_ipi.h
@@ -31,8 +31,9 @@
 
 struct vm;
 
+extern int vmm_ipinum;
+
 void	vmm_ipi_init(void);
 void	vmm_ipi_cleanup(void);
-void	vm_interrupt_hostcpu(struct vm *vm, int vcpu);
 
 #endif
-- 
cgit v1.1


From 8fb5b5f8de608d18362583be1e90150aab0b4d33 Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Fri, 12 Oct 2012 23:12:19 +0000
Subject: Add the guest physical address and r/w/x bits to the paging exit in
 preparation for a rework of bhyve MMIO handling.

Reviewed by:	neel
Obtained from:	NetApp
---
 sys/amd64/include/vmm.h   | 2 ++
 sys/amd64/vmm/intel/vmx.c | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 4dfdd04..d0dfb04 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -266,6 +266,8 @@ struct vm_exit {
 		} inout;
 		struct {
 			uint64_t	cr3;
+			uint64_t	gpa;
+			int		rwx;
 		} paging;
 		/*
 		 * VMX specific payload. Used when there is no "better"
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 6a1dbed..81969ea 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1289,6 +1289,8 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		if (!handled) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->u.paging.cr3 = cr3;
+			vmexit->u.paging.gpa = gpa;
+			vmexit->u.paging.rwx = qual & 0x7;
 		}
 		break;
 	default:
-- 
cgit v1.1


From 26dd051c2cb82a04c38681e6726fef1fa8287c0d Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Sat, 20 Oct 2012 08:23:05 +0000
Subject: Calculate the number of host ticks until the next guest timer
 interrupt.

This information will be used in conjunction with guest "HLT exiting" to
yield the thread hosting the virtual cpu.

Obtained from:	NetApp
---
 sys/amd64/vmm/io/vlapic.c | 113 +++++++++++++++++++++++++---------------------
 sys/amd64/vmm/io/vlapic.h |   2 +-
 sys/amd64/vmm/vmm_lapic.c |   4 +-
 sys/amd64/vmm/vmm_lapic.h |   2 +-
 4 files changed, 65 insertions(+), 56 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 1e8a4e8..911ed64 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -121,6 +121,31 @@ struct vlapic {
 	enum boot_state		boot_state;
 };
 
+#define VLAPIC_BUS_FREQ	tsc_freq
+
+static int
+vlapic_timer_divisor(uint32_t dcr)
+{
+	switch (dcr & 0xB) {
+	case APIC_TDCR_2:
+		return (2);
+	case APIC_TDCR_4:
+		return (4);
+	case APIC_TDCR_8:
+		return (8);
+	case APIC_TDCR_16:
+		return (16);
+	case APIC_TDCR_32:
+		return (32);
+	case APIC_TDCR_64:
+		return (64);
+	case APIC_TDCR_128:
+		return (128);
+	default:
+		panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
+	}
+}
+
 static void
 vlapic_mask_lvts(uint32_t *lvts, int num_lvt)
 {
@@ -175,6 +200,7 @@ vlapic_op_reset(void* dev)
 	memset(lapic, 0, sizeof(*lapic));
 	lapic->apr = vlapic->vcpuid;
 	vlapic_init_ipi(vlapic);
+	vlapic->divisor = vlapic_timer_divisor(lapic->dcr_timer);
 
 	if (vlapic->vcpuid == 0)
 		vlapic->boot_state = BS_RUNNING;	/* BSP */
@@ -218,32 +244,6 @@ vlapic_set_intr_ready(struct vlapic *vlapic, int vector)
 	VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
 }
 
-#define VLAPIC_BUS_FREQ	tsc_freq
-#define VLAPIC_DCR(x)	((x->dcr_timer & 0x8) >> 1)|(x->dcr_timer & 0x3)
-
-static int
-vlapic_timer_divisor(uint32_t dcr)
-{
-	switch (dcr & 0xB) {
-	case APIC_TDCR_2:
-		return (2);
-	case APIC_TDCR_4:
-		return (4);
-	case APIC_TDCR_8:
-		return (8);
-	case APIC_TDCR_16:
-		return (16);
-	case APIC_TDCR_32:
-		return (32);
-	case APIC_TDCR_64:
-		return (64);
-	case APIC_TDCR_128:
-		return (128);
-	default:
-		panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
-	}
-}
-
 static void
 vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed)
 {
@@ -755,59 +755,68 @@ vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data)
 	return (retval);
 }
 
-void
+int
 vlapic_timer_tick(struct vlapic *vlapic)
 {
-	int curticks, delta, periodic;
+	int curticks, delta, periodic, fired;
 	uint32_t ccr;
-	uint32_t decrement, remainder;
+	uint32_t decrement, leftover;
 
+restart:
 	curticks = ticks;
-
-	/* Common case */
 	delta = curticks - vlapic->ccr_ticks;
-	if (delta == 0)
-		return;
 
 	/* Local APIC timer is disabled */
 	if (vlapic->apic.icr_timer == 0)
-		return;
+		return (-1);
 
 	/* One-shot mode and timer has already counted down to zero */
 	periodic = vlapic_periodic_timer(vlapic);
 	if (!periodic && vlapic->apic.ccr_timer == 0)
-		return;
+		return (-1);
 	/*
 	 * The 'curticks' and 'ccr_ticks' are out of sync by more than
 	 * 2^31 ticks. We deal with this by restarting the timer.
 	 */
 	if (delta < 0) {
 		vlapic_start_timer(vlapic, 0);
-		return;
+		goto restart;
 	}
 
-	ccr = vlapic->apic.ccr_timer;
+	fired = 0;
 	decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz;
+
+	vlapic->ccr_ticks = curticks;
+	ccr = vlapic->apic.ccr_timer;
+
 	while (delta-- > 0) {
-		if (ccr <= decrement) {
-			remainder = decrement - ccr;
-			vlapic_fire_timer(vlapic);
-			if (periodic) {
-				vlapic_start_timer(vlapic, remainder);
-				ccr = vlapic->apic.ccr_timer;
-			} else {
-				/*
-				 * One-shot timer has counted down to zero.
-				 */
-				ccr = 0;
-				break;
-			}
-		} else 
+		if (ccr > decrement) {
 			ccr -= decrement;
+			continue;
+		}
+
+		/* Trigger the local apic timer interrupt */
+		vlapic_fire_timer(vlapic);
+		if (periodic) {
+			leftover = decrement - ccr;
+			vlapic_start_timer(vlapic, leftover);
+			ccr = vlapic->apic.ccr_timer;
+		} else {
+			/*
+			 * One-shot timer has counted down to zero.
+			 */
+			ccr = 0;
+		}
+		fired = 1;
+		break;
 	}
 
-	vlapic->ccr_ticks = curticks;
 	vlapic->apic.ccr_timer = ccr;
+
+	if (!fired)
+		return ((ccr / decrement) + 1);
+	else
+		return (0);
 }
 
 struct vdev_ops vlapic_dev_ops = {
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
index f43289d..00de019 100644
--- a/sys/amd64/vmm/io/vlapic.h
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -102,7 +102,7 @@ int vlapic_op_mem_read(void* dev, uint64_t gpa,
 int vlapic_pending_intr(struct vlapic *vlapic);
 void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
 void vlapic_set_intr_ready(struct vlapic *vlapic, int vector);
-void vlapic_timer_tick(struct vlapic *vlapic);
+int vlapic_timer_tick(struct vlapic *vlapic);
 
 uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
 void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
index ace6010..bb22122 100644
--- a/sys/amd64/vmm/vmm_lapic.c
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -106,14 +106,14 @@ lapic_set_intr(struct vm *vm, int cpu, int vector)
 	return (0);
 }
 
-void
+int
 lapic_timer_tick(struct vm *vm, int cpu)
 {
 	struct vlapic *vlapic;
 
 	vlapic = vm_lapic(vm, cpu);
 
-	vlapic_timer_tick(vlapic);
+	return (vlapic_timer_tick(vlapic));
 }
 
 static boolean_t
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
index e9ff8fd..59fc016 100644
--- a/sys/amd64/vmm/vmm_lapic.h
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -38,7 +38,7 @@ int	lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
 
 int	lapic_mmio(struct vm *vm, int cpu, u_int offset, int rd, struct vie *);
 
-void	lapic_timer_tick(struct vm *vm, int cpu);
+int	lapic_timer_tick(struct vm *vm, int cpu);
 
 /*
  * Returns a vector between 32 and 255 if an interrupt is pending in the
-- 
cgit v1.1


From a74007510aa98cb51b2d7cc4056a994e3bf64763 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Tue, 23 Oct 2012 02:20:42 +0000
Subject: Test for AST pending with interrupts disabled right before entering
 the guest.

If an IPI was delivered to this cpu before interrupts were disabled
then return right away via vmx_setjmp() with a return value of VMX_RETURN_AST.

Obtained from:	NetApp
---
 sys/amd64/vmm/intel/vmx.c          | 65 +++++++++++++++++++++++++-------------
 sys/amd64/vmm/intel/vmx.h          |  2 ++
 sys/amd64/vmm/intel/vmx_genassym.c |  7 ++++
 sys/amd64/vmm/intel/vmx_support.S  | 40 +++++++++++++++++++----
 4 files changed, 86 insertions(+), 28 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 81969ea..11b8c9f 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -290,6 +290,8 @@ vmx_setjmp_rc2str(int rc)
 		return "vmresume";
 	case VMX_RETURN_VMLAUNCH:
 		return "vmlaunch";
+	case VMX_RETURN_AST:
+		return "ast";
 	default:
 		return "unknown";
 	}
@@ -798,15 +800,20 @@ vmx_run_trace(struct vmx *vmx, int vcpu)
 
 static __inline void
 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
-	       int handled, int astpending)
+	       int handled)
 {
 #ifdef KTR
 	VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
 		 handled ? "handled" : "unhandled",
 		 exit_reason_to_str(exit_reason), rip);
+#endif
+}
 
-	if (astpending)
-		VMM_CTR0(vmx->vm, vcpu, "astpending");
+static __inline void
+vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
+{
+#ifdef KTR
+	VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
 #endif
 }
 
@@ -981,19 +988,19 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu)
 	const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING |
 				   VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING;
 
-#if 1
 	/*
-	 * XXX
-	 * If an event is being injected from userland then just return.
-	 * For e.g. we may inject a breakpoint exception to cause the
-	 * guest to enter the debugger so we can inspect its state.
+	 * If there is already an interrupt pending then just return.
+	 *
+	 * This could happen if an interrupt was injected on a prior
+	 * VM entry but the actual entry into guest mode was aborted
+	 * because of a pending AST.
 	 */
 	error = vmread(VMCS_ENTRY_INTR_INFO, &info);
 	if (error)
 		panic("vmx_inject_interrupts: vmread(intrinfo) %d", error);
 	if (info & VMCS_INTERRUPTION_INFO_VALID)
 		return;
-#endif
+
 	/*
 	 * NMI injection has priority so deal with those first
 	 */
@@ -1301,7 +1308,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		/*
 		 * It is possible that control is returned to userland
 		 * even though we were able to handle the VM exit in the
-		 * kernel (for e.g. 'astpending' is set in the run loop).
+		 * kernel.
 		 *
 		 * In such a case we want to make sure that the userland
 		 * restarts guest execution at the instruction *after*
@@ -1352,6 +1359,7 @@ vmx_run(void *arg, int vcpu, register_t rip)
 	vmxctx = &vmx->ctx[vcpu];
 	vmxctx->launched = 0;
 
+	astpending = 0;
 	vmexit = vm_exitinfo(vmx->vm, vcpu);
 
 	/*
@@ -1395,6 +1403,9 @@ vmx_run(void *arg, int vcpu, register_t rip)
 			break;
 		case VMX_RETURN_LONGJMP:
 			break;			/* vm exit */
+		case VMX_RETURN_AST:
+			astpending = 1;
+			break;
 		case VMX_RETURN_VMRESUME:
 			vie = vmcs_instruction_error();
 			if (vmxctx->launch_error == VM_FAIL_INVALID ||
@@ -1417,14 +1428,6 @@ vmx_run(void *arg, int vcpu, register_t rip)
 			panic("vmx_setjmp returned %d", rc);
 		}
 		
-		/*
-		 * XXX locking?
-		 * See comments in exception.S about checking for ASTs
-		 * atomically while interrupts are disabled. But it is
-		 * not clear that they apply in our case.
-		 */
-		astpending = curthread->td_flags & TDF_ASTPENDING;
-
 		/* enable interrupts */
 		enable_intr();
 
@@ -1434,11 +1437,18 @@ vmx_run(void *arg, int vcpu, register_t rip)
 		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
 		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
 
+		if (astpending) {
+			handled = 1;
+			vmexit->inst_length = 0;
+			vmexit->exitcode = VM_EXITCODE_BOGUS;
+			vmx_astpending_trace(vmx, vcpu, rip);
+			break;
+		}
+
 		handled = vmx_exit_process(vmx, vcpu, vmexit);
+		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
 
-		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled,
-			       astpending);
-	} while (handled && !astpending);
+	} while (handled);
 
 	/*
 	 * If a VM exit has been handled then the exitcode must be BOGUS
@@ -1646,7 +1656,7 @@ vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
 	   int code_valid)
 {
 	int error;
-	uint32_t info;
+	uint64_t info;
 	struct vmx *vmx = arg;
 	struct vmcs *vmcs = &vmx->vmcs[vcpu];
 
@@ -1660,6 +1670,17 @@ vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
 		0x6,		/* VM_SW_EXCEPTION */
 	};
 
+	/*
+	 * If there is already an exception pending to be delivered to the
+	 * vcpu then just return.
+	 */
+	error = vmcs_getreg(vmcs, VMCS_ENTRY_INTR_INFO, &info);
+	if (error)
+		return (error);
+
+	if (info & VMCS_INTERRUPTION_INFO_VALID)
+		return (EAGAIN);
+
 	info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
 	info |= VMCS_INTERRUPTION_INFO_VALID;
 	error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
index 61d72a8..d4c90fa 100644
--- a/sys/amd64/vmm/intel/vmx.h
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -101,12 +101,14 @@ CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
 #define	VMX_RETURN_LONGJMP	1
 #define	VMX_RETURN_VMRESUME	2
 #define	VMX_RETURN_VMLAUNCH	3
+#define	VMX_RETURN_AST		4
 /*
  * vmx_setjmp() returns:
  * - 0 when it returns directly
  * - 1 when it returns from vmx_longjmp
  * - 2 when it returns from vmx_resume (which would only be in the error case)
  * - 3 when it returns from vmx_launch (which would only be in the error case)
+ * - 4 when it returns from vmx_resume or vmx_launch because of AST pending
  */
 int	vmx_setjmp(struct vmxctx *ctx);
 void	vmx_longjmp(void);			/* returns via vmx_setjmp */
diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c
index c5b5bf9..823a05d 100644
--- a/sys/amd64/vmm/intel/vmx_genassym.c
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
+#include <sys/proc.h>
 #include <sys/assym.h>
 
 #include <vm/vm.h>
@@ -80,3 +81,9 @@ ASSYM(VMX_RETURN_DIRECT,	VMX_RETURN_DIRECT);
 ASSYM(VMX_RETURN_LONGJMP,	VMX_RETURN_LONGJMP);
 ASSYM(VMX_RETURN_VMRESUME,	VMX_RETURN_VMRESUME);
 ASSYM(VMX_RETURN_VMLAUNCH,	VMX_RETURN_VMLAUNCH);
+ASSYM(VMX_RETURN_AST,		VMX_RETURN_AST);
+
+ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
+ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
+ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
+ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
index 8bdba86..4ba582a 100644
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -31,6 +31,32 @@
 #include "vmx_assym.s"
 
 /*
+ * Disable interrupts before updating %rsp in VMX_CHECK_AST or
+ * VMX_GUEST_RESTORE.
+ *
+ * The location that %rsp points to is a 'vmxctx' and not a
+ * real stack so we don't want an interrupt handler to trash it
+ */
+#define	VMX_DISABLE_INTERRUPTS		cli
+
+/*
+ * If the thread hosting the vcpu has an ast pending then take care of it
+ * by returning from vmx_setjmp() with a return value of VMX_RETURN_AST.
+ *
+ * Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts
+ * are disabled.
+ */
+#define	VMX_CHECK_AST							\
+	movq	PCPU(CURTHREAD),%rax;					\
+	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax);	\
+	je	9f;							\
+	movq	$VMX_RETURN_AST,%rsi;					\
+	movq	%rdi,%rsp;						\
+	addq	$VMXCTX_TMPSTKTOP,%rsp;					\
+	callq	vmx_return;						\
+9:
+
+/*
  * Assumes that %rdi holds a pointer to the 'vmxctx'.
  *
  * On "return" all registers are updated to reflect guest state. The two
@@ -41,12 +67,6 @@
  * host context in case of an error with 'vmlaunch' or 'vmresume'.
  */
 #define	VMX_GUEST_RESTORE						\
-	/*								\
-	 * Disable interrupts before updating %rsp. The location that	\
-	 * %rsp points to is a 'vmxctx' and not a real stack so we	\
-	 * don't want an interrupt handler to trash it.			\
-	 */								\
-	cli;								\
 	movq	%rdi,%rsp;						\
 	movq	VMXCTX_GUEST_CR2(%rdi),%rsi;				\
 	movq	%rsi,%cr2;						\
@@ -169,6 +189,10 @@ END(vmx_longjmp)
  * through vmx_setjmp() with a return value of 2.
  */
 ENTRY(vmx_resume)
+	VMX_DISABLE_INTERRUPTS
+
+	VMX_CHECK_AST
+
 	/*
 	 * Restore guest state that is not automatically loaded from the vmcs.
 	 */
@@ -197,6 +221,10 @@ END(vmx_resume)
  * through vmx_setjmp() with a return value of 3.
  */
 ENTRY(vmx_launch)
+	VMX_DISABLE_INTERRUPTS
+
+	VMX_CHECK_AST
+
 	/*
 	 * Restore guest state that is not automatically loaded from the vmcs.
 	 */
-- 
cgit v1.1


From 583a9ef76d9ec8f3bb8e7927281cfe79fc0c0584 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Wed, 24 Oct 2012 02:54:21 +0000
Subject: Maintain state regarding NMI delivery to guest vcpu in VT-x
 independent manner. Also add a stats counter to count the number of NMIs
 delivered per vcpu.

Obtained from:	NetApp
---
 sys/amd64/include/vmm.h   |  4 ++--
 sys/amd64/vmm/amd/amdv.c  |  9 ---------
 sys/amd64/vmm/intel/vmx.c | 16 ++--------------
 sys/amd64/vmm/intel/vmx.h |  1 -
 sys/amd64/vmm/vmm.c       | 49 +++++++++++++++++++++++++++++++++++++++--------
 5 files changed, 45 insertions(+), 34 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index d0dfb04..8f78b8f 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -63,7 +63,6 @@ typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
 typedef int	(*vmi_inject_event_t)(void *vmi, int vcpu,
 				      int type, int vector,
 				      uint32_t code, int code_valid);
-typedef	int	(*vmi_inject_nmi_t)(void *vmi, int vcpu);
 typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
 typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
 
@@ -81,7 +80,6 @@ struct vmm_ops {
 	vmi_get_desc_t		vmgetdesc;
 	vmi_set_desc_t		vmsetdesc;
 	vmi_inject_event_t	vminject;
-	vmi_inject_nmi_t	vmnmi;
 	vmi_get_cap_t		vmgetcap;
 	vmi_set_cap_t		vmsetcap;
 };
@@ -110,6 +108,8 @@ int vm_run(struct vm *vm, struct vm_run *vmrun);
 int vm_inject_event(struct vm *vm, int vcpu, int type,
 		    int vector, uint32_t error_code, int error_code_valid);
 int vm_inject_nmi(struct vm *vm, int vcpu);
+int vm_nmi_pending(struct vm *vm, int vcpuid);
+void vm_nmi_clear(struct vm *vm, int vcpuid);
 uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
 struct vlapic *vm_lapic(struct vm *vm, int cpu);
 int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
index 020743f..dc071d3 100644
--- a/sys/amd64/vmm/amd/amdv.c
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -136,14 +136,6 @@ amdv_inject_event(void *vmi, int vcpu, int type, int vector,
 }
 
 static int
-amdv_nmi(void *arg, int vcpu)
-{
-
-	printf("amdv_nmi: not implemented\n");	
-        return (EINVAL);
-}
-
-static int
 amdv_getcap(void *arg, int vcpu, int type, int *retval)
 {
 
@@ -172,7 +164,6 @@ struct vmm_ops vmm_ops_amd = {
 	amdv_getdesc,
 	amdv_setdesc,
 	amdv_inject_event,
-	amdv_nmi,
 	amdv_getcap,
 	amdv_setcap
 };
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 11b8c9f..16acfff 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -751,7 +751,6 @@ vmx_vminit(struct vm *vm)
 		vmx->cap[i].set = 0;
 		vmx->cap[i].proc_ctls = procbased_ctls;
 
-		vmx->state[i].request_nmi = 0;
 		vmx->state[i].lastcpu = -1;
 		vmx->state[i].vpid = vpid;
 
@@ -940,7 +939,7 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu)
 	uint64_t info, interruptibility;
 
 	/* Bail out if no NMI requested */
-	if (vmx->state[vcpu].request_nmi == 0)
+	if (!vm_nmi_pending(vmx->vm, vcpu))
 		return (0);
 
 	error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
@@ -965,7 +964,7 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu)
 	VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI");
 
 	/* Clear the request */
-	vmx->state[vcpu].request_nmi = 0;
+	vm_nmi_clear(vmx->vm, vcpu);
 	return (1);
 
 nmiblocked:
@@ -1696,16 +1695,6 @@ vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
 }
 
 static int
-vmx_nmi(void *arg, int vcpu)
-{
-	struct vmx *vmx = arg;
-
-	atomic_set_int(&vmx->state[vcpu].request_nmi, 1);
-
-	return (0);
-}
-
-static int
 vmx_getcap(void *arg, int vcpu, int type, int *retval)
 {
 	struct vmx *vmx = arg;
@@ -1843,7 +1832,6 @@ struct vmm_ops vmm_ops_intel = {
 	vmx_getdesc,
 	vmx_setdesc,
 	vmx_inject,
-	vmx_nmi,
 	vmx_getcap,
 	vmx_setcap
 };
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
index d4c90fa..c7cd567 100644
--- a/sys/amd64/vmm/intel/vmx.h
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -76,7 +76,6 @@ struct vmxcap {
 };
 
 struct vmxstate {
-	int	request_nmi;	
 	int	lastcpu;	/* host cpu that this 'vcpu' last ran on */
 	uint16_t vpid;
 };
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 8d8f143..6c6df21 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -77,6 +77,7 @@ struct vcpu {
 	void		*stats;
 	struct vm_exit	exitinfo;
 	enum x2apic_state x2apic_state;
+	int		nmi_pending;
 };
 #define	VCPU_F_PINNED	0x0001
 
@@ -137,8 +138,6 @@ static struct vmm_ops *ops;
 	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
 #define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
 	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
-#define	VMNMI(vmi, vcpu)	\
-	(ops != NULL ? (*ops->vmnmi)(vmi, vcpu) : ENXIO)
 #define	VMGETCAP(vmi, vcpu, num, retval)	\
 	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETCAP(vmi, vcpu, num, val)		\
@@ -710,17 +709,51 @@ vm_inject_event(struct vm *vm, int vcpuid, int type,
 	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
 }
 
+VMM_STAT_DEFINE(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
+
 int
-vm_inject_nmi(struct vm *vm, int vcpu)
+vm_inject_nmi(struct vm *vm, int vcpuid)
 {
-	int error;
+	struct vcpu *vcpu;
 
-	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
-	error = VMNMI(vm->cookie, vcpu);
-	vm_interrupt_hostcpu(vm, vcpu);
-	return (error);
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu->nmi_pending = 1;
+	vm_interrupt_hostcpu(vm, vcpuid);
+	return (0);
+}
+
+int
+vm_nmi_pending(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	return (vcpu->nmi_pending);
+}
+
+void
+vm_nmi_clear(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	if (vcpu->nmi_pending == 0)
+		panic("vm_nmi_clear: inconsistent nmi_pending state");
+
+	vcpu->nmi_pending = 0;
+	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
 }
 
 int
-- 
cgit v1.1


From 80aee5fb8aa2abb172630d47efcb1f8f26f6bcc4 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Thu, 25 Oct 2012 04:08:26 +0000
Subject: Hide the monitor/mwait instruction capability from the guest until we
 know how to properly intercept it.

Obtained from:	NetApp
---
 sys/amd64/vmm/x86.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
index 47ba975..ca0d785 100644
--- a/sys/amd64/vmm/x86.c
+++ b/sys/amd64/vmm/x86.c
@@ -128,6 +128,12 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 				     CPUID2_AVX);
 
 			/*
+			 * Hide monitor/mwait until we know how to deal with
+			 * these instructions.
+			 */
+			regs[2] &= ~CPUID2_MON;
+
+			/*
 			 * Hide thermal monitoring
 			 */
 			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
-- 
cgit v1.1


From bcb3589583c269dcc88504fcf7c0dedc7c03f123 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Thu, 25 Oct 2012 04:29:21 +0000
Subject: If the guest vcpu wants to idle then use that opportunity to
 relinquish the host cpu to the scheduler until the guest is ready to run
 again.

This implies that the host cpu utilization will now closely mirror the actual
load imposed by the guest vcpu.

Also, the vcpu mutex now needs to be of type MTX_SPIN since we need to acquire
it inside a critical section.

Obtained from:	NetApp
---
 sys/amd64/vmm/intel/vmx.c | 21 ++++++++--
 sys/amd64/vmm/vmm.c       | 97 +++++++++++++++++++++++++++++++++++++----------
 2 files changed, 95 insertions(+), 23 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 16acfff..2052dc9 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -159,6 +159,8 @@ static int cap_monitor_trap;
 /* statistics */
 static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus");
 static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt");
+static VMM_STAT_DEFINE(VMEXIT_HLT_IGNORED, "number of times hlt was ignored");
+static VMM_STAT_DEFINE(VMEXIT_HLT, "number of times hlt was intercepted");
 
 #ifdef KTR
 static const char *
@@ -1203,11 +1205,11 @@ vmx_lapic_fault(struct vm *vm, int cpu,
 static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
-	int handled;
+	int error, handled;
 	struct vmcs *vmcs;
 	struct vmxctx *vmxctx;
 	uint32_t eax, ecx, edx;
-	uint64_t qual, gpa, cr3;
+	uint64_t qual, gpa, cr3, intr_info;
 
 	handled = 0;
 	vmcs = &vmx->vmcs[vcpu];
@@ -1240,7 +1242,20 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		}
 		break;
 	case EXIT_REASON_HLT:
-		vmexit->exitcode = VM_EXITCODE_HLT;
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
+		/*
+		 * If there is an event waiting to be injected then there is
+		 * no need to 'hlt'.
+		 */
+		error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info);
+		if (error)
+			panic("vmx_exit_process: vmread(intrinfo) %d", error);
+
+		if (intr_info & VMCS_INTERRUPTION_INFO_VALID) {
+			handled = 1;
+			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1);
+		} else
+			vmexit->exitcode = VM_EXITCODE_HLT;
 		break;
 	case EXIT_REASON_MTF:
 		vmexit->exitcode = VM_EXITCODE_MTRAP;
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 6c6df21..8bc9581 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
 #include "vmm_msr.h"
 #include "vmm_ipi.h"
 #include "vmm_stat.h"
+#include "vmm_lapic.h"
 
 #include "io/ppt.h"
 #include "io/iommu.h"
@@ -92,9 +93,9 @@ do {									\
 	vm->vcpu[vcpuid].pincpu = host_cpuid;				\
 } while(0)
 
-#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_DEF)
-#define	vcpu_lock(v)		mtx_lock(&((v)->mtx))
-#define	vcpu_unlock(v)		mtx_unlock(&((v)->mtx))
+#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
+#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
+#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
 
 #define	VM_MAX_MEMORY_SEGMENTS	2
 
@@ -651,13 +652,16 @@ save_guest_fpustate(struct vcpu *vcpu)
 	fpu_start_emulating();
 }
 
+static VMM_STAT_DEFINE(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
+
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
-	int error, vcpuid;
+	int error, vcpuid, sleepticks, t;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
-	uint64_t tscval;
+	uint64_t tscval, rip;
+	struct vm_exit *vme;
 
 	vcpuid = vmrun->cpuid;
 
@@ -665,7 +669,9 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
-
+	vme = &vmrun->vm_exit;
+	rip = vmrun->rip;
+restart:
 	critical_enter();
 
 	tscval = rdtsc();
@@ -677,7 +683,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	restore_guest_fpustate(vcpu);
 
 	vcpu->hostcpu = curcpu;
-	error = VMRUN(vm->cookie, vcpuid, vmrun->rip);
+	error = VMRUN(vm->cookie, vcpuid, rip);
 	vcpu->hostcpu = NOCPU;
 
 	save_guest_fpustate(vcpu);
@@ -686,10 +692,52 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 
 	/* copy the exit information */
-	bcopy(&vcpu->exitinfo, &vmrun->vm_exit, sizeof(struct vm_exit));
+	bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
 
 	critical_exit();
 
+	/*
+	 * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
+	 * is ready to run.
+	 */
+	if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
+		vcpu_lock(vcpu);
+
+		/*
+		 * Figure out the number of host ticks until the next apic
+		 * timer interrupt in the guest.
+		 */
+		sleepticks = lapic_timer_tick(vm, vcpuid);
+
+		/*
+		 * If the guest local apic timer is disabled then sleep for
+		 * a long time but not forever.
+		 */
+		if (sleepticks < 0)
+			sleepticks = hz;
+
+		/*
+		 * Do a final check for pending NMI or interrupts before
+		 * really putting this thread to sleep.
+		 *
+		 * These interrupts could have happened any time after we
+		 * returned from VMRUN() and before we grabbed the vcpu lock.
+		 */
+		if (!vm_nmi_pending(vm, vcpuid) &&
+		    lapic_pending_intr(vm, vcpuid) < 0) {
+			if (sleepticks <= 0)
+				panic("invalid sleepticks %d", sleepticks);
+			t = ticks;
+			msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
+			vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+		}
+
+		vcpu_unlock(vcpu);
+
+		rip = vme->rip + vme->inst_length;
+		goto restart;
+	}
+
 	return (error);
 }
 
@@ -709,7 +757,7 @@ vm_inject_event(struct vm *vm, int vcpuid, int type,
 	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
 }
 
-VMM_STAT_DEFINE(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
+static VMM_STAT_DEFINE(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
 
 int
 vm_inject_nmi(struct vm *vm, int vcpuid)
@@ -935,16 +983,25 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
 
 	vcpu = &vm->vcpu[vcpuid];
 
-	/*
-	 * XXX racy but the worst case is that we'll send an unnecessary IPI
-	 * to the 'hostcpu'.
-	 *
-	 * We cannot use vcpu_is_running() here because it acquires vcpu->mtx
-	 * which is not allowed inside a critical section.
-	 */
+	vcpu_lock(vcpu);
 	hostcpu = vcpu->hostcpu;
-	if (hostcpu == NOCPU || hostcpu == curcpu)
-		return;
-
-	ipi_cpu(hostcpu, vmm_ipinum);
+	if (hostcpu == NOCPU) {
+		/*
+		 * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
+		 * the host thread must be sleeping waiting for an event to
+		 * kick the vcpu out of 'hlt'.
+		 *
+		 * XXX this is racy because the condition exists right before
+		 * and after calling VMRUN() in vm_run(). The wakeup() is
+		 * benign in this case.
+		 */
+		if (vcpu->state == VCPU_RUNNING)
+			wakeup_one(vcpu);
+	} else {
+		if (vcpu->state != VCPU_RUNNING)
+			panic("invalid vcpu state %d", vcpu->state);
+		if (hostcpu != curcpu)
+			ipi_cpu(hostcpu, vmm_ipinum);
+	}
+	vcpu_unlock(vcpu);
 }
-- 
cgit v1.1


From cbd59fc940c5caaf0cde3410c8772176220fd1a1 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Fri, 26 Oct 2012 03:12:40 +0000
Subject: Unconditionally enable fpu emulation by setting CR0.TS in the host
 after the guest does a vm exit.

This allows us to trap any fpu access in the host context while the fpu still
has "dirty" state belonging to the guest.

Reported by: "s vas" on freebsd-virtualization@
Obtained from:	NetApp
---
 sys/amd64/vmm/intel/vmcs.c | 10 +++++++++-
 sys/amd64/vmm/vmm.c        | 13 +++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index 8c53465..26ac5f8 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -367,7 +367,15 @@ vmcs_set_defaults(struct vmcs *vmcs,
 		goto done;
 
 	/* Load the control registers */
-	cr0 = rcr0();
+
+	/*
+	 * We always want CR0.TS to be set when the processor does a VM exit.
+	 *
+	 * With emulation turned on unconditionally after a VM exit, we are
+	 * able to trap inadvertent use of the FPU until the guest FPU state
+	 * has been safely squirreled away.
+	 */
+	cr0 = rcr0() | CR0_TS;
 	if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
 		goto done;
 	
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 8bc9581..6efc01f 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -640,14 +640,27 @@ restore_guest_fpustate(struct vcpu *vcpu)
 
 	/* flush host state to the pcb */
 	fpuexit(curthread);
+
+	/* restore guest FPU state */
 	fpu_stop_emulating();
 	fpurestore(vcpu->guestfpu);
+
+	/*
+	 * The FPU is now "dirty" with the guest's state so turn on emulation
+	 * to trap any access to the FPU by the host.
+	 */
+	fpu_start_emulating();
 }
 
 static void
 save_guest_fpustate(struct vcpu *vcpu)
 {
 
+	if ((rcr0() & CR0_TS) == 0)
+		panic("fpu emulation not enabled in host!");
+
+	/* save guest FPU state */
+	fpu_stop_emulating();
 	fpusave(vcpu->guestfpu);
 	fpu_start_emulating();
 }
-- 
cgit v1.1


From dc37578ed255be09cc4e4fcd2ebf48781c91eabc Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Fri, 26 Oct 2012 22:32:26 +0000
Subject: Set the valid field of the newly allocated field as all other vm page
 allocators do. This fixes a panic when a virtio block device is mounted as
 root, with the host system dying in vm_page_dirty with invalid bits.

Reviewed by:	neel
Obtained from:	NetApp
---
 sys/amd64/vmm/vmm_mem.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c
index 8745339..04f99b1 100644
--- a/sys/amd64/vmm/vmm_mem.c
+++ b/sys/amd64/vmm/vmm_mem.c
@@ -99,6 +99,7 @@ vmm_mem_alloc(size_t size)
 	
 	if ((m->flags & PG_ZERO) == 0)
 		pagezero((void *)PHYS_TO_DMAP(pa));
+	m->valid = VM_PAGE_BITS_ALL;
 
 	update_pages_allocated(1);
 
-- 
cgit v1.1


From 9631d598ccea6dd526400bad0a438a10c8294542 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Mon, 29 Oct 2012 01:51:24 +0000
Subject: Corral all the host state associated with the virtual machine into
 its own file.

This state is independent of the type of hardware assist used so there is
really no need for it to be in Intel-specific code.

Obtained from:	NetApp
---
 sys/amd64/vmm/intel/vmcs.c |  30 +++++------
 sys/amd64/vmm/intel/vmx.c  |  11 ++--
 sys/amd64/vmm/vmm.c        |   2 +
 sys/amd64/vmm/vmm_host.c   | 124 +++++++++++++++++++++++++++++++++++++++++++++
 sys/amd64/vmm/vmm_host.h   |  75 +++++++++++++++++++++++++++
 5 files changed, 218 insertions(+), 24 deletions(-)
 create mode 100644 sys/amd64/vmm/vmm_host.c
 create mode 100644 sys/amd64/vmm/vmm_host.h

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index 26ac5f8..a5784dd 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/pmap.h>
 
 #include <machine/vmm.h>
+#include "vmm_host.h"
 #include "vmcs.h"
 #include "vmx_cpufunc.h"
 #include "ept.h"
@@ -314,12 +315,12 @@ vmcs_set_defaults(struct vmcs *vmcs,
 {
 	int error, codesel, datasel, tsssel;
 	u_long cr0, cr4, efer;
-	uint64_t eptp, pat;
+	uint64_t eptp, pat, fsbase, idtrbase;
 	uint32_t exc_bitmap;
 
-	codesel = GSEL(GCODE_SEL, SEL_KPL);
-	datasel = GSEL(GDATA_SEL, SEL_KPL);
-	tsssel = GSEL(GPROC0_SEL, SEL_KPL);
+	codesel = vmm_get_host_codesel();
+	datasel = vmm_get_host_datasel();
+	tsssel = vmm_get_host_tsssel();
 
 	/*
 	 * Make sure we have a "current" VMCS to work with.
@@ -357,29 +358,22 @@ vmcs_set_defaults(struct vmcs *vmcs,
 	/* Host state */
 
 	/* Initialize host IA32_PAT MSR */
-	pat = rdmsr(MSR_PAT);
+	pat = vmm_get_host_pat();
 	if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
 		goto done;
 
 	/* Load the IA32_EFER MSR */
-	efer = rdmsr(MSR_EFER);
+	efer = vmm_get_host_efer();
 	if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
 		goto done;
 
 	/* Load the control registers */
 
-	/*
-	 * We always want CR0.TS to be set when the processor does a VM exit.
-	 *
-	 * With emulation turned on unconditionally after a VM exit, we are
-	 * able to trap inadvertent use of the FPU until the guest FPU state
-	 * has been safely squirreled away.
-	 */
-	cr0 = rcr0() | CR0_TS;
+	cr0 = vmm_get_host_cr0();
 	if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
 		goto done;
 	
-	cr4 = rcr4();
+	cr4 = vmm_get_host_cr4() | CR4_VMXE;
 	if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
 		goto done;
 
@@ -411,10 +405,12 @@ vmcs_set_defaults(struct vmcs *vmcs,
 	 * Note that we exclude %gs, tss and gdtr here because their base
 	 * address is pcpu specific.
 	 */
-	if ((error = vmwrite(VMCS_HOST_FS_BASE, 0)) != 0)
+	fsbase = vmm_get_host_fsbase();
+	if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0)
 		goto done;
 
-	if ((error = vmwrite(VMCS_HOST_IDTR_BASE, r_idt.rd_base)) != 0)
+	idtrbase = vmm_get_host_idtrbase();
+	if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0)
 		goto done;
 
 	/* instruction pointer */
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 2052dc9..ace2683 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
 #include <x86/apicreg.h>
 
 #include <machine/vmm.h>
+#include "vmm_host.h"
 #include "vmm_lapic.h"
 #include "vmm_msr.h"
 #include "vmm_ktr.h"
@@ -64,8 +65,6 @@ __FBSDID("$FreeBSD$");
 #include "vmx_controls.h"
 #include "vmm_instruction_emul.h"
 
-#define	CR4_VMXE	(1UL << 13)
-
 #define	PINBASED_CTLS_ONE_SETTING					\
 	(PINBASED_EXTINT_EXITING	|				\
 	 PINBASED_NMI_EXITING		|				\
@@ -118,8 +117,6 @@ __FBSDID("$FreeBSD$");
 
 MALLOC_DEFINE(M_VMX, "vmx", "vmx");
 
-extern  struct pcpu __pcpu[];
-
 int vmxon_enabled[MAXCPU];
 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
 
@@ -836,15 +833,15 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
 
 	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
 
-	error = vmwrite(VMCS_HOST_TR_BASE, (u_long)PCPU_GET(tssp));
+	error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
 	if (error != 0)
 		goto done;
 
-	error = vmwrite(VMCS_HOST_GDTR_BASE, (u_long)&gdt[NGDT * curcpu]);
+	error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
 	if (error != 0)
 		goto done;
 
-	error = vmwrite(VMCS_HOST_GS_BASE, (u_long)&__pcpu[curcpu]);
+	error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
 	if (error != 0)
 		goto done;
 
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 6efc01f..eae9ccc 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
 #include <x86/apicreg.h>
 
 #include <machine/vmm.h>
+#include "vmm_host.h"
 #include "vmm_mem.h"
 #include "vmm_util.h"
 #include <machine/vmm_dev.h>
@@ -196,6 +197,7 @@ vmm_init(void)
 {
 	int error;
 
+	vmm_host_state_init();
 	vmm_ipi_init();
 
 	error = vmm_mem_init();
diff --git a/sys/amd64/vmm/vmm_host.c b/sys/amd64/vmm/vmm_host.c
new file mode 100644
index 0000000..8dfef73
--- /dev/null
+++ b/sys/amd64/vmm/vmm_host.c
@@ -0,0 +1,124 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/pcpu.h>
+
+#include <machine/cpufunc.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+
+#include "vmm_host.h"
+
+static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4;
+
+void
+vmm_host_state_init(void)
+{
+
+	vmm_host_efer = rdmsr(MSR_EFER);
+	vmm_host_pat = rdmsr(MSR_PAT);
+
+	/*
+	 * We always want CR0.TS to be set when the processor does a VM exit.
+	 *
+	 * With emulation turned on unconditionally after a VM exit, we are
+	 * able to trap inadvertent use of the FPU until the guest FPU state
+	 * has been safely squirreled away.
+	 */
+	vmm_host_cr0 = rcr0() | CR0_TS;
+
+	vmm_host_cr4 = rcr4();
+}
+
+uint64_t
+vmm_get_host_pat(void)
+{
+
+	return (vmm_host_pat);
+}
+
+uint64_t
+vmm_get_host_efer(void)
+{
+
+	return (vmm_host_efer);
+}
+
+uint64_t
+vmm_get_host_cr0(void)
+{
+
+	return (vmm_host_cr0);
+}
+
+uint64_t
+vmm_get_host_cr4(void)
+{
+
+	return (vmm_host_cr4);
+}
+
+uint64_t
+vmm_get_host_datasel(void)
+{
+
+	return (GSEL(GDATA_SEL, SEL_KPL));
+
+}
+
+uint64_t
+vmm_get_host_codesel(void)
+{
+
+	return (GSEL(GCODE_SEL, SEL_KPL));
+}
+
+uint64_t
+vmm_get_host_tsssel(void)
+{
+
+	return (GSEL(GPROC0_SEL, SEL_KPL));
+}
+
+uint64_t
+vmm_get_host_fsbase(void)
+{
+
+	return (0);
+}
+
+uint64_t
+vmm_get_host_idtrbase(void)
+{
+
+	return (r_idt.rd_base);
+}
diff --git a/sys/amd64/vmm/vmm_host.h b/sys/amd64/vmm/vmm_host.h
new file mode 100644
index 0000000..839f54a
--- /dev/null
+++ b/sys/amd64/vmm/vmm_host.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_HOST_H_
+#define	_VMM_HOST_H_
+
+#ifndef	_KERNEL
+#error "no user-servicable parts inside"
+#endif
+
+void vmm_host_state_init(void);
+
+uint64_t vmm_get_host_pat(void);
+uint64_t vmm_get_host_efer(void);
+uint64_t vmm_get_host_cr0(void);
+uint64_t vmm_get_host_cr4(void);
+uint64_t vmm_get_host_datasel(void);
+uint64_t vmm_get_host_codesel(void);
+uint64_t vmm_get_host_tsssel(void);
+uint64_t vmm_get_host_fsbase(void);
+uint64_t vmm_get_host_idtrbase(void);
+
+/*
+ * Inline access to host state that is used on every VM entry
+ */
+static __inline uint64_t
+vmm_get_host_trbase(void)
+{
+
+	return ((uint64_t)PCPU_GET(tssp));
+}
+
+static __inline uint64_t
+vmm_get_host_gdtrbase(void)
+{
+
+	return ((uint64_t)&gdt[NGDT * curcpu]);
+}
+
+struct pcpu;
+extern struct pcpu __pcpu[];
+
+static __inline uint64_t
+vmm_get_host_gsbase(void)
+{
+
+	return ((uint64_t)&__pcpu[curcpu]);
+}
+
+#endif
-- 
cgit v1.1


From aee862ac3fd36264249b7160eaecaeacab119ac3 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Mon, 29 Oct 2012 23:58:15 +0000
Subject: Convert VMCS_ENTRY_INTR_INFO field into a vmcs identifier before
 passing it to vmcs_getreg(). Without this conversion vmcs_getreg() will
 return EINVAL.

In particular this prevented injection of the breakpoint exception into the
guest via the "-B" option to /usr/sbin/bhyve which is hugely useful when
debugging guest hangs.

This was broken in r241921.

Pointy hat: me
Obtained from:	NetApp
---
 sys/amd64/vmm/intel/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index ace2683..7a9cfb8 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1685,7 +1685,7 @@ vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
 	 * If there is already an exception pending to be delivered to the
 	 * vcpu then just return.
 	 */
-	error = vmcs_getreg(vmcs, VMCS_ENTRY_INTR_INFO, &info);
+	error = vmcs_getreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
 	if (error)
 		return (error);
 
-- 
cgit v1.1


From 091578815ab0408c9aa2133e259263351101a008 Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Tue, 6 Nov 2012 02:43:41 +0000
Subject: Fix issue found with clang build. Avoid code insertion by the
 compiler between inline asm statements that would in turn modify the flags
 value set by the first asm, and used by the second.

Solve by making the common error block a string that can be pulled
into the first inline asm, and using symbolic labels for asm variables.

bhyve can now build/run fine when compiled with clang.

Reviewed by:	neel
Obtained from:	NetApp
---
 sys/amd64/vmm/intel/vmx_cpufunc.h | 77 ++++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 29 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h
index e9f6c6d..2e66443 100644
--- a/sys/amd64/vmm/intel/vmx_cpufunc.h
+++ b/sys/amd64/vmm/intel/vmx_cpufunc.h
@@ -42,18 +42,15 @@ struct vmcs;
 #define	VM_SUCCESS		0
 #define	VM_FAIL_INVALID		1
 #define	VM_FAIL_VALID		2
-#define	VMX_SET_ERROR_CODE(varname)					\
-	do {								\
-	__asm __volatile("	jnc 1f;"				\
-			 "	mov $1, %0;"	/* CF: error = 1 */	\
-			 "	jmp 3f;"				\
-			 "1:	jnz 2f;"				\
-			 "	mov $2, %0;"	/* ZF: error = 2 */	\
-			 "	jmp 3f;"				\
-			 "2:	mov $0, %0;"				\
-			 "3:	nop"					\
-			 :"=r" (varname));				\
-	} while (0)
+#define	VMX_SET_ERROR_CODE \
+	"	jnc 1f;"						\
+	"	mov $1, %[error];"	/* CF: error = 1 */		\
+	"	jmp 3f;"						\
+	"1:	jnz 2f;"						\
+	"	mov $2, %[error];"	/* ZF: error = 2 */		\
+	"	jmp 3f;"						\
+	"2:	mov $0, %[error];"					\
+	"3:"
 
 /* returns 0 on success and non-zero on failure */
 static __inline int
@@ -63,8 +60,12 @@ vmxon(char *region)
 	uint64_t addr;
 
 	addr = vtophys(region);
-	__asm __volatile("vmxon %0" : : "m" (*(uint64_t *)&addr) : "memory");
-	VMX_SET_ERROR_CODE(error);
+	__asm __volatile("vmxon %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
+
 	return (error);
 }
 
@@ -76,21 +77,26 @@ vmclear(struct vmcs *vmcs)
 	uint64_t addr;
 
 	addr = vtophys(vmcs);
-	__asm __volatile("vmclear %0" : : "m" (*(uint64_t *)&addr) : "memory");
-	VMX_SET_ERROR_CODE(error);
+	__asm __volatile("vmclear %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
 	return (error);
 }
 
 static __inline void
 vmxoff(void)
 {
+
 	__asm __volatile("vmxoff");
 }
 
 static __inline void
 vmptrst(uint64_t *addr)
 {
-	__asm __volatile("vmptrst %0" : : "m" (*addr) : "memory");
+
+	__asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory");
 }
 
 static __inline int
@@ -100,8 +106,11 @@ vmptrld(struct vmcs *vmcs)
 	uint64_t addr;
 
 	addr = vtophys(vmcs);
-	__asm __volatile("vmptrld %0" : : "m" (*(uint64_t *)&addr) : "memory");
-	VMX_SET_ERROR_CODE(error);
+	__asm __volatile("vmptrld %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
 	return (error);
 }
 
@@ -110,9 +119,11 @@ vmwrite(uint64_t reg, uint64_t val)
 {
 	int error;
 
-	__asm __volatile("vmwrite %0, %1" : : "r" (val), "r" (reg) : "memory");
-
-	VMX_SET_ERROR_CODE(error);
+	__asm __volatile("vmwrite %[val], %[reg];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [val] "r" (val), [reg] "r" (reg)
+			 : "memory");
 
 	return (error);
 }
@@ -122,9 +133,11 @@ vmread(uint64_t r, uint64_t *addr)
 {
 	int error;
 
-	__asm __volatile("vmread %0, %1" : : "r" (r), "m" (*addr) : "memory");
-
-	VMX_SET_ERROR_CODE(error);
+	__asm __volatile("vmread %[r], %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [r] "r" (r), [addr] "m" (*addr)
+			 : "memory");
 
 	return (error);
 }
@@ -170,9 +183,12 @@ invvpid(uint64_t type, struct invvpid_desc desc)
 {
 	int error;
 
-	__asm __volatile("invvpid %0, %1" :: "m" (desc), "r" (type) : "memory");
+	__asm __volatile("invvpid %[desc], %[type];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [desc] "m" (desc), [type] "r" (type)
+			 : "memory");
 
-	VMX_SET_ERROR_CODE(error);
 	if (error)
 		panic("invvpid error %d", error);
 }
@@ -190,9 +206,12 @@ invept(uint64_t type, struct invept_desc desc)
 {
 	int error;
 
-	__asm __volatile("invept %0, %1" :: "m" (desc), "r" (type) : "memory");
+	__asm __volatile("invept %[desc], %[type];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [desc] "m" (desc), [type] "r" (type)
+			 : "memory");
 
-	VMX_SET_ERROR_CODE(error);
 	if (error)
 		panic("invept error %d", error);
 }
-- 
cgit v1.1


From 5a600cdfe44adae619eca970bef1539b3ac6ae35 Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Tue, 20 Nov 2012 06:01:03 +0000
Subject: Handle CPUID leaf 0x7 now that FreeBSD is using it. Return 0's for
 now.

Reviewed by:	neel
Obtained from:	NetApp
---
 sys/amd64/vmm/x86.c | 1 +
 sys/amd64/vmm/x86.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
index ca0d785..94abe09 100644
--- a/sys/amd64/vmm/x86.c
+++ b/sys/amd64/vmm/x86.c
@@ -162,6 +162,7 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			break;
 
 		case CPUID_0000_0006:
+		case CPUID_0000_0007:
 			/*
 			 * Handle the access, but report 0 for
 			 * all options
diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h
index d19e1d8..368e967 100644
--- a/sys/amd64/vmm/x86.h
+++ b/sys/amd64/vmm/x86.h
@@ -35,6 +35,7 @@
 #define CPUID_0000_0003 (0x3)
 #define CPUID_0000_0004 (0x4)
 #define CPUID_0000_0006 (0x6)
+#define CPUID_0000_0007 (0x7)
 #define	CPUID_0000_000A	(0xA)
 #define	CPUID_0000_000B	(0xB)
 #define CPUID_8000_0000	(0x80000000)
-- 
cgit v1.1


From 575baa2d8a6961e9a82ca9272a78d3c01cfcbdf1 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Thu, 22 Nov 2012 00:08:20 +0000
Subject: Get rid of redundant comparision which is guaranteed to be "true" for
 unsigned integers.

Obtained from:	NetApp
---
 sys/amd64/vmm/intel/vmx_msr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
index 1e9a837..2aba63c 100644
--- a/sys/amd64/vmm/intel/vmx_msr.c
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -148,7 +148,7 @@ msr_bitmap_change_access(char *bitmap, u_int msr, int access)
 {
 	int byte, bit;
 
-	if (msr >= 0x00000000 && msr <= 0x00001FFF)
+	if (msr <= 0x00001FFF)
 		byte = msr / 8;
 	else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
 		byte = 1024 + (msr - 0xC0000000) / 8;
-- 
cgit v1.1


From d8bfa0f5754e3a60b0d0e1d425a6038be554e73d Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Thu, 22 Nov 2012 04:07:18 +0000
Subject: Fix a bug in the MSI-X resource allocation for PCI passthrough
 devices.

In the case where the underlying host had disabled MSI-X via the
"hw.pci.enable_msix" tunable, the ppt_setup_msix() function would fail
and return an error without properly cleaning up. This in turn would
cause a page fault on the next boot of the guest.

Fix this by calling ppt_teardown_msix() in all the error return paths.

Obtained from:	NetApp
---
 sys/amd64/vmm/io/ppt.c | 63 +++++++++++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 37 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
index 3044fc5..fdf136b 100644
--- a/sys/amd64/vmm/io/ppt.c
+++ b/sys/amd64/vmm/io/ppt.c
@@ -247,7 +247,7 @@ ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
 static void 
 ppt_teardown_msix(struct pptdev *ppt)
 {
-	int i, error;
+	int i;
 
 	if (ppt->msix.num_msgs == 0) 
 		return;
@@ -267,9 +267,7 @@ ppt_teardown_msix(struct pptdev *ppt)
 	free(ppt->msix.cookie, M_PPTMSIX);
 	free(ppt->msix.arg, M_PPTMSIX);
 
-	error = pci_release_msi(ppt->dev);
-	if (error) 
-		printf("ppt_teardown_msix: Failed to release MSI-X resources (error %i)\n", error);
+	pci_release_msi(ppt->dev);
 
 	ppt->msix.num_msgs = 0;
 }
@@ -519,7 +517,7 @@ ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
 {
 	struct pptdev *ppt;
 	struct pci_devinfo *dinfo;
-	int numvec, vector_count, rid, error;
+	int numvec, alloced, rid, error;
 	size_t res_size, cookie_size, arg_size;
 
 	ppt = ppt_find(bus, slot, func);
@@ -538,48 +536,39 @@ ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
 	 *	Allocate the IRQ resources
 	 *	Set up some variables in ppt->msix
 	 */
-	if (!ppt->msix.msix_table_res) {
-		ppt->msix.res = NULL;
-		ppt->msix.cookie = NULL;
-		ppt->msix.arg = NULL;
-
-		rid = dinfo->cfg.msix.msix_table_bar;
-		ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev, SYS_RES_MEMORY,
-								  &rid, RF_ACTIVE);
-		if (ppt->msix.msix_table_res == NULL) 
-			return (ENOSPC);
-
-		ppt->msix.msix_table_rid = rid;
-
-		vector_count = numvec = pci_msix_count(ppt->dev);
-
-		error = pci_alloc_msix(ppt->dev, &numvec);
-		if (error) 
-			return (error);
-		else if (vector_count != numvec) {
-			pci_release_msi(ppt->dev);
-			return (ENOSPC);
-		} 
-
-		ppt->msix.num_msgs = numvec;
+	if (ppt->msix.num_msgs == 0) {
+		numvec = pci_msix_count(ppt->dev);
+		if (numvec <= 0)
+			return (EINVAL);
 
 		ppt->msix.startrid = 1;
+		ppt->msix.num_msgs = numvec;
 
 		res_size = numvec * sizeof(ppt->msix.res[0]);
 		cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
 		arg_size = numvec * sizeof(ppt->msix.arg[0]);
 
-		ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK);
-		ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX, M_WAITOK);
-		ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK);
-		if (ppt->msix.res == NULL || ppt->msix.cookie == NULL || 
-		    ppt->msix.arg == NULL) {
+		ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
+		ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
+					  M_WAITOK | M_ZERO);
+		ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
+
+		rid = dinfo->cfg.msix.msix_table_bar;
+		ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
+					       SYS_RES_MEMORY, &rid, RF_ACTIVE);
+
+		if (ppt->msix.msix_table_res == NULL) {
 			ppt_teardown_msix(ppt);
 			return (ENOSPC);
 		}
-		bzero(ppt->msix.res, res_size);
-		bzero(ppt->msix.cookie, cookie_size);
-		bzero(ppt->msix.arg, arg_size);
+		ppt->msix.msix_table_rid = rid;
+
+		alloced = numvec;
+		error = pci_alloc_msix(ppt->dev, &alloced);
+		if (error || alloced != numvec) {
+			ppt_teardown_msix(ppt);
+			return (error == 0 ? ENOSPC: error);
+		}
 	}
 
 	if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
-- 
cgit v1.1


From 36ab9a2e1ab7d2b1884270275584f989cfd65e2b Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Wed, 28 Nov 2012 00:02:17 +0000
Subject: Revamp the x86 instruction emulation in bhyve.

On a nested page table fault the hypervisor will:
- fetch the instruction using the guest %rip and %cr3
- decode the instruction in 'struct vie'
- emulate the instruction in host kernel context for local apic accesses
- any other type of mmio access is punted up to user-space (e.g. ioapic)

The decoded instruction is passed as collateral to the user-space process
that is handling the PAGING exit.

The emulation code is fleshed out to include more addressing modes (e.g. SIB)
and more types of operands (e.g. imm8). The source code is unified into a
single file (vmm_instruction_emul.c) that is compiled into vmm.ko as well
as /usr/sbin/bhyve.

Reviewed by:	grehan
Obtained from:	NetApp
---
 sys/amd64/include/vmm.h                  |   3 +
 sys/amd64/include/vmm_instruction_emul.h | 113 ++++++++
 sys/amd64/vmm/intel/vmcs.h               |   1 +
 sys/amd64/vmm/intel/vmx.c                |  45 ++-
 sys/amd64/vmm/vmm_instruction_emul.c     | 481 +++++++++++++++++++++++++++----
 sys/amd64/vmm/vmm_instruction_emul.h     |  91 ------
 sys/amd64/vmm/vmm_lapic.c                |  83 ++----
 sys/amd64/vmm/vmm_lapic.h                |   6 +-
 8 files changed, 605 insertions(+), 218 deletions(-)
 create mode 100644 sys/amd64/include/vmm_instruction_emul.h
 delete mode 100644 sys/amd64/vmm/vmm_instruction_emul.h

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 8f78b8f..2fb2194 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -150,6 +150,8 @@ void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
 
 #endif	/* KERNEL */
 
+#include <machine/vmm_instruction_emul.h>
+
 #define	VM_MAXCPU	8			/* maximum virtual cpus */
 
 /*
@@ -268,6 +270,7 @@ struct vm_exit {
 			uint64_t	cr3;
 			uint64_t	gpa;
 			int		rwx;
+			struct vie	vie;
 		} paging;
 		/*
 		 * VMX specific payload. Used when there is no "better"
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
new file mode 100644
index 0000000..4cc494b
--- /dev/null
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -0,0 +1,113 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_INSTRUCTION_EMUL_H_
+#define	_VMM_INSTRUCTION_EMUL_H_
+
+/*
+ * The data structures 'vie' and 'vie_op' are meant to be opaque to the
+ * consumers of instruction decoding. The only reason why their contents
+ * need to be exposed is because they are part of the 'vm_exit' structure.
+ */
+struct vie_op {
+	uint8_t		op_byte;	/* actual opcode byte */
+	uint8_t		op_type;	/* type of operation (e.g. MOV) */
+	uint16_t	op_flags;
+};
+
+#define	VIE_INST_SIZE	15
+struct vie {
+	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
+	uint8_t		num_valid;		/* size of the instruction */
+	uint8_t		num_processed;
+
+	uint8_t		rex_w:1,		/* REX prefix */
+			rex_r:1,
+			rex_x:1,
+			rex_b:1;
+
+	uint8_t		mod:2,			/* ModRM byte */
+			reg:4,
+			rm:4;
+
+	uint8_t		ss:2,			/* SIB byte */
+			index:4,
+			base:4;
+
+	uint8_t		disp_bytes;
+	uint8_t		imm_bytes;
+
+	uint8_t		scale;
+	int		base_register;		/* VM_REG_GUEST_xyz */
+	int		index_register;		/* VM_REG_GUEST_xyz */
+
+	int64_t		displacement;		/* optional addr displacement */
+	int64_t		immediate;		/* optional immediate operand */
+
+	uint8_t		decoded;	/* set to 1 if successfully decoded */
+
+	struct vie_op	op;			/* opcode description */
+};
+
+/*
+ * Callback functions to read and write memory regions.
+ */
+typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
+				 uint64_t *rval, int rsize, void *arg);
+
+typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
+				  uint64_t wval, int wsize, void *arg);
+
+/*
+ * Emulate the decoded 'vie' instruction.
+ *
+ * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
+ * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
+ * callback functions.
+ *
+ * 'void *vm' should be 'struct vm *' when called from kernel context and
+ * 'struct vmctx *' when called from user context.
+ * s
+ */
+int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
+			    mem_region_read_t mrr, mem_region_write_t mrw,
+			    void *mrarg);
+
+#ifdef _KERNEL
+/*
+ * APIs to fetch and decode the instruction from nested page fault handler.
+ */
+int vmm_fetch_instruction(struct vm *vm, int cpuid,
+			  uint64_t rip, int inst_length, uint64_t cr3,
+			  struct vie *vie);
+
+int vmm_decode_instruction(struct vm *vm, int cpuid,
+			   uint64_t gla, struct vie *vie);
+#endif	/* _KERNEL */
+
+#endif	/* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 84532f4..f39eed2 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -67,6 +67,7 @@ uint64_t vmcs_read(uint32_t encoding);
 #define	vmcs_exit_qualification()	vmcs_read(VMCS_EXIT_QUALIFICATION)
 #define	vmcs_guest_cr3()		vmcs_read(VMCS_GUEST_CR3)
 #define	vmcs_gpa()			vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
+#define	vmcs_gla()			vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
 
 #endif	/* _KERNEL */
 
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 7a9cfb8..b185c57 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -63,7 +63,6 @@ __FBSDID("$FreeBSD$");
 #include "vmx.h"
 #include "x86.h"
 #include "vmx_controls.h"
-#include "vmm_instruction_emul.h"
 
 #define	PINBASED_CTLS_ONE_SETTING					\
 	(PINBASED_EXTINT_EXITING	|				\
@@ -1150,23 +1149,11 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 }
 
 static int
-vmx_lapic_fault(struct vm *vm, int cpu,
-		uint64_t gpa, uint64_t rip, int inst_length,
-		uint64_t cr3, uint64_t ept_qual)
+vmx_ept_fault(struct vm *vm, int cpu,
+	      uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
+	      uint64_t cr3, uint64_t ept_qual, struct vie *vie)
 {
-	int read, write, handled;
-	struct vie vie;
-
-	/*
-	 * For this to be a legitimate access to the local apic:
-	 * - the GPA in the local apic page
-	 * - the GPA must be aligned on a 16 byte boundary
-	 */
-	if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
-		return (UNHANDLED);
-
-	if ((gpa & 0xF) != 0)
-		return (UNHANDLED);
+	int read, write, error;
 
 	/* EPT violation on an instruction fetch doesn't make sense here */
 	if (ept_qual & EPT_VIOLATION_INST_FETCH)
@@ -1188,15 +1175,22 @@ vmx_lapic_fault(struct vm *vm, int cpu,
 	}
 
 	/* Fetch, decode and emulate the faulting instruction */
-	if (vmm_fetch_instruction(vm, rip, inst_length, cr3, &vie) != 0)
+	if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
 		return (UNHANDLED);
 
-	if (vmm_decode_instruction(&vie) != 0)
+	if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
 		return (UNHANDLED);
 
-	handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, &vie);
+	/*
+	 * Check if this is a local apic access
+	 */
+	if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
+		return (UNHANDLED);
 
-	return (handled);
+	error = vmm_emulate_instruction(vm, cpu, gpa, vie,
+					lapic_mmio_read, lapic_mmio_write, 0);
+
+	return (error ? UNHANDLED : HANDLED);
 }
 
 static int
@@ -1206,7 +1200,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	struct vmcs *vmcs;
 	struct vmxctx *vmxctx;
 	uint32_t eax, ecx, edx;
-	uint64_t qual, gpa, cr3, intr_info;
+	uint64_t qual, gla, gpa, cr3, intr_info;
 
 	handled = 0;
 	vmcs = &vmx->vmcs[vcpu];
@@ -1299,11 +1293,12 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
 		break;
 	case EXIT_REASON_EPT_FAULT:
+		gla = vmcs_gla();
 		gpa = vmcs_gpa();
 		cr3 = vmcs_guest_cr3();
-		handled = vmx_lapic_fault(vmx->vm, vcpu,
-					  gpa, vmexit->rip, vmexit->inst_length,
-					  cr3, qual);
+		handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
+					vmexit->rip, vmexit->inst_length,
+					cr3, qual, &vmexit->u.paging.vie);
 		if (!handled) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->u.paging.cr3 = cr3;
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 7ef4dbb..5e5399b 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -30,6 +30,7 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#ifdef _KERNEL
 #include <sys/param.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
@@ -40,10 +41,60 @@ __FBSDID("$FreeBSD$");
 #include <machine/pmap.h>
 #include <machine/vmparam.h>
 #include <machine/vmm.h>
+#else	/* !_KERNEL */
+#include <sys/types.h>
+#include <sys/errno.h>
 
-#include "vmm_instruction_emul.h"
+#include <machine/vmm.h>
+
+#include <vmmapi.h>
+#endif	/* _KERNEL */
+
+
+
+/* struct vie_op.op_type */
+enum {
+	VIE_OP_TYPE_NONE = 0,
+	VIE_OP_TYPE_MOV,
+	VIE_OP_TYPE_AND,
+	VIE_OP_TYPE_LAST
+};
+
+/* struct vie_op.op_flags */
+#define	VIE_OP_F_IMM		(1 << 0)	/* immediate operand present */
+#define	VIE_OP_F_IMM8		(1 << 1)	/* 8-bit immediate operand */
+
+static const struct vie_op one_byte_opcodes[256] = {
+	[0x89] = {
+		.op_byte = 0x89,
+		.op_type = VIE_OP_TYPE_MOV,
+	},
+	[0x8B] = {
+		.op_byte = 0x8B,
+		.op_type = VIE_OP_TYPE_MOV,
+	},
+	[0xC7] = {
+		.op_byte = 0xC7,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_IMM,
+	},
+	[0x23] = {
+		.op_byte = 0x23,
+		.op_type = VIE_OP_TYPE_AND,
+	}
+};
+
+/* struct vie.mod */
+#define	VIE_MOD_INDIRECT		0
+#define	VIE_MOD_INDIRECT_DISP8		1
+#define	VIE_MOD_INDIRECT_DISP32		2
+#define	VIE_MOD_DIRECT			3
 
-#define	GB	(1024 * 1024 * 1024)
+/* struct vie.rm */
+#define	VIE_RM_SIB			4
+#define	VIE_RM_DISP32			5
+
+#define	GB				(1024 * 1024 * 1024)
 
 static enum vm_reg_name gpr_map[16] = {
 	VM_REG_GUEST_RAX,
@@ -64,17 +115,232 @@ static enum vm_reg_name gpr_map[16] = {
 	VM_REG_GUEST_R15
 };
 
+static uint64_t size2mask[] = {
+	[1] = 0xff,
+	[2] = 0xffff,
+	[4] = 0xffffffff,
+	[8] = 0xffffffffffffffff,
+};
+
+static int
+vie_valid_register(enum vm_reg_name reg)
+{
+#ifdef _KERNEL
+	/*
+	 * XXX
+	 * The operand register in which we store the result of the
+	 * read must be a GPR that we can modify even if the vcpu
+	 * is "running". All the GPRs qualify except for %rsp.
+	 *
+	 * This is a limitation of the vm_set_register() API
+	 * and can be fixed if necessary.
+	 */
+	if (reg == VM_REG_GUEST_RSP)
+		return (0);
+#endif
+	return (1);
+}
+
+static int
+vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
+{
+	int error;
+
+	if (!vie_valid_register(reg))
+		return (EINVAL);
+
+	error = vm_get_register(vm, vcpuid, reg, rval);
+
+	return (error);
+}
+
+static int
+vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+		    uint64_t val, int size)
+{
+	int error;
+	uint64_t origval;
+
+	if (!vie_valid_register(reg))
+		return (EINVAL);
+
+	switch (size) {
+	case 1:
+	case 2:
+		error = vie_read_register(vm, vcpuid, reg, &origval);
+		if (error)
+			return (error);
+		val &= size2mask[size];
+		val |= origval & ~size2mask[size];
+		break;
+	case 4:
+		val &= 0xffffffffUL;
+		break;
+	case 8:
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	error = vm_set_register(vm, vcpuid, reg, val);
+	return (error);
+}
+
+/*
+ * The following simplifying assumptions are made during emulation:
+ *
+ * - guest is in 64-bit mode
+ *   - default address size is 64-bits
+ *   - default operand size is 32-bits
+ *
+ * - operand size override is not supported
+ *
+ * - address size override is not supported
+ */
+static int
+emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	enum vm_reg_name reg;
+	uint64_t val;
+
+	size = 4;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0x89:
+		/*
+		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+		 * 89/r:	mov r/m32, r32
+		 * REX.W + 89/r	mov r/m64, r64
+		 */
+		if (vie->rex_w)
+			size = 8;
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &val);
+		if (error == 0) {
+			val &= size2mask[size];
+			error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		}
+		break;
+	case 0x8B:
+		/*
+		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
+		 * 8B/r:	mov r32, r/m32
+		 * REX.W 8B/r:	mov r64, r/m64
+		 */
+		if (vie->rex_w)
+			size = 8;
+		error = memread(vm, vcpuid, gpa, &val, size, arg);
+		if (error == 0) {
+			reg = gpr_map[vie->reg];
+			error = vie_update_register(vm, vcpuid, reg, val, size);
+		}
+		break;
+	case 0xC7:
+		/*
+		 * MOV from imm32 to mem (ModRM:r/m)
+		 * C7/0		mov r/m32, imm32
+		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
+		 */
+		val = vie->immediate;		/* already sign-extended */
+
+		if (vie->rex_w)
+			size = 8;
+
+		if (size != 8)
+			val &= size2mask[size];
+
+		error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		break;
+	default:
+		break;
+	}
+
+	return (error);
+}
+
+static int
+emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	enum vm_reg_name reg;
+	uint64_t val1, val2;
+
+	size = 4;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0x23:
+		/*
+		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
+		 * result in reg.
+		 *
+		 * 23/r		and r32, r/m32
+		 * REX.W + 23/r	and r64, r/m64
+		 */
+		if (vie->rex_w)
+			size = 8;
+
+		/* get the first operand */
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &val1);
+		if (error)
+			break;
+
+		/* get the second operand */
+		error = memread(vm, vcpuid, gpa, &val2, size, arg);
+		if (error)
+			break;
+
+		/* perform the operation and write the result */
+		val1 &= val2;
+		error = vie_update_register(vm, vcpuid, reg, val1, size);
+		break;
+	default:
+		break;
+	}
+	return (error);
+}
+
+int
+vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+			mem_region_read_t memread, mem_region_write_t memwrite,
+			void *memarg)
+{
+	int error;
+
+	if (!vie->decoded)
+		return (EINVAL);
+
+	switch (vie->op.op_type) {
+	case VIE_OP_TYPE_MOV:
+		error = emulate_mov(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_AND:
+		error = emulate_and(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
+
+#ifdef _KERNEL
 static void
 vie_init(struct vie *vie)
 {
 
 	bzero(vie, sizeof(struct vie));
 
-	vie->op_size = VIE_OP_SIZE_32BIT;
-
 	vie->base_register = VM_REG_LAST;
 	vie->index_register = VM_REG_LAST;
-	vie->operand_register = VM_REG_LAST;
 }
 
 static int
@@ -129,7 +395,7 @@ error:
 }
 
 int
-vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
+vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
 		      uint64_t cr3, struct vie *vie)
 {
 	int n, err;
@@ -172,6 +438,7 @@ vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
 static int
 vie_peek(struct vie *vie, uint8_t *x)
 {
+
 	if (vie->num_processed < vie->num_valid) {
 		*x = vie->inst[vie->num_processed];
 		return (0);
@@ -182,8 +449,6 @@ vie_peek(struct vie *vie, uint8_t *x)
 static void
 vie_advance(struct vie *vie)
 {
-	if (vie->num_processed >= vie->num_valid)
-		panic("vie_advance: %d/%d", vie->num_processed, vie->num_valid);
 
 	vie->num_processed++;
 }
@@ -213,24 +478,16 @@ decode_opcode(struct vie *vie)
 {
 	uint8_t x;
 
-	static const uint8_t flags[256] = {
-		[0x89] = VIE_F_HAS_MODRM | VIE_F_FROM_REG | VIE_F_TO_RM,
-		[0x8B] = VIE_F_HAS_MODRM | VIE_F_FROM_RM | VIE_F_TO_REG,
-		[0xC7] = VIE_F_HAS_MODRM | VIE_F_FROM_IMM | VIE_F_TO_RM,
-	};
-
 	if (vie_peek(vie, &x))
 		return (-1);
 
-	vie->opcode_byte = x;
-	vie->opcode_flags = flags[x];
+	vie->op = one_byte_opcodes[x];
 
-	vie_advance(vie);
-
-	if (vie->opcode_flags == 0)
+	if (vie->op.op_type == VIE_OP_TYPE_NONE)
 		return (-1);
-	else
-		return (0);
+
+	vie_advance(vie);
+	return (0);
 }
 
 /*
@@ -241,9 +498,6 @@ decode_modrm(struct vie *vie)
 {
 	uint8_t x;
 
-	if ((vie->opcode_flags & VIE_F_HAS_MODRM) == 0)
-		return (0);
-
 	if (vie_peek(vie, &x))
 		return (-1);
 
@@ -251,35 +505,40 @@ decode_modrm(struct vie *vie)
 	vie->rm =  (x >> 0) & 0x7;
 	vie->reg = (x >> 3) & 0x7;
 
+	/*
+	 * A direct addressing mode makes no sense in the context of an EPT
+	 * fault. There has to be a memory access involved to cause the
+	 * EPT fault.
+	 */
+	if (vie->mod == VIE_MOD_DIRECT)
+		return (-1);
+
 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
-			/*
-			 * Table 2-5: Special Cases of REX Encodings
-			 *
-			 * mod=0, r/m=5 is used in the compatibility mode to
-			 * indicate a disp32 without a base register.
-			 *
-			 * mod!=3, r/m=4 is used in the compatibility mode to
-			 * indicate that the SIB byte is present.
-			 *
-			 * The 'b' bit in the REX prefix is don't care in
-			 * this case.
-			 */
+		/*
+		 * Table 2-5: Special Cases of REX Encodings
+		 *
+		 * mod=0, r/m=5 is used in the compatibility mode to
+		 * indicate a disp32 without a base register.
+		 *
+		 * mod!=3, r/m=4 is used in the compatibility mode to
+		 * indicate that the SIB byte is present.
+		 *
+		 * The 'b' bit in the REX prefix is don't care in
+		 * this case.
+		 */
 	} else {
 		vie->rm |= (vie->rex_b << 3);
 	}
 
 	vie->reg |= (vie->rex_r << 3);
 
-	/* SIB addressing not supported yet */
+	/* SIB */
 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
-		return (-1);
+		goto done;
 
 	vie->base_register = gpr_map[vie->rm];
 
-	if (vie->opcode_flags & (VIE_F_FROM_REG | VIE_F_TO_REG))
-		vie->operand_register = gpr_map[vie->reg];
-
 	switch (vie->mod) {
 	case VIE_MOD_INDIRECT_DISP8:
 		vie->disp_bytes = 1;
@@ -295,12 +554,76 @@ decode_modrm(struct vie *vie)
 		break;
 	}
 
-	/* calculate the operand size */
-	if (vie->rex_w)
-		vie->op_size = VIE_OP_SIZE_64BIT;
-
-	if (vie->opcode_flags & VIE_F_FROM_IMM)
+	/* Figure out immediate operand size (if any) */
+	if (vie->op.op_flags & VIE_OP_F_IMM)
 		vie->imm_bytes = 4;
+	else if (vie->op.op_flags & VIE_OP_F_IMM8)
+		vie->imm_bytes = 1;
+
+done:
+	vie_advance(vie);
+
+	return (0);
+}
+
+static int
+decode_sib(struct vie *vie)
+{
+	uint8_t x;
+
+	/* Proceed only if SIB byte is present */
+	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
+		return (0);
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	/* De-construct the SIB byte */
+	vie->ss = (x >> 6) & 0x3;
+	vie->index = (x >> 3) & 0x7;
+	vie->base = (x >> 0) & 0x7;
+
+	/* Apply the REX prefix modifiers */
+	vie->index |= vie->rex_x << 3;
+	vie->base |= vie->rex_b << 3;
+
+	switch (vie->mod) {
+	case VIE_MOD_INDIRECT_DISP8:
+		vie->disp_bytes = 1;
+		break;
+	case VIE_MOD_INDIRECT_DISP32:
+		vie->disp_bytes = 4;
+		break;
+	}
+
+	if (vie->mod == VIE_MOD_INDIRECT &&
+	    (vie->base == 5 || vie->base == 13)) {
+		/*
+		 * Special case when base register is unused if mod = 0
+		 * and base = %rbp or %r13.
+		 *
+		 * Documented in:
+		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+		 * Table 2-5: Special Cases of REX Encodings
+		 */
+		vie->disp_bytes = 4;
+	} else {
+		vie->base_register = gpr_map[vie->base];
+	}
+
+	/*
+	 * All encodings of 'index' are valid except for %rsp (4).
+	 *
+	 * Documented in:
+	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+	 * Table 2-5: Special Cases of REX Encodings
+	 */
+	if (vie->index != 4)
+		vie->index_register = gpr_map[vie->index];
+
+	/* 'scale' makes sense only in the context of an index register */
+	if (vie->index_register < VM_REG_LAST)
+		vie->scale = 1 << vie->ss;
 
 	vie_advance(vie);
 
@@ -348,13 +671,14 @@ decode_immediate(struct vie *vie)
 	uint8_t x;
 	union {
 		char	buf[4];
+		int8_t	signed8;
 		int32_t	signed32;
 	} u;
 
 	if ((n = vie->imm_bytes) == 0)
 		return (0);
 
-	if (n != 4)
+	if (n != 1 && n != 4)
 		panic("decode_immediate: invalid imm_bytes %d", n);
 
 	for (i = 0; i < n; i++) {
@@ -365,14 +689,62 @@ decode_immediate(struct vie *vie)
 		vie_advance(vie);
 	}
 	
-	vie->immediate = u.signed32;		/* sign-extended */
+	if (n == 1)
+		vie->immediate = u.signed8;		/* sign-extended */
+	else
+		vie->immediate = u.signed32;		/* sign-extended */
 
 	return (0);
 }
 
+#define	VERIFY_GLA
+/*
+ * Verify that the 'guest linear address' provided as collateral of the nested
+ * page table fault matches with our instruction decoding.
+ */
+#ifdef VERIFY_GLA
+static int
+verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+	int error;
+	uint64_t base, idx;
+
+	base = 0;
+	if (vie->base_register != VM_REG_LAST) {
+		error = vm_get_register(vm, cpuid, vie->base_register, &base);
+		if (error) {
+			printf("verify_gla: error %d getting base reg %d\n",
+				error, vie->base_register);
+			return (-1);
+		}
+	}
+
+	idx = 0;
+	if (vie->index_register != VM_REG_LAST) {
+		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
+		if (error) {
+			printf("verify_gla: error %d getting index reg %d\n",
+				error, vie->index_register);
+			return (-1);
+		}
+	}
+
+	if (base + vie->scale * idx + vie->displacement != gla) {
+		printf("verify_gla mismatch: "
+		       "base(0x%0lx), scale(%d), index(0x%0lx), "
+		       "disp(0x%0lx), gla(0x%0lx)\n",
+		       base, vie->scale, idx, vie->displacement, gla);
+		return (-1);
+	}
+
+	return (0);
+}
+#endif	/* VERIFY_GLA */
+
 int
-vmm_decode_instruction(struct vie *vie)
+vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 {
+
 	if (decode_rex(vie))
 		return (-1);
 
@@ -382,11 +754,22 @@ vmm_decode_instruction(struct vie *vie)
 	if (decode_modrm(vie))
 		return (-1);
 
+	if (decode_sib(vie))
+		return (-1);
+
 	if (decode_displacement(vie))
 		return (-1);
 	
 	if (decode_immediate(vie))
 		return (-1);
 
+#ifdef VERIFY_GLA
+	if (verify_gla(vm, cpuid, gla, vie))
+		return (-1);
+#endif
+
+	vie->decoded = 1;	/* success */
+
 	return (0);
 }
+#endif	/* _KERNEL */
diff --git a/sys/amd64/vmm/vmm_instruction_emul.h b/sys/amd64/vmm/vmm_instruction_emul.h
deleted file mode 100644
index 1fa9e2b..0000000
--- a/sys/amd64/vmm/vmm_instruction_emul.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*-
- * Copyright (c) 2012 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _VMM_INSTRUCTION_EMUL_H_
-#define _VMM_INSTRUCTION_EMUL_H_
-
-enum vie_op_size {
-	VIE_OP_SIZE_32BIT,		/* default */
-	VIE_OP_SIZE_64BIT,
-	VIE_OP_SIZE_8BIT
-};
-
-#define	VIE_INST_SIZE	15
-struct vie {
-	uint8_t		inst[VIE_INST_SIZE];
-
-	uint8_t		rex_w:1,
-			rex_r:1,
-			rex_x:1,
-			rex_b:1;
-
-	uint8_t		mod:2,
-			reg:4,
-			rm:4;
-
-
-	uint8_t		opcode_byte;
-	uint16_t	opcode_flags;
-	uint8_t		disp_bytes;
-	uint8_t		imm_bytes;
-
-	int		num_valid;
-	int		num_processed;
-
-	enum vm_reg_name base_register;
-	enum vm_reg_name index_register;
-	enum vm_reg_name operand_register;
-
-	int		op_size;
-	int64_t		displacement;
-	int64_t		immediate;
-};
-
-#define	VIE_F_HAS_MODRM	(1 << 0)
-#define	VIE_F_FROM_RM	(1 << 1)
-#define	VIE_F_FROM_REG	(1 << 2)
-#define	VIE_F_TO_RM	(1 << 3)
-#define	VIE_F_TO_REG	(1 << 4)
-#define	VIE_F_FROM_IMM	(1 << 5)
-
-#define	VIE_MOD_INDIRECT		0
-#define	VIE_MOD_INDIRECT_DISP8		1
-#define	VIE_MOD_INDIRECT_DISP32		2
-#define	VIE_MOD_DIRECT			3
-
-#define	VIE_RM_SIB			4
-#define	VIE_RM_DISP32			5
-
-struct vm;
-
-int	vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
-			      uint64_t cr3, struct vie *vie);
-
-int	vmm_decode_instruction(struct vie *vie);
-
-#endif
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
index bb22122..dabcf06 100644
--- a/sys/amd64/vmm/vmm_lapic.c
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -34,12 +34,12 @@ __FBSDID("$FreeBSD$");
 #include <sys/smp.h>
 
 #include <x86/specialreg.h>
+#include <x86/apicreg.h>
 
 #include <machine/vmm.h>
 #include "vmm_ipi.h"
 #include "vmm_lapic.h"
 #include "vlapic.h"
-#include "vmm_instruction_emul.h"
 
 static int
 lapic_write(struct vlapic *vlapic, u_int offset, uint64_t val)
@@ -177,64 +177,45 @@ lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
 }
 
 int
-lapic_mmio(struct vm *vm, int cpu, u_int offset, int read, struct vie *vie)
+lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size,
+		 void *arg)
 {
-	int handled, error;
-	uint64_t val;
+	int error;
+	uint64_t off;
 	struct vlapic *vlapic;
 
-	const int UNHANDLED = 0;
+	off = gpa - DEFAULT_APIC_BASE;
+
+	/*
+	 * Memory mapped local apic accesses must be 4 bytes wide and
+	 * aligned on a 16-byte boundary.
+	 */
+	if (size != 4 || off & 0xf)
+		return (EINVAL);
 
 	vlapic = vm_lapic(vm, cpu);
+	error = vlapic_op_mem_write(vlapic, off, DWORD, wval);
+	return (error);
+}
+
+int
+lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size,
+		void *arg)
+{
+	int error;
+	uint64_t off;
+	struct vlapic *vlapic;
 
-	/* Only 32-bit accesses to local apic */
-	if (vie->op_size != VIE_OP_SIZE_32BIT)
-		return (UNHANDLED);
+	off = gpa - DEFAULT_APIC_BASE;
 
 	/*
-	 * XXX
-	 * The operand register in which we store the result of the
-	 * read must be a GPR that we can modify even if the vcpu
-	 * is "running". All the GPRs qualify except for %rsp.
-	 *
-	 * This is a limitation of the vm_set_register() API
-	 * and can be fixed if necessary.
+	 * Memory mapped local apic accesses must be 4 bytes wide and
+	 * aligned on a 16-byte boundary.
 	 */
-	if (vie->operand_register == VM_REG_GUEST_RSP)
-		return (UNHANDLED);
-
-	if (read) {
-		if ((vie->opcode_flags & VIE_F_TO_REG) == 0)
-			return (UNHANDLED);
-
-		if (vie->operand_register >= VM_REG_LAST)
-			return (UNHANDLED);
-
-		handled = lapic_read(vlapic, offset, &val);
-		if (handled) {
-			error = vm_set_register(vm, cpu, vie->operand_register,
-						val);
-			if (error)
-				panic("lapic_mmio: error %d setting gpr %d",
-				      error, vie->operand_register);
-		}
-	} else {
-		if ((vie->opcode_flags & VIE_F_FROM_REG) &&
-		    (vie->operand_register < VM_REG_LAST)) {
-			error = vm_get_register(vm, cpu, vie->operand_register,
-						&val);
-			if (error) {
-				panic("lapic_mmio: error %d getting gpr %d",
-				      error, vie->operand_register);
-			}
-		} else if (vie->opcode_flags & VIE_F_FROM_IMM) {
-			val = vie->immediate;
-		} else {
-			return (UNHANDLED);
-		}
-
-		handled = lapic_write(vlapic, offset, val);
-	}
+	if (size != 4 || off & 0xf)
+		return (EINVAL);
 
-	return (handled);
+	vlapic = vm_lapic(vm, cpu);
+	error = vlapic_op_mem_read(vlapic, off, DWORD, rval);
+	return (error);
 }
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
index 59fc016..a79912e 100644
--- a/sys/amd64/vmm/vmm_lapic.h
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -30,13 +30,15 @@
 #define	_VMM_LAPIC_H_
 
 struct vm;
-struct vie;
 
 boolean_t lapic_msr(u_int num);
 int	lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval);
 int	lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
 
-int	lapic_mmio(struct vm *vm, int cpu, u_int offset, int rd, struct vie *);
+int	lapic_mmio_read(void *vm, int cpu, uint64_t gpa,
+			uint64_t *rval, int size, void *arg);
+int	lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
+			 uint64_t wval, int size, void *arg);
 
 int	lapic_timer_tick(struct vm *vm, int cpu);
 
-- 
cgit v1.1


From 308122a0f1279a0305554f864d6b01f9c132511a Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Wed, 28 Nov 2012 13:10:18 +0000
Subject: Change emulate_rdmsr() and emulate_wrmsr() to return 0 on sucess and
 errno on failure. The conversion from the return value to HANDLED or
 UNHANDLED can be done locally in vmx_exit_process().

Obtained from: NetApp
---
 sys/amd64/vmm/intel/vmx.c | 14 +++++++------
 sys/amd64/vmm/vmm_lapic.c | 52 +++++++++++++++--------------------------------
 sys/amd64/vmm/vmm_msr.c   | 36 ++++++++++++--------------------
 3 files changed, 37 insertions(+), 65 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index b185c57..af4a03f 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1214,23 +1214,25 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		break;
 	case EXIT_REASON_RDMSR:
 		ecx = vmxctx->guest_rcx;
-		handled = emulate_rdmsr(vmx->vm, vcpu, ecx);
-		if (!handled) {
+		error = emulate_rdmsr(vmx->vm, vcpu, ecx);
+		if (error) {
 			vmexit->exitcode = VM_EXITCODE_RDMSR;
 			vmexit->u.msr.code = ecx;
-		}
+		} else
+			handled = 1;
 		break;
 	case EXIT_REASON_WRMSR:
 		eax = vmxctx->guest_rax;
 		ecx = vmxctx->guest_rcx;
 		edx = vmxctx->guest_rdx;
-		handled = emulate_wrmsr(vmx->vm, vcpu, ecx,
+		error = emulate_wrmsr(vmx->vm, vcpu, ecx,
 					(uint64_t)edx << 32 | eax);
-		if (!handled) {
+		if (error) {
 			vmexit->exitcode = VM_EXITCODE_WRMSR;
 			vmexit->u.msr.code = ecx;
 			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
-		}
+		} else
+			handled = 1;
 		break;
 	case EXIT_REASON_HLT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
index dabcf06..d024b71 100644
--- a/sys/amd64/vmm/vmm_lapic.c
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -41,32 +41,6 @@ __FBSDID("$FreeBSD$");
 #include "vmm_lapic.h"
 #include "vlapic.h"
 
-static int
-lapic_write(struct vlapic *vlapic, u_int offset, uint64_t val)
-{
-	int handled;
-
-	if (vlapic_op_mem_write(vlapic, offset, DWORD, val) == 0)
-		handled = 1;
-	else
-		handled = 0;
-
-	return (handled);
-}
-
-static int
-lapic_read(struct vlapic *vlapic, u_int offset, uint64_t *rv)
-{
-	int handled;
-
-	if (vlapic_op_mem_read(vlapic, offset, DWORD, rv) == 0)
-		handled = 1;
-	else
-		handled = 0;
-
-	return (handled);
-}
-
 int
 lapic_pending_intr(struct vm *vm, int cpu)
 {
@@ -145,35 +119,41 @@ lapic_msr(u_int msr)
 int
 lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval)
 {
-	int handled;
+	int error;
+	u_int offset;
 	struct vlapic *vlapic;
 
 	vlapic = vm_lapic(vm, cpu);
 
 	if (msr == MSR_APICBASE) {
 		*rval = vlapic_get_apicbase(vlapic);
-		handled = 1;
-	} else
-		handled = lapic_read(vlapic, x2apic_msr_to_regoff(msr), rval);
+		error = 0;
+	} else {
+		offset = x2apic_msr_to_regoff(msr);
+		error = vlapic_op_mem_read(vlapic, offset, DWORD, rval);
+	}
 
-	return (handled);
+	return (error);
 }
 
 int
 lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
 {
-	int handled;
+	int error;
+	u_int offset;
 	struct vlapic *vlapic;
 
 	vlapic = vm_lapic(vm, cpu);
 
 	if (msr == MSR_APICBASE) {
 		vlapic_set_apicbase(vlapic, val);
-		handled = 1;
-	} else
-		handled = lapic_write(vlapic, x2apic_msr_to_regoff(msr), val);
+		error = 0;
+	} else {
+		offset = x2apic_msr_to_regoff(msr);
+		error = vlapic_op_mem_write(vlapic, offset, DWORD, val);
+	}
 
-	return (handled);
+	return (error);
 }
 
 int
diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c
index bc67f98..d97c819 100644
--- a/sys/amd64/vmm/vmm_msr.c
+++ b/sys/amd64/vmm/vmm_msr.c
@@ -41,7 +41,7 @@ __FBSDID("$FreeBSD$");
 
 #define	VMM_MSR_F_EMULATE	0x01
 #define	VMM_MSR_F_READONLY	0x02
-#define VMM_MSR_F_INVALID	0x04
+#define VMM_MSR_F_INVALID	0x04  /* guest_msr_valid() can override this */
 
 struct vmm_msr {
 	int		num;
@@ -137,20 +137,15 @@ msr_num_to_idx(u_int num)
 int
 emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
 {
-	int handled, idx;
+	int idx;
 	uint64_t *guest_msrs;
 
-	handled = 0;
-
 	if (lapic_msr(num))
 		return (lapic_wrmsr(vm, cpu, num, val));
 
 	idx = msr_num_to_idx(num);
-	if (idx < 0)
-		goto done;
-
-	if (invalid_msr(idx))
-		goto done;
+	if (idx < 0 || invalid_msr(idx))
+		return (EINVAL);
 
 	if (!readonly_msr(idx)) {
 		guest_msrs = vm_guest_msrs(vm, cpu);
@@ -163,31 +158,26 @@ emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
 			wrmsr(vmm_msr[idx].num, val);
 	}
 
-	handled = 1;
-done:
-	return (handled);
+	return (0);
 }
 
 int
 emulate_rdmsr(struct vm *vm, int cpu, u_int num)
 {
-	int error, handled, idx;
+	int error, idx;
 	uint32_t eax, edx;
 	uint64_t result, *guest_msrs;
 
-	handled = 0;
-
 	if (lapic_msr(num)) {
-		handled = lapic_rdmsr(vm, cpu, num, &result);
+		error = lapic_rdmsr(vm, cpu, num, &result);
 		goto done;
 	}
 
 	idx = msr_num_to_idx(num);
-	if (idx < 0)
-		goto done;
-
-	if (invalid_msr(idx))
+	if (idx < 0 || invalid_msr(idx)) {
+		error = EINVAL;
 		goto done;
+	}
 
 	guest_msrs = vm_guest_msrs(vm, cpu);
 	result = guest_msrs[idx];
@@ -202,10 +192,10 @@ emulate_rdmsr(struct vm *vm, int cpu, u_int num)
 		      result, rdmsr(num));
 	}
 
-	handled = 1;
+	error = 0;
 
 done:
-	if (handled) {
+	if (error == 0) {
 		eax = result;
 		edx = result >> 32;
 		error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax);
@@ -215,7 +205,7 @@ done:
 		if (error)
 			panic("vm_set_register(rdx) error %d", error);
 	}
-	return (handled);
+	return (error);
 }
 
 void
-- 
cgit v1.1


From da4e87dfd614fffb88e5a93c988e1caec9c9efe7 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Wed, 28 Nov 2012 13:34:44 +0000
Subject: Cleanup the user-space paging exit handler now that the unified
 instruction emulation is in place.

Obtained from:	NetApp
---
 sys/amd64/include/vmm.h   | 2 --
 sys/amd64/vmm/intel/vmx.c | 2 --
 2 files changed, 4 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 2fb2194..024c30e 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -267,9 +267,7 @@ struct vm_exit {
 			uint32_t	eax;		/* valid for out */
 		} inout;
 		struct {
-			uint64_t	cr3;
 			uint64_t	gpa;
-			int		rwx;
 			struct vie	vie;
 		} paging;
 		/*
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index af4a03f..2b6ef35 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1303,9 +1303,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 					cr3, qual, &vmexit->u.paging.vie);
 		if (!handled) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
-			vmexit->u.paging.cr3 = cr3;
 			vmexit->u.paging.gpa = gpa;
-			vmexit->u.paging.rwx = qual & 0x7;
 		}
 		break;
 	default:
-- 
cgit v1.1


From ffd1f089c33d0e59c0cb85b52bc683272f7880dd Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Thu, 29 Nov 2012 06:26:42 +0000
Subject: Add support for the 0x81 AND instruction, now generated by clang in
 the local APIC code.

0x81 is a read-modify-write instruction - the EPT check
that only allowed read or write and not both has been
relaxed to allow read and write.

Reviewed by:	neel
Obtained from:	NetApp
---
 sys/amd64/vmm/intel/vmx.c            |  9 +++++----
 sys/amd64/vmm/vmm_instruction_emul.c | 29 +++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 2b6ef35..4f267bb 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1159,15 +1159,16 @@ vmx_ept_fault(struct vm *vm, int cpu,
 	if (ept_qual & EPT_VIOLATION_INST_FETCH)
 		return (UNHANDLED);
 
-	/* EPT violation must be a read fault or a write fault but not both */
+	/* EPT violation must be a read fault or a write fault */
 	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
 	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
-	if ((read ^ write) == 0)
+	if ((read | write) == 0)
 		return (UNHANDLED);
 
 	/*
-	 * The EPT violation must have been caused by accessing a guest-physical
-	 * address that is a translation of a guest-linear address.
+	 * The EPT violation must have been caused by accessing a
+	 * guest-physical address that is a translation of a guest-linear
+	 * address.
 	 */
 	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
 	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 5e5399b..0a7286b 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -81,6 +81,11 @@ static const struct vie_op one_byte_opcodes[256] = {
 	[0x23] = {
 		.op_byte = 0x23,
 		.op_type = VIE_OP_TYPE_AND,
+	},
+	[0x81] = {
+		.op_byte = 0x81,
+		.op_type = VIE_OP_TYPE_AND,
+		.op_flags = VIE_OP_F_IMM,
 	}
 };
 
@@ -299,6 +304,30 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		val1 &= val2;
 		error = vie_update_register(vm, vcpuid, reg, val1, size);
 		break;
+	case 0x81:
+		printf("0x81 AND\n");
+		/*
+		 * AND reg (ModRM:reg) with immediate and store the
+		 * result in reg
+		 *
+		 * 81/          and r/m32, imm32
+		 * REX.W + 81/  and r/m64, imm32 sign-extended to 64
+		 */
+		if (vie->rex_w)
+			size = 8;
+		
+		/* get the first operand */
+                error = memread(vm, vcpuid, gpa, &val1, size, arg);
+                if (error)
+			break;
+
+                /*
+		 * perform the operation with the pre-fetched immediate
+		 * operand and write the result
+		 */
+                val1 &= vie->immediate;
+                error = memwrite(vm, vcpuid, gpa, val1, size, arg);
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.1


From f59654890648245951f48676dc390a6d95f03aae Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Thu, 29 Nov 2012 15:08:13 +0000
Subject: Remove debug printf.

Pointed out by:	emaste
---
 sys/amd64/vmm/vmm_instruction_emul.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 0a7286b..1c4abf8 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -305,7 +305,6 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		error = vie_update_register(vm, vcpuid, reg, val1, size);
 		break;
 	case 0x81:
-		printf("0x81 AND\n");
 		/*
 		 * AND reg (ModRM:reg) with immediate and store the
 		 * result in reg
-- 
cgit v1.1


From 7f24aaf567c1daf5f2478b28960fa3f98e18e374 Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Fri, 30 Nov 2012 05:40:24 +0000
Subject: Properly screen for the AND 0x81 instruction from the set of group1
 0x81 instructions that use the reg bits as an extended opcode.

Still todo: properly update rflags.

Pointed out by:	jilles@
---
 sys/amd64/vmm/vmm_instruction_emul.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 1c4abf8..e73f6bb 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -83,6 +83,7 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_type = VIE_OP_TYPE_AND,
 	},
 	[0x81] = {
+		/* XXX Group 1 extended opcode - not just AND */
 		.op_byte = 0x81,
 		.op_type = VIE_OP_TYPE_AND,
 		.op_flags = VIE_OP_F_IMM,
@@ -311,7 +312,13 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 *
 		 * 81/          and r/m32, imm32
 		 * REX.W + 81/  and r/m64, imm32 sign-extended to 64
+		 *
+		 * Currently, only the AND operation of the 0x81 opcode
+		 * is implemented (ModRM:reg = b100).
 		 */
+		if ((vie->reg & 7) != 4)
+			break;
+
 		if (vie->rex_w)
 			size = 8;
 		
-- 
cgit v1.1


From 7d7f92fbade54e46285282d2c5f456298084d794 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Sun, 16 Dec 2012 00:57:14 +0000
Subject: Prefer x2apic mode when running inside a virtual machine.

Provide a tunable 'machdep.x2apic_desired' to let the administrator override
the default behavior.

Provide a read-only sysctl 'machdep.x2apic' to let the administrator know
whether the kernel is using x2apic or legacy mmio to access local apic.

Tested with Parallels Desktop 8 and bhyve hypervisors.
Also tested running on bare metal Intel Xeon E5-2658.

Obtained from:	NetApp
Discussed with:	jhb, attilio, avg, grehan
---
 sys/amd64/amd64/mp_machdep.c | 2 ++
 sys/amd64/include/apicvar.h  | 1 +
 2 files changed, 3 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index b4a0be4..f7423be 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -708,6 +708,8 @@ init_secondary(void)
 	wrmsr(MSR_STAR, msr);
 	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
 
+	lapic_init_ap();
+
 	/* Disable local APIC just to be sure. */
 	lapic_disable();
 
diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h
index ae2f5b9..dee5900 100644
--- a/sys/amd64/include/apicvar.h
+++ b/sys/amd64/include/apicvar.h
@@ -209,6 +209,7 @@ int	lapic_enable_pmc(void);
 void	lapic_eoi(void);
 int	lapic_id(void);
 void	lapic_init(vm_paddr_t addr);
+void	lapic_init_ap(void);
 int	lapic_intr_pending(u_int vector);
 void	lapic_ipi_raw(register_t icrlo, u_int dest);
 void	lapic_ipi_vectored(u_int vector, int dest);
-- 
cgit v1.1


From bc64633d9d492d8fadbd7972ea2ae81a660233a3 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Sun, 16 Dec 2012 01:20:08 +0000
Subject: Modify the default behavior of bhyve such that it no longer forces
 the use of x2apic mode on the guest.

The guest can decide whether or not it wants to use legacy mmio or x2apic
access to the APIC by writing to the MSR_APICBASE register.

Obtained from:	NetApp
---
 sys/amd64/vmm/io/vlapic.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 911ed64..15fc6c2 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -896,8 +896,6 @@ vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 
 	vlapic = vm_lapic(vm, vcpuid);
 
-	if (state == X2APIC_ENABLED)
-		vlapic->msr_apicbase |= APICBASE_X2APIC;
-	else
+	if (state == X2APIC_DISABLED)
 		vlapic->msr_apicbase &= ~APICBASE_X2APIC;
 }
-- 
cgit v1.1


From 01173b0b4a9b00c153489a51f2cba1b3d0cfc119 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Fri, 4 Jan 2013 02:04:41 +0000
Subject: The "unrestricted guest" capability is a feature of Intel VT-x that
 allows the guest to execute real or unpaged protected mode code - bhyve
 relies on this feature to execute the AP bootstrap code.

Get rid of the hack that allowed bhyve to support SMP guests on processors
that do not have the "unrestricted guest" capability. This hack was entirely
FreeBSD-specific and would not work with any other guest OS.

Instead, limit the number of vcpus to 1 when executing on processors without
"unrestricted guest" capability.

Suggested by:	grehan
Obtained from:	NetApp
---
 sys/amd64/amd64/mp_machdep.c | 43 -------------------------------------------
 1 file changed, 43 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index f7423be..33e0814 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -145,26 +145,6 @@ struct cpu_info {
 int cpu_apic_ids[MAXCPU];
 int apic_cpuids[MAX_APIC_ID + 1];
 
-/*
- * Trampoline for hypervisor direct 64-bit jump.
- *
- *   0  -	signature for guest->host verification
- *   8  -	virtual address of this page
- *  16  -	instruction virtual address
- *  24  -	stack pointer virtual address
- *  32  -	CR3, physical address of kernel page table
- *  40  -	24-byte area for null/code/data GDT entries
- */
-#define MP_V64T_SIG	0xcafebabecafebabeULL
-struct mp_v64tramp {
-	uint64_t	mt_sig;
-	uint64_t	mt_virt;
-	uint64_t	mt_eip;
-	uint64_t	mt_rsp;
-	uint64_t	mt_cr3;
-	uint64_t	mt_gdtr[3];
-};
-
 /* Holds pending bitmap based IPIs per CPU */
 static volatile u_int cpu_ipi_pending[MAXCPU];
 
@@ -967,29 +947,6 @@ start_all_aps(void)
 		bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8;
 		bootAP = cpu;
 
-		/*
-		 * If running in a VM that doesn't support the unrestricted
-		 * guest 16-bit mode, forget most of the above and create
-		 * the data block that allows the hypervisor to direct-jump
-		 * into 64-bit mode. Copy this over the top of the 16-bit
-		 * bootstrap. The startup-IPI informs the hypervisor which
-		 * physical page this data block lies in. The hypervisor
-		 * will then use the block to initialise register state of
-		 * the AP in an almost identical fashion to how it builds
-		 * the BSP initial register state.
-		 */
-		if (testenv("hw.use_bvm_mptramp")) {
-			struct mp_v64tramp mv;
-
-			bzero(&mv, sizeof(mv));
-			mv.mt_sig = MP_V64T_SIG;
-			mv.mt_virt = (uint64_t) va;
-			mv.mt_eip = (uint64_t) init_secondary;
-			mv.mt_rsp = (uint64_t) bootSTK;
-			mv.mt_cr3 = KPML4phys;
-			bcopy(&mv, (void *) va, sizeof(mv));
-		}
-
 		/* attempt to start the Application Processor */
 		if (!start_ap(apic_id)) {
 			/* restore the warmstart vector */
-- 
cgit v1.1


From fec8c768eb1c50ae42da476ee843414b7f87b8f5 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Fri, 4 Jan 2013 02:49:12 +0000
Subject: There is no need for 'start_emulating()' and 'stop_emulating()' to be
 defined in <machine/cpufunc.h> so remove them from there.

Obtained from:	NetApp
---
 sys/amd64/include/cpufunc.h | 17 -----------------
 sys/amd64/vmm/vmm.c         |  4 ++--
 2 files changed, 2 insertions(+), 19 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/cpufunc.h b/sys/amd64/include/cpufunc.h
index 7243173..881fcd2 100644
--- a/sys/amd64/include/cpufunc.h
+++ b/sys/amd64/include/cpufunc.h
@@ -705,23 +705,6 @@ intr_disable(void)
 	return (rflags);
 }
 
-#ifndef	CR0_TS
-/* Defined in <machine/specialreg.h> */
-#define	CR0_TS	0x00000008
-#endif
-static __inline void
-start_emulating(void)
-{
-	__asm __volatile("smsw %%ax; orb %0,%%al; lmsw %%ax"
-			 : : "n" (CR0_TS) : "ax");
-}
-
-static __inline void
-stop_emulating(void)
-{
-	__asm __volatile("clts");
-}
-
 static __inline void
 intr_restore(register_t rflags)
 {
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index eae9ccc..a4dea79 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -145,8 +145,8 @@ static struct vmm_ops *ops;
 #define	VMSETCAP(vmi, vcpu, num, val)		\
 	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
 
-#define	fpu_start_emulating()	start_emulating()
-#define	fpu_stop_emulating()	stop_emulating()
+#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
+#define	fpu_stop_emulating()	clts()
 
 static MALLOC_DEFINE(M_VM, "vm", "vm");
 CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
-- 
cgit v1.1


From 736fc919674c3c284d5611e7e9b572385c4dbc0e Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Fri, 4 Jan 2013 03:02:43 +0000
Subject: There is no need for a special 'BHYVE' kernel configuration file
 anymore - 'GENERIC' works fine.

Obtained from:	NetApp
---
 sys/amd64/conf/BHYVE | 345 ---------------------------------------------------
 1 file changed, 345 deletions(-)
 delete mode 100644 sys/amd64/conf/BHYVE

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/BHYVE b/sys/amd64/conf/BHYVE
deleted file mode 100644
index 89c8ea2..0000000
--- a/sys/amd64/conf/BHYVE
+++ /dev/null
@@ -1,345 +0,0 @@
-#
-# BHYVE -- Kernel configuration file for FreeBSD/amd64 bhyve guest
-#
-# For more information on this file, please read the config(5) manual page,
-# and/or the handbook section on Kernel Configuration Files:
-#
-#    http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
-#
-# The handbook is also available locally in /usr/share/doc/handbook
-# if you've installed the doc distribution, otherwise always see the
-# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
-# latest information.
-#
-# An exhaustive list of options and more detailed explanations of the
-# device lines is also present in the ../../conf/NOTES and NOTES files.
-# If you are in doubt as to the purpose or necessity of a line, check first
-# in NOTES.
-#
-# $FreeBSD: projects/bhyve/sys/amd64/conf/GENERIC 221914 2011-05-14 20:35:01Z jhb $
-
-cpu		HAMMER
-ident		BHYVE
-
-makeoptions	DEBUG=-g		# Build kernel with gdb(1) debug symbols
-
-options 	SCHED_ULE		# ULE scheduler
-options 	PREEMPTION		# Enable kernel thread preemption
-options 	INET			# InterNETworking
-options 	INET6			# IPv6 communications protocols
-options 	SCTP			# Stream Control Transmission Protocol
-options 	FFS			# Berkeley Fast Filesystem
-options 	SOFTUPDATES		# Enable FFS soft updates support
-options 	UFS_ACL			# Support for access control lists
-options 	UFS_DIRHASH		# Improve performance on big directories
-options 	UFS_GJOURNAL		# Enable gjournal-based UFS journaling
-options 	MD_ROOT			# MD is a potential root device
-options 	NFSCL			# New Network Filesystem Client
-options 	NFSD			# New Network Filesystem Server
-options 	NFSLOCKD		# Network Lock Manager
-options 	NFS_ROOT		# NFS usable as /, requires NFSCLIENT
-options 	MSDOSFS			# MSDOS Filesystem
-options 	CD9660			# ISO 9660 Filesystem
-options 	PROCFS			# Process filesystem (requires PSEUDOFS)
-options 	PSEUDOFS		# Pseudo-filesystem framework
-options 	GEOM_PART_GPT		# GUID Partition Tables.
-options 	GEOM_LABEL		# Provides labelization
-options 	COMPAT_FREEBSD32	# Compatible with i386 binaries
-options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
-options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
-options 	COMPAT_FREEBSD6		# Compatible with FreeBSD6
-options 	COMPAT_FREEBSD7		# Compatible with FreeBSD7
-options 	SCSI_DELAY=5000		# Delay (in ms) before probing SCSI
-options 	KTRACE			# ktrace(1) support
-options 	STACK			# stack(9) support
-options 	SYSVSHM			# SYSV-style shared memory
-options 	SYSVMSG			# SYSV-style message queues
-options 	SYSVSEM			# SYSV-style semaphores
-options 	_KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions
-options 	PRINTF_BUFR_SIZE=128	# Prevent printf output being interspersed.
-options 	KBD_INSTALL_CDEV	# install a CDEV entry in /dev
-#options 	HWPMC_HOOKS		# Necessary kernel hooks for hwpmc(4)
-options 	AUDIT			# Security event auditing
-options 	MAC			# TrustedBSD MAC Framework
-#options 	KDTRACE_FRAME		# Ensure frames are compiled in
-#options 	KDTRACE_HOOKS		# Kernel DTrace hooks
-options 	INCLUDE_CONFIG_FILE     # Include this file in kernel
-
-# Debugging for use in -current
-options 	KDB			# Enable kernel debugger support.
-options 	DDB			# Support DDB.
-options 	GDB			# Support remote GDB.
-options 	DEADLKRES		# Enable the deadlock resolver
-options 	INVARIANTS		# Enable calls of extra sanity checking
-options 	INVARIANT_SUPPORT	# Extra sanity checks of internal structures, required by INVARIANTS
-options 	WITNESS			# Enable checks to detect deadlocks and cycles
-options 	WITNESS_SKIPSPIN	# Don't run witness on spinlocks for speed
-options 	MALLOC_DEBUG_MAXZONES=8	# Separate malloc(9) zones
-
-# Make an SMP-capable kernel by default
-options 	SMP			# Symmetric MultiProcessor Kernel
-
-# CPU frequency control
-#device		cpufreq
-
-# Bus support.
-#device		acpi
-device		pci
-
-# Floppy drives
-#device		fdc
-
-# ATA controllers
-#device		ahci		# AHCI-compatible SATA controllers
-#device		ata		# Legacy ATA/SATA controllers
-#options 	ATA_CAM		# Handle legacy controllers with CAM
-#options 	ATA_STATIC_ID	# Static device numbering
-#device		mvs		# Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA
-#device		siis		# SiliconImage SiI3124/SiI3132/SiI3531 SATA
-
-# SCSI Controllers
-#device		ahc		# AHA2940 and onboard AIC7xxx devices
-#options 	AHC_REG_PRETTY_PRINT	# Print register bitfields in debug
-					# output.  Adds ~128k to driver.
-#device		ahd		# AHA39320/29320 and onboard AIC79xx devices
-#options 	AHD_REG_PRETTY_PRINT	# Print register bitfields in debug
-					# output.  Adds ~215k to driver.
-#device		amd		# AMD 53C974 (Tekram DC-390(T))
-#device		hptiop		# Highpoint RocketRaid 3xxx series
-#device		isp		# Qlogic family
-#device		ispfw		# Firmware for QLogic HBAs- normally a module
-#device		mpt		# LSI-Logic MPT-Fusion
-#device		mps		# LSI-Logic MPT-Fusion 2
-#device		ncr		# NCR/Symbios Logic
-#device		sym		# NCR/Symbios Logic (newer chipsets + those of `ncr')
-#device		trm		# Tekram DC395U/UW/F DC315U adapters
-
-#device		adv		# Advansys SCSI adapters
-#device		adw		# Advansys wide SCSI adapters
-#device		aic		# Adaptec 15[012]x SCSI adapters, AIC-6[23]60.
-#device		bt		# Buslogic/Mylex MultiMaster SCSI adapters
-
-# ATA/SCSI peripherals
-#device		scbus		# SCSI bus (required for ATA/SCSI)
-#device		ch		# SCSI media changers
-#device		da		# Direct Access (disks)
-#device		sa		# Sequential Access (tape etc)
-#device		cd		# CD
-#device		pass		# Passthrough device (direct ATA/SCSI access)
-#device		ses		# SCSI Environmental Services (and SAF-TE)
-
-# RAID controllers interfaced to the SCSI subsystem
-#device		amr		# AMI MegaRAID
-#device		arcmsr		# Areca SATA II RAID
-#XXX it is not 64-bit clean, -scottl
-#device		asr		# DPT SmartRAID V, VI and Adaptec SCSI RAID
-#device		ciss		# Compaq Smart RAID 5*
-#device		dpt		# DPT Smartcache III, IV - See NOTES for options
-#device		hptmv		# Highpoint RocketRAID 182x
-#device		hptrr		# Highpoint RocketRAID 17xx, 22xx, 23xx, 25xx
-#device		iir		# Intel Integrated RAID
-#device		ips		# IBM (Adaptec) ServeRAID
-#device		mly		# Mylex AcceleRAID/eXtremeRAID
-#device		twa		# 3ware 9000 series PATA/SATA RAID
-
-# RAID controllers
-#device		aac		# Adaptec FSA RAID
-#device		aacp		# SCSI passthrough for aac (requires CAM)
-#device		ida		# Compaq Smart RAID
-#device		mfi		# LSI MegaRAID SAS
-#device		mlx		# Mylex DAC960 family
-#XXX pointer/int warnings
-#device		pst		# Promise Supertrak SX6000
-#device		twe		# 3ware ATA RAID
-
-# atkbdc0 controls both the keyboard and the PS/2 mouse
-#device		atkbdc		# AT keyboard controller
-#device		atkbd		# AT keyboard
-#device		psm		# PS/2 mouse
-
-#device		kbdmux		# keyboard multiplexer
-
-#device		vga		# VGA video card driver
-
-#device		splash		# Splash screen and screen saver support
-
-# syscons is the default console driver, resembling an SCO console
-#device		sc
-#options 	SC_PIXEL_MODE	# add support for the raster text mode
-
-#device		agp		# support several AGP chipsets
-
-# PCCARD (PCMCIA) support
-# PCMCIA and cardbus bridge support
-#device		cbb		# cardbus (yenta) bridge
-#device		pccard		# PC Card (16-bit) bus
-#device		cardbus		# CardBus (32-bit) bus
-
-# Serial (COM) ports
-device		uart		# Generic UART driver
-
-# Parallel port
-#device		ppc
-#device		ppbus		# Parallel port bus (required)
-#device		lpt		# Printer
-#device		plip		# TCP/IP over parallel
-#device		ppi		# Parallel port interface device
-#device		vpo		# Requires scbus and da
-
-# If you've got a "dumb" serial or parallel PCI card that is
-# supported by the puc(4) glue driver, uncomment the following
-# line to enable it (connects to sio, uart and/or ppc drivers):
-#device		puc
-
-# PCI Ethernet NICs.
-#device		bxe		# Broadcom BCM57710/BCM57711/BCM57711E 10Gb Ethernet
-#device		de		# DEC/Intel DC21x4x (``Tulip'')
-#device		em		# Intel PRO/1000 Gigabit Ethernet Family
-#device		igb		# Intel PRO/1000 PCIE Server Gigabit Family
-#device		ixgbe		# Intel PRO/10GbE PCIE Ethernet Family
-#device		le		# AMD Am7900 LANCE and Am79C9xx PCnet
-#device		ti		# Alteon Networks Tigon I/II gigabit Ethernet
-#device		txp		# 3Com 3cR990 (``Typhoon'')
-#device		vx		# 3Com 3c590, 3c595 (``Vortex'')
-
-# PCI Ethernet NICs that use the common MII bus controller code.
-# NOTE: Be sure to keep the 'device miibus' line in order to use these NICs!
-#device		miibus		# MII bus support
-#device		ae		# Attansic/Atheros L2 FastEthernet
-#device		age		# Attansic/Atheros L1 Gigabit Ethernet
-#device		alc		# Atheros AR8131/AR8132 Ethernet
-#device		ale		# Atheros AR8121/AR8113/AR8114 Ethernet
-#device		bce		# Broadcom BCM5706/BCM5708 Gigabit Ethernet
-#device		bfe		# Broadcom BCM440x 10/100 Ethernet
-#device		bge		# Broadcom BCM570xx Gigabit Ethernet
-#device		dc		# DEC/Intel 21143 and various workalikes
-#device		et		# Agere ET1310 10/100/Gigabit Ethernet
-#device		fxp		# Intel EtherExpress PRO/100B (82557, 82558)
-#device		jme		# JMicron JMC250 Gigabit/JMC260 Fast Ethernet
-#device		lge		# Level 1 LXT1001 gigabit Ethernet
-#device		msk		# Marvell/SysKonnect Yukon II Gigabit Ethernet
-#device		nfe		# nVidia nForce MCP on-board Ethernet
-#device		nge		# NatSemi DP83820 gigabit Ethernet
-#device		nve		# nVidia nForce MCP on-board Ethernet Networking
-#device		pcn		# AMD Am79C97x PCI 10/100 (precedence over 'le')
-#device		re		# RealTek 8139C+/8169/8169S/8110S
-#device		rl		# RealTek 8129/8139
-#device		sf		# Adaptec AIC-6915 (``Starfire'')
-#device		sge		# Silicon Integrated Systems SiS190/191
-#device		sis		# Silicon Integrated Systems SiS 900/SiS 7016
-#device		sk		# SysKonnect SK-984x & SK-982x gigabit Ethernet
-#device		ste		# Sundance ST201 (D-Link DFE-550TX)
-#device		stge		# Sundance/Tamarack TC9021 gigabit Ethernet
-#device		tl		# Texas Instruments ThunderLAN
-#device		tx		# SMC EtherPower II (83c170 ``EPIC'')
-#device		vge		# VIA VT612x gigabit Ethernet
-#device		vr		# VIA Rhine, Rhine II
-#device		wb		# Winbond W89C840F
-#device		xl		# 3Com 3c90x (``Boomerang'', ``Cyclone'')
-
-# ISA Ethernet NICs.  pccard NICs included.
-#device		cs		# Crystal Semiconductor CS89x0 NIC
-# 'device ed' requires 'device miibus'
-#device		ed		# NE[12]000, SMC Ultra, 3c503, DS8390 cards
-#device		ex		# Intel EtherExpress Pro/10 and Pro/10+
-#device		ep		# Etherlink III based cards
-#device		fe		# Fujitsu MB8696x based cards
-#device		sn		# SMC's 9000 series of Ethernet chips
-#device		xe		# Xircom pccard Ethernet
-
-# Wireless NIC cards
-#device		wlan		# 802.11 support
-#options 	IEEE80211_DEBUG	# enable debug msgs
-#options 	IEEE80211_AMPDU_AGE # age frames in AMPDU reorder q's
-#options 	IEEE80211_SUPPORT_MESH	# enable 802.11s draft support
-#device		wlan_wep	# 802.11 WEP support
-#device		wlan_ccmp	# 802.11 CCMP support
-#device		wlan_tkip	# 802.11 TKIP support
-#device		wlan_amrr	# AMRR transmit rate control algorithm
-#device		an		# Aironet 4500/4800 802.11 wireless NICs.
-#device		ath		# Atheros NIC's
-#device		ath_pci		# Atheros pci/cardbus glue
-#device		ath_hal		# pci/cardbus chip support
-#options 	AH_SUPPORT_AR5416	# enable AR5416 tx/rx descriptors
-#device		ath_rate_sample	# SampleRate tx rate control for ath
-#device		bwi		# Broadcom BCM430x/BCM431x wireless NICs.
-#device		bwn		# Broadcom BCM43xx wireless NICs.
-#device		ipw		# Intel 2100 wireless NICs.
-#device		iwi		# Intel 2200BG/2225BG/2915ABG wireless NICs.
-#device		iwn		# Intel 4965/1000/5000/6000 wireless NICs.
-#device		malo		# Marvell Libertas wireless NICs.
-#device		mwl		# Marvell 88W8363 802.11n wireless NICs.
-#device		ral		# Ralink Technology RT2500 wireless NICs.
-#device		wi		# WaveLAN/Intersil/Symbol 802.11 wireless NICs.
-#device		wpi		# Intel 3945ABG wireless NICs.
-
-# Pseudo devices.
-device		loop		# Network loopback
-device		random		# Entropy device
-device		ether		# Ethernet support
-device		vlan		# 802.1Q VLAN support
-device		tun		# Packet tunnel.
-device		pty		# BSD-style compatibility pseudo ttys
-device		md		# Memory "disks"
-device		gif		# IPv6 and IPv4 tunneling
-device		faith		# IPv6-to-IPv4 relaying (translation)
-device		firmware	# firmware assist module
-
-# The `bpf' device enables the Berkeley Packet Filter.
-# Be aware of the administrative consequences of enabling this!
-# Note that 'bpf' is required for DHCP.
-device		bpf		# Berkeley packet filter
-
-# USB support
-#options 	USB_DEBUG	# enable debug msgs
-#device		uhci		# UHCI PCI->USB interface
-#device		ohci		# OHCI PCI->USB interface
-#device		ehci		# EHCI PCI->USB interface (USB 2.0)
-#device		usb		# USB Bus (required)
-#device		udbp		# USB Double Bulk Pipe devices (needs netgraph)
-#device		uhid		# "Human Interface Devices"
-#device		ukbd		# Keyboard
-#device		ulpt		# Printer
-#device		umass		# Disks/Mass storage - Requires scbus and da
-#device		ums		# Mouse
-#device		urio		# Diamond Rio 500 MP3 player
-# USB Serial devices
-#device		u3g		# USB-based 3G modems (Option, Huawei, Sierra)
-#device		uark		# Technologies ARK3116 based serial adapters
-#device		ubsa		# Belkin F5U103 and compatible serial adapters
-#device		uftdi		# For FTDI usb serial adapters
-#device		uipaq		# Some WinCE based devices
-#device		uplcom		# Prolific PL-2303 serial adapters
-#device		uslcom		# SI Labs CP2101/CP2102 serial adapters
-#device		uvisor		# Visor and Palm devices
-#device		uvscom		# USB serial support for DDI pocket's PHS
-# USB Ethernet, requires miibus
-#device		aue		# ADMtek USB Ethernet
-#device		axe		# ASIX Electronics USB Ethernet
-#device		cdce		# Generic USB over Ethernet
-#device		cue		# CATC USB Ethernet
-#device		kue		# Kawasaki LSI USB Ethernet
-#device		rue		# RealTek RTL8150 USB Ethernet
-#device		udav		# Davicom DM9601E USB
-# USB Wireless
-#device		rum		# Ralink Technology RT2501USB wireless NICs
-#device		run		# Ralink Technology RT2700/RT2800/RT3000 NICs.
-#device		uath		# Atheros AR5523 wireless NICs
-#device		upgt		# Conexant/Intersil PrismGT wireless NICs.
-#device		ural		# Ralink Technology RT2500USB wireless NICs
-#device		urtw		# Realtek RTL8187B/L wireless NICs
-#device		zyd		# ZyDAS zb1211/zb1211b wireless NICs
-
-# FireWire support
-#device		firewire	# FireWire bus code
-#device		sbp		# SCSI over FireWire (Requires scbus and da)
-#device		fwe		# Ethernet over FireWire (non-standard!)
-#device		fwip		# IP over FireWire (RFC 2734,3146)
-#device		dcons		# Dumb console driver
-#device		dcons_crom	# Configuration ROM for dcons
-
-device		bvmconsole	# brain dead simple bvm console
-device		bvmdebug	# brain dead simple bvm gdb pipe
-
-device		mptable
-options		NKPT=256
-- 
cgit v1.1


From eda0d7f2563ff44f103201dbb5a841351f0c024a Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Sat, 5 Jan 2013 03:35:30 +0000
Subject: bhyve does not require a custom configuration file anymore so make
 the GENERIC identical to the one in HEAD.

Obtained from:	NetApp
---
 sys/amd64/conf/GENERIC | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index 9c72500..7aff1e8 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -74,7 +74,7 @@ options 	INCLUDE_CONFIG_FILE     # Include this file in kernel
 # Debugging support.  Always need this:
 options 	KDB			# Enable kernel debugger support.
 # For minimum debugger support (stable branch) use:
-options 	KDB_TRACE		# Print a stack trace for a panic.
+#options 	KDB_TRACE		# Print a stack trace for a panic.
 # For full debugger support use this instead:
 options 	DDB			# Support DDB.
 options 	GDB			# Support remote GDB.
@@ -326,11 +326,6 @@ device		fwip		# IP over FireWire (RFC 2734,3146)
 device		dcons		# Dumb console driver
 device		dcons_crom	# Configuration ROM for dcons
 
-# bhyve options
-device		bvmconsole	# brain dead simple bvm console
-device		bvmdebug	# brain dead simple bvm gdb pipe
-device		mptable
-
 # Sound support
 device		sound		# Generic sound driver (required)
 device		snd_cmi		# CMedia CMI8338/CMI8738
-- 
cgit v1.1


From d184bb1077cf7d96f98f7b5b1fb24951ff6a80e7 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Sun, 6 Jan 2013 05:37:26 +0000
Subject: Revert changes for x2apic support from projects/bhyve.

During the early days of bhyve it did not support instruction emulation
which necessitated the use of x2apic to access the local apic. This is no
longer the case and the dependency on x2apic has gone away.

The x2apic patches can be considered independently of bhyve and will be
merged into head via projects/x2apic.

Discussed with:	grehan
---
 sys/amd64/amd64/apic_vector.S | 55 ++++++++++++++-----------------------------
 sys/amd64/amd64/mp_machdep.c  |  2 --
 sys/amd64/include/apicvar.h   |  1 -
 3 files changed, 18 insertions(+), 40 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 8004153..6465247 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -57,15 +57,8 @@ IDTVEC(vec_name) ;							\
 	PUSH_FRAME ;							\
 	FAKE_MCOUNT(TF_RIP(%rsp)) ;					\
 	movq	lapic, %rdx ;	/* pointer to local APIC */		\
-	testq	%rdx, %rdx;						\
-	jnz	3f;							\
-	movl	$MSR_APIC_ISR ## index, %ecx;				\
-	rdmsr;								\
-	jmp	4f;							\
-3:	;								\
 	movl	LA_ISR + 16 * (index)(%rdx), %eax ;	/* load ISR */	\
-4:	;								\
-	bsrl	%eax, %eax ;	/* index of highset set bit in ISR */	\
+	bsrl	%eax, %eax ;	/* index of highest set bit in ISR */	\
 	jz	1f ;							\
 	addl	$(32 * index),%eax ;					\
 	movq	%rsp, %rsi	;                                       \
@@ -136,26 +129,6 @@ IDTVEC(errorint)
 	jmp	doreti
 
 #ifdef SMP
-
-/*
- * We assume that %rax is being saved/restored outside of this macro
- */
-#define	DO_EOI								\
-	movq	lapic, %rax;						\
-	testq	%rax, %rax;						\
-	jz	8f;							\
-	movl	$0, LA_EOI(%rax);					\
-	jmp	9f;							\
-8:;									\
-	pushq	%rcx;							\
-	pushq	%rdx;							\
-	xorl	%edx, %edx;	/* eax is already zero */		\
-	movl	$MSR_APIC_EOI, %ecx;					\
-	wrmsr;								\
-	popq	%rdx;							\
-	popq	%rcx;							\
-9:
-	
 /*
  * Global address space TLB shootdown.
  */
@@ -180,7 +153,8 @@ IDTVEC(invltlb)
 	movq	%cr3, %rax		/* invalidate the TLB */
 	movq	%rax, %cr3
 
-	DO_EOI
+	movq	lapic, %rax
+	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
 
 	lock
 	incl	smp_tlb_wait
@@ -212,7 +186,8 @@ IDTVEC(invlpg)
 	movq	smp_tlb_addr1, %rax
 	invlpg	(%rax)			/* invalidate single page */
 
-	DO_EOI
+	movq	lapic, %rax
+	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
 
 	lock
 	incl	smp_tlb_wait
@@ -249,7 +224,8 @@ IDTVEC(invlrng)
 	cmpq	%rax, %rdx
 	jb	1b
 
-	DO_EOI
+	movq	lapic, %rax
+	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
 
 	lock
 	incl	smp_tlb_wait
@@ -276,7 +252,8 @@ IDTVEC(invlcache)
 
 	wbinvd
 
-	DO_EOI
+	movq	lapic, %rax
+	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
 
 	lock
 	incl	smp_tlb_wait
@@ -292,8 +269,9 @@ IDTVEC(invlcache)
 IDTVEC(ipi_intr_bitmap_handler)		
 	PUSH_FRAME
 
-	DO_EOI
-
+	movq	lapic, %rdx
+	movl	$0, LA_EOI(%rdx)	/* End Of Interrupt to APIC */
+	
 	FAKE_MCOUNT(TF_RIP(%rsp))
 
 	call	ipi_bitmap_handler
@@ -308,7 +286,8 @@ IDTVEC(ipi_intr_bitmap_handler)
 IDTVEC(cpustop)
 	PUSH_FRAME
 
-	DO_EOI
+	movq	lapic, %rax
+	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
 
 	call	cpustop_handler
 	jmp	doreti
@@ -322,7 +301,8 @@ IDTVEC(cpususpend)
 	PUSH_FRAME
 
 	call	cpususpend_handler
-	DO_EOI
+	movq	lapic, %rax
+	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
 	jmp	doreti
 
 /*
@@ -340,6 +320,7 @@ IDTVEC(rendezvous)
 	incq	(%rax)
 #endif
 	call	smp_rendezvous_action
-	DO_EOI
+	movq	lapic, %rax
+	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
 	jmp	doreti
 #endif /* SMP */
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 33e0814..d2e4aad 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -688,8 +688,6 @@ init_secondary(void)
 	wrmsr(MSR_STAR, msr);
 	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
 
-	lapic_init_ap();
-
 	/* Disable local APIC just to be sure. */
 	lapic_disable();
 
diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h
index dee5900..ae2f5b9 100644
--- a/sys/amd64/include/apicvar.h
+++ b/sys/amd64/include/apicvar.h
@@ -209,7 +209,6 @@ int	lapic_enable_pmc(void);
 void	lapic_eoi(void);
 int	lapic_id(void);
 void	lapic_init(vm_paddr_t addr);
-void	lapic_init_ap(void);
 int	lapic_intr_pending(u_int vector);
 void	lapic_ipi_raw(register_t icrlo, u_int dest);
 void	lapic_ipi_vectored(u_int vector, int dest);
-- 
cgit v1.1


From 4c17637f9cfd75b0ebb0474d3f2f39483a453913 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Wed, 9 Jan 2013 03:32:23 +0000
Subject: IFC @ r245205

---
 sys/amd64/amd64/vm_machdep.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
index a40eaba..9883715 100644
--- a/sys/amd64/amd64/vm_machdep.c
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -574,10 +574,9 @@ cpu_reset_proxy()
 	cpuset_t tcrp;
 
 	cpu_reset_proxy_active = 1;
-	while (cpu_reset_proxy_active == 1) {
-		ia32_pause();
-		;	/* Wait for other cpu to see that we've started */
-	}
+	while (cpu_reset_proxy_active == 1)
+		ia32_pause(); /* Wait for other cpu to see that we've started */
+
 	CPU_SETOF(cpu_reset_proxyid, &tcrp);
 	stop_cpus(tcrp);
 	printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
-- 
cgit v1.1


From df2fc90f0e744447190cbd4a0d67474ddadfa96c Mon Sep 17 00:00:00 2001
From: bryanv <bryanv@FreeBSD.org>
Date: Sun, 13 Jan 2013 07:14:16 +0000
Subject: Add VirtIO to the i386 and amd64 GENERIC kernels

This also removes the kludge from r239009 that covered only
the network driver.

Reviewed by:	grehan
Approved by:	grehan (mentor)
MFC after:	1 week
---
 sys/amd64/conf/GENERIC | 8 ++++++++
 sys/amd64/conf/NOTES   | 9 +++++++++
 2 files changed, 17 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index 48f41b3..e53f692 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -332,3 +332,11 @@ device		snd_via8233	# VIA VT8233x Audio
 device		mmc		# MMC/SD bus
 device		mmcsd		# MMC/SD memory card
 device		sdhci		# Generic PCI SD Host Controller
+
+# VirtIO support
+device		virtio		# Generic VirtIO bus (required)
+device		virtio_pci	# VirtIO PCI device
+device		vtnet		# VirtIO Ethernet device
+device		virtio_blk	# VirtIO Block device
+device		virtio_scsi	# VirtIO SCSI device
+device		virtio_balloon	# VirtIO Memory Balloon device
diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES
index 6562981..a4371f7 100644
--- a/sys/amd64/conf/NOTES
+++ b/sys/amd64/conf/NOTES
@@ -440,6 +440,15 @@ device		safe		# SafeNet 1141
 options 	SAFE_DEBUG	# enable debugging support: hw.safe.debug
 options 	SAFE_RNDTEST	# enable rndtest support
 
+#
+# VirtIO support
+device		virtio		# Generic VirtIO bus (required)
+device		virtio_pci	# VirtIO PCI Interface
+device		vtnet		# VirtIO Ethernet device
+device		virtio_blk	# VirtIO Block device
+device		virtio_scsi	# VirtIO SCSI device
+device		virtio_balloon	# VirtIO Memory Balloon device
+
 #####################################################################
 
 #
-- 
cgit v1.1


From 4a3c4478d3346235378985c52457d9bd03d7f401 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Thu, 17 Jan 2013 21:32:25 +0000
Subject: Don't attempt to use clflush on the local APIC register window. 
 Various CPUs exhibit bad behavior if this is done (Intel Errata AAJ3, hangs
 on Pentium-M, and trashing of the local APIC registers on a VIA C7).  The
 local APIC is implicitly mapped UC already via MTRRs, so the clflush isn't
 necessary anyway.

MFC after:	2 weeks
---
 sys/amd64/amd64/pmap.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 8e06ff9..352cb34 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -1150,6 +1150,15 @@ pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
 
 		/*
+		 * XXX: Some CPUs fault, hang, or trash the local APIC
+		 * registers if we use CLFLUSH on the local APIC
+		 * range.  The local APIC is always uncached, so we
+		 * don't need to flush for that range anyway.
+		 */
+		if (pmap_kextract(sva) == lapic_paddr)
+			return;
+
+		/*
 		 * Otherwise, do per-cache line flush.  Use the mfence
 		 * instruction to insure that previous stores are
 		 * included in the write-back.  The processor
-- 
cgit v1.1


From fe9918fd5583c14112273668674d7275662c4961 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Sat, 19 Jan 2013 01:18:22 +0000
Subject: Fix build with SMP disabled.`

Reported by:	bf
---
 sys/amd64/amd64/pmap.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 352cb34..f73e956 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -102,6 +102,7 @@ __FBSDID("$FreeBSD$");
 #include "opt_vm.h"
 
 #include <sys/param.h>
+#include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
@@ -133,6 +134,8 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
-- 
cgit v1.1


From a0f44a6f49338c9ec7e274ad4e8a272008cd1f60 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Sun, 20 Jan 2013 03:42:49 +0000
Subject: Add svn properties to the recently merged bhyve source files.

The pre-commit hook will not allow any commits without the svn:keywords
property in head.
---
 sys/amd64/include/vmm.h     | 2 +-
 sys/amd64/include/vmm_dev.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 024c30e..ec94083 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: vmm.h 482 2011-05-09 21:22:43Z grehan $
+ * $FreeBSD$
  */
 
 #ifndef _VMM_H_
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index 79f893d..2311673 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $
+ * $FreeBSD$
  */
 
 #ifndef	_VMM_DEV_H_
-- 
cgit v1.1


From 92a8d9884de2b1441cb846d50856e44cfa2b0024 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Mon, 21 Jan 2013 01:33:10 +0000
Subject: Postpone vmm module initialization until after SMP is initialized -
 particularly that 'smp_started != 0'.

This is required because the VT-x initialization calls smp_rendezvous()
to set the CR4_VMXE bit on all the cpus.

With this change we can preload vmm.ko from the loader.

Reported by:	alfred@, sbruno@
Obtained from:	NetApp
---
 sys/amd64/vmm/vmm.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index a4dea79..d0e6427 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -249,11 +249,15 @@ static moduledata_t vmm_kmod = {
 };
 
 /*
- * Execute the module load handler after the pci passthru driver has had
- * a chance to claim devices. We need this information at the time we do
- * iommu initialization.
+ * vmm initialization has the following dependencies:
+ *
+ * - iommu initialization must happen after the pci passthru driver has had
+ *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
+ *
+ * - VT-x initialization requires smp_rendezvous() and therefore must happen
+ *   after SMP is fully functional (after SI_SUB_SMP).
  */
-DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY);
+DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
 MODULE_VERSION(vmm, 1);
 
 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
-- 
cgit v1.1


From af17a55dfd7a008dea74152e32f5d6c803b46bdd Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Wed, 23 Jan 2013 21:44:48 +0000
Subject: Don't assume that all Linux TCP-level socket options are identical to
 FreeBSD TCP-level socket options (only the first two are).  Instead, using a
 mapping function and fail unsupported options as we do for other socket
 option levels.

MFC after:	2 weeks
---
 sys/amd64/linux32/linux.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/linux32/linux.h b/sys/amd64/linux32/linux.h
index 2c269d3..c18ee22 100644
--- a/sys/amd64/linux32/linux.h
+++ b/sys/amd64/linux32/linux.h
@@ -725,6 +725,13 @@ union l_semun {
 #define	LINUX_IP_ADD_MEMBERSHIP		35
 #define	LINUX_IP_DROP_MEMBERSHIP	36
 
+#define	LINUX_TCP_NODELAY	1
+#define	LINUX_TCP_MAXSEG	2
+#define	LINUX_TCP_KEEPIDLE	4
+#define	LINUX_TCP_KEEPINTVL	5
+#define	LINUX_TCP_KEEPCNT	6
+#define	LINUX_TCP_MD5SIG	14
+
 struct l_sockaddr {
 	l_ushort	sa_family;
 	char		sa_data[14];
-- 
cgit v1.1


From 94554367a02e21be52d85147b7190cad036ebd31 Mon Sep 17 00:00:00 2001
From: grehan <grehan@FreeBSD.org>
Date: Fri, 25 Jan 2013 21:38:31 +0000
Subject: Always allow access to the sysenter cs/esp/eip MSRs since they are
 automatically saved and restored in the VMCS.

Reviewed by:	neel
Obtained from:	NetApp
---
 sys/amd64/vmm/intel/vmx.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 4f267bb..287ac8c 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -696,6 +696,10 @@ vmx_vminit(struct vm *vm)
 	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
 	 * always restored from the vmcs host state area on vm-exit.
 	 *
+	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
+	 * how they are saved/restored so can be directly accessed by the
+	 * guest.
+	 *
 	 * Guest KGSBASE is saved and restored in the guest MSR save area.
 	 * Host KGSBASE is restored before returning to userland from the pcb.
 	 * There will be a window of time when we are executing in the host
@@ -708,6 +712,9 @@ vmx_vminit(struct vm *vm)
 	 */
 	if (guest_msr_rw(vmx, MSR_GSBASE) ||
 	    guest_msr_rw(vmx, MSR_FSBASE) ||
+	    guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
+	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
+	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
 	    guest_msr_rw(vmx, MSR_KGSBASE) ||
 	    guest_msr_rw(vmx, MSR_EFER))
 		panic("vmx_vminit: error setting guest msr access");
-- 
cgit v1.1


From 2617d9f095bb1dfa934ef021a7237482304fcdb9 Mon Sep 17 00:00:00 2001
From: jhb <jhb@FreeBSD.org>
Date: Tue, 29 Jan 2013 18:41:30 +0000
Subject: Reduce duplication between i386/linux/linux.h and
 amd64/linux32/linux.h by moving bits that are MI out into headers in
 compat/linux.

Reviewed by:	Chagin Dmitry  dmitry | gmail
MFC after:	2 weeks
---
 sys/amd64/linux32/linux.h          | 160 -------------------------------------
 sys/amd64/linux32/linux32_sysvec.c |   1 +
 2 files changed, 1 insertion(+), 160 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/linux32/linux.h b/sys/amd64/linux32/linux.h
index c18ee22..7b52a64 100644
--- a/sys/amd64/linux32/linux.h
+++ b/sys/amd64/linux32/linux.h
@@ -107,11 +107,6 @@ typedef struct {
 /*
  * Miscellaneous
  */
-#define	LINUX_NAME_MAX		255
-#define	LINUX_MAX_UTSNAME	65
-
-#define	LINUX_CTL_MAXNAME	10
-
 #define	LINUX_AT_COUNT		16	/* Count of used aux entry types.
 					 * Keep this synchronized with
 					 * elf_linux_fixup() code.
@@ -127,11 +122,6 @@ struct l___sysctl_args
 	l_ulong		__spare[4];
 } __packed;
 
-/* Scheduling policies */
-#define	LINUX_SCHED_OTHER	0
-#define	LINUX_SCHED_FIFO	1
-#define	LINUX_SCHED_RR		2
-
 /* Resource limits */
 #define	LINUX_RLIMIT_CPU	0
 #define	LINUX_RLIMIT_FSIZE	1
@@ -265,15 +255,6 @@ struct l_statfs64 {
         l_int           f_spare[6];
 } __packed;
 
-struct l_new_utsname {
-	char	sysname[LINUX_MAX_UTSNAME];
-	char	nodename[LINUX_MAX_UTSNAME];
-	char	release[LINUX_MAX_UTSNAME];
-	char	version[LINUX_MAX_UTSNAME];
-	char	machine[LINUX_MAX_UTSNAME];
-	char	domainname[LINUX_MAX_UTSNAME];
-} __packed;
-
 /*
  * Signalling
  */
@@ -535,27 +516,9 @@ struct l_rt_sigframe {
 	l_handler_t 		sf_handler;
 } __packed;
 
-extern int bsd_to_linux_signal[];
-extern int linux_to_bsd_signal[];
 extern struct sysentvec elf_linux_sysvec;
 
 /*
- * Pluggable ioctl handlers
- */
-struct linux_ioctl_args;
-struct thread;
-
-typedef int linux_ioctl_function_t(struct thread *, struct linux_ioctl_args *);
-
-struct linux_ioctl_handler {
-	linux_ioctl_function_t *func;
-	int	low, high;
-};
-
-int	linux_ioctl_register_handler(struct linux_ioctl_handler *h);
-int	linux_ioctl_unregister_handler(struct linux_ioctl_handler *h);
-
-/*
  * open/fcntl flags
  */
 #define	LINUX_O_RDONLY		00000000
@@ -597,65 +560,6 @@ int	linux_ioctl_unregister_handler(struct linux_ioctl_handler *h);
 #define	LINUX_F_WRLCK		1
 #define	LINUX_F_UNLCK		2
 
-/*
- * posix_fadvise advice
- */
-#define	LINUX_POSIX_FADV_NORMAL		0
-#define	LINUX_POSIX_FADV_RANDOM		1
-#define	LINUX_POSIX_FADV_SEQUENTIAL    	2
-#define	LINUX_POSIX_FADV_WILLNEED      	3
-#define	LINUX_POSIX_FADV_DONTNEED      	4
-#define	LINUX_POSIX_FADV_NOREUSE       	5
-
-/*
- * mount flags
- */
-#define	LINUX_MS_RDONLY		0x0001
-#define	LINUX_MS_NOSUID		0x0002
-#define	LINUX_MS_NODEV		0x0004
-#define	LINUX_MS_NOEXEC		0x0008
-#define	LINUX_MS_REMOUNT	0x0020
-
-/*
- * SystemV IPC defines
- */
-#define	LINUX_SEMOP		1
-#define	LINUX_SEMGET		2
-#define	LINUX_SEMCTL		3
-#define	LINUX_MSGSND		11
-#define	LINUX_MSGRCV		12
-#define	LINUX_MSGGET		13
-#define	LINUX_MSGCTL		14
-#define	LINUX_SHMAT		21
-#define	LINUX_SHMDT		22
-#define	LINUX_SHMGET		23
-#define	LINUX_SHMCTL		24
-
-#define	LINUX_IPC_RMID		0
-#define	LINUX_IPC_SET		1
-#define	LINUX_IPC_STAT		2
-#define	LINUX_IPC_INFO		3
-
-#define	LINUX_SHM_LOCK		11
-#define	LINUX_SHM_UNLOCK	12
-#define	LINUX_SHM_STAT		13
-#define	LINUX_SHM_INFO		14
-
-#define	LINUX_SHM_RDONLY	0x1000
-#define	LINUX_SHM_RND		0x2000
-#define	LINUX_SHM_REMAP		0x4000
-
-/* semctl commands */
-#define	LINUX_GETPID		11
-#define	LINUX_GETVAL		12
-#define	LINUX_GETALL		13
-#define	LINUX_GETNCNT		14
-#define	LINUX_GETZCNT		15
-#define	LINUX_SETVAL		16
-#define	LINUX_SETALL		17
-#define	LINUX_SEM_STAT		18
-#define	LINUX_SEM_INFO		19
-
 union l_semun {
 	l_int		val;
 	l_uintptr_t	buf;
@@ -667,25 +571,6 @@ union l_semun {
 /*
  * Socket defines
  */
-#define	LINUX_SOCKET 		1
-#define	LINUX_BIND		2
-#define	LINUX_CONNECT 		3
-#define	LINUX_LISTEN 		4
-#define	LINUX_ACCEPT 		5
-#define	LINUX_GETSOCKNAME	6
-#define	LINUX_GETPEERNAME	7
-#define	LINUX_SOCKETPAIR	8
-#define	LINUX_SEND		9
-#define	LINUX_RECV		10
-#define	LINUX_SENDTO 		11
-#define	LINUX_RECVFROM 		12
-#define	LINUX_SHUTDOWN 		13
-#define	LINUX_SETSOCKOPT	14
-#define	LINUX_GETSOCKOPT	15
-#define	LINUX_SENDMSG		16
-#define	LINUX_RECVMSG		17
-#define	LINUX_ACCEPT4		18
-
 #define	LINUX_SOL_SOCKET	1
 #define	LINUX_SOL_IP		0
 #define	LINUX_SOL_IPX		256
@@ -714,24 +599,6 @@ union l_semun {
 #define	LINUX_SO_TIMESTAMP	29
 #define	LINUX_SO_ACCEPTCONN	30
 
-#define	LINUX_IP_TOS		1
-#define	LINUX_IP_TTL		2
-#define	LINUX_IP_HDRINCL	3
-#define	LINUX_IP_OPTIONS	4
-
-#define	LINUX_IP_MULTICAST_IF		32
-#define	LINUX_IP_MULTICAST_TTL		33
-#define	LINUX_IP_MULTICAST_LOOP		34
-#define	LINUX_IP_ADD_MEMBERSHIP		35
-#define	LINUX_IP_DROP_MEMBERSHIP	36
-
-#define	LINUX_TCP_NODELAY	1
-#define	LINUX_TCP_MAXSEG	2
-#define	LINUX_TCP_KEEPIDLE	4
-#define	LINUX_TCP_KEEPINTVL	5
-#define	LINUX_TCP_KEEPCNT	6
-#define	LINUX_TCP_MD5SIG	14
-
 struct l_sockaddr {
 	l_ushort	sa_family;
 	char		sa_data[14];
@@ -897,30 +764,6 @@ struct l_user_desc {
 #define	LINUX_GET_USEABLE(desc)		\
 	(((desc)->b >> LINUX_ENTRY_B_USEABLE) & 1)
 
-#define	LINUX_CLOCK_REALTIME		0
-#define	LINUX_CLOCK_MONOTONIC		1
-#define	LINUX_CLOCK_PROCESS_CPUTIME_ID	2
-#define	LINUX_CLOCK_THREAD_CPUTIME_ID	3
-#define	LINUX_CLOCK_REALTIME_HR		4
-#define	LINUX_CLOCK_MONOTONIC_HR	5
-
-#define	LINUX_CLONE_VM			0x00000100
-#define	LINUX_CLONE_FS			0x00000200
-#define	LINUX_CLONE_FILES		0x00000400
-#define	LINUX_CLONE_SIGHAND		0x00000800
-#define	LINUX_CLONE_PID			0x00001000	/* No longer exist in Linux */
-#define	LINUX_CLONE_VFORK		0x00004000
-#define	LINUX_CLONE_PARENT		0x00008000
-#define	LINUX_CLONE_THREAD		0x00010000
-#define	LINUX_CLONE_SETTLS		0x00080000
-#define	LINUX_CLONE_PARENT_SETTID	0x00100000
-#define	LINUX_CLONE_CHILD_CLEARTID	0x00200000
-#define	LINUX_CLONE_CHILD_SETTID	0x01000000
-
-#define	LINUX_THREADING_FLAGS					\
-	(LINUX_CLONE_VM | LINUX_CLONE_FS | LINUX_CLONE_FILES |	\
-	LINUX_CLONE_SIGHAND | LINUX_CLONE_THREAD)
-
 struct iovec;
 
 struct l_iovec32 {
@@ -942,7 +785,4 @@ struct linux_robust_list_head {
 	l_uintptr_t			pending_list;
 };
 
-int linux_set_upcall_kse(struct thread *td, register_t stack);
-int linux_set_cloned_tls(struct thread *td, void *desc);
-
 #endif /* !_AMD64_LINUX_H_ */
diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c
index 5afc9ce..42500da 100644
--- a/sys/amd64/linux32/linux32_sysvec.c
+++ b/sys/amd64/linux32/linux32_sysvec.c
@@ -78,6 +78,7 @@ __FBSDID("$FreeBSD$");
 #include <amd64/linux32/linux32_proto.h>
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_futex.h>
+#include <compat/linux/linux_ioctl.h>
 #include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_misc.h>
 #include <compat/linux/linux_signal.h>
-- 
cgit v1.1


From 1ae7af0ed87a074e560452e9f87dc50964a20275 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Wed, 30 Jan 2013 04:09:09 +0000
Subject: Add emulation support for instruction "88/r: mov r/m8, r8".

This instruction moves a byte from a register to a memory location.

Tested by: tycho nightingale at pluribusnetworks com
---
 sys/amd64/include/vmm_instruction_emul.h |  3 +-
 sys/amd64/vmm/vmm_instruction_emul.c     | 58 ++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
index 4cc494b..4c7a346 100644
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -49,7 +49,8 @@ struct vie {
 	uint8_t		rex_w:1,		/* REX prefix */
 			rex_r:1,
 			rex_x:1,
-			rex_b:1;
+			rex_b:1,
+			rex_present:1;
 
 	uint8_t		mod:2,			/* ModRM byte */
 			reg:4,
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index e73f6bb..40748ea 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -65,6 +65,10 @@ enum {
 #define	VIE_OP_F_IMM8		(1 << 1)	/* 8-bit immediate operand */
 
 static const struct vie_op one_byte_opcodes[256] = {
+	[0x88] = {
+		.op_byte = 0x88,
+		.op_type = VIE_OP_TYPE_MOV,
+	},
 	[0x89] = {
 		.op_byte = 0x89,
 		.op_type = VIE_OP_TYPE_MOV,
@@ -161,6 +165,46 @@ vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
 }
 
 static int
+vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
+{
+	uint64_t val;
+	int error, rshift;
+	enum vm_reg_name reg;
+
+	rshift = 0;
+	reg = gpr_map[vie->reg];
+
+	/*
+	 * 64-bit mode imposes limitations on accessing legacy byte registers.
+	 *
+	 * The legacy high-byte registers cannot be addressed if the REX
+	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
+	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
+	 *
+	 * If the REX prefix is not present then the values 4, 5, 6 and 7
+	 * of the 'ModRM:reg' field address the legacy high-byte registers,
+	 * %ah, %ch, %dh and %bh respectively.
+	 */
+	if (!vie->rex_present) {
+		if (vie->reg & 0x4) {
+			/*
+			 * Obtain the value of %ah by reading %rax and shifting
+			 * right by 8 bits (same for %bh, %ch and %dh).
+			 */
+			rshift = 8;
+			reg = gpr_map[vie->reg & 0x3];
+		}
+	}
+
+	if (!vie_valid_register(reg))
+		return (EINVAL);
+
+	error = vm_get_register(vm, vcpuid, reg, &val);
+	*rval = val >> rshift;
+	return (error);
+}
+
+static int
 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
 		    uint64_t val, int size)
 {
@@ -209,12 +253,24 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 {
 	int error, size;
 	enum vm_reg_name reg;
+	uint8_t byte;
 	uint64_t val;
 
 	size = 4;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
+	case 0x88:
+		/*
+		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
+		 * 88/r:	mov r/m8, r8
+		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
+		 */
+		size = 1;
+		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
+		if (error == 0)
+			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
+		break;
 	case 0x89:
 		/*
 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
@@ -497,6 +553,8 @@ decode_rex(struct vie *vie)
 		return (-1);
 
 	if (x >= 0x40 && x <= 0x4F) {
+		vie->rex_present = 1;
+
 		vie->rex_w = x & 0x8 ? 1 : 0;
 		vie->rex_r = x & 0x4 ? 1 : 0;
 		vie->rex_x = x & 0x2 ? 1 : 0;
-- 
cgit v1.1


From c9a45ab898c8adbadbd15cf73d00a9dbf1d4ba52 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Fri, 1 Feb 2013 01:16:26 +0000
Subject: Increase the number of passthru devices supported by bhyve.

The maximum length of an environment variable puts a limitation on the
number of passthru devices that can be specified via a single variable.
The workaround is to allow user to specify passthru devices via multiple
environment variables instead of a single one.

Obtained from:	NetApp
---
 sys/amd64/vmm/io/ppt.c |  2 +-
 sys/amd64/vmm/vmm.c    | 44 ++++++++++++++++++++++++++++----------------
 2 files changed, 29 insertions(+), 17 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
index fdf136b..d3ec8d1 100644
--- a/sys/amd64/vmm/io/ppt.c
+++ b/sys/amd64/vmm/io/ppt.c
@@ -89,7 +89,7 @@ static struct pptdev {
 		void **cookie;
 		struct pptintr_arg *arg;
 	} msix;
-} pptdevs[32];
+} pptdevs[64];
 
 static int num_pptdevs;
 
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index d0e6427..82d4baa 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -862,30 +862,42 @@ vm_lapic(struct vm *vm, int cpu)
 boolean_t
 vmm_is_pptdev(int bus, int slot, int func)
 {
-	int found, b, s, f, n;
+	int found, i, n;
+	int b, s, f;
 	char *val, *cp, *cp2;
 
 	/*
-	 * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12"
+	 * XXX
+	 * The length of an environment variable is limited to 128 bytes which
+	 * puts an upper limit on the number of passthru devices that may be
+	 * specified using a single environment variable.
+	 *
+	 * Work around this by scanning multiple environment variable
+	 * names instead of a single one - yuck!
 	 */
+	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
+
+	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
 	found = 0;
-	cp = val = getenv("pptdevs");
-	while (cp != NULL && *cp != '\0') {
-		if ((cp2 = strchr(cp, ' ')) != NULL)
-			*cp2 = '\0';
-
-		n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
-		if (n == 3 && bus == b && slot == s && func == f) {
-			found = 1;
-			break;
-		}
+	for (i = 0; names[i] != NULL && !found; i++) {
+		cp = val = getenv(names[i]);
+		while (cp != NULL && *cp != '\0') {
+			if ((cp2 = strchr(cp, ' ')) != NULL)
+				*cp2 = '\0';
+
+			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
+			if (n == 3 && bus == b && slot == s && func == f) {
+				found = 1;
+				break;
+			}
 		
-		if (cp2 != NULL)
-			*cp2++ = ' ';
+			if (cp2 != NULL)
+				*cp2++ = ' ';
 
-		cp = cp2;
+			cp = cp2;
+		}
+		freeenv(val);
 	}
-	freeenv(val);
 	return (found);
 }
 
-- 
cgit v1.1


From 81de6f5cc49043ac5e2135ad996dfb05f2bd2a32 Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Fri, 1 Feb 2013 03:49:09 +0000
Subject: Fix a broken assumption in the passthru implementation that the MSI-X
 table can only be located at the beginning or the end of the BAR.

If the MSI-table is located in the middle of a BAR then we will split the
BAR into two and create two mappings - one before the table and one after
the table - leaving a hole in place of the table so accesses to it can be
trapped and emulated.

Obtained from:	NetApp
---
 sys/amd64/vmm/io/ppt.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
index d3ec8d1..4a05985 100644
--- a/sys/amd64/vmm/io/ppt.c
+++ b/sys/amd64/vmm/io/ppt.c
@@ -56,9 +56,18 @@ __FBSDID("$FreeBSD$");
 /* XXX locking */
 
 #define	MAX_PPTDEVS	(sizeof(pptdevs) / sizeof(pptdevs[0]))
-#define	MAX_MMIOSEGS	(PCIR_MAX_BAR_0 + 1)
 #define	MAX_MSIMSGS	32
 
+/*
+ * If the MSI-X table is located in the middle of a BAR then that MMIO
+ * region gets split into two segments - one segment above the MSI-X table
+ * and the other segment below the MSI-X table - with a hole in place of
+ * the MSI-X table so accesses to it can be trapped and emulated.
+ *
+ * So, allocate a MMIO segment for each BAR register + 1 additional segment.
+ */
+#define	MAX_MMIOSEGS	((PCIR_MAX_BAR_0 + 1) + 1)
+
 MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
 
 struct pptintr_arg {				/* pptintr(pptintr_arg) */
-- 
cgit v1.1


From 6a1efe1ad9984c8085ef28facb9d7f1cc2f01b6a Mon Sep 17 00:00:00 2001
From: eadler <eadler@FreeBSD.org>
Date: Fri, 1 Feb 2013 20:17:11 +0000
Subject: Remove support for plip from the GENERIC kernel as no systems in the
 last 10 years require this support.

Discussed with:	db
Discussed with:	kib
Reviewed by:	imp
Reviewed by:	jhb
Reviewed by:	-hackers
Approved by:	cperciva (mentor)
---
 sys/amd64/conf/GENERIC | 1 -
 1 file changed, 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index e53f692..5819a0d 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -197,7 +197,6 @@ device		uart		# Generic UART driver
 device		ppc
 device		ppbus		# Parallel port bus (required)
 device		lpt		# Printer
-device		plip		# TCP/IP over parallel
 device		ppi		# Parallel port interface device
 #device		vpo		# Requires scbus and da
 
-- 
cgit v1.1


From 09a43450b8e300637ed1d8238be2e28d3a727adb Mon Sep 17 00:00:00 2001
From: avg <avg@FreeBSD.org>
Date: Sat, 2 Feb 2013 12:02:42 +0000
Subject: x86 suspend/resume: suspend pics and pseudo-pics in reverse order

- change 'pics' from STAILQ to TAILQ
- ensure that Local APIC is always first in 'pics'

Reviewed by:	jhb
Tested by:	Sergey V. Dyatko <sergey.dyatko@gmail.com>,
		KAHO Toshikazu <kaho@elam.kais.kyoto-u.ac.jp>
MFC after:	12 days
---
 sys/amd64/include/intr_machdep.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/intr_machdep.h b/sys/amd64/include/intr_machdep.h
index 700e35f..8671605 100644
--- a/sys/amd64/include/intr_machdep.h
+++ b/sys/amd64/include/intr_machdep.h
@@ -94,7 +94,7 @@ struct pic {
 	int (*pic_config_intr)(struct intsrc *, enum intr_trigger,
 	    enum intr_polarity);
 	int (*pic_assign_cpu)(struct intsrc *, u_int apic_id);
-	STAILQ_ENTRY(pic) pics;
+	TAILQ_ENTRY(pic) pics;
 };
 
 /* Flags for pic_disable_source() */
-- 
cgit v1.1


From 2e2156704e3464a21d9828a2a25672095f24255d Mon Sep 17 00:00:00 2001
From: avg <avg@FreeBSD.org>
Date: Sat, 2 Feb 2013 12:04:32 +0000
Subject: cpususpend_handler: mark AP as resumed only after fully setting up
 lapic

Reviewed by:	jhb
Tested by:	Sergey V. Dyatko <sergey.dyatko@gmail.com>,
		KAHO Toshikazu <kaho@elam.kais.kyoto-u.ac.jp>
MFC after:	12 days
---
 sys/amd64/amd64/mp_machdep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index d2e4aad..31dbb3f 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -1431,11 +1431,11 @@ cpususpend_handler(void)
 	while (!CPU_ISSET(cpu, &started_cpus))
 		ia32_pause();
 
-	CPU_CLR_ATOMIC(cpu, &started_cpus);
-
 	/* Resume MCA and local APIC */
 	mca_resume();
 	lapic_setup(0);
+
+	CPU_CLR_ATOMIC(cpu, &started_cpus);
 }
 
 /*
-- 
cgit v1.1