27 files changed, 336 insertions, 1398 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index c3aac33..4455cab 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -174,6 +174,22 @@ IDTVEC(xen_intr_upcall)
 	jmp	doreti
 #endif
 
+#ifdef HYPERV
+/*
+ * This is the Hyper-V vmbus channel direct callback interrupt.
+ * Only used when it is running on Hyper-V.
+ */
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(hv_vmbus_callback)
+	PUSH_FRAME
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	movq	%rsp, %rdi
+	call	hv_vector_handler
+	MEXITCOUNT
+	jmp	doreti
+#endif
+
 #ifdef SMP
 /*
  * Global address space TLB shootdown.
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index c81495a..83ca548 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -81,28 +81,11 @@ __FBSDID("$FreeBSD$");
 #define BIOS_RESET		(0x0f)
 #define BIOS_WARM		(0x0a)
 
-/* lock region used by kernel profiling */
-int	mcount_lock;
-
-int	mp_naps;		/* # of Applications processors */
-int	boot_cpu_id = -1;	/* designated BSP */
-
-extern  struct pcpu __pcpu[];
-
-/* AP uses this during bootstrap.  Do not staticize.  */
-char *bootSTK;
-int bootAP;
-
-/* Free these after use */
-void *bootstacks[MAXCPU];
+extern	struct pcpu __pcpu[];
 
 /* Temporary variables for init_secondary()  */
 char *doublefault_stack;
 char *nmi_stack;
-void *dpcpu;
-
-struct pcb stoppcbs[MAXCPU];
-struct susppcb **susppcbs;
 
 /* Variables needed for SMP tlb shootdown. */
 vm_offset_t smp_tlb_addr2;
@@ -112,309 +95,16 @@ uint64_t pcid_cr3;
 pmap_t smp_tlb_pmap;
 extern int invpcid_works;
 
-#ifdef COUNT_IPIS
-/* Interrupt counts. */
-static u_long *ipi_preempt_counts[MAXCPU];
-static u_long *ipi_ast_counts[MAXCPU];
-u_long *ipi_invltlb_counts[MAXCPU];
-u_long *ipi_invlrng_counts[MAXCPU];
-u_long *ipi_invlpg_counts[MAXCPU];
-u_long *ipi_invlcache_counts[MAXCPU];
-u_long *ipi_rendezvous_counts[MAXCPU];
-static u_long *ipi_hardclock_counts[MAXCPU];
-#endif
-
-/* Default cpu_ops implementation. */
-struct cpu_ops cpu_ops;
-
 extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
 
-extern int pmap_pcid_enabled;
-
 /*
  * Local data and functions.
  */
 
-static volatile cpuset_t ipi_nmi_pending;
-
-/* used to hold the AP's until we are ready to release them */
-struct mtx ap_boot_mtx;
-
-/* Set to 1 once we're ready to let the APs out of the pen. */
-static volatile int aps_ready = 0;
-
-/*
- * Store data from cpu_add() until later in the boot when we actually setup
- * the APs.
- */
-struct cpu_info {
-	int	cpu_present:1;
-	int	cpu_bsp:1;
-	int	cpu_disabled:1;
-	int	cpu_hyperthread:1;
-} static cpu_info[MAX_APIC_ID + 1];
-int cpu_apic_ids[MAXCPU];
-int apic_cpuids[MAX_APIC_ID + 1];
-
-/* Holds pending bitmap based IPIs per CPU */
-volatile u_int cpu_ipi_pending[MAXCPU];
-
-static u_int boot_address;
-static int cpu_logical;			/* logical cpus per core */
-static int cpu_cores;			/* cores per package */
-
-static void	assign_cpu_ids(void);
-static void	set_interrupt_apic_ids(void);
 static int	start_ap(int apic_id);
-static void	release_aps(void *dummy);
 
-static u_int	hyperthreading_cpus;	/* logical cpus sharing L1 cache */
-static int	hyperthreading_allowed = 1;
 static u_int	bootMP_size;
-
-static void
-mem_range_AP_init(void)
-{
-	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
-		mem_range_softc.mr_op->initAP(&mem_range_softc);
-}
-
-static void
-topo_probe_amd(void)
-{
-	int core_id_bits;
-	int id;
-
-	/* AMD processors do not support HTT. */
-	cpu_logical = 1;
-
-	if ((amd_feature2 & AMDID2_CMP) == 0) {
-		cpu_cores = 1;
-		return;
-	}
-
-	core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
-	    AMDID_COREID_SIZE_SHIFT;
-	if (core_id_bits == 0) {
-		cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1;
-		return;
-	}
-
-	/* Fam 10h and newer should get here. */
-	for (id = 0; id <= MAX_APIC_ID; id++) {
-		/* Check logical CPU availability. */
-		if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled)
-			continue;
-		/* Check if logical CPU has the same package ID. */
-		if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits))
-			continue;
-		cpu_cores++;
-	}
-}
-
-/*
- * Round up to the next power of two, if necessary, and then
- * take log2.
- * Returns -1 if argument is zero.
- */
-static __inline int
-mask_width(u_int x)
-{
-
-	return (fls(x << (1 - powerof2(x))) - 1);
-}
-
-static void
-topo_probe_0x4(void)
-{
-	u_int p[4];
-	int pkg_id_bits;
-	int core_id_bits;
-	int max_cores;
-	int max_logical;
-	int id;
-
-	/* Both zero and one here mean one logical processor per package. */
-	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
-	    (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
-	if (max_logical <= 1)
-		return;
-
-	/*
-	 * Because of uniformity assumption we examine only
-	 * those logical processors that belong to the same
-	 * package as BSP.  Further, we count number of
-	 * logical processors that belong to the same core
-	 * as BSP thus deducing number of threads per core.
-	 */
-	if (cpu_high >= 0x4) {
-		cpuid_count(0x04, 0, p);
-		max_cores = ((p[0] >> 26) & 0x3f) + 1;
-	} else
-		max_cores = 1;
-	core_id_bits = mask_width(max_logical/max_cores);
-	if (core_id_bits < 0)
-		return;
-	pkg_id_bits = core_id_bits + mask_width(max_cores);
-
-	for (id = 0; id <= MAX_APIC_ID; id++) {
-		/* Check logical CPU availability. */
-		if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled)
-			continue;
-		/* Check if logical CPU has the same package ID. */
-		if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits))
-			continue;
-		cpu_cores++;
-		/* Check if logical CPU has the same package and core IDs. */
-		if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits))
-			cpu_logical++;
-	}
-
-	KASSERT(cpu_cores >= 1 && cpu_logical >= 1,
-	    ("topo_probe_0x4 couldn't find BSP"));
-
-	cpu_cores /= cpu_logical;
-	hyperthreading_cpus = cpu_logical;
-}
-
-static void
-topo_probe_0xb(void)
-{
-	u_int p[4];
-	int bits;
-	int cnt;
-	int i;
-	int logical;
-	int type;
-	int x;
-
-	/* We only support three levels for now. */
-	for (i = 0; i < 3; i++) {
-		cpuid_count(0x0b, i, p);
-
-		/* Fall back if CPU leaf 11 doesn't really exist. */
-		if (i == 0 && p[1] == 0) {
-			topo_probe_0x4();
-			return;
-		}
-
-		bits = p[0] & 0x1f;
-		logical = p[1] &= 0xffff;
-		type = (p[2] >> 8) & 0xff;
-		if (type == 0 || logical == 0)
-			break;
-		/*
-		 * Because of uniformity assumption we examine only
-		 * those logical processors that belong to the same
-		 * package as BSP.
-		 */
-		for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) {
-			if (!cpu_info[x].cpu_present ||
-			    cpu_info[x].cpu_disabled)
-				continue;
-			if (x >> bits == boot_cpu_id >> bits)
-				cnt++;
-		}
-		if (type == CPUID_TYPE_SMT)
-			cpu_logical = cnt;
-		else if (type == CPUID_TYPE_CORE)
-			cpu_cores = cnt;
-	}
-	if (cpu_logical == 0)
-		cpu_logical = 1;
-	cpu_cores /= cpu_logical;
-}
-
-/*
- * Both topology discovery code and code that consumes topology
- * information assume top-down uniformity of the topology.
- * That is, all physical packages must be identical and each
- * core in a package must have the same number of threads.
- * Topology information is queried only on BSP, on which this
- * code runs and for which it can query CPUID information.
- * Then topology is extrapolated on all packages using the
- * uniformity assumption.
- */
-static void
-topo_probe(void)
-{
-	static int cpu_topo_probed = 0;
-
-	if (cpu_topo_probed)
-		return;
-
-	CPU_ZERO(&logical_cpus_mask);
-	if (mp_ncpus <= 1)
-		cpu_cores = cpu_logical = 1;
-	else if (cpu_vendor_id == CPU_VENDOR_AMD)
-		topo_probe_amd();
-	else if (cpu_vendor_id == CPU_VENDOR_INTEL) {
-		/*
-		 * See Intel(R) 64 Architecture Processor
-		 * Topology Enumeration article for details.
-		 *
-		 * Note that 0x1 <= cpu_high < 4 case should be
-		 * compatible with topo_probe_0x4() logic when
-		 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
-		 * or it should trigger the fallback otherwise.
-		 */
-		if (cpu_high >= 0xb)
-			topo_probe_0xb();
-		else if (cpu_high >= 0x1)
-			topo_probe_0x4();
-	}
-
-	/*
-	 * Fallback: assume each logical CPU is in separate
-	 * physical package.  That is, no multi-core, no SMT.
-	 */
-	if (cpu_cores == 0 || cpu_logical == 0)
-		cpu_cores = cpu_logical = 1;
-	cpu_topo_probed = 1;
-}
-
-struct cpu_group *
-cpu_topo(void)
-{
-	int cg_flags;
-
-	/*
-	 * Determine whether any threading flags are
-	 * necessry.
-	 */
-	topo_probe();
-	if (cpu_logical > 1 && hyperthreading_cpus)
-		cg_flags = CG_FLAG_HTT;
-	else if (cpu_logical > 1)
-		cg_flags = CG_FLAG_SMT;
-	else
-		cg_flags = 0;
-	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
-		printf("WARNING: Non-uniform processors.\n");
-		printf("WARNING: Using suboptimal topology.\n");
-		return (smp_topo_none());
-	}
-	/*
-	 * No multi-core or hyper-threaded.
-	 */
-	if (cpu_logical * cpu_cores == 1)
-		return (smp_topo_none());
-	/*
-	 * Only HTT no multi-core.
-	 */
-	if (cpu_logical > 1 && cpu_cores == 1)
-		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags));
-	/*
-	 * Only multi-core no HTT.
-	 */
-	if (cpu_cores > 1 && cpu_logical == 1)
-		return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags));
-	/*
-	 * Both HTT and multi-core.
-	 */
-	return (smp_topo_2level(CG_SHARE_L2, cpu_cores,
-	    CG_SHARE_L1, cpu_logical, cg_flags));
-}
+static u_int	boot_address;
 
 /*
  * Calculate usable address in base memory for AP trampoline code.
@@ -433,85 +123,6 @@ mp_bootaddress(u_int basemem)
 	return mptramp_pagetables;
 }
 
-void
-cpu_add(u_int apic_id, char boot_cpu)
-{
-
-	if (apic_id > MAX_APIC_ID) {
-		panic("SMP: APIC ID %d too high", apic_id);
-		return;
-	}
-	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
-	    apic_id));
-	cpu_info[apic_id].cpu_present = 1;
-	if (boot_cpu) {
-		KASSERT(boot_cpu_id == -1,
-		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
-		    boot_cpu_id));
-		boot_cpu_id = apic_id;
-		cpu_info[apic_id].cpu_bsp = 1;
-	}
-	if (mp_ncpus < MAXCPU) {
-		mp_ncpus++;
-		mp_maxid = mp_ncpus - 1;
-	}
-	if (bootverbose)
-		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
-		    "AP");
-}
-
-void
-cpu_mp_setmaxid(void)
-{
-
-	/*
-	 * mp_maxid should be already set by calls to cpu_add().
-	 * Just sanity check its value here.
-	 */
-	if (mp_ncpus == 0)
-		KASSERT(mp_maxid == 0,
-		    ("%s: mp_ncpus is zero, but mp_maxid is not", __func__));
-	else if (mp_ncpus == 1)
-		mp_maxid = 0;
-	else
-		KASSERT(mp_maxid >= mp_ncpus - 1,
-		    ("%s: counters out of sync: max %d, count %d", __func__,
-			mp_maxid, mp_ncpus));
-}
-
-int
-cpu_mp_probe(void)
-{
-
-	/*
-	 * Always record BSP in CPU map so that the mbuf init code works
-	 * correctly.
-	 */
-	CPU_SETOF(0, &all_cpus);
-	if (mp_ncpus == 0) {
-		/*
-		 * No CPUs were found, so this must be a UP system.  Setup
-		 * the variables to represent a system with a single CPU
-		 * with an id of 0.
-		 */
-		mp_ncpus = 1;
-		return (0);
-	}
-
-	/* At least one CPU was found. */
-	if (mp_ncpus == 1) {
-		/*
-		 * One CPU was found, so this must be a UP system with
-		 * an I/O APIC.
-		 */
-		mp_maxid = 0;
-		return (0);
-	}
-
-	/* At least two CPUs were found. */
-	return (1);
-}
-
 /*
  * Initialize the IPI handlers and start up the AP's.
  */
@@ -575,47 +186,6 @@ cpu_mp_start(void)
 
 
 /*
- * Print various information about the SMP system hardware and setup.
- */
-void
-cpu_mp_announce(void)
-{
-	const char *hyperthread;
-	int i;
-
-	printf("FreeBSD/SMP: %d package(s) x %d core(s)",
-	    mp_ncpus / (cpu_cores * cpu_logical), cpu_cores);
-	if (hyperthreading_cpus > 1)
-	    printf(" x %d HTT threads", cpu_logical);
-	else if (cpu_logical > 1)
-	    printf(" x %d SMT threads", cpu_logical);
-	printf("\n");
-
-	/* List active CPUs first. */
-	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
-	for (i = 1; i < mp_ncpus; i++) {
-		if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread)
-			hyperthread = "/HT";
-		else
-			hyperthread = "";
-		printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread,
-		    cpu_apic_ids[i]);
-	}
-
-	/* List disabled CPUs last. */
-	for (i = 0; i <= MAX_APIC_ID; i++) {
-		if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled)
-			continue;
-		if (cpu_info[i].cpu_hyperthread)
-			hyperthread = "/HT";
-		else
-			hyperthread = "";
-		printf("  cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread,
-		    i);
-	}
-}
-
-/*
  * AP CPU's call this to initialize themselves.
  */
 void
@@ -624,7 +194,6 @@ init_secondary(void)
 	struct pcpu *pc;
 	struct nmi_pcpu *np;
 	u_int64_t msr, cr0;
-	u_int cpuid;
 	int cpu, gsel_tss, x;
 	struct region_descriptor ap_gdt;
 
@@ -712,94 +281,7 @@ init_secondary(void)
 	while (!aps_ready)
 		ia32_pause();
 
-	/*
-	 * On real hardware, switch to x2apic mode if possible.  Do it
-	 * after aps_ready was signalled, to avoid manipulating the
-	 * mode while BSP might still want to send some IPI to us
-	 * (second startup IPI is ignored on modern hardware etc).
-	 */
-	lapic_xapic_mode();
-
-	/* Initialize the PAT MSR. */
-	pmap_init_pat();
-
-	/* set up CPU registers and state */
-	cpu_setregs();
-
-	/* set up SSE/NX */
-	initializecpu();
-
-	/* set up FPU state on the AP */
-	fpuinit();
-
-	if (cpu_ops.cpu_init)
-		cpu_ops.cpu_init();
-
-	/* A quick check from sanity claus */
-	cpuid = PCPU_GET(cpuid);
-	if (PCPU_GET(apic_id) != lapic_id()) {
-		printf("SMP: cpuid = %d\n", cpuid);
-		printf("SMP: actual apic_id = %d\n", lapic_id());
-		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
-		panic("cpuid mismatch! boom!!");
-	}
-
-	/* Initialize curthread. */
-	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
-	PCPU_SET(curthread, PCPU_GET(idlethread));
-
-	mca_init();
-
-	mtx_lock_spin(&ap_boot_mtx);
-
-	/* Init local apic for irq's */
-	lapic_setup(1);
-
-	/* Set memory range attributes for this CPU to match the BSP */
-	mem_range_AP_init();
-
-	smp_cpus++;
-
-	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
-	printf("SMP: AP CPU #%d Launched!\n", cpuid);
-
-	/* Determine if we are a logical CPU. */
-	/* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */
-	if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0)
-		CPU_SET(cpuid, &logical_cpus_mask);
-
-	if (bootverbose)
-		lapic_dump("AP");
-
-	if (smp_cpus == mp_ncpus) {
-		/* enable IPI's, tlb shootdown, freezes etc */
-		atomic_store_rel_int(&smp_started, 1);
-	}
-
-	/*
-	 * Enable global pages TLB extension
-	 * This also implicitly flushes the TLB 
-	 */
-
-	load_cr4(rcr4() | CR4_PGE);
-	if (pmap_pcid_enabled)
-		load_cr4(rcr4() | CR4_PCIDE);
-	load_ds(_udatasel);
-	load_es(_udatasel);
-	load_fs(_ufssel);
-	mtx_unlock_spin(&ap_boot_mtx);
-
-	/* Wait until all the AP's are up. */
-	while (smp_started == 0)
-		ia32_pause();
-
-	/* Start per-CPU event timers. */
-	cpu_initclocks_ap();
-
-	sched_throw(NULL);
-
-	panic("scheduler returned us to %s", __func__);
-	/* NOTREACHED */
+	init_secondary_tail();
 }
 
 /*******************************************************************
@@ -807,108 +289,6 @@ init_secondary(void)
  */
 
 /*
- * We tell the I/O APIC code about all the CPUs we want to receive
- * interrupts.  If we don't want certain CPUs to receive IRQs we
- * can simply not tell the I/O APIC code about them in this function.
- * We also do not tell it about the BSP since it tells itself about
- * the BSP internally to work with UP kernels and on UP machines.
- */
-static void
-set_interrupt_apic_ids(void)
-{
-	u_int i, apic_id;
-
-	for (i = 0; i < MAXCPU; i++) {
-		apic_id = cpu_apic_ids[i];
-		if (apic_id == -1)
-			continue;
-		if (cpu_info[apic_id].cpu_bsp)
-			continue;
-		if (cpu_info[apic_id].cpu_disabled)
-			continue;
-
-		/* Don't let hyperthreads service interrupts. */
-		if (cpu_logical > 1 &&
-		    apic_id % cpu_logical != 0)
-			continue;
-
-		intr_add_cpu(i);
-	}
-}
-
-/*
- * Assign logical CPU IDs to local APICs.
- */
-static void
-assign_cpu_ids(void)
-{
-	u_int i;
-
-	TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
-	    &hyperthreading_allowed);
-
-	/* Check for explicitly disabled CPUs. */
-	for (i = 0; i <= MAX_APIC_ID; i++) {
-		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
-			continue;
-
-		if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) {
-			cpu_info[i].cpu_hyperthread = 1;
-
-			/*
-			 * Don't use HT CPU if it has been disabled by a
-			 * tunable.
-			 */
-			if (hyperthreading_allowed == 0) {
-				cpu_info[i].cpu_disabled = 1;
-				continue;
-			}
-		}
-
-		/* Don't use this CPU if it has been disabled by a tunable. */
-		if (resource_disabled("lapic", i)) {
-			cpu_info[i].cpu_disabled = 1;
-			continue;
-		}
-	}
-
-	if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) {
-		hyperthreading_cpus = 0;
-		cpu_logical = 1;
-	}
-
-	/*
-	 * Assign CPU IDs to local APIC IDs and disable any CPUs
-	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
-	 *
-	 * To minimize confusion for userland, we attempt to number
-	 * CPUs such that all threads and cores in a package are
-	 * grouped together.  For now we assume that the BSP is always
-	 * the first thread in a package and just start adding APs
-	 * starting with the BSP's APIC ID.
-	 */
-	mp_ncpus = 1;
-	cpu_apic_ids[0] = boot_cpu_id;
-	apic_cpuids[boot_cpu_id] = 0;
-	for (i = boot_cpu_id + 1; i != boot_cpu_id;
-	     i == MAX_APIC_ID ? i = 0 : i++) {
-		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
-		    cpu_info[i].cpu_disabled)
-			continue;
-
-		if (mp_ncpus < MAXCPU) {
-			cpu_apic_ids[mp_ncpus] = i;
-			apic_cpuids[i] = mp_ncpus;
-			mp_ncpus++;
-		} else
-			cpu_info[i].cpu_disabled = 1;
-	}
-	KASSERT(mp_maxid >= mp_ncpus - 1,
-	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
-	    mp_ncpus));		
-}
-
-/*
  * start each AP in our list
  */
 int
@@ -1026,129 +406,6 @@ start_ap(int apic_id)
 	return 0;		/* return FAILURE */
 }
 
-#ifdef COUNT_XINVLTLB_HITS
-u_int xhits_gbl[MAXCPU];
-u_int xhits_pg[MAXCPU];
-u_int xhits_rng[MAXCPU];
-static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
-    sizeof(xhits_gbl), "IU", "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
-    sizeof(xhits_pg), "IU", "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
-    sizeof(xhits_rng), "IU", "");
-
-u_int ipi_global;
-u_int ipi_page;
-u_int ipi_range;
-u_int ipi_range_size;
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW,
-    &ipi_range_size, 0, "");
-
-u_int ipi_masked_global;
-u_int ipi_masked_page;
-u_int ipi_masked_range;
-u_int ipi_masked_range_size;
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
-    &ipi_masked_global, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
-    &ipi_masked_page, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
-    &ipi_masked_range, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
-    &ipi_masked_range_size, 0, "");
-#endif /* COUNT_XINVLTLB_HITS */
-
-/*
- * Init and startup IPI.
- */
-void
-ipi_startup(int apic_id, int vector)
-{
-
-	/*
-	 * This attempts to follow the algorithm described in the
-	 * Intel Multiprocessor Specification v1.4 in section B.4.
-	 * For each IPI, we allow the local APIC ~20us to deliver the
-	 * IPI.  If that times out, we panic.
-	 */
-
-	/*
-	 * first we do an INIT IPI: this INIT IPI might be run, resetting
-	 * and running the target CPU. OR this INIT IPI might be latched (P5
-	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
-	 * ignored.
-	 */
-	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
-	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
-	lapic_ipi_wait(100);
-
-	/* Explicitly deassert the INIT IPI. */
-	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
-	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
-	    apic_id);
-
-	DELAY(10000);		/* wait ~10mS */
-
-	/*
-	 * next we do a STARTUP IPI: the previous INIT IPI might still be
-	 * latched, (P5 bug) this 1st STARTUP would then terminate
-	 * immediately, and the previously started INIT IPI would continue. OR
-	 * the previous INIT IPI has already run. and this STARTUP IPI will
-	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
-	 * will run.
-	 */
-	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
-	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
-	    vector, apic_id);
-	if (!lapic_ipi_wait(100))
-		panic("Failed to deliver first STARTUP IPI to APIC %d",
-		    apic_id);
-	DELAY(200);		/* wait ~200uS */
-
-	/*
-	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
-	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
-	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
-	 * recognized after hardware RESET or INIT IPI.
-	 */
-	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
-	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
-	    vector, apic_id);
-	if (!lapic_ipi_wait(100))
-		panic("Failed to deliver second STARTUP IPI to APIC %d",
-		    apic_id);
-
-	DELAY(200);		/* wait ~200uS */
-}
-
-/*
- * Send an IPI to specified CPU handling the bitmap logic.
- */
-static void
-ipi_send_cpu(int cpu, u_int ipi)
-{
-	u_int bitmap, old_pending, new_pending;
-
-	KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
-
-	if (IPI_IS_BITMAPED(ipi)) {
-		bitmap = 1 << ipi;
-		ipi = IPI_BITMAP_VECTOR;
-		do {
-			old_pending = cpu_ipi_pending[cpu];
-			new_pending = old_pending | bitmap;
-		} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
-		    old_pending, new_pending)); 
-		if (old_pending)
-			return;
-	}
-	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
-}
-
 /*
  * Flush the TLB on all other CPU's
  */
@@ -1228,26 +485,6 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
 }
 
 void
-smp_cache_flush(void)
-{
-
-	if (smp_started)
-		smp_tlb_shootdown(IPI_INVLCACHE, NULL, 0, 0);
-}
-
-void
-smp_invltlb(pmap_t pmap)
-{
-
-	if (smp_started) {
-		smp_tlb_shootdown(IPI_INVLTLB, pmap, 0, 0);
-#ifdef COUNT_XINVLTLB_HITS
-		ipi_global++;
-#endif
-	}
-}
-
-void
 smp_invlpg(pmap_t pmap, vm_offset_t addr)
 {
 
@@ -1312,210 +549,23 @@ smp_masked_invlpg_range(cpuset_t mask, pmap_t pmap, vm_offset_t addr1,
 }
 
 void
-ipi_bitmap_handler(struct trapframe frame)
-{
-	struct trapframe *oldframe;
-	struct thread *td;
-	int cpu = PCPU_GET(cpuid);
-	u_int ipi_bitmap;
-
-	critical_enter();
-	td = curthread;
-	td->td_intr_nesting_level++;
-	oldframe = td->td_intr_frame;
-	td->td_intr_frame = &frame;
-	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
-	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
-#ifdef COUNT_IPIS
-		(*ipi_preempt_counts[cpu])++;
-#endif
-		sched_preempt(td);
-	}
-	if (ipi_bitmap & (1 << IPI_AST)) {
-#ifdef COUNT_IPIS
-		(*ipi_ast_counts[cpu])++;
-#endif
-		/* Nothing to do for AST */
-	}
-	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
-#ifdef COUNT_IPIS
-		(*ipi_hardclock_counts[cpu])++;
-#endif
-		hardclockintr();
-	}
-	td->td_intr_frame = oldframe;
-	td->td_intr_nesting_level--;
-	critical_exit();
-}
-
-/*
- * send an IPI to a set of cpus.
- */
-void
-ipi_selected(cpuset_t cpus, u_int ipi)
-{
-	int cpu;
-
-	/*
-	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
-	 * of help in order to understand what is the source.
-	 * Set the mask of receiving CPUs for this purpose.
-	 */
-	if (ipi == IPI_STOP_HARD)
-		CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus);
-
-	while ((cpu = CPU_FFS(&cpus)) != 0) {
-		cpu--;
-		CPU_CLR(cpu, &cpus);
-		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
-		ipi_send_cpu(cpu, ipi);
-	}
-}
-
-/*
- * send an IPI to a specific CPU.
- */
-void
-ipi_cpu(int cpu, u_int ipi)
-{
-
-	/*
-	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
-	 * of help in order to understand what is the source.
-	 * Set the mask of receiving CPUs for this purpose.
-	 */
-	if (ipi == IPI_STOP_HARD)
-		CPU_SET_ATOMIC(cpu, &ipi_nmi_pending);
-
-	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
-	ipi_send_cpu(cpu, ipi);
-}
-
-/*
- * send an IPI to all CPUs EXCEPT myself
- */
-void
-ipi_all_but_self(u_int ipi)
+smp_cache_flush(void)
 {
-	cpuset_t other_cpus;
-
-	other_cpus = all_cpus;
-	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 
-	if (IPI_IS_BITMAPED(ipi)) {
-		ipi_selected(other_cpus, ipi);
-		return;
-	}
-
-	/*
-	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
-	 * of help in order to understand what is the source.
-	 * Set the mask of receiving CPUs for this purpose.
-	 */
-	if (ipi == IPI_STOP_HARD)
-		CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus);
-
-	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
-	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
+	if (smp_started)
+		smp_tlb_shootdown(IPI_INVLCACHE, NULL, 0, 0);
 }
 
-int
-ipi_nmi_handler()
-{
-	u_int cpuid;
-
-	/*
-	 * As long as there is not a simple way to know about a NMI's
-	 * source, if the bitmask for the current CPU is present in
-	 * the global pending bitword an IPI_STOP_HARD has been issued
-	 * and should be handled.
-	 */
-	cpuid = PCPU_GET(cpuid);
-	if (!CPU_ISSET(cpuid, &ipi_nmi_pending))
-		return (1);
-
-	CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending);
-	cpustop_handler();
-	return (0);
-}
-     
-/*
- * Handle an IPI_STOP by saving our current context and spinning until we
- * are resumed.
- */
 void
-cpustop_handler(void)
-{
-	u_int cpu;
-
-	cpu = PCPU_GET(cpuid);
-
-	savectx(&stoppcbs[cpu]);
-
-	/* Indicate that we are stopped */
-	CPU_SET_ATOMIC(cpu, &stopped_cpus);
-
-	/* Wait for restart */
-	while (!CPU_ISSET(cpu, &started_cpus))
-	    ia32_pause();
-
-	CPU_CLR_ATOMIC(cpu, &started_cpus);
-	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
+smp_invltlb(pmap_t pmap)
+{ 
 
-#ifdef DDB
-	amd64_db_resume_dbreg();
+	if (smp_started) {
+		smp_tlb_shootdown(IPI_INVLTLB, pmap, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_global++;
 #endif
-
-	if (cpu == 0 && cpustop_restartfunc != NULL) {
-		cpustop_restartfunc();
-		cpustop_restartfunc = NULL;
-	}
-}
-
-/*
- * Handle an IPI_SUSPEND by saving our current context and spinning until we
- * are resumed.
- */
-void
-cpususpend_handler(void)
-{
-	u_int cpu;
-
-	mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
-
-	cpu = PCPU_GET(cpuid);
-	if (savectx(&susppcbs[cpu]->sp_pcb)) {
-		fpususpend(susppcbs[cpu]->sp_fpususpend);
-		wbinvd();
-		CPU_SET_ATOMIC(cpu, &suspended_cpus);
-	} else {
-		fpuresume(susppcbs[cpu]->sp_fpususpend);
-		pmap_init_pat();
-		initializecpu();
-		PCPU_SET(switchtime, 0);
-		PCPU_SET(switchticks, ticks);
-
-		/* Indicate that we are resumed */
-		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	}
-
-	/* Wait for resume */
-	while (!CPU_ISSET(cpu, &started_cpus))
-		ia32_pause();
-
-	if (cpu_ops.cpu_resume)
-		cpu_ops.cpu_resume();
-	if (vmm_resume_p)
-		vmm_resume_p();
-
-	/* Resume MCA and local APIC */
-	lapic_xapic_mode();
-	mca_resume();
-	lapic_setup(0);
-
-	CPU_CLR_ATOMIC(cpu, &started_cpus);
-	/* Indicate that we are resumed */
-	CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 }
 
 /*
@@ -1678,63 +728,3 @@ invlrng_handler(void)
 
 	atomic_add_int(&smp_tlb_wait, 1);
 }
-
-void
-invlcache_handler(void)
-{
-#ifdef COUNT_IPIS
-	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
-#endif /* COUNT_IPIS */
-
-	wbinvd();
-	atomic_add_int(&smp_tlb_wait, 1);
-}
-
-/*
- * This is called once the rest of the system is up and running and we're
- * ready to let the AP's out of the pen.
- */
-static void
-release_aps(void *dummy __unused)
-{
-
-	if (mp_ncpus == 1) 
-		return;
-	atomic_store_rel_int(&aps_ready, 1);
-	while (smp_started == 0)
-		ia32_pause();
-}
-SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
-
-#ifdef COUNT_IPIS
-/*
- * Setup interrupt counters for IPI handlers.
- */
-static void
-mp_ipi_intrcnt(void *dummy)
-{
-	char buf[64];
-	int i;
-
-	CPU_FOREACH(i) {
-		snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
-		intrcnt_add(buf, &ipi_invltlb_counts[i]);
-		snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
-		intrcnt_add(buf, &ipi_invlrng_counts[i]);
-		snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
-		intrcnt_add(buf, &ipi_invlpg_counts[i]);
-		snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
-		intrcnt_add(buf, &ipi_invlcache_counts[i]);
-		snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
-		intrcnt_add(buf, &ipi_preempt_counts[i]);
-		snprintf(buf, sizeof(buf), "cpu%d:ast", i);
-		intrcnt_add(buf, &ipi_ast_counts[i]);
-		snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
-		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
-		snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
-		intrcnt_add(buf, &ipi_hardclock_counts[i]);
-	}
-}
-SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
-#endif
-
diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
index bdaca33..c24dd5a 100644
--- a/sys/amd64/conf/GENERIC
+++ b/sys/amd64/conf/GENERIC
@@ -340,7 +340,9 @@ device		virtio_blk		# VirtIO Block device
 device		virtio_scsi		# VirtIO SCSI device
 device		virtio_balloon		# VirtIO Memory Balloon device
 
-# HyperV drivers
+# HyperV drivers and enchancement support
+# NOTE: HYPERV depends on hyperv.  They must be added or removed together.
+options 	HYPERV			# Hyper-V kernel infrastructure
 device		hyperv			# HyperV drivers 
 
 # Xen HVM Guest Optimizations
diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES
index 9b697f0..e0fe465 100644
--- a/sys/amd64/conf/NOTES
+++ b/sys/amd64/conf/NOTES
@@ -494,6 +494,8 @@ device		virtio_balloon	# VirtIO Memory Balloon device
 device		virtio_random	# VirtIO Entropy device
 device		virtio_console	# VirtIO Console device
 
+# Microsoft Hyper-V enchancement support
+options 	HYPERV		# Hyper-V kernel infrastructure
 device 		hyperv		# HyperV drivers
 
 # Xen HVM Guest Optimizations
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index 3a4b6b3..034a693 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -35,6 +35,39 @@ extern int			mp_naps;
 extern int			boot_cpu_id;
 extern struct pcb		stoppcbs[];
 extern int			cpu_apic_ids[];
+extern int bootAP;
+extern void *dpcpu;
+extern char *bootSTK;
+extern int bootAP;
+extern void *bootstacks[];
+extern volatile u_int cpu_ipi_pending[];
+extern volatile int aps_ready;
+extern struct mtx ap_boot_mtx;
+extern int cpu_logical;
+extern int cpu_cores;
+extern int pmap_pcid_enabled;
+extern u_int xhits_gbl[];
+extern u_int xhits_pg[];
+extern u_int xhits_rng[];
+extern u_int ipi_global;
+extern u_int ipi_page;
+extern u_int ipi_range;
+extern u_int ipi_range_size;
+extern u_int ipi_masked_global;
+extern u_int ipi_masked_page;
+extern u_int ipi_masked_range;
+extern u_int ipi_masked_range_size;
+
+extern volatile int smp_tlb_wait;
+
+struct cpu_info {
+	int	cpu_present:1;
+	int	cpu_bsp:1;
+	int	cpu_disabled:1;
+	int	cpu_hyperthread:1;
+};
+extern struct cpu_info cpu_info[];
+
 #ifdef COUNT_IPIS
 extern u_long *ipi_invltlb_counts[MAXCPU];
 extern u_long *ipi_invlrng_counts[MAXCPU];
@@ -60,9 +93,11 @@ inthand_t
 struct pmap;
 
 /* functions in mp_machdep.c */
+void	assign_cpu_ids(void);
 void	cpu_add(u_int apic_id, char boot_cpu);
 void	cpustop_handler(void);
 void	cpususpend_handler(void);
+void	init_secondary_tail(void);
 void	invltlb_handler(void);
 void	invltlb_pcid_handler(void);
 void	invlpg_handler(void);
@@ -77,6 +112,7 @@ void	ipi_cpu(int cpu, u_int ipi);
 int	ipi_nmi_handler(void);
 void	ipi_selected(cpuset_t cpus, u_int ipi);
 u_int	mp_bootaddress(u_int);
+void	set_interrupt_apic_ids(void);
 void	smp_cache_flush(void);
 void	smp_invlpg(struct pmap *pmap, vm_offset_t addr);
 void	smp_masked_invlpg(cpuset_t mask, struct pmap *pmap, vm_offset_t addr);
@@ -87,6 +123,9 @@ void	smp_masked_invlpg_range(cpuset_t mask, struct pmap *pmap,
 void	smp_invltlb(struct pmap *pmap);
 void	smp_masked_invltlb(cpuset_t mask, struct pmap *pmap);
 int	native_start_all_aps(void);
+void	mem_range_AP_init(void);
+void	topo_probe(void);
+void	ipi_send_cpu(int cpu, u_int ipi);
 
 #endif /* !LOCORE */
 #endif /* SMP */
diff --git a/sys/amd64/include/vm.h b/sys/amd64/include/vm.h
index 6573e37..22d2eca 100644
--- a/sys/amd64/include/vm.h
+++ b/sys/amd64/include/vm.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009 Advanced Computing Technologies LLC
+ * Copyright (c) 2009 Hudson River Trading LLC
  * Written by: John H. Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 52294bd..7c617be 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -204,13 +204,12 @@ int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
 int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
 int vm_apicid2vcpuid(struct vm *vm, int apicid);
 int vm_activate_cpu(struct vm *vm, int vcpu);
-cpuset_t vm_active_cpus(struct vm *vm);
-cpuset_t vm_suspended_cpus(struct vm *vm);
 struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
 void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
 
+#ifdef _SYS__CPUSET_H_
 /*
  * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
  * The rendezvous 'func(arg)' is not allowed to do anything that will
@@ -228,6 +227,9 @@ void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
 typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
 void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg);
+cpuset_t vm_active_cpus(struct vm *vm);
+cpuset_t vm_suspended_cpus(struct vm *vm);
+#endif	/* _SYS__CPUSET_H_ */
 
 static __inline int
 vcpu_rendezvous_pending(void *rendezvous_cookie)
diff --git a/sys/amd64/include/xen/xenfunc.h b/sys/amd64/include/xen/xenfunc.h
index d03d4f6..d8a6b5c 100644
--- a/sys/amd64/include/xen/xenfunc.h
+++ b/sys/amd64/include/xen/xenfunc.h
@@ -29,12 +29,7 @@
 #ifndef _XEN_XENFUNC_H_
 #define _XEN_XENFUNC_H_
 
-#ifdef XENHVM
 #include <machine/xen/xenvar.h>
-#else
-#include <machine/xen/xenpmap.h>
-#include <machine/segments.h>
-#endif
 
 #define BKPT __asm__("int3");
 #define XPQ_CALL_DEPTH 5
@@ -64,10 +59,6 @@ void _xen_machphys_update(vm_paddr_t, vm_paddr_t, char *file, int line);
 #define xen_machphys_update(a, b) _xen_machphys_update((a), (b), NULL, 0)
 #endif	
 
-#ifndef XENHVM
-void xen_update_descriptor(union descriptor *, union descriptor *);
-#endif
-
 extern struct mtx balloon_lock;
 #if 0
 #define balloon_lock(__flags)   mtx_lock_irqsave(&balloon_lock, __flags)
diff --git a/sys/amd64/include/xen/xenpmap.h b/sys/amd64/include/xen/xenpmap.h
deleted file mode 100644
index d768dad..0000000
--- a/sys/amd64/include/xen/xenpmap.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- *
- * Copyright (c) 2004 Christian Limpach.
- * Copyright (c) 2004,2005 Kip Macy
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *      This product includes software developed by Christian Limpach.
- * 4. The name of the author may not be used to endorse or promote products
- *    derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#ifndef _XEN_XENPMAP_H_
-#define _XEN_XENPMAP_H_
-
-#include <machine/xen/features.h>
-
-void _xen_queue_pt_update(vm_paddr_t, vm_paddr_t, char *, int);
-void xen_pt_switch(vm_paddr_t);
-void xen_set_ldt(vm_paddr_t, unsigned long);
-void xen_pgdpt_pin(vm_paddr_t);
-void xen_pgd_pin(vm_paddr_t);
-void xen_pgd_unpin(vm_paddr_t);
-void xen_pt_pin(vm_paddr_t);
-void xen_pt_unpin(vm_paddr_t);
-void xen_flush_queue(void);
-void xen_check_queue(void);
-#if 0
-void pmap_ref(pt_entry_t *pte, vm_paddr_t ma);
-#endif
-
-#ifdef INVARIANTS
-#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), __FILE__, __LINE__)
-#else
-#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), NULL, 0)
-#endif	
-
-#ifdef PMAP_DEBUG
-#define PMAP_REF pmap_ref
-#define PMAP_DEC_REF_PAGE pmap_dec_ref_page
-#define PMAP_MARK_PRIV pmap_mark_privileged
-#define PMAP_MARK_UNPRIV pmap_mark_unprivileged
-#else 
-#define PMAP_MARK_PRIV(a)
-#define PMAP_MARK_UNPRIV(a)
-#define PMAP_REF(a, b)
-#define PMAP_DEC_REF_PAGE(a)
-#endif
-
-#define ALWAYS_SYNC 0
-
-#ifdef PT_DEBUG
-#define PT_LOG() printk("WP PT_SET %s:%d\n", __FILE__, __LINE__) 
-#else
-#define PT_LOG()
-#endif
-
-#define INVALID_P2M_ENTRY	(~0UL)
-
-#define pmap_valid_entry(E)           ((E) & PG_V) /* is PDE or PTE valid? */
-
-#define SH_PD_SET_VA        1
-#define SH_PD_SET_VA_MA     2
-#define SH_PD_SET_VA_CLEAR  3
-
-struct pmap;
-void pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type);
-#ifdef notyet
-static vm_paddr_t
-vptetomachpte(vm_paddr_t *pte)
-{
-	vm_offset_t offset, ppte;
-	vm_paddr_t pgoffset, retval, *pdir_shadow_ptr;
-	int pgindex;
-
-	ppte = (vm_offset_t)pte;
-	pgoffset = (ppte & PAGE_MASK);
-	offset = ppte - (vm_offset_t)PTmap;
-	pgindex = ppte >> PDRSHIFT;
-
-	pdir_shadow_ptr = (vm_paddr_t *)PCPU_GET(pdir_shadow);
-	retval = (pdir_shadow_ptr[pgindex] & ~PAGE_MASK) + pgoffset;
-	return (retval);
-}
-#endif
-#define	PT_GET(_ptp)						\
-	(pmap_valid_entry(*(_ptp)) ? xpmap_mtop(*(_ptp)) : (0))
-
-#ifdef WRITABLE_PAGETABLES
-
-#define PT_SET_VA(_ptp,_npte,sync) do {				\
-        PMAP_REF((_ptp), xpmap_ptom(_npte));                    \
-        PT_LOG();                                               \
-        *(_ptp) = xpmap_ptom((_npte));                          \
-} while (/*CONSTCOND*/0)
-#define PT_SET_VA_MA(_ptp,_npte,sync) do {		        \
-        PMAP_REF((_ptp), (_npte));                              \
-        PT_LOG();                                               \
-        *(_ptp) = (_npte);                                      \
-} while (/*CONSTCOND*/0)
-#define PT_CLEAR_VA(_ptp, sync) do {				\
-        PMAP_REF((pt_entry_t *)(_ptp), 0);                      \
-        PT_LOG();                                               \
-        *(_ptp) = 0;                                            \
-} while (/*CONSTCOND*/0)
-
-#define PD_SET_VA(_pmap, _ptp, _npte, sync) do {		\
-        PMAP_REF((_ptp), xpmap_ptom(_npte));                    \
-        pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA);           \
-	if (sync || ALWAYS_SYNC) xen_flush_queue();     	\
-} while (/*CONSTCOND*/0)
-#define PD_SET_VA_MA(_pmap, _ptp, _npte, sync) do {		\
-        PMAP_REF((_ptp), (_npte));                              \
-        pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA_MA);        \
-	if (sync || ALWAYS_SYNC) xen_flush_queue();		\
-} while (/*CONSTCOND*/0)
-#define PD_CLEAR_VA(_pmap, _ptp, sync) do {			\
-        PMAP_REF((pt_entry_t *)(_ptp), 0);                      \
-        pd_set((_pmap),(_ptp), 0, SH_PD_SET_VA_CLEAR);  	\
-	if (sync || ALWAYS_SYNC) xen_flush_queue();		\
-} while (/*CONSTCOND*/0)
-
-#else /* !WRITABLE_PAGETABLES */
-
-#define PT_SET_VA(_ptp,_npte,sync) do {				\
-        PMAP_REF((_ptp), xpmap_ptom(_npte));                    \
-	xen_queue_pt_update(vtomach(_ptp), 	        \
-			    xpmap_ptom(_npte)); 		\
-	if (sync || ALWAYS_SYNC) xen_flush_queue();		\
-} while (/*CONSTCOND*/0)
-#define PT_SET_VA_MA(_ptp,_npte,sync) do {		        \
-        PMAP_REF((_ptp), (_npte));                              \
-	xen_queue_pt_update(vtomach(_ptp), _npte);        \
-	if (sync || ALWAYS_SYNC) xen_flush_queue();		\
-} while (/*CONSTCOND*/0)
-#define PT_CLEAR_VA(_ptp, sync) do {				\
-        PMAP_REF((pt_entry_t *)(_ptp), 0);                      \
-	xen_queue_pt_update(vtomach(_ptp), 0);            \
-	if (sync || ALWAYS_SYNC)				\
-		xen_flush_queue();				\
-} while (/*CONSTCOND*/0)
-
-#define PD_SET_VA(_pmap, _ptepindex,_npte,sync) do {		\
-        PMAP_REF((_ptp), xpmap_ptom(_npte));                    \
-        pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA);     \
-	if (sync || ALWAYS_SYNC) xen_flush_queue();     	\
-} while (/*CONSTCOND*/0)
-#define PD_SET_VA_MA(_pmap, _ptepindex,_npte,sync) do {		\
-        PMAP_REF((_ptp), (_npte));                              \
-        pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA_MA);  \
-	if (sync || ALWAYS_SYNC) xen_flush_queue();		\
-} while (/*CONSTCOND*/0)
-#define PD_CLEAR_VA(_pmap, _ptepindex, sync) do {		\
-        PMAP_REF((pt_entry_t *)(_ptp), 0);                      \
-        pd_set((_pmap),(_ptepindex), 0, SH_PD_SET_VA_CLEAR);    \
-	if (sync || ALWAYS_SYNC) xen_flush_queue();		\
-} while (/*CONSTCOND*/0)
-
-#endif
-
-#define PT_SET_MA(_va, _ma) 					\
-do { 								\
-   PANIC_IF(HYPERVISOR_update_va_mapping(((unsigned long)(_va)),\
-	   (_ma),						\
-	   UVMF_INVLPG| UVMF_ALL) < 0);			\
-} while (/*CONSTCOND*/0)	  
-
-#define	PT_UPDATES_FLUSH() do {				        \
-        xen_flush_queue();                                      \
-} while (/*CONSTCOND*/0)
-
-static __inline vm_paddr_t
-xpmap_mtop(vm_paddr_t mpa)
-{
-	vm_paddr_t tmp = (mpa & PG_FRAME);
-	
-	return machtophys(tmp) | (mpa & ~PG_FRAME);
-}
-
-static __inline vm_paddr_t
-xpmap_ptom(vm_paddr_t ppa)
-{
-	vm_paddr_t tmp = (ppa & PG_FRAME);
-
-	return phystomach(tmp) | (ppa & ~PG_FRAME);
-}
-
-static __inline void
-set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
-#ifdef notyet	
-        PANIC_IF(max_mapnr && pfn >= max_mapnr);
-#endif	
-        if (xen_feature(XENFEAT_auto_translated_physmap)) {
-#ifdef notyet		
-                PANIC_IF((pfn != mfn && mfn != INVALID_P2M_ENTRY));
-#endif		
-                return;
-        }
-        xen_phys_machine[pfn] = mfn;
-}
-
-
-
-
-#endif /* _XEN_XENPMAP_H_ */
diff --git a/sys/amd64/include/xen/xenvar.h b/sys/amd64/include/xen/xenvar.h
index d9dbc5d..110a351 100644
--- a/sys/amd64/include/xen/xenvar.h
+++ b/sys/amd64/include/xen/xenvar.h
@@ -48,68 +48,7 @@ if (xendebug_flags & argflags) XENPRINTF("(file=%s, line=%d) " _f "\n", __FILE__
 #define TRACE_DEBUG(argflags, _f, _a...)
 #endif
 
-#ifdef XENHVM
-
-static inline vm_paddr_t
-phystomach(vm_paddr_t pa)
-{
-
-	return (pa);
-}
-
-static inline vm_paddr_t
-machtophys(vm_paddr_t ma)
-{
-
-	return (ma);
-}
-
 #define vtomach(va)	pmap_kextract((vm_offset_t) (va))
-#define PFNTOMFN(pa)	(pa)
-#define MFNTOPFN(ma)	(ma)
-
-#define set_phys_to_machine(pfn, mfn)	((void)0)
-#define phys_to_machine_mapping_valid(pfn)	(TRUE)
-#define PT_UPDATES_FLUSH()		((void)0)
-
-#else
-
-extern	xen_pfn_t *xen_phys_machine;
-
-
-extern xen_pfn_t *xen_machine_phys;
-/* Xen starts physical pages after the 4MB ISA hole -
- * FreeBSD doesn't
- */
-
-
-#undef ADD_ISA_HOLE /* XXX */
-
-#ifdef ADD_ISA_HOLE
-#define ISA_INDEX_OFFSET 1024 
-#define ISA_PDR_OFFSET 1
-#else
-#define ISA_INDEX_OFFSET 0
-#define ISA_PDR_OFFSET 0
-#endif
-
-
-#define PFNTOMFN(i) (xen_phys_machine[(i)])
-#define MFNTOPFN(i) ((vm_paddr_t)xen_machine_phys[(i)])
-
-#define VTOP(x) ((((uintptr_t)(x))) - KERNBASE)
-#define PTOV(x) (((uintptr_t)(x)) + KERNBASE)
-
-#define VTOPFN(x) (VTOP(x) >> PAGE_SHIFT)
-#define PFNTOV(x) PTOV((vm_paddr_t)(x)  << PAGE_SHIFT)
-
-#define VTOMFN(va) (vtomach(va) >> PAGE_SHIFT)
-#define PFN_UP(x)    (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-
-#define phystomach(pa) (((vm_paddr_t)(PFNTOMFN((pa) >> PAGE_SHIFT))) << PAGE_SHIFT)
-#define machtophys(ma) (((vm_paddr_t)(MFNTOPFN((ma) >> PAGE_SHIFT))) << PAGE_SHIFT)
-
-#endif
 
 void xpq_init(void);
 
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
index acb3a3d..3157e21 100644
--- a/sys/amd64/vmm/amd/amdv.c
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
-#include <sys/smp.h>
 
 #include <machine/vmm.h>
 #include "io/iommu.h"
diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c
index f505ea1..7cc13ca 100644
--- a/sys/amd64/vmm/amd/svm.c
+++ b/sys/amd64/vmm/amd/svm.c
@@ -802,6 +802,7 @@ svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
 	case CPU_MODE_REAL:
 		vmexit->u.inst_emul.cs_base = seg.base;
 		vmexit->u.inst_emul.cs_d = 0;
+		break;
 	case CPU_MODE_PROTECTED:
 	case CPU_MODE_COMPATIBILITY:
 		vmexit->u.inst_emul.cs_base = seg.base;
diff --git a/sys/amd64/vmm/amd/svm_msr.c b/sys/amd64/vmm/amd/svm_msr.c
index 100af4b..d3a6fe8 100644
--- a/sys/amd64/vmm/amd/svm_msr.c
+++ b/sys/amd64/vmm/amd/svm_msr.c
@@ -27,12 +27,17 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include <sys/types.h>
+#include <sys/param.h>
 #include <sys/errno.h>
+#include <sys/systm.h>
 
 #include <machine/cpufunc.h>
 #include <machine/specialreg.h>
+#include <machine/vmm.h>
 
+#include "svm.h"
+#include "vmcb.h"
+#include "svm_softc.h"
 #include "svm_msr.h"
 
 #ifndef MSR_AMDK8_IPM
@@ -105,6 +110,14 @@ svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result,
 	int error = 0;
 
 	switch (num) {
+	case MSR_MTRRcap:
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+	case MSR_SYSCFG:
+		*result = 0;
+		break;
 	case MSR_AMDK8_IPM:
 		*result = 0;
 		break;
@@ -122,6 +135,15 @@ svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu)
 	int error = 0;
 
 	switch (num) {
+	case MSR_MTRRcap:
+		vm_inject_gp(sc->vm, vcpu);
+		break;
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+	case MSR_SYSCFG:
+		break;		/* Ignore writes */
 	case MSR_AMDK8_IPM:
 		/*
 		 * Ignore writes to the "Interrupt Pending Message" MSR.
diff --git a/sys/amd64/vmm/amd/vmcb.c b/sys/amd64/vmm/amd/vmcb.c
index fb4b2c8..d860169 100644
--- a/sys/amd64/vmm/amd/vmcb.c
+++ b/sys/amd64/vmm/amd/vmcb.c
@@ -29,7 +29,6 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
 
 #include <machine/segments.h>
 #include <machine/specialreg.h>
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
index e517778..526b0d1 100644
--- a/sys/amd64/vmm/intel/vmx_msr.c
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -31,7 +31,6 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
 
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
@@ -396,6 +395,13 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
 	error = 0;
 
 	switch (num) {
+	case MSR_MTRRcap:
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+		*val = 0;
+		break;
 	case MSR_IA32_MISC_ENABLE:
 		*val = misc_enable;
 		break;
@@ -427,6 +433,14 @@ vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 	error = 0;
 
 	switch (num) {
+	case MSR_MTRRcap:
+		vm_inject_gp(vmx->vm, vcpuid);
+		break;
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+		break;		/* Ignore writes */
 	case MSR_IA32_MISC_ENABLE:
 		changed = val ^ misc_enable;
 		/*
diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c
index 0df6e7c..6e94f5b 100644
--- a/sys/amd64/vmm/io/vatpic.c
+++ b/sys/amd64/vmm/io/vatpic.c
@@ -30,7 +30,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
diff --git a/sys/amd64/vmm/io/vatpit.c b/sys/amd64/vmm/io/vatpit.c
index 842253d..173ef1f 100644
--- a/sys/amd64/vmm/io/vatpit.c
+++ b/sys/amd64/vmm/io/vatpit.c
@@ -31,7 +31,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
diff --git a/sys/amd64/vmm/io/vhpet.c b/sys/amd64/vmm/io/vhpet.c
index a4c96cd..1db1c51 100644
--- a/sys/amd64/vmm/io/vhpet.c
+++ b/sys/amd64/vmm/io/vhpet.c
@@ -36,7 +36,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
 
 #include <dev/acpica/acpi_hpet.h>
 
diff --git a/sys/amd64/vmm/io/vioapic.c b/sys/amd64/vmm/io/vioapic.c
index 411887d..e6b8b5a 100644
--- a/sys/amd64/vmm/io/vioapic.c
+++ b/sys/amd64/vmm/io/vioapic.c
@@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 7097248..3451e1e 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -547,6 +547,8 @@ vlapic_update_ppr(struct vlapic *vlapic)
 	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
 }
 
+static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt");
+
 static void
 vlapic_process_eoi(struct vlapic *vlapic)
 {
@@ -557,11 +559,7 @@ vlapic_process_eoi(struct vlapic *vlapic)
 	isrptr = &lapic->isr0;
 	tmrptr = &lapic->tmr0;
 
-	/*
-	 * The x86 architecture reserves the the first 32 vectors for use
-	 * by the processor.
-	 */
-	for (i = 7; i > 0; i--) {
+	for (i = 7; i >= 0; i--) {
 		idx = i * 4;
 		bitpos = fls(isrptr[idx]);
 		if (bitpos-- != 0) {
@@ -570,17 +568,21 @@ vlapic_process_eoi(struct vlapic *vlapic)
 				      vlapic->isrvec_stk_top);
 			}
 			isrptr[idx] &= ~(1 << bitpos);
+			vector = i * 32 + bitpos;
+			VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d",
+			    vector);
 			VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
 			vlapic->isrvec_stk_top--;
 			vlapic_update_ppr(vlapic);
 			if ((tmrptr[idx] & (1 << bitpos)) != 0) {
-				vector = i * 32 + bitpos;
 				vioapic_process_eoi(vlapic->vm, vlapic->vcpuid,
 				    vector);
 			}
 			return;
 		}
 	}
+	VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI");
+	vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1);
 }
 
 static __inline int
@@ -1092,11 +1094,7 @@ vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
 
 	irrptr = &lapic->irr0;
 
-	/*
-	 * The x86 architecture reserves the the first 32 vectors for use
-	 * by the processor.
-	 */
-	for (i = 7; i > 0; i--) {
+	for (i = 7; i >= 0; i--) {
 		idx = i * 4;
 		val = atomic_load_acq_int(&irrptr[idx]);
 		bitpos = fls(val);
diff --git a/sys/amd64/vmm/io/vpmtmr.c b/sys/amd64/vmm/io/vpmtmr.c
index 09f763f..1e7bb93 100644
--- a/sys/amd64/vmm/io/vpmtmr.c
+++ b/sys/amd64/vmm/io/vpmtmr.c
@@ -29,7 +29,6 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
diff --git a/sys/amd64/vmm/io/vrtc.c b/sys/amd64/vmm/io/vrtc.c
index ab9cabb..18ebc4b 100644
--- a/sys/amd64/vmm/io/vrtc.c
+++ b/sys/amd64/vmm/io/vrtc.c
@@ -30,7 +30,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
@@ -63,9 +62,12 @@ struct rtcdev {
 	uint8_t	reg_b;
 	uint8_t	reg_c;
 	uint8_t	reg_d;
-	uint8_t	nvram[128 - 14];
+	uint8_t	nvram[36];
+	uint8_t	century;
+	uint8_t	nvram2[128 - 51];
 } __packed;
 CTASSERT(sizeof(struct rtcdev) == 128);
+CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY);
 
 struct vrtc {
 	struct vm	*vm;
@@ -139,20 +141,23 @@ update_enabled(struct vrtc *vrtc)
 }
 
 static time_t
-vrtc_curtime(struct vrtc *vrtc)
+vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime)
 {
 	sbintime_t now, delta;
-	time_t t;
+	time_t t, secs;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	t = vrtc->base_rtctime;
+	*basetime = vrtc->base_uptime;
 	if (update_enabled(vrtc)) {
 		now = sbinuptime();
 		delta = now - vrtc->base_uptime;
 		KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: "
 		    "%#lx to %#lx", vrtc->base_uptime, now));
-		t += delta / SBT_1S;
+		secs = delta / SBT_1S;
+		t += secs;
+		*basetime += secs * SBT_1S;
 	}
 	return (t);
 }
@@ -245,6 +250,7 @@ secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update)
 	rtc->day_of_month = rtcset(rtc, ct.day);
 	rtc->month = rtcset(rtc, ct.mon);
 	rtc->year = rtcset(rtc, ct.year % 100);
+	rtc->century = rtcset(rtc, ct.year / 100);
 }
 
 static int
@@ -274,7 +280,7 @@ rtc_to_secs(struct vrtc *vrtc)
 	struct timespec ts;
 	struct rtcdev *rtc;
 	struct vm *vm;
-	int error, hour, pm, year;
+	int century, error, hour, pm, year;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
@@ -358,10 +364,14 @@ rtc_to_secs(struct vrtc *vrtc)
 		VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year);
 		goto fail;
 	}
-	if (year >= 70)
-		ct.year = 1900 + year;
-	else
-		ct.year = 2000 + year;
+
+	error = rtcget(rtc, rtc->century, &century);
+	ct.year = century * 100 + year;
+	if (error || ct.year < POSIX_BASE_YEAR) {
+		VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century,
+		    ct.year);
+		goto fail;
+	}
 
 	error = clock_ct_to_ts(&ct, &ts);
 	if (error || ts.tv_sec < 0) {
@@ -373,13 +383,19 @@ rtc_to_secs(struct vrtc *vrtc)
 	}
 	return (ts.tv_sec);		/* success */
 fail:
-	return (VRTC_BROKEN_TIME);	/* failure */
+	/*
+	 * Stop updating the RTC if the date/time fields programmed by
+	 * the guest are invalid.
+	 */
+	VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected");
+	return (VRTC_BROKEN_TIME);
 }
 
 static int
-vrtc_time_update(struct vrtc *vrtc, time_t newtime)
+vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase)
 {
 	struct rtcdev *rtc;
+	sbintime_t oldbase;
 	time_t oldtime;
 	uint8_t alarm_sec, alarm_min, alarm_hour;
 
@@ -391,16 +407,21 @@ vrtc_time_update(struct vrtc *vrtc, time_t newtime)
 	alarm_hour = rtc->alarm_hour;
 
 	oldtime = vrtc->base_rtctime;
-	VM_CTR2(vrtc->vm, "Updating RTC time from %#lx to %#lx",
+	VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx",
 	    oldtime, newtime);
 
+	oldbase = vrtc->base_uptime;
+	VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx",
+	    oldbase, newbase);
+	vrtc->base_uptime = newbase;
+
 	if (newtime == oldtime)
 		return (0);
 
 	/*
 	 * If 'newtime' indicates that RTC updates are disabled then just
 	 * record that and return. There is no need to do alarm interrupt
-	 * processing or update 'base_uptime' in this case.
+	 * processing in this case.
 	 */
 	if (newtime == VRTC_BROKEN_TIME) {
 		vrtc->base_rtctime = VRTC_BROKEN_TIME;
@@ -446,8 +467,6 @@ vrtc_time_update(struct vrtc *vrtc, time_t newtime)
 	if (uintr_enabled(vrtc))
 		vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE);
 
-	vrtc->base_uptime = sbinuptime();
-
 	return (0);
 }
 
@@ -518,7 +537,7 @@ static void
 vrtc_callout_handler(void *arg)
 {
 	struct vrtc *vrtc = arg;
-	sbintime_t freqsbt;
+	sbintime_t freqsbt, basetime;
 	time_t rtctime;
 	int error;
 
@@ -540,8 +559,8 @@ vrtc_callout_handler(void *arg)
 		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD);
 
 	if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) {
-		rtctime = vrtc_curtime(vrtc);
-		error = vrtc_time_update(vrtc, rtctime);
+		rtctime = vrtc_curtime(vrtc, &basetime);
+		error = vrtc_time_update(vrtc, rtctime, basetime);
 		KASSERT(error == 0, ("%s: vrtc_time_update error %d",
 		    __func__, error));
 	}
@@ -606,7 +625,7 @@ static int
 vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
 {
 	struct rtcdev *rtc;
-	sbintime_t oldfreq, newfreq;
+	sbintime_t oldfreq, newfreq, basetime;
 	time_t curtime, rtctime;
 	int error;
 	uint8_t oldval, changed;
@@ -627,19 +646,13 @@ vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
 	if (changed & RTCSB_HALT) {
 		if ((newval & RTCSB_HALT) == 0) {
 			rtctime = rtc_to_secs(vrtc);
+			basetime = sbinuptime();
 			if (rtctime == VRTC_BROKEN_TIME) {
-				/*
-				 * Stop updating the RTC if the date/time
-				 * programmed by the guest is not correct.
-				 */
-				VM_CTR0(vrtc->vm, "Invalid RTC date/time "
-				    "programming detected");
-
 				if (rtc_flag_broken_time)
 					return (-1);
 			}
 		} else {
-			curtime = vrtc_curtime(vrtc);
+			curtime = vrtc_curtime(vrtc, &basetime);
 			KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch "
 			    "between vrtc basetime (%#lx) and curtime (%#lx)",
 			    __func__, vrtc->base_rtctime, curtime));
@@ -658,7 +671,7 @@ vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
 			rtctime = VRTC_BROKEN_TIME;
 			rtc->reg_b &= ~RTCSB_UINTR;
 		}
-		error = vrtc_time_update(vrtc, rtctime);
+		error = vrtc_time_update(vrtc, rtctime, basetime);
 		KASSERT(error == 0, ("vrtc_time_update error %d", error));
 	}
 
@@ -738,7 +751,7 @@ vrtc_set_time(struct vm *vm, time_t secs)
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
-	error = vrtc_time_update(vrtc, secs);
+	error = vrtc_time_update(vrtc, secs, sbinuptime());
 	VRTC_UNLOCK(vrtc);
 
 	if (error) {
@@ -755,11 +768,12 @@ time_t
 vrtc_get_time(struct vm *vm)
 {
 	struct vrtc *vrtc;
+	sbintime_t basetime;
 	time_t t;
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
-	t = vrtc_curtime(vrtc);
+	t = vrtc_curtime(vrtc, &basetime);
 	VRTC_UNLOCK(vrtc);
 
 	return (t);
@@ -777,7 +791,7 @@ vrtc_nvram_write(struct vm *vm, int offset, uint8_t value)
 	 * Don't allow writes to RTC control registers or the date/time fields.
 	 */
 	if (offset < offsetof(struct rtcdev, nvram[0]) ||
-	    offset >= sizeof(struct rtcdev)) {
+	    offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) {
 		VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d",
 		    offset);
 		return (EINVAL);
@@ -796,6 +810,7 @@ int
 vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval)
 {
 	struct vrtc *vrtc;
+	sbintime_t basetime;
 	time_t curtime;
 	uint8_t *ptr;
 
@@ -811,8 +826,8 @@ vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval)
 	/*
 	 * Update RTC date/time fields if necessary.
 	 */
-	if (offset < 10) {
-		curtime = vrtc_curtime(vrtc);
+	if (offset < 10 || offset == RTC_CENTURY) {
+		curtime = vrtc_curtime(vrtc, &basetime);
 		secs_to_rtc(curtime, vrtc, 0);
 	}
 
@@ -852,6 +867,7 @@ vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
 {
 	struct vrtc *vrtc;
 	struct rtcdev *rtc;
+	sbintime_t basetime;
 	time_t curtime;
 	int error, offset;
 
@@ -869,16 +885,20 @@ vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
 	}
 
 	error = 0;
-	curtime = vrtc_curtime(vrtc);
-	vrtc_time_update(vrtc, curtime);
+	curtime = vrtc_curtime(vrtc, &basetime);
+	vrtc_time_update(vrtc, curtime, basetime);
 
-	if (in) {
-		/*
-		 * Update RTC date/time fields if necessary.
-		 */
-		if (offset < 10)
-			secs_to_rtc(curtime, vrtc, 0);
+	/*
+	 * Update RTC date/time fields if necessary.
+	 *
+	 * This is not just for reads of the RTC. The side-effect of writing
+	 * the century byte requires other RTC date/time fields (e.g. sec)
+	 * to be updated here.
+	 */
+	if (offset < 10 || offset == RTC_CENTURY)
+		secs_to_rtc(curtime, vrtc, 0);
 
+	if (in) {
 		if (offset == 12) {
 			/*
 			 * XXX
@@ -922,6 +942,18 @@ vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
 			*((uint8_t *)rtc + offset) = *val;
 			break;
 		}
+
+		/*
+		 * XXX some guests (e.g. OpenBSD) write the century byte
+		 * outside of RTCSB_HALT so re-calculate the RTC date/time.
+		 */
+		if (offset == RTC_CENTURY && !rtc_halted(vrtc)) {
+			curtime = rtc_to_secs(vrtc);
+			error = vrtc_time_update(vrtc, curtime, sbinuptime());
+			KASSERT(!error, ("vrtc_time_update error %d", error));
+			if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time)
+				error = -1;
+		}
 	}
 	VRTC_UNLOCK(vrtc);
 	return (error);
@@ -971,7 +1003,7 @@ vrtc_init(struct vm *vm)
 
 	VRTC_LOCK(vrtc);
 	vrtc->base_rtctime = VRTC_BROKEN_TIME;
-	vrtc_time_update(vrtc, curtime);
+	vrtc_time_update(vrtc, curtime, sbinuptime());
 	secs_to_rtc(curtime, vrtc, 0);
 	VRTC_UNLOCK(vrtc);
 
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 6bd5bce..bca9b98 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -1293,8 +1293,12 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 	else if (error != 0)
 		panic("%s: vmm_fetch_instruction error %d", __func__, error);
 
-	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
-		return (EFAULT);
+	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) {
+		VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx",
+		    vme->rip + cs_base);
+		*retu = true;	    /* dump instruction bytes in userspace */
+		return (0);
+	}
 
 	/*
 	 * If the instruction length was not specified then update it now
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 0b50e92..7172365 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -72,6 +72,8 @@ enum {
 	VIE_OP_TYPE_POP,
 	VIE_OP_TYPE_MOVS,
 	VIE_OP_TYPE_GROUP1,
+	VIE_OP_TYPE_STOS,
+	VIE_OP_TYPE_BITTEST,
 	VIE_OP_TYPE_LAST
 };
 
@@ -91,6 +93,11 @@ static const struct vie_op two_byte_opcodes[256] = {
 		.op_byte = 0xB7,
 		.op_type = VIE_OP_TYPE_MOVZX,
 	},
+	[0xBA] = {
+		.op_byte = 0xBA,
+		.op_type = VIE_OP_TYPE_BITTEST,
+		.op_flags = VIE_OP_F_IMM8,
+	},
 	[0xBE] = {
 		.op_byte = 0xBE,
 		.op_type = VIE_OP_TYPE_MOVSX,
@@ -146,6 +153,16 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_type = VIE_OP_TYPE_MOVS,
 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
 	},
+	[0xAA] = {
+		.op_byte = 0xAA,
+		.op_type = VIE_OP_TYPE_STOS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+	},
+	[0xAB] = {
+		.op_byte = 0xAB,
+		.op_type = VIE_OP_TYPE_STOS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+	},
 	[0xC6] = {
 		/* XXX Group 11 extended opcode - not just MOV */
 		.op_byte = 0xC6,
@@ -803,6 +820,68 @@ done:
 }
 
 static int
+emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+	int error, opsize, repeat;
+	uint64_t val;
+	uint64_t rcx, rdi, rflags;
+
+	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
+	repeat = vie->repz_present | vie->repnz_present;
+
+	if (repeat) {
+		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
+		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
+
+		/*
+		 * The count register is %rcx, %ecx or %cx depending on the
+		 * address size of the instruction.
+		 */
+		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
+			return (0);
+	}
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
+	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
+
+	error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
+	if (error)
+		return (error);
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
+	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	if (rflags & PSL_D)
+		rdi -= opsize;
+	else
+		rdi += opsize;
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
+	    vie->addrsize);
+	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
+
+	if (repeat) {
+		rcx = rcx - 1;
+		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
+		    rcx, vie->addrsize);
+		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
+
+		/*
+		 * Repeat the instruction if the count register is not zero.
+		 */
+		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
+			vm_restart_instruction(vm, vcpuid);
+	}
+
+	return (0);
+}
+
+static int
 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
@@ -1262,6 +1341,48 @@ emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
+static int
+emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
+{
+	uint64_t val, rflags;
+	int error, bitmask, bitoff;
+
+	/*
+	 * 0F BA is a Group 8 extended opcode.
+	 *
+	 * Currently we only emulate the 'Bit Test' instruction which is
+	 * identified by a ModR/M:reg encoding of 100b.
+	 */
+	if ((vie->reg & 7) != 4)
+		return (EINVAL);
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
+	if (error)
+		return (error);
+
+	/*
+	 * Intel SDM, Vol 2, Table 3-2:
+	 * "Range of Bit Positions Specified by Bit Offset Operands"
+	 */
+	bitmask = vie->opsize * 8 - 1;
+	bitoff = vie->immediate & bitmask;
+
+	/* Copy the bit into the Carry flag in %rflags */
+	if (val & (1UL << bitoff))
+		rflags |= PSL_C;
+	else
+		rflags &= ~PSL_C;
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
+
+	return (0);
+}
+
 int
 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
@@ -1302,6 +1423,10 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
 		    memwrite, memarg);
 		break;
+	case VIE_OP_TYPE_STOS:
+		error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
 	case VIE_OP_TYPE_AND:
 		error = emulate_and(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
@@ -1314,6 +1439,10 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		error = emulate_sub(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
+	case VIE_OP_TYPE_BITTEST:
+		error = emulate_bittest(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
 	default:
 		error = EINVAL;
 		break;
diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c
index fc68a61..63044e8 100644
--- a/sys/amd64/vmm/vmm_ioport.c
+++ b/sys/amd64/vmm/vmm_ioport.c
@@ -28,16 +28,10 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
-#include <sys/types.h>
-#include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/systm.h>
 
-#include <vm/vm.h>
-
 #include <machine/vmm.h>
 #include <machine/vmm_instruction_emul.h>
-#include <x86/psl.h>
 
 #include "vatpic.h"
 #include "vatpit.h"
diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c
index 9ecf9af..4ae5fb9 100644
--- a/sys/amd64/vmm/vmm_stat.c
+++ b/sys/amd64/vmm/vmm_stat.c
@@ -33,7 +33,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
-#include <sys/smp.h>
 
 #include <machine/vmm.h>
 #include "vmm_util.h"
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
index c37d21c..45e08b5 100644
--- a/sys/amd64/vmm/x86.c
+++ b/sys/amd64/vmm/x86.c
@@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
 #include <sys/sysctl.h>
 
 #include <machine/clock.h>
@@ -289,9 +288,8 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			
 			/*
 			 * Machine check handling is done in the host.
-			 * Hide MTRR capability.
 			 */
-			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
+			regs[3] &= ~(CPUID_MCA | CPUID_MCE);
 
                         /*
                         * Hide the debug store capability.