summaryrefslogtreecommitdiffstats
path: root/usr.sbin
diff options
context:
space:
mode:
authorneel <neel@FreeBSD.org>2013-01-19 04:18:52 +0000
committerneel <neel@FreeBSD.org>2013-01-19 04:18:52 +0000
commit363335d53e3c955602378aa434d0054c48a6e0d6 (patch)
tree5af6fe77acd4da3002c907484fd64133deb95c8d /usr.sbin
parent3600c83b820e00959d61600e67e9dcb32ef6b518 (diff)
parentdde8bf641fc7c8e9541167cd7c01523973d0b569 (diff)
downloadFreeBSD-src-363335d53e3c955602378aa434d0054c48a6e0d6.zip
FreeBSD-src-363335d53e3c955602378aa434d0054c48a6e0d6.tar.gz
Merge projects/bhyve to head.
'bhyve' was developed by grehan@ and myself at NetApp (thanks!). Special thanks to Peter Snyder, Joe Caradonna and Michael Dexter for their support and encouragement. Obtained from: NetApp
Diffstat (limited to 'usr.sbin')
-rw-r--r--usr.sbin/Makefile.amd643
-rw-r--r--usr.sbin/bhyve/Makefile27
-rw-r--r--usr.sbin/bhyve/acpi.c844
-rw-r--r--usr.sbin/bhyve/acpi.h34
-rw-r--r--usr.sbin/bhyve/atpic.c68
-rw-r--r--usr.sbin/bhyve/bhyverun.c788
-rw-r--r--usr.sbin/bhyve/bhyverun.h53
-rw-r--r--usr.sbin/bhyve/consport.c140
-rw-r--r--usr.sbin/bhyve/dbgport.c138
-rw-r--r--usr.sbin/bhyve/dbgport.h36
-rw-r--r--usr.sbin/bhyve/elcr.c65
-rw-r--r--usr.sbin/bhyve/inout.c151
-rw-r--r--usr.sbin/bhyve/inout.h67
-rw-r--r--usr.sbin/bhyve/ioapic.c324
-rw-r--r--usr.sbin/bhyve/ioapic.h38
-rw-r--r--usr.sbin/bhyve/mem.c218
-rw-r--r--usr.sbin/bhyve/mem.h57
-rw-r--r--usr.sbin/bhyve/mevent.c432
-rw-r--r--usr.sbin/bhyve/mevent.h49
-rw-r--r--usr.sbin/bhyve/mevent_test.c180
-rw-r--r--usr.sbin/bhyve/mptbl.c398
-rw-r--r--usr.sbin/bhyve/mptbl.h35
-rw-r--r--usr.sbin/bhyve/pci_emul.c1117
-rw-r--r--usr.sbin/bhyve/pci_emul.h216
-rw-r--r--usr.sbin/bhyve/pci_hostbridge.c52
-rw-r--r--usr.sbin/bhyve/pci_passthru.c724
-rw-r--r--usr.sbin/bhyve/pci_uart.c626
-rw-r--r--usr.sbin/bhyve/pci_virtio_block.c534
-rw-r--r--usr.sbin/bhyve/pci_virtio_net.c781
-rw-r--r--usr.sbin/bhyve/pit_8254.c198
-rw-r--r--usr.sbin/bhyve/pit_8254.h45
-rw-r--r--usr.sbin/bhyve/pmtmr.c108
-rw-r--r--usr.sbin/bhyve/post.c51
-rw-r--r--usr.sbin/bhyve/rtc.c274
-rw-r--r--usr.sbin/bhyve/spinup_ap.c119
-rw-r--r--usr.sbin/bhyve/spinup_ap.h34
-rw-r--r--usr.sbin/bhyve/uart.c60
-rw-r--r--usr.sbin/bhyve/virtio.h85
-rw-r--r--usr.sbin/bhyve/xmsr.c48
-rw-r--r--usr.sbin/bhyve/xmsr.h34
-rw-r--r--usr.sbin/bhyvectl/Makefile17
-rw-r--r--usr.sbin/bhyvectl/bhyvectl.c1524
-rw-r--r--usr.sbin/bhyveload/Makefile14
-rw-r--r--usr.sbin/bhyveload/bhyveload.8130
-rw-r--r--usr.sbin/bhyveload/bhyveload.c652
45 files changed, 11588 insertions, 0 deletions
diff --git a/usr.sbin/Makefile.amd64 b/usr.sbin/Makefile.amd64
index 1a1bffe..5ee2165 100644
--- a/usr.sbin/Makefile.amd64
+++ b/usr.sbin/Makefile.amd64
@@ -10,6 +10,9 @@ SUBDIR+= acpi
SUBDIR+= apm
.endif
SUBDIR+= asf
+SUBDIR+= bhyve
+SUBDIR+= bhyvectl
+SUBDIR+= bhyveload
SUBDIR+= boot0cfg
.if ${MK_TOOLCHAIN} != "no"
SUBDIR+= btxld
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
new file mode 100644
index 0000000..078ef9a
--- /dev/null
+++ b/usr.sbin/bhyve/Makefile
@@ -0,0 +1,27 @@
+#
+# $FreeBSD$
+#
+
+PROG= bhyve
+
+DEBUG_FLAGS= -g -O0
+
+SRCS= acpi.c atpic.c bhyverun.c consport.c dbgport.c elcr.c inout.c
+SRCS+= ioapic.c mem.c mevent.c mptbl.c
+SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
+SRCS+= pci_virtio_net.c pci_uart.c pit_8254.c pmtmr.c post.c rtc.c uart.c
+SRCS+= xmsr.c spinup_ap.c
+
+.PATH: ${.CURDIR}/../../sys/amd64/vmm
+SRCS+= vmm_instruction_emul.c
+
+NO_MAN=
+
+DPADD= ${LIBVMMAPI} ${LIBMD} ${LIBPTHREAD}
+LDADD= -lvmmapi -lmd -lpthread
+
+WARNS?= 2
+
+CFLAGS+= -I${.CURDIR}/../../sys
+
+.include <bsd.prog.mk>
diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c
new file mode 100644
index 0000000..32effdc
--- /dev/null
+++ b/usr.sbin/bhyve/acpi.c
@@ -0,0 +1,844 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * bhyve ACPI table generator.
+ *
+ * Create the minimal set of ACPI tables required to boot FreeBSD (and
+ * hopefully other o/s's) by writing out ASL template files for each of
+ * the tables and the compiling them to AML with the Intel iasl compiler.
+ * The AML files are then read into guest memory.
+ *
+ * The tables are placed in the guest's ROM area just below 1MB physical,
+ * above the MPTable.
+ *
+ * Layout
+ * ------
+ * RSDP -> 0xf0400 (36 bytes fixed)
+ * RSDT -> 0xf0440 (36 bytes + 4*N table addrs, 2 used)
+ * XSDT -> 0xf0480 (36 bytes + 8*N table addrs, 2 used)
+ * MADT -> 0xf0500 (depends on #CPUs)
+ * FADT -> 0xf0600 (268 bytes)
+ * FACS -> 0xf0780 (64 bytes)
+ * DSDT -> 0xf0800 (variable - can go up to 0x100000)
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+
+#include <paths.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "bhyverun.h"
+#include "acpi.h"
+
+/*
+ * Define the base address of the ACPI tables, and the offsets to
+ * the individual tables
+ */
+#define BHYVE_ACPI_BASE 0xf0400
+#define RSDT_OFFSET 0x040
+#define XSDT_OFFSET 0x080
+#define MADT_OFFSET 0x100
+#define FADT_OFFSET 0x200
+#define FACS_OFFSET 0x380
+#define DSDT_OFFSET 0x400
+
+#define BHYVE_ASL_TEMPLATE "bhyve.XXXXXXX"
+#define BHYVE_ASL_SUFFIX ".aml"
+#define BHYVE_ASL_COMPILER "/usr/sbin/iasl"
+
+#define BHYVE_PM_TIMER_ADDR 0x408
+
+static int basl_keep_temps;
+static int basl_verbose_iasl;
+static int basl_ncpu;
+static uint32_t basl_acpi_base = BHYVE_ACPI_BASE;
+
+/*
+ * Contains the full pathname of the template to be passed
+ * to mkstemp/mktemps(3)
+ */
+static char basl_template[MAXPATHLEN];
+static char basl_stemplate[MAXPATHLEN];
+
+struct basl_fio {
+ int fd;
+ FILE *fp;
+ char f_name[MAXPATHLEN];
+};
+
+#define EFPRINTF(...) \
+ err = fprintf(__VA_ARGS__); if (err < 0) goto err_exit;
+
+#define EFFLUSH(x) \
+ err = fflush(x); if (err != 0) goto err_exit;
+
+static int
+basl_fwrite_rsdp(FILE *fp)
+{
+ int err;
+
+ err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve RSDP template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0008]\t\tSignature : \"RSD PTR \"\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 43\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 02\n");
+ EFPRINTF(fp, "[0004]\t\tRSDT Address : %08X\n",
+ basl_acpi_base + RSDT_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tLength : 00000024\n");
+ EFPRINTF(fp, "[0008]\t\tXSDT Address : 00000000%08X\n",
+ basl_acpi_base + XSDT_OFFSET);
+ EFPRINTF(fp, "[0001]\t\tExtended Checksum : 00\n");
+ EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_rsdt(FILE *fp)
+{
+ int err;
+
+ err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve RSDT template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"RSDT\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVRSDT \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ /* Add in pointers to the MADT and FADT */
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : %08X\n",
+ basl_acpi_base + MADT_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : %08X\n",
+ basl_acpi_base + FADT_OFFSET);
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_xsdt(FILE *fp)
+{
+ int err;
+
+ err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve XSDT template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"XSDT\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVXSDT \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ /* Add in pointers to the MADT and FADT */
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : 00000000%08X\n",
+ basl_acpi_base + MADT_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : 00000000%08X\n",
+ basl_acpi_base + FADT_OFFSET);
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_madt(FILE *fp)
+{
+ int err;
+ int i;
+
+ err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve MADT template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"APIC\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMADT \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0004]\t\tLocal Apic Address : FEE00000\n");
+ EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
+ EFPRINTF(fp, "\t\t\tPC-AT Compatibility : 1\n");
+ EFPRINTF(fp, "\n");
+
+ /* Add a Processor Local APIC entry for each CPU */
+ for (i = 0; i < basl_ncpu; i++) {
+ EFPRINTF(fp, "[0001]\t\tSubtable Type : 00\n");
+ EFPRINTF(fp, "[0001]\t\tLength : 08\n");
+ EFPRINTF(fp, "[0001]\t\tProcessor ID : %02d\n", i);
+ EFPRINTF(fp, "[0001]\t\tLocal Apic ID : %02d\n", i);
+ EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
+ EFPRINTF(fp, "\t\t\tProcessor Enabled : 1\n");
+ EFPRINTF(fp, "\n");
+ }
+
+ /* Always a single IOAPIC entry, with ID ncpu+1 */
+ EFPRINTF(fp, "[0001]\t\tSubtable Type : 01\n");
+ EFPRINTF(fp, "[0001]\t\tLength : 0C\n");
+ EFPRINTF(fp, "[0001]\t\tI/O Apic ID : %02d\n", basl_ncpu);
+ EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
+ EFPRINTF(fp, "[0004]\t\tAddress : fec00000\n");
+ EFPRINTF(fp, "[0004]\t\tInterrupt : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ /* Override the 8259 chained vector. XXX maybe not needed */
+ EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n");
+ EFPRINTF(fp, "[0001]\t\tLength : 0A\n");
+ EFPRINTF(fp, "[0001]\t\tBus : 00\n");
+ EFPRINTF(fp, "[0001]\t\tSource : 09\n");
+ EFPRINTF(fp, "[0004]\t\tInterrupt : 00000009\n");
+ EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0000\n");
+ EFPRINTF(fp, "\t\t\tPolarity : 0\n");
+ EFPRINTF(fp, "\t\t\tTrigger Mode : 0\n");
+ EFPRINTF(fp, "\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_fadt(FILE *fp)
+{
+ int err;
+
+ err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve FADT template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"FACP\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 0000010C\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 05\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVFACP \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0004]\t\tFACS Address : %08X\n",
+ basl_acpi_base + FACS_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tDSDT Address : %08X\n",
+ basl_acpi_base + DSDT_OFFSET);
+ EFPRINTF(fp, "[0001]\t\tModel : 00\n");
+ EFPRINTF(fp, "[0001]\t\tPM Profile : 00 [Unspecified]\n");
+ EFPRINTF(fp, "[0002]\t\tSCI Interrupt : 0009\n");
+ EFPRINTF(fp, "[0004]\t\tSMI Command Port : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tACPI Enable Value : 00\n");
+ EFPRINTF(fp, "[0001]\t\tACPI Disable Value : 00\n");
+ EFPRINTF(fp, "[0001]\t\tS4BIOS Command : 00\n");
+ EFPRINTF(fp, "[0001]\t\tP-State Control : 00\n");
+ EFPRINTF(fp, "[0004]\t\tPM1A Event Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tPM1B Event Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tPM1A Control Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tPM1B Control Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tPM2 Control Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tPM Timer Block Address : %08X\n",
+ BHYVE_PM_TIMER_ADDR);
+ EFPRINTF(fp, "[0004]\t\tGPE0 Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tGPE1 Block Address : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tPM1 Event Block Length : 04\n");
+ EFPRINTF(fp, "[0001]\t\tPM1 Control Block Length : 02\n");
+ EFPRINTF(fp, "[0001]\t\tPM2 Control Block Length : 00\n");
+ EFPRINTF(fp, "[0001]\t\tPM Timer Block Length : 04\n");
+ EFPRINTF(fp, "[0001]\t\tGPE0 Block Length : 00\n");
+ EFPRINTF(fp, "[0001]\t\tGPE1 Block Length : 00\n");
+ EFPRINTF(fp, "[0001]\t\tGPE1 Base Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\t_CST Support : 00\n");
+ EFPRINTF(fp, "[0002]\t\tC2 Latency : 0000\n");
+ EFPRINTF(fp, "[0002]\t\tC3 Latency : 0000\n");
+ EFPRINTF(fp, "[0002]\t\tCPU Cache Size : 0000\n");
+ EFPRINTF(fp, "[0002]\t\tCache Flush Stride : 0000\n");
+ EFPRINTF(fp, "[0001]\t\tDuty Cycle Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tDuty Cycle Width : 00\n");
+ EFPRINTF(fp, "[0001]\t\tRTC Day Alarm Index : 00\n");
+ EFPRINTF(fp, "[0001]\t\tRTC Month Alarm Index : 00\n");
+ EFPRINTF(fp, "[0001]\t\tRTC Century Index : 00\n");
+ EFPRINTF(fp, "[0002]\t\tBoot Flags (decoded below) : 0000\n");
+ EFPRINTF(fp, "\t\t\tLegacy Devices Supported (V2) : 0\n");
+ EFPRINTF(fp, "\t\t\t8042 Present on ports 60/64 (V2) : 0\n");
+ EFPRINTF(fp, "\t\t\tVGA Not Present (V4) : 1\n");
+ EFPRINTF(fp, "\t\t\tMSI Not Supported (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tPCIe ASPM Not Supported (V4) : 1\n");
+ EFPRINTF(fp, "\t\t\tCMOS RTC Not Present (V5) : 0\n");
+ EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
+ EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
+ EFPRINTF(fp, "\t\t\tWBINVD instruction is operational (V1) : 1\n");
+ EFPRINTF(fp, "\t\t\tWBINVD flushes all caches (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tAll CPUs support C1 (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tC2 works on MP system (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 1\n");
+ EFPRINTF(fp, "\t\t\tControl Method Sleep Button (V1) : 1\n");
+ EFPRINTF(fp, "\t\t\tRTC wake not in fixed reg space (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tRTC can wake system from S4 (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\t32-bit PM Timer (V1) : 1\n");
+ EFPRINTF(fp, "\t\t\tDocking Supported (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tReset Register Supported (V2) : 0\n");
+ EFPRINTF(fp, "\t\t\tSealed Case (V3) : 0\n");
+ EFPRINTF(fp, "\t\t\tHeadless - No Video (V3) : 1\n");
+ EFPRINTF(fp, "\t\t\tUse native instr after SLP_TYPx (V3) : 0\n");
+ EFPRINTF(fp, "\t\t\tPCIEXP_WAK Bits Supported (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tUse Platform Timer (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tRTC_STS valid on S4 wake (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tRemote Power-on capable (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tUse APIC Cluster Model (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tUse APIC Physical Destination Mode (V4) : 1\n");
+ EFPRINTF(fp, "\t\t\tHardware Reduced (V5) : 0\n");
+ EFPRINTF(fp, "\t\t\tLow Power S0 Idle (V5) : 0\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tReset Register : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000001\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0001]\t\tValue to cause reset : 00\n");
+ EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
+ EFPRINTF(fp, "[0008]\t\tFACS Address : 00000000%08X\n",
+ basl_acpi_base + FACS_OFFSET);
+ EFPRINTF(fp, "[0008]\t\tDSDT Address : 00000000%08X\n",
+ basl_acpi_base + DSDT_OFFSET);
+ EFPRINTF(fp,
+ "[0012]\t\tPM1A Event Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 20\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000001\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tPM1B Event Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tPM1A Control Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 10\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000001\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tPM1B Control Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tPM2 Control Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ /* Valid for bhyve */
+ EFPRINTF(fp,
+ "[0012]\t\tPM Timer Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 32\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
+ BHYVE_PM_TIMER_ADDR);
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 80\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0012]\t\tGPE1 Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tSleep Control Register : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tSleep Status Register : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_facs(FILE *fp)
+{
+ int err;
+
+ err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve FACS template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"FACS\"\n");
+ EFPRINTF(fp, "[0004]\t\tLength : 00000040\n");
+ EFPRINTF(fp, "[0004]\t\tHardware Signature : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\t32 Firmware Waking Vector : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tGlobal Lock : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
+ EFPRINTF(fp, "\t\t\tS4BIOS Support Present : 0\n");
+ EFPRINTF(fp, "\t\t\t64-bit Wake Supported (V2) : 0\n");
+ EFPRINTF(fp,
+ "[0008]\t\t64 Firmware Waking Vector : 0000000000000000\n");
+ EFPRINTF(fp, "[0001]\t\tVersion : 02\n");
+ EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
+ EFPRINTF(fp, "[0004]\t\tOspmFlags (decoded below) : 00000000\n");
+ EFPRINTF(fp, "\t\t\t64-bit Wake Env Required (V2) : 0\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_dsdt(FILE *fp)
+{
+ int err;
+
+ err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve DSDT template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2,"
+ "\"BHYVE \", \"BVDSDT \", 0x00000001)\n");
+ EFPRINTF(fp, "{\n");
+ EFPRINTF(fp, " Scope (_SB)\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " Device (PCI0)\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " Name (_HID, EisaId (\"PNP0A03\"))\n");
+ EFPRINTF(fp, " Name (_ADR, Zero)\n");
+ EFPRINTF(fp, " Name (_UID, One)\n");
+ EFPRINTF(fp, " Name (_CRS, ResourceTemplate ()\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " WordBusNumber (ResourceProducer, MinFixed,"
+ "MaxFixed, PosDecode,\n");
+ EFPRINTF(fp, " 0x0000, // Granularity\n");
+ EFPRINTF(fp, " 0x0000, // Range Minimum\n");
+ EFPRINTF(fp, " 0x00FF, // Range Maximum\n");
+ EFPRINTF(fp, " 0x0000, // Transl Offset\n");
+ EFPRINTF(fp, " 0x0100, // Length\n");
+ EFPRINTF(fp, " ,, )\n");
+ EFPRINTF(fp, " IO (Decode16,\n");
+ EFPRINTF(fp, " 0x0CF8, // Range Minimum\n");
+ EFPRINTF(fp, " 0x0CF8, // Range Maximum\n");
+ EFPRINTF(fp, " 0x01, // Alignment\n");
+ EFPRINTF(fp, " 0x08, // Length\n");
+ EFPRINTF(fp, " )\n");
+ EFPRINTF(fp, " WordIO (ResourceProducer, MinFixed, MaxFixed,"
+ "PosDecode, EntireRange,\n");
+ EFPRINTF(fp, " 0x0000, // Granularity\n");
+ EFPRINTF(fp, " 0x0000, // Range Minimum\n");
+ EFPRINTF(fp, " 0x0CF7, // Range Maximum\n");
+ EFPRINTF(fp, " 0x0000, // Transl Offset\n");
+ EFPRINTF(fp, " 0x0CF8, // Length\n");
+ EFPRINTF(fp, " ,, , TypeStatic)\n");
+ EFPRINTF(fp, " WordIO (ResourceProducer, MinFixed, MaxFixed,"
+ "PosDecode, EntireRange,\n");
+ EFPRINTF(fp, " 0x0000, // Granularity\n");
+ EFPRINTF(fp, " 0x0D00, // Range Minimum\n");
+ EFPRINTF(fp, " 0xFFFF, // Range Maximum\n");
+ EFPRINTF(fp, " 0x0000, // Transl Offset\n");
+ EFPRINTF(fp, " 0xF300, // Length\n");
+ EFPRINTF(fp, " ,, , TypeStatic)\n");
+ EFPRINTF(fp, " })\n");
+ EFPRINTF(fp, " }\n");
+ EFPRINTF(fp, " }\n");
+ EFPRINTF(fp, "\n");
+ EFPRINTF(fp, " Scope (_SB.PCI0)\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " Device (ISA)\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " Name (_ADR, 0x00010000)\n");
+ EFPRINTF(fp, " OperationRegion (P40C, PCI_Config, 0x60, 0x04)\n");
+ EFPRINTF(fp, " }\n");
+ EFPRINTF(fp, " }\n");
+ EFPRINTF(fp, "\n");
+ EFPRINTF(fp, " Scope (_SB.PCI0.ISA)\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " Device (RTC)\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " Name (_HID, EisaId (\"PNP0B00\"))\n");
+ EFPRINTF(fp, " Name (_CRS, ResourceTemplate ()\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " IO (Decode16,\n");
+ EFPRINTF(fp, " 0x0070, // Range Minimum\n");
+ EFPRINTF(fp, " 0x0070, // Range Maximum\n");
+ EFPRINTF(fp, " 0x10, // Alignment\n");
+ EFPRINTF(fp, " 0x02, // Length\n");
+ EFPRINTF(fp, " )\n");
+ EFPRINTF(fp, " IRQNoFlags ()\n");
+ EFPRINTF(fp, " {8}\n");
+ EFPRINTF(fp, " IO (Decode16,\n");
+ EFPRINTF(fp, " 0x0072, // Range Minimum\n");
+ EFPRINTF(fp, " 0x0072, // Range Maximum\n");
+ EFPRINTF(fp, " 0x02, // Alignment\n");
+ EFPRINTF(fp, " 0x06, // Length\n");
+ EFPRINTF(fp, " )\n");
+ EFPRINTF(fp, " })\n");
+ EFPRINTF(fp, " }\n");
+ EFPRINTF(fp, " }\n");
+ EFPRINTF(fp, "}\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_open(struct basl_fio *bf, int suffix)
+{
+ int err;
+
+ err = 0;
+
+ if (suffix) {
+ strncpy(bf->f_name, basl_stemplate, MAXPATHLEN);
+ bf->fd = mkstemps(bf->f_name, strlen(BHYVE_ASL_SUFFIX));
+ } else {
+ strncpy(bf->f_name, basl_template, MAXPATHLEN);
+ bf->fd = mkstemp(bf->f_name);
+ }
+
+ if (bf->fd > 0) {
+ bf->fp = fdopen(bf->fd, "w+");
+ if (bf->fp == NULL) {
+ unlink(bf->f_name);
+ close(bf->fd);
+ }
+ } else {
+ err = 1;
+ }
+
+ return (err);
+}
+
+static void
+basl_close(struct basl_fio *bf)
+{
+
+ if (!basl_keep_temps)
+ unlink(bf->f_name);
+ fclose(bf->fp);
+}
+
+static int
+basl_start(struct basl_fio *in, struct basl_fio *out)
+{
+ int err;
+
+ err = basl_open(in, 0);
+ if (!err) {
+ err = basl_open(out, 1);
+ if (err) {
+ basl_close(in);
+ }
+ }
+
+ return (err);
+}
+
+static void
+basl_end(struct basl_fio *in, struct basl_fio *out)
+{
+
+ basl_close(in);
+ basl_close(out);
+}
+
+static int
+basl_load(int fd, uint64_t off)
+{
+ struct stat sb;
+ int err;
+
+ err = 0;
+
+ if (fstat(fd, &sb) < 0 ||
+ read(fd, paddr_guest2host(basl_acpi_base + off), sb.st_size) < 0)
+ err = errno;
+
+ return (err);
+}
+
+static int
+basl_compile(int (*fwrite_section)(FILE *fp), uint64_t offset)
+{
+ struct basl_fio io[2];
+ static char iaslbuf[3*MAXPATHLEN + 10];
+ char *fmt;
+ int err;
+
+ err = basl_start(&io[0], &io[1]);
+ if (!err) {
+ err = (*fwrite_section)(io[0].fp);
+
+ if (!err) {
+ /*
+ * iasl sends the results of the compilation to
+ * stdout. Shut this down by using the shell to
+ * redirect stdout to /dev/null, unless the user
+ * has requested verbose output for debugging
+ * purposes
+ */
+ fmt = basl_verbose_iasl ?
+ "%s -p %s %s" :
+ "/bin/sh -c \"%s -p %s %s\" 1> /dev/null";
+
+ snprintf(iaslbuf, sizeof(iaslbuf),
+ fmt,
+ BHYVE_ASL_COMPILER,
+ io[1].f_name, io[0].f_name);
+ err = system(iaslbuf);
+
+ if (!err) {
+ /*
+ * Copy the aml output file into guest
+ * memory at the specified location
+ */
+ err = basl_load(io[1].fd, offset);
+ }
+ }
+ basl_end(&io[0], &io[1]);
+ }
+
+ return (err);
+}
+
+static int
+basl_make_templates(void)
+{
+ const char *tmpdir;
+ int err;
+ int len;
+
+ err = 0;
+
+ /*
+ *
+ */
+ if ((tmpdir = getenv("BHYVE_TMPDIR")) == NULL || *tmpdir == '\0' ||
+ (tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') {
+ tmpdir = _PATH_TMP;
+ }
+
+ len = strlen(tmpdir);
+
+ if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1) < MAXPATHLEN) {
+ strcpy(basl_template, tmpdir);
+ while (len > 0 && basl_template[len - 1] == '/')
+ len--;
+ basl_template[len] = '/';
+ strcpy(&basl_template[len + 1], BHYVE_ASL_TEMPLATE);
+ } else
+ err = E2BIG;
+
+ if (!err) {
+ /*
+ * len has been intialized (and maybe adjusted) above
+ */
+ if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1 +
+ sizeof(BHYVE_ASL_SUFFIX)) < MAXPATHLEN) {
+ strcpy(basl_stemplate, tmpdir);
+ basl_stemplate[len] = '/';
+ strcpy(&basl_stemplate[len + 1], BHYVE_ASL_TEMPLATE);
+ len = strlen(basl_stemplate);
+ strcpy(&basl_stemplate[len], BHYVE_ASL_SUFFIX);
+ } else
+ err = E2BIG;
+ }
+
+ return (err);
+}
+
+static struct {
+ int (*wsect)(FILE *fp);
+ uint64_t offset;
+} basl_ftables[] =
+{
+ { basl_fwrite_rsdp, 0},
+ { basl_fwrite_rsdt, RSDT_OFFSET },
+ { basl_fwrite_xsdt, XSDT_OFFSET },
+ { basl_fwrite_madt, MADT_OFFSET },
+ { basl_fwrite_fadt, FADT_OFFSET },
+ { basl_fwrite_facs, FACS_OFFSET },
+ { basl_fwrite_dsdt, DSDT_OFFSET },
+ { NULL }
+};
+
+int
+acpi_build(struct vmctx *ctx, int ncpu, int ioapic)
+{
+ int err;
+ int i;
+
+ err = 0;
+ basl_ncpu = ncpu;
+
+ if (!ioapic) {
+ fprintf(stderr, "ACPI tables require an ioapic\n");
+ return (EINVAL);
+ }
+
+ /*
+ * For debug, allow the user to have iasl compiler output sent
+ * to stdout rather than /dev/null
+ */
+ if (getenv("BHYVE_ACPI_VERBOSE_IASL"))
+ basl_verbose_iasl = 1;
+
+ /*
+ * Allow the user to keep the generated ASL files for debugging
+ * instead of deleting them following use
+ */
+ if (getenv("BHYVE_ACPI_KEEPTMPS"))
+ basl_keep_temps = 1;
+
+ i = 0;
+ err = basl_make_templates();
+
+ /*
+ * Run through all the ASL files, compiling them and
+ * copying them into guest memory
+ */
+ while (!err && basl_ftables[i].wsect != NULL) {
+ err = basl_compile(basl_ftables[i].wsect,
+ basl_ftables[i].offset);
+ i++;
+ }
+
+ return (err);
+}
diff --git a/usr.sbin/bhyve/acpi.h b/usr.sbin/bhyve/acpi.h
new file mode 100644
index 0000000..fec6c9d
--- /dev/null
+++ b/usr.sbin/bhyve/acpi.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ACPI_H_
+#define _ACPI_H_
+
+int acpi_build(struct vmctx *ctx, int ncpu, int ioapic);
+
+#endif /* _ACPI_H_ */
diff --git a/usr.sbin/bhyve/atpic.c b/usr.sbin/bhyve/atpic.c
new file mode 100644
index 0000000..a9fb084
--- /dev/null
+++ b/usr.sbin/bhyve/atpic.c
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "inout.h"
+
+/*
+ * FreeBSD only writes to the 8259 interrupt controllers to put them in a
+ * shutdown state.
+ *
+ * So, we just ignore the writes.
+ */
+
+#define IO_ICU1 0x20
+#define IO_ICU2 0xA0
+#define ICU_IMR_OFFSET 1
+
+static int
+atpic_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ if (bytes != 1)
+ return (-1);
+
+ if (in)
+ return (-1);
+
+ /* Pretend all writes to the 8259 are alright */
+ return (0);
+}
+
+INOUT_PORT(atpic, IO_ICU1, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU1 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU2, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU2 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
new file mode 100644
index 0000000..999040f
--- /dev/null
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -0,0 +1,788 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+
+#include <machine/segments.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <libgen.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "acpi.h"
+#include "inout.h"
+#include "dbgport.h"
+#include "mem.h"
+#include "mevent.h"
+#include "mptbl.h"
+#include "pci_emul.h"
+#include "xmsr.h"
+#include "ioapic.h"
+#include "spinup_ap.h"
+
+#define DEFAULT_GUEST_HZ 100
+#define DEFAULT_GUEST_TSLICE 200
+
+#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
+
+#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */
+#define VMEXIT_CONTINUE 1 /* continue from next instruction */
+#define VMEXIT_RESTART 2 /* restart current instruction */
+#define VMEXIT_ABORT 3 /* abort the vm run loop */
+#define VMEXIT_RESET 4 /* guest machine has reset */
+
+#define MB (1024UL * 1024)
+#define GB (1024UL * MB)
+
+typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
+
+int guest_tslice = DEFAULT_GUEST_TSLICE;
+int guest_hz = DEFAULT_GUEST_HZ;
+char *vmname;
+
+u_long lomem_sz;
+u_long himem_sz;
+
+int guest_ncpus;
+
+static int pincpu = -1;
+static int guest_vcpu_mux;
+static int guest_vmexit_on_hlt, guest_vmexit_on_pause, disable_x2apic;
+
+static int foundcpus;
+
+static int strictio;
+
+static int acpi;
+
+static char *lomem_addr;
+static char *himem_addr;
+
+static char *progname;
+static const int BSP = 0;
+
+static int cpumask;
+
+static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
+
+struct vm_exit vmexit[VM_MAXCPU];
+
+struct fbsdstats {
+ uint64_t vmexit_bogus;
+ uint64_t vmexit_bogus_switch;
+ uint64_t vmexit_hlt;
+ uint64_t vmexit_pause;
+ uint64_t vmexit_mtrap;
+ uint64_t vmexit_paging;
+ uint64_t cpu_switch_rotate;
+ uint64_t cpu_switch_direct;
+ int io_reset;
+} stats;
+
+struct mt_vmm_info {
+ pthread_t mt_thr;
+ struct vmctx *mt_ctx;
+ int mt_vcpu;
+} mt_vmm_info[VM_MAXCPU];
+
+static void
+usage(int code)
+{
+
+ fprintf(stderr,
+ "Usage: %s [-aehABHIP][-g <gdb port>][-z <hz>][-s <pci>]"
+ "[-S <pci>][-p pincpu][-n <pci>][-m lowmem][-M highmem] <vm>\n"
+ " -a: local apic is in XAPIC mode (default is X2APIC)\n"
+ " -A: create an ACPI table\n"
+ " -g: gdb port (default is %d and 0 means don't open)\n"
+ " -c: # cpus (default 1)\n"
+ " -p: pin vcpu 'n' to host cpu 'pincpu + n'\n"
+ " -B: inject breakpoint exception on vm entry\n"
+ " -H: vmexit from the guest on hlt\n"
+ " -I: present an ioapic to the guest\n"
+ " -P: vmexit from the guest on pause\n"
+ " -e: exit on unhandled i/o access\n"
+ " -h: help\n"
+ " -z: guest hz (default is %d)\n"
+ " -s: <slot,driver,configinfo> PCI slot config\n"
+ " -S: <slot,driver,configinfo> legacy PCI slot config\n"
+ " -m: lowmem in MB\n"
+ " -M: highmem in MB\n"
+ " -x: mux vcpus to 1 hcpu\n"
+ " -t: mux vcpu timeslice hz (default %d)\n",
+ progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ,
+ DEFAULT_GUEST_TSLICE);
+ exit(code);
+}
+
+void *
+paddr_guest2host(uintptr_t gaddr)
+{
+ if (lomem_sz == 0)
+ return (NULL);
+
+ if (gaddr < lomem_sz) {
+ return ((void *)(lomem_addr + gaddr));
+ } else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) {
+ return ((void *)(himem_addr + gaddr - 4*GB));
+ } else
+ return (NULL);
+}
+
+int
+fbsdrun_disable_x2apic(void)
+{
+
+ return (disable_x2apic);
+}
+
+int
+fbsdrun_vmexit_on_pause(void)
+{
+
+ return (guest_vmexit_on_pause);
+}
+
+int
+fbsdrun_vmexit_on_hlt(void)
+{
+
+ return (guest_vmexit_on_hlt);
+}
+
+int
+fbsdrun_muxed(void)
+{
+
+ return (guest_vcpu_mux);
+}
+
+static void *
+fbsdrun_start_thread(void *param)
+{
+ char tname[MAXCOMLEN + 1];
+ struct mt_vmm_info *mtp;
+ int vcpu;
+
+ mtp = param;
+ vcpu = mtp->mt_vcpu;
+
+ snprintf(tname, sizeof(tname), "%s vcpu %d", vmname, vcpu);
+ pthread_set_name_np(mtp->mt_thr, tname);
+
+ vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
+
+ /* not reached */
+ exit(1);
+ return (NULL);
+}
+
+void
+fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
+{
+ int error;
+
+ if (cpumask & (1 << vcpu)) {
+ fprintf(stderr, "addcpu: attempting to add existing cpu %d\n",
+ vcpu);
+ exit(1);
+ }
+
+ cpumask |= 1 << vcpu;
+ foundcpus++;
+
+ /*
+ * Set up the vmexit struct to allow execution to start
+ * at the given RIP
+ */
+ vmexit[vcpu].rip = rip;
+ vmexit[vcpu].inst_length = 0;
+
+ if (vcpu == BSP || !guest_vcpu_mux){
+ mt_vmm_info[vcpu].mt_ctx = ctx;
+ mt_vmm_info[vcpu].mt_vcpu = vcpu;
+
+ error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
+ fbsdrun_start_thread, &mt_vmm_info[vcpu]);
+ assert(error == 0);
+ }
+}
+
+static int
+fbsdrun_get_next_cpu(int curcpu)
+{
+
+ /*
+ * Get the next available CPU. Assumes they arrive
+ * in ascending order with no gaps.
+ */
+ return ((curcpu + 1) % foundcpus);
+}
+
+static int
+vmexit_catch_reset(void)
+{
+ stats.io_reset++;
+ return (VMEXIT_RESET);
+}
+
+static int
+vmexit_catch_inout(void)
+{
+ return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
+ uint32_t eax)
+{
+#if PG_DEBUG /* put all types of debug here */
+ if (eax == 0) {
+ pause_noswitch = 1;
+ } else if (eax == 1) {
+ pause_noswitch = 0;
+ } else {
+ pause_noswitch = 0;
+ if (eax == 5) {
+ vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1);
+ }
+ }
+#endif
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int error;
+ int bytes, port, in, out;
+ uint32_t eax;
+ int vcpu;
+
+ vcpu = *pvcpu;
+
+ port = vme->u.inout.port;
+ bytes = vme->u.inout.bytes;
+ eax = vme->u.inout.eax;
+ in = vme->u.inout.in;
+ out = !in;
+
+ /* We don't deal with these */
+ if (vme->u.inout.string || vme->u.inout.rep)
+ return (VMEXIT_ABORT);
+
+ /* Special case of guest reset */
+ if (out && port == 0x64 && (uint8_t)eax == 0xFE)
+ return (vmexit_catch_reset());
+
+ /* Extra-special case of host notifications */
+ if (out && port == GUEST_NIO_PORT)
+ return (vmexit_handle_notify(ctx, vme, pvcpu, eax));
+
+ error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio);
+ if (error == 0 && in)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax);
+
+ if (error == 0)
+ return (VMEXIT_CONTINUE);
+ else {
+ fprintf(stderr, "Unhandled %s%c 0x%04x\n",
+ in ? "in" : "out",
+ bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
+ return (vmexit_catch_inout());
+ }
+}
+
+static int
+vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ fprintf(stderr, "vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code,
+ *pvcpu);
+ return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int newcpu;
+ int retval = VMEXIT_CONTINUE;
+
+ newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval);
+
+ if (guest_vcpu_mux && *pvcpu != newcpu) {
+ retval = VMEXIT_SWITCH;
+ *pvcpu = newcpu;
+ }
+
+ return (retval);
+}
+
+static int
+vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int newcpu;
+ int retval = VMEXIT_CONTINUE;
+
+ newcpu = spinup_ap(ctx, *pvcpu,
+ vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
+
+ if (guest_vcpu_mux && *pvcpu != newcpu) {
+ retval = VMEXIT_SWITCH;
+ *pvcpu = newcpu;
+ }
+
+ return (retval);
+}
+
+static int
+vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ fprintf(stderr, "vm exit[%d]\n", *pvcpu);
+ fprintf(stderr, "\treason\t\tVMX\n");
+ fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
+ fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
+ fprintf(stderr, "\terror\t\t%d\n", vmexit->u.vmx.error);
+ fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
+ fprintf(stderr, "\tqualification\t0x%016lx\n",
+ vmexit->u.vmx.exit_qualification);
+
+ return (VMEXIT_ABORT);
+}
+
+static int bogus_noswitch = 1;
+
+static int
+vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_bogus++;
+
+ if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) {
+ return (VMEXIT_RESTART);
+ } else {
+ stats.vmexit_bogus_switch++;
+ vmexit->inst_length = 0;
+ *pvcpu = -1;
+ return (VMEXIT_SWITCH);
+ }
+}
+
+static int
+vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_hlt++;
+ if (fbsdrun_muxed()) {
+ *pvcpu = -1;
+ return (VMEXIT_SWITCH);
+ } else {
+ /*
+ * Just continue execution with the next instruction. We use
+ * the HLT VM exit as a way to be friendly with the host
+ * scheduler.
+ */
+ return (VMEXIT_CONTINUE);
+ }
+}
+
+static int pause_noswitch;
+
+static int
+vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_pause++;
+
+ if (fbsdrun_muxed() && !pause_noswitch) {
+ *pvcpu = -1;
+ return (VMEXIT_SWITCH);
+ } else {
+ return (VMEXIT_CONTINUE);
+ }
+}
+
+static int
+vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_mtrap++;
+
+ return (VMEXIT_RESTART);
+}
+
+static int
+vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ int err;
+ stats.vmexit_paging++;
+
+ err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa,
+ &vmexit->u.paging.vie);
+
+ if (err) {
+ if (err == EINVAL) {
+ fprintf(stderr,
+ "Failed to emulate instruction at 0x%lx\n",
+ vmexit->rip);
+ } else if (err == ESRCH) {
+ fprintf(stderr, "Unhandled memory access to 0x%lx\n",
+ vmexit->u.paging.gpa);
+ }
+
+ return (VMEXIT_ABORT);
+ }
+
+ return (VMEXIT_CONTINUE);
+}
+
+static void
+sigalrm(int sig)
+{
+ return;
+}
+
+static void
+setup_timeslice(void)
+{
+ struct sigaction sa;
+ struct itimerval itv;
+ int error;
+
+ /*
+ * Setup a realtime timer to generate a SIGALRM at a
+ * frequency of 'guest_tslice' ticks per second.
+ */
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = 0;
+ sa.sa_handler = sigalrm;
+
+ error = sigaction(SIGALRM, &sa, NULL);
+ assert(error == 0);
+
+ itv.it_interval.tv_sec = 0;
+ itv.it_interval.tv_usec = 1000000 / guest_tslice;
+ itv.it_value.tv_sec = 0;
+ itv.it_value.tv_usec = 1000000 / guest_tslice;
+
+ error = setitimer(ITIMER_REAL, &itv, NULL);
+ assert(error == 0);
+}
+
+static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
+ [VM_EXITCODE_INOUT] = vmexit_inout,
+ [VM_EXITCODE_VMX] = vmexit_vmx,
+ [VM_EXITCODE_BOGUS] = vmexit_bogus,
+ [VM_EXITCODE_RDMSR] = vmexit_rdmsr,
+ [VM_EXITCODE_WRMSR] = vmexit_wrmsr,
+ [VM_EXITCODE_MTRAP] = vmexit_mtrap,
+ [VM_EXITCODE_PAGING] = vmexit_paging,
+ [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
+};
+
+static void
+vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
+{
+ int error, rc, prevcpu;
+
+ if (guest_vcpu_mux)
+ setup_timeslice();
+
+ if (pincpu >= 0) {
+ error = vm_set_pinning(ctx, vcpu, pincpu + vcpu);
+ assert(error == 0);
+ }
+
+ while (1) {
+ error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
+ if (error != 0) {
+ /*
+ * It is possible that 'vmmctl' or some other process
+ * has transitioned the vcpu to CANNOT_RUN state right
+ * before we tried to transition it to RUNNING.
+ *
+ * This is expected to be temporary so just retry.
+ */
+ if (errno == EBUSY)
+ continue;
+ else
+ break;
+ }
+
+ prevcpu = vcpu;
+ rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu],
+ &vcpu);
+ switch (rc) {
+ case VMEXIT_SWITCH:
+ assert(guest_vcpu_mux);
+ if (vcpu == -1) {
+ stats.cpu_switch_rotate++;
+ vcpu = fbsdrun_get_next_cpu(prevcpu);
+ } else {
+ stats.cpu_switch_direct++;
+ }
+ /* fall through */
+ case VMEXIT_CONTINUE:
+ rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
+ break;
+ case VMEXIT_RESTART:
+ rip = vmexit[vcpu].rip;
+ break;
+ case VMEXIT_RESET:
+ exit(0);
+ default:
+ exit(1);
+ }
+ }
+ fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
+}
+
+static int
+num_vcpus_allowed(struct vmctx *ctx)
+{
+ int tmp, error;
+
+ error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
+
+ /*
+ * The guest is allowed to spinup more than one processor only if the
+ * UNRESTRICTED_GUEST capability is available.
+ */
+ if (error == 0)
+ return (VM_MAXCPU);
+ else
+ return (1);
+}
+
+int
+main(int argc, char *argv[])
+{
+ int c, error, gdb_port, inject_bkpt, tmp, err, ioapic, bvmcons;
+ int max_vcpus;
+ struct vmctx *ctx;
+ uint64_t rip;
+
+ bvmcons = 0;
+ inject_bkpt = 0;
+ progname = basename(argv[0]);
+ gdb_port = DEFAULT_GDB_PORT;
+ guest_ncpus = 1;
+ ioapic = 0;
+
+ while ((c = getopt(argc, argv, "abehABHIPxp:g:c:z:s:S:n:m:M:")) != -1) {
+ switch (c) {
+ case 'a':
+ disable_x2apic = 1;
+ break;
+ case 'A':
+ acpi = 1;
+ break;
+ case 'b':
+ bvmcons = 1;
+ break;
+ case 'B':
+ inject_bkpt = 1;
+ break;
+ case 'x':
+ guest_vcpu_mux = 1;
+ break;
+ case 'p':
+ pincpu = atoi(optarg);
+ break;
+ case 'c':
+ guest_ncpus = atoi(optarg);
+ break;
+ case 'g':
+ gdb_port = atoi(optarg);
+ break;
+ case 'z':
+ guest_hz = atoi(optarg);
+ break;
+ case 't':
+ guest_tslice = atoi(optarg);
+ break;
+ case 's':
+ pci_parse_slot(optarg, 0);
+ break;
+ case 'S':
+ pci_parse_slot(optarg, 1);
+ break;
+ case 'm':
+ lomem_sz = strtoul(optarg, NULL, 0) * MB;
+ break;
+ case 'M':
+ himem_sz = strtoul(optarg, NULL, 0) * MB;
+ break;
+ case 'H':
+ guest_vmexit_on_hlt = 1;
+ break;
+ case 'I':
+ ioapic = 1;
+ break;
+ case 'P':
+ guest_vmexit_on_pause = 1;
+ break;
+ case 'e':
+ strictio = 1;
+ break;
+ case 'h':
+ usage(0);
+ default:
+ usage(1);
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1)
+ usage(1);
+
+ /* No need to mux if guest is uni-processor */
+ if (guest_ncpus <= 1)
+ guest_vcpu_mux = 0;
+
+ /* vmexit on hlt if guest is muxed */
+ if (guest_vcpu_mux) {
+ guest_vmexit_on_hlt = 1;
+ guest_vmexit_on_pause = 1;
+ }
+
+ vmname = argv[0];
+
+ ctx = vm_open(vmname);
+ if (ctx == NULL) {
+ perror("vm_open");
+ exit(1);
+ }
+
+ max_vcpus = num_vcpus_allowed(ctx);
+ if (guest_ncpus > max_vcpus) {
+ fprintf(stderr, "%d vCPUs requested but only %d available\n",
+ guest_ncpus, max_vcpus);
+ exit(1);
+ }
+
+ if (fbsdrun_vmexit_on_hlt()) {
+ err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp);
+ if (err < 0) {
+ fprintf(stderr, "VM exit on HLT not supported\n");
+ exit(1);
+ }
+ vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1);
+ handler[VM_EXITCODE_HLT] = vmexit_hlt;
+ }
+
+ if (fbsdrun_vmexit_on_pause()) {
+ /*
+ * pause exit support required for this mode
+ */
+ err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp);
+ if (err < 0) {
+ fprintf(stderr,
+ "SMP mux requested, no pause support\n");
+ exit(1);
+ }
+ vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1);
+ handler[VM_EXITCODE_PAUSE] = vmexit_pause;
+ }
+
+ if (fbsdrun_disable_x2apic())
+ err = vm_set_x2apic_state(ctx, BSP, X2APIC_DISABLED);
+ else
+ err = vm_set_x2apic_state(ctx, BSP, X2APIC_ENABLED);
+
+ if (err) {
+ fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
+ exit(1);
+ }
+
+ if (lomem_sz != 0) {
+ lomem_addr = vm_map_memory(ctx, 0, lomem_sz);
+ if (lomem_addr == (char *) MAP_FAILED) {
+ lomem_sz = 0;
+ } else if (himem_sz != 0) {
+ himem_addr = vm_map_memory(ctx, 4*GB, himem_sz);
+ if (himem_addr == (char *) MAP_FAILED) {
+ lomem_sz = 0;
+ himem_sz = 0;
+ }
+ }
+ }
+
+ init_inout();
+ init_pci(ctx);
+ if (ioapic)
+ ioapic_init(0);
+
+ if (gdb_port != 0)
+ init_dbgport(gdb_port);
+
+ if (bvmcons)
+ init_bvmcons();
+
+ error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
+ assert(error == 0);
+
+ if (inject_bkpt) {
+ error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP);
+ assert(error == 0);
+ }
+
+ /*
+ * build the guest tables, MP etc.
+ */
+ mptable_build(ctx, guest_ncpus, ioapic);
+
+ if (acpi) {
+ error = acpi_build(ctx, guest_ncpus, ioapic);
+ assert(error == 0);
+ }
+
+ /*
+ * Add CPU 0
+ */
+ fbsdrun_addcpu(ctx, BSP, rip);
+
+ /*
+ * Head off to the main event dispatch loop
+ */
+ mevent_dispatch();
+
+ exit(1);
+}
diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h
new file mode 100644
index 0000000..45033b8
--- /dev/null
+++ b/usr.sbin/bhyve/bhyverun.h
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _FBSDRUN_H_
+#define _FBSDRUN_H_
+
+#ifndef CTASSERT /* Allow lint to override */
+#define CTASSERT(x) _CTASSERT(x, __LINE__)
+#define _CTASSERT(x, y) __CTASSERT(x, y)
+#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1]
+#endif
+
+struct vmctx;
+extern int guest_hz;
+extern int guest_tslice;
+extern int guest_ncpus;
+extern char *vmname;
+
+extern u_long lomem_sz, himem_sz;
+
+void *paddr_guest2host(uintptr_t);
+
+void fbsdrun_addcpu(struct vmctx *ctx, int cpu, uint64_t rip);
+int fbsdrun_muxed(void);
+int fbsdrun_vmexit_on_hlt(void);
+int fbsdrun_vmexit_on_pause(void);
+int fbsdrun_disable_x2apic(void);
+#endif
diff --git a/usr.sbin/bhyve/consport.c b/usr.sbin/bhyve/consport.c
new file mode 100644
index 0000000..3915b6d
--- /dev/null
+++ b/usr.sbin/bhyve/consport.c
@@ -0,0 +1,140 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/select.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <termios.h>
+#include <unistd.h>
+#include <stdbool.h>
+
+#include "inout.h"
+
+#define BVM_CONSOLE_PORT 0x220
+#define BVM_CONS_SIG ('b' << 8 | 'v')
+
+static struct termios tio_orig, tio_new;
+
+static void
+ttyclose(void)
+{
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
+}
+
+static void
+ttyopen(void)
+{
+ tcgetattr(STDIN_FILENO, &tio_orig);
+
+ cfmakeraw(&tio_new);
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
+
+ atexit(ttyclose);
+}
+
+static bool
+tty_char_available(void)
+{
+ fd_set rfds;
+ struct timeval tv;
+
+ FD_ZERO(&rfds);
+ FD_SET(STDIN_FILENO, &rfds);
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
+ return (true);
+ } else {
+ return (false);
+ }
+}
+
+static int
+ttyread(void)
+{
+ char rb;
+
+ if (tty_char_available()) {
+ read(STDIN_FILENO, &rb, 1);
+ return (rb & 0xff);
+ } else {
+ return (-1);
+ }
+}
+
+static void
+ttywrite(unsigned char wb)
+{
+ (void) write(STDOUT_FILENO, &wb, 1);
+}
+
+static int
+console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ static int opened;
+
+ if (bytes == 2 && in) {
+ *eax = BVM_CONS_SIG;
+ return (0);
+ }
+
+ if (bytes != 4)
+ return (-1);
+
+ if (!opened) {
+ ttyopen();
+ opened = 1;
+ }
+
+ if (in)
+ *eax = ttyread();
+ else
+ ttywrite(*eax);
+
+ return (0);
+}
+
+static struct inout_port consport = {
+ "bvmcons",
+ BVM_CONSOLE_PORT,
+ IOPORT_F_INOUT,
+ console_handler
+};
+
+void
+init_bvmcons(void)
+{
+
+ register_inout(&consport);
+}
diff --git a/usr.sbin/bhyve/dbgport.c b/usr.sbin/bhyve/dbgport.c
new file mode 100644
index 0000000..034531c
--- /dev/null
+++ b/usr.sbin/bhyve/dbgport.c
@@ -0,0 +1,138 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <sys/uio.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include "inout.h"
+#include "dbgport.h"
+
+#define BVM_DBG_PORT 0x224
+#define BVM_DBG_SIG ('B' << 8 | 'V')
+
+static int listen_fd, conn_fd;
+
+static struct sockaddr_in sin;
+
+static int
+dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ char ch;
+ int nwritten, nread, printonce;
+
+ if (bytes == 2 && in) {
+ *eax = BVM_DBG_SIG;
+ return (0);
+ }
+
+ if (bytes != 4)
+ return (-1);
+
+again:
+ printonce = 0;
+ while (conn_fd < 0) {
+ if (!printonce) {
+ printf("Waiting for connection from gdb\r\n");
+ printonce = 1;
+ }
+ conn_fd = accept(listen_fd, NULL, NULL);
+ if (conn_fd >= 0)
+ fcntl(conn_fd, F_SETFL, O_NONBLOCK);
+ else if (errno != EINTR)
+ perror("accept");
+ }
+
+ if (in) {
+ nread = read(conn_fd, &ch, 1);
+ if (nread == -1 && errno == EAGAIN)
+ *eax = -1;
+ else if (nread == 1)
+ *eax = ch;
+ else {
+ close(conn_fd);
+ conn_fd = -1;
+ goto again;
+ }
+ } else {
+ ch = *eax;
+ nwritten = write(conn_fd, &ch, 1);
+ if (nwritten != 1) {
+ close(conn_fd);
+ conn_fd = -1;
+ goto again;
+ }
+ }
+ return (0);
+}
+
+static struct inout_port dbgport = {
+ "bvmdbg",
+ BVM_DBG_PORT,
+ IOPORT_F_INOUT,
+ dbg_handler
+};
+
+void
+init_dbgport(int sport)
+{
+ conn_fd = -1;
+
+ if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ perror("socket");
+ exit(1);
+ }
+
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(sport);
+
+ if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+ perror("bind");
+ exit(1);
+ }
+
+ if (listen(listen_fd, 1) < 0) {
+ perror("listen");
+ exit(1);
+ }
+
+ register_inout(&dbgport);
+}
diff --git a/usr.sbin/bhyve/dbgport.h b/usr.sbin/bhyve/dbgport.h
new file mode 100644
index 0000000..8c7dab7
--- /dev/null
+++ b/usr.sbin/bhyve/dbgport.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _DBGPORT_H_
+#define _DBGPORT_H_
+
+#define DEFAULT_GDB_PORT 6466
+
+void init_dbgport(int port);
+
+#endif
diff --git a/usr.sbin/bhyve/elcr.c b/usr.sbin/bhyve/elcr.c
new file mode 100644
index 0000000..2417ae1
--- /dev/null
+++ b/usr.sbin/bhyve/elcr.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include "inout.h"
+
+/*
+ * EISA interrupt Level Control Register.
+ *
+ * This is a 16-bit register with one bit for each of the IRQ0 through IRQ15.
+ * A level triggered irq is indicated by setting the corresponding bit to '1'.
+ */
+#define ELCR_PORT 0x4d0
+
+static uint8_t elcr[2] = { 0x00, 0x00 };
+
+static int
+elcr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int idx;
+
+ if (bytes != 1)
+ return (-1);
+
+ idx = port - ELCR_PORT;
+
+ if (in)
+ *eax = elcr[idx];
+ else
+ elcr[idx] = *eax;
+
+ return (0);
+}
+INOUT_PORT(elcr, ELCR_PORT + 0, IOPORT_F_INOUT, elcr_handler);
+INOUT_PORT(elcr, ELCR_PORT + 1, IOPORT_F_INOUT, elcr_handler);
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
new file mode 100644
index 0000000..5f47a89f
--- /dev/null
+++ b/usr.sbin/bhyve/inout.c
@@ -0,0 +1,151 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "inout.h"
+
+SET_DECLARE(inout_port_set, struct inout_port);
+
+#define MAX_IOPORTS (1 << 16)
+
+static struct {
+ const char *name;
+ int flags;
+ inout_func_t handler;
+ void *arg;
+} inout_handlers[MAX_IOPORTS];
+
+static int
+default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ if (in) {
+ switch (bytes) {
+ case 4:
+ *eax = 0xffffffff;
+ break;
+ case 2:
+ *eax = 0xffff;
+ break;
+ case 1:
+ *eax = 0xff;
+ break;
+ }
+ }
+
+ return (0);
+}
+
+int
+emulate_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, int strict)
+{
+ int flags;
+ uint32_t mask;
+ inout_func_t handler;
+ void *arg;
+
+ assert(port < MAX_IOPORTS);
+
+ handler = inout_handlers[port].handler;
+
+ if (strict && handler == default_inout)
+ return (-1);
+
+ if (!in) {
+ switch (bytes) {
+ case 1:
+ mask = 0xff;
+ break;
+ case 2:
+ mask = 0xffff;
+ break;
+ default:
+ mask = 0xffffffff;
+ break;
+ }
+ *eax = *eax & mask;
+ }
+
+ flags = inout_handlers[port].flags;
+ arg = inout_handlers[port].arg;
+
+ if ((in && (flags & IOPORT_F_IN)) || (!in && (flags & IOPORT_F_OUT)))
+ return ((*handler)(ctx, vcpu, in, port, bytes, eax, arg));
+ else
+ return (-1);
+}
+
+void
+init_inout(void)
+{
+ struct inout_port **iopp, *iop;
+ int i;
+
+ /*
+ * Set up the default handler for all ports
+ */
+ for (i = 0; i < MAX_IOPORTS; i++) {
+ inout_handlers[i].name = "default";
+ inout_handlers[i].flags = IOPORT_F_IN | IOPORT_F_OUT;
+ inout_handlers[i].handler = default_inout;
+ inout_handlers[i].arg = NULL;
+ }
+
+ /*
+ * Overwrite with specified handlers
+ */
+ SET_FOREACH(iopp, inout_port_set) {
+ iop = *iopp;
+ assert(iop->port < MAX_IOPORTS);
+ inout_handlers[iop->port].name = iop->name;
+ inout_handlers[iop->port].flags = iop->flags;
+ inout_handlers[iop->port].handler = iop->handler;
+ inout_handlers[iop->port].arg = NULL;
+ }
+}
+
+int
+register_inout(struct inout_port *iop)
+{
+ assert(iop->port < MAX_IOPORTS);
+ inout_handlers[iop->port].name = iop->name;
+ inout_handlers[iop->port].flags = iop->flags;
+ inout_handlers[iop->port].handler = iop->handler;
+ inout_handlers[iop->port].arg = iop->arg;
+
+ return (0);
+}
diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h
new file mode 100644
index 0000000..a73b78d
--- /dev/null
+++ b/usr.sbin/bhyve/inout.h
@@ -0,0 +1,67 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _INOUT_H_
+#define _INOUT_H_
+
+#include <sys/linker_set.h>
+
+struct vmctx;
+
+typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
+ int bytes, uint32_t *eax, void *arg);
+
+struct inout_port {
+ const char *name;
+ int port;
+ int flags;
+ inout_func_t handler;
+ void *arg;
+};
+#define IOPORT_F_IN 0x1
+#define IOPORT_F_OUT 0x2
+#define IOPORT_F_INOUT 0x3
+
+#define INOUT_PORT(name, port, flags, handler) \
+ static struct inout_port __CONCAT(__inout_port, __LINE__) = { \
+ #name, \
+ (port), \
+ (flags), \
+ (handler), \
+ 0 \
+ }; \
+ DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__))
+
+void init_inout(void);
+int emulate_inout(struct vmctx *, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, int strict);
+int register_inout(struct inout_port *iop);
+
+void init_bvmcons(void);
+
+#endif /* _INOUT_H_ */
diff --git a/usr.sbin/bhyve/ioapic.c b/usr.sbin/bhyve/ioapic.c
new file mode 100644
index 0000000..c712692
--- /dev/null
+++ b/usr.sbin/bhyve/ioapic.c
@@ -0,0 +1,324 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <x86/apicreg.h>
+#include <machine/vmm.h>
+
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include <vmmapi.h>
+
+#include "inout.h"
+#include "mem.h"
+#include "bhyverun.h"
+
+#include <stdio.h>
+
+#define IOAPIC_PADDR 0xFEC00000
+
+#define IOREGSEL 0x00
+#define IOWIN 0x10
+
+#define REDIR_ENTRIES 16
+#define INTR_ASSERTED(ioapic, pin) ((ioapic)->pinstate[(pin)] == true)
+
+struct ioapic {
+ int inited;
+ uint32_t id;
+ uint64_t redtbl[REDIR_ENTRIES];
+ bool pinstate[REDIR_ENTRIES];
+
+ uintptr_t paddr; /* gpa where the ioapic is mapped */
+ uint32_t ioregsel;
+ struct memory_region *region;
+};
+
+static struct ioapic ioapics[1]; /* only a single ioapic for now */
+
+static int ioapic_region_read(struct ioapic *ioapic, uintptr_t paddr,
+ int size, uint64_t *data);
+static int ioapic_region_write(struct ioapic *ioapic, uintptr_t paddr,
+ int size, uint64_t data);
+static int ioapic_region_handler(struct vmctx *vm, int vcpu, int dir,
+ uintptr_t paddr, int size, uint64_t *val,
+ void *arg1, long arg2);
+
+static void
+ioapic_set_pinstate(struct vmctx *ctx, int pin, bool newstate)
+{
+ int vector, apicid, vcpu;
+ uint32_t low, high;
+ struct ioapic *ioapic;
+
+ ioapic = &ioapics[0]; /* assume a single ioapic */
+
+ if (pin < 0 || pin >= REDIR_ENTRIES)
+ return;
+
+ /* Nothing to do if interrupt pin has not changed state */
+ if (ioapic->pinstate[pin] == newstate)
+ return;
+
+ ioapic->pinstate[pin] = newstate; /* record it */
+
+ /* Nothing to do if interrupt pin is deasserted */
+ if (!INTR_ASSERTED(ioapic, pin))
+ return;
+
+ /*
+ * XXX
+ * We only deal with:
+ * - edge triggered interrupts
+ * - physical destination mode
+ * - fixed delivery mode
+ */
+ low = ioapic->redtbl[pin];
+ high = ioapic->redtbl[pin] >> 32;
+ if ((low & IOART_INTMASK) == IOART_INTMCLR &&
+ (low & IOART_TRGRMOD) == IOART_TRGREDG &&
+ (low & IOART_DESTMOD) == IOART_DESTPHY &&
+ (low & IOART_DELMOD) == IOART_DELFIXED) {
+ vector = low & IOART_INTVEC;
+ apicid = high >> APIC_ID_SHIFT;
+ if (apicid != 0xff) {
+ /* unicast */
+ vcpu = vm_apicid2vcpu(ctx, apicid);
+ vm_lapic_irq(ctx, vcpu, vector);
+ } else {
+ /* broadcast */
+ vcpu = 0;
+ while (vcpu < guest_ncpus) {
+ vm_lapic_irq(ctx, vcpu, vector);
+ vcpu++;
+ }
+ }
+ }
+}
+
+void
+ioapic_deassert_pin(struct vmctx *ctx, int pin)
+{
+ ioapic_set_pinstate(ctx, pin, false);
+}
+
+void
+ioapic_assert_pin(struct vmctx *ctx, int pin)
+{
+ ioapic_set_pinstate(ctx, pin, true);
+}
+
+void
+ioapic_init(int which)
+{
+ struct mem_range memp;
+ struct ioapic *ioapic;
+ int error;
+ int i;
+
+ assert(which == 0);
+
+ ioapic = &ioapics[which];
+ assert(ioapic->inited == 0);
+
+ bzero(ioapic, sizeof(struct ioapic));
+
+ /* Initialize all redirection entries to mask all interrupts */
+ for (i = 0; i < REDIR_ENTRIES; i++)
+ ioapic->redtbl[i] = 0x0001000000010000UL;
+
+ ioapic->paddr = IOAPIC_PADDR;
+
+ /* Register emulated memory region */
+ memp.name = "ioapic";
+ memp.flags = MEM_F_RW;
+ memp.handler = ioapic_region_handler;
+ memp.arg1 = ioapic;
+ memp.arg2 = which;
+ memp.base = ioapic->paddr;
+ memp.size = sizeof(struct IOAPIC);
+ error = register_mem(&memp);
+
+ assert (error == 0);
+
+ ioapic->inited = 1;
+}
+
+static uint32_t
+ioapic_read(struct ioapic *ioapic, uint32_t addr)
+{
+ int regnum, pin, rshift;
+
+ assert(ioapic->inited);
+
+ regnum = addr & 0xff;
+ switch (regnum) {
+ case IOAPIC_ID:
+ return (ioapic->id);
+ break;
+ case IOAPIC_VER:
+ return ((REDIR_ENTRIES << MAXREDIRSHIFT) | 0x11);
+ break;
+ case IOAPIC_ARB:
+ return (ioapic->id);
+ break;
+ default:
+ break;
+ }
+
+ /* redirection table entries */
+ if (regnum >= IOAPIC_REDTBL &&
+ regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
+ pin = (regnum - IOAPIC_REDTBL) / 2;
+ if ((regnum - IOAPIC_REDTBL) % 2)
+ rshift = 32;
+ else
+ rshift = 0;
+
+ return (ioapic->redtbl[pin] >> rshift);
+ }
+
+ return (0);
+}
+
+static void
+ioapic_write(struct ioapic *ioapic, uint32_t addr, uint32_t data)
+{
+ int regnum, pin, lshift;
+
+ assert(ioapic->inited);
+
+ regnum = addr & 0xff;
+ switch (regnum) {
+ case IOAPIC_ID:
+ ioapic->id = data & APIC_ID_MASK;
+ break;
+ case IOAPIC_VER:
+ case IOAPIC_ARB:
+ /* readonly */
+ break;
+ default:
+ break;
+ }
+
+ /* redirection table entries */
+ if (regnum >= IOAPIC_REDTBL &&
+ regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
+ pin = (regnum - IOAPIC_REDTBL) / 2;
+ if ((regnum - IOAPIC_REDTBL) % 2)
+ lshift = 32;
+ else
+ lshift = 0;
+
+ ioapic->redtbl[pin] &= ~((uint64_t)0xffffffff << lshift);
+ ioapic->redtbl[pin] |= ((uint64_t)data << lshift);
+ }
+}
+
+static int
+ioapic_region_read(struct ioapic *ioapic, uintptr_t paddr, int size,
+ uint64_t *data)
+{
+ int offset;
+
+ offset = paddr - ioapic->paddr;
+
+ /*
+ * The IOAPIC specification allows 32-bit wide accesses to the
+ * IOREGSEL (offset 0) and IOWIN (offset 16) registers.
+ */
+ if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) {
+#if 1
+ printf("invalid access to ioapic%d: size %d, offset %d\n",
+ (int)(ioapic - ioapics), size, offset);
+#endif
+ *data = 0;
+ return (0);
+ }
+
+ if (offset == IOREGSEL)
+ *data = ioapic->ioregsel;
+ else
+ *data = ioapic_read(ioapic, ioapic->ioregsel);
+
+ return (0);
+}
+
+static int
+ioapic_region_write(struct ioapic *ioapic, uintptr_t paddr, int size,
+ uint64_t data)
+{
+ int offset;
+
+ offset = paddr - ioapic->paddr;
+
+ /*
+ * The ioapic specification allows 32-bit wide accesses to the
+ * IOREGSEL (offset 0) and IOWIN (offset 16) registers.
+ */
+ if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) {
+#if 1
+ printf("invalid access to ioapic%d: size %d, offset %d\n",
+ (int)(ioapic - ioapics), size, offset);
+#endif
+ return (0);
+ }
+
+ if (offset == IOREGSEL)
+ ioapic->ioregsel = data;
+ else
+ ioapic_write(ioapic, ioapic->ioregsel, data);
+
+ return (0);
+}
+
+static int
+ioapic_region_handler(struct vmctx *vm, int vcpu, int dir, uintptr_t paddr,
+ int size, uint64_t *val, void *arg1, long arg2)
+{
+ struct ioapic *ioapic;
+ int which;
+
+ ioapic = arg1;
+ which = arg2;
+
+ assert(ioapic == &ioapics[which]);
+
+ if (dir == MEM_F_READ)
+ ioapic_region_read(ioapic, paddr, size, val);
+ else
+ ioapic_region_write(ioapic, paddr, size, *val);
+
+ return (0);
+}
diff --git a/usr.sbin/bhyve/ioapic.h b/usr.sbin/bhyve/ioapic.h
new file mode 100644
index 0000000..4696f9a
--- /dev/null
+++ b/usr.sbin/bhyve/ioapic.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IOAPIC_H_
+#define _IOAPIC_H_
+
+struct vmctx;
+
+void ioapic_init(int num);
+void ioapic_deassert_pin(struct vmctx *ctx, int pin);
+void ioapic_assert_pin(struct vmctx *ctx, int pin);
+
+#endif
diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c
new file mode 100644
index 0000000..27f4782
--- /dev/null
+++ b/usr.sbin/bhyve/mem.c
@@ -0,0 +1,218 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Memory ranges are represented with an RB tree. On insertion, the range
+ * is checked for overlaps. On lookup, the key has the same base and limit
+ * so it can be searched within the range.
+ *
+ * It is assumed that all setup of ranges takes place in single-threaded
+ * mode before vCPUs have been started. As such, no locks are used on the
+ * RB tree. If this is no longer the case, then a r/w lock could be used,
+ * with readers on the lookup and a writer if the tree needs to be changed
+ * (and per vCPU caches flushed)
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/tree.h>
+#include <sys/errno.h>
+#include <machine/vmm.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "mem.h"
+
+struct mmio_rb_range {
+ RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */
+ struct mem_range mr_param;
+ uint64_t mr_base;
+ uint64_t mr_end;
+};
+
+struct mmio_rb_tree;
+RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
+
+RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rbroot;
+
+/*
+ * Per-vCPU cache. Since most accesses from a vCPU will be to
+ * consecutive addresses in a range, it makes sense to cache the
+ * result of a lookup.
+ */
+static struct mmio_rb_range *mmio_hint[VM_MAXCPU];
+
+static int
+mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b)
+{
+ if (a->mr_end < b->mr_base)
+ return (-1);
+ else if (a->mr_base > b->mr_end)
+ return (1);
+ return (0);
+}
+
+static int
+mmio_rb_lookup(uint64_t addr, struct mmio_rb_range **entry)
+{
+ struct mmio_rb_range find, *res;
+
+ find.mr_base = find.mr_end = addr;
+
+ res = RB_FIND(mmio_rb_tree, &mmio_rbroot, &find);
+
+ if (res != NULL) {
+ *entry = res;
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+static int
+mmio_rb_add(struct mmio_rb_range *new)
+{
+ struct mmio_rb_range *overlap;
+
+ overlap = RB_INSERT(mmio_rb_tree, &mmio_rbroot, new);
+
+ if (overlap != NULL) {
+#ifdef RB_DEBUG
+ printf("overlap detected: new %lx:%lx, tree %lx:%lx\n",
+ new->mr_base, new->mr_end,
+ overlap->mr_base, overlap->mr_end);
+#endif
+
+ return (EEXIST);
+ }
+
+ return (0);
+}
+
+#if 0
+static void
+mmio_rb_dump(void)
+{
+ struct mmio_rb_range *np;
+
+ RB_FOREACH(np, mmio_rb_tree, &mmio_rbroot) {
+ printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end,
+ np->mr_param.name);
+ }
+}
+#endif
+
+RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
+
+static int
+mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg)
+{
+ int error;
+ struct mem_range *mr = arg;
+
+ error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size,
+ rval, mr->arg1, mr->arg2);
+ return (error);
+}
+
+static int
+mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
+{
+ int error;
+ struct mem_range *mr = arg;
+
+ error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size,
+ &wval, mr->arg1, mr->arg2);
+ return (error);
+}
+
+int
+emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie)
+{
+ struct mmio_rb_range *entry;
+ int err;
+
+ /*
+ * First check the per-vCPU cache
+ */
+ if (mmio_hint[vcpu] &&
+ paddr >= mmio_hint[vcpu]->mr_base &&
+ paddr <= mmio_hint[vcpu]->mr_end) {
+ entry = mmio_hint[vcpu];
+ } else
+ entry = NULL;
+
+ if (entry == NULL) {
+ if (mmio_rb_lookup(paddr, &entry))
+ return (ESRCH);
+
+ /* Update the per-vCPU cache */
+ mmio_hint[vcpu] = entry;
+ }
+
+ assert(entry != NULL && entry == mmio_hint[vcpu]);
+
+ err = vmm_emulate_instruction(ctx, vcpu, paddr, vie,
+ mem_read, mem_write, &entry->mr_param);
+ return (err);
+}
+
+int
+register_mem(struct mem_range *memp)
+{
+ struct mmio_rb_range *mrp;
+ int err;
+
+ err = 0;
+
+ mrp = malloc(sizeof(struct mmio_rb_range));
+
+ if (mrp != NULL) {
+ mrp->mr_param = *memp;
+ mrp->mr_base = memp->base;
+ mrp->mr_end = memp->base + memp->size - 1;
+
+ err = mmio_rb_add(mrp);
+ if (err)
+ free(mrp);
+ } else
+ err = ENOMEM;
+
+ return (err);
+}
+
+void
+init_mem(void)
+{
+
+ RB_INIT(&mmio_rbroot);
+}
diff --git a/usr.sbin/bhyve/mem.h b/usr.sbin/bhyve/mem.h
new file mode 100644
index 0000000..88fafe1
--- /dev/null
+++ b/usr.sbin/bhyve/mem.h
@@ -0,0 +1,57 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MEM_H_
+#define _MEM_H_
+
+#include <sys/linker_set.h>
+
+struct vmctx;
+
+typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int size, uint64_t *val, void *arg1, long arg2);
+
+struct mem_range {
+ const char *name;
+ int flags;
+ mem_func_t handler;
+ void *arg1;
+ long arg2;
+ uint64_t base;
+ uint64_t size;
+};
+#define MEM_F_READ 0x1
+#define MEM_F_WRITE 0x2
+#define MEM_F_RW 0x3
+
+void init_mem(void);
+int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie);
+
+int register_mem(struct mem_range *memp);
+
+#endif /* _MEM_H_ */
diff --git a/usr.sbin/bhyve/mevent.c b/usr.sbin/bhyve/mevent.c
new file mode 100644
index 0000000..a6109db
--- /dev/null
+++ b/usr.sbin/bhyve/mevent.c
@@ -0,0 +1,432 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Micro event library for FreeBSD, designed for a single i/o thread
+ * using kqueue, and having events be persistent by default.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/event.h>
+#include <sys/time.h>
+
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "mevent.h"
+
+#define MEVENT_MAX 64
+
+#define MEV_ENABLE 1
+#define MEV_DISABLE 2
+#define MEV_DEL_PENDING 3
+
+extern char *vmname;
+
+static pthread_t mevent_tid;
+static int mevent_pipefd[2];
+static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
+
+struct mevent {
+ void (*me_func)(int, enum ev_type, void *);
+ int me_fd;
+ enum ev_type me_type;
+ void *me_param;
+ int me_cq;
+ int me_state;
+ int me_closefd;
+ LIST_ENTRY(mevent) me_list;
+};
+
+static LIST_HEAD(listhead, mevent) global_head, change_head;
+
+static void
+mevent_qlock(void)
+{
+ pthread_mutex_lock(&mevent_lmutex);
+}
+
+static void
+mevent_qunlock(void)
+{
+ pthread_mutex_unlock(&mevent_lmutex);
+}
+
+static void
+mevent_pipe_read(int fd, enum ev_type type, void *param)
+{
+ char buf[MEVENT_MAX];
+ int status;
+
+ /*
+ * Drain the pipe read side. The fd is non-blocking so this is
+ * safe to do.
+ */
+ do {
+ status = read(fd, buf, sizeof(buf));
+ } while (status == MEVENT_MAX);
+}
+
+static void
+mevent_notify(void)
+{
+ char c;
+
+ /*
+ * If calling from outside the i/o thread, write a byte on the
+ * pipe to force the i/o thread to exit the blocking kevent call.
+ */
+ if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
+ write(mevent_pipefd[1], &c, 1);
+ }
+}
+
+static int
+mevent_kq_filter(struct mevent *mevp)
+{
+ int retval;
+
+ retval = 0;
+
+ if (mevp->me_type == EVF_READ)
+ retval = EVFILT_READ;
+
+ if (mevp->me_type == EVF_WRITE)
+ retval = EVFILT_WRITE;
+
+ return (retval);
+}
+
+static int
+mevent_kq_flags(struct mevent *mevp)
+{
+ int ret;
+
+ switch (mevp->me_state) {
+ case MEV_ENABLE:
+ ret = EV_ADD;
+ break;
+ case MEV_DISABLE:
+ ret = EV_DISABLE;
+ break;
+ case MEV_DEL_PENDING:
+ ret = EV_DELETE;
+ break;
+ }
+
+ return (ret);
+}
+
+static int
+mevent_kq_fflags(struct mevent *mevp)
+{
+ /* XXX nothing yet, perhaps EV_EOF for reads ? */
+ return (0);
+}
+
+static int
+mevent_build(int mfd, struct kevent *kev)
+{
+ struct mevent *mevp, *tmpp;
+ int i;
+
+ i = 0;
+
+ mevent_qlock();
+
+ LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
+ if (mevp->me_closefd) {
+ /*
+ * A close of the file descriptor will remove the
+ * event
+ */
+ close(mevp->me_fd);
+ } else {
+ kev[i].ident = mevp->me_fd;
+ kev[i].filter = mevent_kq_filter(mevp);
+ kev[i].flags = mevent_kq_flags(mevp);
+ kev[i].fflags = mevent_kq_fflags(mevp);
+ kev[i].data = 0;
+ kev[i].udata = mevp;
+ i++;
+ }
+
+ mevp->me_cq = 0;
+ LIST_REMOVE(mevp, me_list);
+
+ if (mevp->me_state == MEV_DEL_PENDING) {
+ free(mevp);
+ } else {
+ LIST_INSERT_HEAD(&global_head, mevp, me_list);
+ }
+
+ assert(i < MEVENT_MAX);
+ }
+
+ mevent_qunlock();
+
+ return (i);
+}
+
+static void
+mevent_handle(struct kevent *kev, int numev)
+{
+ struct mevent *mevp;
+ int i;
+
+ for (i = 0; i < numev; i++) {
+ mevp = kev[i].udata;
+
+ /* XXX check for EV_ERROR ? */
+
+ (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
+ }
+}
+
+struct mevent *
+mevent_add(int fd, enum ev_type type,
+ void (*func)(int, enum ev_type, void *), void *param)
+{
+ struct mevent *lp, *mevp;
+
+ if (fd < 0 || func == NULL) {
+ return (NULL);
+ }
+
+ mevp = NULL;
+
+ mevent_qlock();
+
+ /*
+ * Verify that the fd/type tuple is not present in any list
+ */
+ LIST_FOREACH(lp, &global_head, me_list) {
+ if (lp->me_fd == fd && lp->me_type == type) {
+ goto exit;
+ }
+ }
+
+ LIST_FOREACH(lp, &change_head, me_list) {
+ if (lp->me_fd == fd && lp->me_type == type) {
+ goto exit;
+ }
+ }
+
+ /*
+ * Allocate an entry, populate it, and add it to the change list.
+ */
+ mevp = malloc(sizeof(struct mevent));
+ if (mevp == NULL) {
+ goto exit;
+ }
+
+ memset(mevp, 0, sizeof(struct mevent));
+ mevp->me_fd = fd;
+ mevp->me_type = type;
+ mevp->me_func = func;
+ mevp->me_param = param;
+
+ LIST_INSERT_HEAD(&change_head, mevp, me_list);
+ mevp->me_cq = 1;
+ mevp->me_state = MEV_ENABLE;
+ mevent_notify();
+
+exit:
+ mevent_qunlock();
+
+ return (mevp);
+}
+
+static int
+mevent_update(struct mevent *evp, int newstate)
+{
+ /*
+ * It's not possible to enable/disable a deleted event
+ */
+ if (evp->me_state == MEV_DEL_PENDING)
+ return (EINVAL);
+
+ /*
+ * No update needed if state isn't changing
+ */
+ if (evp->me_state == newstate)
+ return (0);
+
+ mevent_qlock();
+
+ evp->me_state = newstate;
+
+ /*
+ * Place the entry onto the changed list if not already there.
+ */
+ if (evp->me_cq == 0) {
+ evp->me_cq = 1;
+ LIST_REMOVE(evp, me_list);
+ LIST_INSERT_HEAD(&change_head, evp, me_list);
+ mevent_notify();
+ }
+
+ mevent_qunlock();
+
+ return (0);
+}
+
+int
+mevent_enable(struct mevent *evp)
+{
+
+ return (mevent_update(evp, MEV_ENABLE));
+}
+
+int
+mevent_disable(struct mevent *evp)
+{
+
+ return (mevent_update(evp, MEV_DISABLE));
+}
+
+static int
+mevent_delete_event(struct mevent *evp, int closefd)
+{
+ mevent_qlock();
+
+ /*
+ * Place the entry onto the changed list if not already there, and
+ * mark as to be deleted.
+ */
+ if (evp->me_cq == 0) {
+ evp->me_cq = 1;
+ LIST_REMOVE(evp, me_list);
+ LIST_INSERT_HEAD(&change_head, evp, me_list);
+ mevent_notify();
+ }
+ evp->me_state = MEV_DEL_PENDING;
+
+ if (closefd)
+ evp->me_closefd = 1;
+
+ mevent_qunlock();
+
+ return (0);
+}
+
+int
+mevent_delete(struct mevent *evp)
+{
+
+ return (mevent_delete_event(evp, 0));
+}
+
+int
+mevent_delete_close(struct mevent *evp)
+{
+
+ return (mevent_delete_event(evp, 1));
+}
+
+static void
+mevent_set_name(void)
+{
+ char tname[MAXCOMLEN + 1];
+
+ snprintf(tname, sizeof(tname), "%s mevent", vmname);
+ pthread_set_name_np(mevent_tid, tname);
+}
+
+void
+mevent_dispatch(void)
+{
+ struct kevent changelist[MEVENT_MAX];
+ struct kevent eventlist[MEVENT_MAX];
+ struct mevent *pipev;
+ int mfd;
+ int numev;
+ int ret;
+
+ mevent_tid = pthread_self();
+ mevent_set_name();
+
+ mfd = kqueue();
+ assert(mfd > 0);
+
+ /*
+ * Open the pipe that will be used for other threads to force
+ * the blocking kqueue call to exit by writing to it. Set the
+ * descriptor to non-blocking.
+ */
+ ret = pipe(mevent_pipefd);
+ if (ret < 0) {
+ perror("pipe");
+ exit(0);
+ }
+
+ /*
+ * Add internal event handler for the pipe write fd
+ */
+ pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
+ assert(pipev != NULL);
+
+ for (;;) {
+ /*
+ * Build changelist if required.
+ * XXX the changelist can be put into the blocking call
+ * to eliminate the extra syscall. Currently better for
+ * debug.
+ */
+ numev = mevent_build(mfd, changelist);
+ if (numev) {
+ ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
+ if (ret == -1) {
+ perror("Error return from kevent change");
+ }
+ }
+
+ /*
+ * Block awaiting events
+ */
+ ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
+ if (ret == -1) {
+ perror("Error return from kevent monitor");
+ }
+
+ /*
+ * Handle reported events
+ */
+ mevent_handle(eventlist, ret);
+ }
+}
diff --git a/usr.sbin/bhyve/mevent.h b/usr.sbin/bhyve/mevent.h
new file mode 100644
index 0000000..32a9d74
--- /dev/null
+++ b/usr.sbin/bhyve/mevent.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MEVENT_H_
+#define _MEVENT_H_
+
+enum ev_type {
+ EVF_READ,
+ EVF_WRITE
+};
+
+struct mevent;
+
+struct mevent *mevent_add(int fd, enum ev_type type,
+ void (*func)(int, enum ev_type, void *),
+ void *param);
+int mevent_enable(struct mevent *evp);
+int mevent_disable(struct mevent *evp);
+int mevent_delete(struct mevent *evp);
+int mevent_delete_close(struct mevent *evp);
+
+void mevent_dispatch(void);
+
+#endif /* _MEVENT_H_ */
diff --git a/usr.sbin/bhyve/mevent_test.c b/usr.sbin/bhyve/mevent_test.c
new file mode 100644
index 0000000..c72a497
--- /dev/null
+++ b/usr.sbin/bhyve/mevent_test.c
@@ -0,0 +1,180 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Test program for the micro event library. Set up a simple TCP echo
+ * service.
+ *
+ * cc mevent_test.c mevent.c -lpthread
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "mevent.h"
+
+#define TEST_PORT 4321
+
+static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER;
+
+#define MEVENT_ECHO
+
+#ifdef MEVENT_ECHO
+struct esync {
+ pthread_mutex_t e_mt;
+ pthread_cond_t e_cond;
+};
+
+static void
+echoer_callback(int fd, enum ev_type type, void *param)
+{
+ struct esync *sync = param;
+
+ pthread_mutex_lock(&sync->e_mt);
+ pthread_cond_signal(&sync->e_cond);
+ pthread_mutex_unlock(&sync->e_mt);
+}
+
+static void *
+echoer(void *param)
+{
+ struct esync sync;
+ struct mevent *mev;
+ char buf[128];
+ int fd = (int)(uintptr_t) param;
+ int len;
+
+ pthread_mutex_init(&sync.e_mt, NULL);
+ pthread_cond_init(&sync.e_cond, NULL);
+
+ pthread_mutex_lock(&sync.e_mt);
+
+ mev = mevent_add(fd, EVF_READ, echoer_callback, &sync);
+ if (mev == NULL) {
+ printf("Could not allocate echoer event\n");
+ exit(1);
+ }
+
+ while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) {
+ len = read(fd, buf, sizeof(buf));
+ if (len > 0) {
+ write(fd, buf, len);
+ write(0, buf, len);
+ } else {
+ break;
+ }
+ }
+
+ mevent_delete_close(mev);
+
+ pthread_mutex_unlock(&sync.e_mt);
+ pthread_mutex_destroy(&sync.e_mt);
+ pthread_cond_destroy(&sync.e_cond);
+}
+
+#else
+
+static void *
+echoer(void *param)
+{
+ char buf[128];
+ int fd = (int)(uintptr_t) param;
+ int len;
+
+ while ((len = read(fd, buf, sizeof(buf))) > 0) {
+ write(1, buf, len);
+ }
+}
+#endif /* MEVENT_ECHO */
+
+static void
+acceptor_callback(int fd, enum ev_type type, void *param)
+{
+ pthread_mutex_lock(&accept_mutex);
+ pthread_cond_signal(&accept_condvar);
+ pthread_mutex_unlock(&accept_mutex);
+}
+
+static void *
+acceptor(void *param)
+{
+ struct sockaddr_in sin;
+ pthread_t tid;
+ int news;
+ int s;
+
+ if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ perror("socket");
+ exit(1);
+ }
+
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(TEST_PORT);
+
+ if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+ perror("bind");
+ exit(1);
+ }
+
+ if (listen(s, 1) < 0) {
+ perror("listen");
+ exit(1);
+ }
+
+ (void) mevent_add(s, EVF_READ, acceptor_callback, NULL);
+
+ pthread_mutex_lock(&accept_mutex);
+
+ while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) {
+ news = accept(s, NULL, NULL);
+ if (news < 0) {
+ perror("accept error");
+ } else {
+ printf("incoming connection, spawning thread\n");
+ pthread_create(&tid, NULL, echoer,
+ (void *)(uintptr_t)news);
+ }
+ }
+}
+
+main()
+{
+ pthread_t tid;
+
+ pthread_create(&tid, NULL, acceptor, NULL);
+
+ mevent_dispatch();
+}
diff --git a/usr.sbin/bhyve/mptbl.c b/usr.sbin/bhyve/mptbl.c
new file mode 100644
index 0000000..52790f3
--- /dev/null
+++ b/usr.sbin/bhyve/mptbl.c
@@ -0,0 +1,398 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <x86/mptable.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#include "bhyverun.h"
+#include "mptbl.h"
+
+#define MPTABLE_BASE 0xF0000
+
+#define LAPIC_PADDR 0xFEE00000
+#define LAPIC_VERSION 16
+
+#define IOAPIC_PADDR 0xFEC00000
+#define IOAPIC_VERSION 0x11
+
+#define MP_SPECREV 4
+#define MPFP_SIG "_MP_"
+
+/* Configuration header defines */
+#define MPCH_SIG "PCMP"
+#define MPCH_OEMID "BHyVe "
+#define MPCH_OEMID_LEN 8
+#define MPCH_PRODID "Hypervisor "
+#define MPCH_PRODID_LEN 12
+
+/* Processor entry defines */
+#define MPEP_SIG_FAMILY 6 /* XXX bhyve should supply this */
+#define MPEP_SIG_MODEL 26
+#define MPEP_SIG_STEPPING 5
+#define MPEP_SIG \
+ ((MPEP_SIG_FAMILY << 8) | \
+ (MPEP_SIG_MODEL << 4) | \
+ (MPEP_SIG_STEPPING))
+
+#define MPEP_FEATURES (0xBFEBFBFF) /* XXX Intel i7 */
+
+/* Define processor entry struct since <x86/mptable.h> gets it wrong */
+typedef struct BPROCENTRY {
+ u_char type;
+ u_char apic_id;
+ u_char apic_version;
+ u_char cpu_flags;
+ uint32_t cpu_signature;
+ uint32_t feature_flags;
+ uint32_t reserved1;
+ uint32_t reserved2;
+} *bproc_entry_ptr;
+CTASSERT(sizeof(struct BPROCENTRY) == 20);
+
+/* Bus entry defines */
+#define MPE_NUM_BUSES 2
+#define MPE_BUSNAME_LEN 6
+#define MPE_BUSNAME_ISA "ISA "
+#define MPE_BUSNAME_PCI "PCI "
+
+static void *oem_tbl_start;
+static int oem_tbl_size;
+
+static uint8_t
+mpt_compute_checksum(void *base, size_t len)
+{
+ uint8_t *bytes;
+ uint8_t sum;
+
+ for(bytes = base, sum = 0; len > 0; len--) {
+ sum += *bytes++;
+ }
+
+ return (256 - sum);
+}
+
+static void
+mpt_build_mpfp(mpfps_t mpfp, vm_paddr_t gpa)
+{
+
+ memset(mpfp, 0, sizeof(*mpfp));
+ memcpy(mpfp->signature, MPFP_SIG, 4);
+ mpfp->pap = gpa + sizeof(*mpfp);
+ mpfp->length = 1;
+ mpfp->spec_rev = MP_SPECREV;
+ mpfp->checksum = mpt_compute_checksum(mpfp, sizeof(*mpfp));
+}
+
+static void
+mpt_build_mpch(mpcth_t mpch)
+{
+
+ memset(mpch, 0, sizeof(*mpch));
+ memcpy(mpch->signature, MPCH_SIG, 4);
+ mpch->spec_rev = MP_SPECREV;
+ memcpy(mpch->oem_id, MPCH_OEMID, MPCH_OEMID_LEN);
+ memcpy(mpch->product_id, MPCH_PRODID, MPCH_PRODID_LEN);
+ mpch->apic_address = LAPIC_PADDR;
+}
+
+static void
+mpt_build_proc_entries(bproc_entry_ptr mpep, int ncpu)
+{
+ int i;
+
+ for (i = 0; i < ncpu; i++) {
+ memset(mpep, 0, sizeof(*mpep));
+ mpep->type = MPCT_ENTRY_PROCESSOR;
+ mpep->apic_id = i; // XXX
+ mpep->apic_version = LAPIC_VERSION;
+ mpep->cpu_flags = PROCENTRY_FLAG_EN;
+ if (i == 0)
+ mpep->cpu_flags |= PROCENTRY_FLAG_BP;
+ mpep->cpu_signature = MPEP_SIG;
+ mpep->feature_flags = MPEP_FEATURES;
+ mpep++;
+ }
+}
+
+static void
+mpt_build_bus_entries(bus_entry_ptr mpeb)
+{
+
+ memset(mpeb, 0, sizeof(*mpeb));
+ mpeb->type = MPCT_ENTRY_BUS;
+ mpeb->bus_id = ISA;
+ memcpy(mpeb->bus_type, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN);
+ mpeb++;
+
+ memset(mpeb, 0, sizeof(*mpeb));
+ mpeb->type = MPCT_ENTRY_BUS;
+ mpeb->bus_id = PCI;
+ memcpy(mpeb->bus_type, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN);
+}
+
+static void
+mpt_build_ioapic_entries(io_apic_entry_ptr mpei, int id)
+{
+
+ memset(mpei, 0, sizeof(*mpei));
+ mpei->type = MPCT_ENTRY_IOAPIC;
+ mpei->apic_id = id;
+ mpei->apic_version = IOAPIC_VERSION;
+ mpei->apic_flags = IOAPICENTRY_FLAG_EN;
+ mpei->apic_address = IOAPIC_PADDR;
+}
+
+#ifdef notyet
+static void
+mpt_build_ioint_entries(struct mpe_ioint *mpeii, int num_pins, int id)
+{
+ int pin;
+
+ /*
+ * The following config is taken from kernel mptable.c
+ * mptable_parse_default_config_ints(...), for now
+ * just use the default config, tweek later if needed.
+ */
+
+
+ /* Run through all 16 pins. */
+ for (pin = 0; pin < num_pins; pin++) {
+ memset(mpeii, 0, sizeof(*mpeii));
+ mpeii->entry_type = MP_ENTRY_IOINT;
+ mpeii->src_bus_id = MPE_BUSID_ISA;
+ mpeii->dst_apic_id = id;
+
+ /*
+ * All default configs route IRQs from bus 0 to the first 16
+ * pins of the first I/O APIC with an APIC ID of 2.
+ */
+ mpeii->dst_apic_intin = pin;
+ switch (pin) {
+ case 0:
+ /* Pin 0 is an ExtINT pin. */
+ mpeii->intr_type = MPEII_INTR_EXTINT;
+ break;
+ case 2:
+ /* IRQ 0 is routed to pin 2. */
+ mpeii->intr_type = MPEII_INTR_INT;
+ mpeii->src_bus_irq = 0;
+ break;
+ case 5:
+ case 10:
+ case 11:
+ /*
+ * PCI Irqs set to level triggered.
+ */
+ mpeii->intr_flags = MPEII_FLAGS_TRIGMODE_LEVEL;
+ mpeii->src_bus_id = MPE_BUSID_PCI;
+ default:
+ /* All other pins are identity mapped. */
+ mpeii->intr_type = MPEII_INTR_INT;
+ mpeii->src_bus_irq = pin;
+ break;
+ }
+ mpeii++;
+ }
+
+}
+
+#define COPYSTR(dest, src, bytes) \
+ memcpy(dest, src, bytes); \
+ str[bytes] = 0;
+
+static void
+mptable_dump(struct mp_floating_pointer *mpfp, struct mp_config_hdr *mpch)
+{
+ static char str[16];
+ int i;
+ char *cur;
+
+ union mpe {
+ struct mpe_proc *proc;
+ struct mpe_bus *bus;
+ struct mpe_ioapic *ioapic;
+ struct mpe_ioint *ioint;
+ struct mpe_lint *lnit;
+ char *p;
+ };
+
+ union mpe mpe;
+
+ printf(" MP Floating Pointer :\n");
+ COPYSTR(str, mpfp->signature, 4);
+ printf("\tsignature:\t%s\n", str);
+ printf("\tmpch paddr:\t%x\n", mpfp->mptable_paddr);
+ printf("\tlength:\t%x\n", mpfp->length);
+ printf("\tspecrec:\t%x\n", mpfp->specrev);
+ printf("\tchecksum:\t%x\n", mpfp->checksum);
+ printf("\tfeature1:\t%x\n", mpfp->feature1);
+ printf("\tfeature2:\t%x\n", mpfp->feature2);
+ printf("\tfeature3:\t%x\n", mpfp->feature3);
+ printf("\tfeature4:\t%x\n", mpfp->feature4);
+
+ printf(" MP Configuration Header :\n");
+ COPYSTR(str, mpch->signature, 4);
+ printf(" signature: %s\n", str);
+ printf(" length: %x\n", mpch->length);
+ printf(" specrec: %x\n", mpch->specrev);
+ printf(" checksum: %x\n", mpch->checksum);
+ COPYSTR(str, mpch->oemid, MPCH_OEMID_LEN);
+ printf(" oemid: %s\n", str);
+ COPYSTR(str, mpch->prodid, MPCH_PRODID_LEN);
+ printf(" prodid: %s\n", str);
+ printf(" oem_ptr: %x\n", mpch->oem_ptr);
+ printf(" oem_sz: %x\n", mpch->oem_sz);
+ printf(" nr_entries: %x\n", mpch->nr_entries);
+ printf(" apic paddr: %x\n", mpch->lapic_paddr);
+ printf(" ext_length: %x\n", mpch->ext_length);
+ printf(" ext_checksum: %x\n", mpch->ext_checksum);
+
+ cur = (char *)mpch + sizeof(*mpch);
+ for (i = 0; i < mpch->nr_entries; i++) {
+ mpe.p = cur;
+ switch(*mpe.p) {
+ case MP_ENTRY_PROC:
+ printf(" MP Processor Entry :\n");
+ printf(" lapic_id: %x\n", mpe.proc->lapic_id);
+ printf(" lapic_version: %x\n", mpe.proc->lapic_version);
+ printf(" proc_flags: %x\n", mpe.proc->proc_flags);
+ printf(" proc_signature: %x\n", mpe.proc->proc_signature);
+ printf(" feature_flags: %x\n", mpe.proc->feature_flags);
+ cur += sizeof(struct mpe_proc);
+ break;
+ case MP_ENTRY_BUS:
+ printf(" MP Bus Entry :\n");
+ printf(" busid: %x\n", mpe.bus->busid);
+ COPYSTR(str, mpe.bus->busname, MPE_BUSNAME_LEN);
+ printf(" busname: %s\n", str);
+ cur += sizeof(struct mpe_bus);
+ break;
+ case MP_ENTRY_IOAPIC:
+ printf(" MP IOAPIC Entry :\n");
+ printf(" ioapi_id: %x\n", mpe.ioapic->ioapic_id);
+ printf(" ioapi_version: %x\n", mpe.ioapic->ioapic_version);
+ printf(" ioapi_flags: %x\n", mpe.ioapic->ioapic_flags);
+ printf(" ioapi_paddr: %x\n", mpe.ioapic->ioapic_paddr);
+ cur += sizeof(struct mpe_ioapic);
+ break;
+ case MP_ENTRY_IOINT:
+ printf(" MP IO Interrupt Entry :\n");
+ printf(" intr_type: %x\n", mpe.ioint->intr_type);
+ printf(" intr_flags: %x\n", mpe.ioint->intr_flags);
+ printf(" src_bus_id: %x\n", mpe.ioint->src_bus_id);
+ printf(" src_bus_irq: %x\n", mpe.ioint->src_bus_irq);
+ printf(" dst_apic_id: %x\n", mpe.ioint->dst_apic_id);
+ printf(" dst_apic_intin: %x\n", mpe.ioint->dst_apic_intin);
+ cur += sizeof(struct mpe_ioint);
+ break;
+ case MP_ENTRY_LINT:
+ printf(" MP Local Interrupt Entry :\n");
+ cur += sizeof(struct mpe_lint);
+ break;
+ }
+
+ }
+}
+#endif
+
+void
+mptable_add_oemtbl(void *tbl, int tblsz)
+{
+
+ oem_tbl_start = tbl;
+ oem_tbl_size = tblsz;
+}
+
+int
+mptable_build(struct vmctx *ctx, int ncpu, int ioapic)
+{
+ mpcth_t mpch;
+ bus_entry_ptr mpeb;
+ io_apic_entry_ptr mpei;
+ bproc_entry_ptr mpep;
+ mpfps_t mpfp;
+ char *curraddr;
+ char *startaddr;
+
+ if (paddr_guest2host(0) == NULL) {
+ printf("mptable requires mapped mem\n");
+ return (ENOMEM);
+ }
+
+ startaddr = curraddr = paddr_guest2host(MPTABLE_BASE);
+
+ mpfp = (mpfps_t)curraddr;
+ mpt_build_mpfp(mpfp, MPTABLE_BASE);
+ curraddr += sizeof(*mpfp);
+
+ mpch = (mpcth_t)curraddr;
+ mpt_build_mpch(mpch);
+ curraddr += sizeof(*mpch);
+
+ mpep = (bproc_entry_ptr)curraddr;
+ mpt_build_proc_entries(mpep, ncpu);
+ curraddr += sizeof(*mpep) * ncpu;
+ mpch->entry_count += ncpu;
+
+ mpeb = (bus_entry_ptr) curraddr;
+ mpt_build_bus_entries(mpeb);
+ curraddr += sizeof(*mpeb) * MPE_NUM_BUSES;
+ mpch->entry_count += MPE_NUM_BUSES;
+
+ if (ioapic) {
+ mpei = (io_apic_entry_ptr)curraddr;
+ mpt_build_ioapic_entries(mpei, ncpu + 1);
+ curraddr += sizeof(*mpei);
+ mpch->entry_count++;
+ }
+
+#ifdef notyet
+ mpt_build_ioint_entries((struct mpe_ioint*)curraddr, MPEII_MAX_IRQ,
+ ncpu + 1);
+ curraddr += sizeof(struct mpe_ioint) * MPEII_MAX_IRQ;
+ mpch->entry_count += MPEII_MAX_IRQ;
+#endif
+
+ if (oem_tbl_start) {
+ mpch->oem_table_pointer = curraddr - startaddr + MPTABLE_BASE;
+ mpch->oem_table_size = oem_tbl_size;
+ memcpy(curraddr, oem_tbl_start, oem_tbl_size);
+ }
+
+ mpch->base_table_length = curraddr - (char *)mpch;
+ mpch->checksum = mpt_compute_checksum(mpch, sizeof(*mpch));
+
+ return (0);
+}
diff --git a/usr.sbin/bhyve/mptbl.h b/usr.sbin/bhyve/mptbl.h
new file mode 100644
index 0000000..3c4c527
--- /dev/null
+++ b/usr.sbin/bhyve/mptbl.h
@@ -0,0 +1,35 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MPTBL_H_
+#define _MPTBL_H_
+
+int mptable_build(struct vmctx *ctx, int ncpu, int ioapic);
+void mptable_add_oemtbl(void *tbl, int tblsz);
+
+#endif /* _MPTBL_H_ */
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
new file mode 100644
index 0000000..e086aeb
--- /dev/null
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -0,0 +1,1117 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "inout.h"
+#include "mem.h"
+#include "mptbl.h"
+#include "pci_emul.h"
+#include "ioapic.h"
+
+#define CONF1_ADDR_PORT 0x0cf8
+#define CONF1_DATA_PORT 0x0cfc
+
+#define CFGWRITE(pi,off,val,b) \
+do { \
+ if ((b) == 1) { \
+ pci_set_cfgdata8((pi),(off),(val)); \
+ } else if ((b) == 2) { \
+ pci_set_cfgdata16((pi),(off),(val)); \
+ } else { \
+ pci_set_cfgdata32((pi),(off),(val)); \
+ } \
+} while (0)
+
+#define MAXSLOTS (PCI_SLOTMAX + 1)
+#define MAXFUNCS (PCI_FUNCMAX + 1)
+
+static struct slotinfo {
+ char *si_name;
+ char *si_param;
+ struct pci_devinst *si_devi;
+ int si_legacy;
+} pci_slotinfo[MAXSLOTS][MAXFUNCS];
+
+/*
+ * Used to keep track of legacy interrupt owners/requestors
+ */
+#define NLIRQ 16
+
+static struct lirqinfo {
+ int li_generic;
+ int li_acount;
+ struct pci_devinst *li_owner; /* XXX should be a list */
+} lirq[NLIRQ];
+
+SET_DECLARE(pci_devemu_set, struct pci_devemu);
+
+static uint64_t pci_emul_iobase;
+static uint64_t pci_emul_membase32;
+static uint64_t pci_emul_membase64;
+
+#define PCI_EMUL_IOBASE 0x2000
+#define PCI_EMUL_IOLIMIT 0x10000
+
+#define PCI_EMUL_MEMBASE32 (lomem_sz)
+#define PCI_EMUL_MEMLIMIT32 0xE0000000 /* 3.5GB */
+
+#define PCI_EMUL_MEMBASE64 0xD000000000UL
+#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL
+
+static int pci_emul_devices;
+
+/*
+ * I/O access
+ */
+
+/*
+ * Slot options are in the form:
+ *
+ * <slot>[:<func>],<emul>[,<config>]
+ *
+ * slot is 0..31
+ * func is 0..7
+ * emul is a string describing the type of PCI device e.g. virtio-net
+ * config is an optional string, depending on the device, that can be
+ * used for configuration.
+ * Examples are:
+ * 1,virtio-net,tap0
+ * 3:0,dummy
+ */
+static void
+pci_parse_slot_usage(char *aopt)
+{
+ printf("Invalid PCI slot info field \"%s\"\n", aopt);
+ free(aopt);
+}
+
+void
+pci_parse_slot(char *opt, int legacy)
+{
+ char *slot, *func, *emul, *config;
+ char *str, *cpy;
+ int snum, fnum;
+
+ str = cpy = strdup(opt);
+
+ config = NULL;
+
+ if (strchr(str, ':') != NULL) {
+ slot = strsep(&str, ":");
+ func = strsep(&str, ",");
+ } else {
+ slot = strsep(&str, ",");
+ func = NULL;
+ }
+
+ emul = strsep(&str, ",");
+ if (str != NULL) {
+ config = strsep(&str, ",");
+ }
+
+ if (emul == NULL) {
+ pci_parse_slot_usage(cpy);
+ return;
+ }
+
+ snum = atoi(slot);
+ fnum = func ? atoi(func) : 0;
+ if (snum < 0 || snum >= MAXSLOTS || fnum < 0 || fnum >= MAXFUNCS) {
+ pci_parse_slot_usage(cpy);
+ } else {
+ pci_slotinfo[snum][fnum].si_name = emul;
+ pci_slotinfo[snum][fnum].si_param = config;
+ pci_slotinfo[snum][fnum].si_legacy = legacy;
+ }
+}
+
+static int
+pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ struct pci_devinst *pdi = arg;
+ struct pci_devemu *pe = pdi->pi_d;
+ uint64_t offset;
+ int i;
+
+ for (i = 0; i <= PCI_BARMAX; i++) {
+ if (pdi->pi_bar[i].type == PCIBAR_IO &&
+ port >= pdi->pi_bar[i].addr &&
+ port + bytes <=
+ pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
+ offset = port - pdi->pi_bar[i].addr;
+ if (in)
+ *eax = (*pe->pe_barread)(ctx, vcpu, pdi, i,
+ offset, bytes);
+ else
+ (*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset,
+ bytes, *eax);
+ return (0);
+ }
+ }
+ return (-1);
+}
+
+static int
+pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int size, uint64_t *val, void *arg1, long arg2)
+{
+ struct pci_devinst *pdi = arg1;
+ struct pci_devemu *pe = pdi->pi_d;
+ uint64_t offset;
+ int bidx = (int) arg2;
+
+ assert(bidx <= PCI_BARMAX);
+ assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 ||
+ pdi->pi_bar[bidx].type == PCIBAR_MEM64);
+ assert(addr >= pdi->pi_bar[bidx].addr &&
+ addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size);
+
+ offset = addr - pdi->pi_bar[bidx].addr;
+
+ if (dir == MEM_F_WRITE)
+ (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, size, *val);
+ else
+ *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset, size);
+
+ return (0);
+}
+
+
+static int
+pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
+ uint64_t *addr)
+{
+ uint64_t base;
+
+ assert((size & (size - 1)) == 0); /* must be a power of 2 */
+
+ base = roundup2(*baseptr, size);
+
+ if (base + size <= limit) {
+ *addr = base;
+ *baseptr = base + size;
+ return (0);
+ } else
+ return (-1);
+}
+
+int
+pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
+ uint64_t size)
+{
+
+ return (pci_emul_alloc_pbar(pdi, idx, 0, type, size));
+}
+
+int
+pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
+ enum pcibar_type type, uint64_t size)
+{
+ int i, error;
+ uint64_t *baseptr, limit, addr, mask, lobits, bar;
+ struct inout_port iop;
+ struct mem_range memp;
+
+ assert(idx >= 0 && idx <= PCI_BARMAX);
+
+ if ((size & (size - 1)) != 0)
+ size = 1UL << flsl(size); /* round up to a power of 2 */
+
+ switch (type) {
+ case PCIBAR_NONE:
+ baseptr = NULL;
+ addr = mask = lobits = 0;
+ break;
+ case PCIBAR_IO:
+ if (hostbase &&
+ pci_slotinfo[pdi->pi_slot][pdi->pi_func].si_legacy) {
+ assert(hostbase < PCI_EMUL_IOBASE);
+ baseptr = &hostbase;
+ } else {
+ baseptr = &pci_emul_iobase;
+ }
+ limit = PCI_EMUL_IOLIMIT;
+ mask = PCIM_BAR_IO_BASE;
+ lobits = PCIM_BAR_IO_SPACE;
+ break;
+ case PCIBAR_MEM64:
+ /*
+ * XXX
+ * Some drivers do not work well if the 64-bit BAR is allocated
+ * above 4GB. Allow for this by allocating small requests under
+ * 4GB unless then allocation size is larger than some arbitrary
+ * number (32MB currently).
+ */
+ if (size > 32 * 1024 * 1024) {
+ /*
+ * XXX special case for device requiring peer-peer DMA
+ */
+ if (size == 0x100000000UL)
+ baseptr = &hostbase;
+ else
+ baseptr = &pci_emul_membase64;
+ limit = PCI_EMUL_MEMLIMIT64;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+ PCIM_BAR_MEM_PREFETCH;
+ break;
+ } else {
+ baseptr = &pci_emul_membase32;
+ limit = PCI_EMUL_MEMLIMIT32;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64;
+ }
+ break;
+ case PCIBAR_MEM32:
+ baseptr = &pci_emul_membase32;
+ limit = PCI_EMUL_MEMLIMIT32;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+ break;
+ default:
+ printf("pci_emul_alloc_base: invalid bar type %d\n", type);
+ assert(0);
+ }
+
+ if (baseptr != NULL) {
+ error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
+ if (error != 0)
+ return (error);
+ }
+
+ pdi->pi_bar[idx].type = type;
+ pdi->pi_bar[idx].addr = addr;
+ pdi->pi_bar[idx].size = size;
+
+ /* Initialize the BAR register in config space */
+ bar = (addr & mask) | lobits;
+ pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
+
+ if (type == PCIBAR_MEM64) {
+ assert(idx + 1 <= PCI_BARMAX);
+ pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
+ pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
+ }
+
+ /* add a handler to intercept accesses to the I/O bar */
+ if (type == PCIBAR_IO) {
+ iop.name = pdi->pi_name;
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = pci_emul_io_handler;
+ iop.arg = pdi;
+
+ for (i = 0; i < size; i++) {
+ iop.port = addr + i;
+ register_inout(&iop);
+ }
+ } else if (type == PCIBAR_MEM32 || type == PCIBAR_MEM64) {
+ /* add memory bar intercept handler */
+ memp.name = pdi->pi_name;
+ memp.flags = MEM_F_RW;
+ memp.base = addr;
+ memp.size = size;
+ memp.handler = pci_emul_mem_handler;
+ memp.arg1 = pdi;
+ memp.arg2 = idx;
+
+ error = register_mem(&memp);
+ assert(error == 0);
+ }
+
+ return (0);
+}
+
+#define CAP_START_OFFSET 0x40
+static int
+pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
+{
+ int i, capoff, capid, reallen;
+ uint16_t sts;
+
+ static u_char endofcap[4] = {
+ PCIY_RESERVED, 0, 0, 0
+ };
+
+ assert(caplen > 0 && capdata[0] != PCIY_RESERVED);
+
+ reallen = roundup2(caplen, 4); /* dword aligned */
+
+ sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
+ capoff = CAP_START_OFFSET;
+ pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
+ pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
+ } else {
+ capoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
+ while (1) {
+ assert((capoff & 0x3) == 0);
+ capid = pci_get_cfgdata8(pi, capoff);
+ if (capid == PCIY_RESERVED)
+ break;
+ capoff = pci_get_cfgdata8(pi, capoff + 1);
+ }
+ }
+
+ /* Check if we have enough space */
+ if (capoff + reallen + sizeof(endofcap) > PCI_REGMAX + 1)
+ return (-1);
+
+ /* Copy the capability */
+ for (i = 0; i < caplen; i++)
+ pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+ /* Set the next capability pointer */
+ pci_set_cfgdata8(pi, capoff + 1, capoff + reallen);
+
+ /* Copy of the reserved capability which serves as the end marker */
+ for (i = 0; i < sizeof(endofcap); i++)
+ pci_set_cfgdata8(pi, capoff + reallen + i, endofcap[i]);
+
+ return (0);
+}
+
+static struct pci_devemu *
+pci_emul_finddev(char *name)
+{
+ struct pci_devemu **pdpp, *pdp;
+
+ SET_FOREACH(pdpp, pci_devemu_set) {
+ pdp = *pdpp;
+ if (!strcmp(pdp->pe_emu, name)) {
+ return (pdp);
+ }
+ }
+
+ return (NULL);
+}
+
+static void
+pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int slot, int func,
+ char *params)
+{
+ struct pci_devinst *pdi;
+ pdi = malloc(sizeof(struct pci_devinst));
+ bzero(pdi, sizeof(*pdi));
+
+ pdi->pi_vmctx = ctx;
+ pdi->pi_bus = 0;
+ pdi->pi_slot = slot;
+ pdi->pi_func = func;
+ pdi->pi_d = pde;
+ snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
+
+ /* Disable legacy interrupts */
+ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
+ pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
+
+ pci_set_cfgdata8(pdi, PCIR_COMMAND,
+ PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
+
+ if ((*pde->pe_init)(ctx, pdi, params) != 0) {
+ free(pdi);
+ } else {
+ pci_emul_devices++;
+ pci_slotinfo[slot][func].si_devi = pdi;
+ }
+}
+
+void
+pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
+{
+ int mmc;
+
+ CTASSERT(sizeof(struct msicap) == 14);
+
+ /* Number of msi messages must be a power of 2 between 1 and 32 */
+ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
+ mmc = ffs(msgnum) - 1;
+
+ bzero(msicap, sizeof(struct msicap));
+ msicap->capid = PCIY_MSI;
+ msicap->nextptr = nextptr;
+ msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
+}
+
+int
+pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
+{
+ struct msicap msicap;
+
+ pci_populate_msicap(&msicap, msgnum, 0);
+
+ return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
+}
+
+void
+msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+ uint16_t msgctrl, rwmask;
+ int off, table_bar;
+
+ off = offset - capoff;
+ table_bar = pi->pi_msix.table_bar;
+ /* Message Control Register */
+ if (off == 2 && bytes == 2) {
+ rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
+ msgctrl = pci_get_cfgdata16(pi, offset);
+ msgctrl &= ~rwmask;
+ msgctrl |= val & rwmask;
+ val = msgctrl;
+
+ pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
+ }
+
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+void
+msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+ uint16_t msgctrl, rwmask, msgdata, mme;
+ uint32_t addrlo;
+
+ /*
+ * If guest is writing to the message control register make sure
+ * we do not overwrite read-only fields.
+ */
+ if ((offset - capoff) == 2 && bytes == 2) {
+ rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
+ msgctrl = pci_get_cfgdata16(pi, offset);
+ msgctrl &= ~rwmask;
+ msgctrl |= val & rwmask;
+ val = msgctrl;
+
+ addrlo = pci_get_cfgdata32(pi, capoff + 4);
+ if (msgctrl & PCIM_MSICTRL_64BIT)
+ msgdata = pci_get_cfgdata16(pi, capoff + 12);
+ else
+ msgdata = pci_get_cfgdata16(pi, capoff + 8);
+
+ /*
+ * XXX check delivery mode, destination mode etc
+ */
+ mme = msgctrl & PCIM_MSICTRL_MME_MASK;
+ pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
+ if (pi->pi_msi.enabled) {
+ pi->pi_msi.cpu = (addrlo >> 12) & 0xff;
+ pi->pi_msi.vector = msgdata & 0xff;
+ pi->pi_msi.msgnum = 1 << (mme >> 4);
+ } else {
+ pi->pi_msi.cpu = 0;
+ pi->pi_msi.vector = 0;
+ pi->pi_msi.msgnum = 0;
+ }
+ }
+
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+/*
+ * This function assumes that 'coff' is in the capabilities region of the
+ * config space.
+ */
+static void
+pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
+{
+ int capid;
+ uint8_t capoff, nextoff;
+
+ /* Do not allow un-aligned writes */
+ if ((offset & (bytes - 1)) != 0)
+ return;
+
+ /* Find the capability that we want to update */
+ capoff = CAP_START_OFFSET;
+ while (1) {
+ capid = pci_get_cfgdata8(pi, capoff);
+ if (capid == PCIY_RESERVED)
+ break;
+
+ nextoff = pci_get_cfgdata8(pi, capoff + 1);
+ if (offset >= capoff && offset < nextoff)
+ break;
+
+ capoff = nextoff;
+ }
+ assert(offset >= capoff);
+
+ /*
+ * Capability ID and Next Capability Pointer are readonly
+ */
+ if (offset == capoff || offset == capoff + 1)
+ return;
+
+ switch (capid) {
+ case PCIY_MSI:
+ msicap_cfgwrite(pi, capoff, offset, bytes, val);
+ break;
+ default:
+ break;
+ }
+}
+
+static int
+pci_emul_iscap(struct pci_devinst *pi, int offset)
+{
+ int found;
+ uint16_t sts;
+ uint8_t capid, lastoff;
+
+ found = 0;
+ sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+ if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
+ lastoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
+ while (1) {
+ assert((lastoff & 0x3) == 0);
+ capid = pci_get_cfgdata8(pi, lastoff);
+ if (capid == PCIY_RESERVED)
+ break;
+ lastoff = pci_get_cfgdata8(pi, lastoff + 1);
+ }
+ if (offset >= CAP_START_OFFSET && offset <= lastoff)
+ found = 1;
+ }
+ return (found);
+}
+
+void
+init_pci(struct vmctx *ctx)
+{
+ struct pci_devemu *pde;
+ struct slotinfo *si;
+ int slot, func;
+
+ pci_emul_iobase = PCI_EMUL_IOBASE;
+ pci_emul_membase32 = PCI_EMUL_MEMBASE32;
+ pci_emul_membase64 = PCI_EMUL_MEMBASE64;
+
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ for (func = 0; func < MAXFUNCS; func++) {
+ si = &pci_slotinfo[slot][func];
+ if (si->si_name != NULL) {
+ pde = pci_emul_finddev(si->si_name);
+ if (pde != NULL) {
+ pci_emul_init(ctx, pde, slot, func,
+ si->si_param);
+ }
+ }
+ }
+ }
+
+ /*
+ * Allow ISA IRQs 5,10,11,12, and 15 to be available for
+ * generic use
+ */
+ lirq[5].li_generic = 1;
+ lirq[10].li_generic = 1;
+ lirq[11].li_generic = 1;
+ lirq[12].li_generic = 1;
+ lirq[15].li_generic = 1;
+}
+
+int
+pci_msi_enabled(struct pci_devinst *pi)
+{
+ return (pi->pi_msi.enabled);
+}
+
+int
+pci_msi_msgnum(struct pci_devinst *pi)
+{
+ if (pi->pi_msi.enabled)
+ return (pi->pi_msi.msgnum);
+ else
+ return (0);
+}
+
+void
+pci_generate_msi(struct pci_devinst *pi, int msg)
+{
+
+ if (pci_msi_enabled(pi) && msg < pci_msi_msgnum(pi)) {
+ vm_lapic_irq(pi->pi_vmctx,
+ pi->pi_msi.cpu,
+ pi->pi_msi.vector + msg);
+ }
+}
+
+int
+pci_is_legacy(struct pci_devinst *pi)
+{
+
+ return (pci_slotinfo[pi->pi_slot][pi->pi_func].si_legacy);
+}
+
+static int
+pci_lintr_alloc(struct pci_devinst *pi, int vec)
+{
+ int i;
+
+ assert(vec < NLIRQ);
+
+ if (vec == -1) {
+ for (i = 0; i < NLIRQ; i++) {
+ if (lirq[i].li_generic &&
+ lirq[i].li_owner == NULL) {
+ vec = i;
+ break;
+ }
+ }
+ } else {
+ if (lirq[vec].li_owner != NULL) {
+ vec = -1;
+ }
+ }
+ assert(vec != -1);
+
+ lirq[vec].li_owner = pi;
+ pi->pi_lintr_pin = vec;
+
+ return (vec);
+}
+
+int
+pci_lintr_request(struct pci_devinst *pi, int vec)
+{
+
+ vec = pci_lintr_alloc(pi, vec);
+ pci_set_cfgdata8(pi, PCIR_INTLINE, vec);
+ pci_set_cfgdata8(pi, PCIR_INTPIN, 1);
+ return (0);
+}
+
+void
+pci_lintr_assert(struct pci_devinst *pi)
+{
+
+ assert(pi->pi_lintr_pin);
+ ioapic_assert_pin(pi->pi_vmctx, pi->pi_lintr_pin);
+}
+
+void
+pci_lintr_deassert(struct pci_devinst *pi)
+{
+
+ assert(pi->pi_lintr_pin);
+ ioapic_deassert_pin(pi->pi_vmctx, pi->pi_lintr_pin);
+}
+
+/*
+ * Return 1 if the emulated device in 'slot' is a multi-function device.
+ * Return 0 otherwise.
+ */
+static int
+pci_emul_is_mfdev(int slot)
+{
+ int f, numfuncs;
+
+ numfuncs = 0;
+ for (f = 0; f < MAXFUNCS; f++) {
+ if (pci_slotinfo[slot][f].si_devi != NULL) {
+ numfuncs++;
+ }
+ }
+ return (numfuncs > 1);
+}
+
+/*
+ * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on
+ * whether or not is a multi-function being emulated in the pci 'slot'.
+ */
+static void
+pci_emul_hdrtype_fixup(int slot, int off, int bytes, uint32_t *rv)
+{
+ int mfdev;
+
+ if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) {
+ mfdev = pci_emul_is_mfdev(slot);
+ switch (bytes) {
+ case 1:
+ case 2:
+ *rv &= ~PCIM_MFDEV;
+ if (mfdev) {
+ *rv |= PCIM_MFDEV;
+ }
+ break;
+ case 4:
+ *rv &= ~(PCIM_MFDEV << 16);
+ if (mfdev) {
+ *rv |= (PCIM_MFDEV << 16);
+ }
+ break;
+ }
+ }
+}
+
+static int cfgbus, cfgslot, cfgfunc, cfgoff;
+
+static int
+pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ uint32_t x;
+
+ assert(!in);
+
+ if (bytes != 4)
+ return (-1);
+
+ x = *eax;
+ cfgoff = x & PCI_REGMAX;
+ cfgfunc = (x >> 8) & PCI_FUNCMAX;
+ cfgslot = (x >> 11) & PCI_SLOTMAX;
+ cfgbus = (x >> 16) & PCI_BUSMAX;
+
+ return (0);
+}
+INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_OUT, pci_emul_cfgaddr);
+
+static int
+pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ struct pci_devinst *pi;
+ struct pci_devemu *pe;
+ int coff, idx, needcfg;
+ uint64_t mask, bar;
+
+ assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+ if (cfgbus == 0)
+ pi = pci_slotinfo[cfgslot][cfgfunc].si_devi;
+ else
+ pi = NULL;
+
+ coff = cfgoff + (port - CONF1_DATA_PORT);
+
+#if 0
+ printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r",
+ in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc);
+#endif
+
+ /*
+ * Just return if there is no device at this cfgslot:cfgfunc or
+ * if the guest is doing an un-aligned access
+ */
+ if (pi == NULL || (coff & (bytes - 1)) != 0) {
+ if (in)
+ *eax = 0xffffffff;
+ return (0);
+ }
+
+ pe = pi->pi_d;
+
+ /*
+ * Config read
+ */
+ if (in) {
+ /* Let the device emulation override the default handler */
+ if (pe->pe_cfgread != NULL) {
+ needcfg = pe->pe_cfgread(ctx, vcpu, pi,
+ coff, bytes, eax);
+ } else {
+ needcfg = 1;
+ }
+
+ if (needcfg) {
+ if (bytes == 1)
+ *eax = pci_get_cfgdata8(pi, coff);
+ else if (bytes == 2)
+ *eax = pci_get_cfgdata16(pi, coff);
+ else
+ *eax = pci_get_cfgdata32(pi, coff);
+ }
+
+ pci_emul_hdrtype_fixup(cfgslot, coff, bytes, eax);
+ } else {
+ /* Let the device emulation override the default handler */
+ if (pe->pe_cfgwrite != NULL &&
+ (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
+ return (0);
+
+ /*
+ * Special handling for write to BAR registers
+ */
+ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
+ /*
+ * Ignore writes to BAR registers that are not
+ * 4-byte aligned.
+ */
+ if (bytes != 4 || (coff & 0x3) != 0)
+ return (0);
+ idx = (coff - PCIR_BAR(0)) / 4;
+ switch (pi->pi_bar[idx].type) {
+ case PCIBAR_NONE:
+ bar = 0;
+ break;
+ case PCIBAR_IO:
+ mask = ~(pi->pi_bar[idx].size - 1);
+ mask &= PCIM_BAR_IO_BASE;
+ bar = (*eax & mask) | PCIM_BAR_IO_SPACE;
+ break;
+ case PCIBAR_MEM32:
+ mask = ~(pi->pi_bar[idx].size - 1);
+ mask &= PCIM_BAR_MEM_BASE;
+ bar = *eax & mask;
+ bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+ break;
+ case PCIBAR_MEM64:
+ mask = ~(pi->pi_bar[idx].size - 1);
+ mask &= PCIM_BAR_MEM_BASE;
+ bar = *eax & mask;
+ bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+ PCIM_BAR_MEM_PREFETCH;
+ break;
+ case PCIBAR_MEMHI64:
+ mask = ~(pi->pi_bar[idx - 1].size - 1);
+ mask &= PCIM_BAR_MEM_BASE;
+ bar = ((uint64_t)*eax << 32) & mask;
+ bar = bar >> 32;
+ break;
+ default:
+ assert(0);
+ }
+ pci_set_cfgdata32(pi, coff, bar);
+
+ } else if (pci_emul_iscap(pi, coff)) {
+ pci_emul_capwrite(pi, coff, bytes, *eax);
+ } else {
+ CFGWRITE(pi, coff, *eax, bytes);
+ }
+ }
+
+ return (0);
+}
+
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
+
+/*
+ * I/O ports to configure PCI IRQ routing. We ignore all writes to it.
+ */
+static int
+pci_irq_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 0);
+ return (0);
+}
+INOUT_PORT(pci_irq, 0xC00, IOPORT_F_OUT, pci_irq_port_handler);
+INOUT_PORT(pci_irq, 0xC01, IOPORT_F_OUT, pci_irq_port_handler);
+
+#define PCI_EMUL_TEST
+#ifdef PCI_EMUL_TEST
+/*
+ * Define a dummy test device
+ */
+#define DIOSZ 20
+#define DMEMSZ 4096
+struct pci_emul_dsoftc {
+ uint8_t ioregs[DIOSZ];
+ uint8_t memregs[DMEMSZ];
+};
+
+#define PCI_EMUL_MSI_MSGS 4
+#define PCI_EMUL_MSIX_MSGS 16
+
+static int
+pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ int error;
+ struct pci_emul_dsoftc *sc;
+
+ sc = malloc(sizeof(struct pci_emul_dsoftc));
+ memset(sc, 0, sizeof(struct pci_emul_dsoftc));
+
+ pi->pi_arg = sc;
+
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
+ pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
+
+ error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS);
+ assert(error == 0);
+
+ error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ);
+ assert(error == 0);
+
+ error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ);
+ assert(error == 0);
+
+ return (0);
+}
+
+static void
+pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size, uint64_t value)
+{
+ int i;
+ struct pci_emul_dsoftc *sc = pi->pi_arg;
+
+ if (baridx == 0) {
+ if (offset + size > DIOSZ) {
+ printf("diow: iow too large, offset %ld size %d\n",
+ offset, size);
+ return;
+ }
+
+ if (size == 1) {
+ sc->ioregs[offset] = value & 0xff;
+ } else if (size == 2) {
+ *(uint16_t *)&sc->ioregs[offset] = value & 0xffff;
+ } else if (size == 4) {
+ *(uint32_t *)&sc->ioregs[offset] = value;
+ } else {
+ printf("diow: iow unknown size %d\n", size);
+ }
+
+ /*
+ * Special magic value to generate an interrupt
+ */
+ if (offset == 4 && size == 4 && pci_msi_enabled(pi))
+ pci_generate_msi(pi, value % pci_msi_msgnum(pi));
+
+ if (value == 0xabcdef) {
+ for (i = 0; i < pci_msi_msgnum(pi); i++)
+ pci_generate_msi(pi, i);
+ }
+ }
+
+ if (baridx == 1) {
+ if (offset + size > DMEMSZ) {
+ printf("diow: memw too large, offset %ld size %d\n",
+ offset, size);
+ return;
+ }
+
+ if (size == 1) {
+ sc->memregs[offset] = value;
+ } else if (size == 2) {
+ *(uint16_t *)&sc->memregs[offset] = value;
+ } else if (size == 4) {
+ *(uint32_t *)&sc->memregs[offset] = value;
+ } else if (size == 8) {
+ *(uint64_t *)&sc->memregs[offset] = value;
+ } else {
+ printf("diow: memw unknown size %d\n", size);
+ }
+
+ /*
+ * magic interrupt ??
+ */
+ }
+
+ if (baridx > 1) {
+ printf("diow: unknown bar idx %d\n", baridx);
+ }
+}
+
+static uint64_t
+pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size)
+{
+ struct pci_emul_dsoftc *sc = pi->pi_arg;
+ uint32_t value;
+
+ if (baridx == 0) {
+ if (offset + size > DIOSZ) {
+ printf("dior: ior too large, offset %ld size %d\n",
+ offset, size);
+ return (0);
+ }
+
+ if (size == 1) {
+ value = sc->ioregs[offset];
+ } else if (size == 2) {
+ value = *(uint16_t *) &sc->ioregs[offset];
+ } else if (size == 4) {
+ value = *(uint32_t *) &sc->ioregs[offset];
+ } else {
+ printf("dior: ior unknown size %d\n", size);
+ }
+ }
+
+ if (baridx == 1) {
+ if (offset + size > DMEMSZ) {
+ printf("dior: memr too large, offset %ld size %d\n",
+ offset, size);
+ return (0);
+ }
+
+ if (size == 1) {
+ value = sc->memregs[offset];
+ } else if (size == 2) {
+ value = *(uint16_t *) &sc->memregs[offset];
+ } else if (size == 4) {
+ value = *(uint32_t *) &sc->memregs[offset];
+ } else if (size == 8) {
+ value = *(uint64_t *) &sc->memregs[offset];
+ } else {
+ printf("dior: ior unknown size %d\n", size);
+ }
+ }
+
+
+ if (baridx > 1) {
+ printf("dior: unknown bar idx %d\n", baridx);
+ return (0);
+ }
+
+ return (value);
+}
+
+struct pci_devemu pci_dummy = {
+ .pe_emu = "dummy",
+ .pe_init = pci_emul_dinit,
+ .pe_barwrite = pci_emul_diow,
+ .pe_barread = pci_emul_dior
+};
+PCI_EMUL_SET(pci_dummy);
+
+#endif /* PCI_EMUL_TEST */
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
new file mode 100644
index 0000000..e924475
--- /dev/null
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -0,0 +1,216 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PCI_EMUL_H_
+#define _PCI_EMUL_H_
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/kernel.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <assert.h>
+
+#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */
+#define PCIY_RESERVED 0x00
+
+struct vmctx;
+struct pci_devinst;
+struct memory_region;
+
+struct pci_devemu {
+ char *pe_emu; /* Name of device emulation */
+
+ /* instance creation */
+ int (*pe_init)(struct vmctx *, struct pci_devinst *,
+ char *opts);
+
+ /* config space read/write callbacks */
+ int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int offset,
+ int bytes, uint32_t val);
+ int (*pe_cfgread)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int offset,
+ int bytes, uint32_t *retval);
+
+ /* BAR read/write callbacks */
+ void (*pe_barwrite)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size, uint64_t value);
+ uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size);
+};
+#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
+
+enum pcibar_type {
+ PCIBAR_NONE,
+ PCIBAR_IO,
+ PCIBAR_MEM32,
+ PCIBAR_MEM64,
+ PCIBAR_MEMHI64
+};
+
+struct pcibar {
+ enum pcibar_type type; /* io or memory */
+ uint64_t size;
+ uint64_t addr;
+};
+
+#define PI_NAMESZ 40
+
+struct msix_table_entry {
+ uint64_t addr;
+ uint32_t msg_data;
+ uint32_t vector_control;
+} __packed;
+
+/*
+ * In case the structure is modified to hold extra information, use a define
+ * for the size that should be emulated.
+ */
+#define MSIX_TABLE_ENTRY_SIZE 16
+#define MAX_MSIX_TABLE_SIZE 2048
+
+struct pci_devinst {
+ struct pci_devemu *pi_d;
+ struct vmctx *pi_vmctx;
+ uint8_t pi_bus, pi_slot, pi_func;
+ uint8_t pi_lintr_pin;
+ char pi_name[PI_NAMESZ];
+ uint16_t pi_iobase;
+ int pi_bar_getsize;
+
+ struct {
+ int enabled;
+ int cpu;
+ int vector;
+ int msgnum;
+ } pi_msi;
+
+ struct {
+ int enabled;
+ int table_bar;
+ int pba_bar;
+ size_t table_offset;
+ size_t table_size;
+ int table_count;
+ size_t pba_offset;
+ struct msix_table_entry table[MAX_MSIX_TABLE_SIZE];
+ } pi_msix;
+
+ void *pi_arg; /* devemu-private data */
+
+ u_char pi_cfgdata[PCI_REGMAX + 1];
+ struct pcibar pi_bar[PCI_BARMAX + 1];
+};
+
+struct msicap {
+ uint8_t capid;
+ uint8_t nextptr;
+ uint16_t msgctrl;
+ uint32_t addrlo;
+ uint32_t addrhi;
+ uint16_t msgdata;
+} __packed;
+
+struct msixcap {
+ uint8_t capid;
+ uint8_t nextptr;
+ uint16_t msgctrl;
+ uint32_t table_offset;
+ uint32_t pba_offset;
+} __packed;
+
+void init_pci(struct vmctx *ctx);
+void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val);
+void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val);
+void pci_callback(void);
+int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx,
+ enum pcibar_type type, uint64_t size);
+int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx,
+ uint64_t hostbase, enum pcibar_type type, uint64_t size);
+int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
+int pci_is_legacy(struct pci_devinst *pi);
+void pci_generate_msi(struct pci_devinst *pi, int msgnum);
+void pci_generate_msix(struct pci_devinst *pi, int msgnum);
+void pci_lintr_assert(struct pci_devinst *pi);
+void pci_lintr_deassert(struct pci_devinst *pi);
+int pci_lintr_request(struct pci_devinst *pi, int ivec);
+int pci_msi_enabled(struct pci_devinst *pi);
+int pci_msix_enabled(struct pci_devinst *pi);
+int pci_msi_msgnum(struct pci_devinst *pi);
+void pci_parse_slot(char *opt, int legacy);
+void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
+
+static __inline void
+pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
+{
+ assert(offset <= PCI_REGMAX);
+ *(uint8_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void
+pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
+{
+ assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+ *(uint16_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void
+pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
+{
+ assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+ *(uint32_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline uint8_t
+pci_get_cfgdata8(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= PCI_REGMAX);
+ return (*(uint8_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint16_t
+pci_get_cfgdata16(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+ return (*(uint16_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint32_t
+pci_get_cfgdata32(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+ return (*(uint32_t *)(pi->pi_cfgdata + offset));
+}
+
+#endif /* _PCI_EMUL_H_ */
diff --git a/usr.sbin/bhyve/pci_hostbridge.c b/usr.sbin/bhyve/pci_hostbridge.c
new file mode 100644
index 0000000..c77762d
--- /dev/null
+++ b/usr.sbin/bhyve/pci_hostbridge.c
@@ -0,0 +1,52 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "pci_emul.h"
+
+static int
+pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+ /* config space */
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */
+ pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_BRIDGE);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
+
+ return (0);
+}
+
+struct pci_devemu pci_de_hostbridge = {
+ .pe_emu = "hostbridge",
+ .pe_init = pci_hostbridge_init,
+};
+PCI_EMUL_SET(pci_de_hostbridge);
diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c
new file mode 100644
index 0000000..28abb6b
--- /dev/null
+++ b/usr.sbin/bhyve/pci_passthru.c
@@ -0,0 +1,724 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/pciio.h>
+#include <sys/ioctl.h>
+
+#include <dev/io/iodev.h>
+#include <machine/iodev.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+#include "pci_emul.h"
+#include "mem.h"
+
+#ifndef _PATH_DEVPCI
+#define _PATH_DEVPCI "/dev/pci"
+#endif
+
+#ifndef _PATH_DEVIO
+#define _PATH_DEVIO "/dev/io"
+#endif
+
+#define LEGACY_SUPPORT 1
+
+#define MSIX_TABLE_BIR_MASK 7
+#define MSIX_TABLE_OFFSET_MASK (~MSIX_TABLE_BIR_MASK);
+#define MSIX_TABLE_COUNT(x) (((x) & 0x7FF) + 1)
+#define MSIX_CAPLEN 12
+
+static int pcifd = -1;
+static int iofd = -1;
+
+struct passthru_softc {
+ struct pci_devinst *psc_pi;
+ struct pcibar psc_bar[PCI_BARMAX + 1];
+ struct {
+ int capoff;
+ int msgctrl;
+ int emulated;
+ } psc_msi;
+ struct {
+ int capoff;
+ } psc_msix;
+ struct pcisel psc_sel;
+};
+
+static int
+msi_caplen(int msgctrl)
+{
+ int len;
+
+ len = 10; /* minimum length of msi capability */
+
+ if (msgctrl & PCIM_MSICTRL_64BIT)
+ len += 4;
+
+#if 0
+ /*
+ * Ignore the 'mask' and 'pending' bits in the MSI capability.
+ * We'll let the guest manipulate them directly.
+ */
+ if (msgctrl & PCIM_MSICTRL_VECTOR)
+ len += 10;
+#endif
+
+ return (len);
+}
+
+static uint32_t
+read_config(const struct pcisel *sel, long reg, int width)
+{
+ struct pci_io pi;
+
+ bzero(&pi, sizeof(pi));
+ pi.pi_sel = *sel;
+ pi.pi_reg = reg;
+ pi.pi_width = width;
+
+ if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
+ return (0); /* XXX */
+ else
+ return (pi.pi_data);
+}
+
+static void
+write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
+{
+ struct pci_io pi;
+
+ bzero(&pi, sizeof(pi));
+ pi.pi_sel = *sel;
+ pi.pi_reg = reg;
+ pi.pi_width = width;
+ pi.pi_data = data;
+
+ (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */
+}
+
+#ifdef LEGACY_SUPPORT
+static int
+passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
+{
+ int capoff, i;
+ struct msicap msicap;
+ u_char *capdata;
+
+ pci_populate_msicap(&msicap, msgnum, nextptr);
+
+ /*
+ * XXX
+ * Copy the msi capability structure in the last 16 bytes of the
+ * config space. This is wrong because it could shadow something
+ * useful to the device.
+ */
+ capoff = 256 - roundup(sizeof(msicap), 4);
+ capdata = (u_char *)&msicap;
+ for (i = 0; i < sizeof(msicap); i++)
+ pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+ return (capoff);
+}
+#endif /* LEGACY_SUPPORT */
+
+static int
+cfginitmsi(struct passthru_softc *sc)
+{
+ int ptr, capptr, cap, sts, caplen;
+ uint32_t u32;
+ struct pcisel sel;
+ struct pci_devinst *pi;
+ struct msixcap msixcap;
+ uint32_t *msixcap_ptr;
+
+ pi = sc->psc_pi;
+ sel = sc->psc_sel;
+
+ /*
+ * Parse the capabilities and cache the location of the MSI
+ * and MSI-X capabilities.
+ */
+ sts = read_config(&sel, PCIR_STATUS, 2);
+ if (sts & PCIM_STATUS_CAPPRESENT) {
+ ptr = read_config(&sel, PCIR_CAP_PTR, 1);
+ while (ptr != 0 && ptr != 0xff) {
+ cap = read_config(&sel, ptr + PCICAP_ID, 1);
+ if (cap == PCIY_MSI) {
+ /*
+ * Copy the MSI capability into the config
+ * space of the emulated pci device
+ */
+ sc->psc_msi.capoff = ptr;
+ sc->psc_msi.msgctrl = read_config(&sel,
+ ptr + 2, 2);
+ sc->psc_msi.emulated = 0;
+ caplen = msi_caplen(sc->psc_msi.msgctrl);
+ capptr = ptr;
+ while (caplen > 0) {
+ u32 = read_config(&sel, capptr, 4);
+ pci_set_cfgdata32(pi, capptr, u32);
+ caplen -= 4;
+ capptr += 4;
+ }
+ } else if (cap == PCIY_MSIX) {
+ /*
+ * Copy the MSI-X capability
+ */
+ sc->psc_msix.capoff = ptr;
+ caplen = 12;
+ msixcap_ptr = (uint32_t*) &msixcap;
+ capptr = ptr;
+ while (caplen > 0) {
+ u32 = read_config(&sel, capptr, 4);
+ *msixcap_ptr = u32;
+ pci_set_cfgdata32(pi, capptr, u32);
+ caplen -= 4;
+ capptr += 4;
+ msixcap_ptr++;
+ }
+ }
+ ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
+ }
+ }
+
+ if (sc->psc_msix.capoff != 0) {
+ pi->pi_msix.pba_bar =
+ msixcap.pba_offset & MSIX_TABLE_BIR_MASK;
+ pi->pi_msix.pba_offset =
+ msixcap.pba_offset & MSIX_TABLE_OFFSET_MASK;
+ pi->pi_msix.table_bar =
+ msixcap.table_offset & MSIX_TABLE_BIR_MASK;
+ pi->pi_msix.table_offset =
+ msixcap.table_offset & MSIX_TABLE_OFFSET_MASK;
+ pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
+ }
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * If the passthrough device does not support MSI then craft a
+ * MSI capability for it. We link the new MSI capability at the
+ * head of the list of capabilities.
+ */
+ if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
+ int origptr, msiptr;
+ origptr = read_config(&sel, PCIR_CAP_PTR, 1);
+ msiptr = passthru_add_msicap(pi, 1, origptr);
+ sc->psc_msi.capoff = msiptr;
+ sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
+ sc->psc_msi.emulated = 1;
+ pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
+ }
+#endif
+
+ /* Make sure one of the capabilities is present */
+ if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
+ return (-1);
+ else
+ return (0);
+}
+
+static uint64_t
+msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
+{
+ struct pci_devinst *pi;
+ struct msix_table_entry *entry;
+ uint8_t *src8;
+ uint16_t *src16;
+ uint32_t *src32;
+ uint64_t *src64;
+ uint64_t data;
+ size_t entry_offset;
+ int index;
+
+ pi = sc->psc_pi;
+ entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+ index = offset / MSIX_TABLE_ENTRY_SIZE;
+ entry = &pi->pi_msix.table[index];
+
+ switch(size) {
+ case 1:
+ src8 = (uint8_t *)((void *)entry + entry_offset);
+ data = *src8;
+ break;
+ case 2:
+ src16 = (uint16_t *)((void *)entry + entry_offset);
+ data = *src16;
+ break;
+ case 4:
+ src32 = (uint32_t *)((void *)entry + entry_offset);
+ data = *src32;
+ break;
+ case 8:
+ src64 = (uint64_t *)((void *)entry + entry_offset);
+ data = *src64;
+ break;
+ default:
+ return (-1);
+ }
+
+ return (data);
+}
+
+static void
+msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
+ uint64_t offset, int size, uint64_t data)
+{
+ struct pci_devinst *pi;
+ struct msix_table_entry *entry;
+ uint32_t *dest;
+ size_t entry_offset;
+ uint32_t vector_control;
+ int error, index;
+
+ pi = sc->psc_pi;
+ entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+ index = offset / MSIX_TABLE_ENTRY_SIZE;
+ entry = &pi->pi_msix.table[index];
+
+ /* Only 4 byte naturally-aligned writes are supported */
+ assert(size == 4);
+ assert(entry_offset % 4 == 0);
+
+ vector_control = entry->vector_control;
+ dest = (uint32_t *)((void *)entry + entry_offset);
+ *dest = data;
+ /* If MSI-X hasn't been enabled, do nothing */
+ if (pi->pi_msix.enabled) {
+ /* If the entry is masked, don't set it up */
+ if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
+ (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+ error = vm_setup_msix(ctx, vcpu, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev,
+ sc->psc_sel.pc_func,
+ index, entry->msg_data,
+ entry->vector_control,
+ entry->addr);
+ }
+ }
+}
+
+static int
+init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
+{
+ int idx;
+ size_t table_size;
+ vm_paddr_t start;
+ size_t len;
+ struct pci_devinst *pi = sc->psc_pi;
+
+ /*
+ * If the MSI-X table BAR maps memory intended for
+ * other uses, it is at least assured that the table
+ * either resides in its own page within the region,
+ * or it resides in a page shared with only the PBA.
+ */
+ if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar &&
+ ((pi->pi_msix.pba_offset - pi->pi_msix.table_offset) < 4096)) {
+ /* Need to also emulate the PBA, not supported yet */
+ printf("Unsupported MSI-X table and PBA in same page\n");
+ return (-1);
+ }
+
+ /*
+ * May need to split the BAR into 3 regions:
+ * Before the MSI-X table, the MSI-X table, and after it
+ * XXX for now, assume that the table is not in the middle
+ */
+ table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
+ pi->pi_msix.table_size = table_size;
+ idx = pi->pi_msix.table_bar;
+
+ /* Round up to page size */
+ table_size = (table_size + 0x1000) & ~0xFFF;
+ if (pi->pi_msix.table_offset == 0) {
+ /* Map everything after the MSI-X table */
+ start = pi->pi_bar[idx].addr + table_size;
+ len = pi->pi_bar[idx].size - table_size;
+ } else {
+ /* Map everything before the MSI-X table */
+ start = pi->pi_bar[idx].addr;
+ len = pi->pi_msix.table_offset;
+ }
+ return (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
+ start, len, base + table_size));
+}
+
+static int
+cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
+{
+ int i, error;
+ struct pci_devinst *pi;
+ struct pci_bar_io bar;
+ enum pcibar_type bartype;
+ uint64_t base;
+
+ pi = sc->psc_pi;
+
+ /*
+ * Initialize BAR registers
+ */
+ for (i = 0; i <= PCI_BARMAX; i++) {
+ bzero(&bar, sizeof(bar));
+ bar.pbi_sel = sc->psc_sel;
+ bar.pbi_reg = PCIR_BAR(i);
+
+ if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
+ continue;
+
+ if (PCI_BAR_IO(bar.pbi_base)) {
+ bartype = PCIBAR_IO;
+ base = bar.pbi_base & PCIM_BAR_IO_BASE;
+ } else {
+ switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
+ case PCIM_BAR_MEM_64:
+ bartype = PCIBAR_MEM64;
+ break;
+ default:
+ bartype = PCIBAR_MEM32;
+ break;
+ }
+ base = bar.pbi_base & PCIM_BAR_MEM_BASE;
+ }
+
+ /* Cache information about the "real" BAR */
+ sc->psc_bar[i].type = bartype;
+ sc->psc_bar[i].size = bar.pbi_length;
+ sc->psc_bar[i].addr = base;
+
+ /* Allocate the BAR in the guest I/O or MMIO space */
+ error = pci_emul_alloc_pbar(pi, i, base, bartype,
+ bar.pbi_length);
+ if (error)
+ return (-1);
+
+ /* The MSI-X table needs special handling */
+ if (i == pi->pi_msix.table_bar) {
+ error = init_msix_table(ctx, sc, base);
+ if (error)
+ return (-1);
+ } else if (bartype != PCIBAR_IO) {
+ /* Map the physical MMIO space in the guest MMIO space */
+ error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
+ pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
+ if (error)
+ return (-1);
+ }
+
+ /*
+ * 64-bit BAR takes up two slots so skip the next one.
+ */
+ if (bartype == PCIBAR_MEM64) {
+ i++;
+ assert(i <= PCI_BARMAX);
+ sc->psc_bar[i].type = PCIBAR_MEMHI64;
+ }
+ }
+ return (0);
+}
+
+static int
+cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
+{
+ int error;
+ struct passthru_softc *sc;
+
+ error = 1;
+ sc = pi->pi_arg;
+
+ bzero(&sc->psc_sel, sizeof(struct pcisel));
+ sc->psc_sel.pc_bus = bus;
+ sc->psc_sel.pc_dev = slot;
+ sc->psc_sel.pc_func = func;
+
+ if (cfginitmsi(sc) != 0)
+ goto done;
+
+ if (cfginitbar(ctx, sc) != 0)
+ goto done;
+
+ error = 0; /* success */
+done:
+ return (error);
+}
+
+static int
+passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ int bus, slot, func, error;
+ struct passthru_softc *sc;
+
+ sc = NULL;
+ error = 1;
+
+ if (pcifd < 0) {
+ pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
+ if (pcifd < 0)
+ goto done;
+ }
+
+ if (iofd < 0) {
+ iofd = open(_PATH_DEVIO, O_RDWR, 0);
+ if (iofd < 0)
+ goto done;
+ }
+
+ if (opts == NULL ||
+ sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3)
+ goto done;
+
+ if (vm_assign_pptdev(ctx, bus, slot, func) != 0)
+ goto done;
+
+ sc = malloc(sizeof(struct passthru_softc));
+ memset(sc, 0, sizeof(struct passthru_softc));
+
+ pi->pi_arg = sc;
+ sc->psc_pi = pi;
+
+ /* initialize config space */
+ if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
+ goto done;
+
+ error = 0; /* success */
+done:
+ if (error) {
+ free(sc);
+ vm_unassign_pptdev(ctx, bus, slot, func);
+ }
+ return (error);
+}
+
+static int
+bar_access(int coff)
+{
+ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
+ return (1);
+ else
+ return (0);
+}
+
+static int
+msicap_access(struct passthru_softc *sc, int coff)
+{
+ int caplen;
+
+ if (sc->psc_msi.capoff == 0)
+ return (0);
+
+ caplen = msi_caplen(sc->psc_msi.msgctrl);
+
+ if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
+ return (1);
+ else
+ return (0);
+}
+
+static int
+msixcap_access(struct passthru_softc *sc, int coff)
+{
+ if (sc->psc_msix.capoff == 0)
+ return (0);
+
+ return (coff >= sc->psc_msix.capoff &&
+ coff < sc->psc_msix.capoff + MSIX_CAPLEN);
+}
+
+static int
+passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int coff, int bytes, uint32_t *rv)
+{
+ struct passthru_softc *sc;
+
+ sc = pi->pi_arg;
+
+ /*
+ * PCI BARs and MSI capability is emulated.
+ */
+ if (bar_access(coff) || msicap_access(sc, coff))
+ return (-1);
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * Emulate PCIR_CAP_PTR if this device does not support MSI capability
+ * natively.
+ */
+ if (sc->psc_msi.emulated) {
+ if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
+ return (-1);
+ }
+#endif
+
+ /* Everything else just read from the device's config space */
+ *rv = read_config(&sc->psc_sel, coff, bytes);
+
+ return (0);
+}
+
+static int
+passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int coff, int bytes, uint32_t val)
+{
+ int error, msix_table_entries, i;
+ struct passthru_softc *sc;
+
+ sc = pi->pi_arg;
+
+ /*
+ * PCI BARs are emulated
+ */
+ if (bar_access(coff))
+ return (-1);
+
+ /*
+ * MSI capability is emulated
+ */
+ if (msicap_access(sc, coff)) {
+ msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
+
+ error = vm_setup_msi(ctx, vcpu, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.cpu,
+ pi->pi_msi.vector, pi->pi_msi.msgnum);
+ if (error != 0) {
+ printf("vm_setup_msi returned error %d\r\n", errno);
+ exit(1);
+ }
+ return (0);
+ }
+
+ if (msixcap_access(sc, coff)) {
+ msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
+ if (pi->pi_msix.enabled) {
+ msix_table_entries = pi->pi_msix.table_count;
+ for (i = 0; i < msix_table_entries; i++) {
+ error = vm_setup_msix(ctx, vcpu, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev,
+ sc->psc_sel.pc_func, i,
+ pi->pi_msix.table[i].msg_data,
+ pi->pi_msix.table[i].vector_control,
+ pi->pi_msix.table[i].addr);
+
+ if (error) {
+ printf("vm_setup_msix returned error %d\r\n", errno);
+ exit(1);
+ }
+ }
+ }
+ return (0);
+ }
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * If this device does not support MSI natively then we cannot let
+ * the guest disable legacy interrupts from the device. It is the
+ * legacy interrupt that is triggering the virtual MSI to the guest.
+ */
+ if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
+ if (coff == PCIR_COMMAND && bytes == 2)
+ val &= ~PCIM_CMD_INTxDIS;
+ }
+#endif
+
+ write_config(&sc->psc_sel, coff, bytes, val);
+
+ return (0);
+}
+
+static void
+passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size, uint64_t value)
+{
+ struct passthru_softc *sc;
+ struct iodev_pio_req pio;
+
+ sc = pi->pi_arg;
+
+ if (pi->pi_msix.table_bar == baridx) {
+ msix_table_write(ctx, vcpu, sc, offset, size, value);
+ } else {
+ assert(pi->pi_bar[baridx].type == PCIBAR_IO);
+ bzero(&pio, sizeof(struct iodev_pio_req));
+ pio.access = IODEV_PIO_WRITE;
+ pio.port = sc->psc_bar[baridx].addr + offset;
+ pio.width = size;
+ pio.val = value;
+
+ (void)ioctl(iofd, IODEV_PIO, &pio);
+ }
+}
+
+static uint64_t
+passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size)
+{
+ struct passthru_softc *sc;
+ struct iodev_pio_req pio;
+ uint64_t val;
+
+ sc = pi->pi_arg;
+
+ if (pi->pi_msix.table_bar == baridx) {
+ val = msix_table_read(sc, offset, size);
+ } else {
+ assert(pi->pi_bar[baridx].type == PCIBAR_IO);
+ bzero(&pio, sizeof(struct iodev_pio_req));
+ pio.access = IODEV_PIO_READ;
+ pio.port = sc->psc_bar[baridx].addr + offset;
+ pio.width = size;
+ pio.val = 0;
+
+ (void)ioctl(iofd, IODEV_PIO, &pio);
+
+ val = pio.val;
+ }
+
+ return (val);
+}
+
+struct pci_devemu passthru = {
+ .pe_emu = "passthru",
+ .pe_init = passthru_init,
+ .pe_cfgwrite = passthru_cfgwrite,
+ .pe_cfgread = passthru_cfgread,
+ .pe_barwrite = passthru_write,
+ .pe_barread = passthru_read,
+};
+PCI_EMUL_SET(passthru);
diff --git a/usr.sbin/bhyve/pci_uart.c b/usr.sbin/bhyve/pci_uart.c
new file mode 100644
index 0000000..dd30551
--- /dev/null
+++ b/usr.sbin/bhyve/pci_uart.c
@@ -0,0 +1,626 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/select.h>
+#include <dev/ic/ns16550.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <termios.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "mevent.h"
+
+#define COM1_BASE 0x3F8
+#define COM1_IRQ 4
+#define COM2_BASE 0x2F8
+#define COM2_IRQ 3
+
+#define DEFAULT_RCLK 1843200
+#define DEFAULT_BAUD 9600
+
+#define FCR_RX_MASK 0xC0
+
+#define MCR_OUT1 0x04
+#define MCR_OUT2 0x08
+
+#define MSR_DELTA_MASK 0x0f
+
+#ifndef REG_SCR
+#define REG_SCR com_scr
+#endif
+
+#define FIFOSZ 16
+
+/*
+ * Pick a PCI vid/did of a chip with a single uart at
+ * BAR0, that most versions of FreeBSD can understand:
+ * Siig CyberSerial 1-port.
+ */
+#define COM_VENDOR 0x131f
+#define COM_DEV 0x2000
+
+static int pci_uart_stdio; /* stdio in use for i/o */
+
+static int pci_uart_nldevs; /* number of legacy devices - 2 max */
+
+static struct {
+ uint64_t baddr;
+ int vector;
+} pci_uart_lres[] = {
+ { COM1_BASE, COM1_IRQ},
+ { COM2_BASE, COM2_IRQ},
+ { 0, 0 }
+};
+
+struct fifo {
+ uint8_t buf[FIFOSZ];
+ int rindex; /* index to read from */
+ int windex; /* index to write to */
+ int num; /* number of characters in the fifo */
+ int size; /* size of the fifo */
+};
+
+struct pci_uart_softc {
+ struct pci_devinst *pi;
+ pthread_mutex_t mtx; /* protects all softc elements */
+ uint8_t data; /* Data register (R/W) */
+ uint8_t ier; /* Interrupt enable register (R/W) */
+ uint8_t lcr; /* Line control register (R/W) */
+ uint8_t mcr; /* Modem control register (R/W) */
+ uint8_t lsr; /* Line status register (R/W) */
+ uint8_t msr; /* Modem status register (R/W) */
+ uint8_t fcr; /* FIFO control register (W) */
+ uint8_t scr; /* Scratch register (R/W) */
+
+ uint8_t dll; /* Baudrate divisor latch LSB */
+ uint8_t dlh; /* Baudrate divisor latch MSB */
+
+ struct fifo rxfifo;
+
+ int opened;
+ int stdio;
+ bool thre_int_pending; /* THRE interrupt pending */
+};
+
+static void pci_uart_drain(int fd, enum ev_type ev, void *arg);
+
+static struct termios tio_orig, tio_new; /* I/O Terminals */
+
+static void
+ttyclose(void)
+{
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
+}
+
+static void
+ttyopen(void)
+{
+ tcgetattr(STDIN_FILENO, &tio_orig);
+
+ cfmakeraw(&tio_new);
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
+
+ atexit(ttyclose);
+}
+
+static bool
+tty_char_available(void)
+{
+ fd_set rfds;
+ struct timeval tv;
+
+ FD_ZERO(&rfds);
+ FD_SET(STDIN_FILENO, &rfds);
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0 ) {
+ return (true);
+ } else {
+ return (false);
+ }
+}
+
+static int
+ttyread(void)
+{
+ char rb;
+
+ if (tty_char_available()) {
+ read(STDIN_FILENO, &rb, 1);
+ return (rb & 0xff);
+ } else {
+ return (-1);
+ }
+}
+
+static void
+ttywrite(unsigned char wb)
+{
+ (void) write(STDIN_FILENO, &wb, 1);
+}
+
+static void
+fifo_reset(struct fifo *fifo, int size)
+{
+ bzero(fifo, sizeof(struct fifo));
+ fifo->size = size;
+}
+
+static int
+fifo_putchar(struct fifo *fifo, uint8_t ch)
+{
+
+ if (fifo->num < fifo->size) {
+ fifo->buf[fifo->windex] = ch;
+ fifo->windex = (fifo->windex + 1) % fifo->size;
+ fifo->num++;
+ return (0);
+ } else
+ return (-1);
+}
+
+static int
+fifo_getchar(struct fifo *fifo)
+{
+ int c;
+
+ if (fifo->num > 0) {
+ c = fifo->buf[fifo->rindex];
+ fifo->rindex = (fifo->rindex + 1) % fifo->size;
+ fifo->num--;
+ return (c);
+ } else
+ return (-1);
+}
+
+static int
+fifo_numchars(struct fifo *fifo)
+{
+
+ return (fifo->num);
+}
+
+static int
+fifo_available(struct fifo *fifo)
+{
+
+ return (fifo->num < fifo->size);
+}
+
+static void
+pci_uart_opentty(struct pci_uart_softc *sc)
+{
+ struct mevent *mev;
+
+ assert(sc->opened == 0);
+ assert(sc->stdio);
+
+ ttyopen();
+ mev = mevent_add(STDIN_FILENO, EVF_READ, pci_uart_drain, sc);
+ assert(mev);
+}
+
+static void
+pci_uart_legacy_res(uint64_t *bar, int *ivec)
+{
+ if (pci_uart_lres[pci_uart_nldevs].baddr != 0) {
+ *bar = pci_uart_lres[pci_uart_nldevs].baddr;
+ *ivec = pci_uart_lres[pci_uart_nldevs].vector;
+ pci_uart_nldevs++;
+ } else {
+ /* TODO: print warning ? */
+ *bar = 0;
+ *ivec= -1;
+ }
+}
+
+/*
+ * The IIR returns a prioritized interrupt reason:
+ * - receive data available
+ * - transmit holding register empty
+ * - modem status change
+ *
+ * Return an interrupt reason if one is available.
+ */
+static int
+pci_uart_intr_reason(struct pci_uart_softc *sc)
+{
+
+ if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0)
+ return (IIR_RLS);
+ else if (fifo_numchars(&sc->rxfifo) > 0 && (sc->ier & IER_ERXRDY) != 0)
+ return (IIR_RXTOUT);
+ else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0)
+ return (IIR_TXRDY);
+ else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0)
+ return (IIR_MLSC);
+ else
+ return (IIR_NOPEND);
+}
+
+static void
+pci_uart_reset(struct pci_uart_softc *sc)
+{
+ uint16_t divisor;
+
+ divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16;
+ sc->dll = divisor;
+ sc->dlh = divisor >> 16;
+
+ fifo_reset(&sc->rxfifo, 1); /* no fifo until enabled by software */
+}
+
+/*
+ * Toggle the COM port's intr pin depending on whether or not we have an
+ * interrupt condition to report to the processor.
+ */
+static void
+pci_uart_toggle_intr(struct pci_uart_softc *sc)
+{
+ uint8_t intr_reason;
+
+ intr_reason = pci_uart_intr_reason(sc);
+
+ if (intr_reason == IIR_NOPEND)
+ pci_lintr_deassert(sc->pi);
+ else
+ pci_lintr_assert(sc->pi);
+}
+
+static void
+pci_uart_drain(int fd, enum ev_type ev, void *arg)
+{
+ struct pci_uart_softc *sc;
+ int ch;
+
+ sc = arg;
+
+ assert(fd == STDIN_FILENO);
+ assert(ev == EVF_READ);
+
+ /*
+ * This routine is called in the context of the mevent thread
+ * to take out the softc lock to protect against concurrent
+ * access from a vCPU i/o exit
+ */
+ pthread_mutex_lock(&sc->mtx);
+
+ if ((sc->mcr & MCR_LOOPBACK) != 0) {
+ (void) ttyread();
+ } else {
+ while (fifo_available(&sc->rxfifo) &&
+ ((ch = ttyread()) != -1)) {
+ fifo_putchar(&sc->rxfifo, ch);
+ }
+ pci_uart_toggle_intr(sc);
+ }
+
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+static void
+pci_uart_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct pci_uart_softc *sc;
+ int fifosz;
+ uint8_t msr;
+
+ sc = pi->pi_arg;
+
+ assert(baridx == 0);
+ assert(size == 1);
+
+ /* Open terminal */
+ if (!sc->opened && sc->stdio) {
+ pci_uart_opentty(sc);
+ sc->opened = 1;
+ }
+
+ pthread_mutex_lock(&sc->mtx);
+
+ /*
+ * Take care of the special case DLAB accesses first
+ */
+ if ((sc->lcr & LCR_DLAB) != 0) {
+ if (offset == REG_DLL) {
+ sc->dll = value;
+ goto done;
+ }
+
+ if (offset == REG_DLH) {
+ sc->dlh = value;
+ goto done;
+ }
+ }
+
+ switch (offset) {
+ case REG_DATA:
+ if (sc->mcr & MCR_LOOPBACK) {
+ if (fifo_putchar(&sc->rxfifo, value) != 0)
+ sc->lsr |= LSR_OE;
+ } else if (sc->stdio) {
+ ttywrite(value);
+ } /* else drop on floor */
+ sc->thre_int_pending = true;
+ break;
+ case REG_IER:
+ /*
+ * Apply mask so that bits 4-7 are 0
+ * Also enables bits 0-3 only if they're 1
+ */
+ sc->ier = value & 0x0F;
+ break;
+ case REG_FCR:
+ /*
+ * When moving from FIFO and 16450 mode and vice versa,
+ * the FIFO contents are reset.
+ */
+ if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) {
+ fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1;
+ fifo_reset(&sc->rxfifo, fifosz);
+ }
+
+ /*
+ * The FCR_ENABLE bit must be '1' for the programming
+ * of other FCR bits to be effective.
+ */
+ if ((value & FCR_ENABLE) == 0) {
+ sc->fcr = 0;
+ } else {
+ if ((value & FCR_RCV_RST) != 0)
+ fifo_reset(&sc->rxfifo, FIFOSZ);
+
+ sc->fcr = value &
+ (FCR_ENABLE | FCR_DMA | FCR_RX_MASK);
+ }
+ break;
+ case REG_LCR:
+ sc->lcr = value;
+ break;
+ case REG_MCR:
+ /* Apply mask so that bits 5-7 are 0 */
+ sc->mcr = value & 0x1F;
+
+ msr = 0;
+ if (sc->mcr & MCR_LOOPBACK) {
+ /*
+ * In the loopback mode certain bits from the
+ * MCR are reflected back into MSR
+ */
+ if (sc->mcr & MCR_RTS)
+ msr |= MSR_CTS;
+ if (sc->mcr & MCR_DTR)
+ msr |= MSR_DSR;
+ if (sc->mcr & MCR_OUT1)
+ msr |= MSR_RI;
+ if (sc->mcr & MCR_OUT2)
+ msr |= MSR_DCD;
+ }
+
+ /*
+ * Detect if there has been any change between the
+ * previous and the new value of MSR. If there is
+ * then assert the appropriate MSR delta bit.
+ */
+ if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS))
+ sc->msr |= MSR_DCTS;
+ if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR))
+ sc->msr |= MSR_DDSR;
+ if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD))
+ sc->msr |= MSR_DDCD;
+ if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0)
+ sc->msr |= MSR_TERI;
+
+ /*
+ * Update the value of MSR while retaining the delta
+ * bits.
+ */
+ sc->msr &= MSR_DELTA_MASK;
+ sc->msr |= msr;
+ break;
+ case REG_LSR:
+ /*
+ * Line status register is not meant to be written to
+ * during normal operation.
+ */
+ break;
+ case REG_MSR:
+ /*
+ * As far as I can tell MSR is a read-only register.
+ */
+ break;
+ case REG_SCR:
+ sc->scr = value;
+ break;
+ default:
+ break;
+ }
+
+done:
+ pci_uart_toggle_intr(sc);
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+uint64_t
+pci_uart_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size)
+{
+ struct pci_uart_softc *sc;
+ uint8_t iir, intr_reason;
+ uint64_t reg;
+
+ sc = pi->pi_arg;
+
+ assert(baridx == 0);
+ assert(size == 1);
+
+ /* Open terminal */
+ if (!sc->opened && sc->stdio) {
+ pci_uart_opentty(sc);
+ sc->opened = 1;
+ }
+
+ pthread_mutex_lock(&sc->mtx);
+
+ /*
+ * Take care of the special case DLAB accesses first
+ */
+ if ((sc->lcr & LCR_DLAB) != 0) {
+ if (offset == REG_DLL) {
+ reg = sc->dll;
+ goto done;
+ }
+
+ if (offset == REG_DLH) {
+ reg = sc->dlh;
+ goto done;
+ }
+ }
+
+ switch (offset) {
+ case REG_DATA:
+ reg = fifo_getchar(&sc->rxfifo);
+ break;
+ case REG_IER:
+ reg = sc->ier;
+ break;
+ case REG_IIR:
+ iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0;
+
+ intr_reason = pci_uart_intr_reason(sc);
+
+ /*
+ * Deal with side effects of reading the IIR register
+ */
+ if (intr_reason == IIR_TXRDY)
+ sc->thre_int_pending = false;
+
+ iir |= intr_reason;
+
+ reg = iir;
+ break;
+ case REG_LCR:
+ reg = sc->lcr;
+ break;
+ case REG_MCR:
+ reg = sc->mcr;
+ break;
+ case REG_LSR:
+ /* Transmitter is always ready for more data */
+ sc->lsr |= LSR_TEMT | LSR_THRE;
+
+ /* Check for new receive data */
+ if (fifo_numchars(&sc->rxfifo) > 0)
+ sc->lsr |= LSR_RXRDY;
+ else
+ sc->lsr &= ~LSR_RXRDY;
+
+ reg = sc->lsr;
+
+ /* The LSR_OE bit is cleared on LSR read */
+ sc->lsr &= ~LSR_OE;
+ break;
+ case REG_MSR:
+ /*
+ * MSR delta bits are cleared on read
+ */
+ reg = sc->msr;
+ sc->msr &= ~MSR_DELTA_MASK;
+ break;
+ case REG_SCR:
+ reg = sc->scr;
+ break;
+ default:
+ reg = 0xFF;
+ break;
+ }
+
+done:
+ pci_uart_toggle_intr(sc);
+ pthread_mutex_unlock(&sc->mtx);
+
+ return (reg);
+}
+
+static int
+pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ struct pci_uart_softc *sc;
+ uint64_t bar;
+ int ivec;
+
+ sc = malloc(sizeof(struct pci_uart_softc));
+ memset(sc, 0, sizeof(struct pci_uart_softc));
+
+ pi->pi_arg = sc;
+ sc->pi = pi;
+
+ pthread_mutex_init(&sc->mtx, NULL);
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM);
+ if (pci_is_legacy(pi)) {
+ pci_uart_legacy_res(&bar, &ivec);
+ pci_emul_alloc_pbar(pi, 0, bar, PCIBAR_IO, 8);
+ } else {
+ ivec = -1;
+ pci_emul_alloc_bar(pi, 0, PCIBAR_IO, 8);
+ }
+ pci_lintr_request(pi, ivec);
+
+ if (opts != NULL && !strcmp("stdio", opts) && !pci_uart_stdio) {
+ pci_uart_stdio = 1;
+ sc->stdio = 1;
+ }
+
+ pci_uart_reset(sc);
+
+ return (0);
+}
+
+struct pci_devemu pci_de_com = {
+ .pe_emu = "uart",
+ .pe_init = pci_uart_init,
+ .pe_barwrite = pci_uart_write,
+ .pe_barread = pci_uart_read
+};
+PCI_EMUL_SET(pci_de_com);
diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c
new file mode 100644
index 0000000..3382097
--- /dev/null
+++ b/usr.sbin/bhyve/pci_virtio_block.c
@@ -0,0 +1,534 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+#define VTBLK_RINGSZ 64
+
+#define VTBLK_CFGSZ 28
+
+#define VTBLK_R_CFG VTCFG_R_CFG0
+#define VTBLK_R_CFG_END VTBLK_R_CFG + VTBLK_CFGSZ -1
+#define VTBLK_R_MAX VTBLK_R_CFG_END
+
+#define VTBLK_REGSZ VTBLK_R_MAX+1
+
+#define VTBLK_MAXSEGS 32
+
+#define VTBLK_S_OK 0
+#define VTBLK_S_IOERR 1
+
+/*
+ * Host capabilities
+ */
+#define VTBLK_S_HOSTCAPS \
+ ( 0x00000004 | /* host maximum request segments */ \
+ 0x10000000 ) /* supports indirect descriptors */
+
+struct vring_hqueue {
+ /* Internal state */
+ uint16_t hq_size;
+ uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
+
+ /* Host-context pointers to the queue */
+ struct virtio_desc *hq_dtable;
+ uint16_t *hq_avail_flags;
+ uint16_t *hq_avail_idx; /* monotonically increasing */
+ uint16_t *hq_avail_ring;
+
+ uint16_t *hq_used_flags;
+ uint16_t *hq_used_idx; /* monotonically increasing */
+ struct virtio_used *hq_used_ring;
+};
+
+/*
+ * Config space
+ */
+struct vtblk_config {
+ uint64_t vbc_capacity;
+ uint32_t vbc_size_max;
+ uint32_t vbc_seg_max;
+ uint16_t vbc_geom_c;
+ uint8_t vbc_geom_h;
+ uint8_t vbc_geom_s;
+ uint32_t vbc_blk_size;
+ uint32_t vbc_sectors_max;
+} __packed;
+CTASSERT(sizeof(struct vtblk_config) == VTBLK_CFGSZ);
+
+/*
+ * Fixed-size block header
+ */
+struct virtio_blk_hdr {
+#define VBH_OP_READ 0
+#define VBH_OP_WRITE 1
+ uint32_t vbh_type;
+ uint32_t vbh_ioprio;
+ uint64_t vbh_sector;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtblk_debug;
+#define DPRINTF(params) if (pci_vtblk_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtblk_softc {
+ struct pci_devinst *vbsc_pi;
+ int vbsc_fd;
+ int vbsc_status;
+ int vbsc_isr;
+ int vbsc_lastq;
+ uint32_t vbsc_features;
+ uint64_t vbsc_pfn;
+ struct vring_hqueue vbsc_q;
+ struct vtblk_config vbsc_cfg;
+};
+
+/*
+ * Return the number of available descriptors in the vring taking care
+ * of the 16-bit index wraparound.
+ */
+static int
+hq_num_avail(struct vring_hqueue *hq)
+{
+ int ndesc;
+
+ if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
+ ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
+ else
+ ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
+
+ assert(ndesc >= 0 && ndesc <= hq->hq_size);
+
+ return (ndesc);
+}
+
+static void
+pci_vtblk_update_status(struct pci_vtblk_softc *sc, uint32_t value)
+{
+ if (value == 0) {
+ DPRINTF(("vtblk: device reset requested !\n"));
+ }
+
+ sc->vbsc_status = value;
+}
+
+static void
+pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vring_hqueue *hq)
+{
+ struct iovec iov[VTBLK_MAXSEGS];
+ struct virtio_blk_hdr *vbh;
+ struct virtio_desc *vd, *vid;
+ struct virtio_used *vu;
+ uint8_t *status;
+ int i;
+ int err;
+ int iolen;
+ int nsegs;
+ int uidx, aidx, didx;
+ int writeop;
+ off_t offset;
+
+ uidx = *hq->hq_used_idx;
+ aidx = hq->hq_cur_aidx;
+ didx = hq->hq_avail_ring[aidx % hq->hq_size];
+ assert(didx >= 0 && didx < hq->hq_size);
+
+ vd = &hq->hq_dtable[didx];
+
+ /*
+ * Verify that the descriptor is indirect, and obtain
+ * the pointer to the indirect descriptor.
+ * There has to be space for at least 3 descriptors
+ * in the indirect descriptor array: the block header,
+ * 1 or more data descriptors, and a status byte.
+ */
+ assert(vd->vd_flags & VRING_DESC_F_INDIRECT);
+
+ nsegs = vd->vd_len / sizeof(struct virtio_desc);
+ assert(nsegs >= 3);
+ assert(nsegs < VTBLK_MAXSEGS + 2);
+
+ vid = paddr_guest2host(vd->vd_addr);
+ assert((vid->vd_flags & VRING_DESC_F_INDIRECT) == 0);
+
+ /*
+ * The first descriptor will be the read-only fixed header
+ */
+ vbh = paddr_guest2host(vid[0].vd_addr);
+ assert(vid[0].vd_len == sizeof(struct virtio_blk_hdr));
+ assert(vid[0].vd_flags & VRING_DESC_F_NEXT);
+ assert((vid[0].vd_flags & VRING_DESC_F_WRITE) == 0);
+
+ writeop = (vbh->vbh_type == VBH_OP_WRITE);
+
+ offset = vbh->vbh_sector * DEV_BSIZE;
+
+ /*
+ * Build up the iovec based on the guest's data descriptors
+ */
+ for (i = 1, iolen = 0; i < nsegs - 1; i++) {
+ iov[i-1].iov_base = paddr_guest2host(vid[i].vd_addr);
+ iov[i-1].iov_len = vid[i].vd_len;
+ iolen += vid[i].vd_len;
+
+ assert(vid[i].vd_flags & VRING_DESC_F_NEXT);
+ assert((vid[i].vd_flags & VRING_DESC_F_INDIRECT) == 0);
+
+ /*
+ * - write op implies read-only descriptor,
+ * - read op implies write-only descriptor,
+ * therefore test the inverse of the descriptor bit
+ * to the op.
+ */
+ assert(((vid[i].vd_flags & VRING_DESC_F_WRITE) == 0) ==
+ writeop);
+ }
+
+ /* Lastly, get the address of the status byte */
+ status = paddr_guest2host(vid[nsegs - 1].vd_addr);
+ assert(vid[nsegs - 1].vd_len == 1);
+ assert((vid[nsegs - 1].vd_flags & VRING_DESC_F_NEXT) == 0);
+ assert(vid[nsegs - 1].vd_flags & VRING_DESC_F_WRITE);
+
+ DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r",
+ writeop ? "write" : "read", iolen, nsegs - 2, offset));
+
+ if (writeop){
+ err = pwritev(sc->vbsc_fd, iov, nsegs - 2, offset);
+ } else {
+ err = preadv(sc->vbsc_fd, iov, nsegs - 2, offset);
+ }
+
+ *status = err < 0 ? VTBLK_S_IOERR : VTBLK_S_OK;
+
+ /*
+ * Return the single indirect descriptor back to the host
+ */
+ vu = &hq->hq_used_ring[uidx % hq->hq_size];
+ vu->vu_idx = didx;
+ vu->vu_tlen = 1;
+ hq->hq_cur_aidx++;
+ *hq->hq_used_idx += 1;
+}
+
+static void
+pci_vtblk_qnotify(struct pci_vtblk_softc *sc)
+{
+ struct vring_hqueue *hq = &sc->vbsc_q;
+ int i;
+ int ndescs;
+
+ /*
+ * Calculate number of ring entries to process
+ */
+ ndescs = hq_num_avail(hq);
+
+ if (ndescs == 0)
+ return;
+
+ /*
+ * Run through all the entries, placing them into iovecs and
+ * sending when an end-of-packet is found
+ */
+ for (i = 0; i < ndescs; i++)
+ pci_vtblk_proc(sc, hq);
+
+ /*
+ * Generate an interrupt if able
+ */
+ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0 &&
+ sc->vbsc_isr == 0) {
+ sc->vbsc_isr = 1;
+ pci_generate_msi(sc->vbsc_pi, 0);
+ }
+
+}
+
+static void
+pci_vtblk_ring_init(struct pci_vtblk_softc *sc, uint64_t pfn)
+{
+ struct vring_hqueue *hq;
+
+ sc->vbsc_pfn = pfn << VRING_PFN;
+
+ /*
+ * Set up host pointers to the various parts of the
+ * queue
+ */
+ hq = &sc->vbsc_q;
+ hq->hq_size = VTBLK_RINGSZ;
+
+ hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
+ hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
+ hq->hq_avail_idx = hq->hq_avail_flags + 1;
+ hq->hq_avail_ring = hq->hq_avail_flags + 2;
+ hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
+ VRING_ALIGN);
+ hq->hq_used_idx = hq->hq_used_flags + 1;
+ hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
+
+ /*
+ * Initialize queue indexes
+ */
+ hq->hq_cur_aidx = 0;
+}
+
+static int
+pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ struct stat sbuf;
+ struct pci_vtblk_softc *sc;
+ off_t size;
+ int fd;
+ int sectsz;
+
+ if (opts == NULL) {
+ printf("virtio-block: backing device required\n");
+ return (1);
+ }
+
+ /*
+ * Access to guest memory is required. Fail if
+ * memory not mapped
+ */
+ if (paddr_guest2host(0) == NULL)
+ return (1);
+
+ /*
+ * The supplied backing file has to exist
+ */
+ fd = open(opts, O_RDWR);
+ if (fd < 0) {
+ perror("Could not open backing file");
+ return (1);
+ }
+
+ if (fstat(fd, &sbuf) < 0) {
+ perror("Could not stat backing file");
+ close(fd);
+ return (1);
+ }
+
+ /*
+ * Deal with raw devices
+ */
+ size = sbuf.st_size;
+ sectsz = DEV_BSIZE;
+ if (S_ISCHR(sbuf.st_mode)) {
+ if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
+ ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
+ perror("Could not fetch dev blk/sector size");
+ close(fd);
+ return (1);
+ }
+ assert(size != 0);
+ assert(sectsz != 0);
+ }
+
+ sc = malloc(sizeof(struct pci_vtblk_softc));
+ memset(sc, 0, sizeof(struct pci_vtblk_softc));
+
+ pi->pi_arg = sc;
+ sc->vbsc_pi = pi;
+ sc->vbsc_fd = fd;
+
+ /* setup virtio block config space */
+ sc->vbsc_cfg.vbc_capacity = size / sectsz;
+ sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS;
+ sc->vbsc_cfg.vbc_blk_size = sectsz;
+ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */
+ sc->vbsc_cfg.vbc_geom_c = 0; /* no geometry */
+ sc->vbsc_cfg.vbc_geom_h = 0;
+ sc->vbsc_cfg.vbc_geom_s = 0;
+ sc->vbsc_cfg.vbc_sectors_max = 0;
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
+ pci_emul_add_msicap(pi, 1);
+ pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTBLK_REGSZ);
+
+ return (0);
+}
+
+static void
+pci_vtblk_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct pci_vtblk_softc *sc = pi->pi_arg;
+
+ assert(baridx == 0);
+
+ if (offset + size > VTBLK_REGSZ) {
+ DPRINTF(("vtblk_write: 2big, offset %ld size %d\n",
+ offset, size));
+ return;
+ }
+
+ switch (offset) {
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ sc->vbsc_features = value & VTBLK_S_HOSTCAPS;
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ pci_vtblk_ring_init(sc, value);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ sc->vbsc_lastq = value;
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ assert(value == 0);
+ pci_vtblk_qnotify(sc);
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ pci_vtblk_update_status(sc, value);
+ break;
+ case VTCFG_R_HOSTCAP:
+ case VTCFG_R_QNUM:
+ case VTCFG_R_ISR:
+ case VTBLK_R_CFG ... VTBLK_R_CFG_END:
+ DPRINTF(("vtblk: write to readonly reg %ld\n\r", offset));
+ break;
+ default:
+ DPRINTF(("vtblk: unknown i/o write offset %ld\n\r", offset));
+ value = 0;
+ break;
+ }
+}
+
+uint64_t
+pci_vtblk_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size)
+{
+ struct pci_vtblk_softc *sc = pi->pi_arg;
+ void *ptr;
+ uint32_t value;
+
+ assert(baridx == 0);
+
+ if (offset + size > VTBLK_REGSZ) {
+ DPRINTF(("vtblk_read: 2big, offset %ld size %d\n",
+ offset, size));
+ return (0);
+ }
+
+ switch (offset) {
+ case VTCFG_R_HOSTCAP:
+ assert(size == 4);
+ value = VTBLK_S_HOSTCAPS;
+ break;
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ value = sc->vbsc_features; /* XXX never read ? */
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ value = sc->vbsc_pfn >> VRING_PFN;
+ break;
+ case VTCFG_R_QNUM:
+ value = (sc->vbsc_lastq == 0) ? VTBLK_RINGSZ: 0;
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ value = sc->vbsc_lastq; /* XXX never read ? */
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ value = 0; /* XXX never read ? */
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ value = sc->vbsc_status;
+ break;
+ case VTCFG_R_ISR:
+ assert(size == 1);
+ value = sc->vbsc_isr;
+ sc->vbsc_isr = 0; /* a read clears this flag */
+ break;
+ case VTBLK_R_CFG ... VTBLK_R_CFG_END:
+ assert(size + offset <= (VTBLK_R_CFG_END + 1));
+ ptr = (uint8_t *)&sc->vbsc_cfg + offset - VTBLK_R_CFG;
+ if (size == 1) {
+ value = *(uint8_t *) ptr;
+ } else if (size == 2) {
+ value = *(uint16_t *) ptr;
+ } else {
+ value = *(uint32_t *) ptr;
+ }
+ break;
+ default:
+ DPRINTF(("vtblk: unknown i/o read offset %ld\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ return (value);
+}
+
+struct pci_devemu pci_de_vblk = {
+ .pe_emu = "virtio-blk",
+ .pe_init = pci_vtblk_init,
+ .pe_barwrite = pci_vtblk_write,
+ .pe_barread = pci_vtblk_read
+};
+PCI_EMUL_SET(pci_de_vblk);
diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c
new file mode 100644
index 0000000..3f6f88a
--- /dev/null
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@@ -0,0 +1,781 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/select.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <md5.h>
+#include <pthread.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "mevent.h"
+#include "virtio.h"
+
+#define VTNET_RINGSZ 256
+
+#define VTNET_MAXSEGS 32
+
+/*
+ * PCI config-space register offsets
+ */
+#define VTNET_R_CFG0 20
+#define VTNET_R_CFG1 21
+#define VTNET_R_CFG2 22
+#define VTNET_R_CFG3 23
+#define VTNET_R_CFG4 24
+#define VTNET_R_CFG5 25
+#define VTNET_R_CFG6 26
+#define VTNET_R_CFG7 27
+#define VTNET_R_MAX 27
+
+#define VTNET_REGSZ VTNET_R_MAX+1
+
+/*
+ * Host capabilities
+ */
+#define VTNET_S_HOSTCAPS \
+ ( 0x00000020 | /* host supplies MAC */ \
+ 0x00008000 | /* host can merge Rx buffers */ \
+ 0x00010000 ) /* config status available */
+
+/*
+ * Queue definitions.
+ */
+#define VTNET_RXQ 0
+#define VTNET_TXQ 1
+#define VTNET_CTLQ 2
+
+#define VTNET_MAXQ 3
+
+struct vring_hqueue {
+ /* Internal state */
+ uint16_t hq_size;
+ uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
+
+ /* Host-context pointers to the queue */
+ struct virtio_desc *hq_dtable;
+ uint16_t *hq_avail_flags;
+ uint16_t *hq_avail_idx; /* monotonically increasing */
+ uint16_t *hq_avail_ring;
+
+ uint16_t *hq_used_flags;
+ uint16_t *hq_used_idx; /* monotonically increasing */
+ struct virtio_used *hq_used_ring;
+};
+
+/*
+ * Fixed network header size
+ */
+struct virtio_net_rxhdr {
+ uint8_t vrh_flags;
+ uint8_t vrh_gso_type;
+ uint16_t vrh_hdr_len;
+ uint16_t vrh_gso_size;
+ uint16_t vrh_csum_start;
+ uint16_t vrh_csum_offset;
+ uint16_t vrh_bufs;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtnet_debug;
+#define DPRINTF(params) if (pci_vtnet_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtnet_softc {
+ struct pci_devinst *vsc_pi;
+ pthread_mutex_t vsc_mtx;
+ struct mevent *vsc_mevp;
+
+ int vsc_curq;
+ int vsc_status;
+ int vsc_isr;
+ int vsc_tapfd;
+ int vsc_rx_ready;
+ int vsc_rxpend;
+
+ uint32_t vsc_features;
+ uint8_t vsc_macaddr[6];
+
+ uint64_t vsc_pfn[VTNET_MAXQ];
+ struct vring_hqueue vsc_hq[VTNET_MAXQ];
+};
+
+/*
+ * Return the number of available descriptors in the vring taking care
+ * of the 16-bit index wraparound.
+ */
+static int
+hq_num_avail(struct vring_hqueue *hq)
+{
+ int ndesc;
+
+ if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
+ ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
+ else
+ ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
+
+ assert(ndesc >= 0 && ndesc <= hq->hq_size);
+
+ return (ndesc);
+}
+
+static uint16_t
+pci_vtnet_qsize(int qnum)
+{
+ /* XXX no ctl queue currently */
+ if (qnum == VTNET_CTLQ) {
+ return (0);
+ }
+
+ /* XXX fixed currently. Maybe different for tx/rx/ctl */
+ return (VTNET_RINGSZ);
+}
+
+static void
+pci_vtnet_ring_reset(struct pci_vtnet_softc *sc, int ring)
+{
+ struct vring_hqueue *hq;
+
+ assert(ring < VTNET_MAXQ);
+
+ hq = &sc->vsc_hq[ring];
+
+ /*
+ * Reset all soft state
+ */
+ hq->hq_cur_aidx = 0;
+}
+
+static void
+pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value)
+{
+
+ if (value == 0) {
+ DPRINTF(("vtnet: device reset requested !\n"));
+ pci_vtnet_ring_reset(sc, VTNET_RXQ);
+ pci_vtnet_ring_reset(sc, VTNET_TXQ);
+ sc->vsc_rx_ready = 0;
+ }
+
+ sc->vsc_status = value;
+}
+
+/*
+ * Called to send a buffer chain out to the tap device
+ */
+static void
+pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
+ int len)
+{
+ char pad[60];
+
+ if (sc->vsc_tapfd == -1)
+ return;
+
+ /*
+ * If the length is < 60, pad out to that and add the
+ * extra zero'd segment to the iov. It is guaranteed that
+ * there is always an extra iov available by the caller.
+ */
+ if (len < 60) {
+ memset(pad, 0, 60 - len);
+ iov[iovcnt].iov_base = pad;
+ iov[iovcnt].iov_len = 60 - len;
+ iovcnt++;
+ }
+ (void) writev(sc->vsc_tapfd, iov, iovcnt);
+}
+
+/*
+ * Called when there is read activity on the tap file descriptor.
+ * Each buffer posted by the guest is assumed to be able to contain
+ * an entire ethernet frame + rx header.
+ * MP note: the dummybuf is only used for discarding frames, so there
+ * is no need for it to be per-vtnet or locked.
+ */
+static uint8_t dummybuf[2048];
+
+static void
+pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
+{
+ struct virtio_desc *vd;
+ struct virtio_used *vu;
+ struct vring_hqueue *hq;
+ struct virtio_net_rxhdr *vrx;
+ uint8_t *buf;
+ int i;
+ int len;
+ int ndescs;
+ int didx, uidx, aidx; /* descriptor, avail and used index */
+
+ /*
+ * Should never be called without a valid tap fd
+ */
+ assert(sc->vsc_tapfd != -1);
+
+ /*
+ * But, will be called when the rx ring hasn't yet
+ * been set up.
+ */
+ if (sc->vsc_rx_ready == 0) {
+ /*
+ * Drop the packet and try later.
+ */
+ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+ return;
+ }
+
+ /*
+ * Calculate the number of available rx buffers
+ */
+ hq = &sc->vsc_hq[VTNET_RXQ];
+
+ ndescs = hq_num_avail(hq);
+
+ if (ndescs == 0) {
+ /*
+ * Need to wait for host notification to read
+ */
+ if (sc->vsc_rxpend == 0) {
+ WPRINTF(("vtnet: no rx descriptors !\n"));
+ sc->vsc_rxpend = 1;
+ }
+
+ /*
+ * Drop the packet and try later
+ */
+ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+ return;
+ }
+
+ aidx = hq->hq_cur_aidx;
+ uidx = *hq->hq_used_idx;
+ for (i = 0; i < ndescs; i++) {
+ /*
+ * 'aidx' indexes into the an array of descriptor indexes
+ */
+ didx = hq->hq_avail_ring[aidx % hq->hq_size];
+ assert(didx >= 0 && didx < hq->hq_size);
+
+ vd = &hq->hq_dtable[didx];
+
+ /*
+ * Get a pointer to the rx header, and use the
+ * data immediately following it for the packet buffer.
+ */
+ vrx = (struct virtio_net_rxhdr *)paddr_guest2host(vd->vd_addr);
+ buf = (uint8_t *)(vrx + 1);
+
+ len = read(sc->vsc_tapfd, buf,
+ vd->vd_len - sizeof(struct virtio_net_rxhdr));
+
+ if (len < 0 && errno == EWOULDBLOCK) {
+ break;
+ }
+
+ /*
+ * The only valid field in the rx packet header is the
+ * number of buffers, which is always 1 without TSO
+ * support.
+ */
+ memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
+ vrx->vrh_bufs = 1;
+
+ /*
+ * Write this descriptor into the used ring
+ */
+ vu = &hq->hq_used_ring[uidx % hq->hq_size];
+ vu->vu_idx = didx;
+ vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr);
+ uidx++;
+ aidx++;
+ }
+
+ /*
+ * Update the used pointer, and signal an interrupt if allowed
+ */
+ *hq->hq_used_idx = uidx;
+ hq->hq_cur_aidx = aidx;
+
+ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+ sc->vsc_isr |= 1;
+ pci_generate_msi(sc->vsc_pi, 0);
+ }
+}
+
+static void
+pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
+{
+ struct pci_vtnet_softc *sc = param;
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+ pci_vtnet_tap_rx(sc);
+ pthread_mutex_unlock(&sc->vsc_mtx);
+
+}
+
+static void
+pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
+{
+ /*
+ * A qnotify means that the rx process can now begin
+ */
+ if (sc->vsc_rx_ready == 0) {
+ sc->vsc_rx_ready = 1;
+ }
+
+ /*
+ * If the rx queue was empty, attempt to receive a
+ * packet that was previously blocked due to no rx bufs
+ * available
+ */
+ if (sc->vsc_rxpend) {
+ WPRINTF(("vtnet: rx resumed\n\r"));
+ sc->vsc_rxpend = 0;
+ pci_vtnet_tap_rx(sc);
+ }
+}
+
+static void
+pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
+{
+ struct iovec iov[VTNET_MAXSEGS + 1];
+ struct virtio_desc *vd;
+ struct virtio_used *vu;
+ int i;
+ int plen;
+ int tlen;
+ int uidx, aidx, didx;
+
+ uidx = *hq->hq_used_idx;
+ aidx = hq->hq_cur_aidx;
+ didx = hq->hq_avail_ring[aidx % hq->hq_size];
+ assert(didx >= 0 && didx < hq->hq_size);
+
+ vd = &hq->hq_dtable[didx];
+
+ /*
+ * Run through the chain of descriptors, ignoring the
+ * first header descriptor. However, include the header
+ * length in the total length that will be put into the
+ * used queue.
+ */
+ tlen = vd->vd_len;
+ vd = &hq->hq_dtable[vd->vd_next];
+
+ for (i = 0, plen = 0;
+ i < VTNET_MAXSEGS;
+ i++, vd = &hq->hq_dtable[vd->vd_next]) {
+ iov[i].iov_base = paddr_guest2host(vd->vd_addr);
+ iov[i].iov_len = vd->vd_len;
+ plen += vd->vd_len;
+ tlen += vd->vd_len;
+
+ if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
+ break;
+ }
+ assert(i < VTNET_MAXSEGS);
+
+ DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1));
+ pci_vtnet_tap_tx(sc, iov, i + 1, plen);
+
+ /*
+ * Return this chain back to the host
+ */
+ vu = &hq->hq_used_ring[uidx % hq->hq_size];
+ vu->vu_idx = didx;
+ vu->vu_tlen = tlen;
+ hq->hq_cur_aidx = aidx + 1;
+ *hq->hq_used_idx = uidx + 1;
+
+ /*
+ * Generate an interrupt if able
+ */
+ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+ sc->vsc_isr |= 1;
+ pci_generate_msi(sc->vsc_pi, 0);
+ }
+}
+
+static void
+pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
+{
+ struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ];
+ int i;
+ int ndescs;
+
+ /*
+ * Calculate number of ring entries to process
+ */
+ ndescs = hq_num_avail(hq);
+
+ if (ndescs == 0)
+ return;
+
+ /*
+ * Run through all the entries, placing them into iovecs and
+ * sending when an end-of-packet is found
+ */
+ for (i = 0; i < ndescs; i++)
+ pci_vtnet_proctx(sc, hq);
+}
+
+static void
+pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc)
+{
+
+ DPRINTF(("vtnet: control qnotify!\n\r"));
+}
+
+static void
+pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn)
+{
+ struct vring_hqueue *hq;
+ int qnum = sc->vsc_curq;
+
+ assert(qnum < VTNET_MAXQ);
+
+ sc->vsc_pfn[qnum] = pfn << VRING_PFN;
+
+ /*
+ * Set up host pointers to the various parts of the
+ * queue
+ */
+ hq = &sc->vsc_hq[qnum];
+ hq->hq_size = pci_vtnet_qsize(qnum);
+
+ hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
+ hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
+ hq->hq_avail_idx = hq->hq_avail_flags + 1;
+ hq->hq_avail_ring = hq->hq_avail_flags + 2;
+ hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
+ VRING_ALIGN);
+ hq->hq_used_idx = hq->hq_used_flags + 1;
+ hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
+
+ /*
+ * Initialize queue indexes
+ */
+ hq->hq_cur_aidx = 0;
+}
+
+static int
+pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ MD5_CTX mdctx;
+ unsigned char digest[16];
+ char nstr[80];
+ struct pci_vtnet_softc *sc;
+
+ /*
+ * Access to guest memory is required. Fail if
+ * memory not mapped
+ */
+ if (paddr_guest2host(0) == NULL)
+ return (1);
+
+ sc = malloc(sizeof(struct pci_vtnet_softc));
+ memset(sc, 0, sizeof(struct pci_vtnet_softc));
+
+ pi->pi_arg = sc;
+ sc->vsc_pi = pi;
+
+ pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+ /*
+ * Attempt to open the tap device
+ */
+ sc->vsc_tapfd = -1;
+ if (opts != NULL) {
+ char tbuf[80];
+
+ strcpy(tbuf, "/dev/");
+ strlcat(tbuf, opts, sizeof(tbuf));
+
+ sc->vsc_tapfd = open(tbuf, O_RDWR);
+ if (sc->vsc_tapfd == -1) {
+ WPRINTF(("open of tap device %s failed\n", tbuf));
+ } else {
+ /*
+ * Set non-blocking and register for read
+ * notifications with the event loop
+ */
+ int opt = 1;
+ if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
+ WPRINTF(("tap device O_NONBLOCK failed\n"));
+ close(sc->vsc_tapfd);
+ sc->vsc_tapfd = -1;
+ }
+
+ sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
+ EVF_READ,
+ pci_vtnet_tap_callback,
+ sc);
+ if (sc->vsc_mevp == NULL) {
+ WPRINTF(("Could not register event\n"));
+ close(sc->vsc_tapfd);
+ sc->vsc_tapfd = -1;
+ }
+ }
+ }
+
+ /*
+ * The MAC address is the standard NetApp OUI of 00-a0-98,
+ * followed by an MD5 of the vm name. The slot/func number is
+ * prepended to this for slots other than 1:0, so that
+ * a bootloader can netboot from the equivalent of slot 1.
+ */
+ if (pi->pi_slot == 1 && pi->pi_func == 0) {
+ strncpy(nstr, vmname, sizeof(nstr));
+ } else {
+ snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
+ pi->pi_func, vmname);
+ }
+
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, nstr, strlen(nstr));
+ MD5Final(digest, &mdctx);
+
+ sc->vsc_macaddr[0] = 0x00;
+ sc->vsc_macaddr[1] = 0xa0;
+ sc->vsc_macaddr[2] = 0x98;
+ sc->vsc_macaddr[3] = digest[0];
+ sc->vsc_macaddr[4] = digest[1];
+ sc->vsc_macaddr[5] = digest[2];
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+ pci_emul_add_msicap(pi, 1);
+ pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ);
+
+ return (0);
+}
+
+/*
+ * Function pointer array to handle queue notifications
+ */
+static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
+ pci_vtnet_ping_rxq,
+ pci_vtnet_ping_txq,
+ pci_vtnet_ping_ctlq
+};
+
+static void
+pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct pci_vtnet_softc *sc = pi->pi_arg;
+ void *ptr;
+
+ assert(baridx == 0);
+
+ if (offset + size > VTNET_REGSZ) {
+ DPRINTF(("vtnet_write: 2big, offset %ld size %d\n",
+ offset, size));
+ return;
+ }
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+
+ switch (offset) {
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ sc->vsc_features = value & VTNET_S_HOSTCAPS;
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ pci_vtnet_ring_init(sc, value);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ assert(value < VTNET_MAXQ);
+ sc->vsc_curq = value;
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ assert(value < VTNET_MAXQ);
+ (*pci_vtnet_qnotify[value])(sc);
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ pci_vtnet_update_status(sc, value);
+ break;
+ case VTNET_R_CFG0:
+ case VTNET_R_CFG1:
+ case VTNET_R_CFG2:
+ case VTNET_R_CFG3:
+ case VTNET_R_CFG4:
+ case VTNET_R_CFG5:
+ assert((size + offset) <= (VTNET_R_CFG5 + 1));
+ ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
+ /*
+ * The driver is allowed to change the MAC address
+ */
+ sc->vsc_macaddr[offset - VTNET_R_CFG0] = value;
+ if (size == 1) {
+ *(uint8_t *) ptr = value;
+ } else if (size == 2) {
+ *(uint16_t *) ptr = value;
+ } else {
+ *(uint32_t *) ptr = value;
+ }
+ break;
+ case VTCFG_R_HOSTCAP:
+ case VTCFG_R_QNUM:
+ case VTCFG_R_ISR:
+ case VTNET_R_CFG6:
+ case VTNET_R_CFG7:
+ DPRINTF(("vtnet: write to readonly reg %ld\n\r", offset));
+ break;
+ default:
+ DPRINTF(("vtnet: unknown i/o write offset %ld\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+uint64_t
+pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size)
+{
+ struct pci_vtnet_softc *sc = pi->pi_arg;
+ void *ptr;
+ uint64_t value;
+
+ assert(baridx == 0);
+
+ if (offset + size > VTNET_REGSZ) {
+ DPRINTF(("vtnet_read: 2big, offset %ld size %d\n",
+ offset, size));
+ return (0);
+ }
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+
+ switch (offset) {
+ case VTCFG_R_HOSTCAP:
+ assert(size == 4);
+ value = VTNET_S_HOSTCAPS;
+ break;
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ value = sc->vsc_features; /* XXX never read ? */
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
+ break;
+ case VTCFG_R_QNUM:
+ assert(size == 2);
+ value = pci_vtnet_qsize(sc->vsc_curq);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ value = sc->vsc_curq; /* XXX never read ? */
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ value = sc->vsc_curq; /* XXX never read ? */
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ value = sc->vsc_status;
+ break;
+ case VTCFG_R_ISR:
+ assert(size == 1);
+ value = sc->vsc_isr;
+ sc->vsc_isr = 0; /* a read clears this flag */
+ break;
+ case VTNET_R_CFG0:
+ case VTNET_R_CFG1:
+ case VTNET_R_CFG2:
+ case VTNET_R_CFG3:
+ case VTNET_R_CFG4:
+ case VTNET_R_CFG5:
+ assert((size + offset) <= (VTNET_R_CFG5 + 1));
+ ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
+ if (size == 1) {
+ value = *(uint8_t *) ptr;
+ } else if (size == 2) {
+ value = *(uint16_t *) ptr;
+ } else {
+ value = *(uint32_t *) ptr;
+ }
+ break;
+ case VTNET_R_CFG6:
+ assert(size != 4);
+ value = 0x01; /* XXX link always up */
+ break;
+ case VTNET_R_CFG7:
+ assert(size == 1);
+ value = 0; /* XXX link status in LSB */
+ break;
+ default:
+ DPRINTF(("vtnet: unknown i/o read offset %ld\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->vsc_mtx);
+
+ return (value);
+}
+
+struct pci_devemu pci_de_vnet = {
+ .pe_emu = "virtio-net",
+ .pe_init = pci_vtnet_init,
+ .pe_barwrite = pci_vtnet_write,
+ .pe_barread = pci_vtnet_read
+};
+PCI_EMUL_SET(pci_de_vnet);
diff --git a/usr.sbin/bhyve/pit_8254.c b/usr.sbin/bhyve/pit_8254.c
new file mode 100644
index 0000000..c96596a
--- /dev/null
+++ b/usr.sbin/bhyve/pit_8254.c
@@ -0,0 +1,198 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/time.h>
+
+#include <machine/clock.h>
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "bhyverun.h"
+#include "inout.h"
+#include "pit_8254.h"
+
+#define TIMER_SEL_MASK 0xc0
+#define TIMER_RW_MASK 0x30
+#define TIMER_MODE_MASK 0x0f
+#define TIMER_SEL_READBACK 0xc0
+
+#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz))
+
+#define PIT_8254_FREQ 1193182
+static const int nsecs_per_tick = 1000000000 / PIT_8254_FREQ;
+
+struct counter {
+ struct timeval tv; /* uptime when counter was loaded */
+ uint16_t initial; /* initial counter value */
+ uint8_t cr[2];
+ uint8_t ol[2];
+ int crbyte;
+ int olbyte;
+};
+
+static void
+timevalfix(struct timeval *t1)
+{
+
+ if (t1->tv_usec < 0) {
+ t1->tv_sec--;
+ t1->tv_usec += 1000000;
+ }
+ if (t1->tv_usec >= 1000000) {
+ t1->tv_sec++;
+ t1->tv_usec -= 1000000;
+ }
+}
+
+static void
+timevalsub(struct timeval *t1, const struct timeval *t2)
+{
+
+ t1->tv_sec -= t2->tv_sec;
+ t1->tv_usec -= t2->tv_usec;
+ timevalfix(t1);
+}
+
+static void
+latch(struct counter *c)
+{
+ struct timeval tv2;
+ uint16_t lval;
+ uint64_t delta_nsecs, delta_ticks;
+
+ /* cannot latch a new value until the old one has been consumed */
+ if (c->olbyte != 0)
+ return;
+
+ if (c->initial == 0 || c->initial == 1) {
+ /*
+ * XXX the program that runs the VM can be stopped and
+ * restarted at any time. This means that state that was
+ * created by the guest is destroyed between invocations
+ * of the program.
+ *
+ * If the counter's initial value is not programmed we
+ * assume a value that would be set to generate 'guest_hz'
+ * interrupts per second.
+ */
+ c->initial = TIMER_DIV(PIT_8254_FREQ, guest_hz);
+ gettimeofday(&c->tv, NULL);
+ }
+
+ (void)gettimeofday(&tv2, NULL);
+ timevalsub(&tv2, &c->tv);
+ delta_nsecs = tv2.tv_sec * 1000000000 + tv2.tv_usec * 1000;
+ delta_ticks = delta_nsecs / nsecs_per_tick;
+
+ lval = c->initial - delta_ticks % c->initial;
+ c->olbyte = 2;
+ c->ol[1] = lval; /* LSB */
+ c->ol[0] = lval >> 8; /* MSB */
+}
+
+static int
+pit_8254_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int sel, rw, mode;
+ uint8_t val;
+ struct counter *c;
+
+ static struct counter counter[3];
+
+ if (bytes != 1)
+ return (-1);
+
+ val = *eax;
+
+ if (port == TIMER_MODE) {
+ assert(in == 0);
+ sel = val & TIMER_SEL_MASK;
+ rw = val & TIMER_RW_MASK;
+ mode = val & TIMER_MODE_MASK;
+
+ if (sel == TIMER_SEL_READBACK)
+ return (-1);
+ if (rw != TIMER_LATCH && rw != TIMER_16BIT)
+ return (-1);
+
+ if (rw != TIMER_LATCH) {
+ /*
+ * Counter mode is not affected when issuing a
+ * latch command.
+ */
+ if (mode != TIMER_RATEGEN && mode != TIMER_SQWAVE)
+ return (-1);
+ }
+
+ c = &counter[sel >> 6];
+ if (rw == TIMER_LATCH)
+ latch(c);
+ else
+ c->olbyte = 0; /* reset latch after reprogramming */
+
+ return (0);
+ }
+
+ /* counter ports */
+ assert(port >= TIMER_CNTR0 && port <= TIMER_CNTR2);
+ c = &counter[port - TIMER_CNTR0];
+
+ if (in) {
+ /*
+ * XXX
+ * The spec says that once the output latch is completely
+ * read it should revert to "following" the counter. We don't
+ * do this because it is hard and any reasonable OS should
+ * always latch the counter before trying to read it.
+ */
+ if (c->olbyte == 0)
+ c->olbyte = 2;
+ *eax = c->ol[--c->olbyte];
+ } else {
+ c->cr[c->crbyte++] = *eax;
+ if (c->crbyte == 2) {
+ c->crbyte = 0;
+ c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8;
+ if (c->initial == 0)
+ c->initial = 0xffff;
+ gettimeofday(&c->tv, NULL);
+ }
+ }
+
+ return (0);
+}
+
+INOUT_PORT(8254, TIMER_MODE, IOPORT_F_OUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR0, IOPORT_F_INOUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR1, IOPORT_F_INOUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR2, IOPORT_F_INOUT, pit_8254_handler);
diff --git a/usr.sbin/bhyve/pit_8254.h b/usr.sbin/bhyve/pit_8254.h
new file mode 100644
index 0000000..61bd15d
--- /dev/null
+++ b/usr.sbin/bhyve/pit_8254.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PIT_8254_H_
+#define _PIT_8254_H_
+
+/*
+ * Borrowed from amd64/include/timerreg.h because in that file it is
+ * conditionally compiled for #ifdef _KERNEL only.
+ */
+
+#include <dev/ic/i8253reg.h>
+
+#define IO_TIMER1 0x40 /* 8253 Timer #1 */
+#define TIMER_CNTR0 (IO_TIMER1 + TIMER_REG_CNTR0)
+#define TIMER_CNTR1 (IO_TIMER1 + TIMER_REG_CNTR1)
+#define TIMER_CNTR2 (IO_TIMER1 + TIMER_REG_CNTR2)
+#define TIMER_MODE (IO_TIMER1 + TIMER_REG_MODE)
+
+#endif /* _PIT_8254_H_ */
diff --git a/usr.sbin/bhyve/pmtmr.c b/usr.sbin/bhyve/pmtmr.c
new file mode 100644
index 0000000..78d14eb
--- /dev/null
+++ b/usr.sbin/bhyve/pmtmr.c
@@ -0,0 +1,108 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+#include <machine/cpufunc.h>
+
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "inout.h"
+
+/*
+ * The ACPI Power Management timer is a free-running 24- or 32-bit
+ * timer with a frequency of 3.579545MHz
+ *
+ * This implementation will be 32-bits
+ */
+
+#define IO_PMTMR 0x408 /* 4-byte i/o port for the timer */
+
+#define PMTMR_FREQ 3579545 /* 3.579545MHz */
+
+static pthread_mutex_t pmtmr_mtx;
+static uint64_t pmtmr_tscf;
+static uint64_t pmtmr_old;
+static uint64_t pmtmr_tsc_old;
+
+static uint32_t
+pmtmr_val(void)
+{
+ uint64_t pmtmr_tsc_new;
+ uint64_t pmtmr_new;
+ static int inited = 0;
+
+ if (!inited) {
+ size_t len;
+ uint32_t tmpf;
+
+ inited = 1;
+ pthread_mutex_init(&pmtmr_mtx, NULL);
+ len = sizeof(tmpf);
+ sysctlbyname("machdep.tsc_freq", &tmpf, &len,
+ NULL, 0);
+ pmtmr_tscf = tmpf;
+ pmtmr_tsc_old = rdtsc();
+ pmtmr_old = pmtmr_tsc_old / pmtmr_tscf * PMTMR_FREQ;
+ return (pmtmr_old);
+ }
+
+ pthread_mutex_lock(&pmtmr_mtx);
+ pmtmr_tsc_new = rdtsc();
+ pmtmr_new = (pmtmr_tsc_new - pmtmr_tsc_old) * PMTMR_FREQ / pmtmr_tscf +
+ pmtmr_old;
+ pmtmr_old = pmtmr_new;
+ pmtmr_tsc_old = pmtmr_tsc_new;
+ pthread_mutex_unlock(&pmtmr_mtx);
+
+ return (pmtmr_new);
+}
+
+static int
+pmtmr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 1);
+
+ if (bytes != 4)
+ return (-1);
+
+ *eax = pmtmr_val();
+
+ return (0);
+}
+
+INOUT_PORT(pmtmr, IO_PMTMR, IOPORT_F_IN, pmtmr_handler);
+
diff --git a/usr.sbin/bhyve/post.c b/usr.sbin/bhyve/post.c
new file mode 100644
index 0000000..092a551
--- /dev/null
+++ b/usr.sbin/bhyve/post.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+
+#include "inout.h"
+
+static int
+post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 1);
+
+ if (bytes != 1)
+ return (-1);
+
+ *eax = 0xff; /* return some garbage */
+ return (0);
+}
+
+INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler);
diff --git a/usr.sbin/bhyve/rtc.c b/usr.sbin/bhyve/rtc.c
new file mode 100644
index 0000000..f8b894e
--- /dev/null
+++ b/usr.sbin/bhyve/rtc.c
@@ -0,0 +1,274 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
+#include "inout.h"
+
+#define IO_RTC 0x70
+
+#define RTC_SEC 0x00 /* seconds */
+#define RTC_MIN 0x02
+#define RTC_HRS 0x04
+#define RTC_WDAY 0x06
+#define RTC_DAY 0x07
+#define RTC_MONTH 0x08
+#define RTC_YEAR 0x09
+#define RTC_CENTURY 0x32 /* current century */
+
+#define RTC_STATUSA 0xA
+#define RTCSA_TUP 0x80 /* time update, don't look now */
+
+#define RTC_STATUSB 0xB
+#define RTCSB_DST 0x01
+#define RTCSB_24HR 0x02
+#define RTCSB_BIN 0x04 /* 0 = BCD, 1 = Binary */
+#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */
+#define RTCSB_HALT 0x80 /* stop clock updates */
+
+#define RTC_INTR 0x0c /* status register C (R) interrupt source */
+
+#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */
+#define RTCSD_PWR 0x80 /* clock power OK */
+
+#define RTC_DIAG 0x0e
+
+#define RTC_RSTCODE 0x0f
+
+#define RTC_EQUIPMENT 0x14
+
+static int addr;
+
+/* XXX initialize these to default values as they would be from BIOS */
+static uint8_t status_a, status_b, rstcode;
+
+static u_char const bin2bcd_data[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99
+};
+#define bin2bcd(bin) (bin2bcd_data[bin])
+
+#define rtcout(val) ((status_b & RTCSB_BIN) ? (val) : bin2bcd((val)))
+
+static void
+timevalfix(struct timeval *t1)
+{
+
+ if (t1->tv_usec < 0) {
+ t1->tv_sec--;
+ t1->tv_usec += 1000000;
+ }
+ if (t1->tv_usec >= 1000000) {
+ t1->tv_sec++;
+ t1->tv_usec -= 1000000;
+ }
+}
+
+static void
+timevalsub(struct timeval *t1, const struct timeval *t2)
+{
+
+ t1->tv_sec -= t2->tv_sec;
+ t1->tv_usec -= t2->tv_usec;
+ timevalfix(t1);
+}
+
+static int
+rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 0);
+
+ if (bytes != 1)
+ return (-1);
+
+ switch (*eax) {
+ case RTC_SEC:
+ case RTC_MIN:
+ case RTC_HRS:
+ case RTC_WDAY:
+ case RTC_DAY:
+ case RTC_MONTH:
+ case RTC_YEAR:
+ case RTC_CENTURY:
+ case RTC_STATUSA:
+ case RTC_STATUSB:
+ case RTC_INTR:
+ case RTC_STATUSD:
+ case RTC_DIAG:
+ case RTC_RSTCODE:
+ case RTC_EQUIPMENT:
+ break;
+ default:
+ return (-1);
+ }
+
+ addr = *eax;
+ return (0);
+}
+
+static int
+rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int hour;
+ time_t t;
+ struct timeval cur, delta;
+
+ static struct timeval last;
+ static struct tm tm;
+
+ if (bytes != 1)
+ return (-1);
+
+ gettimeofday(&cur, NULL);
+
+ /*
+ * Increment the cached time only once per second so we can guarantee
+ * that the guest has at least one second to read the hour:min:sec
+ * separately and still get a coherent view of the time.
+ */
+ delta = cur;
+ timevalsub(&delta, &last);
+ if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) {
+ t = cur.tv_sec;
+ localtime_r(&t, &tm);
+ last = cur;
+ }
+
+ if (in) {
+ switch (addr) {
+ case RTC_SEC:
+ *eax = rtcout(tm.tm_sec);
+ return (0);
+ case RTC_MIN:
+ *eax = rtcout(tm.tm_min);
+ return (0);
+ case RTC_HRS:
+ if (status_b & RTCSB_24HR)
+ hour = tm.tm_hour;
+ else
+ hour = (tm.tm_hour % 12) + 1;
+
+ *eax = rtcout(hour);
+
+ /*
+ * If we are representing time in the 12-hour format
+ * then set the MSB to indicate PM.
+ */
+ if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12)
+ *eax |= 0x80;
+
+ return (0);
+ case RTC_WDAY:
+ *eax = rtcout(tm.tm_wday + 1);
+ return (0);
+ case RTC_DAY:
+ *eax = rtcout(tm.tm_mday);
+ return (0);
+ case RTC_MONTH:
+ *eax = rtcout(tm.tm_mon + 1);
+ return (0);
+ case RTC_YEAR:
+ *eax = rtcout(tm.tm_year % 100);
+ return (0);
+ case RTC_CENTURY:
+ *eax = rtcout(tm.tm_year / 100);
+ break;
+ case RTC_STATUSA:
+ *eax = status_a;
+ return (0);
+ case RTC_INTR:
+ *eax = 0;
+ return (0);
+ case RTC_STATUSD:
+ *eax = RTCSD_PWR;
+ return (0);
+ case RTC_DIAG:
+ *eax = 0;
+ return (0);
+ case RTC_RSTCODE:
+ *eax = rstcode;
+ return (0);
+ case RTC_EQUIPMENT:
+ *eax = 0;
+ return (0);
+ default:
+ return (-1);
+ }
+ }
+
+ switch (addr) {
+ case RTC_STATUSA:
+ status_a = *eax & ~RTCSA_TUP;
+ break;
+ case RTC_STATUSB:
+ /* XXX not implemented yet XXX */
+ if (*eax & RTCSB_PINTR)
+ return (-1);
+ status_b = *eax;
+ break;
+ case RTC_RSTCODE:
+ rstcode = *eax;
+ break;
+ case RTC_SEC:
+ case RTC_MIN:
+ case RTC_HRS:
+ case RTC_WDAY:
+ case RTC_DAY:
+ case RTC_MONTH:
+ case RTC_YEAR:
+ case RTC_CENTURY:
+ /*
+ * Ignore writes to the time of day registers
+ */
+ break;
+ default:
+ return (-1);
+ }
+ return (0);
+}
+
+INOUT_PORT(rtc, IO_RTC, IOPORT_F_OUT, rtc_addr_handler);
+INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler);
diff --git a/usr.sbin/bhyve/spinup_ap.c b/usr.sbin/bhyve/spinup_ap.c
new file mode 100644
index 0000000..2632aed
--- /dev/null
+++ b/usr.sbin/bhyve/spinup_ap.c
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "bhyverun.h"
+#include "spinup_ap.h"
+
+static void
+spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip)
+{
+ int vector, error;
+ uint16_t cs;
+ uint64_t desc_base;
+ uint32_t desc_limit, desc_access;
+
+ vector = *rip >> PAGE_SHIFT;
+ *rip = 0;
+
+ /*
+ * Update the %cs and %rip of the guest so that it starts
+ * executing real mode code at at 'vector << 12'.
+ */
+ error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip);
+ assert(error == 0);
+
+ error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base,
+ &desc_limit, &desc_access);
+ assert(error == 0);
+
+ desc_base = vector << PAGE_SHIFT;
+ error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ assert(error == 0);
+
+ cs = (vector << PAGE_SHIFT) >> 4;
+ error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs);
+ assert(error == 0);
+}
+
+int
+spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip)
+{
+ int error;
+
+ assert(newcpu != 0);
+ assert(newcpu < guest_ncpus);
+
+ error = vcpu_reset(ctx, newcpu);
+ assert(error == 0);
+
+ /* Set up capabilities */
+ if (fbsdrun_vmexit_on_hlt()) {
+ error = vm_set_capability(ctx, newcpu, VM_CAP_HALT_EXIT, 1);
+ assert(error == 0);
+ }
+
+ if (fbsdrun_vmexit_on_pause()) {
+ error = vm_set_capability(ctx, newcpu, VM_CAP_PAUSE_EXIT, 1);
+ assert(error == 0);
+ }
+
+ if (fbsdrun_disable_x2apic())
+ error = vm_set_x2apic_state(ctx, newcpu, X2APIC_DISABLED);
+ else
+ error = vm_set_x2apic_state(ctx, newcpu, X2APIC_ENABLED);
+ assert(error == 0);
+
+ /*
+ * Enable the 'unrestricted guest' mode for 'newcpu'.
+ *
+ * Set up the processor state in power-on 16-bit mode, with the CS:IP
+ * init'd to the specified low-mem 4K page.
+ */
+ error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
+ assert(error == 0);
+
+ spinup_ap_realmode(ctx, newcpu, &rip);
+
+ fbsdrun_addcpu(ctx, newcpu, rip);
+
+ return (newcpu);
+}
diff --git a/usr.sbin/bhyve/spinup_ap.h b/usr.sbin/bhyve/spinup_ap.h
new file mode 100644
index 0000000..2749ee9
--- /dev/null
+++ b/usr.sbin/bhyve/spinup_ap.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SPINUP_AP_H_
+#define _SPINUP_AP_H_
+
+int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip);
+
+#endif
diff --git a/usr.sbin/bhyve/uart.c b/usr.sbin/bhyve/uart.c
new file mode 100644
index 0000000..640f3bf
--- /dev/null
+++ b/usr.sbin/bhyve/uart.c
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <assert.h>
+
+#include "inout.h"
+
+#define COM1 0x3F8
+#define COM2 0x2F8
+
+#define REG_IIR 2
+
+static int
+com_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in);
+
+ if (bytes != 1)
+ return (-1);
+
+ /*
+ * COM port is not implemented so we return 0xFF for all registers
+ */
+ *eax = 0xFF;
+
+ return (0);
+}
+
+INOUT_PORT(uart, COM1 + REG_IIR, IOPORT_F_IN, com_handler);
+INOUT_PORT(uart, COM2 + REG_IIR, IOPORT_F_IN, com_handler);
diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h
new file mode 100644
index 0000000..474e244
--- /dev/null
+++ b/usr.sbin/bhyve/virtio.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_H_
+#define _VIRTIO_H_
+
+#define VRING_ALIGN 4096
+
+#define VRING_DESC_F_NEXT (1 << 0)
+#define VRING_DESC_F_WRITE (1 << 1)
+#define VRING_DESC_F_INDIRECT (1 << 2)
+
+#define VRING_AVAIL_F_NO_INTERRUPT 1
+
+struct virtio_desc {
+ uint64_t vd_addr;
+ uint32_t vd_len;
+ uint16_t vd_flags;
+ uint16_t vd_next;
+} __packed;
+
+struct virtio_used {
+ uint32_t vu_idx;
+ uint32_t vu_tlen;
+} __packed;
+
+/*
+ * PFN register shift amount
+ */
+#define VRING_PFN 12
+
+/*
+ * Virtio device types
+ */
+#define VIRTIO_TYPE_NET 1
+#define VIRTIO_TYPE_BLOCK 2
+
+/*
+ * PCI vendor/device IDs
+ */
+#define VIRTIO_VENDOR 0x1AF4
+#define VIRTIO_DEV_NET 0x1000
+#define VIRTIO_DEV_BLOCK 0x1001
+
+/*
+ * PCI config space constants
+ */
+#define VTCFG_R_HOSTCAP 0
+#define VTCFG_R_GUESTCAP 4
+#define VTCFG_R_PFN 8
+#define VTCFG_R_QNUM 12
+#define VTCFG_R_QSEL 14
+#define VTCFG_R_QNOTIFY 16
+#define VTCFG_R_STATUS 18
+#define VTCFG_R_ISR 19
+#define VTCFG_R_CFG0 20 /* No MSI-X */
+#define VTCFG_R_CFG1 24 /* With MSI-X */
+#define VTCFG_R_MSIX 20
+
+#endif /* _VIRTIO_H_ */
diff --git a/usr.sbin/bhyve/xmsr.c b/usr.sbin/bhyve/xmsr.c
new file mode 100644
index 0000000..9c05f02
--- /dev/null
+++ b/usr.sbin/bhyve/xmsr.c
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "xmsr.h"
+
+int
+emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val)
+{
+
+ printf("Unknown WRMSR code %x, val %lx, cpu %d\n", code, val, vcpu);
+ exit(1);
+}
diff --git a/usr.sbin/bhyve/xmsr.h b/usr.sbin/bhyve/xmsr.h
new file mode 100644
index 0000000..8cebcea
--- /dev/null
+++ b/usr.sbin/bhyve/xmsr.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _XMSR_H_
+#define _XMSR_H_
+
+int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
+
+#endif
diff --git a/usr.sbin/bhyvectl/Makefile b/usr.sbin/bhyvectl/Makefile
new file mode 100644
index 0000000..9fde12c
--- /dev/null
+++ b/usr.sbin/bhyvectl/Makefile
@@ -0,0 +1,17 @@
+#
+# $FreeBSD$
+#
+
+PROG= bhyvectl
+SRCS= bhyvectl.c
+
+NO_MAN=
+
+DPADD= ${LIBVMMAPI}
+LDADD= -lvmmapi
+
+WARNS?= 3
+
+CFLAGS+= -I${.CURDIR}/../../sys/amd64/vmm
+
+.include <bsd.prog.mk>
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
new file mode 100644
index 0000000..d5e0503
--- /dev/null
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -0,0 +1,1524 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/errno.h>
+#include <sys/mman.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <libutil.h>
+#include <fcntl.h>
+#include <string.h>
+#include <getopt.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "intel/vmcs.h"
+
+#define MB (1UL << 20)
+#define GB (1UL << 30)
+
+#define REQ_ARG required_argument
+#define NO_ARG no_argument
+#define OPT_ARG optional_argument
+
+static const char *progname;
+
+static void
+usage(void)
+{
+
+ (void)fprintf(stderr,
+ "Usage: %s --vm=<name>\n"
+ " [--cpu=<vcpu_number>]\n"
+ " [--create]\n"
+ " [--destroy]\n"
+ " [--get-all]\n"
+ " [--get-stats]\n"
+ " [--set-desc-ds]\n"
+ " [--get-desc-ds]\n"
+ " [--set-desc-es]\n"
+ " [--get-desc-es]\n"
+ " [--set-desc-gs]\n"
+ " [--get-desc-gs]\n"
+ " [--set-desc-fs]\n"
+ " [--get-desc-fs]\n"
+ " [--set-desc-cs]\n"
+ " [--get-desc-cs]\n"
+ " [--set-desc-ss]\n"
+ " [--get-desc-ss]\n"
+ " [--set-desc-tr]\n"
+ " [--get-desc-tr]\n"
+ " [--set-desc-ldtr]\n"
+ " [--get-desc-ldtr]\n"
+ " [--set-desc-gdtr]\n"
+ " [--get-desc-gdtr]\n"
+ " [--set-desc-idtr]\n"
+ " [--get-desc-idtr]\n"
+ " [--run]\n"
+ " [--capname=<capname>]\n"
+ " [--getcap]\n"
+ " [--setcap=<0|1>]\n"
+ " [--desc-base=<BASE>]\n"
+ " [--desc-limit=<LIMIT>]\n"
+ " [--desc-access=<ACCESS>]\n"
+ " [--set-cr0=<CR0>]\n"
+ " [--get-cr0]\n"
+ " [--set-cr3=<CR3>]\n"
+ " [--get-cr3]\n"
+ " [--set-cr4=<CR4>]\n"
+ " [--get-cr4]\n"
+ " [--set-dr7=<DR7>]\n"
+ " [--get-dr7]\n"
+ " [--set-rsp=<RSP>]\n"
+ " [--get-rsp]\n"
+ " [--set-rip=<RIP>]\n"
+ " [--get-rip]\n"
+ " [--get-rax]\n"
+ " [--set-rax=<RAX>]\n"
+ " [--get-rbx]\n"
+ " [--get-rcx]\n"
+ " [--get-rdx]\n"
+ " [--get-rsi]\n"
+ " [--get-rdi]\n"
+ " [--get-rbp]\n"
+ " [--get-r8]\n"
+ " [--get-r9]\n"
+ " [--get-r10]\n"
+ " [--get-r11]\n"
+ " [--get-r12]\n"
+ " [--get-r13]\n"
+ " [--get-r14]\n"
+ " [--get-r15]\n"
+ " [--set-rflags=<RFLAGS>]\n"
+ " [--get-rflags]\n"
+ " [--set-cs]\n"
+ " [--get-cs]\n"
+ " [--set-ds]\n"
+ " [--get-ds]\n"
+ " [--set-es]\n"
+ " [--get-es]\n"
+ " [--set-fs]\n"
+ " [--get-fs]\n"
+ " [--set-gs]\n"
+ " [--get-gs]\n"
+ " [--set-ss]\n"
+ " [--get-ss]\n"
+ " [--get-tr]\n"
+ " [--get-ldtr]\n"
+ " [--get-vmcs-pinbased-ctls]\n"
+ " [--get-vmcs-procbased-ctls]\n"
+ " [--get-vmcs-procbased-ctls2]\n"
+ " [--get-vmcs-entry-interruption-info]\n"
+ " [--set-vmcs-entry-interruption-info=<info>]\n"
+ " [--get-vmcs-eptp]\n"
+ " [--get-vmcs-guest-physical-address\n"
+ " [--get-vmcs-guest-linear-address\n"
+ " [--set-vmcs-exception-bitmap]\n"
+ " [--get-vmcs-exception-bitmap]\n"
+ " [--get-vmcs-io-bitmap-address]\n"
+ " [--get-vmcs-tsc-offset]\n"
+ " [--get-vmcs-guest-pat]\n"
+ " [--get-vmcs-host-pat]\n"
+ " [--get-vmcs-host-cr0]\n"
+ " [--get-vmcs-host-cr3]\n"
+ " [--get-vmcs-host-cr4]\n"
+ " [--get-vmcs-host-rip]\n"
+ " [--get-vmcs-host-rsp]\n"
+ " [--get-vmcs-cr0-mask]\n"
+ " [--get-vmcs-cr0-shadow]\n"
+ " [--get-vmcs-cr4-mask]\n"
+ " [--get-vmcs-cr4-shadow]\n"
+ " [--get-vmcs-cr3-targets]\n"
+ " [--get-vmcs-apic-access-address]\n"
+ " [--get-vmcs-virtual-apic-address]\n"
+ " [--get-vmcs-tpr-threshold]\n"
+ " [--get-vmcs-msr-bitmap]\n"
+ " [--get-vmcs-msr-bitmap-address]\n"
+ " [--get-vmcs-vpid]\n"
+ " [--get-vmcs-ple-gap]\n"
+ " [--get-vmcs-ple-window]\n"
+ " [--get-vmcs-instruction-error]\n"
+ " [--get-vmcs-exit-ctls]\n"
+ " [--get-vmcs-entry-ctls]\n"
+ " [--get-vmcs-guest-sysenter]\n"
+ " [--get-vmcs-link]\n"
+ " [--get-vmcs-exit-reason]\n"
+ " [--get-vmcs-exit-qualification]\n"
+ " [--get-vmcs-exit-interruption-info]\n"
+ " [--get-vmcs-exit-interruption-error]\n"
+ " [--get-vmcs-interruptibility]\n"
+ " [--set-pinning=<host_cpuid>]\n"
+ " [--get-pinning]\n"
+ " [--set-x2apic-state=<state>]\n"
+ " [--get-x2apic-state]\n"
+ " [--set-lowmem=<memory below 4GB in units of MB>]\n"
+ " [--get-lowmem]\n"
+ " [--set-highmem=<memory above 4GB in units of MB>]\n"
+ " [--get-highmem]\n",
+ progname);
+ exit(1);
+}
+
+static int get_stats, getcap, setcap, capval;
+static const char *capname;
+static int create, destroy, get_lowmem, get_highmem;
+static uint64_t lowmem, highmem;
+static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
+static int set_efer, get_efer;
+static int set_dr7, get_dr7;
+static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags;
+static int set_rax, get_rax;
+static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp;
+static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15;
+static int set_desc_ds, get_desc_ds;
+static int set_desc_es, get_desc_es;
+static int set_desc_fs, get_desc_fs;
+static int set_desc_gs, get_desc_gs;
+static int set_desc_cs, get_desc_cs;
+static int set_desc_ss, get_desc_ss;
+static int set_desc_gdtr, get_desc_gdtr;
+static int set_desc_idtr, get_desc_idtr;
+static int set_desc_tr, get_desc_tr;
+static int set_desc_ldtr, get_desc_ldtr;
+static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr;
+static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr;
+static int set_pinning, get_pinning, pincpu;
+static int set_x2apic_state, get_x2apic_state;
+enum x2apic_state x2apic_state;
+static int run;
+
+/*
+ * VMCS-specific fields
+ */
+static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2;
+static int get_eptp, get_io_bitmap, get_tsc_offset;
+static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info;
+static int get_vmcs_interruptibility;
+uint32_t vmcs_entry_interruption_info;
+static int get_vmcs_gpa, get_vmcs_gla;
+static int get_exception_bitmap, set_exception_bitmap, exception_bitmap;
+static int get_cr0_mask, get_cr0_shadow;
+static int get_cr4_mask, get_cr4_shadow;
+static int get_cr3_targets;
+static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold;
+static int get_msr_bitmap, get_msr_bitmap_address;
+static int get_vpid, get_ple_gap, get_ple_window;
+static int get_inst_err, get_exit_ctls, get_entry_ctls;
+static int get_host_cr0, get_host_cr3, get_host_cr4;
+static int get_host_rip, get_host_rsp;
+static int get_guest_pat, get_host_pat;
+static int get_guest_sysenter, get_vmcs_link;
+static int get_vmcs_exit_reason, get_vmcs_exit_qualification;
+static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error;
+
+static uint64_t desc_base;
+static uint32_t desc_limit, desc_access;
+
+static int get_all;
+
+static void
+dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu)
+{
+ printf("vm exit[%d]\n", vcpu);
+ printf("\trip\t\t0x%016lx\n", vmexit->rip);
+ printf("\tinst_length\t%d\n", vmexit->inst_length);
+ switch (vmexit->exitcode) {
+ case VM_EXITCODE_INOUT:
+ printf("\treason\t\tINOUT\n");
+ printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT");
+ printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes);
+ printf("\tflags\t\t%s%s\n",
+ vmexit->u.inout.string ? "STRING " : "",
+ vmexit->u.inout.rep ? "REP " : "");
+ printf("\tport\t\t0x%04x\n", vmexit->u.inout.port);
+ printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax);
+ break;
+ case VM_EXITCODE_VMX:
+ printf("\treason\t\tVMX\n");
+ printf("\terror\t\t%d\n", vmexit->u.vmx.error);
+ printf("\texit_reason\t0x%08x (%u)\n",
+ vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason);
+ printf("\tqualification\t0x%016lx\n",
+ vmexit->u.vmx.exit_qualification);
+ break;
+ default:
+ printf("*** unknown vm run exitcode %d\n", vmexit->exitcode);
+ break;
+ }
+}
+
+static int
+dump_vmcs_msr_bitmap(int vcpu, u_long addr)
+{
+ int error, fd, byte, bit, readable, writeable;
+ u_int msr;
+ const char *bitmap;
+
+ error = -1;
+ bitmap = MAP_FAILED;
+
+ fd = open("/dev/mem", O_RDONLY, 0);
+ if (fd < 0)
+ goto done;
+
+ bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr);
+ if (bitmap == MAP_FAILED)
+ goto done;
+
+ for (msr = 0; msr < 0x2000; msr++) {
+ byte = msr / 8;
+ bit = msr & 0x7;
+
+ /* Look at MSRs in the range 0x00000000 to 0x00001FFF */
+ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+ writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
+ if (readable || writeable) {
+ printf("msr 0x%08x[%d]\t\t%c%c\n", msr, vcpu,
+ readable ? 'R' : '-',
+ writeable ? 'W' : '-');
+ }
+
+ /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
+ byte += 1024;
+ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+ writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
+ if (readable || writeable) {
+ printf("msr 0x%08x[%d]\t\t%c%c\n",
+ 0xc0000000 + msr, vcpu,
+ readable ? 'R' : '-',
+ writeable ? 'W' : '-');
+ }
+ }
+
+ error = 0;
+done:
+ if (bitmap != MAP_FAILED)
+ munmap((void *)bitmap, PAGE_SIZE);
+ if (fd >= 0)
+ close(fd);
+ return (error);
+}
+
+static int
+vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val)
+{
+
+ return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val));
+}
+
+static int
+vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val)
+{
+
+ return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val));
+}
+
+enum {
+ VMNAME = 1000, /* avoid collision with return values from getopt */
+ VCPU,
+ SET_LOWMEM,
+ SET_HIGHMEM,
+ SET_EFER,
+ SET_CR0,
+ SET_CR3,
+ SET_CR4,
+ SET_DR7,
+ SET_RSP,
+ SET_RIP,
+ SET_RAX,
+ SET_RFLAGS,
+ DESC_BASE,
+ DESC_LIMIT,
+ DESC_ACCESS,
+ SET_CS,
+ SET_DS,
+ SET_ES,
+ SET_FS,
+ SET_GS,
+ SET_SS,
+ SET_TR,
+ SET_LDTR,
+ SET_PINNING,
+ SET_X2APIC_STATE,
+ SET_VMCS_EXCEPTION_BITMAP,
+ SET_VMCS_ENTRY_INTERRUPTION_INFO,
+ SET_CAP,
+ CAPNAME,
+};
+
+int
+main(int argc, char *argv[])
+{
+ char *vmname;
+ int error, ch, vcpu;
+ vm_paddr_t gpa;
+ size_t len;
+ struct vm_exit vmexit;
+ uint64_t ctl, eptp, bm, addr, u64;
+ struct vmctx *ctx;
+
+ uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
+ uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
+ uint64_t r8, r9, r10, r11, r12, r13, r14, r15;
+ uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
+
+ struct option opts[] = {
+ { "vm", REQ_ARG, 0, VMNAME },
+ { "cpu", REQ_ARG, 0, VCPU },
+ { "set-lowmem", REQ_ARG, 0, SET_LOWMEM },
+ { "set-highmem",REQ_ARG, 0, SET_HIGHMEM },
+ { "set-efer", REQ_ARG, 0, SET_EFER },
+ { "set-cr0", REQ_ARG, 0, SET_CR0 },
+ { "set-cr3", REQ_ARG, 0, SET_CR3 },
+ { "set-cr4", REQ_ARG, 0, SET_CR4 },
+ { "set-dr7", REQ_ARG, 0, SET_DR7 },
+ { "set-rsp", REQ_ARG, 0, SET_RSP },
+ { "set-rip", REQ_ARG, 0, SET_RIP },
+ { "set-rax", REQ_ARG, 0, SET_RAX },
+ { "set-rflags", REQ_ARG, 0, SET_RFLAGS },
+ { "desc-base", REQ_ARG, 0, DESC_BASE },
+ { "desc-limit", REQ_ARG, 0, DESC_LIMIT },
+ { "desc-access",REQ_ARG, 0, DESC_ACCESS },
+ { "set-cs", REQ_ARG, 0, SET_CS },
+ { "set-ds", REQ_ARG, 0, SET_DS },
+ { "set-es", REQ_ARG, 0, SET_ES },
+ { "set-fs", REQ_ARG, 0, SET_FS },
+ { "set-gs", REQ_ARG, 0, SET_GS },
+ { "set-ss", REQ_ARG, 0, SET_SS },
+ { "set-tr", REQ_ARG, 0, SET_TR },
+ { "set-ldtr", REQ_ARG, 0, SET_LDTR },
+ { "set-pinning",REQ_ARG, 0, SET_PINNING },
+ { "set-x2apic-state",REQ_ARG, 0, SET_X2APIC_STATE },
+ { "set-vmcs-exception-bitmap",
+ REQ_ARG, 0, SET_VMCS_EXCEPTION_BITMAP },
+ { "set-vmcs-entry-interruption-info",
+ REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO },
+ { "capname", REQ_ARG, 0, CAPNAME },
+ { "setcap", REQ_ARG, 0, SET_CAP },
+ { "getcap", NO_ARG, &getcap, 1 },
+ { "get-stats", NO_ARG, &get_stats, 1 },
+ { "get-desc-ds",NO_ARG, &get_desc_ds, 1 },
+ { "set-desc-ds",NO_ARG, &set_desc_ds, 1 },
+ { "get-desc-es",NO_ARG, &get_desc_es, 1 },
+ { "set-desc-es",NO_ARG, &set_desc_es, 1 },
+ { "get-desc-ss",NO_ARG, &get_desc_ss, 1 },
+ { "set-desc-ss",NO_ARG, &set_desc_ss, 1 },
+ { "get-desc-cs",NO_ARG, &get_desc_cs, 1 },
+ { "set-desc-cs",NO_ARG, &set_desc_cs, 1 },
+ { "get-desc-fs",NO_ARG, &get_desc_fs, 1 },
+ { "set-desc-fs",NO_ARG, &set_desc_fs, 1 },
+ { "get-desc-gs",NO_ARG, &get_desc_gs, 1 },
+ { "set-desc-gs",NO_ARG, &set_desc_gs, 1 },
+ { "get-desc-tr",NO_ARG, &get_desc_tr, 1 },
+ { "set-desc-tr",NO_ARG, &set_desc_tr, 1 },
+ { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 },
+ { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 },
+ { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 },
+ { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 },
+ { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 },
+ { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 },
+ { "get-lowmem", NO_ARG, &get_lowmem, 1 },
+ { "get-highmem",NO_ARG, &get_highmem, 1 },
+ { "get-efer", NO_ARG, &get_efer, 1 },
+ { "get-cr0", NO_ARG, &get_cr0, 1 },
+ { "get-cr3", NO_ARG, &get_cr3, 1 },
+ { "get-cr4", NO_ARG, &get_cr4, 1 },
+ { "get-dr7", NO_ARG, &get_dr7, 1 },
+ { "get-rsp", NO_ARG, &get_rsp, 1 },
+ { "get-rip", NO_ARG, &get_rip, 1 },
+ { "get-rax", NO_ARG, &get_rax, 1 },
+ { "get-rbx", NO_ARG, &get_rbx, 1 },
+ { "get-rcx", NO_ARG, &get_rcx, 1 },
+ { "get-rdx", NO_ARG, &get_rdx, 1 },
+ { "get-rsi", NO_ARG, &get_rsi, 1 },
+ { "get-rdi", NO_ARG, &get_rdi, 1 },
+ { "get-rbp", NO_ARG, &get_rbp, 1 },
+ { "get-r8", NO_ARG, &get_r8, 1 },
+ { "get-r9", NO_ARG, &get_r9, 1 },
+ { "get-r10", NO_ARG, &get_r10, 1 },
+ { "get-r11", NO_ARG, &get_r11, 1 },
+ { "get-r12", NO_ARG, &get_r12, 1 },
+ { "get-r13", NO_ARG, &get_r13, 1 },
+ { "get-r14", NO_ARG, &get_r14, 1 },
+ { "get-r15", NO_ARG, &get_r15, 1 },
+ { "get-rflags", NO_ARG, &get_rflags, 1 },
+ { "get-cs", NO_ARG, &get_cs, 1 },
+ { "get-ds", NO_ARG, &get_ds, 1 },
+ { "get-es", NO_ARG, &get_es, 1 },
+ { "get-fs", NO_ARG, &get_fs, 1 },
+ { "get-gs", NO_ARG, &get_gs, 1 },
+ { "get-ss", NO_ARG, &get_ss, 1 },
+ { "get-tr", NO_ARG, &get_tr, 1 },
+ { "get-ldtr", NO_ARG, &get_ldtr, 1 },
+ { "get-vmcs-pinbased-ctls",
+ NO_ARG, &get_pinbased_ctls, 1 },
+ { "get-vmcs-procbased-ctls",
+ NO_ARG, &get_procbased_ctls, 1 },
+ { "get-vmcs-procbased-ctls2",
+ NO_ARG, &get_procbased_ctls2, 1 },
+ { "get-vmcs-guest-linear-address",
+ NO_ARG, &get_vmcs_gla, 1 },
+ { "get-vmcs-guest-physical-address",
+ NO_ARG, &get_vmcs_gpa, 1 },
+ { "get-vmcs-entry-interruption-info",
+ NO_ARG, &get_vmcs_entry_interruption_info, 1},
+ { "get-vmcs-eptp", NO_ARG, &get_eptp, 1 },
+ { "get-vmcs-exception-bitmap",
+ NO_ARG, &get_exception_bitmap, 1 },
+ { "get-vmcs-io-bitmap-address",
+ NO_ARG, &get_io_bitmap, 1 },
+ { "get-vmcs-tsc-offset", NO_ARG,&get_tsc_offset, 1 },
+ { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 },
+ { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 },
+ { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 },
+ { "get-vmcs-cr4-shadow", NO_ARG,&get_cr4_shadow, 1 },
+ { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1},
+ { "get-vmcs-apic-access-address",
+ NO_ARG, &get_apic_access_addr, 1},
+ { "get-vmcs-virtual-apic-address",
+ NO_ARG, &get_virtual_apic_addr, 1},
+ { "get-vmcs-tpr-threshold",
+ NO_ARG, &get_tpr_threshold, 1 },
+ { "get-vmcs-msr-bitmap",
+ NO_ARG, &get_msr_bitmap, 1 },
+ { "get-vmcs-msr-bitmap-address",
+ NO_ARG, &get_msr_bitmap_address, 1 },
+ { "get-vmcs-vpid", NO_ARG, &get_vpid, 1 },
+ { "get-vmcs-ple-gap", NO_ARG, &get_ple_gap, 1 },
+ { "get-vmcs-ple-window", NO_ARG,&get_ple_window,1 },
+ { "get-vmcs-instruction-error",
+ NO_ARG, &get_inst_err, 1 },
+ { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 },
+ { "get-vmcs-entry-ctls",
+ NO_ARG, &get_entry_ctls, 1 },
+ { "get-vmcs-guest-pat", NO_ARG, &get_guest_pat, 1 },
+ { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 },
+ { "get-vmcs-host-cr0",
+ NO_ARG, &get_host_cr0, 1 },
+ { "get-vmcs-host-cr3",
+ NO_ARG, &get_host_cr3, 1 },
+ { "get-vmcs-host-cr4",
+ NO_ARG, &get_host_cr4, 1 },
+ { "get-vmcs-host-rip",
+ NO_ARG, &get_host_rip, 1 },
+ { "get-vmcs-host-rsp",
+ NO_ARG, &get_host_rsp, 1 },
+ { "get-vmcs-guest-sysenter",
+ NO_ARG, &get_guest_sysenter, 1 },
+ { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 },
+ { "get-vmcs-exit-reason",
+ NO_ARG, &get_vmcs_exit_reason, 1 },
+ { "get-vmcs-exit-qualification",
+ NO_ARG, &get_vmcs_exit_qualification, 1 },
+ { "get-vmcs-exit-interruption-info",
+ NO_ARG, &get_vmcs_exit_interruption_info, 1},
+ { "get-vmcs-exit-interruption-error",
+ NO_ARG, &get_vmcs_exit_interruption_error, 1},
+ { "get-vmcs-interruptibility",
+ NO_ARG, &get_vmcs_interruptibility, 1 },
+ { "get-pinning",NO_ARG, &get_pinning, 1 },
+ { "get-x2apic-state",NO_ARG, &get_x2apic_state, 1 },
+ { "get-all", NO_ARG, &get_all, 1 },
+ { "run", NO_ARG, &run, 1 },
+ { "create", NO_ARG, &create, 1 },
+ { "destroy", NO_ARG, &destroy, 1 },
+ { NULL, 0, NULL, 0 }
+ };
+
+ vcpu = 0;
+ progname = basename(argv[0]);
+
+ while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) {
+ switch (ch) {
+ case 0:
+ break;
+ case VMNAME:
+ vmname = optarg;
+ break;
+ case VCPU:
+ vcpu = atoi(optarg);
+ break;
+ case SET_LOWMEM:
+ lowmem = atoi(optarg) * MB;
+ lowmem = roundup(lowmem, 2 * MB);
+ break;
+ case SET_HIGHMEM:
+ highmem = atoi(optarg) * MB;
+ highmem = roundup(highmem, 2 * MB);
+ break;
+ case SET_EFER:
+ efer = strtoul(optarg, NULL, 0);
+ set_efer = 1;
+ break;
+ case SET_CR0:
+ cr0 = strtoul(optarg, NULL, 0);
+ set_cr0 = 1;
+ break;
+ case SET_CR3:
+ cr3 = strtoul(optarg, NULL, 0);
+ set_cr3 = 1;
+ break;
+ case SET_CR4:
+ cr4 = strtoul(optarg, NULL, 0);
+ set_cr4 = 1;
+ break;
+ case SET_DR7:
+ dr7 = strtoul(optarg, NULL, 0);
+ set_dr7 = 1;
+ break;
+ case SET_RSP:
+ rsp = strtoul(optarg, NULL, 0);
+ set_rsp = 1;
+ break;
+ case SET_RIP:
+ rip = strtoul(optarg, NULL, 0);
+ set_rip = 1;
+ break;
+ case SET_RAX:
+ rax = strtoul(optarg, NULL, 0);
+ set_rax = 1;
+ break;
+ case SET_RFLAGS:
+ rflags = strtoul(optarg, NULL, 0);
+ set_rflags = 1;
+ break;
+ case DESC_BASE:
+ desc_base = strtoul(optarg, NULL, 0);
+ break;
+ case DESC_LIMIT:
+ desc_limit = strtoul(optarg, NULL, 0);
+ break;
+ case DESC_ACCESS:
+ desc_access = strtoul(optarg, NULL, 0);
+ break;
+ case SET_CS:
+ cs = strtoul(optarg, NULL, 0);
+ set_cs = 1;
+ break;
+ case SET_DS:
+ ds = strtoul(optarg, NULL, 0);
+ set_ds = 1;
+ break;
+ case SET_ES:
+ es = strtoul(optarg, NULL, 0);
+ set_es = 1;
+ break;
+ case SET_FS:
+ fs = strtoul(optarg, NULL, 0);
+ set_fs = 1;
+ break;
+ case SET_GS:
+ gs = strtoul(optarg, NULL, 0);
+ set_gs = 1;
+ break;
+ case SET_SS:
+ ss = strtoul(optarg, NULL, 0);
+ set_ss = 1;
+ break;
+ case SET_TR:
+ tr = strtoul(optarg, NULL, 0);
+ set_tr = 1;
+ break;
+ case SET_LDTR:
+ ldtr = strtoul(optarg, NULL, 0);
+ set_ldtr = 1;
+ break;
+ case SET_PINNING:
+ pincpu = strtol(optarg, NULL, 0);
+ set_pinning = 1;
+ break;
+ case SET_X2APIC_STATE:
+ x2apic_state = strtol(optarg, NULL, 0);
+ set_x2apic_state = 1;
+ break;
+ case SET_VMCS_EXCEPTION_BITMAP:
+ exception_bitmap = strtoul(optarg, NULL, 0);
+ set_exception_bitmap = 1;
+ break;
+ case SET_VMCS_ENTRY_INTERRUPTION_INFO:
+ vmcs_entry_interruption_info = strtoul(optarg, NULL, 0);
+ set_vmcs_entry_interruption_info = 1;
+ break;
+ case SET_CAP:
+ capval = strtoul(optarg, NULL, 0);
+ setcap = 1;
+ break;
+ case CAPNAME:
+ capname = optarg;
+ break;
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (vmname == NULL)
+ usage();
+
+ error = 0;
+
+ if (!error && create)
+ error = vm_create(vmname);
+
+ if (!error) {
+ ctx = vm_open(vmname);
+ if (ctx == NULL)
+ error = -1;
+ }
+
+ if (!error && lowmem)
+ error = vm_setup_memory(ctx, 0, lowmem, NULL);
+
+ if (!error && highmem)
+ error = vm_setup_memory(ctx, 4 * GB, highmem, NULL);
+
+ if (!error && set_efer)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer);
+
+ if (!error && set_cr0)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0);
+
+ if (!error && set_cr3)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3);
+
+ if (!error && set_cr4)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4);
+
+ if (!error && set_dr7)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7);
+
+ if (!error && set_rsp)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp);
+
+ if (!error && set_rip)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip);
+
+ if (!error && set_rax)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax);
+
+ if (!error && set_rflags) {
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+ rflags);
+ }
+
+ if (!error && set_desc_ds) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_es) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_ss) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_cs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_fs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_gs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_tr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_ldtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_gdtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+ desc_base, desc_limit, 0);
+ }
+
+ if (!error && set_desc_idtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+ desc_base, desc_limit, 0);
+ }
+
+ if (!error && set_cs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs);
+
+ if (!error && set_ds)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds);
+
+ if (!error && set_es)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es);
+
+ if (!error && set_fs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs);
+
+ if (!error && set_gs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs);
+
+ if (!error && set_ss)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss);
+
+ if (!error && set_tr)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr);
+
+ if (!error && set_ldtr)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr);
+
+ if (!error && set_pinning)
+ error = vm_set_pinning(ctx, vcpu, pincpu);
+
+ if (!error && set_x2apic_state)
+ error = vm_set_x2apic_state(ctx, vcpu, x2apic_state);
+
+ if (!error && set_exception_bitmap) {
+ error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
+ exception_bitmap);
+ }
+
+ if (!error && set_vmcs_entry_interruption_info) {
+ error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,
+ vmcs_entry_interruption_info);
+ }
+
+ if (!error && (get_lowmem || get_all)) {
+ gpa = 0;
+ error = vm_get_memory_seg(ctx, gpa, &len);
+ if (error == 0)
+ printf("lowmem\t\t0x%016lx/%ld\n", gpa, len);
+ }
+
+ if (!error && (get_highmem || get_all)) {
+ gpa = 4 * GB;
+ error = vm_get_memory_seg(ctx, gpa, &len);
+ if (error == 0)
+ printf("highmem\t\t0x%016lx/%ld\n", gpa, len);
+ }
+
+ if (!error && (get_efer || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer);
+ if (error == 0)
+ printf("efer[%d]\t\t0x%016lx\n", vcpu, efer);
+ }
+
+ if (!error && (get_cr0 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0);
+ if (error == 0)
+ printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+ }
+
+ if (!error && (get_cr3 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3);
+ if (error == 0)
+ printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+ }
+
+ if (!error && (get_cr4 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4);
+ if (error == 0)
+ printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+ }
+
+ if (!error && (get_dr7 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7);
+ if (error == 0)
+ printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7);
+ }
+
+ if (!error && (get_rsp || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp);
+ if (error == 0)
+ printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
+ }
+
+ if (!error && (get_rip || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+ if (error == 0)
+ printf("rip[%d]\t\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && (get_rax || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax);
+ if (error == 0)
+ printf("rax[%d]\t\t0x%016lx\n", vcpu, rax);
+ }
+
+ if (!error && (get_rbx || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx);
+ if (error == 0)
+ printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx);
+ }
+
+ if (!error && (get_rcx || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx);
+ if (error == 0)
+ printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx);
+ }
+
+ if (!error && (get_rdx || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx);
+ if (error == 0)
+ printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx);
+ }
+
+ if (!error && (get_rsi || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi);
+ if (error == 0)
+ printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi);
+ }
+
+ if (!error && (get_rdi || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi);
+ if (error == 0)
+ printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi);
+ }
+
+ if (!error && (get_rbp || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp);
+ if (error == 0)
+ printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp);
+ }
+
+ if (!error && (get_r8 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8);
+ if (error == 0)
+ printf("r8[%d]\t\t0x%016lx\n", vcpu, r8);
+ }
+
+ if (!error && (get_r9 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9);
+ if (error == 0)
+ printf("r9[%d]\t\t0x%016lx\n", vcpu, r9);
+ }
+
+ if (!error && (get_r10 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10);
+ if (error == 0)
+ printf("r10[%d]\t\t0x%016lx\n", vcpu, r10);
+ }
+
+ if (!error && (get_r11 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11);
+ if (error == 0)
+ printf("r11[%d]\t\t0x%016lx\n", vcpu, r11);
+ }
+
+ if (!error && (get_r12 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12);
+ if (error == 0)
+ printf("r12[%d]\t\t0x%016lx\n", vcpu, r12);
+ }
+
+ if (!error && (get_r13 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13);
+ if (error == 0)
+ printf("r13[%d]\t\t0x%016lx\n", vcpu, r13);
+ }
+
+ if (!error && (get_r14 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14);
+ if (error == 0)
+ printf("r14[%d]\t\t0x%016lx\n", vcpu, r14);
+ }
+
+ if (!error && (get_r15 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15);
+ if (error == 0)
+ printf("r15[%d]\t\t0x%016lx\n", vcpu, r15);
+ }
+
+ if (!error && (get_rflags || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+ &rflags);
+ if (error == 0)
+ printf("rflags[%d]\t0x%016lx\n", vcpu, rflags);
+ }
+
+ if (!error && (get_stats || get_all)) {
+ int i, num_stats;
+ uint64_t *stats;
+ struct timeval tv;
+ const char *desc;
+
+ stats = vm_get_stats(ctx, vcpu, &tv, &num_stats);
+ if (stats != NULL) {
+ printf("vcpu%d\n", vcpu);
+ for (i = 0; i < num_stats; i++) {
+ desc = vm_get_stat_desc(ctx, i);
+ printf("%-32s\t%ld\n", desc, stats[i]);
+ }
+ }
+ }
+
+ if (!error && (get_desc_ds || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_es || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_fs || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_gs || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_ss || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_cs || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_tr || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_ldtr || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_gdtr || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("gdtr[%d]\t\t0x%016lx/0x%08x\n",
+ vcpu, desc_base, desc_limit);
+ }
+ }
+
+ if (!error && (get_desc_idtr || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("idtr[%d]\t\t0x%016lx/0x%08x\n",
+ vcpu, desc_base, desc_limit);
+ }
+ }
+
+ if (!error && (get_cs || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs);
+ if (error == 0)
+ printf("cs[%d]\t\t0x%04lx\n", vcpu, cs);
+ }
+
+ if (!error && (get_ds || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds);
+ if (error == 0)
+ printf("ds[%d]\t\t0x%04lx\n", vcpu, ds);
+ }
+
+ if (!error && (get_es || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es);
+ if (error == 0)
+ printf("es[%d]\t\t0x%04lx\n", vcpu, es);
+ }
+
+ if (!error && (get_fs || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs);
+ if (error == 0)
+ printf("fs[%d]\t\t0x%04lx\n", vcpu, fs);
+ }
+
+ if (!error && (get_gs || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs);
+ if (error == 0)
+ printf("gs[%d]\t\t0x%04lx\n", vcpu, gs);
+ }
+
+ if (!error && (get_ss || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss);
+ if (error == 0)
+ printf("ss[%d]\t\t0x%04lx\n", vcpu, ss);
+ }
+
+ if (!error && (get_tr || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr);
+ if (error == 0)
+ printf("tr[%d]\t\t0x%04lx\n", vcpu, tr);
+ }
+
+ if (!error && (get_ldtr || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr);
+ if (error == 0)
+ printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr);
+ }
+
+ if (!error && (get_pinning || get_all)) {
+ error = vm_get_pinning(ctx, vcpu, &pincpu);
+ if (error == 0) {
+ if (pincpu < 0)
+ printf("pincpu[%d]\tunpinned\n", vcpu);
+ else
+ printf("pincpu[%d]\t%d\n", vcpu, pincpu);
+ }
+ }
+
+ if (!error && (get_x2apic_state || get_all)) {
+ error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state);
+ if (error == 0)
+ printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state);
+ }
+
+ if (!error && (get_pinbased_ctls || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("pinbased_ctls[%d]\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_procbased_ctls || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_PRI_PROC_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("procbased_ctls[%d]\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_procbased_ctls2 || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_SEC_PROC_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("procbased_ctls2[%d]\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_vmcs_gla || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_LINEAR_ADDRESS, &u64);
+ if (error == 0)
+ printf("gla[%d]\t\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && (get_vmcs_gpa || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_PHYSICAL_ADDRESS, &u64);
+ if (error == 0)
+ printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && (get_vmcs_entry_interruption_info || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64);
+ if (error == 0) {
+ printf("entry_interruption_info[%d]\t0x%08lx\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && (get_eptp || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp);
+ if (error == 0)
+ printf("eptp[%d]\t\t0x%016lx\n", vcpu, eptp);
+ }
+
+ if (!error && (get_exception_bitmap || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
+ &bm);
+ if (error == 0)
+ printf("exception_bitmap[%d]\t0x%08lx\n", vcpu, bm);
+ }
+
+ if (!error && (get_io_bitmap || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm);
+ if (error == 0)
+ printf("io_bitmap_a[%d]\t0x%08lx\n", vcpu, bm);
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm);
+ if (error == 0)
+ printf("io_bitmap_b[%d]\t0x%08lx\n", vcpu, bm);
+ }
+
+ if (!error && (get_tsc_offset || get_all)) {
+ uint64_t tscoff;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff);
+ if (error == 0)
+ printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff);
+ }
+
+ if (!error && (get_cr0_mask || get_all)) {
+ uint64_t cr0mask;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask);
+ if (error == 0)
+ printf("cr0_mask[%d]\t\t0x%016lx\n", vcpu, cr0mask);
+ }
+
+ if (!error && (get_cr0_shadow || get_all)) {
+ uint64_t cr0shadow;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW,
+ &cr0shadow);
+ if (error == 0)
+ printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpu, cr0shadow);
+ }
+
+ if (!error && (get_cr4_mask || get_all)) {
+ uint64_t cr4mask;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask);
+ if (error == 0)
+ printf("cr4_mask[%d]\t\t0x%016lx\n", vcpu, cr4mask);
+ }
+
+ if (!error && (get_cr4_shadow || get_all)) {
+ uint64_t cr4shadow;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW,
+ &cr4shadow);
+ if (error == 0)
+ printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpu, cr4shadow);
+ }
+
+ if (!error && (get_cr3_targets || get_all)) {
+ uint64_t target_count, target_addr;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT,
+ &target_count);
+ if (error == 0) {
+ printf("cr3_target_count[%d]\t0x%08lx\n",
+ vcpu, target_count);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target0[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target1[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target2[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target3[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+ }
+
+ if (!error && (get_apic_access_addr || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_APIC_ACCESS, &addr);
+ if (error == 0)
+ printf("apic_access_addr[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && (get_virtual_apic_addr || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_VIRTUAL_APIC, &addr);
+ if (error == 0)
+ printf("virtual_apic_addr[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && (get_tpr_threshold || get_all)) {
+ uint64_t threshold;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD,
+ &threshold);
+ if (error == 0)
+ printf("tpr_threshold[%d]\t0x%08lx\n", vcpu, threshold);
+ }
+
+ if (!error && (get_msr_bitmap_address || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+ if (error == 0)
+ printf("msr_bitmap[%d]\t\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && (get_msr_bitmap || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+ if (error == 0)
+ error = dump_vmcs_msr_bitmap(vcpu, addr);
+ }
+
+ if (!error && (get_vpid || get_all)) {
+ uint64_t vpid;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid);
+ if (error == 0)
+ printf("vpid[%d]\t\t0x%04lx\n", vcpu, vpid);
+ }
+
+ if (!error && (get_ple_window || get_all)) {
+ uint64_t window;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_WINDOW, &window);
+ if (error == 0)
+ printf("ple_window[%d]\t\t0x%08lx\n", vcpu, window);
+ }
+
+ if (!error && (get_ple_gap || get_all)) {
+ uint64_t gap;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_GAP, &gap);
+ if (error == 0)
+ printf("ple_gap[%d]\t\t0x%08lx\n", vcpu, gap);
+ }
+
+ if (!error && (get_inst_err || get_all)) {
+ uint64_t insterr;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR,
+ &insterr);
+ if (error == 0) {
+ printf("instruction_error[%d]\t0x%08lx\n",
+ vcpu, insterr);
+ }
+ }
+
+ if (!error && (get_exit_ctls || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl);
+ if (error == 0)
+ printf("exit_ctls[%d]\t\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_entry_ctls || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl);
+ if (error == 0)
+ printf("entry_ctls[%d]\t\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_host_pat || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat);
+ if (error == 0)
+ printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+ }
+
+ if (!error && (get_guest_pat || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat);
+ if (error == 0)
+ printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+ }
+
+ if (!error && (get_host_cr0 || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0);
+ if (error == 0)
+ printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+ }
+
+ if (!error && (get_host_cr3 || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3);
+ if (error == 0)
+ printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+ }
+
+ if (!error && (get_host_cr4 || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4);
+ if (error == 0)
+ printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+ }
+
+ if (!error && (get_host_rip || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip);
+ if (error == 0)
+ printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && (get_host_rsp || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp);
+ if (error == 0)
+ printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
+ }
+
+ if (!error && (get_guest_sysenter || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_CS, &cs);
+ if (error == 0)
+ printf("guest_sysenter_cs[%d]\t0x%08lx\n", vcpu, cs);
+
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_ESP, &rsp);
+ if (error == 0)
+ printf("guest_sysenter_sp[%d]\t0x%016lx\n", vcpu, rsp);
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_EIP, &rip);
+ if (error == 0)
+ printf("guest_sysenter_ip[%d]\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && (get_vmcs_link || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr);
+ if (error == 0)
+ printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && (get_vmcs_exit_reason || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64);
+ if (error == 0)
+ printf("vmcs_exit_reason[%d]\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && (get_vmcs_exit_qualification || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION,
+ &u64);
+ if (error == 0)
+ printf("vmcs_exit_qualification[%d]\t0x%016lx\n",
+ vcpu, u64);
+ }
+
+ if (!error && (get_vmcs_exit_interruption_info || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_EXIT_INTERRUPTION_INFO, &u64);
+ if (error == 0) {
+ printf("vmcs_exit_interruption_info[%d]\t0x%08lx\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && (get_vmcs_exit_interruption_error || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_EXIT_INTERRUPTION_ERROR, &u64);
+ if (error == 0) {
+ printf("vmcs_exit_interruption_error[%d]\t0x%08lx\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && (get_vmcs_interruptibility || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_INTERRUPTIBILITY, &u64);
+ if (error == 0) {
+ printf("vmcs_guest_interruptibility[%d]\t0x%08lx\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && setcap) {
+ int captype;
+ captype = vm_capability_name2type(capname);
+ error = vm_set_capability(ctx, vcpu, captype, capval);
+ if (error != 0 && errno == ENOENT)
+ printf("Capability \"%s\" is not available\n", capname);
+ }
+
+ if (!error && (getcap || get_all)) {
+ int captype, val, getcaptype;
+
+ if (getcap && capname)
+ getcaptype = vm_capability_name2type(capname);
+ else
+ getcaptype = -1;
+
+ for (captype = 0; captype < VM_CAP_MAX; captype++) {
+ if (getcaptype >= 0 && captype != getcaptype)
+ continue;
+ error = vm_get_capability(ctx, vcpu, captype, &val);
+ if (error == 0) {
+ printf("Capability \"%s\" is %s on vcpu %d\n",
+ vm_capability_type2name(captype),
+ val ? "set" : "not set", vcpu);
+ } else if (errno == ENOENT) {
+ printf("Capability \"%s\" is not available\n",
+ vm_capability_type2name(captype));
+ } else {
+ break;
+ }
+ }
+ }
+
+ if (!error && run) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+ assert(error == 0);
+
+ error = vm_run(ctx, vcpu, rip, &vmexit);
+ if (error == 0)
+ dump_vm_run_exitcode(&vmexit, vcpu);
+ else
+ printf("vm_run error %d\n", error);
+ }
+
+ if (error)
+ printf("errno = %d\n", errno);
+
+ if (!error && destroy)
+ vm_destroy(ctx);
+
+ exit(error);
+}
diff --git a/usr.sbin/bhyveload/Makefile b/usr.sbin/bhyveload/Makefile
new file mode 100644
index 0000000..7b00818
--- /dev/null
+++ b/usr.sbin/bhyveload/Makefile
@@ -0,0 +1,14 @@
+# $FreeBSD$
+
+PROG= bhyveload
+SRCS= bhyveload.c
+MAN= bhyveload.8
+
+DPADD+= ${LIBVMMAPI}
+LDADD+= -lvmmapi
+
+WARNS?= 3
+
+CFLAGS+=-I${.CURDIR}/../../sys/boot/userboot
+
+.include <bsd.prog.mk>
diff --git a/usr.sbin/bhyveload/bhyveload.8 b/usr.sbin/bhyveload/bhyveload.8
new file mode 100644
index 0000000..2918c4c
--- /dev/null
+++ b/usr.sbin/bhyveload/bhyveload.8
@@ -0,0 +1,130 @@
+.\"
+.\" Copyright (c) 2012 NetApp Inc
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd January 7, 2012
+.Dt BHYVELOAD 8
+.Os
+.Sh NAME
+.Nm bhyveload
+.Nd load a
+.Fx
+guest inside a bhyve virtual machine
+.Sh SYNOPSIS
+.Nm
+.Op Fl m Ar lowmem
+.Op Fl M Ar highmem
+.Op Fl d Ar disk-path
+.Op Fl h Ar host-path
+.Ar vmname
+.Sh DESCRIPTION
+.Nm
+is used to load a
+.Fx
+guest inside a
+.Xr bhyve 4
+virtual machine.
+.Pp
+.Nm
+is based on
+.Xr loader 8
+and will present an interface identical to
+.Fx
+loader on the user's terminal.
+.Pp
+The virtual machine is identified as
+.Ar vmname
+and will be created if it does not already exist.
+.Sh OPTIONS
+The following options are available:
+.Bl -tag -width indent
+.It Fl m Ar lowmem
+.Ar lowmem
+is the amount of memory allocated below 4GB in the guest's physical address
+space.
+.Pp
+The default value of
+.Ar lowmem
+is 256MB.
+.It Fl M Ar highmem
+.Ar highmem
+is the amount of memory allocated above 4GB in the guest's physical address
+space.
+.Pp
+The default value of
+.Ar highmem
+is 0MB.
+.It Fl d Ar disk-path
+The
+.Ar disk-path
+is the pathname of the guest's boot disk image.
+.It Fl h Ar host-path
+The
+.Ar host-path
+is the directory at the top of the guest's boot filesystem.
+
+.Sh EXAMPLES
+To create a virtual machine named
+.Ar freebsd-vm
+that boots off the ISO image
+.Pa /freebsd/release.iso
+and has 1GB memory allocated to it:
+
+.Dl "bhyveload -m 256 -M 768 -d /freebsd/release.iso freebsd-vm
+
+In the example above the 1GB allocation is split in two segments:
+.Bl -dash -compact
+.It
+256MB below the 4GB boundary (0MB - 256MB)
+.It
+768MB above the 4GB boundary (4096MB - 4864MB)
+.El
+
+.Sh SEE ALSO
+.Xr bhyve 4 ,
+.Xr bhyve 8 ,
+.Xr loader 8 ,
+.Xr vmm 4
+
+.Sh HISTORY
+.Nm
+first appeared in
+.Fx 10.0 ,
+and was developed at NetApp Inc.
+
+.Sh AUTHORS
+.Nm
+was developed by
+.An -nosplit
+.An "Neel Natu" Aq neel@FreeBSD.org
+at NetApp Inc with a lot of help from
+.An Doug Rabson Aq dfr@FreeBSD.org
+
+.Sh BUGS
+.Nm
+can load only
+.Fx
+as a guest.
diff --git a/usr.sbin/bhyveload/bhyveload.c b/usr.sbin/bhyveload/bhyveload.c
new file mode 100644
index 0000000..ef12d9f
--- /dev/null
+++ b/usr.sbin/bhyveload/bhyveload.c
@@ -0,0 +1,652 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*-
+ * Copyright (c) 2011 Google, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/disk.h>
+
+#include <machine/specialreg.h>
+#include <machine/vmm.h>
+
+#include <dirent.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <termios.h>
+#include <unistd.h>
+
+#include <vmmapi.h>
+
+#include "userboot.h"
+
+#define MB (1024 * 1024UL)
+#define GB (1024 * 1024 * 1024UL)
+#define BSP 0
+
+static char *host_base = "/";
+static struct termios term, oldterm;
+static int disk_fd = -1;
+
+static char *vmname, *progname, *membase;
+static uint64_t lowmem, highmem;
+static struct vmctx *ctx;
+
+static uint64_t gdtbase, cr3, rsp;
+
+static void cb_exit(void *arg, int v);
+
+/*
+ * Console i/o callbacks
+ */
+
+static void
+cb_putc(void *arg, int ch)
+{
+ char c = ch;
+
+ write(1, &c, 1);
+}
+
+static int
+cb_getc(void *arg)
+{
+ char c;
+
+ if (read(0, &c, 1) == 1)
+ return (c);
+ return (-1);
+}
+
+static int
+cb_poll(void *arg)
+{
+ int n;
+
+ if (ioctl(0, FIONREAD, &n) >= 0)
+ return (n > 0);
+ return (0);
+}
+
+/*
+ * Host filesystem i/o callbacks
+ */
+
+struct cb_file {
+ int cf_isdir;
+ size_t cf_size;
+ struct stat cf_stat;
+ union {
+ int fd;
+ DIR *dir;
+ } cf_u;
+};
+
+static int
+cb_open(void *arg, const char *filename, void **hp)
+{
+ struct stat st;
+ struct cb_file *cf;
+ char path[PATH_MAX];
+
+ if (!host_base)
+ return (ENOENT);
+
+ strlcpy(path, host_base, PATH_MAX);
+ if (path[strlen(path) - 1] == '/')
+ path[strlen(path) - 1] = 0;
+ strlcat(path, filename, PATH_MAX);
+ cf = malloc(sizeof(struct cb_file));
+ if (stat(path, &cf->cf_stat) < 0) {
+ free(cf);
+ return (errno);
+ }
+
+ cf->cf_size = st.st_size;
+ if (S_ISDIR(cf->cf_stat.st_mode)) {
+ cf->cf_isdir = 1;
+ cf->cf_u.dir = opendir(path);
+ if (!cf->cf_u.dir)
+ goto out;
+ *hp = cf;
+ return (0);
+ }
+ if (S_ISREG(cf->cf_stat.st_mode)) {
+ cf->cf_isdir = 0;
+ cf->cf_u.fd = open(path, O_RDONLY);
+ if (cf->cf_u.fd < 0)
+ goto out;
+ *hp = cf;
+ return (0);
+ }
+
+out:
+ free(cf);
+ return (EINVAL);
+}
+
+static int
+cb_close(void *arg, void *h)
+{
+ struct cb_file *cf = h;
+
+ if (cf->cf_isdir)
+ closedir(cf->cf_u.dir);
+ else
+ close(cf->cf_u.fd);
+ free(cf);
+
+ return (0);
+}
+
+static int
+cb_isdir(void *arg, void *h)
+{
+ struct cb_file *cf = h;
+
+ return (cf->cf_isdir);
+}
+
+static int
+cb_read(void *arg, void *h, void *buf, size_t size, size_t *resid)
+{
+ struct cb_file *cf = h;
+ ssize_t sz;
+
+ if (cf->cf_isdir)
+ return (EINVAL);
+ sz = read(cf->cf_u.fd, buf, size);
+ if (sz < 0)
+ return (EINVAL);
+ *resid = size - sz;
+ return (0);
+}
+
+static int
+cb_readdir(void *arg, void *h, uint32_t *fileno_return, uint8_t *type_return,
+ size_t *namelen_return, char *name)
+{
+ struct cb_file *cf = h;
+ struct dirent *dp;
+
+ if (!cf->cf_isdir)
+ return (EINVAL);
+
+ dp = readdir(cf->cf_u.dir);
+ if (!dp)
+ return (ENOENT);
+
+ /*
+ * Note: d_namlen is in the range 0..255 and therefore less
+ * than PATH_MAX so we don't need to test before copying.
+ */
+ *fileno_return = dp->d_fileno;
+ *type_return = dp->d_type;
+ *namelen_return = dp->d_namlen;
+ memcpy(name, dp->d_name, dp->d_namlen);
+ name[dp->d_namlen] = 0;
+
+ return (0);
+}
+
+static int
+cb_seek(void *arg, void *h, uint64_t offset, int whence)
+{
+ struct cb_file *cf = h;
+
+ if (cf->cf_isdir)
+ return (EINVAL);
+ if (lseek(cf->cf_u.fd, offset, whence) < 0)
+ return (errno);
+ return (0);
+}
+
+static int
+cb_stat(void *arg, void *h, int *mode, int *uid, int *gid, uint64_t *size)
+{
+ struct cb_file *cf = h;
+
+ *mode = cf->cf_stat.st_mode;
+ *uid = cf->cf_stat.st_uid;
+ *gid = cf->cf_stat.st_gid;
+ *size = cf->cf_stat.st_size;
+ return (0);
+}
+
+/*
+ * Disk image i/o callbacks
+ */
+
+static int
+cb_diskread(void *arg, int unit, uint64_t from, void *to, size_t size,
+ size_t *resid)
+{
+ ssize_t n;
+
+ if (unit != 0 || disk_fd == -1)
+ return (EIO);
+ n = pread(disk_fd, to, size, from);
+ if (n < 0)
+ return (errno);
+ *resid = size - n;
+ return (0);
+}
+
+static int
+cb_diskioctl(void *arg, int unit, u_long cmd, void *data)
+{
+ struct stat sb;
+
+ if (unit != 0 || disk_fd == -1)
+ return (EBADF);
+
+ switch (cmd) {
+ case DIOCGSECTORSIZE:
+ *(u_int *)data = 512;
+ break;
+ case DIOCGMEDIASIZE:
+ if (fstat(disk_fd, &sb) == 0)
+ *(off_t *)data = sb.st_size;
+ else
+ return (ENOTTY);
+ break;
+ default:
+ return (ENOTTY);
+ }
+
+ return (0);
+}
+
+/*
+ * Guest virtual machine i/o callbacks
+ */
+static int
+cb_copyin(void *arg, const void *from, uint64_t to, size_t size)
+{
+
+ to &= 0x7fffffff;
+ if (to > lowmem)
+ return (EFAULT);
+ if (to + size > lowmem)
+ size = lowmem - to;
+
+ memcpy(&membase[to], from, size);
+
+ return (0);
+}
+
+static int
+cb_copyout(void *arg, uint64_t from, void *to, size_t size)
+{
+
+ from &= 0x7fffffff;
+ if (from > lowmem)
+ return (EFAULT);
+ if (from + size > lowmem)
+ size = lowmem - from;
+
+ memcpy(to, &membase[from], size);
+
+ return (0);
+}
+
+static void
+cb_setreg(void *arg, int r, uint64_t v)
+{
+ int error;
+ enum vm_reg_name vmreg;
+
+ vmreg = VM_REG_LAST;
+
+ switch (r) {
+ case 4:
+ vmreg = VM_REG_GUEST_RSP;
+ rsp = v;
+ break;
+ default:
+ break;
+ }
+
+ if (vmreg == VM_REG_LAST) {
+ printf("test_setreg(%d): not implemented\n", r);
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+
+ error = vm_set_register(ctx, BSP, vmreg, v);
+ if (error) {
+ perror("vm_set_register");
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+}
+
+static void
+cb_setmsr(void *arg, int r, uint64_t v)
+{
+ int error;
+ enum vm_reg_name vmreg;
+
+ vmreg = VM_REG_LAST;
+
+ switch (r) {
+ case MSR_EFER:
+ vmreg = VM_REG_GUEST_EFER;
+ break;
+ default:
+ break;
+ }
+
+ if (vmreg == VM_REG_LAST) {
+ printf("test_setmsr(%d): not implemented\n", r);
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+
+ error = vm_set_register(ctx, BSP, vmreg, v);
+ if (error) {
+ perror("vm_set_msr");
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+}
+
+static void
+cb_setcr(void *arg, int r, uint64_t v)
+{
+ int error;
+ enum vm_reg_name vmreg;
+
+ vmreg = VM_REG_LAST;
+
+ switch (r) {
+ case 0:
+ vmreg = VM_REG_GUEST_CR0;
+ break;
+ case 3:
+ vmreg = VM_REG_GUEST_CR3;
+ cr3 = v;
+ break;
+ case 4:
+ vmreg = VM_REG_GUEST_CR4;
+ break;
+ default:
+ break;
+ }
+
+ if (vmreg == VM_REG_LAST) {
+ printf("test_setcr(%d): not implemented\n", r);
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+
+ error = vm_set_register(ctx, BSP, vmreg, v);
+ if (error) {
+ perror("vm_set_cr");
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+}
+
+static void
+cb_setgdt(void *arg, uint64_t base, size_t size)
+{
+ int error;
+
+ error = vm_set_desc(ctx, BSP, VM_REG_GUEST_GDTR, base, size - 1, 0);
+ if (error != 0) {
+ perror("vm_set_desc(gdt)");
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+
+ gdtbase = base;
+}
+
+static void
+cb_exec(void *arg, uint64_t rip)
+{
+ int error;
+
+ error = vm_setup_freebsd_registers(ctx, BSP, rip, cr3, gdtbase, rsp);
+ if (error) {
+ perror("vm_setup_freebsd_registers");
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+
+ cb_exit(NULL, 0);
+}
+
+/*
+ * Misc
+ */
+
+static void
+cb_delay(void *arg, int usec)
+{
+
+ usleep(usec);
+}
+
+static void
+cb_exit(void *arg, int v)
+{
+
+ tcsetattr(0, TCSAFLUSH, &oldterm);
+ exit(v);
+}
+
+static void
+cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem)
+{
+
+ *ret_lowmem = lowmem;
+ *ret_highmem = highmem;
+}
+
+static const char *
+cb_getenv(void *arg, int num)
+{
+ int max;
+
+ static const char * var[] = {
+ "smbios.bios.vendor=BHYVE",
+ "boot_serial=1",
+ NULL
+ };
+
+ max = sizeof(var) / sizeof(var[0]);
+
+ if (num < max)
+ return (var[num]);
+ else
+ return (NULL);
+}
+
+static struct loader_callbacks cb = {
+ .getc = cb_getc,
+ .putc = cb_putc,
+ .poll = cb_poll,
+
+ .open = cb_open,
+ .close = cb_close,
+ .isdir = cb_isdir,
+ .read = cb_read,
+ .readdir = cb_readdir,
+ .seek = cb_seek,
+ .stat = cb_stat,
+
+ .diskread = cb_diskread,
+ .diskioctl = cb_diskioctl,
+
+ .copyin = cb_copyin,
+ .copyout = cb_copyout,
+ .setreg = cb_setreg,
+ .setmsr = cb_setmsr,
+ .setcr = cb_setcr,
+ .setgdt = cb_setgdt,
+ .exec = cb_exec,
+
+ .delay = cb_delay,
+ .exit = cb_exit,
+ .getmem = cb_getmem,
+
+ .getenv = cb_getenv,
+};
+
+static void
+usage(void)
+{
+
+ printf("usage: %s [-d <disk image path>] [-h <host filesystem path>] "
+ "[-m <lowmem>][-M <highmem>] "
+ "<vmname>\n", progname);
+ exit(1);
+}
+
+int
+main(int argc, char** argv)
+{
+ void *h;
+ void (*func)(struct loader_callbacks *, void *, int, int);
+ int opt, error;
+ char *disk_image;
+
+ progname = argv[0];
+
+ lowmem = 128 * MB;
+ highmem = 0;
+ disk_image = NULL;
+
+ while ((opt = getopt(argc, argv, "d:h:m:M:")) != -1) {
+ switch (opt) {
+ case 'd':
+ disk_image = optarg;
+ break;
+
+ case 'h':
+ host_base = optarg;
+ break;
+
+ case 'm':
+ lowmem = strtoul(optarg, NULL, 0) * MB;
+ break;
+
+ case 'M':
+ highmem = strtoul(optarg, NULL, 0) * MB;
+ break;
+
+ case '?':
+ usage();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1)
+ usage();
+
+ vmname = argv[0];
+
+ error = vm_create(vmname);
+ if (error != 0 && errno != EEXIST) {
+ perror("vm_create");
+ exit(1);
+
+ }
+
+ ctx = vm_open(vmname);
+ if (ctx == NULL) {
+ perror("vm_open");
+ exit(1);
+ }
+
+ error = vm_setup_memory(ctx, 0, lowmem, &membase);
+ if (error) {
+ perror("vm_setup_memory(lowmem)");
+ exit(1);
+ }
+
+ if (highmem != 0) {
+ error = vm_setup_memory(ctx, 4 * GB, highmem, NULL);
+ if (error) {
+ perror("vm_setup_memory(highmem)");
+ exit(1);
+ }
+ }
+
+ tcgetattr(0, &term);
+ oldterm = term;
+ term.c_lflag &= ~(ICANON|ECHO);
+ term.c_iflag &= ~ICRNL;
+ tcsetattr(0, TCSAFLUSH, &term);
+ h = dlopen("/boot/userboot.so", RTLD_LOCAL);
+ if (!h) {
+ printf("%s\n", dlerror());
+ return (1);
+ }
+ func = dlsym(h, "loader_main");
+ if (!func) {
+ printf("%s\n", dlerror());
+ return (1);
+ }
+
+ if (disk_image) {
+ disk_fd = open(disk_image, O_RDONLY);
+ }
+ func(&cb, NULL, USERBOOT_VERSION_3, disk_fd >= 0);
+}
OpenPOWER on IntegriCloud