summaryrefslogtreecommitdiffstats
path: root/usr.sbin
diff options
context:
space:
mode:
authorgrehan <grehan@FreeBSD.org>2011-05-13 04:54:01 +0000
committergrehan <grehan@FreeBSD.org>2011-05-13 04:54:01 +0000
commitd45b7f14ae6fa78882fa9ec3be976733ca4767b4 (patch)
tree4af898a91c7d67e7068687610ebc68f1cbdf3b2e /usr.sbin
parent1430f46faf0f3eb24ffcd28a3248a565a48236ac (diff)
downloadFreeBSD-src-d45b7f14ae6fa78882fa9ec3be976733ca4767b4.zip
FreeBSD-src-d45b7f14ae6fa78882fa9ec3be976733ca4767b4.tar.gz
Import of bhyve hypervisor and utilities, part 1.
vmm.ko - kernel module for VT-x, VT-d and hypervisor control bhyve - user-space sequencer and i/o emulation vmmctl - dump of hypervisor register state libvmm - front-end to vmm.ko chardev interface bhyve was designed and implemented by Neel Natu. Thanks to the following folk from NetApp who helped to make this available: Joe CaraDonna Peter Snyder Jeff Heller Sandeep Mann Steve Miller Brian Pawlowski
Diffstat (limited to 'usr.sbin')
-rw-r--r--usr.sbin/Makefile4
-rw-r--r--usr.sbin/bhyve/Makefile18
-rw-r--r--usr.sbin/bhyve/atpic.c68
-rw-r--r--usr.sbin/bhyve/consport.c121
-rw-r--r--usr.sbin/bhyve/dbgport.c124
-rw-r--r--usr.sbin/bhyve/dbgport.h36
-rw-r--r--usr.sbin/bhyve/elcr.c65
-rw-r--r--usr.sbin/bhyve/fbsdrun.c650
-rw-r--r--usr.sbin/bhyve/fbsdrun.h53
-rw-r--r--usr.sbin/bhyve/inout.c98
-rw-r--r--usr.sbin/bhyve/inout.h64
-rw-r--r--usr.sbin/bhyve/mevent.c419
-rw-r--r--usr.sbin/bhyve/mevent.h49
-rw-r--r--usr.sbin/bhyve/mevent_test.c180
-rw-r--r--usr.sbin/bhyve/pci_emul.c976
-rw-r--r--usr.sbin/bhyve/pci_emul.h171
-rw-r--r--usr.sbin/bhyve/pci_hostbridge.c52
-rw-r--r--usr.sbin/bhyve/pci_passthru.c508
-rw-r--r--usr.sbin/bhyve/pci_virtio_block.c502
-rw-r--r--usr.sbin/bhyve/pci_virtio_net.c739
-rw-r--r--usr.sbin/bhyve/pit_8254.c196
-rw-r--r--usr.sbin/bhyve/pit_8254.h45
-rw-r--r--usr.sbin/bhyve/post.c51
-rw-r--r--usr.sbin/bhyve/rtc.c268
-rw-r--r--usr.sbin/bhyve/uart.c60
-rw-r--r--usr.sbin/bhyve/virtio.h85
-rw-r--r--usr.sbin/bhyve/xmsr.c261
-rw-r--r--usr.sbin/bhyve/xmsr.h34
-rw-r--r--usr.sbin/vmmctl/Makefile15
-rwxr-xr-xusr.sbin/vmmctl/sample.sh75
-rw-r--r--usr.sbin/vmmctl/vmmctl.c1485
31 files changed, 7472 insertions, 0 deletions
diff --git a/usr.sbin/Makefile b/usr.sbin/Makefile
index 44f20a4..fc527b7 100644
--- a/usr.sbin/Makefile
+++ b/usr.sbin/Makefile
@@ -19,6 +19,7 @@ SUBDIR= ${_ac} \
${_auditd} \
${_auditreduce} \
${_authpf} \
+ ${_bhyve} \
${_bluetooth} \
${_boot0cfg} \
${_boot98cfg} \
@@ -194,6 +195,7 @@ SUBDIR= ${_ac} \
${_usbdevs} \
${_usbconfig} \
${_vidcontrol} \
+ ${_vmmctl} \
vipw \
wake \
watch \
@@ -477,6 +479,7 @@ _boot98cfg= boot98cfg
_acpi= acpi
.endif
_asf= asf
+_bhyve= bhyve
_boot0cfg= boot0cfg
.if ${MK_TOOLCHAIN} != "no"
_btxld= btxld
@@ -494,6 +497,7 @@ _ndiscvt= ndiscvt
.endif
_sicontrol= sicontrol
_spkrtest= spkrtest
+_vmmctl= vmmctl
_zzz= zzz
.endif
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
new file mode 100644
index 0000000..71df082
--- /dev/null
+++ b/usr.sbin/bhyve/Makefile
@@ -0,0 +1,18 @@
+#
+# $FreeBSD$
+#
+
+PROG= bhyve
+
+SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c mevent.c
+SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
+SRCS+= pci_virtio_net.c pit_8254.c post.c rtc.c uart.c xmsr.c
+
+NO_MAN=
+
+DPADD= ${LIBVMMAPI} ${LIBMD} ${LIBPTHREAD}
+LDADD= -lvmmapi -lmd -lpthread
+
+CFLAGS+= -I${.CURDIR}/../../sys
+
+.include <bsd.prog.mk>
diff --git a/usr.sbin/bhyve/atpic.c b/usr.sbin/bhyve/atpic.c
new file mode 100644
index 0000000..a9fb084
--- /dev/null
+++ b/usr.sbin/bhyve/atpic.c
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "inout.h"
+
+/*
+ * FreeBSD only writes to the 8259 interrupt controllers to put them in a
+ * shutdown state.
+ *
+ * So, we just ignore the writes.
+ */
+
+#define IO_ICU1 0x20
+#define IO_ICU2 0xA0
+#define ICU_IMR_OFFSET 1
+
+static int
+atpic_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ if (bytes != 1)
+ return (-1);
+
+ if (in)
+ return (-1);
+
+ /* Pretend all writes to the 8259 are alright */
+ return (0);
+}
+
+INOUT_PORT(atpic, IO_ICU1, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU1 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU2, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU2 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);
diff --git a/usr.sbin/bhyve/consport.c b/usr.sbin/bhyve/consport.c
new file mode 100644
index 0000000..34f94a6
--- /dev/null
+++ b/usr.sbin/bhyve/consport.c
@@ -0,0 +1,121 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/select.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <termios.h>
+#include <unistd.h>
+#include <stdbool.h>
+
+#include "inout.h"
+
+#define BVM_CONSOLE_PORT 0x220
+
+static struct termios tio_orig, tio_new;
+
+static void
+ttyclose(void)
+{
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
+}
+
+static void
+ttyopen(void)
+{
+ tcgetattr(STDIN_FILENO, &tio_orig);
+
+ cfmakeraw(&tio_new);
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
+
+ atexit(ttyclose);
+}
+
+static bool
+tty_char_available(void)
+{
+ fd_set rfds;
+ struct timeval tv;
+
+ FD_ZERO(&rfds);
+ FD_SET(STDIN_FILENO, &rfds);
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
+ return (true);
+ } else {
+ return (false);
+ }
+}
+
+static int
+ttyread(void)
+{
+ char rb;
+
+ if (tty_char_available()) {
+ read(STDIN_FILENO, &rb, 1);
+ return (rb & 0xff);
+ } else {
+ return (-1);
+ }
+}
+
+static void
+ttywrite(unsigned char wb)
+{
+ (void) write(STDOUT_FILENO, &wb, 1);
+}
+
+static int
+console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ static int opened;
+
+ if (bytes != 4)
+ return (-1);
+
+ if (!opened) {
+ ttyopen();
+ opened = 1;
+ }
+
+ if (in)
+ *eax = ttyread();
+ else
+ ttywrite(*eax);
+
+ return (0);
+}
+INOUT_PORT(console, BVM_CONSOLE_PORT, IOPORT_F_INOUT, console_handler);
diff --git a/usr.sbin/bhyve/dbgport.c b/usr.sbin/bhyve/dbgport.c
new file mode 100644
index 0000000..be919e1
--- /dev/null
+++ b/usr.sbin/bhyve/dbgport.c
@@ -0,0 +1,124 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <sys/uio.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include "inout.h"
+
+#define BVM_DBG_PORT 0x224
+
+static int listen_fd, conn_fd;
+
+static struct sockaddr_in sin;
+
+void
+init_dbgport(int sport)
+{
+ conn_fd = -1;
+
+ if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ perror("socket");
+ exit(1);
+ }
+
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(sport);
+
+ if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+ perror("bind");
+ exit(1);
+ }
+
+ if (listen(listen_fd, 1) < 0) {
+ perror("listen");
+ exit(1);
+ }
+}
+
+static int
+dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ char ch;
+ int nwritten, nread, printonce;
+
+ if (bytes != 4)
+ return (-1);
+
+again:
+ printonce = 0;
+ while (conn_fd < 0) {
+ if (!printonce) {
+ printf("Waiting for connection from gdb\r\n");
+ printonce = 1;
+ }
+ conn_fd = accept(listen_fd, NULL, NULL);
+ if (conn_fd >= 0)
+ fcntl(conn_fd, F_SETFL, O_NONBLOCK);
+ else if (errno != EINTR)
+ perror("accept");
+ }
+
+ if (in) {
+ nread = read(conn_fd, &ch, 1);
+ if (nread == -1 && errno == EAGAIN)
+ *eax = -1;
+ else if (nread == 1)
+ *eax = ch;
+ else {
+ close(conn_fd);
+ conn_fd = -1;
+ goto again;
+ }
+ } else {
+ ch = *eax;
+ nwritten = write(conn_fd, &ch, 1);
+ if (nwritten != 1) {
+ close(conn_fd);
+ conn_fd = -1;
+ goto again;
+ }
+ }
+ return (0);
+}
+
+INOUT_PORT(dbg, BVM_DBG_PORT, IOPORT_F_INOUT, dbg_handler);
diff --git a/usr.sbin/bhyve/dbgport.h b/usr.sbin/bhyve/dbgport.h
new file mode 100644
index 0000000..8c7dab7
--- /dev/null
+++ b/usr.sbin/bhyve/dbgport.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _DBGPORT_H_
+#define _DBGPORT_H_
+
+#define DEFAULT_GDB_PORT 6466
+
+void init_dbgport(int port);
+
+#endif
diff --git a/usr.sbin/bhyve/elcr.c b/usr.sbin/bhyve/elcr.c
new file mode 100644
index 0000000..2417ae1
--- /dev/null
+++ b/usr.sbin/bhyve/elcr.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include "inout.h"
+
+/*
+ * EISA interrupt Level Control Register.
+ *
+ * This is a 16-bit register with one bit for each of the IRQ0 through IRQ15.
+ * A level triggered irq is indicated by setting the corresponding bit to '1'.
+ */
+#define ELCR_PORT 0x4d0
+
+static uint8_t elcr[2] = { 0x00, 0x00 };
+
+static int
+elcr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int idx;
+
+ if (bytes != 1)
+ return (-1);
+
+ idx = port - ELCR_PORT;
+
+ if (in)
+ *eax = elcr[idx];
+ else
+ elcr[idx] = *eax;
+
+ return (0);
+}
+INOUT_PORT(elcr, ELCR_PORT + 0, IOPORT_F_INOUT, elcr_handler);
+INOUT_PORT(elcr, ELCR_PORT + 1, IOPORT_F_INOUT, elcr_handler);
diff --git a/usr.sbin/bhyve/fbsdrun.c b/usr.sbin/bhyve/fbsdrun.c
new file mode 100644
index 0000000..ddbe709b
--- /dev/null
+++ b/usr.sbin/bhyve/fbsdrun.c
@@ -0,0 +1,650 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+
+#include <machine/segments.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <libgen.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <pthread.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "fbsdrun.h"
+#include "inout.h"
+#include "dbgport.h"
+#include "mevent.h"
+#include "pci_emul.h"
+#include "xmsr.h"
+
+#define DEFAULT_GUEST_HZ 100
+#define DEFAULT_GUEST_TSLICE 200
+
+#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
+
+#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */
+#define VMEXIT_CONTINUE 1 /* continue from next instruction */
+#define VMEXIT_RESTART 2 /* restart current instruction */
+#define VMEXIT_ABORT 3 /* abort the vm run loop */
+#define VMEXIT_RESET 4 /* guest machine has reset */
+
+#define MB (1024UL * 1024)
+#define GB (1024UL * MB)
+
+typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
+
+int guest_tslice = DEFAULT_GUEST_TSLICE;
+int guest_hz = DEFAULT_GUEST_HZ;
+char *vmname;
+
+u_long lomem_sz;
+u_long himem_sz;
+
+int guest_ncpus;
+
+static int pincpu = -1;
+static int guest_vcpu_mux;
+static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
+
+static int foundcpus;
+
+static char *lomem_addr;
+static char *himem_addr;
+
+static char *progname;
+static const int BSP = 0;
+
+static int cpumask;
+
+static void *oem_tbl_start;
+static int oem_tbl_size;
+
+static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
+
+struct vm_exit vmexit[VM_MAXCPU];
+
+struct fbsdstats {
+ uint64_t vmexit_bogus;
+ uint64_t vmexit_bogus_switch;
+ uint64_t vmexit_hlt;
+ uint64_t vmexit_pause;
+ uint64_t vmexit_mtrap;
+ uint64_t cpu_switch_rotate;
+ uint64_t cpu_switch_direct;
+ int io_reset;
+} stats;
+
+struct mt_vmm_info {
+ pthread_t mt_thr;
+ struct vmctx *mt_ctx;
+ int mt_vcpu;
+} mt_vmm_info[VM_MAXCPU];
+
+static void
+usage(int code)
+{
+
+ fprintf(stderr,
+ "Usage: %s [-hBHP][-g <gdb port>][-z <hz>][-s <pci>][-p pincpu]"
+ "[-n <pci>][-m lowmem][-M highmem] <vm>\n"
+ " -g: gdb port (default is %d and 0 means don't open)\n"
+ " -c: # cpus (default 1)\n"
+ " -p: pin vcpu 'n' to host cpu 'pincpu + n'\n"
+ " -B: inject breakpoint exception on vm entry\n"
+ " -H: vmexit from the guest on hlt\n"
+ " -P: vmexit from the guest on pause\n"
+ " -h: help\n"
+ " -z: guest hz (default is %d)\n"
+ " -s: <slot,driver,configinfo> PCI slot config\n"
+ " -n: <slot,name> PCI slot naming\n"
+ " -m: lowmem in MB\n"
+ " -M: highmem in MB\n"
+ " -x: mux vcpus to 1 hcpu\n"
+ " -t: mux vcpu timeslice hz (default %d)\n",
+ progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ,
+ DEFAULT_GUEST_TSLICE);
+ exit(code);
+}
+
+void *
+paddr_guest2host(uintptr_t gaddr)
+{
+ if (lomem_sz == 0)
+ return (NULL);
+
+ if (gaddr < lomem_sz) {
+ return ((void *)(lomem_addr + gaddr));
+ } else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) {
+ return ((void *)(himem_addr + gaddr - 4*GB));
+ } else
+ return (NULL);
+}
+
+void
+fbsdrun_add_oemtbl(void *tbl, int tblsz)
+{
+ oem_tbl_start = tbl;
+ oem_tbl_size = tblsz;
+}
+
+int
+fbsdrun_vmexit_on_pause(void)
+{
+
+ return (guest_vmexit_on_pause);
+}
+
+int
+fbsdrun_vmexit_on_hlt(void)
+{
+
+ return (guest_vmexit_on_hlt);
+}
+
+int
+fbsdrun_muxed(void)
+{
+
+ return (guest_vcpu_mux);
+}
+
+void *
+fbsdrun_start_thread(void *param)
+{
+ int vcpu;
+ struct mt_vmm_info *mtp = param;
+
+ vcpu = mtp->mt_vcpu;
+ vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
+
+ /* not reached */
+ exit(1);
+ return (NULL);
+}
+
+void
+fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
+{
+ int error;
+
+ if (cpumask & (1 << vcpu)) {
+ printf("addcpu: attempting to add existing cpu %d\n", vcpu);
+ exit(1);
+ }
+
+ cpumask |= 1 << vcpu;
+ foundcpus++;
+
+ /*
+ * Set up the vmexit struct to allow execution to start
+ * at the given RIP
+ */
+ vmexit[vcpu].rip = rip;
+ vmexit[vcpu].inst_length = 0;
+
+ if (vcpu == BSP || !guest_vcpu_mux){
+ mt_vmm_info[vcpu].mt_ctx = ctx;
+ mt_vmm_info[vcpu].mt_vcpu = vcpu;
+
+ error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
+ fbsdrun_start_thread, &mt_vmm_info[vcpu]);
+ assert(error == 0);
+ }
+}
+
+static int
+fbsdrun_get_next_cpu(int curcpu)
+{
+
+ /*
+ * Get the next available CPU. Assumes they arrive
+ * in ascending order with no gaps.
+ */
+ return ((curcpu + 1) % foundcpus);
+}
+
+int
+vmexit_catch_reset(void)
+{
+ stats.io_reset++;
+ return (VMEXIT_RESET);
+}
+
+int
+vmexit_catch_inout(void)
+{
+ return (VMEXIT_ABORT);
+}
+
+int
+vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
+ uint32_t eax)
+{
+#if PG_DEBUG /* put all types of debug here */
+ if (eax == 0) {
+ pause_noswitch = 1;
+ } else if (eax == 1) {
+ pause_noswitch = 0;
+ } else {
+ pause_noswitch = 0;
+ if (eax == 5) {
+ vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1);
+ }
+ }
+#endif
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int error;
+ int bytes, port, in, out;
+ uint32_t eax;
+ int vcpu;
+
+ vcpu = *pvcpu;
+
+ port = vme->u.inout.port;
+ bytes = vme->u.inout.bytes;
+ eax = vme->u.inout.eax;
+ in = vme->u.inout.in;
+ out = !in;
+
+ /* We don't deal with these */
+ if (vme->u.inout.string || vme->u.inout.rep)
+ return (VMEXIT_ABORT);
+
+ /* Special case of guest reset */
+ if (out && port == 0x64 && (uint8_t)eax == 0xFE)
+ return (vmexit_catch_reset());
+
+ /* Extra-special case of host notifications */
+ if (out && port == GUEST_NIO_PORT)
+ return (vmexit_handle_notify(ctx, vme, pvcpu, eax));
+
+ error = emulate_inout(ctx, vcpu, in, port, bytes, &eax);
+ if (error == 0 && in)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax);
+
+ if (error == 0)
+ return (VMEXIT_CONTINUE);
+ else {
+ fprintf(stderr, "Unhandled %s%c 0x%04x\n",
+ in ? "in" : "out",
+ bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
+ return (vmexit_catch_inout());
+ }
+}
+
+static int
+vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ printf("vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code, *pvcpu);
+ return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int newcpu;
+ int retval = VMEXIT_CONTINUE;
+
+ newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval);
+
+ if (guest_vcpu_mux && *pvcpu != newcpu) {
+ retval = VMEXIT_SWITCH;
+ *pvcpu = newcpu;
+ }
+
+ return (retval);
+}
+
+static int
+vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ printf("vm exit[%d]\n", *pvcpu);
+ printf("\treason\t\tVMX\n");
+ printf("\trip\t\t0x%016lx\n", vmexit->rip);
+ printf("\tinst_length\t%d\n", vmexit->inst_length);
+ printf("\terror\t\t%d\n", vmexit->u.vmx.error);
+ printf("\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
+ printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification);
+
+ return (VMEXIT_ABORT);
+}
+
+static int bogus_noswitch = 1;
+
+static int
+vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_bogus++;
+
+ if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) {
+ return (VMEXIT_RESTART);
+ } else {
+ stats.vmexit_bogus_switch++;
+ vmexit->inst_length = 0;
+ *pvcpu = -1;
+ return (VMEXIT_SWITCH);
+ }
+}
+
+static int
+vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_hlt++;
+ if (fbsdrun_muxed()) {
+ *pvcpu = -1;
+ return (VMEXIT_SWITCH);
+ } else {
+ /*
+ * Just continue execution with the next instruction. We use
+ * the HLT VM exit as a way to be friendly with the host
+ * scheduler.
+ */
+ return (VMEXIT_CONTINUE);
+ }
+}
+
+static int pause_noswitch;
+
+static int
+vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_pause++;
+
+ if (fbsdrun_muxed() && !pause_noswitch) {
+ *pvcpu = -1;
+ return (VMEXIT_SWITCH);
+ } else {
+ return (VMEXIT_CONTINUE);
+ }
+}
+
+static int
+vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_mtrap++;
+
+ return (VMEXIT_RESTART);
+}
+
+static void
+sigalrm(int sig)
+{
+ return;
+}
+
+static void
+setup_timeslice(void)
+{
+ struct sigaction sa;
+ struct itimerval itv;
+ int error;
+
+ /*
+ * Setup a realtime timer to generate a SIGALRM at a
+ * frequency of 'guest_tslice' ticks per second.
+ */
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = 0;
+ sa.sa_handler = sigalrm;
+
+ error = sigaction(SIGALRM, &sa, NULL);
+ assert(error == 0);
+
+ itv.it_interval.tv_sec = 0;
+ itv.it_interval.tv_usec = 1000000 / guest_tslice;
+ itv.it_value.tv_sec = 0;
+ itv.it_value.tv_usec = 1000000 / guest_tslice;
+
+ error = setitimer(ITIMER_REAL, &itv, NULL);
+ assert(error == 0);
+}
+
+static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
+ [VM_EXITCODE_INOUT] = vmexit_inout,
+ [VM_EXITCODE_VMX] = vmexit_vmx,
+ [VM_EXITCODE_BOGUS] = vmexit_bogus,
+ [VM_EXITCODE_RDMSR] = vmexit_rdmsr,
+ [VM_EXITCODE_WRMSR] = vmexit_wrmsr,
+ [VM_EXITCODE_MTRAP] = vmexit_mtrap,
+};
+
+static void
+vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
+{
+ int error, rc, prevcpu;
+
+ if (guest_vcpu_mux)
+ setup_timeslice();
+
+ if (pincpu >= 0) {
+ error = vm_set_pinning(ctx, vcpu, pincpu + vcpu);
+ assert(error == 0);
+ }
+
+ while (1) {
+ error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
+ if (error != 0)
+ break;
+
+ prevcpu = vcpu;
+ rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu],
+ &vcpu);
+ switch (rc) {
+ case VMEXIT_SWITCH:
+ assert(guest_vcpu_mux);
+ if (vcpu == -1) {
+ stats.cpu_switch_rotate++;
+ vcpu = fbsdrun_get_next_cpu(prevcpu);
+ } else {
+ stats.cpu_switch_direct++;
+ }
+ /* fall through */
+ case VMEXIT_CONTINUE:
+ rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
+ break;
+ case VMEXIT_RESTART:
+ rip = vmexit[vcpu].rip;
+ break;
+ case VMEXIT_RESET:
+ exit(0);
+ default:
+ exit(1);
+ }
+ }
+ fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
+}
+
+
+int
+main(int argc, char *argv[])
+{
+ int c, error, gdb_port, inject_bkpt, tmp, err;
+ struct vmctx *ctx;
+ uint64_t rip;
+
+ inject_bkpt = 0;
+ progname = basename(argv[0]);
+ gdb_port = DEFAULT_GDB_PORT;
+ guest_ncpus = 1;
+
+ while ((c = getopt(argc, argv, "hBHPxp:g:c:z:s:n:m:M:")) != -1) {
+ switch (c) {
+ case 'B':
+ inject_bkpt = 1;
+ break;
+ case 'x':
+ guest_vcpu_mux = 1;
+ break;
+ case 'p':
+ pincpu = atoi(optarg);
+ break;
+ case 'c':
+ guest_ncpus = atoi(optarg);
+ break;
+ case 'g':
+ gdb_port = atoi(optarg);
+ break;
+ case 'z':
+ guest_hz = atoi(optarg);
+ break;
+ case 't':
+ guest_tslice = atoi(optarg);
+ break;
+ case 's':
+ pci_parse_slot(optarg);
+ break;
+ case 'n':
+ pci_parse_name(optarg);
+ break;
+ case 'm':
+ lomem_sz = strtoul(optarg, NULL, 0) * MB;
+ break;
+ case 'M':
+ himem_sz = strtoul(optarg, NULL, 0) * MB;
+ break;
+ case 'H':
+ guest_vmexit_on_hlt = 1;
+ break;
+ case 'P':
+ guest_vmexit_on_pause = 1;
+ break;
+ case 'h':
+ usage(0);
+ default:
+ usage(1);
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1)
+ usage(1);
+
+ /* No need to mux if guest is uni-processor */
+ if (guest_ncpus <= 1)
+ guest_vcpu_mux = 0;
+
+ /* vmexit on hlt if guest is muxed */
+ if (guest_vcpu_mux) {
+ guest_vmexit_on_hlt = 1;
+ guest_vmexit_on_pause = 1;
+ }
+
+ vmname = argv[0];
+
+ ctx = vm_open(vmname);
+ if (ctx == NULL) {
+ perror("vm_open");
+ exit(1);
+ }
+
+ if (fbsdrun_vmexit_on_hlt()) {
+ err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp);
+ if (err < 0) {
+ printf("VM exit on HLT not supported\n");
+ exit(1);
+ }
+ vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1);
+ handler[VM_EXITCODE_HLT] = vmexit_hlt;
+ }
+
+ if (fbsdrun_vmexit_on_pause()) {
+ /*
+ * pause exit support required for this mode
+ */
+ err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp);
+ if (err < 0) {
+ printf("SMP mux requested, no pause support\n");
+ exit(1);
+ }
+ vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1);
+ handler[VM_EXITCODE_PAUSE] = vmexit_pause;
+ }
+
+ if (lomem_sz != 0) {
+ lomem_addr = vm_map_memory(ctx, 0, lomem_sz);
+ if (lomem_addr == (char *) MAP_FAILED) {
+ lomem_sz = 0;
+ } else if (himem_sz != 0) {
+ himem_addr = vm_map_memory(ctx, 4*GB, himem_sz);
+ if (himem_addr == (char *) MAP_FAILED) {
+ lomem_sz = 0;
+ himem_sz = 0;
+ }
+ }
+ }
+
+ init_inout();
+ init_pci(ctx);
+
+ if (gdb_port != 0)
+ init_dbgport(gdb_port);
+
+ error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
+ assert(error == 0);
+
+ if (inject_bkpt) {
+ error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP);
+ assert(error == 0);
+ }
+
+ /*
+ * build the guest tables, MP etc.
+ */
+ vm_build_tables(ctx, guest_ncpus, oem_tbl_start, oem_tbl_size);
+
+ /*
+ * Add CPU 0
+ */
+ fbsdrun_addcpu(ctx, BSP, rip);
+
+ /*
+ * Head off to the main event dispatch loop
+ */
+ mevent_dispatch();
+
+ exit(1);
+}
diff --git a/usr.sbin/bhyve/fbsdrun.h b/usr.sbin/bhyve/fbsdrun.h
new file mode 100644
index 0000000..8106122
--- /dev/null
+++ b/usr.sbin/bhyve/fbsdrun.h
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _FBSDRUN_H_
+#define _FBSDRUN_H_
+
+#ifndef CTASSERT /* Allow lint to override */
+#define CTASSERT(x) _CTASSERT(x, __LINE__)
+#define _CTASSERT(x, y) __CTASSERT(x, y)
+#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1]
+#endif
+
+struct vmctx;
+extern int guest_hz;
+extern int guest_tslice;
+extern int guest_ncpus;
+extern char *vmname;
+
+extern u_long lomem_sz, himem_sz;
+
+void *paddr_guest2host(uintptr_t);
+
+void fbsdrun_addcpu(struct vmctx *ctx, int cpu, uint64_t rip);
+void fbsdrun_add_oemtbl(void *tbl, int tblsz);
+int fbsdrun_muxed(void);
+int fbsdrun_vmexit_on_hlt(void);
+int fbsdrun_vmexit_on_pause(void);
+#endif
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
new file mode 100644
index 0000000..84445b1
--- /dev/null
+++ b/usr.sbin/bhyve/inout.c
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "inout.h"
+
+SET_DECLARE(inout_port_set, struct inout_port);
+
+#define MAX_IOPORTS (1 << 16)
+
+static struct {
+ const char *name;
+ int flags;
+ inout_func_t handler;
+ void *arg;
+} inout_handlers[MAX_IOPORTS];
+
+int
+emulate_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax)
+{
+ int flags;
+ inout_func_t handler;
+ void *arg;
+
+ assert(port < MAX_IOPORTS);
+
+ if ((handler = inout_handlers[port].handler) == NULL)
+ return (-1);
+
+ flags = inout_handlers[port].flags;
+ arg = inout_handlers[port].arg;
+
+ if ((in && (flags & IOPORT_F_IN)) || (!in && (flags & IOPORT_F_OUT)))
+ return ((*handler)(ctx, vcpu, in, port, bytes, eax, arg));
+ else
+ return (-1);
+}
+
+void
+init_inout(void)
+{
+ struct inout_port **iopp, *iop;
+
+ SET_FOREACH(iopp, inout_port_set) {
+ iop = *iopp;
+ assert(iop->port < MAX_IOPORTS);
+ inout_handlers[iop->port].name = iop->name;
+ inout_handlers[iop->port].flags = iop->flags;
+ inout_handlers[iop->port].handler = iop->handler;
+ inout_handlers[iop->port].arg = NULL;
+ }
+}
+
+int
+register_inout(struct inout_port *iop)
+{
+ assert(iop->port < MAX_IOPORTS);
+ inout_handlers[iop->port].name = iop->name;
+ inout_handlers[iop->port].flags = iop->flags;
+ inout_handlers[iop->port].handler = iop->handler;
+ inout_handlers[iop->port].arg = iop->arg;
+
+ return (0);
+}
diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h
new file mode 100644
index 0000000..7b8a4a6
--- /dev/null
+++ b/usr.sbin/bhyve/inout.h
@@ -0,0 +1,64 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _INOUT_H_
+#define _INOUT_H_
+
+#include <sys/linker_set.h>
+
+struct vmctx;
+
+typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
+ int bytes, uint32_t *eax, void *arg);
+
+struct inout_port {
+ const char *name;
+ int port;
+ int flags;
+ inout_func_t handler;
+ void *arg;
+};
+#define IOPORT_F_IN 0x1
+#define IOPORT_F_OUT 0x2
+#define IOPORT_F_INOUT 0x3
+
+#define INOUT_PORT(name, port, flags, handler) \
+ static struct inout_port __CONCAT(__inout_port, __LINE__) = { \
+ #name, \
+ (port), \
+ (flags), \
+ (handler) \
+ }; \
+ DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__))
+
+void init_inout(void);
+int emulate_inout(struct vmctx *, int vcpu, int in, int port, int bytes,
+ uint32_t *eax);
+int register_inout(struct inout_port *iop);
+
+#endif /* _INOUT_H_ */
diff --git a/usr.sbin/bhyve/mevent.c b/usr.sbin/bhyve/mevent.c
new file mode 100644
index 0000000..0d3b287
--- /dev/null
+++ b/usr.sbin/bhyve/mevent.c
@@ -0,0 +1,419 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Micro event library for FreeBSD, designed for a single i/o thread
+ * using kqueue, and having events be persistent by default.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/event.h>
+#include <sys/time.h>
+
+#include <pthread.h>
+
+#include "mevent.h"
+
+#define MEVENT_MAX 64
+
+#define MEV_ENABLE 1
+#define MEV_DISABLE 2
+#define MEV_DEL_PENDING 3
+
+static pthread_t mevent_tid;
+static int mevent_pipefd[2];
+static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
+
+struct mevent {
+ void (*me_func)(int, enum ev_type, void *);
+ int me_fd;
+ enum ev_type me_type;
+ void *me_param;
+ int me_cq;
+ int me_state;
+ int me_closefd;
+ LIST_ENTRY(mevent) me_list;
+};
+
+static LIST_HEAD(listhead, mevent) global_head, change_head;
+
+static void
+mevent_qlock(void)
+{
+ pthread_mutex_lock(&mevent_lmutex);
+}
+
+static void
+mevent_qunlock(void)
+{
+ pthread_mutex_unlock(&mevent_lmutex);
+}
+
+static void
+mevent_pipe_read(int fd, enum ev_type type, void *param)
+{
+ char buf[MEVENT_MAX];
+ int status;
+
+ /*
+ * Drain the pipe read side. The fd is non-blocking so this is
+ * safe to do.
+ */
+ do {
+ status = read(fd, buf, sizeof(buf));
+ } while (status == MEVENT_MAX);
+}
+
+static void
+mevent_notify(void)
+{
+ char c;
+
+ /*
+ * If calling from outside the i/o thread, write a byte on the
+ * pipe to force the i/o thread to exit the blocking kevent call.
+ */
+ if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
+ write(mevent_pipefd[1], &c, 1);
+ }
+}
+
+static int
+mevent_kq_filter(struct mevent *mevp)
+{
+ int retval;
+
+ retval = 0;
+
+ if (mevp->me_type == EVF_READ)
+ retval = EVFILT_READ;
+
+ if (mevp->me_type == EVF_WRITE)
+ retval = EVFILT_WRITE;
+
+ return (retval);
+}
+
+static int
+mevent_kq_flags(struct mevent *mevp)
+{
+ int ret;
+
+ switch (mevp->me_state) {
+ case MEV_ENABLE:
+ ret = EV_ADD;
+ break;
+ case MEV_DISABLE:
+ ret = EV_DISABLE;
+ break;
+ case MEV_DEL_PENDING:
+ ret = EV_DELETE;
+ break;
+ }
+
+ return (ret);
+}
+
+static int
+mevent_kq_fflags(struct mevent *mevp)
+{
+ /* XXX nothing yet, perhaps EV_EOF for reads ? */
+ return (0);
+}
+
+static int
+mevent_build(int mfd, struct kevent *kev)
+{
+ struct mevent *mevp, *tmpp;
+ int i;
+
+ i = 0;
+
+ mevent_qlock();
+
+ LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
+ if (mevp->me_closefd) {
+ /*
+ * A close of the file descriptor will remove the
+ * event
+ */
+ close(mevp->me_fd);
+ } else {
+ kev[i].ident = mevp->me_fd;
+ kev[i].filter = mevent_kq_filter(mevp);
+ kev[i].flags = mevent_kq_flags(mevp);
+ kev[i].fflags = mevent_kq_fflags(mevp);
+ kev[i].data = 0;
+ kev[i].udata = mevp;
+ i++;
+ }
+
+ mevp->me_cq = 0;
+ LIST_REMOVE(mevp, me_list);
+
+ if (mevp->me_state == MEV_DEL_PENDING) {
+ free(mevp);
+ } else {
+ LIST_INSERT_HEAD(&global_head, mevp, me_list);
+ }
+
+ assert(i < MEVENT_MAX);
+ }
+
+ mevent_qunlock();
+
+ return (i);
+}
+
+static void
+mevent_handle(struct kevent *kev, int numev)
+{
+ struct mevent *mevp;
+ int i;
+
+ for (i = 0; i < numev; i++) {
+ mevp = kev[i].udata;
+
+ /* XXX check for EV_ERROR ? */
+
+ (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
+ }
+}
+
+struct mevent *
+mevent_add(int fd, enum ev_type type,
+ void (*func)(int, enum ev_type, void *), void *param)
+{
+ struct mevent *lp, *mevp;
+
+ if (fd < 0 || func == NULL) {
+ return (NULL);
+ }
+
+ mevp = NULL;
+
+ mevent_qlock();
+
+ /*
+ * Verify that the fd/type tuple is not present in any list
+ */
+ LIST_FOREACH(lp, &global_head, me_list) {
+ if (lp->me_fd == fd && lp->me_type == type) {
+ goto exit;
+ }
+ }
+
+ LIST_FOREACH(lp, &change_head, me_list) {
+ if (lp->me_fd == fd && lp->me_type == type) {
+ goto exit;
+ }
+ }
+
+ /*
+ * Allocate an entry, populate it, and add it to the change list.
+ */
+ mevp = malloc(sizeof(struct mevent));
+ if (mevp == NULL) {
+ goto exit;
+ }
+
+ memset(mevp, 0, sizeof(struct mevent));
+ mevp->me_fd = fd;
+ mevp->me_type = type;
+ mevp->me_func = func;
+ mevp->me_param = param;
+
+ LIST_INSERT_HEAD(&change_head, mevp, me_list);
+ mevp->me_cq = 1;
+ mevp->me_state = MEV_ENABLE;
+ mevent_notify();
+
+exit:
+ mevent_qunlock();
+
+ return (mevp);
+}
+
+static int
+mevent_update(struct mevent *evp, int newstate)
+{
+ /*
+ * It's not possible to enable/disable a deleted event
+ */
+ if (evp->me_state == MEV_DEL_PENDING)
+ return (EINVAL);
+
+ /*
+ * No update needed if state isn't changing
+ */
+ if (evp->me_state == newstate)
+ return (0);
+
+ mevent_qlock();
+
+ evp->me_state = newstate;
+
+ /*
+ * Place the entry onto the changed list if not already there.
+ */
+ if (evp->me_cq == 0) {
+ evp->me_cq = 1;
+ LIST_REMOVE(evp, me_list);
+ LIST_INSERT_HEAD(&change_head, evp, me_list);
+ mevent_notify();
+ }
+
+ mevent_qunlock();
+
+ return (0);
+}
+
+int
+mevent_enable(struct mevent *evp)
+{
+
+ return (mevent_update(evp, MEV_ENABLE));
+}
+
+int
+mevent_disable(struct mevent *evp)
+{
+
+ return (mevent_update(evp, MEV_DISABLE));
+}
+
+static int
+mevent_delete_event(struct mevent *evp, int closefd)
+{
+ mevent_qlock();
+
+ /*
+ * Place the entry onto the changed list if not already there, and
+ * mark as to be deleted.
+ */
+ if (evp->me_cq == 0) {
+ evp->me_cq = 1;
+ LIST_REMOVE(evp, me_list);
+ LIST_INSERT_HEAD(&change_head, evp, me_list);
+ mevent_notify();
+ }
+ evp->me_state = MEV_DEL_PENDING;
+
+ if (closefd)
+ evp->me_closefd = 1;
+
+ mevent_qunlock();
+
+ return (0);
+}
+
+int
+mevent_delete(struct mevent *evp)
+{
+
+ return (mevent_delete_event(evp, 0));
+}
+
+int
+mevent_delete_close(struct mevent *evp)
+{
+
+ return (mevent_delete_event(evp, 1));
+}
+
+void
+mevent_dispatch(void)
+{
+ struct kevent changelist[MEVENT_MAX];
+ struct kevent eventlist[MEVENT_MAX];
+ struct mevent *pipev;
+ int mfd;
+ int numev;
+ int ret;
+
+ mevent_tid = pthread_self();
+
+ mfd = kqueue();
+ assert(mfd > 0);
+
+ /*
+ * Open the pipe that will be used for other threads to force
+ * the blocking kqueue call to exit by writing to it. Set the
+ * descriptor to non-blocking.
+ */
+ ret = pipe(mevent_pipefd);
+ if (ret < 0) {
+ perror("pipe");
+ exit(0);
+ }
+
+ /*
+ * Add internal event handler for the pipe write fd
+ */
+ pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
+ assert(pipev != NULL);
+
+ for (;;) {
+ /*
+ * Build changelist if required.
+ * XXX the changelist can be put into the blocking call
+ * to eliminate the extra syscall. Currently better for
+ * debug.
+ */
+ numev = mevent_build(mfd, changelist);
+ if (numev) {
+ ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
+ if (ret == -1) {
+ perror("Error return from kevent change");
+ }
+ }
+
+ /*
+ * Block awaiting events
+ */
+ ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
+ if (ret == -1) {
+ perror("Error return from kevent monitor");
+ }
+
+ /*
+ * Handle reported events
+ */
+ mevent_handle(eventlist, ret);
+ }
+}
diff --git a/usr.sbin/bhyve/mevent.h b/usr.sbin/bhyve/mevent.h
new file mode 100644
index 0000000..32a9d74
--- /dev/null
+++ b/usr.sbin/bhyve/mevent.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MEVENT_H_
+#define _MEVENT_H_
+
+enum ev_type {
+ EVF_READ,
+ EVF_WRITE
+};
+
+struct mevent;
+
+struct mevent *mevent_add(int fd, enum ev_type type,
+ void (*func)(int, enum ev_type, void *),
+ void *param);
+int mevent_enable(struct mevent *evp);
+int mevent_disable(struct mevent *evp);
+int mevent_delete(struct mevent *evp);
+int mevent_delete_close(struct mevent *evp);
+
+void mevent_dispatch(void);
+
+#endif /* _MEVENT_H_ */
diff --git a/usr.sbin/bhyve/mevent_test.c b/usr.sbin/bhyve/mevent_test.c
new file mode 100644
index 0000000..c72a497
--- /dev/null
+++ b/usr.sbin/bhyve/mevent_test.c
@@ -0,0 +1,180 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Test program for the micro event library. Set up a simple TCP echo
+ * service.
+ *
+ * cc mevent_test.c mevent.c -lpthread
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "mevent.h"
+
+#define TEST_PORT 4321
+
+static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER;
+
+#define MEVENT_ECHO
+
+#ifdef MEVENT_ECHO
+struct esync {
+ pthread_mutex_t e_mt;
+ pthread_cond_t e_cond;
+};
+
+static void
+echoer_callback(int fd, enum ev_type type, void *param)
+{
+ struct esync *sync = param;
+
+ pthread_mutex_lock(&sync->e_mt);
+ pthread_cond_signal(&sync->e_cond);
+ pthread_mutex_unlock(&sync->e_mt);
+}
+
+static void *
+echoer(void *param)
+{
+ struct esync sync;
+ struct mevent *mev;
+ char buf[128];
+ int fd = (int)(uintptr_t) param;
+ int len;
+
+ pthread_mutex_init(&sync.e_mt, NULL);
+ pthread_cond_init(&sync.e_cond, NULL);
+
+ pthread_mutex_lock(&sync.e_mt);
+
+ mev = mevent_add(fd, EVF_READ, echoer_callback, &sync);
+ if (mev == NULL) {
+ printf("Could not allocate echoer event\n");
+ exit(1);
+ }
+
+ while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) {
+ len = read(fd, buf, sizeof(buf));
+ if (len > 0) {
+ write(fd, buf, len);
+ write(0, buf, len);
+ } else {
+ break;
+ }
+ }
+
+ mevent_delete_close(mev);
+
+ pthread_mutex_unlock(&sync.e_mt);
+ pthread_mutex_destroy(&sync.e_mt);
+ pthread_cond_destroy(&sync.e_cond);
+}
+
+#else
+
+static void *
+echoer(void *param)
+{
+ char buf[128];
+ int fd = (int)(uintptr_t) param;
+ int len;
+
+ while ((len = read(fd, buf, sizeof(buf))) > 0) {
+ write(1, buf, len);
+ }
+}
+#endif /* MEVENT_ECHO */
+
+static void
+acceptor_callback(int fd, enum ev_type type, void *param)
+{
+ pthread_mutex_lock(&accept_mutex);
+ pthread_cond_signal(&accept_condvar);
+ pthread_mutex_unlock(&accept_mutex);
+}
+
+static void *
+acceptor(void *param)
+{
+ struct sockaddr_in sin;
+ pthread_t tid;
+ int news;
+ int s;
+
+ if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ perror("socket");
+ exit(1);
+ }
+
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(TEST_PORT);
+
+ if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+ perror("bind");
+ exit(1);
+ }
+
+ if (listen(s, 1) < 0) {
+ perror("listen");
+ exit(1);
+ }
+
+ (void) mevent_add(s, EVF_READ, acceptor_callback, NULL);
+
+ pthread_mutex_lock(&accept_mutex);
+
+ while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) {
+ news = accept(s, NULL, NULL);
+ if (news < 0) {
+ perror("accept error");
+ } else {
+ printf("incoming connection, spawning thread\n");
+ pthread_create(&tid, NULL, echoer,
+ (void *)(uintptr_t)news);
+ }
+ }
+}
+
+main()
+{
+ pthread_t tid;
+
+ pthread_create(&tid, NULL, acceptor, NULL);
+
+ mevent_dispatch();
+}
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
new file mode 100644
index 0000000..273c6a2
--- /dev/null
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -0,0 +1,976 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "fbsdrun.h"
+#include "inout.h"
+#include "pci_emul.h"
+
+#define CONF1_ADDR_PORT 0x0cf8
+#define CONF1_DATA_PORT 0x0cfc
+
+#define CFGWRITE(pi,off,val,b) \
+do { \
+ if ((b) == 1) { \
+ pci_set_cfgdata8((pi),(off),(val)); \
+ } else if ((b) == 2) { \
+ pci_set_cfgdata16((pi),(off),(val)); \
+ } else { \
+ pci_set_cfgdata32((pi),(off),(val)); \
+ } \
+} while (0)
+
+#define MAXSLOTS 32
+
+static struct slotinfo {
+ char *si_name;
+ char *si_param;
+ struct pci_devinst *si_devi;
+ int si_titled;
+ int si_pslot;
+ char si_prefix;
+ char si_suffix;
+} pci_slotinfo[MAXSLOTS];
+
+/*
+ * NetApp specific:
+ * struct used to build an in-core OEM table to supply device names
+ * to driver instances
+ */
+static struct mptable_pci_devnames {
+#define MPT_HDR_BASE 0
+#define MPT_HDR_NAME 2
+ uint16_t md_hdrtype;
+ uint16_t md_entries;
+ uint16_t md_cksum;
+ uint16_t md_pad;
+#define MPT_NTAP_SIG \
+ ((uint32_t)(('P' << 24) | ('A' << 16) | ('T' << 8) | 'N'))
+ uint32_t md_sig;
+ uint32_t md_rsvd;
+ struct mptable_pci_slotinfo {
+ uint16_t mds_type;
+ uint16_t mds_phys_slot;
+ uint8_t mds_bus;
+ uint8_t mds_slot;
+ uint8_t mds_func;
+ uint8_t mds_pad;
+ uint16_t mds_vid;
+ uint16_t mds_did;
+ uint8_t mds_suffix[4];
+ uint8_t mds_prefix[4];
+ uint32_t mds_rsvd[3];
+ } md_slotinfo[MAXSLOTS];
+} pci_devnames;
+
+SET_DECLARE(pci_devemu_set, struct pci_devemu);
+
+static uint64_t pci_emul_iobase;
+static uint64_t pci_emul_membase32;
+static uint64_t pci_emul_membase64;
+
+#define PCI_EMUL_IOBASE 0x2000
+#define PCI_EMUL_IOLIMIT 0x10000
+
+#define PCI_EMUL_MEMBASE32 (lomem_sz)
+#define PCI_EMUL_MEMLIMIT32 0xE0000000 /* 3.5GB */
+
+#define PCI_EMUL_MEMBASE64 0xD000000000UL
+#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL
+
+static int pci_emul_devices;
+static int devname_elems;
+
+/*
+ * I/O access
+ */
+
+/*
+ * Slot options are in the form:
+ *
+ * <slot>,<emul>[,<config>]
+ *
+ * slot is 0..31
+ * emul is a string describing the type of PCI device e.g. virtio-net
+ * config is an optional string, depending on the device, that can be
+ * used for configuration.
+ * Examples are:
+ * 1,virtio-net,tap0
+ * 3,dummy
+ */
+static void
+pci_parse_slot_usage(char *aopt)
+{
+ printf("Invalid PCI slot info field \"%s\"\n", aopt);
+ free(aopt);
+}
+
+void
+pci_parse_slot(char *opt)
+{
+ char *slot, *emul, *config;
+ char *str, *cpy;
+ int snum;
+
+ str = cpy = strdup(opt);
+ config = NULL;
+
+ slot = strsep(&str, ",");
+ emul = strsep(&str, ",");
+ if (str != NULL) {
+ config = strsep(&str, ",");
+ }
+
+ if (emul == NULL) {
+ pci_parse_slot_usage(cpy);
+ return;
+ }
+
+ snum = 255;
+ snum = atoi(slot);
+ if (snum < 0 || snum >= MAXSLOTS) {
+ pci_parse_slot_usage(cpy);
+ } else {
+ pci_slotinfo[snum].si_name = emul;
+ pci_slotinfo[snum].si_param = config;
+ }
+}
+
+
+/*
+ *
+ * PCI MPTable names are of the form:
+ *
+ * <slot>,[prefix]<digit><suffix>
+ *
+ * .. with <prefix> an alphabetic char, <digit> a 1 or 2-digit string,
+ * and <suffix> a single char.
+ *
+ * Examples:
+ * 1,e0c
+ * 4,e0P
+ * 6,43a
+ * 7,0f
+ * 10,1
+ * 12,e0M
+ * 2,12a
+ *
+ * Note that this is NetApp-specific, but is ignored on other o/s's.
+ */
+static void
+pci_parse_name_usage(char *aopt)
+{
+ printf("Invalid PCI slot name field \"%s\"\n", aopt);
+}
+
+void
+pci_parse_name(char *opt)
+{
+ char csnum[4];
+ char *namestr;
+ char *slotend;
+ char prefix, suffix;
+ int i;
+ int pslot;
+ int snum;
+
+ pslot = -1;
+ prefix = suffix = 0;
+ slotend = strchr(opt, ',');
+
+ /*
+ * A comma must be present, and can't be the first character
+ * or no slot would be present. Also, the slot number can't be
+ * more than 2 characters.
+ */
+ if (slotend == NULL || slotend == opt || (slotend - opt > 2)) {
+ pci_parse_name_usage(opt);
+ return;
+ }
+
+ for (i = 0; i < (slotend - opt); i++) {
+ csnum[i] = opt[i];
+ }
+ csnum[i] = '\0';
+
+ snum = 255;
+ snum = atoi(csnum);
+ if (snum < 0 || snum >= MAXSLOTS) {
+ pci_parse_name_usage(opt);
+ return;
+ }
+
+ namestr = slotend + 1;
+
+ if (strlen(namestr) > 3) {
+ pci_parse_name_usage(opt);
+ return;
+ }
+
+ if (isalpha(*namestr)) {
+ prefix = *namestr++;
+ }
+
+ if (!isdigit(*namestr)) {
+ pci_parse_name_usage(opt);
+ } else {
+ pslot = *namestr++ - '0';
+ if (isnumber(*namestr)) {
+ pslot = 10*pslot + *namestr++ - '0';
+
+ }
+ if (isalpha(*namestr) && *(namestr + 1) == 0) {
+ suffix = *namestr;
+ pci_slotinfo[snum].si_titled = 1;
+ pci_slotinfo[snum].si_pslot = pslot;
+ pci_slotinfo[snum].si_prefix = prefix;
+ pci_slotinfo[snum].si_suffix = suffix;
+
+ } else {
+ pci_parse_name_usage(opt);
+ }
+ }
+}
+
+static void
+pci_add_mptable_name(struct slotinfo *si)
+{
+ struct mptable_pci_slotinfo *ms;
+
+ /*
+ * If naming information has been supplied for this slot, populate
+ * the next available mptable OEM entry
+ */
+ if (si->si_titled) {
+ ms = &pci_devnames.md_slotinfo[devname_elems];
+
+ ms->mds_type = MPT_HDR_NAME;
+ ms->mds_phys_slot = si->si_pslot;
+ ms->mds_bus = si->si_devi->pi_bus;
+ ms->mds_slot = si->si_devi->pi_slot;
+ ms->mds_func = si->si_devi->pi_func;
+ ms->mds_vid = pci_get_cfgdata16(si->si_devi, PCIR_VENDOR);
+ ms->mds_did = pci_get_cfgdata16(si->si_devi, PCIR_DEVICE);
+ ms->mds_suffix[0] = si->si_suffix;
+ ms->mds_prefix[0] = si->si_prefix;
+
+ devname_elems++;
+ }
+}
+
+static void
+pci_finish_mptable_names(void)
+{
+ int size;
+
+ if (devname_elems) {
+ pci_devnames.md_hdrtype = MPT_HDR_BASE;
+ pci_devnames.md_entries = devname_elems;
+ pci_devnames.md_cksum = 0; /* XXX */
+ pci_devnames.md_sig = MPT_NTAP_SIG;
+
+ size = (uintptr_t)&pci_devnames.md_slotinfo[devname_elems] -
+ (uintptr_t)&pci_devnames;
+
+ fbsdrun_add_oemtbl(&pci_devnames, size);
+ }
+}
+
+static int
+pci_emul_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ struct pci_devinst *pdi = arg;
+ struct pci_devemu *pe = pdi->pi_d;
+ int offset, i;
+
+ for (i = 0; i <= PCI_BARMAX; i++) {
+ if (pdi->pi_bar[i].type == PCIBAR_IO &&
+ port >= pdi->pi_bar[i].addr &&
+ port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
+ offset = port - pdi->pi_bar[i].addr;
+ if (in)
+ *eax = (*pe->pe_ior)(pdi, i, offset, bytes);
+ else
+ (*pe->pe_iow)(pdi, i, offset, bytes, *eax);
+ return (0);
+ }
+ }
+ return (-1);
+}
+
+static int
+pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
+ uint64_t *addr)
+{
+ uint64_t base;
+
+ assert((size & (size - 1)) == 0); /* must be a power of 2 */
+
+ base = roundup2(*baseptr, size);
+
+ if (base + size <= limit) {
+ *addr = base;
+ *baseptr = base + size;
+ return (0);
+ } else
+ return (-1);
+}
+
+int
+pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
+ enum pcibar_type type, uint64_t size)
+{
+ int i, error;
+ uint64_t *baseptr, limit, addr, mask, lobits, bar;
+ struct inout_port iop;
+
+ assert(idx >= 0 && idx <= PCI_BARMAX);
+
+ if ((size & (size - 1)) != 0)
+ size = 1UL << flsl(size); /* round up to a power of 2 */
+
+ switch (type) {
+ case PCIBAR_NONE:
+ baseptr = NULL;
+ addr = mask = lobits = 0;
+ break;
+ case PCIBAR_IO:
+ baseptr = &pci_emul_iobase;
+ limit = PCI_EMUL_IOLIMIT;
+ mask = PCIM_BAR_IO_BASE;
+ lobits = PCIM_BAR_IO_SPACE;
+ break;
+ case PCIBAR_MEM64:
+ /*
+ * XXX
+ * Some drivers do not work well if the 64-bit BAR is allocated
+ * above 4GB. Allow for this by allocating small requests under
+ * 4GB unless then allocation size is larger than some arbitrary
+ * number (32MB currently).
+ */
+ if (size > 32 * 1024 * 1024) {
+ /*
+ * XXX special case for device requiring peer-peer DMA
+ */
+ if (size == 0x100000000UL)
+ baseptr = &hostbase;
+ else
+ baseptr = &pci_emul_membase64;
+ limit = PCI_EMUL_MEMLIMIT64;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+ PCIM_BAR_MEM_PREFETCH;
+ break;
+ }
+ /* fallthrough */
+ case PCIBAR_MEM32:
+ baseptr = &pci_emul_membase32;
+ limit = PCI_EMUL_MEMLIMIT32;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+ break;
+ default:
+ printf("pci_emul_alloc_base: invalid bar type %d\n", type);
+ assert(0);
+ }
+
+ if (baseptr != NULL) {
+ error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
+ if (error != 0)
+ return (error);
+ }
+
+ pdi->pi_bar[idx].type = type;
+ pdi->pi_bar[idx].addr = addr;
+ pdi->pi_bar[idx].size = size;
+
+ /* Initialize the BAR register in config space */
+ bar = (addr & mask) | lobits;
+ pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
+
+ if (type == PCIBAR_MEM64) {
+ assert(idx + 1 <= PCI_BARMAX);
+ pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
+ pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
+ }
+
+ /* add a handler to intercept accesses to the I/O bar */
+ if (type == PCIBAR_IO) {
+ iop.name = pdi->pi_name;
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = pci_emul_handler;
+ iop.arg = pdi;
+
+ for (i = 0; i < size; i++) {
+ iop.port = addr + i;
+ register_inout(&iop);
+ }
+ }
+
+ return (0);
+}
+
+#define CAP_START_OFFSET 0x40
+static int
+pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
+{
+ int i, capoff, capid, reallen;
+ uint16_t sts;
+
+ static u_char endofcap[4] = {
+ PCIY_RESERVED, 0, 0, 0
+ };
+
+ assert(caplen > 0 && capdata[0] != PCIY_RESERVED);
+
+ reallen = roundup2(caplen, 4); /* dword aligned */
+
+ sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
+ capoff = CAP_START_OFFSET;
+ pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
+ pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
+ } else {
+ capoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
+ while (1) {
+ assert((capoff & 0x3) == 0);
+ capid = pci_get_cfgdata8(pi, capoff);
+ if (capid == PCIY_RESERVED)
+ break;
+ capoff = pci_get_cfgdata8(pi, capoff + 1);
+ }
+ }
+
+ /* Check if we have enough space */
+ if (capoff + reallen + sizeof(endofcap) > PCI_REGMAX + 1)
+ return (-1);
+
+ /* Copy the capability */
+ for (i = 0; i < caplen; i++)
+ pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+ /* Set the next capability pointer */
+ pci_set_cfgdata8(pi, capoff + 1, capoff + reallen);
+
+ /* Copy of the reserved capability which serves as the end marker */
+ for (i = 0; i < sizeof(endofcap); i++)
+ pci_set_cfgdata8(pi, capoff + reallen + i, endofcap[i]);
+
+ return (0);
+}
+
+static struct pci_devemu *
+pci_emul_finddev(char *name)
+{
+ struct pci_devemu **pdpp, *pdp;
+
+ SET_FOREACH(pdpp, pci_devemu_set) {
+ pdp = *pdpp;
+ if (!strcmp(pdp->pe_emu, name)) {
+ return (pdp);
+ }
+ }
+
+ return (NULL);
+}
+
+static void
+pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int slot, char *params)
+{
+ struct pci_devinst *pdi;
+ pdi = malloc(sizeof(struct pci_devinst));
+ bzero(pdi, sizeof(*pdi));
+
+ pdi->pi_vmctx = ctx;
+ pdi->pi_bus = 0;
+ pdi->pi_slot = slot;
+ pdi->pi_func = 0;
+ pdi->pi_d = pde;
+ snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
+
+ /* Disable legacy interrupts */
+ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
+ pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
+
+ pci_set_cfgdata8(pdi, PCIR_COMMAND,
+ PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
+
+ if ((*pde->pe_init)(ctx, pdi, params) != 0) {
+ free(pdi);
+ } else {
+ pci_emul_devices++;
+ pci_slotinfo[slot].si_devi = pdi;
+ }
+}
+
+void
+pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
+{
+ int mmc;
+
+ CTASSERT(sizeof(struct msicap) == 14);
+
+ /* Number of msi messages must be a power of 2 between 1 and 32 */
+ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
+ mmc = ffs(msgnum) - 1;
+
+ bzero(msicap, sizeof(struct msicap));
+ msicap->capid = PCIY_MSI;
+ msicap->nextptr = nextptr;
+ msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
+}
+
+int
+pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
+{
+ struct msicap msicap;
+
+ pci_populate_msicap(&msicap, msgnum, 0);
+
+ return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
+}
+
+void
+msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+ uint16_t msgctrl, rwmask, msgdata, mme;
+ uint32_t addrlo;
+
+ /*
+ * If guest is writing to the message control register make sure
+ * we do not overwrite read-only fields.
+ */
+ if ((offset - capoff) == 2 && bytes == 2) {
+ rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
+ msgctrl = pci_get_cfgdata16(pi, offset);
+ msgctrl &= ~rwmask;
+ msgctrl |= val & rwmask;
+ val = msgctrl;
+
+ addrlo = pci_get_cfgdata32(pi, capoff + 4);
+ if (msgctrl & PCIM_MSICTRL_64BIT)
+ msgdata = pci_get_cfgdata16(pi, capoff + 12);
+ else
+ msgdata = pci_get_cfgdata16(pi, capoff + 8);
+
+ /*
+ * XXX check delivery mode, destination mode etc
+ */
+ mme = msgctrl & PCIM_MSICTRL_MME_MASK;
+ pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
+ if (pi->pi_msi.enabled) {
+ pi->pi_msi.cpu = (addrlo >> 12) & 0xff;
+ pi->pi_msi.vector = msgdata & 0xff;
+ pi->pi_msi.msgnum = 1 << (mme >> 4);
+ } else {
+ pi->pi_msi.cpu = 0;
+ pi->pi_msi.vector = 0;
+ pi->pi_msi.msgnum = 0;
+ }
+ }
+
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+/*
+ * This function assumes that 'coff' is in the capabilities region of the
+ * config space.
+ */
+static void
+pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
+{
+ int capid;
+ uint8_t capoff, nextoff;
+
+ /* Do not allow un-aligned writes */
+ if ((offset & (bytes - 1)) != 0)
+ return;
+
+ /* Find the capability that we want to update */
+ capoff = CAP_START_OFFSET;
+ while (1) {
+ capid = pci_get_cfgdata8(pi, capoff);
+ if (capid == PCIY_RESERVED)
+ break;
+
+ nextoff = pci_get_cfgdata8(pi, capoff + 1);
+ if (offset >= capoff && offset < nextoff)
+ break;
+
+ capoff = nextoff;
+ }
+ assert(offset >= capoff);
+
+ /*
+ * Capability ID and Next Capability Pointer are readonly
+ */
+ if (offset == capoff || offset == capoff + 1)
+ return;
+
+ switch (capid) {
+ case PCIY_MSI:
+ msicap_cfgwrite(pi, capoff, offset, bytes, val);
+ break;
+ default:
+ break;
+ }
+}
+
+static int
+pci_emul_iscap(struct pci_devinst *pi, int offset)
+{
+ int found;
+ uint16_t sts;
+ uint8_t capid, lastoff;
+
+ found = 0;
+ sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+ if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
+ lastoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
+ while (1) {
+ assert((lastoff & 0x3) == 0);
+ capid = pci_get_cfgdata8(pi, lastoff);
+ if (capid == PCIY_RESERVED)
+ break;
+ lastoff = pci_get_cfgdata8(pi, lastoff + 1);
+ }
+ if (offset >= CAP_START_OFFSET && offset <= lastoff)
+ found = 1;
+ }
+ return (found);
+}
+
+void
+init_pci(struct vmctx *ctx)
+{
+ struct pci_devemu *pde;
+ struct slotinfo *si;
+ int i;
+
+ pci_emul_iobase = PCI_EMUL_IOBASE;
+ pci_emul_membase32 = PCI_EMUL_MEMBASE32;
+ pci_emul_membase64 = PCI_EMUL_MEMBASE64;
+
+ si = pci_slotinfo;
+
+ for (i = 0; i < MAXSLOTS; i++, si++) {
+ if (si->si_name != NULL) {
+ pde = pci_emul_finddev(si->si_name);
+ if (pde != NULL) {
+ pci_emul_init(ctx, pde, i, si->si_param);
+ pci_add_mptable_name(si);
+ }
+ }
+ }
+ pci_finish_mptable_names();
+}
+
+int
+pci_msi_enabled(struct pci_devinst *pi)
+{
+ return (pi->pi_msi.enabled);
+}
+
+int
+pci_msi_msgnum(struct pci_devinst *pi)
+{
+ if (pi->pi_msi.enabled)
+ return (pi->pi_msi.msgnum);
+ else
+ return (0);
+}
+
+void
+pci_generate_msi(struct pci_devinst *pi, int msg)
+{
+
+ if (pci_msi_enabled(pi) && msg < pci_msi_msgnum(pi)) {
+ vm_lapic_irq(pi->pi_vmctx,
+ pi->pi_msi.cpu,
+ pi->pi_msi.vector + msg);
+ }
+}
+
+static int cfgbus, cfgslot, cfgfunc, cfgoff;
+
+static int
+pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ uint32_t x;
+
+ assert(!in);
+
+ if (bytes != 4)
+ return (-1);
+
+ x = *eax;
+ cfgoff = x & PCI_REGMAX;
+ cfgfunc = (x >> 8) & PCI_FUNCMAX;
+ cfgslot = (x >> 11) & PCI_SLOTMAX;
+ cfgbus = (x >> 16) & PCI_BUSMAX;
+
+ return (0);
+}
+INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_OUT, pci_emul_cfgaddr);
+
+static int
+pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ struct pci_devinst *pi;
+ struct pci_devemu *pe;
+ int coff, idx;
+ uint64_t mask, bar;
+
+ assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+ pi = pci_slotinfo[cfgslot].si_devi;
+ coff = cfgoff + (port - CONF1_DATA_PORT);
+
+#if 0
+ printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r",
+ in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc);
+#endif
+
+ if (pi == NULL || cfgfunc != 0) {
+ if (in)
+ *eax = 0xffffffff;
+ return (0);
+ }
+
+ pe = pi->pi_d;
+
+ /*
+ * Config read
+ */
+ if (in) {
+ /* Let the device emulation override the default handler */
+ if (pe->pe_cfgread != NULL &&
+ (*pe->pe_cfgread)(ctx, vcpu, pi, coff, bytes, eax) == 0)
+ return (0);
+
+ if (bytes == 1)
+ *eax = pci_get_cfgdata8(pi, coff);
+ else if (bytes == 2)
+ *eax = pci_get_cfgdata16(pi, coff);
+ else
+ *eax = pci_get_cfgdata32(pi, coff);
+ } else {
+ /* Let the device emulation override the default handler */
+ if (pe->pe_cfgwrite != NULL &&
+ (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
+ return (0);
+
+ /*
+ * Special handling for write to BAR registers
+ */
+ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
+ /*
+ * Ignore writes to BAR registers that are not
+ * 4-byte aligned.
+ */
+ if (bytes != 4 || (coff & 0x3) != 0)
+ return (0);
+ idx = (coff - PCIR_BAR(0)) / 4;
+ switch (pi->pi_bar[idx].type) {
+ case PCIBAR_NONE:
+ bar = 0;
+ break;
+ case PCIBAR_IO:
+ mask = ~(pi->pi_bar[idx].size - 1);
+ mask &= PCIM_BAR_IO_BASE;
+ bar = (*eax & mask) | PCIM_BAR_IO_SPACE;
+ break;
+ case PCIBAR_MEM32:
+ mask = ~(pi->pi_bar[idx].size - 1);
+ mask &= PCIM_BAR_MEM_BASE;
+ bar = *eax & mask;
+ bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+ break;
+ case PCIBAR_MEM64:
+ mask = ~(pi->pi_bar[idx].size - 1);
+ mask &= PCIM_BAR_MEM_BASE;
+ bar = *eax & mask;
+ bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+ PCIM_BAR_MEM_PREFETCH;
+ break;
+ case PCIBAR_MEMHI64:
+ mask = ~(pi->pi_bar[idx - 1].size - 1);
+ mask &= PCIM_BAR_MEM_BASE;
+ bar = ((uint64_t)*eax << 32) & mask;
+ bar = bar >> 32;
+ break;
+ default:
+ assert(0);
+ }
+ pci_set_cfgdata32(pi, coff, bar);
+ } else if (pci_emul_iscap(pi, coff)) {
+ pci_emul_capwrite(pi, coff, bytes, *eax);
+ } else {
+ CFGWRITE(pi, coff, *eax, bytes);
+ }
+ }
+
+ return (0);
+}
+
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
+
+/*
+ * I/O ports to configure PCI IRQ routing. We ignore all writes to it.
+ */
+static int
+pci_irq_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 0);
+ return (0);
+}
+INOUT_PORT(pci_irq, 0xC00, IOPORT_F_OUT, pci_irq_port_handler);
+INOUT_PORT(pci_irq, 0xC01, IOPORT_F_OUT, pci_irq_port_handler);
+
+#define PCI_EMUL_TEST
+#ifdef PCI_EMUL_TEST
+/*
+ * Define a dummy test device
+ */
+#define DREGSZ 20
+struct pci_emul_dsoftc {
+ uint8_t regs[DREGSZ];
+};
+
+#define PCI_EMUL_MSGS 4
+
+int
+pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ int error;
+ struct pci_emul_dsoftc *sc;
+
+ sc = malloc(sizeof(struct pci_emul_dsoftc));
+ memset(sc, 0, sizeof(struct pci_emul_dsoftc));
+
+ pi->pi_arg = sc;
+
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
+ pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
+
+ error = pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, DREGSZ);
+ assert(error == 0);
+
+ error = pci_emul_add_msicap(pi, PCI_EMUL_MSGS);
+ assert(error == 0);
+
+ return (0);
+}
+
+void
+pci_emul_diow(struct pci_devinst *pi, int baridx, int offset, int size,
+ uint32_t value)
+{
+ int i;
+ struct pci_emul_dsoftc *sc = pi->pi_arg;
+
+ if (offset + size > DREGSZ) {
+ printf("diow: too large, offset %d size %d\n", offset, size);
+ return;
+ }
+
+ if (size == 1) {
+ sc->regs[offset] = value & 0xff;
+ } else if (size == 2) {
+ *(uint16_t *)&sc->regs[offset] = value & 0xffff;
+ } else {
+ *(uint32_t *)&sc->regs[offset] = value;
+ }
+
+ /*
+ * Special magic value to generate an interrupt
+ */
+ if (offset == 4 && size == 4 && pci_msi_enabled(pi))
+ pci_generate_msi(pi, value % pci_msi_msgnum(pi));
+
+ if (value == 0xabcdef) {
+ for (i = 0; i < pci_msi_msgnum(pi); i++)
+ pci_generate_msi(pi, i);
+ }
+}
+
+uint32_t
+pci_emul_dior(struct pci_devinst *pi, int baridx, int offset, int size)
+{
+ struct pci_emul_dsoftc *sc = pi->pi_arg;
+ uint32_t value;
+
+ if (offset + size > DREGSZ) {
+ printf("dior: too large, offset %d size %d\n", offset, size);
+ return (0);
+ }
+
+ if (size == 1) {
+ value = sc->regs[offset];
+ } else if (size == 2) {
+ value = *(uint16_t *) &sc->regs[offset];
+ } else {
+ value = *(uint32_t *) &sc->regs[offset];
+ }
+
+ return (value);
+}
+
+struct pci_devemu pci_dummy = {
+ .pe_emu = "dummy",
+ .pe_init = pci_emul_dinit,
+ .pe_iow = pci_emul_diow,
+ .pe_ior = pci_emul_dior
+};
+PCI_EMUL_SET(pci_dummy);
+
+#endif /* PCI_EMUL_TEST */
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
new file mode 100644
index 0000000..f5f8e22
--- /dev/null
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -0,0 +1,171 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PCI_EMUL_H_
+#define _PCI_EMUL_H_
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/kernel.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <assert.h>
+
+#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */
+#define PCIY_RESERVED 0x00
+
+struct vmctx;
+struct pci_devinst;
+
+struct pci_devemu {
+ char *pe_emu; /* Name of device emulation */
+
+ /* instance creation */
+ int (*pe_init)(struct vmctx *, struct pci_devinst *, char *opts);
+
+ /* config space read/write callbacks */
+ int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int offset,
+ int bytes, uint32_t val);
+ int (*pe_cfgread)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int offset,
+ int bytes, uint32_t *retval);
+
+ /* I/O space read/write callbacks */
+ void (*pe_iow)(struct pci_devinst *pi, int baridx,
+ int offset, int size, uint32_t value);
+ uint32_t (*pe_ior)(struct pci_devinst *pi, int baridx,
+ int offset, int size);
+};
+#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
+
+enum pcibar_type {
+ PCIBAR_NONE,
+ PCIBAR_IO,
+ PCIBAR_MEM32,
+ PCIBAR_MEM64,
+ PCIBAR_MEMHI64
+};
+
+struct pcibar {
+ enum pcibar_type type; /* io or memory */
+ uint64_t size;
+ uint64_t addr;
+};
+
+#define PI_NAMESZ 40
+
+struct pci_devinst {
+ struct pci_devemu *pi_d;
+ struct vmctx *pi_vmctx;
+ uint8_t pi_bus, pi_slot, pi_func;
+ char pi_name[PI_NAMESZ];
+ uint16_t pi_iobase;
+ int pi_bar_getsize;
+
+ struct {
+ int enabled;
+ int cpu;
+ int vector;
+ int msgnum;
+ } pi_msi;
+
+ void *pi_arg; /* devemu-private data */
+
+ u_char pi_cfgdata[PCI_REGMAX + 1];
+ struct pcibar pi_bar[PCI_BARMAX + 1];
+};
+
+struct msicap {
+ uint8_t capid;
+ uint8_t nextptr;
+ uint16_t msgctrl;
+ uint32_t addrlo;
+ uint32_t addrhi;
+ uint16_t msgdata;
+} __packed;
+
+void init_pci(struct vmctx *ctx);
+void pci_parse_slot(char *opt);
+void pci_parse_name(char *opt);
+void pci_callback(void);
+int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
+ enum pcibar_type type, uint64_t size);
+int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
+void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val);
+
+void pci_generate_msi(struct pci_devinst *pi, int msgnum);
+int pci_msi_enabled(struct pci_devinst *pi);
+int pci_msi_msgnum(struct pci_devinst *pi);
+void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
+
+static __inline void
+pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
+{
+ assert(offset <= PCI_REGMAX);
+ *(uint8_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void
+pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
+{
+ assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+ *(uint16_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void
+pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
+{
+ assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+ *(uint32_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline uint8_t
+pci_get_cfgdata8(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= PCI_REGMAX);
+ return (*(uint8_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint16_t
+pci_get_cfgdata16(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+ return (*(uint16_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint32_t
+pci_get_cfgdata32(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+ return (*(uint32_t *)(pi->pi_cfgdata + offset));
+}
+
+#endif /* _PCI_EMUL_H_ */
diff --git a/usr.sbin/bhyve/pci_hostbridge.c b/usr.sbin/bhyve/pci_hostbridge.c
new file mode 100644
index 0000000..c77762d
--- /dev/null
+++ b/usr.sbin/bhyve/pci_hostbridge.c
@@ -0,0 +1,52 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "pci_emul.h"
+
+static int
+pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+ /* config space */
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */
+ pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_BRIDGE);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
+
+ return (0);
+}
+
+struct pci_devemu pci_de_hostbridge = {
+ .pe_emu = "hostbridge",
+ .pe_init = pci_hostbridge_init,
+};
+PCI_EMUL_SET(pci_de_hostbridge);
diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c
new file mode 100644
index 0000000..1c417fd
--- /dev/null
+++ b/usr.sbin/bhyve/pci_passthru.c
@@ -0,0 +1,508 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/pciio.h>
+#include <sys/ioctl.h>
+
+#include <dev/io/iodev.h>
+#include <machine/iodev.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+#include "pci_emul.h"
+
+#ifndef _PATH_DEVPCI
+#define _PATH_DEVPCI "/dev/pci"
+#endif
+
+#ifndef _PATH_DEVIO
+#define _PATH_DEVIO "/dev/io"
+#endif
+
+#define LEGACY_SUPPORT 1
+
+static int pcifd = -1;
+static int iofd = -1;
+
+struct passthru_softc {
+ struct pci_devinst *psc_pi;
+ struct pcibar psc_bar[PCI_BARMAX + 1];
+ struct {
+ int capoff;
+ int msgctrl;
+ int emulated;
+ } psc_msi;
+ struct pcisel psc_sel;
+};
+
+static int
+msi_caplen(int msgctrl)
+{
+ int len;
+
+ len = 10; /* minimum length of msi capability */
+
+ if (msgctrl & PCIM_MSICTRL_64BIT)
+ len += 4;
+
+#if 0
+ /*
+ * Ignore the 'mask' and 'pending' bits in the MSI capability.
+ * We'll let the guest manipulate them directly.
+ */
+ if (msgctrl & PCIM_MSICTRL_VECTOR)
+ len += 10;
+#endif
+
+ return (len);
+}
+
+static uint32_t
+read_config(const struct pcisel *sel, long reg, int width)
+{
+ struct pci_io pi;
+
+ bzero(&pi, sizeof(pi));
+ pi.pi_sel = *sel;
+ pi.pi_reg = reg;
+ pi.pi_width = width;
+
+ if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
+ return (0); /* XXX */
+ else
+ return (pi.pi_data);
+}
+
+static void
+write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
+{
+ struct pci_io pi;
+
+ bzero(&pi, sizeof(pi));
+ pi.pi_sel = *sel;
+ pi.pi_reg = reg;
+ pi.pi_width = width;
+ pi.pi_data = data;
+
+ (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */
+}
+
+#ifdef LEGACY_SUPPORT
+static int
+passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
+{
+ int capoff, i;
+ struct msicap msicap;
+ u_char *capdata;
+
+ pci_populate_msicap(&msicap, msgnum, nextptr);
+
+ /*
+ * XXX
+ * Copy the msi capability structure in the last 16 bytes of the
+ * config space. This is wrong because it could shadow something
+ * useful to the device.
+ */
+ capoff = 256 - roundup(sizeof(msicap), 4);
+ capdata = (u_char *)&msicap;
+ for (i = 0; i < sizeof(msicap); i++)
+ pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+ return (capoff);
+}
+#endif /* LEGACY_SUPPORT */
+
+static int
+cfginitmsi(struct passthru_softc *sc)
+{
+ int ptr, cap, sts, caplen;
+ uint32_t u32;
+ struct pcisel sel;
+ struct pci_devinst *pi;
+
+ pi = sc->psc_pi;
+ sel = sc->psc_sel;
+
+ /*
+ * Parse the capabilities and cache the location of the MSI
+ * capability.
+ */
+ sts = read_config(&sel, PCIR_STATUS, 2);
+ if (sts & PCIM_STATUS_CAPPRESENT) {
+ ptr = read_config(&sel, PCIR_CAP_PTR, 1);
+ while (ptr != 0 && ptr != 0xff) {
+ cap = read_config(&sel, ptr + PCICAP_ID, 1);
+ if (cap == PCIY_MSI) {
+ /*
+ * Copy the MSI capability into the config
+ * space of the emulated pci device
+ */
+ sc->psc_msi.capoff = ptr;
+ sc->psc_msi.msgctrl = read_config(&sel,
+ ptr + 2, 2);
+ sc->psc_msi.emulated = 0;
+ caplen = msi_caplen(sc->psc_msi.msgctrl);
+ while (caplen > 0) {
+ u32 = read_config(&sel, ptr, 4);
+ pci_set_cfgdata32(pi, ptr, u32);
+ caplen -= 4;
+ ptr += 4;
+ }
+ break;
+ }
+ ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
+ }
+ }
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * If the passthrough device does not support MSI then craft a
+ * MSI capability for it. We link the new MSI capability at the
+ * head of the list of capabilities.
+ */
+ if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
+ int origptr, msiptr;
+ origptr = read_config(&sel, PCIR_CAP_PTR, 1);
+ msiptr = passthru_add_msicap(pi, 1, origptr);
+ sc->psc_msi.capoff = msiptr;
+ sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
+ sc->psc_msi.emulated = 1;
+ pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
+ }
+#endif
+
+ if (sc->psc_msi.capoff == 0) /* MSI or bust */
+ return (-1);
+ else
+ return (0);
+}
+
+static int
+cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
+{
+ int i, error;
+ struct pci_devinst *pi;
+ struct pci_bar_io bar;
+ enum pcibar_type bartype;
+ uint64_t base;
+
+ pi = sc->psc_pi;
+
+ /*
+ * Initialize BAR registers
+ */
+ for (i = 0; i <= PCI_BARMAX; i++) {
+ bzero(&bar, sizeof(bar));
+ bar.pbi_sel = sc->psc_sel;
+ bar.pbi_reg = PCIR_BAR(i);
+
+ if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
+ continue;
+
+ if (PCI_BAR_IO(bar.pbi_base)) {
+ bartype = PCIBAR_IO;
+ base = bar.pbi_base & PCIM_BAR_IO_BASE;
+ } else {
+ switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
+ case PCIM_BAR_MEM_64:
+ bartype = PCIBAR_MEM64;
+ break;
+ default:
+ bartype = PCIBAR_MEM32;
+ break;
+ }
+ base = bar.pbi_base & PCIM_BAR_MEM_BASE;
+ }
+
+ /* Cache information about the "real" BAR */
+ sc->psc_bar[i].type = bartype;
+ sc->psc_bar[i].size = bar.pbi_length;
+ sc->psc_bar[i].addr = base;
+
+ /* Allocate the BAR in the guest I/O or MMIO space */
+ error = pci_emul_alloc_bar(pi, i, base, bartype,
+ bar.pbi_length);
+ if (error)
+ return (-1);
+
+ /*
+ * Map the physical MMIO space in the guest MMIO space
+ */
+ if (bartype != PCIBAR_IO) {
+ error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
+ pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
+ if (error)
+ return (-1);
+ }
+
+ /*
+ * 64-bit BAR takes up two slots so skip the next one.
+ */
+ if (bartype == PCIBAR_MEM64) {
+ i++;
+ assert(i <= PCI_BARMAX);
+ sc->psc_bar[i].type = PCIBAR_MEMHI64;
+ }
+ }
+ return (0);
+}
+
+static int
+cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
+{
+ int error;
+ struct passthru_softc *sc;
+
+ error = 1;
+ sc = pi->pi_arg;
+
+ bzero(&sc->psc_sel, sizeof(struct pcisel));
+ sc->psc_sel.pc_bus = bus;
+ sc->psc_sel.pc_dev = slot;
+ sc->psc_sel.pc_func = func;
+
+ if (cfginitbar(ctx, sc) != 0)
+ goto done;
+
+ if (cfginitmsi(sc) != 0)
+ goto done;
+
+ error = 0; /* success */
+done:
+ return (error);
+}
+
+static int
+passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ int bus, slot, func, error;
+ struct passthru_softc *sc;
+
+ sc = NULL;
+ error = 1;
+
+ if (pcifd < 0) {
+ pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
+ if (pcifd < 0)
+ goto done;
+ }
+
+ if (iofd < 0) {
+ iofd = open(_PATH_DEVIO, O_RDWR, 0);
+ if (iofd < 0)
+ goto done;
+ }
+
+ if (opts == NULL || sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3)
+ goto done;
+
+ if (vm_assign_pptdev(ctx, bus, slot, func) != 0)
+ goto done;
+
+ sc = malloc(sizeof(struct passthru_softc));
+ memset(sc, 0, sizeof(struct passthru_softc));
+
+ pi->pi_arg = sc;
+ sc->psc_pi = pi;
+
+ /* initialize config space */
+ if (cfginit(ctx, pi, bus, slot, func) != 0)
+ goto done;
+
+ error = 0; /* success */
+done:
+ if (error) {
+ free(sc);
+ vm_unassign_pptdev(ctx, bus, slot, func);
+ }
+ return (error);
+}
+
+static int
+bar_access(int coff)
+{
+ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
+ return (1);
+ else
+ return (0);
+}
+
+static int
+msicap_access(struct passthru_softc *sc, int coff)
+{
+ int caplen;
+
+ if (sc->psc_msi.capoff == 0)
+ return (0);
+
+ caplen = msi_caplen(sc->psc_msi.msgctrl);
+
+ if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
+ return (1);
+ else
+ return (0);
+}
+
+static int
+passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff,
+ int bytes, uint32_t *rv)
+{
+ struct passthru_softc *sc;
+
+ sc = pi->pi_arg;
+
+ /*
+ * PCI BARs and MSI capability is emulated.
+ */
+ if (bar_access(coff) || msicap_access(sc, coff))
+ return (-1);
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * Emulate PCIR_CAP_PTR if this device does not support MSI capability
+ * natively.
+ */
+ if (sc->psc_msi.emulated) {
+ if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
+ return (-1);
+ }
+#endif
+
+ /* Everything else just read from the device's config space */
+ *rv = read_config(&sc->psc_sel, coff, bytes);
+
+ return (0);
+}
+
+static int
+passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff,
+ int bytes, uint32_t val)
+{
+ int error;
+ struct passthru_softc *sc;
+
+ sc = pi->pi_arg;
+
+ /*
+ * PCI BARs are emulated
+ */
+ if (bar_access(coff))
+ return (-1);
+
+ /*
+ * MSI capability is emulated
+ */
+ if (msicap_access(sc, coff)) {
+ msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
+
+ error = vm_setup_msi(ctx, vcpu, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.cpu,
+ pi->pi_msi.vector, pi->pi_msi.msgnum);
+ if (error != 0) {
+ printf("vm_setup_msi returned error %d\r\n", errno);
+ exit(1);
+ }
+ return (0);
+ }
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * If this device does not support MSI natively then we cannot let
+ * the guest disable legacy interrupts from the device. It is the
+ * legacy interrupt that is triggering the virtual MSI to the guest.
+ */
+ if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
+ if (coff == PCIR_COMMAND && bytes == 2)
+ val &= ~PCIM_CMD_INTxDIS;
+ }
+#endif
+
+ write_config(&sc->psc_sel, coff, bytes, val);
+
+ return (0);
+}
+
+static void
+passthru_iow(struct pci_devinst *pi, int baridx, int offset, int size,
+ uint32_t value)
+{
+ struct passthru_softc *sc;
+ struct iodev_pio_req pio;
+
+ sc = pi->pi_arg;
+
+ bzero(&pio, sizeof(struct iodev_pio_req));
+ pio.access = IODEV_PIO_WRITE;
+ pio.port = sc->psc_bar[baridx].addr + offset;
+ pio.width = size;
+ pio.val = value;
+
+ (void)ioctl(iofd, IODEV_PIO, &pio);
+}
+
+static uint32_t
+passthru_ior(struct pci_devinst *pi, int baridx, int offset, int size)
+{
+ struct passthru_softc *sc;
+ struct iodev_pio_req pio;
+
+ sc = pi->pi_arg;
+
+ bzero(&pio, sizeof(struct iodev_pio_req));
+ pio.access = IODEV_PIO_READ;
+ pio.port = sc->psc_bar[baridx].addr + offset;
+ pio.width = size;
+ pio.val = 0;
+
+ (void)ioctl(iofd, IODEV_PIO, &pio);
+
+ return (pio.val);
+}
+
+struct pci_devemu passthru = {
+ .pe_emu = "passthru",
+ .pe_init = passthru_init,
+ .pe_cfgwrite = passthru_cfgwrite,
+ .pe_cfgread = passthru_cfgread,
+ .pe_iow = passthru_iow,
+ .pe_ior = passthru_ior,
+};
+PCI_EMUL_SET(passthru);
diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c
new file mode 100644
index 0000000..b86e21d
--- /dev/null
+++ b/usr.sbin/bhyve/pci_virtio_block.c
@@ -0,0 +1,502 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "fbsdrun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+#define VTBLK_RINGSZ 64
+
+#define VTBLK_CFGSZ 28
+
+#define VTBLK_R_CFG VTCFG_R_CFG0
+#define VTBLK_R_CFG_END VTBLK_R_CFG + VTBLK_CFGSZ -1
+#define VTBLK_R_MAX VTBLK_R_CFG_END
+
+#define VTBLK_REGSZ VTBLK_R_MAX+1
+
+#define VTBLK_MAXSEGS 32
+
+#define VTBLK_S_OK 0
+#define VTBLK_S_IOERR 1
+
+/*
+ * Host capabilities
+ */
+#define VTBLK_S_HOSTCAPS \
+ ( 0x00000004 | /* host maximum request segments */ \
+ 0x10000000 ) /* supports indirect descriptors */
+
+struct vring_hqueue {
+ /* Internal state */
+ uint16_t hq_size;
+ uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
+
+ /* Host-context pointers to the queue */
+ struct virtio_desc *hq_dtable;
+ uint16_t *hq_avail_flags;
+ uint16_t *hq_avail_idx; /* monotonically increasing */
+ uint16_t *hq_avail_ring;
+
+ uint16_t *hq_used_flags;
+ uint16_t *hq_used_idx; /* monotonically increasing */
+ struct virtio_used *hq_used_ring;
+};
+
+/*
+ * Config space
+ */
+struct vtblk_config {
+ uint64_t vbc_capacity;
+ uint32_t vbc_size_max;
+ uint32_t vbc_seg_max;
+ uint16_t vbc_geom_c;
+ uint8_t vbc_geom_h;
+ uint8_t vbc_geom_s;
+ uint32_t vbc_blk_size;
+ uint32_t vbc_sectors_max;
+} __packed;
+CTASSERT(sizeof(struct vtblk_config) == VTBLK_CFGSZ);
+
+/*
+ * Fixed-size block header
+ */
+struct virtio_blk_hdr {
+#define VBH_OP_READ 0
+#define VBH_OP_WRITE 1
+ uint32_t vbh_type;
+ uint32_t vbh_ioprio;
+ uint64_t vbh_sector;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtblk_debug;
+#define DPRINTF(params) if (pci_vtblk_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtblk_softc {
+ struct pci_devinst *vbsc_pi;
+ int vbsc_fd;
+ int vbsc_status;
+ int vbsc_isr;
+ int vbsc_lastq;
+ uint32_t vbsc_features;
+ uint64_t vbsc_pfn;
+ struct vring_hqueue vbsc_q;
+ struct vtblk_config vbsc_cfg;
+};
+
+/*
+ * Return the number of available descriptors in the vring taking care
+ * of the 16-bit index wraparound.
+ */
+static int
+hq_num_avail(struct vring_hqueue *hq)
+{
+ int ndesc;
+
+ if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
+ ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
+ else
+ ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
+
+ assert(ndesc >= 0 && ndesc <= hq->hq_size);
+
+ return (ndesc);
+}
+
+static void
+pci_vtblk_update_status(struct pci_vtblk_softc *sc, uint32_t value)
+{
+ if (value == 0) {
+ DPRINTF(("vtblk: device reset requested !\n"));
+ }
+
+ sc->vbsc_status = value;
+}
+
+static void
+pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vring_hqueue *hq)
+{
+ struct iovec iov[VTBLK_MAXSEGS];
+ struct virtio_blk_hdr *vbh;
+ struct virtio_desc *vd, *vid;
+ struct virtio_used *vu;
+ uint8_t *status;
+ int i;
+ int err;
+ int iolen;
+ int nsegs;
+ int uidx, aidx, didx;
+ int writeop;
+ off_t offset;
+
+ uidx = *hq->hq_used_idx;
+ aidx = hq->hq_cur_aidx;
+ didx = hq->hq_avail_ring[aidx % hq->hq_size];
+ assert(didx >= 0 && didx < hq->hq_size);
+
+ vd = &hq->hq_dtable[didx];
+
+ /*
+ * Verify that the descriptor is indirect, and obtain
+ * the pointer to the indirect descriptor.
+ * There has to be space for at least 3 descriptors
+ * in the indirect descriptor array: the block header,
+ * 1 or more data descriptors, and a status byte.
+ */
+ assert(vd->vd_flags & VRING_DESC_F_INDIRECT);
+
+ nsegs = vd->vd_len / sizeof(struct virtio_desc);
+ assert(nsegs >= 3);
+ assert(nsegs < VTBLK_MAXSEGS + 2);
+
+ vid = paddr_guest2host(vd->vd_addr);
+ assert((vid->vd_flags & VRING_DESC_F_INDIRECT) == 0);
+
+ /*
+ * The first descriptor will be the read-only fixed header
+ */
+ vbh = paddr_guest2host(vid[0].vd_addr);
+ assert(vid[0].vd_len == sizeof(struct virtio_blk_hdr));
+ assert(vid[0].vd_flags & VRING_DESC_F_NEXT);
+ assert((vid[0].vd_flags & VRING_DESC_F_WRITE) == 0);
+
+ writeop = (vbh->vbh_type == VBH_OP_WRITE);
+
+ offset = vbh->vbh_sector * DEV_BSIZE;
+
+ /*
+ * Build up the iovec based on the guest's data descriptors
+ */
+ for (i = 1, iolen = 0; i < nsegs - 1; i++) {
+ iov[i-1].iov_base = paddr_guest2host(vid[i].vd_addr);
+ iov[i-1].iov_len = vid[i].vd_len;
+ iolen += vid[i].vd_len;
+
+ assert(vid[i].vd_flags & VRING_DESC_F_NEXT);
+ assert((vid[i].vd_flags & VRING_DESC_F_INDIRECT) == 0);
+
+ /*
+ * - write op implies read-only descriptor,
+ * - read op implies write-only descriptor,
+ * therefore test the inverse of the descriptor bit
+ * to the op.
+ */
+ assert(((vid[i].vd_flags & VRING_DESC_F_WRITE) == 0) ==
+ writeop);
+ }
+
+ /* Lastly, get the address of the status byte */
+ status = paddr_guest2host(vid[nsegs - 1].vd_addr);
+ assert(vid[nsegs - 1].vd_len == 1);
+ assert((vid[nsegs - 1].vd_flags & VRING_DESC_F_NEXT) == 0);
+ assert(vid[nsegs - 1].vd_flags & VRING_DESC_F_WRITE);
+
+ DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r",
+ writeop ? "write" : "read", iolen, nsegs - 2, offset));
+
+ if (writeop){
+ err = pwritev(sc->vbsc_fd, iov, nsegs - 2, offset);
+ } else {
+ err = preadv(sc->vbsc_fd, iov, nsegs - 2, offset);
+ }
+
+ *status = err < 0 ? VTBLK_S_IOERR : VTBLK_S_OK;
+
+ /*
+ * Return the single indirect descriptor back to the host
+ */
+ vu = &hq->hq_used_ring[uidx % hq->hq_size];
+ vu->vu_idx = didx;
+ vu->vu_tlen = 1;
+ hq->hq_cur_aidx++;
+ *hq->hq_used_idx += 1;
+}
+
+static void
+pci_vtblk_qnotify(struct pci_vtblk_softc *sc)
+{
+ struct vring_hqueue *hq = &sc->vbsc_q;
+ int i;
+ int ndescs;
+
+ /*
+ * Calculate number of ring entries to process
+ */
+ ndescs = hq_num_avail(hq);
+
+ if (ndescs == 0)
+ return;
+
+ /*
+ * Run through all the entries, placing them into iovecs and
+ * sending when an end-of-packet is found
+ */
+ for (i = 0; i < ndescs; i++)
+ pci_vtblk_proc(sc, hq);
+
+ /*
+ * Generate an interrupt if able
+ */
+ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0 &&
+ sc->vbsc_isr == 0) {
+ sc->vbsc_isr = 1;
+ pci_generate_msi(sc->vbsc_pi, 0);
+ }
+
+}
+
+static void
+pci_vtblk_ring_init(struct pci_vtblk_softc *sc, uint64_t pfn)
+{
+ struct vring_hqueue *hq;
+
+ sc->vbsc_pfn = pfn << VRING_PFN;
+
+ /*
+ * Set up host pointers to the various parts of the
+ * queue
+ */
+ hq = &sc->vbsc_q;
+ hq->hq_size = VTBLK_RINGSZ;
+
+ hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
+ hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
+ hq->hq_avail_idx = hq->hq_avail_flags + 1;
+ hq->hq_avail_ring = hq->hq_avail_flags + 2;
+ hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
+ VRING_ALIGN);
+ hq->hq_used_idx = hq->hq_used_flags + 1;
+ hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
+
+ /*
+ * Initialize queue indexes
+ */
+ hq->hq_cur_aidx = 0;
+}
+
+static int
+pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ struct stat sbuf;
+ struct pci_vtblk_softc *sc;
+ int fd;
+
+ if (opts == NULL) {
+ printf("virtio-block: backing device required\n");
+ return (1);
+ }
+
+ /*
+ * Access to guest memory is required. Fail if
+ * memory not mapped
+ */
+ if (paddr_guest2host(0) == NULL)
+ return (1);
+
+ /*
+ * The supplied backing file has to exist
+ */
+ fd = open(opts, O_RDWR);
+ if (fd < 0) {
+ perror("Could not open backing file");
+ return (1);
+ }
+
+ if (fstat(fd, &sbuf) < 0) {
+ perror("Could not stat backing file");
+ close(fd);
+ return (1);
+ }
+
+ sc = malloc(sizeof(struct pci_vtblk_softc));
+ memset(sc, 0, sizeof(struct pci_vtblk_softc));
+
+ pi->pi_arg = sc;
+ sc->vbsc_pi = pi;
+ sc->vbsc_fd = fd;
+
+ /* setup virtio block config space */
+ sc->vbsc_cfg.vbc_capacity = sbuf.st_size / DEV_BSIZE;
+ sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS;
+ sc->vbsc_cfg.vbc_blk_size = DEV_BSIZE;
+ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */
+ sc->vbsc_cfg.vbc_geom_c = 0; /* no geometry */
+ sc->vbsc_cfg.vbc_geom_h = 0;
+ sc->vbsc_cfg.vbc_geom_s = 0;
+ sc->vbsc_cfg.vbc_sectors_max = 0;
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
+ pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, VTBLK_REGSZ);
+ pci_emul_add_msicap(pi, 1);
+
+ return (0);
+}
+
+static void
+pci_vtblk_write(struct pci_devinst *pi, int baridx, int offset, int size,
+ uint32_t value)
+{
+ struct pci_vtblk_softc *sc = pi->pi_arg;
+
+ if (offset + size > VTBLK_REGSZ) {
+ DPRINTF(("vtblk_write: 2big, offset %d size %d\n",
+ offset, size));
+ return;
+ }
+
+ switch (offset) {
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ sc->vbsc_features = value & VTBLK_S_HOSTCAPS;
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ pci_vtblk_ring_init(sc, value);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ sc->vbsc_lastq = value;
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ assert(value == 0);
+ pci_vtblk_qnotify(sc);
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ pci_vtblk_update_status(sc, value);
+ break;
+ case VTCFG_R_HOSTCAP:
+ case VTCFG_R_QNUM:
+ case VTCFG_R_ISR:
+ case VTBLK_R_CFG ... VTBLK_R_CFG_END:
+ DPRINTF(("vtblk: write to readonly reg %d\n\r", offset));
+ break;
+ default:
+ DPRINTF(("vtblk: unknown i/o write offset %d\n\r", offset));
+ value = 0;
+ break;
+ }
+}
+
+uint32_t
+pci_vtblk_read(struct pci_devinst *pi, int baridx, int offset, int size)
+{
+ struct pci_vtblk_softc *sc = pi->pi_arg;
+ uint32_t value;
+
+ if (offset + size > VTBLK_REGSZ) {
+ DPRINTF(("vtblk_read: 2big, offset %d size %d\n",
+ offset, size));
+ return (0);
+ }
+
+ switch (offset) {
+ case VTCFG_R_HOSTCAP:
+ assert(size == 4);
+ value = VTBLK_S_HOSTCAPS;
+ break;
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ value = sc->vbsc_features; /* XXX never read ? */
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ value = sc->vbsc_pfn >> VRING_PFN;
+ break;
+ case VTCFG_R_QNUM:
+ value = (sc->vbsc_lastq == 0) ? VTBLK_RINGSZ: 0;
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ value = sc->vbsc_lastq; /* XXX never read ? */
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ value = 0; /* XXX never read ? */
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ value = sc->vbsc_status;
+ break;
+ case VTCFG_R_ISR:
+ assert(size == 1);
+ value = sc->vbsc_isr;
+ sc->vbsc_isr = 0; /* a read clears this flag */
+ break;
+ case VTBLK_R_CFG ... VTBLK_R_CFG_END:
+ assert(size == 1);
+ value = *((uint8_t *)&sc->vbsc_cfg + offset - VTBLK_R_CFG);
+ break;
+ default:
+ DPRINTF(("vtblk: unknown i/o read offset %d\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ return (value);
+}
+
+struct pci_devemu pci_de_vblk = {
+ .pe_emu = "virtio-blk",
+ .pe_init = pci_vtblk_init,
+ .pe_iow = pci_vtblk_write,
+ .pe_ior = pci_vtblk_read,
+};
+PCI_EMUL_SET(pci_de_vblk);
diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c
new file mode 100644
index 0000000..5db1eb7
--- /dev/null
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@@ -0,0 +1,739 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/select.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <md5.h>
+#include <pthread.h>
+
+#include "fbsdrun.h"
+#include "pci_emul.h"
+#include "mevent.h"
+#include "virtio.h"
+
+#define VTNET_RINGSZ 256
+
+#define VTNET_MAXSEGS 32
+
+/*
+ * PCI config-space register offsets
+ */
+#define VTNET_R_CFG0 20
+#define VTNET_R_CFG1 21
+#define VTNET_R_CFG2 22
+#define VTNET_R_CFG3 23
+#define VTNET_R_CFG4 24
+#define VTNET_R_CFG5 25
+#define VTNET_R_CFG6 26
+#define VTNET_R_CFG7 27
+#define VTNET_R_MAX 27
+
+#define VTNET_REGSZ VTNET_R_MAX+1
+
+/*
+ * Host capabilities
+ */
+#define VTNET_S_HOSTCAPS \
+ ( 0x00000020 | /* host supplies MAC */ \
+ 0x00008000 | /* host can merge Rx buffers */ \
+ 0x00010000 ) /* config status available */
+
+/*
+ * Queue definitions.
+ */
+#define VTNET_RXQ 0
+#define VTNET_TXQ 1
+#define VTNET_CTLQ 2
+
+#define VTNET_MAXQ 3
+
+struct vring_hqueue {
+ /* Internal state */
+ uint16_t hq_size;
+ uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
+
+ /* Host-context pointers to the queue */
+ struct virtio_desc *hq_dtable;
+ uint16_t *hq_avail_flags;
+ uint16_t *hq_avail_idx; /* monotonically increasing */
+ uint16_t *hq_avail_ring;
+
+ uint16_t *hq_used_flags;
+ uint16_t *hq_used_idx; /* monotonically increasing */
+ struct virtio_used *hq_used_ring;
+};
+
+/*
+ * Fixed network header size
+ */
+struct virtio_net_rxhdr {
+ uint8_t vrh_flags;
+ uint8_t vrh_gso_type;
+ uint16_t vrh_hdr_len;
+ uint16_t vrh_gso_size;
+ uint16_t vrh_csum_start;
+ uint16_t vrh_csum_offset;
+ uint16_t vrh_bufs;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtnet_debug;
+#define DPRINTF(params) if (pci_vtnet_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtnet_softc {
+ struct pci_devinst *vsc_pi;
+ pthread_mutex_t vsc_mtx;
+ struct mevent *vsc_mevp;
+
+ int vsc_curq;
+ int vsc_status;
+ int vsc_isr;
+ int vsc_tapfd;
+ int vsc_rx_ready;
+ int vsc_rxpend;
+
+ uint32_t vsc_features;
+ uint8_t vsc_macaddr[6];
+
+ uint64_t vsc_pfn[VTNET_MAXQ];
+ struct vring_hqueue vsc_hq[VTNET_MAXQ];
+};
+
+/*
+ * Return the number of available descriptors in the vring taking care
+ * of the 16-bit index wraparound.
+ */
+static int
+hq_num_avail(struct vring_hqueue *hq)
+{
+ int ndesc;
+
+ if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
+ ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
+ else
+ ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
+
+ assert(ndesc >= 0 && ndesc <= hq->hq_size);
+
+ return (ndesc);
+}
+
+static uint16_t
+pci_vtnet_qsize(int qnum)
+{
+ /* XXX no ctl queue currently */
+ if (qnum == VTNET_CTLQ) {
+ return (0);
+ }
+
+ /* XXX fixed currently. Maybe different for tx/rx/ctl */
+ return (VTNET_RINGSZ);
+}
+
+static void
+pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value)
+{
+ if (value == 0) {
+ DPRINTF(("vtnet: device reset requested !\n"));
+ }
+
+ sc->vsc_status = value;
+}
+
+/*
+ * Called to send a buffer chain out to the tap device
+ */
+static void
+pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
+ int len)
+{
+ char pad[60];
+
+ if (sc->vsc_tapfd == -1)
+ return;
+
+ /*
+ * If the length is < 60, pad out to that and add the
+ * extra zero'd segment to the iov. It is guaranteed that
+ * there is always an extra iov available by the caller.
+ */
+ if (len < 60) {
+ memset(pad, 0, 60 - len);
+ iov[iovcnt].iov_base = pad;
+ iov[iovcnt].iov_len = 60 - len;
+ iovcnt++;
+ }
+ (void) writev(sc->vsc_tapfd, iov, iovcnt);
+}
+
+/*
+ * Called when there is read activity on the tap file descriptor.
+ * Each buffer posted by the guest is assumed to be able to contain
+ * an entire ethernet frame + rx header.
+ * MP note: the dummybuf is only used for discarding frames, so there
+ * is no need for it to be per-vtnet or locked.
+ */
+static uint8_t dummybuf[2048];
+
+static void
+pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
+{
+ struct virtio_desc *vd;
+ struct virtio_used *vu;
+ struct vring_hqueue *hq;
+ struct virtio_net_rxhdr *vrx;
+ uint8_t *buf;
+ int i;
+ int len;
+ int ndescs;
+ int didx, uidx, aidx; /* descriptor, avail and used index */
+
+ /*
+ * Should never be called without a valid tap fd
+ */
+ assert(sc->vsc_tapfd != -1);
+
+ /*
+ * But, will be called when the rx ring hasn't yet
+ * been set up.
+ */
+ if (sc->vsc_rx_ready == 0) {
+ /*
+ * Drop the packet and try later.
+ */
+ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+ return;
+ }
+
+ /*
+ * Calculate the number of available rx buffers
+ */
+ hq = &sc->vsc_hq[VTNET_RXQ];
+
+ ndescs = hq_num_avail(hq);
+
+ if (ndescs == 0) {
+ /*
+ * Need to wait for host notification to read
+ */
+ if (sc->vsc_rxpend == 0) {
+ WPRINTF(("vtnet: no rx descriptors !\n"));
+ sc->vsc_rxpend = 1;
+ }
+
+ /*
+ * Drop the packet and try later
+ */
+ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+ return;
+ }
+
+ aidx = hq->hq_cur_aidx;
+ uidx = *hq->hq_used_idx;
+ for (i = 0; i < ndescs; i++) {
+ /*
+ * 'aidx' indexes into the an array of descriptor indexes
+ */
+ didx = hq->hq_avail_ring[aidx % hq->hq_size];
+ assert(didx >= 0 && didx < hq->hq_size);
+
+ vd = &hq->hq_dtable[didx];
+
+ /*
+ * Get a pointer to the rx header, and use the
+ * data immediately following it for the packet buffer.
+ */
+ vrx = (struct virtio_net_rxhdr *)paddr_guest2host(vd->vd_addr);
+ buf = (uint8_t *)(vrx + 1);
+
+ len = read(sc->vsc_tapfd, buf,
+ vd->vd_len - sizeof(struct virtio_net_rxhdr));
+
+ if (len < 0 && errno == EWOULDBLOCK) {
+ break;
+ }
+
+ /*
+ * The only valid field in the rx packet header is the
+ * number of buffers, which is always 1 without TSO
+ * support.
+ */
+ memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
+ vrx->vrh_bufs = 1;
+
+ /*
+ * Write this descriptor into the used ring
+ */
+ vu = &hq->hq_used_ring[uidx % hq->hq_size];
+ vu->vu_idx = didx;
+ vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr);
+ uidx++;
+ aidx++;
+ }
+
+ /*
+ * Update the used pointer, and signal an interrupt if allowed
+ */
+ *hq->hq_used_idx = uidx;
+ hq->hq_cur_aidx = aidx;
+
+ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+ sc->vsc_isr |= 1;
+ pci_generate_msi(sc->vsc_pi, 0);
+ }
+}
+
+static void
+pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
+{
+ struct pci_vtnet_softc *sc = param;
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+ pci_vtnet_tap_rx(sc);
+ pthread_mutex_unlock(&sc->vsc_mtx);
+
+}
+
+static void
+pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
+{
+ /*
+ * A qnotify means that the rx process can now begin
+ */
+ if (sc->vsc_rx_ready == 0) {
+ sc->vsc_rx_ready = 1;
+ }
+
+ /*
+ * If the rx queue was empty, attempt to receive a
+ * packet that was previously blocked due to no rx bufs
+ * available
+ */
+ if (sc->vsc_rxpend) {
+ WPRINTF(("vtnet: rx resumed\n\r"));
+ sc->vsc_rxpend = 0;
+ pci_vtnet_tap_rx(sc);
+ }
+}
+
+static void
+pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
+{
+ struct iovec iov[VTNET_MAXSEGS + 1];
+ struct virtio_desc *vd;
+ struct virtio_used *vu;
+ int i;
+ int plen;
+ int tlen;
+ int uidx, aidx, didx;
+
+ uidx = *hq->hq_used_idx;
+ aidx = hq->hq_cur_aidx;
+ didx = hq->hq_avail_ring[aidx % hq->hq_size];
+ assert(didx >= 0 && didx < hq->hq_size);
+
+ vd = &hq->hq_dtable[didx];
+
+ /*
+ * Run through the chain of descriptors, ignoring the
+ * first header descriptor. However, include the header
+ * length in the total length that will be put into the
+ * used queue.
+ */
+ tlen = vd->vd_len;
+ vd = &hq->hq_dtable[vd->vd_next];
+
+ for (i = 0, plen = 0;
+ i < VTNET_MAXSEGS;
+ i++, vd = &hq->hq_dtable[vd->vd_next]) {
+ iov[i].iov_base = paddr_guest2host(vd->vd_addr);
+ iov[i].iov_len = vd->vd_len;
+ plen += vd->vd_len;
+ tlen += vd->vd_len;
+
+ if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
+ break;
+ }
+ assert(i < VTNET_MAXSEGS);
+
+ DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1));
+ pci_vtnet_tap_tx(sc, iov, i + 1, plen);
+
+ /*
+ * Return this chain back to the host
+ */
+ vu = &hq->hq_used_ring[uidx % hq->hq_size];
+ vu->vu_idx = didx;
+ vu->vu_tlen = tlen;
+ hq->hq_cur_aidx = aidx + 1;
+ *hq->hq_used_idx = uidx + 1;
+
+ /*
+ * Generate an interrupt if able
+ */
+ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+ sc->vsc_isr |= 1;
+ pci_generate_msi(sc->vsc_pi, 0);
+ }
+}
+
+static void
+pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
+{
+ struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ];
+ int i;
+ int ndescs;
+
+ /*
+ * Calculate number of ring entries to process
+ */
+ ndescs = hq_num_avail(hq);
+
+ if (ndescs == 0)
+ return;
+
+ /*
+ * Run through all the entries, placing them into iovecs and
+ * sending when an end-of-packet is found
+ */
+ for (i = 0; i < ndescs; i++)
+ pci_vtnet_proctx(sc, hq);
+}
+
+static void
+pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc)
+{
+
+ DPRINTF(("vtnet: control qnotify!\n\r"));
+}
+
+static void
+pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn)
+{
+ struct vring_hqueue *hq;
+ int qnum = sc->vsc_curq;
+
+ assert(qnum < VTNET_MAXQ);
+
+ sc->vsc_pfn[qnum] = pfn << VRING_PFN;
+
+ /*
+ * Set up host pointers to the various parts of the
+ * queue
+ */
+ hq = &sc->vsc_hq[qnum];
+ hq->hq_size = pci_vtnet_qsize(qnum);
+
+ hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
+ hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
+ hq->hq_avail_idx = hq->hq_avail_flags + 1;
+ hq->hq_avail_ring = hq->hq_avail_flags + 2;
+ hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
+ VRING_ALIGN);
+ hq->hq_used_idx = hq->hq_used_flags + 1;
+ hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
+
+ /*
+ * Initialize queue indexes
+ */
+ hq->hq_cur_aidx = 0;
+}
+
+static int
+pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ MD5_CTX mdctx;
+ unsigned char digest[16];
+ char nstr[80];
+ struct pci_vtnet_softc *sc;
+
+ /*
+ * Access to guest memory is required. Fail if
+ * memory not mapped
+ */
+ if (paddr_guest2host(0) == NULL)
+ return (1);
+
+ sc = malloc(sizeof(struct pci_vtnet_softc));
+ memset(sc, 0, sizeof(struct pci_vtnet_softc));
+
+ pi->pi_arg = sc;
+ sc->vsc_pi = pi;
+
+ pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+ /*
+ * Attempt to open the tap device
+ */
+ sc->vsc_tapfd = -1;
+ if (opts != NULL) {
+ char tbuf[80];
+
+ strcpy(tbuf, "/dev/");
+ strncat(tbuf, opts, sizeof(tbuf) - strlen(tbuf));
+
+ sc->vsc_tapfd = open(tbuf, O_RDWR);
+ if (sc->vsc_tapfd == -1) {
+ WPRINTF(("open of tap device %s failed\n", tbuf));
+ } else {
+ /*
+ * Set non-blocking and register for read
+ * notifications with the event loop
+ */
+ int opt = 1;
+ if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
+ WPRINTF(("tap device O_NONBLOCK failed\n"));
+ close(sc->vsc_tapfd);
+ sc->vsc_tapfd = -1;
+ }
+
+ sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
+ EVF_READ,
+ pci_vtnet_tap_callback,
+ sc);
+ if (sc->vsc_mevp == NULL) {
+ WPRINTF(("Could not register event\n"));
+ close(sc->vsc_tapfd);
+ sc->vsc_tapfd = -1;
+ }
+ }
+ }
+
+ /*
+ * The MAC address is the standard NetApp OUI of 00-a0-98,
+ * followed by an MD5 of the vm name. The slot number is
+ * prepended to this for slots other than 1, so that
+ * CFE can netboot from the equivalent of slot 1.
+ */
+ if (pi->pi_slot == 1) {
+ strncpy(nstr, vmname, sizeof(nstr));
+ } else {
+ snprintf(nstr, sizeof(nstr), "%d-%s", pi->pi_slot, vmname);
+ }
+
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, nstr, strlen(nstr));
+ MD5Final(digest, &mdctx);
+
+ sc->vsc_macaddr[0] = 0x00;
+ sc->vsc_macaddr[1] = 0xa0;
+ sc->vsc_macaddr[2] = 0x98;
+ sc->vsc_macaddr[3] = digest[0];
+ sc->vsc_macaddr[4] = digest[1];
+ sc->vsc_macaddr[5] = digest[2];
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+ pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, VTNET_REGSZ);
+ pci_emul_add_msicap(pi, 1);
+
+ return (0);
+}
+
+/*
+ * Function pointer array to handle queue notifications
+ */
+static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
+ pci_vtnet_ping_rxq,
+ pci_vtnet_ping_txq,
+ pci_vtnet_ping_ctlq
+};
+
+static void
+pci_vtnet_write(struct pci_devinst *pi, int baridx, int offset, int size,
+ uint32_t value)
+{
+ struct pci_vtnet_softc *sc = pi->pi_arg;
+
+ if (offset + size > VTNET_REGSZ) {
+ DPRINTF(("vtnet_write: 2big, offset %d size %d\n",
+ offset, size));
+ return;
+ }
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+
+ switch (offset) {
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ sc->vsc_features = value & VTNET_S_HOSTCAPS;
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ pci_vtnet_ring_init(sc, value);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ assert(value < VTNET_MAXQ);
+ sc->vsc_curq = value;
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ assert(value < VTNET_MAXQ);
+ (*pci_vtnet_qnotify[value])(sc);
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ pci_vtnet_update_status(sc, value);
+ break;
+ case VTNET_R_CFG0:
+ case VTNET_R_CFG1:
+ case VTNET_R_CFG2:
+ case VTNET_R_CFG3:
+ case VTNET_R_CFG4:
+ case VTNET_R_CFG5:
+ /*
+ * The driver is allowed to change the MAC address
+ */
+ assert(size == 1);
+ sc->vsc_macaddr[offset - VTNET_R_CFG0] = value;
+ break;
+ case VTCFG_R_HOSTCAP:
+ case VTCFG_R_QNUM:
+ case VTCFG_R_ISR:
+ case VTNET_R_CFG6:
+ case VTNET_R_CFG7:
+ DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
+ break;
+ default:
+ DPRINTF(("vtnet: unknown i/o write offset %d\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+uint32_t
+pci_vtnet_read(struct pci_devinst *pi, int baridx, int offset, int size)
+{
+ struct pci_vtnet_softc *sc = pi->pi_arg;
+ uint32_t value;
+
+ if (offset + size > VTNET_REGSZ) {
+ DPRINTF(("vtnet_read: 2big, offset %d size %d\n",
+ offset, size));
+ return (0);
+ }
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+
+ switch (offset) {
+ case VTCFG_R_HOSTCAP:
+ assert(size == 4);
+ value = VTNET_S_HOSTCAPS;
+ break;
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ value = sc->vsc_features; /* XXX never read ? */
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
+ break;
+ case VTCFG_R_QNUM:
+ assert(size == 2);
+ value = pci_vtnet_qsize(sc->vsc_curq);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ value = sc->vsc_curq; /* XXX never read ? */
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ value = sc->vsc_curq; /* XXX never read ? */
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ value = sc->vsc_status;
+ break;
+ case VTCFG_R_ISR:
+ assert(size == 1);
+ value = sc->vsc_isr;
+ sc->vsc_isr = 0; /* a read clears this flag */
+ break;
+ case VTNET_R_CFG0:
+ case VTNET_R_CFG1:
+ case VTNET_R_CFG2:
+ case VTNET_R_CFG3:
+ case VTNET_R_CFG4:
+ case VTNET_R_CFG5:
+ assert(size == 1);
+ value = sc->vsc_macaddr[offset - VTNET_R_CFG0];
+ break;
+ case VTNET_R_CFG6:
+ assert(size == 1);
+ value = 0x01; /* XXX link always up */
+ break;
+ case VTNET_R_CFG7:
+ assert(size == 1);
+ value = 0; /* link status is in the LSB */
+ break;
+ default:
+ DPRINTF(("vtnet: unknown i/o read offset %d\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->vsc_mtx);
+
+ return (value);
+}
+
+struct pci_devemu pci_de_vnet = {
+ .pe_emu = "virtio-net",
+ .pe_init = pci_vtnet_init,
+ .pe_iow = pci_vtnet_write,
+ .pe_ior = pci_vtnet_read,
+};
+PCI_EMUL_SET(pci_de_vnet);
diff --git a/usr.sbin/bhyve/pit_8254.c b/usr.sbin/bhyve/pit_8254.c
new file mode 100644
index 0000000..b510161
--- /dev/null
+++ b/usr.sbin/bhyve/pit_8254.c
@@ -0,0 +1,196 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/time.h>
+
+#include <machine/clock.h>
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "fbsdrun.h"
+#include "inout.h"
+#include "pit_8254.h"
+
+#define TIMER_SEL_MASK 0xc0
+#define TIMER_RW_MASK 0x30
+#define TIMER_MODE_MASK 0x0f
+#define TIMER_SEL_READBACK 0xc0
+
+#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz))
+
+#define PIT_8254_FREQ 1193182
+static const int nsecs_per_tick = 1000000000 / PIT_8254_FREQ;
+
+struct counter {
+ struct timeval tv; /* uptime when counter was loaded */
+ uint16_t initial; /* initial counter value */
+ uint8_t cr[2];
+ uint8_t ol[2];
+ int crbyte;
+ int olbyte;
+};
+
+static void
+timevalfix(struct timeval *t1)
+{
+
+ if (t1->tv_usec < 0) {
+ t1->tv_sec--;
+ t1->tv_usec += 1000000;
+ }
+ if (t1->tv_usec >= 1000000) {
+ t1->tv_sec++;
+ t1->tv_usec -= 1000000;
+ }
+}
+
+static void
+timevalsub(struct timeval *t1, const struct timeval *t2)
+{
+
+ t1->tv_sec -= t2->tv_sec;
+ t1->tv_usec -= t2->tv_usec;
+ timevalfix(t1);
+}
+
+static void
+latch(struct counter *c)
+{
+ struct timeval tv2;
+ uint16_t lval;
+ uint64_t delta_nsecs, delta_ticks;
+
+ /* cannot latch a new value until the old one has been consumed */
+ if (c->olbyte != 0)
+ return;
+
+ if (c->initial == 0 || c->initial == 1) {
+ /*
+ * XXX the program that runs the VM can be stopped and
+ * restarted at any time. This means that state that was
+ * created by the guest is destroyed between invocations
+ * of the program.
+ *
+ * If the counter's initial value is not programmed we
+ * assume a value that would be set to generate 'guest_hz'
+ * interrupts per second.
+ */
+ c->initial = TIMER_DIV(PIT_8254_FREQ, guest_hz);
+ gettimeofday(&c->tv, NULL);
+ }
+
+ (void)gettimeofday(&tv2, NULL);
+ timevalsub(&tv2, &c->tv);
+ delta_nsecs = tv2.tv_sec * 1000000000 + tv2.tv_usec * 1000;
+ delta_ticks = delta_nsecs / nsecs_per_tick;
+
+ lval = c->initial - delta_ticks % c->initial;
+ c->olbyte = 2;
+ c->ol[1] = lval; /* LSB */
+ c->ol[0] = lval >> 8; /* MSB */
+}
+
+static int
+pit_8254_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int sel, rw, mode;
+ uint8_t val;
+ struct counter *c;
+
+ static struct counter counter[3];
+
+ if (bytes != 1)
+ return (-1);
+
+ val = *eax;
+
+ if (port == TIMER_MODE) {
+ assert(in == 0);
+ sel = val & TIMER_SEL_MASK;
+ rw = val & TIMER_RW_MASK;
+ mode = val & TIMER_MODE_MASK;
+
+ if (sel == TIMER_SEL_READBACK)
+ return (-1);
+ if (rw != TIMER_LATCH && rw != TIMER_16BIT)
+ return (-1);
+
+ if (rw != TIMER_LATCH) {
+ /*
+ * Counter mode is not affected when issuing a
+ * latch command.
+ */
+ if (mode != TIMER_RATEGEN && mode != TIMER_SQWAVE)
+ return (-1);
+ }
+
+ c = &counter[sel >> 6];
+ if (rw == TIMER_LATCH)
+ latch(c);
+ else
+ c->olbyte = 0; /* reset latch after reprogramming */
+
+ return (0);
+ }
+
+ /* counter ports */
+ assert(port >= TIMER_CNTR0 && port <= TIMER_CNTR2);
+ c = &counter[port - TIMER_CNTR0];
+
+ if (in) {
+ /*
+ * XXX
+ * The spec says that once the output latch is completely
+ * read it should revert to "following" the counter. We don't
+ * do this because it is hard and any reasonable OS should
+ * always latch the counter before trying to read it.
+ */
+ if (c->olbyte == 0)
+ c->olbyte = 2;
+ *eax = c->ol[--c->olbyte];
+ } else {
+ c->cr[c->crbyte++] = *eax;
+ if (c->crbyte == 2) {
+ c->crbyte = 0;
+ c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8;
+ gettimeofday(&c->tv, NULL);
+ }
+ }
+
+ return (0);
+}
+
+INOUT_PORT(8254, TIMER_MODE, IOPORT_F_OUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR0, IOPORT_F_INOUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR1, IOPORT_F_INOUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR2, IOPORT_F_INOUT, pit_8254_handler);
diff --git a/usr.sbin/bhyve/pit_8254.h b/usr.sbin/bhyve/pit_8254.h
new file mode 100644
index 0000000..61bd15d
--- /dev/null
+++ b/usr.sbin/bhyve/pit_8254.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PIT_8254_H_
+#define _PIT_8254_H_
+
+/*
+ * Borrowed from amd64/include/timerreg.h because in that file it is
+ * conditionally compiled for #ifdef _KERNEL only.
+ */
+
+#include <dev/ic/i8253reg.h>
+
+#define IO_TIMER1 0x40 /* 8253 Timer #1 */
+#define TIMER_CNTR0 (IO_TIMER1 + TIMER_REG_CNTR0)
+#define TIMER_CNTR1 (IO_TIMER1 + TIMER_REG_CNTR1)
+#define TIMER_CNTR2 (IO_TIMER1 + TIMER_REG_CNTR2)
+#define TIMER_MODE (IO_TIMER1 + TIMER_REG_MODE)
+
+#endif /* _PIT_8254_H_ */
diff --git a/usr.sbin/bhyve/post.c b/usr.sbin/bhyve/post.c
new file mode 100644
index 0000000..092a551
--- /dev/null
+++ b/usr.sbin/bhyve/post.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+
+#include "inout.h"
+
+static int
+post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 1);
+
+ if (bytes != 1)
+ return (-1);
+
+ *eax = 0xff; /* return some garbage */
+ return (0);
+}
+
+INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler);
diff --git a/usr.sbin/bhyve/rtc.c b/usr.sbin/bhyve/rtc.c
new file mode 100644
index 0000000..a6f44e0
--- /dev/null
+++ b/usr.sbin/bhyve/rtc.c
@@ -0,0 +1,268 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
+#include "inout.h"
+
+#define IO_RTC 0x70
+
+#define RTC_SEC 0x00 /* seconds */
+#define RTC_MIN 0x02
+#define RTC_HRS 0x04
+#define RTC_WDAY 0x06
+#define RTC_DAY 0x07
+#define RTC_MONTH 0x08
+#define RTC_YEAR 0x09
+#define RTC_CENTURY 0x32 /* current century */
+
+#define RTC_STATUSA 0xA
+#define RTCSA_TUP 0x80 /* time update, don't look now */
+
+#define RTC_STATUSB 0xB
+#define RTCSB_DST 0x01
+#define RTCSB_24HR 0x02
+#define RTCSB_BIN 0x04 /* 0 = BCD, 1 = Binary */
+#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */
+#define RTCSB_HALT 0x80 /* stop clock updates */
+
+#define RTC_INTR 0x0c /* status register C (R) interrupt source */
+
+#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */
+#define RTCSD_PWR 0x80 /* clock power OK */
+
+#define RTC_DIAG 0x0e
+
+#define RTC_RSTCODE 0x0f
+
+static int addr;
+
+/* XXX initialize these to default values as they would be from BIOS */
+static uint8_t status_a, status_b, rstcode;
+
+static u_char const bin2bcd_data[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99
+};
+#define bin2bcd(bin) (bin2bcd_data[bin])
+
+#define rtcout(val) ((status_b & RTCSB_BIN) ? (val) : bin2bcd((val)))
+
+static void
+timevalfix(struct timeval *t1)
+{
+
+ if (t1->tv_usec < 0) {
+ t1->tv_sec--;
+ t1->tv_usec += 1000000;
+ }
+ if (t1->tv_usec >= 1000000) {
+ t1->tv_sec++;
+ t1->tv_usec -= 1000000;
+ }
+}
+
+static void
+timevalsub(struct timeval *t1, const struct timeval *t2)
+{
+
+ t1->tv_sec -= t2->tv_sec;
+ t1->tv_usec -= t2->tv_usec;
+ timevalfix(t1);
+}
+
+static int
+rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 0);
+
+ if (bytes != 1)
+ return (-1);
+
+ switch (*eax) {
+ case RTC_SEC:
+ case RTC_MIN:
+ case RTC_HRS:
+ case RTC_WDAY:
+ case RTC_DAY:
+ case RTC_MONTH:
+ case RTC_YEAR:
+ case RTC_CENTURY:
+ case RTC_STATUSA:
+ case RTC_STATUSB:
+ case RTC_INTR:
+ case RTC_STATUSD:
+ case RTC_DIAG:
+ case RTC_RSTCODE:
+ break;
+ default:
+ return (-1);
+ }
+
+ addr = *eax;
+ return (0);
+}
+
+static int
+rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int hour;
+ time_t t;
+ struct timeval cur, delta;
+
+ static struct timeval last;
+ static struct tm tm;
+
+ if (bytes != 1)
+ return (-1);
+
+ gettimeofday(&cur, NULL);
+
+ /*
+ * Increment the cached time only once per second so we can guarantee
+ * that the guest has at least one second to read the hour:min:sec
+ * separately and still get a coherent view of the time.
+ */
+ delta = cur;
+ timevalsub(&delta, &last);
+ if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) {
+ t = cur.tv_sec;
+ localtime_r(&t, &tm);
+ last = cur;
+ }
+
+ if (in) {
+ switch (addr) {
+ case RTC_SEC:
+ *eax = rtcout(tm.tm_sec);
+ return (0);
+ case RTC_MIN:
+ *eax = rtcout(tm.tm_min);
+ return (0);
+ case RTC_HRS:
+ if (status_b & RTCSB_24HR)
+ hour = tm.tm_hour;
+ else
+ hour = (tm.tm_hour % 12) + 1;
+
+ *eax = rtcout(hour);
+
+ /*
+ * If we are representing time in the 12-hour format
+ * then set the MSB to indicate PM.
+ */
+ if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12)
+ *eax |= 0x80;
+
+ return (0);
+ case RTC_WDAY:
+ *eax = rtcout(tm.tm_wday + 1);
+ return (0);
+ case RTC_DAY:
+ *eax = rtcout(tm.tm_mday);
+ return (0);
+ case RTC_MONTH:
+ *eax = rtcout(tm.tm_mon + 1);
+ return (0);
+ case RTC_YEAR:
+ *eax = rtcout(tm.tm_year % 100);
+ return (0);
+ case RTC_CENTURY:
+ *eax = rtcout(tm.tm_year / 100);
+ break;
+ case RTC_STATUSA:
+ *eax = status_a;
+ return (0);
+ case RTC_INTR:
+ *eax = 0;
+ return (0);
+ case RTC_STATUSD:
+ *eax = RTCSD_PWR;
+ return (0);
+ case RTC_DIAG:
+ *eax = 0;
+ return (0);
+ case RTC_RSTCODE:
+ *eax = rstcode;
+ return (0);
+ default:
+ return (-1);
+ }
+ }
+
+ switch (addr) {
+ case RTC_STATUSA:
+ status_a = *eax & ~RTCSA_TUP;
+ break;
+ case RTC_STATUSB:
+ /* XXX not implemented yet XXX */
+ if (*eax & RTCSB_PINTR)
+ return (-1);
+ status_b = *eax;
+ break;
+ case RTC_RSTCODE:
+ rstcode = *eax;
+ break;
+ case RTC_SEC:
+ case RTC_MIN:
+ case RTC_HRS:
+ case RTC_WDAY:
+ case RTC_DAY:
+ case RTC_MONTH:
+ case RTC_YEAR:
+ case RTC_CENTURY:
+ /*
+ * Ignore writes to the time of day registers
+ */
+ break;
+ default:
+ return (-1);
+ }
+ return (0);
+}
+
+INOUT_PORT(rtc, IO_RTC, IOPORT_F_OUT, rtc_addr_handler);
+INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler);
diff --git a/usr.sbin/bhyve/uart.c b/usr.sbin/bhyve/uart.c
new file mode 100644
index 0000000..640f3bf
--- /dev/null
+++ b/usr.sbin/bhyve/uart.c
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <assert.h>
+
+#include "inout.h"
+
+#define COM1 0x3F8
+#define COM2 0x2F8
+
+#define REG_IIR 2
+
+static int
+com_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in);
+
+ if (bytes != 1)
+ return (-1);
+
+ /*
+ * COM port is not implemented so we return 0xFF for all registers
+ */
+ *eax = 0xFF;
+
+ return (0);
+}
+
+INOUT_PORT(uart, COM1 + REG_IIR, IOPORT_F_IN, com_handler);
+INOUT_PORT(uart, COM2 + REG_IIR, IOPORT_F_IN, com_handler);
diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h
new file mode 100644
index 0000000..474e244
--- /dev/null
+++ b/usr.sbin/bhyve/virtio.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_H_
+#define _VIRTIO_H_
+
+#define VRING_ALIGN 4096
+
+#define VRING_DESC_F_NEXT (1 << 0)
+#define VRING_DESC_F_WRITE (1 << 1)
+#define VRING_DESC_F_INDIRECT (1 << 2)
+
+#define VRING_AVAIL_F_NO_INTERRUPT 1
+
+struct virtio_desc {
+ uint64_t vd_addr;
+ uint32_t vd_len;
+ uint16_t vd_flags;
+ uint16_t vd_next;
+} __packed;
+
+struct virtio_used {
+ uint32_t vu_idx;
+ uint32_t vu_tlen;
+} __packed;
+
+/*
+ * PFN register shift amount
+ */
+#define VRING_PFN 12
+
+/*
+ * Virtio device types
+ */
+#define VIRTIO_TYPE_NET 1
+#define VIRTIO_TYPE_BLOCK 2
+
+/*
+ * PCI vendor/device IDs
+ */
+#define VIRTIO_VENDOR 0x1AF4
+#define VIRTIO_DEV_NET 0x1000
+#define VIRTIO_DEV_BLOCK 0x1001
+
+/*
+ * PCI config space constants
+ */
+#define VTCFG_R_HOSTCAP 0
+#define VTCFG_R_GUESTCAP 4
+#define VTCFG_R_PFN 8
+#define VTCFG_R_QNUM 12
+#define VTCFG_R_QSEL 14
+#define VTCFG_R_QNOTIFY 16
+#define VTCFG_R_STATUS 18
+#define VTCFG_R_ISR 19
+#define VTCFG_R_CFG0 20 /* No MSI-X */
+#define VTCFG_R_CFG1 24 /* With MSI-X */
+#define VTCFG_R_MSIX 20
+
+#endif /* _VIRTIO_H_ */
diff --git a/usr.sbin/bhyve/xmsr.c b/usr.sbin/bhyve/xmsr.c
new file mode 100644
index 0000000..931b7d7
--- /dev/null
+++ b/usr.sbin/bhyve/xmsr.c
@@ -0,0 +1,261 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <machine/apicreg.h>
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "fbsdrun.h"
+#include "xmsr.h"
+
+/*
+ * Trampoline for hypervisor direct 64-bit jump.
+ *
+ * 0 - signature for guest->host verification
+ * 8 - kernel virtual address of trampoline
+ * 16 - instruction virtual address
+ * 24 - stack pointer virtual address
+ * 32 - CR3, physical address of kernel page table
+ * 40 - 24-byte area for null/code/data GDT entries
+ */
+#define MP_V64T_SIG 0xcafebabecafebabeULL
+struct mp_v64tramp {
+ uint64_t mt_sig;
+ uint64_t mt_virt;
+ uint64_t mt_eip;
+ uint64_t mt_rsp;
+ uint64_t mt_cr3;
+ uint64_t mt_gdtr[3];
+};
+
+/*
+ * CPU 0 is considered to be the BSP and is set to the RUNNING state.
+ * All other CPUs are set up in the INIT state.
+ */
+#define BSP 0
+enum cpu_bstate {
+ CPU_S_INIT,
+ CPU_S_SIPI,
+ CPU_S_RUNNING
+} static cpu_b[VM_MAXCPU] = { [BSP] = CPU_S_RUNNING };
+
+static void spinup_ap(struct vmctx *, int, int, uint64_t *);
+static void spinup_ap_direct64(struct vmctx *, int, uintptr_t, uint64_t *);
+
+int
+emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val)
+{
+ int dest;
+ int mode;
+ int thiscpu;
+ int vec;
+ int error, retval;
+ uint64_t rip;
+
+ retval = vcpu;
+ thiscpu = 1 << vcpu;
+
+ /*
+ * The only MSR value handled is the x2apic CR register
+ */
+ if (code != 0x830) {
+ printf("Unknown WRMSR code %x, val %lx, cpu %d\n",
+ code, val, vcpu);
+ exit(1);
+ }
+
+ /*
+ * The value written to the MSR will generate an IPI to
+ * a set of CPUs. If this is a SIPI, create the initial
+ * state for the CPU and switch to it. Otherwise, inject
+ * an interrupt for the destination CPU(s), and request
+ * a switch to the next available one by returning -1
+ */
+ dest = val >> 32;
+ vec = val & APIC_VECTOR_MASK;
+ mode = val & APIC_DELMODE_MASK;
+
+ switch (mode) {
+ case APIC_DELMODE_INIT:
+ assert(dest != 0);
+ assert(dest < guest_ncpus);
+
+ /*
+ * Ignore legacy de-assert INITs in x2apic mode
+ */
+ if ((val & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) {
+ break;
+ }
+ assert(cpu_b[dest] == CPU_S_INIT);
+
+ /*
+ * Move CPU to wait-for-SIPI state
+ */
+ error = vcpu_reset(ctx, dest);
+ assert(error == 0);
+
+ cpu_b[dest] = CPU_S_SIPI;
+ break;
+
+ case APIC_DELMODE_STARTUP:
+ assert(dest != 0);
+ assert(dest < guest_ncpus);
+ /*
+ * Ignore SIPIs in any state other than wait-for-SIPI
+ */
+ if (cpu_b[dest] != CPU_S_SIPI) {
+ break;
+ }
+
+ /*
+ * Bring up the AP and signal the main loop that it is
+ * available and to switch to it.
+ */
+ spinup_ap(ctx, dest, vec, &rip);
+ cpu_b[dest] = CPU_S_RUNNING;
+ fbsdrun_addcpu(ctx, dest, rip);
+ retval = dest;
+ break;
+
+ default:
+ printf("APIC delivery mode %lx not supported!\n",
+ val & APIC_DELMODE_MASK);
+ exit(1);
+ }
+
+ return (retval);
+}
+
+/*
+ * There are 2 startup modes possible here:
+ * - if the CPU supports 'unrestricted guest' mode, the spinup can
+ * set up the processor state in power-on 16-bit mode, with the CS:IP
+ * init'd to the specified low-mem 4K page.
+ * - if the guest has requested a 64-bit trampoline in the low-mem 4K
+ * page by placing in the specified signature, set up the register
+ * state using register state in the signature. Note that this
+ * requires accessing guest physical memory to read the signature
+ * while 'unrestricted mode' does not.
+ */
+static void
+spinup_ap(struct vmctx *ctx, int newcpu, int vector, uint64_t *rip)
+{
+ int error;
+ uint16_t cs;
+ uint64_t desc_base;
+ uint32_t desc_limit, desc_access;
+
+ if (fbsdrun_vmexit_on_hlt()) {
+ error = vm_set_capability(ctx, newcpu, VM_CAP_HALT_EXIT, 1);
+ assert(error == 0);
+ }
+
+ if (fbsdrun_vmexit_on_pause()) {
+ error = vm_set_capability(ctx, newcpu, VM_CAP_PAUSE_EXIT, 1);
+ assert(error == 0);
+ }
+
+ error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
+ if (error) {
+ /*
+ * If the guest does not support real-mode execution then
+ * we will bring up the AP directly in 64-bit mode.
+ */
+ spinup_ap_direct64(ctx, newcpu, vector << PAGE_SHIFT, rip);
+ } else {
+ /*
+ * Update the %cs and %rip of the guest so that it starts
+ * executing real mode code at at 'vector << 12'.
+ */
+ *rip = 0;
+ error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip);
+ assert(error == 0);
+
+ error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base,
+ &desc_limit, &desc_access);
+ assert(error == 0);
+
+ desc_base = vector << PAGE_SHIFT;
+ error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ assert(error == 0);
+
+ cs = (vector << PAGE_SHIFT) >> 4;
+ error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs);
+ assert(error == 0);
+ }
+}
+
+static void
+spinup_ap_direct64(struct vmctx *ctx, int newcpu, uintptr_t gaddr,
+ uint64_t *rip)
+{
+ struct mp_v64tramp *mvt;
+ char *errstr;
+ int error;
+ uint64_t gdtbase;
+
+ mvt = paddr_guest2host(gaddr);
+
+ assert(mvt->mt_sig == MP_V64T_SIG);
+
+ /*
+ * Set up the 3-entry GDT using memory supplied in the
+ * guest's trampoline structure.
+ */
+ vm_setup_freebsd_gdt(mvt->mt_gdtr);
+
+#define CHECK_ERROR(msg) \
+ if (error != 0) { \
+ errstr = msg; \
+ goto err_exit; \
+ }
+
+ /* entry point */
+ *rip = mvt->mt_eip;
+
+ /* Get the guest virtual address of the GDT */
+ gdtbase = mvt->mt_virt + __offsetof(struct mp_v64tramp, mt_gdtr);
+
+ error = vm_setup_freebsd_registers(ctx, newcpu, mvt->mt_eip,
+ mvt->mt_cr3, gdtbase, mvt->mt_rsp);
+ CHECK_ERROR("vm_setup_freebsd_registers");
+
+ return;
+err_exit:
+ printf("spinup_ap_direct64: machine state error: %s", errstr);
+ exit(1);
+}
diff --git a/usr.sbin/bhyve/xmsr.h b/usr.sbin/bhyve/xmsr.h
new file mode 100644
index 0000000..8cebcea
--- /dev/null
+++ b/usr.sbin/bhyve/xmsr.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _XMSR_H_
+#define _XMSR_H_
+
+int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
+
+#endif
diff --git a/usr.sbin/vmmctl/Makefile b/usr.sbin/vmmctl/Makefile
new file mode 100644
index 0000000..1f529b5
--- /dev/null
+++ b/usr.sbin/vmmctl/Makefile
@@ -0,0 +1,15 @@
+#
+# $FreeBSD$
+#
+
+PROG= vmmctl
+SRCS= vmmctl.c
+
+NO_MAN=
+
+DPADD= ${LIBVMMAPI}
+LDADD= -lvmmapi
+
+CFLAGS+= -I${.CURDIR}/../../sys/amd64/vmm
+
+.include <bsd.prog.mk>
diff --git a/usr.sbin/vmmctl/sample.sh b/usr.sbin/vmmctl/sample.sh
new file mode 100755
index 0000000..f38d0da
--- /dev/null
+++ b/usr.sbin/vmmctl/sample.sh
@@ -0,0 +1,75 @@
+#!/bin/sh
+
+# $FreeBSD$
+
+VMMCTL="sudo ./vmmctl"
+VMNAME=sample
+
+${VMMCTL} --vm=${VMNAME} --create
+${VMMCTL} --vm=${VMNAME} --set-lowmem=128 --set-highmem=256
+${VMMCTL} --vm=${VMNAME} --get-lowmem --get-highmem
+
+CR0_PE=$((1 << 0))
+CR0_PG=$((1 << 31))
+CR0=$(($CR0_PE | $CR0_PG))
+${VMMCTL} --vm=${VMNAME} --set-cr0=${CR0} --get-cr0
+
+# XXX this is bogus the value of %cr3 should come from the loader
+CR3=0
+${VMMCTL} --vm=${VMNAME} --set-cr3=${CR3} --get-cr3
+
+CR4_PAE=$((1 << 5))
+CR4=$((${CR4_PAE}))
+${VMMCTL} --vm=${VMNAME} --set-cr4=${CR4} --get-cr4
+
+DR7=0x00000400 # Table 9-1 from Intel Architecture Manual 3A
+${VMMCTL} --vm=${VMNAME} --set-dr7=${DR7} --get-dr7
+
+#
+# XXX the values of rsp and rip are bogus and should come from the loader.
+#
+RSP=0xa5a5a5a5
+RIP=0x0000bfbfbfbf0000
+RFLAGS=0x2
+${VMMCTL} --vm=${VMNAME} --set-rsp=${RSP} --get-rsp
+${VMMCTL} --vm=${VMNAME} --set-rip=${RIP} --get-rip
+${VMMCTL} --vm=${VMNAME} --set-rflags=${RFLAGS} --get-rflags
+
+# Set "hidden" state of %cs descriptor to indicate long mode code segment.
+#
+# Note that this should match the contents of the entry pointed to by the
+# segment selector in the GDTR.
+#
+${VMMCTL} --vm=${VMNAME} --set-desc-cs --desc-access=0x00002098 --get-desc-cs
+
+# Set "hidden" state of all data descriptors to indicate a usable segment.
+# The only useful fields are the "Present" and "Descriptor Type" bits.
+${VMMCTL} --vm=${VMNAME} --set-desc-ds --desc-access=0x00000090 --get-desc-ds
+${VMMCTL} --vm=${VMNAME} --set-desc-es --desc-access=0x00000090 --get-desc-es
+${VMMCTL} --vm=${VMNAME} --set-desc-fs --desc-access=0x00000090 --get-desc-fs
+${VMMCTL} --vm=${VMNAME} --set-desc-gs --desc-access=0x00000090 --get-desc-gs
+${VMMCTL} --vm=${VMNAME} --set-desc-ss --desc-access=0x00000090 --get-desc-ss
+
+#
+# Set the code segment selector to point to entry at offset 8 in the GDTR.
+#
+${VMMCTL} --vm=${VMNAME} --set-cs=0x0008 --get-cs
+
+# Set all the remaining data segment selectors to point to entry at offset
+# 16 in the GDTR.
+${VMMCTL} --vm=${VMNAME} --set-ds=0x0010 --get-ds
+${VMMCTL} --vm=${VMNAME} --set-es=0x0010 --get-es
+${VMMCTL} --vm=${VMNAME} --set-fs=0x0010 --get-fs
+${VMMCTL} --vm=${VMNAME} --set-gs=0x0010 --get-gs
+${VMMCTL} --vm=${VMNAME} --set-ss=0x0010 --get-ss
+
+# XXX the value of the GDTR should come from the loader.
+# Set the GDTR
+GDTR_BASE=0xffff0000
+GDTR_LIMIT=0x10
+${VMMCTL} --vm=${VMNAME} --set-desc-gdtr --desc-base=${GDTR_BASE} --desc-limit=${GDTR_LIMIT} --get-desc-gdtr
+
+${VMMCTL} --vm=${VMNAME} --set-pinning=0 --get-pinning
+${VMMCTL} --vm=${VMNAME} --set-pinning=-1 --get-pinning
+
+${VMMCTL} --vm=${VMNAME} --destroy
diff --git a/usr.sbin/vmmctl/vmmctl.c b/usr.sbin/vmmctl/vmmctl.c
new file mode 100644
index 0000000..678f98b
--- /dev/null
+++ b/usr.sbin/vmmctl/vmmctl.c
@@ -0,0 +1,1485 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/errno.h>
+#include <sys/mman.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <libutil.h>
+#include <fcntl.h>
+#include <string.h>
+#include <getopt.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "intel/vmcs.h"
+
+#define MB (1UL << 20)
+#define GB (1UL << 30)
+
+#define REQ_ARG required_argument
+#define NO_ARG no_argument
+#define OPT_ARG optional_argument
+
+static const char *progname;
+
+static void
+usage(void)
+{
+
+ (void)fprintf(stderr,
+ "Usage: %s --vm=<name>\n"
+ " [--cpu=<vcpu_number>]\n"
+ " [--create]\n"
+ " [--destroy]\n"
+ " [--get-stats]\n"
+ " [--set-desc-ds]\n"
+ " [--get-desc-ds]\n"
+ " [--set-desc-es]\n"
+ " [--get-desc-es]\n"
+ " [--set-desc-gs]\n"
+ " [--get-desc-gs]\n"
+ " [--set-desc-fs]\n"
+ " [--get-desc-fs]\n"
+ " [--set-desc-cs]\n"
+ " [--get-desc-cs]\n"
+ " [--set-desc-ss]\n"
+ " [--get-desc-ss]\n"
+ " [--set-desc-tr]\n"
+ " [--get-desc-tr]\n"
+ " [--set-desc-ldtr]\n"
+ " [--get-desc-ldtr]\n"
+ " [--set-desc-gdtr]\n"
+ " [--get-desc-gdtr]\n"
+ " [--set-desc-idtr]\n"
+ " [--get-desc-idtr]\n"
+ " [--run]\n"
+ " [--capname=<capname>]\n"
+ " [--getcap]\n"
+ " [--setcap=<0|1>]\n"
+ " [--desc-base=<BASE>]\n"
+ " [--desc-limit=<LIMIT>]\n"
+ " [--desc-access=<ACCESS>]\n"
+ " [--set-cr0=<CR0>]\n"
+ " [--get-cr0]\n"
+ " [--set-cr3=<CR3>]\n"
+ " [--get-cr3]\n"
+ " [--set-cr4=<CR4>]\n"
+ " [--get-cr4]\n"
+ " [--set-dr7=<DR7>]\n"
+ " [--get-dr7]\n"
+ " [--set-rsp=<RSP>]\n"
+ " [--get-rsp]\n"
+ " [--set-rip=<RIP>]\n"
+ " [--get-rip]\n"
+ " [--get-rax]\n"
+ " [--set-rax=<RAX>]\n"
+ " [--get-rbx]\n"
+ " [--get-rcx]\n"
+ " [--get-rdx]\n"
+ " [--get-rsi]\n"
+ " [--get-rdi]\n"
+ " [--get-rbp]\n"
+ " [--get-r8]\n"
+ " [--get-r9]\n"
+ " [--get-r10]\n"
+ " [--get-r11]\n"
+ " [--get-r12]\n"
+ " [--get-r13]\n"
+ " [--get-r14]\n"
+ " [--get-r15]\n"
+ " [--set-rflags=<RFLAGS>]\n"
+ " [--get-rflags]\n"
+ " [--set-cs]\n"
+ " [--get-cs]\n"
+ " [--set-ds]\n"
+ " [--get-ds]\n"
+ " [--set-es]\n"
+ " [--get-es]\n"
+ " [--set-fs]\n"
+ " [--get-fs]\n"
+ " [--set-gs]\n"
+ " [--get-gs]\n"
+ " [--set-ss]\n"
+ " [--get-ss]\n"
+ " [--get-tr]\n"
+ " [--get-ldtr]\n"
+ " [--get-vmcs-pinbased-ctls]\n"
+ " [--get-vmcs-procbased-ctls]\n"
+ " [--get-vmcs-procbased-ctls2]\n"
+ " [--get-vmcs-entry-interruption-info]\n"
+ " [--set-vmcs-entry-interruption-info=<info>]\n"
+ " [--get-vmcs-eptp]\n"
+ " [--get-vmcs-guest-physical-address\n"
+ " [--get-vmcs-guest-linear-address\n"
+ " [--set-vmcs-exception-bitmap]\n"
+ " [--get-vmcs-exception-bitmap]\n"
+ " [--get-vmcs-io-bitmap-address]\n"
+ " [--get-vmcs-tsc-offset]\n"
+ " [--get-vmcs-guest-pat]\n"
+ " [--get-vmcs-host-pat]\n"
+ " [--get-vmcs-host-cr0]\n"
+ " [--get-vmcs-host-cr3]\n"
+ " [--get-vmcs-host-cr4]\n"
+ " [--get-vmcs-host-rip]\n"
+ " [--get-vmcs-host-rsp]\n"
+ " [--get-vmcs-cr0-mask]\n"
+ " [--get-vmcs-cr0-shadow]\n"
+ " [--get-vmcs-cr4-mask]\n"
+ " [--get-vmcs-cr4-shadow]\n"
+ " [--get-vmcs-cr3-targets]\n"
+ " [--get-vmcs-apic-access-address]\n"
+ " [--get-vmcs-virtual-apic-address]\n"
+ " [--get-vmcs-tpr-threshold]\n"
+ " [--get-vmcs-msr-bitmap]\n"
+ " [--get-vmcs-msr-bitmap-address]\n"
+ " [--get-vmcs-vpid]\n"
+ " [--get-vmcs-ple-gap]\n"
+ " [--get-vmcs-ple-window]\n"
+ " [--get-vmcs-instruction-error]\n"
+ " [--get-vmcs-exit-ctls]\n"
+ " [--get-vmcs-entry-ctls]\n"
+ " [--get-vmcs-guest-sysenter]\n"
+ " [--get-vmcs-link]\n"
+ " [--get-vmcs-exit-reason]\n"
+ " [--get-vmcs-exit-qualification]\n"
+ " [--get-vmcs-exit-interruption-info]\n"
+ " [--get-vmcs-exit-interruption-error]\n"
+ " [--get-vmcs-interruptibility]\n"
+ " [--set-pinning=<host_cpuid>]\n"
+ " [--get-pinning]\n"
+ " [--set-lowmem=<memory below 4GB in units of MB>]\n"
+ " [--get-lowmem]\n"
+ " [--set-highmem=<memory above 4GB in units of MB>]\n"
+ " [--get-highmem]\n",
+ progname);
+ exit(1);
+}
+
+static int get_stats, getcap, setcap, capval;
+static const char *capname;
+static int create, destroy, get_lowmem, get_highmem;
+static uint64_t lowmem, highmem;
+static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
+static int set_efer, get_efer;
+static int set_dr7, get_dr7;
+static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags;
+static int set_rax, get_rax;
+static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp;
+static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15;
+static int set_desc_ds, get_desc_ds;
+static int set_desc_es, get_desc_es;
+static int set_desc_fs, get_desc_fs;
+static int set_desc_gs, get_desc_gs;
+static int set_desc_cs, get_desc_cs;
+static int set_desc_ss, get_desc_ss;
+static int set_desc_gdtr, get_desc_gdtr;
+static int set_desc_idtr, get_desc_idtr;
+static int set_desc_tr, get_desc_tr;
+static int set_desc_ldtr, get_desc_ldtr;
+static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr;
+static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr;
+static int set_pinning, get_pinning, pincpu;
+static int run;
+
+/*
+ * VMCS-specific fields
+ */
+static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2;
+static int get_eptp, get_io_bitmap, get_tsc_offset;
+static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info;
+static int get_vmcs_interruptibility;
+uint32_t vmcs_entry_interruption_info;
+static int get_vmcs_gpa, get_vmcs_gla;
+static int get_exception_bitmap, set_exception_bitmap, exception_bitmap;
+static int get_cr0_mask, get_cr0_shadow;
+static int get_cr4_mask, get_cr4_shadow;
+static int get_cr3_targets;
+static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold;
+static int get_msr_bitmap, get_msr_bitmap_address;
+static int get_vpid, get_ple_gap, get_ple_window;
+static int get_inst_err, get_exit_ctls, get_entry_ctls;
+static int get_host_cr0, get_host_cr3, get_host_cr4;
+static int get_host_rip, get_host_rsp;
+static int get_guest_pat, get_host_pat;
+static int get_guest_sysenter, get_vmcs_link;
+static int get_vmcs_exit_reason, get_vmcs_exit_qualification;
+static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error;
+
+static uint64_t desc_base;
+static uint32_t desc_limit, desc_access;
+
+static void
+dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu)
+{
+ printf("vm exit[%d]\n", vcpu);
+ printf("\trip\t\t0x%016lx\n", vmexit->rip);
+ printf("\tinst_length\t%d\n", vmexit->inst_length);
+ switch (vmexit->exitcode) {
+ case VM_EXITCODE_INOUT:
+ printf("\treason\t\tINOUT\n");
+ printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT");
+ printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes);
+ printf("\tflags\t\t%s%s\n",
+ vmexit->u.inout.string ? "STRING " : "",
+ vmexit->u.inout.rep ? "REP " : "");
+ printf("\tport\t\t0x%04x\n", vmexit->u.inout.port);
+ printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax);
+ break;
+ case VM_EXITCODE_VMX:
+ printf("\treason\t\tVMX\n");
+ printf("\terror\t\t%d\n", vmexit->u.vmx.error);
+ printf("\texit_reason\t0x%08x (%u)\n",
+ vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason);
+ printf("\tqualification\t0x%016lx\n",
+ vmexit->u.vmx.exit_qualification);
+ break;
+ default:
+ printf("*** unknown vm run exitcode %d\n", vmexit->exitcode);
+ break;
+ }
+}
+
+static int
+dump_vmcs_msr_bitmap(int vcpu, u_long addr)
+{
+ int error, fd, byte, bit, readable, writeable;
+ u_int msr;
+ const char *bitmap;
+
+ error = -1;
+ bitmap = MAP_FAILED;
+
+ fd = open("/dev/mem", O_RDONLY, 0);
+ if (fd < 0)
+ goto done;
+
+ bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr);
+ if (bitmap == MAP_FAILED)
+ goto done;
+
+ for (msr = 0; msr < 0x2000; msr++) {
+ byte = msr / 8;
+ bit = msr & 0x7;
+
+ /* Look at MSRs in the range 0x00000000 to 0x00001FFF */
+ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+ writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
+ if (readable || writeable) {
+ printf("msr 0x%08x[%d]\t\t%c%c\n", msr, vcpu,
+ readable ? 'R' : '-',
+ writeable ? 'W' : '-');
+ }
+
+ /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
+ byte += 1024;
+ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+ writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
+ if (readable || writeable) {
+ printf("msr 0x%08x[%d]\t\t%c%c\n",
+ 0xc0000000 + msr, vcpu,
+ readable ? 'R' : '-',
+ writeable ? 'W' : '-');
+ }
+ }
+
+ error = 0;
+done:
+ if (bitmap != MAP_FAILED)
+ munmap((void *)bitmap, PAGE_SIZE);
+ if (fd >= 0)
+ close(fd);
+ return (error);
+}
+
+static int
+vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val)
+{
+
+ return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val));
+}
+
+static int
+vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val)
+{
+
+ return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val));
+}
+
+enum {
+ VMNAME = 1000, /* avoid collision with return values from getopt */
+ VCPU,
+ SET_LOWMEM,
+ SET_HIGHMEM,
+ SET_EFER,
+ SET_CR0,
+ SET_CR3,
+ SET_CR4,
+ SET_DR7,
+ SET_RSP,
+ SET_RIP,
+ SET_RAX,
+ SET_RFLAGS,
+ DESC_BASE,
+ DESC_LIMIT,
+ DESC_ACCESS,
+ SET_CS,
+ SET_DS,
+ SET_ES,
+ SET_FS,
+ SET_GS,
+ SET_SS,
+ SET_TR,
+ SET_LDTR,
+ SET_PINNING,
+ SET_VMCS_EXCEPTION_BITMAP,
+ SET_VMCS_ENTRY_INTERRUPTION_INFO,
+ SET_CAP,
+ CAPNAME,
+};
+
+int
+main(int argc, char *argv[])
+{
+ char *vmname;
+ int error, ch, vcpu;
+ vm_paddr_t hpa;
+ size_t len;
+ struct vm_exit vmexit;
+ uint64_t ctl, eptp, bm, tsc_off, addr, u64;
+ struct vmctx *ctx;
+
+ uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
+ uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
+ uint64_t r8, r9, r10, r11, r12, r13, r14, r15;
+ uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
+
+ struct option opts[] = {
+ { "vm", REQ_ARG, 0, VMNAME },
+ { "cpu", REQ_ARG, 0, VCPU },
+ { "set-lowmem", REQ_ARG, 0, SET_LOWMEM },
+ { "set-highmem",REQ_ARG, 0, SET_HIGHMEM },
+ { "set-efer", REQ_ARG, 0, SET_EFER },
+ { "set-cr0", REQ_ARG, 0, SET_CR0 },
+ { "set-cr3", REQ_ARG, 0, SET_CR3 },
+ { "set-cr4", REQ_ARG, 0, SET_CR4 },
+ { "set-dr7", REQ_ARG, 0, SET_DR7 },
+ { "set-rsp", REQ_ARG, 0, SET_RSP },
+ { "set-rip", REQ_ARG, 0, SET_RIP },
+ { "set-rax", REQ_ARG, 0, SET_RAX },
+ { "set-rflags", REQ_ARG, 0, SET_RFLAGS },
+ { "desc-base", REQ_ARG, 0, DESC_BASE },
+ { "desc-limit", REQ_ARG, 0, DESC_LIMIT },
+ { "desc-access",REQ_ARG, 0, DESC_ACCESS },
+ { "set-cs", REQ_ARG, 0, SET_CS },
+ { "set-ds", REQ_ARG, 0, SET_DS },
+ { "set-es", REQ_ARG, 0, SET_ES },
+ { "set-fs", REQ_ARG, 0, SET_FS },
+ { "set-gs", REQ_ARG, 0, SET_GS },
+ { "set-ss", REQ_ARG, 0, SET_SS },
+ { "set-tr", REQ_ARG, 0, SET_TR },
+ { "set-ldtr", REQ_ARG, 0, SET_LDTR },
+ { "set-pinning",REQ_ARG, 0, SET_PINNING },
+ { "set-vmcs-exception-bitmap",
+ REQ_ARG, 0, SET_VMCS_EXCEPTION_BITMAP },
+ { "set-vmcs-entry-interruption-info",
+ REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO },
+ { "capname", REQ_ARG, 0, CAPNAME },
+ { "setcap", REQ_ARG, 0, SET_CAP },
+ { "getcap", NO_ARG, &getcap, 1 },
+ { "get-stats", NO_ARG, &get_stats, 1 },
+ { "get-desc-ds",NO_ARG, &get_desc_ds, 1 },
+ { "set-desc-ds",NO_ARG, &set_desc_ds, 1 },
+ { "get-desc-es",NO_ARG, &get_desc_es, 1 },
+ { "set-desc-es",NO_ARG, &set_desc_es, 1 },
+ { "get-desc-ss",NO_ARG, &get_desc_ss, 1 },
+ { "set-desc-ss",NO_ARG, &set_desc_ss, 1 },
+ { "get-desc-cs",NO_ARG, &get_desc_cs, 1 },
+ { "set-desc-cs",NO_ARG, &set_desc_cs, 1 },
+ { "get-desc-fs",NO_ARG, &get_desc_fs, 1 },
+ { "set-desc-fs",NO_ARG, &set_desc_fs, 1 },
+ { "get-desc-gs",NO_ARG, &get_desc_gs, 1 },
+ { "set-desc-gs",NO_ARG, &set_desc_gs, 1 },
+ { "get-desc-tr",NO_ARG, &get_desc_tr, 1 },
+ { "set-desc-tr",NO_ARG, &set_desc_tr, 1 },
+ { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 },
+ { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 },
+ { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 },
+ { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 },
+ { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 },
+ { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 },
+ { "get-lowmem", NO_ARG, &get_lowmem, 1 },
+ { "get-highmem",NO_ARG, &get_highmem, 1 },
+ { "get-efer", NO_ARG, &get_efer, 1 },
+ { "get-cr0", NO_ARG, &get_cr0, 1 },
+ { "get-cr3", NO_ARG, &get_cr3, 1 },
+ { "get-cr4", NO_ARG, &get_cr4, 1 },
+ { "get-dr7", NO_ARG, &get_dr7, 1 },
+ { "get-rsp", NO_ARG, &get_rsp, 1 },
+ { "get-rip", NO_ARG, &get_rip, 1 },
+ { "get-rax", NO_ARG, &get_rax, 1 },
+ { "get-rbx", NO_ARG, &get_rbx, 1 },
+ { "get-rcx", NO_ARG, &get_rcx, 1 },
+ { "get-rdx", NO_ARG, &get_rdx, 1 },
+ { "get-rsi", NO_ARG, &get_rsi, 1 },
+ { "get-rdi", NO_ARG, &get_rdi, 1 },
+ { "get-rbp", NO_ARG, &get_rbp, 1 },
+ { "get-r8", NO_ARG, &get_r8, 1 },
+ { "get-r9", NO_ARG, &get_r9, 1 },
+ { "get-r10", NO_ARG, &get_r10, 1 },
+ { "get-r11", NO_ARG, &get_r11, 1 },
+ { "get-r12", NO_ARG, &get_r12, 1 },
+ { "get-r13", NO_ARG, &get_r13, 1 },
+ { "get-r14", NO_ARG, &get_r14, 1 },
+ { "get-r15", NO_ARG, &get_r15, 1 },
+ { "get-rflags", NO_ARG, &get_rflags, 1 },
+ { "get-cs", NO_ARG, &get_cs, 1 },
+ { "get-ds", NO_ARG, &get_ds, 1 },
+ { "get-es", NO_ARG, &get_es, 1 },
+ { "get-fs", NO_ARG, &get_fs, 1 },
+ { "get-gs", NO_ARG, &get_gs, 1 },
+ { "get-ss", NO_ARG, &get_ss, 1 },
+ { "get-tr", NO_ARG, &get_tr, 1 },
+ { "get-ldtr", NO_ARG, &get_ldtr, 1 },
+ { "get-vmcs-pinbased-ctls",
+ NO_ARG, &get_pinbased_ctls, 1 },
+ { "get-vmcs-procbased-ctls",
+ NO_ARG, &get_procbased_ctls, 1 },
+ { "get-vmcs-procbased-ctls2",
+ NO_ARG, &get_procbased_ctls2, 1 },
+ { "get-vmcs-guest-linear-address",
+ NO_ARG, &get_vmcs_gla, 1 },
+ { "get-vmcs-guest-physical-address",
+ NO_ARG, &get_vmcs_gpa, 1 },
+ { "get-vmcs-entry-interruption-info",
+ NO_ARG, &get_vmcs_entry_interruption_info, 1},
+ { "get-vmcs-eptp", NO_ARG, &get_eptp, 1 },
+ { "get-vmcs-exception-bitmap",
+ NO_ARG, &get_exception_bitmap, 1 },
+ { "get-vmcs-io-bitmap-address",
+ NO_ARG, &get_io_bitmap, 1 },
+ { "get-vmcs-tsc-offset", NO_ARG,&get_tsc_offset, 1 },
+ { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 },
+ { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 },
+ { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 },
+ { "get-vmcs-cr4-shadow", NO_ARG,&get_cr4_shadow, 1 },
+ { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1},
+ { "get-vmcs-apic-access-address",
+ NO_ARG, &get_apic_access_addr, 1},
+ { "get-vmcs-virtual-apic-address",
+ NO_ARG, &get_virtual_apic_addr, 1},
+ { "get-vmcs-tpr-threshold",
+ NO_ARG, &get_tpr_threshold, 1 },
+ { "get-vmcs-msr-bitmap",
+ NO_ARG, &get_msr_bitmap, 1 },
+ { "get-vmcs-msr-bitmap-address",
+ NO_ARG, &get_msr_bitmap_address, 1 },
+ { "get-vmcs-vpid", NO_ARG, &get_vpid, 1 },
+ { "get-vmcs-ple-gap", NO_ARG, &get_ple_gap, 1 },
+ { "get-vmcs-ple-window", NO_ARG,&get_ple_window,1 },
+ { "get-vmcs-instruction-error",
+ NO_ARG, &get_inst_err, 1 },
+ { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 },
+ { "get-vmcs-entry-ctls",
+ NO_ARG, &get_entry_ctls, 1 },
+ { "get-vmcs-guest-pat", NO_ARG, &get_guest_pat, 1 },
+ { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 },
+ { "get-vmcs-host-cr0",
+ NO_ARG, &get_host_cr0, 1 },
+ { "get-vmcs-host-cr3",
+ NO_ARG, &get_host_cr3, 1 },
+ { "get-vmcs-host-cr4",
+ NO_ARG, &get_host_cr4, 1 },
+ { "get-vmcs-host-rip",
+ NO_ARG, &get_host_rip, 1 },
+ { "get-vmcs-host-rsp",
+ NO_ARG, &get_host_rsp, 1 },
+ { "get-vmcs-guest-sysenter",
+ NO_ARG, &get_guest_sysenter, 1 },
+ { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 },
+ { "get-vmcs-exit-reason",
+ NO_ARG, &get_vmcs_exit_reason, 1 },
+ { "get-vmcs-exit-qualification",
+ NO_ARG, &get_vmcs_exit_qualification, 1 },
+ { "get-vmcs-exit-interruption-info",
+ NO_ARG, &get_vmcs_exit_interruption_info, 1},
+ { "get-vmcs-exit-interruption-error",
+ NO_ARG, &get_vmcs_exit_interruption_error, 1},
+ { "get-vmcs-interruptibility",
+ NO_ARG, &get_vmcs_interruptibility, 1 },
+ { "get-pinning",NO_ARG, &get_pinning, 1 },
+ { "run", NO_ARG, &run, 1 },
+ { "create", NO_ARG, &create, 1 },
+ { "destroy", NO_ARG, &destroy, 1 },
+ { NULL, 0, NULL, 0 }
+ };
+
+ vcpu = 0;
+ progname = basename(argv[0]);
+
+ while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) {
+ switch (ch) {
+ case 0:
+ break;
+ case VMNAME:
+ vmname = optarg;
+ break;
+ case VCPU:
+ vcpu = atoi(optarg);
+ break;
+ case SET_LOWMEM:
+ lowmem = atoi(optarg) * MB;
+ lowmem = roundup(lowmem, 2 * MB);
+ break;
+ case SET_HIGHMEM:
+ highmem = atoi(optarg) * MB;
+ highmem = roundup(highmem, 2 * MB);
+ break;
+ case SET_EFER:
+ efer = strtoul(optarg, NULL, 0);
+ set_efer = 1;
+ break;
+ case SET_CR0:
+ cr0 = strtoul(optarg, NULL, 0);
+ set_cr0 = 1;
+ break;
+ case SET_CR3:
+ cr3 = strtoul(optarg, NULL, 0);
+ set_cr3 = 1;
+ break;
+ case SET_CR4:
+ cr4 = strtoul(optarg, NULL, 0);
+ set_cr4 = 1;
+ break;
+ case SET_DR7:
+ dr7 = strtoul(optarg, NULL, 0);
+ set_dr7 = 1;
+ break;
+ case SET_RSP:
+ rsp = strtoul(optarg, NULL, 0);
+ set_rsp = 1;
+ break;
+ case SET_RIP:
+ rip = strtoul(optarg, NULL, 0);
+ set_rip = 1;
+ break;
+ case SET_RAX:
+ rax = strtoul(optarg, NULL, 0);
+ set_rax = 1;
+ break;
+ case SET_RFLAGS:
+ rflags = strtoul(optarg, NULL, 0);
+ set_rflags = 1;
+ break;
+ case DESC_BASE:
+ desc_base = strtoul(optarg, NULL, 0);
+ break;
+ case DESC_LIMIT:
+ desc_limit = strtoul(optarg, NULL, 0);
+ break;
+ case DESC_ACCESS:
+ desc_access = strtoul(optarg, NULL, 0);
+ break;
+ case SET_CS:
+ cs = strtoul(optarg, NULL, 0);
+ set_cs = 1;
+ break;
+ case SET_DS:
+ ds = strtoul(optarg, NULL, 0);
+ set_ds = 1;
+ break;
+ case SET_ES:
+ es = strtoul(optarg, NULL, 0);
+ set_es = 1;
+ break;
+ case SET_FS:
+ fs = strtoul(optarg, NULL, 0);
+ set_fs = 1;
+ break;
+ case SET_GS:
+ gs = strtoul(optarg, NULL, 0);
+ set_gs = 1;
+ break;
+ case SET_SS:
+ ss = strtoul(optarg, NULL, 0);
+ set_ss = 1;
+ break;
+ case SET_TR:
+ tr = strtoul(optarg, NULL, 0);
+ set_tr = 1;
+ break;
+ case SET_LDTR:
+ ldtr = strtoul(optarg, NULL, 0);
+ set_ldtr = 1;
+ break;
+ case SET_PINNING:
+ pincpu = strtol(optarg, NULL, 0);
+ set_pinning = 1;
+ break;
+ case SET_VMCS_EXCEPTION_BITMAP:
+ exception_bitmap = strtoul(optarg, NULL, 0);
+ set_exception_bitmap = 1;
+ break;
+ case SET_VMCS_ENTRY_INTERRUPTION_INFO:
+ vmcs_entry_interruption_info = strtoul(optarg, NULL, 0);
+ set_vmcs_entry_interruption_info = 1;
+ break;
+ case SET_CAP:
+ capval = strtoul(optarg, NULL, 0);
+ setcap = 1;
+ break;
+ case CAPNAME:
+ capname = optarg;
+ break;
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (vmname == NULL)
+ usage();
+
+ error = 0;
+
+ if (!error && create)
+ error = vm_create(vmname);
+
+ if (!error) {
+ ctx = vm_open(vmname);
+ if (ctx == NULL)
+ error = -1;
+ }
+
+ if (!error && lowmem)
+ error = vm_setup_memory(ctx, 0, lowmem, NULL);
+
+ if (!error && highmem)
+ error = vm_setup_memory(ctx, 4 * GB, highmem, NULL);
+
+ if (!error && set_efer)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer);
+
+ if (!error && set_cr0)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0);
+
+ if (!error && set_cr3)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3);
+
+ if (!error && set_cr4)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4);
+
+ if (!error && set_dr7)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7);
+
+ if (!error && set_rsp)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp);
+
+ if (!error && set_rip)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip);
+
+ if (!error && set_rax)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax);
+
+ if (!error && set_rflags) {
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+ rflags);
+ }
+
+ if (!error && set_desc_ds) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_es) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_ss) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_cs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_fs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_gs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_tr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_ldtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_gdtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+ desc_base, desc_limit, 0);
+ }
+
+ if (!error && set_desc_idtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+ desc_base, desc_limit, 0);
+ }
+
+ if (!error && set_cs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs);
+
+ if (!error && set_ds)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds);
+
+ if (!error && set_es)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es);
+
+ if (!error && set_fs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs);
+
+ if (!error && set_gs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs);
+
+ if (!error && set_ss)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss);
+
+ if (!error && set_tr)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr);
+
+ if (!error && set_ldtr)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr);
+
+ if (!error && set_pinning)
+ error = vm_set_pinning(ctx, vcpu, pincpu);
+
+ if (!error && set_exception_bitmap) {
+ error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
+ exception_bitmap);
+ }
+
+ if (!error && set_vmcs_entry_interruption_info) {
+ error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,
+ vmcs_entry_interruption_info);
+ }
+
+ if (!error && get_lowmem) {
+ error = vm_get_memory_seg(ctx, 0, &hpa, &len);
+ if (error == 0)
+ printf("lowmem\t\t0x%016lx/%ld\n", hpa, len);
+ }
+
+ if (!error && get_highmem) {
+ error = vm_get_memory_seg(ctx, 4 * GB, &hpa, &len);
+ if (error == 0)
+ printf("highmem\t\t0x%016lx/%ld\n", hpa, len);
+ }
+
+ if (!error && get_efer) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer);
+ if (error == 0)
+ printf("efer[%d]\t\t0x%016lx\n", vcpu, efer);
+ }
+
+ if (!error && get_cr0) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0);
+ if (error == 0)
+ printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+ }
+
+ if (!error && get_cr3) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3);
+ if (error == 0)
+ printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+ }
+
+ if (!error && get_cr4) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4);
+ if (error == 0)
+ printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+ }
+
+ if (!error && get_dr7) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7);
+ if (error == 0)
+ printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7);
+ }
+
+ if (!error && get_rsp) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp);
+ if (error == 0)
+ printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
+ }
+
+ if (!error && get_rip) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+ if (error == 0)
+ printf("rip[%d]\t\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && get_rax) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax);
+ if (error == 0)
+ printf("rax[%d]\t\t0x%016lx\n", vcpu, rax);
+ }
+
+ if (!error && get_rbx) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx);
+ if (error == 0)
+ printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx);
+ }
+
+ if (!error && get_rcx) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx);
+ if (error == 0)
+ printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx);
+ }
+
+ if (!error && get_rdx) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx);
+ if (error == 0)
+ printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx);
+ }
+
+ if (!error && get_rsi) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi);
+ if (error == 0)
+ printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi);
+ }
+
+ if (!error && get_rdi) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi);
+ if (error == 0)
+ printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi);
+ }
+
+ if (!error && get_rbp) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp);
+ if (error == 0)
+ printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp);
+ }
+
+ if (!error && get_r8) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8);
+ if (error == 0)
+ printf("r8[%d]\t\t0x%016lx\n", vcpu, r8);
+ }
+
+ if (!error && get_r9) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9);
+ if (error == 0)
+ printf("r9[%d]\t\t0x%016lx\n", vcpu, r9);
+ }
+
+ if (!error && get_r10) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10);
+ if (error == 0)
+ printf("r10[%d]\t\t0x%016lx\n", vcpu, r10);
+ }
+
+ if (!error && get_r11) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11);
+ if (error == 0)
+ printf("r11[%d]\t\t0x%016lx\n", vcpu, r11);
+ }
+
+ if (!error && get_r12) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12);
+ if (error == 0)
+ printf("r12[%d]\t\t0x%016lx\n", vcpu, r12);
+ }
+
+ if (!error && get_r13) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13);
+ if (error == 0)
+ printf("r13[%d]\t\t0x%016lx\n", vcpu, r13);
+ }
+
+ if (!error && get_r14) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14);
+ if (error == 0)
+ printf("r14[%d]\t\t0x%016lx\n", vcpu, r14);
+ }
+
+ if (!error && get_r15) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15);
+ if (error == 0)
+ printf("r15[%d]\t\t0x%016lx\n", vcpu, r15);
+ }
+
+ if (!error && get_rflags) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+ &rflags);
+ if (error == 0)
+ printf("rflags[%d]\t0x%016lx\n", vcpu, rflags);
+ }
+
+ if (!error && get_stats) {
+ int i, num_stats;
+ uint64_t *stats;
+ struct timeval tv;
+ const char *desc;
+
+ stats = vm_get_stats(ctx, vcpu, &tv, &num_stats);
+ if (stats != NULL) {
+ printf("vcpu%d\n", vcpu);
+ for (i = 0; i < num_stats; i++) {
+ desc = vm_get_stat_desc(ctx, i);
+ printf("%-32s\t%ld\n", desc, stats[i]);
+ }
+ }
+ }
+
+ if (!error && get_desc_ds) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_es) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_fs) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_gs) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_ss) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_cs) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_tr) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_ldtr) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_gdtr) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("gdtr[%d]\t\t0x%016lx/0x%08x\n",
+ vcpu, desc_base, desc_limit);
+ }
+ }
+
+ if (!error && get_desc_idtr) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("idtr[%d]\t\t0x%016lx/0x%08x\n",
+ vcpu, desc_base, desc_limit);
+ }
+ }
+
+ if (!error && get_cs) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs);
+ if (error == 0)
+ printf("cs[%d]\t\t0x%04lx\n", vcpu, cs);
+ }
+
+ if (!error && get_ds) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds);
+ if (error == 0)
+ printf("ds[%d]\t\t0x%04lx\n", vcpu, ds);
+ }
+
+ if (!error && get_es) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es);
+ if (error == 0)
+ printf("es[%d]\t\t0x%04lx\n", vcpu, es);
+ }
+
+ if (!error && get_fs) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs);
+ if (error == 0)
+ printf("fs[%d]\t\t0x%04lx\n", vcpu, fs);
+ }
+
+ if (!error && get_gs) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs);
+ if (error == 0)
+ printf("gs[%d]\t\t0x%04lx\n", vcpu, gs);
+ }
+
+ if (!error && get_ss) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss);
+ if (error == 0)
+ printf("ss[%d]\t\t0x%04lx\n", vcpu, ss);
+ }
+
+ if (!error && get_tr) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr);
+ if (error == 0)
+ printf("tr[%d]\t\t0x%04lx\n", vcpu, tr);
+ }
+
+ if (!error && get_ldtr) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr);
+ if (error == 0)
+ printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr);
+ }
+
+ if (!error && get_pinning) {
+ error = vm_get_pinning(ctx, vcpu, &pincpu);
+ if (error == 0) {
+ if (pincpu < 0)
+ printf("pincpu[%d]\tunpinned\n", vcpu);
+ else
+ printf("pincpu[%d]\t%d\n", vcpu, pincpu);
+ }
+ }
+
+ if (!error && get_pinbased_ctls) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("pinbased_ctls[%d]\t0x%08x\n", vcpu, ctl);
+ }
+
+ if (!error && get_procbased_ctls) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_PRI_PROC_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("procbased_ctls[%d]\t0x%08x\n", vcpu, ctl);
+ }
+
+ if (!error && get_procbased_ctls2) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_SEC_PROC_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("procbased_ctls2[%d]\t0x%08x\n", vcpu, ctl);
+ }
+
+ if (!error && get_vmcs_gla) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_LINEAR_ADDRESS, &u64);
+ if (error == 0)
+ printf("gla[%d]\t\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && get_vmcs_gpa) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_PHYSICAL_ADDRESS, &u64);
+ if (error == 0)
+ printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && get_vmcs_entry_interruption_info) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64);
+ if (error == 0) {
+ printf("entry_interruption_info[%d]\t0x%08x\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && get_eptp) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp);
+ if (error == 0)
+ printf("eptp[%d]\t\t0x%016lx\n", vcpu, eptp);
+ }
+
+ if (!error && get_exception_bitmap) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
+ &bm);
+ if (error == 0)
+ printf("exception_bitmap[%d]\t0x%08x\n", vcpu, bm);
+ }
+
+ if (!error && get_io_bitmap) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm);
+ if (error == 0)
+ printf("io_bitmap_a[%d]\t0x%08x\n", vcpu, bm);
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm);
+ if (error == 0)
+ printf("io_bitmap_b[%d]\t0x%08x\n", vcpu, bm);
+ }
+
+ if (!error && get_tsc_offset) {
+ uint64_t tscoff;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff);
+ if (error == 0)
+ printf("tsc_offset[%d]\t0x%016lx\n", tscoff);
+ }
+
+ if (!error && get_cr0_mask) {
+ uint64_t cr0mask;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask);
+ if (error == 0)
+ printf("cr0_mask[%d]\t\t0x%016lx\n", cr0mask);
+ }
+
+ if (!error && get_cr0_shadow) {
+ uint64_t cr0shadow;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW,
+ &cr0shadow);
+ if (error == 0)
+ printf("cr0_shadow[%d]\t\t0x%016lx\n", cr0shadow);
+ }
+
+ if (!error && get_cr4_mask) {
+ uint64_t cr4mask;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask);
+ if (error == 0)
+ printf("cr4_mask[%d]\t\t0x%016lx\n", cr4mask);
+ }
+
+ if (!error && get_cr4_shadow) {
+ uint64_t cr4shadow;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW,
+ &cr4shadow);
+ if (error == 0)
+ printf("cr4_shadow[%d]\t\t0x%016lx\n", cr4shadow);
+ }
+
+ if (!error && get_cr3_targets) {
+ uint64_t target_count, target_addr;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT,
+ &target_count);
+ if (error == 0) {
+ printf("cr3_target_count[%d]\t0x%08x\n",
+ vcpu, target_count);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target0[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target1[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target2[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target3[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+ }
+
+ if (!error && get_apic_access_addr) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_APIC_ACCESS, &addr);
+ if (error == 0)
+ printf("apic_access_addr[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && get_virtual_apic_addr) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_VIRTUAL_APIC, &addr);
+ if (error == 0)
+ printf("virtual_apic_addr[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && get_tpr_threshold) {
+ uint64_t threshold;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD,
+ &threshold);
+ if (error == 0)
+ printf("tpr_threshold[%d]\t0x%08x\n", vcpu, threshold);
+ }
+
+ if (!error && get_msr_bitmap_address) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+ if (error == 0)
+ printf("msr_bitmap[%d]\t\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && get_msr_bitmap) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+ if (error == 0)
+ error = dump_vmcs_msr_bitmap(vcpu, addr);
+ }
+
+ if (!error && get_vpid) {
+ uint64_t vpid;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid);
+ if (error == 0)
+ printf("vpid[%d]\t\t0x%04x\n", vcpu, vpid);
+ }
+
+ if (!error && get_ple_window) {
+ uint64_t window;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_WINDOW, &window);
+ if (error == 0)
+ printf("ple_window[%d]\t\t0x%08x\n", vcpu, window);
+ }
+
+ if (!error && get_ple_gap) {
+ uint64_t gap;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_GAP, &gap);
+ if (error == 0)
+ printf("ple_gap[%d]\t\t0x%08x\n", vcpu, gap);
+ }
+
+ if (!error && get_inst_err) {
+ uint64_t insterr;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR,
+ &insterr);
+ if (error == 0) {
+ printf("instruction_error[%d]\t0x%08x\n",
+ vcpu, insterr);
+ }
+ }
+
+ if (!error && get_exit_ctls) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl);
+ if (error == 0)
+ printf("exit_ctls[%d]\t\t0x%08x\n", vcpu, ctl);
+ }
+
+ if (!error && get_entry_ctls) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl);
+ if (error == 0)
+ printf("entry_ctls[%d]\t\t0x%08x\n", vcpu, ctl);
+ }
+
+ if (!error && get_host_pat) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat);
+ if (error == 0)
+ printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+ }
+
+ if (!error && get_guest_pat) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat);
+ if (error == 0)
+ printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+ }
+
+ if (!error && get_host_cr0) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0);
+ if (error == 0)
+ printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+ }
+
+ if (!error && get_host_cr3) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3);
+ if (error == 0)
+ printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+ }
+
+ if (!error && get_host_cr4) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4);
+ if (error == 0)
+ printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+ }
+
+ if (!error && get_host_rip) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip);
+ if (error == 0)
+ printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && get_host_rsp) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp);
+ if (error == 0)
+ printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rsp);
+ }
+
+ if (!error && get_guest_sysenter) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_CS, &cs);
+ if (error == 0)
+ printf("guest_sysenter_cs[%d]\t0x%08x\n", vcpu, cs);
+
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_ESP, &rsp);
+ if (error == 0)
+ printf("guest_sysenter_sp[%d]\t0x%016lx\n", vcpu, rsp);
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_EIP, &rip);
+ if (error == 0)
+ printf("guest_sysenter_ip[%d]\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && get_vmcs_link) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr);
+ if (error == 0)
+ printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && get_vmcs_exit_reason) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64);
+ if (error == 0)
+ printf("vmcs_exit_reason[%d]\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && get_vmcs_exit_qualification) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION,
+ &u64);
+ if (error == 0)
+ printf("vmcs_exit_qualification[%d]\t0x%016lx\n",
+ vcpu, u64);
+ }
+
+ if (!error && get_vmcs_exit_interruption_info) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_EXIT_INTERRUPTION_INFO, &u64);
+ if (error == 0) {
+ printf("vmcs_exit_interruption_info[%d]\t0x%08x\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && get_vmcs_exit_interruption_error) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_EXIT_INTERRUPTION_ERROR, &u64);
+ if (error == 0) {
+ printf("vmcs_exit_interruption_error[%d]\t0x%08x\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && get_vmcs_interruptibility) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_INTERRUPTIBILITY, &u64);
+ if (error == 0) {
+ printf("vmcs_guest_interruptibility[%d]\t0x%08x\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && setcap) {
+ int captype;
+ captype = vm_capability_name2type(capname);
+ error = vm_set_capability(ctx, vcpu, captype, capval);
+ if (error != 0 && errno == ENOENT)
+ printf("Capability \"%s\" is not available\n", capname);
+ }
+
+ if (!error && getcap) {
+ int captype, val;
+ captype = vm_capability_name2type(capname);
+ error = vm_get_capability(ctx, vcpu, captype, &val);
+ if (error == 0) {
+ printf("Capability \"%s\" is %s on vcpu %d\n", capname,
+ val ? "set" : "not set", vcpu);
+ } else if (errno == ENOENT) {
+ printf("Capability \"%s\" is not available\n", capname);
+ }
+ }
+
+ if (!error && run) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+ assert(error == 0);
+
+ error = vm_run(ctx, vcpu, rip, &vmexit);
+ if (error == 0)
+ dump_vm_run_exitcode(&vmexit, vcpu);
+ else
+ printf("vm_run error %d\n", error);
+ }
+
+ if (error)
+ printf("errno = %d\n", errno);
+
+ if (!error && destroy)
+ vm_destroy(ctx);
+
+ exit(error);
+}
OpenPOWER on IntegriCloud