summaryrefslogtreecommitdiffstats
path: root/sys/amd64
diff options
context:
space:
mode:
Diffstat (limited to 'sys/amd64')
-rw-r--r--sys/amd64/Makefile39
-rw-r--r--sys/amd64/acpica/acpi_machdep.c385
-rw-r--r--sys/amd64/acpica/acpi_wakecode.S282
-rw-r--r--sys/amd64/amd64/amd64_mem.c755
-rw-r--r--sys/amd64/amd64/apic_vector.S326
-rw-r--r--sys/amd64/amd64/atomic.c49
-rw-r--r--sys/amd64/amd64/atpic_vector.S73
-rw-r--r--sys/amd64/amd64/autoconf.c132
-rw-r--r--sys/amd64/amd64/bios.c95
-rw-r--r--sys/amd64/amd64/bpf_jit_machdep.c653
-rw-r--r--sys/amd64/amd64/bpf_jit_machdep.h482
-rw-r--r--sys/amd64/amd64/cpu_switch.S541
-rw-r--r--sys/amd64/amd64/db_disasm.c1637
-rw-r--r--sys/amd64/amd64/db_interface.c149
-rw-r--r--sys/amd64/amd64/db_trace.c701
-rw-r--r--sys/amd64/amd64/elf_machdep.c278
-rw-r--r--sys/amd64/amd64/exception.S878
-rw-r--r--sys/amd64/amd64/fpu.c1012
-rw-r--r--sys/amd64/amd64/gdb_machdep.c117
-rw-r--r--sys/amd64/amd64/genassym.c252
-rw-r--r--sys/amd64/amd64/identcpu.c688
-rw-r--r--sys/amd64/amd64/in_cksum.c241
-rw-r--r--sys/amd64/amd64/initcpu.c217
-rw-r--r--sys/amd64/amd64/io.c59
-rw-r--r--sys/amd64/amd64/locore.S88
-rw-r--r--sys/amd64/amd64/machdep.c2556
-rw-r--r--sys/amd64/amd64/mem.c216
-rw-r--r--sys/amd64/amd64/minidump_machdep.c479
-rw-r--r--sys/amd64/amd64/mp_machdep.c1488
-rw-r--r--sys/amd64/amd64/mp_watchdog.c211
-rw-r--r--sys/amd64/amd64/mpboot.S236
-rw-r--r--sys/amd64/amd64/pmap.c5538
-rw-r--r--sys/amd64/amd64/prof_machdep.c391
-rw-r--r--sys/amd64/amd64/ptrace_machdep.c152
-rw-r--r--sys/amd64/amd64/sigtramp.S56
-rw-r--r--sys/amd64/amd64/stack_machdep.c87
-rw-r--r--sys/amd64/amd64/support.S732
-rw-r--r--sys/amd64/amd64/sys_machdep.c753
-rw-r--r--sys/amd64/amd64/trap.c1006
-rw-r--r--sys/amd64/amd64/uio_machdep.c119
-rw-r--r--sys/amd64/amd64/uma_machdep.c84
-rw-r--r--sys/amd64/amd64/vm_machdep.c750
-rw-r--r--sys/amd64/compile/.cvsignore1
-rw-r--r--sys/amd64/conf/.cvsignore1
-rw-r--r--sys/amd64/conf/DEFAULTS24
-rw-r--r--sys/amd64/conf/GENERIC340
-rw-r--r--sys/amd64/conf/GENERIC.hints33
-rw-r--r--sys/amd64/conf/Makefile5
-rw-r--r--sys/amd64/conf/NOTES626
-rw-r--r--sys/amd64/conf/XENHVM22
-rw-r--r--sys/amd64/ia32/ia32_exception.S75
-rw-r--r--sys/amd64/ia32/ia32_misc.c82
-rw-r--r--sys/amd64/ia32/ia32_reg.c235
-rw-r--r--sys/amd64/ia32/ia32_signal.c1006
-rw-r--r--sys/amd64/ia32/ia32_sigtramp.S161
-rw-r--r--sys/amd64/ia32/ia32_syscall.c255
-rw-r--r--sys/amd64/include/_align.h6
-rw-r--r--sys/amd64/include/_bus.h46
-rw-r--r--sys/amd64/include/_inttypes.h6
-rw-r--r--sys/amd64/include/_limits.h6
-rw-r--r--sys/amd64/include/_stdint.h6
-rw-r--r--sys/amd64/include/_types.h6
-rw-r--r--sys/amd64/include/acpica_machdep.h84
-rw-r--r--sys/amd64/include/apicvar.h232
-rw-r--r--sys/amd64/include/apm_bios.h6
-rw-r--r--sys/amd64/include/asm.h91
-rw-r--r--sys/amd64/include/asmacros.h204
-rw-r--r--sys/amd64/include/atomic.h483
-rw-r--r--sys/amd64/include/bus.h6
-rw-r--r--sys/amd64/include/bus_dma.h34
-rw-r--r--sys/amd64/include/clock.h43
-rw-r--r--sys/amd64/include/counter.h51
-rw-r--r--sys/amd64/include/cpu.h78
-rw-r--r--sys/amd64/include/cpufunc.h791
-rw-r--r--sys/amd64/include/cputypes.h59
-rw-r--r--sys/amd64/include/db_machdep.h94
-rw-r--r--sys/amd64/include/elf.h6
-rw-r--r--sys/amd64/include/endian.h6
-rw-r--r--sys/amd64/include/exec.h38
-rw-r--r--sys/amd64/include/float.h6
-rw-r--r--sys/amd64/include/floatingpoint.h43
-rw-r--r--sys/amd64/include/fpu.h89
-rw-r--r--sys/amd64/include/frame.h6
-rw-r--r--sys/amd64/include/gdb_machdep.h52
-rw-r--r--sys/amd64/include/ieeefp.h308
-rw-r--r--sys/amd64/include/in_cksum.h84
-rw-r--r--sys/amd64/include/intr_machdep.h174
-rw-r--r--sys/amd64/include/iodev.h46
-rw-r--r--sys/amd64/include/kdb.h59
-rw-r--r--sys/amd64/include/limits.h44
-rw-r--r--sys/amd64/include/md_var.h121
-rw-r--r--sys/amd64/include/memdev.h40
-rw-r--r--sys/amd64/include/metadata.h35
-rw-r--r--sys/amd64/include/minidump.h46
-rw-r--r--sys/amd64/include/mp_watchdog.h34
-rw-r--r--sys/amd64/include/nexusvar.h45
-rw-r--r--sys/amd64/include/npx.h6
-rw-r--r--sys/amd64/include/param.h142
-rw-r--r--sys/amd64/include/pc/bios.h111
-rw-r--r--sys/amd64/include/pc/display.h45
-rw-r--r--sys/amd64/include/pcb.h149
-rw-r--r--sys/amd64/include/pci_cfgreg.h6
-rw-r--r--sys/amd64/include/pcpu.h261
-rw-r--r--sys/amd64/include/pmap.h337
-rw-r--r--sys/amd64/include/pmc_mdep.h143
-rw-r--r--sys/amd64/include/ppireg.h49
-rw-r--r--sys/amd64/include/proc.h91
-rw-r--r--sys/amd64/include/profile.h201
-rw-r--r--sys/amd64/include/psl.h6
-rw-r--r--sys/amd64/include/ptrace.h6
-rw-r--r--sys/amd64/include/reg.h6
-rw-r--r--sys/amd64/include/reloc.h49
-rw-r--r--sys/amd64/include/resource.h44
-rw-r--r--sys/amd64/include/runq.h46
-rw-r--r--sys/amd64/include/segments.h106
-rw-r--r--sys/amd64/include/setjmp.h6
-rw-r--r--sys/amd64/include/sf_buf.h58
-rw-r--r--sys/amd64/include/sigframe.h6
-rw-r--r--sys/amd64/include/signal.h6
-rw-r--r--sys/amd64/include/smp.h82
-rw-r--r--sys/amd64/include/specialreg.h6
-rw-r--r--sys/amd64/include/stack.h44
-rw-r--r--sys/amd64/include/stdarg.h6
-rw-r--r--sys/amd64/include/sysarch.h6
-rw-r--r--sys/amd64/include/timerreg.h54
-rw-r--r--sys/amd64/include/trap.h6
-rw-r--r--sys/amd64/include/tss.h70
-rw-r--r--sys/amd64/include/ucontext.h6
-rw-r--r--sys/amd64/include/varargs.h89
-rw-r--r--sys/amd64/include/vdso.h6
-rw-r--r--sys/amd64/include/vm.h45
-rw-r--r--sys/amd64/include/vmm.h291
-rw-r--r--sys/amd64/include/vmm_dev.h204
-rw-r--r--sys/amd64/include/vmm_instruction_emul.h126
-rw-r--r--sys/amd64/include/vmparam.h217
-rw-r--r--sys/amd64/include/xen/hypercall.h415
-rw-r--r--sys/amd64/include/xen/synch_bitops.h129
-rw-r--r--sys/amd64/include/xen/xen-os.h296
-rw-r--r--sys/amd64/include/xen/xenfunc.h82
-rw-r--r--sys/amd64/include/xen/xenpmap.h227
-rw-r--r--sys/amd64/include/xen/xenvar.h120
-rw-r--r--sys/amd64/linux32/Makefile17
-rw-r--r--sys/amd64/linux32/linux.h788
-rw-r--r--sys/amd64/linux32/linux32_dummy.c176
-rw-r--r--sys/amd64/linux32/linux32_genassym.c14
-rw-r--r--sys/amd64/linux32/linux32_ipc64.h145
-rw-r--r--sys/amd64/linux32/linux32_locore.s38
-rw-r--r--sys/amd64/linux32/linux32_machdep.c1067
-rw-r--r--sys/amd64/linux32/linux32_proto.h1682
-rw-r--r--sys/amd64/linux32/linux32_support.s124
-rw-r--r--sys/amd64/linux32/linux32_syscall.h324
-rw-r--r--sys/amd64/linux32/linux32_syscalls.c360
-rw-r--r--sys/amd64/linux32/linux32_sysent.c371
-rw-r--r--sys/amd64/linux32/linux32_systrace_args.c6667
-rw-r--r--sys/amd64/linux32/linux32_sysvec.c1205
-rw-r--r--sys/amd64/linux32/syscalls.conf11
-rw-r--r--sys/amd64/linux32/syscalls.master561
-rw-r--r--sys/amd64/pci/pci_cfgreg.c370
-rw-r--r--sys/amd64/vmm/amd/amdv.c265
-rw-r--r--sys/amd64/vmm/intel/ept.c392
-rw-r--r--sys/amd64/vmm/intel/ept.h43
-rw-r--r--sys/amd64/vmm/intel/vmcs.c551
-rw-r--r--sys/amd64/vmm/intel/vmcs.h338
-rw-r--r--sys/amd64/vmm/intel/vmx.c1867
-rw-r--r--sys/amd64/vmm/intel/vmx.h120
-rw-r--r--sys/amd64/vmm/intel/vmx_controls.h92
-rw-r--r--sys/amd64/vmm/intel/vmx_cpufunc.h218
-rw-r--r--sys/amd64/vmm/intel/vmx_genassym.c89
-rw-r--r--sys/amd64/vmm/intel/vmx_msr.c172
-rw-r--r--sys/amd64/vmm/intel/vmx_msr.h78
-rw-r--r--sys/amd64/vmm/intel/vmx_support.S246
-rw-r--r--sys/amd64/vmm/intel/vtd.c677
-rw-r--r--sys/amd64/vmm/io/iommu.c277
-rw-r--r--sys/amd64/vmm/io/iommu.h75
-rw-r--r--sys/amd64/vmm/io/ppt.c594
-rw-r--r--sys/amd64/vmm/io/ppt.h41
-rw-r--r--sys/amd64/vmm/io/vdev.c270
-rw-r--r--sys/amd64/vmm/io/vdev.h84
-rw-r--r--sys/amd64/vmm/io/vlapic.c907
-rw-r--r--sys/amd64/vmm/io/vlapic.h111
-rw-r--r--sys/amd64/vmm/vmm.c992
-rw-r--r--sys/amd64/vmm/vmm_dev.c526
-rw-r--r--sys/amd64/vmm/vmm_host.c124
-rw-r--r--sys/amd64/vmm/vmm_host.h75
-rw-r--r--sys/amd64/vmm/vmm_instruction_emul.c867
-rw-r--r--sys/amd64/vmm/vmm_ipi.c93
-rw-r--r--sys/amd64/vmm/vmm_ipi.h39
-rw-r--r--sys/amd64/vmm/vmm_ktr.h51
-rw-r--r--sys/amd64/vmm/vmm_lapic.c201
-rw-r--r--sys/amd64/vmm/vmm_lapic.h71
-rw-r--r--sys/amd64/vmm/vmm_mem.c135
-rw-r--r--sys/amd64/vmm/vmm_mem.h37
-rw-r--r--sys/amd64/vmm/vmm_msr.c254
-rw-r--r--sys/amd64/vmm/vmm_msr.h43
-rw-r--r--sys/amd64/vmm/vmm_stat.c130
-rw-r--r--sys/amd64/vmm/vmm_stat.h105
-rw-r--r--sys/amd64/vmm/vmm_support.S42
-rw-r--r--sys/amd64/vmm/vmm_util.c111
-rw-r--r--sys/amd64/vmm/vmm_util.h40
-rw-r--r--sys/amd64/vmm/x86.c219
-rw-r--r--sys/amd64/vmm/x86.h64
201 files changed, 61516 insertions, 0 deletions
diff --git a/sys/amd64/Makefile b/sys/amd64/Makefile
new file mode 100644
index 0000000..3925e74
--- /dev/null
+++ b/sys/amd64/Makefile
@@ -0,0 +1,39 @@
+# $FreeBSD$
+# @(#)Makefile 8.1 (Berkeley) 6/11/93
+
+# Makefile for amd64 links, tags file
+
+# SYS is normally set in Make.tags.inc
+SYS=/sys
+
+TAGDIR= amd64
+
+.include "../kern/Make.tags.inc"
+
+all:
+ @echo "make links or tags only"
+
+# Directories in which to place amd64 tags links
+DAMD64= acpica amd64 ia32 include isa linux32 pci
+
+links::
+ -for i in ${COMMDIR1}; do \
+ (cd $$i && { rm -f tags; ln -s ../${TAGDIR}/tags tags; }) done
+ -for i in ${COMMDIR2}; do \
+ (cd $$i && { rm -f tags; ln -s ../../${TAGDIR}/tags tags; }) done
+ -for i in ${DAMD64}; do \
+ (cd $$i && { rm -f tags; ln -s ../tags tags; }) done
+
+SAMD64= ${SYS}/amd64/acpica/*.[ch] \
+ ${SYS}/amd64/amd64/*.[ch] ${SYS}/amd64/ia32/*.[ch] \
+ ${SYS}/amd64/include/*.[ch] ${SYS}/amd64/isa/*.[ch] \
+ ${SYS}/amd64/linux32/*.[ch] ${SYS}/amd64/pci/*.[ch]
+AAMD64= ${SYS}/amd64/amd64/*.S
+
+tags::
+ -ctags -wdt ${COMM} ${SAMD64}
+ egrep "^ENTRY(.*)|^ALTENTRY(.*)" ${AAMD64} | \
+ sed "s;\([^:]*\):\([^(]*\)(\([^, )]*\)\(.*\);\3 \1 /^\2(\3\4$$/;" \
+ >> tags
+ sort -o tags tags
+ chmod 444 tags
diff --git a/sys/amd64/acpica/acpi_machdep.c b/sys/amd64/acpica/acpi_machdep.c
new file mode 100644
index 0000000..e5dd4c3
--- /dev/null
+++ b/sys/amd64/acpica/acpi_machdep.c
@@ -0,0 +1,385 @@
+/*-
+ * Copyright (c) 2001 Mitsuru IWASAKI
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <contrib/dev/acpica/include/acpi.h>
+#include <contrib/dev/acpica/include/accommon.h>
+#include <contrib/dev/acpica/include/actables.h>
+
+#include <dev/acpica/acpivar.h>
+
+#include <machine/nexusvar.h>
+
+int acpi_resume_beep;
+TUNABLE_INT("debug.acpi.resume_beep", &acpi_resume_beep);
+SYSCTL_INT(_debug_acpi, OID_AUTO, resume_beep, CTLFLAG_RW, &acpi_resume_beep,
+ 0, "Beep the PC speaker when resuming");
+
+int acpi_reset_video;
+TUNABLE_INT("hw.acpi.reset_video", &acpi_reset_video);
+
+static int intr_model = ACPI_INTR_PIC;
+
+int
+acpi_machdep_init(device_t dev)
+{
+ struct acpi_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ acpi_apm_init(sc);
+
+ if (intr_model != ACPI_INTR_PIC)
+ acpi_SetIntrModel(intr_model);
+
+ SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx,
+ SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO,
+ "reset_video", CTLFLAG_RW, &acpi_reset_video, 0,
+ "Call the VESA reset BIOS vector on the resume path");
+
+ return (0);
+}
+
+void
+acpi_SetDefaultIntrModel(int model)
+{
+
+ intr_model = model;
+}
+
+int
+acpi_machdep_quirks(int *quirks)
+{
+
+ return (0);
+}
+
+void
+acpi_cpu_c1()
+{
+
+ __asm __volatile("sti; hlt");
+}
+
+/*
+ * Support for mapping ACPI tables during early boot. Currently this
+ * uses the crashdump map to map each table. However, the crashdump
+ * map is created in pmap_bootstrap() right after the direct map, so
+ * we should be able to just use pmap_mapbios() here instead.
+ *
+ * This makes the following assumptions about how we use this KVA:
+ * pages 0 and 1 are used to map in the header of each table found via
+ * the RSDT or XSDT and pages 2 to n are used to map in the RSDT or
+ * XSDT. This has to use 2 pages for the table headers in case a
+ * header spans a page boundary.
+ *
+ * XXX: We don't ensure the table fits in the available address space
+ * in the crashdump map.
+ */
+
+/*
+ * Map some memory using the crashdump map. 'offset' is an offset in
+ * pages into the crashdump map to use for the start of the mapping.
+ */
+static void *
+table_map(vm_paddr_t pa, int offset, vm_offset_t length)
+{
+ vm_offset_t va, off;
+ void *data;
+
+ off = pa & PAGE_MASK;
+ length = round_page(length + off);
+ pa = pa & PG_FRAME;
+ va = (vm_offset_t)pmap_kenter_temporary(pa, offset) +
+ (offset * PAGE_SIZE);
+ data = (void *)(va + off);
+ length -= PAGE_SIZE;
+ while (length > 0) {
+ va += PAGE_SIZE;
+ pa += PAGE_SIZE;
+ length -= PAGE_SIZE;
+ pmap_kenter(va, pa);
+ invlpg(va);
+ }
+ return (data);
+}
+
+/* Unmap memory previously mapped with table_map(). */
+static void
+table_unmap(void *data, vm_offset_t length)
+{
+ vm_offset_t va, off;
+
+ va = (vm_offset_t)data;
+ off = va & PAGE_MASK;
+ length = round_page(length + off);
+ va &= ~PAGE_MASK;
+ while (length > 0) {
+ pmap_kremove(va);
+ invlpg(va);
+ va += PAGE_SIZE;
+ length -= PAGE_SIZE;
+ }
+}
+
+/*
+ * Map a table at a given offset into the crashdump map. It first
+ * maps the header to determine the table length and then maps the
+ * entire table.
+ */
+static void *
+map_table(vm_paddr_t pa, int offset, const char *sig)
+{
+ ACPI_TABLE_HEADER *header;
+ vm_offset_t length;
+ void *table;
+
+ header = table_map(pa, offset, sizeof(ACPI_TABLE_HEADER));
+ if (strncmp(header->Signature, sig, ACPI_NAME_SIZE) != 0) {
+ table_unmap(header, sizeof(ACPI_TABLE_HEADER));
+ return (NULL);
+ }
+ length = header->Length;
+ table_unmap(header, sizeof(ACPI_TABLE_HEADER));
+ table = table_map(pa, offset, length);
+ if (ACPI_FAILURE(AcpiTbChecksum(table, length))) {
+ if (bootverbose)
+ printf("ACPI: Failed checksum for table %s\n", sig);
+#if (ACPI_CHECKSUM_ABORT)
+ table_unmap(table, length);
+ return (NULL);
+#endif
+ }
+ return (table);
+}
+
+/*
+ * See if a given ACPI table is the requested table. Returns the
+ * length of the able if it matches or zero on failure.
+ */
+static int
+probe_table(vm_paddr_t address, const char *sig)
+{
+ ACPI_TABLE_HEADER *table;
+
+ table = table_map(address, 0, sizeof(ACPI_TABLE_HEADER));
+ if (table == NULL) {
+ if (bootverbose)
+ printf("ACPI: Failed to map table at 0x%jx\n",
+ (uintmax_t)address);
+ return (0);
+ }
+ if (bootverbose)
+ printf("Table '%.4s' at 0x%jx\n", table->Signature,
+ (uintmax_t)address);
+
+ if (strncmp(table->Signature, sig, ACPI_NAME_SIZE) != 0) {
+ table_unmap(table, sizeof(ACPI_TABLE_HEADER));
+ return (0);
+ }
+ table_unmap(table, sizeof(ACPI_TABLE_HEADER));
+ return (1);
+}
+
+/*
+ * Try to map a table at a given physical address previously returned
+ * by acpi_find_table().
+ */
+void *
+acpi_map_table(vm_paddr_t pa, const char *sig)
+{
+
+ return (map_table(pa, 0, sig));
+}
+
+/* Unmap a table previously mapped via acpi_map_table(). */
+void
+acpi_unmap_table(void *table)
+{
+ ACPI_TABLE_HEADER *header;
+
+ header = (ACPI_TABLE_HEADER *)table;
+ table_unmap(table, header->Length);
+}
+
+/*
+ * Return the physical address of the requested table or zero if one
+ * is not found.
+ */
+vm_paddr_t
+acpi_find_table(const char *sig)
+{
+ ACPI_PHYSICAL_ADDRESS rsdp_ptr;
+ ACPI_TABLE_RSDP *rsdp;
+ ACPI_TABLE_RSDT *rsdt;
+ ACPI_TABLE_XSDT *xsdt;
+ ACPI_TABLE_HEADER *table;
+ vm_paddr_t addr;
+ int i, count;
+
+ if (resource_disabled("acpi", 0))
+ return (0);
+
+ /*
+ * Map in the RSDP. Since ACPI uses AcpiOsMapMemory() which in turn
+ * calls pmap_mapbios() to find the RSDP, we assume that we can use
+ * pmap_mapbios() to map the RSDP.
+ */
+ if ((rsdp_ptr = AcpiOsGetRootPointer()) == 0)
+ return (0);
+ rsdp = pmap_mapbios(rsdp_ptr, sizeof(ACPI_TABLE_RSDP));
+ if (rsdp == NULL) {
+ if (bootverbose)
+ printf("ACPI: Failed to map RSDP\n");
+ return (0);
+ }
+
+ /*
+ * For ACPI >= 2.0, use the XSDT if it is available.
+ * Otherwise, use the RSDT. We map the XSDT or RSDT at page 2
+ * in the crashdump area. Pages 0 and 1 are used to map in the
+ * headers of candidate ACPI tables.
+ */
+ addr = 0;
+ if (rsdp->Revision >= 2 && rsdp->XsdtPhysicalAddress != 0) {
+ /*
+ * AcpiOsGetRootPointer only verifies the checksum for
+ * the version 1.0 portion of the RSDP. Version 2.0 has
+ * an additional checksum that we verify first.
+ */
+ if (AcpiTbChecksum((UINT8 *)rsdp, ACPI_RSDP_XCHECKSUM_LENGTH)) {
+ if (bootverbose)
+ printf("ACPI: RSDP failed extended checksum\n");
+ return (0);
+ }
+ xsdt = map_table(rsdp->XsdtPhysicalAddress, 2, ACPI_SIG_XSDT);
+ if (xsdt == NULL) {
+ if (bootverbose)
+ printf("ACPI: Failed to map XSDT\n");
+ return (0);
+ }
+ count = (xsdt->Header.Length - sizeof(ACPI_TABLE_HEADER)) /
+ sizeof(UINT64);
+ for (i = 0; i < count; i++)
+ if (probe_table(xsdt->TableOffsetEntry[i], sig)) {
+ addr = xsdt->TableOffsetEntry[i];
+ break;
+ }
+ acpi_unmap_table(xsdt);
+ } else {
+ rsdt = map_table(rsdp->RsdtPhysicalAddress, 2, ACPI_SIG_RSDT);
+ if (rsdt == NULL) {
+ if (bootverbose)
+ printf("ACPI: Failed to map RSDT\n");
+ return (0);
+ }
+ count = (rsdt->Header.Length - sizeof(ACPI_TABLE_HEADER)) /
+ sizeof(UINT32);
+ for (i = 0; i < count; i++)
+ if (probe_table(rsdt->TableOffsetEntry[i], sig)) {
+ addr = rsdt->TableOffsetEntry[i];
+ break;
+ }
+ acpi_unmap_table(rsdt);
+ }
+ pmap_unmapbios((vm_offset_t)rsdp, sizeof(ACPI_TABLE_RSDP));
+ if (addr == 0) {
+ if (bootverbose)
+ printf("ACPI: No %s table found\n", sig);
+ return (0);
+ }
+ if (bootverbose)
+ printf("%s: Found table at 0x%jx\n", sig, (uintmax_t)addr);
+
+ /*
+ * Verify that we can map the full table and that its checksum is
+ * correct, etc.
+ */
+ table = map_table(addr, 0, sig);
+ if (table == NULL)
+ return (0);
+ acpi_unmap_table(table);
+
+ return (addr);
+}
+
+/*
+ * ACPI nexus(4) driver.
+ */
+static int
+nexus_acpi_probe(device_t dev)
+{
+ int error;
+
+ error = acpi_identify();
+ if (error)
+ return (error);
+
+ return (BUS_PROBE_DEFAULT);
+}
+
+static int
+nexus_acpi_attach(device_t dev)
+{
+ device_t acpi_dev;
+ int error;
+
+ nexus_init_resources();
+ bus_generic_probe(dev);
+ acpi_dev = BUS_ADD_CHILD(dev, 10, "acpi", 0);
+ if (acpi_dev == NULL)
+ panic("failed to add acpi0 device");
+
+ error = bus_generic_attach(dev);
+ if (error == 0)
+ acpi_install_wakeup_handler(device_get_softc(acpi_dev));
+
+ return (error);
+}
+
+static device_method_t nexus_acpi_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, nexus_acpi_probe),
+ DEVMETHOD(device_attach, nexus_acpi_attach),
+
+ { 0, 0 }
+};
+
+DEFINE_CLASS_1(nexus, nexus_acpi_driver, nexus_acpi_methods, 1, nexus_driver);
+static devclass_t nexus_devclass;
+
+DRIVER_MODULE(nexus_acpi, root, nexus_acpi_driver, nexus_devclass, 0, 0);
diff --git a/sys/amd64/acpica/acpi_wakecode.S b/sys/amd64/acpica/acpi_wakecode.S
new file mode 100644
index 0000000..c4b0dcd
--- /dev/null
+++ b/sys/amd64/acpica/acpi_wakecode.S
@@ -0,0 +1,282 @@
+/*-
+ * Copyright (c) 2001 Takanori Watanabe <takawata@jp.freebsd.org>
+ * Copyright (c) 2001 Mitsuru IWASAKI <iwasaki@jp.freebsd.org>
+ * Copyright (c) 2003 Peter Wemm
+ * Copyright (c) 2008-2012 Jung-uk Kim <jkim@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+#include <machine/ppireg.h>
+#include <machine/specialreg.h>
+#include <machine/timerreg.h>
+
+#include "assym.s"
+
+/*
+ * Resume entry point for real mode.
+ *
+ * If XFirmwareWakingVector is zero and FirmwareWakingVector is non-zero
+ * in FACS, the BIOS enters here in real mode after POST with CS set to
+ * (FirmwareWakingVector >> 4) and IP set to (FirmwareWakingVector & 0xf).
+ * Depending on the previous sleep state, we may need to initialize more
+ * of the system (i.e., S3 suspend-to-RAM vs. S4 suspend-to-disk).
+ *
+ * Note: If XFirmwareWakingVector is non-zero, it should disable address
+ * translation/paging and interrupts, load all segment registers with
+ * a flat 4 GB address space, and set EFLAGS.IF to zero. Currently
+ * this mode is not supported by this code.
+ */
+
+ .data /* So we can modify it */
+
+ ALIGN_TEXT
+ .code16
+wakeup_start:
+ /*
+ * Set up segment registers for real mode, a small stack for
+ * any calls we make, and clear any flags.
+ */
+ cli /* make sure no interrupts */
+ mov %cs, %ax /* copy %cs to %ds. Remember these */
+ mov %ax, %ds /* are offsets rather than selectors */
+ mov %ax, %ss
+ movw $PAGE_SIZE, %sp
+ xorw %ax, %ax
+ pushw %ax
+ popfw
+
+ /* To debug resume hangs, beep the speaker if the user requested. */
+ testb $~0, resume_beep - wakeup_start
+ jz 1f
+ movb $0, resume_beep - wakeup_start
+
+ /* Set PIC timer2 to beep. */
+ movb $(TIMER_SEL2 | TIMER_SQWAVE | TIMER_16BIT), %al
+ outb %al, $TIMER_MODE
+
+ /* Turn on speaker. */
+ inb $IO_PPI, %al
+ orb $PIT_SPKR, %al
+ outb %al, $IO_PPI
+
+ /* Set frequency. */
+ movw $0x4c0, %ax
+ outb %al, $TIMER_CNTR2
+ shrw $8, %ax
+ outb %al, $TIMER_CNTR2
+1:
+
+ /* Re-initialize video BIOS if the reset_video tunable is set. */
+ testb $~0, reset_video - wakeup_start
+ jz 1f
+ movb $0, reset_video - wakeup_start
+ lcall $0xc000, $3
+
+ /* When we reach here, int 0x10 should be ready. Hide cursor. */
+ movb $0x01, %ah
+ movb $0x20, %ch
+ int $0x10
+
+ /* Re-start in case the previous BIOS call clobbers them. */
+ jmp wakeup_start
+1:
+
+ /*
+ * Find relocation base and patch the gdt descript and ljmp targets
+ */
+ xorl %ebx, %ebx
+ mov %cs, %bx
+ sall $4, %ebx /* %ebx is now our relocation base */
+
+ /*
+ * Load the descriptor table pointer. We'll need it when running
+ * in 16-bit protected mode.
+ */
+ lgdtl bootgdtdesc - wakeup_start
+
+ /* Enable protected mode */
+ movl $CR0_PE, %eax
+ mov %eax, %cr0
+
+ /*
+ * Now execute a far jump to turn on protected mode. This
+ * causes the segment registers to turn into selectors and causes
+ * %cs to be loaded from the gdt.
+ *
+ * The following instruction is:
+ * ljmpl $bootcode32 - bootgdt, $wakeup_32 - wakeup_start
+ * but gas cannot assemble that. And besides, we patch the targets
+ * in early startup and its a little clearer what we are patching.
+ */
+wakeup_sw32:
+ .byte 0x66 /* size override to 32 bits */
+ .byte 0xea /* opcode for far jump */
+ .long wakeup_32 - wakeup_start /* offset in segment */
+ .word bootcode32 - bootgdt /* index in gdt for 32 bit code */
+
+ /*
+ * At this point, we are running in 32 bit legacy protected mode.
+ */
+ ALIGN_TEXT
+ .code32
+wakeup_32:
+
+ mov $bootdata32 - bootgdt, %eax
+ mov %ax, %ds
+
+ /* Turn on the PAE and PSE bits for when paging is enabled */
+ mov %cr4, %eax
+ orl $(CR4_PAE | CR4_PSE), %eax
+ mov %eax, %cr4
+
+ /*
+ * Enable EFER.LME so that we get long mode when all the prereqs are
+ * in place. In this case, it turns on when CR0_PG is finally enabled.
+ * Pick up a few other EFER bits that we'll use need we're here.
+ */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ orl $EFER_LME | EFER_SCE, %eax
+ wrmsr
+
+ /*
+ * Point to the embedded page tables for startup. Note that this
+ * only gets accessed after we're actually in 64 bit mode, however
+ * we can only set the bottom 32 bits of %cr3 in this state. This
+ * means we are required to use a temporary page table that is below
+ * the 4GB limit. %ebx is still our relocation base. We could just
+ * subtract 3 * PAGE_SIZE, but that would be too easy.
+ */
+ leal wakeup_pagetables - wakeup_start(%ebx), %eax
+ movl (%eax), %eax
+ mov %eax, %cr3
+
+ /*
+ * Finally, switch to long bit mode by enabling paging. We have
+ * to be very careful here because all the segmentation disappears
+ * out from underneath us. The spec says we can depend on the
+ * subsequent pipelined branch to execute, but *only if* everthing
+ * is still identity mapped. If any mappings change, the pipeline
+ * will flush.
+ */
+ mov %cr0, %eax
+ orl $CR0_PG, %eax
+ mov %eax, %cr0
+
+ /*
+ * At this point paging is enabled, and we are in "compatability" mode.
+ * We do another far jump to reload %cs with the 64 bit selector.
+ * %cr3 points to a 4-level page table page.
+ * We cannot yet jump all the way to the kernel because we can only
+ * specify a 32 bit linear address. So, yet another trampoline.
+ *
+ * The following instruction is:
+ * ljmp $bootcode64 - bootgdt, $wakeup_64 - wakeup_start
+ * but gas cannot assemble that. And besides, we patch the targets
+ * in early startup and its a little clearer what we are patching.
+ */
+wakeup_sw64:
+ .byte 0xea /* opcode for far jump */
+ .long wakeup_64 - wakeup_start /* offset in segment */
+ .word bootcode64 - bootgdt /* index in gdt for 64 bit code */
+
+ /*
+ * Yeehar! We're running in 64-bit mode! We can mostly ignore our
+ * segment registers, and get on with it.
+ * Note that we are running at the correct virtual address, but with
+ * a 1:1 1GB mirrored mapping over entire address space. We had better
+ * switch to a real %cr3 promptly so that we can get to the direct map
+ * space. Remember that jmp is relative and that we've been relocated,
+ * so use an indirect jump.
+ */
+ ALIGN_TEXT
+ .code64
+wakeup_64:
+ mov $bootdata64 - bootgdt, %eax
+ mov %ax, %ds
+
+ /* Restore arguments. */
+ movq wakeup_pcb - wakeup_start(%rbx), %rdi
+ movq wakeup_ret - wakeup_start(%rbx), %rax
+
+ /* Restore GDT. */
+ lgdt wakeup_gdt - wakeup_start(%rbx)
+
+ /* Jump to return address. */
+ jmp *%rax
+
+ .data
+
+resume_beep:
+ .byte 0
+reset_video:
+ .byte 0
+
+ ALIGN_DATA
+bootgdt:
+ .long 0x00000000
+ .long 0x00000000
+ .long 0x00000000
+ .long 0x00000000
+ .long 0x00000000
+ .long 0x00000000
+ .long 0x00000000
+ .long 0x00000000
+
+bootcode64:
+ .long 0x0000ffff
+ .long 0x00af9b00
+
+bootdata64:
+ .long 0x0000ffff
+ .long 0x00af9300
+
+bootcode32:
+ .long 0x0000ffff
+ .long 0x00cf9b00
+
+bootdata32:
+ .long 0x0000ffff
+ .long 0x00cf9300
+bootgdtend:
+
+wakeup_pagetables:
+ .long 0
+
+bootgdtdesc:
+ .word bootgdtend - bootgdt /* Length */
+ .long bootgdt - wakeup_start /* Offset plus %ds << 4 */
+
+ ALIGN_DATA
+wakeup_pcb:
+ .quad 0
+wakeup_ret:
+ .quad 0
+wakeup_gdt:
+ .word 0
+ .quad 0
+dummy:
diff --git a/sys/amd64/amd64/amd64_mem.c b/sys/amd64/amd64/amd64_mem.c
new file mode 100644
index 0000000..e77a96f
--- /dev/null
+++ b/sys/amd64/amd64/amd64_mem.c
@@ -0,0 +1,755 @@
+/*-
+ * Copyright (c) 1999 Michael Smith <msmith@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/memrange.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+
+#include <machine/cputypes.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+/*
+ * amd64 memory range operations
+ *
+ * This code will probably be impenetrable without reference to the
+ * Intel Pentium Pro documentation or x86-64 programmers manual vol 2.
+ */
+
+static char *mem_owner_bios = "BIOS";
+
+#define MR686_FIXMTRR (1<<0)
+
+#define mrwithin(mr, a) \
+ (((a) >= (mr)->mr_base) && ((a) < ((mr)->mr_base + (mr)->mr_len)))
+#define mroverlap(mra, mrb) \
+ (mrwithin(mra, mrb->mr_base) || mrwithin(mrb, mra->mr_base))
+
+#define mrvalid(base, len) \
+ ((!(base & ((1 << 12) - 1))) && /* base is multiple of 4k */ \
+ ((len) >= (1 << 12)) && /* length is >= 4k */ \
+ powerof2((len)) && /* ... and power of two */ \
+ !((base) & ((len) - 1))) /* range is not discontiuous */
+
+#define mrcopyflags(curr, new) \
+ (((curr) & ~MDF_ATTRMASK) | ((new) & MDF_ATTRMASK))
+
+static int mtrrs_disabled;
+TUNABLE_INT("machdep.disable_mtrrs", &mtrrs_disabled);
+SYSCTL_INT(_machdep, OID_AUTO, disable_mtrrs, CTLFLAG_RDTUN,
+ &mtrrs_disabled, 0, "Disable amd64 MTRRs.");
+
+static void amd64_mrinit(struct mem_range_softc *sc);
+static int amd64_mrset(struct mem_range_softc *sc,
+ struct mem_range_desc *mrd, int *arg);
+static void amd64_mrAPinit(struct mem_range_softc *sc);
+static void amd64_mrreinit(struct mem_range_softc *sc);
+
+static struct mem_range_ops amd64_mrops = {
+ amd64_mrinit,
+ amd64_mrset,
+ amd64_mrAPinit,
+ amd64_mrreinit
+};
+
+/* XXX for AP startup hook */
+static u_int64_t mtrrcap, mtrrdef;
+
+/* The bitmask for the PhysBase and PhysMask fields of the variable MTRRs. */
+static u_int64_t mtrr_physmask;
+
+static struct mem_range_desc *mem_range_match(struct mem_range_softc *sc,
+ struct mem_range_desc *mrd);
+static void amd64_mrfetch(struct mem_range_softc *sc);
+static int amd64_mtrrtype(int flags);
+static int amd64_mrt2mtrr(int flags, int oldval);
+static int amd64_mtrrconflict(int flag1, int flag2);
+static void amd64_mrstore(struct mem_range_softc *sc);
+static void amd64_mrstoreone(void *arg);
+static struct mem_range_desc *amd64_mtrrfixsearch(struct mem_range_softc *sc,
+ u_int64_t addr);
+static int amd64_mrsetlow(struct mem_range_softc *sc,
+ struct mem_range_desc *mrd, int *arg);
+static int amd64_mrsetvariable(struct mem_range_softc *sc,
+ struct mem_range_desc *mrd, int *arg);
+
+/* amd64 MTRR type to memory range type conversion */
+static int amd64_mtrrtomrt[] = {
+ MDF_UNCACHEABLE,
+ MDF_WRITECOMBINE,
+ MDF_UNKNOWN,
+ MDF_UNKNOWN,
+ MDF_WRITETHROUGH,
+ MDF_WRITEPROTECT,
+ MDF_WRITEBACK
+};
+
+#define MTRRTOMRTLEN (sizeof(amd64_mtrrtomrt) / sizeof(amd64_mtrrtomrt[0]))
+
+static int
+amd64_mtrr2mrt(int val)
+{
+
+ if (val < 0 || val >= MTRRTOMRTLEN)
+ return (MDF_UNKNOWN);
+ return (amd64_mtrrtomrt[val]);
+}
+
+/*
+ * amd64 MTRR conflicts. Writeback and uncachable may overlap.
+ */
+static int
+amd64_mtrrconflict(int flag1, int flag2)
+{
+
+ flag1 &= MDF_ATTRMASK;
+ flag2 &= MDF_ATTRMASK;
+ if ((flag1 & MDF_UNKNOWN) || (flag2 & MDF_UNKNOWN))
+ return (1);
+ if (flag1 == flag2 ||
+ (flag1 == MDF_WRITEBACK && flag2 == MDF_UNCACHEABLE) ||
+ (flag2 == MDF_WRITEBACK && flag1 == MDF_UNCACHEABLE))
+ return (0);
+ return (1);
+}
+
+/*
+ * Look for an exactly-matching range.
+ */
+static struct mem_range_desc *
+mem_range_match(struct mem_range_softc *sc, struct mem_range_desc *mrd)
+{
+ struct mem_range_desc *cand;
+ int i;
+
+ for (i = 0, cand = sc->mr_desc; i < sc->mr_ndesc; i++, cand++)
+ if ((cand->mr_base == mrd->mr_base) &&
+ (cand->mr_len == mrd->mr_len))
+ return (cand);
+ return (NULL);
+}
+
+/*
+ * Fetch the current mtrr settings from the current CPU (assumed to
+ * all be in sync in the SMP case). Note that if we are here, we
+ * assume that MTRRs are enabled, and we may or may not have fixed
+ * MTRRs.
+ */
+static void
+amd64_mrfetch(struct mem_range_softc *sc)
+{
+ struct mem_range_desc *mrd;
+ u_int64_t msrv;
+ int i, j, msr;
+
+ mrd = sc->mr_desc;
+
+ /* Get fixed-range MTRRs. */
+ if (sc->mr_cap & MR686_FIXMTRR) {
+ msr = MSR_MTRR64kBase;
+ for (i = 0; i < (MTRR_N64K / 8); i++, msr++) {
+ msrv = rdmsr(msr);
+ for (j = 0; j < 8; j++, mrd++) {
+ mrd->mr_flags =
+ (mrd->mr_flags & ~MDF_ATTRMASK) |
+ amd64_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE;
+ if (mrd->mr_owner[0] == 0)
+ strcpy(mrd->mr_owner, mem_owner_bios);
+ msrv = msrv >> 8;
+ }
+ }
+ msr = MSR_MTRR16kBase;
+ for (i = 0; i < (MTRR_N16K / 8); i++, msr++) {
+ msrv = rdmsr(msr);
+ for (j = 0; j < 8; j++, mrd++) {
+ mrd->mr_flags =
+ (mrd->mr_flags & ~MDF_ATTRMASK) |
+ amd64_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE;
+ if (mrd->mr_owner[0] == 0)
+ strcpy(mrd->mr_owner, mem_owner_bios);
+ msrv = msrv >> 8;
+ }
+ }
+ msr = MSR_MTRR4kBase;
+ for (i = 0; i < (MTRR_N4K / 8); i++, msr++) {
+ msrv = rdmsr(msr);
+ for (j = 0; j < 8; j++, mrd++) {
+ mrd->mr_flags =
+ (mrd->mr_flags & ~MDF_ATTRMASK) |
+ amd64_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE;
+ if (mrd->mr_owner[0] == 0)
+ strcpy(mrd->mr_owner, mem_owner_bios);
+ msrv = msrv >> 8;
+ }
+ }
+ }
+
+ /* Get remainder which must be variable MTRRs. */
+ msr = MSR_MTRRVarBase;
+ for (; (mrd - sc->mr_desc) < sc->mr_ndesc; msr += 2, mrd++) {
+ msrv = rdmsr(msr);
+ mrd->mr_flags = (mrd->mr_flags & ~MDF_ATTRMASK) |
+ amd64_mtrr2mrt(msrv & MTRR_PHYSBASE_TYPE);
+ mrd->mr_base = msrv & mtrr_physmask;
+ msrv = rdmsr(msr + 1);
+ mrd->mr_flags = (msrv & MTRR_PHYSMASK_VALID) ?
+ (mrd->mr_flags | MDF_ACTIVE) :
+ (mrd->mr_flags & ~MDF_ACTIVE);
+
+ /* Compute the range from the mask. Ick. */
+ mrd->mr_len = (~(msrv & mtrr_physmask) &
+ (mtrr_physmask | 0xfffL)) + 1;
+ if (!mrvalid(mrd->mr_base, mrd->mr_len))
+ mrd->mr_flags |= MDF_BOGUS;
+
+ /* If unclaimed and active, must be the BIOS. */
+ if ((mrd->mr_flags & MDF_ACTIVE) && (mrd->mr_owner[0] == 0))
+ strcpy(mrd->mr_owner, mem_owner_bios);
+ }
+}
+
+/*
+ * Return the MTRR memory type matching a region's flags
+ */
+static int
+amd64_mtrrtype(int flags)
+{
+ int i;
+
+ flags &= MDF_ATTRMASK;
+
+ for (i = 0; i < MTRRTOMRTLEN; i++) {
+ if (amd64_mtrrtomrt[i] == MDF_UNKNOWN)
+ continue;
+ if (flags == amd64_mtrrtomrt[i])
+ return (i);
+ }
+ return (-1);
+}
+
+static int
+amd64_mrt2mtrr(int flags, int oldval)
+{
+ int val;
+
+ if ((val = amd64_mtrrtype(flags)) == -1)
+ return (oldval & 0xff);
+ return (val & 0xff);
+}
+
+/*
+ * Update running CPU(s) MTRRs to match the ranges in the descriptor
+ * list.
+ *
+ * XXX Must be called with interrupts enabled.
+ */
+static void
+amd64_mrstore(struct mem_range_softc *sc)
+{
+#ifdef SMP
+ /*
+ * We should use ipi_all_but_self() to call other CPUs into a
+ * locking gate, then call a target function to do this work.
+ * The "proper" solution involves a generalised locking gate
+ * implementation, not ready yet.
+ */
+ smp_rendezvous(NULL, amd64_mrstoreone, NULL, sc);
+#else
+ disable_intr(); /* disable interrupts */
+ amd64_mrstoreone(sc);
+ enable_intr();
+#endif
+}
+
+/*
+ * Update the current CPU's MTRRs with those represented in the
+ * descriptor list. Note that we do this wholesale rather than just
+ * stuffing one entry; this is simpler (but slower, of course).
+ */
+static void
+amd64_mrstoreone(void *arg)
+{
+ struct mem_range_softc *sc = arg;
+ struct mem_range_desc *mrd;
+ u_int64_t omsrv, msrv;
+ int i, j, msr;
+ u_long cr0, cr4;
+
+ mrd = sc->mr_desc;
+
+ critical_enter();
+
+ /* Disable PGE. */
+ cr4 = rcr4();
+ load_cr4(cr4 & ~CR4_PGE);
+
+ /* Disable caches (CD = 1, NW = 0). */
+ cr0 = rcr0();
+ load_cr0((cr0 & ~CR0_NW) | CR0_CD);
+
+ /* Flushes caches and TLBs. */
+ wbinvd();
+ invltlb();
+
+ /* Disable MTRRs (E = 0). */
+ wrmsr(MSR_MTRRdefType, rdmsr(MSR_MTRRdefType) & ~MTRR_DEF_ENABLE);
+
+ /* Set fixed-range MTRRs. */
+ if (sc->mr_cap & MR686_FIXMTRR) {
+ msr = MSR_MTRR64kBase;
+ for (i = 0; i < (MTRR_N64K / 8); i++, msr++) {
+ msrv = 0;
+ omsrv = rdmsr(msr);
+ for (j = 7; j >= 0; j--) {
+ msrv = msrv << 8;
+ msrv |= amd64_mrt2mtrr((mrd + j)->mr_flags,
+ omsrv >> (j * 8));
+ }
+ wrmsr(msr, msrv);
+ mrd += 8;
+ }
+ msr = MSR_MTRR16kBase;
+ for (i = 0; i < (MTRR_N16K / 8); i++, msr++) {
+ msrv = 0;
+ omsrv = rdmsr(msr);
+ for (j = 7; j >= 0; j--) {
+ msrv = msrv << 8;
+ msrv |= amd64_mrt2mtrr((mrd + j)->mr_flags,
+ omsrv >> (j * 8));
+ }
+ wrmsr(msr, msrv);
+ mrd += 8;
+ }
+ msr = MSR_MTRR4kBase;
+ for (i = 0; i < (MTRR_N4K / 8); i++, msr++) {
+ msrv = 0;
+ omsrv = rdmsr(msr);
+ for (j = 7; j >= 0; j--) {
+ msrv = msrv << 8;
+ msrv |= amd64_mrt2mtrr((mrd + j)->mr_flags,
+ omsrv >> (j * 8));
+ }
+ wrmsr(msr, msrv);
+ mrd += 8;
+ }
+ }
+
+ /* Set remainder which must be variable MTRRs. */
+ msr = MSR_MTRRVarBase;
+ for (; (mrd - sc->mr_desc) < sc->mr_ndesc; msr += 2, mrd++) {
+ /* base/type register */
+ omsrv = rdmsr(msr);
+ if (mrd->mr_flags & MDF_ACTIVE) {
+ msrv = mrd->mr_base & mtrr_physmask;
+ msrv |= amd64_mrt2mtrr(mrd->mr_flags, omsrv);
+ } else {
+ msrv = 0;
+ }
+ wrmsr(msr, msrv);
+
+ /* mask/active register */
+ if (mrd->mr_flags & MDF_ACTIVE) {
+ msrv = MTRR_PHYSMASK_VALID |
+ (~(mrd->mr_len - 1) & mtrr_physmask);
+ } else {
+ msrv = 0;
+ }
+ wrmsr(msr + 1, msrv);
+ }
+
+ /* Flush caches and TLBs. */
+ wbinvd();
+ invltlb();
+
+ /* Enable MTRRs. */
+ wrmsr(MSR_MTRRdefType, rdmsr(MSR_MTRRdefType) | MTRR_DEF_ENABLE);
+
+ /* Restore caches and PGE. */
+ load_cr0(cr0);
+ load_cr4(cr4);
+
+ critical_exit();
+}
+
+/*
+ * Hunt for the fixed MTRR referencing (addr)
+ */
+static struct mem_range_desc *
+amd64_mtrrfixsearch(struct mem_range_softc *sc, u_int64_t addr)
+{
+ struct mem_range_desc *mrd;
+ int i;
+
+ for (i = 0, mrd = sc->mr_desc; i < (MTRR_N64K + MTRR_N16K + MTRR_N4K);
+ i++, mrd++)
+ if ((addr >= mrd->mr_base) &&
+ (addr < (mrd->mr_base + mrd->mr_len)))
+ return (mrd);
+ return (NULL);
+}
+
+/*
+ * Try to satisfy the given range request by manipulating the fixed
+ * MTRRs that cover low memory.
+ *
+ * Note that we try to be generous here; we'll bloat the range out to
+ * the next higher/lower boundary to avoid the consumer having to know
+ * too much about the mechanisms here.
+ *
+ * XXX note that this will have to be updated when we start supporting
+ * "busy" ranges.
+ */
+static int
+amd64_mrsetlow(struct mem_range_softc *sc, struct mem_range_desc *mrd, int *arg)
+{
+ struct mem_range_desc *first_md, *last_md, *curr_md;
+
+ /* Range check. */
+ if (((first_md = amd64_mtrrfixsearch(sc, mrd->mr_base)) == NULL) ||
+ ((last_md = amd64_mtrrfixsearch(sc, mrd->mr_base + mrd->mr_len - 1)) == NULL))
+ return (EINVAL);
+
+ /* Check that we aren't doing something risky. */
+ if (!(mrd->mr_flags & MDF_FORCE))
+ for (curr_md = first_md; curr_md <= last_md; curr_md++) {
+ if ((curr_md->mr_flags & MDF_ATTRMASK) == MDF_UNKNOWN)
+ return (EACCES);
+ }
+
+ /* Set flags, clear set-by-firmware flag. */
+ for (curr_md = first_md; curr_md <= last_md; curr_md++) {
+ curr_md->mr_flags = mrcopyflags(curr_md->mr_flags &
+ ~MDF_FIRMWARE, mrd->mr_flags);
+ bcopy(mrd->mr_owner, curr_md->mr_owner, sizeof(mrd->mr_owner));
+ }
+
+ return (0);
+}
+
+/*
+ * Modify/add a variable MTRR to satisfy the request.
+ *
+ * XXX needs to be updated to properly support "busy" ranges.
+ */
+static int
+amd64_mrsetvariable(struct mem_range_softc *sc, struct mem_range_desc *mrd,
+ int *arg)
+{
+ struct mem_range_desc *curr_md, *free_md;
+ int i;
+
+ /*
+ * Scan the currently active variable descriptors, look for
+ * one we exactly match (straight takeover) and for possible
+ * accidental overlaps.
+ *
+ * Keep track of the first empty variable descriptor in case
+ * we can't perform a takeover.
+ */
+ i = (sc->mr_cap & MR686_FIXMTRR) ? MTRR_N64K + MTRR_N16K + MTRR_N4K : 0;
+ curr_md = sc->mr_desc + i;
+ free_md = NULL;
+ for (; i < sc->mr_ndesc; i++, curr_md++) {
+ if (curr_md->mr_flags & MDF_ACTIVE) {
+ /* Exact match? */
+ if ((curr_md->mr_base == mrd->mr_base) &&
+ (curr_md->mr_len == mrd->mr_len)) {
+
+ /* Whoops, owned by someone. */
+ if (curr_md->mr_flags & MDF_BUSY)
+ return (EBUSY);
+
+ /* Check that we aren't doing something risky */
+ if (!(mrd->mr_flags & MDF_FORCE) &&
+ ((curr_md->mr_flags & MDF_ATTRMASK) ==
+ MDF_UNKNOWN))
+ return (EACCES);
+
+ /* Ok, just hijack this entry. */
+ free_md = curr_md;
+ break;
+ }
+
+ /* Non-exact overlap? */
+ if (mroverlap(curr_md, mrd)) {
+ /* Between conflicting region types? */
+ if (amd64_mtrrconflict(curr_md->mr_flags,
+ mrd->mr_flags))
+ return (EINVAL);
+ }
+ } else if (free_md == NULL) {
+ free_md = curr_md;
+ }
+ }
+
+ /* Got somewhere to put it? */
+ if (free_md == NULL)
+ return (ENOSPC);
+
+ /* Set up new descriptor. */
+ free_md->mr_base = mrd->mr_base;
+ free_md->mr_len = mrd->mr_len;
+ free_md->mr_flags = mrcopyflags(MDF_ACTIVE, mrd->mr_flags);
+ bcopy(mrd->mr_owner, free_md->mr_owner, sizeof(mrd->mr_owner));
+ return (0);
+}
+
+/*
+ * Handle requests to set memory range attributes by manipulating MTRRs.
+ */
+static int
+amd64_mrset(struct mem_range_softc *sc, struct mem_range_desc *mrd, int *arg)
+{
+ struct mem_range_desc *targ;
+ int error, i;
+
+ switch (*arg) {
+ case MEMRANGE_SET_UPDATE:
+ /*
+ * Make sure that what's being asked for is even
+ * possible at all.
+ */
+ if (!mrvalid(mrd->mr_base, mrd->mr_len) ||
+ amd64_mtrrtype(mrd->mr_flags) == -1)
+ return (EINVAL);
+
+#define FIXTOP ((MTRR_N64K * 0x10000) + (MTRR_N16K * 0x4000) + (MTRR_N4K * 0x1000))
+
+ /* Are the "low memory" conditions applicable? */
+ if ((sc->mr_cap & MR686_FIXMTRR) &&
+ ((mrd->mr_base + mrd->mr_len) <= FIXTOP)) {
+ if ((error = amd64_mrsetlow(sc, mrd, arg)) != 0)
+ return (error);
+ } else {
+ /* It's time to play with variable MTRRs. */
+ if ((error = amd64_mrsetvariable(sc, mrd, arg)) != 0)
+ return (error);
+ }
+ break;
+
+ case MEMRANGE_SET_REMOVE:
+ if ((targ = mem_range_match(sc, mrd)) == NULL)
+ return (ENOENT);
+ if (targ->mr_flags & MDF_FIXACTIVE)
+ return (EPERM);
+ if (targ->mr_flags & MDF_BUSY)
+ return (EBUSY);
+ targ->mr_flags &= ~MDF_ACTIVE;
+ targ->mr_owner[0] = 0;
+ break;
+
+ default:
+ return (EOPNOTSUPP);
+ }
+
+ /*
+ * Ensure that the direct map region does not contain any mappings
+ * that span MTRRs of different types. However, the fixed MTRRs can
+ * be ignored, because a large page mapping the first 1 MB of physical
+ * memory is a special case that the processor handles. The entire
+ * TLB will be invalidated by amd64_mrstore(), so pmap_demote_DMAP()
+ * needn't do it.
+ */
+ i = (sc->mr_cap & MR686_FIXMTRR) ? MTRR_N64K + MTRR_N16K + MTRR_N4K : 0;
+ mrd = sc->mr_desc + i;
+ for (; i < sc->mr_ndesc; i++, mrd++) {
+ if ((mrd->mr_flags & (MDF_ACTIVE | MDF_BOGUS)) == MDF_ACTIVE)
+ pmap_demote_DMAP(mrd->mr_base, mrd->mr_len, FALSE);
+ }
+
+ /* Update the hardware. */
+ amd64_mrstore(sc);
+
+ /* Refetch to see where we're at. */
+ amd64_mrfetch(sc);
+ return (0);
+}
+
+/*
+ * Work out how many ranges we support, initialise storage for them,
+ * and fetch the initial settings.
+ */
+static void
+amd64_mrinit(struct mem_range_softc *sc)
+{
+ struct mem_range_desc *mrd;
+ u_int regs[4];
+ int i, nmdesc = 0, pabits;
+
+ mtrrcap = rdmsr(MSR_MTRRcap);
+ mtrrdef = rdmsr(MSR_MTRRdefType);
+
+ /* For now, bail out if MTRRs are not enabled. */
+ if (!(mtrrdef & MTRR_DEF_ENABLE)) {
+ if (bootverbose)
+ printf("CPU supports MTRRs but not enabled\n");
+ return;
+ }
+ nmdesc = mtrrcap & MTRR_CAP_VCNT;
+
+ /*
+ * Determine the size of the PhysMask and PhysBase fields in
+ * the variable range MTRRs. If the extended CPUID 0x80000008
+ * is present, use that to figure out how many physical
+ * address bits the CPU supports. Otherwise, default to 36
+ * address bits.
+ */
+ if (cpu_exthigh >= 0x80000008) {
+ do_cpuid(0x80000008, regs);
+ pabits = regs[0] & 0xff;
+ } else
+ pabits = 36;
+ mtrr_physmask = ((1UL << pabits) - 1) & ~0xfffUL;
+
+ /* If fixed MTRRs supported and enabled. */
+ if ((mtrrcap & MTRR_CAP_FIXED) && (mtrrdef & MTRR_DEF_FIXED_ENABLE)) {
+ sc->mr_cap = MR686_FIXMTRR;
+ nmdesc += MTRR_N64K + MTRR_N16K + MTRR_N4K;
+ }
+
+ sc->mr_desc = malloc(nmdesc * sizeof(struct mem_range_desc), M_MEMDESC,
+ M_WAITOK | M_ZERO);
+ sc->mr_ndesc = nmdesc;
+
+ mrd = sc->mr_desc;
+
+ /* Populate the fixed MTRR entries' base/length. */
+ if (sc->mr_cap & MR686_FIXMTRR) {
+ for (i = 0; i < MTRR_N64K; i++, mrd++) {
+ mrd->mr_base = i * 0x10000;
+ mrd->mr_len = 0x10000;
+ mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN |
+ MDF_FIXACTIVE;
+ }
+ for (i = 0; i < MTRR_N16K; i++, mrd++) {
+ mrd->mr_base = i * 0x4000 + 0x80000;
+ mrd->mr_len = 0x4000;
+ mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN |
+ MDF_FIXACTIVE;
+ }
+ for (i = 0; i < MTRR_N4K; i++, mrd++) {
+ mrd->mr_base = i * 0x1000 + 0xc0000;
+ mrd->mr_len = 0x1000;
+ mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN |
+ MDF_FIXACTIVE;
+ }
+ }
+
+ /*
+ * Get current settings, anything set now is considered to
+ * have been set by the firmware. (XXX has something already
+ * played here?)
+ */
+ amd64_mrfetch(sc);
+ mrd = sc->mr_desc;
+ for (i = 0; i < sc->mr_ndesc; i++, mrd++) {
+ if (mrd->mr_flags & MDF_ACTIVE)
+ mrd->mr_flags |= MDF_FIRMWARE;
+ }
+
+ /*
+ * Ensure that the direct map region does not contain any mappings
+ * that span MTRRs of different types. However, the fixed MTRRs can
+ * be ignored, because a large page mapping the first 1 MB of physical
+ * memory is a special case that the processor handles. Invalidate
+ * any old TLB entries that might hold inconsistent memory type
+ * information.
+ */
+ i = (sc->mr_cap & MR686_FIXMTRR) ? MTRR_N64K + MTRR_N16K + MTRR_N4K : 0;
+ mrd = sc->mr_desc + i;
+ for (; i < sc->mr_ndesc; i++, mrd++) {
+ if ((mrd->mr_flags & (MDF_ACTIVE | MDF_BOGUS)) == MDF_ACTIVE)
+ pmap_demote_DMAP(mrd->mr_base, mrd->mr_len, TRUE);
+ }
+}
+
+/*
+ * Initialise MTRRs on an AP after the BSP has run the init code.
+ */
+static void
+amd64_mrAPinit(struct mem_range_softc *sc)
+{
+
+ amd64_mrstoreone(sc);
+ wrmsr(MSR_MTRRdefType, mtrrdef);
+}
+
+/*
+ * Re-initialise running CPU(s) MTRRs to match the ranges in the descriptor
+ * list.
+ *
+ * XXX Must be called with interrupts enabled.
+ */
+static void
+amd64_mrreinit(struct mem_range_softc *sc)
+{
+#ifdef SMP
+ /*
+ * We should use ipi_all_but_self() to call other CPUs into a
+ * locking gate, then call a target function to do this work.
+ * The "proper" solution involves a generalised locking gate
+ * implementation, not ready yet.
+ */
+ smp_rendezvous(NULL, (void *)amd64_mrAPinit, NULL, sc);
+#else
+ disable_intr(); /* disable interrupts */
+ amd64_mrAPinit(sc);
+ enable_intr();
+#endif
+}
+
+static void
+amd64_mem_drvinit(void *unused)
+{
+
+ if (mtrrs_disabled)
+ return;
+ if (!(cpu_feature & CPUID_MTRR))
+ return;
+ if ((cpu_id & 0xf00) != 0x600 && (cpu_id & 0xf00) != 0xf00)
+ return;
+ switch (cpu_vendor_id) {
+ case CPU_VENDOR_INTEL:
+ case CPU_VENDOR_AMD:
+ case CPU_VENDOR_CENTAUR:
+ break;
+ default:
+ return;
+ }
+ mem_range_softc.mr_op = &amd64_mrops;
+}
+SYSINIT(amd64memdev, SI_SUB_DRIVERS, SI_ORDER_FIRST, amd64_mem_drvinit, NULL);
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
new file mode 100644
index 0000000..6465247
--- /dev/null
+++ b/sys/amd64/amd64/apic_vector.S
@@ -0,0 +1,326 @@
+/*-
+ * Copyright (c) 1989, 1990 William F. Jolitz.
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: vector.s, 386BSD 0.1 unknown origin
+ * $FreeBSD$
+ */
+
+/*
+ * Interrupt entry points for external interrupts triggered by I/O APICs
+ * as well as IPI handlers.
+ */
+
+#include "opt_smp.h"
+
+#include <machine/asmacros.h>
+#include <x86/apicreg.h>
+
+#include "assym.s"
+
+/*
+ * I/O Interrupt Entry Point. Rather than having one entry point for
+ * each interrupt source, we use one entry point for each 32-bit word
+ * in the ISR. The handler determines the highest bit set in the ISR,
+ * translates that into a vector, and passes the vector to the
+ * lapic_handle_intr() function.
+ */
+#define ISR_VEC(index, vec_name) \
+ .text ; \
+ SUPERALIGN_TEXT ; \
+IDTVEC(vec_name) ; \
+ PUSH_FRAME ; \
+ FAKE_MCOUNT(TF_RIP(%rsp)) ; \
+ movq lapic, %rdx ; /* pointer to local APIC */ \
+ movl LA_ISR + 16 * (index)(%rdx), %eax ; /* load ISR */ \
+ bsrl %eax, %eax ; /* index of highest set bit in ISR */ \
+ jz 1f ; \
+ addl $(32 * index),%eax ; \
+ movq %rsp, %rsi ; \
+ movl %eax, %edi ; /* pass the IRQ */ \
+ call lapic_handle_intr ; \
+1: ; \
+ MEXITCOUNT ; \
+ jmp doreti
+
+/*
+ * Handle "spurious INTerrupts".
+ * Notes:
+ * This is different than the "spurious INTerrupt" generated by an
+ * 8259 PIC for missing INTs. See the APIC documentation for details.
+ * This routine should NOT do an 'EOI' cycle.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(spuriousint)
+
+ /* No EOI cycle used here */
+
+ jmp doreti_iret
+
+ ISR_VEC(1, apic_isr1)
+ ISR_VEC(2, apic_isr2)
+ ISR_VEC(3, apic_isr3)
+ ISR_VEC(4, apic_isr4)
+ ISR_VEC(5, apic_isr5)
+ ISR_VEC(6, apic_isr6)
+ ISR_VEC(7, apic_isr7)
+
+/*
+ * Local APIC periodic timer handler.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(timerint)
+ PUSH_FRAME
+ FAKE_MCOUNT(TF_RIP(%rsp))
+ movq %rsp, %rdi
+ call lapic_handle_timer
+ MEXITCOUNT
+ jmp doreti
+
+/*
+ * Local APIC CMCI handler.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(cmcint)
+ PUSH_FRAME
+ FAKE_MCOUNT(TF_RIP(%rsp))
+ call lapic_handle_cmc
+ MEXITCOUNT
+ jmp doreti
+
+/*
+ * Local APIC error interrupt handler.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(errorint)
+ PUSH_FRAME
+ FAKE_MCOUNT(TF_RIP(%rsp))
+ call lapic_handle_error
+ MEXITCOUNT
+ jmp doreti
+
+#ifdef SMP
+/*
+ * Global address space TLB shootdown.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(invltlb)
+#if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
+ PUSH_FRAME
+ movl PCPU(CPUID), %eax
+#ifdef COUNT_XINVLTLB_HITS
+ incl xhits_gbl(,%rax,4)
+#endif
+#ifdef COUNT_IPIS
+ movq ipi_invltlb_counts(,%rax,8),%rax
+ incq (%rax)
+#endif
+ POP_FRAME
+#endif
+
+ pushq %rax
+
+ movq %cr3, %rax /* invalidate the TLB */
+ movq %rax, %cr3
+
+ movq lapic, %rax
+ movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
+
+ lock
+ incl smp_tlb_wait
+
+ popq %rax
+ jmp doreti_iret
+
+/*
+ * Single page TLB shootdown
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(invlpg)
+#if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
+ PUSH_FRAME
+ movl PCPU(CPUID), %eax
+#ifdef COUNT_XINVLTLB_HITS
+ incl xhits_pg(,%rax,4)
+#endif
+#ifdef COUNT_IPIS
+ movq ipi_invlpg_counts(,%rax,8),%rax
+ incq (%rax)
+#endif
+ POP_FRAME
+#endif
+
+ pushq %rax
+
+ movq smp_tlb_addr1, %rax
+ invlpg (%rax) /* invalidate single page */
+
+ movq lapic, %rax
+ movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
+
+ lock
+ incl smp_tlb_wait
+
+ popq %rax
+ jmp doreti_iret
+
+/*
+ * Page range TLB shootdown.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(invlrng)
+#if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
+ PUSH_FRAME
+ movl PCPU(CPUID), %eax
+#ifdef COUNT_XINVLTLB_HITS
+ incl xhits_rng(,%rax,4)
+#endif
+#ifdef COUNT_IPIS
+ movq ipi_invlrng_counts(,%rax,8),%rax
+ incq (%rax)
+#endif
+ POP_FRAME
+#endif
+
+ pushq %rax
+ pushq %rdx
+
+ movq smp_tlb_addr1, %rdx
+ movq smp_tlb_addr2, %rax
+1: invlpg (%rdx) /* invalidate single page */
+ addq $PAGE_SIZE, %rdx
+ cmpq %rax, %rdx
+ jb 1b
+
+ movq lapic, %rax
+ movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
+
+ lock
+ incl smp_tlb_wait
+
+ popq %rdx
+ popq %rax
+ jmp doreti_iret
+
+/*
+ * Invalidate cache.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(invlcache)
+#ifdef COUNT_IPIS
+ PUSH_FRAME
+ movl PCPU(CPUID), %eax
+ movq ipi_invlcache_counts(,%rax,8),%rax
+ incq (%rax)
+ POP_FRAME
+#endif
+
+ pushq %rax
+
+ wbinvd
+
+ movq lapic, %rax
+ movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
+
+ lock
+ incl smp_tlb_wait
+
+ popq %rax
+ jmp doreti_iret
+
+/*
+ * Handler for IPIs sent via the per-cpu IPI bitmap.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(ipi_intr_bitmap_handler)
+ PUSH_FRAME
+
+ movq lapic, %rdx
+ movl $0, LA_EOI(%rdx) /* End Of Interrupt to APIC */
+
+ FAKE_MCOUNT(TF_RIP(%rsp))
+
+ call ipi_bitmap_handler
+ MEXITCOUNT
+ jmp doreti
+
+/*
+ * Executed by a CPU when it receives an IPI_STOP from another CPU.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(cpustop)
+ PUSH_FRAME
+
+ movq lapic, %rax
+ movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
+
+ call cpustop_handler
+ jmp doreti
+
+/*
+ * Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(cpususpend)
+ PUSH_FRAME
+
+ call cpususpend_handler
+ movq lapic, %rax
+ movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
+ jmp doreti
+
+/*
+ * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU.
+ *
+ * - Calls the generic rendezvous action function.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(rendezvous)
+ PUSH_FRAME
+#ifdef COUNT_IPIS
+ movl PCPU(CPUID), %eax
+ movq ipi_rendezvous_counts(,%rax,8), %rax
+ incq (%rax)
+#endif
+ call smp_rendezvous_action
+ movq lapic, %rax
+ movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
+ jmp doreti
+#endif /* SMP */
diff --git a/sys/amd64/amd64/atomic.c b/sys/amd64/amd64/atomic.c
new file mode 100644
index 0000000..1b4ff7e
--- /dev/null
+++ b/sys/amd64/amd64/atomic.c
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 1999 Peter Jeremy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/* This file creates publically callable functions to perform various
+ * simple arithmetic on memory which is atomic in the presence of
+ * interrupts and multiple processors.
+ */
+#include <sys/types.h>
+
+/* Firstly make atomic.h generate prototypes as it will for kernel modules */
+#define KLD_MODULE
+#include <machine/atomic.h>
+#undef _MACHINE_ATOMIC_H_ /* forget we included it */
+#undef KLD_MODULE
+#undef ATOMIC_ASM
+
+/* Make atomic.h generate public functions */
+#define WANT_FUNCTIONS
+#define static
+#undef __inline
+#define __inline
+
+#include <machine/atomic.h>
diff --git a/sys/amd64/amd64/atpic_vector.S b/sys/amd64/amd64/atpic_vector.S
new file mode 100644
index 0000000..e7dcbc3
--- /dev/null
+++ b/sys/amd64/amd64/atpic_vector.S
@@ -0,0 +1,73 @@
+/*-
+ * Copyright (c) 1989, 1990 William F. Jolitz.
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: vector.s, 386BSD 0.1 unknown origin
+ * $FreeBSD$
+ */
+
+/*
+ * Interrupt entry points for external interrupts triggered by the 8259A
+ * master and slave interrupt controllers.
+ */
+
+#include <machine/asmacros.h>
+
+#include "assym.s"
+
+/*
+ * Macros for interrupt entry, call to handler, and exit.
+ */
+#define INTR(irq_num, vec_name) \
+ .text ; \
+ SUPERALIGN_TEXT ; \
+IDTVEC(vec_name) ; \
+ PUSH_FRAME ; \
+ FAKE_MCOUNT(TF_RIP(%rsp)) ; \
+ movq %rsp, %rsi ; \
+ movl $irq_num, %edi; /* pass the IRQ */ \
+ call atpic_handle_intr ; \
+ MEXITCOUNT ; \
+ jmp doreti
+
+ INTR(0, atpic_intr0)
+ INTR(1, atpic_intr1)
+ INTR(2, atpic_intr2)
+ INTR(3, atpic_intr3)
+ INTR(4, atpic_intr4)
+ INTR(5, atpic_intr5)
+ INTR(6, atpic_intr6)
+ INTR(7, atpic_intr7)
+ INTR(8, atpic_intr8)
+ INTR(9, atpic_intr9)
+ INTR(10, atpic_intr10)
+ INTR(11, atpic_intr11)
+ INTR(12, atpic_intr12)
+ INTR(13, atpic_intr13)
+ INTR(14, atpic_intr14)
+ INTR(15, atpic_intr15)
diff --git a/sys/amd64/amd64/autoconf.c b/sys/amd64/amd64/autoconf.c
new file mode 100644
index 0000000..ee32740
--- /dev/null
+++ b/sys/amd64/amd64/autoconf.c
@@ -0,0 +1,132 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)autoconf.c 7.1 (Berkeley) 5/9/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Setup the system to run on the current machine.
+ *
+ * Configure() is called at boot time and initializes the vba
+ * device tables and the memory controller monitoring. Available
+ * devices are determined (from possibilities mentioned in ioconf.c),
+ * and the drivers are initialized.
+ */
+#include "opt_bootp.h"
+#include "opt_isa.h"
+#include "opt_bus.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/reboot.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/cons.h>
+
+#include <sys/socket.h>
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/if_var.h>
+#include <net/ethernet.h>
+#include <netinet/in.h>
+
+#include <machine/md_var.h>
+
+#ifdef DEV_ISA
+#include <isa/isavar.h>
+
+device_t isa_bus_device = 0;
+#endif
+
+static void configure_first(void *);
+static void configure(void *);
+static void configure_final(void *);
+
+SYSINIT(configure1, SI_SUB_CONFIGURE, SI_ORDER_FIRST, configure_first, NULL);
+/* SI_ORDER_SECOND is hookable */
+SYSINIT(configure2, SI_SUB_CONFIGURE, SI_ORDER_THIRD, configure, NULL);
+/* SI_ORDER_MIDDLE is hookable */
+SYSINIT(configure3, SI_SUB_CONFIGURE, SI_ORDER_ANY, configure_final, NULL);
+
+/*
+ * Determine i/o configuration for a machine.
+ */
+static void
+configure_first(dummy)
+ void *dummy;
+{
+
+ /* nexus0 is the top of the amd64 device tree */
+ device_add_child(root_bus, "nexus", 0);
+}
+
+static void
+configure(dummy)
+ void *dummy;
+{
+
+ /*
+ * Enable interrupts on the processor. The interrupts are still
+ * disabled in the interrupt controllers until interrupt handlers
+ * are registered.
+ */
+ enable_intr();
+
+ /* initialize new bus architecture */
+ root_bus_configure();
+
+#ifdef DEV_ISA
+ /*
+ * Explicitly probe and attach ISA last. The isa bus saves
+ * it's device node at attach time for us here.
+ */
+ if (isa_bus_device)
+ isa_probe_children(isa_bus_device);
+#endif
+}
+
+static void
+configure_final(dummy)
+ void *dummy;
+{
+
+ cninit_finish();
+ if (bootverbose)
+ printf("Device configuration finished.\n");
+ cold = 0;
+}
diff --git a/sys/amd64/amd64/bios.c b/sys/amd64/amd64/bios.c
new file mode 100644
index 0000000..c8985c1
--- /dev/null
+++ b/sys/amd64/amd64/bios.c
@@ -0,0 +1,95 @@
+/*-
+ * Copyright (c) 1997 Michael Smith
+ * Copyright (c) 1998 Jonathan Lemon
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Subset of the i386 bios support code. We cannot make bios16 nor bios32
+ * calls, so we can leave that out. However, searching for bios rom
+ * signatures can be useful for locating tables, eg: powernow settings.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/pc/bios.h>
+
+#define BIOS_START 0xe0000
+#define BIOS_SIZE 0x20000
+
+/*
+ * bios_sigsearch
+ *
+ * Search some or all of the BIOS region for a signature string.
+ *
+ * (start) Optional offset returned from this function
+ * (for searching for multiple matches), or NULL
+ * to start the search from the base of the BIOS.
+ * Note that this will be a _physical_ address in
+ * the range 0xe0000 - 0xfffff.
+ * (sig) is a pointer to the byte(s) of the signature.
+ * (siglen) number of bytes in the signature.
+ * (paralen) signature paragraph (alignment) size.
+ * (sigofs) offset of the signature within the paragraph.
+ *
+ * Returns the _physical_ address of the found signature, 0 if the
+ * signature was not found.
+ */
+
+u_int32_t
+bios_sigsearch(u_int32_t start, u_char *sig, int siglen, int paralen, int sigofs)
+{
+ u_char *sp, *end;
+
+ /* compute the starting address */
+ if ((start >= BIOS_START) && (start <= (BIOS_START + BIOS_SIZE))) {
+ sp = (char *)BIOS_PADDRTOVADDR(start);
+ } else if (start == 0) {
+ sp = (char *)BIOS_PADDRTOVADDR(BIOS_START);
+ } else {
+ return 0; /* bogus start address */
+ }
+
+ /* compute the end address */
+ end = (u_char *)BIOS_PADDRTOVADDR(BIOS_START + BIOS_SIZE);
+
+ /* loop searching */
+ while ((sp + sigofs + siglen) < end) {
+
+ /* compare here */
+ if (!bcmp(sp + sigofs, sig, siglen)) {
+ /* convert back to physical address */
+ return((u_int32_t)(uintptr_t)BIOS_VADDRTOPADDR(sp));
+ }
+ sp += paralen;
+ }
+ return(0);
+}
diff --git a/sys/amd64/amd64/bpf_jit_machdep.c b/sys/amd64/amd64/bpf_jit_machdep.c
new file mode 100644
index 0000000..fe861d2
--- /dev/null
+++ b/sys/amd64/amd64/bpf_jit_machdep.c
@@ -0,0 +1,653 @@
+/*-
+ * Copyright (C) 2002-2003 NetGroup, Politecnico di Torino (Italy)
+ * Copyright (C) 2005-2009 Jung-uk Kim <jkim@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Politecnico di Torino nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef _KERNEL
+#include "opt_bpf.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/malloc.h>
+#include <net/if.h>
+#else
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/param.h>
+#endif
+
+#include <sys/types.h>
+
+#include <net/bpf.h>
+#include <net/bpf_jitter.h>
+
+#include <amd64/amd64/bpf_jit_machdep.h>
+
+bpf_filter_func bpf_jit_compile(struct bpf_insn *, u_int, size_t *);
+
+/*
+ * Emit routine to update the jump table.
+ */
+static void
+emit_length(bpf_bin_stream *stream, __unused u_int value, u_int len)
+{
+
+ if (stream->refs != NULL)
+ (stream->refs)[stream->bpf_pc] += len;
+ stream->cur_ip += len;
+}
+
+/*
+ * Emit routine to output the actual binary code.
+ */
+static void
+emit_code(bpf_bin_stream *stream, u_int value, u_int len)
+{
+
+ switch (len) {
+ case 1:
+ stream->ibuf[stream->cur_ip] = (u_char)value;
+ stream->cur_ip++;
+ break;
+
+ case 2:
+ *((u_short *)(stream->ibuf + stream->cur_ip)) = (u_short)value;
+ stream->cur_ip += 2;
+ break;
+
+ case 4:
+ *((u_int *)(stream->ibuf + stream->cur_ip)) = value;
+ stream->cur_ip += 4;
+ break;
+ }
+
+ return;
+}
+
+/*
+ * Scan the filter program and find possible optimization.
+ */
+static int
+bpf_jit_optimize(struct bpf_insn *prog, u_int nins)
+{
+ int flags;
+ u_int i;
+
+ /* Do we return immediately? */
+ if (BPF_CLASS(prog[0].code) == BPF_RET)
+ return (BPF_JIT_FRET);
+
+ for (flags = 0, i = 0; i < nins; i++) {
+ switch (prog[i].code) {
+ case BPF_LD|BPF_W|BPF_ABS:
+ case BPF_LD|BPF_H|BPF_ABS:
+ case BPF_LD|BPF_B|BPF_ABS:
+ case BPF_LD|BPF_W|BPF_IND:
+ case BPF_LD|BPF_H|BPF_IND:
+ case BPF_LD|BPF_B|BPF_IND:
+ case BPF_LDX|BPF_MSH|BPF_B:
+ flags |= BPF_JIT_FPKT;
+ break;
+ case BPF_LD|BPF_MEM:
+ case BPF_LDX|BPF_MEM:
+ case BPF_ST:
+ case BPF_STX:
+ flags |= BPF_JIT_FMEM;
+ break;
+ case BPF_LD|BPF_W|BPF_LEN:
+ case BPF_LDX|BPF_W|BPF_LEN:
+ flags |= BPF_JIT_FLEN;
+ break;
+ case BPF_JMP|BPF_JA:
+ case BPF_JMP|BPF_JGT|BPF_K:
+ case BPF_JMP|BPF_JGE|BPF_K:
+ case BPF_JMP|BPF_JEQ|BPF_K:
+ case BPF_JMP|BPF_JSET|BPF_K:
+ case BPF_JMP|BPF_JGT|BPF_X:
+ case BPF_JMP|BPF_JGE|BPF_X:
+ case BPF_JMP|BPF_JEQ|BPF_X:
+ case BPF_JMP|BPF_JSET|BPF_X:
+ flags |= BPF_JIT_FJMP;
+ break;
+ }
+ if (flags == BPF_JIT_FLAG_ALL)
+ break;
+ }
+
+ return (flags);
+}
+
+/*
+ * Function that does the real stuff.
+ */
+bpf_filter_func
+bpf_jit_compile(struct bpf_insn *prog, u_int nins, size_t *size)
+{
+ bpf_bin_stream stream;
+ struct bpf_insn *ins;
+ int flags, fret, fpkt, fmem, fjmp, flen;
+ u_int i, pass;
+
+ /*
+ * NOTE: Do not modify the name of this variable, as it's used by
+ * the macros to emit code.
+ */
+ emit_func emitm;
+
+ flags = bpf_jit_optimize(prog, nins);
+ fret = (flags & BPF_JIT_FRET) != 0;
+ fpkt = (flags & BPF_JIT_FPKT) != 0;
+ fmem = (flags & BPF_JIT_FMEM) != 0;
+ fjmp = (flags & BPF_JIT_FJMP) != 0;
+ flen = (flags & BPF_JIT_FLEN) != 0;
+
+ if (fret)
+ nins = 1;
+
+ memset(&stream, 0, sizeof(stream));
+
+ /* Allocate the reference table for the jumps. */
+ if (fjmp) {
+#ifdef _KERNEL
+ stream.refs = malloc((nins + 1) * sizeof(u_int), M_BPFJIT,
+ M_NOWAIT | M_ZERO);
+#else
+ stream.refs = calloc(nins + 1, sizeof(u_int));
+#endif
+ if (stream.refs == NULL)
+ return (NULL);
+ }
+
+ /*
+ * The first pass will emit the lengths of the instructions
+ * to create the reference table.
+ */
+ emitm = emit_length;
+
+ for (pass = 0; pass < 2; pass++) {
+ ins = prog;
+
+ /* Create the procedure header. */
+ if (fmem) {
+ PUSH(RBP);
+ MOVrq(RSP, RBP);
+ SUBib(BPF_MEMWORDS * sizeof(uint32_t), RSP);
+ }
+ if (flen)
+ MOVrd2(ESI, R9D);
+ if (fpkt) {
+ MOVrq2(RDI, R8);
+ MOVrd(EDX, EDI);
+ }
+
+ for (i = 0; i < nins; i++) {
+ stream.bpf_pc++;
+
+ switch (ins->code) {
+ default:
+#ifdef _KERNEL
+ return (NULL);
+#else
+ abort();
+#endif
+
+ case BPF_RET|BPF_K:
+ MOVid(ins->k, EAX);
+ if (fmem)
+ LEAVE();
+ RET();
+ break;
+
+ case BPF_RET|BPF_A:
+ if (fmem)
+ LEAVE();
+ RET();
+ break;
+
+ case BPF_LD|BPF_W|BPF_ABS:
+ MOVid(ins->k, ESI);
+ CMPrd(EDI, ESI);
+ JAb(12);
+ MOVrd(EDI, ECX);
+ SUBrd(ESI, ECX);
+ CMPid(sizeof(int32_t), ECX);
+ if (fmem) {
+ JAEb(4);
+ ZEROrd(EAX);
+ LEAVE();
+ } else {
+ JAEb(3);
+ ZEROrd(EAX);
+ }
+ RET();
+ MOVrq3(R8, RCX);
+ MOVobd(RCX, RSI, EAX);
+ BSWAP(EAX);
+ break;
+
+ case BPF_LD|BPF_H|BPF_ABS:
+ ZEROrd(EAX);
+ MOVid(ins->k, ESI);
+ CMPrd(EDI, ESI);
+ JAb(12);
+ MOVrd(EDI, ECX);
+ SUBrd(ESI, ECX);
+ CMPid(sizeof(int16_t), ECX);
+ if (fmem) {
+ JAEb(2);
+ LEAVE();
+ } else
+ JAEb(1);
+ RET();
+ MOVrq3(R8, RCX);
+ MOVobw(RCX, RSI, AX);
+ SWAP_AX();
+ break;
+
+ case BPF_LD|BPF_B|BPF_ABS:
+ ZEROrd(EAX);
+ MOVid(ins->k, ESI);
+ CMPrd(EDI, ESI);
+ if (fmem) {
+ JBb(2);
+ LEAVE();
+ } else
+ JBb(1);
+ RET();
+ MOVrq3(R8, RCX);
+ MOVobb(RCX, RSI, AL);
+ break;
+
+ case BPF_LD|BPF_W|BPF_LEN:
+ MOVrd3(R9D, EAX);
+ break;
+
+ case BPF_LDX|BPF_W|BPF_LEN:
+ MOVrd3(R9D, EDX);
+ break;
+
+ case BPF_LD|BPF_W|BPF_IND:
+ CMPrd(EDI, EDX);
+ JAb(27);
+ MOVid(ins->k, ESI);
+ MOVrd(EDI, ECX);
+ SUBrd(EDX, ECX);
+ CMPrd(ESI, ECX);
+ JBb(14);
+ ADDrd(EDX, ESI);
+ MOVrd(EDI, ECX);
+ SUBrd(ESI, ECX);
+ CMPid(sizeof(int32_t), ECX);
+ if (fmem) {
+ JAEb(4);
+ ZEROrd(EAX);
+ LEAVE();
+ } else {
+ JAEb(3);
+ ZEROrd(EAX);
+ }
+ RET();
+ MOVrq3(R8, RCX);
+ MOVobd(RCX, RSI, EAX);
+ BSWAP(EAX);
+ break;
+
+ case BPF_LD|BPF_H|BPF_IND:
+ ZEROrd(EAX);
+ CMPrd(EDI, EDX);
+ JAb(27);
+ MOVid(ins->k, ESI);
+ MOVrd(EDI, ECX);
+ SUBrd(EDX, ECX);
+ CMPrd(ESI, ECX);
+ JBb(14);
+ ADDrd(EDX, ESI);
+ MOVrd(EDI, ECX);
+ SUBrd(ESI, ECX);
+ CMPid(sizeof(int16_t), ECX);
+ if (fmem) {
+ JAEb(2);
+ LEAVE();
+ } else
+ JAEb(1);
+ RET();
+ MOVrq3(R8, RCX);
+ MOVobw(RCX, RSI, AX);
+ SWAP_AX();
+ break;
+
+ case BPF_LD|BPF_B|BPF_IND:
+ ZEROrd(EAX);
+ CMPrd(EDI, EDX);
+ JAEb(13);
+ MOVid(ins->k, ESI);
+ MOVrd(EDI, ECX);
+ SUBrd(EDX, ECX);
+ CMPrd(ESI, ECX);
+ if (fmem) {
+ JAb(2);
+ LEAVE();
+ } else
+ JAb(1);
+ RET();
+ MOVrq3(R8, RCX);
+ ADDrd(EDX, ESI);
+ MOVobb(RCX, RSI, AL);
+ break;
+
+ case BPF_LDX|BPF_MSH|BPF_B:
+ MOVid(ins->k, ESI);
+ CMPrd(EDI, ESI);
+ if (fmem) {
+ JBb(4);
+ ZEROrd(EAX);
+ LEAVE();
+ } else {
+ JBb(3);
+ ZEROrd(EAX);
+ }
+ RET();
+ ZEROrd(EDX);
+ MOVrq3(R8, RCX);
+ MOVobb(RCX, RSI, DL);
+ ANDib(0x0f, DL);
+ SHLib(2, EDX);
+ break;
+
+ case BPF_LD|BPF_IMM:
+ MOVid(ins->k, EAX);
+ break;
+
+ case BPF_LDX|BPF_IMM:
+ MOVid(ins->k, EDX);
+ break;
+
+ case BPF_LD|BPF_MEM:
+ MOVid(ins->k * sizeof(uint32_t), ESI);
+ MOVobd(RSP, RSI, EAX);
+ break;
+
+ case BPF_LDX|BPF_MEM:
+ MOVid(ins->k * sizeof(uint32_t), ESI);
+ MOVobd(RSP, RSI, EDX);
+ break;
+
+ case BPF_ST:
+ /*
+ * XXX this command and the following could
+ * be optimized if the previous instruction
+ * was already of this type
+ */
+ MOVid(ins->k * sizeof(uint32_t), ESI);
+ MOVomd(EAX, RSP, RSI);
+ break;
+
+ case BPF_STX:
+ MOVid(ins->k * sizeof(uint32_t), ESI);
+ MOVomd(EDX, RSP, RSI);
+ break;
+
+ case BPF_JMP|BPF_JA:
+ JUMP(ins->k);
+ break;
+
+ case BPF_JMP|BPF_JGT|BPF_K:
+ if (ins->jt == ins->jf) {
+ JUMP(ins->jt);
+ break;
+ }
+ CMPid(ins->k, EAX);
+ JCC(JA, JBE);
+ break;
+
+ case BPF_JMP|BPF_JGE|BPF_K:
+ if (ins->jt == ins->jf) {
+ JUMP(ins->jt);
+ break;
+ }
+ CMPid(ins->k, EAX);
+ JCC(JAE, JB);
+ break;
+
+ case BPF_JMP|BPF_JEQ|BPF_K:
+ if (ins->jt == ins->jf) {
+ JUMP(ins->jt);
+ break;
+ }
+ CMPid(ins->k, EAX);
+ JCC(JE, JNE);
+ break;
+
+ case BPF_JMP|BPF_JSET|BPF_K:
+ if (ins->jt == ins->jf) {
+ JUMP(ins->jt);
+ break;
+ }
+ TESTid(ins->k, EAX);
+ JCC(JNE, JE);
+ break;
+
+ case BPF_JMP|BPF_JGT|BPF_X:
+ if (ins->jt == ins->jf) {
+ JUMP(ins->jt);
+ break;
+ }
+ CMPrd(EDX, EAX);
+ JCC(JA, JBE);
+ break;
+
+ case BPF_JMP|BPF_JGE|BPF_X:
+ if (ins->jt == ins->jf) {
+ JUMP(ins->jt);
+ break;
+ }
+ CMPrd(EDX, EAX);
+ JCC(JAE, JB);
+ break;
+
+ case BPF_JMP|BPF_JEQ|BPF_X:
+ if (ins->jt == ins->jf) {
+ JUMP(ins->jt);
+ break;
+ }
+ CMPrd(EDX, EAX);
+ JCC(JE, JNE);
+ break;
+
+ case BPF_JMP|BPF_JSET|BPF_X:
+ if (ins->jt == ins->jf) {
+ JUMP(ins->jt);
+ break;
+ }
+ TESTrd(EDX, EAX);
+ JCC(JNE, JE);
+ break;
+
+ case BPF_ALU|BPF_ADD|BPF_X:
+ ADDrd(EDX, EAX);
+ break;
+
+ case BPF_ALU|BPF_SUB|BPF_X:
+ SUBrd(EDX, EAX);
+ break;
+
+ case BPF_ALU|BPF_MUL|BPF_X:
+ MOVrd(EDX, ECX);
+ MULrd(EDX);
+ MOVrd(ECX, EDX);
+ break;
+
+ case BPF_ALU|BPF_DIV|BPF_X:
+ TESTrd(EDX, EDX);
+ if (fmem) {
+ JNEb(4);
+ ZEROrd(EAX);
+ LEAVE();
+ } else {
+ JNEb(3);
+ ZEROrd(EAX);
+ }
+ RET();
+ MOVrd(EDX, ECX);
+ ZEROrd(EDX);
+ DIVrd(ECX);
+ MOVrd(ECX, EDX);
+ break;
+
+ case BPF_ALU|BPF_AND|BPF_X:
+ ANDrd(EDX, EAX);
+ break;
+
+ case BPF_ALU|BPF_OR|BPF_X:
+ ORrd(EDX, EAX);
+ break;
+
+ case BPF_ALU|BPF_LSH|BPF_X:
+ MOVrd(EDX, ECX);
+ SHL_CLrb(EAX);
+ break;
+
+ case BPF_ALU|BPF_RSH|BPF_X:
+ MOVrd(EDX, ECX);
+ SHR_CLrb(EAX);
+ break;
+
+ case BPF_ALU|BPF_ADD|BPF_K:
+ ADD_EAXi(ins->k);
+ break;
+
+ case BPF_ALU|BPF_SUB|BPF_K:
+ SUB_EAXi(ins->k);
+ break;
+
+ case BPF_ALU|BPF_MUL|BPF_K:
+ MOVrd(EDX, ECX);
+ MOVid(ins->k, EDX);
+ MULrd(EDX);
+ MOVrd(ECX, EDX);
+ break;
+
+ case BPF_ALU|BPF_DIV|BPF_K:
+ MOVrd(EDX, ECX);
+ ZEROrd(EDX);
+ MOVid(ins->k, ESI);
+ DIVrd(ESI);
+ MOVrd(ECX, EDX);
+ break;
+
+ case BPF_ALU|BPF_AND|BPF_K:
+ ANDid(ins->k, EAX);
+ break;
+
+ case BPF_ALU|BPF_OR|BPF_K:
+ ORid(ins->k, EAX);
+ break;
+
+ case BPF_ALU|BPF_LSH|BPF_K:
+ SHLib((ins->k) & 0xff, EAX);
+ break;
+
+ case BPF_ALU|BPF_RSH|BPF_K:
+ SHRib((ins->k) & 0xff, EAX);
+ break;
+
+ case BPF_ALU|BPF_NEG:
+ NEGd(EAX);
+ break;
+
+ case BPF_MISC|BPF_TAX:
+ MOVrd(EAX, EDX);
+ break;
+
+ case BPF_MISC|BPF_TXA:
+ MOVrd(EDX, EAX);
+ break;
+ }
+ ins++;
+ }
+
+ if (pass > 0)
+ continue;
+
+ *size = stream.cur_ip;
+#ifdef _KERNEL
+ stream.ibuf = malloc(*size, M_BPFJIT, M_NOWAIT);
+ if (stream.ibuf == NULL)
+ break;
+#else
+ stream.ibuf = mmap(NULL, *size, PROT_READ | PROT_WRITE,
+ MAP_ANON, -1, 0);
+ if (stream.ibuf == MAP_FAILED) {
+ stream.ibuf = NULL;
+ break;
+ }
+#endif
+
+ /*
+ * Modify the reference table to contain the offsets and
+ * not the lengths of the instructions.
+ */
+ if (fjmp)
+ for (i = 1; i < nins + 1; i++)
+ stream.refs[i] += stream.refs[i - 1];
+
+ /* Reset the counters. */
+ stream.cur_ip = 0;
+ stream.bpf_pc = 0;
+
+ /* The second pass creates the actual code. */
+ emitm = emit_code;
+ }
+
+ /*
+ * The reference table is needed only during compilation,
+ * now we can free it.
+ */
+ if (fjmp)
+#ifdef _KERNEL
+ free(stream.refs, M_BPFJIT);
+#else
+ free(stream.refs);
+#endif
+
+#ifndef _KERNEL
+ if (stream.ibuf != NULL &&
+ mprotect(stream.ibuf, *size, PROT_READ | PROT_EXEC) != 0) {
+ munmap(stream.ibuf, *size);
+ stream.ibuf = NULL;
+ }
+#endif
+
+ return ((bpf_filter_func)stream.ibuf);
+}
diff --git a/sys/amd64/amd64/bpf_jit_machdep.h b/sys/amd64/amd64/bpf_jit_machdep.h
new file mode 100644
index 0000000..01c251f
--- /dev/null
+++ b/sys/amd64/amd64/bpf_jit_machdep.h
@@ -0,0 +1,482 @@
+/*-
+ * Copyright (C) 2002-2003 NetGroup, Politecnico di Torino (Italy)
+ * Copyright (C) 2005-2009 Jung-uk Kim <jkim@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Politecnico di Torino nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _BPF_JIT_MACHDEP_H_
+#define _BPF_JIT_MACHDEP_H_
+
+/*
+ * Registers
+ */
+#define RAX 0
+#define RCX 1
+#define RDX 2
+#define RBX 3
+#define RSP 4
+#define RBP 5
+#define RSI 6
+#define RDI 7
+#define R8 0
+#define R9 1
+#define R10 2
+#define R11 3
+#define R12 4
+#define R13 5
+#define R14 6
+#define R15 7
+
+#define EAX 0
+#define ECX 1
+#define EDX 2
+#define EBX 3
+#define ESP 4
+#define EBP 5
+#define ESI 6
+#define EDI 7
+#define R8D 0
+#define R9D 1
+#define R10D 2
+#define R11D 3
+#define R12D 4
+#define R13D 5
+#define R14D 6
+#define R15D 7
+
+#define AX 0
+#define CX 1
+#define DX 2
+#define BX 3
+#define SP 4
+#define BP 5
+#define SI 6
+#define DI 7
+
+#define AL 0
+#define CL 1
+#define DL 2
+#define BL 3
+
+/* Optimization flags */
+#define BPF_JIT_FRET 0x01
+#define BPF_JIT_FPKT 0x02
+#define BPF_JIT_FMEM 0x04
+#define BPF_JIT_FJMP 0x08
+#define BPF_JIT_FLEN 0x10
+
+#define BPF_JIT_FLAG_ALL \
+ (BPF_JIT_FPKT | BPF_JIT_FMEM | BPF_JIT_FJMP | BPF_JIT_FLEN)
+
+/* A stream of native binary code */
+typedef struct bpf_bin_stream {
+ /* Current native instruction pointer. */
+ int cur_ip;
+
+ /*
+ * Current BPF instruction pointer, i.e. position in
+ * the BPF program reached by the jitter.
+ */
+ int bpf_pc;
+
+ /* Instruction buffer, contains the generated native code. */
+ char *ibuf;
+
+ /* Jumps reference table. */
+ u_int *refs;
+} bpf_bin_stream;
+
+/*
+ * Prototype of the emit functions.
+ *
+ * Different emit functions are used to create the reference table and
+ * to generate the actual filtering code. This allows to have simpler
+ * instruction macros.
+ * The first parameter is the stream that will receive the data.
+ * The second one is a variable containing the data.
+ * The third one is the length, that can be 1, 2, or 4 since it is possible
+ * to emit a byte, a short, or a word at a time.
+ */
+typedef void (*emit_func)(bpf_bin_stream *stream, u_int value, u_int n);
+
+/*
+ * Native instruction macros
+ */
+
+/* movl i32,r32 */
+#define MOVid(i32, r32) do { \
+ emitm(&stream, (11 << 4) | (1 << 3) | (r32 & 0x7), 1); \
+ emitm(&stream, i32, 4); \
+} while (0)
+
+/* movq i64,r64 */
+#define MOViq(i64, r64) do { \
+ emitm(&stream, 0x48, 1); \
+ emitm(&stream, (11 << 4) | (1 << 3) | (r64 & 0x7), 1); \
+ emitm(&stream, i64, 4); \
+ emitm(&stream, (i64 >> 32), 4); \
+} while (0)
+
+/* movl sr32,dr32 */
+#define MOVrd(sr32, dr32) do { \
+ emitm(&stream, 0x89, 1); \
+ emitm(&stream, \
+ (3 << 6) | ((sr32 & 0x7) << 3) | (dr32 & 0x7), 1); \
+} while (0)
+
+/* movl sr32,dr32 (dr32 = %r8-15d) */
+#define MOVrd2(sr32, dr32) do { \
+ emitm(&stream, 0x8941, 2); \
+ emitm(&stream, \
+ (3 << 6) | ((sr32 & 0x7) << 3) | (dr32 & 0x7), 1); \
+} while (0)
+
+/* movl sr32,dr32 (sr32 = %r8-15d) */
+#define MOVrd3(sr32, dr32) do { \
+ emitm(&stream, 0x8944, 2); \
+ emitm(&stream, \
+ (3 << 6) | ((sr32 & 0x7) << 3) | (dr32 & 0x7), 1); \
+} while (0)
+
+/* movq sr64,dr64 */
+#define MOVrq(sr64, dr64) do { \
+ emitm(&stream, 0x8948, 2); \
+ emitm(&stream, \
+ (3 << 6) | ((sr64 & 0x7) << 3) | (dr64 & 0x7), 1); \
+} while (0)
+
+/* movq sr64,dr64 (dr64 = %r8-15) */
+#define MOVrq2(sr64, dr64) do { \
+ emitm(&stream, 0x8949, 2); \
+ emitm(&stream, \
+ (3 << 6) | ((sr64 & 0x7) << 3) | (dr64 & 0x7), 1); \
+} while (0)
+
+/* movq sr64,dr64 (sr64 = %r8-15) */
+#define MOVrq3(sr64, dr64) do { \
+ emitm(&stream, 0x894c, 2); \
+ emitm(&stream, \
+ (3 << 6) | ((sr64 & 0x7) << 3) | (dr64 & 0x7), 1); \
+} while (0)
+
+/* movl (sr64,or64,1),dr32 */
+#define MOVobd(sr64, or64, dr32) do { \
+ emitm(&stream, 0x8b, 1); \
+ emitm(&stream, ((dr32 & 0x7) << 3) | 4, 1); \
+ emitm(&stream, ((or64 & 0x7) << 3) | (sr64 & 0x7), 1); \
+} while (0)
+
+/* movw (sr64,or64,1),dr16 */
+#define MOVobw(sr64, or64, dr16) do { \
+ emitm(&stream, 0x8b66, 2); \
+ emitm(&stream, ((dr16 & 0x7) << 3) | 4, 1); \
+ emitm(&stream, ((or64 & 0x7) << 3) | (sr64 & 0x7), 1); \
+} while (0)
+
+/* movb (sr64,or64,1),dr8 */
+#define MOVobb(sr64, or64, dr8) do { \
+ emitm(&stream, 0x8a, 1); \
+ emitm(&stream, ((dr8 & 0x7) << 3) | 4, 1); \
+ emitm(&stream, ((or64 & 0x7) << 3) | (sr64 & 0x7), 1); \
+} while (0)
+
+/* movl sr32,(dr64,or64,1) */
+#define MOVomd(sr32, dr64, or64) do { \
+ emitm(&stream, 0x89, 1); \
+ emitm(&stream, ((sr32 & 0x7) << 3) | 4, 1); \
+ emitm(&stream, ((or64 & 0x7) << 3) | (dr64 & 0x7), 1); \
+} while (0)
+
+/* bswapl dr32 */
+#define BSWAP(dr32) do { \
+ emitm(&stream, 0xf, 1); \
+ emitm(&stream, (0x19 << 3) | dr32, 1); \
+} while (0)
+
+/* xchgb %al,%ah */
+#define SWAP_AX() do { \
+ emitm(&stream, 0xc486, 2); \
+} while (0)
+
+/* pushq r64 */
+#define PUSH(r64) do { \
+ emitm(&stream, (5 << 4) | (0 << 3) | (r64 & 0x7), 1); \
+} while (0)
+
+/* leaveq */
+#define LEAVE() do { \
+ emitm(&stream, 0xc9, 1); \
+} while (0)
+
+/* retq */
+#define RET() do { \
+ emitm(&stream, 0xc3, 1); \
+} while (0)
+
+/* addl sr32,dr32 */
+#define ADDrd(sr32, dr32) do { \
+ emitm(&stream, 0x01, 1); \
+ emitm(&stream, \
+ (3 << 6) | ((sr32 & 0x7) << 3) | (dr32 & 0x7), 1); \
+} while (0)
+
+/* addl i32,%eax */
+#define ADD_EAXi(i32) do { \
+ emitm(&stream, 0x05, 1); \
+ emitm(&stream, i32, 4); \
+} while (0)
+
+/* addl i8,r32 */
+#define ADDib(i8, r32) do { \
+ emitm(&stream, 0x83, 1); \
+ emitm(&stream, (24 << 3) | r32, 1); \
+ emitm(&stream, i8, 1); \
+} while (0)
+
+/* subl sr32,dr32 */
+#define SUBrd(sr32, dr32) do { \
+ emitm(&stream, 0x29, 1); \
+ emitm(&stream, \
+ (3 << 6) | ((sr32 & 0x7) << 3) | (dr32 & 0x7), 1); \
+} while (0)
+
+/* subl i32,%eax */
+#define SUB_EAXi(i32) do { \
+ emitm(&stream, 0x2d, 1); \
+ emitm(&stream, i32, 4); \
+} while (0)
+
+/* subq i8,r64 */
+#define SUBib(i8, r64) do { \
+ emitm(&stream, 0x8348, 2); \
+ emitm(&stream, (29 << 3) | (r64 & 0x7), 1); \
+ emitm(&stream, i8, 1); \
+} while (0)
+
+/* mull r32 */
+#define MULrd(r32) do { \
+ emitm(&stream, 0xf7, 1); \
+ emitm(&stream, (7 << 5) | (r32 & 0x7), 1); \
+} while (0)
+
+/* divl r32 */
+#define DIVrd(r32) do { \
+ emitm(&stream, 0xf7, 1); \
+ emitm(&stream, (15 << 4) | (r32 & 0x7), 1); \
+} while (0)
+
+/* andb i8,r8 */
+#define ANDib(i8, r8) do { \
+ if (r8 == AL) { \
+ emitm(&stream, 0x24, 1); \
+ } else { \
+ emitm(&stream, 0x80, 1); \
+ emitm(&stream, (7 << 5) | r8, 1); \
+ } \
+ emitm(&stream, i8, 1); \
+} while (0)
+
+/* andl i32,r32 */
+#define ANDid(i32, r32) do { \
+ if (r32 == EAX) { \
+ emitm(&stream, 0x25, 1); \
+ } else { \
+ emitm(&stream, 0x81, 1); \
+ emitm(&stream, (7 << 5) | r32, 1); \
+ } \
+ emitm(&stream, i32, 4); \
+} while (0)
+
+/* andl sr32,dr32 */
+#define ANDrd(sr32, dr32) do { \
+ emitm(&stream, 0x21, 1); \
+ emitm(&stream, \
+ (3 << 6) | ((sr32 & 0x7) << 3) | (dr32 & 0x7), 1); \
+} while (0)
+
+/* testl i32,r32 */
+#define TESTid(i32, r32) do { \
+ if (r32 == EAX) { \
+ emitm(&stream, 0xa9, 1); \
+ } else { \
+ emitm(&stream, 0xf7, 1); \
+ emitm(&stream, (3 << 6) | r32, 1); \
+ } \
+ emitm(&stream, i32, 4); \
+} while (0)
+
+/* testl sr32,dr32 */
+#define TESTrd(sr32, dr32) do { \
+ emitm(&stream, 0x85, 1); \
+ emitm(&stream, \
+ (3 << 6) | ((sr32 & 0x7) << 3) | (dr32 & 0x7), 1); \
+} while (0)
+
+/* orl sr32,dr32 */
+#define ORrd(sr32, dr32) do { \
+ emitm(&stream, 0x09, 1); \
+ emitm(&stream, \
+ (3 << 6) | ((sr32 & 0x7) << 3) | (dr32 & 0x7), 1); \
+} while (0)
+
+/* orl i32,r32 */
+#define ORid(i32, r32) do { \
+ if (r32 == EAX) { \
+ emitm(&stream, 0x0d, 1); \
+ } else { \
+ emitm(&stream, 0x81, 1); \
+ emitm(&stream, (25 << 3) | r32, 1); \
+ } \
+ emitm(&stream, i32, 4); \
+} while (0)
+
+/* shll i8,r32 */
+#define SHLib(i8, r32) do { \
+ emitm(&stream, 0xc1, 1); \
+ emitm(&stream, (7 << 5) | (r32 & 0x7), 1); \
+ emitm(&stream, i8, 1); \
+} while (0)
+
+/* shll %cl,dr32 */
+#define SHL_CLrb(dr32) do { \
+ emitm(&stream, 0xd3, 1); \
+ emitm(&stream, (7 << 5) | (dr32 & 0x7), 1); \
+} while (0)
+
+/* shrl i8,r32 */
+#define SHRib(i8, r32) do { \
+ emitm(&stream, 0xc1, 1); \
+ emitm(&stream, (29 << 3) | (r32 & 0x7), 1); \
+ emitm(&stream, i8, 1); \
+} while (0)
+
+/* shrl %cl,dr32 */
+#define SHR_CLrb(dr32) do { \
+ emitm(&stream, 0xd3, 1); \
+ emitm(&stream, (29 << 3) | (dr32 & 0x7), 1); \
+} while (0)
+
+/* negl r32 */
+#define NEGd(r32) do { \
+ emitm(&stream, 0xf7, 1); \
+ emitm(&stream, (27 << 3) | (r32 & 0x7), 1); \
+} while (0)
+
+/* cmpl sr32,dr32 */
+#define CMPrd(sr32, dr32) do { \
+ emitm(&stream, 0x39, 1); \
+ emitm(&stream, \
+ (3 << 6) | ((sr32 & 0x7) << 3) | (dr32 & 0x7), 1); \
+} while (0)
+
+/* cmpl i32,dr32 */
+#define CMPid(i32, dr32) do { \
+ if (dr32 == EAX){ \
+ emitm(&stream, 0x3d, 1); \
+ emitm(&stream, i32, 4); \
+ } else { \
+ emitm(&stream, 0x81, 1); \
+ emitm(&stream, (0x1f << 3) | (dr32 & 0x7), 1); \
+ emitm(&stream, i32, 4); \
+ } \
+} while (0)
+
+/* jb off8 */
+#define JBb(off8) do { \
+ emitm(&stream, 0x72, 1); \
+ emitm(&stream, off8, 1); \
+} while (0)
+
+/* jae off8 */
+#define JAEb(off8) do { \
+ emitm(&stream, 0x73, 1); \
+ emitm(&stream, off8, 1); \
+} while (0)
+
+/* jne off8 */
+#define JNEb(off8) do { \
+ emitm(&stream, 0x75, 1); \
+ emitm(&stream, off8, 1); \
+} while (0)
+
+/* ja off8 */
+#define JAb(off8) do { \
+ emitm(&stream, 0x77, 1); \
+ emitm(&stream, off8, 1); \
+} while (0)
+
+/* jmp off32 */
+#define JMP(off32) do { \
+ emitm(&stream, 0xe9, 1); \
+ emitm(&stream, off32, 4); \
+} while (0)
+
+/* xorl r32,r32 */
+#define ZEROrd(r32) do { \
+ emitm(&stream, 0x31, 1); \
+ emitm(&stream, (3 << 6) | ((r32 & 0x7) << 3) | (r32 & 0x7), 1); \
+} while (0)
+
+/*
+ * Conditional long jumps
+ */
+#define JB 0x82
+#define JAE 0x83
+#define JE 0x84
+#define JNE 0x85
+#define JBE 0x86
+#define JA 0x87
+
+#define JCC(t, f) do { \
+ if (ins->jt != 0 && ins->jf != 0) { \
+ /* 5 is the size of the following jmp */ \
+ emitm(&stream, ((t) << 8) | 0x0f, 2); \
+ emitm(&stream, stream.refs[stream.bpf_pc + ins->jt] - \
+ stream.refs[stream.bpf_pc] + 5, 4); \
+ JMP(stream.refs[stream.bpf_pc + ins->jf] - \
+ stream.refs[stream.bpf_pc]); \
+ } else if (ins->jt != 0) { \
+ emitm(&stream, ((t) << 8) | 0x0f, 2); \
+ emitm(&stream, stream.refs[stream.bpf_pc + ins->jt] - \
+ stream.refs[stream.bpf_pc], 4); \
+ } else { \
+ emitm(&stream, ((f) << 8) | 0x0f, 2); \
+ emitm(&stream, stream.refs[stream.bpf_pc + ins->jf] - \
+ stream.refs[stream.bpf_pc], 4); \
+ } \
+} while (0)
+
+#define JUMP(off) do { \
+ if ((off) != 0) \
+ JMP(stream.refs[stream.bpf_pc + (off)] - \
+ stream.refs[stream.bpf_pc]); \
+} while (0)
+
+#endif /* _BPF_JIT_MACHDEP_H_ */
diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S
new file mode 100644
index 0000000..ed1ccb5
--- /dev/null
+++ b/sys/amd64/amd64/cpu_switch.S
@@ -0,0 +1,541 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm.
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+#include <machine/specialreg.h>
+
+#include "assym.s"
+#include "opt_sched.h"
+
+/*****************************************************************************/
+/* Scheduling */
+/*****************************************************************************/
+
+ .text
+
+#ifdef SMP
+#define LK lock ;
+#else
+#define LK
+#endif
+
+#if defined(SCHED_ULE) && defined(SMP)
+#define SETLK xchgq
+#else
+#define SETLK movq
+#endif
+
+/*
+ * cpu_throw()
+ *
+ * This is the second half of cpu_switch(). It is used when the current
+ * thread is either a dummy or slated to die, and we no longer care
+ * about its state. This is only a slight optimization and is probably
+ * not worth it anymore. Note that we need to clear the pm_active bits so
+ * we do need the old proc if it still exists.
+ * %rdi = oldtd
+ * %rsi = newtd
+ */
+ENTRY(cpu_throw)
+ movl PCPU(CPUID),%eax
+ testq %rdi,%rdi
+ jz 1f
+ /* release bit from old pm_active */
+ movq PCPU(CURPMAP),%rdx
+ LK btrl %eax,PM_ACTIVE(%rdx) /* clear old */
+1:
+ movq TD_PCB(%rsi),%r8 /* newtd->td_pcb */
+ movq PCB_CR3(%r8),%rdx
+ movq %rdx,%cr3 /* new address space */
+ jmp swact
+END(cpu_throw)
+
+/*
+ * cpu_switch(old, new, mtx)
+ *
+ * Save the current thread state, then select the next thread to run
+ * and load its state.
+ * %rdi = oldtd
+ * %rsi = newtd
+ * %rdx = mtx
+ */
+ENTRY(cpu_switch)
+ /* Switch to new thread. First, save context. */
+ movq TD_PCB(%rdi),%r8
+ orl $PCB_FULL_IRET,PCB_FLAGS(%r8)
+
+ movq (%rsp),%rax /* Hardware registers */
+ movq %r15,PCB_R15(%r8)
+ movq %r14,PCB_R14(%r8)
+ movq %r13,PCB_R13(%r8)
+ movq %r12,PCB_R12(%r8)
+ movq %rbp,PCB_RBP(%r8)
+ movq %rsp,PCB_RSP(%r8)
+ movq %rbx,PCB_RBX(%r8)
+ movq %rax,PCB_RIP(%r8)
+
+ testl $PCB_DBREGS,PCB_FLAGS(%r8)
+ jnz store_dr /* static predict not taken */
+done_store_dr:
+
+ /* have we used fp, and need a save? */
+ cmpq %rdi,PCPU(FPCURTHREAD)
+ jne 3f
+ movq PCB_SAVEFPU(%r8),%r8
+ clts
+ cmpl $0,use_xsave
+ jne 1f
+ fxsave (%r8)
+ jmp 2f
+1: movq %rdx,%rcx
+ movl xsave_mask,%eax
+ movl xsave_mask+4,%edx
+ .globl ctx_switch_xsave
+ctx_switch_xsave:
+ /* This is patched to xsaveopt if supported, see fpuinit_bsp1() */
+ xsave (%r8)
+ movq %rcx,%rdx
+2: smsw %ax
+ orb $CR0_TS,%al
+ lmsw %ax
+ xorl %eax,%eax
+ movq %rax,PCPU(FPCURTHREAD)
+3:
+
+ /* Save is done. Now fire up new thread. Leave old vmspace. */
+ movq TD_PCB(%rsi),%r8
+
+ /* switch address space */
+ movq PCB_CR3(%r8),%rcx
+ movq %cr3,%rax
+ cmpq %rcx,%rax /* Same address space? */
+ jne swinact
+ SETLK %rdx, TD_LOCK(%rdi) /* Release the old thread */
+ jmp sw1
+swinact:
+ movq %rcx,%cr3 /* new address space */
+ movl PCPU(CPUID), %eax
+ /* Release bit from old pmap->pm_active */
+ movq PCPU(CURPMAP),%rcx
+ LK btrl %eax,PM_ACTIVE(%rcx) /* clear old */
+ SETLK %rdx, TD_LOCK(%rdi) /* Release the old thread */
+swact:
+ /* Set bit in new pmap->pm_active */
+ movq TD_PROC(%rsi),%rdx /* newproc */
+ movq P_VMSPACE(%rdx), %rdx
+ addq $VM_PMAP,%rdx
+ LK btsl %eax,PM_ACTIVE(%rdx) /* set new */
+ movq %rdx,PCPU(CURPMAP)
+
+sw1:
+#if defined(SCHED_ULE) && defined(SMP)
+ /* Wait for the new thread to become unblocked */
+ movq $blocked_lock, %rdx
+1:
+ movq TD_LOCK(%rsi),%rcx
+ cmpq %rcx, %rdx
+ pause
+ je 1b
+#endif
+ /*
+ * At this point, we've switched address spaces and are ready
+ * to load up the rest of the next context.
+ */
+
+ /* Skip loading user fsbase/gsbase for kthreads */
+ testl $TDP_KTHREAD,TD_PFLAGS(%rsi)
+ jnz do_kthread
+
+ /*
+ * Load ldt register
+ */
+ movq TD_PROC(%rsi),%rcx
+ cmpq $0, P_MD+MD_LDT(%rcx)
+ jne do_ldt
+ xorl %eax,%eax
+ld_ldt: lldt %ax
+
+ /* Restore fs base in GDT */
+ movl PCB_FSBASE(%r8),%eax
+ movq PCPU(FS32P),%rdx
+ movw %ax,2(%rdx)
+ shrl $16,%eax
+ movb %al,4(%rdx)
+ shrl $8,%eax
+ movb %al,7(%rdx)
+
+ /* Restore gs base in GDT */
+ movl PCB_GSBASE(%r8),%eax
+ movq PCPU(GS32P),%rdx
+ movw %ax,2(%rdx)
+ shrl $16,%eax
+ movb %al,4(%rdx)
+ shrl $8,%eax
+ movb %al,7(%rdx)
+
+do_kthread:
+ /* Do we need to reload tss ? */
+ movq PCPU(TSSP),%rax
+ movq PCB_TSSP(%r8),%rdx
+ testq %rdx,%rdx
+ cmovzq PCPU(COMMONTSSP),%rdx
+ cmpq %rax,%rdx
+ jne do_tss
+done_tss:
+ movq %r8,PCPU(RSP0)
+ movq %r8,PCPU(CURPCB)
+ /* Update the TSS_RSP0 pointer for the next interrupt */
+ movq %r8,COMMON_TSS_RSP0(%rdx)
+ movq %rsi,PCPU(CURTHREAD) /* into next thread */
+
+ /* Test if debug registers should be restored. */
+ testl $PCB_DBREGS,PCB_FLAGS(%r8)
+ jnz load_dr /* static predict not taken */
+done_load_dr:
+
+ /* Restore context. */
+ movq PCB_R15(%r8),%r15
+ movq PCB_R14(%r8),%r14
+ movq PCB_R13(%r8),%r13
+ movq PCB_R12(%r8),%r12
+ movq PCB_RBP(%r8),%rbp
+ movq PCB_RSP(%r8),%rsp
+ movq PCB_RBX(%r8),%rbx
+ movq PCB_RIP(%r8),%rax
+ movq %rax,(%rsp)
+ ret
+
+ /*
+ * We order these strangely for several reasons.
+ * 1: I wanted to use static branch prediction hints
+ * 2: Most athlon64/opteron cpus don't have them. They define
+ * a forward branch as 'predict not taken'. Intel cores have
+ * the 'rep' prefix to invert this.
+ * So, to make it work on both forms of cpu we do the detour.
+ * We use jumps rather than call in order to avoid the stack.
+ */
+
+store_dr:
+ movq %dr7,%rax /* yes, do the save */
+ movq %dr0,%r15
+ movq %dr1,%r14
+ movq %dr2,%r13
+ movq %dr3,%r12
+ movq %dr6,%r11
+ movq %r15,PCB_DR0(%r8)
+ movq %r14,PCB_DR1(%r8)
+ movq %r13,PCB_DR2(%r8)
+ movq %r12,PCB_DR3(%r8)
+ movq %r11,PCB_DR6(%r8)
+ movq %rax,PCB_DR7(%r8)
+ andq $0x0000fc00, %rax /* disable all watchpoints */
+ movq %rax,%dr7
+ jmp done_store_dr
+
+load_dr:
+ movq %dr7,%rax
+ movq PCB_DR0(%r8),%r15
+ movq PCB_DR1(%r8),%r14
+ movq PCB_DR2(%r8),%r13
+ movq PCB_DR3(%r8),%r12
+ movq PCB_DR6(%r8),%r11
+ movq PCB_DR7(%r8),%rcx
+ movq %r15,%dr0
+ movq %r14,%dr1
+ /* Preserve reserved bits in %dr7 */
+ andq $0x0000fc00,%rax
+ andq $~0x0000fc00,%rcx
+ movq %r13,%dr2
+ movq %r12,%dr3
+ orq %rcx,%rax
+ movq %r11,%dr6
+ movq %rax,%dr7
+ jmp done_load_dr
+
+do_tss: movq %rdx,PCPU(TSSP)
+ movq %rdx,%rcx
+ movq PCPU(TSS),%rax
+ movw %cx,2(%rax)
+ shrq $16,%rcx
+ movb %cl,4(%rax)
+ shrq $8,%rcx
+ movb %cl,7(%rax)
+ shrq $8,%rcx
+ movl %ecx,8(%rax)
+ movb $0x89,5(%rax) /* unset busy */
+ movl $TSSSEL,%eax
+ ltr %ax
+ jmp done_tss
+
+do_ldt: movq PCPU(LDT),%rax
+ movq P_MD+MD_LDT_SD(%rcx),%rdx
+ movq %rdx,(%rax)
+ movq P_MD+MD_LDT_SD+8(%rcx),%rdx
+ movq %rdx,8(%rax)
+ movl $LDTSEL,%eax
+ jmp ld_ldt
+END(cpu_switch)
+
+/*
+ * savectx(pcb)
+ * Update pcb, saving current processor state.
+ */
+ENTRY(savectx)
+ /* Save caller's return address. */
+ movq (%rsp),%rax
+ movq %rax,PCB_RIP(%rdi)
+
+ movq %rbx,PCB_RBX(%rdi)
+ movq %rsp,PCB_RSP(%rdi)
+ movq %rbp,PCB_RBP(%rdi)
+ movq %r12,PCB_R12(%rdi)
+ movq %r13,PCB_R13(%rdi)
+ movq %r14,PCB_R14(%rdi)
+ movq %r15,PCB_R15(%rdi)
+
+ movq %cr0,%rsi
+ movq %rsi,PCB_CR0(%rdi)
+ movq %cr2,%rax
+ movq %rax,PCB_CR2(%rdi)
+ movq %cr3,%rax
+ movq %rax,PCB_CR3(%rdi)
+ movq %cr4,%rax
+ movq %rax,PCB_CR4(%rdi)
+
+ movq %dr0,%rax
+ movq %rax,PCB_DR0(%rdi)
+ movq %dr1,%rax
+ movq %rax,PCB_DR1(%rdi)
+ movq %dr2,%rax
+ movq %rax,PCB_DR2(%rdi)
+ movq %dr3,%rax
+ movq %rax,PCB_DR3(%rdi)
+ movq %dr6,%rax
+ movq %rax,PCB_DR6(%rdi)
+ movq %dr7,%rax
+ movq %rax,PCB_DR7(%rdi)
+
+ movl $MSR_FSBASE,%ecx
+ rdmsr
+ movl %eax,PCB_FSBASE(%rdi)
+ movl %edx,PCB_FSBASE+4(%rdi)
+ movl $MSR_GSBASE,%ecx
+ rdmsr
+ movl %eax,PCB_GSBASE(%rdi)
+ movl %edx,PCB_GSBASE+4(%rdi)
+ movl $MSR_KGSBASE,%ecx
+ rdmsr
+ movl %eax,PCB_KGSBASE(%rdi)
+ movl %edx,PCB_KGSBASE+4(%rdi)
+ movl $MSR_EFER,%ecx
+ rdmsr
+ movl %eax,PCB_EFER(%rdi)
+ movl %edx,PCB_EFER+4(%rdi)
+ movl $MSR_STAR,%ecx
+ rdmsr
+ movl %eax,PCB_STAR(%rdi)
+ movl %edx,PCB_STAR+4(%rdi)
+ movl $MSR_LSTAR,%ecx
+ rdmsr
+ movl %eax,PCB_LSTAR(%rdi)
+ movl %edx,PCB_LSTAR+4(%rdi)
+ movl $MSR_CSTAR,%ecx
+ rdmsr
+ movl %eax,PCB_CSTAR(%rdi)
+ movl %edx,PCB_CSTAR+4(%rdi)
+ movl $MSR_SF_MASK,%ecx
+ rdmsr
+ movl %eax,PCB_SFMASK(%rdi)
+ movl %edx,PCB_SFMASK+4(%rdi)
+ movl xsave_mask,%eax
+ movl %eax,PCB_XSMASK(%rdi)
+ movl xsave_mask+4,%eax
+ movl %eax,PCB_XSMASK+4(%rdi)
+
+ sgdt PCB_GDT(%rdi)
+ sidt PCB_IDT(%rdi)
+ sldt PCB_LDT(%rdi)
+ str PCB_TR(%rdi)
+
+2: movq %rsi,%cr0 /* The previous %cr0 is saved in %rsi. */
+
+ movl $1,%eax
+ ret
+END(savectx)
+
+/*
+ * resumectx(pcb)
+ * Resuming processor state from pcb.
+ */
+ENTRY(resumectx)
+ /* Switch to KPML4phys. */
+ movq KPML4phys,%rax
+ movq %rax,%cr3
+
+ /* Force kernel segment registers. */
+ movl $KDSEL,%eax
+ movw %ax,%ds
+ movw %ax,%es
+ movw %ax,%ss
+ movl $KUF32SEL,%eax
+ movw %ax,%fs
+ movl $KUG32SEL,%eax
+ movw %ax,%gs
+
+ movl $MSR_FSBASE,%ecx
+ movl PCB_FSBASE(%rdi),%eax
+ movl 4 + PCB_FSBASE(%rdi),%edx
+ wrmsr
+ movl $MSR_GSBASE,%ecx
+ movl PCB_GSBASE(%rdi),%eax
+ movl 4 + PCB_GSBASE(%rdi),%edx
+ wrmsr
+ movl $MSR_KGSBASE,%ecx
+ movl PCB_KGSBASE(%rdi),%eax
+ movl 4 + PCB_KGSBASE(%rdi),%edx
+ wrmsr
+
+ /* Restore EFER. */
+ movl $MSR_EFER,%ecx
+ movl PCB_EFER(%rdi),%eax
+ wrmsr
+
+ /* Restore fast syscall stuff. */
+ movl $MSR_STAR,%ecx
+ movl PCB_STAR(%rdi),%eax
+ movl 4 + PCB_STAR(%rdi),%edx
+ wrmsr
+ movl $MSR_LSTAR,%ecx
+ movl PCB_LSTAR(%rdi),%eax
+ movl 4 + PCB_LSTAR(%rdi),%edx
+ wrmsr
+ movl $MSR_CSTAR,%ecx
+ movl PCB_CSTAR(%rdi),%eax
+ movl 4 + PCB_CSTAR(%rdi),%edx
+ wrmsr
+ movl $MSR_SF_MASK,%ecx
+ movl PCB_SFMASK(%rdi),%eax
+ wrmsr
+
+ /* Restore CR0 except for FPU mode. */
+ movq PCB_CR0(%rdi),%rax
+ andq $~(CR0_EM | CR0_TS),%rax
+ movq %rax,%cr0
+
+ /* Restore CR2, CR4 and CR3. */
+ movq PCB_CR2(%rdi),%rax
+ movq %rax,%cr2
+ movq PCB_CR4(%rdi),%rax
+ movq %rax,%cr4
+ movq PCB_CR3(%rdi),%rax
+ movq %rax,%cr3
+
+ /* Restore descriptor tables. */
+ lidt PCB_IDT(%rdi)
+ lldt PCB_LDT(%rdi)
+
+#define SDT_SYSTSS 9
+#define SDT_SYSBSY 11
+
+ /* Clear "task busy" bit and reload TR. */
+ movq PCPU(TSS),%rax
+ andb $(~SDT_SYSBSY | SDT_SYSTSS),5(%rax)
+ movw PCB_TR(%rdi),%ax
+ ltr %ax
+
+#undef SDT_SYSTSS
+#undef SDT_SYSBSY
+
+ /* Restore debug registers. */
+ movq PCB_DR0(%rdi),%rax
+ movq %rax,%dr0
+ movq PCB_DR1(%rdi),%rax
+ movq %rax,%dr1
+ movq PCB_DR2(%rdi),%rax
+ movq %rax,%dr2
+ movq PCB_DR3(%rdi),%rax
+ movq %rax,%dr3
+ movq PCB_DR6(%rdi),%rax
+ movq %rax,%dr6
+ movq PCB_DR7(%rdi),%rax
+ movq %rax,%dr7
+
+ /* Restore FPU state. */
+ fninit
+ movq PCB_FPUSUSPEND(%rdi),%rbx
+ movq PCB_XSMASK(%rdi),%rax
+ testq %rax,%rax
+ jz 1f
+ movq %rax,%rdx
+ shrq $32,%rdx
+ movl $XCR0,%ecx
+ xsetbv
+ xrstor (%rbx)
+ jmp 2f
+1:
+ fxrstor (%rbx)
+2:
+
+ /* Reload CR0. */
+ movq PCB_CR0(%rdi),%rax
+ movq %rax,%cr0
+
+ /* Restore other callee saved registers. */
+ movq PCB_R15(%rdi),%r15
+ movq PCB_R14(%rdi),%r14
+ movq PCB_R13(%rdi),%r13
+ movq PCB_R12(%rdi),%r12
+ movq PCB_RBP(%rdi),%rbp
+ movq PCB_RSP(%rdi),%rsp
+ movq PCB_RBX(%rdi),%rbx
+
+ /* Restore return address. */
+ movq PCB_RIP(%rdi),%rax
+ movq %rax,(%rsp)
+
+ xorl %eax,%eax
+ ret
+END(resumectx)
+
+/*
+ * Wrapper around fpusave to care about TS0_CR.
+ */
+ENTRY(ctx_fpusave)
+ movq %cr0,%rsi
+ clts
+ call fpusave
+ movq %rsi,%cr0
+ ret
+END(ctx_fpusave)
diff --git a/sys/amd64/amd64/db_disasm.c b/sys/amd64/amd64/db_disasm.c
new file mode 100644
index 0000000..46144e0
--- /dev/null
+++ b/sys/amd64/amd64/db_disasm.c
@@ -0,0 +1,1637 @@
+/*-
+ * Mach Operating System
+ * Copyright (c) 1991,1990 Carnegie Mellon University
+ * All Rights Reserved.
+ *
+ * Permission to use, copy, modify and distribute this software and its
+ * documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Instruction disassembler.
+ */
+#include <sys/param.h>
+#include <sys/libkern.h>
+
+#include <ddb/ddb.h>
+#include <ddb/db_access.h>
+#include <ddb/db_sym.h>
+
+/*
+ * Size attributes
+ */
+#define BYTE 0
+#define WORD 1
+#define LONG 2
+#define QUAD 3
+#define SNGL 4
+#define DBLR 5
+#define EXTR 6
+#define SDEP 7
+#define ADEP 8
+#define ESC 9
+#define NONE 10
+
+/*
+ * REX prefix and bits
+ */
+#define REX_B 1
+#define REX_X 2
+#define REX_R 4
+#define REX_W 8
+#define REX 0x40
+
+/*
+ * Addressing modes
+ */
+#define E 1 /* general effective address */
+#define Eind 2 /* indirect address (jump, call) */
+#define Ew 3 /* address, word size */
+#define Eb 4 /* address, byte size */
+#define R 5 /* register, in 'reg' field */
+#define Rw 6 /* word register, in 'reg' field */
+#define Rq 39 /* quad register, in 'reg' field */
+#define Ri 7 /* register in instruction */
+#define S 8 /* segment reg, in 'reg' field */
+#define Si 9 /* segment reg, in instruction */
+#define A 10 /* accumulator */
+#define BX 11 /* (bx) */
+#define CL 12 /* cl, for shifts */
+#define DX 13 /* dx, for IO */
+#define SI 14 /* si */
+#define DI 15 /* di */
+#define CR 16 /* control register */
+#define DR 17 /* debug register */
+#define TR 18 /* test register */
+#define I 19 /* immediate, unsigned */
+#define Is 20 /* immediate, signed */
+#define Ib 21 /* byte immediate, unsigned */
+#define Ibs 22 /* byte immediate, signed */
+#define Iw 23 /* word immediate, unsigned */
+#define Ilq 24 /* long/quad immediate, unsigned */
+#define O 25 /* direct address */
+#define Db 26 /* byte displacement from EIP */
+#define Dl 27 /* long displacement from EIP */
+#define o1 28 /* constant 1 */
+#define o3 29 /* constant 3 */
+#define OS 30 /* immediate offset/segment */
+#define ST 31 /* FP stack top */
+#define STI 32 /* FP stack */
+#define X 33 /* extended FP op */
+#define XA 34 /* for 'fstcw %ax' */
+#define El 35 /* address, long/quad size */
+#define Ril 36 /* long register in instruction */
+#define Iba 37 /* byte immediate, don't print if 0xa */
+#define EL 38 /* address, explicitly long size */
+
+struct inst {
+ const char * i_name; /* name */
+ short i_has_modrm; /* has regmodrm byte */
+ short i_size; /* operand size */
+ int i_mode; /* addressing modes */
+ const void * i_extra; /* pointer to extra opcode table */
+};
+
+#define op1(x) (x)
+#define op2(x,y) ((x)|((y)<<8))
+#define op3(x,y,z) ((x)|((y)<<8)|((z)<<16))
+
+struct finst {
+ const char * f_name; /* name for memory instruction */
+ int f_size; /* size for memory instruction */
+ int f_rrmode; /* mode for rr instruction */
+ const void * f_rrname; /* name for rr instruction
+ (or pointer to table) */
+};
+
+static const struct inst db_inst_0f388x[] = {
+/*80*/ { "", TRUE, SDEP, op2(E, Rq), "invept" },
+/*81*/ { "", TRUE, SDEP, op2(E, Rq), "invvpid" },
+/*82*/ { "", FALSE, NONE, 0, 0 },
+/*83*/ { "", FALSE, NONE, 0, 0 },
+/*84*/ { "", FALSE, NONE, 0, 0 },
+/*85*/ { "", FALSE, NONE, 0, 0 },
+/*86*/ { "", FALSE, NONE, 0, 0 },
+/*87*/ { "", FALSE, NONE, 0, 0 },
+
+/*88*/ { "", FALSE, NONE, 0, 0 },
+/*89*/ { "", FALSE, NONE, 0, 0 },
+/*8a*/ { "", FALSE, NONE, 0, 0 },
+/*8b*/ { "", FALSE, NONE, 0, 0 },
+/*8c*/ { "", FALSE, NONE, 0, 0 },
+/*8d*/ { "", FALSE, NONE, 0, 0 },
+/*8e*/ { "", FALSE, NONE, 0, 0 },
+/*8f*/ { "", FALSE, NONE, 0, 0 },
+};
+
+static const struct inst * const db_inst_0f38[] = {
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ db_inst_0f388x,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+};
+
+static const char * const db_Grp6[] = {
+ "sldt",
+ "str",
+ "lldt",
+ "ltr",
+ "verr",
+ "verw",
+ "",
+ ""
+};
+
+static const char * const db_Grp7[] = {
+ "sgdt",
+ "sidt",
+ "lgdt",
+ "lidt",
+ "smsw",
+ "",
+ "lmsw",
+ "invlpg"
+};
+
+static const char * const db_Grp8[] = {
+ "",
+ "",
+ "",
+ "",
+ "bt",
+ "bts",
+ "btr",
+ "btc"
+};
+
+static const char * const db_Grp9[] = {
+ "",
+ "cmpxchg8b",
+ "",
+ "",
+ "",
+ "",
+ "vmptrld",
+ "vmptrst"
+};
+
+static const char * const db_Grp15[] = {
+ "fxsave",
+ "fxrstor",
+ "ldmxcsr",
+ "stmxcsr",
+ "xsave",
+ "xrstor",
+ "xsaveopt",
+ "clflush"
+};
+
+static const char * const db_Grp15b[] = {
+ "",
+ "",
+ "",
+ "",
+ "",
+ "lfence",
+ "mfence",
+ "sfence"
+};
+
+static const struct inst db_inst_0f0x[] = {
+/*00*/ { "", TRUE, NONE, op1(Ew), db_Grp6 },
+/*01*/ { "", TRUE, NONE, op1(Ew), db_Grp7 },
+/*02*/ { "lar", TRUE, LONG, op2(E,R), 0 },
+/*03*/ { "lsl", TRUE, LONG, op2(E,R), 0 },
+/*04*/ { "", FALSE, NONE, 0, 0 },
+/*05*/ { "syscall",FALSE,NONE, 0, 0 },
+/*06*/ { "clts", FALSE, NONE, 0, 0 },
+/*07*/ { "sysret",FALSE, NONE, 0, 0 },
+
+/*08*/ { "invd", FALSE, NONE, 0, 0 },
+/*09*/ { "wbinvd",FALSE, NONE, 0, 0 },
+/*0a*/ { "", FALSE, NONE, 0, 0 },
+/*0b*/ { "", FALSE, NONE, 0, 0 },
+/*0c*/ { "", FALSE, NONE, 0, 0 },
+/*0d*/ { "", FALSE, NONE, 0, 0 },
+/*0e*/ { "", FALSE, NONE, 0, 0 },
+/*0f*/ { "", FALSE, NONE, 0, 0 },
+};
+
+static const struct inst db_inst_0f2x[] = {
+/*20*/ { "mov", TRUE, LONG, op2(CR,El), 0 },
+/*21*/ { "mov", TRUE, LONG, op2(DR,El), 0 },
+/*22*/ { "mov", TRUE, LONG, op2(El,CR), 0 },
+/*23*/ { "mov", TRUE, LONG, op2(El,DR), 0 },
+/*24*/ { "mov", TRUE, LONG, op2(TR,El), 0 },
+/*25*/ { "", FALSE, NONE, 0, 0 },
+/*26*/ { "mov", TRUE, LONG, op2(El,TR), 0 },
+/*27*/ { "", FALSE, NONE, 0, 0 },
+
+/*28*/ { "", FALSE, NONE, 0, 0 },
+/*29*/ { "", FALSE, NONE, 0, 0 },
+/*2a*/ { "", FALSE, NONE, 0, 0 },
+/*2b*/ { "", FALSE, NONE, 0, 0 },
+/*2c*/ { "", FALSE, NONE, 0, 0 },
+/*2d*/ { "", FALSE, NONE, 0, 0 },
+/*2e*/ { "", FALSE, NONE, 0, 0 },
+/*2f*/ { "", FALSE, NONE, 0, 0 },
+};
+
+static const struct inst db_inst_0f3x[] = {
+/*30*/ { "wrmsr", FALSE, NONE, 0, 0 },
+/*31*/ { "rdtsc", FALSE, NONE, 0, 0 },
+/*32*/ { "rdmsr", FALSE, NONE, 0, 0 },
+/*33*/ { "rdpmc", FALSE, NONE, 0, 0 },
+/*34*/ { "sysenter",FALSE,NONE, 0, 0 },
+/*35*/ { "sysexit",FALSE,NONE, 0, 0 },
+/*36*/ { "", FALSE, NONE, 0, 0 },
+/*37*/ { "getsec",FALSE, NONE, 0, 0 },
+
+/*38*/ { "", FALSE, ESC, 0, db_inst_0f38 },
+/*39*/ { "", FALSE, NONE, 0, 0 },
+/*3a*/ { "", FALSE, NONE, 0, 0 },
+/*3b*/ { "", FALSE, NONE, 0, 0 },
+/*3c*/ { "", FALSE, NONE, 0, 0 },
+/*3d*/ { "", FALSE, NONE, 0, 0 },
+/*3e*/ { "", FALSE, NONE, 0, 0 },
+/*3f*/ { "", FALSE, NONE, 0, 0 },
+};
+
+static const struct inst db_inst_0f4x[] = {
+/*40*/ { "cmovo", TRUE, NONE, op2(E, R), 0 },
+/*41*/ { "cmovno", TRUE, NONE, op2(E, R), 0 },
+/*42*/ { "cmovb", TRUE, NONE, op2(E, R), 0 },
+/*43*/ { "cmovnb", TRUE, NONE, op2(E, R), 0 },
+/*44*/ { "cmovz", TRUE, NONE, op2(E, R), 0 },
+/*45*/ { "cmovnz", TRUE, NONE, op2(E, R), 0 },
+/*46*/ { "cmovbe", TRUE, NONE, op2(E, R), 0 },
+/*47*/ { "cmovnbe",TRUE, NONE, op2(E, R), 0 },
+
+/*48*/ { "cmovs", TRUE, NONE, op2(E, R), 0 },
+/*49*/ { "cmovns", TRUE, NONE, op2(E, R), 0 },
+/*4a*/ { "cmovp", TRUE, NONE, op2(E, R), 0 },
+/*4b*/ { "cmovnp", TRUE, NONE, op2(E, R), 0 },
+/*4c*/ { "cmovl", TRUE, NONE, op2(E, R), 0 },
+/*4d*/ { "cmovnl", TRUE, NONE, op2(E, R), 0 },
+/*4e*/ { "cmovle", TRUE, NONE, op2(E, R), 0 },
+/*4f*/ { "cmovnle",TRUE, NONE, op2(E, R), 0 },
+};
+
+static const struct inst db_inst_0f7x[] = {
+/*70*/ { "", FALSE, NONE, 0, 0 },
+/*71*/ { "", FALSE, NONE, 0, 0 },
+/*72*/ { "", FALSE, NONE, 0, 0 },
+/*73*/ { "", FALSE, NONE, 0, 0 },
+/*74*/ { "", FALSE, NONE, 0, 0 },
+/*75*/ { "", FALSE, NONE, 0, 0 },
+/*76*/ { "", FALSE, NONE, 0, 0 },
+/*77*/ { "", FALSE, NONE, 0, 0 },
+
+/*78*/ { "vmread", TRUE, NONE, op2(Rq, E), 0 },
+/*79*/ { "vmwrite",TRUE, NONE, op2(E, Rq), 0 },
+/*7a*/ { "", FALSE, NONE, 0, 0 },
+/*7b*/ { "", FALSE, NONE, 0, 0 },
+/*7c*/ { "", FALSE, NONE, 0, 0 },
+/*7d*/ { "", FALSE, NONE, 0, 0 },
+/*7e*/ { "", FALSE, NONE, 0, 0 },
+/*7f*/ { "", FALSE, NONE, 0, 0 },
+};
+
+static const struct inst db_inst_0f8x[] = {
+/*80*/ { "jo", FALSE, NONE, op1(Dl), 0 },
+/*81*/ { "jno", FALSE, NONE, op1(Dl), 0 },
+/*82*/ { "jb", FALSE, NONE, op1(Dl), 0 },
+/*83*/ { "jnb", FALSE, NONE, op1(Dl), 0 },
+/*84*/ { "jz", FALSE, NONE, op1(Dl), 0 },
+/*85*/ { "jnz", FALSE, NONE, op1(Dl), 0 },
+/*86*/ { "jbe", FALSE, NONE, op1(Dl), 0 },
+/*87*/ { "jnbe", FALSE, NONE, op1(Dl), 0 },
+
+/*88*/ { "js", FALSE, NONE, op1(Dl), 0 },
+/*89*/ { "jns", FALSE, NONE, op1(Dl), 0 },
+/*8a*/ { "jp", FALSE, NONE, op1(Dl), 0 },
+/*8b*/ { "jnp", FALSE, NONE, op1(Dl), 0 },
+/*8c*/ { "jl", FALSE, NONE, op1(Dl), 0 },
+/*8d*/ { "jnl", FALSE, NONE, op1(Dl), 0 },
+/*8e*/ { "jle", FALSE, NONE, op1(Dl), 0 },
+/*8f*/ { "jnle", FALSE, NONE, op1(Dl), 0 },
+};
+
+static const struct inst db_inst_0f9x[] = {
+/*90*/ { "seto", TRUE, NONE, op1(Eb), 0 },
+/*91*/ { "setno", TRUE, NONE, op1(Eb), 0 },
+/*92*/ { "setb", TRUE, NONE, op1(Eb), 0 },
+/*93*/ { "setnb", TRUE, NONE, op1(Eb), 0 },
+/*94*/ { "setz", TRUE, NONE, op1(Eb), 0 },
+/*95*/ { "setnz", TRUE, NONE, op1(Eb), 0 },
+/*96*/ { "setbe", TRUE, NONE, op1(Eb), 0 },
+/*97*/ { "setnbe",TRUE, NONE, op1(Eb), 0 },
+
+/*98*/ { "sets", TRUE, NONE, op1(Eb), 0 },
+/*99*/ { "setns", TRUE, NONE, op1(Eb), 0 },
+/*9a*/ { "setp", TRUE, NONE, op1(Eb), 0 },
+/*9b*/ { "setnp", TRUE, NONE, op1(Eb), 0 },
+/*9c*/ { "setl", TRUE, NONE, op1(Eb), 0 },
+/*9d*/ { "setnl", TRUE, NONE, op1(Eb), 0 },
+/*9e*/ { "setle", TRUE, NONE, op1(Eb), 0 },
+/*9f*/ { "setnle",TRUE, NONE, op1(Eb), 0 },
+};
+
+static const struct inst db_inst_0fax[] = {
+/*a0*/ { "push", FALSE, NONE, op1(Si), 0 },
+/*a1*/ { "pop", FALSE, NONE, op1(Si), 0 },
+/*a2*/ { "cpuid", FALSE, NONE, 0, 0 },
+/*a3*/ { "bt", TRUE, LONG, op2(R,E), 0 },
+/*a4*/ { "shld", TRUE, LONG, op3(Ib,R,E), 0 },
+/*a5*/ { "shld", TRUE, LONG, op3(CL,R,E), 0 },
+/*a6*/ { "", FALSE, NONE, 0, 0 },
+/*a7*/ { "", FALSE, NONE, 0, 0 },
+
+/*a8*/ { "push", FALSE, NONE, op1(Si), 0 },
+/*a9*/ { "pop", FALSE, NONE, op1(Si), 0 },
+/*aa*/ { "rsm", FALSE, NONE, 0, 0 },
+/*ab*/ { "bts", TRUE, LONG, op2(R,E), 0 },
+/*ac*/ { "shrd", TRUE, LONG, op3(Ib,R,E), 0 },
+/*ad*/ { "shrd", TRUE, LONG, op3(CL,R,E), 0 },
+/*ae*/ { "", TRUE, LONG, op1(E), db_Grp15 },
+/*af*/ { "imul", TRUE, LONG, op2(E,R), 0 },
+};
+
+static const struct inst db_inst_0fbx[] = {
+/*b0*/ { "cmpxchg",TRUE, BYTE, op2(R, E), 0 },
+/*b0*/ { "cmpxchg",TRUE, LONG, op2(R, E), 0 },
+/*b2*/ { "lss", TRUE, LONG, op2(E, R), 0 },
+/*b3*/ { "btr", TRUE, LONG, op2(R, E), 0 },
+/*b4*/ { "lfs", TRUE, LONG, op2(E, R), 0 },
+/*b5*/ { "lgs", TRUE, LONG, op2(E, R), 0 },
+/*b6*/ { "movzb", TRUE, LONG, op2(Eb, R), 0 },
+/*b7*/ { "movzw", TRUE, LONG, op2(Ew, R), 0 },
+
+/*b8*/ { "", FALSE, NONE, 0, 0 },
+/*b9*/ { "", FALSE, NONE, 0, 0 },
+/*ba*/ { "", TRUE, LONG, op2(Ib, E), db_Grp8 },
+/*bb*/ { "btc", TRUE, LONG, op2(R, E), 0 },
+/*bc*/ { "bsf", TRUE, LONG, op2(E, R), 0 },
+/*bd*/ { "bsr", TRUE, LONG, op2(E, R), 0 },
+/*be*/ { "movsb", TRUE, LONG, op2(Eb, R), 0 },
+/*bf*/ { "movsw", TRUE, LONG, op2(Ew, R), 0 },
+};
+
+static const struct inst db_inst_0fcx[] = {
+/*c0*/ { "xadd", TRUE, BYTE, op2(R, E), 0 },
+/*c1*/ { "xadd", TRUE, LONG, op2(R, E), 0 },
+/*c2*/ { "", FALSE, NONE, 0, 0 },
+/*c3*/ { "", FALSE, NONE, 0, 0 },
+/*c4*/ { "", FALSE, NONE, 0, 0 },
+/*c5*/ { "", FALSE, NONE, 0, 0 },
+/*c6*/ { "", FALSE, NONE, 0, 0 },
+/*c7*/ { "", TRUE, NONE, op1(E), db_Grp9 },
+/*c8*/ { "bswap", FALSE, LONG, op1(Ril), 0 },
+/*c9*/ { "bswap", FALSE, LONG, op1(Ril), 0 },
+/*ca*/ { "bswap", FALSE, LONG, op1(Ril), 0 },
+/*cb*/ { "bswap", FALSE, LONG, op1(Ril), 0 },
+/*cc*/ { "bswap", FALSE, LONG, op1(Ril), 0 },
+/*cd*/ { "bswap", FALSE, LONG, op1(Ril), 0 },
+/*ce*/ { "bswap", FALSE, LONG, op1(Ril), 0 },
+/*cf*/ { "bswap", FALSE, LONG, op1(Ril), 0 },
+};
+
+static const struct inst * const db_inst_0f[] = {
+ db_inst_0f0x,
+ 0,
+ db_inst_0f2x,
+ db_inst_0f3x,
+ db_inst_0f4x,
+ 0,
+ 0,
+ db_inst_0f7x,
+ db_inst_0f8x,
+ db_inst_0f9x,
+ db_inst_0fax,
+ db_inst_0fbx,
+ db_inst_0fcx,
+ 0,
+ 0,
+ 0
+};
+
+static const char * const db_Esc92[] = {
+ "fnop", "", "", "", "", "", "", ""
+};
+static const char * const db_Esc94[] = {
+ "fchs", "fabs", "", "", "ftst", "fxam", "", ""
+};
+static const char * const db_Esc95[] = {
+ "fld1", "fldl2t","fldl2e","fldpi","fldlg2","fldln2","fldz",""
+};
+static const char * const db_Esc96[] = {
+ "f2xm1","fyl2x","fptan","fpatan","fxtract","fprem1","fdecstp",
+ "fincstp"
+};
+static const char * const db_Esc97[] = {
+ "fprem","fyl2xp1","fsqrt","fsincos","frndint","fscale","fsin","fcos"
+};
+
+static const char * const db_Esca5[] = {
+ "", "fucompp","", "", "", "", "", ""
+};
+
+static const char * const db_Escb4[] = {
+ "fneni","fndisi", "fnclex","fninit","fsetpm", "", "", ""
+};
+
+static const char * const db_Esce3[] = {
+ "", "fcompp","", "", "", "", "", ""
+};
+
+static const char * const db_Escf4[] = {
+ "fnstsw","", "", "", "", "", "", ""
+};
+
+static const struct finst db_Esc8[] = {
+/*0*/ { "fadd", SNGL, op2(STI,ST), 0 },
+/*1*/ { "fmul", SNGL, op2(STI,ST), 0 },
+/*2*/ { "fcom", SNGL, op2(STI,ST), 0 },
+/*3*/ { "fcomp", SNGL, op2(STI,ST), 0 },
+/*4*/ { "fsub", SNGL, op2(STI,ST), 0 },
+/*5*/ { "fsubr", SNGL, op2(STI,ST), 0 },
+/*6*/ { "fdiv", SNGL, op2(STI,ST), 0 },
+/*7*/ { "fdivr", SNGL, op2(STI,ST), 0 },
+};
+
+static const struct finst db_Esc9[] = {
+/*0*/ { "fld", SNGL, op1(STI), 0 },
+/*1*/ { "", NONE, op1(STI), "fxch" },
+/*2*/ { "fst", SNGL, op1(X), db_Esc92 },
+/*3*/ { "fstp", SNGL, 0, 0 },
+/*4*/ { "fldenv", NONE, op1(X), db_Esc94 },
+/*5*/ { "fldcw", NONE, op1(X), db_Esc95 },
+/*6*/ { "fnstenv",NONE, op1(X), db_Esc96 },
+/*7*/ { "fnstcw", NONE, op1(X), db_Esc97 },
+};
+
+static const struct finst db_Esca[] = {
+/*0*/ { "fiadd", LONG, 0, 0 },
+/*1*/ { "fimul", LONG, 0, 0 },
+/*2*/ { "ficom", LONG, 0, 0 },
+/*3*/ { "ficomp", LONG, 0, 0 },
+/*4*/ { "fisub", LONG, 0, 0 },
+/*5*/ { "fisubr", LONG, op1(X), db_Esca5 },
+/*6*/ { "fidiv", LONG, 0, 0 },
+/*7*/ { "fidivr", LONG, 0, 0 }
+};
+
+static const struct finst db_Escb[] = {
+/*0*/ { "fild", LONG, 0, 0 },
+/*1*/ { "", NONE, 0, 0 },
+/*2*/ { "fist", LONG, 0, 0 },
+/*3*/ { "fistp", LONG, 0, 0 },
+/*4*/ { "", WORD, op1(X), db_Escb4 },
+/*5*/ { "fld", EXTR, 0, 0 },
+/*6*/ { "", WORD, 0, 0 },
+/*7*/ { "fstp", EXTR, 0, 0 },
+};
+
+static const struct finst db_Escc[] = {
+/*0*/ { "fadd", DBLR, op2(ST,STI), 0 },
+/*1*/ { "fmul", DBLR, op2(ST,STI), 0 },
+/*2*/ { "fcom", DBLR, 0, 0 },
+/*3*/ { "fcomp", DBLR, 0, 0 },
+/*4*/ { "fsub", DBLR, op2(ST,STI), "fsubr" },
+/*5*/ { "fsubr", DBLR, op2(ST,STI), "fsub" },
+/*6*/ { "fdiv", DBLR, op2(ST,STI), "fdivr" },
+/*7*/ { "fdivr", DBLR, op2(ST,STI), "fdiv" },
+};
+
+static const struct finst db_Escd[] = {
+/*0*/ { "fld", DBLR, op1(STI), "ffree" },
+/*1*/ { "", NONE, 0, 0 },
+/*2*/ { "fst", DBLR, op1(STI), 0 },
+/*3*/ { "fstp", DBLR, op1(STI), 0 },
+/*4*/ { "frstor", NONE, op1(STI), "fucom" },
+/*5*/ { "", NONE, op1(STI), "fucomp" },
+/*6*/ { "fnsave", NONE, 0, 0 },
+/*7*/ { "fnstsw", NONE, 0, 0 },
+};
+
+static const struct finst db_Esce[] = {
+/*0*/ { "fiadd", WORD, op2(ST,STI), "faddp" },
+/*1*/ { "fimul", WORD, op2(ST,STI), "fmulp" },
+/*2*/ { "ficom", WORD, 0, 0 },
+/*3*/ { "ficomp", WORD, op1(X), db_Esce3 },
+/*4*/ { "fisub", WORD, op2(ST,STI), "fsubrp" },
+/*5*/ { "fisubr", WORD, op2(ST,STI), "fsubp" },
+/*6*/ { "fidiv", WORD, op2(ST,STI), "fdivrp" },
+/*7*/ { "fidivr", WORD, op2(ST,STI), "fdivp" },
+};
+
+static const struct finst db_Escf[] = {
+/*0*/ { "fild", WORD, 0, 0 },
+/*1*/ { "", NONE, 0, 0 },
+/*2*/ { "fist", WORD, 0, 0 },
+/*3*/ { "fistp", WORD, 0, 0 },
+/*4*/ { "fbld", NONE, op1(XA), db_Escf4 },
+/*5*/ { "fild", QUAD, 0, 0 },
+/*6*/ { "fbstp", NONE, 0, 0 },
+/*7*/ { "fistp", QUAD, 0, 0 },
+};
+
+static const struct finst * const db_Esc_inst[] = {
+ db_Esc8, db_Esc9, db_Esca, db_Escb,
+ db_Escc, db_Escd, db_Esce, db_Escf
+};
+
+static const char * const db_Grp1[] = {
+ "add",
+ "or",
+ "adc",
+ "sbb",
+ "and",
+ "sub",
+ "xor",
+ "cmp"
+};
+
+static const char * const db_Grp2[] = {
+ "rol",
+ "ror",
+ "rcl",
+ "rcr",
+ "shl",
+ "shr",
+ "shl",
+ "sar"
+};
+
+static const struct inst db_Grp3[] = {
+ { "test", TRUE, NONE, op2(I,E), 0 },
+ { "test", TRUE, NONE, op2(I,E), 0 },
+ { "not", TRUE, NONE, op1(E), 0 },
+ { "neg", TRUE, NONE, op1(E), 0 },
+ { "mul", TRUE, NONE, op2(E,A), 0 },
+ { "imul", TRUE, NONE, op2(E,A), 0 },
+ { "div", TRUE, NONE, op2(E,A), 0 },
+ { "idiv", TRUE, NONE, op2(E,A), 0 },
+};
+
+static const struct inst db_Grp4[] = {
+ { "inc", TRUE, BYTE, op1(E), 0 },
+ { "dec", TRUE, BYTE, op1(E), 0 },
+ { "", TRUE, NONE, 0, 0 },
+ { "", TRUE, NONE, 0, 0 },
+ { "", TRUE, NONE, 0, 0 },
+ { "", TRUE, NONE, 0, 0 },
+ { "", TRUE, NONE, 0, 0 },
+ { "", TRUE, NONE, 0, 0 }
+};
+
+static const struct inst db_Grp5[] = {
+ { "inc", TRUE, LONG, op1(E), 0 },
+ { "dec", TRUE, LONG, op1(E), 0 },
+ { "call", TRUE, LONG, op1(Eind),0 },
+ { "lcall", TRUE, LONG, op1(Eind),0 },
+ { "jmp", TRUE, LONG, op1(Eind),0 },
+ { "ljmp", TRUE, LONG, op1(Eind),0 },
+ { "push", TRUE, LONG, op1(E), 0 },
+ { "", TRUE, NONE, 0, 0 }
+};
+
+static const struct inst db_inst_table[256] = {
+/*00*/ { "add", TRUE, BYTE, op2(R, E), 0 },
+/*01*/ { "add", TRUE, LONG, op2(R, E), 0 },
+/*02*/ { "add", TRUE, BYTE, op2(E, R), 0 },
+/*03*/ { "add", TRUE, LONG, op2(E, R), 0 },
+/*04*/ { "add", FALSE, BYTE, op2(I, A), 0 },
+/*05*/ { "add", FALSE, LONG, op2(Is, A), 0 },
+/*06*/ { "push", FALSE, NONE, op1(Si), 0 },
+/*07*/ { "pop", FALSE, NONE, op1(Si), 0 },
+
+/*08*/ { "or", TRUE, BYTE, op2(R, E), 0 },
+/*09*/ { "or", TRUE, LONG, op2(R, E), 0 },
+/*0a*/ { "or", TRUE, BYTE, op2(E, R), 0 },
+/*0b*/ { "or", TRUE, LONG, op2(E, R), 0 },
+/*0c*/ { "or", FALSE, BYTE, op2(I, A), 0 },
+/*0d*/ { "or", FALSE, LONG, op2(I, A), 0 },
+/*0e*/ { "push", FALSE, NONE, op1(Si), 0 },
+/*0f*/ { "", FALSE, ESC, 0, db_inst_0f },
+
+/*10*/ { "adc", TRUE, BYTE, op2(R, E), 0 },
+/*11*/ { "adc", TRUE, LONG, op2(R, E), 0 },
+/*12*/ { "adc", TRUE, BYTE, op2(E, R), 0 },
+/*13*/ { "adc", TRUE, LONG, op2(E, R), 0 },
+/*14*/ { "adc", FALSE, BYTE, op2(I, A), 0 },
+/*15*/ { "adc", FALSE, LONG, op2(Is, A), 0 },
+/*16*/ { "push", FALSE, NONE, op1(Si), 0 },
+/*17*/ { "pop", FALSE, NONE, op1(Si), 0 },
+
+/*18*/ { "sbb", TRUE, BYTE, op2(R, E), 0 },
+/*19*/ { "sbb", TRUE, LONG, op2(R, E), 0 },
+/*1a*/ { "sbb", TRUE, BYTE, op2(E, R), 0 },
+/*1b*/ { "sbb", TRUE, LONG, op2(E, R), 0 },
+/*1c*/ { "sbb", FALSE, BYTE, op2(I, A), 0 },
+/*1d*/ { "sbb", FALSE, LONG, op2(Is, A), 0 },
+/*1e*/ { "push", FALSE, NONE, op1(Si), 0 },
+/*1f*/ { "pop", FALSE, NONE, op1(Si), 0 },
+
+/*20*/ { "and", TRUE, BYTE, op2(R, E), 0 },
+/*21*/ { "and", TRUE, LONG, op2(R, E), 0 },
+/*22*/ { "and", TRUE, BYTE, op2(E, R), 0 },
+/*23*/ { "and", TRUE, LONG, op2(E, R), 0 },
+/*24*/ { "and", FALSE, BYTE, op2(I, A), 0 },
+/*25*/ { "and", FALSE, LONG, op2(I, A), 0 },
+/*26*/ { "", FALSE, NONE, 0, 0 },
+/*27*/ { "daa", FALSE, NONE, 0, 0 },
+
+/*28*/ { "sub", TRUE, BYTE, op2(R, E), 0 },
+/*29*/ { "sub", TRUE, LONG, op2(R, E), 0 },
+/*2a*/ { "sub", TRUE, BYTE, op2(E, R), 0 },
+/*2b*/ { "sub", TRUE, LONG, op2(E, R), 0 },
+/*2c*/ { "sub", FALSE, BYTE, op2(I, A), 0 },
+/*2d*/ { "sub", FALSE, LONG, op2(Is, A), 0 },
+/*2e*/ { "", FALSE, NONE, 0, 0 },
+/*2f*/ { "das", FALSE, NONE, 0, 0 },
+
+/*30*/ { "xor", TRUE, BYTE, op2(R, E), 0 },
+/*31*/ { "xor", TRUE, LONG, op2(R, E), 0 },
+/*32*/ { "xor", TRUE, BYTE, op2(E, R), 0 },
+/*33*/ { "xor", TRUE, LONG, op2(E, R), 0 },
+/*34*/ { "xor", FALSE, BYTE, op2(I, A), 0 },
+/*35*/ { "xor", FALSE, LONG, op2(I, A), 0 },
+/*36*/ { "", FALSE, NONE, 0, 0 },
+/*37*/ { "aaa", FALSE, NONE, 0, 0 },
+
+/*38*/ { "cmp", TRUE, BYTE, op2(R, E), 0 },
+/*39*/ { "cmp", TRUE, LONG, op2(R, E), 0 },
+/*3a*/ { "cmp", TRUE, BYTE, op2(E, R), 0 },
+/*3b*/ { "cmp", TRUE, LONG, op2(E, R), 0 },
+/*3c*/ { "cmp", FALSE, BYTE, op2(I, A), 0 },
+/*3d*/ { "cmp", FALSE, LONG, op2(Is, A), 0 },
+/*3e*/ { "", FALSE, NONE, 0, 0 },
+/*3f*/ { "aas", FALSE, NONE, 0, 0 },
+
+/*40*/ { "rex", FALSE, NONE, 0, 0 },
+/*41*/ { "rex.b", FALSE, NONE, 0, 0 },
+/*42*/ { "rex.x", FALSE, NONE, 0, 0 },
+/*43*/ { "rex.xb", FALSE, NONE, 0, 0 },
+/*44*/ { "rex.r", FALSE, NONE, 0, 0 },
+/*45*/ { "rex.rb", FALSE, NONE, 0, 0 },
+/*46*/ { "rex.rx", FALSE, NONE, 0, 0 },
+/*47*/ { "rex.rxb", FALSE, NONE, 0, 0 },
+
+/*48*/ { "rex.w", FALSE, NONE, 0, 0 },
+/*49*/ { "rex.wb", FALSE, NONE, 0, 0 },
+/*4a*/ { "rex.wx", FALSE, NONE, 0, 0 },
+/*4b*/ { "rex.wxb", FALSE, NONE, 0, 0 },
+/*4c*/ { "rex.wr", FALSE, NONE, 0, 0 },
+/*4d*/ { "rex.wrb", FALSE, NONE, 0, 0 },
+/*4e*/ { "rex.wrx", FALSE, NONE, 0, 0 },
+/*4f*/ { "rex.wrxb", FALSE, NONE, 0, 0 },
+
+/*50*/ { "push", FALSE, LONG, op1(Ri), 0 },
+/*51*/ { "push", FALSE, LONG, op1(Ri), 0 },
+/*52*/ { "push", FALSE, LONG, op1(Ri), 0 },
+/*53*/ { "push", FALSE, LONG, op1(Ri), 0 },
+/*54*/ { "push", FALSE, LONG, op1(Ri), 0 },
+/*55*/ { "push", FALSE, LONG, op1(Ri), 0 },
+/*56*/ { "push", FALSE, LONG, op1(Ri), 0 },
+/*57*/ { "push", FALSE, LONG, op1(Ri), 0 },
+
+/*58*/ { "pop", FALSE, LONG, op1(Ri), 0 },
+/*59*/ { "pop", FALSE, LONG, op1(Ri), 0 },
+/*5a*/ { "pop", FALSE, LONG, op1(Ri), 0 },
+/*5b*/ { "pop", FALSE, LONG, op1(Ri), 0 },
+/*5c*/ { "pop", FALSE, LONG, op1(Ri), 0 },
+/*5d*/ { "pop", FALSE, LONG, op1(Ri), 0 },
+/*5e*/ { "pop", FALSE, LONG, op1(Ri), 0 },
+/*5f*/ { "pop", FALSE, LONG, op1(Ri), 0 },
+
+/*60*/ { "pusha", FALSE, LONG, 0, 0 },
+/*61*/ { "popa", FALSE, LONG, 0, 0 },
+/*62*/ { "bound", TRUE, LONG, op2(E, R), 0 },
+/*63*/ { "movslq", TRUE, NONE, op2(EL,R), 0 },
+
+/*64*/ { "", FALSE, NONE, 0, 0 },
+/*65*/ { "", FALSE, NONE, 0, 0 },
+/*66*/ { "", FALSE, NONE, 0, 0 },
+/*67*/ { "", FALSE, NONE, 0, 0 },
+
+/*68*/ { "push", FALSE, LONG, op1(I), 0 },
+/*69*/ { "imul", TRUE, LONG, op3(I,E,R), 0 },
+/*6a*/ { "push", FALSE, LONG, op1(Ibs), 0 },
+/*6b*/ { "imul", TRUE, LONG, op3(Ibs,E,R),0 },
+/*6c*/ { "ins", FALSE, BYTE, op2(DX, DI), 0 },
+/*6d*/ { "ins", FALSE, LONG, op2(DX, DI), 0 },
+/*6e*/ { "outs", FALSE, BYTE, op2(SI, DX), 0 },
+/*6f*/ { "outs", FALSE, LONG, op2(SI, DX), 0 },
+
+/*70*/ { "jo", FALSE, NONE, op1(Db), 0 },
+/*71*/ { "jno", FALSE, NONE, op1(Db), 0 },
+/*72*/ { "jb", FALSE, NONE, op1(Db), 0 },
+/*73*/ { "jnb", FALSE, NONE, op1(Db), 0 },
+/*74*/ { "jz", FALSE, NONE, op1(Db), 0 },
+/*75*/ { "jnz", FALSE, NONE, op1(Db), 0 },
+/*76*/ { "jbe", FALSE, NONE, op1(Db), 0 },
+/*77*/ { "jnbe", FALSE, NONE, op1(Db), 0 },
+
+/*78*/ { "js", FALSE, NONE, op1(Db), 0 },
+/*79*/ { "jns", FALSE, NONE, op1(Db), 0 },
+/*7a*/ { "jp", FALSE, NONE, op1(Db), 0 },
+/*7b*/ { "jnp", FALSE, NONE, op1(Db), 0 },
+/*7c*/ { "jl", FALSE, NONE, op1(Db), 0 },
+/*7d*/ { "jnl", FALSE, NONE, op1(Db), 0 },
+/*7e*/ { "jle", FALSE, NONE, op1(Db), 0 },
+/*7f*/ { "jnle", FALSE, NONE, op1(Db), 0 },
+
+/*80*/ { "", TRUE, BYTE, op2(I, E), db_Grp1 },
+/*81*/ { "", TRUE, LONG, op2(I, E), db_Grp1 },
+/*82*/ { "", TRUE, BYTE, op2(I, E), db_Grp1 },
+/*83*/ { "", TRUE, LONG, op2(Ibs,E), db_Grp1 },
+/*84*/ { "test", TRUE, BYTE, op2(R, E), 0 },
+/*85*/ { "test", TRUE, LONG, op2(R, E), 0 },
+/*86*/ { "xchg", TRUE, BYTE, op2(R, E), 0 },
+/*87*/ { "xchg", TRUE, LONG, op2(R, E), 0 },
+
+/*88*/ { "mov", TRUE, BYTE, op2(R, E), 0 },
+/*89*/ { "mov", TRUE, LONG, op2(R, E), 0 },
+/*8a*/ { "mov", TRUE, BYTE, op2(E, R), 0 },
+/*8b*/ { "mov", TRUE, LONG, op2(E, R), 0 },
+/*8c*/ { "mov", TRUE, NONE, op2(S, Ew), 0 },
+/*8d*/ { "lea", TRUE, LONG, op2(E, R), 0 },
+/*8e*/ { "mov", TRUE, NONE, op2(Ew, S), 0 },
+/*8f*/ { "pop", TRUE, LONG, op1(E), 0 },
+
+/*90*/ { "nop", FALSE, NONE, 0, 0 },
+/*91*/ { "xchg", FALSE, LONG, op2(A, Ri), 0 },
+/*92*/ { "xchg", FALSE, LONG, op2(A, Ri), 0 },
+/*93*/ { "xchg", FALSE, LONG, op2(A, Ri), 0 },
+/*94*/ { "xchg", FALSE, LONG, op2(A, Ri), 0 },
+/*95*/ { "xchg", FALSE, LONG, op2(A, Ri), 0 },
+/*96*/ { "xchg", FALSE, LONG, op2(A, Ri), 0 },
+/*97*/ { "xchg", FALSE, LONG, op2(A, Ri), 0 },
+
+/*98*/ { "cwde", FALSE, SDEP, 0, "cbw" },
+/*99*/ { "cdq", FALSE, SDEP, 0, "cwd" },
+/*9a*/ { "lcall", FALSE, NONE, op1(OS), 0 },
+/*9b*/ { "wait", FALSE, NONE, 0, 0 },
+/*9c*/ { "pushf", FALSE, LONG, 0, 0 },
+/*9d*/ { "popf", FALSE, LONG, 0, 0 },
+/*9e*/ { "sahf", FALSE, NONE, 0, 0 },
+/*9f*/ { "lahf", FALSE, NONE, 0, 0 },
+
+/*a0*/ { "mov", FALSE, BYTE, op2(O, A), 0 },
+/*a1*/ { "mov", FALSE, LONG, op2(O, A), 0 },
+/*a2*/ { "mov", FALSE, BYTE, op2(A, O), 0 },
+/*a3*/ { "mov", FALSE, LONG, op2(A, O), 0 },
+/*a4*/ { "movs", FALSE, BYTE, op2(SI,DI), 0 },
+/*a5*/ { "movs", FALSE, LONG, op2(SI,DI), 0 },
+/*a6*/ { "cmps", FALSE, BYTE, op2(SI,DI), 0 },
+/*a7*/ { "cmps", FALSE, LONG, op2(SI,DI), 0 },
+
+/*a8*/ { "test", FALSE, BYTE, op2(I, A), 0 },
+/*a9*/ { "test", FALSE, LONG, op2(I, A), 0 },
+/*aa*/ { "stos", FALSE, BYTE, op1(DI), 0 },
+/*ab*/ { "stos", FALSE, LONG, op1(DI), 0 },
+/*ac*/ { "lods", FALSE, BYTE, op1(SI), 0 },
+/*ad*/ { "lods", FALSE, LONG, op1(SI), 0 },
+/*ae*/ { "scas", FALSE, BYTE, op1(SI), 0 },
+/*af*/ { "scas", FALSE, LONG, op1(SI), 0 },
+
+/*b0*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 },
+/*b1*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 },
+/*b2*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 },
+/*b3*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 },
+/*b4*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 },
+/*b5*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 },
+/*b6*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 },
+/*b7*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 },
+
+/*b8*/ { "mov", FALSE, LONG, op2(Ilq, Ri), 0 },
+/*b9*/ { "mov", FALSE, LONG, op2(Ilq, Ri), 0 },
+/*ba*/ { "mov", FALSE, LONG, op2(Ilq, Ri), 0 },
+/*bb*/ { "mov", FALSE, LONG, op2(Ilq, Ri), 0 },
+/*bc*/ { "mov", FALSE, LONG, op2(Ilq, Ri), 0 },
+/*bd*/ { "mov", FALSE, LONG, op2(Ilq, Ri), 0 },
+/*be*/ { "mov", FALSE, LONG, op2(Ilq, Ri), 0 },
+/*bf*/ { "mov", FALSE, LONG, op2(Ilq, Ri), 0 },
+
+/*c0*/ { "", TRUE, BYTE, op2(Ib, E), db_Grp2 },
+/*c1*/ { "", TRUE, LONG, op2(Ib, E), db_Grp2 },
+/*c2*/ { "ret", FALSE, NONE, op1(Iw), 0 },
+/*c3*/ { "ret", FALSE, NONE, 0, 0 },
+/*c4*/ { "les", TRUE, LONG, op2(E, R), 0 },
+/*c5*/ { "lds", TRUE, LONG, op2(E, R), 0 },
+/*c6*/ { "mov", TRUE, BYTE, op2(I, E), 0 },
+/*c7*/ { "mov", TRUE, LONG, op2(I, E), 0 },
+
+/*c8*/ { "enter", FALSE, NONE, op2(Iw, Ib), 0 },
+/*c9*/ { "leave", FALSE, NONE, 0, 0 },
+/*ca*/ { "lret", FALSE, NONE, op1(Iw), 0 },
+/*cb*/ { "lret", FALSE, NONE, 0, 0 },
+/*cc*/ { "int", FALSE, NONE, op1(o3), 0 },
+/*cd*/ { "int", FALSE, NONE, op1(Ib), 0 },
+/*ce*/ { "into", FALSE, NONE, 0, 0 },
+/*cf*/ { "iret", FALSE, NONE, 0, 0 },
+
+/*d0*/ { "", TRUE, BYTE, op2(o1, E), db_Grp2 },
+/*d1*/ { "", TRUE, LONG, op2(o1, E), db_Grp2 },
+/*d2*/ { "", TRUE, BYTE, op2(CL, E), db_Grp2 },
+/*d3*/ { "", TRUE, LONG, op2(CL, E), db_Grp2 },
+/*d4*/ { "aam", FALSE, NONE, op1(Iba), 0 },
+/*d5*/ { "aad", FALSE, NONE, op1(Iba), 0 },
+/*d6*/ { ".byte\t0xd6", FALSE, NONE, 0, 0 },
+/*d7*/ { "xlat", FALSE, BYTE, op1(BX), 0 },
+
+/*d8*/ { "", TRUE, NONE, 0, db_Esc8 },
+/*d9*/ { "", TRUE, NONE, 0, db_Esc9 },
+/*da*/ { "", TRUE, NONE, 0, db_Esca },
+/*db*/ { "", TRUE, NONE, 0, db_Escb },
+/*dc*/ { "", TRUE, NONE, 0, db_Escc },
+/*dd*/ { "", TRUE, NONE, 0, db_Escd },
+/*de*/ { "", TRUE, NONE, 0, db_Esce },
+/*df*/ { "", TRUE, NONE, 0, db_Escf },
+
+/*e0*/ { "loopne",FALSE, NONE, op1(Db), 0 },
+/*e1*/ { "loope", FALSE, NONE, op1(Db), 0 },
+/*e2*/ { "loop", FALSE, NONE, op1(Db), 0 },
+/*e3*/ { "jrcxz", FALSE, ADEP, op1(Db), "jecxz" },
+/*e4*/ { "in", FALSE, BYTE, op2(Ib, A), 0 },
+/*e5*/ { "in", FALSE, LONG, op2(Ib, A) , 0 },
+/*e6*/ { "out", FALSE, BYTE, op2(A, Ib), 0 },
+/*e7*/ { "out", FALSE, LONG, op2(A, Ib) , 0 },
+
+/*e8*/ { "call", FALSE, NONE, op1(Dl), 0 },
+/*e9*/ { "jmp", FALSE, NONE, op1(Dl), 0 },
+/*ea*/ { "ljmp", FALSE, NONE, op1(OS), 0 },
+/*eb*/ { "jmp", FALSE, NONE, op1(Db), 0 },
+/*ec*/ { "in", FALSE, BYTE, op2(DX, A), 0 },
+/*ed*/ { "in", FALSE, LONG, op2(DX, A) , 0 },
+/*ee*/ { "out", FALSE, BYTE, op2(A, DX), 0 },
+/*ef*/ { "out", FALSE, LONG, op2(A, DX) , 0 },
+
+/*f0*/ { "", FALSE, NONE, 0, 0 },
+/*f1*/ { ".byte\t0xf1", FALSE, NONE, 0, 0 },
+/*f2*/ { "", FALSE, NONE, 0, 0 },
+/*f3*/ { "", FALSE, NONE, 0, 0 },
+/*f4*/ { "hlt", FALSE, NONE, 0, 0 },
+/*f5*/ { "cmc", FALSE, NONE, 0, 0 },
+/*f6*/ { "", TRUE, BYTE, 0, db_Grp3 },
+/*f7*/ { "", TRUE, LONG, 0, db_Grp3 },
+
+/*f8*/ { "clc", FALSE, NONE, 0, 0 },
+/*f9*/ { "stc", FALSE, NONE, 0, 0 },
+/*fa*/ { "cli", FALSE, NONE, 0, 0 },
+/*fb*/ { "sti", FALSE, NONE, 0, 0 },
+/*fc*/ { "cld", FALSE, NONE, 0, 0 },
+/*fd*/ { "std", FALSE, NONE, 0, 0 },
+/*fe*/ { "", TRUE, NONE, 0, db_Grp4 },
+/*ff*/ { "", TRUE, NONE, 0, db_Grp5 },
+};
+
+static const struct inst db_bad_inst =
+ { "???", FALSE, NONE, 0, 0 }
+;
+
+#define f_mod(rex, byte) ((byte)>>6)
+#define f_reg(rex, byte) ((((byte)>>3)&0x7) | (rex & REX_R ? 0x8 : 0x0))
+#define f_rm(rex, byte) (((byte)&0x7) | (rex & REX_B ? 0x8 : 0x0))
+
+#define sib_ss(rex, byte) ((byte)>>6)
+#define sib_index(rex, byte) ((((byte)>>3)&0x7) | (rex & REX_X ? 0x8 : 0x0))
+#define sib_base(rex, byte) (((byte)&0x7) | (rex & REX_B ? 0x8 : 0x0))
+
+struct i_addr {
+ int is_reg; /* if reg, reg number is in 'disp' */
+ int disp;
+ const char * base;
+ const char * index;
+ int ss;
+};
+
+static const char * const db_reg[2][4][16] = {
+
+ {{"%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh",
+ "%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b" },
+ { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di",
+ "%r8w", "%r9w", "%r10w", "%r11w", "%r12w", "%r13w", "%r14w", "%r15w" },
+ { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
+ "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" },
+ { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" }},
+
+ {{"%al", "%cl", "%dl", "%bl", "%spl", "%bpl", "%sil", "%dil",
+ "%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b" },
+ { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di",
+ "%r8w", "%r9w", "%r10w", "%r11w", "%r12w", "%r13w", "%r14w", "%r15w" },
+ { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
+ "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" },
+ { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" }}
+};
+
+static const char * const db_seg_reg[8] = {
+ "%es", "%cs", "%ss", "%ds", "%fs", "%gs", "", ""
+};
+
+/*
+ * lengths for size attributes
+ */
+static const int db_lengths[] = {
+ 1, /* BYTE */
+ 2, /* WORD */
+ 4, /* LONG */
+ 8, /* QUAD */
+ 4, /* SNGL */
+ 8, /* DBLR */
+ 10, /* EXTR */
+};
+
+#define get_value_inc(result, loc, size, is_signed) \
+ result = db_get_value((loc), (size), (is_signed)); \
+ (loc) += (size);
+
+static db_addr_t
+ db_disasm_esc(db_addr_t loc, int inst, int rex, int short_addr,
+ int size, const char *seg);
+static void db_print_address(const char *seg, int size, int rex,
+ struct i_addr *addrp);
+static db_addr_t
+ db_read_address(db_addr_t loc, int short_addr, int rex, int regmodrm,
+ struct i_addr *addrp);
+
+/*
+ * Read address at location and return updated location.
+ */
+static db_addr_t
+db_read_address(loc, short_addr, rex, regmodrm, addrp)
+ db_addr_t loc;
+ int short_addr;
+ int rex;
+ int regmodrm;
+ struct i_addr * addrp; /* out */
+{
+ int mod, rm, sib, index, disp, size, have_sib;
+
+ mod = f_mod(rex, regmodrm);
+ rm = f_rm(rex, regmodrm);
+
+ if (mod == 3) {
+ addrp->is_reg = TRUE;
+ addrp->disp = rm;
+ return (loc);
+ }
+ addrp->is_reg = FALSE;
+ addrp->index = 0;
+
+ if (short_addr)
+ size = LONG;
+ else
+ size = QUAD;
+
+ if ((rm & 0x7) == 4) {
+ get_value_inc(sib, loc, 1, FALSE);
+ rm = sib_base(rex, sib);
+ index = sib_index(rex, sib);
+ if (index != 4)
+ addrp->index = db_reg[1][size][index];
+ addrp->ss = sib_ss(rex, sib);
+ have_sib = 1;
+ } else
+ have_sib = 0;
+
+ switch (mod) {
+ case 0:
+ if (rm == 5) {
+ get_value_inc(addrp->disp, loc, 4, FALSE);
+ if (have_sib)
+ addrp->base = 0;
+ else if (short_addr)
+ addrp->base = "%eip";
+ else
+ addrp->base = "%rip";
+ } else {
+ addrp->disp = 0;
+ addrp->base = db_reg[1][size][rm];
+ }
+ break;
+
+ case 1:
+ get_value_inc(disp, loc, 1, TRUE);
+ addrp->disp = disp;
+ addrp->base = db_reg[1][size][rm];
+ break;
+
+ case 2:
+ get_value_inc(disp, loc, 4, FALSE);
+ addrp->disp = disp;
+ addrp->base = db_reg[1][size][rm];
+ break;
+ }
+ return (loc);
+}
+
+static void
+db_print_address(seg, size, rex, addrp)
+ const char * seg;
+ int size;
+ int rex;
+ struct i_addr * addrp;
+{
+ if (addrp->is_reg) {
+ db_printf("%s", db_reg[rex != 0 ? 1 : 0][(size == LONG && (rex & REX_W)) ? QUAD : size][addrp->disp]);
+ return;
+ }
+
+ if (seg) {
+ db_printf("%s:", seg);
+ }
+
+ if (addrp->disp != 0 || (addrp->base == 0 && addrp->index == 0))
+ db_printsym((db_addr_t)addrp->disp, DB_STGY_ANY);
+ if (addrp->base != 0 || addrp->index != 0) {
+ db_printf("(");
+ if (addrp->base)
+ db_printf("%s", addrp->base);
+ if (addrp->index)
+ db_printf(",%s,%d", addrp->index, 1<<addrp->ss);
+ db_printf(")");
+ }
+}
+
+/*
+ * Disassemble floating-point ("escape") instruction
+ * and return updated location.
+ */
+static db_addr_t
+db_disasm_esc(loc, inst, rex, short_addr, size, seg)
+ db_addr_t loc;
+ int inst;
+ int rex;
+ int short_addr;
+ int size;
+ const char * seg;
+{
+ int regmodrm;
+ const struct finst * fp;
+ int mod;
+ struct i_addr address;
+ const char * name;
+
+ get_value_inc(regmodrm, loc, 1, FALSE);
+ fp = &db_Esc_inst[inst - 0xd8][f_reg(rex, regmodrm)];
+ mod = f_mod(rex, regmodrm);
+ if (mod != 3) {
+ if (*fp->f_name == '\0') {
+ db_printf("<bad instruction>");
+ return (loc);
+ }
+ /*
+ * Normal address modes.
+ */
+ loc = db_read_address(loc, short_addr, rex, regmodrm, &address);
+ db_printf("%s", fp->f_name);
+ switch(fp->f_size) {
+ case SNGL:
+ db_printf("s");
+ break;
+ case DBLR:
+ db_printf("l");
+ break;
+ case EXTR:
+ db_printf("t");
+ break;
+ case WORD:
+ db_printf("s");
+ break;
+ case LONG:
+ db_printf("l");
+ break;
+ case QUAD:
+ db_printf("q");
+ break;
+ default:
+ break;
+ }
+ db_printf("\t");
+ db_print_address(seg, BYTE, rex, &address);
+ }
+ else {
+ /*
+ * 'reg-reg' - special formats
+ */
+ switch (fp->f_rrmode) {
+ case op2(ST,STI):
+ name = (fp->f_rrname) ? fp->f_rrname : fp->f_name;
+ db_printf("%s\t%%st,%%st(%d)",name,f_rm(rex, regmodrm));
+ break;
+ case op2(STI,ST):
+ name = (fp->f_rrname) ? fp->f_rrname : fp->f_name;
+ db_printf("%s\t%%st(%d),%%st",name, f_rm(rex, regmodrm));
+ break;
+ case op1(STI):
+ name = (fp->f_rrname) ? fp->f_rrname : fp->f_name;
+ db_printf("%s\t%%st(%d)",name, f_rm(rex, regmodrm));
+ break;
+ case op1(X):
+ name = ((const char * const *)fp->f_rrname)[f_rm(rex, regmodrm)];
+ if (*name == '\0')
+ goto bad;
+ db_printf("%s", name);
+ break;
+ case op1(XA):
+ name = ((const char * const *)fp->f_rrname)[f_rm(rex, regmodrm)];
+ if (*name == '\0')
+ goto bad;
+ db_printf("%s\t%%ax", name);
+ break;
+ default:
+ bad:
+ db_printf("<bad instruction>");
+ break;
+ }
+ }
+
+ return (loc);
+}
+
+/*
+ * Disassemble instruction at 'loc'. 'altfmt' specifies an
+ * (optional) alternate format. Return address of start of
+ * next instruction.
+ */
+db_addr_t
+db_disasm(loc, altfmt)
+ db_addr_t loc;
+ boolean_t altfmt;
+{
+ int inst;
+ int size;
+ int short_addr;
+ const char * seg;
+ const struct inst * ip;
+ const char * i_name;
+ int i_size;
+ int i_mode;
+ int rex = 0;
+ int regmodrm = 0;
+ boolean_t first;
+ int displ;
+ int prefix;
+ int rep;
+ int imm;
+ int imm2;
+ long imm64;
+ int len;
+ struct i_addr address;
+
+ get_value_inc(inst, loc, 1, FALSE);
+ short_addr = FALSE;
+ size = LONG;
+ seg = 0;
+
+ /*
+ * Get prefixes
+ */
+ rep = FALSE;
+ prefix = TRUE;
+ do {
+ switch (inst) {
+ case 0x66: /* data16 */
+ size = WORD;
+ break;
+ case 0x67:
+ short_addr = TRUE;
+ break;
+ case 0x26:
+ seg = "%es";
+ break;
+ case 0x36:
+ seg = "%ss";
+ break;
+ case 0x2e:
+ seg = "%cs";
+ break;
+ case 0x3e:
+ seg = "%ds";
+ break;
+ case 0x64:
+ seg = "%fs";
+ break;
+ case 0x65:
+ seg = "%gs";
+ break;
+ case 0xf0:
+ db_printf("lock ");
+ break;
+ case 0xf2:
+ db_printf("repne ");
+ break;
+ case 0xf3:
+ rep = TRUE;
+ break;
+ default:
+ prefix = FALSE;
+ break;
+ }
+ if (inst >= 0x40 && inst < 0x50) {
+ rex = inst;
+ prefix = TRUE;
+ }
+ if (prefix) {
+ get_value_inc(inst, loc, 1, FALSE);
+ }
+ } while (prefix);
+
+ if (inst >= 0xd8 && inst <= 0xdf) {
+ loc = db_disasm_esc(loc, inst, rex, short_addr, size, seg);
+ db_printf("\n");
+ return (loc);
+ }
+
+ ip = &db_inst_table[inst];
+ while (ip->i_size == ESC) {
+ get_value_inc(inst, loc, 1, FALSE);
+ ip = ((const struct inst * const *)ip->i_extra)[inst>>4];
+ if (ip == 0) {
+ ip = &db_bad_inst;
+ }
+ else {
+ ip = &ip[inst&0xf];
+ }
+ }
+
+ if (ip->i_has_modrm) {
+ get_value_inc(regmodrm, loc, 1, FALSE);
+ loc = db_read_address(loc, short_addr, rex, regmodrm, &address);
+ }
+
+ i_name = ip->i_name;
+ i_size = ip->i_size;
+ i_mode = ip->i_mode;
+
+ if (ip->i_extra == db_Grp1 || ip->i_extra == db_Grp2 ||
+ ip->i_extra == db_Grp6 || ip->i_extra == db_Grp7 ||
+ ip->i_extra == db_Grp8 || ip->i_extra == db_Grp9 ||
+ ip->i_extra == db_Grp15) {
+ i_name = ((const char * const *)ip->i_extra)[f_reg(rex, regmodrm)];
+ }
+ else if (ip->i_extra == db_Grp3) {
+ ip = ip->i_extra;
+ ip = &ip[f_reg(rex, regmodrm)];
+ i_name = ip->i_name;
+ i_mode = ip->i_mode;
+ }
+ else if (ip->i_extra == db_Grp4 || ip->i_extra == db_Grp5) {
+ ip = ip->i_extra;
+ ip = &ip[f_reg(rex, regmodrm)];
+ i_name = ip->i_name;
+ i_mode = ip->i_mode;
+ i_size = ip->i_size;
+ }
+
+ /* Special cases that don't fit well in the tables. */
+ if (ip->i_extra == db_Grp7 && f_mod(rex, regmodrm) == 3) {
+ switch (regmodrm) {
+ case 0xc1:
+ i_name = "vmcall";
+ i_size = NONE;
+ i_mode = 0;
+ break;
+ case 0xc2:
+ i_name = "vmlaunch";
+ i_size = NONE;
+ i_mode = 0;
+ break;
+ case 0xc3:
+ i_name = "vmresume";
+ i_size = NONE;
+ i_mode = 0;
+ break;
+ case 0xc4:
+ i_name = "vmxoff";
+ i_size = NONE;
+ i_mode = 0;
+ break;
+ case 0xc8:
+ i_name = "monitor";
+ i_size = NONE;
+ i_mode = 0;
+ break;
+ case 0xc9:
+ i_name = "mwait";
+ i_size = NONE;
+ i_mode = 0;
+ break;
+ case 0xd0:
+ i_name = "xgetbv";
+ i_size = NONE;
+ i_mode = 0;
+ break;
+ case 0xd1:
+ i_name = "xsetbv";
+ i_size = NONE;
+ i_mode = 0;
+ break;
+ case 0xf8:
+ i_name = "swapgs";
+ i_size = NONE;
+ i_mode = 0;
+ break;
+ case 0xf9:
+ i_name = "rdtscp";
+ i_size = NONE;
+ i_mode = 0;
+ break;
+ }
+ }
+ if (ip->i_extra == db_Grp15 && f_mod(rex, regmodrm) == 3) {
+ i_name = db_Grp15b[f_reg(rex, regmodrm)];
+ i_size = NONE;
+ i_mode = 0;
+ }
+
+ /* Handle instructions identified by mandatory prefixes. */
+ if (rep == TRUE) {
+ if (inst == 0x90) {
+ i_name = "pause";
+ i_size = NONE;
+ i_mode = 0;
+ rep = FALSE;
+ } else if (ip->i_extra == db_Grp9 && f_mod(rex, regmodrm) != 3 &&
+ f_reg(rex, regmodrm) == 0x6) {
+ i_name = "vmxon";
+ rep = FALSE;
+ }
+ }
+ if (size == WORD) {
+ if (ip->i_extra == db_Grp9 && f_mod(rex, regmodrm) != 3 &&
+ f_reg(rex, regmodrm) == 0x6) {
+ i_name = "vmclear";
+ }
+ }
+ if (rex & REX_W) {
+ if (strcmp(i_name, "cwde") == 0)
+ i_name = "cdqe";
+ else if (strcmp(i_name, "cmpxchg8b") == 0)
+ i_name = "cmpxchg16b";
+ }
+
+ if (rep == TRUE)
+ db_printf("repe "); /* XXX repe VS rep */
+
+ if (i_size == SDEP) {
+ if (size == LONG)
+ db_printf("%s", i_name);
+ else
+ db_printf("%s", (const char *)ip->i_extra);
+ } else if (i_size == ADEP) {
+ if (short_addr == FALSE)
+ db_printf("%s", i_name);
+ else
+ db_printf("%s", (const char *)ip->i_extra);
+ }
+ else {
+ db_printf("%s", i_name);
+ if ((inst >= 0x50 && inst <= 0x5f) || inst == 0x68 || inst == 0x6a) {
+ i_size = NONE;
+ db_printf("q");
+ }
+ if (i_size != NONE) {
+ if (i_size == BYTE) {
+ db_printf("b");
+ size = BYTE;
+ }
+ else if (i_size == WORD) {
+ db_printf("w");
+ size = WORD;
+ }
+ else if (size == WORD)
+ db_printf("w");
+ else {
+ if (rex & REX_W)
+ db_printf("q");
+ else
+ db_printf("l");
+ }
+ }
+ }
+ db_printf("\t");
+ for (first = TRUE;
+ i_mode != 0;
+ i_mode >>= 8, first = FALSE)
+ {
+ if (!first)
+ db_printf(",");
+
+ switch (i_mode & 0xFF) {
+
+ case E:
+ db_print_address(seg, size, rex, &address);
+ break;
+
+ case Eind:
+ db_printf("*");
+ db_print_address(seg, size, rex, &address);
+ break;
+
+ case El:
+ db_print_address(seg, (rex & REX_W) ? QUAD : LONG, rex, &address);
+ break;
+
+ case EL:
+ db_print_address(seg, LONG, 0, &address);
+ break;
+
+ case Ew:
+ db_print_address(seg, WORD, rex, &address);
+ break;
+
+ case Eb:
+ db_print_address(seg, BYTE, rex, &address);
+ break;
+
+ case R:
+ db_printf("%s", db_reg[rex != 0 ? 1 : 0][(size == LONG && (rex & REX_W)) ? QUAD : size][f_reg(rex, regmodrm)]);
+ break;
+
+ case Rw:
+ db_printf("%s", db_reg[rex != 0 ? 1 : 0][WORD][f_reg(rex, regmodrm)]);
+ break;
+
+ case Rq:
+ db_printf("%s", db_reg[rex != 0 ? 1 : 0][QUAD][f_reg(rex, regmodrm)]);
+ break;
+
+ case Ri:
+ db_printf("%s", db_reg[0][QUAD][f_rm(rex, inst)]);
+ break;
+
+ case Ril:
+ db_printf("%s", db_reg[rex != 0 ? 1 : 0][(rex & REX_R) ? QUAD : LONG][f_rm(rex, inst)]);
+ break;
+
+ case S:
+ db_printf("%s", db_seg_reg[f_reg(rex, regmodrm)]);
+ break;
+
+ case Si:
+ db_printf("%s", db_seg_reg[f_reg(rex, inst)]);
+ break;
+
+ case A:
+ db_printf("%s", db_reg[rex != 0 ? 1 : 0][size][0]); /* acc */
+ break;
+
+ case BX:
+ if (seg)
+ db_printf("%s:", seg);
+ db_printf("(%s)", short_addr ? "%bx" : "%ebx");
+ break;
+
+ case CL:
+ db_printf("%%cl");
+ break;
+
+ case DX:
+ db_printf("%%dx");
+ break;
+
+ case SI:
+ if (seg)
+ db_printf("%s:", seg);
+ db_printf("(%s)", short_addr ? "%si" : "%rsi");
+ break;
+
+ case DI:
+ db_printf("%%es:(%s)", short_addr ? "%di" : "%rdi");
+ break;
+
+ case CR:
+ db_printf("%%cr%d", f_reg(rex, regmodrm));
+ break;
+
+ case DR:
+ db_printf("%%dr%d", f_reg(rex, regmodrm));
+ break;
+
+ case TR:
+ db_printf("%%tr%d", f_reg(rex, regmodrm));
+ break;
+
+ case I:
+ len = db_lengths[size];
+ get_value_inc(imm, loc, len, FALSE);
+ db_printf("$%#r", imm);
+ break;
+
+ case Is:
+ len = db_lengths[(size == LONG && (rex & REX_W)) ? QUAD : size];
+ get_value_inc(imm, loc, len, FALSE);
+ db_printf("$%+#r", imm);
+ break;
+
+ case Ib:
+ get_value_inc(imm, loc, 1, FALSE);
+ db_printf("$%#r", imm);
+ break;
+
+ case Iba:
+ get_value_inc(imm, loc, 1, FALSE);
+ if (imm != 0x0a)
+ db_printf("$%#r", imm);
+ break;
+
+ case Ibs:
+ get_value_inc(imm, loc, 1, TRUE);
+ if (size == WORD)
+ imm &= 0xFFFF;
+ db_printf("$%+#r", imm);
+ break;
+
+ case Iw:
+ get_value_inc(imm, loc, 2, FALSE);
+ db_printf("$%#r", imm);
+ break;
+
+ case Ilq:
+ len = db_lengths[rex & REX_W ? QUAD : LONG];
+ get_value_inc(imm64, loc, len, FALSE);
+ db_printf("$%#lr", imm64);
+ break;
+
+ case O:
+ len = (short_addr ? 2 : 4);
+ get_value_inc(displ, loc, len, FALSE);
+ if (seg)
+ db_printf("%s:%+#r",seg, displ);
+ else
+ db_printsym((db_addr_t)displ, DB_STGY_ANY);
+ break;
+
+ case Db:
+ get_value_inc(displ, loc, 1, TRUE);
+ displ += loc;
+ if (size == WORD)
+ displ &= 0xFFFF;
+ db_printsym((db_addr_t)displ, DB_STGY_XTRN);
+ break;
+
+ case Dl:
+ len = db_lengths[(size == LONG && (rex & REX_W)) ? QUAD : size];
+ get_value_inc(displ, loc, len, FALSE);
+ displ += loc;
+ if (size == WORD)
+ displ &= 0xFFFF;
+ db_printsym((db_addr_t)displ, DB_STGY_XTRN);
+ break;
+
+ case o1:
+ db_printf("$1");
+ break;
+
+ case o3:
+ db_printf("$3");
+ break;
+
+ case OS:
+ len = db_lengths[size];
+ get_value_inc(imm, loc, len, FALSE); /* offset */
+ get_value_inc(imm2, loc, 2, FALSE); /* segment */
+ db_printf("$%#r,%#r", imm2, imm);
+ break;
+ }
+ }
+ db_printf("\n");
+ return (loc);
+}
diff --git a/sys/amd64/amd64/db_interface.c b/sys/amd64/amd64/db_interface.c
new file mode 100644
index 0000000..f44cac4
--- /dev/null
+++ b/sys/amd64/amd64/db_interface.c
@@ -0,0 +1,149 @@
+/*-
+ * Mach Operating System
+ * Copyright (c) 1991,1990 Carnegie Mellon University
+ * All Rights Reserved.
+ *
+ * Permission to use, copy, modify and distribute this software and its
+ * documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Interface to new debugger.
+ */
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/cons.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+
+#include <machine/cpu.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <ddb/ddb.h>
+
+/*
+ * Read bytes from kernel address space for debugger.
+ */
+int
+db_read_bytes(vm_offset_t addr, size_t size, char *data)
+{
+ jmp_buf jb;
+ void *prev_jb;
+ char *src;
+ int ret;
+
+ prev_jb = kdb_jmpbuf(jb);
+ ret = setjmp(jb);
+ if (ret == 0) {
+ src = (char *)addr;
+ while (size-- > 0)
+ *data++ = *src++;
+ }
+ (void)kdb_jmpbuf(prev_jb);
+ return (ret);
+}
+
+/*
+ * Write bytes to kernel address space for debugger.
+ */
+int
+db_write_bytes(vm_offset_t addr, size_t size, char *data)
+{
+ jmp_buf jb;
+ void *prev_jb;
+ char *dst;
+ pt_entry_t *ptep0 = NULL;
+ pt_entry_t oldmap0 = 0;
+ vm_offset_t addr1;
+ pt_entry_t *ptep1 = NULL;
+ pt_entry_t oldmap1 = 0;
+ int ret;
+
+ prev_jb = kdb_jmpbuf(jb);
+ ret = setjmp(jb);
+ if (ret == 0) {
+ if (addr > trunc_page((vm_offset_t)btext) - size &&
+ addr < round_page((vm_offset_t)etext)) {
+
+ ptep0 = vtopte(addr);
+ oldmap0 = *ptep0;
+ *ptep0 |= PG_RW;
+
+ /*
+ * Map another page if the data crosses a page
+ * boundary.
+ */
+ if ((*ptep0 & PG_PS) == 0) {
+ addr1 = trunc_page(addr + size - 1);
+ if (trunc_page(addr) != addr1) {
+ ptep1 = vtopte(addr1);
+ oldmap1 = *ptep1;
+ *ptep1 |= PG_RW;
+ }
+ } else {
+ addr1 = trunc_2mpage(addr + size - 1);
+ if (trunc_2mpage(addr) != addr1) {
+ ptep1 = vtopte(addr1);
+ oldmap1 = *ptep1;
+ *ptep1 |= PG_RW;
+ }
+ }
+
+ invltlb();
+ }
+
+ dst = (char *)addr;
+
+ while (size-- > 0)
+ *dst++ = *data++;
+ }
+
+ (void)kdb_jmpbuf(prev_jb);
+
+ if (ptep0) {
+ *ptep0 = oldmap0;
+
+ if (ptep1)
+ *ptep1 = oldmap1;
+
+ invltlb();
+ }
+
+ return (ret);
+}
+
+void
+db_show_mdpcpu(struct pcpu *pc)
+{
+
+ db_printf("curpmap = %p\n", pc->pc_curpmap);
+ db_printf("tssp = %p\n", pc->pc_tssp);
+ db_printf("commontssp = %p\n", pc->pc_commontssp);
+ db_printf("rsp0 = 0x%lx\n", pc->pc_rsp0);
+ db_printf("gs32p = %p\n", pc->pc_gs32p);
+ db_printf("ldt = %p\n", pc->pc_ldt);
+ db_printf("tss = %p\n", pc->pc_tss);
+}
diff --git a/sys/amd64/amd64/db_trace.c b/sys/amd64/amd64/db_trace.c
new file mode 100644
index 0000000..2c81f87
--- /dev/null
+++ b/sys/amd64/amd64/db_trace.c
@@ -0,0 +1,701 @@
+/*-
+ * Mach Operating System
+ * Copyright (c) 1991,1990 Carnegie Mellon University
+ * All Rights Reserved.
+ *
+ * Permission to use, copy, modify and distribute this software and its
+ * documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/proc.h>
+#include <sys/stack.h>
+#include <sys/sysent.h>
+
+#include <machine/cpu.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#include <machine/reg.h>
+#include <machine/stack.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+
+#include <ddb/ddb.h>
+#include <ddb/db_access.h>
+#include <ddb/db_sym.h>
+#include <ddb/db_variables.h>
+
+static db_varfcn_t db_dr0;
+static db_varfcn_t db_dr1;
+static db_varfcn_t db_dr2;
+static db_varfcn_t db_dr3;
+static db_varfcn_t db_dr4;
+static db_varfcn_t db_dr5;
+static db_varfcn_t db_dr6;
+static db_varfcn_t db_dr7;
+static db_varfcn_t db_frame;
+static db_varfcn_t db_rsp;
+static db_varfcn_t db_ss;
+
+/*
+ * Machine register set.
+ */
+#define DB_OFFSET(x) (db_expr_t *)offsetof(struct trapframe, x)
+struct db_variable db_regs[] = {
+ { "cs", DB_OFFSET(tf_cs), db_frame },
+ { "ds", DB_OFFSET(tf_ds), db_frame },
+ { "es", DB_OFFSET(tf_es), db_frame },
+ { "fs", DB_OFFSET(tf_fs), db_frame },
+ { "gs", DB_OFFSET(tf_gs), db_frame },
+ { "ss", NULL, db_ss },
+ { "rax", DB_OFFSET(tf_rax), db_frame },
+ { "rcx", DB_OFFSET(tf_rcx), db_frame },
+ { "rdx", DB_OFFSET(tf_rdx), db_frame },
+ { "rbx", DB_OFFSET(tf_rbx), db_frame },
+ { "rsp", NULL, db_rsp },
+ { "rbp", DB_OFFSET(tf_rbp), db_frame },
+ { "rsi", DB_OFFSET(tf_rsi), db_frame },
+ { "rdi", DB_OFFSET(tf_rdi), db_frame },
+ { "r8", DB_OFFSET(tf_r8), db_frame },
+ { "r9", DB_OFFSET(tf_r9), db_frame },
+ { "r10", DB_OFFSET(tf_r10), db_frame },
+ { "r11", DB_OFFSET(tf_r11), db_frame },
+ { "r12", DB_OFFSET(tf_r12), db_frame },
+ { "r13", DB_OFFSET(tf_r13), db_frame },
+ { "r14", DB_OFFSET(tf_r14), db_frame },
+ { "r15", DB_OFFSET(tf_r15), db_frame },
+ { "rip", DB_OFFSET(tf_rip), db_frame },
+ { "rflags", DB_OFFSET(tf_rflags), db_frame },
+#define DB_N_SHOW_REGS 24 /* Don't show registers after here. */
+ { "dr0", NULL, db_dr0 },
+ { "dr1", NULL, db_dr1 },
+ { "dr2", NULL, db_dr2 },
+ { "dr3", NULL, db_dr3 },
+ { "dr4", NULL, db_dr4 },
+ { "dr5", NULL, db_dr5 },
+ { "dr6", NULL, db_dr6 },
+ { "dr7", NULL, db_dr7 },
+};
+struct db_variable *db_eregs = db_regs + DB_N_SHOW_REGS;
+
+#define DB_DRX_FUNC(reg) \
+static int \
+db_ ## reg (vp, valuep, op) \
+ struct db_variable *vp; \
+ db_expr_t * valuep; \
+ int op; \
+{ \
+ if (op == DB_VAR_GET) \
+ *valuep = r ## reg (); \
+ else \
+ load_ ## reg (*valuep); \
+ return (1); \
+}
+
+DB_DRX_FUNC(dr0)
+DB_DRX_FUNC(dr1)
+DB_DRX_FUNC(dr2)
+DB_DRX_FUNC(dr3)
+DB_DRX_FUNC(dr4)
+DB_DRX_FUNC(dr5)
+DB_DRX_FUNC(dr6)
+DB_DRX_FUNC(dr7)
+
+static __inline long
+get_rsp(struct trapframe *tf)
+{
+ return ((ISPL(tf->tf_cs)) ? tf->tf_rsp :
+ (db_expr_t)tf + offsetof(struct trapframe, tf_rsp));
+}
+
+static int
+db_frame(struct db_variable *vp, db_expr_t *valuep, int op)
+{
+ long *reg;
+
+ if (kdb_frame == NULL)
+ return (0);
+
+ reg = (long *)((uintptr_t)kdb_frame + (db_expr_t)vp->valuep);
+ if (op == DB_VAR_GET)
+ *valuep = *reg;
+ else
+ *reg = *valuep;
+ return (1);
+}
+
+static int
+db_rsp(struct db_variable *vp, db_expr_t *valuep, int op)
+{
+
+ if (kdb_frame == NULL)
+ return (0);
+
+ if (op == DB_VAR_GET)
+ *valuep = get_rsp(kdb_frame);
+ else if (ISPL(kdb_frame->tf_cs))
+ kdb_frame->tf_rsp = *valuep;
+ return (1);
+}
+
+static int
+db_ss(struct db_variable *vp, db_expr_t *valuep, int op)
+{
+
+ if (kdb_frame == NULL)
+ return (0);
+
+ if (op == DB_VAR_GET)
+ *valuep = (ISPL(kdb_frame->tf_cs)) ? kdb_frame->tf_ss : rss();
+ else if (ISPL(kdb_frame->tf_cs))
+ kdb_frame->tf_ss = *valuep;
+ return (1);
+}
+
+#define NORMAL 0
+#define TRAP 1
+#define INTERRUPT 2
+#define SYSCALL 3
+#define TRAP_INTERRUPT 5
+
+static void db_nextframe(struct amd64_frame **, db_addr_t *, struct thread *);
+static int db_numargs(struct amd64_frame *);
+static void db_print_stack_entry(const char *, int, char **, long *, db_addr_t,
+ void *);
+static void decode_syscall(int, struct thread *);
+
+static const char * watchtype_str(int type);
+int amd64_set_watch(int watchnum, unsigned long watchaddr, int size,
+ int access, struct dbreg *d);
+int amd64_clr_watch(int watchnum, struct dbreg *d);
+
+/*
+ * Figure out how many arguments were passed into the frame at "fp".
+ */
+static int
+db_numargs(fp)
+ struct amd64_frame *fp;
+{
+#if 1
+ return (0); /* regparm, needs dwarf2 info */
+#else
+ long *argp;
+ int inst;
+ int args;
+
+ argp = (long *)db_get_value((long)&fp->f_retaddr, 8, FALSE);
+ /*
+ * XXX etext is wrong for LKMs. We should attempt to interpret
+ * the instruction at the return address in all cases. This
+ * may require better fault handling.
+ */
+ if (argp < (long *)btext || argp >= (long *)etext) {
+ args = 5;
+ } else {
+ inst = db_get_value((long)argp, 4, FALSE);
+ if ((inst & 0xff) == 0x59) /* popl %ecx */
+ args = 1;
+ else if ((inst & 0xffff) == 0xc483) /* addl $Ibs, %esp */
+ args = ((inst >> 16) & 0xff) / 4;
+ else
+ args = 5;
+ }
+ return (args);
+#endif
+}
+
+static void
+db_print_stack_entry(name, narg, argnp, argp, callpc, frame)
+ const char *name;
+ int narg;
+ char **argnp;
+ long *argp;
+ db_addr_t callpc;
+ void *frame;
+{
+ db_printf("%s(", name);
+#if 0
+ while (narg) {
+ if (argnp)
+ db_printf("%s=", *argnp++);
+ db_printf("%lr", (long)db_get_value((long)argp, 8, FALSE));
+ argp++;
+ if (--narg != 0)
+ db_printf(",");
+ }
+#endif
+ db_printf(") at ");
+ db_printsym(callpc, DB_STGY_PROC);
+ if (frame != NULL)
+ db_printf("/frame 0x%lx", (register_t)frame);
+ db_printf("\n");
+}
+
+static void
+decode_syscall(int number, struct thread *td)
+{
+ struct proc *p;
+ c_db_sym_t sym;
+ db_expr_t diff;
+ sy_call_t *f;
+ const char *symname;
+
+ db_printf(" (%d", number);
+ p = (td != NULL) ? td->td_proc : NULL;
+ if (p != NULL && 0 <= number && number < p->p_sysent->sv_size) {
+ f = p->p_sysent->sv_table[number].sy_call;
+ sym = db_search_symbol((db_addr_t)f, DB_STGY_ANY, &diff);
+ if (sym != DB_SYM_NULL && diff == 0) {
+ db_symbol_values(sym, &symname, NULL);
+ db_printf(", %s, %s", p->p_sysent->sv_name, symname);
+ }
+ }
+ db_printf(")");
+}
+
+/*
+ * Figure out the next frame up in the call stack.
+ */
+static void
+db_nextframe(struct amd64_frame **fp, db_addr_t *ip, struct thread *td)
+{
+ struct trapframe *tf;
+ int frame_type;
+ long rip, rsp, rbp;
+ db_expr_t offset;
+ c_db_sym_t sym;
+ const char *name;
+
+ rip = db_get_value((long) &(*fp)->f_retaddr, 8, FALSE);
+ rbp = db_get_value((long) &(*fp)->f_frame, 8, FALSE);
+
+ /*
+ * Figure out frame type. We look at the address just before
+ * the saved instruction pointer as the saved EIP is after the
+ * call function, and if the function being called is marked as
+ * dead (such as panic() at the end of dblfault_handler()), then
+ * the instruction at the saved EIP will be part of a different
+ * function (syscall() in this example) rather than the one that
+ * actually made the call.
+ */
+ frame_type = NORMAL;
+ sym = db_search_symbol(rip - 1, DB_STGY_ANY, &offset);
+ db_symbol_values(sym, &name, NULL);
+ if (name != NULL) {
+ if (strcmp(name, "calltrap") == 0 ||
+ strcmp(name, "fork_trampoline") == 0 ||
+ strcmp(name, "nmi_calltrap") == 0 ||
+ strcmp(name, "Xdblfault") == 0)
+ frame_type = TRAP;
+ else if (strncmp(name, "Xatpic_intr", 11) == 0 ||
+ strncmp(name, "Xapic_isr", 9) == 0 ||
+ strcmp(name, "Xtimerint") == 0 ||
+ strcmp(name, "Xipi_intr_bitmap_handler") == 0 ||
+ strcmp(name, "Xcpustop") == 0 ||
+ strcmp(name, "Xcpususpend") == 0 ||
+ strcmp(name, "Xrendezvous") == 0)
+ frame_type = INTERRUPT;
+ else if (strcmp(name, "Xfast_syscall") == 0)
+ frame_type = SYSCALL;
+#ifdef COMPAT_FREEBSD32
+ else if (strcmp(name, "Xint0x80_syscall") == 0)
+ frame_type = SYSCALL;
+#endif
+ /* XXX: These are interrupts with trap frames. */
+ else if (strcmp(name, "Xtimerint") == 0 ||
+ strcmp(name, "Xcpustop") == 0 ||
+ strcmp(name, "Xcpususpend") == 0 ||
+ strcmp(name, "Xrendezvous") == 0 ||
+ strcmp(name, "Xipi_intr_bitmap_handler") == 0)
+ frame_type = TRAP_INTERRUPT;
+ }
+
+ /*
+ * Normal frames need no special processing.
+ */
+ if (frame_type == NORMAL) {
+ *ip = (db_addr_t) rip;
+ *fp = (struct amd64_frame *) rbp;
+ return;
+ }
+
+ db_print_stack_entry(name, 0, 0, 0, rip, &(*fp)->f_frame);
+
+ /*
+ * Point to base of trapframe which is just above the
+ * current frame.
+ */
+ tf = (struct trapframe *)((long)*fp + 16);
+
+ if (INKERNEL((long) tf)) {
+ rsp = get_rsp(tf);
+ rip = tf->tf_rip;
+ rbp = tf->tf_rbp;
+ switch (frame_type) {
+ case TRAP:
+ db_printf("--- trap %#r", tf->tf_trapno);
+ break;
+ case SYSCALL:
+ db_printf("--- syscall");
+ decode_syscall(tf->tf_rax, td);
+ break;
+ case TRAP_INTERRUPT:
+ case INTERRUPT:
+ db_printf("--- interrupt");
+ break;
+ default:
+ panic("The moon has moved again.");
+ }
+ db_printf(", rip = %#lr, rsp = %#lr, rbp = %#lr ---\n", rip,
+ rsp, rbp);
+ }
+
+ *ip = (db_addr_t) rip;
+ *fp = (struct amd64_frame *) rbp;
+}
+
+static int
+db_backtrace(struct thread *td, struct trapframe *tf,
+ struct amd64_frame *frame, db_addr_t pc, int count)
+{
+ struct amd64_frame *actframe;
+#define MAXNARG 16
+ char *argnames[MAXNARG], **argnp = NULL;
+ const char *name;
+ long *argp;
+ db_expr_t offset;
+ c_db_sym_t sym;
+ int narg;
+ boolean_t first;
+
+ if (count == -1)
+ count = 1024;
+
+ first = TRUE;
+ while (count-- && !db_pager_quit) {
+ sym = db_search_symbol(pc, DB_STGY_ANY, &offset);
+ db_symbol_values(sym, &name, NULL);
+
+ /*
+ * Attempt to determine a (possibly fake) frame that gives
+ * the caller's pc. It may differ from `frame' if the
+ * current function never sets up a standard frame or hasn't
+ * set one up yet or has just discarded one. The last two
+ * cases can be guessed fairly reliably for code generated
+ * by gcc. The first case is too much trouble to handle in
+ * general because the amount of junk on the stack depends
+ * on the pc (the special handling of "calltrap", etc. in
+ * db_nextframe() works because the `next' pc is special).
+ */
+ actframe = frame;
+ if (first) {
+ if (tf != NULL) {
+ int instr;
+
+ instr = db_get_value(pc, 4, FALSE);
+ if ((instr & 0xffffffff) == 0xe5894855) {
+ /* pushq %rbp; movq %rsp, %rbp */
+ actframe = (void *)(get_rsp(tf) - 8);
+ } else if ((instr & 0xffffff) == 0xe58948) {
+ /* movq %rsp, %rbp */
+ actframe = (void *)get_rsp(tf);
+ if (tf->tf_rbp == 0) {
+ /* Fake frame better. */
+ frame = actframe;
+ }
+ } else if ((instr & 0xff) == 0xc3) {
+ /* ret */
+ actframe = (void *)(get_rsp(tf) - 8);
+ } else if (offset == 0) {
+ /* Probably an assembler symbol. */
+ actframe = (void *)(get_rsp(tf) - 8);
+ }
+ } else if (strcmp(name, "fork_trampoline") == 0) {
+ /*
+ * Don't try to walk back on a stack for a
+ * process that hasn't actually been run yet.
+ */
+ db_print_stack_entry(name, 0, 0, 0, pc,
+ actframe);
+ break;
+ }
+ first = FALSE;
+ }
+
+ argp = &actframe->f_arg0;
+ narg = MAXNARG;
+ if (sym != NULL && db_sym_numargs(sym, &narg, argnames)) {
+ argnp = argnames;
+ } else {
+ narg = db_numargs(frame);
+ }
+
+ db_print_stack_entry(name, narg, argnp, argp, pc, actframe);
+
+ if (actframe != frame) {
+ /* `frame' belongs to caller. */
+ pc = (db_addr_t)
+ db_get_value((long)&actframe->f_retaddr, 8, FALSE);
+ continue;
+ }
+
+ db_nextframe(&frame, &pc, td);
+
+ if (INKERNEL((long)pc) && !INKERNEL((long)frame)) {
+ sym = db_search_symbol(pc, DB_STGY_ANY, &offset);
+ db_symbol_values(sym, &name, NULL);
+ db_print_stack_entry(name, 0, 0, 0, pc, frame);
+ break;
+ }
+ if (!INKERNEL((long) frame)) {
+ break;
+ }
+ }
+
+ return (0);
+}
+
+void
+db_trace_self(void)
+{
+ struct amd64_frame *frame;
+ db_addr_t callpc;
+ register_t rbp;
+
+ __asm __volatile("movq %%rbp,%0" : "=r" (rbp));
+ frame = (struct amd64_frame *)rbp;
+ callpc = (db_addr_t)db_get_value((long)&frame->f_retaddr, 8, FALSE);
+ frame = frame->f_frame;
+ db_backtrace(curthread, NULL, frame, callpc, -1);
+}
+
+int
+db_trace_thread(struct thread *thr, int count)
+{
+ struct pcb *ctx;
+
+ ctx = kdb_thr_ctx(thr);
+ return (db_backtrace(thr, NULL, (struct amd64_frame *)ctx->pcb_rbp,
+ ctx->pcb_rip, count));
+}
+
+int
+amd64_set_watch(watchnum, watchaddr, size, access, d)
+ int watchnum;
+ unsigned long watchaddr;
+ int size;
+ int access;
+ struct dbreg *d;
+{
+ int i, len;
+
+ if (watchnum == -1) {
+ for (i = 0; i < 4; i++)
+ if (!DBREG_DR7_ENABLED(d->dr[7], i))
+ break;
+ if (i < 4)
+ watchnum = i;
+ else
+ return (-1);
+ }
+
+ switch (access) {
+ case DBREG_DR7_EXEC:
+ size = 1; /* size must be 1 for an execution breakpoint */
+ /* fall through */
+ case DBREG_DR7_WRONLY:
+ case DBREG_DR7_RDWR:
+ break;
+ default:
+ return (-1);
+ }
+
+ /*
+ * we can watch a 1, 2, 4, or 8 byte sized location
+ */
+ switch (size) {
+ case 1:
+ len = DBREG_DR7_LEN_1;
+ break;
+ case 2:
+ len = DBREG_DR7_LEN_2;
+ break;
+ case 4:
+ len = DBREG_DR7_LEN_4;
+ break;
+ case 8:
+ len = DBREG_DR7_LEN_8;
+ break;
+ default:
+ return (-1);
+ }
+
+ /* clear the bits we are about to affect */
+ d->dr[7] &= ~DBREG_DR7_MASK(watchnum);
+
+ /* set drN register to the address, N=watchnum */
+ DBREG_DRX(d, watchnum) = watchaddr;
+
+ /* enable the watchpoint */
+ d->dr[7] |= DBREG_DR7_SET(watchnum, len, access,
+ DBREG_DR7_GLOBAL_ENABLE);
+
+ return (watchnum);
+}
+
+
+int
+amd64_clr_watch(watchnum, d)
+ int watchnum;
+ struct dbreg *d;
+{
+
+ if (watchnum < 0 || watchnum >= 4)
+ return (-1);
+
+ d->dr[7] &= ~DBREG_DR7_MASK(watchnum);
+ DBREG_DRX(d, watchnum) = 0;
+
+ return (0);
+}
+
+
+int
+db_md_set_watchpoint(addr, size)
+ db_expr_t addr;
+ db_expr_t size;
+{
+ struct dbreg d;
+ int avail, i, wsize;
+
+ fill_dbregs(NULL, &d);
+
+ avail = 0;
+ for(i = 0; i < 4; i++) {
+ if (!DBREG_DR7_ENABLED(d.dr[7], i))
+ avail++;
+ }
+
+ if (avail * 8 < size)
+ return (-1);
+
+ for (i = 0; i < 4 && (size > 0); i++) {
+ if (!DBREG_DR7_ENABLED(d.dr[7], i)) {
+ if (size >= 8 || (avail == 1 && size > 4))
+ wsize = 8;
+ else if (size > 2)
+ wsize = 4;
+ else
+ wsize = size;
+ amd64_set_watch(i, addr, wsize,
+ DBREG_DR7_WRONLY, &d);
+ addr += wsize;
+ size -= wsize;
+ avail--;
+ }
+ }
+
+ set_dbregs(NULL, &d);
+
+ return(0);
+}
+
+
+int
+db_md_clr_watchpoint(addr, size)
+ db_expr_t addr;
+ db_expr_t size;
+{
+ struct dbreg d;
+ int i;
+
+ fill_dbregs(NULL, &d);
+
+ for(i = 0; i < 4; i++) {
+ if (DBREG_DR7_ENABLED(d.dr[7], i)) {
+ if ((DBREG_DRX((&d), i) >= addr) &&
+ (DBREG_DRX((&d), i) < addr+size))
+ amd64_clr_watch(i, &d);
+
+ }
+ }
+
+ set_dbregs(NULL, &d);
+
+ return(0);
+}
+
+
+static const char *
+watchtype_str(type)
+ int type;
+{
+ switch (type) {
+ case DBREG_DR7_EXEC : return "execute"; break;
+ case DBREG_DR7_RDWR : return "read/write"; break;
+ case DBREG_DR7_WRONLY : return "write"; break;
+ default : return "invalid"; break;
+ }
+}
+
+
+void
+db_md_list_watchpoints()
+{
+ struct dbreg d;
+ int i, len, type;
+
+ fill_dbregs(NULL, &d);
+
+ db_printf("\nhardware watchpoints:\n");
+ db_printf(" watch status type len address\n");
+ db_printf(" ----- -------- ---------- --- ------------------\n");
+ for (i = 0; i < 4; i++) {
+ if (DBREG_DR7_ENABLED(d.dr[7], i)) {
+ type = DBREG_DR7_ACCESS(d.dr[7], i);
+ len = DBREG_DR7_LEN(d.dr[7], i);
+ if (len == DBREG_DR7_LEN_8)
+ len = 8;
+ else
+ len++;
+ db_printf(" %-5d %-8s %10s %3d ",
+ i, "enabled", watchtype_str(type), len);
+ db_printsym((db_addr_t)DBREG_DRX((&d), i), DB_STGY_ANY);
+ db_printf("\n");
+ } else {
+ db_printf(" %-5d disabled\n", i);
+ }
+ }
+
+ db_printf("\ndebug register values:\n");
+ for (i = 0; i < 8; i++) {
+ db_printf(" dr%d 0x%016lx\n", i, DBREG_DRX((&d), i));
+ }
+ db_printf("\n");
+}
diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c
new file mode 100644
index 0000000..fdc4d56
--- /dev/null
+++ b/sys/amd64/amd64/elf_machdep.c
@@ -0,0 +1,278 @@
+/*-
+ * Copyright 1996-1998 John D. Polstra.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/linker.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/imgact_elf.h>
+#include <sys/syscall.h>
+#include <sys/signalvar.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_param.h>
+
+#include <machine/elf.h>
+#include <machine/md_var.h>
+
+struct sysentvec elf64_freebsd_sysvec = {
+ .sv_size = SYS_MAXSYSCALL,
+ .sv_table = sysent,
+ .sv_mask = 0,
+ .sv_sigsize = 0,
+ .sv_sigtbl = NULL,
+ .sv_errsize = 0,
+ .sv_errtbl = NULL,
+ .sv_transtrap = NULL,
+ .sv_fixup = __elfN(freebsd_fixup),
+ .sv_sendsig = sendsig,
+ .sv_sigcode = sigcode,
+ .sv_szsigcode = &szsigcode,
+ .sv_prepsyscall = NULL,
+ .sv_name = "FreeBSD ELF64",
+ .sv_coredump = __elfN(coredump),
+ .sv_imgact_try = NULL,
+ .sv_minsigstksz = MINSIGSTKSZ,
+ .sv_pagesize = PAGE_SIZE,
+ .sv_minuser = VM_MIN_ADDRESS,
+ .sv_maxuser = VM_MAXUSER_ADDRESS,
+ .sv_usrstack = USRSTACK,
+ .sv_psstrings = PS_STRINGS,
+ .sv_stackprot = VM_PROT_ALL,
+ .sv_copyout_strings = exec_copyout_strings,
+ .sv_setregs = exec_setregs,
+ .sv_fixlimit = NULL,
+ .sv_maxssiz = NULL,
+ .sv_flags = SV_ABI_FREEBSD | SV_LP64 | SV_SHP,
+ .sv_set_syscall_retval = cpu_set_syscall_retval,
+ .sv_fetch_syscall_args = cpu_fetch_syscall_args,
+ .sv_syscallnames = syscallnames,
+ .sv_shared_page_base = SHAREDPAGE,
+ .sv_shared_page_len = PAGE_SIZE,
+ .sv_schedtail = NULL,
+};
+INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec);
+
+static Elf64_Brandinfo freebsd_brand_info = {
+ .brand = ELFOSABI_FREEBSD,
+ .machine = EM_X86_64,
+ .compat_3_brand = "FreeBSD",
+ .emul_path = NULL,
+ .interp_path = "/libexec/ld-elf.so.1",
+ .sysvec = &elf64_freebsd_sysvec,
+ .interp_newpath = NULL,
+ .brand_note = &elf64_freebsd_brandnote,
+ .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
+};
+
+SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST,
+ (sysinit_cfunc_t) elf64_insert_brand_entry,
+ &freebsd_brand_info);
+
+static Elf64_Brandinfo freebsd_brand_oinfo = {
+ .brand = ELFOSABI_FREEBSD,
+ .machine = EM_X86_64,
+ .compat_3_brand = "FreeBSD",
+ .emul_path = NULL,
+ .interp_path = "/usr/libexec/ld-elf.so.1",
+ .sysvec = &elf64_freebsd_sysvec,
+ .interp_newpath = NULL,
+ .brand_note = &elf64_freebsd_brandnote,
+ .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
+};
+
+SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY,
+ (sysinit_cfunc_t) elf64_insert_brand_entry,
+ &freebsd_brand_oinfo);
+
+static Elf64_Brandinfo kfreebsd_brand_info = {
+ .brand = ELFOSABI_FREEBSD,
+ .machine = EM_X86_64,
+ .compat_3_brand = "FreeBSD",
+ .emul_path = NULL,
+ .interp_path = "/lib/ld-kfreebsd-x86-64.so.1",
+ .sysvec = &elf64_freebsd_sysvec,
+ .interp_newpath = NULL,
+ .brand_note = &elf64_kfreebsd_brandnote,
+ .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY
+};
+
+SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY,
+ (sysinit_cfunc_t) elf64_insert_brand_entry,
+ &kfreebsd_brand_info);
+
+void
+elf64_dump_thread(struct thread *td __unused, void *dst __unused,
+ size_t *off __unused)
+{
+}
+
+
+/* Process one elf relocation with addend. */
+static int
+elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data,
+ int type, int local, elf_lookup_fn lookup)
+{
+ Elf64_Addr *where, val;
+ Elf32_Addr *where32, val32;
+ Elf_Addr addr;
+ Elf_Addr addend;
+ Elf_Size rtype, symidx;
+ const Elf_Rel *rel;
+ const Elf_Rela *rela;
+
+ switch (type) {
+ case ELF_RELOC_REL:
+ rel = (const Elf_Rel *)data;
+ where = (Elf_Addr *) (relocbase + rel->r_offset);
+ rtype = ELF_R_TYPE(rel->r_info);
+ symidx = ELF_R_SYM(rel->r_info);
+ /* Addend is 32 bit on 32 bit relocs */
+ switch (rtype) {
+ case R_X86_64_PC32:
+ case R_X86_64_32S:
+ addend = *(Elf32_Addr *)where;
+ break;
+ default:
+ addend = *where;
+ break;
+ }
+ break;
+ case ELF_RELOC_RELA:
+ rela = (const Elf_Rela *)data;
+ where = (Elf_Addr *) (relocbase + rela->r_offset);
+ addend = rela->r_addend;
+ rtype = ELF_R_TYPE(rela->r_info);
+ symidx = ELF_R_SYM(rela->r_info);
+ break;
+ default:
+ panic("unknown reloc type %d\n", type);
+ }
+
+ switch (rtype) {
+
+ case R_X86_64_NONE: /* none */
+ break;
+
+ case R_X86_64_64: /* S + A */
+ addr = lookup(lf, symidx, 1);
+ val = addr + addend;
+ if (addr == 0)
+ return -1;
+ if (*where != val)
+ *where = val;
+ break;
+
+ case R_X86_64_PC32: /* S + A - P */
+ addr = lookup(lf, symidx, 1);
+ where32 = (Elf32_Addr *)where;
+ val32 = (Elf32_Addr)(addr + addend - (Elf_Addr)where);
+ if (addr == 0)
+ return -1;
+ if (*where32 != val32)
+ *where32 = val32;
+ break;
+
+ case R_X86_64_32S: /* S + A sign extend */
+ addr = lookup(lf, symidx, 1);
+ val32 = (Elf32_Addr)(addr + addend);
+ where32 = (Elf32_Addr *)where;
+ if (addr == 0)
+ return -1;
+ if (*where32 != val32)
+ *where32 = val32;
+ break;
+
+ case R_X86_64_COPY: /* none */
+ /*
+ * There shouldn't be copy relocations in kernel
+ * objects.
+ */
+ printf("kldload: unexpected R_COPY relocation\n");
+ return -1;
+ break;
+
+ case R_X86_64_GLOB_DAT: /* S */
+ case R_X86_64_JMP_SLOT: /* XXX need addend + offset */
+ addr = lookup(lf, symidx, 1);
+ if (addr == 0)
+ return -1;
+ if (*where != addr)
+ *where = addr;
+ break;
+
+ case R_X86_64_RELATIVE: /* B + A */
+ addr = relocbase + addend;
+ val = addr;
+ if (*where != val)
+ *where = val;
+ break;
+
+ default:
+ printf("kldload: unexpected relocation type %ld\n",
+ rtype);
+ return -1;
+ }
+ return(0);
+}
+
+int
+elf_reloc(linker_file_t lf, Elf_Addr relocbase, const void *data, int type,
+ elf_lookup_fn lookup)
+{
+
+ return (elf_reloc_internal(lf, relocbase, data, type, 0, lookup));
+}
+
+int
+elf_reloc_local(linker_file_t lf, Elf_Addr relocbase, const void *data,
+ int type, elf_lookup_fn lookup)
+{
+
+ return (elf_reloc_internal(lf, relocbase, data, type, 1, lookup));
+}
+
+int
+elf_cpu_load_file(linker_file_t lf __unused)
+{
+
+ return (0);
+}
+
+int
+elf_cpu_unload_file(linker_file_t lf __unused)
+{
+
+ return (0);
+}
diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S
new file mode 100644
index 0000000..89ad638
--- /dev/null
+++ b/sys/amd64/amd64/exception.S
@@ -0,0 +1,878 @@
+/*-
+ * Copyright (c) 1989, 1990 William F. Jolitz.
+ * Copyright (c) 1990 The Regents of the University of California.
+ * Copyright (c) 2007 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by A. Joseph Koshy under
+ * sponsorship from the FreeBSD Foundation and Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_atpic.h"
+#include "opt_compat.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+
+#include <machine/asmacros.h>
+#include <machine/psl.h>
+#include <machine/trap.h>
+#include <machine/specialreg.h>
+
+#include "assym.s"
+
+#ifdef KDTRACE_HOOKS
+ .bss
+ .globl dtrace_invop_jump_addr
+ .align 8
+ .type dtrace_invop_jump_addr,@object
+ .size dtrace_invop_jump_addr,8
+dtrace_invop_jump_addr:
+ .zero 8
+ .globl dtrace_invop_calltrap_addr
+ .align 8
+ .type dtrace_invop_calltrap_addr,@object
+ .size dtrace_invop_calltrap_addr,8
+dtrace_invop_calltrap_addr:
+ .zero 8
+#endif
+ .text
+#ifdef HWPMC_HOOKS
+ ENTRY(start_exceptions)
+#endif
+
+/*****************************************************************************/
+/* Trap handling */
+/*****************************************************************************/
+/*
+ * Trap and fault vector routines.
+ *
+ * All traps are 'interrupt gates', SDT_SYSIGT. An interrupt gate pushes
+ * state on the stack but also disables interrupts. This is important for
+ * us for the use of the swapgs instruction. We cannot be interrupted
+ * until the GS.base value is correct. For most traps, we automatically
+ * then enable interrupts if the interrupted context had them enabled.
+ * This is equivalent to the i386 port's use of SDT_SYS386TGT.
+ *
+ * The cpu will push a certain amount of state onto the kernel stack for
+ * the current process. See amd64/include/frame.h.
+ * This includes the current RFLAGS (status register, which includes
+ * the interrupt disable state prior to the trap), the code segment register,
+ * and the return instruction pointer are pushed by the cpu. The cpu
+ * will also push an 'error' code for certain traps. We push a dummy
+ * error code for those traps where the cpu doesn't in order to maintain
+ * a consistent frame. We also push a contrived 'trap number'.
+ *
+ * The cpu does not push the general registers, we must do that, and we
+ * must restore them prior to calling 'iret'. The cpu adjusts the %cs and
+ * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we
+ * must load them with appropriate values for supervisor mode operation.
+ */
+
+MCOUNT_LABEL(user)
+MCOUNT_LABEL(btrap)
+
+/* Traps that we leave interrupts disabled for.. */
+#define TRAP_NOEN(a) \
+ subq $TF_RIP,%rsp; \
+ movl $(a),TF_TRAPNO(%rsp) ; \
+ movq $0,TF_ADDR(%rsp) ; \
+ movq $0,TF_ERR(%rsp) ; \
+ jmp alltraps_noen
+IDTVEC(dbg)
+ TRAP_NOEN(T_TRCTRAP)
+IDTVEC(bpt)
+ TRAP_NOEN(T_BPTFLT)
+#ifdef KDTRACE_HOOKS
+IDTVEC(dtrace_ret)
+ TRAP_NOEN(T_DTRACE_RET)
+#endif
+
+/* Regular traps; The cpu does not supply tf_err for these. */
+#define TRAP(a) \
+ subq $TF_RIP,%rsp; \
+ movl $(a),TF_TRAPNO(%rsp) ; \
+ movq $0,TF_ADDR(%rsp) ; \
+ movq $0,TF_ERR(%rsp) ; \
+ jmp alltraps
+IDTVEC(div)
+ TRAP(T_DIVIDE)
+IDTVEC(ofl)
+ TRAP(T_OFLOW)
+IDTVEC(bnd)
+ TRAP(T_BOUND)
+IDTVEC(ill)
+ TRAP(T_PRIVINFLT)
+IDTVEC(dna)
+ TRAP(T_DNA)
+IDTVEC(fpusegm)
+ TRAP(T_FPOPFLT)
+IDTVEC(mchk)
+ TRAP(T_MCHK)
+IDTVEC(rsvd)
+ TRAP(T_RESERVED)
+IDTVEC(fpu)
+ TRAP(T_ARITHTRAP)
+IDTVEC(xmm)
+ TRAP(T_XMMFLT)
+
+/* This group of traps have tf_err already pushed by the cpu */
+#define TRAP_ERR(a) \
+ subq $TF_ERR,%rsp; \
+ movl $(a),TF_TRAPNO(%rsp) ; \
+ movq $0,TF_ADDR(%rsp) ; \
+ jmp alltraps
+IDTVEC(tss)
+ TRAP_ERR(T_TSSFLT)
+IDTVEC(missing)
+ TRAP_ERR(T_SEGNPFLT)
+IDTVEC(stk)
+ TRAP_ERR(T_STKFLT)
+IDTVEC(align)
+ TRAP_ERR(T_ALIGNFLT)
+
+ /*
+ * alltraps entry point. Use swapgs if this is the first time in the
+ * kernel from userland. Reenable interrupts if they were enabled
+ * before the trap. This approximates SDT_SYS386TGT on the i386 port.
+ */
+ SUPERALIGN_TEXT
+ .globl alltraps
+ .type alltraps,@function
+alltraps:
+ movq %rdi,TF_RDI(%rsp)
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+ jz alltraps_testi /* already running with kernel GS.base */
+ swapgs
+ movq PCPU(CURPCB),%rdi
+ andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi)
+ movw %fs,TF_FS(%rsp)
+ movw %gs,TF_GS(%rsp)
+ movw %es,TF_ES(%rsp)
+ movw %ds,TF_DS(%rsp)
+alltraps_testi:
+ testl $PSL_I,TF_RFLAGS(%rsp)
+ jz alltraps_pushregs_no_rdi
+ sti
+alltraps_pushregs_no_rdi:
+ movq %rsi,TF_RSI(%rsp)
+ movq %rdx,TF_RDX(%rsp)
+ movq %rcx,TF_RCX(%rsp)
+ movq %r8,TF_R8(%rsp)
+ movq %r9,TF_R9(%rsp)
+ movq %rax,TF_RAX(%rsp)
+ movq %rbx,TF_RBX(%rsp)
+ movq %rbp,TF_RBP(%rsp)
+ movq %r10,TF_R10(%rsp)
+ movq %r11,TF_R11(%rsp)
+ movq %r12,TF_R12(%rsp)
+ movq %r13,TF_R13(%rsp)
+ movq %r14,TF_R14(%rsp)
+ movq %r15,TF_R15(%rsp)
+ movl $TF_HASSEGS,TF_FLAGS(%rsp)
+ cld
+ FAKE_MCOUNT(TF_RIP(%rsp))
+#ifdef KDTRACE_HOOKS
+ /*
+ * DTrace Function Boundary Trace (fbt) probes are triggered
+ * by int3 (0xcc) which causes the #BP (T_BPTFLT) breakpoint
+ * interrupt. For all other trap types, just handle them in
+ * the usual way.
+ */
+ cmpl $T_BPTFLT,TF_TRAPNO(%rsp)
+ jne calltrap
+
+ /* Check if there is no DTrace hook registered. */
+ cmpq $0,dtrace_invop_jump_addr
+ je calltrap
+
+ /*
+ * Set our jump address for the jump back in the event that
+ * the breakpoint wasn't caused by DTrace at all.
+ */
+ movq $calltrap,dtrace_invop_calltrap_addr(%rip)
+
+ /* Jump to the code hooked in by DTrace. */
+ movq dtrace_invop_jump_addr,%rax
+ jmpq *dtrace_invop_jump_addr
+#endif
+ .globl calltrap
+ .type calltrap,@function
+calltrap:
+ movq %rsp,%rdi
+ call trap
+ MEXITCOUNT
+ jmp doreti /* Handle any pending ASTs */
+
+ /*
+ * alltraps_noen entry point. Unlike alltraps above, we want to
+ * leave the interrupts disabled. This corresponds to
+ * SDT_SYS386IGT on the i386 port.
+ */
+ SUPERALIGN_TEXT
+ .globl alltraps_noen
+ .type alltraps_noen,@function
+alltraps_noen:
+ movq %rdi,TF_RDI(%rsp)
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+ jz 1f /* already running with kernel GS.base */
+ swapgs
+ movq PCPU(CURPCB),%rdi
+ andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi)
+1: movw %fs,TF_FS(%rsp)
+ movw %gs,TF_GS(%rsp)
+ movw %es,TF_ES(%rsp)
+ movw %ds,TF_DS(%rsp)
+ jmp alltraps_pushregs_no_rdi
+
+IDTVEC(dblfault)
+ subq $TF_ERR,%rsp
+ movl $T_DOUBLEFLT,TF_TRAPNO(%rsp)
+ movq $0,TF_ADDR(%rsp)
+ movq $0,TF_ERR(%rsp)
+ movq %rdi,TF_RDI(%rsp)
+ movq %rsi,TF_RSI(%rsp)
+ movq %rdx,TF_RDX(%rsp)
+ movq %rcx,TF_RCX(%rsp)
+ movq %r8,TF_R8(%rsp)
+ movq %r9,TF_R9(%rsp)
+ movq %rax,TF_RAX(%rsp)
+ movq %rbx,TF_RBX(%rsp)
+ movq %rbp,TF_RBP(%rsp)
+ movq %r10,TF_R10(%rsp)
+ movq %r11,TF_R11(%rsp)
+ movq %r12,TF_R12(%rsp)
+ movq %r13,TF_R13(%rsp)
+ movq %r14,TF_R14(%rsp)
+ movq %r15,TF_R15(%rsp)
+ movw %fs,TF_FS(%rsp)
+ movw %gs,TF_GS(%rsp)
+ movw %es,TF_ES(%rsp)
+ movw %ds,TF_DS(%rsp)
+ movl $TF_HASSEGS,TF_FLAGS(%rsp)
+ cld
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+ jz 1f /* already running with kernel GS.base */
+ swapgs
+1:
+ movq %rsp,%rdi
+ call dblfault_handler
+2:
+ hlt
+ jmp 2b
+
+IDTVEC(page)
+ subq $TF_ERR,%rsp
+ movl $T_PAGEFLT,TF_TRAPNO(%rsp)
+ movq %rdi,TF_RDI(%rsp) /* free up a GP register */
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+ jz 1f /* already running with kernel GS.base */
+ swapgs
+ movq PCPU(CURPCB),%rdi
+ andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi)
+1: movq %cr2,%rdi /* preserve %cr2 before .. */
+ movq %rdi,TF_ADDR(%rsp) /* enabling interrupts. */
+ movw %fs,TF_FS(%rsp)
+ movw %gs,TF_GS(%rsp)
+ movw %es,TF_ES(%rsp)
+ movw %ds,TF_DS(%rsp)
+ testl $PSL_I,TF_RFLAGS(%rsp)
+ jz alltraps_pushregs_no_rdi
+ sti
+ jmp alltraps_pushregs_no_rdi
+
+ /*
+ * We have to special-case this one. If we get a trap in doreti() at
+ * the iretq stage, we'll reenter with the wrong gs state. We'll have
+ * to do a special the swapgs in this case even coming from the kernel.
+ * XXX linux has a trap handler for their equivalent of load_gs().
+ */
+IDTVEC(prot)
+ subq $TF_ERR,%rsp
+ movl $T_PROTFLT,TF_TRAPNO(%rsp)
+ movq $0,TF_ADDR(%rsp)
+ movq %rdi,TF_RDI(%rsp) /* free up a GP register */
+ leaq doreti_iret(%rip),%rdi
+ cmpq %rdi,TF_RIP(%rsp)
+ je 1f /* kernel but with user gsbase!! */
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+ jz 2f /* already running with kernel GS.base */
+1: swapgs
+2: movq PCPU(CURPCB),%rdi
+ orl $PCB_FULL_IRET,PCB_FLAGS(%rdi) /* always full iret from GPF */
+ movw %fs,TF_FS(%rsp)
+ movw %gs,TF_GS(%rsp)
+ movw %es,TF_ES(%rsp)
+ movw %ds,TF_DS(%rsp)
+ testl $PSL_I,TF_RFLAGS(%rsp)
+ jz alltraps_pushregs_no_rdi
+ sti
+ jmp alltraps_pushregs_no_rdi
+
+/*
+ * Fast syscall entry point. We enter here with just our new %cs/%ss set,
+ * and the new privilige level. We are still running on the old user stack
+ * pointer. We have to juggle a few things around to find our stack etc.
+ * swapgs gives us access to our PCPU space only.
+ *
+ * We do not support invoking this from a custom %cs or %ss (e.g. using
+ * entries from an LDT).
+ */
+IDTVEC(fast_syscall)
+ swapgs
+ movq %rsp,PCPU(SCRATCH_RSP)
+ movq PCPU(RSP0),%rsp
+ /* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
+ subq $TF_SIZE,%rsp
+ /* defer TF_RSP till we have a spare register */
+ movq %r11,TF_RFLAGS(%rsp)
+ movq %rcx,TF_RIP(%rsp) /* %rcx original value is in %r10 */
+ movq PCPU(SCRATCH_RSP),%r11 /* %r11 already saved */
+ movq %r11,TF_RSP(%rsp) /* user stack pointer */
+ movw %fs,TF_FS(%rsp)
+ movw %gs,TF_GS(%rsp)
+ movw %es,TF_ES(%rsp)
+ movw %ds,TF_DS(%rsp)
+ movq PCPU(CURPCB),%r11
+ andl $~PCB_FULL_IRET,PCB_FLAGS(%r11)
+ sti
+ movq $KUDSEL,TF_SS(%rsp)
+ movq $KUCSEL,TF_CS(%rsp)
+ movq $2,TF_ERR(%rsp)
+ movq %rdi,TF_RDI(%rsp) /* arg 1 */
+ movq %rsi,TF_RSI(%rsp) /* arg 2 */
+ movq %rdx,TF_RDX(%rsp) /* arg 3 */
+ movq %r10,TF_RCX(%rsp) /* arg 4 */
+ movq %r8,TF_R8(%rsp) /* arg 5 */
+ movq %r9,TF_R9(%rsp) /* arg 6 */
+ movq %rax,TF_RAX(%rsp) /* syscall number */
+ movq %rbx,TF_RBX(%rsp) /* C preserved */
+ movq %rbp,TF_RBP(%rsp) /* C preserved */
+ movq %r12,TF_R12(%rsp) /* C preserved */
+ movq %r13,TF_R13(%rsp) /* C preserved */
+ movq %r14,TF_R14(%rsp) /* C preserved */
+ movq %r15,TF_R15(%rsp) /* C preserved */
+ movl $TF_HASSEGS,TF_FLAGS(%rsp)
+ cld
+ FAKE_MCOUNT(TF_RIP(%rsp))
+ movq PCPU(CURTHREAD),%rdi
+ movq %rsp,TD_FRAME(%rdi)
+ movl TF_RFLAGS(%rsp),%esi
+ andl $PSL_T,%esi
+ call amd64_syscall
+1: movq PCPU(CURPCB),%rax
+ /* Disable interrupts before testing PCB_FULL_IRET. */
+ cli
+ testl $PCB_FULL_IRET,PCB_FLAGS(%rax)
+ jnz 3f
+ /* Check for and handle AST's on return to userland. */
+ movq PCPU(CURTHREAD),%rax
+ testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
+ jne 2f
+ /* Restore preserved registers. */
+ MEXITCOUNT
+ movq TF_RDI(%rsp),%rdi /* bonus; preserve arg 1 */
+ movq TF_RSI(%rsp),%rsi /* bonus: preserve arg 2 */
+ movq TF_RDX(%rsp),%rdx /* return value 2 */
+ movq TF_RAX(%rsp),%rax /* return value 1 */
+ movq TF_RFLAGS(%rsp),%r11 /* original %rflags */
+ movq TF_RIP(%rsp),%rcx /* original %rip */
+ movq TF_RSP(%rsp),%rsp /* user stack pointer */
+ swapgs
+ sysretq
+
+2: /* AST scheduled. */
+ sti
+ movq %rsp,%rdi
+ call ast
+ jmp 1b
+
+3: /* Requested full context restore, use doreti for that. */
+ MEXITCOUNT
+ jmp doreti
+
+/*
+ * Here for CYA insurance, in case a "syscall" instruction gets
+ * issued from 32 bit compatability mode. MSR_CSTAR has to point
+ * to *something* if EFER_SCE is enabled.
+ */
+IDTVEC(fast_syscall32)
+ sysret
+
+/*
+ * NMI handling is special.
+ *
+ * First, NMIs do not respect the state of the processor's RFLAGS.IF
+ * bit. The NMI handler may be entered at any time, including when
+ * the processor is in a critical section with RFLAGS.IF == 0.
+ * The processor's GS.base value could be invalid on entry to the
+ * handler.
+ *
+ * Second, the processor treats NMIs specially, blocking further NMIs
+ * until an 'iretq' instruction is executed. We thus need to execute
+ * the NMI handler with interrupts disabled, to prevent a nested interrupt
+ * from executing an 'iretq' instruction and inadvertently taking the
+ * processor out of NMI mode.
+ *
+ * Third, the NMI handler runs on its own stack (tss_ist2). The canonical
+ * GS.base value for the processor is stored just above the bottom of its
+ * NMI stack. For NMIs taken from kernel mode, the current value in
+ * the processor's GS.base is saved at entry to C-preserved register %r12,
+ * the canonical value for GS.base is then loaded into the processor, and
+ * the saved value is restored at exit time. For NMIs taken from user mode,
+ * the cheaper 'SWAPGS' instructions are used for swapping GS.base.
+ */
+
+IDTVEC(nmi)
+ subq $TF_RIP,%rsp
+ movl $(T_NMI),TF_TRAPNO(%rsp)
+ movq $0,TF_ADDR(%rsp)
+ movq $0,TF_ERR(%rsp)
+ movq %rdi,TF_RDI(%rsp)
+ movq %rsi,TF_RSI(%rsp)
+ movq %rdx,TF_RDX(%rsp)
+ movq %rcx,TF_RCX(%rsp)
+ movq %r8,TF_R8(%rsp)
+ movq %r9,TF_R9(%rsp)
+ movq %rax,TF_RAX(%rsp)
+ movq %rbx,TF_RBX(%rsp)
+ movq %rbp,TF_RBP(%rsp)
+ movq %r10,TF_R10(%rsp)
+ movq %r11,TF_R11(%rsp)
+ movq %r12,TF_R12(%rsp)
+ movq %r13,TF_R13(%rsp)
+ movq %r14,TF_R14(%rsp)
+ movq %r15,TF_R15(%rsp)
+ movw %fs,TF_FS(%rsp)
+ movw %gs,TF_GS(%rsp)
+ movw %es,TF_ES(%rsp)
+ movw %ds,TF_DS(%rsp)
+ movl $TF_HASSEGS,TF_FLAGS(%rsp)
+ cld
+ xorl %ebx,%ebx
+ testb $SEL_RPL_MASK,TF_CS(%rsp)
+ jnz nmi_fromuserspace
+ /*
+ * We've interrupted the kernel. Preserve GS.base in %r12.
+ */
+ movl $MSR_GSBASE,%ecx
+ rdmsr
+ movq %rax,%r12
+ shlq $32,%rdx
+ orq %rdx,%r12
+ /* Retrieve and load the canonical value for GS.base. */
+ movq TF_SIZE(%rsp),%rdx
+ movl %edx,%eax
+ shrq $32,%rdx
+ wrmsr
+ jmp nmi_calltrap
+nmi_fromuserspace:
+ incl %ebx
+ swapgs
+/* Note: this label is also used by ddb and gdb: */
+nmi_calltrap:
+ FAKE_MCOUNT(TF_RIP(%rsp))
+ movq %rsp,%rdi
+ call trap
+ MEXITCOUNT
+#ifdef HWPMC_HOOKS
+ /*
+ * Capture a userspace callchain if needed.
+ *
+ * - Check if the current trap was from user mode.
+ * - Check if the current thread is valid.
+ * - Check if the thread requires a user call chain to be
+ * captured.
+ *
+ * We are still in NMI mode at this point.
+ */
+ testl %ebx,%ebx
+ jz nocallchain /* not from userspace */
+ movq PCPU(CURTHREAD),%rax
+ orq %rax,%rax /* curthread present? */
+ jz nocallchain
+ testl $TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
+ jz nocallchain
+ /*
+ * A user callchain is to be captured, so:
+ * - Move execution to the regular kernel stack, to allow for
+ * nested NMI interrupts.
+ * - Take the processor out of "NMI" mode by faking an "iret".
+ * - Enable interrupts, so that copyin() can work.
+ */
+ movq %rsp,%rsi /* source stack pointer */
+ movq $TF_SIZE,%rcx
+ movq PCPU(RSP0),%rdx
+ subq %rcx,%rdx
+ movq %rdx,%rdi /* destination stack pointer */
+
+ shrq $3,%rcx /* trap frame size in long words */
+ cld
+ rep
+ movsq /* copy trapframe */
+
+ movl %ss,%eax
+ pushq %rax /* tf_ss */
+ pushq %rdx /* tf_rsp (on kernel stack) */
+ pushfq /* tf_rflags */
+ movl %cs,%eax
+ pushq %rax /* tf_cs */
+ pushq $outofnmi /* tf_rip */
+ iretq
+outofnmi:
+ /*
+ * At this point the processor has exited NMI mode and is running
+ * with interrupts turned off on the normal kernel stack.
+ *
+ * If a pending NMI gets recognized at or after this point, it
+ * will cause a kernel callchain to be traced.
+ *
+ * We turn interrupts back on, and call the user callchain capture hook.
+ */
+ movq pmc_hook,%rax
+ orq %rax,%rax
+ jz nocallchain
+ movq PCPU(CURTHREAD),%rdi /* thread */
+ movq $PMC_FN_USER_CALLCHAIN,%rsi /* command */
+ movq %rsp,%rdx /* frame */
+ sti
+ call *%rax
+ cli
+nocallchain:
+#endif
+ testl %ebx,%ebx
+ jnz doreti_exit
+nmi_kernelexit:
+ /*
+ * Put back the preserved MSR_GSBASE value.
+ */
+ movl $MSR_GSBASE,%ecx
+ movq %r12,%rdx
+ movl %edx,%eax
+ shrq $32,%rdx
+ wrmsr
+nmi_restoreregs:
+ movq TF_RDI(%rsp),%rdi
+ movq TF_RSI(%rsp),%rsi
+ movq TF_RDX(%rsp),%rdx
+ movq TF_RCX(%rsp),%rcx
+ movq TF_R8(%rsp),%r8
+ movq TF_R9(%rsp),%r9
+ movq TF_RAX(%rsp),%rax
+ movq TF_RBX(%rsp),%rbx
+ movq TF_RBP(%rsp),%rbp
+ movq TF_R10(%rsp),%r10
+ movq TF_R11(%rsp),%r11
+ movq TF_R12(%rsp),%r12
+ movq TF_R13(%rsp),%r13
+ movq TF_R14(%rsp),%r14
+ movq TF_R15(%rsp),%r15
+ addq $TF_RIP,%rsp
+ jmp doreti_iret
+
+ENTRY(fork_trampoline)
+ movq %r12,%rdi /* function */
+ movq %rbx,%rsi /* arg1 */
+ movq %rsp,%rdx /* trapframe pointer */
+ call fork_exit
+ MEXITCOUNT
+ jmp doreti /* Handle any ASTs */
+
+/*
+ * To efficiently implement classification of trap and interrupt handlers
+ * for profiling, there must be only trap handlers between the labels btrap
+ * and bintr, and only interrupt handlers between the labels bintr and
+ * eintr. This is implemented (partly) by including files that contain
+ * some of the handlers. Before including the files, set up a normal asm
+ * environment so that the included files doen't need to know that they are
+ * included.
+ */
+
+#ifdef COMPAT_FREEBSD32
+ .data
+ .p2align 4
+ .text
+ SUPERALIGN_TEXT
+
+#include <amd64/ia32/ia32_exception.S>
+#endif
+
+ .data
+ .p2align 4
+ .text
+ SUPERALIGN_TEXT
+MCOUNT_LABEL(bintr)
+
+#include <amd64/amd64/apic_vector.S>
+
+#ifdef DEV_ATPIC
+ .data
+ .p2align 4
+ .text
+ SUPERALIGN_TEXT
+
+#include <amd64/amd64/atpic_vector.S>
+#endif
+
+ .text
+MCOUNT_LABEL(eintr)
+
+/*
+ * void doreti(struct trapframe)
+ *
+ * Handle return from interrupts, traps and syscalls.
+ */
+ .text
+ SUPERALIGN_TEXT
+ .type doreti,@function
+doreti:
+ FAKE_MCOUNT($bintr) /* init "from" bintr -> doreti */
+ /*
+ * Check if ASTs can be handled now.
+ */
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* are we returning to user mode? */
+ jz doreti_exit /* can't handle ASTs now if not */
+
+doreti_ast:
+ /*
+ * Check for ASTs atomically with returning. Disabling CPU
+ * interrupts provides sufficient locking even in the SMP case,
+ * since we will be informed of any new ASTs by an IPI.
+ */
+ cli
+ movq PCPU(CURTHREAD),%rax
+ testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
+ je doreti_exit
+ sti
+ movq %rsp,%rdi /* pass a pointer to the trapframe */
+ call ast
+ jmp doreti_ast
+
+ /*
+ * doreti_exit: pop registers, iret.
+ *
+ * The segment register pop is a special case, since it may
+ * fault if (for example) a sigreturn specifies bad segment
+ * registers. The fault is handled in trap.c.
+ */
+doreti_exit:
+ MEXITCOUNT
+ movq PCPU(CURPCB),%r8
+
+ /*
+ * Do not reload segment registers for kernel.
+ * Since we do not reload segments registers with sane
+ * values on kernel entry, descriptors referenced by
+ * segments registers might be not valid. This is fatal
+ * for user mode, but is not a problem for the kernel.
+ */
+ testb $SEL_RPL_MASK,TF_CS(%rsp)
+ jz ld_regs
+ testl $PCB_FULL_IRET,PCB_FLAGS(%r8)
+ jz ld_regs
+ testl $TF_HASSEGS,TF_FLAGS(%rsp)
+ je set_segs
+
+do_segs:
+ /* Restore %fs and fsbase */
+ movw TF_FS(%rsp),%ax
+ .globl ld_fs
+ld_fs:
+ movw %ax,%fs
+ cmpw $KUF32SEL,%ax
+ jne 1f
+ movl $MSR_FSBASE,%ecx
+ movl PCB_FSBASE(%r8),%eax
+ movl PCB_FSBASE+4(%r8),%edx
+ .globl ld_fsbase
+ld_fsbase:
+ wrmsr
+1:
+ /* Restore %gs and gsbase */
+ movw TF_GS(%rsp),%si
+ pushfq
+ cli
+ movl $MSR_GSBASE,%ecx
+ rdmsr
+ .globl ld_gs
+ld_gs:
+ movw %si,%gs
+ wrmsr
+ popfq
+ cmpw $KUG32SEL,%si
+ jne 1f
+ movl $MSR_KGSBASE,%ecx
+ movl PCB_GSBASE(%r8),%eax
+ movl PCB_GSBASE+4(%r8),%edx
+ .globl ld_gsbase
+ld_gsbase:
+ wrmsr
+1:
+ .globl ld_es
+ld_es:
+ movw TF_ES(%rsp),%es
+ .globl ld_ds
+ld_ds:
+ movw TF_DS(%rsp),%ds
+ld_regs:
+ movq TF_RDI(%rsp),%rdi
+ movq TF_RSI(%rsp),%rsi
+ movq TF_RDX(%rsp),%rdx
+ movq TF_RCX(%rsp),%rcx
+ movq TF_R8(%rsp),%r8
+ movq TF_R9(%rsp),%r9
+ movq TF_RAX(%rsp),%rax
+ movq TF_RBX(%rsp),%rbx
+ movq TF_RBP(%rsp),%rbp
+ movq TF_R10(%rsp),%r10
+ movq TF_R11(%rsp),%r11
+ movq TF_R12(%rsp),%r12
+ movq TF_R13(%rsp),%r13
+ movq TF_R14(%rsp),%r14
+ movq TF_R15(%rsp),%r15
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+ jz 1f /* keep running with kernel GS.base */
+ cli
+ swapgs
+1:
+ addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */
+ .globl doreti_iret
+doreti_iret:
+ iretq
+
+set_segs:
+ movw $KUDSEL,%ax
+ movw %ax,TF_DS(%rsp)
+ movw %ax,TF_ES(%rsp)
+ movw $KUF32SEL,TF_FS(%rsp)
+ movw $KUG32SEL,TF_GS(%rsp)
+ jmp do_segs
+
+ /*
+ * doreti_iret_fault. Alternative return code for
+ * the case where we get a fault in the doreti_exit code
+ * above. trap() (amd64/amd64/trap.c) catches this specific
+ * case, sends the process a signal and continues in the
+ * corresponding place in the code below.
+ */
+ ALIGN_TEXT
+ .globl doreti_iret_fault
+doreti_iret_fault:
+ subq $TF_RIP,%rsp /* space including tf_err, tf_trapno */
+ testl $PSL_I,TF_RFLAGS(%rsp)
+ jz 1f
+ sti
+1:
+ movw %fs,TF_FS(%rsp)
+ movw %gs,TF_GS(%rsp)
+ movw %es,TF_ES(%rsp)
+ movw %ds,TF_DS(%rsp)
+ movl $TF_HASSEGS,TF_FLAGS(%rsp)
+ movq %rdi,TF_RDI(%rsp)
+ movq %rsi,TF_RSI(%rsp)
+ movq %rdx,TF_RDX(%rsp)
+ movq %rcx,TF_RCX(%rsp)
+ movq %r8,TF_R8(%rsp)
+ movq %r9,TF_R9(%rsp)
+ movq %rax,TF_RAX(%rsp)
+ movq %rbx,TF_RBX(%rsp)
+ movq %rbp,TF_RBP(%rsp)
+ movq %r10,TF_R10(%rsp)
+ movq %r11,TF_R11(%rsp)
+ movq %r12,TF_R12(%rsp)
+ movq %r13,TF_R13(%rsp)
+ movq %r14,TF_R14(%rsp)
+ movq %r15,TF_R15(%rsp)
+ movl $T_PROTFLT,TF_TRAPNO(%rsp)
+ movq $0,TF_ERR(%rsp) /* XXX should be the error code */
+ movq $0,TF_ADDR(%rsp)
+ FAKE_MCOUNT(TF_RIP(%rsp))
+ jmp calltrap
+
+ ALIGN_TEXT
+ .globl ds_load_fault
+ds_load_fault:
+ movl $T_PROTFLT,TF_TRAPNO(%rsp)
+ movq %rsp,%rdi
+ call trap
+ movw $KUDSEL,TF_DS(%rsp)
+ jmp doreti
+
+ ALIGN_TEXT
+ .globl es_load_fault
+es_load_fault:
+ movl $T_PROTFLT,TF_TRAPNO(%rsp)
+ movq %rsp,%rdi
+ call trap
+ movw $KUDSEL,TF_ES(%rsp)
+ jmp doreti
+
+ ALIGN_TEXT
+ .globl fs_load_fault
+fs_load_fault:
+ movl $T_PROTFLT,TF_TRAPNO(%rsp)
+ movq %rsp,%rdi
+ call trap
+ movw $KUF32SEL,TF_FS(%rsp)
+ jmp doreti
+
+ ALIGN_TEXT
+ .globl gs_load_fault
+gs_load_fault:
+ popfq
+ movl $T_PROTFLT,TF_TRAPNO(%rsp)
+ movq %rsp,%rdi
+ call trap
+ movw $KUG32SEL,TF_GS(%rsp)
+ jmp doreti
+
+ ALIGN_TEXT
+ .globl fsbase_load_fault
+fsbase_load_fault:
+ movl $T_PROTFLT,TF_TRAPNO(%rsp)
+ movq %rsp,%rdi
+ call trap
+ movq PCPU(CURTHREAD),%r8
+ movq TD_PCB(%r8),%r8
+ movq $0,PCB_FSBASE(%r8)
+ jmp doreti
+
+ ALIGN_TEXT
+ .globl gsbase_load_fault
+gsbase_load_fault:
+ movl $T_PROTFLT,TF_TRAPNO(%rsp)
+ movq %rsp,%rdi
+ call trap
+ movq PCPU(CURTHREAD),%r8
+ movq TD_PCB(%r8),%r8
+ movq $0,PCB_GSBASE(%r8)
+ jmp doreti
+
+#ifdef HWPMC_HOOKS
+ ENTRY(end_exceptions)
+#endif
diff --git a/sys/amd64/amd64/fpu.c b/sys/amd64/amd64/fpu.c
new file mode 100644
index 0000000..18130b5
--- /dev/null
+++ b/sys/amd64/amd64/fpu.c
@@ -0,0 +1,1012 @@
+/*-
+ * Copyright (c) 1990 William Jolitz.
+ * Copyright (c) 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)npx.c 7.2 (Berkeley) 5/12/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <machine/bus.h>
+#include <sys/rman.h>
+#include <sys/signalvar.h>
+#include <vm/uma.h>
+
+#include <machine/cputypes.h>
+#include <machine/frame.h>
+#include <machine/intr_machdep.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#include <machine/psl.h>
+#include <machine/resource.h>
+#include <machine/specialreg.h>
+#include <machine/segments.h>
+#include <machine/ucontext.h>
+
+/*
+ * Floating point support.
+ */
+
+#if defined(__GNUCLIKE_ASM) && !defined(lint)
+
+#define fldcw(cw) __asm __volatile("fldcw %0" : : "m" (cw))
+#define fnclex() __asm __volatile("fnclex")
+#define fninit() __asm __volatile("fninit")
+#define fnstcw(addr) __asm __volatile("fnstcw %0" : "=m" (*(addr)))
+#define fnstsw(addr) __asm __volatile("fnstsw %0" : "=am" (*(addr)))
+#define fxrstor(addr) __asm __volatile("fxrstor %0" : : "m" (*(addr)))
+#define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr)))
+#define ldmxcsr(csr) __asm __volatile("ldmxcsr %0" : : "m" (csr))
+#define stmxcsr(addr) __asm __volatile("stmxcsr %0" : : "m" (*(addr)))
+
+static __inline void
+xrstor(char *addr, uint64_t mask)
+{
+ uint32_t low, hi;
+
+ low = mask;
+ hi = mask >> 32;
+ __asm __volatile("xrstor %0" : : "m" (*addr), "a" (low), "d" (hi));
+}
+
+static __inline void
+xsave(char *addr, uint64_t mask)
+{
+ uint32_t low, hi;
+
+ low = mask;
+ hi = mask >> 32;
+ __asm __volatile("xsave %0" : "=m" (*addr) : "a" (low), "d" (hi) :
+ "memory");
+}
+
+#else /* !(__GNUCLIKE_ASM && !lint) */
+
+void fldcw(u_short cw);
+void fnclex(void);
+void fninit(void);
+void fnstcw(caddr_t addr);
+void fnstsw(caddr_t addr);
+void fxsave(caddr_t addr);
+void fxrstor(caddr_t addr);
+void ldmxcsr(u_int csr);
+void stmxcsr(u_int *csr);
+void xrstor(char *addr, uint64_t mask);
+void xsave(char *addr, uint64_t mask);
+
+#endif /* __GNUCLIKE_ASM && !lint */
+
+#define start_emulating() load_cr0(rcr0() | CR0_TS)
+#define stop_emulating() clts()
+
+CTASSERT(sizeof(struct savefpu) == 512);
+CTASSERT(sizeof(struct xstate_hdr) == 64);
+CTASSERT(sizeof(struct savefpu_ymm) == 832);
+
+/*
+ * This requirement is to make it easier for asm code to calculate
+ * offset of the fpu save area from the pcb address. FPU save area
+ * must be 64-byte aligned.
+ */
+CTASSERT(sizeof(struct pcb) % XSAVE_AREA_ALIGN == 0);
+
+static void fpu_clean_state(void);
+
+SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD,
+ NULL, 1, "Floating point instructions executed in hardware");
+
+static int use_xsaveopt;
+int use_xsave; /* non-static for cpu_switch.S */
+uint64_t xsave_mask; /* the same */
+static uma_zone_t fpu_save_area_zone;
+static struct savefpu *fpu_initialstate;
+
+struct xsave_area_elm_descr {
+ u_int offset;
+ u_int size;
+} *xsave_area_desc;
+
+void
+fpusave(void *addr)
+{
+
+ if (use_xsave)
+ xsave((char *)addr, xsave_mask);
+ else
+ fxsave((char *)addr);
+}
+
+void
+fpurestore(void *addr)
+{
+
+ if (use_xsave)
+ xrstor((char *)addr, xsave_mask);
+ else
+ fxrstor((char *)addr);
+}
+
+/*
+ * Enable XSAVE if supported and allowed by user.
+ * Calculate the xsave_mask.
+ */
+static void
+fpuinit_bsp1(void)
+{
+ u_int cp[4];
+ uint64_t xsave_mask_user;
+
+ if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
+ use_xsave = 1;
+ TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
+ }
+ if (!use_xsave)
+ return;
+
+ cpuid_count(0xd, 0x0, cp);
+ xsave_mask = XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE;
+ if ((cp[0] & xsave_mask) != xsave_mask)
+ panic("CPU0 does not support X87 or SSE: %x", cp[0]);
+ xsave_mask = ((uint64_t)cp[3] << 32) | cp[0];
+ xsave_mask_user = xsave_mask;
+ TUNABLE_ULONG_FETCH("hw.xsave_mask", &xsave_mask_user);
+ xsave_mask_user |= XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE;
+ xsave_mask &= xsave_mask_user;
+
+ cpuid_count(0xd, 0x1, cp);
+ if ((cp[0] & CPUID_EXTSTATE_XSAVEOPT) != 0) {
+ /*
+ * Patch the XSAVE instruction in the cpu_switch code
+ * to XSAVEOPT. We assume that XSAVE encoding used
+ * REX byte, and set the bit 4 of the r/m byte.
+ */
+ ctx_switch_xsave[3] |= 0x10;
+ use_xsaveopt = 1;
+ }
+}
+
+/*
+ * Calculate the fpu save area size.
+ */
+static void
+fpuinit_bsp2(void)
+{
+ u_int cp[4];
+
+ if (use_xsave) {
+ cpuid_count(0xd, 0x0, cp);
+ cpu_max_ext_state_size = cp[1];
+
+ /*
+ * Reload the cpu_feature2, since we enabled OSXSAVE.
+ */
+ do_cpuid(1, cp);
+ cpu_feature2 = cp[2];
+ } else
+ cpu_max_ext_state_size = sizeof(struct savefpu);
+}
+
+/*
+ * Initialize the floating point unit.
+ */
+void
+fpuinit(void)
+{
+ register_t saveintr;
+ u_int mxcsr;
+ u_short control;
+
+ if (IS_BSP())
+ fpuinit_bsp1();
+
+ if (use_xsave) {
+ load_cr4(rcr4() | CR4_XSAVE);
+ load_xcr(XCR0, xsave_mask);
+ }
+
+ /*
+ * XCR0 shall be set up before CPU can report the save area size.
+ */
+ if (IS_BSP())
+ fpuinit_bsp2();
+
+ /*
+ * It is too early for critical_enter() to work on AP.
+ */
+ saveintr = intr_disable();
+ stop_emulating();
+ fninit();
+ control = __INITIAL_FPUCW__;
+ fldcw(control);
+ mxcsr = __INITIAL_MXCSR__;
+ ldmxcsr(mxcsr);
+ start_emulating();
+ intr_restore(saveintr);
+}
+
+/*
+ * On the boot CPU we generate a clean state that is used to
+ * initialize the floating point unit when it is first used by a
+ * process.
+ */
+static void
+fpuinitstate(void *arg __unused)
+{
+ register_t saveintr;
+ int cp[4], i, max_ext_n;
+
+ fpu_initialstate = malloc(cpu_max_ext_state_size, M_DEVBUF,
+ M_WAITOK | M_ZERO);
+ saveintr = intr_disable();
+ stop_emulating();
+
+ fpusave(fpu_initialstate);
+ if (fpu_initialstate->sv_env.en_mxcsr_mask)
+ cpu_mxcsr_mask = fpu_initialstate->sv_env.en_mxcsr_mask;
+ else
+ cpu_mxcsr_mask = 0xFFBF;
+
+ /*
+ * The fninit instruction does not modify XMM registers. The
+ * fpusave call dumped the garbage contained in the registers
+ * after reset to the initial state saved. Clear XMM
+ * registers file image to make the startup program state and
+ * signal handler XMM register content predictable.
+ */
+ bzero(&fpu_initialstate->sv_xmm[0], sizeof(struct xmmacc));
+
+ /*
+ * Create a table describing the layout of the CPU Extended
+ * Save Area.
+ */
+ if (use_xsaveopt) {
+ max_ext_n = flsl(xsave_mask);
+ xsave_area_desc = malloc(max_ext_n * sizeof(struct
+ xsave_area_elm_descr), M_DEVBUF, M_WAITOK | M_ZERO);
+ /* x87 state */
+ xsave_area_desc[0].offset = 0;
+ xsave_area_desc[0].size = 160;
+ /* XMM */
+ xsave_area_desc[1].offset = 160;
+ xsave_area_desc[1].size = 288 - 160;
+
+ for (i = 2; i < max_ext_n; i++) {
+ cpuid_count(0xd, i, cp);
+ xsave_area_desc[i].offset = cp[1];
+ xsave_area_desc[i].size = cp[0];
+ }
+ }
+
+ fpu_save_area_zone = uma_zcreate("FPU_save_area",
+ cpu_max_ext_state_size, NULL, NULL, NULL, NULL,
+ XSAVE_AREA_ALIGN - 1, 0);
+
+ start_emulating();
+ intr_restore(saveintr);
+}
+SYSINIT(fpuinitstate, SI_SUB_DRIVERS, SI_ORDER_ANY, fpuinitstate, NULL);
+
+/*
+ * Free coprocessor (if we have it).
+ */
+void
+fpuexit(struct thread *td)
+{
+
+ critical_enter();
+ if (curthread == PCPU_GET(fpcurthread)) {
+ stop_emulating();
+ fpusave(curpcb->pcb_save);
+ start_emulating();
+ PCPU_SET(fpcurthread, 0);
+ }
+ critical_exit();
+}
+
+int
+fpuformat()
+{
+
+ return (_MC_FPFMT_XMM);
+}
+
+/*
+ * The following mechanism is used to ensure that the FPE_... value
+ * that is passed as a trapcode to the signal handler of the user
+ * process does not have more than one bit set.
+ *
+ * Multiple bits may be set if the user process modifies the control
+ * word while a status word bit is already set. While this is a sign
+ * of bad coding, we have no choise than to narrow them down to one
+ * bit, since we must not send a trapcode that is not exactly one of
+ * the FPE_ macros.
+ *
+ * The mechanism has a static table with 127 entries. Each combination
+ * of the 7 FPU status word exception bits directly translates to a
+ * position in this table, where a single FPE_... value is stored.
+ * This FPE_... value stored there is considered the "most important"
+ * of the exception bits and will be sent as the signal code. The
+ * precedence of the bits is based upon Intel Document "Numerical
+ * Applications", Chapter "Special Computational Situations".
+ *
+ * The macro to choose one of these values does these steps: 1) Throw
+ * away status word bits that cannot be masked. 2) Throw away the bits
+ * currently masked in the control word, assuming the user isn't
+ * interested in them anymore. 3) Reinsert status word bit 7 (stack
+ * fault) if it is set, which cannot be masked but must be presered.
+ * 4) Use the remaining bits to point into the trapcode table.
+ *
+ * The 6 maskable bits in order of their preference, as stated in the
+ * above referenced Intel manual:
+ * 1 Invalid operation (FP_X_INV)
+ * 1a Stack underflow
+ * 1b Stack overflow
+ * 1c Operand of unsupported format
+ * 1d SNaN operand.
+ * 2 QNaN operand (not an exception, irrelavant here)
+ * 3 Any other invalid-operation not mentioned above or zero divide
+ * (FP_X_INV, FP_X_DZ)
+ * 4 Denormal operand (FP_X_DNML)
+ * 5 Numeric over/underflow (FP_X_OFL, FP_X_UFL)
+ * 6 Inexact result (FP_X_IMP)
+ */
+static char fpetable[128] = {
+ 0,
+ FPE_FLTINV, /* 1 - INV */
+ FPE_FLTUND, /* 2 - DNML */
+ FPE_FLTINV, /* 3 - INV | DNML */
+ FPE_FLTDIV, /* 4 - DZ */
+ FPE_FLTINV, /* 5 - INV | DZ */
+ FPE_FLTDIV, /* 6 - DNML | DZ */
+ FPE_FLTINV, /* 7 - INV | DNML | DZ */
+ FPE_FLTOVF, /* 8 - OFL */
+ FPE_FLTINV, /* 9 - INV | OFL */
+ FPE_FLTUND, /* A - DNML | OFL */
+ FPE_FLTINV, /* B - INV | DNML | OFL */
+ FPE_FLTDIV, /* C - DZ | OFL */
+ FPE_FLTINV, /* D - INV | DZ | OFL */
+ FPE_FLTDIV, /* E - DNML | DZ | OFL */
+ FPE_FLTINV, /* F - INV | DNML | DZ | OFL */
+ FPE_FLTUND, /* 10 - UFL */
+ FPE_FLTINV, /* 11 - INV | UFL */
+ FPE_FLTUND, /* 12 - DNML | UFL */
+ FPE_FLTINV, /* 13 - INV | DNML | UFL */
+ FPE_FLTDIV, /* 14 - DZ | UFL */
+ FPE_FLTINV, /* 15 - INV | DZ | UFL */
+ FPE_FLTDIV, /* 16 - DNML | DZ | UFL */
+ FPE_FLTINV, /* 17 - INV | DNML | DZ | UFL */
+ FPE_FLTOVF, /* 18 - OFL | UFL */
+ FPE_FLTINV, /* 19 - INV | OFL | UFL */
+ FPE_FLTUND, /* 1A - DNML | OFL | UFL */
+ FPE_FLTINV, /* 1B - INV | DNML | OFL | UFL */
+ FPE_FLTDIV, /* 1C - DZ | OFL | UFL */
+ FPE_FLTINV, /* 1D - INV | DZ | OFL | UFL */
+ FPE_FLTDIV, /* 1E - DNML | DZ | OFL | UFL */
+ FPE_FLTINV, /* 1F - INV | DNML | DZ | OFL | UFL */
+ FPE_FLTRES, /* 20 - IMP */
+ FPE_FLTINV, /* 21 - INV | IMP */
+ FPE_FLTUND, /* 22 - DNML | IMP */
+ FPE_FLTINV, /* 23 - INV | DNML | IMP */
+ FPE_FLTDIV, /* 24 - DZ | IMP */
+ FPE_FLTINV, /* 25 - INV | DZ | IMP */
+ FPE_FLTDIV, /* 26 - DNML | DZ | IMP */
+ FPE_FLTINV, /* 27 - INV | DNML | DZ | IMP */
+ FPE_FLTOVF, /* 28 - OFL | IMP */
+ FPE_FLTINV, /* 29 - INV | OFL | IMP */
+ FPE_FLTUND, /* 2A - DNML | OFL | IMP */
+ FPE_FLTINV, /* 2B - INV | DNML | OFL | IMP */
+ FPE_FLTDIV, /* 2C - DZ | OFL | IMP */
+ FPE_FLTINV, /* 2D - INV | DZ | OFL | IMP */
+ FPE_FLTDIV, /* 2E - DNML | DZ | OFL | IMP */
+ FPE_FLTINV, /* 2F - INV | DNML | DZ | OFL | IMP */
+ FPE_FLTUND, /* 30 - UFL | IMP */
+ FPE_FLTINV, /* 31 - INV | UFL | IMP */
+ FPE_FLTUND, /* 32 - DNML | UFL | IMP */
+ FPE_FLTINV, /* 33 - INV | DNML | UFL | IMP */
+ FPE_FLTDIV, /* 34 - DZ | UFL | IMP */
+ FPE_FLTINV, /* 35 - INV | DZ | UFL | IMP */
+ FPE_FLTDIV, /* 36 - DNML | DZ | UFL | IMP */
+ FPE_FLTINV, /* 37 - INV | DNML | DZ | UFL | IMP */
+ FPE_FLTOVF, /* 38 - OFL | UFL | IMP */
+ FPE_FLTINV, /* 39 - INV | OFL | UFL | IMP */
+ FPE_FLTUND, /* 3A - DNML | OFL | UFL | IMP */
+ FPE_FLTINV, /* 3B - INV | DNML | OFL | UFL | IMP */
+ FPE_FLTDIV, /* 3C - DZ | OFL | UFL | IMP */
+ FPE_FLTINV, /* 3D - INV | DZ | OFL | UFL | IMP */
+ FPE_FLTDIV, /* 3E - DNML | DZ | OFL | UFL | IMP */
+ FPE_FLTINV, /* 3F - INV | DNML | DZ | OFL | UFL | IMP */
+ FPE_FLTSUB, /* 40 - STK */
+ FPE_FLTSUB, /* 41 - INV | STK */
+ FPE_FLTUND, /* 42 - DNML | STK */
+ FPE_FLTSUB, /* 43 - INV | DNML | STK */
+ FPE_FLTDIV, /* 44 - DZ | STK */
+ FPE_FLTSUB, /* 45 - INV | DZ | STK */
+ FPE_FLTDIV, /* 46 - DNML | DZ | STK */
+ FPE_FLTSUB, /* 47 - INV | DNML | DZ | STK */
+ FPE_FLTOVF, /* 48 - OFL | STK */
+ FPE_FLTSUB, /* 49 - INV | OFL | STK */
+ FPE_FLTUND, /* 4A - DNML | OFL | STK */
+ FPE_FLTSUB, /* 4B - INV | DNML | OFL | STK */
+ FPE_FLTDIV, /* 4C - DZ | OFL | STK */
+ FPE_FLTSUB, /* 4D - INV | DZ | OFL | STK */
+ FPE_FLTDIV, /* 4E - DNML | DZ | OFL | STK */
+ FPE_FLTSUB, /* 4F - INV | DNML | DZ | OFL | STK */
+ FPE_FLTUND, /* 50 - UFL | STK */
+ FPE_FLTSUB, /* 51 - INV | UFL | STK */
+ FPE_FLTUND, /* 52 - DNML | UFL | STK */
+ FPE_FLTSUB, /* 53 - INV | DNML | UFL | STK */
+ FPE_FLTDIV, /* 54 - DZ | UFL | STK */
+ FPE_FLTSUB, /* 55 - INV | DZ | UFL | STK */
+ FPE_FLTDIV, /* 56 - DNML | DZ | UFL | STK */
+ FPE_FLTSUB, /* 57 - INV | DNML | DZ | UFL | STK */
+ FPE_FLTOVF, /* 58 - OFL | UFL | STK */
+ FPE_FLTSUB, /* 59 - INV | OFL | UFL | STK */
+ FPE_FLTUND, /* 5A - DNML | OFL | UFL | STK */
+ FPE_FLTSUB, /* 5B - INV | DNML | OFL | UFL | STK */
+ FPE_FLTDIV, /* 5C - DZ | OFL | UFL | STK */
+ FPE_FLTSUB, /* 5D - INV | DZ | OFL | UFL | STK */
+ FPE_FLTDIV, /* 5E - DNML | DZ | OFL | UFL | STK */
+ FPE_FLTSUB, /* 5F - INV | DNML | DZ | OFL | UFL | STK */
+ FPE_FLTRES, /* 60 - IMP | STK */
+ FPE_FLTSUB, /* 61 - INV | IMP | STK */
+ FPE_FLTUND, /* 62 - DNML | IMP | STK */
+ FPE_FLTSUB, /* 63 - INV | DNML | IMP | STK */
+ FPE_FLTDIV, /* 64 - DZ | IMP | STK */
+ FPE_FLTSUB, /* 65 - INV | DZ | IMP | STK */
+ FPE_FLTDIV, /* 66 - DNML | DZ | IMP | STK */
+ FPE_FLTSUB, /* 67 - INV | DNML | DZ | IMP | STK */
+ FPE_FLTOVF, /* 68 - OFL | IMP | STK */
+ FPE_FLTSUB, /* 69 - INV | OFL | IMP | STK */
+ FPE_FLTUND, /* 6A - DNML | OFL | IMP | STK */
+ FPE_FLTSUB, /* 6B - INV | DNML | OFL | IMP | STK */
+ FPE_FLTDIV, /* 6C - DZ | OFL | IMP | STK */
+ FPE_FLTSUB, /* 6D - INV | DZ | OFL | IMP | STK */
+ FPE_FLTDIV, /* 6E - DNML | DZ | OFL | IMP | STK */
+ FPE_FLTSUB, /* 6F - INV | DNML | DZ | OFL | IMP | STK */
+ FPE_FLTUND, /* 70 - UFL | IMP | STK */
+ FPE_FLTSUB, /* 71 - INV | UFL | IMP | STK */
+ FPE_FLTUND, /* 72 - DNML | UFL | IMP | STK */
+ FPE_FLTSUB, /* 73 - INV | DNML | UFL | IMP | STK */
+ FPE_FLTDIV, /* 74 - DZ | UFL | IMP | STK */
+ FPE_FLTSUB, /* 75 - INV | DZ | UFL | IMP | STK */
+ FPE_FLTDIV, /* 76 - DNML | DZ | UFL | IMP | STK */
+ FPE_FLTSUB, /* 77 - INV | DNML | DZ | UFL | IMP | STK */
+ FPE_FLTOVF, /* 78 - OFL | UFL | IMP | STK */
+ FPE_FLTSUB, /* 79 - INV | OFL | UFL | IMP | STK */
+ FPE_FLTUND, /* 7A - DNML | OFL | UFL | IMP | STK */
+ FPE_FLTSUB, /* 7B - INV | DNML | OFL | UFL | IMP | STK */
+ FPE_FLTDIV, /* 7C - DZ | OFL | UFL | IMP | STK */
+ FPE_FLTSUB, /* 7D - INV | DZ | OFL | UFL | IMP | STK */
+ FPE_FLTDIV, /* 7E - DNML | DZ | OFL | UFL | IMP | STK */
+ FPE_FLTSUB, /* 7F - INV | DNML | DZ | OFL | UFL | IMP | STK */
+};
+
+/*
+ * Read the FP status and control words, then generate si_code value
+ * for SIGFPE. The error code chosen will be one of the
+ * FPE_... macros. It will be sent as the second argument to old
+ * BSD-style signal handlers and as "siginfo_t->si_code" (second
+ * argument) to SA_SIGINFO signal handlers.
+ *
+ * Some time ago, we cleared the x87 exceptions with FNCLEX there.
+ * Clearing exceptions was necessary mainly to avoid IRQ13 bugs. The
+ * usermode code which understands the FPU hardware enough to enable
+ * the exceptions, can also handle clearing the exception state in the
+ * handler. The only consequence of not clearing the exception is the
+ * rethrow of the SIGFPE on return from the signal handler and
+ * reexecution of the corresponding instruction.
+ *
+ * For XMM traps, the exceptions were never cleared.
+ */
+int
+fputrap_x87(void)
+{
+ struct savefpu *pcb_save;
+ u_short control, status;
+
+ critical_enter();
+
+ /*
+ * Interrupt handling (for another interrupt) may have pushed the
+ * state to memory. Fetch the relevant parts of the state from
+ * wherever they are.
+ */
+ if (PCPU_GET(fpcurthread) != curthread) {
+ pcb_save = curpcb->pcb_save;
+ control = pcb_save->sv_env.en_cw;
+ status = pcb_save->sv_env.en_sw;
+ } else {
+ fnstcw(&control);
+ fnstsw(&status);
+ }
+
+ critical_exit();
+ return (fpetable[status & ((~control & 0x3f) | 0x40)]);
+}
+
+int
+fputrap_sse(void)
+{
+ u_int mxcsr;
+
+ critical_enter();
+ if (PCPU_GET(fpcurthread) != curthread)
+ mxcsr = curpcb->pcb_save->sv_env.en_mxcsr;
+ else
+ stmxcsr(&mxcsr);
+ critical_exit();
+ return (fpetable[(mxcsr & (~mxcsr >> 7)) & 0x3f]);
+}
+
+/*
+ * Implement device not available (DNA) exception
+ *
+ * It would be better to switch FP context here (if curthread != fpcurthread)
+ * and not necessarily for every context switch, but it is too hard to
+ * access foreign pcb's.
+ */
+
+static int err_count = 0;
+
+void
+fpudna(void)
+{
+
+ critical_enter();
+ if (PCPU_GET(fpcurthread) == curthread) {
+ printf("fpudna: fpcurthread == curthread %d times\n",
+ ++err_count);
+ stop_emulating();
+ critical_exit();
+ return;
+ }
+ if (PCPU_GET(fpcurthread) != NULL) {
+ printf("fpudna: fpcurthread = %p (%d), curthread = %p (%d)\n",
+ PCPU_GET(fpcurthread),
+ PCPU_GET(fpcurthread)->td_proc->p_pid,
+ curthread, curthread->td_proc->p_pid);
+ panic("fpudna");
+ }
+ stop_emulating();
+ /*
+ * Record new context early in case frstor causes a trap.
+ */
+ PCPU_SET(fpcurthread, curthread);
+
+ fpu_clean_state();
+
+ if ((curpcb->pcb_flags & PCB_FPUINITDONE) == 0) {
+ /*
+ * This is the first time this thread has used the FPU or
+ * the PCB doesn't contain a clean FPU state. Explicitly
+ * load an initial state.
+ *
+ * We prefer to restore the state from the actual save
+ * area in PCB instead of directly loading from
+ * fpu_initialstate, to ignite the XSAVEOPT
+ * tracking engine.
+ */
+ bcopy(fpu_initialstate, curpcb->pcb_save, cpu_max_ext_state_size);
+ fpurestore(curpcb->pcb_save);
+ if (curpcb->pcb_initial_fpucw != __INITIAL_FPUCW__)
+ fldcw(curpcb->pcb_initial_fpucw);
+ if (PCB_USER_FPU(curpcb))
+ set_pcb_flags(curpcb,
+ PCB_FPUINITDONE | PCB_USERFPUINITDONE);
+ else
+ set_pcb_flags(curpcb, PCB_FPUINITDONE);
+ } else
+ fpurestore(curpcb->pcb_save);
+ critical_exit();
+}
+
+void
+fpudrop()
+{
+ struct thread *td;
+
+ td = PCPU_GET(fpcurthread);
+ KASSERT(td == curthread, ("fpudrop: fpcurthread != curthread"));
+ CRITICAL_ASSERT(td);
+ PCPU_SET(fpcurthread, NULL);
+ clear_pcb_flags(td->td_pcb, PCB_FPUINITDONE);
+ start_emulating();
+}
+
+/*
+ * Get the user state of the FPU into pcb->pcb_user_save without
+ * dropping ownership (if possible). It returns the FPU ownership
+ * status.
+ */
+int
+fpugetregs(struct thread *td)
+{
+ struct pcb *pcb;
+ uint64_t *xstate_bv, bit;
+ char *sa;
+ int max_ext_n, i;
+
+ pcb = td->td_pcb;
+ if ((pcb->pcb_flags & PCB_USERFPUINITDONE) == 0) {
+ bcopy(fpu_initialstate, get_pcb_user_save_pcb(pcb),
+ cpu_max_ext_state_size);
+ get_pcb_user_save_pcb(pcb)->sv_env.en_cw =
+ pcb->pcb_initial_fpucw;
+ fpuuserinited(td);
+ return (_MC_FPOWNED_PCB);
+ }
+ critical_enter();
+ if (td == PCPU_GET(fpcurthread) && PCB_USER_FPU(pcb)) {
+ fpusave(get_pcb_user_save_pcb(pcb));
+ critical_exit();
+ return (_MC_FPOWNED_FPU);
+ } else {
+ critical_exit();
+ if (use_xsaveopt) {
+ /*
+ * Handle partially saved state.
+ */
+ sa = (char *)get_pcb_user_save_pcb(pcb);
+ xstate_bv = (uint64_t *)(sa + sizeof(struct savefpu) +
+ offsetof(struct xstate_hdr, xstate_bv));
+ max_ext_n = flsl(xsave_mask);
+ for (i = 0; i < max_ext_n; i++) {
+ bit = 1 << i;
+ if ((*xstate_bv & bit) != 0)
+ continue;
+ bcopy((char *)fpu_initialstate +
+ xsave_area_desc[i].offset,
+ sa + xsave_area_desc[i].offset,
+ xsave_area_desc[i].size);
+ *xstate_bv |= bit;
+ }
+ }
+ return (_MC_FPOWNED_PCB);
+ }
+}
+
+void
+fpuuserinited(struct thread *td)
+{
+ struct pcb *pcb;
+
+ pcb = td->td_pcb;
+ if (PCB_USER_FPU(pcb))
+ set_pcb_flags(pcb,
+ PCB_FPUINITDONE | PCB_USERFPUINITDONE);
+ else
+ set_pcb_flags(pcb, PCB_FPUINITDONE);
+}
+
+int
+fpusetxstate(struct thread *td, char *xfpustate, size_t xfpustate_size)
+{
+ struct xstate_hdr *hdr, *ehdr;
+ size_t len, max_len;
+ uint64_t bv;
+
+ /* XXXKIB should we clear all extended state in xstate_bv instead ? */
+ if (xfpustate == NULL)
+ return (0);
+ if (!use_xsave)
+ return (EOPNOTSUPP);
+
+ len = xfpustate_size;
+ if (len < sizeof(struct xstate_hdr))
+ return (EINVAL);
+ max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
+ if (len > max_len)
+ return (EINVAL);
+
+ ehdr = (struct xstate_hdr *)xfpustate;
+ bv = ehdr->xstate_bv;
+
+ /*
+ * Avoid #gp.
+ */
+ if (bv & ~xsave_mask)
+ return (EINVAL);
+ if ((bv & (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)) !=
+ (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE))
+ return (EINVAL);
+
+ hdr = (struct xstate_hdr *)(get_pcb_user_save_td(td) + 1);
+
+ hdr->xstate_bv = bv;
+ bcopy(xfpustate + sizeof(struct xstate_hdr),
+ (char *)(hdr + 1), len - sizeof(struct xstate_hdr));
+
+ return (0);
+}
+
+/*
+ * Set the state of the FPU.
+ */
+int
+fpusetregs(struct thread *td, struct savefpu *addr, char *xfpustate,
+ size_t xfpustate_size)
+{
+ struct pcb *pcb;
+ int error;
+
+ pcb = td->td_pcb;
+ critical_enter();
+ if (td == PCPU_GET(fpcurthread) && PCB_USER_FPU(pcb)) {
+ error = fpusetxstate(td, xfpustate, xfpustate_size);
+ if (error != 0) {
+ critical_exit();
+ return (error);
+ }
+ bcopy(addr, get_pcb_user_save_td(td), sizeof(*addr));
+ fpurestore(get_pcb_user_save_td(td));
+ critical_exit();
+ set_pcb_flags(pcb, PCB_FPUINITDONE | PCB_USERFPUINITDONE);
+ } else {
+ critical_exit();
+ error = fpusetxstate(td, xfpustate, xfpustate_size);
+ if (error != 0)
+ return (error);
+ bcopy(addr, get_pcb_user_save_td(td), sizeof(*addr));
+ fpuuserinited(td);
+ }
+ return (0);
+}
+
+/*
+ * On AuthenticAMD processors, the fxrstor instruction does not restore
+ * the x87's stored last instruction pointer, last data pointer, and last
+ * opcode values, except in the rare case in which the exception summary
+ * (ES) bit in the x87 status word is set to 1.
+ *
+ * In order to avoid leaking this information across processes, we clean
+ * these values by performing a dummy load before executing fxrstor().
+ */
+static void
+fpu_clean_state(void)
+{
+ static float dummy_variable = 0.0;
+ u_short status;
+
+ /*
+ * Clear the ES bit in the x87 status word if it is currently
+ * set, in order to avoid causing a fault in the upcoming load.
+ */
+ fnstsw(&status);
+ if (status & 0x80)
+ fnclex();
+
+ /*
+ * Load the dummy variable into the x87 stack. This mangles
+ * the x87 stack, but we don't care since we're about to call
+ * fxrstor() anyway.
+ */
+ __asm __volatile("ffree %%st(7); flds %0" : : "m" (dummy_variable));
+}
+
+/*
+ * This really sucks. We want the acpi version only, but it requires
+ * the isa_if.h file in order to get the definitions.
+ */
+#include "opt_isa.h"
+#ifdef DEV_ISA
+#include <isa/isavar.h>
+/*
+ * This sucks up the legacy ISA support assignments from PNPBIOS/ACPI.
+ */
+static struct isa_pnp_id fpupnp_ids[] = {
+ { 0x040cd041, "Legacy ISA coprocessor support" }, /* PNP0C04 */
+ { 0 }
+};
+
+static int
+fpupnp_probe(device_t dev)
+{
+ int result;
+
+ result = ISA_PNP_PROBE(device_get_parent(dev), dev, fpupnp_ids);
+ if (result <= 0)
+ device_quiet(dev);
+ return (result);
+}
+
+static int
+fpupnp_attach(device_t dev)
+{
+
+ return (0);
+}
+
+static device_method_t fpupnp_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, fpupnp_probe),
+ DEVMETHOD(device_attach, fpupnp_attach),
+ DEVMETHOD(device_detach, bus_generic_detach),
+ DEVMETHOD(device_shutdown, bus_generic_shutdown),
+ DEVMETHOD(device_suspend, bus_generic_suspend),
+ DEVMETHOD(device_resume, bus_generic_resume),
+
+ { 0, 0 }
+};
+
+static driver_t fpupnp_driver = {
+ "fpupnp",
+ fpupnp_methods,
+ 1, /* no softc */
+};
+
+static devclass_t fpupnp_devclass;
+
+DRIVER_MODULE(fpupnp, acpi, fpupnp_driver, fpupnp_devclass, 0, 0);
+#endif /* DEV_ISA */
+
+static MALLOC_DEFINE(M_FPUKERN_CTX, "fpukern_ctx",
+ "Kernel contexts for FPU state");
+
+#define FPU_KERN_CTX_FPUINITDONE 0x01
+
+struct fpu_kern_ctx {
+ struct savefpu *prev;
+ uint32_t flags;
+ char hwstate1[];
+};
+
+struct fpu_kern_ctx *
+fpu_kern_alloc_ctx(u_int flags)
+{
+ struct fpu_kern_ctx *res;
+ size_t sz;
+
+ sz = sizeof(struct fpu_kern_ctx) + XSAVE_AREA_ALIGN +
+ cpu_max_ext_state_size;
+ res = malloc(sz, M_FPUKERN_CTX, ((flags & FPU_KERN_NOWAIT) ?
+ M_NOWAIT : M_WAITOK) | M_ZERO);
+ return (res);
+}
+
+void
+fpu_kern_free_ctx(struct fpu_kern_ctx *ctx)
+{
+
+ /* XXXKIB clear the memory ? */
+ free(ctx, M_FPUKERN_CTX);
+}
+
+static struct savefpu *
+fpu_kern_ctx_savefpu(struct fpu_kern_ctx *ctx)
+{
+ vm_offset_t p;
+
+ p = (vm_offset_t)&ctx->hwstate1;
+ p = roundup2(p, XSAVE_AREA_ALIGN);
+ return ((struct savefpu *)p);
+}
+
+int
+fpu_kern_enter(struct thread *td, struct fpu_kern_ctx *ctx, u_int flags)
+{
+ struct pcb *pcb;
+
+ pcb = td->td_pcb;
+ KASSERT(!PCB_USER_FPU(pcb) || pcb->pcb_save ==
+ get_pcb_user_save_pcb(pcb), ("mangled pcb_save"));
+ ctx->flags = 0;
+ if ((pcb->pcb_flags & PCB_FPUINITDONE) != 0)
+ ctx->flags |= FPU_KERN_CTX_FPUINITDONE;
+ fpuexit(td);
+ ctx->prev = pcb->pcb_save;
+ pcb->pcb_save = fpu_kern_ctx_savefpu(ctx);
+ set_pcb_flags(pcb, PCB_KERNFPU);
+ clear_pcb_flags(pcb, PCB_FPUINITDONE);
+ return (0);
+}
+
+int
+fpu_kern_leave(struct thread *td, struct fpu_kern_ctx *ctx)
+{
+ struct pcb *pcb;
+
+ pcb = td->td_pcb;
+ critical_enter();
+ if (curthread == PCPU_GET(fpcurthread))
+ fpudrop();
+ critical_exit();
+ pcb->pcb_save = ctx->prev;
+ if (pcb->pcb_save == get_pcb_user_save_pcb(pcb)) {
+ if ((pcb->pcb_flags & PCB_USERFPUINITDONE) != 0) {
+ set_pcb_flags(pcb, PCB_FPUINITDONE);
+ clear_pcb_flags(pcb, PCB_KERNFPU);
+ } else
+ clear_pcb_flags(pcb, PCB_FPUINITDONE | PCB_KERNFPU);
+ } else {
+ if ((ctx->flags & FPU_KERN_CTX_FPUINITDONE) != 0)
+ set_pcb_flags(pcb, PCB_FPUINITDONE);
+ else
+ clear_pcb_flags(pcb, PCB_FPUINITDONE);
+ KASSERT(!PCB_USER_FPU(pcb), ("unpaired fpu_kern_leave"));
+ }
+ return (0);
+}
+
+int
+fpu_kern_thread(u_int flags)
+{
+
+ KASSERT((curthread->td_pflags & TDP_KTHREAD) != 0,
+ ("Only kthread may use fpu_kern_thread"));
+ KASSERT(curpcb->pcb_save == get_pcb_user_save_pcb(curpcb),
+ ("mangled pcb_save"));
+ KASSERT(PCB_USER_FPU(curpcb), ("recursive call"));
+
+ set_pcb_flags(curpcb, PCB_KERNFPU);
+ return (0);
+}
+
+int
+is_fpu_kern_thread(u_int flags)
+{
+
+ if ((curthread->td_pflags & TDP_KTHREAD) == 0)
+ return (0);
+ return ((curpcb->pcb_flags & PCB_KERNFPU) != 0);
+}
+
+/*
+ * FPU save area alloc/free/init utility routines
+ */
+struct savefpu *
+fpu_save_area_alloc(void)
+{
+
+ return (uma_zalloc(fpu_save_area_zone, 0));
+}
+
+void
+fpu_save_area_free(struct savefpu *fsa)
+{
+
+ uma_zfree(fpu_save_area_zone, fsa);
+}
+
+void
+fpu_save_area_reset(struct savefpu *fsa)
+{
+
+ bcopy(fpu_initialstate, fsa, cpu_max_ext_state_size);
+}
diff --git a/sys/amd64/amd64/gdb_machdep.c b/sys/amd64/amd64/gdb_machdep.c
new file mode 100644
index 0000000..5775c8f
--- /dev/null
+++ b/sys/amd64/amd64/gdb_machdep.c
@@ -0,0 +1,117 @@
+/*-
+ * Copyright (c) 2004 Marcel Moolenaar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/signal.h>
+
+#include <machine/frame.h>
+#include <machine/gdb_machdep.h>
+#include <machine/pcb.h>
+#include <machine/psl.h>
+#include <machine/reg.h>
+#include <machine/trap.h>
+#include <machine/frame.h>
+#include <machine/endian.h>
+
+#include <gdb/gdb.h>
+
+void *
+gdb_cpu_getreg(int regnum, size_t *regsz)
+{
+
+ *regsz = gdb_cpu_regsz(regnum);
+
+ if (kdb_thread == curthread) {
+ switch (regnum) {
+ case 0: return (&kdb_frame->tf_rax);
+ case 2: return (&kdb_frame->tf_rcx);
+ case 3: return (&kdb_frame->tf_rdx);
+ case 4: return (&kdb_frame->tf_rsi);
+ case 5: return (&kdb_frame->tf_rdi);
+ case 8: return (&kdb_frame->tf_r8);
+ case 9: return (&kdb_frame->tf_r9);
+ case 10: return (&kdb_frame->tf_r10);
+ case 11: return (&kdb_frame->tf_r11);
+ case 17: return (&kdb_frame->tf_rflags);
+ case 18: return (&kdb_frame->tf_cs);
+ case 19: return (&kdb_frame->tf_ss);
+ }
+ }
+ switch (regnum) {
+ case 1: return (&kdb_thrctx->pcb_rbx);
+ case 6: return (&kdb_thrctx->pcb_rbp);
+ case 7: return (&kdb_thrctx->pcb_rsp);
+ case 12: return (&kdb_thrctx->pcb_r12);
+ case 13: return (&kdb_thrctx->pcb_r13);
+ case 14: return (&kdb_thrctx->pcb_r14);
+ case 15: return (&kdb_thrctx->pcb_r15);
+ case 16: return (&kdb_thrctx->pcb_rip);
+ }
+ return (NULL);
+}
+
+void
+gdb_cpu_setreg(int regnum, void *val)
+{
+
+ switch (regnum) {
+ case GDB_REG_PC:
+ kdb_thrctx->pcb_rip = *(register_t *)val;
+ if (kdb_thread == curthread)
+ kdb_frame->tf_rip = *(register_t *)val;
+ }
+}
+
+int
+gdb_cpu_signal(int type, int code)
+{
+
+ switch (type & ~T_USER) {
+ case 0: return (SIGFPE); /* Divide by zero. */
+ case 1: return (SIGTRAP); /* Debug exception. */
+ case 3: return (SIGTRAP); /* Breakpoint. */
+ case 4: return (SIGSEGV); /* into instr. (overflow). */
+ case 5: return (SIGURG); /* bound instruction. */
+ case 6: return (SIGILL); /* Invalid opcode. */
+ case 7: return (SIGFPE); /* Coprocessor not present. */
+ case 8: return (SIGEMT); /* Double fault. */
+ case 9: return (SIGSEGV); /* Coprocessor segment overrun. */
+ case 10: return (SIGTRAP); /* Invalid TSS (also single-step). */
+ case 11: return (SIGSEGV); /* Segment not present. */
+ case 12: return (SIGSEGV); /* Stack exception. */
+ case 13: return (SIGSEGV); /* General protection. */
+ case 14: return (SIGSEGV); /* Page fault. */
+ case 16: return (SIGEMT); /* Coprocessor error. */
+ }
+ return (SIGEMT);
+}
diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c
new file mode 100644
index 0000000..174927a
--- /dev/null
+++ b/sys/amd64/amd64/genassym.c
@@ -0,0 +1,252 @@
+/*-
+ * Copyright (c) 1982, 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_kstack_pages.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/assym.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+#include <sys/errno.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/resourcevar.h>
+#include <sys/ucontext.h>
+#include <machine/tss.h>
+#include <sys/vmmeter.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/proc.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <nfs/nfsproto.h>
+#include <nfsclient/nfs.h>
+#include <nfs/nfsdiskless.h>
+#include <x86/apicreg.h>
+#include <machine/cpu.h>
+#include <machine/pcb.h>
+#include <machine/sigframe.h>
+#include <machine/proc.h>
+#include <machine/segments.h>
+
+ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
+ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
+ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
+
+ASSYM(P_MD, offsetof(struct proc, p_md));
+ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt));
+ASSYM(MD_LDT_SD, offsetof(struct mdproc, md_ldt_sd));
+
+ASSYM(TD_LOCK, offsetof(struct thread, td_lock));
+ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
+ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
+ASSYM(TD_PFLAGS, offsetof(struct thread, td_pflags));
+ASSYM(TD_PROC, offsetof(struct thread, td_proc));
+ASSYM(TD_TID, offsetof(struct thread, td_tid));
+ASSYM(TD_FRAME, offsetof(struct thread, td_frame));
+
+ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
+ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
+
+ASSYM(TDP_CALLCHAIN, TDP_CALLCHAIN);
+ASSYM(TDP_KTHREAD, TDP_KTHREAD);
+
+ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap));
+ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall));
+ASSYM(V_INTR, offsetof(struct vmmeter, v_intr));
+ASSYM(KSTACK_PAGES, KSTACK_PAGES);
+ASSYM(PAGE_SIZE, PAGE_SIZE);
+ASSYM(NPTEPG, NPTEPG);
+ASSYM(NPDEPG, NPDEPG);
+ASSYM(addr_PTmap, addr_PTmap);
+ASSYM(addr_PDmap, addr_PDmap);
+ASSYM(addr_PDPmap, addr_PDPmap);
+ASSYM(addr_PML4map, addr_PML4map);
+ASSYM(addr_PML4pml4e, addr_PML4pml4e);
+ASSYM(PDESIZE, sizeof(pd_entry_t));
+ASSYM(PTESIZE, sizeof(pt_entry_t));
+ASSYM(PTESHIFT, PTESHIFT);
+ASSYM(PAGE_SHIFT, PAGE_SHIFT);
+ASSYM(PAGE_MASK, PAGE_MASK);
+ASSYM(PDRSHIFT, PDRSHIFT);
+ASSYM(PDPSHIFT, PDPSHIFT);
+ASSYM(PML4SHIFT, PML4SHIFT);
+ASSYM(val_KPDPI, KPDPI);
+ASSYM(val_KPML4I, KPML4I);
+ASSYM(val_PML4PML4I, PML4PML4I);
+ASSYM(USRSTACK, USRSTACK);
+ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS);
+ASSYM(KERNBASE, KERNBASE);
+ASSYM(DMAP_MIN_ADDRESS, DMAP_MIN_ADDRESS);
+ASSYM(DMAP_MAX_ADDRESS, DMAP_MAX_ADDRESS);
+ASSYM(MCLBYTES, MCLBYTES);
+
+ASSYM(PCB_R15, offsetof(struct pcb, pcb_r15));
+ASSYM(PCB_R14, offsetof(struct pcb, pcb_r14));
+ASSYM(PCB_R13, offsetof(struct pcb, pcb_r13));
+ASSYM(PCB_R12, offsetof(struct pcb, pcb_r12));
+ASSYM(PCB_RBP, offsetof(struct pcb, pcb_rbp));
+ASSYM(PCB_RSP, offsetof(struct pcb, pcb_rsp));
+ASSYM(PCB_RBX, offsetof(struct pcb, pcb_rbx));
+ASSYM(PCB_RIP, offsetof(struct pcb, pcb_rip));
+ASSYM(PCB_FSBASE, offsetof(struct pcb, pcb_fsbase));
+ASSYM(PCB_GSBASE, offsetof(struct pcb, pcb_gsbase));
+ASSYM(PCB_KGSBASE, offsetof(struct pcb, pcb_kgsbase));
+ASSYM(PCB_CR0, offsetof(struct pcb, pcb_cr0));
+ASSYM(PCB_CR2, offsetof(struct pcb, pcb_cr2));
+ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3));
+ASSYM(PCB_CR4, offsetof(struct pcb, pcb_cr4));
+ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0));
+ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1));
+ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2));
+ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3));
+ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6));
+ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7));
+ASSYM(PCB_GDT, offsetof(struct pcb, pcb_gdt));
+ASSYM(PCB_IDT, offsetof(struct pcb, pcb_idt));
+ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt));
+ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr));
+ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
+ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
+ASSYM(PCB_GS32SD, offsetof(struct pcb, pcb_gs32sd));
+ASSYM(PCB_TSSP, offsetof(struct pcb, pcb_tssp));
+ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
+ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct savefpu));
+ASSYM(PCB_USERFPU, sizeof(struct pcb));
+ASSYM(PCB_EFER, offsetof(struct pcb, pcb_efer));
+ASSYM(PCB_STAR, offsetof(struct pcb, pcb_star));
+ASSYM(PCB_LSTAR, offsetof(struct pcb, pcb_lstar));
+ASSYM(PCB_CSTAR, offsetof(struct pcb, pcb_cstar));
+ASSYM(PCB_SFMASK, offsetof(struct pcb, pcb_sfmask));
+ASSYM(PCB_XSMASK, offsetof(struct pcb, pcb_xsmask));
+ASSYM(PCB_FPUSUSPEND, offsetof(struct pcb, pcb_fpususpend));
+ASSYM(PCB_SIZE, sizeof(struct pcb));
+ASSYM(PCB_FULL_IRET, PCB_FULL_IRET);
+ASSYM(PCB_DBREGS, PCB_DBREGS);
+ASSYM(PCB_GS32BIT, PCB_GS32BIT);
+ASSYM(PCB_32BIT, PCB_32BIT);
+
+ASSYM(COMMON_TSS_RSP0, offsetof(struct amd64tss, tss_rsp0));
+
+ASSYM(TF_R15, offsetof(struct trapframe, tf_r15));
+ASSYM(TF_R14, offsetof(struct trapframe, tf_r14));
+ASSYM(TF_R13, offsetof(struct trapframe, tf_r13));
+ASSYM(TF_R12, offsetof(struct trapframe, tf_r12));
+ASSYM(TF_R11, offsetof(struct trapframe, tf_r11));
+ASSYM(TF_R10, offsetof(struct trapframe, tf_r10));
+ASSYM(TF_R9, offsetof(struct trapframe, tf_r9));
+ASSYM(TF_R8, offsetof(struct trapframe, tf_r8));
+ASSYM(TF_RDI, offsetof(struct trapframe, tf_rdi));
+ASSYM(TF_RSI, offsetof(struct trapframe, tf_rsi));
+ASSYM(TF_RBP, offsetof(struct trapframe, tf_rbp));
+ASSYM(TF_RBX, offsetof(struct trapframe, tf_rbx));
+ASSYM(TF_RDX, offsetof(struct trapframe, tf_rdx));
+ASSYM(TF_RCX, offsetof(struct trapframe, tf_rcx));
+ASSYM(TF_RAX, offsetof(struct trapframe, tf_rax));
+ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno));
+ASSYM(TF_ADDR, offsetof(struct trapframe, tf_addr));
+ASSYM(TF_ERR, offsetof(struct trapframe, tf_err));
+ASSYM(TF_RIP, offsetof(struct trapframe, tf_rip));
+ASSYM(TF_CS, offsetof(struct trapframe, tf_cs));
+ASSYM(TF_RFLAGS, offsetof(struct trapframe, tf_rflags));
+ASSYM(TF_RSP, offsetof(struct trapframe, tf_rsp));
+ASSYM(TF_SS, offsetof(struct trapframe, tf_ss));
+ASSYM(TF_DS, offsetof(struct trapframe, tf_ds));
+ASSYM(TF_ES, offsetof(struct trapframe, tf_es));
+ASSYM(TF_FS, offsetof(struct trapframe, tf_fs));
+ASSYM(TF_GS, offsetof(struct trapframe, tf_gs));
+ASSYM(TF_FLAGS, offsetof(struct trapframe, tf_flags));
+ASSYM(TF_SIZE, sizeof(struct trapframe));
+ASSYM(TF_HASSEGS, TF_HASSEGS);
+
+ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
+ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc));
+ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_rflags));
+ASSYM(ENOENT, ENOENT);
+ASSYM(EFAULT, EFAULT);
+ASSYM(ENAMETOOLONG, ENAMETOOLONG);
+ASSYM(MAXCOMLEN, MAXCOMLEN);
+ASSYM(MAXPATHLEN, MAXPATHLEN);
+ASSYM(PC_SIZEOF, sizeof(struct pcpu));
+ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace));
+ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
+ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread));
+ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread));
+ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb));
+ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
+ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp));
+ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
+ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp));
+ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0));
+ASSYM(PC_FS32P, offsetof(struct pcpu, pc_fs32p));
+ASSYM(PC_GS32P, offsetof(struct pcpu, pc_gs32p));
+ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt));
+ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp));
+ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss));
+
+ASSYM(LA_VER, offsetof(struct LAPIC, version));
+ASSYM(LA_TPR, offsetof(struct LAPIC, tpr));
+ASSYM(LA_EOI, offsetof(struct LAPIC, eoi));
+ASSYM(LA_SVR, offsetof(struct LAPIC, svr));
+ASSYM(LA_ICR_LO, offsetof(struct LAPIC, icr_lo));
+ASSYM(LA_ICR_HI, offsetof(struct LAPIC, icr_hi));
+ASSYM(LA_ISR, offsetof(struct LAPIC, isr0));
+
+ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL));
+ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL));
+ASSYM(KUCSEL, GSEL(GUCODE_SEL, SEL_UPL));
+ASSYM(KUDSEL, GSEL(GUDATA_SEL, SEL_UPL));
+ASSYM(KUC32SEL, GSEL(GUCODE32_SEL, SEL_UPL));
+ASSYM(KUF32SEL, GSEL(GUFS32_SEL, SEL_UPL));
+ASSYM(KUG32SEL, GSEL(GUGS32_SEL, SEL_UPL));
+ASSYM(TSSSEL, GSEL(GPROC0_SEL, SEL_KPL));
+ASSYM(LDTSEL, GSEL(GUSERLDT_SEL, SEL_KPL));
+ASSYM(SEL_RPL_MASK, SEL_RPL_MASK);
+
+#ifdef HWPMC_HOOKS
+ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN);
+#endif
diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c
new file mode 100644
index 0000000..2517498
--- /dev/null
+++ b/sys/amd64/amd64/identcpu.c
@@ -0,0 +1,688 @@
+/*-
+ * Copyright (c) 1992 Terrence R. Lambert.
+ * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
+ * Copyright (c) 1997 KATO Takenori.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: Id: machdep.c,v 1.193 1996/06/18 01:22:04 bde Exp
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_cpu.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/cpu.h>
+#include <sys/eventhandler.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/power.h>
+
+#include <machine/asmacros.h>
+#include <machine/clock.h>
+#include <machine/cputypes.h>
+#include <machine/frame.h>
+#include <machine/intr_machdep.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/md_var.h>
+
+#include <x86/isa/icu.h>
+
+/* XXX - should be in header file: */
+void printcpuinfo(void);
+void identify_cpu(void);
+void earlysetcpuclass(void);
+void panicifcpuunsupported(void);
+
+static u_int find_cpu_vendor_id(void);
+static void print_AMD_info(void);
+static void print_AMD_assoc(int i);
+static void print_via_padlock_info(void);
+
+int cpu_class;
+char machine[] = "amd64";
+
+#ifdef SCTL_MASK32
+extern int adaptive_machine_arch;
+#endif
+
+static int
+sysctl_hw_machine(SYSCTL_HANDLER_ARGS)
+{
+#ifdef SCTL_MASK32
+ static const char machine32[] = "i386";
+#endif
+ int error;
+
+#ifdef SCTL_MASK32
+ if ((req->flags & SCTL_MASK32) != 0 && adaptive_machine_arch)
+ error = SYSCTL_OUT(req, machine32, sizeof(machine32));
+ else
+#endif
+ error = SYSCTL_OUT(req, machine, sizeof(machine));
+ return (error);
+
+}
+SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD,
+ NULL, 0, sysctl_hw_machine, "A", "Machine class");
+
+static char cpu_model[128];
+SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD,
+ cpu_model, 0, "Machine model");
+
+static int hw_clockrate;
+SYSCTL_INT(_hw, OID_AUTO, clockrate, CTLFLAG_RD,
+ &hw_clockrate, 0, "CPU instruction clock rate");
+
+static eventhandler_tag tsc_post_tag;
+
+static char cpu_brand[48];
+
+static struct {
+ char *cpu_name;
+ int cpu_class;
+} amd64_cpus[] = {
+ { "Clawhammer", CPUCLASS_K8 }, /* CPU_CLAWHAMMER */
+ { "Sledgehammer", CPUCLASS_K8 }, /* CPU_SLEDGEHAMMER */
+};
+
+static struct {
+ char *vendor;
+ u_int vendor_id;
+} cpu_vendors[] = {
+ { INTEL_VENDOR_ID, CPU_VENDOR_INTEL }, /* GenuineIntel */
+ { AMD_VENDOR_ID, CPU_VENDOR_AMD }, /* AuthenticAMD */
+ { CENTAUR_VENDOR_ID, CPU_VENDOR_CENTAUR }, /* CentaurHauls */
+};
+
+
+void
+printcpuinfo(void)
+{
+ u_int regs[4], i;
+ char *brand;
+
+ cpu_class = amd64_cpus[cpu].cpu_class;
+ printf("CPU: ");
+ strncpy(cpu_model, amd64_cpus[cpu].cpu_name, sizeof (cpu_model));
+
+ /* Check for extended CPUID information and a processor name. */
+ if (cpu_exthigh >= 0x80000004) {
+ brand = cpu_brand;
+ for (i = 0x80000002; i < 0x80000005; i++) {
+ do_cpuid(i, regs);
+ memcpy(brand, regs, sizeof(regs));
+ brand += sizeof(regs);
+ }
+ }
+
+ switch (cpu_vendor_id) {
+ case CPU_VENDOR_INTEL:
+ /* Please make up your mind folks! */
+ strcat(cpu_model, "EM64T");
+ break;
+ case CPU_VENDOR_AMD:
+ /*
+ * Values taken from AMD Processor Recognition
+ * http://www.amd.com/K6/k6docs/pdf/20734g.pdf
+ * (also describes ``Features'' encodings.
+ */
+ strcpy(cpu_model, "AMD ");
+ if ((cpu_id & 0xf00) == 0xf00)
+ strcat(cpu_model, "AMD64 Processor");
+ else
+ strcat(cpu_model, "Unknown");
+ break;
+ case CPU_VENDOR_CENTAUR:
+ strcpy(cpu_model, "VIA ");
+ if ((cpu_id & 0xff0) == 0x6f0)
+ strcat(cpu_model, "Nano Processor");
+ else
+ strcat(cpu_model, "Unknown");
+ break;
+ default:
+ strcat(cpu_model, "Unknown");
+ break;
+ }
+
+ /*
+ * Replace cpu_model with cpu_brand minus leading spaces if
+ * we have one.
+ */
+ brand = cpu_brand;
+ while (*brand == ' ')
+ ++brand;
+ if (*brand != '\0')
+ strcpy(cpu_model, brand);
+
+ printf("%s (", cpu_model);
+ switch(cpu_class) {
+ case CPUCLASS_K8:
+ if (tsc_freq != 0) {
+ hw_clockrate = (tsc_freq + 5000) / 1000000;
+ printf("%jd.%02d-MHz ",
+ (intmax_t)(tsc_freq + 4999) / 1000000,
+ (u_int)((tsc_freq + 4999) / 10000) % 100);
+ }
+ printf("K8");
+ break;
+ default:
+ printf("Unknown"); /* will panic below... */
+ }
+ printf("-class CPU)\n");
+ if (*cpu_vendor)
+ printf(" Origin = \"%s\"", cpu_vendor);
+ if (cpu_id)
+ printf(" Id = 0x%x", cpu_id);
+
+ if (cpu_vendor_id == CPU_VENDOR_INTEL ||
+ cpu_vendor_id == CPU_VENDOR_AMD ||
+ cpu_vendor_id == CPU_VENDOR_CENTAUR) {
+ printf(" Family = 0x%x", CPUID_TO_FAMILY(cpu_id));
+ printf(" Model = 0x%x", CPUID_TO_MODEL(cpu_id));
+ printf(" Stepping = %u", cpu_id & CPUID_STEPPING);
+
+ /*
+ * AMD CPUID Specification
+ * http://support.amd.com/us/Embedded_TechDocs/25481.pdf
+ *
+ * Intel Processor Identification and CPUID Instruction
+ * http://www.intel.com/assets/pdf/appnote/241618.pdf
+ */
+ if (cpu_high > 0) {
+
+ /*
+ * Here we should probably set up flags indicating
+ * whether or not various features are available.
+ * The interesting ones are probably VME, PSE, PAE,
+ * and PGE. The code already assumes without bothering
+ * to check that all CPUs >= Pentium have a TSC and
+ * MSRs.
+ */
+ printf("\n Features=0x%b", cpu_feature,
+ "\020"
+ "\001FPU" /* Integral FPU */
+ "\002VME" /* Extended VM86 mode support */
+ "\003DE" /* Debugging Extensions (CR4.DE) */
+ "\004PSE" /* 4MByte page tables */
+ "\005TSC" /* Timestamp counter */
+ "\006MSR" /* Machine specific registers */
+ "\007PAE" /* Physical address extension */
+ "\010MCE" /* Machine Check support */
+ "\011CX8" /* CMPEXCH8 instruction */
+ "\012APIC" /* SMP local APIC */
+ "\013oldMTRR" /* Previous implementation of MTRR */
+ "\014SEP" /* Fast System Call */
+ "\015MTRR" /* Memory Type Range Registers */
+ "\016PGE" /* PG_G (global bit) support */
+ "\017MCA" /* Machine Check Architecture */
+ "\020CMOV" /* CMOV instruction */
+ "\021PAT" /* Page attributes table */
+ "\022PSE36" /* 36 bit address space support */
+ "\023PN" /* Processor Serial number */
+ "\024CLFLUSH" /* Has the CLFLUSH instruction */
+ "\025<b20>"
+ "\026DTS" /* Debug Trace Store */
+ "\027ACPI" /* ACPI support */
+ "\030MMX" /* MMX instructions */
+ "\031FXSR" /* FXSAVE/FXRSTOR */
+ "\032SSE" /* Streaming SIMD Extensions */
+ "\033SSE2" /* Streaming SIMD Extensions #2 */
+ "\034SS" /* Self snoop */
+ "\035HTT" /* Hyperthreading (see EBX bit 16-23) */
+ "\036TM" /* Thermal Monitor clock slowdown */
+ "\037IA64" /* CPU can execute IA64 instructions */
+ "\040PBE" /* Pending Break Enable */
+ );
+
+ if (cpu_feature2 != 0) {
+ printf("\n Features2=0x%b", cpu_feature2,
+ "\020"
+ "\001SSE3" /* SSE3 */
+ "\002PCLMULQDQ" /* Carry-Less Mul Quadword */
+ "\003DTES64" /* 64-bit Debug Trace */
+ "\004MON" /* MONITOR/MWAIT Instructions */
+ "\005DS_CPL" /* CPL Qualified Debug Store */
+ "\006VMX" /* Virtual Machine Extensions */
+ "\007SMX" /* Safer Mode Extensions */
+ "\010EST" /* Enhanced SpeedStep */
+ "\011TM2" /* Thermal Monitor 2 */
+ "\012SSSE3" /* SSSE3 */
+ "\013CNXT-ID" /* L1 context ID available */
+ "\014<b11>"
+ "\015FMA" /* Fused Multiply Add */
+ "\016CX16" /* CMPXCHG16B Instruction */
+ "\017xTPR" /* Send Task Priority Messages*/
+ "\020PDCM" /* Perf/Debug Capability MSR */
+ "\021<b16>"
+ "\022PCID" /* Process-context Identifiers*/
+ "\023DCA" /* Direct Cache Access */
+ "\024SSE4.1" /* SSE 4.1 */
+ "\025SSE4.2" /* SSE 4.2 */
+ "\026x2APIC" /* xAPIC Extensions */
+ "\027MOVBE" /* MOVBE Instruction */
+ "\030POPCNT" /* POPCNT Instruction */
+ "\031TSCDLT" /* TSC-Deadline Timer */
+ "\032AESNI" /* AES Crypto */
+ "\033XSAVE" /* XSAVE/XRSTOR States */
+ "\034OSXSAVE" /* OS-Enabled State Management*/
+ "\035AVX" /* Advanced Vector Extensions */
+ "\036F16C" /* Half-precision conversions */
+ "\037RDRAND" /* RDRAND Instruction */
+ "\040HV" /* Hypervisor */
+ );
+ }
+
+ if (amd_feature != 0) {
+ printf("\n AMD Features=0x%b", amd_feature,
+ "\020" /* in hex */
+ "\001<s0>" /* Same */
+ "\002<s1>" /* Same */
+ "\003<s2>" /* Same */
+ "\004<s3>" /* Same */
+ "\005<s4>" /* Same */
+ "\006<s5>" /* Same */
+ "\007<s6>" /* Same */
+ "\010<s7>" /* Same */
+ "\011<s8>" /* Same */
+ "\012<s9>" /* Same */
+ "\013<b10>" /* Undefined */
+ "\014SYSCALL" /* Have SYSCALL/SYSRET */
+ "\015<s12>" /* Same */
+ "\016<s13>" /* Same */
+ "\017<s14>" /* Same */
+ "\020<s15>" /* Same */
+ "\021<s16>" /* Same */
+ "\022<s17>" /* Same */
+ "\023<b18>" /* Reserved, unknown */
+ "\024MP" /* Multiprocessor Capable */
+ "\025NX" /* Has EFER.NXE, NX */
+ "\026<b21>" /* Undefined */
+ "\027MMX+" /* AMD MMX Extensions */
+ "\030<s23>" /* Same */
+ "\031<s24>" /* Same */
+ "\032FFXSR" /* Fast FXSAVE/FXRSTOR */
+ "\033Page1GB" /* 1-GB large page support */
+ "\034RDTSCP" /* RDTSCP */
+ "\035<b28>" /* Undefined */
+ "\036LM" /* 64 bit long mode */
+ "\0373DNow!+" /* AMD 3DNow! Extensions */
+ "\0403DNow!" /* AMD 3DNow! */
+ );
+ }
+
+ if (amd_feature2 != 0) {
+ printf("\n AMD Features2=0x%b", amd_feature2,
+ "\020"
+ "\001LAHF" /* LAHF/SAHF in long mode */
+ "\002CMP" /* CMP legacy */
+ "\003SVM" /* Secure Virtual Mode */
+ "\004ExtAPIC" /* Extended APIC register */
+ "\005CR8" /* CR8 in legacy mode */
+ "\006ABM" /* LZCNT instruction */
+ "\007SSE4A" /* SSE4A */
+ "\010MAS" /* Misaligned SSE mode */
+ "\011Prefetch" /* 3DNow! Prefetch/PrefetchW */
+ "\012OSVW" /* OS visible workaround */
+ "\013IBS" /* Instruction based sampling */
+ "\014XOP" /* XOP extended instructions */
+ "\015SKINIT" /* SKINIT/STGI */
+ "\016WDT" /* Watchdog timer */
+ "\017<b14>"
+ "\020LWP" /* Lightweight Profiling */
+ "\021FMA4" /* 4-operand FMA instructions */
+ "\022<b17>"
+ "\023<b18>"
+ "\024NodeId" /* NodeId MSR support */
+ "\025<b20>"
+ "\026TBM" /* Trailing Bit Manipulation */
+ "\027Topology" /* Topology Extensions */
+ "\030<b23>"
+ "\031<b24>"
+ "\032<b25>"
+ "\033<b26>"
+ "\034<b27>"
+ "\035<b28>"
+ "\036<b29>"
+ "\037<b30>"
+ "\040<b31>"
+ );
+ }
+
+ if (cpu_stdext_feature != 0) {
+ printf("\n Standard Extended Features=0x%b",
+ cpu_stdext_feature,
+ "\020"
+ "\001GSFSBASE"
+ "\002TSCADJ"
+ "\010SMEP"
+ "\012ENHMOVSB"
+ "\013INVPCID"
+ );
+ }
+
+ if (via_feature_rng != 0 || via_feature_xcrypt != 0)
+ print_via_padlock_info();
+
+ if ((cpu_feature & CPUID_HTT) &&
+ cpu_vendor_id == CPU_VENDOR_AMD)
+ cpu_feature &= ~CPUID_HTT;
+
+ /*
+ * If this CPU supports P-state invariant TSC then
+ * mention the capability.
+ */
+ if (tsc_is_invariant) {
+ printf("\n TSC: P-state invariant");
+ if (tsc_perf_stat)
+ printf(", performance statistics");
+ }
+
+ }
+ }
+ /* Avoid ugly blank lines: only print newline when we have to. */
+ if (*cpu_vendor || cpu_id)
+ printf("\n");
+
+ if (!bootverbose)
+ return;
+
+ if (cpu_vendor_id == CPU_VENDOR_AMD)
+ print_AMD_info();
+}
+
+void
+panicifcpuunsupported(void)
+{
+
+#ifndef HAMMER
+#error "You need to specify a cpu type"
+#endif
+ /*
+ * Now that we have told the user what they have,
+ * let them know if that machine type isn't configured.
+ */
+ switch (cpu_class) {
+ case CPUCLASS_X86:
+#ifndef HAMMER
+ case CPUCLASS_K8:
+#endif
+ panic("CPU class not configured");
+ default:
+ break;
+ }
+}
+
+
+/* Update TSC freq with the value indicated by the caller. */
+static void
+tsc_freq_changed(void *arg __unused, const struct cf_level *level, int status)
+{
+
+ /* If there was an error during the transition, don't do anything. */
+ if (status != 0)
+ return;
+
+ /* Total setting for this level gives the new frequency in MHz. */
+ hw_clockrate = level->total_set.freq;
+}
+
+static void
+hook_tsc_freq(void *arg __unused)
+{
+
+ if (tsc_is_invariant)
+ return;
+
+ tsc_post_tag = EVENTHANDLER_REGISTER(cpufreq_post_change,
+ tsc_freq_changed, NULL, EVENTHANDLER_PRI_ANY);
+}
+
+SYSINIT(hook_tsc_freq, SI_SUB_CONFIGURE, SI_ORDER_ANY, hook_tsc_freq, NULL);
+
+/*
+ * Final stage of CPU identification.
+ */
+void
+identify_cpu(void)
+{
+ u_int regs[4], cpu_stdext_disable;
+
+ do_cpuid(0, regs);
+ cpu_high = regs[0];
+ ((u_int *)&cpu_vendor)[0] = regs[1];
+ ((u_int *)&cpu_vendor)[1] = regs[3];
+ ((u_int *)&cpu_vendor)[2] = regs[2];
+ cpu_vendor[12] = '\0';
+ cpu_vendor_id = find_cpu_vendor_id();
+
+ do_cpuid(1, regs);
+ cpu_id = regs[0];
+ cpu_procinfo = regs[1];
+ cpu_feature = regs[3];
+ cpu_feature2 = regs[2];
+
+ /*
+ * Clear "Limit CPUID Maxval" bit and get the largest standard CPUID
+ * function number again if it is set from BIOS. It is necessary
+ * for probing correct CPU topology later.
+ * XXX This is only done on the BSP package.
+ */
+ if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_high > 0 && cpu_high < 4) {
+ uint64_t msr;
+ msr = rdmsr(MSR_IA32_MISC_ENABLE);
+ if ((msr & 0x400000ULL) != 0) {
+ wrmsr(MSR_IA32_MISC_ENABLE, msr & ~0x400000ULL);
+ do_cpuid(0, regs);
+ cpu_high = regs[0];
+ }
+ }
+
+ if (cpu_high >= 7) {
+ cpuid_count(7, 0, regs);
+ cpu_stdext_feature = regs[1];
+
+ /*
+ * Some hypervisors fail to filter out unsupported
+ * extended features. For now, disable the
+ * extensions, activation of which requires setting a
+ * bit in CR4, and which VM monitors do not support.
+ */
+ if (cpu_feature2 & CPUID2_HV) {
+ cpu_stdext_disable = CPUID_STDEXT_FSGSBASE |
+ CPUID_STDEXT_SMEP;
+ } else
+ cpu_stdext_disable = 0;
+ TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable);
+ cpu_stdext_feature &= ~cpu_stdext_disable;
+ }
+
+ if (cpu_vendor_id == CPU_VENDOR_INTEL ||
+ cpu_vendor_id == CPU_VENDOR_AMD ||
+ cpu_vendor_id == CPU_VENDOR_CENTAUR) {
+ do_cpuid(0x80000000, regs);
+ cpu_exthigh = regs[0];
+ }
+ if (cpu_exthigh >= 0x80000001) {
+ do_cpuid(0x80000001, regs);
+ amd_feature = regs[3] & ~(cpu_feature & 0x0183f3ff);
+ amd_feature2 = regs[2];
+ }
+ if (cpu_exthigh >= 0x80000007) {
+ do_cpuid(0x80000007, regs);
+ amd_pminfo = regs[3];
+ }
+ if (cpu_exthigh >= 0x80000008) {
+ do_cpuid(0x80000008, regs);
+ cpu_procinfo2 = regs[2];
+ }
+
+ /* XXX */
+ cpu = CPU_CLAWHAMMER;
+}
+
+static u_int
+find_cpu_vendor_id(void)
+{
+ int i;
+
+ for (i = 0; i < sizeof(cpu_vendors) / sizeof(cpu_vendors[0]); i++)
+ if (strcmp(cpu_vendor, cpu_vendors[i].vendor) == 0)
+ return (cpu_vendors[i].vendor_id);
+ return (0);
+}
+
+static void
+print_AMD_assoc(int i)
+{
+ if (i == 255)
+ printf(", fully associative\n");
+ else
+ printf(", %d-way associative\n", i);
+}
+
+static void
+print_AMD_l2_assoc(int i)
+{
+ switch (i & 0x0f) {
+ case 0: printf(", disabled/not present\n"); break;
+ case 1: printf(", direct mapped\n"); break;
+ case 2: printf(", 2-way associative\n"); break;
+ case 4: printf(", 4-way associative\n"); break;
+ case 6: printf(", 8-way associative\n"); break;
+ case 8: printf(", 16-way associative\n"); break;
+ case 15: printf(", fully associative\n"); break;
+ default: printf(", reserved configuration\n"); break;
+ }
+}
+
+static void
+print_AMD_info(void)
+{
+ u_int regs[4];
+
+ if (cpu_exthigh < 0x80000005)
+ return;
+
+ do_cpuid(0x80000005, regs);
+ printf("L1 2MB data TLB: %d entries", (regs[0] >> 16) & 0xff);
+ print_AMD_assoc(regs[0] >> 24);
+
+ printf("L1 2MB instruction TLB: %d entries", regs[0] & 0xff);
+ print_AMD_assoc((regs[0] >> 8) & 0xff);
+
+ printf("L1 4KB data TLB: %d entries", (regs[1] >> 16) & 0xff);
+ print_AMD_assoc(regs[1] >> 24);
+
+ printf("L1 4KB instruction TLB: %d entries", regs[1] & 0xff);
+ print_AMD_assoc((regs[1] >> 8) & 0xff);
+
+ printf("L1 data cache: %d kbytes", regs[2] >> 24);
+ printf(", %d bytes/line", regs[2] & 0xff);
+ printf(", %d lines/tag", (regs[2] >> 8) & 0xff);
+ print_AMD_assoc((regs[2] >> 16) & 0xff);
+
+ printf("L1 instruction cache: %d kbytes", regs[3] >> 24);
+ printf(", %d bytes/line", regs[3] & 0xff);
+ printf(", %d lines/tag", (regs[3] >> 8) & 0xff);
+ print_AMD_assoc((regs[3] >> 16) & 0xff);
+
+ if (cpu_exthigh >= 0x80000006) {
+ do_cpuid(0x80000006, regs);
+ if ((regs[0] >> 16) != 0) {
+ printf("L2 2MB data TLB: %d entries",
+ (regs[0] >> 16) & 0xfff);
+ print_AMD_l2_assoc(regs[0] >> 28);
+ printf("L2 2MB instruction TLB: %d entries",
+ regs[0] & 0xfff);
+ print_AMD_l2_assoc((regs[0] >> 28) & 0xf);
+ } else {
+ printf("L2 2MB unified TLB: %d entries",
+ regs[0] & 0xfff);
+ print_AMD_l2_assoc((regs[0] >> 28) & 0xf);
+ }
+ if ((regs[1] >> 16) != 0) {
+ printf("L2 4KB data TLB: %d entries",
+ (regs[1] >> 16) & 0xfff);
+ print_AMD_l2_assoc(regs[1] >> 28);
+
+ printf("L2 4KB instruction TLB: %d entries",
+ (regs[1] >> 16) & 0xfff);
+ print_AMD_l2_assoc((regs[1] >> 28) & 0xf);
+ } else {
+ printf("L2 4KB unified TLB: %d entries",
+ (regs[1] >> 16) & 0xfff);
+ print_AMD_l2_assoc((regs[1] >> 28) & 0xf);
+ }
+ printf("L2 unified cache: %d kbytes", regs[2] >> 16);
+ printf(", %d bytes/line", regs[2] & 0xff);
+ printf(", %d lines/tag", (regs[2] >> 8) & 0x0f);
+ print_AMD_l2_assoc((regs[2] >> 12) & 0x0f);
+ }
+
+ /*
+ * Opteron Rev E shows a bug as in very rare occasions a read memory
+ * barrier is not performed as expected if it is followed by a
+ * non-atomic read-modify-write instruction.
+ * As long as that bug pops up very rarely (intensive machine usage
+ * on other operating systems generally generates one unexplainable
+ * crash any 2 months) and as long as a model specific fix would be
+ * impratical at this stage, print out a warning string if the broken
+ * model and family are identified.
+ */
+ if (CPUID_TO_FAMILY(cpu_id) == 0xf && CPUID_TO_MODEL(cpu_id) >= 0x20 &&
+ CPUID_TO_MODEL(cpu_id) <= 0x3f)
+ printf("WARNING: This architecture revision has known SMP "
+ "hardware bugs which may cause random instability\n");
+}
+
+static void
+print_via_padlock_info(void)
+{
+ u_int regs[4];
+
+ do_cpuid(0xc0000001, regs);
+ printf("\n VIA Padlock Features=0x%b", regs[3],
+ "\020"
+ "\003RNG" /* RNG */
+ "\007AES" /* ACE */
+ "\011AES-CTR" /* ACE2 */
+ "\013SHA1,SHA256" /* PHE */
+ "\015RSA" /* PMM */
+ );
+}
diff --git a/sys/amd64/amd64/in_cksum.c b/sys/amd64/amd64/in_cksum.c
new file mode 100644
index 0000000..ae02e91
--- /dev/null
+++ b/sys/amd64/amd64/in_cksum.c
@@ -0,0 +1,241 @@
+/* $NetBSD: in_cksum.c,v 1.7 1997/09/02 13:18:15 thorpej Exp $ */
+
+/*-
+ * Copyright (c) 1988, 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ * Copyright (c) 1996
+ * Matt Thomas <matt@3am-software.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h> /* RCS ID & Copyright macro defns */
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/mbuf.h>
+#include <sys/systm.h>
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <machine/in_cksum.h>
+
+/*
+ * Checksum routine for Internet Protocol family headers
+ * (Portable Alpha version).
+ *
+ * This routine is very heavily used in the network
+ * code and should be modified for each CPU to be as fast as possible.
+ */
+
+#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x)
+#define REDUCE32 \
+ { \
+ q_util.q = sum; \
+ sum = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
+ }
+#define REDUCE16 \
+ { \
+ q_util.q = sum; \
+ l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
+ sum = l_util.s[0] + l_util.s[1]; \
+ ADDCARRY(sum); \
+ }
+
+static const u_int32_t in_masks[] = {
+ /*0 bytes*/ /*1 byte*/ /*2 bytes*/ /*3 bytes*/
+ 0x00000000, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, /* offset 0 */
+ 0x00000000, 0x0000FF00, 0x00FFFF00, 0xFFFFFF00, /* offset 1 */
+ 0x00000000, 0x00FF0000, 0xFFFF0000, 0xFFFF0000, /* offset 2 */
+ 0x00000000, 0xFF000000, 0xFF000000, 0xFF000000, /* offset 3 */
+};
+
+union l_util {
+ u_int16_t s[2];
+ u_int32_t l;
+};
+union q_util {
+ u_int16_t s[4];
+ u_int32_t l[2];
+ u_int64_t q;
+};
+
+static u_int64_t
+in_cksumdata(const void *buf, int len)
+{
+ const u_int32_t *lw = (const u_int32_t *) buf;
+ u_int64_t sum = 0;
+ u_int64_t prefilled;
+ int offset;
+ union q_util q_util;
+
+ if ((3 & (long) lw) == 0 && len == 20) {
+ sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
+ REDUCE32;
+ return sum;
+ }
+
+ if ((offset = 3 & (long) lw) != 0) {
+ const u_int32_t *masks = in_masks + (offset << 2);
+ lw = (u_int32_t *) (((long) lw) - offset);
+ sum = *lw++ & masks[len >= 3 ? 3 : len];
+ len -= 4 - offset;
+ if (len <= 0) {
+ REDUCE32;
+ return sum;
+ }
+ }
+#if 0
+ /*
+ * Force to cache line boundary.
+ */
+ offset = 32 - (0x1f & (long) lw);
+ if (offset < 32 && len > offset) {
+ len -= offset;
+ if (4 & offset) {
+ sum += (u_int64_t) lw[0];
+ lw += 1;
+ }
+ if (8 & offset) {
+ sum += (u_int64_t) lw[0] + lw[1];
+ lw += 2;
+ }
+ if (16 & offset) {
+ sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
+ lw += 4;
+ }
+ }
+#endif
+ /*
+ * access prefilling to start load of next cache line.
+ * then add current cache line
+ * save result of prefilling for loop iteration.
+ */
+ prefilled = lw[0];
+ while ((len -= 32) >= 4) {
+ u_int64_t prefilling = lw[8];
+ sum += prefilled + lw[1] + lw[2] + lw[3]
+ + lw[4] + lw[5] + lw[6] + lw[7];
+ lw += 8;
+ prefilled = prefilling;
+ }
+ if (len >= 0) {
+ sum += prefilled + lw[1] + lw[2] + lw[3]
+ + lw[4] + lw[5] + lw[6] + lw[7];
+ lw += 8;
+ } else {
+ len += 32;
+ }
+ while ((len -= 16) >= 0) {
+ sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
+ lw += 4;
+ }
+ len += 16;
+ while ((len -= 4) >= 0) {
+ sum += (u_int64_t) *lw++;
+ }
+ len += 4;
+ if (len > 0)
+ sum += (u_int64_t) (in_masks[len] & *lw);
+ REDUCE32;
+ return sum;
+}
+
+u_short
+in_addword(u_short a, u_short b)
+{
+ u_int64_t sum = a + b;
+
+ ADDCARRY(sum);
+ return (sum);
+}
+
+u_short
+in_pseudo(u_int32_t a, u_int32_t b, u_int32_t c)
+{
+ u_int64_t sum;
+ union q_util q_util;
+ union l_util l_util;
+
+ sum = (u_int64_t) a + b + c;
+ REDUCE16;
+ return (sum);
+}
+
+u_short
+in_cksum_skip(struct mbuf *m, int len, int skip)
+{
+ u_int64_t sum = 0;
+ int mlen = 0;
+ int clen = 0;
+ caddr_t addr;
+ union q_util q_util;
+ union l_util l_util;
+
+ len -= skip;
+ for (; skip && m; m = m->m_next) {
+ if (m->m_len > skip) {
+ mlen = m->m_len - skip;
+ addr = mtod(m, caddr_t) + skip;
+ goto skip_start;
+ } else {
+ skip -= m->m_len;
+ }
+ }
+
+ for (; m && len; m = m->m_next) {
+ if (m->m_len == 0)
+ continue;
+ mlen = m->m_len;
+ addr = mtod(m, caddr_t);
+skip_start:
+ if (len < mlen)
+ mlen = len;
+ if ((clen ^ (long) addr) & 1)
+ sum += in_cksumdata(addr, mlen) << 8;
+ else
+ sum += in_cksumdata(addr, mlen);
+
+ clen += mlen;
+ len -= mlen;
+ }
+ REDUCE16;
+ return (~sum & 0xffff);
+}
+
+u_int in_cksum_hdr(const struct ip *ip)
+{
+ u_int64_t sum = in_cksumdata(ip, sizeof(struct ip));
+ union q_util q_util;
+ union l_util l_util;
+ REDUCE16;
+ return (~sum & 0xffff);
+}
diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c
new file mode 100644
index 0000000..4abed4c
--- /dev/null
+++ b/sys/amd64/amd64/initcpu.c
@@ -0,0 +1,217 @@
+/*-
+ * Copyright (c) KATO Takenori, 1997, 1998.
+ *
+ * All rights reserved. Unpublished rights reserved under the copyright
+ * laws of Japan.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer as
+ * the first lines of this file unmodified.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_cpu.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/pcpu.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+
+#include <machine/cputypes.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+static int hw_instruction_sse;
+SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD,
+ &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU");
+/*
+ * -1: automatic (default)
+ * 0: keep enable CLFLUSH
+ * 1: force disable CLFLUSH
+ */
+static int hw_clflush_disable = -1;
+
+int cpu; /* Are we 386, 386sx, 486, etc? */
+u_int cpu_feature; /* Feature flags */
+u_int cpu_feature2; /* Feature flags */
+u_int amd_feature; /* AMD feature flags */
+u_int amd_feature2; /* AMD feature flags */
+u_int amd_pminfo; /* AMD advanced power management info */
+u_int via_feature_rng; /* VIA RNG features */
+u_int via_feature_xcrypt; /* VIA ACE features */
+u_int cpu_high; /* Highest arg to CPUID */
+u_int cpu_exthigh; /* Highest arg to extended CPUID */
+u_int cpu_id; /* Stepping ID */
+u_int cpu_procinfo; /* HyperThreading Info / Brand Index / CLFUSH */
+u_int cpu_procinfo2; /* Multicore info */
+char cpu_vendor[20]; /* CPU Origin code */
+u_int cpu_vendor_id; /* CPU vendor ID */
+u_int cpu_fxsr; /* SSE enabled */
+u_int cpu_mxcsr_mask; /* Valid bits in mxcsr */
+u_int cpu_clflush_line_size = 32;
+u_int cpu_stdext_feature;
+u_int cpu_max_ext_state_size;
+
+SYSCTL_UINT(_hw, OID_AUTO, via_feature_rng, CTLFLAG_RD,
+ &via_feature_rng, 0, "VIA RNG feature available in CPU");
+SYSCTL_UINT(_hw, OID_AUTO, via_feature_xcrypt, CTLFLAG_RD,
+ &via_feature_xcrypt, 0, "VIA xcrypt feature available in CPU");
+
+static void
+init_amd(void)
+{
+
+ /*
+ * Work around Erratum 721 for Family 10h and 12h processors.
+ * These processors may incorrectly update the stack pointer
+ * after a long series of push and/or near-call instructions,
+ * or a long series of pop and/or near-return instructions.
+ *
+ * http://support.amd.com/us/Processor_TechDocs/41322_10h_Rev_Gd.pdf
+ * http://support.amd.com/us/Processor_TechDocs/44739_12h_Rev_Gd.pdf
+ *
+ * Hypervisors do not provide access to the errata MSR,
+ * causing #GP exception on attempt to apply the errata. The
+ * MSR write shall be done on host and persist globally
+ * anyway, so do not try to do it when under virtualization.
+ */
+ switch (CPUID_TO_FAMILY(cpu_id)) {
+ case 0x10:
+ case 0x12:
+ if ((cpu_feature2 & CPUID2_HV) == 0)
+ wrmsr(0xc0011029, rdmsr(0xc0011029) | 1);
+ break;
+ }
+}
+
+/*
+ * Initialize special VIA features
+ */
+static void
+init_via(void)
+{
+ u_int regs[4], val;
+
+ /*
+ * Check extended CPUID for PadLock features.
+ *
+ * http://www.via.com.tw/en/downloads/whitepapers/initiatives/padlock/programming_guide.pdf
+ */
+ do_cpuid(0xc0000000, regs);
+ if (regs[0] >= 0xc0000001) {
+ do_cpuid(0xc0000001, regs);
+ val = regs[3];
+ } else
+ return;
+
+ /* Enable RNG if present. */
+ if ((val & VIA_CPUID_HAS_RNG) != 0) {
+ via_feature_rng = VIA_HAS_RNG;
+ wrmsr(0x110B, rdmsr(0x110B) | VIA_CPUID_DO_RNG);
+ }
+
+ /* Enable PadLock if present. */
+ if ((val & VIA_CPUID_HAS_ACE) != 0)
+ via_feature_xcrypt |= VIA_HAS_AES;
+ if ((val & VIA_CPUID_HAS_ACE2) != 0)
+ via_feature_xcrypt |= VIA_HAS_AESCTR;
+ if ((val & VIA_CPUID_HAS_PHE) != 0)
+ via_feature_xcrypt |= VIA_HAS_SHA;
+ if ((val & VIA_CPUID_HAS_PMM) != 0)
+ via_feature_xcrypt |= VIA_HAS_MM;
+ if (via_feature_xcrypt != 0)
+ wrmsr(0x1107, rdmsr(0x1107) | (1 << 28));
+}
+
+/*
+ * Initialize CPU control registers
+ */
+void
+initializecpu(void)
+{
+ uint64_t msr;
+ uint32_t cr4;
+
+ cr4 = rcr4();
+ if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) {
+ cr4 |= CR4_FXSR | CR4_XMM;
+ cpu_fxsr = hw_instruction_sse = 1;
+ }
+ if (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE)
+ cr4 |= CR4_FSGSBASE;
+
+ /*
+ * Postpone enabling the SMEP on the boot CPU until the page
+ * tables are switched from the boot loader identity mapping
+ * to the kernel tables. The boot loader enables the U bit in
+ * its tables.
+ */
+ if (!IS_BSP() && (cpu_stdext_feature & CPUID_STDEXT_SMEP))
+ cr4 |= CR4_SMEP;
+ load_cr4(cr4);
+ if ((amd_feature & AMDID_NX) != 0) {
+ msr = rdmsr(MSR_EFER) | EFER_NXE;
+ wrmsr(MSR_EFER, msr);
+ pg_nx = PG_NX;
+ }
+ switch (cpu_vendor_id) {
+ case CPU_VENDOR_AMD:
+ init_amd();
+ break;
+ case CPU_VENDOR_CENTAUR:
+ init_via();
+ break;
+ }
+}
+
+void
+initializecpucache()
+{
+
+ /*
+ * CPUID with %eax = 1, %ebx returns
+ * Bits 15-8: CLFLUSH line size
+ * (Value * 8 = cache line size in bytes)
+ */
+ if ((cpu_feature & CPUID_CLFSH) != 0)
+ cpu_clflush_line_size = ((cpu_procinfo >> 8) & 0xff) * 8;
+ /*
+ * XXXKIB: (temporary) hack to work around traps generated
+ * when CLFLUSHing APIC register window under virtualization
+ * environments. These environments tend to disable the
+ * CPUID_SS feature even though the native CPU supports it.
+ */
+ TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable);
+ if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1)
+ cpu_feature &= ~CPUID_CLFSH;
+ /*
+ * Allow to disable CLFLUSH feature manually by
+ * hw.clflush_disable tunable.
+ */
+ if (hw_clflush_disable == 1)
+ cpu_feature &= ~CPUID_CLFSH;
+}
diff --git a/sys/amd64/amd64/io.c b/sys/amd64/amd64/io.c
new file mode 100644
index 0000000..c2d0d51
--- /dev/null
+++ b/sys/amd64/amd64/io.c
@@ -0,0 +1,59 @@
+/*-
+ * Copyright (c) 2004 Mark R V Murray
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/proc.h>
+
+#include <machine/frame.h>
+#include <machine/iodev.h>
+#include <machine/psl.h>
+
+int
+iodev_open(struct thread *td)
+{
+
+ td->td_frame->tf_rflags |= PSL_IOPL;
+ return (0);
+}
+
+int
+iodev_close(struct thread *td)
+{
+
+ td->td_frame->tf_rflags &= ~PSL_IOPL;
+ return (0);
+}
+
+/* ARGSUSED */
+int
+iodev_ioctl(u_long cmd __unused, caddr_t data __unused)
+{
+
+ return (ENOIOCTL);
+}
diff --git a/sys/amd64/amd64/locore.S b/sys/amd64/amd64/locore.S
new file mode 100644
index 0000000..55cda3a
--- /dev/null
+++ b/sys/amd64/amd64/locore.S
@@ -0,0 +1,88 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm <peter@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+#include <machine/psl.h>
+#include <machine/pmap.h>
+#include <machine/specialreg.h>
+
+#include "assym.s"
+
+/*
+ * Compiled KERNBASE location
+ */
+ .globl kernbase,loc_PTmap,loc_PDmap,loc_PDPmap,loc_PML4map,loc_PML4pml4e,dmapbase,dmapend
+ .set kernbase,KERNBASE
+ .set loc_PTmap,addr_PTmap
+ .set loc_PDmap,addr_PDmap
+ .set loc_PDPmap,addr_PDPmap
+ .set loc_PML4map,addr_PML4map
+ .set loc_PML4pml4e,addr_PML4pml4e
+ .set dmapbase,DMAP_MIN_ADDRESS
+ .set dmapend,DMAP_MAX_ADDRESS
+
+ .text
+/**********************************************************************
+ *
+ * This is where the loader trampoline start us, set the ball rolling...
+ *
+ * We are called with the stack looking like this:
+ * 0(%rsp) = 32 bit return address (cannot be used)
+ * 4(%rsp) = 32 bit modulep
+ * 8(%rsp) = 32 bit kernend
+ *
+ * We are already in long mode, on a 64 bit %cs and running at KERNBASE.
+ */
+NON_GPROF_ENTRY(btext)
+
+ /* Tell the bios to warmboot next time */
+ movw $0x1234,0x472
+
+ /* Don't trust what the loader gives for rflags. */
+ pushq $PSL_KERNEL
+ popfq
+
+ /* Find the metadata pointers before we lose them */
+ movq %rsp, %rbp
+ movl 4(%rbp),%edi /* modulep (arg 1) */
+ movl 8(%rbp),%esi /* kernend (arg 2) */
+
+ /* Get onto a stack that we can trust - there is no going back now. */
+ movq $bootstack,%rsp
+ xorl %ebp, %ebp
+
+ call hammer_time /* set up cpu for unix operation */
+ movq %rax,%rsp /* set up kstack for mi_startup() */
+ call mi_startup /* autoconfiguration, mountroot etc */
+0: hlt
+ jmp 0b
+
+ .bss
+ ALIGN_DATA /* just to be sure */
+ .space 0x1000 /* space for bootstack - temporary stack */
+bootstack:
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
new file mode 100644
index 0000000..f5e1437
--- /dev/null
+++ b/sys/amd64/amd64/machdep.c
@@ -0,0 +1,2556 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm.
+ * Copyright (c) 1992 Terrence R. Lambert.
+ * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_atalk.h"
+#include "opt_atpic.h"
+#include "opt_compat.h"
+#include "opt_cpu.h"
+#include "opt_ddb.h"
+#include "opt_inet.h"
+#include "opt_ipx.h"
+#include "opt_isa.h"
+#include "opt_kstack_pages.h"
+#include "opt_maxmem.h"
+#include "opt_mp_watchdog.h"
+#include "opt_perfmon.h"
+#include "opt_sched.h"
+#include "opt_kdtrace.h"
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/bus.h>
+#include <sys/callout.h>
+#include <sys/cons.h>
+#include <sys/cpu.h>
+#include <sys/eventhandler.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/linker.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/memrange.h>
+#include <sys/msgbuf.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/ptrace.h>
+#include <sys/reboot.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/signalvar.h>
+#ifdef SMP
+#include <sys/smp.h>
+#endif
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/ucontext.h>
+#include <sys/vmmeter.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_param.h>
+
+#ifdef DDB
+#ifndef KDB
+#error KDB must be enabled in order for DDB to work!
+#endif
+#include <ddb/ddb.h>
+#include <ddb/db_sym.h>
+#endif
+
+#include <net/netisr.h>
+
+#include <machine/clock.h>
+#include <machine/cpu.h>
+#include <machine/cputypes.h>
+#include <machine/intr_machdep.h>
+#include <x86/mca.h>
+#include <machine/md_var.h>
+#include <machine/metadata.h>
+#include <machine/mp_watchdog.h>
+#include <machine/pc/bios.h>
+#include <machine/pcb.h>
+#include <machine/proc.h>
+#include <machine/reg.h>
+#include <machine/sigframe.h>
+#include <machine/specialreg.h>
+#ifdef PERFMON
+#include <machine/perfmon.h>
+#endif
+#include <machine/tss.h>
+#ifdef SMP
+#include <machine/smp.h>
+#endif
+
+#ifdef DEV_ATPIC
+#include <x86/isa/icu.h>
+#else
+#include <machine/apicvar.h>
+#endif
+
+#include <isa/isareg.h>
+#include <isa/rtc.h>
+
+/* Sanity check for __curthread() */
+CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
+
+extern u_int64_t hammer_time(u_int64_t, u_int64_t);
+
+extern void printcpuinfo(void); /* XXX header file */
+extern void identify_cpu(void);
+extern void panicifcpuunsupported(void);
+
+#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
+#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
+
+static void cpu_startup(void *);
+static void get_fpcontext(struct thread *td, mcontext_t *mcp,
+ char *xfpusave, size_t xfpusave_len);
+static int set_fpcontext(struct thread *td, const mcontext_t *mcp,
+ char *xfpustate, size_t xfpustate_len);
+SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
+
+/*
+ * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its value is
+ * the physical address at which the kernel is loaded.
+ */
+extern char kernphys[];
+#ifdef DDB
+extern vm_offset_t ksym_start, ksym_end;
+#endif
+
+struct msgbuf *msgbufp;
+
+/* Intel ICH registers */
+#define ICH_PMBASE 0x400
+#define ICH_SMI_EN ICH_PMBASE + 0x30
+
+int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
+
+int cold = 1;
+
+long Maxmem = 0;
+long realmem = 0;
+
+/*
+ * The number of PHYSMAP entries must be one less than the number of
+ * PHYSSEG entries because the PHYSMAP entry that spans the largest
+ * physical address that is accessible by ISA DMA is split into two
+ * PHYSSEG entries.
+ */
+#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
+
+vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
+vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
+
+/* must be 2 less so 0 0 can signal end of chunks */
+#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
+#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
+
+struct kva_md_info kmi;
+
+static struct trapframe proc0_tf;
+struct region_descriptor r_gdt, r_idt;
+
+struct pcpu __pcpu[MAXCPU];
+
+struct mtx icu_lock;
+
+struct mem_range_softc mem_range_softc;
+
+struct mtx dt_lock; /* lock for GDT and LDT */
+
+static void
+cpu_startup(dummy)
+ void *dummy;
+{
+ uintmax_t memsize;
+ char *sysenv;
+
+ /*
+ * On MacBooks, we need to disallow the legacy USB circuit to
+ * generate an SMI# because this can cause several problems,
+ * namely: incorrect CPU frequency detection and failure to
+ * start the APs.
+ * We do this by disabling a bit in the SMI_EN (SMI Control and
+ * Enable register) of the Intel ICH LPC Interface Bridge.
+ */
+ sysenv = getenv("smbios.system.product");
+ if (sysenv != NULL) {
+ if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
+ strncmp(sysenv, "MacBook3,1", 10) == 0 ||
+ strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
+ strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
+ strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
+ strncmp(sysenv, "Macmini1,1", 10) == 0) {
+ if (bootverbose)
+ printf("Disabling LEGACY_USB_EN bit on "
+ "Intel ICH.\n");
+ outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
+ }
+ freeenv(sysenv);
+ }
+
+ /*
+ * Good {morning,afternoon,evening,night}.
+ */
+ startrtclock();
+ printcpuinfo();
+ panicifcpuunsupported();
+#ifdef PERFMON
+ perfmon_init();
+#endif
+ realmem = Maxmem;
+
+ /*
+ * Display physical memory if SMBIOS reports reasonable amount.
+ */
+ memsize = 0;
+ sysenv = getenv("smbios.memory.enabled");
+ if (sysenv != NULL) {
+ memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
+ freeenv(sysenv);
+ }
+ if (memsize < ptoa((uintmax_t)cnt.v_free_count))
+ memsize = ptoa((uintmax_t)Maxmem);
+ printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
+
+ /*
+ * Display any holes after the first chunk of extended memory.
+ */
+ if (bootverbose) {
+ int indx;
+
+ printf("Physical memory chunk(s):\n");
+ for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
+ vm_paddr_t size;
+
+ size = phys_avail[indx + 1] - phys_avail[indx];
+ printf(
+ "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
+ (uintmax_t)phys_avail[indx],
+ (uintmax_t)phys_avail[indx + 1] - 1,
+ (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
+ }
+ }
+
+ vm_ksubmap_init(&kmi);
+
+ printf("avail memory = %ju (%ju MB)\n",
+ ptoa((uintmax_t)cnt.v_free_count),
+ ptoa((uintmax_t)cnt.v_free_count) / 1048576);
+
+ /*
+ * Set up buffers, so they can be used to read disk labels.
+ */
+ bufinit();
+ vm_pager_bufferinit();
+
+ cpu_setregs();
+}
+
+/*
+ * Send an interrupt to process.
+ *
+ * Stack is set up to allow sigcode stored
+ * at top to call routine, followed by call
+ * to sigreturn routine below. After sigreturn
+ * resets the signal mask, the stack, and the
+ * frame pointer, it returns to the user
+ * specified pc, psl.
+ */
+void
+sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
+{
+ struct sigframe sf, *sfp;
+ struct pcb *pcb;
+ struct proc *p;
+ struct thread *td;
+ struct sigacts *psp;
+ char *sp;
+ struct trapframe *regs;
+ char *xfpusave;
+ size_t xfpusave_len;
+ int sig;
+ int oonstack;
+
+ td = curthread;
+ pcb = td->td_pcb;
+ p = td->td_proc;
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ sig = ksi->ksi_signo;
+ psp = p->p_sigacts;
+ mtx_assert(&psp->ps_mtx, MA_OWNED);
+ regs = td->td_frame;
+ oonstack = sigonstack(regs->tf_rsp);
+
+ if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
+ xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
+ xfpusave = __builtin_alloca(xfpusave_len);
+ } else {
+ xfpusave_len = 0;
+ xfpusave = NULL;
+ }
+
+ /* Save user context. */
+ bzero(&sf, sizeof(sf));
+ sf.sf_uc.uc_sigmask = *mask;
+ sf.sf_uc.uc_stack = td->td_sigstk;
+ sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
+ ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
+ sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
+ bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
+ sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
+ get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
+ fpstate_drop(td);
+ sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
+ sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
+ bzero(sf.sf_uc.uc_mcontext.mc_spare,
+ sizeof(sf.sf_uc.uc_mcontext.mc_spare));
+ bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
+
+ /* Allocate space for the signal handler context. */
+ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
+ SIGISMEMBER(psp->ps_sigonstack, sig)) {
+ sp = td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
+#if defined(COMPAT_43)
+ td->td_sigstk.ss_flags |= SS_ONSTACK;
+#endif
+ } else
+ sp = (char *)regs->tf_rsp - 128;
+ if (xfpusave != NULL) {
+ sp -= xfpusave_len;
+ sp = (char *)((unsigned long)sp & ~0x3Ful);
+ sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
+ }
+ sp -= sizeof(struct sigframe);
+ /* Align to 16 bytes. */
+ sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
+
+ /* Translate the signal if appropriate. */
+ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
+ sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
+
+ /* Build the argument list for the signal handler. */
+ regs->tf_rdi = sig; /* arg 1 in %rdi */
+ regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
+ bzero(&sf.sf_si, sizeof(sf.sf_si));
+ if (SIGISMEMBER(psp->ps_siginfo, sig)) {
+ /* Signal handler installed with SA_SIGINFO. */
+ regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
+ sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
+
+ /* Fill in POSIX parts */
+ sf.sf_si = ksi->ksi_info;
+ sf.sf_si.si_signo = sig; /* maybe a translated signal */
+ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
+ } else {
+ /* Old FreeBSD-style arguments. */
+ regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */
+ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
+ sf.sf_ahu.sf_handler = catcher;
+ }
+ mtx_unlock(&psp->ps_mtx);
+ PROC_UNLOCK(p);
+
+ /*
+ * Copy the sigframe out to the user's stack.
+ */
+ if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
+ (xfpusave != NULL && copyout(xfpusave,
+ (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
+ != 0)) {
+#ifdef DEBUG
+ printf("process %ld has trashed its stack\n", (long)p->p_pid);
+#endif
+ PROC_LOCK(p);
+ sigexit(td, SIGILL);
+ }
+
+ regs->tf_rsp = (long)sfp;
+ regs->tf_rip = p->p_sysent->sv_sigcode_base;
+ regs->tf_rflags &= ~(PSL_T | PSL_D);
+ regs->tf_cs = _ucodesel;
+ regs->tf_ds = _udatasel;
+ regs->tf_es = _udatasel;
+ regs->tf_fs = _ufssel;
+ regs->tf_gs = _ugssel;
+ regs->tf_flags = TF_HASSEGS;
+ set_pcb_flags(pcb, PCB_FULL_IRET);
+ PROC_LOCK(p);
+ mtx_lock(&psp->ps_mtx);
+}
+
+/*
+ * System call to cleanup state after a signal
+ * has been taken. Reset signal mask and
+ * stack state from context left by sendsig (above).
+ * Return to previous pc and psl as specified by
+ * context left by sendsig. Check carefully to
+ * make sure that the user has not modified the
+ * state to gain improper privileges.
+ *
+ * MPSAFE
+ */
+int
+sys_sigreturn(td, uap)
+ struct thread *td;
+ struct sigreturn_args /* {
+ const struct __ucontext *sigcntxp;
+ } */ *uap;
+{
+ ucontext_t uc;
+ struct pcb *pcb;
+ struct proc *p;
+ struct trapframe *regs;
+ ucontext_t *ucp;
+ char *xfpustate;
+ size_t xfpustate_len;
+ long rflags;
+ int cs, error, ret;
+ ksiginfo_t ksi;
+
+ pcb = td->td_pcb;
+ p = td->td_proc;
+
+ error = copyin(uap->sigcntxp, &uc, sizeof(uc));
+ if (error != 0) {
+ uprintf("pid %d (%s): sigreturn copyin failed\n",
+ p->p_pid, td->td_name);
+ return (error);
+ }
+ ucp = &uc;
+ if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
+ uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
+ td->td_name, ucp->uc_mcontext.mc_flags);
+ return (EINVAL);
+ }
+ regs = td->td_frame;
+ rflags = ucp->uc_mcontext.mc_rflags;
+ /*
+ * Don't allow users to change privileged or reserved flags.
+ */
+ /*
+ * XXX do allow users to change the privileged flag PSL_RF.
+ * The cpu sets PSL_RF in tf_rflags for faults. Debuggers
+ * should sometimes set it there too. tf_rflags is kept in
+ * the signal context during signal handling and there is no
+ * other place to remember it, so the PSL_RF bit may be
+ * corrupted by the signal handler without us knowing.
+ * Corruption of the PSL_RF bit at worst causes one more or
+ * one less debugger trap, so allowing it is fairly harmless.
+ */
+ if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
+ uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
+ td->td_name, rflags);
+ return (EINVAL);
+ }
+
+ /*
+ * Don't allow users to load a valid privileged %cs. Let the
+ * hardware check for invalid selectors, excess privilege in
+ * other selectors, invalid %eip's and invalid %esp's.
+ */
+ cs = ucp->uc_mcontext.mc_cs;
+ if (!CS_SECURE(cs)) {
+ uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
+ td->td_name, cs);
+ ksiginfo_init_trap(&ksi);
+ ksi.ksi_signo = SIGBUS;
+ ksi.ksi_code = BUS_OBJERR;
+ ksi.ksi_trapno = T_PROTFLT;
+ ksi.ksi_addr = (void *)regs->tf_rip;
+ trapsignal(td, &ksi);
+ return (EINVAL);
+ }
+
+ if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
+ xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
+ if (xfpustate_len > cpu_max_ext_state_size -
+ sizeof(struct savefpu)) {
+ uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
+ p->p_pid, td->td_name, xfpustate_len);
+ return (EINVAL);
+ }
+ xfpustate = __builtin_alloca(xfpustate_len);
+ error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
+ xfpustate, xfpustate_len);
+ if (error != 0) {
+ uprintf(
+ "pid %d (%s): sigreturn copying xfpustate failed\n",
+ p->p_pid, td->td_name);
+ return (error);
+ }
+ } else {
+ xfpustate = NULL;
+ xfpustate_len = 0;
+ }
+ ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
+ if (ret != 0) {
+ uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
+ p->p_pid, td->td_name, ret);
+ return (ret);
+ }
+ bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
+ pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
+ pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
+
+#if defined(COMPAT_43)
+ if (ucp->uc_mcontext.mc_onstack & 1)
+ td->td_sigstk.ss_flags |= SS_ONSTACK;
+ else
+ td->td_sigstk.ss_flags &= ~SS_ONSTACK;
+#endif
+
+ kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
+ set_pcb_flags(pcb, PCB_FULL_IRET);
+ return (EJUSTRETURN);
+}
+
+#ifdef COMPAT_FREEBSD4
+int
+freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
+{
+
+ return sys_sigreturn(td, (struct sigreturn_args *)uap);
+}
+#endif
+
+
+/*
+ * Machine dependent boot() routine
+ *
+ * I haven't seen anything to put here yet
+ * Possibly some stuff might be grafted back here from boot()
+ */
+void
+cpu_boot(int howto)
+{
+}
+
+/*
+ * Flush the D-cache for non-DMA I/O so that the I-cache can
+ * be made coherent later.
+ */
+void
+cpu_flush_dcache(void *ptr, size_t len)
+{
+ /* Not applicable */
+}
+
+/* Get current clock frequency for the given cpu id. */
+int
+cpu_est_clockrate(int cpu_id, uint64_t *rate)
+{
+ uint64_t tsc1, tsc2;
+ uint64_t acnt, mcnt, perf;
+ register_t reg;
+
+ if (pcpu_find(cpu_id) == NULL || rate == NULL)
+ return (EINVAL);
+
+ /*
+ * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
+ * DELAY(9) based logic fails.
+ */
+ if (tsc_is_invariant && !tsc_perf_stat)
+ return (EOPNOTSUPP);
+
+#ifdef SMP
+ if (smp_cpus > 1) {
+ /* Schedule ourselves on the indicated cpu. */
+ thread_lock(curthread);
+ sched_bind(curthread, cpu_id);
+ thread_unlock(curthread);
+ }
+#endif
+
+ /* Calibrate by measuring a short delay. */
+ reg = intr_disable();
+ if (tsc_is_invariant) {
+ wrmsr(MSR_MPERF, 0);
+ wrmsr(MSR_APERF, 0);
+ tsc1 = rdtsc();
+ DELAY(1000);
+ mcnt = rdmsr(MSR_MPERF);
+ acnt = rdmsr(MSR_APERF);
+ tsc2 = rdtsc();
+ intr_restore(reg);
+ perf = 1000 * acnt / mcnt;
+ *rate = (tsc2 - tsc1) * perf;
+ } else {
+ tsc1 = rdtsc();
+ DELAY(1000);
+ tsc2 = rdtsc();
+ intr_restore(reg);
+ *rate = (tsc2 - tsc1) * 1000;
+ }
+
+#ifdef SMP
+ if (smp_cpus > 1) {
+ thread_lock(curthread);
+ sched_unbind(curthread);
+ thread_unlock(curthread);
+ }
+#endif
+
+ return (0);
+}
+
+/*
+ * Shutdown the CPU as much as possible
+ */
+void
+cpu_halt(void)
+{
+ for (;;)
+ halt();
+}
+
+void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */
+static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */
+static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */
+TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
+SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
+ 0, "Use MONITOR/MWAIT for short idle");
+
+#define STATE_RUNNING 0x0
+#define STATE_MWAIT 0x1
+#define STATE_SLEEPING 0x2
+
+static void
+cpu_idle_acpi(sbintime_t sbt)
+{
+ int *state;
+
+ state = (int *)PCPU_PTR(monitorbuf);
+ *state = STATE_SLEEPING;
+
+ /* See comments in cpu_idle_hlt(). */
+ disable_intr();
+ if (sched_runnable())
+ enable_intr();
+ else if (cpu_idle_hook)
+ cpu_idle_hook(sbt);
+ else
+ __asm __volatile("sti; hlt");
+ *state = STATE_RUNNING;
+}
+
+static void
+cpu_idle_hlt(sbintime_t sbt)
+{
+ int *state;
+
+ state = (int *)PCPU_PTR(monitorbuf);
+ *state = STATE_SLEEPING;
+
+ /*
+ * Since we may be in a critical section from cpu_idle(), if
+ * an interrupt fires during that critical section we may have
+ * a pending preemption. If the CPU halts, then that thread
+ * may not execute until a later interrupt awakens the CPU.
+ * To handle this race, check for a runnable thread after
+ * disabling interrupts and immediately return if one is
+ * found. Also, we must absolutely guarentee that hlt is
+ * the next instruction after sti. This ensures that any
+ * interrupt that fires after the call to disable_intr() will
+ * immediately awaken the CPU from hlt. Finally, please note
+ * that on x86 this works fine because of interrupts enabled only
+ * after the instruction following sti takes place, while IF is set
+ * to 1 immediately, allowing hlt instruction to acknowledge the
+ * interrupt.
+ */
+ disable_intr();
+ if (sched_runnable())
+ enable_intr();
+ else
+ __asm __volatile("sti; hlt");
+ *state = STATE_RUNNING;
+}
+
+/*
+ * MWAIT cpu power states. Lower 4 bits are sub-states.
+ */
+#define MWAIT_C0 0xf0
+#define MWAIT_C1 0x00
+#define MWAIT_C2 0x10
+#define MWAIT_C3 0x20
+#define MWAIT_C4 0x30
+
+static void
+cpu_idle_mwait(sbintime_t sbt)
+{
+ int *state;
+
+ state = (int *)PCPU_PTR(monitorbuf);
+ *state = STATE_MWAIT;
+
+ /* See comments in cpu_idle_hlt(). */
+ disable_intr();
+ if (sched_runnable()) {
+ enable_intr();
+ *state = STATE_RUNNING;
+ return;
+ }
+ cpu_monitor(state, 0, 0);
+ if (*state == STATE_MWAIT)
+ __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
+ else
+ enable_intr();
+ *state = STATE_RUNNING;
+}
+
+static void
+cpu_idle_spin(sbintime_t sbt)
+{
+ int *state;
+ int i;
+
+ state = (int *)PCPU_PTR(monitorbuf);
+ *state = STATE_RUNNING;
+
+ /*
+ * The sched_runnable() call is racy but as long as there is
+ * a loop missing it one time will have just a little impact if any
+ * (and it is much better than missing the check at all).
+ */
+ for (i = 0; i < 1000; i++) {
+ if (sched_runnable())
+ return;
+ cpu_spinwait();
+ }
+}
+
+/*
+ * C1E renders the local APIC timer dead, so we disable it by
+ * reading the Interrupt Pending Message register and clearing
+ * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
+ *
+ * Reference:
+ * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
+ * #32559 revision 3.00+
+ */
+#define MSR_AMDK8_IPM 0xc0010055
+#define AMDK8_SMIONCMPHALT (1ULL << 27)
+#define AMDK8_C1EONCMPHALT (1ULL << 28)
+#define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
+
+static void
+cpu_probe_amdc1e(void)
+{
+
+ /*
+ * Detect the presence of C1E capability mostly on latest
+ * dual-cores (or future) k8 family.
+ */
+ if (cpu_vendor_id == CPU_VENDOR_AMD &&
+ (cpu_id & 0x00000f00) == 0x00000f00 &&
+ (cpu_id & 0x0fff0000) >= 0x00040000) {
+ cpu_ident_amdc1e = 1;
+ }
+}
+
+void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
+
+void
+cpu_idle(int busy)
+{
+ uint64_t msr;
+ sbintime_t sbt = -1;
+
+ CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
+ busy, curcpu);
+#ifdef MP_WATCHDOG
+ ap_watchdog(PCPU_GET(cpuid));
+#endif
+ /* If we are busy - try to use fast methods. */
+ if (busy) {
+ if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
+ cpu_idle_mwait(busy);
+ goto out;
+ }
+ }
+
+ /* If we have time - switch timers into idle mode. */
+ if (!busy) {
+ critical_enter();
+ sbt = cpu_idleclock();
+ }
+
+ /* Apply AMD APIC timer C1E workaround. */
+ if (cpu_ident_amdc1e && cpu_disable_deep_sleep) {
+ msr = rdmsr(MSR_AMDK8_IPM);
+ if (msr & AMDK8_CMPHALT)
+ wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
+ }
+
+ /* Call main idle method. */
+ cpu_idle_fn(sbt);
+
+ /* Switch timers mack into active mode. */
+ if (!busy) {
+ cpu_activeclock();
+ critical_exit();
+ }
+out:
+ CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
+ busy, curcpu);
+}
+
+int
+cpu_idle_wakeup(int cpu)
+{
+ struct pcpu *pcpu;
+ int *state;
+
+ pcpu = pcpu_find(cpu);
+ state = (int *)pcpu->pc_monitorbuf;
+ /*
+ * This doesn't need to be atomic since missing the race will
+ * simply result in unnecessary IPIs.
+ */
+ if (*state == STATE_SLEEPING)
+ return (0);
+ if (*state == STATE_MWAIT)
+ *state = STATE_RUNNING;
+ return (1);
+}
+
+/*
+ * Ordered by speed/power consumption.
+ */
+struct {
+ void *id_fn;
+ char *id_name;
+} idle_tbl[] = {
+ { cpu_idle_spin, "spin" },
+ { cpu_idle_mwait, "mwait" },
+ { cpu_idle_hlt, "hlt" },
+ { cpu_idle_acpi, "acpi" },
+ { NULL, NULL }
+};
+
+static int
+idle_sysctl_available(SYSCTL_HANDLER_ARGS)
+{
+ char *avail, *p;
+ int error;
+ int i;
+
+ avail = malloc(256, M_TEMP, M_WAITOK);
+ p = avail;
+ for (i = 0; idle_tbl[i].id_name != NULL; i++) {
+ if (strstr(idle_tbl[i].id_name, "mwait") &&
+ (cpu_feature2 & CPUID2_MON) == 0)
+ continue;
+ if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
+ cpu_idle_hook == NULL)
+ continue;
+ p += sprintf(p, "%s%s", p != avail ? ", " : "",
+ idle_tbl[i].id_name);
+ }
+ error = sysctl_handle_string(oidp, avail, 0, req);
+ free(avail, M_TEMP);
+ return (error);
+}
+
+SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
+ 0, 0, idle_sysctl_available, "A", "list of available idle functions");
+
+static int
+idle_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ char buf[16];
+ int error;
+ char *p;
+ int i;
+
+ p = "unknown";
+ for (i = 0; idle_tbl[i].id_name != NULL; i++) {
+ if (idle_tbl[i].id_fn == cpu_idle_fn) {
+ p = idle_tbl[i].id_name;
+ break;
+ }
+ }
+ strncpy(buf, p, sizeof(buf));
+ error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ for (i = 0; idle_tbl[i].id_name != NULL; i++) {
+ if (strstr(idle_tbl[i].id_name, "mwait") &&
+ (cpu_feature2 & CPUID2_MON) == 0)
+ continue;
+ if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
+ cpu_idle_hook == NULL)
+ continue;
+ if (strcmp(idle_tbl[i].id_name, buf))
+ continue;
+ cpu_idle_fn = idle_tbl[i].id_fn;
+ return (0);
+ }
+ return (EINVAL);
+}
+
+SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
+ idle_sysctl, "A", "currently selected idle function");
+
+/*
+ * Reset registers to default values on exec.
+ */
+void
+exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
+{
+ struct trapframe *regs = td->td_frame;
+ struct pcb *pcb = td->td_pcb;
+
+ mtx_lock(&dt_lock);
+ if (td->td_proc->p_md.md_ldt != NULL)
+ user_ldt_free(td);
+ else
+ mtx_unlock(&dt_lock);
+
+ pcb->pcb_fsbase = 0;
+ pcb->pcb_gsbase = 0;
+ clear_pcb_flags(pcb, PCB_32BIT | PCB_GS32BIT);
+ pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
+ set_pcb_flags(pcb, PCB_FULL_IRET);
+
+ bzero((char *)regs, sizeof(struct trapframe));
+ regs->tf_rip = imgp->entry_addr;
+ regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
+ regs->tf_rdi = stack; /* argv */
+ regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
+ regs->tf_ss = _udatasel;
+ regs->tf_cs = _ucodesel;
+ regs->tf_ds = _udatasel;
+ regs->tf_es = _udatasel;
+ regs->tf_fs = _ufssel;
+ regs->tf_gs = _ugssel;
+ regs->tf_flags = TF_HASSEGS;
+ td->td_retval[1] = 0;
+
+ /*
+ * Reset the hardware debug registers if they were in use.
+ * They won't have any meaning for the newly exec'd process.
+ */
+ if (pcb->pcb_flags & PCB_DBREGS) {
+ pcb->pcb_dr0 = 0;
+ pcb->pcb_dr1 = 0;
+ pcb->pcb_dr2 = 0;
+ pcb->pcb_dr3 = 0;
+ pcb->pcb_dr6 = 0;
+ pcb->pcb_dr7 = 0;
+ if (pcb == curpcb) {
+ /*
+ * Clear the debug registers on the running
+ * CPU, otherwise they will end up affecting
+ * the next process we switch to.
+ */
+ reset_dbregs();
+ }
+ clear_pcb_flags(pcb, PCB_DBREGS);
+ }
+
+ /*
+ * Drop the FP state if we hold it, so that the process gets a
+ * clean FP state if it uses the FPU again.
+ */
+ fpstate_drop(td);
+}
+
+void
+cpu_setregs(void)
+{
+ register_t cr0;
+
+ cr0 = rcr0();
+ /*
+ * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
+ * BSP. See the comments there about why we set them.
+ */
+ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
+ load_cr0(cr0);
+}
+
+/*
+ * Initialize amd64 and configure to run kernel
+ */
+
+/*
+ * Initialize segments & interrupt table
+ */
+
+struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
+static struct gate_descriptor idt0[NIDT];
+struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
+
+static char dblfault_stack[PAGE_SIZE] __aligned(16);
+
+static char nmi0_stack[PAGE_SIZE] __aligned(16);
+CTASSERT(sizeof(struct nmi_pcpu) == 16);
+
+struct amd64tss common_tss[MAXCPU];
+
+/*
+ * Software prototypes -- in more palatable form.
+ *
+ * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
+ * slots as corresponding segments for i386 kernel.
+ */
+struct soft_segment_descriptor gdt_segs[] = {
+/* GNULL_SEL 0 Null Descriptor */
+{ .ssd_base = 0x0,
+ .ssd_limit = 0x0,
+ .ssd_type = 0,
+ .ssd_dpl = 0,
+ .ssd_p = 0,
+ .ssd_long = 0,
+ .ssd_def32 = 0,
+ .ssd_gran = 0 },
+/* GNULL2_SEL 1 Null Descriptor */
+{ .ssd_base = 0x0,
+ .ssd_limit = 0x0,
+ .ssd_type = 0,
+ .ssd_dpl = 0,
+ .ssd_p = 0,
+ .ssd_long = 0,
+ .ssd_def32 = 0,
+ .ssd_gran = 0 },
+/* GUFS32_SEL 2 32 bit %gs Descriptor for user */
+{ .ssd_base = 0x0,
+ .ssd_limit = 0xfffff,
+ .ssd_type = SDT_MEMRWA,
+ .ssd_dpl = SEL_UPL,
+ .ssd_p = 1,
+ .ssd_long = 0,
+ .ssd_def32 = 1,
+ .ssd_gran = 1 },
+/* GUGS32_SEL 3 32 bit %fs Descriptor for user */
+{ .ssd_base = 0x0,
+ .ssd_limit = 0xfffff,
+ .ssd_type = SDT_MEMRWA,
+ .ssd_dpl = SEL_UPL,
+ .ssd_p = 1,
+ .ssd_long = 0,
+ .ssd_def32 = 1,
+ .ssd_gran = 1 },
+/* GCODE_SEL 4 Code Descriptor for kernel */
+{ .ssd_base = 0x0,
+ .ssd_limit = 0xfffff,
+ .ssd_type = SDT_MEMERA,
+ .ssd_dpl = SEL_KPL,
+ .ssd_p = 1,
+ .ssd_long = 1,
+ .ssd_def32 = 0,
+ .ssd_gran = 1 },
+/* GDATA_SEL 5 Data Descriptor for kernel */
+{ .ssd_base = 0x0,
+ .ssd_limit = 0xfffff,
+ .ssd_type = SDT_MEMRWA,
+ .ssd_dpl = SEL_KPL,
+ .ssd_p = 1,
+ .ssd_long = 1,
+ .ssd_def32 = 0,
+ .ssd_gran = 1 },
+/* GUCODE32_SEL 6 32 bit Code Descriptor for user */
+{ .ssd_base = 0x0,
+ .ssd_limit = 0xfffff,
+ .ssd_type = SDT_MEMERA,
+ .ssd_dpl = SEL_UPL,
+ .ssd_p = 1,
+ .ssd_long = 0,
+ .ssd_def32 = 1,
+ .ssd_gran = 1 },
+/* GUDATA_SEL 7 32/64 bit Data Descriptor for user */
+{ .ssd_base = 0x0,
+ .ssd_limit = 0xfffff,
+ .ssd_type = SDT_MEMRWA,
+ .ssd_dpl = SEL_UPL,
+ .ssd_p = 1,
+ .ssd_long = 0,
+ .ssd_def32 = 1,
+ .ssd_gran = 1 },
+/* GUCODE_SEL 8 64 bit Code Descriptor for user */
+{ .ssd_base = 0x0,
+ .ssd_limit = 0xfffff,
+ .ssd_type = SDT_MEMERA,
+ .ssd_dpl = SEL_UPL,
+ .ssd_p = 1,
+ .ssd_long = 1,
+ .ssd_def32 = 0,
+ .ssd_gran = 1 },
+/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
+{ .ssd_base = 0x0,
+ .ssd_limit = sizeof(struct amd64tss) + IOPAGES * PAGE_SIZE - 1,
+ .ssd_type = SDT_SYSTSS,
+ .ssd_dpl = SEL_KPL,
+ .ssd_p = 1,
+ .ssd_long = 0,
+ .ssd_def32 = 0,
+ .ssd_gran = 0 },
+/* Actually, the TSS is a system descriptor which is double size */
+{ .ssd_base = 0x0,
+ .ssd_limit = 0x0,
+ .ssd_type = 0,
+ .ssd_dpl = 0,
+ .ssd_p = 0,
+ .ssd_long = 0,
+ .ssd_def32 = 0,
+ .ssd_gran = 0 },
+/* GUSERLDT_SEL 11 LDT Descriptor */
+{ .ssd_base = 0x0,
+ .ssd_limit = 0x0,
+ .ssd_type = 0,
+ .ssd_dpl = 0,
+ .ssd_p = 0,
+ .ssd_long = 0,
+ .ssd_def32 = 0,
+ .ssd_gran = 0 },
+/* GUSERLDT_SEL 12 LDT Descriptor, double size */
+{ .ssd_base = 0x0,
+ .ssd_limit = 0x0,
+ .ssd_type = 0,
+ .ssd_dpl = 0,
+ .ssd_p = 0,
+ .ssd_long = 0,
+ .ssd_def32 = 0,
+ .ssd_gran = 0 },
+};
+
+void
+setidt(idx, func, typ, dpl, ist)
+ int idx;
+ inthand_t *func;
+ int typ;
+ int dpl;
+ int ist;
+{
+ struct gate_descriptor *ip;
+
+ ip = idt + idx;
+ ip->gd_looffset = (uintptr_t)func;
+ ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
+ ip->gd_ist = ist;
+ ip->gd_xx = 0;
+ ip->gd_type = typ;
+ ip->gd_dpl = dpl;
+ ip->gd_p = 1;
+ ip->gd_hioffset = ((uintptr_t)func)>>16 ;
+}
+
+extern inthand_t
+ IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
+ IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
+ IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
+ IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
+ IDTVEC(xmm), IDTVEC(dblfault),
+#ifdef KDTRACE_HOOKS
+ IDTVEC(dtrace_ret),
+#endif
+ IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
+
+#ifdef DDB
+/*
+ * Display the index and function name of any IDT entries that don't use
+ * the default 'rsvd' entry point.
+ */
+DB_SHOW_COMMAND(idt, db_show_idt)
+{
+ struct gate_descriptor *ip;
+ int idx;
+ uintptr_t func;
+
+ ip = idt;
+ for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
+ func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+ if (func != (uintptr_t)&IDTVEC(rsvd)) {
+ db_printf("%3d\t", idx);
+ db_printsym(func, DB_STGY_PROC);
+ db_printf("\n");
+ }
+ ip++;
+ }
+}
+#endif
+
+void
+sdtossd(sd, ssd)
+ struct user_segment_descriptor *sd;
+ struct soft_segment_descriptor *ssd;
+{
+
+ ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
+ ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
+ ssd->ssd_type = sd->sd_type;
+ ssd->ssd_dpl = sd->sd_dpl;
+ ssd->ssd_p = sd->sd_p;
+ ssd->ssd_long = sd->sd_long;
+ ssd->ssd_def32 = sd->sd_def32;
+ ssd->ssd_gran = sd->sd_gran;
+}
+
+void
+ssdtosd(ssd, sd)
+ struct soft_segment_descriptor *ssd;
+ struct user_segment_descriptor *sd;
+{
+
+ sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
+ sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
+ sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
+ sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
+ sd->sd_type = ssd->ssd_type;
+ sd->sd_dpl = ssd->ssd_dpl;
+ sd->sd_p = ssd->ssd_p;
+ sd->sd_long = ssd->ssd_long;
+ sd->sd_def32 = ssd->ssd_def32;
+ sd->sd_gran = ssd->ssd_gran;
+}
+
+void
+ssdtosyssd(ssd, sd)
+ struct soft_segment_descriptor *ssd;
+ struct system_segment_descriptor *sd;
+{
+
+ sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
+ sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
+ sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
+ sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
+ sd->sd_type = ssd->ssd_type;
+ sd->sd_dpl = ssd->ssd_dpl;
+ sd->sd_p = ssd->ssd_p;
+ sd->sd_gran = ssd->ssd_gran;
+}
+
+#if !defined(DEV_ATPIC) && defined(DEV_ISA)
+#include <isa/isavar.h>
+#include <isa/isareg.h>
+/*
+ * Return a bitmap of the current interrupt requests. This is 8259-specific
+ * and is only suitable for use at probe time.
+ * This is only here to pacify sio. It is NOT FATAL if this doesn't work.
+ * It shouldn't be here. There should probably be an APIC centric
+ * implementation in the apic driver code, if at all.
+ */
+intrmask_t
+isa_irq_pending(void)
+{
+ u_char irr1;
+ u_char irr2;
+
+ irr1 = inb(IO_ICU1);
+ irr2 = inb(IO_ICU2);
+ return ((irr2 << 8) | irr1);
+}
+#endif
+
+u_int basemem;
+
+static int
+add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
+{
+ int i, insert_idx, physmap_idx;
+
+ physmap_idx = *physmap_idxp;
+
+ if (boothowto & RB_VERBOSE)
+ printf("SMAP type=%02x base=%016lx len=%016lx\n",
+ smap->type, smap->base, smap->length);
+
+ if (smap->type != SMAP_TYPE_MEMORY)
+ return (1);
+
+ if (smap->length == 0)
+ return (0);
+
+ /*
+ * Find insertion point while checking for overlap. Start off by
+ * assuming the new entry will be added to the end.
+ */
+ insert_idx = physmap_idx + 2;
+ for (i = 0; i <= physmap_idx; i += 2) {
+ if (smap->base < physmap[i + 1]) {
+ if (smap->base + smap->length <= physmap[i]) {
+ insert_idx = i;
+ break;
+ }
+ if (boothowto & RB_VERBOSE)
+ printf(
+ "Overlapping memory regions, ignoring second region\n");
+ return (1);
+ }
+ }
+
+ /* See if we can prepend to the next entry. */
+ if (insert_idx <= physmap_idx &&
+ smap->base + smap->length == physmap[insert_idx]) {
+ physmap[insert_idx] = smap->base;
+ return (1);
+ }
+
+ /* See if we can append to the previous entry. */
+ if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) {
+ physmap[insert_idx - 1] += smap->length;
+ return (1);
+ }
+
+ physmap_idx += 2;
+ *physmap_idxp = physmap_idx;
+ if (physmap_idx == PHYSMAP_SIZE) {
+ printf(
+ "Too many segments in the physical address map, giving up\n");
+ return (0);
+ }
+
+ /*
+ * Move the last 'N' entries down to make room for the new
+ * entry if needed.
+ */
+ for (i = physmap_idx; i > insert_idx; i -= 2) {
+ physmap[i] = physmap[i - 2];
+ physmap[i + 1] = physmap[i - 1];
+ }
+
+ /* Insert the new entry. */
+ physmap[insert_idx] = smap->base;
+ physmap[insert_idx + 1] = smap->base + smap->length;
+ return (1);
+}
+
+/*
+ * Populate the (physmap) array with base/bound pairs describing the
+ * available physical memory in the system, then test this memory and
+ * build the phys_avail array describing the actually-available memory.
+ *
+ * Total memory size may be set by the kernel environment variable
+ * hw.physmem or the compile-time define MAXMEM.
+ *
+ * XXX first should be vm_paddr_t.
+ */
+static void
+getmemsize(caddr_t kmdp, u_int64_t first)
+{
+ int i, physmap_idx, pa_indx, da_indx;
+ vm_paddr_t pa, physmap[PHYSMAP_SIZE];
+ u_long physmem_start, physmem_tunable, memtest;
+ pt_entry_t *pte;
+ struct bios_smap *smapbase, *smap, *smapend;
+ u_int32_t smapsize;
+ quad_t dcons_addr, dcons_size;
+
+ bzero(physmap, sizeof(physmap));
+ basemem = 0;
+ physmap_idx = 0;
+
+ /*
+ * get memory map from INT 15:E820, kindly supplied by the loader.
+ *
+ * subr_module.c says:
+ * "Consumer may safely assume that size value precedes data."
+ * ie: an int32_t immediately precedes smap.
+ */
+ smapbase = (struct bios_smap *)preload_search_info(kmdp,
+ MODINFO_METADATA | MODINFOMD_SMAP);
+ if (smapbase == NULL)
+ panic("No BIOS smap info from loader!");
+
+ smapsize = *((u_int32_t *)smapbase - 1);
+ smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
+
+ for (smap = smapbase; smap < smapend; smap++)
+ if (!add_smap_entry(smap, physmap, &physmap_idx))
+ break;
+
+ /*
+ * Find the 'base memory' segment for SMP
+ */
+ basemem = 0;
+ for (i = 0; i <= physmap_idx; i += 2) {
+ if (physmap[i] == 0x00000000) {
+ basemem = physmap[i + 1] / 1024;
+ break;
+ }
+ }
+ if (basemem == 0)
+ panic("BIOS smap did not include a basemem segment!");
+
+#ifdef SMP
+ /* make hole for AP bootstrap code */
+ physmap[1] = mp_bootaddress(physmap[1] / 1024);
+#endif
+
+ /*
+ * Maxmem isn't the "maximum memory", it's one larger than the
+ * highest page of the physical address space. It should be
+ * called something like "Maxphyspage". We may adjust this
+ * based on ``hw.physmem'' and the results of the memory test.
+ */
+ Maxmem = atop(physmap[physmap_idx + 1]);
+
+#ifdef MAXMEM
+ Maxmem = MAXMEM / 4;
+#endif
+
+ if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
+ Maxmem = atop(physmem_tunable);
+
+ /*
+ * By default enable the memory test on real hardware, and disable
+ * it if we appear to be running in a VM. This avoids touching all
+ * pages unnecessarily, which doesn't matter on real hardware but is
+ * bad for shared VM hosts. Use a general name so that
+ * one could eventually do more with the code than just disable it.
+ */
+ memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1;
+ TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
+
+ /*
+ * Don't allow MAXMEM or hw.physmem to extend the amount of memory
+ * in the system.
+ */
+ if (Maxmem > atop(physmap[physmap_idx + 1]))
+ Maxmem = atop(physmap[physmap_idx + 1]);
+
+ if (atop(physmap[physmap_idx + 1]) != Maxmem &&
+ (boothowto & RB_VERBOSE))
+ printf("Physical memory use set to %ldK\n", Maxmem * 4);
+
+ /* call pmap initialization to make new kernel address space */
+ pmap_bootstrap(&first);
+
+ /*
+ * Size up each available chunk of physical memory.
+ *
+ * XXX Some BIOSes corrupt low 64KB between suspend and resume.
+ * By default, mask off the first 16 pages unless we appear to be
+ * running in a VM.
+ */
+ physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
+ TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
+ if (physmem_start < PAGE_SIZE)
+ physmap[0] = PAGE_SIZE;
+ else if (physmem_start >= physmap[1])
+ physmap[0] = round_page(physmap[1] - PAGE_SIZE);
+ else
+ physmap[0] = round_page(physmem_start);
+ pa_indx = 0;
+ da_indx = 1;
+ phys_avail[pa_indx++] = physmap[0];
+ phys_avail[pa_indx] = physmap[0];
+ dump_avail[da_indx] = physmap[0];
+ pte = CMAP1;
+
+ /*
+ * Get dcons buffer address
+ */
+ if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
+ getenv_quad("dcons.size", &dcons_size) == 0)
+ dcons_addr = 0;
+
+ /*
+ * physmap is in bytes, so when converting to page boundaries,
+ * round up the start address and round down the end address.
+ */
+ for (i = 0; i <= physmap_idx; i += 2) {
+ vm_paddr_t end;
+
+ end = ptoa((vm_paddr_t)Maxmem);
+ if (physmap[i + 1] < end)
+ end = trunc_page(physmap[i + 1]);
+ for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
+ int tmp, page_bad, full;
+ int *ptr = (int *)CADDR1;
+
+ full = FALSE;
+ /*
+ * block out kernel memory as not available.
+ */
+ if (pa >= (vm_paddr_t)kernphys && pa < first)
+ goto do_dump_avail;
+
+ /*
+ * block out dcons buffer
+ */
+ if (dcons_addr > 0
+ && pa >= trunc_page(dcons_addr)
+ && pa < dcons_addr + dcons_size)
+ goto do_dump_avail;
+
+ page_bad = FALSE;
+ if (memtest == 0)
+ goto skip_memtest;
+
+ /*
+ * map page into kernel: valid, read/write,non-cacheable
+ */
+ *pte = pa | PG_V | PG_RW | PG_N;
+ invltlb();
+
+ tmp = *(int *)ptr;
+ /*
+ * Test for alternating 1's and 0's
+ */
+ *(volatile int *)ptr = 0xaaaaaaaa;
+ if (*(volatile int *)ptr != 0xaaaaaaaa)
+ page_bad = TRUE;
+ /*
+ * Test for alternating 0's and 1's
+ */
+ *(volatile int *)ptr = 0x55555555;
+ if (*(volatile int *)ptr != 0x55555555)
+ page_bad = TRUE;
+ /*
+ * Test for all 1's
+ */
+ *(volatile int *)ptr = 0xffffffff;
+ if (*(volatile int *)ptr != 0xffffffff)
+ page_bad = TRUE;
+ /*
+ * Test for all 0's
+ */
+ *(volatile int *)ptr = 0x0;
+ if (*(volatile int *)ptr != 0x0)
+ page_bad = TRUE;
+ /*
+ * Restore original value.
+ */
+ *(int *)ptr = tmp;
+
+skip_memtest:
+ /*
+ * Adjust array of valid/good pages.
+ */
+ if (page_bad == TRUE)
+ continue;
+ /*
+ * If this good page is a continuation of the
+ * previous set of good pages, then just increase
+ * the end pointer. Otherwise start a new chunk.
+ * Note that "end" points one higher than end,
+ * making the range >= start and < end.
+ * If we're also doing a speculative memory
+ * test and we at or past the end, bump up Maxmem
+ * so that we keep going. The first bad page
+ * will terminate the loop.
+ */
+ if (phys_avail[pa_indx] == pa) {
+ phys_avail[pa_indx] += PAGE_SIZE;
+ } else {
+ pa_indx++;
+ if (pa_indx == PHYS_AVAIL_ARRAY_END) {
+ printf(
+ "Too many holes in the physical address space, giving up\n");
+ pa_indx--;
+ full = TRUE;
+ goto do_dump_avail;
+ }
+ phys_avail[pa_indx++] = pa; /* start */
+ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
+ }
+ physmem++;
+do_dump_avail:
+ if (dump_avail[da_indx] == pa) {
+ dump_avail[da_indx] += PAGE_SIZE;
+ } else {
+ da_indx++;
+ if (da_indx == DUMP_AVAIL_ARRAY_END) {
+ da_indx--;
+ goto do_next;
+ }
+ dump_avail[da_indx++] = pa; /* start */
+ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
+ }
+do_next:
+ if (full)
+ break;
+ }
+ }
+ *pte = 0;
+ invltlb();
+
+ /*
+ * XXX
+ * The last chunk must contain at least one page plus the message
+ * buffer to avoid complicating other code (message buffer address
+ * calculation, etc.).
+ */
+ while (phys_avail[pa_indx - 1] + PAGE_SIZE +
+ round_page(msgbufsize) >= phys_avail[pa_indx]) {
+ physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
+ phys_avail[pa_indx--] = 0;
+ phys_avail[pa_indx--] = 0;
+ }
+
+ Maxmem = atop(phys_avail[pa_indx]);
+
+ /* Trim off space for the message buffer. */
+ phys_avail[pa_indx] -= round_page(msgbufsize);
+
+ /* Map the message buffer. */
+ msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
+}
+
+u_int64_t
+hammer_time(u_int64_t modulep, u_int64_t physfree)
+{
+ caddr_t kmdp;
+ int gsel_tss, x;
+ struct pcpu *pc;
+ struct nmi_pcpu *np;
+ struct xstate_hdr *xhdr;
+ u_int64_t msr;
+ char *env;
+ size_t kstack0_sz;
+
+ thread0.td_kstack = physfree + KERNBASE;
+ thread0.td_kstack_pages = KSTACK_PAGES;
+ kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
+ bzero((void *)thread0.td_kstack, kstack0_sz);
+ physfree += kstack0_sz;
+
+ /*
+ * This may be done better later if it gets more high level
+ * components in it. If so just link td->td_proc here.
+ */
+ proc_linkup0(&proc0, &thread0);
+
+ preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
+ preload_bootstrap_relocate(KERNBASE);
+ kmdp = preload_search_by_type("elf kernel");
+ if (kmdp == NULL)
+ kmdp = preload_search_by_type("elf64 kernel");
+ boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
+ kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE;
+#ifdef DDB
+ ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
+ ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
+#endif
+
+ /* Init basic tunables, hz etc */
+ init_param1();
+
+ /*
+ * make gdt memory segments
+ */
+ for (x = 0; x < NGDT; x++) {
+ if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
+ x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
+ ssdtosd(&gdt_segs[x], &gdt[x]);
+ }
+ gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
+ ssdtosyssd(&gdt_segs[GPROC0_SEL],
+ (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
+
+ r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
+ r_gdt.rd_base = (long) gdt;
+ lgdt(&r_gdt);
+ pc = &__pcpu[0];
+
+ wrmsr(MSR_FSBASE, 0); /* User value */
+ wrmsr(MSR_GSBASE, (u_int64_t)pc);
+ wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
+
+ pcpu_init(pc, 0, sizeof(struct pcpu));
+ dpcpu_init((void *)(physfree + KERNBASE), 0);
+ physfree += DPCPU_SIZE;
+ PCPU_SET(prvspace, pc);
+ PCPU_SET(curthread, &thread0);
+ PCPU_SET(tssp, &common_tss[0]);
+ PCPU_SET(commontssp, &common_tss[0]);
+ PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
+ PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
+ PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
+ PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
+
+ /*
+ * Initialize mutexes.
+ *
+ * icu_lock: in order to allow an interrupt to occur in a critical
+ * section, to set pcpu->ipending (etc...) properly, we
+ * must be able to get the icu lock, so it can't be
+ * under witness.
+ */
+ mutex_init();
+ mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
+ mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
+
+ /* exceptions */
+ for (x = 0; x < NIDT; x++)
+ setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
+ setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0);
+ setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
+ setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
+#ifdef KDTRACE_HOOKS
+ setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
+#endif
+
+ r_idt.rd_limit = sizeof(idt0) - 1;
+ r_idt.rd_base = (long) idt;
+ lidt(&r_idt);
+
+ /*
+ * Initialize the i8254 before the console so that console
+ * initialization can use DELAY().
+ */
+ i8254_init();
+
+ /*
+ * Initialize the console before we print anything out.
+ */
+ cninit();
+
+#ifdef DEV_ISA
+#ifdef DEV_ATPIC
+ elcr_probe();
+ atpic_startup();
+#else
+ /* Reset and mask the atpics and leave them shut down. */
+ atpic_reset();
+
+ /*
+ * Point the ICU spurious interrupt vectors at the APIC spurious
+ * interrupt handler.
+ */
+ setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
+#endif
+#else
+#error "have you forgotten the isa device?";
+#endif
+
+ kdb_init();
+
+#ifdef KDB
+ if (boothowto & RB_KDB)
+ kdb_enter(KDB_WHY_BOOTFLAGS,
+ "Boot flags requested debugger");
+#endif
+
+ identify_cpu(); /* Final stage of CPU initialization */
+ initializecpu(); /* Initialize CPU registers */
+ initializecpucache();
+
+ /* doublefault stack space, runs on ist1 */
+ common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
+
+ /*
+ * NMI stack, runs on ist2. The pcpu pointer is stored just
+ * above the start of the ist2 stack.
+ */
+ np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
+ np->np_pcpu = (register_t) pc;
+ common_tss[0].tss_ist2 = (long) np;
+
+ /* Set the IO permission bitmap (empty due to tss seg limit) */
+ common_tss[0].tss_iobase = sizeof(struct amd64tss) +
+ IOPAGES * PAGE_SIZE;
+
+ gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
+ ltr(gsel_tss);
+
+ /* Set up the fast syscall stuff */
+ msr = rdmsr(MSR_EFER) | EFER_SCE;
+ wrmsr(MSR_EFER, msr);
+ wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
+ wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
+ msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
+ ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
+ wrmsr(MSR_STAR, msr);
+ wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
+
+ getmemsize(kmdp, physfree);
+ init_param2(physmem);
+
+ /* now running on new page tables, configured,and u/iom is accessible */
+
+ msgbufinit(msgbufp, msgbufsize);
+ fpuinit();
+
+ /*
+ * Set up thread0 pcb after fpuinit calculated pcb + fpu save
+ * area size. Zero out the extended state header in fpu save
+ * area.
+ */
+ thread0.td_pcb = get_pcb_td(&thread0);
+ bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
+ if (use_xsave) {
+ xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
+ 1);
+ xhdr->xstate_bv = xsave_mask;
+ }
+ /* make an initial tss so cpu can get interrupt stack on syscall! */
+ common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb;
+ /* Ensure the stack is aligned to 16 bytes */
+ common_tss[0].tss_rsp0 &= ~0xFul;
+ PCPU_SET(rsp0, common_tss[0].tss_rsp0);
+ PCPU_SET(curpcb, thread0.td_pcb);
+
+ /* transfer to user mode */
+
+ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
+ _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
+ _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
+ _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
+ _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
+
+ load_ds(_udatasel);
+ load_es(_udatasel);
+ load_fs(_ufssel);
+
+ /* setup proc 0's pcb */
+ thread0.td_pcb->pcb_flags = 0;
+ thread0.td_pcb->pcb_cr3 = KPML4phys;
+ thread0.td_frame = &proc0_tf;
+
+ env = getenv("kernelname");
+ if (env != NULL)
+ strlcpy(kernelname, env, sizeof(kernelname));
+
+#ifdef XENHVM
+ if (inw(0x10) == 0x49d2) {
+ if (bootverbose)
+ printf("Xen detected: disabling emulated block and network devices\n");
+ outw(0x10, 3);
+ }
+#endif
+
+ cpu_probe_amdc1e();
+
+ /* Location of kernel stack for locore */
+ return ((u_int64_t)thread0.td_pcb);
+}
+
+void
+cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
+{
+
+ pcpu->pc_acpi_id = 0xffffffff;
+}
+
+void
+spinlock_enter(void)
+{
+ struct thread *td;
+ register_t flags;
+
+ td = curthread;
+ if (td->td_md.md_spinlock_count == 0) {
+ flags = intr_disable();
+ td->td_md.md_spinlock_count = 1;
+ td->td_md.md_saved_flags = flags;
+ } else
+ td->td_md.md_spinlock_count++;
+ critical_enter();
+}
+
+void
+spinlock_exit(void)
+{
+ struct thread *td;
+ register_t flags;
+
+ td = curthread;
+ critical_exit();
+ flags = td->td_md.md_saved_flags;
+ td->td_md.md_spinlock_count--;
+ if (td->td_md.md_spinlock_count == 0)
+ intr_restore(flags);
+}
+
+/*
+ * Construct a PCB from a trapframe. This is called from kdb_trap() where
+ * we want to start a backtrace from the function that caused us to enter
+ * the debugger. We have the context in the trapframe, but base the trace
+ * on the PCB. The PCB doesn't have to be perfect, as long as it contains
+ * enough for a backtrace.
+ */
+void
+makectx(struct trapframe *tf, struct pcb *pcb)
+{
+
+ pcb->pcb_r12 = tf->tf_r12;
+ pcb->pcb_r13 = tf->tf_r13;
+ pcb->pcb_r14 = tf->tf_r14;
+ pcb->pcb_r15 = tf->tf_r15;
+ pcb->pcb_rbp = tf->tf_rbp;
+ pcb->pcb_rbx = tf->tf_rbx;
+ pcb->pcb_rip = tf->tf_rip;
+ pcb->pcb_rsp = tf->tf_rsp;
+}
+
+int
+ptrace_set_pc(struct thread *td, unsigned long addr)
+{
+ td->td_frame->tf_rip = addr;
+ return (0);
+}
+
+int
+ptrace_single_step(struct thread *td)
+{
+ td->td_frame->tf_rflags |= PSL_T;
+ return (0);
+}
+
+int
+ptrace_clear_single_step(struct thread *td)
+{
+ td->td_frame->tf_rflags &= ~PSL_T;
+ return (0);
+}
+
+int
+fill_regs(struct thread *td, struct reg *regs)
+{
+ struct trapframe *tp;
+
+ tp = td->td_frame;
+ return (fill_frame_regs(tp, regs));
+}
+
+int
+fill_frame_regs(struct trapframe *tp, struct reg *regs)
+{
+ regs->r_r15 = tp->tf_r15;
+ regs->r_r14 = tp->tf_r14;
+ regs->r_r13 = tp->tf_r13;
+ regs->r_r12 = tp->tf_r12;
+ regs->r_r11 = tp->tf_r11;
+ regs->r_r10 = tp->tf_r10;
+ regs->r_r9 = tp->tf_r9;
+ regs->r_r8 = tp->tf_r8;
+ regs->r_rdi = tp->tf_rdi;
+ regs->r_rsi = tp->tf_rsi;
+ regs->r_rbp = tp->tf_rbp;
+ regs->r_rbx = tp->tf_rbx;
+ regs->r_rdx = tp->tf_rdx;
+ regs->r_rcx = tp->tf_rcx;
+ regs->r_rax = tp->tf_rax;
+ regs->r_rip = tp->tf_rip;
+ regs->r_cs = tp->tf_cs;
+ regs->r_rflags = tp->tf_rflags;
+ regs->r_rsp = tp->tf_rsp;
+ regs->r_ss = tp->tf_ss;
+ if (tp->tf_flags & TF_HASSEGS) {
+ regs->r_ds = tp->tf_ds;
+ regs->r_es = tp->tf_es;
+ regs->r_fs = tp->tf_fs;
+ regs->r_gs = tp->tf_gs;
+ } else {
+ regs->r_ds = 0;
+ regs->r_es = 0;
+ regs->r_fs = 0;
+ regs->r_gs = 0;
+ }
+ return (0);
+}
+
+int
+set_regs(struct thread *td, struct reg *regs)
+{
+ struct trapframe *tp;
+ register_t rflags;
+
+ tp = td->td_frame;
+ rflags = regs->r_rflags & 0xffffffff;
+ if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
+ return (EINVAL);
+ tp->tf_r15 = regs->r_r15;
+ tp->tf_r14 = regs->r_r14;
+ tp->tf_r13 = regs->r_r13;
+ tp->tf_r12 = regs->r_r12;
+ tp->tf_r11 = regs->r_r11;
+ tp->tf_r10 = regs->r_r10;
+ tp->tf_r9 = regs->r_r9;
+ tp->tf_r8 = regs->r_r8;
+ tp->tf_rdi = regs->r_rdi;
+ tp->tf_rsi = regs->r_rsi;
+ tp->tf_rbp = regs->r_rbp;
+ tp->tf_rbx = regs->r_rbx;
+ tp->tf_rdx = regs->r_rdx;
+ tp->tf_rcx = regs->r_rcx;
+ tp->tf_rax = regs->r_rax;
+ tp->tf_rip = regs->r_rip;
+ tp->tf_cs = regs->r_cs;
+ tp->tf_rflags = rflags;
+ tp->tf_rsp = regs->r_rsp;
+ tp->tf_ss = regs->r_ss;
+ if (0) { /* XXXKIB */
+ tp->tf_ds = regs->r_ds;
+ tp->tf_es = regs->r_es;
+ tp->tf_fs = regs->r_fs;
+ tp->tf_gs = regs->r_gs;
+ tp->tf_flags = TF_HASSEGS;
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+ }
+ return (0);
+}
+
+/* XXX check all this stuff! */
+/* externalize from sv_xmm */
+static void
+fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
+{
+ struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
+ struct envxmm *penv_xmm = &sv_xmm->sv_env;
+ int i;
+
+ /* pcb -> fpregs */
+ bzero(fpregs, sizeof(*fpregs));
+
+ /* FPU control/status */
+ penv_fpreg->en_cw = penv_xmm->en_cw;
+ penv_fpreg->en_sw = penv_xmm->en_sw;
+ penv_fpreg->en_tw = penv_xmm->en_tw;
+ penv_fpreg->en_opcode = penv_xmm->en_opcode;
+ penv_fpreg->en_rip = penv_xmm->en_rip;
+ penv_fpreg->en_rdp = penv_xmm->en_rdp;
+ penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
+ penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
+
+ /* FPU registers */
+ for (i = 0; i < 8; ++i)
+ bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
+
+ /* SSE registers */
+ for (i = 0; i < 16; ++i)
+ bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
+}
+
+/* internalize from fpregs into sv_xmm */
+static void
+set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
+{
+ struct envxmm *penv_xmm = &sv_xmm->sv_env;
+ struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
+ int i;
+
+ /* fpregs -> pcb */
+ /* FPU control/status */
+ penv_xmm->en_cw = penv_fpreg->en_cw;
+ penv_xmm->en_sw = penv_fpreg->en_sw;
+ penv_xmm->en_tw = penv_fpreg->en_tw;
+ penv_xmm->en_opcode = penv_fpreg->en_opcode;
+ penv_xmm->en_rip = penv_fpreg->en_rip;
+ penv_xmm->en_rdp = penv_fpreg->en_rdp;
+ penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
+ penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
+
+ /* FPU registers */
+ for (i = 0; i < 8; ++i)
+ bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
+
+ /* SSE registers */
+ for (i = 0; i < 16; ++i)
+ bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
+}
+
+/* externalize from td->pcb */
+int
+fill_fpregs(struct thread *td, struct fpreg *fpregs)
+{
+
+ KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
+ P_SHOULDSTOP(td->td_proc),
+ ("not suspended thread %p", td));
+ fpugetregs(td);
+ fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
+ return (0);
+}
+
+/* internalize to td->pcb */
+int
+set_fpregs(struct thread *td, struct fpreg *fpregs)
+{
+
+ set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
+ fpuuserinited(td);
+ return (0);
+}
+
+/*
+ * Get machine context.
+ */
+int
+get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
+{
+ struct pcb *pcb;
+ struct trapframe *tp;
+
+ pcb = td->td_pcb;
+ tp = td->td_frame;
+ PROC_LOCK(curthread->td_proc);
+ mcp->mc_onstack = sigonstack(tp->tf_rsp);
+ PROC_UNLOCK(curthread->td_proc);
+ mcp->mc_r15 = tp->tf_r15;
+ mcp->mc_r14 = tp->tf_r14;
+ mcp->mc_r13 = tp->tf_r13;
+ mcp->mc_r12 = tp->tf_r12;
+ mcp->mc_r11 = tp->tf_r11;
+ mcp->mc_r10 = tp->tf_r10;
+ mcp->mc_r9 = tp->tf_r9;
+ mcp->mc_r8 = tp->tf_r8;
+ mcp->mc_rdi = tp->tf_rdi;
+ mcp->mc_rsi = tp->tf_rsi;
+ mcp->mc_rbp = tp->tf_rbp;
+ mcp->mc_rbx = tp->tf_rbx;
+ mcp->mc_rcx = tp->tf_rcx;
+ mcp->mc_rflags = tp->tf_rflags;
+ if (flags & GET_MC_CLEAR_RET) {
+ mcp->mc_rax = 0;
+ mcp->mc_rdx = 0;
+ mcp->mc_rflags &= ~PSL_C;
+ } else {
+ mcp->mc_rax = tp->tf_rax;
+ mcp->mc_rdx = tp->tf_rdx;
+ }
+ mcp->mc_rip = tp->tf_rip;
+ mcp->mc_cs = tp->tf_cs;
+ mcp->mc_rsp = tp->tf_rsp;
+ mcp->mc_ss = tp->tf_ss;
+ mcp->mc_ds = tp->tf_ds;
+ mcp->mc_es = tp->tf_es;
+ mcp->mc_fs = tp->tf_fs;
+ mcp->mc_gs = tp->tf_gs;
+ mcp->mc_flags = tp->tf_flags;
+ mcp->mc_len = sizeof(*mcp);
+ get_fpcontext(td, mcp, NULL, 0);
+ mcp->mc_fsbase = pcb->pcb_fsbase;
+ mcp->mc_gsbase = pcb->pcb_gsbase;
+ mcp->mc_xfpustate = 0;
+ mcp->mc_xfpustate_len = 0;
+ bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
+ return (0);
+}
+
+/*
+ * Set machine context.
+ *
+ * However, we don't set any but the user modifiable flags, and we won't
+ * touch the cs selector.
+ */
+int
+set_mcontext(struct thread *td, const mcontext_t *mcp)
+{
+ struct pcb *pcb;
+ struct trapframe *tp;
+ char *xfpustate;
+ long rflags;
+ int ret;
+
+ pcb = td->td_pcb;
+ tp = td->td_frame;
+ if (mcp->mc_len != sizeof(*mcp) ||
+ (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
+ return (EINVAL);
+ rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
+ (tp->tf_rflags & ~PSL_USERCHANGE);
+ if (mcp->mc_flags & _MC_HASFPXSTATE) {
+ if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
+ sizeof(struct savefpu))
+ return (EINVAL);
+ xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
+ ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
+ mcp->mc_xfpustate_len);
+ if (ret != 0)
+ return (ret);
+ } else
+ xfpustate = NULL;
+ ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
+ if (ret != 0)
+ return (ret);
+ tp->tf_r15 = mcp->mc_r15;
+ tp->tf_r14 = mcp->mc_r14;
+ tp->tf_r13 = mcp->mc_r13;
+ tp->tf_r12 = mcp->mc_r12;
+ tp->tf_r11 = mcp->mc_r11;
+ tp->tf_r10 = mcp->mc_r10;
+ tp->tf_r9 = mcp->mc_r9;
+ tp->tf_r8 = mcp->mc_r8;
+ tp->tf_rdi = mcp->mc_rdi;
+ tp->tf_rsi = mcp->mc_rsi;
+ tp->tf_rbp = mcp->mc_rbp;
+ tp->tf_rbx = mcp->mc_rbx;
+ tp->tf_rdx = mcp->mc_rdx;
+ tp->tf_rcx = mcp->mc_rcx;
+ tp->tf_rax = mcp->mc_rax;
+ tp->tf_rip = mcp->mc_rip;
+ tp->tf_rflags = rflags;
+ tp->tf_rsp = mcp->mc_rsp;
+ tp->tf_ss = mcp->mc_ss;
+ tp->tf_flags = mcp->mc_flags;
+ if (tp->tf_flags & TF_HASSEGS) {
+ tp->tf_ds = mcp->mc_ds;
+ tp->tf_es = mcp->mc_es;
+ tp->tf_fs = mcp->mc_fs;
+ tp->tf_gs = mcp->mc_gs;
+ }
+ if (mcp->mc_flags & _MC_HASBASES) {
+ pcb->pcb_fsbase = mcp->mc_fsbase;
+ pcb->pcb_gsbase = mcp->mc_gsbase;
+ }
+ set_pcb_flags(pcb, PCB_FULL_IRET);
+ return (0);
+}
+
+static void
+get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
+ size_t xfpusave_len)
+{
+ size_t max_len, len;
+
+ mcp->mc_ownedfp = fpugetregs(td);
+ bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate,
+ sizeof(mcp->mc_fpstate));
+ mcp->mc_fpformat = fpuformat();
+ if (!use_xsave || xfpusave_len == 0)
+ return;
+ max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
+ len = xfpusave_len;
+ if (len > max_len) {
+ len = max_len;
+ bzero(xfpusave + max_len, len - max_len);
+ }
+ mcp->mc_flags |= _MC_HASFPXSTATE;
+ mcp->mc_xfpustate_len = len;
+ bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
+}
+
+static int
+set_fpcontext(struct thread *td, const mcontext_t *mcp, char *xfpustate,
+ size_t xfpustate_len)
+{
+ struct savefpu *fpstate;
+ int error;
+
+ if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
+ return (0);
+ else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
+ return (EINVAL);
+ else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
+ /* We don't care what state is left in the FPU or PCB. */
+ fpstate_drop(td);
+ error = 0;
+ } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
+ mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
+ fpstate = (struct savefpu *)&mcp->mc_fpstate;
+ fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
+ error = fpusetregs(td, fpstate, xfpustate, xfpustate_len);
+ } else
+ return (EINVAL);
+ return (error);
+}
+
+void
+fpstate_drop(struct thread *td)
+{
+
+ KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
+ critical_enter();
+ if (PCPU_GET(fpcurthread) == td)
+ fpudrop();
+ /*
+ * XXX force a full drop of the fpu. The above only drops it if we
+ * owned it.
+ *
+ * XXX I don't much like fpugetuserregs()'s semantics of doing a full
+ * drop. Dropping only to the pcb matches fnsave's behaviour.
+ * We only need to drop to !PCB_INITDONE in sendsig(). But
+ * sendsig() is the only caller of fpugetuserregs()... perhaps we just
+ * have too many layers.
+ */
+ clear_pcb_flags(curthread->td_pcb,
+ PCB_FPUINITDONE | PCB_USERFPUINITDONE);
+ critical_exit();
+}
+
+int
+fill_dbregs(struct thread *td, struct dbreg *dbregs)
+{
+ struct pcb *pcb;
+
+ if (td == NULL) {
+ dbregs->dr[0] = rdr0();
+ dbregs->dr[1] = rdr1();
+ dbregs->dr[2] = rdr2();
+ dbregs->dr[3] = rdr3();
+ dbregs->dr[6] = rdr6();
+ dbregs->dr[7] = rdr7();
+ } else {
+ pcb = td->td_pcb;
+ dbregs->dr[0] = pcb->pcb_dr0;
+ dbregs->dr[1] = pcb->pcb_dr1;
+ dbregs->dr[2] = pcb->pcb_dr2;
+ dbregs->dr[3] = pcb->pcb_dr3;
+ dbregs->dr[6] = pcb->pcb_dr6;
+ dbregs->dr[7] = pcb->pcb_dr7;
+ }
+ dbregs->dr[4] = 0;
+ dbregs->dr[5] = 0;
+ dbregs->dr[8] = 0;
+ dbregs->dr[9] = 0;
+ dbregs->dr[10] = 0;
+ dbregs->dr[11] = 0;
+ dbregs->dr[12] = 0;
+ dbregs->dr[13] = 0;
+ dbregs->dr[14] = 0;
+ dbregs->dr[15] = 0;
+ return (0);
+}
+
+int
+set_dbregs(struct thread *td, struct dbreg *dbregs)
+{
+ struct pcb *pcb;
+ int i;
+
+ if (td == NULL) {
+ load_dr0(dbregs->dr[0]);
+ load_dr1(dbregs->dr[1]);
+ load_dr2(dbregs->dr[2]);
+ load_dr3(dbregs->dr[3]);
+ load_dr6(dbregs->dr[6]);
+ load_dr7(dbregs->dr[7]);
+ } else {
+ /*
+ * Don't let an illegal value for dr7 get set. Specifically,
+ * check for undefined settings. Setting these bit patterns
+ * result in undefined behaviour and can lead to an unexpected
+ * TRCTRAP or a general protection fault right here.
+ * Upper bits of dr6 and dr7 must not be set
+ */
+ for (i = 0; i < 4; i++) {
+ if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
+ return (EINVAL);
+ if (td->td_frame->tf_cs == _ucode32sel &&
+ DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
+ return (EINVAL);
+ }
+ if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
+ (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
+ return (EINVAL);
+
+ pcb = td->td_pcb;
+
+ /*
+ * Don't let a process set a breakpoint that is not within the
+ * process's address space. If a process could do this, it
+ * could halt the system by setting a breakpoint in the kernel
+ * (if ddb was enabled). Thus, we need to check to make sure
+ * that no breakpoints are being enabled for addresses outside
+ * process's address space.
+ *
+ * XXX - what about when the watched area of the user's
+ * address space is written into from within the kernel
+ * ... wouldn't that still cause a breakpoint to be generated
+ * from within kernel mode?
+ */
+
+ if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
+ /* dr0 is enabled */
+ if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
+ return (EINVAL);
+ }
+ if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
+ /* dr1 is enabled */
+ if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
+ return (EINVAL);
+ }
+ if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
+ /* dr2 is enabled */
+ if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
+ return (EINVAL);
+ }
+ if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
+ /* dr3 is enabled */
+ if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
+ return (EINVAL);
+ }
+
+ pcb->pcb_dr0 = dbregs->dr[0];
+ pcb->pcb_dr1 = dbregs->dr[1];
+ pcb->pcb_dr2 = dbregs->dr[2];
+ pcb->pcb_dr3 = dbregs->dr[3];
+ pcb->pcb_dr6 = dbregs->dr[6];
+ pcb->pcb_dr7 = dbregs->dr[7];
+
+ set_pcb_flags(pcb, PCB_DBREGS);
+ }
+
+ return (0);
+}
+
+void
+reset_dbregs(void)
+{
+
+ load_dr7(0); /* Turn off the control bits first */
+ load_dr0(0);
+ load_dr1(0);
+ load_dr2(0);
+ load_dr3(0);
+ load_dr6(0);
+}
+
+/*
+ * Return > 0 if a hardware breakpoint has been hit, and the
+ * breakpoint was in user space. Return 0, otherwise.
+ */
+int
+user_dbreg_trap(void)
+{
+ u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
+ u_int64_t bp; /* breakpoint bits extracted from dr6 */
+ int nbp; /* number of breakpoints that triggered */
+ caddr_t addr[4]; /* breakpoint addresses */
+ int i;
+
+ dr7 = rdr7();
+ if ((dr7 & 0x000000ff) == 0) {
+ /*
+ * all GE and LE bits in the dr7 register are zero,
+ * thus the trap couldn't have been caused by the
+ * hardware debug registers
+ */
+ return 0;
+ }
+
+ nbp = 0;
+ dr6 = rdr6();
+ bp = dr6 & 0x0000000f;
+
+ if (!bp) {
+ /*
+ * None of the breakpoint bits are set meaning this
+ * trap was not caused by any of the debug registers
+ */
+ return 0;
+ }
+
+ /*
+ * at least one of the breakpoints were hit, check to see
+ * which ones and if any of them are user space addresses
+ */
+
+ if (bp & 0x01) {
+ addr[nbp++] = (caddr_t)rdr0();
+ }
+ if (bp & 0x02) {
+ addr[nbp++] = (caddr_t)rdr1();
+ }
+ if (bp & 0x04) {
+ addr[nbp++] = (caddr_t)rdr2();
+ }
+ if (bp & 0x08) {
+ addr[nbp++] = (caddr_t)rdr3();
+ }
+
+ for (i = 0; i < nbp; i++) {
+ if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
+ /*
+ * addr[i] is in user space
+ */
+ return nbp;
+ }
+ }
+
+ /*
+ * None of the breakpoints are in user space.
+ */
+ return 0;
+}
+
+#ifdef KDB
+
+/*
+ * Provide inb() and outb() as functions. They are normally only available as
+ * inline functions, thus cannot be called from the debugger.
+ */
+
+/* silence compiler warnings */
+u_char inb_(u_short);
+void outb_(u_short, u_char);
+
+u_char
+inb_(u_short port)
+{
+ return inb(port);
+}
+
+void
+outb_(u_short port, u_char data)
+{
+ outb(port, data);
+}
+
+#endif /* KDB */
diff --git a/sys/amd64/amd64/mem.c b/sys/amd64/amd64/mem.c
new file mode 100644
index 0000000..abbbb21
--- /dev/null
+++ b/sys/amd64/amd64/mem.c
@@ -0,0 +1,216 @@
+/*-
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1982, 1986, 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department, and code derived from software contributed to
+ * Berkeley by William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: Utah $Hdr: mem.c 1.13 89/10/08$
+ * from: @(#)mem.c 7.2 (Berkeley) 5/9/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Memory special file
+ */
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/ioccom.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/memrange.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+#include <sys/systm.h>
+#include <sys/uio.h>
+
+#include <machine/specialreg.h>
+#include <machine/vmparam.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+
+#include <machine/memdev.h>
+
+/*
+ * Used in /dev/mem drivers and elsewhere
+ */
+MALLOC_DEFINE(M_MEMDESC, "memdesc", "memory range descriptors");
+
+/* ARGSUSED */
+int
+memrw(struct cdev *dev, struct uio *uio, int flags)
+{
+ int o;
+ u_long c = 0, v;
+ struct iovec *iov;
+ int error = 0;
+ vm_offset_t addr, eaddr;
+
+ GIANT_REQUIRED;
+
+ while (uio->uio_resid > 0 && error == 0) {
+ iov = uio->uio_iov;
+ if (iov->iov_len == 0) {
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ if (uio->uio_iovcnt < 0)
+ panic("memrw");
+ continue;
+ }
+ if (dev2unit(dev) == CDEV_MINOR_MEM) {
+ v = uio->uio_offset;
+kmemphys:
+ o = v & PAGE_MASK;
+ c = min(uio->uio_resid, (u_int)(PAGE_SIZE - o));
+ error = uiomove((void *)PHYS_TO_DMAP(v), (int)c, uio);
+ continue;
+ }
+ else if (dev2unit(dev) == CDEV_MINOR_KMEM) {
+ v = uio->uio_offset;
+
+ if (v >= DMAP_MIN_ADDRESS && v < DMAP_MAX_ADDRESS) {
+ v = DMAP_TO_PHYS(v);
+ goto kmemphys;
+ }
+
+ c = iov->iov_len;
+
+ /*
+ * Make sure that all of the pages are currently
+ * resident so that we don't create any zero-fill
+ * pages.
+ */
+ addr = trunc_page(v);
+ eaddr = round_page(v + c);
+
+ if (addr < VM_MIN_KERNEL_ADDRESS)
+ return (EFAULT);
+ for (; addr < eaddr; addr += PAGE_SIZE)
+ if (pmap_extract(kernel_pmap, addr) == 0)
+ return (EFAULT);
+
+ if (!kernacc((caddr_t)(long)v, c,
+ uio->uio_rw == UIO_READ ?
+ VM_PROT_READ : VM_PROT_WRITE))
+ return (EFAULT);
+
+ error = uiomove((caddr_t)(long)v, (int)c, uio);
+ continue;
+ }
+ /* else panic! */
+ }
+ return (error);
+}
+
+/*
+ * allow user processes to MMAP some memory sections
+ * instead of going through read/write
+ */
+/* ARGSUSED */
+int
+memmmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
+ int prot __unused, vm_memattr_t *memattr __unused)
+{
+ if (dev2unit(dev) == CDEV_MINOR_MEM)
+ *paddr = offset;
+ else if (dev2unit(dev) == CDEV_MINOR_KMEM)
+ *paddr = vtophys(offset);
+ /* else panic! */
+ return (0);
+}
+
+/*
+ * Operations for changing memory attributes.
+ *
+ * This is basically just an ioctl shim for mem_range_attr_get
+ * and mem_range_attr_set.
+ */
+/* ARGSUSED */
+int
+memioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, int flags,
+ struct thread *td)
+{
+ int nd, error = 0;
+ struct mem_range_op *mo = (struct mem_range_op *)data;
+ struct mem_range_desc *md;
+
+ /* is this for us? */
+ if ((cmd != MEMRANGE_GET) &&
+ (cmd != MEMRANGE_SET))
+ return (ENOTTY);
+
+ /* any chance we can handle this? */
+ if (mem_range_softc.mr_op == NULL)
+ return (EOPNOTSUPP);
+
+ /* do we have any descriptors? */
+ if (mem_range_softc.mr_ndesc == 0)
+ return (ENXIO);
+
+ switch (cmd) {
+ case MEMRANGE_GET:
+ nd = imin(mo->mo_arg[0], mem_range_softc.mr_ndesc);
+ if (nd > 0) {
+ md = (struct mem_range_desc *)
+ malloc(nd * sizeof(struct mem_range_desc),
+ M_MEMDESC, M_WAITOK);
+ error = mem_range_attr_get(md, &nd);
+ if (!error)
+ error = copyout(md, mo->mo_desc,
+ nd * sizeof(struct mem_range_desc));
+ free(md, M_MEMDESC);
+ }
+ else
+ nd = mem_range_softc.mr_ndesc;
+ mo->mo_arg[0] = nd;
+ break;
+
+ case MEMRANGE_SET:
+ md = (struct mem_range_desc *)malloc(sizeof(struct mem_range_desc),
+ M_MEMDESC, M_WAITOK);
+ error = copyin(mo->mo_desc, md, sizeof(struct mem_range_desc));
+ /* clamp description string */
+ md->mr_owner[sizeof(md->mr_owner) - 1] = 0;
+ if (error == 0)
+ error = mem_range_attr_set(md, &mo->mo_arg[0]);
+ free(md, M_MEMDESC);
+ break;
+ }
+ return (error);
+}
diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c
new file mode 100644
index 0000000..79d8bde
--- /dev/null
+++ b/sys/amd64/amd64/minidump_machdep.c
@@ -0,0 +1,479 @@
+/*-
+ * Copyright (c) 2006 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_pmap.h"
+#include "opt_watchdog.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/kernel.h>
+#include <sys/kerneldump.h>
+#include <sys/msgbuf.h>
+#include <sys/watchdog.h>
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_phys.h>
+#include <vm/pmap.h>
+#include <machine/atomic.h>
+#include <machine/elf.h>
+#include <machine/md_var.h>
+#include <machine/vmparam.h>
+#include <machine/minidump.h>
+
+CTASSERT(sizeof(struct kerneldumpheader) == 512);
+
+/*
+ * Don't touch the first SIZEOF_METADATA bytes on the dump device. This
+ * is to protect us from metadata and to protect metadata from us.
+ */
+#define SIZEOF_METADATA (64*1024)
+
+#define MD_ALIGN(x) (((off_t)(x) + PAGE_MASK) & ~PAGE_MASK)
+#define DEV_ALIGN(x) (((off_t)(x) + (DEV_BSIZE-1)) & ~(DEV_BSIZE-1))
+
+uint64_t *vm_page_dump;
+int vm_page_dump_size;
+
+static struct kerneldumpheader kdh;
+static off_t dumplo;
+
+/* Handle chunked writes. */
+static size_t fragsz;
+static void *dump_va;
+static size_t counter, progress, dumpsize;
+
+CTASSERT(sizeof(*vm_page_dump) == 8);
+
+static int
+is_dumpable(vm_paddr_t pa)
+{
+ vm_page_t m;
+ int i;
+
+ if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
+ return ((m->flags & PG_NODUMP) == 0);
+ for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
+ if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
+ return (1);
+ }
+ return (0);
+}
+
+#define PG2MB(pgs) (((pgs) + (1 << 8) - 1) >> 8)
+
+static int
+blk_flush(struct dumperinfo *di)
+{
+ int error;
+
+ if (fragsz == 0)
+ return (0);
+
+ error = dump_write(di, dump_va, 0, dumplo, fragsz);
+ dumplo += fragsz;
+ fragsz = 0;
+ return (error);
+}
+
+static struct {
+ int min_per;
+ int max_per;
+ int visited;
+} progress_track[10] = {
+ { 0, 10, 0},
+ { 10, 20, 0},
+ { 20, 30, 0},
+ { 30, 40, 0},
+ { 40, 50, 0},
+ { 50, 60, 0},
+ { 60, 70, 0},
+ { 70, 80, 0},
+ { 80, 90, 0},
+ { 90, 100, 0}
+};
+
+static void
+report_progress(size_t progress, size_t dumpsize)
+{
+ int sofar, i;
+
+ sofar = 100 - ((progress * 100) / dumpsize);
+ for (i = 0; i < 10; i++) {
+ if (sofar < progress_track[i].min_per || sofar > progress_track[i].max_per)
+ continue;
+ if (progress_track[i].visited)
+ return;
+ progress_track[i].visited = 1;
+ printf("..%d%%", sofar);
+ return;
+ }
+}
+
+static int
+blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz)
+{
+ size_t len;
+ int error, i, c;
+ u_int maxdumpsz;
+
+ maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE);
+ if (maxdumpsz == 0) /* seatbelt */
+ maxdumpsz = PAGE_SIZE;
+ error = 0;
+ if ((sz % PAGE_SIZE) != 0) {
+ printf("size not page aligned\n");
+ return (EINVAL);
+ }
+ if (ptr != NULL && pa != 0) {
+ printf("cant have both va and pa!\n");
+ return (EINVAL);
+ }
+ if (pa != 0 && (((uintptr_t)ptr) % PAGE_SIZE) != 0) {
+ printf("address not page aligned\n");
+ return (EINVAL);
+ }
+ if (ptr != NULL) {
+ /* If we're doing a virtual dump, flush any pre-existing pa pages */
+ error = blk_flush(di);
+ if (error)
+ return (error);
+ }
+ while (sz) {
+ len = maxdumpsz - fragsz;
+ if (len > sz)
+ len = sz;
+ counter += len;
+ progress -= len;
+ if (counter >> 24) {
+ report_progress(progress, dumpsize);
+ counter &= (1<<24) - 1;
+ }
+
+ wdog_kern_pat(WD_LASTVAL);
+
+ if (ptr) {
+ error = dump_write(di, ptr, 0, dumplo, len);
+ if (error)
+ return (error);
+ dumplo += len;
+ ptr += len;
+ sz -= len;
+ } else {
+ for (i = 0; i < len; i += PAGE_SIZE)
+ dump_va = pmap_kenter_temporary(pa + i, (i + fragsz) >> PAGE_SHIFT);
+ fragsz += len;
+ pa += len;
+ sz -= len;
+ if (fragsz == maxdumpsz) {
+ error = blk_flush(di);
+ if (error)
+ return (error);
+ }
+ }
+
+ /* Check for user abort. */
+ c = cncheckc();
+ if (c == 0x03)
+ return (ECANCELED);
+ if (c != -1)
+ printf(" (CTRL-C to abort) ");
+ }
+
+ return (0);
+}
+
+/* A fake page table page, to avoid having to handle both 4K and 2M pages */
+static pd_entry_t fakepd[NPDEPG];
+
+void
+minidumpsys(struct dumperinfo *di)
+{
+ uint32_t pmapsize;
+ vm_offset_t va;
+ int error;
+ uint64_t bits;
+ uint64_t *pdp, *pd, *pt, pa;
+ int i, j, k, n, bit;
+ int retry_count;
+ struct minidumphdr mdhdr;
+
+ retry_count = 0;
+ retry:
+ retry_count++;
+ counter = 0;
+ /* Walk page table pages, set bits in vm_page_dump */
+ pmapsize = 0;
+ pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
+ for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR,
+ kernel_vm_end); ) {
+ /*
+ * We always write a page, even if it is zero. Each
+ * page written corresponds to 1GB of space
+ */
+ pmapsize += PAGE_SIZE;
+ i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
+ if ((pdp[i] & PG_V) == 0) {
+ va += NBPDP;
+ continue;
+ }
+
+ /*
+ * 1GB page is represented as 512 2MB pages in a dump.
+ */
+ if ((pdp[i] & PG_PS) != 0) {
+ va += NBPDP;
+ pa = pdp[i] & PG_PS_FRAME;
+ for (n = 0; n < NPDEPG * NPTEPG; n++) {
+ if (is_dumpable(pa))
+ dump_add_page(pa);
+ pa += PAGE_SIZE;
+ }
+ continue;
+ }
+
+ pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
+ for (n = 0; n < NPDEPG; n++, va += NBPDR) {
+ j = (va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1);
+
+ if ((pd[j] & PG_V) == 0)
+ continue;
+
+ if ((pd[j] & PG_PS) != 0) {
+ /* This is an entire 2M page. */
+ pa = pd[j] & PG_PS_FRAME;
+ for (k = 0; k < NPTEPG; k++) {
+ if (is_dumpable(pa))
+ dump_add_page(pa);
+ pa += PAGE_SIZE;
+ }
+ continue;
+ }
+
+ pa = pd[j] & PG_FRAME;
+ /* set bit for this PTE page */
+ if (is_dumpable(pa))
+ dump_add_page(pa);
+ /* and for each valid page in this 2MB block */
+ pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME);
+ for (k = 0; k < NPTEPG; k++) {
+ if ((pt[k] & PG_V) == 0)
+ continue;
+ pa = pt[k] & PG_FRAME;
+ if (is_dumpable(pa))
+ dump_add_page(pa);
+ }
+ }
+ }
+
+ /* Calculate dump size. */
+ dumpsize = pmapsize;
+ dumpsize += round_page(msgbufp->msg_size);
+ dumpsize += round_page(vm_page_dump_size);
+ for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
+ bits = vm_page_dump[i];
+ while (bits) {
+ bit = bsfq(bits);
+ pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) + bit) * PAGE_SIZE;
+ /* Clear out undumpable pages now if needed */
+ if (is_dumpable(pa)) {
+ dumpsize += PAGE_SIZE;
+ } else {
+ dump_drop_page(pa);
+ }
+ bits &= ~(1ul << bit);
+ }
+ }
+ dumpsize += PAGE_SIZE;
+
+ /* Determine dump offset on device. */
+ if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) {
+ error = E2BIG;
+ goto fail;
+ }
+ dumplo = di->mediaoffset + di->mediasize - dumpsize;
+ dumplo -= sizeof(kdh) * 2;
+ progress = dumpsize;
+
+ /* Initialize mdhdr */
+ bzero(&mdhdr, sizeof(mdhdr));
+ strcpy(mdhdr.magic, MINIDUMP_MAGIC);
+ mdhdr.version = MINIDUMP_VERSION;
+ mdhdr.msgbufsize = msgbufp->msg_size;
+ mdhdr.bitmapsize = vm_page_dump_size;
+ mdhdr.pmapsize = pmapsize;
+ mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS;
+ mdhdr.dmapbase = DMAP_MIN_ADDRESS;
+ mdhdr.dmapend = DMAP_MAX_ADDRESS;
+
+ mkdumpheader(&kdh, KERNELDUMPMAGIC, KERNELDUMP_AMD64_VERSION, dumpsize, di->blocksize);
+
+ printf("Dumping %llu out of %ju MB:", (long long)dumpsize >> 20,
+ ptoa((uintmax_t)physmem) / 1048576);
+
+ /* Dump leader */
+ error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
+ if (error)
+ goto fail;
+ dumplo += sizeof(kdh);
+
+ /* Dump my header */
+ bzero(&fakepd, sizeof(fakepd));
+ bcopy(&mdhdr, &fakepd, sizeof(mdhdr));
+ error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
+ if (error)
+ goto fail;
+
+ /* Dump msgbuf up front */
+ error = blk_write(di, (char *)msgbufp->msg_ptr, 0, round_page(msgbufp->msg_size));
+ if (error)
+ goto fail;
+
+ /* Dump bitmap */
+ error = blk_write(di, (char *)vm_page_dump, 0, round_page(vm_page_dump_size));
+ if (error)
+ goto fail;
+
+ /* Dump kernel page directory pages */
+ bzero(fakepd, sizeof(fakepd));
+ pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
+ for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR,
+ kernel_vm_end); va += NBPDP) {
+ i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
+
+ /* We always write a page, even if it is zero */
+ if ((pdp[i] & PG_V) == 0) {
+ error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
+ if (error)
+ goto fail;
+ /* flush, in case we reuse fakepd in the same block */
+ error = blk_flush(di);
+ if (error)
+ goto fail;
+ continue;
+ }
+
+ /* 1GB page is represented as 512 2MB pages in a dump */
+ if ((pdp[i] & PG_PS) != 0) {
+ /* PDPE and PDP have identical layout in this case */
+ fakepd[0] = pdp[i];
+ for (j = 1; j < NPDEPG; j++)
+ fakepd[j] = fakepd[j - 1] + NBPDR;
+ error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
+ if (error)
+ goto fail;
+ /* flush, in case we reuse fakepd in the same block */
+ error = blk_flush(di);
+ if (error)
+ goto fail;
+ bzero(fakepd, sizeof(fakepd));
+ continue;
+ }
+
+ pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
+ error = blk_write(di, (char *)pd, 0, PAGE_SIZE);
+ if (error)
+ goto fail;
+ error = blk_flush(di);
+ if (error)
+ goto fail;
+ }
+
+ /* Dump memory chunks */
+ /* XXX cluster it up and use blk_dump() */
+ for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
+ bits = vm_page_dump[i];
+ while (bits) {
+ bit = bsfq(bits);
+ pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) + bit) * PAGE_SIZE;
+ error = blk_write(di, 0, pa, PAGE_SIZE);
+ if (error)
+ goto fail;
+ bits &= ~(1ul << bit);
+ }
+ }
+
+ error = blk_flush(di);
+ if (error)
+ goto fail;
+
+ /* Dump trailer */
+ error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
+ if (error)
+ goto fail;
+ dumplo += sizeof(kdh);
+
+ /* Signal completion, signoff and exit stage left. */
+ dump_write(di, NULL, 0, 0, 0);
+ printf("\nDump complete\n");
+ return;
+
+ fail:
+ if (error < 0)
+ error = -error;
+
+ printf("\n");
+ if (error == ENOSPC) {
+ printf("Dump map grown while dumping. ");
+ if (retry_count < 5) {
+ printf("Retrying...\n");
+ goto retry;
+ }
+ printf("Dump failed.\n");
+ }
+ else if (error == ECANCELED)
+ printf("Dump aborted\n");
+ else if (error == E2BIG)
+ printf("Dump failed. Partition too small.\n");
+ else
+ printf("** DUMP FAILED (ERROR %d) **\n", error);
+}
+
+void
+dump_add_page(vm_paddr_t pa)
+{
+ int idx, bit;
+
+ pa >>= PAGE_SHIFT;
+ idx = pa >> 6; /* 2^6 = 64 */
+ bit = pa & 63;
+ atomic_set_long(&vm_page_dump[idx], 1ul << bit);
+}
+
+void
+dump_drop_page(vm_paddr_t pa)
+{
+ int idx, bit;
+
+ pa >>= PAGE_SHIFT;
+ idx = pa >> 6; /* 2^6 = 64 */
+ bit = pa & 63;
+ atomic_clear_long(&vm_page_dump[idx], 1ul << bit);
+}
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
new file mode 100644
index 0000000..31dbb3f
--- /dev/null
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -0,0 +1,1488 @@
+/*-
+ * Copyright (c) 1996, by Steve Passe
+ * Copyright (c) 2003, by Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. The name of the developer may NOT be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_cpu.h"
+#include "opt_kstack_pages.h"
+#include "opt_sched.h"
+#include "opt_smp.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/cpuset.h>
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/memrange.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+#include <x86/apicreg.h>
+#include <machine/clock.h>
+#include <machine/cputypes.h>
+#include <machine/cpufunc.h>
+#include <x86/mca.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#include <machine/psl.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+#include <machine/tss.h>
+
+#define WARMBOOT_TARGET 0
+#define WARMBOOT_OFF (KERNBASE + 0x0467)
+#define WARMBOOT_SEG (KERNBASE + 0x0469)
+
+#define CMOS_REG (0x70)
+#define CMOS_DATA (0x71)
+#define BIOS_RESET (0x0f)
+#define BIOS_WARM (0x0a)
+
+/* lock region used by kernel profiling */
+int mcount_lock;
+
+int mp_naps; /* # of Applications processors */
+int boot_cpu_id = -1; /* designated BSP */
+
+extern struct pcpu __pcpu[];
+
+/* AP uses this during bootstrap. Do not staticize. */
+char *bootSTK;
+static int bootAP;
+
+/* Free these after use */
+void *bootstacks[MAXCPU];
+
+/* Temporary variables for init_secondary() */
+char *doublefault_stack;
+char *nmi_stack;
+void *dpcpu;
+
+struct pcb stoppcbs[MAXCPU];
+struct pcb **susppcbs;
+
+/* Variables needed for SMP tlb shootdown. */
+vm_offset_t smp_tlb_addr1;
+vm_offset_t smp_tlb_addr2;
+volatile int smp_tlb_wait;
+
+#ifdef COUNT_IPIS
+/* Interrupt counts. */
+static u_long *ipi_preempt_counts[MAXCPU];
+static u_long *ipi_ast_counts[MAXCPU];
+u_long *ipi_invltlb_counts[MAXCPU];
+u_long *ipi_invlrng_counts[MAXCPU];
+u_long *ipi_invlpg_counts[MAXCPU];
+u_long *ipi_invlcache_counts[MAXCPU];
+u_long *ipi_rendezvous_counts[MAXCPU];
+static u_long *ipi_hardclock_counts[MAXCPU];
+#endif
+
+extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
+
+/*
+ * Local data and functions.
+ */
+
+static volatile cpuset_t ipi_nmi_pending;
+
+/* used to hold the AP's until we are ready to release them */
+static struct mtx ap_boot_mtx;
+
+/* Set to 1 once we're ready to let the APs out of the pen. */
+static volatile int aps_ready = 0;
+
+/*
+ * Store data from cpu_add() until later in the boot when we actually setup
+ * the APs.
+ */
+struct cpu_info {
+ int cpu_present:1;
+ int cpu_bsp:1;
+ int cpu_disabled:1;
+ int cpu_hyperthread:1;
+} static cpu_info[MAX_APIC_ID + 1];
+int cpu_apic_ids[MAXCPU];
+int apic_cpuids[MAX_APIC_ID + 1];
+
+/* Holds pending bitmap based IPIs per CPU */
+static volatile u_int cpu_ipi_pending[MAXCPU];
+
+static u_int boot_address;
+static int cpu_logical; /* logical cpus per core */
+static int cpu_cores; /* cores per package */
+
+static void assign_cpu_ids(void);
+static void set_interrupt_apic_ids(void);
+static int start_all_aps(void);
+static int start_ap(int apic_id);
+static void release_aps(void *dummy);
+
+static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */
+static int hyperthreading_allowed = 1;
+static u_int bootMP_size;
+
+static void
+mem_range_AP_init(void)
+{
+ if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
+ mem_range_softc.mr_op->initAP(&mem_range_softc);
+}
+
+static void
+topo_probe_amd(void)
+{
+ int core_id_bits;
+ int id;
+
+ /* AMD processors do not support HTT. */
+ cpu_logical = 1;
+
+ if ((amd_feature2 & AMDID2_CMP) == 0) {
+ cpu_cores = 1;
+ return;
+ }
+
+ core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
+ AMDID_COREID_SIZE_SHIFT;
+ if (core_id_bits == 0) {
+ cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1;
+ return;
+ }
+
+ /* Fam 10h and newer should get here. */
+ for (id = 0; id <= MAX_APIC_ID; id++) {
+ /* Check logical CPU availability. */
+ if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled)
+ continue;
+ /* Check if logical CPU has the same package ID. */
+ if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits))
+ continue;
+ cpu_cores++;
+ }
+}
+
+/*
+ * Round up to the next power of two, if necessary, and then
+ * take log2.
+ * Returns -1 if argument is zero.
+ */
+static __inline int
+mask_width(u_int x)
+{
+
+ return (fls(x << (1 - powerof2(x))) - 1);
+}
+
+static void
+topo_probe_0x4(void)
+{
+ u_int p[4];
+ int pkg_id_bits;
+ int core_id_bits;
+ int max_cores;
+ int max_logical;
+ int id;
+
+ /* Both zero and one here mean one logical processor per package. */
+ max_logical = (cpu_feature & CPUID_HTT) != 0 ?
+ (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
+ if (max_logical <= 1)
+ return;
+
+ /*
+ * Because of uniformity assumption we examine only
+ * those logical processors that belong to the same
+ * package as BSP. Further, we count number of
+ * logical processors that belong to the same core
+ * as BSP thus deducing number of threads per core.
+ */
+ if (cpu_high >= 0x4) {
+ cpuid_count(0x04, 0, p);
+ max_cores = ((p[0] >> 26) & 0x3f) + 1;
+ } else
+ max_cores = 1;
+ core_id_bits = mask_width(max_logical/max_cores);
+ if (core_id_bits < 0)
+ return;
+ pkg_id_bits = core_id_bits + mask_width(max_cores);
+
+ for (id = 0; id <= MAX_APIC_ID; id++) {
+ /* Check logical CPU availability. */
+ if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled)
+ continue;
+ /* Check if logical CPU has the same package ID. */
+ if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits))
+ continue;
+ cpu_cores++;
+ /* Check if logical CPU has the same package and core IDs. */
+ if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits))
+ cpu_logical++;
+ }
+
+ KASSERT(cpu_cores >= 1 && cpu_logical >= 1,
+ ("topo_probe_0x4 couldn't find BSP"));
+
+ cpu_cores /= cpu_logical;
+ hyperthreading_cpus = cpu_logical;
+}
+
+static void
+topo_probe_0xb(void)
+{
+ u_int p[4];
+ int bits;
+ int cnt;
+ int i;
+ int logical;
+ int type;
+ int x;
+
+ /* We only support three levels for now. */
+ for (i = 0; i < 3; i++) {
+ cpuid_count(0x0b, i, p);
+
+ /* Fall back if CPU leaf 11 doesn't really exist. */
+ if (i == 0 && p[1] == 0) {
+ topo_probe_0x4();
+ return;
+ }
+
+ bits = p[0] & 0x1f;
+ logical = p[1] &= 0xffff;
+ type = (p[2] >> 8) & 0xff;
+ if (type == 0 || logical == 0)
+ break;
+ /*
+ * Because of uniformity assumption we examine only
+ * those logical processors that belong to the same
+ * package as BSP.
+ */
+ for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) {
+ if (!cpu_info[x].cpu_present ||
+ cpu_info[x].cpu_disabled)
+ continue;
+ if (x >> bits == boot_cpu_id >> bits)
+ cnt++;
+ }
+ if (type == CPUID_TYPE_SMT)
+ cpu_logical = cnt;
+ else if (type == CPUID_TYPE_CORE)
+ cpu_cores = cnt;
+ }
+ if (cpu_logical == 0)
+ cpu_logical = 1;
+ cpu_cores /= cpu_logical;
+}
+
+/*
+ * Both topology discovery code and code that consumes topology
+ * information assume top-down uniformity of the topology.
+ * That is, all physical packages must be identical and each
+ * core in a package must have the same number of threads.
+ * Topology information is queried only on BSP, on which this
+ * code runs and for which it can query CPUID information.
+ * Then topology is extrapolated on all packages using the
+ * uniformity assumption.
+ */
+static void
+topo_probe(void)
+{
+ static int cpu_topo_probed = 0;
+
+ if (cpu_topo_probed)
+ return;
+
+ CPU_ZERO(&logical_cpus_mask);
+ if (mp_ncpus <= 1)
+ cpu_cores = cpu_logical = 1;
+ else if (cpu_vendor_id == CPU_VENDOR_AMD)
+ topo_probe_amd();
+ else if (cpu_vendor_id == CPU_VENDOR_INTEL) {
+ /*
+ * See Intel(R) 64 Architecture Processor
+ * Topology Enumeration article for details.
+ *
+ * Note that 0x1 <= cpu_high < 4 case should be
+ * compatible with topo_probe_0x4() logic when
+ * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
+ * or it should trigger the fallback otherwise.
+ */
+ if (cpu_high >= 0xb)
+ topo_probe_0xb();
+ else if (cpu_high >= 0x1)
+ topo_probe_0x4();
+ }
+
+ /*
+ * Fallback: assume each logical CPU is in separate
+ * physical package. That is, no multi-core, no SMT.
+ */
+ if (cpu_cores == 0 || cpu_logical == 0)
+ cpu_cores = cpu_logical = 1;
+ cpu_topo_probed = 1;
+}
+
+struct cpu_group *
+cpu_topo(void)
+{
+ int cg_flags;
+
+ /*
+ * Determine whether any threading flags are
+ * necessry.
+ */
+ topo_probe();
+ if (cpu_logical > 1 && hyperthreading_cpus)
+ cg_flags = CG_FLAG_HTT;
+ else if (cpu_logical > 1)
+ cg_flags = CG_FLAG_SMT;
+ else
+ cg_flags = 0;
+ if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
+ printf("WARNING: Non-uniform processors.\n");
+ printf("WARNING: Using suboptimal topology.\n");
+ return (smp_topo_none());
+ }
+ /*
+ * No multi-core or hyper-threaded.
+ */
+ if (cpu_logical * cpu_cores == 1)
+ return (smp_topo_none());
+ /*
+ * Only HTT no multi-core.
+ */
+ if (cpu_logical > 1 && cpu_cores == 1)
+ return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags));
+ /*
+ * Only multi-core no HTT.
+ */
+ if (cpu_cores > 1 && cpu_logical == 1)
+ return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags));
+ /*
+ * Both HTT and multi-core.
+ */
+ return (smp_topo_2level(CG_SHARE_L2, cpu_cores,
+ CG_SHARE_L1, cpu_logical, cg_flags));
+}
+
+/*
+ * Calculate usable address in base memory for AP trampoline code.
+ */
+u_int
+mp_bootaddress(u_int basemem)
+{
+
+ bootMP_size = mptramp_end - mptramp_start;
+ boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */
+ if (((basemem * 1024) - boot_address) < bootMP_size)
+ boot_address -= PAGE_SIZE; /* not enough, lower by 4k */
+ /* 3 levels of page table pages */
+ mptramp_pagetables = boot_address - (PAGE_SIZE * 3);
+
+ return mptramp_pagetables;
+}
+
+void
+cpu_add(u_int apic_id, char boot_cpu)
+{
+
+ if (apic_id > MAX_APIC_ID) {
+ panic("SMP: APIC ID %d too high", apic_id);
+ return;
+ }
+ KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
+ apic_id));
+ cpu_info[apic_id].cpu_present = 1;
+ if (boot_cpu) {
+ KASSERT(boot_cpu_id == -1,
+ ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
+ boot_cpu_id));
+ boot_cpu_id = apic_id;
+ cpu_info[apic_id].cpu_bsp = 1;
+ }
+ if (mp_ncpus < MAXCPU) {
+ mp_ncpus++;
+ mp_maxid = mp_ncpus - 1;
+ }
+ if (bootverbose)
+ printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
+ "AP");
+}
+
+void
+cpu_mp_setmaxid(void)
+{
+
+ /*
+ * mp_maxid should be already set by calls to cpu_add().
+ * Just sanity check its value here.
+ */
+ if (mp_ncpus == 0)
+ KASSERT(mp_maxid == 0,
+ ("%s: mp_ncpus is zero, but mp_maxid is not", __func__));
+ else if (mp_ncpus == 1)
+ mp_maxid = 0;
+ else
+ KASSERT(mp_maxid >= mp_ncpus - 1,
+ ("%s: counters out of sync: max %d, count %d", __func__,
+ mp_maxid, mp_ncpus));
+}
+
+int
+cpu_mp_probe(void)
+{
+
+ /*
+ * Always record BSP in CPU map so that the mbuf init code works
+ * correctly.
+ */
+ CPU_SETOF(0, &all_cpus);
+ if (mp_ncpus == 0) {
+ /*
+ * No CPUs were found, so this must be a UP system. Setup
+ * the variables to represent a system with a single CPU
+ * with an id of 0.
+ */
+ mp_ncpus = 1;
+ return (0);
+ }
+
+ /* At least one CPU was found. */
+ if (mp_ncpus == 1) {
+ /*
+ * One CPU was found, so this must be a UP system with
+ * an I/O APIC.
+ */
+ mp_maxid = 0;
+ return (0);
+ }
+
+ /* At least two CPUs were found. */
+ return (1);
+}
+
+/*
+ * Initialize the IPI handlers and start up the AP's.
+ */
+void
+cpu_mp_start(void)
+{
+ int i;
+
+ /* Initialize the logical ID to APIC ID table. */
+ for (i = 0; i < MAXCPU; i++) {
+ cpu_apic_ids[i] = -1;
+ cpu_ipi_pending[i] = 0;
+ }
+
+ /* Install an inter-CPU IPI for TLB invalidation */
+ setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0);
+
+ /* Install an inter-CPU IPI for cache invalidation. */
+ setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0);
+
+ /* Install an inter-CPU IPI for all-CPU rendezvous */
+ setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
+
+ /* Install generic inter-CPU IPI handler */
+ setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
+ SDT_SYSIGT, SEL_KPL, 0);
+
+ /* Install an inter-CPU IPI for CPU stop/restart */
+ setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0);
+
+ /* Install an inter-CPU IPI for CPU suspend/resume */
+ setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0);
+
+ /* Set boot_cpu_id if needed. */
+ if (boot_cpu_id == -1) {
+ boot_cpu_id = PCPU_GET(apic_id);
+ cpu_info[boot_cpu_id].cpu_bsp = 1;
+ } else
+ KASSERT(boot_cpu_id == PCPU_GET(apic_id),
+ ("BSP's APIC ID doesn't match boot_cpu_id"));
+
+ /* Probe logical/physical core configuration. */
+ topo_probe();
+
+ assign_cpu_ids();
+
+ /* Start each Application Processor */
+ start_all_aps();
+
+ set_interrupt_apic_ids();
+}
+
+
+/*
+ * Print various information about the SMP system hardware and setup.
+ */
+void
+cpu_mp_announce(void)
+{
+ const char *hyperthread;
+ int i;
+
+ printf("FreeBSD/SMP: %d package(s) x %d core(s)",
+ mp_ncpus / (cpu_cores * cpu_logical), cpu_cores);
+ if (hyperthreading_cpus > 1)
+ printf(" x %d HTT threads", cpu_logical);
+ else if (cpu_logical > 1)
+ printf(" x %d SMT threads", cpu_logical);
+ printf("\n");
+
+ /* List active CPUs first. */
+ printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
+ for (i = 1; i < mp_ncpus; i++) {
+ if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread)
+ hyperthread = "/HT";
+ else
+ hyperthread = "";
+ printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread,
+ cpu_apic_ids[i]);
+ }
+
+ /* List disabled CPUs last. */
+ for (i = 0; i <= MAX_APIC_ID; i++) {
+ if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled)
+ continue;
+ if (cpu_info[i].cpu_hyperthread)
+ hyperthread = "/HT";
+ else
+ hyperthread = "";
+ printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread,
+ i);
+ }
+}
+
+/*
+ * AP CPU's call this to initialize themselves.
+ */
+void
+init_secondary(void)
+{
+ struct pcpu *pc;
+ struct nmi_pcpu *np;
+ u_int64_t msr, cr0;
+ u_int cpuid;
+ int cpu, gsel_tss, x;
+ struct region_descriptor ap_gdt;
+
+ /* Set by the startup code for us to use */
+ cpu = bootAP;
+
+ /* Init tss */
+ common_tss[cpu] = common_tss[0];
+ common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */
+ common_tss[cpu].tss_iobase = sizeof(struct amd64tss) +
+ IOPAGES * PAGE_SIZE;
+ common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE];
+
+ /* The NMI stack runs on IST2. */
+ np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
+ common_tss[cpu].tss_ist2 = (long) np;
+
+ /* Prepare private GDT */
+ gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
+ for (x = 0; x < NGDT; x++) {
+ if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
+ x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1))
+ ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]);
+ }
+ ssdtosyssd(&gdt_segs[GPROC0_SEL],
+ (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]);
+ ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
+ ap_gdt.rd_base = (long) &gdt[NGDT * cpu];
+ lgdt(&ap_gdt); /* does magic intra-segment return */
+
+ /* Get per-cpu data */
+ pc = &__pcpu[cpu];
+
+ /* prime data page for it to use */
+ pcpu_init(pc, cpu, sizeof(struct pcpu));
+ dpcpu_init(dpcpu, cpu);
+ pc->pc_apic_id = cpu_apic_ids[cpu];
+ pc->pc_prvspace = pc;
+ pc->pc_curthread = 0;
+ pc->pc_tssp = &common_tss[cpu];
+ pc->pc_commontssp = &common_tss[cpu];
+ pc->pc_rsp0 = 0;
+ pc->pc_tss = (struct system_segment_descriptor *)&gdt[NGDT * cpu +
+ GPROC0_SEL];
+ pc->pc_fs32p = &gdt[NGDT * cpu + GUFS32_SEL];
+ pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL];
+ pc->pc_ldt = (struct system_segment_descriptor *)&gdt[NGDT * cpu +
+ GUSERLDT_SEL];
+
+ /* Save the per-cpu pointer for use by the NMI handler. */
+ np->np_pcpu = (register_t) pc;
+
+ wrmsr(MSR_FSBASE, 0); /* User value */
+ wrmsr(MSR_GSBASE, (u_int64_t)pc);
+ wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */
+
+ lidt(&r_idt);
+
+ gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
+ ltr(gsel_tss);
+
+ /*
+ * Set to a known state:
+ * Set by mpboot.s: CR0_PG, CR0_PE
+ * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
+ */
+ cr0 = rcr0();
+ cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
+ load_cr0(cr0);
+
+ /* Set up the fast syscall stuff */
+ msr = rdmsr(MSR_EFER) | EFER_SCE;
+ wrmsr(MSR_EFER, msr);
+ wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
+ wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
+ msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
+ ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
+ wrmsr(MSR_STAR, msr);
+ wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
+
+ /* Disable local APIC just to be sure. */
+ lapic_disable();
+
+ /* signal our startup to the BSP. */
+ mp_naps++;
+
+ /* Spin until the BSP releases the AP's. */
+ while (!aps_ready)
+ ia32_pause();
+
+ /* Initialize the PAT MSR. */
+ pmap_init_pat();
+
+ /* set up CPU registers and state */
+ cpu_setregs();
+
+ /* set up SSE/NX registers */
+ initializecpu();
+
+ /* set up FPU state on the AP */
+ fpuinit();
+
+ /* A quick check from sanity claus */
+ cpuid = PCPU_GET(cpuid);
+ if (PCPU_GET(apic_id) != lapic_id()) {
+ printf("SMP: cpuid = %d\n", cpuid);
+ printf("SMP: actual apic_id = %d\n", lapic_id());
+ printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
+ panic("cpuid mismatch! boom!!");
+ }
+
+ /* Initialize curthread. */
+ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
+ PCPU_SET(curthread, PCPU_GET(idlethread));
+
+ mca_init();
+
+ mtx_lock_spin(&ap_boot_mtx);
+
+ /* Init local apic for irq's */
+ lapic_setup(1);
+
+ /* Set memory range attributes for this CPU to match the BSP */
+ mem_range_AP_init();
+
+ smp_cpus++;
+
+ CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
+ printf("SMP: AP CPU #%d Launched!\n", cpuid);
+
+ /* Determine if we are a logical CPU. */
+ /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */
+ if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0)
+ CPU_SET(cpuid, &logical_cpus_mask);
+
+ if (bootverbose)
+ lapic_dump("AP");
+
+ if (smp_cpus == mp_ncpus) {
+ /* enable IPI's, tlb shootdown, freezes etc */
+ atomic_store_rel_int(&smp_started, 1);
+ smp_active = 1; /* historic */
+ }
+
+ /*
+ * Enable global pages TLB extension
+ * This also implicitly flushes the TLB
+ */
+
+ load_cr4(rcr4() | CR4_PGE);
+ load_ds(_udatasel);
+ load_es(_udatasel);
+ load_fs(_ufssel);
+ mtx_unlock_spin(&ap_boot_mtx);
+
+ /* Wait until all the AP's are up. */
+ while (smp_started == 0)
+ ia32_pause();
+
+ /* Start per-CPU event timers. */
+ cpu_initclocks_ap();
+
+ sched_throw(NULL);
+
+ panic("scheduler returned us to %s", __func__);
+ /* NOTREACHED */
+}
+
+/*******************************************************************
+ * local functions and data
+ */
+
+/*
+ * We tell the I/O APIC code about all the CPUs we want to receive
+ * interrupts. If we don't want certain CPUs to receive IRQs we
+ * can simply not tell the I/O APIC code about them in this function.
+ * We also do not tell it about the BSP since it tells itself about
+ * the BSP internally to work with UP kernels and on UP machines.
+ */
+static void
+set_interrupt_apic_ids(void)
+{
+ u_int i, apic_id;
+
+ for (i = 0; i < MAXCPU; i++) {
+ apic_id = cpu_apic_ids[i];
+ if (apic_id == -1)
+ continue;
+ if (cpu_info[apic_id].cpu_bsp)
+ continue;
+ if (cpu_info[apic_id].cpu_disabled)
+ continue;
+
+ /* Don't let hyperthreads service interrupts. */
+ if (hyperthreading_cpus > 1 &&
+ apic_id % hyperthreading_cpus != 0)
+ continue;
+
+ intr_add_cpu(i);
+ }
+}
+
+/*
+ * Assign logical CPU IDs to local APICs.
+ */
+static void
+assign_cpu_ids(void)
+{
+ u_int i;
+
+ TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
+ &hyperthreading_allowed);
+
+ /* Check for explicitly disabled CPUs. */
+ for (i = 0; i <= MAX_APIC_ID; i++) {
+ if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
+ continue;
+
+ if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) {
+ cpu_info[i].cpu_hyperthread = 1;
+
+ /*
+ * Don't use HT CPU if it has been disabled by a
+ * tunable.
+ */
+ if (hyperthreading_allowed == 0) {
+ cpu_info[i].cpu_disabled = 1;
+ continue;
+ }
+ }
+
+ /* Don't use this CPU if it has been disabled by a tunable. */
+ if (resource_disabled("lapic", i)) {
+ cpu_info[i].cpu_disabled = 1;
+ continue;
+ }
+ }
+
+ if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) {
+ hyperthreading_cpus = 0;
+ cpu_logical = 1;
+ }
+
+ /*
+ * Assign CPU IDs to local APIC IDs and disable any CPUs
+ * beyond MAXCPU. CPU 0 is always assigned to the BSP.
+ *
+ * To minimize confusion for userland, we attempt to number
+ * CPUs such that all threads and cores in a package are
+ * grouped together. For now we assume that the BSP is always
+ * the first thread in a package and just start adding APs
+ * starting with the BSP's APIC ID.
+ */
+ mp_ncpus = 1;
+ cpu_apic_ids[0] = boot_cpu_id;
+ apic_cpuids[boot_cpu_id] = 0;
+ for (i = boot_cpu_id + 1; i != boot_cpu_id;
+ i == MAX_APIC_ID ? i = 0 : i++) {
+ if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
+ cpu_info[i].cpu_disabled)
+ continue;
+
+ if (mp_ncpus < MAXCPU) {
+ cpu_apic_ids[mp_ncpus] = i;
+ apic_cpuids[i] = mp_ncpus;
+ mp_ncpus++;
+ } else
+ cpu_info[i].cpu_disabled = 1;
+ }
+ KASSERT(mp_maxid >= mp_ncpus - 1,
+ ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
+ mp_ncpus));
+}
+
+/*
+ * start each AP in our list
+ */
+static int
+start_all_aps(void)
+{
+ vm_offset_t va = boot_address + KERNBASE;
+ u_int64_t *pt4, *pt3, *pt2;
+ u_int32_t mpbioswarmvec;
+ int apic_id, cpu, i;
+ u_char mpbiosreason;
+
+ mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
+
+ /* install the AP 1st level boot code */
+ pmap_kenter(va, boot_address);
+ pmap_invalidate_page(kernel_pmap, va);
+ bcopy(mptramp_start, (void *)va, bootMP_size);
+
+ /* Locate the page tables, they'll be below the trampoline */
+ pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE);
+ pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
+ pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
+
+ /* Create the initial 1GB replicated page tables */
+ for (i = 0; i < 512; i++) {
+ /* Each slot of the level 4 pages points to the same level 3 page */
+ pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE);
+ pt4[i] |= PG_V | PG_RW | PG_U;
+
+ /* Each slot of the level 3 pages points to the same level 2 page */
+ pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE));
+ pt3[i] |= PG_V | PG_RW | PG_U;
+
+ /* The level 2 page slots are mapped with 2MB pages for 1GB. */
+ pt2[i] = i * (2 * 1024 * 1024);
+ pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
+ }
+
+ /* save the current value of the warm-start vector */
+ mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
+ outb(CMOS_REG, BIOS_RESET);
+ mpbiosreason = inb(CMOS_DATA);
+
+ /* setup a vector to our boot code */
+ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
+ *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
+ outb(CMOS_REG, BIOS_RESET);
+ outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */
+
+ /* start each AP */
+ for (cpu = 1; cpu < mp_ncpus; cpu++) {
+ apic_id = cpu_apic_ids[cpu];
+
+ /* allocate and set up an idle stack data page */
+ bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
+ doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
+ nmi_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
+ dpcpu = (void *)kmem_alloc(kernel_map, DPCPU_SIZE);
+
+ bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8;
+ bootAP = cpu;
+
+ /* attempt to start the Application Processor */
+ if (!start_ap(apic_id)) {
+ /* restore the warmstart vector */
+ *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
+ panic("AP #%d (PHY# %d) failed!", cpu, apic_id);
+ }
+
+ CPU_SET(cpu, &all_cpus); /* record AP in CPU map */
+ }
+
+ /* restore the warmstart vector */
+ *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
+
+ outb(CMOS_REG, BIOS_RESET);
+ outb(CMOS_DATA, mpbiosreason);
+
+ /* number of APs actually started */
+ return mp_naps;
+}
+
+
+/*
+ * This function starts the AP (application processor) identified
+ * by the APIC ID 'physicalCpu'. It does quite a "song and dance"
+ * to accomplish this. This is necessary because of the nuances
+ * of the different hardware we might encounter. It isn't pretty,
+ * but it seems to work.
+ */
+static int
+start_ap(int apic_id)
+{
+ int vector, ms;
+ int cpus;
+
+ /* calculate the vector */
+ vector = (boot_address >> 12) & 0xff;
+
+ /* used as a watchpoint to signal AP startup */
+ cpus = mp_naps;
+
+ ipi_startup(apic_id, vector);
+
+ /* Wait up to 5 seconds for it to start. */
+ for (ms = 0; ms < 5000; ms++) {
+ if (mp_naps > cpus)
+ return 1; /* return SUCCESS */
+ DELAY(1000);
+ }
+ return 0; /* return FAILURE */
+}
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+ sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+ sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+ sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW,
+ &ipi_range_size, 0, "");
+
+u_int ipi_masked_global;
+u_int ipi_masked_page;
+u_int ipi_masked_range;
+u_int ipi_masked_range_size;
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
+ &ipi_masked_global, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
+ &ipi_masked_page, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
+ &ipi_masked_range, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
+ &ipi_masked_range_size, 0, "");
+#endif /* COUNT_XINVLTLB_HITS */
+
+/*
+ * Init and startup IPI.
+ */
+void
+ipi_startup(int apic_id, int vector)
+{
+
+ /*
+ * first we do an INIT IPI: this INIT IPI might be run, resetting
+ * and running the target CPU. OR this INIT IPI might be latched (P5
+ * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
+ * ignored.
+ */
+ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+ APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
+ lapic_ipi_wait(-1);
+ DELAY(10000); /* wait ~10mS */
+
+ /*
+ * next we do a STARTUP IPI: the previous INIT IPI might still be
+ * latched, (P5 bug) this 1st STARTUP would then terminate
+ * immediately, and the previously started INIT IPI would continue. OR
+ * the previous INIT IPI has already run. and this STARTUP IPI will
+ * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
+ * will run.
+ */
+ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+ APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
+ vector, apic_id);
+ lapic_ipi_wait(-1);
+ DELAY(200); /* wait ~200uS */
+
+ /*
+ * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
+ * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
+ * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
+ * recognized after hardware RESET or INIT IPI.
+ */
+ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+ APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
+ vector, apic_id);
+ lapic_ipi_wait(-1);
+ DELAY(200); /* wait ~200uS */
+}
+
+/*
+ * Send an IPI to specified CPU handling the bitmap logic.
+ */
+static void
+ipi_send_cpu(int cpu, u_int ipi)
+{
+ u_int bitmap, old_pending, new_pending;
+
+ KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
+
+ if (IPI_IS_BITMAPED(ipi)) {
+ bitmap = 1 << ipi;
+ ipi = IPI_BITMAP_VECTOR;
+ do {
+ old_pending = cpu_ipi_pending[cpu];
+ new_pending = old_pending | bitmap;
+ } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
+ old_pending, new_pending));
+ if (old_pending)
+ return;
+ }
+ lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
+}
+
+/*
+ * Flush the TLB on all other CPU's
+ */
+static void
+smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+ u_int ncpu;
+
+ ncpu = mp_ncpus - 1; /* does not shootdown self */
+ if (ncpu < 1)
+ return; /* no other cpus */
+ if (!(read_rflags() & PSL_I))
+ panic("%s: interrupts disabled", __func__);
+ mtx_lock_spin(&smp_ipi_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ atomic_store_rel_int(&smp_tlb_wait, 0);
+ ipi_all_but_self(vector);
+ while (smp_tlb_wait < ncpu)
+ ia32_pause();
+ mtx_unlock_spin(&smp_ipi_mtx);
+}
+
+static void
+smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+ int cpu, ncpu, othercpus;
+
+ othercpus = mp_ncpus - 1;
+ if (CPU_ISFULLSET(&mask)) {
+ if (othercpus < 1)
+ return;
+ } else {
+ CPU_CLR(PCPU_GET(cpuid), &mask);
+ if (CPU_EMPTY(&mask))
+ return;
+ }
+ if (!(read_rflags() & PSL_I))
+ panic("%s: interrupts disabled", __func__);
+ mtx_lock_spin(&smp_ipi_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ atomic_store_rel_int(&smp_tlb_wait, 0);
+ if (CPU_ISFULLSET(&mask)) {
+ ncpu = othercpus;
+ ipi_all_but_self(vector);
+ } else {
+ ncpu = 0;
+ while ((cpu = cpusetobj_ffs(&mask)) != 0) {
+ cpu--;
+ CPU_CLR(cpu, &mask);
+ CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
+ cpu, vector);
+ ipi_send_cpu(cpu, vector);
+ ncpu++;
+ }
+ }
+ while (smp_tlb_wait < ncpu)
+ ia32_pause();
+ mtx_unlock_spin(&smp_ipi_mtx);
+}
+
+void
+smp_cache_flush(void)
+{
+
+ if (smp_started)
+ smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
+}
+
+void
+smp_invltlb(void)
+{
+
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_global++;
+#endif
+ }
+}
+
+void
+smp_invlpg(vm_offset_t addr)
+{
+
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_page++;
+#endif
+ }
+}
+
+void
+smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
+{
+
+ if (smp_started) {
+ smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_range++;
+ ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+}
+
+void
+smp_masked_invltlb(cpuset_t mask)
+{
+
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_global++;
+#endif
+ }
+}
+
+void
+smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
+{
+
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_page++;
+#endif
+ }
+}
+
+void
+smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2)
+{
+
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_masked_range++;
+ ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+}
+
+void
+ipi_bitmap_handler(struct trapframe frame)
+{
+ struct trapframe *oldframe;
+ struct thread *td;
+ int cpu = PCPU_GET(cpuid);
+ u_int ipi_bitmap;
+
+ critical_enter();
+ td = curthread;
+ td->td_intr_nesting_level++;
+ oldframe = td->td_intr_frame;
+ td->td_intr_frame = &frame;
+ ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
+ if (ipi_bitmap & (1 << IPI_PREEMPT)) {
+#ifdef COUNT_IPIS
+ (*ipi_preempt_counts[cpu])++;
+#endif
+ sched_preempt(td);
+ }
+ if (ipi_bitmap & (1 << IPI_AST)) {
+#ifdef COUNT_IPIS
+ (*ipi_ast_counts[cpu])++;
+#endif
+ /* Nothing to do for AST */
+ }
+ if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
+#ifdef COUNT_IPIS
+ (*ipi_hardclock_counts[cpu])++;
+#endif
+ hardclockintr();
+ }
+ td->td_intr_frame = oldframe;
+ td->td_intr_nesting_level--;
+ critical_exit();
+}
+
+/*
+ * send an IPI to a set of cpus.
+ */
+void
+ipi_selected(cpuset_t cpus, u_int ipi)
+{
+ int cpu;
+
+ /*
+ * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
+ * of help in order to understand what is the source.
+ * Set the mask of receiving CPUs for this purpose.
+ */
+ if (ipi == IPI_STOP_HARD)
+ CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus);
+
+ while ((cpu = cpusetobj_ffs(&cpus)) != 0) {
+ cpu--;
+ CPU_CLR(cpu, &cpus);
+ CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
+ ipi_send_cpu(cpu, ipi);
+ }
+}
+
+/*
+ * send an IPI to a specific CPU.
+ */
+void
+ipi_cpu(int cpu, u_int ipi)
+{
+
+ /*
+ * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
+ * of help in order to understand what is the source.
+ * Set the mask of receiving CPUs for this purpose.
+ */
+ if (ipi == IPI_STOP_HARD)
+ CPU_SET_ATOMIC(cpu, &ipi_nmi_pending);
+
+ CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
+ ipi_send_cpu(cpu, ipi);
+}
+
+/*
+ * send an IPI to all CPUs EXCEPT myself
+ */
+void
+ipi_all_but_self(u_int ipi)
+{
+ cpuset_t other_cpus;
+
+ other_cpus = all_cpus;
+ CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+
+ if (IPI_IS_BITMAPED(ipi)) {
+ ipi_selected(other_cpus, ipi);
+ return;
+ }
+
+ /*
+ * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
+ * of help in order to understand what is the source.
+ * Set the mask of receiving CPUs for this purpose.
+ */
+ if (ipi == IPI_STOP_HARD)
+ CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus);
+
+ CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
+ lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
+}
+
+int
+ipi_nmi_handler()
+{
+ u_int cpuid;
+
+ /*
+ * As long as there is not a simple way to know about a NMI's
+ * source, if the bitmask for the current CPU is present in
+ * the global pending bitword an IPI_STOP_HARD has been issued
+ * and should be handled.
+ */
+ cpuid = PCPU_GET(cpuid);
+ if (!CPU_ISSET(cpuid, &ipi_nmi_pending))
+ return (1);
+
+ CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending);
+ cpustop_handler();
+ return (0);
+}
+
+/*
+ * Handle an IPI_STOP by saving our current context and spinning until we
+ * are resumed.
+ */
+void
+cpustop_handler(void)
+{
+ u_int cpu;
+
+ cpu = PCPU_GET(cpuid);
+
+ savectx(&stoppcbs[cpu]);
+
+ /* Indicate that we are stopped */
+ CPU_SET_ATOMIC(cpu, &stopped_cpus);
+
+ /* Wait for restart */
+ while (!CPU_ISSET(cpu, &started_cpus))
+ ia32_pause();
+
+ CPU_CLR_ATOMIC(cpu, &started_cpus);
+ CPU_CLR_ATOMIC(cpu, &stopped_cpus);
+
+ if (cpu == 0 && cpustop_restartfunc != NULL) {
+ cpustop_restartfunc();
+ cpustop_restartfunc = NULL;
+ }
+}
+
+/*
+ * Handle an IPI_SUSPEND by saving our current context and spinning until we
+ * are resumed.
+ */
+void
+cpususpend_handler(void)
+{
+ u_int cpu;
+
+ cpu = PCPU_GET(cpuid);
+
+ if (savectx(susppcbs[cpu])) {
+ ctx_fpusave(susppcbs[cpu]->pcb_fpususpend);
+ wbinvd();
+ CPU_SET_ATOMIC(cpu, &suspended_cpus);
+ } else {
+ pmap_init_pat();
+ initializecpu();
+ PCPU_SET(switchtime, 0);
+ PCPU_SET(switchticks, ticks);
+
+ /* Indicate that we are resumed */
+ CPU_CLR_ATOMIC(cpu, &suspended_cpus);
+ }
+
+ /* Wait for resume */
+ while (!CPU_ISSET(cpu, &started_cpus))
+ ia32_pause();
+
+ /* Resume MCA and local APIC */
+ mca_resume();
+ lapic_setup(0);
+
+ CPU_CLR_ATOMIC(cpu, &started_cpus);
+}
+
+/*
+ * This is called once the rest of the system is up and running and we're
+ * ready to let the AP's out of the pen.
+ */
+static void
+release_aps(void *dummy __unused)
+{
+
+ if (mp_ncpus == 1)
+ return;
+ atomic_store_rel_int(&aps_ready, 1);
+ while (smp_started == 0)
+ ia32_pause();
+}
+SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
+
+#ifdef COUNT_IPIS
+/*
+ * Setup interrupt counters for IPI handlers.
+ */
+static void
+mp_ipi_intrcnt(void *dummy)
+{
+ char buf[64];
+ int i;
+
+ CPU_FOREACH(i) {
+ snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
+ intrcnt_add(buf, &ipi_invltlb_counts[i]);
+ snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
+ intrcnt_add(buf, &ipi_invlrng_counts[i]);
+ snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
+ intrcnt_add(buf, &ipi_invlpg_counts[i]);
+ snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
+ intrcnt_add(buf, &ipi_invlcache_counts[i]);
+ snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
+ intrcnt_add(buf, &ipi_preempt_counts[i]);
+ snprintf(buf, sizeof(buf), "cpu%d:ast", i);
+ intrcnt_add(buf, &ipi_ast_counts[i]);
+ snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
+ intrcnt_add(buf, &ipi_rendezvous_counts[i]);
+ snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
+ intrcnt_add(buf, &ipi_hardclock_counts[i]);
+ }
+}
+SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
+#endif
+
diff --git a/sys/amd64/amd64/mp_watchdog.c b/sys/amd64/amd64/mp_watchdog.c
new file mode 100644
index 0000000..5cbd649
--- /dev/null
+++ b/sys/amd64/amd64/mp_watchdog.c
@@ -0,0 +1,211 @@
+/*-
+ * Copyright (c) 2004 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_mp_watchdog.h"
+#include "opt_sched.h"
+
+#ifdef SCHED_ULE
+#error MP_WATCHDOG cannot currently be used with SCHED_ULE
+#endif
+
+#include <sys/param.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <machine/smp.h>
+#include <x86/apicreg.h>
+#include <machine/apicvar.h>
+#include <machine/mp_watchdog.h>
+
+/*
+ * mp_watchdog hijacks the idle thread on a specified CPU, prevents new work
+ * from being scheduled there, and uses it as a "watchdog" to detect kernel
+ * failure on other CPUs. This is made reasonable by inclusion of logical
+ * processors in Xeon hardware. The watchdog is configured by setting the
+ * debug.watchdog sysctl/tunable to the CPU of interest. A callout will then
+ * begin executing reseting a timer that is gradually lowered by the watching
+ * thread. If the timer reaches 0, the watchdog fires by ether dropping
+ * directly to the debugger, or by sending an NMI IPI to the boot processor.
+ * This is a somewhat less efficient substitute for dedicated watchdog
+ * hardware, but can be quite an effective tool for debugging hangs.
+ *
+ * XXXRW: This should really use the watchdog(9)/watchdog(4) framework, but
+ * doesn't yet.
+ */
+static int watchdog_cpu = -1;
+static int watchdog_dontfire = 1;
+static int watchdog_timer = -1;
+static int watchdog_nmi = 1;
+
+TUNABLE_INT("debug.watchdog", &watchdog_cpu);
+SYSCTL_INT(_debug, OID_AUTO, watchdog_nmi, CTLFLAG_RW, &watchdog_nmi, 0,
+ "IPI the boot processor with an NMI to enter the debugger");
+
+static struct callout watchdog_callout;
+
+static void watchdog_change(int wdcpu);
+
+/*
+ * Number of seconds before the watchdog will fire if the callout fails to
+ * reset the timer.
+ */
+#define WATCHDOG_THRESHOLD 10
+
+static void
+watchdog_init(void *arg)
+{
+
+ callout_init(&watchdog_callout, CALLOUT_MPSAFE);
+ if (watchdog_cpu != -1)
+ watchdog_change(watchdog_cpu);
+}
+
+/*
+ * This callout resets a timer until the watchdog kicks in. It acquires some
+ * critical locks to make sure things haven't gotten wedged with hose locks
+ * held.
+ */
+static void
+watchdog_function(void *arg)
+{
+
+ /*
+ * Since the timer ran, we must not be wedged. Acquire some critical
+ * locks to make sure. Then reset the timer.
+ */
+ mtx_lock(&Giant);
+ watchdog_timer = WATCHDOG_THRESHOLD;
+ mtx_unlock(&Giant);
+ callout_reset(&watchdog_callout, 1 * hz, watchdog_function, NULL);
+}
+SYSINIT(watchdog_init, SI_SUB_DRIVERS, SI_ORDER_ANY, watchdog_init, NULL);
+
+static void
+watchdog_change(int wdcpu)
+{
+
+ if (wdcpu == -1 || wdcpu == 0xffffffff) {
+ /*
+ * Disable the watchdog.
+ */
+ watchdog_cpu = -1;
+ watchdog_dontfire = 1;
+ callout_stop(&watchdog_callout);
+ printf("watchdog stopped\n");
+ } else {
+ watchdog_timer = WATCHDOG_THRESHOLD;
+ watchdog_dontfire = 0;
+ watchdog_cpu = wdcpu;
+ callout_reset(&watchdog_callout, 1 * hz, watchdog_function,
+ NULL);
+ }
+}
+
+/*
+ * This sysctl sets which CPU is the watchdog CPU. Set to -1 or 0xffffffff
+ * to disable the watchdog.
+ */
+static int
+sysctl_watchdog(SYSCTL_HANDLER_ARGS)
+{
+ int error, temp;
+
+ temp = watchdog_cpu;
+ error = sysctl_handle_int(oidp, &temp, 0, req);
+ if (error)
+ return (error);
+
+ if (req->newptr != NULL)
+ watchdog_change(temp);
+ return (0);
+}
+SYSCTL_PROC(_debug, OID_AUTO, watchdog, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
+ sysctl_watchdog, "I", "");
+
+/*
+ * Drop into the debugger by sending an IPI NMI to the boot processor.
+ */
+static void
+watchdog_ipi_nmi(void)
+{
+
+ /*
+ * Deliver NMI to the boot processor. Why not?
+ */
+ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+ APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_NMI,
+ boot_cpu_id);
+ lapic_ipi_wait(-1);
+}
+
+/*
+ * ap_watchdog() is called by the SMP idle loop code. It works on the same
+ * premise that the disabling of logical processors does: that if the cpu is
+ * idle, then it can ignore the world from then on, as nothing will be
+ * scheduled on it. Leaving aside multi-runqueue schedulers (SCHED_ULE) and
+ * explicit process migration (sched_bind()), this is not an unreasonable
+ * assumption.
+ */
+void
+ap_watchdog(u_int cpuid)
+{
+ char old_pcomm[MAXCOMLEN + 1];
+ struct proc *p;
+
+ if (watchdog_cpu != cpuid)
+ return;
+
+ printf("watchdog started on cpu %d\n", cpuid);
+ p = curproc;
+ bcopy(p->p_comm, old_pcomm, MAXCOMLEN + 1);
+ snprintf(p->p_comm, MAXCOMLEN + 1, "mp_watchdog cpu %d", cpuid);
+ while (1) {
+ DELAY(1000000); /* One second. */
+ if (watchdog_cpu != cpuid)
+ break;
+ atomic_subtract_int(&watchdog_timer, 1);
+ if (watchdog_timer < 4)
+ printf("Watchdog timer: %d\n", watchdog_timer);
+ if (watchdog_timer == 0 && watchdog_dontfire == 0) {
+ printf("Watchdog firing!\n");
+ watchdog_dontfire = 1;
+ if (watchdog_nmi)
+ watchdog_ipi_nmi();
+ else
+ kdb_enter(KDB_WHY_WATCHDOG, "mp_watchdog");
+ }
+ }
+ bcopy(old_pcomm, p->p_comm, MAXCOMLEN + 1);
+ printf("watchdog stopped on cpu %d\n", cpuid);
+}
diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S
new file mode 100644
index 0000000..ec30c72
--- /dev/null
+++ b/sys/amd64/amd64/mpboot.S
@@ -0,0 +1,236 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h> /* miscellaneous asm macros */
+#include <machine/specialreg.h>
+
+#include "assym.s"
+
+ .data /* So we can modify it */
+
+ .p2align 4,0
+ .globl mptramp_start
+mptramp_start:
+ .code16
+ /*
+ * The AP enters here in response to the startup IPI.
+ * We are in real mode. %cs is the only segment register set.
+ */
+ cli /* make sure no interrupts */
+ mov %cs, %ax /* copy %cs to %ds. Remember these */
+ mov %ax, %ds /* are offsets rather than selectors */
+ mov %ax, %ss
+
+ /*
+ * Find relocation base and patch the gdt descript and ljmp targets
+ */
+ xorl %ebx,%ebx
+ mov %cs, %bx
+ sall $4, %ebx /* %ebx is now our relocation base */
+ orl %ebx, lgdt_desc-mptramp_start+2
+ orl %ebx, jmp_32-mptramp_start+2
+ orl %ebx, jmp_64-mptramp_start+1
+
+ /*
+ * Load the descriptor table pointer. We'll need it when running
+ * in 16 bit protected mode.
+ */
+ lgdt lgdt_desc-mptramp_start
+
+ /* Enable protected mode */
+ movl $CR0_PE, %eax
+ mov %eax, %cr0
+
+ /*
+ * Now execute a far jump to turn on protected mode. This
+ * causes the segment registers to turn into selectors and causes
+ * %cs to be loaded from the gdt.
+ *
+ * The following instruction is:
+ * ljmpl $bootcode-gdt, $protmode-mptramp_start
+ * but gas cannot assemble that. And besides, we patch the targets
+ * in early startup and its a little clearer what we are patching.
+ */
+jmp_32:
+ .byte 0x66 /* size override to 32 bits */
+ .byte 0xea /* opcode for far jump */
+ .long protmode-mptramp_start /* offset in segment */
+ .word bootcode-gdt /* index in gdt for 32 bit code */
+
+ /*
+ * At this point, we are running in 32 bit legacy protected mode.
+ */
+ .code32
+protmode:
+ mov $bootdata-gdt, %eax
+ mov %ax, %ds
+
+ /* Turn on the PAE, PSE and PGE bits for when paging is enabled */
+ mov %cr4, %eax
+ orl $(CR4_PAE | CR4_PSE), %eax
+ mov %eax, %cr4
+
+ /*
+ * Enable EFER.LME so that we get long mode when all the prereqs are
+ * in place. In this case, it turns on when CR0_PG is finally enabled.
+ * Pick up a few other EFER bits that we'll use need we're here.
+ */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ orl $EFER_LME | EFER_SCE, %eax
+ wrmsr
+
+ /*
+ * Point to the embedded page tables for startup. Note that this
+ * only gets accessed after we're actually in 64 bit mode, however
+ * we can only set the bottom 32 bits of %cr3 in this state. This
+ * means we are required to use a temporary page table that is below
+ * the 4GB limit. %ebx is still our relocation base. We could just
+ * subtract 3 * PAGE_SIZE, but that would be too easy.
+ */
+ leal mptramp_pagetables-mptramp_start(%ebx),%eax
+ movl (%eax), %eax
+ mov %eax, %cr3
+
+ /*
+ * Finally, switch to long bit mode by enabling paging. We have
+ * to be very careful here because all the segmentation disappears
+ * out from underneath us. The spec says we can depend on the
+ * subsequent pipelined branch to execute, but *only if* everthing
+ * is still identity mapped. If any mappings change, the pipeline
+ * will flush.
+ */
+ mov %cr0, %eax
+ orl $CR0_PG, %eax
+ mov %eax, %cr0
+
+ /*
+ * At this point paging is enabled, and we are in "compatability" mode.
+ * We do another far jump to reload %cs with the 64 bit selector.
+ * %cr3 points to a 4-level page table page.
+ * We cannot yet jump all the way to the kernel because we can only
+ * specify a 32 bit linear address. So, yet another trampoline.
+ *
+ * The following instruction is:
+ * ljmp $kernelcode-gdt, $tramp_64-mptramp_start
+ * but gas cannot assemble that. And besides, we patch the targets
+ * in early startup and its a little clearer what we are patching.
+ */
+jmp_64:
+ .byte 0xea /* opcode for far jump */
+ .long tramp_64-mptramp_start /* offset in segment */
+ .word kernelcode-gdt /* index in gdt for 64 bit code */
+
+ /*
+ * Yeehar! We're running in 64 bit mode! We can mostly ignore our
+ * segment registers, and get on with it.
+ * Note that we are running at the correct virtual address, but with
+ * a 1:1 1GB mirrored mapping over entire address space. We had better
+ * switch to a real %cr3 promptly so that we can get to the direct map
+ * space. Remember that jmp is relative and that we've been relocated,
+ * so use an indirect jump.
+ */
+ .code64
+tramp_64:
+ movabsq $entry_64,%rax /* 64 bit immediate load */
+ jmp *%rax
+
+ .p2align 4,0
+gdt:
+ /*
+ * All segment descriptor tables start with a null descriptor
+ */
+ .long 0x00000000
+ .long 0x00000000
+
+ /*
+ * This is the 64 bit long mode code descriptor. There is no
+ * 64 bit data descriptor.
+ */
+kernelcode:
+ .long 0x00000000
+ .long 0x00209800
+
+ /*
+ * This is the descriptor for the 32 bit boot code.
+ * %cs: +A, +R, -C, DPL=0, +P, +D, +G
+ * Accessed, Readable, Present, 32 bit, 4G granularity
+ */
+bootcode:
+ .long 0x0000ffff
+ .long 0x00cf9b00
+
+ /*
+ * This is the descriptor for the 32 bit boot data.
+ * We load it into %ds and %ss. The bits for each selector
+ * are interpreted slightly differently.
+ * %ds: +A, +W, -E, DPL=0, +P, +D, +G
+ * %ss: +A, +W, -E, DPL=0, +P, +B, +G
+ * Accessed, Writeable, Expand up, Present, 32 bit, 4GB
+ * For %ds, +D means 'default operand size is 32 bit'.
+ * For %ss, +B means the stack register is %esp rather than %sp.
+ */
+bootdata:
+ .long 0x0000ffff
+ .long 0x00cf9300
+
+gdtend:
+
+ /*
+ * The address of our page table pages that the boot code
+ * uses to trampoline up to kernel address space.
+ */
+ .globl mptramp_pagetables
+mptramp_pagetables:
+ .long 0
+
+ /*
+ * The pseudo descriptor for lgdt to use.
+ */
+lgdt_desc:
+ .word gdtend-gdt /* Length */
+ .long gdt-mptramp_start /* Offset plus %ds << 4 */
+
+ .globl mptramp_end
+mptramp_end:
+
+ /*
+ * From here on down is executed in the kernel .text section.
+ *
+ * Load a real %cr3 that has all the direct map stuff and switches
+ * off the 1GB replicated mirror. Load a stack pointer and jump
+ * into AP startup code in C.
+ */
+ .text
+ .code64
+ .p2align 4,0
+entry_64:
+ movq KPML4phys, %rax
+ movq %rax, %cr3
+ movq bootSTK, %rsp
+ jmp init_secondary
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
new file mode 100644
index 0000000..1b1c86c
--- /dev/null
+++ b/sys/amd64/amd64/pmap.c
@@ -0,0 +1,5538 @@
+/*-
+ * Copyright (c) 1991 Regents of the University of California.
+ * All rights reserved.
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ * Copyright (c) 1994 David Greenman
+ * All rights reserved.
+ * Copyright (c) 2003 Peter Wemm
+ * All rights reserved.
+ * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department and William Jolitz of UUNET Technologies Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
+ */
+/*-
+ * Copyright (c) 2003 Networks Associates Technology, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Jake Burkholder,
+ * Safeport Network Services, and Network Associates Laboratories, the
+ * Security Research Division of Network Associates, Inc. under
+ * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
+ * CHATS research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Manages physical address maps.
+ *
+ * Since the information managed by this module is
+ * also stored by the logical address mapping module,
+ * this module may throw away valid virtual-to-physical
+ * mappings at almost any time. However, invalidations
+ * of virtual-to-physical mappings must be done as
+ * requested.
+ *
+ * In order to cope with hardware architectures which
+ * make virtual-to-physical map invalidates expensive,
+ * this module may delay invalidate or reduced protection
+ * operations until such time as they are actually
+ * necessary. This module is given full information as
+ * to which processors are currently using which maps,
+ * and to when physical maps must be made correct.
+ */
+
+#include "opt_pmap.h"
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+#include <sys/vmmeter.h>
+#include <sys/sched.h>
+#include <sys/sysctl.h>
+#ifdef SMP
+#include <sys/smp.h>
+#else
+#include <sys/cpuset.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_radix.h>
+#include <vm/vm_reserv.h>
+#include <vm/uma.h>
+
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
+#include <machine/cpu.h>
+#include <machine/cputypes.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#include <machine/specialreg.h>
+#ifdef SMP
+#include <machine/smp.h>
+#endif
+
+#if !defined(DIAGNOSTIC)
+#ifdef __GNUC_GNU_INLINE__
+#define PMAP_INLINE __attribute__((__gnu_inline__)) inline
+#else
+#define PMAP_INLINE extern inline
+#endif
+#else
+#define PMAP_INLINE
+#endif
+
+#ifdef PV_STATS
+#define PV_STAT(x) do { x ; } while (0)
+#else
+#define PV_STAT(x) do { } while (0)
+#endif
+
+#define pa_index(pa) ((pa) >> PDRSHIFT)
+#define pa_to_pvh(pa) (&pv_table[pa_index(pa)])
+
+#define NPV_LIST_LOCKS MAXCPU
+
+#define PHYS_TO_PV_LIST_LOCK(pa) \
+ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
+
+#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \
+ struct rwlock **_lockp = (lockp); \
+ struct rwlock *_new_lock; \
+ \
+ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \
+ if (_new_lock != *_lockp) { \
+ if (*_lockp != NULL) \
+ rw_wunlock(*_lockp); \
+ *_lockp = _new_lock; \
+ rw_wlock(*_lockp); \
+ } \
+} while (0)
+
+#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
+ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
+
+#define RELEASE_PV_LIST_LOCK(lockp) do { \
+ struct rwlock **_lockp = (lockp); \
+ \
+ if (*_lockp != NULL) { \
+ rw_wunlock(*_lockp); \
+ *_lockp = NULL; \
+ } \
+} while (0)
+
+#define VM_PAGE_TO_PV_LIST_LOCK(m) \
+ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
+
+struct pmap kernel_pmap_store;
+
+vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
+vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
+
+int nkpt;
+SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
+ "Number of kernel page table pages allocated on bootup");
+
+static int ndmpdp;
+static vm_paddr_t dmaplimit;
+vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
+pt_entry_t pg_nx;
+
+static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
+
+static int pat_works = 1;
+SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
+ "Is page attribute table fully functional?");
+
+static int pg_ps_enabled = 1;
+SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
+ "Are large page mappings enabled?");
+
+#define PAT_INDEX_SIZE 8
+static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */
+
+static u_int64_t KPTphys; /* phys addr of kernel level 1 */
+static u_int64_t KPDphys; /* phys addr of kernel level 2 */
+u_int64_t KPDPphys; /* phys addr of kernel level 3 */
+u_int64_t KPML4phys; /* phys addr of kernel level 4 */
+
+static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
+static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
+
+static struct rwlock_padalign pvh_global_lock;
+
+/*
+ * Data for the pv entry allocation mechanism
+ */
+static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
+static struct mtx pv_chunks_mutex;
+static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
+static struct md_page *pv_table;
+
+/*
+ * All those kernel PT submaps that BSD is so fond of
+ */
+pt_entry_t *CMAP1 = 0;
+caddr_t CADDR1 = 0;
+
+/*
+ * Crashdump maps.
+ */
+static caddr_t crashdumpmap;
+
+static void free_pv_chunk(struct pv_chunk *pc);
+static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
+static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
+static int popcnt_pc_map_elem(uint64_t elem);
+static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
+static void reserve_pv_entries(pmap_t pmap, int needed,
+ struct rwlock **lockp);
+static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+ struct rwlock **lockp);
+static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+ struct rwlock **lockp);
+static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+ struct rwlock **lockp);
+static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
+static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
+ vm_offset_t va);
+static int pmap_pvh_wired_mappings(struct md_page *pvh, int count);
+
+static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
+static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
+static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
+ vm_offset_t va, struct rwlock **lockp);
+static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
+ vm_offset_t va);
+static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
+ vm_prot_t prot, struct rwlock **lockp);
+static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
+ vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
+static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
+static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
+static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
+static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
+static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
+static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
+static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
+static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
+ struct rwlock **lockp);
+static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
+ vm_prot_t prot);
+static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
+static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
+ vm_page_t *free, struct rwlock **lockp);
+static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
+ vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free,
+ struct rwlock **lockp);
+static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
+static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
+ vm_page_t *free);
+static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
+ vm_page_t m, struct rwlock **lockp);
+static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
+ pd_entry_t newpde);
+static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
+
+static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
+ struct rwlock **lockp);
+static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
+ struct rwlock **lockp);
+static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
+ struct rwlock **lockp);
+
+static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
+ vm_page_t *free);
+static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
+static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
+
+CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
+CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
+
+/*
+ * Move the kernel virtual free pointer to the next
+ * 2MB. This is used to help improve performance
+ * by using a large (2MB) page for much of the kernel
+ * (.text, .data, .bss)
+ */
+static vm_offset_t
+pmap_kmem_choose(vm_offset_t addr)
+{
+ vm_offset_t newaddr = addr;
+
+ newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
+ return (newaddr);
+}
+
+/********************/
+/* Inline functions */
+/********************/
+
+/* Return a non-clipped PD index for a given VA */
+static __inline vm_pindex_t
+pmap_pde_pindex(vm_offset_t va)
+{
+ return (va >> PDRSHIFT);
+}
+
+
+/* Return various clipped indexes for a given VA */
+static __inline vm_pindex_t
+pmap_pte_index(vm_offset_t va)
+{
+
+ return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
+}
+
+static __inline vm_pindex_t
+pmap_pde_index(vm_offset_t va)
+{
+
+ return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
+}
+
+static __inline vm_pindex_t
+pmap_pdpe_index(vm_offset_t va)
+{
+
+ return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
+}
+
+static __inline vm_pindex_t
+pmap_pml4e_index(vm_offset_t va)
+{
+
+ return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
+}
+
+/* Return a pointer to the PML4 slot that corresponds to a VA */
+static __inline pml4_entry_t *
+pmap_pml4e(pmap_t pmap, vm_offset_t va)
+{
+
+ return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
+}
+
+/* Return a pointer to the PDP slot that corresponds to a VA */
+static __inline pdp_entry_t *
+pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
+{
+ pdp_entry_t *pdpe;
+
+ pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
+ return (&pdpe[pmap_pdpe_index(va)]);
+}
+
+/* Return a pointer to the PDP slot that corresponds to a VA */
+static __inline pdp_entry_t *
+pmap_pdpe(pmap_t pmap, vm_offset_t va)
+{
+ pml4_entry_t *pml4e;
+
+ pml4e = pmap_pml4e(pmap, va);
+ if ((*pml4e & PG_V) == 0)
+ return (NULL);
+ return (pmap_pml4e_to_pdpe(pml4e, va));
+}
+
+/* Return a pointer to the PD slot that corresponds to a VA */
+static __inline pd_entry_t *
+pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
+{
+ pd_entry_t *pde;
+
+ pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
+ return (&pde[pmap_pde_index(va)]);
+}
+
+/* Return a pointer to the PD slot that corresponds to a VA */
+static __inline pd_entry_t *
+pmap_pde(pmap_t pmap, vm_offset_t va)
+{
+ pdp_entry_t *pdpe;
+
+ pdpe = pmap_pdpe(pmap, va);
+ if (pdpe == NULL || (*pdpe & PG_V) == 0)
+ return (NULL);
+ return (pmap_pdpe_to_pde(pdpe, va));
+}
+
+/* Return a pointer to the PT slot that corresponds to a VA */
+static __inline pt_entry_t *
+pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
+{
+ pt_entry_t *pte;
+
+ pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
+ return (&pte[pmap_pte_index(va)]);
+}
+
+/* Return a pointer to the PT slot that corresponds to a VA */
+static __inline pt_entry_t *
+pmap_pte(pmap_t pmap, vm_offset_t va)
+{
+ pd_entry_t *pde;
+
+ pde = pmap_pde(pmap, va);
+ if (pde == NULL || (*pde & PG_V) == 0)
+ return (NULL);
+ if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */
+ return ((pt_entry_t *)pde);
+ return (pmap_pde_to_pte(pde, va));
+}
+
+static __inline void
+pmap_resident_count_inc(pmap_t pmap, int count)
+{
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ pmap->pm_stats.resident_count += count;
+}
+
+static __inline void
+pmap_resident_count_dec(pmap_t pmap, int count)
+{
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ pmap->pm_stats.resident_count -= count;
+}
+
+PMAP_INLINE pt_entry_t *
+vtopte(vm_offset_t va)
+{
+ u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+
+ return (PTmap + ((va >> PAGE_SHIFT) & mask));
+}
+
+static __inline pd_entry_t *
+vtopde(vm_offset_t va)
+{
+ u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+
+ return (PDmap + ((va >> PDRSHIFT) & mask));
+}
+
+static u_int64_t
+allocpages(vm_paddr_t *firstaddr, int n)
+{
+ u_int64_t ret;
+
+ ret = *firstaddr;
+ bzero((void *)ret, n * PAGE_SIZE);
+ *firstaddr += n * PAGE_SIZE;
+ return (ret);
+}
+
+CTASSERT(powerof2(NDMPML4E));
+
+/* number of kernel PDP slots */
+#define NKPDPE(ptpgs) howmany((ptpgs), NPDEPG)
+
+static void
+nkpt_init(vm_paddr_t addr)
+{
+ int pt_pages;
+
+#ifdef NKPT
+ pt_pages = NKPT;
+#else
+ pt_pages = howmany(addr, 1 << PDRSHIFT);
+ pt_pages += NKPDPE(pt_pages);
+
+ /*
+ * Add some slop beyond the bare minimum required for bootstrapping
+ * the kernel.
+ *
+ * This is quite important when allocating KVA for kernel modules.
+ * The modules are required to be linked in the negative 2GB of
+ * the address space. If we run out of KVA in this region then
+ * pmap_growkernel() will need to allocate page table pages to map
+ * the entire 512GB of KVA space which is an unnecessary tax on
+ * physical memory.
+ */
+ pt_pages += 8; /* 16MB additional slop for kernel modules */
+#endif
+ nkpt = pt_pages;
+}
+
+static void
+create_pagetables(vm_paddr_t *firstaddr)
+{
+ int i, j, ndm1g, nkpdpe;
+
+ /* Allocate page table pages for the direct map */
+ ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
+ if (ndmpdp < 4) /* Minimum 4GB of dirmap */
+ ndmpdp = 4;
+ DMPDPphys = allocpages(firstaddr, NDMPML4E);
+ ndm1g = 0;
+ if ((amd_feature & AMDID_PAGE1GB) != 0)
+ ndm1g = ptoa(Maxmem) >> PDPSHIFT;
+ if (ndm1g < ndmpdp)
+ DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
+ dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
+
+ /* Allocate pages */
+ KPML4phys = allocpages(firstaddr, 1);
+ KPDPphys = allocpages(firstaddr, NKPML4E);
+
+ /*
+ * Allocate the initial number of kernel page table pages required to
+ * bootstrap. We defer this until after all memory-size dependent
+ * allocations are done (e.g. direct map), so that we don't have to
+ * build in too much slop in our estimate.
+ */
+ nkpt_init(*firstaddr);
+ nkpdpe = NKPDPE(nkpt);
+
+ KPTphys = allocpages(firstaddr, nkpt);
+ KPDphys = allocpages(firstaddr, nkpdpe);
+
+ /* Fill in the underlying page table pages */
+ /* Read-only from zero to physfree */
+ /* XXX not fully used, underneath 2M pages */
+ for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
+ ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
+ ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
+ }
+
+ /* Now map the page tables at their location within PTmap */
+ for (i = 0; i < nkpt; i++) {
+ ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
+ ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
+ }
+
+ /* Map from zero to end of allocations under 2M pages */
+ /* This replaces some of the KPTphys entries above */
+ for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
+ ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
+ ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
+ }
+
+ /* And connect up the PD to the PDP */
+ for (i = 0; i < nkpdpe; i++) {
+ ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys +
+ (i << PAGE_SHIFT);
+ ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
+ }
+
+ /*
+ * Now, set up the direct map region using 2MB and/or 1GB pages. If
+ * the end of physical memory is not aligned to a 1GB page boundary,
+ * then the residual physical memory is mapped with 2MB pages. Later,
+ * if pmap_mapdev{_attr}() uses the direct map for non-write-back
+ * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
+ * that are partially used.
+ */
+ for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
+ ((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)i << PDRSHIFT;
+ /* Preset PG_M and PG_A because demotion expects it. */
+ ((pd_entry_t *)DMPDphys)[j] |= PG_RW | PG_V | PG_PS | PG_G |
+ PG_M | PG_A;
+ }
+ for (i = 0; i < ndm1g; i++) {
+ ((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)i << PDPSHIFT;
+ /* Preset PG_M and PG_A because demotion expects it. */
+ ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | PG_G |
+ PG_M | PG_A;
+ }
+ for (j = 0; i < ndmpdp; i++, j++) {
+ ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (j << PAGE_SHIFT);
+ ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
+ }
+
+ /* And recursively map PML4 to itself in order to get PTmap */
+ ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
+ ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
+
+ /* Connect the Direct Map slot(s) up to the PML4. */
+ for (i = 0; i < NDMPML4E; i++) {
+ ((pdp_entry_t *)KPML4phys)[DMPML4I + i] = DMPDPphys +
+ (i << PAGE_SHIFT);
+ ((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_RW | PG_V | PG_U;
+ }
+
+ /* Connect the KVA slot up to the PML4 */
+ ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
+ ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
+}
+
+/*
+ * Bootstrap the system enough to run with virtual memory.
+ *
+ * On amd64 this is called after mapping has already been enabled
+ * and just syncs the pmap module with what has already been done.
+ * [We can't call it easily with mapping off since the kernel is not
+ * mapped with PA == VA, hence we would have to relocate every address
+ * from the linked base (virtual) address "KERNBASE" to the actual
+ * (physical) address starting relative to 0]
+ */
+void
+pmap_bootstrap(vm_paddr_t *firstaddr)
+{
+ vm_offset_t va;
+ pt_entry_t *pte, *unused;
+
+ /*
+ * Create an initial set of page tables to run the kernel in.
+ */
+ create_pagetables(firstaddr);
+
+ virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
+ virtual_avail = pmap_kmem_choose(virtual_avail);
+
+ virtual_end = VM_MAX_KERNEL_ADDRESS;
+
+
+ /* XXX do %cr0 as well */
+ load_cr4(rcr4() | CR4_PGE | CR4_PSE);
+ load_cr3(KPML4phys);
+ if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
+ load_cr4(rcr4() | CR4_SMEP);
+
+ /*
+ * Initialize the kernel pmap (which is statically allocated).
+ */
+ PMAP_LOCK_INIT(kernel_pmap);
+ kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
+ CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */
+ TAILQ_INIT(&kernel_pmap->pm_pvchunk);
+
+ /*
+ * Initialize the global pv list lock.
+ */
+ rw_init(&pvh_global_lock, "pmap pv global");
+
+ /*
+ * Reserve some special page table entries/VA space for temporary
+ * mapping of pages.
+ */
+#define SYSMAP(c, p, v, n) \
+ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
+
+ va = virtual_avail;
+ pte = vtopte(va);
+
+ /*
+ * CMAP1 is only used for the memory test.
+ */
+ SYSMAP(caddr_t, CMAP1, CADDR1, 1)
+
+ /*
+ * Crashdump maps.
+ */
+ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
+
+ virtual_avail = va;
+
+ /* Initialize the PAT MSR. */
+ pmap_init_pat();
+}
+
+/*
+ * Setup the PAT MSR.
+ */
+void
+pmap_init_pat(void)
+{
+ int pat_table[PAT_INDEX_SIZE];
+ uint64_t pat_msr;
+ u_long cr0, cr4;
+ int i;
+
+ /* Bail if this CPU doesn't implement PAT. */
+ if ((cpu_feature & CPUID_PAT) == 0)
+ panic("no PAT??");
+
+ /* Set default PAT index table. */
+ for (i = 0; i < PAT_INDEX_SIZE; i++)
+ pat_table[i] = -1;
+ pat_table[PAT_WRITE_BACK] = 0;
+ pat_table[PAT_WRITE_THROUGH] = 1;
+ pat_table[PAT_UNCACHEABLE] = 3;
+ pat_table[PAT_WRITE_COMBINING] = 3;
+ pat_table[PAT_WRITE_PROTECTED] = 3;
+ pat_table[PAT_UNCACHED] = 3;
+
+ /* Initialize default PAT entries. */
+ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
+ PAT_VALUE(1, PAT_WRITE_THROUGH) |
+ PAT_VALUE(2, PAT_UNCACHED) |
+ PAT_VALUE(3, PAT_UNCACHEABLE) |
+ PAT_VALUE(4, PAT_WRITE_BACK) |
+ PAT_VALUE(5, PAT_WRITE_THROUGH) |
+ PAT_VALUE(6, PAT_UNCACHED) |
+ PAT_VALUE(7, PAT_UNCACHEABLE);
+
+ if (pat_works) {
+ /*
+ * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
+ * Program 5 and 6 as WP and WC.
+ * Leave 4 and 7 as WB and UC.
+ */
+ pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
+ pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
+ PAT_VALUE(6, PAT_WRITE_COMBINING);
+ pat_table[PAT_UNCACHED] = 2;
+ pat_table[PAT_WRITE_PROTECTED] = 5;
+ pat_table[PAT_WRITE_COMBINING] = 6;
+ } else {
+ /*
+ * Just replace PAT Index 2 with WC instead of UC-.
+ */
+ pat_msr &= ~PAT_MASK(2);
+ pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
+ pat_table[PAT_WRITE_COMBINING] = 2;
+ }
+
+ /* Disable PGE. */
+ cr4 = rcr4();
+ load_cr4(cr4 & ~CR4_PGE);
+
+ /* Disable caches (CD = 1, NW = 0). */
+ cr0 = rcr0();
+ load_cr0((cr0 & ~CR0_NW) | CR0_CD);
+
+ /* Flushes caches and TLBs. */
+ wbinvd();
+ invltlb();
+
+ /* Update PAT and index table. */
+ wrmsr(MSR_PAT, pat_msr);
+ for (i = 0; i < PAT_INDEX_SIZE; i++)
+ pat_index[i] = pat_table[i];
+
+ /* Flush caches and TLBs again. */
+ wbinvd();
+ invltlb();
+
+ /* Restore caches and PGE. */
+ load_cr0(cr0);
+ load_cr4(cr4);
+}
+
+/*
+ * Initialize a vm_page's machine-dependent fields.
+ */
+void
+pmap_page_init(vm_page_t m)
+{
+
+ TAILQ_INIT(&m->md.pv_list);
+ m->md.pat_mode = PAT_WRITE_BACK;
+}
+
+/*
+ * Initialize the pmap module.
+ * Called by vm_init, to initialize any structures that the pmap
+ * system needs to map virtual memory.
+ */
+void
+pmap_init(void)
+{
+ vm_page_t mpte;
+ vm_size_t s;
+ int i, pv_npg;
+
+ /*
+ * Initialize the vm page array entries for the kernel pmap's
+ * page table pages.
+ */
+ for (i = 0; i < nkpt; i++) {
+ mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
+ KASSERT(mpte >= vm_page_array &&
+ mpte < &vm_page_array[vm_page_array_size],
+ ("pmap_init: page table page is out of range"));
+ mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
+ mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
+ }
+
+ /*
+ * If the kernel is running in a virtual machine on an AMD Family 10h
+ * processor, then it must assume that MCA is enabled by the virtual
+ * machine monitor.
+ */
+ if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
+ CPUID_TO_FAMILY(cpu_id) == 0x10)
+ workaround_erratum383 = 1;
+
+ /*
+ * Are large page mappings enabled?
+ */
+ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
+ if (pg_ps_enabled) {
+ KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
+ ("pmap_init: can't assign to pagesizes[1]"));
+ pagesizes[1] = NBPDR;
+ }
+
+ /*
+ * Initialize the pv chunk list mutex.
+ */
+ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
+
+ /*
+ * Initialize the pool of pv list locks.
+ */
+ for (i = 0; i < NPV_LIST_LOCKS; i++)
+ rw_init(&pv_list_locks[i], "pmap pv list");
+
+ /*
+ * Calculate the size of the pv head table for superpages.
+ */
+ for (i = 0; phys_avail[i + 1]; i += 2);
+ pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
+
+ /*
+ * Allocate memory for the pv head table for superpages.
+ */
+ s = (vm_size_t)(pv_npg * sizeof(struct md_page));
+ s = round_page(s);
+ pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
+ for (i = 0; i < pv_npg; i++)
+ TAILQ_INIT(&pv_table[i].pv_list);
+}
+
+static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
+ "2MB page mapping counters");
+
+static u_long pmap_pde_demotions;
+SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
+ &pmap_pde_demotions, 0, "2MB page demotions");
+
+static u_long pmap_pde_mappings;
+SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
+ &pmap_pde_mappings, 0, "2MB page mappings");
+
+static u_long pmap_pde_p_failures;
+SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
+ &pmap_pde_p_failures, 0, "2MB page promotion failures");
+
+static u_long pmap_pde_promotions;
+SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
+ &pmap_pde_promotions, 0, "2MB page promotions");
+
+static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
+ "1GB page mapping counters");
+
+static u_long pmap_pdpe_demotions;
+SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
+ &pmap_pdpe_demotions, 0, "1GB page demotions");
+
+/***************************************************
+ * Low level helper routines.....
+ ***************************************************/
+
+/*
+ * Determine the appropriate bits to set in a PTE or PDE for a specified
+ * caching mode.
+ */
+static int
+pmap_cache_bits(int mode, boolean_t is_pde)
+{
+ int cache_bits, pat_flag, pat_idx;
+
+ if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
+ panic("Unknown caching mode %d\n", mode);
+
+ /* The PAT bit is different for PTE's and PDE's. */
+ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
+
+ /* Map the caching mode to a PAT index. */
+ pat_idx = pat_index[mode];
+
+ /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
+ cache_bits = 0;
+ if (pat_idx & 0x4)
+ cache_bits |= pat_flag;
+ if (pat_idx & 0x2)
+ cache_bits |= PG_NC_PCD;
+ if (pat_idx & 0x1)
+ cache_bits |= PG_NC_PWT;
+ return (cache_bits);
+}
+
+/*
+ * After changing the page size for the specified virtual address in the page
+ * table, flush the corresponding entries from the processor's TLB. Only the
+ * calling processor's TLB is affected.
+ *
+ * The calling thread must be pinned to a processor.
+ */
+static void
+pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
+{
+ u_long cr4;
+
+ if ((newpde & PG_PS) == 0)
+ /* Demotion: flush a specific 2MB page mapping. */
+ invlpg(va);
+ else if ((newpde & PG_G) == 0)
+ /*
+ * Promotion: flush every 4KB page mapping from the TLB
+ * because there are too many to flush individually.
+ */
+ invltlb();
+ else {
+ /*
+ * Promotion: flush every 4KB page mapping from the TLB,
+ * including any global (PG_G) mappings.
+ */
+ cr4 = rcr4();
+ load_cr4(cr4 & ~CR4_PGE);
+ /*
+ * Although preemption at this point could be detrimental to
+ * performance, it would not lead to an error. PG_G is simply
+ * ignored if CR4.PGE is clear. Moreover, in case this block
+ * is re-entered, the load_cr4() either above or below will
+ * modify CR4.PGE flushing the TLB.
+ */
+ load_cr4(cr4 | CR4_PGE);
+ }
+}
+#ifdef SMP
+/*
+ * For SMP, these functions have to use the IPI mechanism for coherence.
+ *
+ * N.B.: Before calling any of the following TLB invalidation functions,
+ * the calling processor must ensure that all stores updating a non-
+ * kernel page table are globally performed. Otherwise, another
+ * processor could cache an old, pre-update entry without being
+ * invalidated. This can happen one of two ways: (1) The pmap becomes
+ * active on another processor after its pm_active field is checked by
+ * one of the following functions but before a store updating the page
+ * table is globally performed. (2) The pmap becomes active on another
+ * processor before its pm_active field is checked but due to
+ * speculative loads one of the following functions stills reads the
+ * pmap as inactive on the other processor.
+ *
+ * The kernel page table is exempt because its pm_active field is
+ * immutable. The kernel page table is always active on every
+ * processor.
+ */
+void
+pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
+{
+ cpuset_t other_cpus;
+ u_int cpuid;
+
+ sched_pin();
+ if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
+ invlpg(va);
+ smp_invlpg(va);
+ } else {
+ cpuid = PCPU_GET(cpuid);
+ other_cpus = all_cpus;
+ CPU_CLR(cpuid, &other_cpus);
+ if (CPU_ISSET(cpuid, &pmap->pm_active))
+ invlpg(va);
+ CPU_AND(&other_cpus, &pmap->pm_active);
+ if (!CPU_EMPTY(&other_cpus))
+ smp_masked_invlpg(other_cpus, va);
+ }
+ sched_unpin();
+}
+
+void
+pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+ cpuset_t other_cpus;
+ vm_offset_t addr;
+ u_int cpuid;
+
+ sched_pin();
+ if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+ smp_invlpg_range(sva, eva);
+ } else {
+ cpuid = PCPU_GET(cpuid);
+ other_cpus = all_cpus;
+ CPU_CLR(cpuid, &other_cpus);
+ if (CPU_ISSET(cpuid, &pmap->pm_active))
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+ CPU_AND(&other_cpus, &pmap->pm_active);
+ if (!CPU_EMPTY(&other_cpus))
+ smp_masked_invlpg_range(other_cpus, sva, eva);
+ }
+ sched_unpin();
+}
+
+void
+pmap_invalidate_all(pmap_t pmap)
+{
+ cpuset_t other_cpus;
+ u_int cpuid;
+
+ sched_pin();
+ if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
+ invltlb();
+ smp_invltlb();
+ } else {
+ cpuid = PCPU_GET(cpuid);
+ other_cpus = all_cpus;
+ CPU_CLR(cpuid, &other_cpus);
+ if (CPU_ISSET(cpuid, &pmap->pm_active))
+ invltlb();
+ CPU_AND(&other_cpus, &pmap->pm_active);
+ if (!CPU_EMPTY(&other_cpus))
+ smp_masked_invltlb(other_cpus);
+ }
+ sched_unpin();
+}
+
+void
+pmap_invalidate_cache(void)
+{
+
+ sched_pin();
+ wbinvd();
+ smp_cache_flush();
+ sched_unpin();
+}
+
+struct pde_action {
+ cpuset_t invalidate; /* processors that invalidate their TLB */
+ vm_offset_t va;
+ pd_entry_t *pde;
+ pd_entry_t newpde;
+ u_int store; /* processor that updates the PDE */
+};
+
+static void
+pmap_update_pde_action(void *arg)
+{
+ struct pde_action *act = arg;
+
+ if (act->store == PCPU_GET(cpuid))
+ pde_store(act->pde, act->newpde);
+}
+
+static void
+pmap_update_pde_teardown(void *arg)
+{
+ struct pde_action *act = arg;
+
+ if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
+ pmap_update_pde_invalidate(act->va, act->newpde);
+}
+
+/*
+ * Change the page size for the specified virtual address in a way that
+ * prevents any possibility of the TLB ever having two entries that map the
+ * same virtual address using different page sizes. This is the recommended
+ * workaround for Erratum 383 on AMD Family 10h processors. It prevents a
+ * machine check exception for a TLB state that is improperly diagnosed as a
+ * hardware error.
+ */
+static void
+pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
+{
+ struct pde_action act;
+ cpuset_t active, other_cpus;
+ u_int cpuid;
+
+ sched_pin();
+ cpuid = PCPU_GET(cpuid);
+ other_cpus = all_cpus;
+ CPU_CLR(cpuid, &other_cpus);
+ if (pmap == kernel_pmap)
+ active = all_cpus;
+ else
+ active = pmap->pm_active;
+ if (CPU_OVERLAP(&active, &other_cpus)) {
+ act.store = cpuid;
+ act.invalidate = active;
+ act.va = va;
+ act.pde = pde;
+ act.newpde = newpde;
+ CPU_SET(cpuid, &active);
+ smp_rendezvous_cpus(active,
+ smp_no_rendevous_barrier, pmap_update_pde_action,
+ pmap_update_pde_teardown, &act);
+ } else {
+ pde_store(pde, newpde);
+ if (CPU_ISSET(cpuid, &active))
+ pmap_update_pde_invalidate(va, newpde);
+ }
+ sched_unpin();
+}
+#else /* !SMP */
+/*
+ * Normal, non-SMP, invalidation functions.
+ * We inline these within pmap.c for speed.
+ */
+PMAP_INLINE void
+pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
+{
+
+ if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+ invlpg(va);
+}
+
+PMAP_INLINE void
+pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+ vm_offset_t addr;
+
+ if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+}
+
+PMAP_INLINE void
+pmap_invalidate_all(pmap_t pmap)
+{
+
+ if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+ invltlb();
+}
+
+PMAP_INLINE void
+pmap_invalidate_cache(void)
+{
+
+ wbinvd();
+}
+
+static void
+pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
+{
+
+ pde_store(pde, newpde);
+ if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+ pmap_update_pde_invalidate(va, newpde);
+}
+#endif /* !SMP */
+
+#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024)
+
+void
+pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
+{
+
+ KASSERT((sva & PAGE_MASK) == 0,
+ ("pmap_invalidate_cache_range: sva not page-aligned"));
+ KASSERT((eva & PAGE_MASK) == 0,
+ ("pmap_invalidate_cache_range: eva not page-aligned"));
+
+ if (cpu_feature & CPUID_SS)
+ ; /* If "Self Snoop" is supported, do nothing. */
+ else if ((cpu_feature & CPUID_CLFSH) != 0 &&
+ eva - sva < PMAP_CLFLUSH_THRESHOLD) {
+
+ /*
+ * XXX: Some CPUs fault, hang, or trash the local APIC
+ * registers if we use CLFLUSH on the local APIC
+ * range. The local APIC is always uncached, so we
+ * don't need to flush for that range anyway.
+ */
+ if (pmap_kextract(sva) == lapic_paddr)
+ return;
+
+ /*
+ * Otherwise, do per-cache line flush. Use the mfence
+ * instruction to insure that previous stores are
+ * included in the write-back. The processor
+ * propagates flush to other processors in the cache
+ * coherence domain.
+ */
+ mfence();
+ for (; sva < eva; sva += cpu_clflush_line_size)
+ clflush(sva);
+ mfence();
+ } else {
+
+ /*
+ * No targeted cache flush methods are supported by CPU,
+ * or the supplied range is bigger than 2MB.
+ * Globally invalidate cache.
+ */
+ pmap_invalidate_cache();
+ }
+}
+
+/*
+ * Remove the specified set of pages from the data and instruction caches.
+ *
+ * In contrast to pmap_invalidate_cache_range(), this function does not
+ * rely on the CPU's self-snoop feature, because it is intended for use
+ * when moving pages into a different cache domain.
+ */
+void
+pmap_invalidate_cache_pages(vm_page_t *pages, int count)
+{
+ vm_offset_t daddr, eva;
+ int i;
+
+ if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
+ (cpu_feature & CPUID_CLFSH) == 0)
+ pmap_invalidate_cache();
+ else {
+ mfence();
+ for (i = 0; i < count; i++) {
+ daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
+ eva = daddr + PAGE_SIZE;
+ for (; daddr < eva; daddr += cpu_clflush_line_size)
+ clflush(daddr);
+ }
+ mfence();
+ }
+}
+
+/*
+ * Are we current address space or kernel?
+ */
+static __inline int
+pmap_is_current(pmap_t pmap)
+{
+ return (pmap == kernel_pmap ||
+ (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
+}
+
+/*
+ * Routine: pmap_extract
+ * Function:
+ * Extract the physical page address associated
+ * with the given map/virtual_address pair.
+ */
+vm_paddr_t
+pmap_extract(pmap_t pmap, vm_offset_t va)
+{
+ pdp_entry_t *pdpe;
+ pd_entry_t *pde;
+ pt_entry_t *pte;
+ vm_paddr_t pa;
+
+ pa = 0;
+ PMAP_LOCK(pmap);
+ pdpe = pmap_pdpe(pmap, va);
+ if (pdpe != NULL && (*pdpe & PG_V) != 0) {
+ if ((*pdpe & PG_PS) != 0)
+ pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
+ else {
+ pde = pmap_pdpe_to_pde(pdpe, va);
+ if ((*pde & PG_V) != 0) {
+ if ((*pde & PG_PS) != 0) {
+ pa = (*pde & PG_PS_FRAME) |
+ (va & PDRMASK);
+ } else {
+ pte = pmap_pde_to_pte(pde, va);
+ pa = (*pte & PG_FRAME) |
+ (va & PAGE_MASK);
+ }
+ }
+ }
+ }
+ PMAP_UNLOCK(pmap);
+ return (pa);
+}
+
+/*
+ * Routine: pmap_extract_and_hold
+ * Function:
+ * Atomically extract and hold the physical page
+ * with the given pmap and virtual address pair
+ * if that mapping permits the given protection.
+ */
+vm_page_t
+pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
+{
+ pd_entry_t pde, *pdep;
+ pt_entry_t pte;
+ vm_paddr_t pa;
+ vm_page_t m;
+
+ pa = 0;
+ m = NULL;
+ PMAP_LOCK(pmap);
+retry:
+ pdep = pmap_pde(pmap, va);
+ if (pdep != NULL && (pde = *pdep)) {
+ if (pde & PG_PS) {
+ if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
+ if (vm_page_pa_tryrelock(pmap, (pde &
+ PG_PS_FRAME) | (va & PDRMASK), &pa))
+ goto retry;
+ m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
+ (va & PDRMASK));
+ vm_page_hold(m);
+ }
+ } else {
+ pte = *pmap_pde_to_pte(pdep, va);
+ if ((pte & PG_V) &&
+ ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
+ if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
+ &pa))
+ goto retry;
+ m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
+ vm_page_hold(m);
+ }
+ }
+ }
+ PA_UNLOCK_COND(pa);
+ PMAP_UNLOCK(pmap);
+ return (m);
+}
+
+vm_paddr_t
+pmap_kextract(vm_offset_t va)
+{
+ pd_entry_t pde;
+ vm_paddr_t pa;
+
+ if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
+ pa = DMAP_TO_PHYS(va);
+ } else {
+ pde = *vtopde(va);
+ if (pde & PG_PS) {
+ pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
+ } else {
+ /*
+ * Beware of a concurrent promotion that changes the
+ * PDE at this point! For example, vtopte() must not
+ * be used to access the PTE because it would use the
+ * new PDE. It is, however, safe to use the old PDE
+ * because the page table page is preserved by the
+ * promotion.
+ */
+ pa = *pmap_pde_to_pte(&pde, va);
+ pa = (pa & PG_FRAME) | (va & PAGE_MASK);
+ }
+ }
+ return (pa);
+}
+
+/***************************************************
+ * Low level mapping routines.....
+ ***************************************************/
+
+/*
+ * Add a wired page to the kva.
+ * Note: not SMP coherent.
+ */
+PMAP_INLINE void
+pmap_kenter(vm_offset_t va, vm_paddr_t pa)
+{
+ pt_entry_t *pte;
+
+ pte = vtopte(va);
+ pte_store(pte, pa | PG_RW | PG_V | PG_G);
+}
+
+static __inline void
+pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
+{
+ pt_entry_t *pte;
+
+ pte = vtopte(va);
+ pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
+}
+
+/*
+ * Remove a page from the kernel pagetables.
+ * Note: not SMP coherent.
+ */
+PMAP_INLINE void
+pmap_kremove(vm_offset_t va)
+{
+ pt_entry_t *pte;
+
+ pte = vtopte(va);
+ pte_clear(pte);
+}
+
+/*
+ * Used to map a range of physical addresses into kernel
+ * virtual address space.
+ *
+ * The value passed in '*virt' is a suggested virtual address for
+ * the mapping. Architectures which can support a direct-mapped
+ * physical to virtual region can return the appropriate address
+ * within that region, leaving '*virt' unchanged. Other
+ * architectures should map the pages starting at '*virt' and
+ * update '*virt' with the first usable address after the mapped
+ * region.
+ */
+vm_offset_t
+pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
+{
+ return PHYS_TO_DMAP(start);
+}
+
+
+/*
+ * Add a list of wired pages to the kva
+ * this routine is only used for temporary
+ * kernel mappings that do not need to have
+ * page modification or references recorded.
+ * Note that old mappings are simply written
+ * over. The page *must* be wired.
+ * Note: SMP coherent. Uses a ranged shootdown IPI.
+ */
+void
+pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
+{
+ pt_entry_t *endpte, oldpte, pa, *pte;
+ vm_page_t m;
+
+ oldpte = 0;
+ pte = vtopte(sva);
+ endpte = pte + count;
+ while (pte < endpte) {
+ m = *ma++;
+ pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
+ if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
+ oldpte |= *pte;
+ pte_store(pte, pa | PG_G | PG_RW | PG_V);
+ }
+ pte++;
+ }
+ if (__predict_false((oldpte & PG_V) != 0))
+ pmap_invalidate_range(kernel_pmap, sva, sva + count *
+ PAGE_SIZE);
+}
+
+/*
+ * This routine tears out page mappings from the
+ * kernel -- it is meant only for temporary mappings.
+ * Note: SMP coherent. Uses a ranged shootdown IPI.
+ */
+void
+pmap_qremove(vm_offset_t sva, int count)
+{
+ vm_offset_t va;
+
+ va = sva;
+ while (count-- > 0) {
+ KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
+ pmap_kremove(va);
+ va += PAGE_SIZE;
+ }
+ pmap_invalidate_range(kernel_pmap, sva, va);
+}
+
+/***************************************************
+ * Page table page management routines.....
+ ***************************************************/
+static __inline void
+pmap_free_zero_pages(vm_page_t free)
+{
+ vm_page_t m;
+
+ while (free != NULL) {
+ m = free;
+ free = (void *)m->object;
+ m->object = NULL;
+ /* Preserve the page's PG_ZERO setting. */
+ vm_page_free_toq(m);
+ }
+}
+
+/*
+ * Schedule the specified unused page table page to be freed. Specifically,
+ * add the page to the specified list of pages that will be released to the
+ * physical memory manager after the TLB has been updated.
+ */
+static __inline void
+pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
+{
+
+ if (set_PG_ZERO)
+ m->flags |= PG_ZERO;
+ else
+ m->flags &= ~PG_ZERO;
+ m->object = (void *)*free;
+ *free = m;
+}
+
+/*
+ * Inserts the specified page table page into the specified pmap's collection
+ * of idle page table pages. Each of a pmap's page table pages is responsible
+ * for mapping a distinct range of virtual addresses. The pmap's collection is
+ * ordered by this virtual address range.
+ */
+static __inline void
+pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
+{
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ vm_radix_insert(&pmap->pm_root, mpte);
+}
+
+/*
+ * Looks for a page table page mapping the specified virtual address in the
+ * specified pmap's collection of idle page table pages. Returns NULL if there
+ * is no page table page corresponding to the specified virtual address.
+ */
+static __inline vm_page_t
+pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
+{
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
+}
+
+/*
+ * Removes the specified page table page from the specified pmap's collection
+ * of idle page table pages. The specified page table page must be a member of
+ * the pmap's collection.
+ */
+static __inline void
+pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
+{
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ vm_radix_remove(&pmap->pm_root, mpte->pindex);
+}
+
+/*
+ * Decrements a page table page's wire count, which is used to record the
+ * number of valid page table entries within the page. If the wire count
+ * drops to zero, then the page table page is unmapped. Returns TRUE if the
+ * page table page was unmapped and FALSE otherwise.
+ */
+static inline boolean_t
+pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
+{
+
+ --m->wire_count;
+ if (m->wire_count == 0) {
+ _pmap_unwire_ptp(pmap, va, m, free);
+ return (TRUE);
+ } else
+ return (FALSE);
+}
+
+static void
+_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
+{
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ /*
+ * unmap the page table page
+ */
+ if (m->pindex >= (NUPDE + NUPDPE)) {
+ /* PDP page */
+ pml4_entry_t *pml4;
+ pml4 = pmap_pml4e(pmap, va);
+ *pml4 = 0;
+ } else if (m->pindex >= NUPDE) {
+ /* PD page */
+ pdp_entry_t *pdp;
+ pdp = pmap_pdpe(pmap, va);
+ *pdp = 0;
+ } else {
+ /* PTE page */
+ pd_entry_t *pd;
+ pd = pmap_pde(pmap, va);
+ *pd = 0;
+ }
+ pmap_resident_count_dec(pmap, 1);
+ if (m->pindex < NUPDE) {
+ /* We just released a PT, unhold the matching PD */
+ vm_page_t pdpg;
+
+ pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
+ pmap_unwire_ptp(pmap, va, pdpg, free);
+ }
+ if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
+ /* We just released a PD, unhold the matching PDP */
+ vm_page_t pdppg;
+
+ pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
+ pmap_unwire_ptp(pmap, va, pdppg, free);
+ }
+
+ /*
+ * This is a release store so that the ordinary store unmapping
+ * the page table page is globally performed before TLB shoot-
+ * down is begun.
+ */
+ atomic_subtract_rel_int(&cnt.v_wire_count, 1);
+
+ /*
+ * Put page on a list so that it is released after
+ * *ALL* TLB shootdown is done
+ */
+ pmap_add_delayed_free_list(m, free, TRUE);
+}
+
+/*
+ * After removing a page table entry, this routine is used to
+ * conditionally free the page, and manage the hold/wire counts.
+ */
+static int
+pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free)
+{
+ vm_page_t mpte;
+
+ if (va >= VM_MAXUSER_ADDRESS)
+ return (0);
+ KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
+ mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
+ return (pmap_unwire_ptp(pmap, va, mpte, free));
+}
+
+void
+pmap_pinit0(pmap_t pmap)
+{
+
+ PMAP_LOCK_INIT(pmap);
+ pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
+ pmap->pm_root.rt_root = 0;
+ CPU_ZERO(&pmap->pm_active);
+ PCPU_SET(curpmap, pmap);
+ TAILQ_INIT(&pmap->pm_pvchunk);
+ bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+}
+
+/*
+ * Initialize a preallocated and zeroed pmap structure,
+ * such as one in a vmspace structure.
+ */
+int
+pmap_pinit(pmap_t pmap)
+{
+ vm_page_t pml4pg;
+ int i;
+
+ PMAP_LOCK_INIT(pmap);
+
+ /*
+ * allocate the page directory page
+ */
+ while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
+ VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
+ VM_WAIT;
+
+ pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
+
+ if ((pml4pg->flags & PG_ZERO) == 0)
+ pagezero(pmap->pm_pml4);
+
+ /* Wire in kernel global address entries. */
+ pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
+ for (i = 0; i < NDMPML4E; i++) {
+ pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) |
+ PG_RW | PG_V | PG_U;
+ }
+
+ /* install self-referential address mapping entry(s) */
+ pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
+
+ pmap->pm_root.rt_root = 0;
+ CPU_ZERO(&pmap->pm_active);
+ TAILQ_INIT(&pmap->pm_pvchunk);
+ bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+
+ return (1);
+}
+
+/*
+ * This routine is called if the desired page table page does not exist.
+ *
+ * If page table page allocation fails, this routine may sleep before
+ * returning NULL. It sleeps only if a lock pointer was given.
+ *
+ * Note: If a page allocation fails at page table level two or three,
+ * one or two pages may be held during the wait, only to be released
+ * afterwards. This conservative approach is easily argued to avoid
+ * race conditions.
+ */
+static vm_page_t
+_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
+{
+ vm_page_t m, pdppg, pdpg;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
+ /*
+ * Allocate a page table page.
+ */
+ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
+ VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
+ if (lockp != NULL) {
+ RELEASE_PV_LIST_LOCK(lockp);
+ PMAP_UNLOCK(pmap);
+ rw_runlock(&pvh_global_lock);
+ VM_WAIT;
+ rw_rlock(&pvh_global_lock);
+ PMAP_LOCK(pmap);
+ }
+
+ /*
+ * Indicate the need to retry. While waiting, the page table
+ * page may have been allocated.
+ */
+ return (NULL);
+ }
+ if ((m->flags & PG_ZERO) == 0)
+ pmap_zero_page(m);
+
+ /*
+ * Map the pagetable page into the process address space, if
+ * it isn't already there.
+ */
+
+ if (ptepindex >= (NUPDE + NUPDPE)) {
+ pml4_entry_t *pml4;
+ vm_pindex_t pml4index;
+
+ /* Wire up a new PDPE page */
+ pml4index = ptepindex - (NUPDE + NUPDPE);
+ pml4 = &pmap->pm_pml4[pml4index];
+ *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
+
+ } else if (ptepindex >= NUPDE) {
+ vm_pindex_t pml4index;
+ vm_pindex_t pdpindex;
+ pml4_entry_t *pml4;
+ pdp_entry_t *pdp;
+
+ /* Wire up a new PDE page */
+ pdpindex = ptepindex - NUPDE;
+ pml4index = pdpindex >> NPML4EPGSHIFT;
+
+ pml4 = &pmap->pm_pml4[pml4index];
+ if ((*pml4 & PG_V) == 0) {
+ /* Have to allocate a new pdp, recurse */
+ if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
+ lockp) == NULL) {
+ --m->wire_count;
+ atomic_subtract_int(&cnt.v_wire_count, 1);
+ vm_page_free_zero(m);
+ return (NULL);
+ }
+ } else {
+ /* Add reference to pdp page */
+ pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
+ pdppg->wire_count++;
+ }
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
+
+ /* Now find the pdp page */
+ pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
+ *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
+
+ } else {
+ vm_pindex_t pml4index;
+ vm_pindex_t pdpindex;
+ pml4_entry_t *pml4;
+ pdp_entry_t *pdp;
+ pd_entry_t *pd;
+
+ /* Wire up a new PTE page */
+ pdpindex = ptepindex >> NPDPEPGSHIFT;
+ pml4index = pdpindex >> NPML4EPGSHIFT;
+
+ /* First, find the pdp and check that its valid. */
+ pml4 = &pmap->pm_pml4[pml4index];
+ if ((*pml4 & PG_V) == 0) {
+ /* Have to allocate a new pd, recurse */
+ if (_pmap_allocpte(pmap, NUPDE + pdpindex,
+ lockp) == NULL) {
+ --m->wire_count;
+ atomic_subtract_int(&cnt.v_wire_count, 1);
+ vm_page_free_zero(m);
+ return (NULL);
+ }
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
+ pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
+ } else {
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
+ pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
+ if ((*pdp & PG_V) == 0) {
+ /* Have to allocate a new pd, recurse */
+ if (_pmap_allocpte(pmap, NUPDE + pdpindex,
+ lockp) == NULL) {
+ --m->wire_count;
+ atomic_subtract_int(&cnt.v_wire_count,
+ 1);
+ vm_page_free_zero(m);
+ return (NULL);
+ }
+ } else {
+ /* Add reference to the pd page */
+ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
+ pdpg->wire_count++;
+ }
+ }
+ pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
+
+ /* Now we know where the page directory page is */
+ pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
+ *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
+ }
+
+ pmap_resident_count_inc(pmap, 1);
+
+ return (m);
+}
+
+static vm_page_t
+pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
+{
+ vm_pindex_t pdpindex, ptepindex;
+ pdp_entry_t *pdpe;
+ vm_page_t pdpg;
+
+retry:
+ pdpe = pmap_pdpe(pmap, va);
+ if (pdpe != NULL && (*pdpe & PG_V) != 0) {
+ /* Add a reference to the pd page. */
+ pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
+ pdpg->wire_count++;
+ } else {
+ /* Allocate a pd page. */
+ ptepindex = pmap_pde_pindex(va);
+ pdpindex = ptepindex >> NPDPEPGSHIFT;
+ pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
+ if (pdpg == NULL && lockp != NULL)
+ goto retry;
+ }
+ return (pdpg);
+}
+
+static vm_page_t
+pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
+{
+ vm_pindex_t ptepindex;
+ pd_entry_t *pd;
+ vm_page_t m;
+
+ /*
+ * Calculate pagetable page index
+ */
+ ptepindex = pmap_pde_pindex(va);
+retry:
+ /*
+ * Get the page directory entry
+ */
+ pd = pmap_pde(pmap, va);
+
+ /*
+ * This supports switching from a 2MB page to a
+ * normal 4K page.
+ */
+ if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
+ if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
+ /*
+ * Invalidation of the 2MB page mapping may have caused
+ * the deallocation of the underlying PD page.
+ */
+ pd = NULL;
+ }
+ }
+
+ /*
+ * If the page table page is mapped, we just increment the
+ * hold count, and activate it.
+ */
+ if (pd != NULL && (*pd & PG_V) != 0) {
+ m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
+ m->wire_count++;
+ } else {
+ /*
+ * Here if the pte page isn't mapped, or if it has been
+ * deallocated.
+ */
+ m = _pmap_allocpte(pmap, ptepindex, lockp);
+ if (m == NULL && lockp != NULL)
+ goto retry;
+ }
+ return (m);
+}
+
+
+/***************************************************
+ * Pmap allocation/deallocation routines.
+ ***************************************************/
+
+/*
+ * Release any resources held by the given physical map.
+ * Called when a pmap initialized by pmap_pinit is being released.
+ * Should only be called if the map contains no valid mappings.
+ */
+void
+pmap_release(pmap_t pmap)
+{
+ vm_page_t m;
+ int i;
+
+ KASSERT(pmap->pm_stats.resident_count == 0,
+ ("pmap_release: pmap resident count %ld != 0",
+ pmap->pm_stats.resident_count));
+ KASSERT(vm_radix_is_empty(&pmap->pm_root),
+ ("pmap_release: pmap has reserved page table page(s)"));
+
+ m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
+
+ pmap->pm_pml4[KPML4I] = 0; /* KVA */
+ for (i = 0; i < NDMPML4E; i++) /* Direct Map */
+ pmap->pm_pml4[DMPML4I + i] = 0;
+ pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */
+
+ m->wire_count--;
+ atomic_subtract_int(&cnt.v_wire_count, 1);
+ vm_page_free_zero(m);
+ PMAP_LOCK_DESTROY(pmap);
+}
+
+static int
+kvm_size(SYSCTL_HANDLER_ARGS)
+{
+ unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
+
+ return sysctl_handle_long(oidp, &ksize, 0, req);
+}
+SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
+ 0, 0, kvm_size, "LU", "Size of KVM");
+
+static int
+kvm_free(SYSCTL_HANDLER_ARGS)
+{
+ unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
+
+ return sysctl_handle_long(oidp, &kfree, 0, req);
+}
+SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
+ 0, 0, kvm_free, "LU", "Amount of KVM free");
+
+/*
+ * grow the number of kernel page table entries, if needed
+ */
+void
+pmap_growkernel(vm_offset_t addr)
+{
+ vm_paddr_t paddr;
+ vm_page_t nkpg;
+ pd_entry_t *pde, newpdir;
+ pdp_entry_t *pdpe;
+
+ mtx_assert(&kernel_map->system_mtx, MA_OWNED);
+
+ /*
+ * Return if "addr" is within the range of kernel page table pages
+ * that were preallocated during pmap bootstrap. Moreover, leave
+ * "kernel_vm_end" and the kernel page table as they were.
+ *
+ * The correctness of this action is based on the following
+ * argument: vm_map_findspace() allocates contiguous ranges of the
+ * kernel virtual address space. It calls this function if a range
+ * ends after "kernel_vm_end". If the kernel is mapped between
+ * "kernel_vm_end" and "addr", then the range cannot begin at
+ * "kernel_vm_end". In fact, its beginning address cannot be less
+ * than the kernel. Thus, there is no immediate need to allocate
+ * any new kernel page table pages between "kernel_vm_end" and
+ * "KERNBASE".
+ */
+ if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
+ return;
+
+ addr = roundup2(addr, NBPDR);
+ if (addr - 1 >= kernel_map->max_offset)
+ addr = kernel_map->max_offset;
+ while (kernel_vm_end < addr) {
+ pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
+ if ((*pdpe & PG_V) == 0) {
+ /* We need a new PDP entry */
+ nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
+ VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
+ VM_ALLOC_WIRED | VM_ALLOC_ZERO);
+ if (nkpg == NULL)
+ panic("pmap_growkernel: no memory to grow kernel");
+ if ((nkpg->flags & PG_ZERO) == 0)
+ pmap_zero_page(nkpg);
+ paddr = VM_PAGE_TO_PHYS(nkpg);
+ *pdpe = (pdp_entry_t)
+ (paddr | PG_V | PG_RW | PG_A | PG_M);
+ continue; /* try again */
+ }
+ pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
+ if ((*pde & PG_V) != 0) {
+ kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
+ if (kernel_vm_end - 1 >= kernel_map->max_offset) {
+ kernel_vm_end = kernel_map->max_offset;
+ break;
+ }
+ continue;
+ }
+
+ nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
+ VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
+ VM_ALLOC_ZERO);
+ if (nkpg == NULL)
+ panic("pmap_growkernel: no memory to grow kernel");
+ if ((nkpg->flags & PG_ZERO) == 0)
+ pmap_zero_page(nkpg);
+ paddr = VM_PAGE_TO_PHYS(nkpg);
+ newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
+ pde_store(pde, newpdir);
+
+ kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
+ if (kernel_vm_end - 1 >= kernel_map->max_offset) {
+ kernel_vm_end = kernel_map->max_offset;
+ break;
+ }
+ }
+}
+
+
+/***************************************************
+ * page management routines.
+ ***************************************************/
+
+CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
+CTASSERT(_NPCM == 3);
+CTASSERT(_NPCPV == 168);
+
+static __inline struct pv_chunk *
+pv_to_chunk(pv_entry_t pv)
+{
+
+ return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
+}
+
+#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
+
+#define PC_FREE0 0xfffffffffffffffful
+#define PC_FREE1 0xfffffffffffffffful
+#define PC_FREE2 0x000000fffffffffful
+
+static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
+
+#ifdef PV_STATS
+static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
+
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
+ "Current number of pv entry chunks");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
+ "Current number of pv entry chunks allocated");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
+ "Current number of pv entry chunks frees");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
+ "Number of times tried to get a chunk page but failed.");
+
+static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
+static int pv_entry_spare;
+
+SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
+ "Current number of pv entry frees");
+SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
+ "Current number of pv entry allocs");
+SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
+ "Current number of pv entries");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
+ "Current number of spare pv entries");
+#endif
+
+/*
+ * We are in a serious low memory condition. Resort to
+ * drastic measures to free some pages so we can allocate
+ * another pv entry chunk.
+ *
+ * Returns NULL if PV entries were reclaimed from the specified pmap.
+ *
+ * We do not, however, unmap 2mpages because subsequent accesses will
+ * allocate per-page pv entries until repromotion occurs, thereby
+ * exacerbating the shortage of free pv entries.
+ */
+static vm_page_t
+reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
+{
+ struct pch new_tail;
+ struct pv_chunk *pc;
+ struct md_page *pvh;
+ pd_entry_t *pde;
+ pmap_t pmap;
+ pt_entry_t *pte, tpte;
+ pv_entry_t pv;
+ vm_offset_t va;
+ vm_page_t free, m, m_pc;
+ uint64_t inuse;
+ int bit, field, freed;
+
+ rw_assert(&pvh_global_lock, RA_LOCKED);
+ PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
+ KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
+ pmap = NULL;
+ free = m_pc = NULL;
+ TAILQ_INIT(&new_tail);
+ mtx_lock(&pv_chunks_mutex);
+ while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && free == NULL) {
+ TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
+ mtx_unlock(&pv_chunks_mutex);
+ if (pmap != pc->pc_pmap) {
+ if (pmap != NULL) {
+ pmap_invalidate_all(pmap);
+ if (pmap != locked_pmap)
+ PMAP_UNLOCK(pmap);
+ }
+ pmap = pc->pc_pmap;
+ /* Avoid deadlock and lock recursion. */
+ if (pmap > locked_pmap) {
+ RELEASE_PV_LIST_LOCK(lockp);
+ PMAP_LOCK(pmap);
+ } else if (pmap != locked_pmap &&
+ !PMAP_TRYLOCK(pmap)) {
+ pmap = NULL;
+ TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+ mtx_lock(&pv_chunks_mutex);
+ continue;
+ }
+ }
+
+ /*
+ * Destroy every non-wired, 4 KB page mapping in the chunk.
+ */
+ freed = 0;
+ for (field = 0; field < _NPCM; field++) {
+ for (inuse = ~pc->pc_map[field] & pc_freemask[field];
+ inuse != 0; inuse &= ~(1UL << bit)) {
+ bit = bsfq(inuse);
+ pv = &pc->pc_pventry[field * 64 + bit];
+ va = pv->pv_va;
+ pde = pmap_pde(pmap, va);
+ if ((*pde & PG_PS) != 0)
+ continue;
+ pte = pmap_pde_to_pte(pde, va);
+ if ((*pte & PG_W) != 0)
+ continue;
+ tpte = pte_load_clear(pte);
+ if ((tpte & PG_G) != 0)
+ pmap_invalidate_page(pmap, va);
+ m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
+ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
+ vm_page_dirty(m);
+ if ((tpte & PG_A) != 0)
+ vm_page_aflag_set(m, PGA_REFERENCED);
+ CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
+ if (TAILQ_EMPTY(&m->md.pv_list) &&
+ (m->flags & PG_FICTITIOUS) == 0) {
+ pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+ if (TAILQ_EMPTY(&pvh->pv_list)) {
+ vm_page_aflag_clear(m,
+ PGA_WRITEABLE);
+ }
+ }
+ pc->pc_map[field] |= 1UL << bit;
+ pmap_unuse_pt(pmap, va, *pde, &free);
+ freed++;
+ }
+ }
+ if (freed == 0) {
+ TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+ mtx_lock(&pv_chunks_mutex);
+ continue;
+ }
+ /* Every freed mapping is for a 4 KB page. */
+ pmap_resident_count_dec(pmap, freed);
+ PV_STAT(atomic_add_long(&pv_entry_frees, freed));
+ PV_STAT(atomic_add_int(&pv_entry_spare, freed));
+ PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
+ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+ if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
+ pc->pc_map[2] == PC_FREE2) {
+ PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
+ PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
+ PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
+ /* Entire chunk is free; return it. */
+ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
+ dump_drop_page(m_pc->phys_addr);
+ mtx_lock(&pv_chunks_mutex);
+ break;
+ }
+ TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+ TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+ mtx_lock(&pv_chunks_mutex);
+ /* One freed pv entry in locked_pmap is sufficient. */
+ if (pmap == locked_pmap)
+ break;
+ }
+ TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
+ mtx_unlock(&pv_chunks_mutex);
+ if (pmap != NULL) {
+ pmap_invalidate_all(pmap);
+ if (pmap != locked_pmap)
+ PMAP_UNLOCK(pmap);
+ }
+ if (m_pc == NULL && free != NULL) {
+ m_pc = free;
+ free = (void *)m_pc->object;
+ /* Recycle a freed page table page. */
+ m_pc->wire_count = 1;
+ atomic_add_int(&cnt.v_wire_count, 1);
+ }
+ pmap_free_zero_pages(free);
+ return (m_pc);
+}
+
+/*
+ * free the pv_entry back to the free list
+ */
+static void
+free_pv_entry(pmap_t pmap, pv_entry_t pv)
+{
+ struct pv_chunk *pc;
+ int idx, field, bit;
+
+ rw_assert(&pvh_global_lock, RA_LOCKED);
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ PV_STAT(atomic_add_long(&pv_entry_frees, 1));
+ PV_STAT(atomic_add_int(&pv_entry_spare, 1));
+ PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
+ pc = pv_to_chunk(pv);
+ idx = pv - &pc->pc_pventry[0];
+ field = idx / 64;
+ bit = idx % 64;
+ pc->pc_map[field] |= 1ul << bit;
+ if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
+ pc->pc_map[2] != PC_FREE2) {
+ /* 98% of the time, pc is already at the head of the list. */
+ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
+ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+ TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+ }
+ return;
+ }
+ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+ free_pv_chunk(pc);
+}
+
+static void
+free_pv_chunk(struct pv_chunk *pc)
+{
+ vm_page_t m;
+
+ mtx_lock(&pv_chunks_mutex);
+ TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
+ mtx_unlock(&pv_chunks_mutex);
+ PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
+ PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
+ PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
+ /* entire chunk is free, return it */
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
+ dump_drop_page(m->phys_addr);
+ vm_page_unwire(m, 0);
+ vm_page_free(m);
+}
+
+/*
+ * Returns a new PV entry, allocating a new PV chunk from the system when
+ * needed. If this PV chunk allocation fails and a PV list lock pointer was
+ * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
+ * returned.
+ *
+ * The given PV list lock may be released.
+ */
+static pv_entry_t
+get_pv_entry(pmap_t pmap, struct rwlock **lockp)
+{
+ int bit, field;
+ pv_entry_t pv;
+ struct pv_chunk *pc;
+ vm_page_t m;
+
+ rw_assert(&pvh_global_lock, RA_LOCKED);
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
+retry:
+ pc = TAILQ_FIRST(&pmap->pm_pvchunk);
+ if (pc != NULL) {
+ for (field = 0; field < _NPCM; field++) {
+ if (pc->pc_map[field]) {
+ bit = bsfq(pc->pc_map[field]);
+ break;
+ }
+ }
+ if (field < _NPCM) {
+ pv = &pc->pc_pventry[field * 64 + bit];
+ pc->pc_map[field] &= ~(1ul << bit);
+ /* If this was the last item, move it to tail */
+ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
+ pc->pc_map[2] == 0) {
+ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+ TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
+ pc_list);
+ }
+ PV_STAT(atomic_add_long(&pv_entry_count, 1));
+ PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
+ return (pv);
+ }
+ }
+ /* No free items, allocate another chunk */
+ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
+ VM_ALLOC_WIRED);
+ if (m == NULL) {
+ if (lockp == NULL) {
+ PV_STAT(pc_chunk_tryfail++);
+ return (NULL);
+ }
+ m = reclaim_pv_chunk(pmap, lockp);
+ if (m == NULL)
+ goto retry;
+ }
+ PV_STAT(atomic_add_int(&pc_chunk_count, 1));
+ PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
+ dump_add_page(m->phys_addr);
+ pc = (void *)PHYS_TO_DMAP(m->phys_addr);
+ pc->pc_pmap = pmap;
+ pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */
+ pc->pc_map[1] = PC_FREE1;
+ pc->pc_map[2] = PC_FREE2;
+ mtx_lock(&pv_chunks_mutex);
+ TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
+ mtx_unlock(&pv_chunks_mutex);
+ pv = &pc->pc_pventry[0];
+ TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+ PV_STAT(atomic_add_long(&pv_entry_count, 1));
+ PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
+ return (pv);
+}
+
+/*
+ * Returns the number of one bits within the given PV chunk map element.
+ */
+static int
+popcnt_pc_map_elem(uint64_t elem)
+{
+ int count;
+
+ /*
+ * This simple method of counting the one bits performs well because
+ * the given element typically contains more zero bits than one bits.
+ */
+ count = 0;
+ for (; elem != 0; elem &= elem - 1)
+ count++;
+ return (count);
+}
+
+/*
+ * Ensure that the number of spare PV entries in the specified pmap meets or
+ * exceeds the given count, "needed".
+ *
+ * The given PV list lock may be released.
+ */
+static void
+reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
+{
+ struct pch new_tail;
+ struct pv_chunk *pc;
+ int avail, free;
+ vm_page_t m;
+
+ rw_assert(&pvh_global_lock, RA_LOCKED);
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
+
+ /*
+ * Newly allocated PV chunks must be stored in a private list until
+ * the required number of PV chunks have been allocated. Otherwise,
+ * reclaim_pv_chunk() could recycle one of these chunks. In
+ * contrast, these chunks must be added to the pmap upon allocation.
+ */
+ TAILQ_INIT(&new_tail);
+retry:
+ avail = 0;
+ TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
+ if ((cpu_feature2 & CPUID2_POPCNT) == 0) {
+ free = popcnt_pc_map_elem(pc->pc_map[0]);
+ free += popcnt_pc_map_elem(pc->pc_map[1]);
+ free += popcnt_pc_map_elem(pc->pc_map[2]);
+ } else {
+ free = popcntq(pc->pc_map[0]);
+ free += popcntq(pc->pc_map[1]);
+ free += popcntq(pc->pc_map[2]);
+ }
+ if (free == 0)
+ break;
+ avail += free;
+ if (avail >= needed)
+ break;
+ }
+ for (; avail < needed; avail += _NPCPV) {
+ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
+ VM_ALLOC_WIRED);
+ if (m == NULL) {
+ m = reclaim_pv_chunk(pmap, lockp);
+ if (m == NULL)
+ goto retry;
+ }
+ PV_STAT(atomic_add_int(&pc_chunk_count, 1));
+ PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
+ dump_add_page(m->phys_addr);
+ pc = (void *)PHYS_TO_DMAP(m->phys_addr);
+ pc->pc_pmap = pmap;
+ pc->pc_map[0] = PC_FREE0;
+ pc->pc_map[1] = PC_FREE1;
+ pc->pc_map[2] = PC_FREE2;
+ TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+ TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+ PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
+ }
+ if (!TAILQ_EMPTY(&new_tail)) {
+ mtx_lock(&pv_chunks_mutex);
+ TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
+ mtx_unlock(&pv_chunks_mutex);
+ }
+}
+
+/*
+ * First find and then remove the pv entry for the specified pmap and virtual
+ * address from the specified pv list. Returns the pv entry if found and NULL
+ * otherwise. This operation can be performed on pv lists for either 4KB or
+ * 2MB page mappings.
+ */
+static __inline pv_entry_t
+pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
+{
+ pv_entry_t pv;
+
+ rw_assert(&pvh_global_lock, RA_LOCKED);
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
+ if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
+ TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
+ break;
+ }
+ }
+ return (pv);
+}
+
+/*
+ * After demotion from a 2MB page mapping to 512 4KB page mappings,
+ * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
+ * entries for each of the 4KB page mappings.
+ */
+static void
+pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+ struct rwlock **lockp)
+{
+ struct md_page *pvh;
+ struct pv_chunk *pc;
+ pv_entry_t pv;
+ vm_offset_t va_last;
+ vm_page_t m;
+ int bit, field;
+
+ rw_assert(&pvh_global_lock, RA_LOCKED);
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ KASSERT((pa & PDRMASK) == 0,
+ ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
+ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
+
+ /*
+ * Transfer the 2mpage's pv entry for this mapping to the first
+ * page's pv list. Once this transfer begins, the pv list lock
+ * must not be released until the last pv entry is reinstantiated.
+ */
+ pvh = pa_to_pvh(pa);
+ va = trunc_2mpage(va);
+ pv = pmap_pvh_remove(pvh, pmap, va);
+ KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
+ m = PHYS_TO_VM_PAGE(pa);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+ /* Instantiate the remaining NPTEPG - 1 pv entries. */
+ PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
+ va_last = va + NBPDR - PAGE_SIZE;
+ for (;;) {
+ pc = TAILQ_FIRST(&pmap->pm_pvchunk);
+ KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
+ pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
+ for (field = 0; field < _NPCM; field++) {
+ while (pc->pc_map[field]) {
+ bit = bsfq(pc->pc_map[field]);
+ pc->pc_map[field] &= ~(1ul << bit);
+ pv = &pc->pc_pventry[field * 64 + bit];
+ va += PAGE_SIZE;
+ pv->pv_va = va;
+ m++;
+ KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+ ("pmap_pv_demote_pde: page %p is not managed", m));
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+ if (va == va_last)
+ goto out;
+ }
+ }
+ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+ TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+ }
+out:
+ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
+ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+ TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+ }
+ PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
+ PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
+}
+
+/*
+ * After promotion from 512 4KB page mappings to a single 2MB page mapping,
+ * replace the many pv entries for the 4KB page mappings by a single pv entry
+ * for the 2MB page mapping.
+ */
+static void
+pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+ struct rwlock **lockp)
+{
+ struct md_page *pvh;
+ pv_entry_t pv;
+ vm_offset_t va_last;
+ vm_page_t m;
+
+ rw_assert(&pvh_global_lock, RA_LOCKED);
+ KASSERT((pa & PDRMASK) == 0,
+ ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
+ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
+
+ /*
+ * Transfer the first page's pv entry for this mapping to the 2mpage's
+ * pv list. Aside from avoiding the cost of a call to get_pv_entry(),
+ * a transfer avoids the possibility that get_pv_entry() calls
+ * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
+ * mappings that is being promoted.
+ */
+ m = PHYS_TO_VM_PAGE(pa);
+ va = trunc_2mpage(va);
+ pv = pmap_pvh_remove(&m->md, pmap, va);
+ KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
+ pvh = pa_to_pvh(pa);
+ TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
+ /* Free the remaining NPTEPG - 1 pv entries. */
+ va_last = va + NBPDR - PAGE_SIZE;
+ do {
+ m++;
+ va += PAGE_SIZE;
+ pmap_pvh_free(&m->md, pmap, va);
+ } while (va < va_last);
+}
+
+/*
+ * First find and then destroy the pv entry for the specified pmap and virtual
+ * address. This operation can be performed on pv lists for either 4KB or 2MB
+ * page mappings.
+ */
+static void
+pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
+{
+ pv_entry_t pv;
+
+ pv = pmap_pvh_remove(pvh, pmap, va);
+ KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
+ free_pv_entry(pmap, pv);
+}
+
+/*
+ * Conditionally create the PV entry for a 4KB page mapping if the required
+ * memory can be allocated without resorting to reclamation.
+ */
+static boolean_t
+pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
+ struct rwlock **lockp)
+{
+ pv_entry_t pv;
+
+ rw_assert(&pvh_global_lock, RA_LOCKED);
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ /* Pass NULL instead of the lock pointer to disable reclamation. */
+ if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
+ pv->pv_va = va;
+ CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+ return (TRUE);
+ } else
+ return (FALSE);
+}
+
+/*
+ * Conditionally create the PV entry for a 2MB page mapping if the required
+ * memory can be allocated without resorting to reclamation.
+ */
+static boolean_t
+pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+ struct rwlock **lockp)
+{
+ struct md_page *pvh;
+ pv_entry_t pv;
+
+ rw_assert(&pvh_global_lock, RA_LOCKED);
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ /* Pass NULL instead of the lock pointer to disable reclamation. */
+ if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
+ pv->pv_va = va;
+ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
+ pvh = pa_to_pvh(pa);
+ TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
+ return (TRUE);
+ } else
+ return (FALSE);
+}
+
+/*
+ * Fills a page table page with mappings to consecutive physical pages.
+ */
+static void
+pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
+{
+ pt_entry_t *pte;
+
+ for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
+ *pte = newpte;
+ newpte += PAGE_SIZE;
+ }
+}
+
+/*
+ * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page
+ * mapping is invalidated.
+ */
+static boolean_t
+pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
+{
+ struct rwlock *lock;
+ boolean_t rv;
+
+ lock = NULL;
+ rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
+ if (lock != NULL)
+ rw_wunlock(lock);
+ return (rv);
+}
+
+static boolean_t
+pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
+ struct rwlock **lockp)
+{
+ pd_entry_t newpde, oldpde;
+ pt_entry_t *firstpte, newpte;
+ vm_paddr_t mptepa;
+ vm_page_t free, mpte;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ oldpde = *pde;
+ KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
+ ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
+ mpte = pmap_lookup_pt_page(pmap, va);
+ if (mpte != NULL)
+ pmap_remove_pt_page(pmap, mpte);
+ else {
+ KASSERT((oldpde & PG_W) == 0,
+ ("pmap_demote_pde: page table page for a wired mapping"
+ " is missing"));
+
+ /*
+ * Invalidate the 2MB page mapping and return "failure" if the
+ * mapping was never accessed or the allocation of the new
+ * page table page fails. If the 2MB page mapping belongs to
+ * the direct map region of the kernel's address space, then
+ * the page allocation request specifies the highest possible
+ * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is
+ * normal. Page table pages are preallocated for every other
+ * part of the kernel address space, so the direct map region
+ * is the only part of the kernel address space that must be
+ * handled here.
+ */
+ if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
+ pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
+ DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
+ VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
+ free = NULL;
+ pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
+ lockp);
+ pmap_invalidate_page(pmap, trunc_2mpage(va));
+ pmap_free_zero_pages(free);
+ CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
+ " in pmap %p", va, pmap);
+ return (FALSE);
+ }
+ if (va < VM_MAXUSER_ADDRESS)
+ pmap_resident_count_inc(pmap, 1);
+ }
+ mptepa = VM_PAGE_TO_PHYS(mpte);
+ firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
+ newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
+ KASSERT((oldpde & PG_A) != 0,
+ ("pmap_demote_pde: oldpde is missing PG_A"));
+ KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
+ ("pmap_demote_pde: oldpde is missing PG_M"));
+ newpte = oldpde & ~PG_PS;
+ if ((newpte & PG_PDE_PAT) != 0)
+ newpte ^= PG_PDE_PAT | PG_PTE_PAT;
+
+ /*
+ * If the page table page is new, initialize it.
+ */
+ if (mpte->wire_count == 1) {
+ mpte->wire_count = NPTEPG;
+ pmap_fill_ptp(firstpte, newpte);
+ }
+ KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
+ ("pmap_demote_pde: firstpte and newpte map different physical"
+ " addresses"));
+
+ /*
+ * If the mapping has changed attributes, update the page table
+ * entries.
+ */
+ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
+ pmap_fill_ptp(firstpte, newpte);
+
+ /*
+ * The spare PV entries must be reserved prior to demoting the
+ * mapping, that is, prior to changing the PDE. Otherwise, the state
+ * of the PDE and the PV lists will be inconsistent, which can result
+ * in reclaim_pv_chunk() attempting to remove a PV entry from the
+ * wrong PV list and pmap_pv_demote_pde() failing to find the expected
+ * PV entry for the 2MB page mapping that is being demoted.
+ */
+ if ((oldpde & PG_MANAGED) != 0)
+ reserve_pv_entries(pmap, NPTEPG - 1, lockp);
+
+ /*
+ * Demote the mapping. This pmap is locked. The old PDE has
+ * PG_A set. If the old PDE has PG_RW set, it also has PG_M
+ * set. Thus, there is no danger of a race with another
+ * processor changing the setting of PG_A and/or PG_M between
+ * the read above and the store below.
+ */
+ if (workaround_erratum383)
+ pmap_update_pde(pmap, va, pde, newpde);
+ else
+ pde_store(pde, newpde);
+
+ /*
+ * Invalidate a stale recursive mapping of the page table page.
+ */
+ if (va >= VM_MAXUSER_ADDRESS)
+ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
+
+ /*
+ * Demote the PV entry.
+ */
+ if ((oldpde & PG_MANAGED) != 0)
+ pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
+
+ atomic_add_long(&pmap_pde_demotions, 1);
+ CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
+ " in pmap %p", va, pmap);
+ return (TRUE);
+}
+
+/*
+ * pmap_remove_pde: do the things to unmap a superpage in a process
+ */
+static int
+pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
+ vm_page_t *free, struct rwlock **lockp)
+{
+ struct md_page *pvh;
+ pd_entry_t oldpde;
+ vm_offset_t eva, va;
+ vm_page_t m, mpte;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ KASSERT((sva & PDRMASK) == 0,
+ ("pmap_remove_pde: sva is not 2mpage aligned"));
+ oldpde = pte_load_clear(pdq);
+ if (oldpde & PG_W)
+ pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
+
+ /*
+ * Machines that don't support invlpg, also don't support
+ * PG_G.
+ */
+ if (oldpde & PG_G)
+ pmap_invalidate_page(kernel_pmap, sva);
+ pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
+ if (oldpde & PG_MANAGED) {
+ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
+ pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
+ pmap_pvh_free(pvh, pmap, sva);
+ eva = sva + NBPDR;
+ for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
+ va < eva; va += PAGE_SIZE, m++) {
+ if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
+ vm_page_dirty(m);
+ if (oldpde & PG_A)
+ vm_page_aflag_set(m, PGA_REFERENCED);
+ if (TAILQ_EMPTY(&m->md.pv_list) &&
+ TAILQ_EMPTY(&pvh->pv_list))
+ vm_page_aflag_clear(m, PGA_WRITEABLE);
+ }
+ }
+ if (pmap == kernel_pmap) {
+ if (!pmap_demote_pde_locked(pmap, pdq, sva, lockp))
+ panic("pmap_remove_pde: failed demotion");
+ } else {
+ mpte = pmap_lookup_pt_page(pmap, sva);
+ if (mpte != NULL) {
+ pmap_remove_pt_page(pmap, mpte);
+ pmap_resident_count_dec(pmap, 1);
+ KASSERT(mpte->wire_count == NPTEPG,
+ ("pmap_remove_pde: pte page wire count error"));
+ mpte->wire_count = 0;
+ pmap_add_delayed_free_list(mpte, free, FALSE);
+ atomic_subtract_int(&cnt.v_wire_count, 1);
+ }
+ }
+ return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
+}
+
+/*
+ * pmap_remove_pte: do the things to unmap a page in a process
+ */
+static int
+pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
+ pd_entry_t ptepde, vm_page_t *free, struct rwlock **lockp)
+{
+ struct md_page *pvh;
+ pt_entry_t oldpte;
+ vm_page_t m;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ oldpte = pte_load_clear(ptq);
+ if (oldpte & PG_W)
+ pmap->pm_stats.wired_count -= 1;
+ pmap_resident_count_dec(pmap, 1);
+ if (oldpte & PG_MANAGED) {
+ m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
+ if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
+ vm_page_dirty(m);
+ if (oldpte & PG_A)
+ vm_page_aflag_set(m, PGA_REFERENCED);
+ CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
+ pmap_pvh_free(&m->md, pmap, va);
+ if (TAILQ_EMPTY(&m->md.pv_list) &&
+ (m->flags & PG_FICTITIOUS) == 0) {
+ pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+ if (TAILQ_EMPTY(&pvh->pv_list))
+ vm_page_aflag_clear(m, PGA_WRITEABLE);
+ }
+ }
+ return (pmap_unuse_pt(pmap, va, ptepde, free));
+}
+
+/*
+ * Remove a single page from a process address space
+ */
+static void
+pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free)
+{
+ struct rwlock *lock;
+ pt_entry_t *pte;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ if ((*pde & PG_V) == 0)
+ return;
+ pte = pmap_pde_to_pte(pde, va);
+ if ((*pte & PG_V) == 0)
+ return;
+ lock = NULL;
+ pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
+ if (lock != NULL)
+ rw_wunlock(lock);
+ pmap_invalidate_page(pmap, va);
+}
+
+/*
+ * Remove the given range of addresses from the specified map.
+ *
+ * It is assumed that the start and end are properly
+ * rounded to the page size.
+ */
+void
+pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+ struct rwlock *lock;
+ vm_offset_t va, va_next;
+ pml4_entry_t *pml4e;
+ pdp_entry_t *pdpe;
+ pd_entry_t ptpaddr, *pde;
+ pt_entry_t *pte;
+ vm_page_t free = NULL;
+ int anyvalid;
+
+ /*
+ * Perform an unsynchronized read. This is, however, safe.
+ */
+ if (pmap->pm_stats.resident_count == 0)
+ return;
+
+ anyvalid = 0;
+
+ rw_rlock(&pvh_global_lock);
+ PMAP_LOCK(pmap);
+
+ /*
+ * special handling of removing one page. a very
+ * common operation and easy to short circuit some
+ * code.
+ */
+ if (sva + PAGE_SIZE == eva) {
+ pde = pmap_pde(pmap, sva);
+ if (pde && (*pde & PG_PS) == 0) {
+ pmap_remove_page(pmap, sva, pde, &free);
+ goto out;
+ }
+ }
+
+ lock = NULL;
+ for (; sva < eva; sva = va_next) {
+
+ if (pmap->pm_stats.resident_count == 0)
+ break;
+
+ pml4e = pmap_pml4e(pmap, sva);
+ if ((*pml4e & PG_V) == 0) {
+ va_next = (sva + NBPML4) & ~PML4MASK;
+ if (va_next < sva)
+ va_next = eva;
+ continue;
+ }
+
+ pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+ if ((*pdpe & PG_V) == 0) {
+ va_next = (sva + NBPDP) & ~PDPMASK;
+ if (va_next < sva)
+ va_next = eva;
+ continue;
+ }
+
+ /*
+ * Calculate index for next page table.
+ */
+ va_next = (sva + NBPDR) & ~PDRMASK;
+ if (va_next < sva)
+ va_next = eva;
+
+ pde = pmap_pdpe_to_pde(pdpe, sva);
+ ptpaddr = *pde;
+
+ /*
+ * Weed out invalid mappings.
+ */
+ if (ptpaddr == 0)
+ continue;
+
+ /*
+ * Check for large page.
+ */
+ if ((ptpaddr & PG_PS) != 0) {
+ /*
+ * Are we removing the entire large page? If not,
+ * demote the mapping and fall through.
+ */
+ if (sva + NBPDR == va_next && eva >= va_next) {
+ /*
+ * The TLB entry for a PG_G mapping is
+ * invalidated by pmap_remove_pde().
+ */
+ if ((ptpaddr & PG_G) == 0)
+ anyvalid = 1;
+ pmap_remove_pde(pmap, pde, sva, &free, &lock);
+ continue;
+ } else if (!pmap_demote_pde_locked(pmap, pde, sva,
+ &lock)) {
+ /* The large page mapping was destroyed. */
+ continue;
+ } else
+ ptpaddr = *pde;
+ }
+
+ /*
+ * Limit our scan to either the end of the va represented
+ * by the current page table page, or to the end of the
+ * range being removed.
+ */
+ if (va_next > eva)
+ va_next = eva;
+
+ va = va_next;
+ for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
+ sva += PAGE_SIZE) {
+ if (*pte == 0) {
+ if (va != va_next) {
+ pmap_invalidate_range(pmap, va, sva);
+ va = va_next;
+ }
+ continue;
+ }
+ if ((*pte & PG_G) == 0)
+ anyvalid = 1;
+ else if (va == va_next)
+ va = sva;
+ if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
+ &lock)) {
+ sva += PAGE_SIZE;
+ break;
+ }
+ }
+ if (va != va_next)
+ pmap_invalidate_range(pmap, va, sva);
+ }
+ if (lock != NULL)
+ rw_wunlock(lock);
+out:
+ if (anyvalid)
+ pmap_invalidate_all(pmap);
+ rw_runlock(&pvh_global_lock);
+ PMAP_UNLOCK(pmap);
+ pmap_free_zero_pages(free);
+}
+
+/*
+ * Routine: pmap_remove_all
+ * Function:
+ * Removes this physical page from
+ * all physical maps in which it resides.
+ * Reflects back modify bits to the pager.
+ *
+ * Notes:
+ * Original versions of this routine were very
+ * inefficient because they iteratively called
+ * pmap_remove (slow...)
+ */
+
+void
+pmap_remove_all(vm_page_t m)
+{
+ struct md_page *pvh;
+ pv_entry_t pv;
+ pmap_t pmap;
+ pt_entry_t *pte, tpte;
+ pd_entry_t *pde;
+ vm_offset_t va;
+ vm_page_t free;
+
+ KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+ ("pmap_remove_all: page %p is not managed", m));
+ free = NULL;
+ rw_wlock(&pvh_global_lock);
+ if ((m->flags & PG_FICTITIOUS) != 0)
+ goto small_mappings;
+ pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+ while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ va = pv->pv_va;
+ pde = pmap_pde(pmap, va);
+ (void)pmap_demote_pde(pmap, pde, va);
+ PMAP_UNLOCK(pmap);
+ }
+small_mappings:
+ while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ pmap_resident_count_dec(pmap, 1);
+ pde = pmap_pde(pmap, pv->pv_va);
+ KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
+ " a 2mpage in page %p's pv list", m));
+ pte = pmap_pde_to_pte(pde, pv->pv_va);
+ tpte = pte_load_clear(pte);
+ if (tpte & PG_W)
+ pmap->pm_stats.wired_count--;
+ if (tpte & PG_A)
+ vm_page_aflag_set(m, PGA_REFERENCED);
+
+ /*
+ * Update the vm_page_t clean and reference bits.
+ */
+ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
+ vm_page_dirty(m);
+ pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
+ pmap_invalidate_page(pmap, pv->pv_va);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
+ free_pv_entry(pmap, pv);
+ PMAP_UNLOCK(pmap);
+ }
+ vm_page_aflag_clear(m, PGA_WRITEABLE);
+ rw_wunlock(&pvh_global_lock);
+ pmap_free_zero_pages(free);
+}
+
+/*
+ * pmap_protect_pde: do the things to protect a 2mpage in a process
+ */
+static boolean_t
+pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
+{
+ pd_entry_t newpde, oldpde;
+ vm_offset_t eva, va;
+ vm_page_t m;
+ boolean_t anychanged;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ KASSERT((sva & PDRMASK) == 0,
+ ("pmap_protect_pde: sva is not 2mpage aligned"));
+ anychanged = FALSE;
+retry:
+ oldpde = newpde = *pde;
+ if (oldpde & PG_MANAGED) {
+ eva = sva + NBPDR;
+ for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
+ va < eva; va += PAGE_SIZE, m++)
+ if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
+ vm_page_dirty(m);
+ }
+ if ((prot & VM_PROT_WRITE) == 0)
+ newpde &= ~(PG_RW | PG_M);
+ if ((prot & VM_PROT_EXECUTE) == 0)
+ newpde |= pg_nx;
+ if (newpde != oldpde) {
+ if (!atomic_cmpset_long(pde, oldpde, newpde))
+ goto retry;
+ if (oldpde & PG_G)
+ pmap_invalidate_page(pmap, sva);
+ else
+ anychanged = TRUE;
+ }
+ return (anychanged);
+}
+
+/*
+ * Set the physical protection on the
+ * specified range of this map as requested.
+ */
+void
+pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
+{
+ vm_offset_t va_next;
+ pml4_entry_t *pml4e;
+ pdp_entry_t *pdpe;
+ pd_entry_t ptpaddr, *pde;
+ pt_entry_t *pte;
+ boolean_t anychanged, pv_lists_locked;
+
+ if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
+ pmap_remove(pmap, sva, eva);
+ return;
+ }
+
+ if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
+ (VM_PROT_WRITE|VM_PROT_EXECUTE))
+ return;
+
+ pv_lists_locked = FALSE;
+resume:
+ anychanged = FALSE;
+
+ PMAP_LOCK(pmap);
+ for (; sva < eva; sva = va_next) {
+
+ pml4e = pmap_pml4e(pmap, sva);
+ if ((*pml4e & PG_V) == 0) {
+ va_next = (sva + NBPML4) & ~PML4MASK;
+ if (va_next < sva)
+ va_next = eva;
+ continue;
+ }
+
+ pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+ if ((*pdpe & PG_V) == 0) {
+ va_next = (sva + NBPDP) & ~PDPMASK;
+ if (va_next < sva)
+ va_next = eva;
+ continue;
+ }
+
+ va_next = (sva + NBPDR) & ~PDRMASK;
+ if (va_next < sva)
+ va_next = eva;
+
+ pde = pmap_pdpe_to_pde(pdpe, sva);
+ ptpaddr = *pde;
+
+ /*
+ * Weed out invalid mappings.
+ */
+ if (ptpaddr == 0)
+ continue;
+
+ /*
+ * Check for large page.
+ */
+ if ((ptpaddr & PG_PS) != 0) {
+ /*
+ * Are we protecting the entire large page? If not,
+ * demote the mapping and fall through.
+ */
+ if (sva + NBPDR == va_next && eva >= va_next) {
+ /*
+ * The TLB entry for a PG_G mapping is
+ * invalidated by pmap_protect_pde().
+ */
+ if (pmap_protect_pde(pmap, pde, sva, prot))
+ anychanged = TRUE;
+ continue;
+ } else {
+ if (!pv_lists_locked) {
+ pv_lists_locked = TRUE;
+ if (!rw_try_rlock(&pvh_global_lock)) {
+ if (anychanged)
+ pmap_invalidate_all(
+ pmap);
+ PMAP_UNLOCK(pmap);
+ rw_rlock(&pvh_global_lock);
+ goto resume;
+ }
+ }
+ if (!pmap_demote_pde(pmap, pde, sva)) {
+ /*
+ * The large page mapping was
+ * destroyed.
+ */
+ continue;
+ }
+ }
+ }
+
+ if (va_next > eva)
+ va_next = eva;
+
+ for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
+ sva += PAGE_SIZE) {
+ pt_entry_t obits, pbits;
+ vm_page_t m;
+
+retry:
+ obits = pbits = *pte;
+ if ((pbits & PG_V) == 0)
+ continue;
+
+ if ((prot & VM_PROT_WRITE) == 0) {
+ if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
+ (PG_MANAGED | PG_M | PG_RW)) {
+ m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
+ vm_page_dirty(m);
+ }
+ pbits &= ~(PG_RW | PG_M);
+ }
+ if ((prot & VM_PROT_EXECUTE) == 0)
+ pbits |= pg_nx;
+
+ if (pbits != obits) {
+ if (!atomic_cmpset_long(pte, obits, pbits))
+ goto retry;
+ if (obits & PG_G)
+ pmap_invalidate_page(pmap, sva);
+ else
+ anychanged = TRUE;
+ }
+ }
+ }
+ if (anychanged)
+ pmap_invalidate_all(pmap);
+ if (pv_lists_locked)
+ rw_runlock(&pvh_global_lock);
+ PMAP_UNLOCK(pmap);
+}
+
+/*
+ * Tries to promote the 512, contiguous 4KB page mappings that are within a
+ * single page table page (PTP) to a single 2MB page mapping. For promotion
+ * to occur, two conditions must be met: (1) the 4KB page mappings must map
+ * aligned, contiguous physical memory and (2) the 4KB page mappings must have
+ * identical characteristics.
+ */
+static void
+pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
+ struct rwlock **lockp)
+{
+ pd_entry_t newpde;
+ pt_entry_t *firstpte, oldpte, pa, *pte;
+ vm_offset_t oldpteva;
+ vm_page_t mpte;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
+ /*
+ * Examine the first PTE in the specified PTP. Abort if this PTE is
+ * either invalid, unused, or does not map the first 4KB physical page
+ * within a 2MB page.
+ */
+ firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
+setpde:
+ newpde = *firstpte;
+ if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
+ atomic_add_long(&pmap_pde_p_failures, 1);
+ CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
+ " in pmap %p", va, pmap);
+ return;
+ }
+ if ((newpde & (PG_M | PG_RW)) == PG_RW) {
+ /*
+ * When PG_M is already clear, PG_RW can be cleared without
+ * a TLB invalidation.
+ */
+ if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
+ goto setpde;
+ newpde &= ~PG_RW;
+ }
+
+ /*
+ * Examine each of the other PTEs in the specified PTP. Abort if this
+ * PTE maps an unexpected 4KB physical page or does not have identical
+ * characteristics to the first PTE.
+ */
+ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
+ for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
+setpte:
+ oldpte = *pte;
+ if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
+ atomic_add_long(&pmap_pde_p_failures, 1);
+ CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
+ " in pmap %p", va, pmap);
+ return;
+ }
+ if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
+ /*
+ * When PG_M is already clear, PG_RW can be cleared
+ * without a TLB invalidation.
+ */
+ if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
+ goto setpte;
+ oldpte &= ~PG_RW;
+ oldpteva = (oldpte & PG_FRAME & PDRMASK) |
+ (va & ~PDRMASK);
+ CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
+ " in pmap %p", oldpteva, pmap);
+ }
+ if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
+ atomic_add_long(&pmap_pde_p_failures, 1);
+ CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
+ " in pmap %p", va, pmap);
+ return;
+ }
+ pa -= PAGE_SIZE;
+ }
+
+ /*
+ * Save the page table page in its current state until the PDE
+ * mapping the superpage is demoted by pmap_demote_pde() or
+ * destroyed by pmap_remove_pde().
+ */
+ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
+ KASSERT(mpte >= vm_page_array &&
+ mpte < &vm_page_array[vm_page_array_size],
+ ("pmap_promote_pde: page table page is out of range"));
+ KASSERT(mpte->pindex == pmap_pde_pindex(va),
+ ("pmap_promote_pde: page table page's pindex is wrong"));
+ pmap_insert_pt_page(pmap, mpte);
+
+ /*
+ * Promote the pv entries.
+ */
+ if ((newpde & PG_MANAGED) != 0)
+ pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
+
+ /*
+ * Propagate the PAT index to its proper position.
+ */
+ if ((newpde & PG_PTE_PAT) != 0)
+ newpde ^= PG_PDE_PAT | PG_PTE_PAT;
+
+ /*
+ * Map the superpage.
+ */
+ if (workaround_erratum383)
+ pmap_update_pde(pmap, va, pde, PG_PS | newpde);
+ else
+ pde_store(pde, PG_PS | newpde);
+
+ atomic_add_long(&pmap_pde_promotions, 1);
+ CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
+ " in pmap %p", va, pmap);
+}
+
+/*
+ * Insert the given physical page (p) at
+ * the specified virtual address (v) in the
+ * target physical map with the protection requested.
+ *
+ * If specified, the page will be wired down, meaning
+ * that the related pte can not be reclaimed.
+ *
+ * NB: This is the only routine which MAY NOT lazy-evaluate
+ * or lose information. That is, this routine must actually
+ * insert this page into the given map NOW.
+ */
+void
+pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
+ vm_prot_t prot, boolean_t wired)
+{
+ struct rwlock *lock;
+ pd_entry_t *pde;
+ pt_entry_t *pte;
+ pt_entry_t newpte, origpte;
+ pv_entry_t pv;
+ vm_paddr_t opa, pa;
+ vm_page_t mpte, om;
+
+ va = trunc_page(va);
+ KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
+ KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
+ ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
+ va));
+ KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
+ va >= kmi.clean_eva,
+ ("pmap_enter: managed mapping within the clean submap"));
+ if ((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) == 0)
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
+ pa = VM_PAGE_TO_PHYS(m);
+ newpte = (pt_entry_t)(pa | PG_A | PG_V);
+ if ((access & VM_PROT_WRITE) != 0)
+ newpte |= PG_M;
+ if ((prot & VM_PROT_WRITE) != 0)
+ newpte |= PG_RW;
+ KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
+ ("pmap_enter: access includes VM_PROT_WRITE but prot doesn't"));
+ if ((prot & VM_PROT_EXECUTE) == 0)
+ newpte |= pg_nx;
+ if (wired)
+ newpte |= PG_W;
+ if (va < VM_MAXUSER_ADDRESS)
+ newpte |= PG_U;
+ if (pmap == kernel_pmap)
+ newpte |= PG_G;
+ newpte |= pmap_cache_bits(m->md.pat_mode, 0);
+
+ mpte = NULL;
+
+ lock = NULL;
+ rw_rlock(&pvh_global_lock);
+ PMAP_LOCK(pmap);
+
+ /*
+ * In the case that a page table page is not
+ * resident, we are creating it here.
+ */
+retry:
+ pde = pmap_pde(pmap, va);
+ if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
+ pmap_demote_pde_locked(pmap, pde, va, &lock))) {
+ pte = pmap_pde_to_pte(pde, va);
+ if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
+ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
+ mpte->wire_count++;
+ }
+ } else if (va < VM_MAXUSER_ADDRESS) {
+ /*
+ * Here if the pte page isn't mapped, or if it has been
+ * deallocated.
+ */
+ mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), &lock);
+ goto retry;
+ } else
+ panic("pmap_enter: invalid page directory va=%#lx", va);
+
+ origpte = *pte;
+
+ /*
+ * Is the specified virtual address already mapped?
+ */
+ if ((origpte & PG_V) != 0) {
+ /*
+ * Wiring change, just update stats. We don't worry about
+ * wiring PT pages as they remain resident as long as there
+ * are valid mappings in them. Hence, if a user page is wired,
+ * the PT page will be also.
+ */
+ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
+ pmap->pm_stats.wired_count++;
+ else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
+ pmap->pm_stats.wired_count--;
+
+ /*
+ * Remove the extra PT page reference.
+ */
+ if (mpte != NULL) {
+ mpte->wire_count--;
+ KASSERT(mpte->wire_count > 0,
+ ("pmap_enter: missing reference to page table page,"
+ " va: 0x%lx", va));
+ }
+
+ /*
+ * Has the physical page changed?
+ */
+ opa = origpte & PG_FRAME;
+ if (opa == pa) {
+ /*
+ * No, might be a protection or wiring change.
+ */
+ if ((origpte & PG_MANAGED) != 0) {
+ newpte |= PG_MANAGED;
+ if ((newpte & PG_RW) != 0)
+ vm_page_aflag_set(m, PGA_WRITEABLE);
+ }
+ if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
+ goto unchanged;
+ goto validate;
+ }
+ } else {
+ /*
+ * Increment the counters.
+ */
+ if ((newpte & PG_W) != 0)
+ pmap->pm_stats.wired_count++;
+ pmap_resident_count_inc(pmap, 1);
+ }
+
+ /*
+ * Enter on the PV list if part of our managed memory.
+ */
+ if ((m->oflags & VPO_UNMANAGED) == 0) {
+ newpte |= PG_MANAGED;
+ pv = get_pv_entry(pmap, &lock);
+ pv->pv_va = va;
+ CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+ if ((newpte & PG_RW) != 0)
+ vm_page_aflag_set(m, PGA_WRITEABLE);
+ }
+
+ /*
+ * Update the PTE.
+ */
+ if ((origpte & PG_V) != 0) {
+validate:
+ origpte = pte_load_store(pte, newpte);
+ opa = origpte & PG_FRAME;
+ if (opa != pa) {
+ if ((origpte & PG_MANAGED) != 0) {
+ om = PHYS_TO_VM_PAGE(opa);
+ if ((origpte & (PG_M | PG_RW)) == (PG_M |
+ PG_RW))
+ vm_page_dirty(om);
+ if ((origpte & PG_A) != 0)
+ vm_page_aflag_set(om, PGA_REFERENCED);
+ CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
+ pmap_pvh_free(&om->md, pmap, va);
+ if ((om->aflags & PGA_WRITEABLE) != 0 &&
+ TAILQ_EMPTY(&om->md.pv_list) &&
+ ((om->flags & PG_FICTITIOUS) != 0 ||
+ TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
+ vm_page_aflag_clear(om, PGA_WRITEABLE);
+ }
+ } else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
+ PG_RW)) == (PG_M | PG_RW)) {
+ if ((origpte & PG_MANAGED) != 0)
+ vm_page_dirty(m);
+
+ /*
+ * Although the PTE may still have PG_RW set, TLB
+ * invalidation may nonetheless be required because
+ * the PTE no longer has PG_M set.
+ */
+ } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
+ /*
+ * This PTE change does not require TLB invalidation.
+ */
+ goto unchanged;
+ }
+ if ((origpte & PG_A) != 0)
+ pmap_invalidate_page(pmap, va);
+ } else
+ pte_store(pte, newpte);
+
+unchanged:
+
+ /*
+ * If both the page table page and the reservation are fully
+ * populated, then attempt promotion.
+ */
+ if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
+ pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
+ vm_reserv_level_iffullpop(m) == 0)
+ pmap_promote_pde(pmap, pde, va, &lock);
+
+ if (lock != NULL)
+ rw_wunlock(lock);
+ rw_runlock(&pvh_global_lock);
+ PMAP_UNLOCK(pmap);
+}
+
+/*
+ * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE
+ * otherwise. Fails if (1) a page table page cannot be allocated without
+ * blocking, (2) a mapping already exists at the specified virtual address, or
+ * (3) a pv entry cannot be allocated without reclaiming another pv entry.
+ */
+static boolean_t
+pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
+ struct rwlock **lockp)
+{
+ pd_entry_t *pde, newpde;
+ vm_page_t free, mpde;
+
+ rw_assert(&pvh_global_lock, RA_LOCKED);
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
+ CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
+ " in pmap %p", va, pmap);
+ return (FALSE);
+ }
+ pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
+ pde = &pde[pmap_pde_index(va)];
+ if ((*pde & PG_V) != 0) {
+ KASSERT(mpde->wire_count > 1,
+ ("pmap_enter_pde: mpde's wire count is too low"));
+ mpde->wire_count--;
+ CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
+ " in pmap %p", va, pmap);
+ return (FALSE);
+ }
+ newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
+ PG_PS | PG_V;
+ if ((m->oflags & VPO_UNMANAGED) == 0) {
+ newpde |= PG_MANAGED;
+
+ /*
+ * Abort this mapping if its PV entry could not be created.
+ */
+ if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
+ lockp)) {
+ free = NULL;
+ if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
+ pmap_invalidate_page(pmap, va);
+ pmap_free_zero_pages(free);
+ }
+ CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
+ " in pmap %p", va, pmap);
+ return (FALSE);
+ }
+ }
+ if ((prot & VM_PROT_EXECUTE) == 0)
+ newpde |= pg_nx;
+ if (va < VM_MAXUSER_ADDRESS)
+ newpde |= PG_U;
+
+ /*
+ * Increment counters.
+ */
+ pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
+
+ /*
+ * Map the superpage.
+ */
+ pde_store(pde, newpde);
+
+ atomic_add_long(&pmap_pde_mappings, 1);
+ CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
+ " in pmap %p", va, pmap);
+ return (TRUE);
+}
+
+/*
+ * Maps a sequence of resident pages belonging to the same object.
+ * The sequence begins with the given page m_start. This page is
+ * mapped at the given virtual address start. Each subsequent page is
+ * mapped at a virtual address that is offset from start by the same
+ * amount as the page is offset from m_start within the object. The
+ * last page in the sequence is the page with the largest offset from
+ * m_start that can be mapped at a virtual address less than the given
+ * virtual address end. Not every virtual page between start and end
+ * is mapped; only those for which a resident page exists with the
+ * corresponding offset from m_start are mapped.
+ */
+void
+pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
+ vm_page_t m_start, vm_prot_t prot)
+{
+ struct rwlock *lock;
+ vm_offset_t va;
+ vm_page_t m, mpte;
+ vm_pindex_t diff, psize;
+
+ VM_OBJECT_ASSERT_WLOCKED(m_start->object);
+ psize = atop(end - start);
+ mpte = NULL;
+ m = m_start;
+ lock = NULL;
+ rw_rlock(&pvh_global_lock);
+ PMAP_LOCK(pmap);
+ while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
+ va = start + ptoa(diff);
+ if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
+ (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
+ pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
+ pmap_enter_pde(pmap, va, m, prot, &lock))
+ m = &m[NBPDR / PAGE_SIZE - 1];
+ else
+ mpte = pmap_enter_quick_locked(pmap, va, m, prot,
+ mpte, &lock);
+ m = TAILQ_NEXT(m, listq);
+ }
+ if (lock != NULL)
+ rw_wunlock(lock);
+ rw_runlock(&pvh_global_lock);
+ PMAP_UNLOCK(pmap);
+}
+
+/*
+ * this code makes some *MAJOR* assumptions:
+ * 1. Current pmap & pmap exists.
+ * 2. Not wired.
+ * 3. Read access.
+ * 4. No page table pages.
+ * but is *MUCH* faster than pmap_enter...
+ */
+
+void
+pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
+{
+ struct rwlock *lock;
+
+ lock = NULL;
+ rw_rlock(&pvh_global_lock);
+ PMAP_LOCK(pmap);
+ (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
+ if (lock != NULL)
+ rw_wunlock(lock);
+ rw_runlock(&pvh_global_lock);
+ PMAP_UNLOCK(pmap);
+}
+
+static vm_page_t
+pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
+ vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
+{
+ vm_page_t free;
+ pt_entry_t *pte;
+ vm_paddr_t pa;
+
+ KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
+ (m->oflags & VPO_UNMANAGED) != 0,
+ ("pmap_enter_quick_locked: managed mapping within the clean submap"));
+ rw_assert(&pvh_global_lock, RA_LOCKED);
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
+ /*
+ * In the case that a page table page is not
+ * resident, we are creating it here.
+ */
+ if (va < VM_MAXUSER_ADDRESS) {
+ vm_pindex_t ptepindex;
+ pd_entry_t *ptepa;
+
+ /*
+ * Calculate pagetable page index
+ */
+ ptepindex = pmap_pde_pindex(va);
+ if (mpte && (mpte->pindex == ptepindex)) {
+ mpte->wire_count++;
+ } else {
+ /*
+ * Get the page directory entry
+ */
+ ptepa = pmap_pde(pmap, va);
+
+ /*
+ * If the page table page is mapped, we just increment
+ * the hold count, and activate it. Otherwise, we
+ * attempt to allocate a page table page. If this
+ * attempt fails, we don't retry. Instead, we give up.
+ */
+ if (ptepa && (*ptepa & PG_V) != 0) {
+ if (*ptepa & PG_PS)
+ return (NULL);
+ mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
+ mpte->wire_count++;
+ } else {
+ /*
+ * Pass NULL instead of the PV list lock
+ * pointer, because we don't intend to sleep.
+ */
+ mpte = _pmap_allocpte(pmap, ptepindex, NULL);
+ if (mpte == NULL)
+ return (mpte);
+ }
+ }
+ pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
+ pte = &pte[pmap_pte_index(va)];
+ } else {
+ mpte = NULL;
+ pte = vtopte(va);
+ }
+ if (*pte) {
+ if (mpte != NULL) {
+ mpte->wire_count--;
+ mpte = NULL;
+ }
+ return (mpte);
+ }
+
+ /*
+ * Enter on the PV list if part of our managed memory.
+ */
+ if ((m->oflags & VPO_UNMANAGED) == 0 &&
+ !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
+ if (mpte != NULL) {
+ free = NULL;
+ if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
+ pmap_invalidate_page(pmap, va);
+ pmap_free_zero_pages(free);
+ }
+ mpte = NULL;
+ }
+ return (mpte);
+ }
+
+ /*
+ * Increment counters
+ */
+ pmap_resident_count_inc(pmap, 1);
+
+ pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
+ if ((prot & VM_PROT_EXECUTE) == 0)
+ pa |= pg_nx;
+
+ /*
+ * Now validate mapping with RO protection
+ */
+ if ((m->oflags & VPO_UNMANAGED) != 0)
+ pte_store(pte, pa | PG_V | PG_U);
+ else
+ pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
+ return (mpte);
+}
+
+/*
+ * Make a temporary mapping for a physical address. This is only intended
+ * to be used for panic dumps.
+ */
+void *
+pmap_kenter_temporary(vm_paddr_t pa, int i)
+{
+ vm_offset_t va;
+
+ va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
+ pmap_kenter(va, pa);
+ invlpg(va);
+ return ((void *)crashdumpmap);
+}
+
+/*
+ * This code maps large physical mmap regions into the
+ * processor address space. Note that some shortcuts
+ * are taken, but the code works.
+ */
+void
+pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
+ vm_pindex_t pindex, vm_size_t size)
+{
+ pd_entry_t *pde;
+ vm_paddr_t pa, ptepa;
+ vm_page_t p, pdpg;
+ int pat_mode;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
+ ("pmap_object_init_pt: non-device object"));
+ if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
+ if (!vm_object_populate(object, pindex, pindex + atop(size)))
+ return;
+ p = vm_page_lookup(object, pindex);
+ KASSERT(p->valid == VM_PAGE_BITS_ALL,
+ ("pmap_object_init_pt: invalid page %p", p));
+ pat_mode = p->md.pat_mode;
+
+ /*
+ * Abort the mapping if the first page is not physically
+ * aligned to a 2MB page boundary.
+ */
+ ptepa = VM_PAGE_TO_PHYS(p);
+ if (ptepa & (NBPDR - 1))
+ return;
+
+ /*
+ * Skip the first page. Abort the mapping if the rest of
+ * the pages are not physically contiguous or have differing
+ * memory attributes.
+ */
+ p = TAILQ_NEXT(p, listq);
+ for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
+ pa += PAGE_SIZE) {
+ KASSERT(p->valid == VM_PAGE_BITS_ALL,
+ ("pmap_object_init_pt: invalid page %p", p));
+ if (pa != VM_PAGE_TO_PHYS(p) ||
+ pat_mode != p->md.pat_mode)
+ return;
+ p = TAILQ_NEXT(p, listq);
+ }
+
+ /*
+ * Map using 2MB pages. Since "ptepa" is 2M aligned and
+ * "size" is a multiple of 2M, adding the PAT setting to "pa"
+ * will not affect the termination of this loop.
+ */
+ PMAP_LOCK(pmap);
+ for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
+ size; pa += NBPDR) {
+ pdpg = pmap_allocpde(pmap, addr, NULL);
+ if (pdpg == NULL) {
+ /*
+ * The creation of mappings below is only an
+ * optimization. If a page directory page
+ * cannot be allocated without blocking,
+ * continue on to the next mapping rather than
+ * blocking.
+ */
+ addr += NBPDR;
+ continue;
+ }
+ pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
+ pde = &pde[pmap_pde_index(addr)];
+ if ((*pde & PG_V) == 0) {
+ pde_store(pde, pa | PG_PS | PG_M | PG_A |
+ PG_U | PG_RW | PG_V);
+ pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
+ atomic_add_long(&pmap_pde_mappings, 1);
+ } else {
+ /* Continue on if the PDE is already valid. */
+ pdpg->wire_count--;
+ KASSERT(pdpg->wire_count > 0,
+ ("pmap_object_init_pt: missing reference "
+ "to page directory page, va: 0x%lx", addr));
+ }
+ addr += NBPDR;
+ }
+ PMAP_UNLOCK(pmap);
+ }
+}
+
+/*
+ * Routine: pmap_change_wiring
+ * Function: Change the wiring attribute for a map/virtual-address
+ * pair.
+ * In/out conditions:
+ * The mapping must already exist in the pmap.
+ */
+void
+pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
+{
+ pd_entry_t *pde;
+ pt_entry_t *pte;
+ boolean_t pv_lists_locked;
+
+ pv_lists_locked = FALSE;
+
+ /*
+ * Wiring is not a hardware characteristic so there is no need to
+ * invalidate TLB.
+ */
+retry:
+ PMAP_LOCK(pmap);
+ pde = pmap_pde(pmap, va);
+ if ((*pde & PG_PS) != 0) {
+ if (!wired != ((*pde & PG_W) == 0)) {
+ if (!pv_lists_locked) {
+ pv_lists_locked = TRUE;
+ if (!rw_try_rlock(&pvh_global_lock)) {
+ PMAP_UNLOCK(pmap);
+ rw_rlock(&pvh_global_lock);
+ goto retry;
+ }
+ }
+ if (!pmap_demote_pde(pmap, pde, va))
+ panic("pmap_change_wiring: demotion failed");
+ } else
+ goto out;
+ }
+ pte = pmap_pde_to_pte(pde, va);
+ if (wired && (*pte & PG_W) == 0) {
+ pmap->pm_stats.wired_count++;
+ atomic_set_long(pte, PG_W);
+ } else if (!wired && (*pte & PG_W) != 0) {
+ pmap->pm_stats.wired_count--;
+ atomic_clear_long(pte, PG_W);
+ }
+out:
+ if (pv_lists_locked)
+ rw_runlock(&pvh_global_lock);
+ PMAP_UNLOCK(pmap);
+}
+
+/*
+ * Copy the range specified by src_addr/len
+ * from the source map to the range dst_addr/len
+ * in the destination map.
+ *
+ * This routine is only advisory and need not do anything.
+ */
+
+void
+pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
+ vm_offset_t src_addr)
+{
+ struct rwlock *lock;
+ vm_page_t free;
+ vm_offset_t addr;
+ vm_offset_t end_addr = src_addr + len;
+ vm_offset_t va_next;
+
+ if (dst_addr != src_addr)
+ return;
+
+ lock = NULL;
+ rw_rlock(&pvh_global_lock);
+ if (dst_pmap < src_pmap) {
+ PMAP_LOCK(dst_pmap);
+ PMAP_LOCK(src_pmap);
+ } else {
+ PMAP_LOCK(src_pmap);
+ PMAP_LOCK(dst_pmap);
+ }
+ for (addr = src_addr; addr < end_addr; addr = va_next) {
+ pt_entry_t *src_pte, *dst_pte;
+ vm_page_t dstmpde, dstmpte, srcmpte;
+ pml4_entry_t *pml4e;
+ pdp_entry_t *pdpe;
+ pd_entry_t srcptepaddr, *pde;
+
+ KASSERT(addr < UPT_MIN_ADDRESS,
+ ("pmap_copy: invalid to pmap_copy page tables"));
+
+ pml4e = pmap_pml4e(src_pmap, addr);
+ if ((*pml4e & PG_V) == 0) {
+ va_next = (addr + NBPML4) & ~PML4MASK;
+ if (va_next < addr)
+ va_next = end_addr;
+ continue;
+ }
+
+ pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
+ if ((*pdpe & PG_V) == 0) {
+ va_next = (addr + NBPDP) & ~PDPMASK;
+ if (va_next < addr)
+ va_next = end_addr;
+ continue;
+ }
+
+ va_next = (addr + NBPDR) & ~PDRMASK;
+ if (va_next < addr)
+ va_next = end_addr;
+
+ pde = pmap_pdpe_to_pde(pdpe, addr);
+ srcptepaddr = *pde;
+ if (srcptepaddr == 0)
+ continue;
+
+ if (srcptepaddr & PG_PS) {
+ dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
+ if (dstmpde == NULL)
+ break;
+ pde = (pd_entry_t *)
+ PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
+ pde = &pde[pmap_pde_index(addr)];
+ if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
+ pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
+ PG_PS_FRAME, &lock))) {
+ *pde = srcptepaddr & ~PG_W;
+ pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
+ } else
+ dstmpde->wire_count--;
+ continue;
+ }
+
+ srcptepaddr &= PG_FRAME;
+ srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
+ KASSERT(srcmpte->wire_count > 0,
+ ("pmap_copy: source page table page is unused"));
+
+ if (va_next > end_addr)
+ va_next = end_addr;
+
+ src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
+ src_pte = &src_pte[pmap_pte_index(addr)];
+ dstmpte = NULL;
+ while (addr < va_next) {
+ pt_entry_t ptetemp;
+ ptetemp = *src_pte;
+ /*
+ * we only virtual copy managed pages
+ */
+ if ((ptetemp & PG_MANAGED) != 0) {
+ if (dstmpte != NULL &&
+ dstmpte->pindex == pmap_pde_pindex(addr))
+ dstmpte->wire_count++;
+ else if ((dstmpte = pmap_allocpte(dst_pmap,
+ addr, NULL)) == NULL)
+ goto out;
+ dst_pte = (pt_entry_t *)
+ PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
+ dst_pte = &dst_pte[pmap_pte_index(addr)];
+ if (*dst_pte == 0 &&
+ pmap_try_insert_pv_entry(dst_pmap, addr,
+ PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
+ &lock)) {
+ /*
+ * Clear the wired, modified, and
+ * accessed (referenced) bits
+ * during the copy.
+ */
+ *dst_pte = ptetemp & ~(PG_W | PG_M |
+ PG_A);
+ pmap_resident_count_inc(dst_pmap, 1);
+ } else {
+ free = NULL;
+ if (pmap_unwire_ptp(dst_pmap, addr,
+ dstmpte, &free)) {
+ pmap_invalidate_page(dst_pmap,
+ addr);
+ pmap_free_zero_pages(free);
+ }
+ goto out;
+ }
+ if (dstmpte->wire_count >= srcmpte->wire_count)
+ break;
+ }
+ addr += PAGE_SIZE;
+ src_pte++;
+ }
+ }
+out:
+ if (lock != NULL)
+ rw_wunlock(lock);
+ rw_runlock(&pvh_global_lock);
+ PMAP_UNLOCK(src_pmap);
+ PMAP_UNLOCK(dst_pmap);
+}
+
+/*
+ * pmap_zero_page zeros the specified hardware page by mapping
+ * the page into KVM and using bzero to clear its contents.
+ */
+void
+pmap_zero_page(vm_page_t m)
+{
+ vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
+
+ pagezero((void *)va);
+}
+
+/*
+ * pmap_zero_page_area zeros the specified hardware page by mapping
+ * the page into KVM and using bzero to clear its contents.
+ *
+ * off and size may not cover an area beyond a single hardware page.
+ */
+void
+pmap_zero_page_area(vm_page_t m, int off, int size)
+{
+ vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
+
+ if (off == 0 && size == PAGE_SIZE)
+ pagezero((void *)va);
+ else
+ bzero((char *)va + off, size);
+}
+
+/*
+ * pmap_zero_page_idle zeros the specified hardware page by mapping
+ * the page into KVM and using bzero to clear its contents. This
+ * is intended to be called from the vm_pagezero process only and
+ * outside of Giant.
+ */
+void
+pmap_zero_page_idle(vm_page_t m)
+{
+ vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
+
+ pagezero((void *)va);
+}
+
+/*
+ * pmap_copy_page copies the specified (machine independent)
+ * page by mapping the page into virtual memory and using
+ * bcopy to copy the page, one machine dependent page at a
+ * time.
+ */
+void
+pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
+{
+ vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
+ vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
+
+ pagecopy((void *)src, (void *)dst);
+}
+
+int unmapped_buf_allowed = 1;
+
+void
+pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
+ vm_offset_t b_offset, int xfersize)
+{
+ void *a_cp, *b_cp;
+ vm_offset_t a_pg_offset, b_pg_offset;
+ int cnt;
+
+ while (xfersize > 0) {
+ a_pg_offset = a_offset & PAGE_MASK;
+ cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
+ a_cp = (char *)PHYS_TO_DMAP(ma[a_offset >> PAGE_SHIFT]->
+ phys_addr) + a_pg_offset;
+ b_pg_offset = b_offset & PAGE_MASK;
+ cnt = min(cnt, PAGE_SIZE - b_pg_offset);
+ b_cp = (char *)PHYS_TO_DMAP(mb[b_offset >> PAGE_SHIFT]->
+ phys_addr) + b_pg_offset;
+ bcopy(a_cp, b_cp, cnt);
+ a_offset += cnt;
+ b_offset += cnt;
+ xfersize -= cnt;
+ }
+}
+
+/*
+ * Returns true if the pmap's pv is one of the first
+ * 16 pvs linked to from this page. This count may
+ * be changed upwards or downwards in the future; it
+ * is only necessary that true be returned for a small
+ * subset of pmaps for proper page aging.
+ */
+boolean_t
+pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
+{
+ struct md_page *pvh;
+ struct rwlock *lock;
+ pv_entry_t pv;
+ int loops = 0;
+ boolean_t rv;
+
+ KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+ ("pmap_page_exists_quick: page %p is not managed", m));
+ rv = FALSE;
+ rw_rlock(&pvh_global_lock);
+ lock = VM_PAGE_TO_PV_LIST_LOCK(m);
+ rw_rlock(lock);
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
+ if (PV_PMAP(pv) == pmap) {
+ rv = TRUE;
+ break;
+ }
+ loops++;
+ if (loops >= 16)
+ break;
+ }
+ if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
+ pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
+ if (PV_PMAP(pv) == pmap) {
+ rv = TRUE;
+ break;
+ }
+ loops++;
+ if (loops >= 16)
+ break;
+ }
+ }
+ rw_runlock(lock);
+ rw_runlock(&pvh_global_lock);
+ return (rv);
+}
+
+/*
+ * pmap_page_wired_mappings:
+ *
+ * Return the number of managed mappings to the given physical page
+ * that are wired.
+ */
+int
+pmap_page_wired_mappings(vm_page_t m)
+{
+ int count;
+
+ count = 0;
+ if ((m->oflags & VPO_UNMANAGED) != 0)
+ return (count);
+ rw_wlock(&pvh_global_lock);
+ count = pmap_pvh_wired_mappings(&m->md, count);
+ if ((m->flags & PG_FICTITIOUS) == 0) {
+ count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
+ count);
+ }
+ rw_wunlock(&pvh_global_lock);
+ return (count);
+}
+
+/*
+ * pmap_pvh_wired_mappings:
+ *
+ * Return the updated number "count" of managed mappings that are wired.
+ */
+static int
+pmap_pvh_wired_mappings(struct md_page *pvh, int count)
+{
+ pmap_t pmap;
+ pt_entry_t *pte;
+ pv_entry_t pv;
+
+ rw_assert(&pvh_global_lock, RA_WLOCKED);
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ pte = pmap_pte(pmap, pv->pv_va);
+ if ((*pte & PG_W) != 0)
+ count++;
+ PMAP_UNLOCK(pmap);
+ }
+ return (count);
+}
+
+/*
+ * Returns TRUE if the given page is mapped individually or as part of
+ * a 2mpage. Otherwise, returns FALSE.
+ */
+boolean_t
+pmap_page_is_mapped(vm_page_t m)
+{
+ struct rwlock *lock;
+ boolean_t rv;
+
+ if ((m->oflags & VPO_UNMANAGED) != 0)
+ return (FALSE);
+ rw_rlock(&pvh_global_lock);
+ lock = VM_PAGE_TO_PV_LIST_LOCK(m);
+ rw_rlock(lock);
+ rv = !TAILQ_EMPTY(&m->md.pv_list) ||
+ ((m->flags & PG_FICTITIOUS) == 0 &&
+ !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
+ rw_runlock(lock);
+ rw_runlock(&pvh_global_lock);
+ return (rv);
+}
+
+/*
+ * Remove all pages from specified address space
+ * this aids process exit speeds. Also, this code
+ * is special cased for current process only, but
+ * can have the more generic (and slightly slower)
+ * mode enabled. This is much faster than pmap_remove
+ * in the case of running down an entire address space.
+ */
+void
+pmap_remove_pages(pmap_t pmap)
+{
+ pd_entry_t ptepde;
+ pt_entry_t *pte, tpte;
+ vm_page_t free = NULL;
+ vm_page_t m, mpte, mt;
+ pv_entry_t pv;
+ struct md_page *pvh;
+ struct pv_chunk *pc, *npc;
+ struct rwlock *lock;
+ int64_t bit;
+ uint64_t inuse, bitmask;
+ int allfree, field, freed, idx;
+
+ if (pmap != PCPU_GET(curpmap)) {
+ printf("warning: pmap_remove_pages called with non-current pmap\n");
+ return;
+ }
+ lock = NULL;
+ rw_rlock(&pvh_global_lock);
+ PMAP_LOCK(pmap);
+ TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
+ allfree = 1;
+ freed = 0;
+ for (field = 0; field < _NPCM; field++) {
+ inuse = ~pc->pc_map[field] & pc_freemask[field];
+ while (inuse != 0) {
+ bit = bsfq(inuse);
+ bitmask = 1UL << bit;
+ idx = field * 64 + bit;
+ pv = &pc->pc_pventry[idx];
+ inuse &= ~bitmask;
+
+ pte = pmap_pdpe(pmap, pv->pv_va);
+ ptepde = *pte;
+ pte = pmap_pdpe_to_pde(pte, pv->pv_va);
+ tpte = *pte;
+ if ((tpte & (PG_PS | PG_V)) == PG_V) {
+ ptepde = tpte;
+ pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
+ PG_FRAME);
+ pte = &pte[pmap_pte_index(pv->pv_va)];
+ tpte = *pte & ~PG_PTE_PAT;
+ }
+ if ((tpte & PG_V) == 0) {
+ panic("bad pte va %lx pte %lx",
+ pv->pv_va, tpte);
+ }
+
+/*
+ * We cannot remove wired pages from a process' mapping at this time
+ */
+ if (tpte & PG_W) {
+ allfree = 0;
+ continue;
+ }
+
+ m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
+ KASSERT(m->phys_addr == (tpte & PG_FRAME),
+ ("vm_page_t %p phys_addr mismatch %016jx %016jx",
+ m, (uintmax_t)m->phys_addr,
+ (uintmax_t)tpte));
+
+ KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
+ m < &vm_page_array[vm_page_array_size],
+ ("pmap_remove_pages: bad tpte %#jx",
+ (uintmax_t)tpte));
+
+ pte_clear(pte);
+
+ /*
+ * Update the vm_page_t clean/reference bits.
+ */
+ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+ if ((tpte & PG_PS) != 0) {
+ for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
+ vm_page_dirty(mt);
+ } else
+ vm_page_dirty(m);
+ }
+
+ CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
+
+ /* Mark free */
+ pc->pc_map[field] |= bitmask;
+ if ((tpte & PG_PS) != 0) {
+ pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
+ pvh = pa_to_pvh(tpte & PG_PS_FRAME);
+ TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
+ if (TAILQ_EMPTY(&pvh->pv_list)) {
+ for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
+ if ((mt->aflags & PGA_WRITEABLE) != 0 &&
+ TAILQ_EMPTY(&mt->md.pv_list))
+ vm_page_aflag_clear(mt, PGA_WRITEABLE);
+ }
+ mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
+ if (mpte != NULL) {
+ pmap_remove_pt_page(pmap, mpte);
+ pmap_resident_count_dec(pmap, 1);
+ KASSERT(mpte->wire_count == NPTEPG,
+ ("pmap_remove_pages: pte page wire count error"));
+ mpte->wire_count = 0;
+ pmap_add_delayed_free_list(mpte, &free, FALSE);
+ atomic_subtract_int(&cnt.v_wire_count, 1);
+ }
+ } else {
+ pmap_resident_count_dec(pmap, 1);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
+ if ((m->aflags & PGA_WRITEABLE) != 0 &&
+ TAILQ_EMPTY(&m->md.pv_list) &&
+ (m->flags & PG_FICTITIOUS) == 0) {
+ pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+ if (TAILQ_EMPTY(&pvh->pv_list))
+ vm_page_aflag_clear(m, PGA_WRITEABLE);
+ }
+ }
+ pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
+ freed++;
+ }
+ }
+ PV_STAT(atomic_add_long(&pv_entry_frees, freed));
+ PV_STAT(atomic_add_int(&pv_entry_spare, freed));
+ PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
+ if (allfree) {
+ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+ free_pv_chunk(pc);
+ }
+ }
+ if (lock != NULL)
+ rw_wunlock(lock);
+ pmap_invalidate_all(pmap);
+ rw_runlock(&pvh_global_lock);
+ PMAP_UNLOCK(pmap);
+ pmap_free_zero_pages(free);
+}
+
+/*
+ * pmap_is_modified:
+ *
+ * Return whether or not the specified physical page was modified
+ * in any physical maps.
+ */
+boolean_t
+pmap_is_modified(vm_page_t m)
+{
+ boolean_t rv;
+
+ KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+ ("pmap_is_modified: page %p is not managed", m));
+
+ /*
+ * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be
+ * concurrently set while the object is locked. Thus, if PGA_WRITEABLE
+ * is clear, no PTEs can have PG_M set.
+ */
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
+ if ((m->oflags & VPO_BUSY) == 0 &&
+ (m->aflags & PGA_WRITEABLE) == 0)
+ return (FALSE);
+ rw_wlock(&pvh_global_lock);
+ rv = pmap_is_modified_pvh(&m->md) ||
+ ((m->flags & PG_FICTITIOUS) == 0 &&
+ pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
+ rw_wunlock(&pvh_global_lock);
+ return (rv);
+}
+
+/*
+ * Returns TRUE if any of the given mappings were used to modify
+ * physical memory. Otherwise, returns FALSE. Both page and 2mpage
+ * mappings are supported.
+ */
+static boolean_t
+pmap_is_modified_pvh(struct md_page *pvh)
+{
+ pv_entry_t pv;
+ pt_entry_t *pte;
+ pmap_t pmap;
+ boolean_t rv;
+
+ rw_assert(&pvh_global_lock, RA_WLOCKED);
+ rv = FALSE;
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ pte = pmap_pte(pmap, pv->pv_va);
+ rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
+ PMAP_UNLOCK(pmap);
+ if (rv)
+ break;
+ }
+ return (rv);
+}
+
+/*
+ * pmap_is_prefaultable:
+ *
+ * Return whether or not the specified virtual address is elgible
+ * for prefault.
+ */
+boolean_t
+pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
+{
+ pd_entry_t *pde;
+ pt_entry_t *pte;
+ boolean_t rv;
+
+ rv = FALSE;
+ PMAP_LOCK(pmap);
+ pde = pmap_pde(pmap, addr);
+ if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
+ pte = pmap_pde_to_pte(pde, addr);
+ rv = (*pte & PG_V) == 0;
+ }
+ PMAP_UNLOCK(pmap);
+ return (rv);
+}
+
+/*
+ * pmap_is_referenced:
+ *
+ * Return whether or not the specified physical page was referenced
+ * in any physical maps.
+ */
+boolean_t
+pmap_is_referenced(vm_page_t m)
+{
+ boolean_t rv;
+
+ KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+ ("pmap_is_referenced: page %p is not managed", m));
+ rw_wlock(&pvh_global_lock);
+ rv = pmap_is_referenced_pvh(&m->md) ||
+ ((m->flags & PG_FICTITIOUS) == 0 &&
+ pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
+ rw_wunlock(&pvh_global_lock);
+ return (rv);
+}
+
+/*
+ * Returns TRUE if any of the given mappings were referenced and FALSE
+ * otherwise. Both page and 2mpage mappings are supported.
+ */
+static boolean_t
+pmap_is_referenced_pvh(struct md_page *pvh)
+{
+ pv_entry_t pv;
+ pt_entry_t *pte;
+ pmap_t pmap;
+ boolean_t rv;
+
+ rw_assert(&pvh_global_lock, RA_WLOCKED);
+ rv = FALSE;
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ pte = pmap_pte(pmap, pv->pv_va);
+ rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
+ PMAP_UNLOCK(pmap);
+ if (rv)
+ break;
+ }
+ return (rv);
+}
+
+/*
+ * Clear the write and modified bits in each of the given page's mappings.
+ */
+void
+pmap_remove_write(vm_page_t m)
+{
+ struct md_page *pvh;
+ pmap_t pmap;
+ pv_entry_t next_pv, pv;
+ pd_entry_t *pde;
+ pt_entry_t oldpte, *pte;
+ vm_offset_t va;
+
+ KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+ ("pmap_remove_write: page %p is not managed", m));
+
+ /*
+ * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by
+ * another thread while the object is locked. Thus, if PGA_WRITEABLE
+ * is clear, no page table entries need updating.
+ */
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
+ if ((m->oflags & VPO_BUSY) == 0 &&
+ (m->aflags & PGA_WRITEABLE) == 0)
+ return;
+ rw_wlock(&pvh_global_lock);
+ if ((m->flags & PG_FICTITIOUS) != 0)
+ goto small_mappings;
+ pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+ TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ va = pv->pv_va;
+ pde = pmap_pde(pmap, va);
+ if ((*pde & PG_RW) != 0)
+ (void)pmap_demote_pde(pmap, pde, va);
+ PMAP_UNLOCK(pmap);
+ }
+small_mappings:
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ pde = pmap_pde(pmap, pv->pv_va);
+ KASSERT((*pde & PG_PS) == 0,
+ ("pmap_remove_write: found a 2mpage in page %p's pv list",
+ m));
+ pte = pmap_pde_to_pte(pde, pv->pv_va);
+retry:
+ oldpte = *pte;
+ if (oldpte & PG_RW) {
+ if (!atomic_cmpset_long(pte, oldpte, oldpte &
+ ~(PG_RW | PG_M)))
+ goto retry;
+ if ((oldpte & PG_M) != 0)
+ vm_page_dirty(m);
+ pmap_invalidate_page(pmap, pv->pv_va);
+ }
+ PMAP_UNLOCK(pmap);
+ }
+ vm_page_aflag_clear(m, PGA_WRITEABLE);
+ rw_wunlock(&pvh_global_lock);
+}
+
+/*
+ * pmap_ts_referenced:
+ *
+ * Return a count of reference bits for a page, clearing those bits.
+ * It is not necessary for every reference bit to be cleared, but it
+ * is necessary that 0 only be returned when there are truly no
+ * reference bits set.
+ *
+ * XXX: The exact number of bits to check and clear is a matter that
+ * should be tested and standardized at some point in the future for
+ * optimal aging of shared pages.
+ */
+int
+pmap_ts_referenced(vm_page_t m)
+{
+ struct md_page *pvh;
+ pv_entry_t pv, pvf, pvn;
+ pmap_t pmap;
+ pd_entry_t oldpde, *pde;
+ pt_entry_t *pte;
+ vm_offset_t va;
+ int rtval = 0;
+
+ KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+ ("pmap_ts_referenced: page %p is not managed", m));
+ rw_wlock(&pvh_global_lock);
+ if ((m->flags & PG_FICTITIOUS) != 0)
+ goto small_mappings;
+ pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+ TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, pvn) {
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ va = pv->pv_va;
+ pde = pmap_pde(pmap, va);
+ oldpde = *pde;
+ if ((oldpde & PG_A) != 0) {
+ if (pmap_demote_pde(pmap, pde, va)) {
+ if ((oldpde & PG_W) == 0) {
+ /*
+ * Remove the mapping to a single page
+ * so that a subsequent access may
+ * repromote. Since the underlying
+ * page table page is fully populated,
+ * this removal never frees a page
+ * table page.
+ */
+ va += VM_PAGE_TO_PHYS(m) - (oldpde &
+ PG_PS_FRAME);
+ pmap_remove_page(pmap, va, pde, NULL);
+ rtval++;
+ if (rtval > 4) {
+ PMAP_UNLOCK(pmap);
+ goto out;
+ }
+ }
+ }
+ }
+ PMAP_UNLOCK(pmap);
+ }
+small_mappings:
+ if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
+ pvf = pv;
+ do {
+ pvn = TAILQ_NEXT(pv, pv_next);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ pde = pmap_pde(pmap, pv->pv_va);
+ KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
+ " found a 2mpage in page %p's pv list", m));
+ pte = pmap_pde_to_pte(pde, pv->pv_va);
+ if ((*pte & PG_A) != 0) {
+ atomic_clear_long(pte, PG_A);
+ pmap_invalidate_page(pmap, pv->pv_va);
+ rtval++;
+ if (rtval > 4)
+ pvn = NULL;
+ }
+ PMAP_UNLOCK(pmap);
+ } while ((pv = pvn) != NULL && pv != pvf);
+ }
+out:
+ rw_wunlock(&pvh_global_lock);
+ return (rtval);
+}
+
+/*
+ * Clear the modify bits on the specified physical page.
+ */
+void
+pmap_clear_modify(vm_page_t m)
+{
+ struct md_page *pvh;
+ pmap_t pmap;
+ pv_entry_t next_pv, pv;
+ pd_entry_t oldpde, *pde;
+ pt_entry_t oldpte, *pte;
+ vm_offset_t va;
+
+ KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+ ("pmap_clear_modify: page %p is not managed", m));
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
+ KASSERT((m->oflags & VPO_BUSY) == 0,
+ ("pmap_clear_modify: page %p is busy", m));
+
+ /*
+ * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
+ * If the object containing the page is locked and the page is not
+ * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set.
+ */
+ if ((m->aflags & PGA_WRITEABLE) == 0)
+ return;
+ rw_wlock(&pvh_global_lock);
+ if ((m->flags & PG_FICTITIOUS) != 0)
+ goto small_mappings;
+ pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+ TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ va = pv->pv_va;
+ pde = pmap_pde(pmap, va);
+ oldpde = *pde;
+ if ((oldpde & PG_RW) != 0) {
+ if (pmap_demote_pde(pmap, pde, va)) {
+ if ((oldpde & PG_W) == 0) {
+ /*
+ * Write protect the mapping to a
+ * single page so that a subsequent
+ * write access may repromote.
+ */
+ va += VM_PAGE_TO_PHYS(m) - (oldpde &
+ PG_PS_FRAME);
+ pte = pmap_pde_to_pte(pde, va);
+ oldpte = *pte;
+ if ((oldpte & PG_V) != 0) {
+ while (!atomic_cmpset_long(pte,
+ oldpte,
+ oldpte & ~(PG_M | PG_RW)))
+ oldpte = *pte;
+ vm_page_dirty(m);
+ pmap_invalidate_page(pmap, va);
+ }
+ }
+ }
+ }
+ PMAP_UNLOCK(pmap);
+ }
+small_mappings:
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ pde = pmap_pde(pmap, pv->pv_va);
+ KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
+ " a 2mpage in page %p's pv list", m));
+ pte = pmap_pde_to_pte(pde, pv->pv_va);
+ if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+ atomic_clear_long(pte, PG_M);
+ pmap_invalidate_page(pmap, pv->pv_va);
+ }
+ PMAP_UNLOCK(pmap);
+ }
+ rw_wunlock(&pvh_global_lock);
+}
+
+/*
+ * pmap_clear_reference:
+ *
+ * Clear the reference bit on the specified physical page.
+ */
+void
+pmap_clear_reference(vm_page_t m)
+{
+ struct md_page *pvh;
+ pmap_t pmap;
+ pv_entry_t next_pv, pv;
+ pd_entry_t oldpde, *pde;
+ pt_entry_t *pte;
+ vm_offset_t va;
+
+ KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+ ("pmap_clear_reference: page %p is not managed", m));
+ rw_wlock(&pvh_global_lock);
+ if ((m->flags & PG_FICTITIOUS) != 0)
+ goto small_mappings;
+ pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+ TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ va = pv->pv_va;
+ pde = pmap_pde(pmap, va);
+ oldpde = *pde;
+ if ((oldpde & PG_A) != 0) {
+ if (pmap_demote_pde(pmap, pde, va)) {
+ /*
+ * Remove the mapping to a single page so
+ * that a subsequent access may repromote.
+ * Since the underlying page table page is
+ * fully populated, this removal never frees
+ * a page table page.
+ */
+ va += VM_PAGE_TO_PHYS(m) - (oldpde &
+ PG_PS_FRAME);
+ pmap_remove_page(pmap, va, pde, NULL);
+ }
+ }
+ PMAP_UNLOCK(pmap);
+ }
+small_mappings:
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ pde = pmap_pde(pmap, pv->pv_va);
+ KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
+ " a 2mpage in page %p's pv list", m));
+ pte = pmap_pde_to_pte(pde, pv->pv_va);
+ if (*pte & PG_A) {
+ atomic_clear_long(pte, PG_A);
+ pmap_invalidate_page(pmap, pv->pv_va);
+ }
+ PMAP_UNLOCK(pmap);
+ }
+ rw_wunlock(&pvh_global_lock);
+}
+
+/*
+ * Miscellaneous support routines follow
+ */
+
+/* Adjust the cache mode for a 4KB page mapped via a PTE. */
+static __inline void
+pmap_pte_attr(pt_entry_t *pte, int cache_bits)
+{
+ u_int opte, npte;
+
+ /*
+ * The cache mode bits are all in the low 32-bits of the
+ * PTE, so we can just spin on updating the low 32-bits.
+ */
+ do {
+ opte = *(u_int *)pte;
+ npte = opte & ~PG_PTE_CACHE;
+ npte |= cache_bits;
+ } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
+}
+
+/* Adjust the cache mode for a 2MB page mapped via a PDE. */
+static __inline void
+pmap_pde_attr(pd_entry_t *pde, int cache_bits)
+{
+ u_int opde, npde;
+
+ /*
+ * The cache mode bits are all in the low 32-bits of the
+ * PDE, so we can just spin on updating the low 32-bits.
+ */
+ do {
+ opde = *(u_int *)pde;
+ npde = opde & ~PG_PDE_CACHE;
+ npde |= cache_bits;
+ } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
+}
+
+/*
+ * Map a set of physical memory pages into the kernel virtual
+ * address space. Return a pointer to where it is mapped. This
+ * routine is intended to be used for mapping device memory,
+ * NOT real memory.
+ */
+void *
+pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
+{
+ vm_offset_t va, offset;
+ vm_size_t tmpsize;
+
+ /*
+ * If the specified range of physical addresses fits within the direct
+ * map window, use the direct map.
+ */
+ if (pa < dmaplimit && pa + size < dmaplimit) {
+ va = PHYS_TO_DMAP(pa);
+ if (!pmap_change_attr(va, size, mode))
+ return ((void *)va);
+ }
+ offset = pa & PAGE_MASK;
+ size = round_page(offset + size);
+ va = kmem_alloc_nofault(kernel_map, size);
+ if (!va)
+ panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
+ pa = trunc_page(pa);
+ for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
+ pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
+ pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
+ pmap_invalidate_cache_range(va, va + tmpsize);
+ return ((void *)(va + offset));
+}
+
+void *
+pmap_mapdev(vm_paddr_t pa, vm_size_t size)
+{
+
+ return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
+}
+
+void *
+pmap_mapbios(vm_paddr_t pa, vm_size_t size)
+{
+
+ return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
+}
+
+void
+pmap_unmapdev(vm_offset_t va, vm_size_t size)
+{
+ vm_offset_t base, offset;
+
+ /* If we gave a direct map region in pmap_mapdev, do nothing */
+ if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
+ return;
+ base = trunc_page(va);
+ offset = va & PAGE_MASK;
+ size = round_page(offset + size);
+ kmem_free(kernel_map, base, size);
+}
+
+/*
+ * Tries to demote a 1GB page mapping.
+ */
+static boolean_t
+pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
+{
+ pdp_entry_t newpdpe, oldpdpe;
+ pd_entry_t *firstpde, newpde, *pde;
+ vm_paddr_t mpdepa;
+ vm_page_t mpde;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ oldpdpe = *pdpe;
+ KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
+ ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
+ if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
+ VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
+ CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
+ " in pmap %p", va, pmap);
+ return (FALSE);
+ }
+ mpdepa = VM_PAGE_TO_PHYS(mpde);
+ firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
+ newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
+ KASSERT((oldpdpe & PG_A) != 0,
+ ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
+ KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
+ ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
+ newpde = oldpdpe;
+
+ /*
+ * Initialize the page directory page.
+ */
+ for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
+ *pde = newpde;
+ newpde += NBPDR;
+ }
+
+ /*
+ * Demote the mapping.
+ */
+ *pdpe = newpdpe;
+
+ /*
+ * Invalidate a stale recursive mapping of the page directory page.
+ */
+ pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
+
+ pmap_pdpe_demotions++;
+ CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
+ " in pmap %p", va, pmap);
+ return (TRUE);
+}
+
+/*
+ * Sets the memory attribute for the specified page.
+ */
+void
+pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
+{
+
+ m->md.pat_mode = ma;
+
+ /*
+ * If "m" is a normal page, update its direct mapping. This update
+ * can be relied upon to perform any cache operations that are
+ * required for data coherence.
+ */
+ if ((m->flags & PG_FICTITIOUS) == 0 &&
+ pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
+ m->md.pat_mode))
+ panic("memory attribute change on the direct map failed");
+}
+
+/*
+ * Changes the specified virtual address range's memory type to that given by
+ * the parameter "mode". The specified virtual address range must be
+ * completely contained within either the direct map or the kernel map. If
+ * the virtual address range is contained within the kernel map, then the
+ * memory type for each of the corresponding ranges of the direct map is also
+ * changed. (The corresponding ranges of the direct map are those ranges that
+ * map the same physical pages as the specified virtual address range.) These
+ * changes to the direct map are necessary because Intel describes the
+ * behavior of their processors as "undefined" if two or more mappings to the
+ * same physical page have different memory types.
+ *
+ * Returns zero if the change completed successfully, and either EINVAL or
+ * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
+ * of the virtual address range was not mapped, and ENOMEM is returned if
+ * there was insufficient memory available to complete the change. In the
+ * latter case, the memory type may have been changed on some part of the
+ * virtual address range or the direct map.
+ */
+int
+pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
+{
+ int error;
+
+ PMAP_LOCK(kernel_pmap);
+ error = pmap_change_attr_locked(va, size, mode);
+ PMAP_UNLOCK(kernel_pmap);
+ return (error);
+}
+
+static int
+pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
+{
+ vm_offset_t base, offset, tmpva;
+ vm_paddr_t pa_start, pa_end;
+ pdp_entry_t *pdpe;
+ pd_entry_t *pde;
+ pt_entry_t *pte;
+ int cache_bits_pte, cache_bits_pde, error;
+ boolean_t changed;
+
+ PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
+ base = trunc_page(va);
+ offset = va & PAGE_MASK;
+ size = round_page(offset + size);
+
+ /*
+ * Only supported on kernel virtual addresses, including the direct
+ * map but excluding the recursive map.
+ */
+ if (base < DMAP_MIN_ADDRESS)
+ return (EINVAL);
+
+ cache_bits_pde = pmap_cache_bits(mode, 1);
+ cache_bits_pte = pmap_cache_bits(mode, 0);
+ changed = FALSE;
+
+ /*
+ * Pages that aren't mapped aren't supported. Also break down 2MB pages
+ * into 4KB pages if required.
+ */
+ for (tmpva = base; tmpva < base + size; ) {
+ pdpe = pmap_pdpe(kernel_pmap, tmpva);
+ if (*pdpe == 0)
+ return (EINVAL);
+ if (*pdpe & PG_PS) {
+ /*
+ * If the current 1GB page already has the required
+ * memory type, then we need not demote this page. Just
+ * increment tmpva to the next 1GB page frame.
+ */
+ if ((*pdpe & PG_PDE_CACHE) == cache_bits_pde) {
+ tmpva = trunc_1gpage(tmpva) + NBPDP;
+ continue;
+ }
+
+ /*
+ * If the current offset aligns with a 1GB page frame
+ * and there is at least 1GB left within the range, then
+ * we need not break down this page into 2MB pages.
+ */
+ if ((tmpva & PDPMASK) == 0 &&
+ tmpva + PDPMASK < base + size) {
+ tmpva += NBPDP;
+ continue;
+ }
+ if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
+ return (ENOMEM);
+ }
+ pde = pmap_pdpe_to_pde(pdpe, tmpva);
+ if (*pde == 0)
+ return (EINVAL);
+ if (*pde & PG_PS) {
+ /*
+ * If the current 2MB page already has the required
+ * memory type, then we need not demote this page. Just
+ * increment tmpva to the next 2MB page frame.
+ */
+ if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
+ tmpva = trunc_2mpage(tmpva) + NBPDR;
+ continue;
+ }
+
+ /*
+ * If the current offset aligns with a 2MB page frame
+ * and there is at least 2MB left within the range, then
+ * we need not break down this page into 4KB pages.
+ */
+ if ((tmpva & PDRMASK) == 0 &&
+ tmpva + PDRMASK < base + size) {
+ tmpva += NBPDR;
+ continue;
+ }
+ if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
+ return (ENOMEM);
+ }
+ pte = pmap_pde_to_pte(pde, tmpva);
+ if (*pte == 0)
+ return (EINVAL);
+ tmpva += PAGE_SIZE;
+ }
+ error = 0;
+
+ /*
+ * Ok, all the pages exist, so run through them updating their
+ * cache mode if required.
+ */
+ pa_start = pa_end = 0;
+ for (tmpva = base; tmpva < base + size; ) {
+ pdpe = pmap_pdpe(kernel_pmap, tmpva);
+ if (*pdpe & PG_PS) {
+ if ((*pdpe & PG_PDE_CACHE) != cache_bits_pde) {
+ pmap_pde_attr(pdpe, cache_bits_pde);
+ changed = TRUE;
+ }
+ if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
+ if (pa_start == pa_end) {
+ /* Start physical address run. */
+ pa_start = *pdpe & PG_PS_FRAME;
+ pa_end = pa_start + NBPDP;
+ } else if (pa_end == (*pdpe & PG_PS_FRAME))
+ pa_end += NBPDP;
+ else {
+ /* Run ended, update direct map. */
+ error = pmap_change_attr_locked(
+ PHYS_TO_DMAP(pa_start),
+ pa_end - pa_start, mode);
+ if (error != 0)
+ break;
+ /* Start physical address run. */
+ pa_start = *pdpe & PG_PS_FRAME;
+ pa_end = pa_start + NBPDP;
+ }
+ }
+ tmpva = trunc_1gpage(tmpva) + NBPDP;
+ continue;
+ }
+ pde = pmap_pdpe_to_pde(pdpe, tmpva);
+ if (*pde & PG_PS) {
+ if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
+ pmap_pde_attr(pde, cache_bits_pde);
+ changed = TRUE;
+ }
+ if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
+ if (pa_start == pa_end) {
+ /* Start physical address run. */
+ pa_start = *pde & PG_PS_FRAME;
+ pa_end = pa_start + NBPDR;
+ } else if (pa_end == (*pde & PG_PS_FRAME))
+ pa_end += NBPDR;
+ else {
+ /* Run ended, update direct map. */
+ error = pmap_change_attr_locked(
+ PHYS_TO_DMAP(pa_start),
+ pa_end - pa_start, mode);
+ if (error != 0)
+ break;
+ /* Start physical address run. */
+ pa_start = *pde & PG_PS_FRAME;
+ pa_end = pa_start + NBPDR;
+ }
+ }
+ tmpva = trunc_2mpage(tmpva) + NBPDR;
+ } else {
+ pte = pmap_pde_to_pte(pde, tmpva);
+ if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
+ pmap_pte_attr(pte, cache_bits_pte);
+ changed = TRUE;
+ }
+ if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
+ if (pa_start == pa_end) {
+ /* Start physical address run. */
+ pa_start = *pte & PG_FRAME;
+ pa_end = pa_start + PAGE_SIZE;
+ } else if (pa_end == (*pte & PG_FRAME))
+ pa_end += PAGE_SIZE;
+ else {
+ /* Run ended, update direct map. */
+ error = pmap_change_attr_locked(
+ PHYS_TO_DMAP(pa_start),
+ pa_end - pa_start, mode);
+ if (error != 0)
+ break;
+ /* Start physical address run. */
+ pa_start = *pte & PG_FRAME;
+ pa_end = pa_start + PAGE_SIZE;
+ }
+ }
+ tmpva += PAGE_SIZE;
+ }
+ }
+ if (error == 0 && pa_start != pa_end)
+ error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
+ pa_end - pa_start, mode);
+
+ /*
+ * Flush CPU caches if required to make sure any data isn't cached that
+ * shouldn't be, etc.
+ */
+ if (changed) {
+ pmap_invalidate_range(kernel_pmap, base, tmpva);
+ pmap_invalidate_cache_range(base, tmpva);
+ }
+ return (error);
+}
+
+/*
+ * Demotes any mapping within the direct map region that covers more than the
+ * specified range of physical addresses. This range's size must be a power
+ * of two and its starting address must be a multiple of its size. Since the
+ * demotion does not change any attributes of the mapping, a TLB invalidation
+ * is not mandatory. The caller may, however, request a TLB invalidation.
+ */
+void
+pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
+{
+ pdp_entry_t *pdpe;
+ pd_entry_t *pde;
+ vm_offset_t va;
+ boolean_t changed;
+
+ if (len == 0)
+ return;
+ KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
+ KASSERT((base & (len - 1)) == 0,
+ ("pmap_demote_DMAP: base is not a multiple of len"));
+ if (len < NBPDP && base < dmaplimit) {
+ va = PHYS_TO_DMAP(base);
+ changed = FALSE;
+ PMAP_LOCK(kernel_pmap);
+ pdpe = pmap_pdpe(kernel_pmap, va);
+ if ((*pdpe & PG_V) == 0)
+ panic("pmap_demote_DMAP: invalid PDPE");
+ if ((*pdpe & PG_PS) != 0) {
+ if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
+ panic("pmap_demote_DMAP: PDPE failed");
+ changed = TRUE;
+ }
+ if (len < NBPDR) {
+ pde = pmap_pdpe_to_pde(pdpe, va);
+ if ((*pde & PG_V) == 0)
+ panic("pmap_demote_DMAP: invalid PDE");
+ if ((*pde & PG_PS) != 0) {
+ if (!pmap_demote_pde(kernel_pmap, pde, va))
+ panic("pmap_demote_DMAP: PDE failed");
+ changed = TRUE;
+ }
+ }
+ if (changed && invalidate)
+ pmap_invalidate_page(kernel_pmap, va);
+ PMAP_UNLOCK(kernel_pmap);
+ }
+}
+
+/*
+ * perform the pmap work for mincore
+ */
+int
+pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
+{
+ pd_entry_t *pdep;
+ pt_entry_t pte;
+ vm_paddr_t pa;
+ int val;
+
+ PMAP_LOCK(pmap);
+retry:
+ pdep = pmap_pde(pmap, addr);
+ if (pdep != NULL && (*pdep & PG_V)) {
+ if (*pdep & PG_PS) {
+ pte = *pdep;
+ /* Compute the physical address of the 4KB page. */
+ pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
+ PG_FRAME;
+ val = MINCORE_SUPER;
+ } else {
+ pte = *pmap_pde_to_pte(pdep, addr);
+ pa = pte & PG_FRAME;
+ val = 0;
+ }
+ } else {
+ pte = 0;
+ pa = 0;
+ val = 0;
+ }
+ if ((pte & PG_V) != 0) {
+ val |= MINCORE_INCORE;
+ if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
+ val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
+ if ((pte & PG_A) != 0)
+ val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
+ }
+ if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
+ (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
+ (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
+ /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
+ if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
+ goto retry;
+ } else
+ PA_UNLOCK_COND(*locked_pa);
+ PMAP_UNLOCK(pmap);
+ return (val);
+}
+
+void
+pmap_activate(struct thread *td)
+{
+ pmap_t pmap, oldpmap;
+ u_int cpuid;
+ u_int64_t cr3;
+
+ critical_enter();
+ pmap = vmspace_pmap(td->td_proc->p_vmspace);
+ oldpmap = PCPU_GET(curpmap);
+ cpuid = PCPU_GET(cpuid);
+#ifdef SMP
+ CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
+ CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
+#else
+ CPU_CLR(cpuid, &oldpmap->pm_active);
+ CPU_SET(cpuid, &pmap->pm_active);
+#endif
+ cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4);
+ td->td_pcb->pcb_cr3 = cr3;
+ load_cr3(cr3);
+ PCPU_SET(curpmap, pmap);
+ critical_exit();
+}
+
+void
+pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
+{
+}
+
+/*
+ * Increase the starting virtual address of the given mapping if a
+ * different alignment might result in more superpage mappings.
+ */
+void
+pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
+ vm_offset_t *addr, vm_size_t size)
+{
+ vm_offset_t superpage_offset;
+
+ if (size < NBPDR)
+ return;
+ if (object != NULL && (object->flags & OBJ_COLORED) != 0)
+ offset += ptoa(object->pg_color);
+ superpage_offset = offset & PDRMASK;
+ if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
+ (*addr & PDRMASK) == superpage_offset)
+ return;
+ if ((*addr & PDRMASK) < superpage_offset)
+ *addr = (*addr & ~PDRMASK) + superpage_offset;
+ else
+ *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(pte, pmap_print_pte)
+{
+ pmap_t pmap;
+ pml4_entry_t *pml4;
+ pdp_entry_t *pdp;
+ pd_entry_t *pde;
+ pt_entry_t *pte;
+ vm_offset_t va;
+
+ if (have_addr) {
+ va = (vm_offset_t)addr;
+ pmap = PCPU_GET(curpmap); /* XXX */
+ } else {
+ db_printf("show pte addr\n");
+ return;
+ }
+ pml4 = pmap_pml4e(pmap, va);
+ db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
+ if ((*pml4 & PG_V) == 0) {
+ db_printf("\n");
+ return;
+ }
+ pdp = pmap_pml4e_to_pdpe(pml4, va);
+ db_printf(" pdpe %#016lx", *pdp);
+ if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
+ db_printf("\n");
+ return;
+ }
+ pde = pmap_pdpe_to_pde(pdp, va);
+ db_printf(" pde %#016lx", *pde);
+ if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
+ db_printf("\n");
+ return;
+ }
+ pte = pmap_pde_to_pte(pde, va);
+ db_printf(" pte %#016lx\n", *pte);
+}
+#endif
diff --git a/sys/amd64/amd64/prof_machdep.c b/sys/amd64/amd64/prof_machdep.c
new file mode 100644
index 0000000..273c833
--- /dev/null
+++ b/sys/amd64/amd64/prof_machdep.c
@@ -0,0 +1,391 @@
+/*-
+ * Copyright (c) 1996 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef GUPROF
+#if 0
+#include "opt_i586_guprof.h"
+#include "opt_perfmon.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/cpu.h>
+#include <sys/eventhandler.h>
+#include <sys/gmon.h>
+#include <sys/kernel.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <machine/clock.h>
+#if 0
+#include <machine/perfmon.h>
+#endif
+#include <machine/timerreg.h>
+
+#define CPUTIME_CLOCK_UNINITIALIZED 0
+#define CPUTIME_CLOCK_I8254 1
+#define CPUTIME_CLOCK_TSC 2
+#define CPUTIME_CLOCK_I586_PMC 3
+#define CPUTIME_CLOCK_I8254_SHIFT 7
+
+int cputime_bias = 1; /* initialize for locality of reference */
+
+static int cputime_clock = CPUTIME_CLOCK_UNINITIALIZED;
+#if defined(PERFMON) && defined(I586_PMC_GUPROF)
+static u_int cputime_clock_pmc_conf = I586_PMC_GUPROF;
+static int cputime_clock_pmc_init;
+static struct gmonparam saved_gmp;
+#endif
+static int cputime_prof_active;
+#endif /* GUPROF */
+
+#ifdef __GNUCLIKE_ASM
+__asm(" \n\
+GM_STATE = 0 \n\
+GMON_PROF_OFF = 3 \n\
+ \n\
+ .text \n\
+ .p2align 4,0x90 \n\
+ .globl __mcount \n\
+ .type __mcount,@function \n\
+__mcount: \n\
+ # \n\
+ # Check that we are profiling. Do it early for speed. \n\
+ # \n\
+ cmpl $GMON_PROF_OFF,_gmonparam+GM_STATE \n\
+ je .mcount_exit \n\
+ # \n\
+ # __mcount is the same as [.]mcount except the caller \n\
+ # hasn't changed the stack except to call here, so the \n\
+ # caller's raddr is above our raddr. \n\
+ # \n\
+ pushq %rax \n\
+ pushq %rdx \n\
+ pushq %rcx \n\
+ pushq %rsi \n\
+ pushq %rdi \n\
+ pushq %r8 \n\
+ pushq %r9 \n\
+ movq 7*8+8(%rsp),%rdi \n\
+ jmp .got_frompc \n\
+ \n\
+ .p2align 4,0x90 \n\
+ .globl .mcount \n\
+.mcount: \n\
+ cmpl $GMON_PROF_OFF,_gmonparam+GM_STATE \n\
+ je .mcount_exit \n\
+ # \n\
+ # The caller's stack frame has already been built, so \n\
+ # %rbp is the caller's frame pointer. The caller's \n\
+ # raddr is in the caller's frame following the caller's \n\
+ # caller's frame pointer. \n\
+ # \n\
+ pushq %rax \n\
+ pushq %rdx \n\
+ pushq %rcx \n\
+ pushq %rsi \n\
+ pushq %rdi \n\
+ pushq %r8 \n\
+ pushq %r9 \n\
+ movq 8(%rbp),%rdi \n\
+.got_frompc: \n\
+ # \n\
+ # Our raddr is the caller's pc. \n\
+ # \n\
+ movq 7*8(%rsp),%rsi \n\
+ \n\
+ pushfq \n\
+ cli \n\
+ call mcount \n\
+ popfq \n\
+ popq %r9 \n\
+ popq %r8 \n\
+ popq %rdi \n\
+ popq %rsi \n\
+ popq %rcx \n\
+ popq %rdx \n\
+ popq %rax \n\
+.mcount_exit: \n\
+ ret $0 \n\
+");
+#else /* !__GNUCLIKE_ASM */
+#error "this file needs to be ported to your compiler"
+#endif /* __GNUCLIKE_ASM */
+
+#ifdef GUPROF
+/*
+ * [.]mexitcount saves the return register(s), loads selfpc and calls
+ * mexitcount(selfpc) to do the work. Someday it should be in a machine
+ * dependent file together with cputime(), __mcount and [.]mcount. cputime()
+ * can't just be put in machdep.c because it has to be compiled without -pg.
+ */
+#ifdef __GNUCLIKE_ASM
+__asm(" \n\
+ .text \n\
+# \n\
+# Dummy label to be seen when gprof -u hides [.]mexitcount. \n\
+# \n\
+ .p2align 4,0x90 \n\
+ .globl __mexitcount \n\
+ .type __mexitcount,@function \n\
+__mexitcount: \n\
+ nop \n\
+ \n\
+GMON_PROF_HIRES = 4 \n\
+ \n\
+ .p2align 4,0x90 \n\
+ .globl .mexitcount \n\
+.mexitcount: \n\
+ cmpl $GMON_PROF_HIRES,_gmonparam+GM_STATE \n\
+ jne .mexitcount_exit \n\
+ pushq %rax \n\
+ pushq %rdx \n\
+ pushq %rcx \n\
+ pushq %rsi \n\
+ pushq %rdi \n\
+ pushq %r8 \n\
+ pushq %r9 \n\
+ movq 7*8(%rsp),%rdi \n\
+ pushfq \n\
+ cli \n\
+ call mexitcount \n\
+ popfq \n\
+ popq %r9 \n\
+ popq %r8 \n\
+ popq %rdi \n\
+ popq %rsi \n\
+ popq %rcx \n\
+ popq %rdx \n\
+ popq %rax \n\
+.mexitcount_exit: \n\
+ ret $0 \n\
+");
+#endif /* __GNUCLIKE_ASM */
+
+/*
+ * Return the time elapsed since the last call. The units are machine-
+ * dependent.
+ */
+int
+cputime()
+{
+ u_int count;
+ int delta;
+#if defined(PERFMON) && defined(I586_PMC_GUPROF) && !defined(SMP)
+ u_quad_t event_count;
+#endif
+ u_char high, low;
+ static u_int prev_count;
+
+ if (cputime_clock == CPUTIME_CLOCK_TSC) {
+ /*
+ * Scale the TSC a little to make cputime()'s frequency
+ * fit in an int, assuming that the TSC frequency fits
+ * in a u_int. Use a fixed scale since dynamic scaling
+ * would be slower and we can't really use the low bit
+ * of precision.
+ */
+ count = (u_int)rdtsc() & ~1u;
+ delta = (int)(count - prev_count) >> 1;
+ prev_count = count;
+ return (delta);
+ }
+#if defined(PERFMON) && defined(I586_PMC_GUPROF) && !defined(SMP)
+ if (cputime_clock == CPUTIME_CLOCK_I586_PMC) {
+ /*
+ * XXX permon_read() should be inlined so that the
+ * perfmon module doesn't need to be compiled with
+ * profiling disabled and so that it is fast.
+ */
+ perfmon_read(0, &event_count);
+
+ count = (u_int)event_count;
+ delta = (int)(count - prev_count);
+ prev_count = count;
+ return (delta);
+ }
+#endif /* PERFMON && I586_PMC_GUPROF && !SMP */
+
+ /*
+ * Read the current value of the 8254 timer counter 0.
+ */
+ outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH);
+ low = inb(TIMER_CNTR0);
+ high = inb(TIMER_CNTR0);
+ count = ((high << 8) | low) << CPUTIME_CLOCK_I8254_SHIFT;
+
+ /*
+ * The timer counts down from TIMER_CNTR0_MAX to 0 and then resets.
+ * While profiling is enabled, this routine is called at least twice
+ * per timer reset (for mcounting and mexitcounting hardclock()),
+ * so at most one reset has occurred since the last call, and one
+ * has occurred iff the current count is larger than the previous
+ * count. This allows counter underflow to be detected faster
+ * than in microtime().
+ */
+ delta = prev_count - count;
+ prev_count = count;
+ if ((int) delta <= 0)
+ return (delta + (i8254_max_count << CPUTIME_CLOCK_I8254_SHIFT));
+ return (delta);
+}
+
+static int
+sysctl_machdep_cputime_clock(SYSCTL_HANDLER_ARGS)
+{
+ int clock;
+ int error;
+#if defined(PERFMON) && defined(I586_PMC_GUPROF)
+ int event;
+ struct pmc pmc;
+#endif
+
+ clock = cputime_clock;
+#if defined(PERFMON) && defined(I586_PMC_GUPROF)
+ if (clock == CPUTIME_CLOCK_I586_PMC) {
+ pmc.pmc_val = cputime_clock_pmc_conf;
+ clock += pmc.pmc_event;
+ }
+#endif
+ error = sysctl_handle_opaque(oidp, &clock, sizeof clock, req);
+ if (error == 0 && req->newptr != NULL) {
+#if defined(PERFMON) && defined(I586_PMC_GUPROF)
+ if (clock >= CPUTIME_CLOCK_I586_PMC) {
+ event = clock - CPUTIME_CLOCK_I586_PMC;
+ if (event >= 256)
+ return (EINVAL);
+ pmc.pmc_num = 0;
+ pmc.pmc_event = event;
+ pmc.pmc_unit = 0;
+ pmc.pmc_flags = PMCF_E | PMCF_OS | PMCF_USR;
+ pmc.pmc_mask = 0;
+ cputime_clock_pmc_conf = pmc.pmc_val;
+ cputime_clock = CPUTIME_CLOCK_I586_PMC;
+ } else
+#endif
+ {
+ if (clock < 0 || clock >= CPUTIME_CLOCK_I586_PMC)
+ return (EINVAL);
+ cputime_clock = clock;
+ }
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_machdep, OID_AUTO, cputime_clock, CTLTYPE_INT | CTLFLAG_RW,
+ 0, sizeof(u_int), sysctl_machdep_cputime_clock, "I", "");
+
+/*
+ * The start and stop routines need not be here since we turn off profiling
+ * before calling them. They are here for convenience.
+ */
+
+void
+startguprof(gp)
+ struct gmonparam *gp;
+{
+ uint64_t freq;
+
+ freq = atomic_load_acq_64(&tsc_freq);
+ if (cputime_clock == CPUTIME_CLOCK_UNINITIALIZED) {
+ if (freq != 0 && mp_ncpus == 1)
+ cputime_clock = CPUTIME_CLOCK_TSC;
+ else
+ cputime_clock = CPUTIME_CLOCK_I8254;
+ }
+ if (cputime_clock == CPUTIME_CLOCK_TSC) {
+ gp->profrate = freq >> 1;
+ cputime_prof_active = 1;
+ } else
+ gp->profrate = i8254_freq << CPUTIME_CLOCK_I8254_SHIFT;
+#if defined(PERFMON) && defined(I586_PMC_GUPROF)
+ if (cputime_clock == CPUTIME_CLOCK_I586_PMC) {
+ if (perfmon_avail() &&
+ perfmon_setup(0, cputime_clock_pmc_conf) == 0) {
+ if (perfmon_start(0) != 0)
+ perfmon_fini(0);
+ else {
+ /* XXX 1 event == 1 us. */
+ gp->profrate = 1000000;
+
+ saved_gmp = *gp;
+
+ /* Zap overheads. They are invalid. */
+ gp->cputime_overhead = 0;
+ gp->mcount_overhead = 0;
+ gp->mcount_post_overhead = 0;
+ gp->mcount_pre_overhead = 0;
+ gp->mexitcount_overhead = 0;
+ gp->mexitcount_post_overhead = 0;
+ gp->mexitcount_pre_overhead = 0;
+
+ cputime_clock_pmc_init = TRUE;
+ }
+ }
+ }
+#endif /* PERFMON && I586_PMC_GUPROF */
+ cputime_bias = 0;
+ cputime();
+}
+
+void
+stopguprof(gp)
+ struct gmonparam *gp;
+{
+#if defined(PERFMON) && defined(I586_PMC_GUPROF)
+ if (cputime_clock_pmc_init) {
+ *gp = saved_gmp;
+ perfmon_fini(0);
+ cputime_clock_pmc_init = FALSE;
+ }
+#endif
+ if (cputime_clock == CPUTIME_CLOCK_TSC)
+ cputime_prof_active = 0;
+}
+
+/* If the cpu frequency changed while profiling, report a warning. */
+static void
+tsc_freq_changed(void *arg, const struct cf_level *level, int status)
+{
+
+ /*
+ * If there was an error during the transition or
+ * TSC is P-state invariant, don't do anything.
+ */
+ if (status != 0 || tsc_is_invariant)
+ return;
+ if (cputime_prof_active && cputime_clock == CPUTIME_CLOCK_TSC)
+ printf("warning: cpu freq changed while profiling active\n");
+}
+
+EVENTHANDLER_DEFINE(cpufreq_post_change, tsc_freq_changed, NULL,
+ EVENTHANDLER_PRI_ANY);
+
+#endif /* GUPROF */
diff --git a/sys/amd64/amd64/ptrace_machdep.c b/sys/amd64/amd64/ptrace_machdep.c
new file mode 100644
index 0000000..9fa1917
--- /dev/null
+++ b/sys/amd64/amd64/ptrace_machdep.c
@@ -0,0 +1,152 @@
+/*-
+ * Copyright (c) 2011 Konstantin Belousov <kib@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/ptrace.h>
+#include <sys/sysent.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+
+static int
+cpu_ptrace_xstate(struct thread *td, int req, void *addr, int data)
+{
+ char *savefpu;
+ int error;
+
+ if (!use_xsave)
+ return (EOPNOTSUPP);
+
+ switch (req) {
+ case PT_GETXSTATE:
+ fpugetregs(td);
+ savefpu = (char *)(get_pcb_user_save_td(td) + 1);
+ error = copyout(savefpu, addr,
+ cpu_max_ext_state_size - sizeof(struct savefpu));
+ break;
+
+ case PT_SETXSTATE:
+ if (data > cpu_max_ext_state_size - sizeof(struct savefpu)) {
+ error = EINVAL;
+ break;
+ }
+ savefpu = malloc(data, M_TEMP, M_WAITOK);
+ error = copyin(addr, savefpu, data);
+ if (error == 0) {
+ fpugetregs(td);
+ error = fpusetxstate(td, savefpu, data);
+ }
+ free(savefpu, M_TEMP);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+#ifdef COMPAT_FREEBSD32
+#define PT_I386_GETXMMREGS (PT_FIRSTMACH + 0)
+#define PT_I386_SETXMMREGS (PT_FIRSTMACH + 1)
+#define PT_I386_GETXSTATE (PT_FIRSTMACH + 2)
+#define PT_I386_SETXSTATE (PT_FIRSTMACH + 3)
+
+static int
+cpu32_ptrace(struct thread *td, int req, void *addr, int data)
+{
+ struct savefpu *fpstate;
+ int error;
+
+ switch (req) {
+ case PT_I386_GETXMMREGS:
+ fpugetregs(td);
+ error = copyout(get_pcb_user_save_td(td), addr,
+ sizeof(*fpstate));
+ break;
+
+ case PT_I386_SETXMMREGS:
+ fpugetregs(td);
+ fpstate = get_pcb_user_save_td(td);
+ error = copyin(addr, fpstate, sizeof(*fpstate));
+ fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
+ break;
+
+ case PT_I386_GETXSTATE:
+ error = cpu_ptrace_xstate(td, PT_GETXSTATE, addr, data);
+ break;
+
+ case PT_I386_SETXSTATE:
+ error = cpu_ptrace_xstate(td, PT_SETXSTATE, addr, data);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+#endif
+
+int
+cpu_ptrace(struct thread *td, int req, void *addr, int data)
+{
+ int error;
+
+#ifdef COMPAT_FREEBSD32
+ if (SV_CURPROC_FLAG(SV_ILP32))
+ return (cpu32_ptrace(td, req, addr, data));
+#endif
+
+ /* Support old values of PT_GETXSTATE and PT_SETXSTATE. */
+ if (req == PT_FIRSTMACH + 0)
+ req = PT_GETXSTATE;
+ if (req == PT_FIRSTMACH + 1)
+ req = PT_SETXSTATE;
+
+ switch (req) {
+ case PT_GETXSTATE:
+ case PT_SETXSTATE:
+ error = cpu_ptrace_xstate(td, req, addr, data);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
diff --git a/sys/amd64/amd64/sigtramp.S b/sys/amd64/amd64/sigtramp.S
new file mode 100644
index 0000000..a05ea85
--- /dev/null
+++ b/sys/amd64/amd64/sigtramp.S
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm <peter@freeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/syscall.h>
+
+#include <machine/asmacros.h>
+
+#include "assym.s"
+
+ .text
+/**********************************************************************
+ *
+ * Signal trampoline, copied to top of user stack
+ *
+ */
+NON_GPROF_ENTRY(sigcode)
+ call *SIGF_HANDLER(%rsp) /* call signal handler */
+ lea SIGF_UC(%rsp),%rdi /* get ucontext_t */
+ pushq $0 /* junk to fake return addr. */
+ movq $SYS_sigreturn,%rax
+ syscall /* enter kernel with args */
+0: hlt /* trap priviliged instruction */
+ jmp 0b
+
+ ALIGN_TEXT
+esigcode:
+
+ .data
+ .globl szsigcode
+szsigcode:
+ .long esigcode-sigcode
diff --git a/sys/amd64/amd64/stack_machdep.c b/sys/amd64/amd64/stack_machdep.c
new file mode 100644
index 0000000..57908e2
--- /dev/null
+++ b/sys/amd64/amd64/stack_machdep.c
@@ -0,0 +1,87 @@
+/*-
+ * Copyright (c) 2005 Antoine Brodin
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/stack.h>
+
+#include <machine/pcb.h>
+#include <machine/stack.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+
+static void
+stack_capture(struct stack *st, register_t rbp)
+{
+ struct amd64_frame *frame;
+ vm_offset_t callpc;
+
+ stack_zero(st);
+ frame = (struct amd64_frame *)rbp;
+ while (1) {
+ if (!INKERNEL((long)frame))
+ break;
+ callpc = frame->f_retaddr;
+ if (!INKERNEL(callpc))
+ break;
+ if (stack_put(st, callpc) == -1)
+ break;
+ if (frame->f_frame <= frame ||
+ (vm_offset_t)frame->f_frame >=
+ (vm_offset_t)rbp + KSTACK_PAGES * PAGE_SIZE)
+ break;
+ frame = frame->f_frame;
+ }
+}
+
+void
+stack_save_td(struct stack *st, struct thread *td)
+{
+ register_t rbp;
+
+ if (TD_IS_SWAPPED(td))
+ panic("stack_save_td: swapped");
+ if (TD_IS_RUNNING(td))
+ panic("stack_save_td: running");
+
+ rbp = td->td_pcb->pcb_rbp;
+ stack_capture(st, rbp);
+}
+
+void
+stack_save(struct stack *st)
+{
+ register_t rbp;
+
+ __asm __volatile("movq %%rbp,%0" : "=r" (rbp));
+ stack_capture(st, rbp);
+}
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
new file mode 100644
index 0000000..fed852c
--- /dev/null
+++ b/sys/amd64/amd64/support.S
@@ -0,0 +1,732 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm.
+ * Copyright (c) 1993 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <machine/asmacros.h>
+#include <machine/intr_machdep.h>
+#include <machine/pmap.h>
+
+#include "assym.s"
+
+ .text
+
+/*
+ * bcopy family
+ * void bzero(void *buf, u_int len)
+ */
+
+/* done */
+ENTRY(bzero)
+ movq %rsi,%rcx
+ xorl %eax,%eax
+ shrq $3,%rcx
+ cld
+ rep
+ stosq
+ movq %rsi,%rcx
+ andq $7,%rcx
+ rep
+ stosb
+ ret
+END(bzero)
+
+/* Address: %rdi */
+ENTRY(pagezero)
+ movq $-PAGE_SIZE,%rdx
+ subq %rdx,%rdi
+ xorl %eax,%eax
+1:
+ movnti %rax,(%rdi,%rdx)
+ movnti %rax,8(%rdi,%rdx)
+ movnti %rax,16(%rdi,%rdx)
+ movnti %rax,24(%rdi,%rdx)
+ addq $32,%rdx
+ jne 1b
+ sfence
+ ret
+END(pagezero)
+
+ENTRY(bcmp)
+ movq %rdx,%rcx
+ shrq $3,%rcx
+ cld /* compare forwards */
+ repe
+ cmpsq
+ jne 1f
+
+ movq %rdx,%rcx
+ andq $7,%rcx
+ repe
+ cmpsb
+1:
+ setne %al
+ movsbl %al,%eax
+ ret
+END(bcmp)
+
+/*
+ * bcopy(src, dst, cnt)
+ * rdi, rsi, rdx
+ * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
+ */
+ENTRY(bcopy)
+ xchgq %rsi,%rdi
+ movq %rdx,%rcx
+
+ movq %rdi,%rax
+ subq %rsi,%rax
+ cmpq %rcx,%rax /* overlapping && src < dst? */
+ jb 1f
+
+ shrq $3,%rcx /* copy by 64-bit words */
+ cld /* nope, copy forwards */
+ rep
+ movsq
+ movq %rdx,%rcx
+ andq $7,%rcx /* any bytes left? */
+ rep
+ movsb
+ ret
+
+ /* ALIGN_TEXT */
+1:
+ addq %rcx,%rdi /* copy backwards */
+ addq %rcx,%rsi
+ decq %rdi
+ decq %rsi
+ andq $7,%rcx /* any fractional bytes? */
+ std
+ rep
+ movsb
+ movq %rdx,%rcx /* copy remainder by 32-bit words */
+ shrq $3,%rcx
+ subq $7,%rsi
+ subq $7,%rdi
+ rep
+ movsq
+ cld
+ ret
+END(bcopy)
+
+/*
+ * Note: memcpy does not support overlapping copies
+ */
+ENTRY(memcpy)
+ movq %rdx,%rcx
+ shrq $3,%rcx /* copy by 64-bit words */
+ cld /* copy forwards */
+ rep
+ movsq
+ movq %rdx,%rcx
+ andq $7,%rcx /* any bytes left? */
+ rep
+ movsb
+ ret
+END(memcpy)
+
+/*
+ * pagecopy(%rdi=from, %rsi=to)
+ */
+ENTRY(pagecopy)
+ movq $-PAGE_SIZE,%rax
+ movq %rax,%rdx
+ subq %rax,%rdi
+ subq %rax,%rsi
+1:
+ prefetchnta (%rdi,%rax)
+ addq $64,%rax
+ jne 1b
+2:
+ movq (%rdi,%rdx),%rax
+ movnti %rax,(%rsi,%rdx)
+ movq 8(%rdi,%rdx),%rax
+ movnti %rax,8(%rsi,%rdx)
+ movq 16(%rdi,%rdx),%rax
+ movnti %rax,16(%rsi,%rdx)
+ movq 24(%rdi,%rdx),%rax
+ movnti %rax,24(%rsi,%rdx)
+ addq $32,%rdx
+ jne 2b
+ sfence
+ ret
+END(pagecopy)
+
+/* fillw(pat, base, cnt) */
+/* %rdi,%rsi, %rdx */
+ENTRY(fillw)
+ movq %rdi,%rax
+ movq %rsi,%rdi
+ movq %rdx,%rcx
+ cld
+ rep
+ stosw
+ ret
+END(fillw)
+
+/*****************************************************************************/
+/* copyout and fubyte family */
+/*****************************************************************************/
+/*
+ * Access user memory from inside the kernel. These routines should be
+ * the only places that do this.
+ *
+ * These routines set curpcb->onfault for the time they execute. When a
+ * protection violation occurs inside the functions, the trap handler
+ * returns to *curpcb->onfault instead of the function.
+ */
+
+/*
+ * copyout(from_kernel, to_user, len) - MP SAFE
+ * %rdi, %rsi, %rdx
+ */
+ENTRY(copyout)
+ movq PCPU(CURPCB),%rax
+ movq $copyout_fault,PCB_ONFAULT(%rax)
+ testq %rdx,%rdx /* anything to do? */
+ jz done_copyout
+
+ /*
+ * Check explicitly for non-user addresses. If 486 write protection
+ * is being used, this check is essential because we are in kernel
+ * mode so the h/w does not provide any protection against writing
+ * kernel addresses.
+ */
+
+ /*
+ * First, prevent address wrapping.
+ */
+ movq %rsi,%rax
+ addq %rdx,%rax
+ jc copyout_fault
+/*
+ * XXX STOP USING VM_MAXUSER_ADDRESS.
+ * It is an end address, not a max, so every time it is used correctly it
+ * looks like there is an off by one error, and of course it caused an off
+ * by one error in several places.
+ */
+ movq $VM_MAXUSER_ADDRESS,%rcx
+ cmpq %rcx,%rax
+ ja copyout_fault
+
+ xchgq %rdi,%rsi
+ /* bcopy(%rsi, %rdi, %rdx) */
+ movq %rdx,%rcx
+
+ shrq $3,%rcx
+ cld
+ rep
+ movsq
+ movb %dl,%cl
+ andb $7,%cl
+ rep
+ movsb
+
+done_copyout:
+ xorl %eax,%eax
+ movq PCPU(CURPCB),%rdx
+ movq %rax,PCB_ONFAULT(%rdx)
+ ret
+
+ ALIGN_TEXT
+copyout_fault:
+ movq PCPU(CURPCB),%rdx
+ movq $0,PCB_ONFAULT(%rdx)
+ movq $EFAULT,%rax
+ ret
+END(copyout)
+
+/*
+ * copyin(from_user, to_kernel, len) - MP SAFE
+ * %rdi, %rsi, %rdx
+ */
+ENTRY(copyin)
+ movq PCPU(CURPCB),%rax
+ movq $copyin_fault,PCB_ONFAULT(%rax)
+ testq %rdx,%rdx /* anything to do? */
+ jz done_copyin
+
+ /*
+ * make sure address is valid
+ */
+ movq %rdi,%rax
+ addq %rdx,%rax
+ jc copyin_fault
+ movq $VM_MAXUSER_ADDRESS,%rcx
+ cmpq %rcx,%rax
+ ja copyin_fault
+
+ xchgq %rdi,%rsi
+ movq %rdx,%rcx
+ movb %cl,%al
+ shrq $3,%rcx /* copy longword-wise */
+ cld
+ rep
+ movsq
+ movb %al,%cl
+ andb $7,%cl /* copy remaining bytes */
+ rep
+ movsb
+
+done_copyin:
+ xorl %eax,%eax
+ movq PCPU(CURPCB),%rdx
+ movq %rax,PCB_ONFAULT(%rdx)
+ ret
+
+ ALIGN_TEXT
+copyin_fault:
+ movq PCPU(CURPCB),%rdx
+ movq $0,PCB_ONFAULT(%rdx)
+ movq $EFAULT,%rax
+ ret
+END(copyin)
+
+/*
+ * casuword32. Compare and set user integer. Returns -1 or the current value.
+ * dst = %rdi, old = %rsi, new = %rdx
+ */
+ENTRY(casuword32)
+ movq PCPU(CURPCB),%rcx
+ movq $fusufault,PCB_ONFAULT(%rcx)
+
+ movq $VM_MAXUSER_ADDRESS-4,%rax
+ cmpq %rax,%rdi /* verify address is valid */
+ ja fusufault
+
+ movl %esi,%eax /* old */
+#ifdef SMP
+ lock
+#endif
+ cmpxchgl %edx,(%rdi) /* new = %edx */
+
+ /*
+ * The old value is in %eax. If the store succeeded it will be the
+ * value we expected (old) from before the store, otherwise it will
+ * be the current value.
+ */
+
+ movq PCPU(CURPCB),%rcx
+ movq $0,PCB_ONFAULT(%rcx)
+ ret
+END(casuword32)
+
+/*
+ * casuword. Compare and set user word. Returns -1 or the current value.
+ * dst = %rdi, old = %rsi, new = %rdx
+ */
+ENTRY(casuword)
+ movq PCPU(CURPCB),%rcx
+ movq $fusufault,PCB_ONFAULT(%rcx)
+
+ movq $VM_MAXUSER_ADDRESS-4,%rax
+ cmpq %rax,%rdi /* verify address is valid */
+ ja fusufault
+
+ movq %rsi,%rax /* old */
+#ifdef SMP
+ lock
+#endif
+ cmpxchgq %rdx,(%rdi) /* new = %rdx */
+
+ /*
+ * The old value is in %eax. If the store succeeded it will be the
+ * value we expected (old) from before the store, otherwise it will
+ * be the current value.
+ */
+
+ movq PCPU(CURPCB),%rcx
+ movq $fusufault,PCB_ONFAULT(%rcx)
+ movq $0,PCB_ONFAULT(%rcx)
+ ret
+END(casuword)
+
+/*
+ * Fetch (load) a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit
+ * byte from user memory. All these functions are MPSAFE.
+ * addr = %rdi
+ */
+
+ALTENTRY(fuword64)
+ENTRY(fuword)
+ movq PCPU(CURPCB),%rcx
+ movq $fusufault,PCB_ONFAULT(%rcx)
+
+ movq $VM_MAXUSER_ADDRESS-8,%rax
+ cmpq %rax,%rdi /* verify address is valid */
+ ja fusufault
+
+ movq (%rdi),%rax
+ movq $0,PCB_ONFAULT(%rcx)
+ ret
+END(fuword64)
+END(fuword)
+
+ENTRY(fuword32)
+ movq PCPU(CURPCB),%rcx
+ movq $fusufault,PCB_ONFAULT(%rcx)
+
+ movq $VM_MAXUSER_ADDRESS-4,%rax
+ cmpq %rax,%rdi /* verify address is valid */
+ ja fusufault
+
+ movl (%rdi),%eax
+ movq $0,PCB_ONFAULT(%rcx)
+ ret
+END(fuword32)
+
+/*
+ * fuswintr() and suswintr() are specialized variants of fuword16() and
+ * suword16(), respectively. They are called from the profiling code,
+ * potentially at interrupt time. If they fail, that's okay; good things
+ * will happen later. They always fail for now, until the trap code is
+ * able to deal with this.
+ */
+ALTENTRY(suswintr)
+ENTRY(fuswintr)
+ movq $-1,%rax
+ ret
+END(suswintr)
+END(fuswintr)
+
+ENTRY(fuword16)
+ movq PCPU(CURPCB),%rcx
+ movq $fusufault,PCB_ONFAULT(%rcx)
+
+ movq $VM_MAXUSER_ADDRESS-2,%rax
+ cmpq %rax,%rdi
+ ja fusufault
+
+ movzwl (%rdi),%eax
+ movq $0,PCB_ONFAULT(%rcx)
+ ret
+END(fuword16)
+
+ENTRY(fubyte)
+ movq PCPU(CURPCB),%rcx
+ movq $fusufault,PCB_ONFAULT(%rcx)
+
+ movq $VM_MAXUSER_ADDRESS-1,%rax
+ cmpq %rax,%rdi
+ ja fusufault
+
+ movzbl (%rdi),%eax
+ movq $0,PCB_ONFAULT(%rcx)
+ ret
+END(fubyte)
+
+ ALIGN_TEXT
+fusufault:
+ movq PCPU(CURPCB),%rcx
+ xorl %eax,%eax
+ movq %rax,PCB_ONFAULT(%rcx)
+ decq %rax
+ ret
+
+/*
+ * Store a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit byte to
+ * user memory. All these functions are MPSAFE.
+ * addr = %rdi, value = %rsi
+ */
+ALTENTRY(suword64)
+ENTRY(suword)
+ movq PCPU(CURPCB),%rcx
+ movq $fusufault,PCB_ONFAULT(%rcx)
+
+ movq $VM_MAXUSER_ADDRESS-8,%rax
+ cmpq %rax,%rdi /* verify address validity */
+ ja fusufault
+
+ movq %rsi,(%rdi)
+ xorl %eax,%eax
+ movq PCPU(CURPCB),%rcx
+ movq %rax,PCB_ONFAULT(%rcx)
+ ret
+END(suword64)
+END(suword)
+
+ENTRY(suword32)
+ movq PCPU(CURPCB),%rcx
+ movq $fusufault,PCB_ONFAULT(%rcx)
+
+ movq $VM_MAXUSER_ADDRESS-4,%rax
+ cmpq %rax,%rdi /* verify address validity */
+ ja fusufault
+
+ movl %esi,(%rdi)
+ xorl %eax,%eax
+ movq PCPU(CURPCB),%rcx
+ movq %rax,PCB_ONFAULT(%rcx)
+ ret
+END(suword32)
+
+ENTRY(suword16)
+ movq PCPU(CURPCB),%rcx
+ movq $fusufault,PCB_ONFAULT(%rcx)
+
+ movq $VM_MAXUSER_ADDRESS-2,%rax
+ cmpq %rax,%rdi /* verify address validity */
+ ja fusufault
+
+ movw %si,(%rdi)
+ xorl %eax,%eax
+ movq PCPU(CURPCB),%rcx /* restore trashed register */
+ movq %rax,PCB_ONFAULT(%rcx)
+ ret
+END(suword16)
+
+ENTRY(subyte)
+ movq PCPU(CURPCB),%rcx
+ movq $fusufault,PCB_ONFAULT(%rcx)
+
+ movq $VM_MAXUSER_ADDRESS-1,%rax
+ cmpq %rax,%rdi /* verify address validity */
+ ja fusufault
+
+ movl %esi,%eax
+ movb %al,(%rdi)
+ xorl %eax,%eax
+ movq PCPU(CURPCB),%rcx /* restore trashed register */
+ movq %rax,PCB_ONFAULT(%rcx)
+ ret
+END(subyte)
+
+/*
+ * copyinstr(from, to, maxlen, int *lencopied) - MP SAFE
+ * %rdi, %rsi, %rdx, %rcx
+ *
+ * copy a string from from to to, stop when a 0 character is reached.
+ * return ENAMETOOLONG if string is longer than maxlen, and
+ * EFAULT on protection violations. If lencopied is non-zero,
+ * return the actual length in *lencopied.
+ */
+ENTRY(copyinstr)
+ movq %rdx,%r8 /* %r8 = maxlen */
+ movq %rcx,%r9 /* %r9 = *len */
+ xchgq %rdi,%rsi /* %rdi = from, %rsi = to */
+ movq PCPU(CURPCB),%rcx
+ movq $cpystrflt,PCB_ONFAULT(%rcx)
+
+ movq $VM_MAXUSER_ADDRESS,%rax
+
+ /* make sure 'from' is within bounds */
+ subq %rsi,%rax
+ jbe cpystrflt
+
+ /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
+ cmpq %rdx,%rax
+ jae 1f
+ movq %rax,%rdx
+ movq %rax,%r8
+1:
+ incq %rdx
+ cld
+
+2:
+ decq %rdx
+ jz 3f
+
+ lodsb
+ stosb
+ orb %al,%al
+ jnz 2b
+
+ /* Success -- 0 byte reached */
+ decq %rdx
+ xorl %eax,%eax
+ jmp cpystrflt_x
+3:
+ /* rdx is zero - return ENAMETOOLONG or EFAULT */
+ movq $VM_MAXUSER_ADDRESS,%rax
+ cmpq %rax,%rsi
+ jae cpystrflt
+4:
+ movq $ENAMETOOLONG,%rax
+ jmp cpystrflt_x
+
+cpystrflt:
+ movq $EFAULT,%rax
+
+cpystrflt_x:
+ /* set *lencopied and return %eax */
+ movq PCPU(CURPCB),%rcx
+ movq $0,PCB_ONFAULT(%rcx)
+
+ testq %r9,%r9
+ jz 1f
+ subq %rdx,%r8
+ movq %r8,(%r9)
+1:
+ ret
+END(copyinstr)
+
+/*
+ * copystr(from, to, maxlen, int *lencopied) - MP SAFE
+ * %rdi, %rsi, %rdx, %rcx
+ */
+ENTRY(copystr)
+ movq %rdx,%r8 /* %r8 = maxlen */
+
+ xchgq %rdi,%rsi
+ incq %rdx
+ cld
+1:
+ decq %rdx
+ jz 4f
+ lodsb
+ stosb
+ orb %al,%al
+ jnz 1b
+
+ /* Success -- 0 byte reached */
+ decq %rdx
+ xorl %eax,%eax
+ jmp 6f
+4:
+ /* rdx is zero -- return ENAMETOOLONG */
+ movq $ENAMETOOLONG,%rax
+
+6:
+
+ testq %rcx,%rcx
+ jz 7f
+ /* set *lencopied and return %rax */
+ subq %rdx,%r8
+ movq %r8,(%rcx)
+7:
+ ret
+END(copystr)
+
+/*
+ * Handling of special amd64 registers and descriptor tables etc
+ * %rdi
+ */
+/* void lgdt(struct region_descriptor *rdp); */
+ENTRY(lgdt)
+ /* reload the descriptor table */
+ lgdt (%rdi)
+
+ /* flush the prefetch q */
+ jmp 1f
+ nop
+1:
+ movl $KDSEL,%eax
+ movl %eax,%ds
+ movl %eax,%es
+ movl %eax,%fs /* Beware, use wrmsr to set 64 bit base */
+ movl %eax,%gs
+ movl %eax,%ss
+
+ /* reload code selector by turning return into intersegmental return */
+ popq %rax
+ pushq $KCSEL
+ pushq %rax
+ MEXITCOUNT
+ lretq
+END(lgdt)
+
+/*****************************************************************************/
+/* setjump, longjump */
+/*****************************************************************************/
+
+ENTRY(setjmp)
+ movq %rbx,0(%rdi) /* save rbx */
+ movq %rsp,8(%rdi) /* save rsp */
+ movq %rbp,16(%rdi) /* save rbp */
+ movq %r12,24(%rdi) /* save r12 */
+ movq %r13,32(%rdi) /* save r13 */
+ movq %r14,40(%rdi) /* save r14 */
+ movq %r15,48(%rdi) /* save r15 */
+ movq 0(%rsp),%rdx /* get rta */
+ movq %rdx,56(%rdi) /* save rip */
+ xorl %eax,%eax /* return(0); */
+ ret
+END(setjmp)
+
+ENTRY(longjmp)
+ movq 0(%rdi),%rbx /* restore rbx */
+ movq 8(%rdi),%rsp /* restore rsp */
+ movq 16(%rdi),%rbp /* restore rbp */
+ movq 24(%rdi),%r12 /* restore r12 */
+ movq 32(%rdi),%r13 /* restore r13 */
+ movq 40(%rdi),%r14 /* restore r14 */
+ movq 48(%rdi),%r15 /* restore r15 */
+ movq 56(%rdi),%rdx /* get rta */
+ movq %rdx,0(%rsp) /* put in return frame */
+ xorl %eax,%eax /* return(1); */
+ incl %eax
+ ret
+END(longjmp)
+
+/*
+ * Support for reading MSRs in the safe manner.
+ */
+ENTRY(rdmsr_safe)
+/* int rdmsr_safe(u_int msr, uint64_t *data) */
+ movq PCPU(CURPCB),%r8
+ movq $msr_onfault,PCB_ONFAULT(%r8)
+ movl %edi,%ecx
+ rdmsr /* Read MSR pointed by %ecx. Returns
+ hi byte in edx, lo in %eax */
+ salq $32,%rdx /* sign-shift %rdx left */
+ movl %eax,%eax /* zero-extend %eax -> %rax */
+ orq %rdx,%rax
+ movq %rax,(%rsi)
+ xorq %rax,%rax
+ movq %rax,PCB_ONFAULT(%r8)
+ ret
+
+/*
+ * Support for writing MSRs in the safe manner.
+ */
+ENTRY(wrmsr_safe)
+/* int wrmsr_safe(u_int msr, uint64_t data) */
+ movq PCPU(CURPCB),%r8
+ movq $msr_onfault,PCB_ONFAULT(%r8)
+ movl %edi,%ecx
+ movl %esi,%eax
+ sarq $32,%rsi
+ movl %esi,%edx
+ wrmsr /* Write MSR pointed by %ecx. Accepts
+ hi byte in edx, lo in %eax. */
+ xorq %rax,%rax
+ movq %rax,PCB_ONFAULT(%r8)
+ ret
+
+/*
+ * MSR operations fault handler
+ */
+ ALIGN_TEXT
+msr_onfault:
+ movq $0,PCB_ONFAULT(%r8)
+ movl $EFAULT,%eax
+ ret
diff --git a/sys/amd64/amd64/sys_machdep.c b/sys/amd64/amd64/sys_machdep.c
new file mode 100644
index 0000000..2f136ab
--- /dev/null
+++ b/sys/amd64/amd64/sys_machdep.c
@@ -0,0 +1,753 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm.
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)sys_machdep.c 5.5 (Berkeley) 1/19/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/capability.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_kern.h> /* for kernel_map */
+#include <vm/vm_extern.h>
+
+#include <machine/frame.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#include <machine/specialreg.h>
+#include <machine/sysarch.h>
+#include <machine/tss.h>
+#include <machine/vmparam.h>
+
+#include <security/audit/audit.h>
+
+#define MAX_LD 8192
+
+int max_ldt_segment = 1024;
+SYSCTL_INT(_machdep, OID_AUTO, max_ldt_segment, CTLFLAG_RDTUN,
+ &max_ldt_segment, 0,
+ "Maximum number of allowed LDT segments in the single address space");
+
+static void
+max_ldt_segment_init(void *arg __unused)
+{
+
+ TUNABLE_INT_FETCH("machdep.max_ldt_segment", &max_ldt_segment);
+ if (max_ldt_segment <= 0)
+ max_ldt_segment = 1;
+ if (max_ldt_segment > MAX_LD)
+ max_ldt_segment = MAX_LD;
+}
+SYSINIT(maxldt, SI_SUB_VM_CONF, SI_ORDER_ANY, max_ldt_segment_init, NULL);
+
+#ifdef notyet
+#ifdef SMP
+static void set_user_ldt_rv(struct vmspace *vmsp);
+#endif
+#endif
+static void user_ldt_derefl(struct proc_ldt *pldt);
+
+#ifndef _SYS_SYSPROTO_H_
+struct sysarch_args {
+ int op;
+ char *parms;
+};
+#endif
+
+int
+sysarch_ldt(struct thread *td, struct sysarch_args *uap, int uap_space)
+{
+ struct i386_ldt_args *largs, la;
+ struct user_segment_descriptor *lp;
+ int error = 0;
+
+ /*
+ * XXXKIB check that the BSM generation code knows to encode
+ * the op argument.
+ */
+ AUDIT_ARG_CMD(uap->op);
+ if (uap_space == UIO_USERSPACE) {
+ error = copyin(uap->parms, &la, sizeof(struct i386_ldt_args));
+ if (error != 0)
+ return (error);
+ largs = &la;
+ } else
+ largs = (struct i386_ldt_args *)uap->parms;
+
+ switch (uap->op) {
+ case I386_GET_LDT:
+ error = amd64_get_ldt(td, largs);
+ break;
+ case I386_SET_LDT:
+ if (largs->descs != NULL && largs->num > max_ldt_segment)
+ return (EINVAL);
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+ if (largs->descs != NULL) {
+ lp = malloc(largs->num * sizeof(struct
+ user_segment_descriptor), M_TEMP, M_WAITOK);
+ error = copyin(largs->descs, lp, largs->num *
+ sizeof(struct user_segment_descriptor));
+ if (error == 0)
+ error = amd64_set_ldt(td, largs, lp);
+ free(lp, M_TEMP);
+ } else {
+ error = amd64_set_ldt(td, largs, NULL);
+ }
+ break;
+ }
+ return (error);
+}
+
+void
+update_gdt_gsbase(struct thread *td, uint32_t base)
+{
+ struct user_segment_descriptor *sd;
+
+ if (td != curthread)
+ return;
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+ critical_enter();
+ sd = PCPU_GET(gs32p);
+ sd->sd_lobase = base & 0xffffff;
+ sd->sd_hibase = (base >> 24) & 0xff;
+ critical_exit();
+}
+
+void
+update_gdt_fsbase(struct thread *td, uint32_t base)
+{
+ struct user_segment_descriptor *sd;
+
+ if (td != curthread)
+ return;
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+ critical_enter();
+ sd = PCPU_GET(fs32p);
+ sd->sd_lobase = base & 0xffffff;
+ sd->sd_hibase = (base >> 24) & 0xff;
+ critical_exit();
+}
+
+int
+sysarch(td, uap)
+ struct thread *td;
+ register struct sysarch_args *uap;
+{
+ int error = 0;
+ struct pcb *pcb = curthread->td_pcb;
+ uint32_t i386base;
+ uint64_t a64base;
+ struct i386_ioperm_args iargs;
+ struct i386_get_xfpustate i386xfpu;
+ struct amd64_get_xfpustate a64xfpu;
+
+#ifdef CAPABILITY_MODE
+ /*
+ * When adding new operations, add a new case statement here to
+ * explicitly indicate whether or not the operation is safe to
+ * perform in capability mode.
+ */
+ if (IN_CAPABILITY_MODE(td)) {
+ switch (uap->op) {
+ case I386_GET_LDT:
+ case I386_SET_LDT:
+ case I386_GET_IOPERM:
+ case I386_GET_FSBASE:
+ case I386_SET_FSBASE:
+ case I386_GET_GSBASE:
+ case I386_SET_GSBASE:
+ case I386_GET_XFPUSTATE:
+ case AMD64_GET_FSBASE:
+ case AMD64_SET_FSBASE:
+ case AMD64_GET_GSBASE:
+ case AMD64_SET_GSBASE:
+ case AMD64_GET_XFPUSTATE:
+ break;
+
+ case I386_SET_IOPERM:
+ default:
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CAPFAIL))
+ ktrcapfail(CAPFAIL_SYSCALL, 0, 0);
+#endif
+ return (ECAPMODE);
+ }
+ }
+#endif
+
+ if (uap->op == I386_GET_LDT || uap->op == I386_SET_LDT)
+ return (sysarch_ldt(td, uap, UIO_USERSPACE));
+ /*
+ * XXXKIB check that the BSM generation code knows to encode
+ * the op argument.
+ */
+ AUDIT_ARG_CMD(uap->op);
+ switch (uap->op) {
+ case I386_GET_IOPERM:
+ case I386_SET_IOPERM:
+ if ((error = copyin(uap->parms, &iargs,
+ sizeof(struct i386_ioperm_args))) != 0)
+ return (error);
+ break;
+ case I386_GET_XFPUSTATE:
+ if ((error = copyin(uap->parms, &i386xfpu,
+ sizeof(struct i386_get_xfpustate))) != 0)
+ return (error);
+ a64xfpu.addr = (void *)(uintptr_t)i386xfpu.addr;
+ a64xfpu.len = i386xfpu.len;
+ break;
+ case AMD64_GET_XFPUSTATE:
+ if ((error = copyin(uap->parms, &a64xfpu,
+ sizeof(struct amd64_get_xfpustate))) != 0)
+ return (error);
+ break;
+ default:
+ break;
+ }
+
+ switch (uap->op) {
+ case I386_GET_IOPERM:
+ error = amd64_get_ioperm(td, &iargs);
+ if (error == 0)
+ error = copyout(&iargs, uap->parms,
+ sizeof(struct i386_ioperm_args));
+ break;
+ case I386_SET_IOPERM:
+ error = amd64_set_ioperm(td, &iargs);
+ break;
+ case I386_GET_FSBASE:
+ i386base = pcb->pcb_fsbase;
+ error = copyout(&i386base, uap->parms, sizeof(i386base));
+ break;
+ case I386_SET_FSBASE:
+ error = copyin(uap->parms, &i386base, sizeof(i386base));
+ if (!error) {
+ pcb->pcb_fsbase = i386base;
+ td->td_frame->tf_fs = _ufssel;
+ update_gdt_fsbase(td, i386base);
+ }
+ break;
+ case I386_GET_GSBASE:
+ i386base = pcb->pcb_gsbase;
+ error = copyout(&i386base, uap->parms, sizeof(i386base));
+ break;
+ case I386_SET_GSBASE:
+ error = copyin(uap->parms, &i386base, sizeof(i386base));
+ if (!error) {
+ pcb->pcb_gsbase = i386base;
+ td->td_frame->tf_gs = _ugssel;
+ update_gdt_gsbase(td, i386base);
+ }
+ break;
+ case AMD64_GET_FSBASE:
+ error = copyout(&pcb->pcb_fsbase, uap->parms, sizeof(pcb->pcb_fsbase));
+ break;
+
+ case AMD64_SET_FSBASE:
+ error = copyin(uap->parms, &a64base, sizeof(a64base));
+ if (!error) {
+ if (a64base < VM_MAXUSER_ADDRESS) {
+ pcb->pcb_fsbase = a64base;
+ set_pcb_flags(pcb, PCB_FULL_IRET);
+ td->td_frame->tf_fs = _ufssel;
+ } else
+ error = EINVAL;
+ }
+ break;
+
+ case AMD64_GET_GSBASE:
+ error = copyout(&pcb->pcb_gsbase, uap->parms, sizeof(pcb->pcb_gsbase));
+ break;
+
+ case AMD64_SET_GSBASE:
+ error = copyin(uap->parms, &a64base, sizeof(a64base));
+ if (!error) {
+ if (a64base < VM_MAXUSER_ADDRESS) {
+ pcb->pcb_gsbase = a64base;
+ set_pcb_flags(pcb, PCB_FULL_IRET);
+ td->td_frame->tf_gs = _ugssel;
+ } else
+ error = EINVAL;
+ }
+ break;
+
+ case I386_GET_XFPUSTATE:
+ case AMD64_GET_XFPUSTATE:
+ if (a64xfpu.len > cpu_max_ext_state_size -
+ sizeof(struct savefpu))
+ return (EINVAL);
+ fpugetregs(td);
+ error = copyout((char *)(get_pcb_user_save_td(td) + 1),
+ a64xfpu.addr, a64xfpu.len);
+ return (error);
+
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+int
+amd64_set_ioperm(td, uap)
+ struct thread *td;
+ struct i386_ioperm_args *uap;
+{
+ int i, error;
+ char *iomap;
+ struct amd64tss *tssp;
+ struct system_segment_descriptor *tss_sd;
+ u_long *addr;
+ struct pcb *pcb;
+
+ if ((error = priv_check(td, PRIV_IO)) != 0)
+ return (error);
+ if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
+ return (error);
+ if (uap->start + uap->length > IOPAGES * PAGE_SIZE * NBBY)
+ return (EINVAL);
+
+ /*
+ * XXX
+ * While this is restricted to root, we should probably figure out
+ * whether any other driver is using this i/o address, as so not to
+ * cause confusion. This probably requires a global 'usage registry'.
+ */
+ pcb = td->td_pcb;
+ if (pcb->pcb_tssp == NULL) {
+ tssp = (struct amd64tss *)kmem_alloc(kernel_map,
+ ctob(IOPAGES+1));
+ if (tssp == NULL)
+ return (ENOMEM);
+ iomap = (char *)&tssp[1];
+ addr = (u_long *)iomap;
+ for (i = 0; i < (ctob(IOPAGES) + 1) / sizeof(u_long); i++)
+ *addr++ = ~0;
+ critical_enter();
+ /* Takes care of tss_rsp0. */
+ memcpy(tssp, &common_tss[PCPU_GET(cpuid)],
+ sizeof(struct amd64tss));
+ tssp->tss_iobase = sizeof(*tssp);
+ pcb->pcb_tssp = tssp;
+ tss_sd = PCPU_GET(tss);
+ tss_sd->sd_lobase = (u_long)tssp & 0xffffff;
+ tss_sd->sd_hibase = ((u_long)tssp >> 24) & 0xfffffffffful;
+ tss_sd->sd_type = SDT_SYSTSS;
+ ltr(GSEL(GPROC0_SEL, SEL_KPL));
+ PCPU_SET(tssp, tssp);
+ critical_exit();
+ } else
+ iomap = (char *)&pcb->pcb_tssp[1];
+ for (i = uap->start; i < uap->start + uap->length; i++) {
+ if (uap->enable)
+ iomap[i >> 3] &= ~(1 << (i & 7));
+ else
+ iomap[i >> 3] |= (1 << (i & 7));
+ }
+ return (error);
+}
+
+int
+amd64_get_ioperm(td, uap)
+ struct thread *td;
+ struct i386_ioperm_args *uap;
+{
+ int i, state;
+ char *iomap;
+
+ if (uap->start >= IOPAGES * PAGE_SIZE * NBBY)
+ return (EINVAL);
+ if (td->td_pcb->pcb_tssp == NULL) {
+ uap->length = 0;
+ goto done;
+ }
+
+ iomap = (char *)&td->td_pcb->pcb_tssp[1];
+
+ i = uap->start;
+ state = (iomap[i >> 3] >> (i & 7)) & 1;
+ uap->enable = !state;
+ uap->length = 1;
+
+ for (i = uap->start + 1; i < IOPAGES * PAGE_SIZE * NBBY; i++) {
+ if (state != ((iomap[i >> 3] >> (i & 7)) & 1))
+ break;
+ uap->length++;
+ }
+
+done:
+ return (0);
+}
+
+/*
+ * Update the GDT entry pointing to the LDT to point to the LDT of the
+ * current process.
+ */
+void
+set_user_ldt(struct mdproc *mdp)
+{
+
+ critical_enter();
+ *PCPU_GET(ldt) = mdp->md_ldt_sd;
+ lldt(GSEL(GUSERLDT_SEL, SEL_KPL));
+ critical_exit();
+}
+
+#ifdef notyet
+#ifdef SMP
+static void
+set_user_ldt_rv(struct vmspace *vmsp)
+{
+ struct thread *td;
+
+ td = curthread;
+ if (vmsp != td->td_proc->p_vmspace)
+ return;
+
+ set_user_ldt(&td->td_proc->p_md);
+}
+#endif
+#endif
+
+struct proc_ldt *
+user_ldt_alloc(struct proc *p, int force)
+{
+ struct proc_ldt *pldt, *new_ldt;
+ struct mdproc *mdp;
+ struct soft_segment_descriptor sldt;
+
+ mtx_assert(&dt_lock, MA_OWNED);
+ mdp = &p->p_md;
+ if (!force && mdp->md_ldt != NULL)
+ return (mdp->md_ldt);
+ mtx_unlock(&dt_lock);
+ new_ldt = malloc(sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK);
+ new_ldt->ldt_base = (caddr_t)kmem_alloc(kernel_map,
+ max_ldt_segment * sizeof(struct user_segment_descriptor));
+ if (new_ldt->ldt_base == NULL) {
+ FREE(new_ldt, M_SUBPROC);
+ mtx_lock(&dt_lock);
+ return (NULL);
+ }
+ new_ldt->ldt_refcnt = 1;
+ sldt.ssd_base = (uint64_t)new_ldt->ldt_base;
+ sldt.ssd_limit = max_ldt_segment *
+ sizeof(struct user_segment_descriptor) - 1;
+ sldt.ssd_type = SDT_SYSLDT;
+ sldt.ssd_dpl = SEL_KPL;
+ sldt.ssd_p = 1;
+ sldt.ssd_long = 0;
+ sldt.ssd_def32 = 0;
+ sldt.ssd_gran = 0;
+ mtx_lock(&dt_lock);
+ pldt = mdp->md_ldt;
+ if (pldt != NULL && !force) {
+ kmem_free(kernel_map, (vm_offset_t)new_ldt->ldt_base,
+ max_ldt_segment * sizeof(struct user_segment_descriptor));
+ free(new_ldt, M_SUBPROC);
+ return (pldt);
+ }
+
+ if (pldt != NULL) {
+ bcopy(pldt->ldt_base, new_ldt->ldt_base, max_ldt_segment *
+ sizeof(struct user_segment_descriptor));
+ user_ldt_derefl(pldt);
+ }
+ ssdtosyssd(&sldt, &p->p_md.md_ldt_sd);
+ atomic_store_rel_ptr((volatile uintptr_t *)&mdp->md_ldt,
+ (uintptr_t)new_ldt);
+ if (p == curproc)
+ set_user_ldt(mdp);
+
+ return (mdp->md_ldt);
+}
+
+void
+user_ldt_free(struct thread *td)
+{
+ struct proc *p = td->td_proc;
+ struct mdproc *mdp = &p->p_md;
+ struct proc_ldt *pldt;
+
+ mtx_assert(&dt_lock, MA_OWNED);
+ if ((pldt = mdp->md_ldt) == NULL) {
+ mtx_unlock(&dt_lock);
+ return;
+ }
+
+ mdp->md_ldt = NULL;
+ bzero(&mdp->md_ldt_sd, sizeof(mdp->md_ldt_sd));
+ if (td == curthread)
+ lldt(GSEL(GNULL_SEL, SEL_KPL));
+ user_ldt_deref(pldt);
+}
+
+static void
+user_ldt_derefl(struct proc_ldt *pldt)
+{
+
+ if (--pldt->ldt_refcnt == 0) {
+ kmem_free(kernel_map, (vm_offset_t)pldt->ldt_base,
+ max_ldt_segment * sizeof(struct user_segment_descriptor));
+ free(pldt, M_SUBPROC);
+ }
+}
+
+void
+user_ldt_deref(struct proc_ldt *pldt)
+{
+
+ mtx_assert(&dt_lock, MA_OWNED);
+ user_ldt_derefl(pldt);
+ mtx_unlock(&dt_lock);
+}
+
+/*
+ * Note for the authors of compat layers (linux, etc): copyout() in
+ * the function below is not a problem since it presents data in
+ * arch-specific format (i.e. i386-specific in this case), not in
+ * the OS-specific one.
+ */
+int
+amd64_get_ldt(td, uap)
+ struct thread *td;
+ struct i386_ldt_args *uap;
+{
+ int error = 0;
+ struct proc_ldt *pldt;
+ int num;
+ struct user_segment_descriptor *lp;
+
+#ifdef DEBUG
+ printf("amd64_get_ldt: start=%d num=%d descs=%p\n",
+ uap->start, uap->num, (void *)uap->descs);
+#endif
+
+ if ((pldt = td->td_proc->p_md.md_ldt) != NULL) {
+ lp = &((struct user_segment_descriptor *)(pldt->ldt_base))
+ [uap->start];
+ num = min(uap->num, max_ldt_segment);
+ } else
+ return (EINVAL);
+
+ if ((uap->start > (unsigned int)max_ldt_segment) ||
+ ((unsigned int)num > (unsigned int)max_ldt_segment) ||
+ ((unsigned int)(uap->start + num) > (unsigned int)max_ldt_segment))
+ return(EINVAL);
+
+ error = copyout(lp, uap->descs, num *
+ sizeof(struct user_segment_descriptor));
+ if (!error)
+ td->td_retval[0] = num;
+
+ return(error);
+}
+
+int
+amd64_set_ldt(td, uap, descs)
+ struct thread *td;
+ struct i386_ldt_args *uap;
+ struct user_segment_descriptor *descs;
+{
+ int error = 0, i;
+ int largest_ld;
+ struct mdproc *mdp = &td->td_proc->p_md;
+ struct proc_ldt *pldt;
+ struct user_segment_descriptor *dp;
+ struct proc *p;
+
+#ifdef DEBUG
+ printf("amd64_set_ldt: start=%d num=%d descs=%p\n",
+ uap->start, uap->num, (void *)uap->descs);
+#endif
+
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+ p = td->td_proc;
+ if (descs == NULL) {
+ /* Free descriptors */
+ if (uap->start == 0 && uap->num == 0)
+ uap->num = max_ldt_segment;
+ if (uap->num == 0)
+ return (EINVAL);
+ if ((pldt = mdp->md_ldt) == NULL ||
+ uap->start >= max_ldt_segment)
+ return (0);
+ largest_ld = uap->start + uap->num;
+ if (largest_ld > max_ldt_segment)
+ largest_ld = max_ldt_segment;
+ i = largest_ld - uap->start;
+ mtx_lock(&dt_lock);
+ bzero(&((struct user_segment_descriptor *)(pldt->ldt_base))
+ [uap->start], sizeof(struct user_segment_descriptor) * i);
+ mtx_unlock(&dt_lock);
+ return (0);
+ }
+
+ if (!(uap->start == LDT_AUTO_ALLOC && uap->num == 1)) {
+ /* verify range of descriptors to modify */
+ largest_ld = uap->start + uap->num;
+ if (uap->start >= max_ldt_segment ||
+ largest_ld > max_ldt_segment)
+ return (EINVAL);
+ }
+
+ /* Check descriptors for access violations */
+ for (i = 0; i < uap->num; i++) {
+ dp = &descs[i];
+
+ switch (dp->sd_type) {
+ case SDT_SYSNULL: /* system null */
+ dp->sd_p = 0;
+ break;
+ case SDT_SYS286TSS:
+ case SDT_SYSLDT:
+ case SDT_SYS286BSY:
+ case SDT_SYS286CGT:
+ case SDT_SYSTASKGT:
+ case SDT_SYS286IGT:
+ case SDT_SYS286TGT:
+ case SDT_SYSNULL2:
+ case SDT_SYSTSS:
+ case SDT_SYSNULL3:
+ case SDT_SYSBSY:
+ case SDT_SYSCGT:
+ case SDT_SYSNULL4:
+ case SDT_SYSIGT:
+ case SDT_SYSTGT:
+ /* I can't think of any reason to allow a user proc
+ * to create a segment of these types. They are
+ * for OS use only.
+ */
+ return (EACCES);
+ /*NOTREACHED*/
+
+ /* memory segment types */
+ case SDT_MEMEC: /* memory execute only conforming */
+ case SDT_MEMEAC: /* memory execute only accessed conforming */
+ case SDT_MEMERC: /* memory execute read conforming */
+ case SDT_MEMERAC: /* memory execute read accessed conforming */
+ /* Must be "present" if executable and conforming. */
+ if (dp->sd_p == 0)
+ return (EACCES);
+ break;
+ case SDT_MEMRO: /* memory read only */
+ case SDT_MEMROA: /* memory read only accessed */
+ case SDT_MEMRW: /* memory read write */
+ case SDT_MEMRWA: /* memory read write accessed */
+ case SDT_MEMROD: /* memory read only expand dwn limit */
+ case SDT_MEMRODA: /* memory read only expand dwn lim accessed */
+ case SDT_MEMRWD: /* memory read write expand dwn limit */
+ case SDT_MEMRWDA: /* memory read write expand dwn lim acessed */
+ case SDT_MEME: /* memory execute only */
+ case SDT_MEMEA: /* memory execute only accessed */
+ case SDT_MEMER: /* memory execute read */
+ case SDT_MEMERA: /* memory execute read accessed */
+ break;
+ default:
+ return(EINVAL);
+ /*NOTREACHED*/
+ }
+
+ /* Only user (ring-3) descriptors may be present. */
+ if ((dp->sd_p != 0) && (dp->sd_dpl != SEL_UPL))
+ return (EACCES);
+ }
+
+ if (uap->start == LDT_AUTO_ALLOC && uap->num == 1) {
+ /* Allocate a free slot */
+ mtx_lock(&dt_lock);
+ pldt = user_ldt_alloc(p, 0);
+ if (pldt == NULL) {
+ mtx_unlock(&dt_lock);
+ return (ENOMEM);
+ }
+
+ /*
+ * start scanning a bit up to leave room for NVidia and
+ * Wine, which still user the "Blat" method of allocation.
+ */
+ i = 16;
+ dp = &((struct user_segment_descriptor *)(pldt->ldt_base))[i];
+ for (; i < max_ldt_segment; ++i, ++dp) {
+ if (dp->sd_type == SDT_SYSNULL)
+ break;
+ }
+ if (i >= max_ldt_segment) {
+ mtx_unlock(&dt_lock);
+ return (ENOSPC);
+ }
+ uap->start = i;
+ error = amd64_set_ldt_data(td, i, 1, descs);
+ mtx_unlock(&dt_lock);
+ } else {
+ largest_ld = uap->start + uap->num;
+ if (largest_ld > max_ldt_segment)
+ return (EINVAL);
+ mtx_lock(&dt_lock);
+ if (user_ldt_alloc(p, 0) != NULL) {
+ error = amd64_set_ldt_data(td, uap->start, uap->num,
+ descs);
+ }
+ mtx_unlock(&dt_lock);
+ }
+ if (error == 0)
+ td->td_retval[0] = uap->start;
+ return (error);
+}
+
+int
+amd64_set_ldt_data(struct thread *td, int start, int num,
+ struct user_segment_descriptor *descs)
+{
+ struct mdproc *mdp = &td->td_proc->p_md;
+ struct proc_ldt *pldt = mdp->md_ldt;
+
+ mtx_assert(&dt_lock, MA_OWNED);
+
+ /* Fill in range */
+ bcopy(descs,
+ &((struct user_segment_descriptor *)(pldt->ldt_base))[start],
+ num * sizeof(struct user_segment_descriptor));
+ return (0);
+}
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
new file mode 100644
index 0000000..6fcca81
--- /dev/null
+++ b/sys/amd64/amd64/trap.c
@@ -0,0 +1,1006 @@
+/*-
+ * Copyright (C) 1994, David Greenman
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the University of Utah, and William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)trap.c 7.4 (Berkeley) 5/13/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * AMD64 Trap and System call handling
+ */
+
+#include "opt_clock.h"
+#include "opt_cpu.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_isa.h"
+#include "opt_kdb.h"
+#include "opt_kdtrace.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/ptrace.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/syscall.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/uio.h>
+#include <sys/vmmeter.h>
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+PMC_SOFT_DEFINE( , , page_fault, all);
+PMC_SOFT_DEFINE( , , page_fault, read);
+PMC_SOFT_DEFINE( , , page_fault, write);
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_extern.h>
+
+#include <machine/cpu.h>
+#include <machine/intr_machdep.h>
+#include <x86/mca.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#ifdef SMP
+#include <machine/smp.h>
+#endif
+#include <machine/tss.h>
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+
+/*
+ * This is a hook which is initialised by the dtrace module
+ * to handle traps which might occur during DTrace probe
+ * execution.
+ */
+dtrace_trap_func_t dtrace_trap_func;
+
+dtrace_doubletrap_func_t dtrace_doubletrap_func;
+
+/*
+ * This is a hook which is initialised by the systrace module
+ * when it is loaded. This keeps the DTrace syscall provider
+ * implementation opaque.
+ */
+systrace_probe_func_t systrace_probe_func;
+
+/*
+ * These hooks are necessary for the pid, usdt and fasttrap providers.
+ */
+dtrace_fasttrap_probe_ptr_t dtrace_fasttrap_probe_ptr;
+dtrace_pid_probe_ptr_t dtrace_pid_probe_ptr;
+dtrace_return_probe_ptr_t dtrace_return_probe_ptr;
+#endif
+
+extern void trap(struct trapframe *frame);
+extern void syscall(struct trapframe *frame);
+void dblfault_handler(struct trapframe *frame);
+
+static int trap_pfault(struct trapframe *, int);
+static void trap_fatal(struct trapframe *, vm_offset_t);
+
+#define MAX_TRAP_MSG 33
+static char *trap_msg[] = {
+ "", /* 0 unused */
+ "privileged instruction fault", /* 1 T_PRIVINFLT */
+ "", /* 2 unused */
+ "breakpoint instruction fault", /* 3 T_BPTFLT */
+ "", /* 4 unused */
+ "", /* 5 unused */
+ "arithmetic trap", /* 6 T_ARITHTRAP */
+ "", /* 7 unused */
+ "", /* 8 unused */
+ "general protection fault", /* 9 T_PROTFLT */
+ "trace trap", /* 10 T_TRCTRAP */
+ "", /* 11 unused */
+ "page fault", /* 12 T_PAGEFLT */
+ "", /* 13 unused */
+ "alignment fault", /* 14 T_ALIGNFLT */
+ "", /* 15 unused */
+ "", /* 16 unused */
+ "", /* 17 unused */
+ "integer divide fault", /* 18 T_DIVIDE */
+ "non-maskable interrupt trap", /* 19 T_NMI */
+ "overflow trap", /* 20 T_OFLOW */
+ "FPU bounds check fault", /* 21 T_BOUND */
+ "FPU device not available", /* 22 T_DNA */
+ "double fault", /* 23 T_DOUBLEFLT */
+ "FPU operand fetch fault", /* 24 T_FPOPFLT */
+ "invalid TSS fault", /* 25 T_TSSFLT */
+ "segment not present fault", /* 26 T_SEGNPFLT */
+ "stack fault", /* 27 T_STKFLT */
+ "machine check trap", /* 28 T_MCHK */
+ "SIMD floating-point exception", /* 29 T_XMMFLT */
+ "reserved (unknown) fault", /* 30 T_RESERVED */
+ "", /* 31 unused (reserved) */
+ "DTrace pid return trap", /* 32 T_DTRACE_RET */
+ "DTrace fasttrap probe trap", /* 33 T_DTRACE_PROBE */
+};
+
+#ifdef KDB
+static int kdb_on_nmi = 1;
+SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RW,
+ &kdb_on_nmi, 0, "Go to KDB on NMI");
+TUNABLE_INT("machdep.kdb_on_nmi", &kdb_on_nmi);
+#endif
+static int panic_on_nmi = 1;
+SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
+ &panic_on_nmi, 0, "Panic on NMI");
+TUNABLE_INT("machdep.panic_on_nmi", &panic_on_nmi);
+static int prot_fault_translation;
+SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW,
+ &prot_fault_translation, 0,
+ "Select signal to deliver on protection fault");
+static int uprintf_signal;
+SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW,
+ &uprintf_signal, 0,
+ "Print debugging information on trap signal to ctty");
+
+/*
+ * Exception, fault, and trap interface to the FreeBSD kernel.
+ * This common code is called from assembly language IDT gate entry
+ * routines that prepare a suitable stack frame, and restore this
+ * frame after the exception has been processed.
+ */
+
+void
+trap(struct trapframe *frame)
+{
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ int i = 0, ucode = 0, code;
+ u_int type;
+ register_t addr = 0;
+ ksiginfo_t ksi;
+
+ PCPU_INC(cnt.v_trap);
+ type = frame->tf_trapno;
+
+#ifdef SMP
+ /* Handler for NMI IPIs used for stopping CPUs. */
+ if (type == T_NMI) {
+ if (ipi_nmi_handler() == 0)
+ goto out;
+ }
+#endif /* SMP */
+
+#ifdef KDB
+ if (kdb_active) {
+ kdb_reenter();
+ goto out;
+ }
+#endif
+
+ if (type == T_RESERVED) {
+ trap_fatal(frame, 0);
+ goto out;
+ }
+
+#ifdef HWPMC_HOOKS
+ /*
+ * CPU PMCs interrupt using an NMI. If the PMC module is
+ * active, pass the 'rip' value to the PMC module's interrupt
+ * handler. A return value of '1' from the handler means that
+ * the NMI was handled by it and we can return immediately.
+ */
+ if (type == T_NMI && pmc_intr &&
+ (*pmc_intr)(PCPU_GET(cpuid), frame))
+ goto out;
+#endif
+
+ if (type == T_MCHK) {
+ mca_intr();
+ goto out;
+ }
+
+#ifdef KDTRACE_HOOKS
+ /*
+ * A trap can occur while DTrace executes a probe. Before
+ * executing the probe, DTrace blocks re-scheduling and sets
+ * a flag in it's per-cpu flags to indicate that it doesn't
+ * want to fault. On returning from the probe, the no-fault
+ * flag is cleared and finally re-scheduling is enabled.
+ *
+ * If the DTrace kernel module has registered a trap handler,
+ * call it and if it returns non-zero, assume that it has
+ * handled the trap and modified the trap frame so that this
+ * function can return normally.
+ */
+ if (type == T_DTRACE_PROBE || type == T_DTRACE_RET ||
+ type == T_BPTFLT) {
+ struct reg regs;
+
+ fill_frame_regs(frame, &regs);
+ if (type == T_DTRACE_PROBE &&
+ dtrace_fasttrap_probe_ptr != NULL &&
+ dtrace_fasttrap_probe_ptr(&regs) == 0)
+ goto out;
+ else if (type == T_BPTFLT &&
+ dtrace_pid_probe_ptr != NULL &&
+ dtrace_pid_probe_ptr(&regs) == 0)
+ goto out;
+ else if (type == T_DTRACE_RET &&
+ dtrace_return_probe_ptr != NULL &&
+ dtrace_return_probe_ptr(&regs) == 0)
+ goto out;
+ }
+ if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
+ goto out;
+#endif
+
+ if ((frame->tf_rflags & PSL_I) == 0) {
+ /*
+ * Buggy application or kernel code has disabled
+ * interrupts and then trapped. Enabling interrupts
+ * now is wrong, but it is better than running with
+ * interrupts disabled until they are accidentally
+ * enabled later.
+ */
+ if (ISPL(frame->tf_cs) == SEL_UPL)
+ uprintf(
+ "pid %ld (%s): trap %d with interrupts disabled\n",
+ (long)curproc->p_pid, curthread->td_name, type);
+ else if (type != T_NMI && type != T_BPTFLT &&
+ type != T_TRCTRAP) {
+ /*
+ * XXX not quite right, since this may be for a
+ * multiple fault in user mode.
+ */
+ printf("kernel trap %d with interrupts disabled\n",
+ type);
+
+ /*
+ * We shouldn't enable interrupts while holding a
+ * spin lock.
+ */
+ if (td->td_md.md_spinlock_count == 0)
+ enable_intr();
+ }
+ }
+
+ code = frame->tf_err;
+
+ if (ISPL(frame->tf_cs) == SEL_UPL) {
+ /* user trap */
+
+ td->td_pticks = 0;
+ td->td_frame = frame;
+ addr = frame->tf_rip;
+ if (td->td_ucred != p->p_ucred)
+ cred_update_thread(td);
+
+ switch (type) {
+ case T_PRIVINFLT: /* privileged instruction fault */
+ i = SIGILL;
+ ucode = ILL_PRVOPC;
+ break;
+
+ case T_BPTFLT: /* bpt instruction fault */
+ case T_TRCTRAP: /* trace trap */
+ enable_intr();
+ frame->tf_rflags &= ~PSL_T;
+ i = SIGTRAP;
+ ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
+ break;
+
+ case T_ARITHTRAP: /* arithmetic trap */
+ ucode = fputrap_x87();
+ if (ucode == -1)
+ goto userout;
+ i = SIGFPE;
+ break;
+
+ case T_PROTFLT: /* general protection fault */
+ i = SIGBUS;
+ ucode = BUS_OBJERR;
+ break;
+ case T_STKFLT: /* stack fault */
+ case T_SEGNPFLT: /* segment not present fault */
+ i = SIGBUS;
+ ucode = BUS_ADRERR;
+ break;
+ case T_TSSFLT: /* invalid TSS fault */
+ i = SIGBUS;
+ ucode = BUS_OBJERR;
+ break;
+ case T_DOUBLEFLT: /* double fault */
+ default:
+ i = SIGBUS;
+ ucode = BUS_OBJERR;
+ break;
+
+ case T_PAGEFLT: /* page fault */
+ addr = frame->tf_addr;
+ i = trap_pfault(frame, TRUE);
+ if (i == -1)
+ goto userout;
+ if (i == 0)
+ goto user;
+
+ if (i == SIGSEGV)
+ ucode = SEGV_MAPERR;
+ else {
+ if (prot_fault_translation == 0) {
+ /*
+ * Autodetect.
+ * This check also covers the images
+ * without the ABI-tag ELF note.
+ */
+ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD
+ && p->p_osrel >= P_OSREL_SIGSEGV) {
+ i = SIGSEGV;
+ ucode = SEGV_ACCERR;
+ } else {
+ i = SIGBUS;
+ ucode = BUS_PAGE_FAULT;
+ }
+ } else if (prot_fault_translation == 1) {
+ /*
+ * Always compat mode.
+ */
+ i = SIGBUS;
+ ucode = BUS_PAGE_FAULT;
+ } else {
+ /*
+ * Always SIGSEGV mode.
+ */
+ i = SIGSEGV;
+ ucode = SEGV_ACCERR;
+ }
+ }
+ break;
+
+ case T_DIVIDE: /* integer divide fault */
+ ucode = FPE_INTDIV;
+ i = SIGFPE;
+ break;
+
+#ifdef DEV_ISA
+ case T_NMI:
+ /* machine/parity/power fail/"kitchen sink" faults */
+ if (isa_nmi(code) == 0) {
+#ifdef KDB
+ /*
+ * NMI can be hooked up to a pushbutton
+ * for debugging.
+ */
+ if (kdb_on_nmi) {
+ printf ("NMI ... going to debugger\n");
+ kdb_trap(type, 0, frame);
+ }
+#endif /* KDB */
+ goto userout;
+ } else if (panic_on_nmi)
+ panic("NMI indicates hardware failure");
+ break;
+#endif /* DEV_ISA */
+
+ case T_OFLOW: /* integer overflow fault */
+ ucode = FPE_INTOVF;
+ i = SIGFPE;
+ break;
+
+ case T_BOUND: /* bounds check fault */
+ ucode = FPE_FLTSUB;
+ i = SIGFPE;
+ break;
+
+ case T_DNA:
+ /* transparent fault (due to context switch "late") */
+ KASSERT(PCB_USER_FPU(td->td_pcb),
+ ("kernel FPU ctx has leaked"));
+ fpudna();
+ goto userout;
+
+ case T_FPOPFLT: /* FPU operand fetch fault */
+ ucode = ILL_COPROC;
+ i = SIGILL;
+ break;
+
+ case T_XMMFLT: /* SIMD floating-point exception */
+ ucode = fputrap_sse();
+ if (ucode == -1)
+ goto userout;
+ i = SIGFPE;
+ break;
+ }
+ } else {
+ /* kernel trap */
+
+ KASSERT(cold || td->td_ucred != NULL,
+ ("kernel trap doesn't have ucred"));
+ switch (type) {
+ case T_PAGEFLT: /* page fault */
+ (void) trap_pfault(frame, FALSE);
+ goto out;
+
+ case T_DNA:
+ KASSERT(!PCB_USER_FPU(td->td_pcb),
+ ("Unregistered use of FPU in kernel"));
+ fpudna();
+ goto out;
+
+ case T_ARITHTRAP: /* arithmetic trap */
+ case T_XMMFLT: /* SIMD floating-point exception */
+ case T_FPOPFLT: /* FPU operand fetch fault */
+ /*
+ * XXXKIB for now disable any FPU traps in kernel
+ * handler registration seems to be overkill
+ */
+ trap_fatal(frame, 0);
+ goto out;
+
+ case T_STKFLT: /* stack fault */
+ break;
+
+ case T_PROTFLT: /* general protection fault */
+ case T_SEGNPFLT: /* segment not present fault */
+ if (td->td_intr_nesting_level != 0)
+ break;
+
+ /*
+ * Invalid segment selectors and out of bounds
+ * %rip's and %rsp's can be set up in user mode.
+ * This causes a fault in kernel mode when the
+ * kernel tries to return to user mode. We want
+ * to get this fault so that we can fix the
+ * problem here and not have to check all the
+ * selectors and pointers when the user changes
+ * them.
+ */
+ if (frame->tf_rip == (long)doreti_iret) {
+ frame->tf_rip = (long)doreti_iret_fault;
+ goto out;
+ }
+ if (frame->tf_rip == (long)ld_ds) {
+ frame->tf_rip = (long)ds_load_fault;
+ goto out;
+ }
+ if (frame->tf_rip == (long)ld_es) {
+ frame->tf_rip = (long)es_load_fault;
+ goto out;
+ }
+ if (frame->tf_rip == (long)ld_fs) {
+ frame->tf_rip = (long)fs_load_fault;
+ goto out;
+ }
+ if (frame->tf_rip == (long)ld_gs) {
+ frame->tf_rip = (long)gs_load_fault;
+ goto out;
+ }
+ if (frame->tf_rip == (long)ld_gsbase) {
+ frame->tf_rip = (long)gsbase_load_fault;
+ goto out;
+ }
+ if (frame->tf_rip == (long)ld_fsbase) {
+ frame->tf_rip = (long)fsbase_load_fault;
+ goto out;
+ }
+ if (curpcb->pcb_onfault != NULL) {
+ frame->tf_rip = (long)curpcb->pcb_onfault;
+ goto out;
+ }
+ break;
+
+ case T_TSSFLT:
+ /*
+ * PSL_NT can be set in user mode and isn't cleared
+ * automatically when the kernel is entered. This
+ * causes a TSS fault when the kernel attempts to
+ * `iret' because the TSS link is uninitialized. We
+ * want to get this fault so that we can fix the
+ * problem here and not every time the kernel is
+ * entered.
+ */
+ if (frame->tf_rflags & PSL_NT) {
+ frame->tf_rflags &= ~PSL_NT;
+ goto out;
+ }
+ break;
+
+ case T_TRCTRAP: /* trace trap */
+ /*
+ * Ignore debug register trace traps due to
+ * accesses in the user's address space, which
+ * can happen under several conditions such as
+ * if a user sets a watchpoint on a buffer and
+ * then passes that buffer to a system call.
+ * We still want to get TRCTRAPS for addresses
+ * in kernel space because that is useful when
+ * debugging the kernel.
+ */
+ if (user_dbreg_trap()) {
+ /*
+ * Reset breakpoint bits because the
+ * processor doesn't
+ */
+ /* XXX check upper bits here */
+ load_dr6(rdr6() & 0xfffffff0);
+ goto out;
+ }
+ /*
+ * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
+ */
+ case T_BPTFLT:
+ /*
+ * If KDB is enabled, let it handle the debugger trap.
+ * Otherwise, debugger traps "can't happen".
+ */
+#ifdef KDB
+ if (kdb_trap(type, 0, frame))
+ goto out;
+#endif
+ break;
+
+#ifdef DEV_ISA
+ case T_NMI:
+ /* machine/parity/power fail/"kitchen sink" faults */
+ if (isa_nmi(code) == 0) {
+#ifdef KDB
+ /*
+ * NMI can be hooked up to a pushbutton
+ * for debugging.
+ */
+ if (kdb_on_nmi) {
+ printf ("NMI ... going to debugger\n");
+ kdb_trap(type, 0, frame);
+ }
+#endif /* KDB */
+ goto out;
+ } else if (panic_on_nmi == 0)
+ goto out;
+ /* FALLTHROUGH */
+#endif /* DEV_ISA */
+ }
+
+ trap_fatal(frame, 0);
+ goto out;
+ }
+
+ /* Translate fault for emulators (e.g. Linux) */
+ if (*p->p_sysent->sv_transtrap)
+ i = (*p->p_sysent->sv_transtrap)(i, type);
+
+ ksiginfo_init_trap(&ksi);
+ ksi.ksi_signo = i;
+ ksi.ksi_code = ucode;
+ ksi.ksi_trapno = type;
+ ksi.ksi_addr = (void *)addr;
+ if (uprintf_signal) {
+ uprintf("pid %d comm %s: signal %d err %lx code %d type %d "
+ "addr 0x%lx rip 0x%lx "
+ "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
+ p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr,
+ frame->tf_rip,
+ fubyte((void *)(frame->tf_rip + 0)),
+ fubyte((void *)(frame->tf_rip + 1)),
+ fubyte((void *)(frame->tf_rip + 2)),
+ fubyte((void *)(frame->tf_rip + 3)),
+ fubyte((void *)(frame->tf_rip + 4)),
+ fubyte((void *)(frame->tf_rip + 5)),
+ fubyte((void *)(frame->tf_rip + 6)),
+ fubyte((void *)(frame->tf_rip + 7)));
+ }
+ trapsignal(td, &ksi);
+
+user:
+ userret(td, frame);
+ KASSERT(PCB_USER_FPU(td->td_pcb),
+ ("Return from trap with kernel FPU ctx leaked"));
+userout:
+out:
+ return;
+}
+
+static int
+trap_pfault(frame, usermode)
+ struct trapframe *frame;
+ int usermode;
+{
+ vm_offset_t va;
+ struct vmspace *vm;
+ vm_map_t map;
+ int rv = 0;
+ vm_prot_t ftype;
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ vm_offset_t eva = frame->tf_addr;
+
+ if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
+ /*
+ * Due to both processor errata and lazy TLB invalidation when
+ * access restrictions are removed from virtual pages, memory
+ * accesses that are allowed by the physical mapping layer may
+ * nonetheless cause one spurious page fault per virtual page.
+ * When the thread is executing a "no faulting" section that
+ * is bracketed by vm_fault_{disable,enable}_pagefaults(),
+ * every page fault is treated as a spurious page fault,
+ * unless it accesses the same virtual address as the most
+ * recent page fault within the same "no faulting" section.
+ */
+ if (td->td_md.md_spurflt_addr != eva ||
+ (td->td_pflags & TDP_RESETSPUR) != 0) {
+ /*
+ * Do nothing to the TLB. A stale TLB entry is
+ * flushed automatically by a page fault.
+ */
+ td->td_md.md_spurflt_addr = eva;
+ td->td_pflags &= ~TDP_RESETSPUR;
+ return (0);
+ }
+ } else {
+ /*
+ * If we get a page fault while in a critical section, then
+ * it is most likely a fatal kernel page fault. The kernel
+ * is already going to panic trying to get a sleep lock to
+ * do the VM lookup, so just consider it a fatal trap so the
+ * kernel can print out a useful trap message and even get
+ * to the debugger.
+ *
+ * If we get a page fault while holding a non-sleepable
+ * lock, then it is most likely a fatal kernel page fault.
+ * If WITNESS is enabled, then it's going to whine about
+ * bogus LORs with various VM locks, so just skip to the
+ * fatal trap handling directly.
+ */
+ if (td->td_critnest != 0 ||
+ WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
+ "Kernel page fault") != 0) {
+ trap_fatal(frame, eva);
+ return (-1);
+ }
+ }
+ va = trunc_page(eva);
+ if (va >= VM_MIN_KERNEL_ADDRESS) {
+ /*
+ * Don't allow user-mode faults in kernel address space.
+ */
+ if (usermode)
+ goto nogo;
+
+ map = kernel_map;
+ } else {
+ /*
+ * This is a fault on non-kernel virtual memory. If either
+ * p or p->p_vmspace is NULL, then the fault is fatal.
+ */
+ if (p == NULL || (vm = p->p_vmspace) == NULL)
+ goto nogo;
+
+ map = &vm->vm_map;
+
+ /*
+ * When accessing a usermode address, kernel must be
+ * ready to accept the page fault, and provide a
+ * handling routine. Since accessing the address
+ * without the handler is a bug, do not try to handle
+ * it normally, and panic immediately.
+ */
+ if (!usermode && (td->td_intr_nesting_level != 0 ||
+ curpcb->pcb_onfault == NULL)) {
+ trap_fatal(frame, eva);
+ return (-1);
+ }
+ }
+
+ /*
+ * PGEX_I is defined only if the execute disable bit capability is
+ * supported and enabled.
+ */
+ if (frame->tf_err & PGEX_W)
+ ftype = VM_PROT_WRITE;
+ else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
+ ftype = VM_PROT_EXECUTE;
+ else
+ ftype = VM_PROT_READ;
+
+ if (map != kernel_map) {
+ /*
+ * Keep swapout from messing with us during this
+ * critical time.
+ */
+ PROC_LOCK(p);
+ ++p->p_lock;
+ PROC_UNLOCK(p);
+
+ /* Fault in the user page: */
+ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
+
+ PROC_LOCK(p);
+ --p->p_lock;
+ PROC_UNLOCK(p);
+ } else {
+ /*
+ * Don't have to worry about process locking or stacks in the
+ * kernel.
+ */
+ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
+ }
+ if (rv == KERN_SUCCESS) {
+#ifdef HWPMC_HOOKS
+ if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
+ PMC_SOFT_CALL_TF( , , page_fault, all, frame);
+ if (ftype == VM_PROT_READ)
+ PMC_SOFT_CALL_TF( , , page_fault, read,
+ frame);
+ else
+ PMC_SOFT_CALL_TF( , , page_fault, write,
+ frame);
+ }
+#endif
+ return (0);
+ }
+nogo:
+ if (!usermode) {
+ if (td->td_intr_nesting_level == 0 &&
+ curpcb->pcb_onfault != NULL) {
+ frame->tf_rip = (long)curpcb->pcb_onfault;
+ return (0);
+ }
+ trap_fatal(frame, eva);
+ return (-1);
+ }
+ return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
+}
+
+static void
+trap_fatal(frame, eva)
+ struct trapframe *frame;
+ vm_offset_t eva;
+{
+ int code, ss;
+ u_int type;
+ long esp;
+ struct soft_segment_descriptor softseg;
+ char *msg;
+
+ code = frame->tf_err;
+ type = frame->tf_trapno;
+ sdtossd(&gdt[NGDT * PCPU_GET(cpuid) + IDXSEL(frame->tf_cs & 0xffff)],
+ &softseg);
+
+ if (type <= MAX_TRAP_MSG)
+ msg = trap_msg[type];
+ else
+ msg = "UNKNOWN";
+ printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
+ ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
+#ifdef SMP
+ /* two separate prints in case of a trap on an unmapped page */
+ printf("cpuid = %d; ", PCPU_GET(cpuid));
+ printf("apic id = %02x\n", PCPU_GET(apic_id));
+#endif
+ if (type == T_PAGEFLT) {
+ printf("fault virtual address = 0x%lx\n", eva);
+ printf("fault code = %s %s %s, %s\n",
+ code & PGEX_U ? "user" : "supervisor",
+ code & PGEX_W ? "write" : "read",
+ code & PGEX_I ? "instruction" : "data",
+ code & PGEX_P ? "protection violation" : "page not present");
+ }
+ printf("instruction pointer = 0x%lx:0x%lx\n",
+ frame->tf_cs & 0xffff, frame->tf_rip);
+ if (ISPL(frame->tf_cs) == SEL_UPL) {
+ ss = frame->tf_ss & 0xffff;
+ esp = frame->tf_rsp;
+ } else {
+ ss = GSEL(GDATA_SEL, SEL_KPL);
+ esp = (long)&frame->tf_rsp;
+ }
+ printf("stack pointer = 0x%x:0x%lx\n", ss, esp);
+ printf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp);
+ printf("code segment = base 0x%lx, limit 0x%lx, type 0x%x\n",
+ softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
+ printf(" = DPL %d, pres %d, long %d, def32 %d, gran %d\n",
+ softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32,
+ softseg.ssd_gran);
+ printf("processor eflags = ");
+ if (frame->tf_rflags & PSL_T)
+ printf("trace trap, ");
+ if (frame->tf_rflags & PSL_I)
+ printf("interrupt enabled, ");
+ if (frame->tf_rflags & PSL_NT)
+ printf("nested task, ");
+ if (frame->tf_rflags & PSL_RF)
+ printf("resume, ");
+ printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12);
+ printf("current process = ");
+ if (curproc) {
+ printf("%lu (%s)\n",
+ (u_long)curproc->p_pid, curthread->td_name ?
+ curthread->td_name : "");
+ } else {
+ printf("Idle\n");
+ }
+
+#ifdef KDB
+ if (debugger_on_panic || kdb_active)
+ if (kdb_trap(type, 0, frame))
+ return;
+#endif
+ printf("trap number = %d\n", type);
+ if (type <= MAX_TRAP_MSG)
+ panic("%s", trap_msg[type]);
+ else
+ panic("unknown/reserved trap");
+}
+
+/*
+ * Double fault handler. Called when a fault occurs while writing
+ * a frame for a trap/exception onto the stack. This usually occurs
+ * when the stack overflows (such is the case with infinite recursion,
+ * for example).
+ */
+void
+dblfault_handler(struct trapframe *frame)
+{
+#ifdef KDTRACE_HOOKS
+ if (dtrace_doubletrap_func != NULL)
+ (*dtrace_doubletrap_func)();
+#endif
+ printf("\nFatal double fault\n");
+ printf("rip = 0x%lx\n", frame->tf_rip);
+ printf("rsp = 0x%lx\n", frame->tf_rsp);
+ printf("rbp = 0x%lx\n", frame->tf_rbp);
+#ifdef SMP
+ /* two separate prints in case of a trap on an unmapped page */
+ printf("cpuid = %d; ", PCPU_GET(cpuid));
+ printf("apic id = %02x\n", PCPU_GET(apic_id));
+#endif
+ panic("double fault");
+}
+
+int
+cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
+{
+ struct proc *p;
+ struct trapframe *frame;
+ register_t *argp;
+ caddr_t params;
+ int reg, regcnt, error;
+
+ p = td->td_proc;
+ frame = td->td_frame;
+ reg = 0;
+ regcnt = 6;
+
+ params = (caddr_t)frame->tf_rsp + sizeof(register_t);
+ sa->code = frame->tf_rax;
+
+ if (sa->code == SYS_syscall || sa->code == SYS___syscall) {
+ sa->code = frame->tf_rdi;
+ reg++;
+ regcnt--;
+ }
+ if (p->p_sysent->sv_mask)
+ sa->code &= p->p_sysent->sv_mask;
+
+ if (sa->code >= p->p_sysent->sv_size)
+ sa->callp = &p->p_sysent->sv_table[0];
+ else
+ sa->callp = &p->p_sysent->sv_table[sa->code];
+
+ sa->narg = sa->callp->sy_narg;
+ KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]),
+ ("Too many syscall arguments!"));
+ error = 0;
+ argp = &frame->tf_rdi;
+ argp += reg;
+ bcopy(argp, sa->args, sizeof(sa->args[0]) * regcnt);
+ if (sa->narg > regcnt) {
+ KASSERT(params != NULL, ("copyin args with no params!"));
+ error = copyin(params, &sa->args[regcnt],
+ (sa->narg - regcnt) * sizeof(sa->args[0]));
+ }
+
+ if (error == 0) {
+ td->td_retval[0] = 0;
+ td->td_retval[1] = frame->tf_rdx;
+ }
+
+ return (error);
+}
+
+#include "../../kern/subr_syscall.c"
+
+/*
+ * System call handler for native binaries. The trap frame is already
+ * set up by the assembler trampoline and a pointer to it is saved in
+ * td_frame.
+ */
+void
+amd64_syscall(struct thread *td, int traced)
+{
+ struct syscall_args sa;
+ int error;
+ ksiginfo_t ksi;
+
+#ifdef DIAGNOSTIC
+ if (ISPL(td->td_frame->tf_cs) != SEL_UPL) {
+ panic("syscall");
+ /* NOT REACHED */
+ }
+#endif
+ error = syscallenter(td, &sa);
+
+ /*
+ * Traced syscall.
+ */
+ if (__predict_false(traced)) {
+ td->td_frame->tf_rflags &= ~PSL_T;
+ ksiginfo_init_trap(&ksi);
+ ksi.ksi_signo = SIGTRAP;
+ ksi.ksi_code = TRAP_TRACE;
+ ksi.ksi_addr = (void *)td->td_frame->tf_rip;
+ trapsignal(td, &ksi);
+ }
+
+ KASSERT(PCB_USER_FPU(td->td_pcb),
+ ("System call %s returing with kernel FPU ctx leaked",
+ syscallname(td->td_proc, sa.code)));
+ KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
+ ("System call %s returning with mangled pcb_save",
+ syscallname(td->td_proc, sa.code)));
+
+ syscallret(td, error, &sa);
+
+ /*
+ * If the user-supplied value of %rip is not a canonical
+ * address, then some CPUs will trigger a ring 0 #GP during
+ * the sysret instruction. However, the fault handler would
+ * execute in ring 0 with the user's %gs and %rsp which would
+ * not be safe. Instead, use the full return path which
+ * catches the problem safely.
+ */
+ if (td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS)
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+}
diff --git a/sys/amd64/amd64/uio_machdep.c b/sys/amd64/amd64/uio_machdep.c
new file mode 100644
index 0000000..2d24c7c
--- /dev/null
+++ b/sys/amd64/amd64/uio_machdep.c
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 2004 Alan L. Cox <alc@cs.rice.edu>
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/vmparam.h>
+
+/*
+ * Implement uiomove(9) from physical memory using the direct map to
+ * avoid the creation and destruction of ephemeral mappings.
+ */
+int
+uiomove_fromphys(vm_page_t ma[], vm_offset_t offset, int n, struct uio *uio)
+{
+ struct thread *td = curthread;
+ struct iovec *iov;
+ void *cp;
+ vm_offset_t page_offset;
+ size_t cnt;
+ int error = 0;
+ int save = 0;
+
+ KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
+ ("uiomove_fromphys: mode"));
+ KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
+ ("uiomove_fromphys proc"));
+ save = td->td_pflags & TDP_DEADLKTREAT;
+ td->td_pflags |= TDP_DEADLKTREAT;
+ while (n > 0 && uio->uio_resid) {
+ iov = uio->uio_iov;
+ cnt = iov->iov_len;
+ if (cnt == 0) {
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ continue;
+ }
+ if (cnt > n)
+ cnt = n;
+ page_offset = offset & PAGE_MASK;
+ cnt = min(cnt, PAGE_SIZE - page_offset);
+ cp = (char *)PHYS_TO_DMAP(ma[offset >> PAGE_SHIFT]->phys_addr) +
+ page_offset;
+ switch (uio->uio_segflg) {
+ case UIO_USERSPACE:
+ maybe_yield();
+ if (uio->uio_rw == UIO_READ)
+ error = copyout(cp, iov->iov_base, cnt);
+ else
+ error = copyin(iov->iov_base, cp, cnt);
+ if (error)
+ goto out;
+ break;
+ case UIO_SYSSPACE:
+ if (uio->uio_rw == UIO_READ)
+ bcopy(cp, iov->iov_base, cnt);
+ else
+ bcopy(iov->iov_base, cp, cnt);
+ break;
+ case UIO_NOCOPY:
+ break;
+ }
+ iov->iov_base = (char *)iov->iov_base + cnt;
+ iov->iov_len -= cnt;
+ uio->uio_resid -= cnt;
+ uio->uio_offset += cnt;
+ offset += cnt;
+ n -= cnt;
+ }
+out:
+ if (save == 0)
+ td->td_pflags &= ~TDP_DEADLKTREAT;
+ return (error);
+}
diff --git a/sys/amd64/amd64/uma_machdep.c b/sys/amd64/amd64/uma_machdep.c
new file mode 100644
index 0000000..c4ca677
--- /dev/null
+++ b/sys/amd64/amd64/uma_machdep.c
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2003 Alan L. Cox <alc@cs.rice.edu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/uma.h>
+#include <vm/uma_int.h>
+#include <machine/md_var.h>
+#include <machine/vmparam.h>
+
+void *
+uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+{
+ vm_page_t m;
+ vm_paddr_t pa;
+ void *va;
+ int pflags;
+
+ *flags = UMA_SLAB_PRIV;
+ pflags = malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED;
+ for (;;) {
+ m = vm_page_alloc(NULL, 0, pflags);
+ if (m == NULL) {
+ if (wait & M_NOWAIT)
+ return (NULL);
+ else
+ VM_WAIT;
+ } else
+ break;
+ }
+ pa = m->phys_addr;
+ if ((wait & M_NODUMP) == 0)
+ dump_add_page(pa);
+ va = (void *)PHYS_TO_DMAP(pa);
+ if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0)
+ pagezero(va);
+ return (va);
+}
+
+void
+uma_small_free(void *mem, int size, u_int8_t flags)
+{
+ vm_page_t m;
+ vm_paddr_t pa;
+
+ pa = DMAP_TO_PHYS((vm_offset_t)mem);
+ dump_drop_page(pa);
+ m = PHYS_TO_VM_PAGE(pa);
+ m->wire_count--;
+ vm_page_free(m);
+ atomic_subtract_int(&cnt.v_wire_count, 1);
+}
diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
new file mode 100644
index 0000000..9883715
--- /dev/null
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -0,0 +1,750 @@
+/*-
+ * Copyright (c) 1982, 1986 The Regents of the University of California.
+ * Copyright (c) 1989, 1990 William Jolitz
+ * Copyright (c) 1994 John Dyson
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department, and William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91
+ * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_isa.h"
+#include "opt_cpu.h"
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/pioctl.h>
+#include <sys/proc.h>
+#include <sys/sf_buf.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/vmmeter.h>
+
+#include <machine/cpu.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+#include <machine/tss.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_param.h>
+
+#include <x86/isa/isa.h>
+
+static void cpu_reset_real(void);
+#ifdef SMP
+static void cpu_reset_proxy(void);
+static u_int cpu_reset_proxyid;
+static volatile u_int cpu_reset_proxy_active;
+#endif
+
+CTASSERT((struct thread **)OFFSETOF_CURTHREAD ==
+ &((struct pcpu *)NULL)->pc_curthread);
+CTASSERT((struct pcb **)OFFSETOF_CURPCB == &((struct pcpu *)NULL)->pc_curpcb);
+
+struct savefpu *
+get_pcb_user_save_td(struct thread *td)
+{
+ vm_offset_t p;
+
+ p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
+ cpu_max_ext_state_size;
+ KASSERT((p % 64) == 0, ("Unaligned pcb_user_save area"));
+ return ((struct savefpu *)p);
+}
+
+struct savefpu *
+get_pcb_user_save_pcb(struct pcb *pcb)
+{
+ vm_offset_t p;
+
+ p = (vm_offset_t)(pcb + 1);
+ return ((struct savefpu *)p);
+}
+
+struct pcb *
+get_pcb_td(struct thread *td)
+{
+ vm_offset_t p;
+
+ p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
+ cpu_max_ext_state_size - sizeof(struct pcb);
+ return ((struct pcb *)p);
+}
+
+void *
+alloc_fpusave(int flags)
+{
+ struct pcb *res;
+ struct savefpu_ymm *sf;
+
+ res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags);
+ if (use_xsave) {
+ sf = (struct savefpu_ymm *)res;
+ bzero(&sf->sv_xstate.sx_hd, sizeof(sf->sv_xstate.sx_hd));
+ sf->sv_xstate.sx_hd.xstate_bv = xsave_mask;
+ }
+ return (res);
+}
+
+/*
+ * Finish a fork operation, with process p2 nearly set up.
+ * Copy and update the pcb, set up the stack so that the child
+ * ready to run and return to user mode.
+ */
+void
+cpu_fork(td1, p2, td2, flags)
+ register struct thread *td1;
+ register struct proc *p2;
+ struct thread *td2;
+ int flags;
+{
+ register struct proc *p1;
+ struct pcb *pcb2;
+ struct mdproc *mdp1, *mdp2;
+ struct proc_ldt *pldt;
+ pmap_t pmap2;
+
+ p1 = td1->td_proc;
+ if ((flags & RFPROC) == 0) {
+ if ((flags & RFMEM) == 0) {
+ /* unshare user LDT */
+ mdp1 = &p1->p_md;
+ mtx_lock(&dt_lock);
+ if ((pldt = mdp1->md_ldt) != NULL &&
+ pldt->ldt_refcnt > 1 &&
+ user_ldt_alloc(p1, 1) == NULL)
+ panic("could not copy LDT");
+ mtx_unlock(&dt_lock);
+ }
+ return;
+ }
+
+ /* Ensure that td1's pcb is up to date. */
+ fpuexit(td1);
+
+ /* Point the pcb to the top of the stack */
+ pcb2 = get_pcb_td(td2);
+ td2->td_pcb = pcb2;
+
+ /* Copy td1's pcb */
+ bcopy(td1->td_pcb, pcb2, sizeof(*pcb2));
+
+ /* Properly initialize pcb_save */
+ pcb2->pcb_save = get_pcb_user_save_pcb(pcb2);
+ bcopy(get_pcb_user_save_td(td1), get_pcb_user_save_pcb(pcb2),
+ cpu_max_ext_state_size);
+
+ /* Point mdproc and then copy over td1's contents */
+ mdp2 = &p2->p_md;
+ bcopy(&p1->p_md, mdp2, sizeof(*mdp2));
+
+ /*
+ * Create a new fresh stack for the new process.
+ * Copy the trap frame for the return to user mode as if from a
+ * syscall. This copies most of the user mode register values.
+ */
+ td2->td_frame = (struct trapframe *)td2->td_pcb - 1;
+ bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe));
+
+ td2->td_frame->tf_rax = 0; /* Child returns zero */
+ td2->td_frame->tf_rflags &= ~PSL_C; /* success */
+ td2->td_frame->tf_rdx = 1;
+
+ /*
+ * If the parent process has the trap bit set (i.e. a debugger had
+ * single stepped the process to the system call), we need to clear
+ * the trap flag from the new frame unless the debugger had set PF_FORK
+ * on the parent. Otherwise, the child will receive a (likely
+ * unexpected) SIGTRAP when it executes the first instruction after
+ * returning to userland.
+ */
+ if ((p1->p_pfsflags & PF_FORK) == 0)
+ td2->td_frame->tf_rflags &= ~PSL_T;
+
+ /*
+ * Set registers for trampoline to user mode. Leave space for the
+ * return address on stack. These are the kernel mode register values.
+ */
+ pmap2 = vmspace_pmap(p2->p_vmspace);
+ pcb2->pcb_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap2->pm_pml4);
+ pcb2->pcb_r12 = (register_t)fork_return; /* fork_trampoline argument */
+ pcb2->pcb_rbp = 0;
+ pcb2->pcb_rsp = (register_t)td2->td_frame - sizeof(void *);
+ pcb2->pcb_rbx = (register_t)td2; /* fork_trampoline argument */
+ pcb2->pcb_rip = (register_t)fork_trampoline;
+ /*-
+ * pcb2->pcb_dr*: cloned above.
+ * pcb2->pcb_savefpu: cloned above.
+ * pcb2->pcb_flags: cloned above.
+ * pcb2->pcb_onfault: cloned above (always NULL here?).
+ * pcb2->pcb_[fg]sbase: cloned above
+ */
+
+ /* Setup to release spin count in fork_exit(). */
+ td2->td_md.md_spinlock_count = 1;
+ td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
+
+ /* As an i386, do not copy io permission bitmap. */
+ pcb2->pcb_tssp = NULL;
+
+ /* New segment registers. */
+ set_pcb_flags(pcb2, PCB_FULL_IRET);
+
+ /* Copy the LDT, if necessary. */
+ mdp1 = &td1->td_proc->p_md;
+ mdp2 = &p2->p_md;
+ mtx_lock(&dt_lock);
+ if (mdp1->md_ldt != NULL) {
+ if (flags & RFMEM) {
+ mdp1->md_ldt->ldt_refcnt++;
+ mdp2->md_ldt = mdp1->md_ldt;
+ bcopy(&mdp1->md_ldt_sd, &mdp2->md_ldt_sd, sizeof(struct
+ system_segment_descriptor));
+ } else {
+ mdp2->md_ldt = NULL;
+ mdp2->md_ldt = user_ldt_alloc(p2, 0);
+ if (mdp2->md_ldt == NULL)
+ panic("could not copy LDT");
+ amd64_set_ldt_data(td2, 0, max_ldt_segment,
+ (struct user_segment_descriptor *)
+ mdp1->md_ldt->ldt_base);
+ }
+ } else
+ mdp2->md_ldt = NULL;
+ mtx_unlock(&dt_lock);
+
+ /*
+ * Now, cpu_switch() can schedule the new process.
+ * pcb_rsp is loaded pointing to the cpu_switch() stack frame
+ * containing the return address when exiting cpu_switch.
+ * This will normally be to fork_trampoline(), which will have
+ * %ebx loaded with the new proc's pointer. fork_trampoline()
+ * will set up a stack to call fork_return(p, frame); to complete
+ * the return to user-mode.
+ */
+}
+
+/*
+ * Intercept the return address from a freshly forked process that has NOT
+ * been scheduled yet.
+ *
+ * This is needed to make kernel threads stay in kernel mode.
+ */
+void
+cpu_set_fork_handler(td, func, arg)
+ struct thread *td;
+ void (*func)(void *);
+ void *arg;
+{
+ /*
+ * Note that the trap frame follows the args, so the function
+ * is really called like this: func(arg, frame);
+ */
+ td->td_pcb->pcb_r12 = (long) func; /* function */
+ td->td_pcb->pcb_rbx = (long) arg; /* first arg */
+}
+
+void
+cpu_exit(struct thread *td)
+{
+
+ /*
+ * If this process has a custom LDT, release it.
+ */
+ mtx_lock(&dt_lock);
+ if (td->td_proc->p_md.md_ldt != 0)
+ user_ldt_free(td);
+ else
+ mtx_unlock(&dt_lock);
+}
+
+void
+cpu_thread_exit(struct thread *td)
+{
+ struct pcb *pcb;
+
+ critical_enter();
+ if (td == PCPU_GET(fpcurthread))
+ fpudrop();
+ critical_exit();
+
+ pcb = td->td_pcb;
+
+ /* Disable any hardware breakpoints. */
+ if (pcb->pcb_flags & PCB_DBREGS) {
+ reset_dbregs();
+ clear_pcb_flags(pcb, PCB_DBREGS);
+ }
+}
+
+void
+cpu_thread_clean(struct thread *td)
+{
+ struct pcb *pcb;
+
+ pcb = td->td_pcb;
+
+ /*
+ * Clean TSS/iomap
+ */
+ if (pcb->pcb_tssp != NULL) {
+ kmem_free(kernel_map, (vm_offset_t)pcb->pcb_tssp,
+ ctob(IOPAGES + 1));
+ pcb->pcb_tssp = NULL;
+ }
+}
+
+void
+cpu_thread_swapin(struct thread *td)
+{
+}
+
+void
+cpu_thread_swapout(struct thread *td)
+{
+}
+
+void
+cpu_thread_alloc(struct thread *td)
+{
+ struct pcb *pcb;
+ struct xstate_hdr *xhdr;
+
+ td->td_pcb = pcb = get_pcb_td(td);
+ td->td_frame = (struct trapframe *)pcb - 1;
+ pcb->pcb_save = get_pcb_user_save_pcb(pcb);
+ if (use_xsave) {
+ xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1);
+ bzero(xhdr, sizeof(*xhdr));
+ xhdr->xstate_bv = xsave_mask;
+ }
+}
+
+void
+cpu_thread_free(struct thread *td)
+{
+
+ cpu_thread_clean(td);
+}
+
+void
+cpu_set_syscall_retval(struct thread *td, int error)
+{
+
+ switch (error) {
+ case 0:
+ td->td_frame->tf_rax = td->td_retval[0];
+ td->td_frame->tf_rdx = td->td_retval[1];
+ td->td_frame->tf_rflags &= ~PSL_C;
+ break;
+
+ case ERESTART:
+ /*
+ * Reconstruct pc, we know that 'syscall' is 2 bytes,
+ * lcall $X,y is 7 bytes, int 0x80 is 2 bytes.
+ * We saved this in tf_err.
+ * %r10 (which was holding the value of %rcx) is restored
+ * for the next iteration.
+ * %r10 restore is only required for freebsd/amd64 processes,
+ * but shall be innocent for any ia32 ABI.
+ */
+ td->td_frame->tf_rip -= td->td_frame->tf_err;
+ td->td_frame->tf_r10 = td->td_frame->tf_rcx;
+ break;
+
+ case EJUSTRETURN:
+ break;
+
+ default:
+ if (td->td_proc->p_sysent->sv_errsize) {
+ if (error >= td->td_proc->p_sysent->sv_errsize)
+ error = -1; /* XXX */
+ else
+ error = td->td_proc->p_sysent->sv_errtbl[error];
+ }
+ td->td_frame->tf_rax = error;
+ td->td_frame->tf_rflags |= PSL_C;
+ break;
+ }
+}
+
+/*
+ * Initialize machine state (pcb and trap frame) for a new thread about to
+ * upcall. Put enough state in the new thread's PCB to get it to go back
+ * userret(), where we can intercept it again to set the return (upcall)
+ * Address and stack, along with those from upcals that are from other sources
+ * such as those generated in thread_userret() itself.
+ */
+void
+cpu_set_upcall(struct thread *td, struct thread *td0)
+{
+ struct pcb *pcb2;
+
+ /* Point the pcb to the top of the stack. */
+ pcb2 = td->td_pcb;
+
+ /*
+ * Copy the upcall pcb. This loads kernel regs.
+ * Those not loaded individually below get their default
+ * values here.
+ */
+ bcopy(td0->td_pcb, pcb2, sizeof(*pcb2));
+ clear_pcb_flags(pcb2, PCB_FPUINITDONE | PCB_USERFPUINITDONE);
+ pcb2->pcb_save = get_pcb_user_save_pcb(pcb2);
+ bcopy(get_pcb_user_save_td(td0), pcb2->pcb_save,
+ cpu_max_ext_state_size);
+ set_pcb_flags(pcb2, PCB_FULL_IRET);
+
+ /*
+ * Create a new fresh stack for the new thread.
+ */
+ bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe));
+
+ /* If the current thread has the trap bit set (i.e. a debugger had
+ * single stepped the process to the system call), we need to clear
+ * the trap flag from the new frame. Otherwise, the new thread will
+ * receive a (likely unexpected) SIGTRAP when it executes the first
+ * instruction after returning to userland.
+ */
+ td->td_frame->tf_rflags &= ~PSL_T;
+
+ /*
+ * Set registers for trampoline to user mode. Leave space for the
+ * return address on stack. These are the kernel mode register values.
+ */
+ pcb2->pcb_r12 = (register_t)fork_return; /* trampoline arg */
+ pcb2->pcb_rbp = 0;
+ pcb2->pcb_rsp = (register_t)td->td_frame - sizeof(void *); /* trampoline arg */
+ pcb2->pcb_rbx = (register_t)td; /* trampoline arg */
+ pcb2->pcb_rip = (register_t)fork_trampoline;
+ /*
+ * If we didn't copy the pcb, we'd need to do the following registers:
+ * pcb2->pcb_cr3: cloned above.
+ * pcb2->pcb_dr*: cloned above.
+ * pcb2->pcb_savefpu: cloned above.
+ * pcb2->pcb_onfault: cloned above (always NULL here?).
+ * pcb2->pcb_[fg]sbase: cloned above
+ */
+
+ /* Setup to release spin count in fork_exit(). */
+ td->td_md.md_spinlock_count = 1;
+ td->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
+}
+
+/*
+ * Set that machine state for performing an upcall that has to
+ * be done in thread_userret() so that those upcalls generated
+ * in thread_userret() itself can be done as well.
+ */
+void
+cpu_set_upcall_kse(struct thread *td, void (*entry)(void *), void *arg,
+ stack_t *stack)
+{
+
+ /*
+ * Do any extra cleaning that needs to be done.
+ * The thread may have optional components
+ * that are not present in a fresh thread.
+ * This may be a recycled thread so make it look
+ * as though it's newly allocated.
+ */
+ cpu_thread_clean(td);
+
+#ifdef COMPAT_FREEBSD32
+ if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+ /*
+ * Set the trap frame to point at the beginning of the uts
+ * function.
+ */
+ td->td_frame->tf_rbp = 0;
+ td->td_frame->tf_rsp =
+ (((uintptr_t)stack->ss_sp + stack->ss_size - 4) & ~0x0f) - 4;
+ td->td_frame->tf_rip = (uintptr_t)entry;
+
+ /*
+ * Pass the address of the mailbox for this kse to the uts
+ * function as a parameter on the stack.
+ */
+ suword32((void *)(td->td_frame->tf_rsp + sizeof(int32_t)),
+ (uint32_t)(uintptr_t)arg);
+
+ return;
+ }
+#endif
+
+ /*
+ * Set the trap frame to point at the beginning of the uts
+ * function.
+ */
+ td->td_frame->tf_rbp = 0;
+ td->td_frame->tf_rsp =
+ ((register_t)stack->ss_sp + stack->ss_size) & ~0x0f;
+ td->td_frame->tf_rsp -= 8;
+ td->td_frame->tf_rip = (register_t)entry;
+ td->td_frame->tf_ds = _udatasel;
+ td->td_frame->tf_es = _udatasel;
+ td->td_frame->tf_fs = _ufssel;
+ td->td_frame->tf_gs = _ugssel;
+ td->td_frame->tf_flags = TF_HASSEGS;
+
+ /*
+ * Pass the address of the mailbox for this kse to the uts
+ * function as a parameter on the stack.
+ */
+ td->td_frame->tf_rdi = (register_t)arg;
+}
+
+int
+cpu_set_user_tls(struct thread *td, void *tls_base)
+{
+ struct pcb *pcb;
+
+ if ((u_int64_t)tls_base >= VM_MAXUSER_ADDRESS)
+ return (EINVAL);
+
+ pcb = td->td_pcb;
+ set_pcb_flags(pcb, PCB_FULL_IRET);
+#ifdef COMPAT_FREEBSD32
+ if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+ pcb->pcb_gsbase = (register_t)tls_base;
+ return (0);
+ }
+#endif
+ pcb->pcb_fsbase = (register_t)tls_base;
+ return (0);
+}
+
+#ifdef SMP
+static void
+cpu_reset_proxy()
+{
+ cpuset_t tcrp;
+
+ cpu_reset_proxy_active = 1;
+ while (cpu_reset_proxy_active == 1)
+ ia32_pause(); /* Wait for other cpu to see that we've started */
+
+ CPU_SETOF(cpu_reset_proxyid, &tcrp);
+ stop_cpus(tcrp);
+ printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
+ DELAY(1000000);
+ cpu_reset_real();
+}
+#endif
+
+void
+cpu_reset()
+{
+#ifdef SMP
+ cpuset_t map;
+ u_int cnt;
+
+ if (smp_active) {
+ map = all_cpus;
+ CPU_CLR(PCPU_GET(cpuid), &map);
+ CPU_NAND(&map, &stopped_cpus);
+ if (!CPU_EMPTY(&map)) {
+ printf("cpu_reset: Stopping other CPUs\n");
+ stop_cpus(map);
+ }
+
+ if (PCPU_GET(cpuid) != 0) {
+ cpu_reset_proxyid = PCPU_GET(cpuid);
+ cpustop_restartfunc = cpu_reset_proxy;
+ cpu_reset_proxy_active = 0;
+ printf("cpu_reset: Restarting BSP\n");
+
+ /* Restart CPU #0. */
+ CPU_SETOF(0, &started_cpus);
+ wmb();
+
+ cnt = 0;
+ while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
+ ia32_pause();
+ cnt++; /* Wait for BSP to announce restart */
+ }
+ if (cpu_reset_proxy_active == 0)
+ printf("cpu_reset: Failed to restart BSP\n");
+ enable_intr();
+ cpu_reset_proxy_active = 2;
+
+ while (1)
+ ia32_pause();
+ /* NOTREACHED */
+ }
+
+ DELAY(1000000);
+ }
+#endif
+ cpu_reset_real();
+ /* NOTREACHED */
+}
+
+static void
+cpu_reset_real()
+{
+ struct region_descriptor null_idt;
+ int b;
+
+ disable_intr();
+
+ /*
+ * Attempt to do a CPU reset via the keyboard controller,
+ * do not turn off GateA20, as any machine that fails
+ * to do the reset here would then end up in no man's land.
+ */
+ outb(IO_KBD + 4, 0xFE);
+ DELAY(500000); /* wait 0.5 sec to see if that did it */
+
+ /*
+ * Attempt to force a reset via the Reset Control register at
+ * I/O port 0xcf9. Bit 2 forces a system reset when it
+ * transitions from 0 to 1. Bit 1 selects the type of reset
+ * to attempt: 0 selects a "soft" reset, and 1 selects a
+ * "hard" reset. We try a "hard" reset. The first write sets
+ * bit 1 to select a "hard" reset and clears bit 2. The
+ * second write forces a 0 -> 1 transition in bit 2 to trigger
+ * a reset.
+ */
+ outb(0xcf9, 0x2);
+ outb(0xcf9, 0x6);
+ DELAY(500000); /* wait 0.5 sec to see if that did it */
+
+ /*
+ * Attempt to force a reset via the Fast A20 and Init register
+ * at I/O port 0x92. Bit 1 serves as an alternate A20 gate.
+ * Bit 0 asserts INIT# when set to 1. We are careful to only
+ * preserve bit 1 while setting bit 0. We also must clear bit
+ * 0 before setting it if it isn't already clear.
+ */
+ b = inb(0x92);
+ if (b != 0xff) {
+ if ((b & 0x1) != 0)
+ outb(0x92, b & 0xfe);
+ outb(0x92, b | 0x1);
+ DELAY(500000); /* wait 0.5 sec to see if that did it */
+ }
+
+ printf("No known reset method worked, attempting CPU shutdown\n");
+ DELAY(1000000); /* wait 1 sec for printf to complete */
+
+ /* Wipe the IDT. */
+ null_idt.rd_limit = 0;
+ null_idt.rd_base = 0;
+ lidt(&null_idt);
+
+ /* "good night, sweet prince .... <THUNK!>" */
+ breakpoint();
+
+ /* NOTREACHED */
+ while(1);
+}
+
+/*
+ * Allocate an sf_buf for the given vm_page. On this machine, however, there
+ * is no sf_buf object. Instead, an opaque pointer to the given vm_page is
+ * returned.
+ */
+struct sf_buf *
+sf_buf_alloc(struct vm_page *m, int pri)
+{
+
+ return ((struct sf_buf *)m);
+}
+
+/*
+ * Free the sf_buf. In fact, do nothing because there are no resources
+ * associated with the sf_buf.
+ */
+void
+sf_buf_free(struct sf_buf *sf)
+{
+}
+
+/*
+ * Software interrupt handler for queued VM system processing.
+ */
+void
+swi_vm(void *dummy)
+{
+ if (busdma_swi_pending != 0)
+ busdma_swi();
+}
+
+/*
+ * Tell whether this address is in some physical memory region.
+ * Currently used by the kernel coredump code in order to avoid
+ * dumping the ``ISA memory hole'' which could cause indefinite hangs,
+ * or other unpredictable behaviour.
+ */
+
+int
+is_physical_memory(vm_paddr_t addr)
+{
+
+#ifdef DEV_ISA
+ /* The ISA ``memory hole''. */
+ if (addr >= 0xa0000 && addr < 0x100000)
+ return 0;
+#endif
+
+ /*
+ * stuff other tests for known memory-mapped devices (PCI?)
+ * here
+ */
+
+ return 1;
+}
diff --git a/sys/amd64/compile/.cvsignore b/sys/amd64/compile/.cvsignore
new file mode 100644
index 0000000..232298e
--- /dev/null
+++ b/sys/amd64/compile/.cvsignore
@@ -0,0 +1 @@
+[A-Za-z0-9]*
diff --git a/sys/amd64/conf/.cvsignore b/sys/amd64/conf/.cvsignore
new file mode 100644
index 0000000..232298e
--- /dev/null
+++ b/sys/amd64/conf/.cvsignore
@@ -0,0 +1 @@
+[A-Za-z0-9]*
diff --git a/sys/amd64/conf/DEFAULTS b/sys/amd64/conf/DEFAULTS
new file mode 100644
index 0000000..2c221cb
--- /dev/null
+++ b/sys/amd64/conf/DEFAULTS
@@ -0,0 +1,24 @@
+#
+# DEFAULTS -- Default kernel configuration file for FreeBSD/amd64
+#
+# $FreeBSD$
+
+machine amd64
+
+# Bus support.
+device isa
+
+# Pseudo devices.
+device mem # Memory and kernel memory devices
+device io # I/O device
+
+# UART chips on this platform
+device uart_ns8250
+
+# Default partitioning schemes
+options GEOM_PART_BSD
+options GEOM_PART_EBR
+options GEOM_PART_EBR_COMPAT
+options GEOM_PART_MBR
+
+options NEW_PCIB
diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC
new file mode 100644
index 0000000..1043b2b
--- /dev/null
+++ b/sys/amd64/conf/GENERIC
@@ -0,0 +1,340 @@
+#
+# GENERIC -- Generic kernel configuration file for FreeBSD/amd64
+#
+# For more information on this file, please read the config(5) manual page,
+# and/or the handbook section on Kernel Configuration Files:
+#
+# http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
+#
+# The handbook is also available locally in /usr/share/doc/handbook
+# if you've installed the doc distribution, otherwise always see the
+# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
+# latest information.
+#
+# An exhaustive list of options and more detailed explanations of the
+# device lines is also present in the ../../conf/NOTES and NOTES files.
+# If you are in doubt as to the purpose or necessity of a line, check first
+# in NOTES.
+#
+# $FreeBSD$
+
+cpu HAMMER
+ident GENERIC
+
+makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols
+makeoptions WITH_CTF=1 # Run ctfconvert(1) for DTrace support
+
+options SCHED_ULE # ULE scheduler
+options PREEMPTION # Enable kernel thread preemption
+options INET # InterNETworking
+options INET6 # IPv6 communications protocols
+options TCP_OFFLOAD # TCP offload
+options SCTP # Stream Control Transmission Protocol
+options FFS # Berkeley Fast Filesystem
+options SOFTUPDATES # Enable FFS soft updates support
+options UFS_ACL # Support for access control lists
+options UFS_DIRHASH # Improve performance on big directories
+options UFS_GJOURNAL # Enable gjournal-based UFS journaling
+options QUOTA # Enable disk quotas for UFS
+options MD_ROOT # MD is a potential root device
+options NFSCL # New Network Filesystem Client
+options NFSD # New Network Filesystem Server
+options NFSLOCKD # Network Lock Manager
+options NFS_ROOT # NFS usable as /, requires NFSCL
+options MSDOSFS # MSDOS Filesystem
+options CD9660 # ISO 9660 Filesystem
+options PROCFS # Process filesystem (requires PSEUDOFS)
+options PSEUDOFS # Pseudo-filesystem framework
+options GEOM_PART_GPT # GUID Partition Tables.
+options GEOM_RAID # Soft RAID functionality.
+options GEOM_LABEL # Provides labelization
+options COMPAT_FREEBSD32 # Compatible with i386 binaries
+options COMPAT_FREEBSD4 # Compatible with FreeBSD4
+options COMPAT_FREEBSD5 # Compatible with FreeBSD5
+options COMPAT_FREEBSD6 # Compatible with FreeBSD6
+options COMPAT_FREEBSD7 # Compatible with FreeBSD7
+options SCSI_DELAY=5000 # Delay (in ms) before probing SCSI
+options KTRACE # ktrace(1) support
+options STACK # stack(9) support
+options SYSVSHM # SYSV-style shared memory
+options SYSVMSG # SYSV-style message queues
+options SYSVSEM # SYSV-style semaphores
+options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions
+options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed.
+options KBD_INSTALL_CDEV # install a CDEV entry in /dev
+options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4)
+options AUDIT # Security event auditing
+options CAPABILITY_MODE # Capsicum capability mode
+options CAPABILITIES # Capsicum capabilities
+options MAC # TrustedBSD MAC Framework
+options KDTRACE_FRAME # Ensure frames are compiled in
+options KDTRACE_HOOKS # Kernel DTrace hooks
+options DDB_CTF # Kernel ELF linker loads CTF data
+options INCLUDE_CONFIG_FILE # Include this file in kernel
+
+# Debugging support. Always need this:
+options KDB # Enable kernel debugger support.
+# For minimum debugger support (stable branch) use:
+#options KDB_TRACE # Print a stack trace for a panic.
+# For full debugger support use this instead:
+options DDB # Support DDB.
+options GDB # Support remote GDB.
+options DEADLKRES # Enable the deadlock resolver
+options INVARIANTS # Enable calls of extra sanity checking
+options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS
+options WITNESS # Enable checks to detect deadlocks and cycles
+options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed
+options MALLOC_DEBUG_MAXZONES=8 # Separate malloc(9) zones
+
+# Make an SMP-capable kernel by default
+options SMP # Symmetric MultiProcessor Kernel
+
+# CPU frequency control
+device cpufreq
+
+# Bus support.
+device acpi
+device pci
+
+# Floppy drives
+device fdc
+
+# ATA controllers
+device ahci # AHCI-compatible SATA controllers
+device ata # Legacy ATA/SATA controllers
+options ATA_STATIC_ID # Static device numbering
+device mvs # Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA
+device siis # SiliconImage SiI3124/SiI3132/SiI3531 SATA
+
+# SCSI Controllers
+device ahc # AHA2940 and onboard AIC7xxx devices
+options AHC_REG_PRETTY_PRINT # Print register bitfields in debug
+ # output. Adds ~128k to driver.
+device ahd # AHA39320/29320 and onboard AIC79xx devices
+options AHD_REG_PRETTY_PRINT # Print register bitfields in debug
+ # output. Adds ~215k to driver.
+device esp # AMD Am53C974 (Tekram DC-390(T))
+device hptiop # Highpoint RocketRaid 3xxx series
+device isp # Qlogic family
+#device ispfw # Firmware for QLogic HBAs- normally a module
+device mpt # LSI-Logic MPT-Fusion
+device mps # LSI-Logic MPT-Fusion 2
+#device ncr # NCR/Symbios Logic
+device sym # NCR/Symbios Logic (newer chipsets + those of `ncr')
+device trm # Tekram DC395U/UW/F DC315U adapters
+
+device adv # Advansys SCSI adapters
+device adw # Advansys wide SCSI adapters
+device aic # Adaptec 15[012]x SCSI adapters, AIC-6[23]60.
+device bt # Buslogic/Mylex MultiMaster SCSI adapters
+device isci # Intel C600 SAS controller
+
+# ATA/SCSI peripherals
+device scbus # SCSI bus (required for ATA/SCSI)
+device ch # SCSI media changers
+device da # Direct Access (disks)
+device sa # Sequential Access (tape etc)
+device cd # CD
+device pass # Passthrough device (direct ATA/SCSI access)
+device ses # Enclosure Services (SES and SAF-TE)
+#device ctl # CAM Target Layer
+
+# RAID controllers interfaced to the SCSI subsystem
+device amr # AMI MegaRAID
+device arcmsr # Areca SATA II RAID
+#XXX it is not 64-bit clean, -scottl
+#device asr # DPT SmartRAID V, VI and Adaptec SCSI RAID
+device ciss # Compaq Smart RAID 5*
+device dpt # DPT Smartcache III, IV - See NOTES for options
+device hptmv # Highpoint RocketRAID 182x
+device hptrr # Highpoint RocketRAID 17xx, 22xx, 23xx, 25xx
+device hpt27xx # Highpoint RocketRAID 27xx
+device iir # Intel Integrated RAID
+device ips # IBM (Adaptec) ServeRAID
+device mly # Mylex AcceleRAID/eXtremeRAID
+device twa # 3ware 9000 series PATA/SATA RAID
+device tws # LSI 3ware 9750 SATA+SAS 6Gb/s RAID controller
+
+# RAID controllers
+device aac # Adaptec FSA RAID
+device aacp # SCSI passthrough for aac (requires CAM)
+device ida # Compaq Smart RAID
+device mfi # LSI MegaRAID SAS
+device mlx # Mylex DAC960 family
+#XXX pointer/int warnings
+#device pst # Promise Supertrak SX6000
+device twe # 3ware ATA RAID
+
+# atkbdc0 controls both the keyboard and the PS/2 mouse
+device atkbdc # AT keyboard controller
+device atkbd # AT keyboard
+device psm # PS/2 mouse
+
+device kbdmux # keyboard multiplexer
+
+device vga # VGA video card driver
+options VESA # Add support for VESA BIOS Extensions (VBE)
+
+device splash # Splash screen and screen saver support
+
+# syscons is the default console driver, resembling an SCO console
+device sc
+options SC_PIXEL_MODE # add support for the raster text mode
+
+device agp # support several AGP chipsets
+
+# PCCARD (PCMCIA) support
+# PCMCIA and cardbus bridge support
+device cbb # cardbus (yenta) bridge
+device pccard # PC Card (16-bit) bus
+device cardbus # CardBus (32-bit) bus
+
+# Serial (COM) ports
+device uart # Generic UART driver
+
+# Parallel port
+device ppc
+device ppbus # Parallel port bus (required)
+device lpt # Printer
+device ppi # Parallel port interface device
+#device vpo # Requires scbus and da
+
+device puc # Multi I/O cards and multi-channel UARTs
+
+# PCI Ethernet NICs.
+device bxe # Broadcom BCM57710/BCM57711/BCM57711E 10Gb Ethernet
+device de # DEC/Intel DC21x4x (``Tulip'')
+device em # Intel PRO/1000 Gigabit Ethernet Family
+device igb # Intel PRO/1000 PCIE Server Gigabit Family
+device ixgbe # Intel PRO/10GbE PCIE Ethernet Family
+device le # AMD Am7900 LANCE and Am79C9xx PCnet
+device ti # Alteon Networks Tigon I/II gigabit Ethernet
+device txp # 3Com 3cR990 (``Typhoon'')
+device vx # 3Com 3c590, 3c595 (``Vortex'')
+
+# PCI Ethernet NICs that use the common MII bus controller code.
+# NOTE: Be sure to keep the 'device miibus' line in order to use these NICs!
+device miibus # MII bus support
+device ae # Attansic/Atheros L2 FastEthernet
+device age # Attansic/Atheros L1 Gigabit Ethernet
+device alc # Atheros AR8131/AR8132 Ethernet
+device ale # Atheros AR8121/AR8113/AR8114 Ethernet
+device bce # Broadcom BCM5706/BCM5708 Gigabit Ethernet
+device bfe # Broadcom BCM440x 10/100 Ethernet
+device bge # Broadcom BCM570xx Gigabit Ethernet
+device cas # Sun Cassini/Cassini+ and NS DP83065 Saturn
+device dc # DEC/Intel 21143 and various workalikes
+device et # Agere ET1310 10/100/Gigabit Ethernet
+device fxp # Intel EtherExpress PRO/100B (82557, 82558)
+device gem # Sun GEM/Sun ERI/Apple GMAC
+device hme # Sun HME (Happy Meal Ethernet)
+device jme # JMicron JMC250 Gigabit/JMC260 Fast Ethernet
+device lge # Level 1 LXT1001 gigabit Ethernet
+device msk # Marvell/SysKonnect Yukon II Gigabit Ethernet
+device nfe # nVidia nForce MCP on-board Ethernet
+device nge # NatSemi DP83820 gigabit Ethernet
+#device nve # nVidia nForce MCP on-board Ethernet Networking
+device pcn # AMD Am79C97x PCI 10/100 (precedence over 'le')
+device re # RealTek 8139C+/8169/8169S/8110S
+device rl # RealTek 8129/8139
+device sf # Adaptec AIC-6915 (``Starfire'')
+device sge # Silicon Integrated Systems SiS190/191
+device sis # Silicon Integrated Systems SiS 900/SiS 7016
+device sk # SysKonnect SK-984x & SK-982x gigabit Ethernet
+device ste # Sundance ST201 (D-Link DFE-550TX)
+device stge # Sundance/Tamarack TC9021 gigabit Ethernet
+device tl # Texas Instruments ThunderLAN
+device tx # SMC EtherPower II (83c170 ``EPIC'')
+device vge # VIA VT612x gigabit Ethernet
+device vr # VIA Rhine, Rhine II
+device wb # Winbond W89C840F
+device xl # 3Com 3c90x (``Boomerang'', ``Cyclone'')
+
+# ISA Ethernet NICs. pccard NICs included.
+device cs # Crystal Semiconductor CS89x0 NIC
+# 'device ed' requires 'device miibus'
+device ed # NE[12]000, SMC Ultra, 3c503, DS8390 cards
+device ex # Intel EtherExpress Pro/10 and Pro/10+
+device ep # Etherlink III based cards
+device fe # Fujitsu MB8696x based cards
+device sn # SMC's 9000 series of Ethernet chips
+device xe # Xircom pccard Ethernet
+
+# Wireless NIC cards
+device wlan # 802.11 support
+options IEEE80211_DEBUG # enable debug msgs
+options IEEE80211_AMPDU_AGE # age frames in AMPDU reorder q's
+options IEEE80211_SUPPORT_MESH # enable 802.11s draft support
+device wlan_wep # 802.11 WEP support
+device wlan_ccmp # 802.11 CCMP support
+device wlan_tkip # 802.11 TKIP support
+device wlan_amrr # AMRR transmit rate control algorithm
+device an # Aironet 4500/4800 802.11 wireless NICs.
+device ath # Atheros NICs
+device ath_pci # Atheros pci/cardbus glue
+device ath_hal # pci/cardbus chip support
+options AH_SUPPORT_AR5416 # enable AR5416 tx/rx descriptors
+options AH_AR5416_INTERRUPT_MITIGATION # AR5416 interrupt mitigation
+options ATH_ENABLE_11N # Enable 802.11n support for AR5416 and later
+device ath_rate_sample # SampleRate tx rate control for ath
+#device bwi # Broadcom BCM430x/BCM431x wireless NICs.
+#device bwn # Broadcom BCM43xx wireless NICs.
+device ipw # Intel 2100 wireless NICs.
+device iwi # Intel 2200BG/2225BG/2915ABG wireless NICs.
+device iwn # Intel 4965/1000/5000/6000 wireless NICs.
+device malo # Marvell Libertas wireless NICs.
+device mwl # Marvell 88W8363 802.11n wireless NICs.
+device ral # Ralink Technology RT2500 wireless NICs.
+device wi # WaveLAN/Intersil/Symbol 802.11 wireless NICs.
+device wpi # Intel 3945ABG wireless NICs.
+
+# Pseudo devices.
+device loop # Network loopback
+device random # Entropy device
+options PADLOCK_RNG # VIA Padlock RNG
+options RDRAND_RNG # Intel Bull Mountain RNG
+device ether # Ethernet support
+device vlan # 802.1Q VLAN support
+device tun # Packet tunnel.
+device md # Memory "disks"
+device gif # IPv6 and IPv4 tunneling
+device faith # IPv6-to-IPv4 relaying (translation)
+device firmware # firmware assist module
+
+# The `bpf' device enables the Berkeley Packet Filter.
+# Be aware of the administrative consequences of enabling this!
+# Note that 'bpf' is required for DHCP.
+device bpf # Berkeley packet filter
+
+# USB support
+options USB_DEBUG # enable debug msgs
+device uhci # UHCI PCI->USB interface
+device ohci # OHCI PCI->USB interface
+device ehci # EHCI PCI->USB interface (USB 2.0)
+device xhci # XHCI PCI->USB interface (USB 3.0)
+device usb # USB Bus (required)
+device ukbd # Keyboard
+device umass # Disks/Mass storage - Requires scbus and da
+
+# Sound support
+device sound # Generic sound driver (required)
+device snd_cmi # CMedia CMI8338/CMI8738
+device snd_csa # Crystal Semiconductor CS461x/428x
+device snd_emu10kx # Creative SoundBlaster Live! and Audigy
+device snd_es137x # Ensoniq AudioPCI ES137x
+device snd_hda # Intel High Definition Audio
+device snd_ich # Intel, NVidia and other ICH AC'97 Audio
+device snd_via8233 # VIA VT8233x Audio
+
+# MMC/SD
+device mmc # MMC/SD bus
+device mmcsd # MMC/SD memory card
+device sdhci # Generic PCI SD Host Controller
+
+# VirtIO support
+device virtio # Generic VirtIO bus (required)
+device virtio_pci # VirtIO PCI device
+device vtnet # VirtIO Ethernet device
+device virtio_blk # VirtIO Block device
+device virtio_scsi # VirtIO SCSI device
+device virtio_balloon # VirtIO Memory Balloon device
diff --git a/sys/amd64/conf/GENERIC.hints b/sys/amd64/conf/GENERIC.hints
new file mode 100644
index 0000000..eacbbe8
--- /dev/null
+++ b/sys/amd64/conf/GENERIC.hints
@@ -0,0 +1,33 @@
+# $FreeBSD$
+hint.fdc.0.at="isa"
+hint.fdc.0.port="0x3F0"
+hint.fdc.0.irq="6"
+hint.fdc.0.drq="2"
+hint.fd.0.at="fdc0"
+hint.fd.0.drive="0"
+hint.fd.1.at="fdc0"
+hint.fd.1.drive="1"
+hint.atkbdc.0.at="isa"
+hint.atkbdc.0.port="0x060"
+hint.atkbd.0.at="atkbdc"
+hint.atkbd.0.irq="1"
+hint.psm.0.at="atkbdc"
+hint.psm.0.irq="12"
+hint.sc.0.at="isa"
+hint.sc.0.flags="0x100"
+hint.uart.0.at="isa"
+hint.uart.0.port="0x3F8"
+hint.uart.0.flags="0x10"
+hint.uart.0.irq="4"
+hint.uart.1.at="isa"
+hint.uart.1.port="0x2F8"
+hint.uart.1.irq="3"
+hint.ppc.0.at="isa"
+hint.ppc.0.irq="7"
+hint.atrtc.0.at="isa"
+hint.atrtc.0.port="0x70"
+hint.atrtc.0.irq="8"
+hint.attimer.0.at="isa"
+hint.attimer.0.port="0x40"
+hint.attimer.0.irq="0"
+hint.wbwd.0.at="isa"
diff --git a/sys/amd64/conf/Makefile b/sys/amd64/conf/Makefile
new file mode 100644
index 0000000..1d2513f
--- /dev/null
+++ b/sys/amd64/conf/Makefile
@@ -0,0 +1,5 @@
+# $FreeBSD$
+
+TARGET=amd64
+
+.include "${.CURDIR}/../../conf/makeLINT.mk"
diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES
new file mode 100644
index 0000000..7a41464
--- /dev/null
+++ b/sys/amd64/conf/NOTES
@@ -0,0 +1,626 @@
+#
+# NOTES -- Lines that can be cut/pasted into kernel and hints configs.
+#
+# This file contains machine dependent kernel configuration notes. For
+# machine independent notes, look in /sys/conf/NOTES.
+#
+# $FreeBSD$
+#
+
+#
+# We want LINT to cover profiling as well.
+profile 2
+
+#
+# Enable the kernel DTrace hooks which are required to load the DTrace
+# kernel modules.
+#
+options KDTRACE_HOOKS
+
+
+#####################################################################
+# SMP OPTIONS:
+#
+# Notes:
+#
+# IPI_PREEMPTION instructs the kernel to preempt threads running on other
+# CPUS if needed. Relies on the PREEMPTION option
+
+# Optional:
+options IPI_PREEMPTION
+device atpic # Optional legacy pic support
+device mptable # Optional MPSPEC mptable support
+
+#
+# Watchdog routines.
+#
+options MP_WATCHDOG
+
+# Debugging options.
+#
+options COUNT_XINVLTLB_HITS # Counters for TLB events
+options COUNT_IPIS # Per-CPU IPI interrupt counters
+
+
+
+#####################################################################
+# CPU OPTIONS
+
+#
+# You must specify at least one CPU (the one you intend to run on);
+# deleting the specification for CPUs you don't need to use may make
+# parts of the system run faster.
+#
+cpu HAMMER # aka K8, aka Opteron & Athlon64
+
+#
+# Options for CPU features.
+#
+
+#
+# PERFMON causes the driver for Pentium/Pentium Pro performance counters
+# to be compiled. See perfmon(4) for more information.
+#
+#XXX#options PERFMON
+
+
+#####################################################################
+# NETWORKING OPTIONS
+
+#
+# DEVICE_POLLING adds support for mixed interrupt-polling handling
+# of network device drivers, which has significant benefits in terms
+# of robustness to overloads and responsivity, as well as permitting
+# accurate scheduling of the CPU time between kernel network processing
+# and other activities. The drawback is a moderate (up to 1/HZ seconds)
+# potential increase in response times.
+# It is strongly recommended to use HZ=1000 or 2000 with DEVICE_POLLING
+# to achieve smoother behaviour.
+# Additionally, you can enable/disable polling at runtime with help of
+# the ifconfig(8) utility, and select the CPU fraction reserved to
+# userland with the sysctl variable kern.polling.user_frac
+# (default 50, range 0..100).
+#
+# Not all device drivers support this mode of operation at the time of
+# this writing. See polling(4) for more details.
+
+options DEVICE_POLLING
+
+# BPF_JITTER adds support for BPF just-in-time compiler.
+
+options BPF_JITTER
+
+# OpenFabrics Enterprise Distribution (Infiniband).
+options OFED
+options OFED_DEBUG_INIT
+
+# Sockets Direct Protocol
+options SDP
+options SDP_DEBUG
+
+# IP over Infiniband
+options IPOIB
+options IPOIB_DEBUG
+options IPOIB_CM
+
+
+#####################################################################
+# CLOCK OPTIONS
+
+# Provide read/write access to the memory in the clock chip.
+device nvram # Access to rtc cmos via /dev/nvram
+
+
+#####################################################################
+# MISCELLANEOUS DEVICES AND OPTIONS
+
+device speaker #Play IBM BASIC-style noises out your speaker
+hint.speaker.0.at="isa"
+hint.speaker.0.port="0x61"
+device gzip #Exec gzipped a.out's. REQUIRES COMPAT_AOUT!
+
+
+#####################################################################
+# HARDWARE BUS CONFIGURATION
+
+#
+# ISA bus
+#
+device isa
+
+#
+# Options for `isa':
+#
+# AUTO_EOI_1 enables the `automatic EOI' feature for the master 8259A
+# interrupt controller. This saves about 0.7-1.25 usec for each interrupt.
+# This option breaks suspend/resume on some portables.
+#
+# AUTO_EOI_2 enables the `automatic EOI' feature for the slave 8259A
+# interrupt controller. This saves about 0.7-1.25 usec for each interrupt.
+# Automatic EOI is documented not to work for for the slave with the
+# original i8259A, but it works for some clones and some integrated
+# versions.
+#
+# MAXMEM specifies the amount of RAM on the machine; if this is not
+# specified, FreeBSD will first read the amount of memory from the CMOS
+# RAM, so the amount of memory will initially be limited to 64MB or 16MB
+# depending on the BIOS. If the BIOS reports 64MB, a memory probe will
+# then attempt to detect the installed amount of RAM. If this probe
+# fails to detect >64MB RAM you will have to use the MAXMEM option.
+# The amount is in kilobytes, so for a machine with 128MB of RAM, it would
+# be 131072 (128 * 1024).
+#
+# BROKEN_KEYBOARD_RESET disables the use of the keyboard controller to
+# reset the CPU for reboot. This is needed on some systems with broken
+# keyboard controllers.
+
+options AUTO_EOI_1
+#options AUTO_EOI_2
+
+options MAXMEM=(128*1024)
+#options BROKEN_KEYBOARD_RESET
+
+#
+# PCI bus & PCI options:
+#
+device pci
+
+#
+# AGP GART support
+device agp
+
+#
+# AGP debugging.
+#
+options AGP_DEBUG
+
+
+#####################################################################
+# HARDWARE DEVICE CONFIGURATION
+
+# To include support for VGA VESA video modes
+options VESA
+
+# Turn on extra debugging checks and output for VESA support.
+options VESA_DEBUG
+
+device dpms # DPMS suspend & resume via VESA BIOS
+
+# x86 real mode BIOS emulator, required by atkbdc/dpms/vesa
+options X86BIOS
+
+#
+# Optional devices:
+#
+
+# PS/2 mouse
+device psm
+hint.psm.0.at="atkbdc"
+hint.psm.0.irq="12"
+
+# Options for psm:
+options PSM_HOOKRESUME #hook the system resume event, useful
+ #for some laptops
+options PSM_RESETAFTERSUSPEND #reset the device at the resume event
+
+# The keyboard controller; it controls the keyboard and the PS/2 mouse.
+device atkbdc
+hint.atkbdc.0.at="isa"
+hint.atkbdc.0.port="0x060"
+
+# The AT keyboard
+device atkbd
+hint.atkbd.0.at="atkbdc"
+hint.atkbd.0.irq="1"
+
+# Options for atkbd:
+options ATKBD_DFLT_KEYMAP # specify the built-in keymap
+makeoptions ATKBD_DFLT_KEYMAP=jp.106
+
+# `flags' for atkbd:
+# 0x01 Force detection of keyboard, else we always assume a keyboard
+# 0x02 Don't reset keyboard, useful for some newer ThinkPads
+# 0x03 Force detection and avoid reset, might help with certain
+# dockingstations
+# 0x04 Old-style (XT) keyboard support, useful for older ThinkPads
+
+# Video card driver for VGA adapters.
+device vga
+hint.vga.0.at="isa"
+
+# Options for vga:
+# Try the following option if the mouse pointer is not drawn correctly
+# or font does not seem to be loaded properly. May cause flicker on
+# some systems.
+options VGA_ALT_SEQACCESS
+
+# If you can dispense with some vga driver features, you may want to
+# use the following options to save some memory.
+#options VGA_NO_FONT_LOADING # don't save/load font
+#options VGA_NO_MODE_CHANGE # don't change video modes
+
+# Older video cards may require this option for proper operation.
+options VGA_SLOW_IOACCESS # do byte-wide i/o's to TS and GDC regs
+
+# The following option probably won't work with the LCD displays.
+options VGA_WIDTH90 # support 90 column modes
+
+# Debugging.
+options VGA_DEBUG
+
+# Linear framebuffer driver for S3 VESA 1.2 cards. Works on top of VESA.
+device s3pci
+
+# 3Dfx Voodoo Graphics, Voodoo II /dev/3dfx CDEV support. This will create
+# the /dev/3dfx0 device to work with glide implementations. This should get
+# linked to /dev/3dfx and /dev/voodoo. Note that this is not the same as
+# the tdfx DRI module from XFree86 and is completely unrelated.
+#
+# To enable Linuxulator support, one must also include COMPAT_LINUX in the
+# config as well. The other option is to load both as modules.
+
+device tdfx # Enable 3Dfx Voodoo support
+#XXX#device tdfx_linux # Enable Linuxulator support
+
+#
+# ACPI support using the Intel ACPI Component Architecture reference
+# implementation.
+#
+# ACPI_DEBUG enables the use of the debug.acpi.level and debug.acpi.layer
+# kernel environment variables to select initial debugging levels for the
+# Intel ACPICA code. (Note that the Intel code must also have USE_DEBUGGER
+# defined when it is built).
+
+device acpi
+options ACPI_DEBUG
+
+# The cpufreq(4) driver provides support for non-ACPI CPU frequency control
+device cpufreq
+
+# Direct Rendering modules for 3D acceleration.
+device drm # DRM core module required by DRM drivers
+device i915drm # Intel i830 through i915
+device mach64drm # ATI Rage Pro, Rage Mobility P/M, Rage XL
+device mgadrm # AGP Matrox G200, G400, G450, G550
+device r128drm # ATI Rage 128
+device radeondrm # ATI Radeon
+device savagedrm # S3 Savage3D, Savage4
+device sisdrm # SiS 300/305, 540, 630
+device tdfxdrm # 3dfx Voodoo 3/4/5 and Banshee
+device viadrm # VIA
+options DRM_DEBUG # Include debug printfs (slow)
+
+#
+# Network interfaces:
+#
+
+# ed: Western Digital and SMC 80xx; Novell NE1000 and NE2000; 3Com 3C503
+# HP PC Lan+, various PC Card devices
+# (requires miibus)
+# ipw: Intel PRO/Wireless 2100 IEEE 802.11 adapter
+# Requires the ipw firmware module
+# iwi: Intel PRO/Wireless 2200BG/2225BG/2915ABG IEEE 802.11 adapters
+# Requires the iwi firmware module
+# iwn: Intel Wireless WiFi Link 4965/1000/5000/6000 802.11 network adapters
+# Requires the iwn firmware module
+# mlx4ib: Mellanox ConnectX HCA InfiniBand
+# mlxen: Mellanox ConnectX HCA Ethernet
+# mthca: Mellanox HCA InfiniBand
+# nfe: nVidia nForce MCP on-board Ethernet Networking (BSD open source)
+# nve: nVidia nForce MCP on-board Ethernet Networking
+# sfxge: Solarflare SFC9000 family 10Gb Ethernet adapters
+# wpi: Intel 3945ABG Wireless LAN controller
+# Requires the wpi firmware module
+
+device ed # NE[12]000, SMC Ultra, 3c503, DS8390 cards
+options ED_3C503
+options ED_HPP
+options ED_SIC
+device ipw # Intel 2100 wireless NICs.
+device iwi # Intel 2200BG/2225BG/2915ABG wireless NICs.
+device iwn # Intel 4965/1000/5000/6000 wireless NICs.
+device mlx4ib # Mellanox ConnectX HCA InfiniBand
+device mlxen # Mellanox ConnectX HCA Ethernet
+device mthca # Mellanox HCA InfiniBand
+device nfe # nVidia nForce MCP on-board Ethernet
+device nve # nVidia nForce MCP on-board Ethernet Networking
+device sfxge # Solarflare SFC9000 10Gb Ethernet
+device wpi # Intel 3945ABG wireless NICs.
+
+# IEEE 802.11 adapter firmware modules
+
+# Intel PRO/Wireless 2100 firmware:
+# ipwfw: BSS/IBSS/monitor mode firmware
+# ipwbssfw: BSS mode firmware
+# ipwibssfw: IBSS mode firmware
+# ipwmonitorfw: Monitor mode firmware
+# Intel PRO/Wireless 2200BG/2225BG/2915ABG firmware:
+# iwifw: BSS/IBSS/monitor mode firmware
+# iwibssfw: BSS mode firmware
+# iwiibssfw: IBSS mode firmware
+# iwimonitorfw: Monitor mode firmware
+# Intel Wireless WiFi Link 4965/1000/5000/6000 series firmware:
+# iwnfw: Single module to support the 4965/1000/5000/5150/6000
+# iwn4965fw: Specific module for the 4965 only
+# iwn1000fw: Specific module for the 1000 only
+# iwn5000fw: Specific module for the 5000 only
+# iwn5150fw: Specific module for the 5150 only
+# iwn6000fw: Specific module for the 6000 only
+# iwn6050fw: Specific module for the 6050 only
+# wpifw: Intel 3945ABG Wireless LAN Controller firmware
+
+device iwifw
+device iwibssfw
+device iwiibssfw
+device iwimonitorfw
+device ipwfw
+device ipwbssfw
+device ipwibssfw
+device ipwmonitorfw
+device iwnfw
+device iwn4965fw
+device iwn1000fw
+device iwn5000fw
+device iwn5150fw
+device iwn6000fw
+device iwn6050fw
+device wpifw
+
+#
+#XXX this stores pointers in a 32bit field that is defined by the hardware
+#device pst
+
+#
+# Areca 11xx and 12xx series of SATA II RAID controllers.
+# CAM is required.
+#
+device arcmsr # Areca SATA II RAID
+
+#
+# 3ware 9000 series PATA/SATA RAID controller driver and options.
+# The driver is implemented as a SIM, and so, needs the CAM infrastructure.
+#
+options TWA_DEBUG # 0-10; 10 prints the most messages.
+options TWA_FLASH_FIRMWARE # firmware image bundled when defined.
+device twa # 3ware 9000 series PATA/SATA RAID
+
+#
+# SCSI host adapters:
+#
+# ncv: NCR 53C500 based SCSI host adapters.
+# nsp: Workbit Ninja SCSI-3 based PC Card SCSI host adapters.
+# stg: TMC 18C30, 18C50 based SCSI host adapters.
+
+device ncv
+device nsp
+device stg
+
+#
+# Adaptec FSA RAID controllers, including integrated DELL controllers,
+# the Dell PERC 2/QC and the HP NetRAID-4M
+device aac
+device aacp # SCSI Passthrough interface (optional, CAM required)
+
+#
+# Highpoint RocketRAID 27xx.
+device hpt27xx
+
+#
+# Highpoint RocketRAID 182x.
+device hptmv
+
+#
+# Highpoint RocketRAID. Supports RR172x, RR222x, RR2240, RR232x, RR2340,
+# RR2210, RR174x, RR2522, RR231x, RR230x.
+device hptrr
+
+#
+# Highpoint RocketRaid 3xxx series SATA RAID
+device hptiop
+
+#
+# IBM (now Adaptec) ServeRAID controllers
+device ips
+
+#
+# Intel C600 (Patsburg) integrated SAS controller
+device isci
+options ISCI_LOGGING # enable debugging in isci HAL
+
+#
+# NVM Express (NVMe) support
+device nvme # base NVMe driver
+device nvd # expose NVMe namespaces as disks, depends on nvme
+
+#
+# SafeNet crypto driver: can be moved to the MI NOTES as soon as
+# it's tested on a big-endian machine
+#
+device safe # SafeNet 1141
+options SAFE_DEBUG # enable debugging support: hw.safe.debug
+options SAFE_RNDTEST # enable rndtest support
+
+#
+# VirtIO support
+#
+# The virtio entry provides a generic bus for use by the device drivers.
+# It must be combined with an interface that communicates with the host.
+# Multiple such interfaces defined by the VirtIO specification. FreeBSD
+# only has support for PCI. Therefore, virtio_pci must be statically
+# compiled in or loaded as a module for the device drivers to function.
+#
+device virtio # Generic VirtIO bus (required)
+device virtio_pci # VirtIO PCI Interface
+device vtnet # VirtIO Ethernet device
+device virtio_blk # VirtIO Block device
+device virtio_scsi # VirtIO SCSI device
+device virtio_balloon # VirtIO Memory Balloon device
+
+#####################################################################
+
+#
+# Miscellaneous hardware:
+#
+# ipmi: Intelligent Platform Management Interface
+# pbio: Parallel (8255 PPI) basic I/O (mode 0) port (e.g. Advantech PCL-724)
+# smbios: DMI/SMBIOS entry point
+# vpd: Vital Product Data kernel interface
+# asmc: Apple System Management Controller
+# si: Specialix International SI/XIO or SX intelligent serial card
+# tpm: Trusted Platform Module
+
+# Notes on the Specialix SI/XIO driver:
+# The host card is memory, not IO mapped.
+# The Rev 1 host cards use a 64K chunk, on a 32K boundary.
+# The Rev 2 host cards use a 32K chunk, on a 32K boundary.
+# The cards can use an IRQ of 11, 12 or 15.
+
+device ipmi
+device pbio
+hint.pbio.0.at="isa"
+hint.pbio.0.port="0x360"
+device smbios
+device vpd
+device asmc
+#device si
+device tpm
+
+#
+# Laptop/Notebook options:
+#
+
+
+#
+# I2C Bus
+#
+
+#
+# Hardware watchdog timers:
+#
+# ichwd: Intel ICH watchdog timer
+# amdsbwd: AMD SB7xx watchdog timer
+# viawd: VIA south bridge watchdog timer
+# wbwd: Winbond watchdog timer
+#
+device ichwd
+device amdsbwd
+device viawd
+device wbwd
+
+#
+# Temperature sensors:
+#
+# coretemp: on-die sensor on Intel Core and newer CPUs
+# amdtemp: on-die sensor on AMD K8/K10/K11 CPUs
+#
+device coretemp
+device amdtemp
+
+#
+# CPU control pseudo-device. Provides access to MSRs, CPUID info and
+# microcode update feature.
+#
+device cpuctl
+
+#
+# System Management Bus (SMB)
+#
+options ENABLE_ALART # Control alarm on Intel intpm driver
+
+#
+# Number of initial kernel page table pages used for early bootstrap.
+# This number should include enough pages to map the kernel and any
+# modules or other data loaded with the kernel by the loader. Each
+# page table page maps 2MB.
+#
+options NKPT=31
+
+
+#####################################################################
+# ABI Emulation
+
+#XXX keep these here for now and reactivate when support for emulating
+#XXX these 32 bit binaries is added.
+
+# Enable 32-bit runtime support for FreeBSD/i386 binaries.
+options COMPAT_FREEBSD32
+
+# Enable iBCS2 runtime support for SCO and ISC binaries
+#XXX#options IBCS2
+
+# Emulate spx device for client side of SVR3 local X interface
+#XXX#options SPX_HACK
+
+# Enable Linux ABI emulation
+#XXX#options COMPAT_LINUX
+
+# Enable 32-bit Linux ABI emulation (requires COMPAT_43 and COMPAT_FREEBSD32)
+options COMPAT_LINUX32
+
+# Enable the linux-like proc filesystem support (requires COMPAT_LINUX32
+# and PSEUDOFS)
+options LINPROCFS
+
+#Enable the linux-like sys filesystem support (requires COMPAT_LINUX32
+# and PSEUDOFS)
+options LINSYSFS
+
+#
+# SysVR4 ABI emulation
+#
+# The svr4 ABI emulator can be statically compiled into the kernel or loaded as
+# a KLD module.
+# The STREAMS network emulation code can also be compiled statically or as a
+# module. If loaded as a module, it must be loaded before the svr4 module
+# (the /usr/sbin/svr4 script does this for you). If compiling statically,
+# the `streams' device must be configured into any kernel which also
+# specifies COMPAT_SVR4. It is possible to have a statically-configured
+# STREAMS device and a dynamically loadable svr4 emulator; the /usr/sbin/svr4
+# script understands that it doesn't need to load the `streams' module under
+# those circumstances.
+# Caveat: At this time, `options KTRACE' is required for the svr4 emulator
+# (whether static or dynamic).
+#
+#XXX#options COMPAT_SVR4 # build emulator statically
+#XXX#options DEBUG_SVR4 # enable verbose debugging
+#XXX#device streams # STREAMS network driver (required for svr4).
+
+
+#####################################################################
+# VM OPTIONS
+
+# KSTACK_PAGES is the number of memory pages to assign to the kernel
+# stack of each thread.
+
+options KSTACK_PAGES=5
+
+# Enable detailed accounting by the PV entry allocator.
+
+options PV_STATS
+
+#####################################################################
+
+# More undocumented options for linting.
+# Note that documenting these are not considered an affront.
+
+options FB_INSTALL_CDEV # install a CDEV entry in /dev
+
+options KBDIO_DEBUG=2
+options KBD_MAXRETRY=4
+options KBD_MAXWAIT=6
+options KBD_RESETDELAY=201
+
+options PSM_DEBUG=1
+
+options TIMER_FREQ=((14318182+6)/12)
+
+options VM_KMEM_SIZE
+options VM_KMEM_SIZE_MAX
+options VM_KMEM_SIZE_SCALE
+
+# Enable NDIS binary driver support
+options NDISAPI
+device ndis
+
+# Linux-specific pseudo devices support
+device lindev
diff --git a/sys/amd64/conf/XENHVM b/sys/amd64/conf/XENHVM
new file mode 100644
index 0000000..ee745ec
--- /dev/null
+++ b/sys/amd64/conf/XENHVM
@@ -0,0 +1,22 @@
+#
+# XENHVM -- Xen HVM kernel configuration file for FreeBSD/amd64
+#
+# $FreeBSD$
+#
+include GENERIC
+ident XENHVM
+
+#
+# Adaptive locks rely on a lock-free pointer read to determine the run state
+# of the thread holding a lock when under contention; under a virtualisation
+# system, the thread run state may not accurately reflect whether the thread
+# (or rather its host VCPU) is actually executing. As such, disable this
+# optimisation.
+#
+options NO_ADAPTIVE_MUTEXES
+options NO_ADAPTIVE_RWLOCKS
+options NO_ADAPTIVE_SX
+
+# Xen HVM support
+options XENHVM
+device xenpci
diff --git a/sys/amd64/ia32/ia32_exception.S b/sys/amd64/ia32/ia32_exception.S
new file mode 100644
index 0000000..fe1a676
--- /dev/null
+++ b/sys/amd64/ia32/ia32_exception.S
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+
+#include "assym.s"
+
+ .text
+/*
+ * Call gate entry for FreeBSD ELF and Linux/NetBSD syscall (int 0x80)
+ *
+ * This is a SDT_SYSIDT entry point (unlike the i386 port) so that we
+ * can do a swapgs before enabling interrupts. This is critical because
+ * if we took an interrupt before swapgs, the interrupt code would see
+ * that it originated in supervisor mode and skip the swapgs.
+ */
+ SUPERALIGN_TEXT
+IDTVEC(int0x80_syscall)
+ swapgs
+ pushq $2 /* sizeof "int 0x80" */
+ subq $TF_ERR,%rsp /* skip over tf_trapno */
+ movq %rdi,TF_RDI(%rsp)
+ movq PCPU(CURPCB),%rdi
+ andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi)
+ movw %fs,TF_FS(%rsp)
+ movw %gs,TF_GS(%rsp)
+ movw %es,TF_ES(%rsp)
+ movw %ds,TF_DS(%rsp)
+ sti
+ movq %rsi,TF_RSI(%rsp)
+ movq %rdx,TF_RDX(%rsp)
+ movq %rcx,TF_RCX(%rsp)
+ movq %r8,TF_R8(%rsp)
+ movq %r9,TF_R9(%rsp)
+ movq %rax,TF_RAX(%rsp)
+ movq %rbx,TF_RBX(%rsp)
+ movq %rbp,TF_RBP(%rsp)
+ movq %r10,TF_R10(%rsp)
+ movq %r11,TF_R11(%rsp)
+ movq %r12,TF_R12(%rsp)
+ movq %r13,TF_R13(%rsp)
+ movq %r14,TF_R14(%rsp)
+ movq %r15,TF_R15(%rsp)
+ movl $TF_HASSEGS,TF_FLAGS(%rsp)
+ cld
+ FAKE_MCOUNT(TF_RIP(%rsp))
+ movq %rsp, %rdi
+ call ia32_syscall
+ MEXITCOUNT
+ jmp doreti
diff --git a/sys/amd64/ia32/ia32_misc.c b/sys/amd64/ia32/ia32_misc.c
new file mode 100644
index 0000000..5a8a721
--- /dev/null
+++ b/sys/amd64/ia32/ia32_misc.c
@@ -0,0 +1,82 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/uio.h>
+
+#include <machine/cpu.h>
+#include <machine/sysarch.h>
+
+#include <compat/freebsd32/freebsd32_util.h>
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+
+int
+freebsd32_sysarch(struct thread *td, struct freebsd32_sysarch_args *uap)
+{
+ struct sysarch_args uap1;
+ struct i386_ldt_args uapl;
+ struct i386_ldt_args32 uapl32;
+ int error;
+
+ if (uap->op == I386_SET_LDT || uap->op == I386_GET_LDT) {
+ if ((error = copyin(uap->parms, &uapl32, sizeof(uapl32))) != 0)
+ return (error);
+ uap1.op = uap->op;
+ uap1.parms = (char *)&uapl;
+ uapl.start = uapl32.start;
+ uapl.descs = (struct user_segment_descriptor *)(uintptr_t)
+ uapl32.descs;
+ uapl.num = uapl32.num;
+ return (sysarch_ldt(td, &uap1, UIO_SYSSPACE));
+ } else {
+ uap1.op = uap->op;
+ uap1.parms = uap->parms;
+ return (sysarch(td, &uap1));
+ }
+}
+
+#ifdef COMPAT_43
+int
+ofreebsd32_getpagesize(struct thread *td,
+ struct ofreebsd32_getpagesize_args *uap)
+{
+
+ td->td_retval[0] = IA32_PAGE_SIZE;
+ return (0);
+}
+#endif
diff --git a/sys/amd64/ia32/ia32_reg.c b/sys/amd64/ia32/ia32_reg.c
new file mode 100644
index 0000000..5bc18f1
--- /dev/null
+++ b/sys/amd64/ia32/ia32_reg.c
@@ -0,0 +1,235 @@
+/*-
+ * Copyright (c) 2005 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/mman.h>
+#include <sys/namei.h>
+#include <sys/pioctl.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/resourcevar.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/syscall.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+
+#include <compat/freebsd32/freebsd32_util.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <machine/fpu.h>
+#include <machine/psl.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/frame.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#include <machine/cpufunc.h>
+
+#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
+#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
+
+int
+fill_regs32(struct thread *td, struct reg32 *regs)
+{
+ struct pcb *pcb;
+ struct trapframe *tp;
+
+ tp = td->td_frame;
+ pcb = td->td_pcb;
+ if (tp->tf_flags & TF_HASSEGS) {
+ regs->r_gs = tp->tf_gs;
+ regs->r_fs = tp->tf_fs;
+ regs->r_es = tp->tf_es;
+ regs->r_ds = tp->tf_ds;
+ } else {
+ regs->r_gs = _ugssel;
+ regs->r_fs = _ufssel;
+ regs->r_es = _udatasel;
+ regs->r_ds = _udatasel;
+ }
+ regs->r_edi = tp->tf_rdi;
+ regs->r_esi = tp->tf_rsi;
+ regs->r_ebp = tp->tf_rbp;
+ regs->r_ebx = tp->tf_rbx;
+ regs->r_edx = tp->tf_rdx;
+ regs->r_ecx = tp->tf_rcx;
+ regs->r_eax = tp->tf_rax;
+ regs->r_eip = tp->tf_rip;
+ regs->r_cs = tp->tf_cs;
+ regs->r_eflags = tp->tf_rflags;
+ regs->r_esp = tp->tf_rsp;
+ regs->r_ss = tp->tf_ss;
+ return (0);
+}
+
+int
+set_regs32(struct thread *td, struct reg32 *regs)
+{
+ struct pcb *pcb;
+ struct trapframe *tp;
+
+ tp = td->td_frame;
+ if (!EFL_SECURE(regs->r_eflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
+ return (EINVAL);
+ pcb = td->td_pcb;
+ tp->tf_gs = regs->r_gs;
+ tp->tf_fs = regs->r_fs;
+ tp->tf_es = regs->r_es;
+ tp->tf_ds = regs->r_ds;
+ set_pcb_flags(pcb, PCB_FULL_IRET);
+ tp->tf_flags = TF_HASSEGS;
+ tp->tf_rdi = regs->r_edi;
+ tp->tf_rsi = regs->r_esi;
+ tp->tf_rbp = regs->r_ebp;
+ tp->tf_rbx = regs->r_ebx;
+ tp->tf_rdx = regs->r_edx;
+ tp->tf_rcx = regs->r_ecx;
+ tp->tf_rax = regs->r_eax;
+ tp->tf_rip = regs->r_eip;
+ tp->tf_cs = regs->r_cs;
+ tp->tf_rflags = regs->r_eflags;
+ tp->tf_rsp = regs->r_esp;
+ tp->tf_ss = regs->r_ss;
+ return (0);
+}
+
+int
+fill_fpregs32(struct thread *td, struct fpreg32 *regs)
+{
+ struct savefpu *sv_fpu;
+ struct save87 *sv_87;
+ struct env87 *penv_87;
+ struct envxmm *penv_xmm;
+ int i;
+
+ bzero(regs, sizeof(*regs));
+ sv_87 = (struct save87 *)regs;
+ penv_87 = &sv_87->sv_env;
+ fpugetregs(td);
+ sv_fpu = get_pcb_user_save_td(td);
+ penv_xmm = &sv_fpu->sv_env;
+
+ /* FPU control/status */
+ penv_87->en_cw = penv_xmm->en_cw;
+ penv_87->en_sw = penv_xmm->en_sw;
+ penv_87->en_tw = penv_xmm->en_tw;
+ /*
+ * XXX for en_fip/fcs/foo/fos, check if the fxsave format
+ * uses the old-style layout for 32 bit user apps. If so,
+ * read the ip and operand segment registers from there.
+ * For now, use the process's %cs/%ds.
+ */
+ penv_87->en_fip = penv_xmm->en_rip;
+ penv_87->en_fcs = td->td_frame->tf_cs;
+ penv_87->en_opcode = penv_xmm->en_opcode;
+ penv_87->en_foo = penv_xmm->en_rdp;
+ /* Entry into the kernel always sets TF_HASSEGS */
+ penv_87->en_fos = td->td_frame->tf_ds;
+
+ /* FPU registers */
+ for (i = 0; i < 8; ++i)
+ sv_87->sv_ac[i] = sv_fpu->sv_fp[i].fp_acc;
+
+ return (0);
+}
+
+int
+set_fpregs32(struct thread *td, struct fpreg32 *regs)
+{
+ struct save87 *sv_87 = (struct save87 *)regs;
+ struct env87 *penv_87 = &sv_87->sv_env;
+ struct savefpu *sv_fpu = get_pcb_user_save_td(td);
+ struct envxmm *penv_xmm = &sv_fpu->sv_env;
+ int i;
+
+ /* FPU control/status */
+ penv_xmm->en_cw = penv_87->en_cw;
+ penv_xmm->en_sw = penv_87->en_sw;
+ penv_xmm->en_tw = penv_87->en_tw;
+ penv_xmm->en_rip = penv_87->en_fip;
+ /* penv_87->en_fcs and en_fos ignored, see above */
+ penv_xmm->en_opcode = penv_87->en_opcode;
+ penv_xmm->en_rdp = penv_87->en_foo;
+
+ /* FPU registers */
+ for (i = 0; i < 8; ++i)
+ sv_fpu->sv_fp[i].fp_acc = sv_87->sv_ac[i];
+ for (i = 8; i < 16; ++i)
+ bzero(&sv_fpu->sv_fp[i].fp_acc, sizeof(sv_fpu->sv_fp[i].fp_acc));
+ fpuuserinited(td);
+
+ return (0);
+}
+
+int
+fill_dbregs32(struct thread *td, struct dbreg32 *regs)
+{
+ struct dbreg dr;
+ int err, i;
+
+ err = fill_dbregs(td, &dr);
+ for (i = 0; i < 8; i++)
+ regs->dr[i] = dr.dr[i];
+ return (err);
+}
+
+int
+set_dbregs32(struct thread *td, struct dbreg32 *regs)
+{
+ struct dbreg dr;
+ int i;
+
+ for (i = 0; i < 8; i++)
+ dr.dr[i] = regs->dr[i];
+ for (i = 8; i < 16; i++)
+ dr.dr[i] = 0;
+ return (set_dbregs(td, &dr));
+}
diff --git a/sys/amd64/ia32/ia32_signal.c b/sys/amd64/ia32/ia32_signal.c
new file mode 100644
index 0000000..09ec7ab
--- /dev/null
+++ b/sys/amd64/ia32/ia32_signal.c
@@ -0,0 +1,1006 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm
+ * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/mman.h>
+#include <sys/namei.h>
+#include <sys/pioctl.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/resourcevar.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_util.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/ia32/ia32_signal.h>
+#include <machine/psl.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/frame.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#include <machine/cpufunc.h>
+
+#ifdef COMPAT_FREEBSD4
+static void freebsd4_ia32_sendsig(sig_t, ksiginfo_t *, sigset_t *);
+#endif
+
+#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
+#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
+
+static void
+ia32_get_fpcontext(struct thread *td, struct ia32_mcontext *mcp,
+ char *xfpusave, size_t xfpusave_len)
+{
+ size_t max_len, len;
+
+ /*
+ * XXX Format of 64bit and 32bit FXSAVE areas differs. FXSAVE
+ * in 32bit mode saves %cs and %ds, while on 64bit it saves
+ * 64bit instruction and data pointers. Ignore the difference
+ * for now, it should be irrelevant for most applications.
+ */
+ mcp->mc_ownedfp = fpugetregs(td);
+ bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate,
+ sizeof(mcp->mc_fpstate));
+ mcp->mc_fpformat = fpuformat();
+ if (!use_xsave || xfpusave_len == 0)
+ return;
+ max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
+ len = xfpusave_len;
+ if (len > max_len) {
+ len = max_len;
+ bzero(xfpusave + max_len, len - max_len);
+ }
+ mcp->mc_flags |= _MC_HASFPXSTATE;
+ mcp->mc_xfpustate_len = len;
+ bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
+}
+
+static int
+ia32_set_fpcontext(struct thread *td, const struct ia32_mcontext *mcp,
+ char *xfpustate, size_t xfpustate_len)
+{
+ int error;
+
+ if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
+ return (0);
+ else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
+ return (EINVAL);
+ else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
+ /* We don't care what state is left in the FPU or PCB. */
+ fpstate_drop(td);
+ error = 0;
+ } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
+ mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
+ error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
+ xfpustate, xfpustate_len);
+ } else
+ return (EINVAL);
+ return (error);
+}
+
+/*
+ * Get machine context.
+ */
+static int
+ia32_get_mcontext(struct thread *td, struct ia32_mcontext *mcp, int flags)
+{
+ struct pcb *pcb;
+ struct trapframe *tp;
+
+ pcb = td->td_pcb;
+ tp = td->td_frame;
+
+ PROC_LOCK(curthread->td_proc);
+ mcp->mc_onstack = sigonstack(tp->tf_rsp);
+ PROC_UNLOCK(curthread->td_proc);
+ /* Entry into kernel always sets TF_HASSEGS */
+ mcp->mc_gs = tp->tf_gs;
+ mcp->mc_fs = tp->tf_fs;
+ mcp->mc_es = tp->tf_es;
+ mcp->mc_ds = tp->tf_ds;
+ mcp->mc_edi = tp->tf_rdi;
+ mcp->mc_esi = tp->tf_rsi;
+ mcp->mc_ebp = tp->tf_rbp;
+ mcp->mc_isp = tp->tf_rsp;
+ mcp->mc_eflags = tp->tf_rflags;
+ if (flags & GET_MC_CLEAR_RET) {
+ mcp->mc_eax = 0;
+ mcp->mc_edx = 0;
+ mcp->mc_eflags &= ~PSL_C;
+ } else {
+ mcp->mc_eax = tp->tf_rax;
+ mcp->mc_edx = tp->tf_rdx;
+ }
+ mcp->mc_ebx = tp->tf_rbx;
+ mcp->mc_ecx = tp->tf_rcx;
+ mcp->mc_eip = tp->tf_rip;
+ mcp->mc_cs = tp->tf_cs;
+ mcp->mc_esp = tp->tf_rsp;
+ mcp->mc_ss = tp->tf_ss;
+ mcp->mc_len = sizeof(*mcp);
+ mcp->mc_flags = tp->tf_flags;
+ ia32_get_fpcontext(td, mcp, NULL, 0);
+ mcp->mc_fsbase = pcb->pcb_fsbase;
+ mcp->mc_gsbase = pcb->pcb_gsbase;
+ mcp->mc_xfpustate = 0;
+ mcp->mc_xfpustate_len = 0;
+ bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
+ set_pcb_flags(pcb, PCB_FULL_IRET);
+ return (0);
+}
+
+/*
+ * Set machine context.
+ *
+ * However, we don't set any but the user modifiable flags, and we won't
+ * touch the cs selector.
+ */
+static int
+ia32_set_mcontext(struct thread *td, const struct ia32_mcontext *mcp)
+{
+ struct trapframe *tp;
+ char *xfpustate;
+ long rflags;
+ int ret;
+
+ tp = td->td_frame;
+ if (mcp->mc_len != sizeof(*mcp))
+ return (EINVAL);
+ rflags = (mcp->mc_eflags & PSL_USERCHANGE) |
+ (tp->tf_rflags & ~PSL_USERCHANGE);
+ if (mcp->mc_flags & _MC_IA32_HASFPXSTATE) {
+ if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
+ sizeof(struct savefpu))
+ return (EINVAL);
+ xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
+ ret = copyin(PTRIN(mcp->mc_xfpustate), xfpustate,
+ mcp->mc_xfpustate_len);
+ if (ret != 0)
+ return (ret);
+ } else
+ xfpustate = NULL;
+ ret = ia32_set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
+ if (ret != 0)
+ return (ret);
+ tp->tf_gs = mcp->mc_gs;
+ tp->tf_fs = mcp->mc_fs;
+ tp->tf_es = mcp->mc_es;
+ tp->tf_ds = mcp->mc_ds;
+ tp->tf_flags = TF_HASSEGS;
+ tp->tf_rdi = mcp->mc_edi;
+ tp->tf_rsi = mcp->mc_esi;
+ tp->tf_rbp = mcp->mc_ebp;
+ tp->tf_rbx = mcp->mc_ebx;
+ tp->tf_rdx = mcp->mc_edx;
+ tp->tf_rcx = mcp->mc_ecx;
+ tp->tf_rax = mcp->mc_eax;
+ /* trapno, err */
+ tp->tf_rip = mcp->mc_eip;
+ tp->tf_rflags = rflags;
+ tp->tf_rsp = mcp->mc_esp;
+ tp->tf_ss = mcp->mc_ss;
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+ return (0);
+}
+
+/*
+ * The first two fields of a ucontext_t are the signal mask and
+ * the machine context. The next field is uc_link; we want to
+ * avoid destroying the link when copying out contexts.
+ */
+#define UC_COPY_SIZE offsetof(struct ia32_ucontext, uc_link)
+
+int
+freebsd32_getcontext(struct thread *td, struct freebsd32_getcontext_args *uap)
+{
+ struct ia32_ucontext uc;
+ int ret;
+
+ if (uap->ucp == NULL)
+ ret = EINVAL;
+ else {
+ ia32_get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
+ PROC_LOCK(td->td_proc);
+ uc.uc_sigmask = td->td_sigmask;
+ PROC_UNLOCK(td->td_proc);
+ bzero(&uc.__spare__, sizeof(uc.__spare__));
+ ret = copyout(&uc, uap->ucp, UC_COPY_SIZE);
+ }
+ return (ret);
+}
+
+int
+freebsd32_setcontext(struct thread *td, struct freebsd32_setcontext_args *uap)
+{
+ struct ia32_ucontext uc;
+ int ret;
+
+ if (uap->ucp == NULL)
+ ret = EINVAL;
+ else {
+ ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
+ if (ret == 0) {
+ ret = ia32_set_mcontext(td, &uc.uc_mcontext);
+ if (ret == 0) {
+ kern_sigprocmask(td, SIG_SETMASK,
+ &uc.uc_sigmask, NULL, 0);
+ }
+ }
+ }
+ return (ret == 0 ? EJUSTRETURN : ret);
+}
+
+int
+freebsd32_swapcontext(struct thread *td, struct freebsd32_swapcontext_args *uap)
+{
+ struct ia32_ucontext uc;
+ int ret;
+
+ if (uap->oucp == NULL || uap->ucp == NULL)
+ ret = EINVAL;
+ else {
+ ia32_get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
+ PROC_LOCK(td->td_proc);
+ uc.uc_sigmask = td->td_sigmask;
+ PROC_UNLOCK(td->td_proc);
+ ret = copyout(&uc, uap->oucp, UC_COPY_SIZE);
+ if (ret == 0) {
+ ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
+ if (ret == 0) {
+ ret = ia32_set_mcontext(td, &uc.uc_mcontext);
+ if (ret == 0) {
+ kern_sigprocmask(td, SIG_SETMASK,
+ &uc.uc_sigmask, NULL, 0);
+ }
+ }
+ }
+ }
+ return (ret == 0 ? EJUSTRETURN : ret);
+}
+
+/*
+ * Send an interrupt to process.
+ *
+ * Stack is set up to allow sigcode stored
+ * at top to call routine, followed by kcall
+ * to sigreturn routine below. After sigreturn
+ * resets the signal mask, the stack, and the
+ * frame pointer, it returns to the user
+ * specified pc, psl.
+ */
+
+#ifdef COMPAT_43
+static void
+ia32_osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
+{
+ struct ia32_sigframe3 sf, *fp;
+ struct proc *p;
+ struct thread *td;
+ struct sigacts *psp;
+ struct trapframe *regs;
+ int sig;
+ int oonstack;
+
+ td = curthread;
+ p = td->td_proc;
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ sig = ksi->ksi_signo;
+ psp = p->p_sigacts;
+ mtx_assert(&psp->ps_mtx, MA_OWNED);
+ regs = td->td_frame;
+ oonstack = sigonstack(regs->tf_rsp);
+
+ /* Allocate space for the signal handler context. */
+ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
+ SIGISMEMBER(psp->ps_sigonstack, sig)) {
+ fp = (struct ia32_sigframe3 *)(td->td_sigstk.ss_sp +
+ td->td_sigstk.ss_size - sizeof(sf));
+ td->td_sigstk.ss_flags |= SS_ONSTACK;
+ } else
+ fp = (struct ia32_sigframe3 *)regs->tf_rsp - 1;
+
+ /* Translate the signal if appropriate. */
+ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
+ sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
+
+ /* Build the argument list for the signal handler. */
+ sf.sf_signum = sig;
+ sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
+ if (SIGISMEMBER(psp->ps_siginfo, sig)) {
+ /* Signal handler installed with SA_SIGINFO. */
+ sf.sf_arg2 = (register_t)&fp->sf_siginfo;
+ sf.sf_siginfo.si_signo = sig;
+ sf.sf_siginfo.si_code = ksi->ksi_code;
+ sf.sf_ah = (uintptr_t)catcher;
+ } else {
+ /* Old FreeBSD-style arguments. */
+ sf.sf_arg2 = ksi->ksi_code;
+ sf.sf_addr = (register_t)ksi->ksi_addr;
+ sf.sf_ah = (uintptr_t)catcher;
+ }
+ mtx_unlock(&psp->ps_mtx);
+ PROC_UNLOCK(p);
+
+ /* Save most if not all of trap frame. */
+ sf.sf_siginfo.si_sc.sc_eax = regs->tf_rax;
+ sf.sf_siginfo.si_sc.sc_ebx = regs->tf_rbx;
+ sf.sf_siginfo.si_sc.sc_ecx = regs->tf_rcx;
+ sf.sf_siginfo.si_sc.sc_edx = regs->tf_rdx;
+ sf.sf_siginfo.si_sc.sc_esi = regs->tf_rsi;
+ sf.sf_siginfo.si_sc.sc_edi = regs->tf_rdi;
+ sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
+ sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
+ sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
+ sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
+ sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
+ sf.sf_siginfo.si_sc.sc_gs = regs->tf_gs;
+ sf.sf_siginfo.si_sc.sc_isp = regs->tf_rsp;
+
+ /* Build the signal context to be used by osigreturn(). */
+ sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
+ SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
+ sf.sf_siginfo.si_sc.sc_esp = regs->tf_rsp;
+ sf.sf_siginfo.si_sc.sc_ebp = regs->tf_rbp;
+ sf.sf_siginfo.si_sc.sc_eip = regs->tf_rip;
+ sf.sf_siginfo.si_sc.sc_eflags = regs->tf_rflags;
+ sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
+ sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
+
+ /*
+ * Copy the sigframe out to the user's stack.
+ */
+ if (copyout(&sf, fp, sizeof(*fp)) != 0) {
+#ifdef DEBUG
+ printf("process %ld has trashed its stack\n", (long)p->p_pid);
+#endif
+ PROC_LOCK(p);
+ sigexit(td, SIGILL);
+ }
+
+ regs->tf_rsp = (uintptr_t)fp;
+ regs->tf_rip = p->p_sysent->sv_psstrings - sz_ia32_osigcode;
+ regs->tf_rflags &= ~(PSL_T | PSL_D);
+ regs->tf_cs = _ucode32sel;
+ regs->tf_ds = _udatasel;
+ regs->tf_es = _udatasel;
+ regs->tf_fs = _udatasel;
+ regs->tf_ss = _udatasel;
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+ PROC_LOCK(p);
+ mtx_lock(&psp->ps_mtx);
+}
+#endif
+
+#ifdef COMPAT_FREEBSD4
+static void
+freebsd4_ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
+{
+ struct ia32_sigframe4 sf, *sfp;
+ struct siginfo32 siginfo;
+ struct proc *p;
+ struct thread *td;
+ struct sigacts *psp;
+ struct trapframe *regs;
+ int oonstack;
+ int sig;
+
+ td = curthread;
+ p = td->td_proc;
+ siginfo_to_siginfo32(&ksi->ksi_info, &siginfo);
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ sig = siginfo.si_signo;
+ psp = p->p_sigacts;
+ mtx_assert(&psp->ps_mtx, MA_OWNED);
+ regs = td->td_frame;
+ oonstack = sigonstack(regs->tf_rsp);
+
+ /* Save user context. */
+ bzero(&sf, sizeof(sf));
+ sf.sf_uc.uc_sigmask = *mask;
+ sf.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp;
+ sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size;
+ sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
+ ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
+ sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
+ sf.sf_uc.uc_mcontext.mc_edi = regs->tf_rdi;
+ sf.sf_uc.uc_mcontext.mc_esi = regs->tf_rsi;
+ sf.sf_uc.uc_mcontext.mc_ebp = regs->tf_rbp;
+ sf.sf_uc.uc_mcontext.mc_isp = regs->tf_rsp; /* XXX */
+ sf.sf_uc.uc_mcontext.mc_ebx = regs->tf_rbx;
+ sf.sf_uc.uc_mcontext.mc_edx = regs->tf_rdx;
+ sf.sf_uc.uc_mcontext.mc_ecx = regs->tf_rcx;
+ sf.sf_uc.uc_mcontext.mc_eax = regs->tf_rax;
+ sf.sf_uc.uc_mcontext.mc_trapno = regs->tf_trapno;
+ sf.sf_uc.uc_mcontext.mc_err = regs->tf_err;
+ sf.sf_uc.uc_mcontext.mc_eip = regs->tf_rip;
+ sf.sf_uc.uc_mcontext.mc_cs = regs->tf_cs;
+ sf.sf_uc.uc_mcontext.mc_eflags = regs->tf_rflags;
+ sf.sf_uc.uc_mcontext.mc_esp = regs->tf_rsp;
+ sf.sf_uc.uc_mcontext.mc_ss = regs->tf_ss;
+ sf.sf_uc.uc_mcontext.mc_ds = regs->tf_ds;
+ sf.sf_uc.uc_mcontext.mc_es = regs->tf_es;
+ sf.sf_uc.uc_mcontext.mc_fs = regs->tf_fs;
+ sf.sf_uc.uc_mcontext.mc_gs = regs->tf_gs;
+ bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
+ sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
+ bzero(sf.sf_uc.uc_mcontext.__spare__,
+ sizeof(sf.sf_uc.uc_mcontext.__spare__));
+ bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
+
+ /* Allocate space for the signal handler context. */
+ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
+ SIGISMEMBER(psp->ps_sigonstack, sig)) {
+ sfp = (struct ia32_sigframe4 *)(td->td_sigstk.ss_sp +
+ td->td_sigstk.ss_size - sizeof(sf));
+ } else
+ sfp = (struct ia32_sigframe4 *)regs->tf_rsp - 1;
+ PROC_UNLOCK(p);
+
+ /* Translate the signal if appropriate. */
+ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
+ sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
+
+ /* Build the argument list for the signal handler. */
+ sf.sf_signum = sig;
+ sf.sf_ucontext = (register_t)&sfp->sf_uc;
+ bzero(&sf.sf_si, sizeof(sf.sf_si));
+ if (SIGISMEMBER(psp->ps_siginfo, sig)) {
+ /* Signal handler installed with SA_SIGINFO. */
+ sf.sf_siginfo = (u_int32_t)(uintptr_t)&sfp->sf_si;
+ sf.sf_ah = (u_int32_t)(uintptr_t)catcher;
+
+ /* Fill in POSIX parts */
+ sf.sf_si = siginfo;
+ sf.sf_si.si_signo = sig;
+ } else {
+ /* Old FreeBSD-style arguments. */
+ sf.sf_siginfo = siginfo.si_code;
+ sf.sf_addr = (u_int32_t)siginfo.si_addr;
+ sf.sf_ah = (u_int32_t)(uintptr_t)catcher;
+ }
+ mtx_unlock(&psp->ps_mtx);
+
+ /*
+ * Copy the sigframe out to the user's stack.
+ */
+ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
+#ifdef DEBUG
+ printf("process %ld has trashed its stack\n", (long)p->p_pid);
+#endif
+ PROC_LOCK(p);
+ sigexit(td, SIGILL);
+ }
+
+ regs->tf_rsp = (uintptr_t)sfp;
+ regs->tf_rip = p->p_sysent->sv_sigcode_base + sz_ia32_sigcode -
+ sz_freebsd4_ia32_sigcode;
+ regs->tf_rflags &= ~(PSL_T | PSL_D);
+ regs->tf_cs = _ucode32sel;
+ regs->tf_ss = _udatasel;
+ regs->tf_ds = _udatasel;
+ regs->tf_es = _udatasel;
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+ /* leave user %fs and %gs untouched */
+ PROC_LOCK(p);
+ mtx_lock(&psp->ps_mtx);
+}
+#endif /* COMPAT_FREEBSD4 */
+
+void
+ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
+{
+ struct ia32_sigframe sf, *sfp;
+ struct siginfo32 siginfo;
+ struct proc *p;
+ struct thread *td;
+ struct sigacts *psp;
+ char *sp;
+ struct trapframe *regs;
+ char *xfpusave;
+ size_t xfpusave_len;
+ int oonstack;
+ int sig;
+
+ siginfo_to_siginfo32(&ksi->ksi_info, &siginfo);
+ td = curthread;
+ p = td->td_proc;
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ sig = siginfo.si_signo;
+ psp = p->p_sigacts;
+#ifdef COMPAT_FREEBSD4
+ if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
+ freebsd4_ia32_sendsig(catcher, ksi, mask);
+ return;
+ }
+#endif
+#ifdef COMPAT_43
+ if (SIGISMEMBER(psp->ps_osigset, sig)) {
+ ia32_osendsig(catcher, ksi, mask);
+ return;
+ }
+#endif
+ mtx_assert(&psp->ps_mtx, MA_OWNED);
+ regs = td->td_frame;
+ oonstack = sigonstack(regs->tf_rsp);
+
+ if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
+ xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
+ xfpusave = __builtin_alloca(xfpusave_len);
+ } else {
+ xfpusave_len = 0;
+ xfpusave = NULL;
+ }
+
+ /* Save user context. */
+ bzero(&sf, sizeof(sf));
+ sf.sf_uc.uc_sigmask = *mask;
+ sf.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp;
+ sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size;
+ sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
+ ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
+ sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
+ sf.sf_uc.uc_mcontext.mc_edi = regs->tf_rdi;
+ sf.sf_uc.uc_mcontext.mc_esi = regs->tf_rsi;
+ sf.sf_uc.uc_mcontext.mc_ebp = regs->tf_rbp;
+ sf.sf_uc.uc_mcontext.mc_isp = regs->tf_rsp; /* XXX */
+ sf.sf_uc.uc_mcontext.mc_ebx = regs->tf_rbx;
+ sf.sf_uc.uc_mcontext.mc_edx = regs->tf_rdx;
+ sf.sf_uc.uc_mcontext.mc_ecx = regs->tf_rcx;
+ sf.sf_uc.uc_mcontext.mc_eax = regs->tf_rax;
+ sf.sf_uc.uc_mcontext.mc_trapno = regs->tf_trapno;
+ sf.sf_uc.uc_mcontext.mc_err = regs->tf_err;
+ sf.sf_uc.uc_mcontext.mc_eip = regs->tf_rip;
+ sf.sf_uc.uc_mcontext.mc_cs = regs->tf_cs;
+ sf.sf_uc.uc_mcontext.mc_eflags = regs->tf_rflags;
+ sf.sf_uc.uc_mcontext.mc_esp = regs->tf_rsp;
+ sf.sf_uc.uc_mcontext.mc_ss = regs->tf_ss;
+ sf.sf_uc.uc_mcontext.mc_ds = regs->tf_ds;
+ sf.sf_uc.uc_mcontext.mc_es = regs->tf_es;
+ sf.sf_uc.uc_mcontext.mc_fs = regs->tf_fs;
+ sf.sf_uc.uc_mcontext.mc_gs = regs->tf_gs;
+ sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
+ ia32_get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
+ fpstate_drop(td);
+ sf.sf_uc.uc_mcontext.mc_fsbase = td->td_pcb->pcb_fsbase;
+ sf.sf_uc.uc_mcontext.mc_gsbase = td->td_pcb->pcb_gsbase;
+ bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
+
+ /* Allocate space for the signal handler context. */
+ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
+ SIGISMEMBER(psp->ps_sigonstack, sig))
+ sp = td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
+ else
+ sp = (char *)regs->tf_rsp;
+ if (xfpusave != NULL) {
+ sp -= xfpusave_len;
+ sp = (char *)((unsigned long)sp & ~0x3Ful);
+ sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
+ }
+ sp -= sizeof(sf);
+ /* Align to 16 bytes. */
+ sfp = (struct ia32_sigframe *)((uintptr_t)sp & ~0xF);
+ PROC_UNLOCK(p);
+
+ /* Translate the signal if appropriate. */
+ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
+ sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
+
+ /* Build the argument list for the signal handler. */
+ sf.sf_signum = sig;
+ sf.sf_ucontext = (register_t)&sfp->sf_uc;
+ bzero(&sf.sf_si, sizeof(sf.sf_si));
+ if (SIGISMEMBER(psp->ps_siginfo, sig)) {
+ /* Signal handler installed with SA_SIGINFO. */
+ sf.sf_siginfo = (u_int32_t)(uintptr_t)&sfp->sf_si;
+ sf.sf_ah = (u_int32_t)(uintptr_t)catcher;
+
+ /* Fill in POSIX parts */
+ sf.sf_si = siginfo;
+ sf.sf_si.si_signo = sig;
+ } else {
+ /* Old FreeBSD-style arguments. */
+ sf.sf_siginfo = siginfo.si_code;
+ sf.sf_addr = (u_int32_t)siginfo.si_addr;
+ sf.sf_ah = (u_int32_t)(uintptr_t)catcher;
+ }
+ mtx_unlock(&psp->ps_mtx);
+
+ /*
+ * Copy the sigframe out to the user's stack.
+ */
+ if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
+ (xfpusave != NULL && copyout(xfpusave,
+ PTRIN(sf.sf_uc.uc_mcontext.mc_xfpustate), xfpusave_len)
+ != 0)) {
+#ifdef DEBUG
+ printf("process %ld has trashed its stack\n", (long)p->p_pid);
+#endif
+ PROC_LOCK(p);
+ sigexit(td, SIGILL);
+ }
+
+ regs->tf_rsp = (uintptr_t)sfp;
+ regs->tf_rip = p->p_sysent->sv_sigcode_base;
+ regs->tf_rflags &= ~(PSL_T | PSL_D);
+ regs->tf_cs = _ucode32sel;
+ regs->tf_ss = _udatasel;
+ regs->tf_ds = _udatasel;
+ regs->tf_es = _udatasel;
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+ /* XXXKIB leave user %fs and %gs untouched */
+ PROC_LOCK(p);
+ mtx_lock(&psp->ps_mtx);
+}
+
+/*
+ * System call to cleanup state after a signal
+ * has been taken. Reset signal mask and
+ * stack state from context left by sendsig (above).
+ * Return to previous pc and psl as specified by
+ * context left by sendsig. Check carefully to
+ * make sure that the user has not modified the
+ * state to gain improper privileges.
+ */
+
+#ifdef COMPAT_43
+int
+ofreebsd32_sigreturn(struct thread *td, struct ofreebsd32_sigreturn_args *uap)
+{
+ struct ia32_sigcontext3 sc, *scp;
+ struct trapframe *regs;
+ int eflags, error;
+ ksiginfo_t ksi;
+
+ regs = td->td_frame;
+ error = copyin(uap->sigcntxp, &sc, sizeof(sc));
+ if (error != 0)
+ return (error);
+ scp = &sc;
+ eflags = scp->sc_eflags;
+ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
+ return (EINVAL);
+ }
+ if (!CS_SECURE(scp->sc_cs)) {
+ ksiginfo_init_trap(&ksi);
+ ksi.ksi_signo = SIGBUS;
+ ksi.ksi_code = BUS_OBJERR;
+ ksi.ksi_trapno = T_PROTFLT;
+ ksi.ksi_addr = (void *)regs->tf_rip;
+ trapsignal(td, &ksi);
+ return (EINVAL);
+ }
+ regs->tf_ds = scp->sc_ds;
+ regs->tf_es = scp->sc_es;
+ regs->tf_fs = scp->sc_fs;
+ regs->tf_gs = scp->sc_gs;
+
+ regs->tf_rax = scp->sc_eax;
+ regs->tf_rbx = scp->sc_ebx;
+ regs->tf_rcx = scp->sc_ecx;
+ regs->tf_rdx = scp->sc_edx;
+ regs->tf_rsi = scp->sc_esi;
+ regs->tf_rdi = scp->sc_edi;
+ regs->tf_cs = scp->sc_cs;
+ regs->tf_ss = scp->sc_ss;
+ regs->tf_rbp = scp->sc_ebp;
+ regs->tf_rsp = scp->sc_esp;
+ regs->tf_rip = scp->sc_eip;
+ regs->tf_rflags = eflags;
+
+ if (scp->sc_onstack & 1)
+ td->td_sigstk.ss_flags |= SS_ONSTACK;
+ else
+ td->td_sigstk.ss_flags &= ~SS_ONSTACK;
+
+ kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
+ SIGPROCMASK_OLD);
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+ return (EJUSTRETURN);
+}
+#endif
+
+#ifdef COMPAT_FREEBSD4
+/*
+ * MPSAFE
+ */
+int
+freebsd4_freebsd32_sigreturn(td, uap)
+ struct thread *td;
+ struct freebsd4_freebsd32_sigreturn_args /* {
+ const struct freebsd4_freebsd32_ucontext *sigcntxp;
+ } */ *uap;
+{
+ struct ia32_ucontext4 uc;
+ struct trapframe *regs;
+ struct ia32_ucontext4 *ucp;
+ int cs, eflags, error;
+ ksiginfo_t ksi;
+
+ error = copyin(uap->sigcntxp, &uc, sizeof(uc));
+ if (error != 0)
+ return (error);
+ ucp = &uc;
+ regs = td->td_frame;
+ eflags = ucp->uc_mcontext.mc_eflags;
+ /*
+ * Don't allow users to change privileged or reserved flags.
+ */
+ /*
+ * XXX do allow users to change the privileged flag PSL_RF.
+ * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
+ * should sometimes set it there too. tf_eflags is kept in
+ * the signal context during signal handling and there is no
+ * other place to remember it, so the PSL_RF bit may be
+ * corrupted by the signal handler without us knowing.
+ * Corruption of the PSL_RF bit at worst causes one more or
+ * one less debugger trap, so allowing it is fairly harmless.
+ */
+ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
+ uprintf("pid %d (%s): freebsd4_freebsd32_sigreturn eflags = 0x%x\n",
+ td->td_proc->p_pid, td->td_name, eflags);
+ return (EINVAL);
+ }
+
+ /*
+ * Don't allow users to load a valid privileged %cs. Let the
+ * hardware check for invalid selectors, excess privilege in
+ * other selectors, invalid %eip's and invalid %esp's.
+ */
+ cs = ucp->uc_mcontext.mc_cs;
+ if (!CS_SECURE(cs)) {
+ uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
+ td->td_proc->p_pid, td->td_name, cs);
+ ksiginfo_init_trap(&ksi);
+ ksi.ksi_signo = SIGBUS;
+ ksi.ksi_code = BUS_OBJERR;
+ ksi.ksi_trapno = T_PROTFLT;
+ ksi.ksi_addr = (void *)regs->tf_rip;
+ trapsignal(td, &ksi);
+ return (EINVAL);
+ }
+
+ regs->tf_rdi = ucp->uc_mcontext.mc_edi;
+ regs->tf_rsi = ucp->uc_mcontext.mc_esi;
+ regs->tf_rbp = ucp->uc_mcontext.mc_ebp;
+ regs->tf_rbx = ucp->uc_mcontext.mc_ebx;
+ regs->tf_rdx = ucp->uc_mcontext.mc_edx;
+ regs->tf_rcx = ucp->uc_mcontext.mc_ecx;
+ regs->tf_rax = ucp->uc_mcontext.mc_eax;
+ regs->tf_trapno = ucp->uc_mcontext.mc_trapno;
+ regs->tf_err = ucp->uc_mcontext.mc_err;
+ regs->tf_rip = ucp->uc_mcontext.mc_eip;
+ regs->tf_cs = cs;
+ regs->tf_rflags = ucp->uc_mcontext.mc_eflags;
+ regs->tf_rsp = ucp->uc_mcontext.mc_esp;
+ regs->tf_ss = ucp->uc_mcontext.mc_ss;
+ regs->tf_ds = ucp->uc_mcontext.mc_ds;
+ regs->tf_es = ucp->uc_mcontext.mc_es;
+ regs->tf_fs = ucp->uc_mcontext.mc_fs;
+ regs->tf_gs = ucp->uc_mcontext.mc_gs;
+
+ kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+ return (EJUSTRETURN);
+}
+#endif /* COMPAT_FREEBSD4 */
+
+/*
+ * MPSAFE
+ */
+int
+freebsd32_sigreturn(td, uap)
+ struct thread *td;
+ struct freebsd32_sigreturn_args /* {
+ const struct freebsd32_ucontext *sigcntxp;
+ } */ *uap;
+{
+ struct ia32_ucontext uc;
+ struct trapframe *regs;
+ struct ia32_ucontext *ucp;
+ char *xfpustate;
+ size_t xfpustate_len;
+ int cs, eflags, error, ret;
+ ksiginfo_t ksi;
+
+ error = copyin(uap->sigcntxp, &uc, sizeof(uc));
+ if (error != 0)
+ return (error);
+ ucp = &uc;
+ regs = td->td_frame;
+ eflags = ucp->uc_mcontext.mc_eflags;
+ /*
+ * Don't allow users to change privileged or reserved flags.
+ */
+ /*
+ * XXX do allow users to change the privileged flag PSL_RF.
+ * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
+ * should sometimes set it there too. tf_eflags is kept in
+ * the signal context during signal handling and there is no
+ * other place to remember it, so the PSL_RF bit may be
+ * corrupted by the signal handler without us knowing.
+ * Corruption of the PSL_RF bit at worst causes one more or
+ * one less debugger trap, so allowing it is fairly harmless.
+ */
+ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
+ uprintf("pid %d (%s): freebsd32_sigreturn eflags = 0x%x\n",
+ td->td_proc->p_pid, td->td_name, eflags);
+ return (EINVAL);
+ }
+
+ /*
+ * Don't allow users to load a valid privileged %cs. Let the
+ * hardware check for invalid selectors, excess privilege in
+ * other selectors, invalid %eip's and invalid %esp's.
+ */
+ cs = ucp->uc_mcontext.mc_cs;
+ if (!CS_SECURE(cs)) {
+ uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
+ td->td_proc->p_pid, td->td_name, cs);
+ ksiginfo_init_trap(&ksi);
+ ksi.ksi_signo = SIGBUS;
+ ksi.ksi_code = BUS_OBJERR;
+ ksi.ksi_trapno = T_PROTFLT;
+ ksi.ksi_addr = (void *)regs->tf_rip;
+ trapsignal(td, &ksi);
+ return (EINVAL);
+ }
+
+ if ((ucp->uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
+ xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
+ if (xfpustate_len > cpu_max_ext_state_size -
+ sizeof(struct savefpu)) {
+ uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
+ td->td_proc->p_pid, td->td_name, xfpustate_len);
+ return (EINVAL);
+ }
+ xfpustate = __builtin_alloca(xfpustate_len);
+ error = copyin(PTRIN(ucp->uc_mcontext.mc_xfpustate),
+ xfpustate, xfpustate_len);
+ if (error != 0) {
+ uprintf(
+ "pid %d (%s): sigreturn copying xfpustate failed\n",
+ td->td_proc->p_pid, td->td_name);
+ return (error);
+ }
+ } else {
+ xfpustate = NULL;
+ xfpustate_len = 0;
+ }
+ ret = ia32_set_fpcontext(td, &ucp->uc_mcontext, xfpustate,
+ xfpustate_len);
+ if (ret != 0) {
+ uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
+ td->td_proc->p_pid, td->td_name, ret);
+ return (ret);
+ }
+
+ regs->tf_rdi = ucp->uc_mcontext.mc_edi;
+ regs->tf_rsi = ucp->uc_mcontext.mc_esi;
+ regs->tf_rbp = ucp->uc_mcontext.mc_ebp;
+ regs->tf_rbx = ucp->uc_mcontext.mc_ebx;
+ regs->tf_rdx = ucp->uc_mcontext.mc_edx;
+ regs->tf_rcx = ucp->uc_mcontext.mc_ecx;
+ regs->tf_rax = ucp->uc_mcontext.mc_eax;
+ regs->tf_trapno = ucp->uc_mcontext.mc_trapno;
+ regs->tf_err = ucp->uc_mcontext.mc_err;
+ regs->tf_rip = ucp->uc_mcontext.mc_eip;
+ regs->tf_cs = cs;
+ regs->tf_rflags = ucp->uc_mcontext.mc_eflags;
+ regs->tf_rsp = ucp->uc_mcontext.mc_esp;
+ regs->tf_ss = ucp->uc_mcontext.mc_ss;
+ regs->tf_ds = ucp->uc_mcontext.mc_ds;
+ regs->tf_es = ucp->uc_mcontext.mc_es;
+ regs->tf_fs = ucp->uc_mcontext.mc_fs;
+ regs->tf_gs = ucp->uc_mcontext.mc_gs;
+ regs->tf_flags = TF_HASSEGS;
+
+ kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+ return (EJUSTRETURN);
+}
+
+/*
+ * Clear registers on exec
+ */
+void
+ia32_setregs(struct thread *td, struct image_params *imgp, u_long stack)
+{
+ struct trapframe *regs = td->td_frame;
+ struct pcb *pcb = td->td_pcb;
+
+ mtx_lock(&dt_lock);
+ if (td->td_proc->p_md.md_ldt != NULL)
+ user_ldt_free(td);
+ else
+ mtx_unlock(&dt_lock);
+#ifdef COMPAT_43
+ setup_lcall_gate();
+#endif
+
+ pcb->pcb_fsbase = 0;
+ pcb->pcb_gsbase = 0;
+ pcb->pcb_initial_fpucw = __INITIAL_FPUCW_I386__;
+
+ bzero((char *)regs, sizeof(struct trapframe));
+ regs->tf_rip = imgp->entry_addr;
+ regs->tf_rsp = stack;
+ regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
+ regs->tf_ss = _udatasel;
+ regs->tf_cs = _ucode32sel;
+ regs->tf_rbx = imgp->ps_strings;
+ regs->tf_ds = _udatasel;
+ regs->tf_es = _udatasel;
+ regs->tf_fs = _ufssel;
+ regs->tf_gs = _ugssel;
+ regs->tf_flags = TF_HASSEGS;
+
+ fpstate_drop(td);
+
+ /* Return via doreti so that we can change to a different %cs */
+ set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
+ clear_pcb_flags(pcb, PCB_GS32BIT);
+ td->td_retval[1] = 0;
+}
diff --git a/sys/amd64/ia32/ia32_sigtramp.S b/sys/amd64/ia32/ia32_sigtramp.S
new file mode 100644
index 0000000..3541988
--- /dev/null
+++ b/sys/amd64/ia32/ia32_sigtramp.S
@@ -0,0 +1,161 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+
+#include <machine/asmacros.h>
+#include <sys/syscall.h>
+
+#include "ia32_assym.h"
+
+ .text
+ .code32
+/*
+ * Signal trampoline, copied to top of user stack
+ * XXX may need to be MD to match backend sendsig handoff protocol
+ */
+ ALIGN_TEXT
+ .globl ia32_sigcode
+ia32_sigcode:
+ calll *IA32_SIGF_HANDLER(%esp)
+ leal IA32_SIGF_UC(%esp),%eax /* get ucontext */
+ pushl %eax
+ movl $SYS_sigreturn,%eax
+ pushl %eax /* junk to fake return addr. */
+ int $0x80 /* enter kernel with args */
+ /* on stack */
+1:
+ jmp 1b
+
+#ifdef COMPAT_FREEBSD4
+ ALIGN_TEXT
+freebsd4_ia32_sigcode:
+ calll *IA32_SIGF_HANDLER(%esp)
+ leal IA32_SIGF_UC4(%esp),%eax/* get ucontext */
+ pushl %eax
+ movl $344,%eax /* 4.x SYS_sigreturn */
+ pushl %eax /* junk to fake return addr. */
+ int $0x80 /* enter kernel with args */
+ /* on stack */
+1:
+ jmp 1b
+#endif
+
+#ifdef COMPAT_43
+ ALIGN_TEXT
+ia32_osigcode:
+ calll *IA32_SIGF_HANDLER(%esp)/* call signal handler */
+ leal IA32_SIGF_SC(%esp),%eax /* get sigcontext */
+ pushl %eax
+ movl $103,%eax /* 3.x SYS_sigreturn */
+ pushl %eax /* junk to fake return addr. */
+ int $0x80 /* enter kernel with args */
+1:
+ jmp 1b
+
+
+/*
+ * The lcall $7,$0 emulator cannot use the call gate that does an
+ * inter-privilege transition. The reason is that the call gate
+ * does not disable interrupts, and, before the swapgs is
+ * executed, we would have a window where the ring 0 code is
+ * executed with the wrong gsbase.
+ *
+ * Instead, reflect the lcall $7,$0 back to ring 3 trampoline
+ * which sets up the frame for int $0x80.
+ */
+ ALIGN_TEXT
+lcall_tramp:
+ .code64
+ /*
+ * There, we are in 64bit mode and need to return to 32bit.
+ * First, convert call frame from 64 to 32 bit format.
+ */
+ pushq %rax
+ movl 16(%rsp),%eax
+ movl %eax,20(%rsp) /* ret %cs */
+ movl 8(%rsp),%eax
+ movl %eax,16(%rsp) /* ret %rip -> %eip */
+ popq %rax
+ addq $8,%rsp
+ /* Now return to 32bit */
+ pushq $0x33 /* _ucode32sel UPL */
+ callq 1f
+1:
+ addq $2f-1b,(%rsp)
+ lretq
+2:
+ /* Back in 32bit mode */
+ .code32
+ cmpl $SYS_vfork,%eax
+ je 4f
+ pushl %ebp
+ movl %esp,%ebp
+ pushl 0x24(%ebp) /* arg 6 */
+ pushl 0x20(%ebp)
+ pushl 0x1c(%ebp)
+ pushl 0x18(%ebp)
+ pushl 0x14(%ebp)
+ pushl 0x10(%ebp) /* arg 1 */
+ pushl 0xc(%ebp) /* gap */
+ int $0x80
+ leavel
+3:
+ lretl
+4:
+ /*
+ * vfork handling is special and relies on the libc stub saving
+ * the return ip in %ecx. If vfork failed, then there is no
+ * child which can corrupt the frame created by call gate.
+ */
+ int $0x80
+ jb 3b
+ addl $8,%esp
+ jmpl *%ecx
+#endif
+
+ ALIGN_TEXT
+esigcode:
+
+ .data
+ .globl sz_ia32_sigcode
+sz_ia32_sigcode:
+ .long esigcode-ia32_sigcode
+#ifdef COMPAT_FREEBSD4
+ .globl sz_freebsd4_ia32_sigcode
+sz_freebsd4_ia32_sigcode:
+ .long esigcode-freebsd4_ia32_sigcode
+#endif
+#ifdef COMPAT_43
+ .globl sz_ia32_osigcode
+sz_ia32_osigcode:
+ .long esigcode-ia32_osigcode
+ .globl sz_lcall_tramp
+sz_lcall_tramp:
+ .long esigcode-lcall_tramp
+#endif
diff --git a/sys/amd64/ia32/ia32_syscall.c b/sys/amd64/ia32/ia32_syscall.c
new file mode 100644
index 0000000..0cdec6f
--- /dev/null
+++ b/sys/amd64/ia32/ia32_syscall.c
@@ -0,0 +1,255 @@
+/*-
+ * Copyright (C) 1994, David Greenman
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the University of Utah, and William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * 386 Trap and System call handling
+ */
+
+#include "opt_clock.h"
+#include "opt_compat.h"
+#include "opt_cpu.h"
+#include "opt_isa.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/ptrace.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/syscall.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/uio.h>
+#include <sys/vmmeter.h>
+#include <security/audit/audit.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_extern.h>
+
+#include <machine/cpu.h>
+#include <machine/intr_machdep.h>
+#include <machine/md_var.h>
+
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_util.h>
+#include <compat/ia32/ia32_signal.h>
+#include <machine/psl.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/sysarch.h>
+#include <machine/frame.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#include <machine/cpufunc.h>
+
+#define IDTVEC(name) __CONCAT(X,name)
+
+extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(rsvd);
+
+void ia32_syscall(struct trapframe *frame); /* Called from asm code */
+
+void
+ia32_set_syscall_retval(struct thread *td, int error)
+{
+
+ cpu_set_syscall_retval(td, error);
+}
+
+int
+ia32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
+{
+ struct proc *p;
+ struct trapframe *frame;
+ caddr_t params;
+ u_int32_t args[8];
+ int error, i;
+
+ p = td->td_proc;
+ frame = td->td_frame;
+
+ params = (caddr_t)frame->tf_rsp + sizeof(u_int32_t);
+ sa->code = frame->tf_rax;
+
+ /*
+ * Need to check if this is a 32 bit or 64 bit syscall.
+ */
+ if (sa->code == SYS_syscall) {
+ /*
+ * Code is first argument, followed by actual args.
+ */
+ sa->code = fuword32(params);
+ params += sizeof(int);
+ } else if (sa->code == SYS___syscall) {
+ /*
+ * Like syscall, but code is a quad, so as to maintain
+ * quad alignment for the rest of the arguments.
+ * We use a 32-bit fetch in case params is not
+ * aligned.
+ */
+ sa->code = fuword32(params);
+ params += sizeof(quad_t);
+ }
+ if (p->p_sysent->sv_mask)
+ sa->code &= p->p_sysent->sv_mask;
+ if (sa->code >= p->p_sysent->sv_size)
+ sa->callp = &p->p_sysent->sv_table[0];
+ else
+ sa->callp = &p->p_sysent->sv_table[sa->code];
+ sa->narg = sa->callp->sy_narg;
+
+ if (params != NULL && sa->narg != 0)
+ error = copyin(params, (caddr_t)args,
+ (u_int)(sa->narg * sizeof(int)));
+ else
+ error = 0;
+
+ for (i = 0; i < sa->narg; i++)
+ sa->args[i] = args[i];
+
+ if (error == 0) {
+ td->td_retval[0] = 0;
+ td->td_retval[1] = frame->tf_rdx;
+ }
+
+ return (error);
+}
+
+#include "../../kern/subr_syscall.c"
+
+void
+ia32_syscall(struct trapframe *frame)
+{
+ struct thread *td;
+ struct syscall_args sa;
+ register_t orig_tf_rflags;
+ int error;
+ ksiginfo_t ksi;
+
+ orig_tf_rflags = frame->tf_rflags;
+ td = curthread;
+ td->td_frame = frame;
+
+ error = syscallenter(td, &sa);
+
+ /*
+ * Traced syscall.
+ */
+ if (orig_tf_rflags & PSL_T) {
+ frame->tf_rflags &= ~PSL_T;
+ ksiginfo_init_trap(&ksi);
+ ksi.ksi_signo = SIGTRAP;
+ ksi.ksi_code = TRAP_TRACE;
+ ksi.ksi_addr = (void *)frame->tf_rip;
+ trapsignal(td, &ksi);
+ }
+
+ syscallret(td, error, &sa);
+}
+
+static void
+ia32_syscall_enable(void *dummy)
+{
+
+ setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0);
+}
+
+static void
+ia32_syscall_disable(void *dummy)
+{
+
+ setidt(IDT_SYSCALL, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+}
+
+SYSINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_enable, NULL);
+SYSUNINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_disable, NULL);
+
+#ifdef COMPAT_43
+int
+setup_lcall_gate(void)
+{
+ struct i386_ldt_args uap;
+ struct user_segment_descriptor descs[2];
+ struct gate_descriptor *ssd;
+ uint32_t lcall_addr;
+ int error;
+
+ bzero(&uap, sizeof(uap));
+ uap.start = 0;
+ uap.num = 2;
+
+ /*
+ * This is the easiest way to cut the space for system
+ * descriptor in ldt. Manually adjust the descriptor type to
+ * the call gate later.
+ */
+ bzero(&descs[0], sizeof(descs));
+ descs[0].sd_type = SDT_SYSNULL;
+ descs[1].sd_type = SDT_SYSNULL;
+ error = amd64_set_ldt(curthread, &uap, descs);
+ if (error != 0)
+ return (error);
+
+ lcall_addr = curproc->p_sysent->sv_psstrings - sz_lcall_tramp;
+ mtx_lock(&dt_lock);
+ ssd = (struct gate_descriptor *)(curproc->p_md.md_ldt->ldt_base);
+ bzero(ssd, sizeof(*ssd));
+ ssd->gd_looffset = lcall_addr;
+ ssd->gd_hioffset = lcall_addr >> 16;
+ ssd->gd_selector = _ucodesel;
+ ssd->gd_type = SDT_SYSCGT;
+ ssd->gd_dpl = SEL_UPL;
+ ssd->gd_p = 1;
+ mtx_unlock(&dt_lock);
+
+ return (0);
+}
+#endif
diff --git a/sys/amd64/include/_align.h b/sys/amd64/include/_align.h
new file mode 100644
index 0000000..28c4669
--- /dev/null
+++ b/sys/amd64/include/_align.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/_align.h>
diff --git a/sys/amd64/include/_bus.h b/sys/amd64/include/_bus.h
new file mode 100644
index 0000000..a8cbf48
--- /dev/null
+++ b/sys/amd64/include/_bus.h
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 2005 M. Warner Losh.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification, immediately at the beginning of the file.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef AMD64_INCLUDE__BUS_H
+#define AMD64_INCLUDE__BUS_H
+
+/*
+ * Bus address and size types
+ */
+typedef uint64_t bus_addr_t;
+typedef uint64_t bus_size_t;
+
+/*
+ * Access methods for bus resources and address space.
+ */
+typedef uint64_t bus_space_tag_t;
+typedef uint64_t bus_space_handle_t;
+
+#endif /* AMD64_INCLUDE__BUS_H */
diff --git a/sys/amd64/include/_inttypes.h b/sys/amd64/include/_inttypes.h
new file mode 100644
index 0000000..40107cd
--- /dev/null
+++ b/sys/amd64/include/_inttypes.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/_inttypes.h>
diff --git a/sys/amd64/include/_limits.h b/sys/amd64/include/_limits.h
new file mode 100644
index 0000000..3c7365b
--- /dev/null
+++ b/sys/amd64/include/_limits.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/_limits.h>
diff --git a/sys/amd64/include/_stdint.h b/sys/amd64/include/_stdint.h
new file mode 100644
index 0000000..db1affc
--- /dev/null
+++ b/sys/amd64/include/_stdint.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/_stdint.h>
diff --git a/sys/amd64/include/_types.h b/sys/amd64/include/_types.h
new file mode 100644
index 0000000..2680367
--- /dev/null
+++ b/sys/amd64/include/_types.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/_types.h>
diff --git a/sys/amd64/include/acpica_machdep.h b/sys/amd64/include/acpica_machdep.h
new file mode 100644
index 0000000..9943af7
--- /dev/null
+++ b/sys/amd64/include/acpica_machdep.h
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2002 Mitsuru IWASAKI
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/******************************************************************************
+ *
+ * Name: acpica_machdep.h - arch-specific defines, etc.
+ * $Revision$
+ *
+ *****************************************************************************/
+
+#ifndef __ACPICA_MACHDEP_H__
+#define __ACPICA_MACHDEP_H__
+
+#ifdef _KERNEL
+/*
+ * Calling conventions:
+ *
+ * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads)
+ * ACPI_EXTERNAL_XFACE - External ACPI interfaces
+ * ACPI_INTERNAL_XFACE - Internal ACPI interfaces
+ * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces
+ */
+#define ACPI_SYSTEM_XFACE
+#define ACPI_EXTERNAL_XFACE
+#define ACPI_INTERNAL_XFACE
+#define ACPI_INTERNAL_VAR_XFACE
+
+/* Asm macros */
+
+#define ACPI_ASM_MACROS
+#define BREAKPOINT3
+#define ACPI_DISABLE_IRQS() disable_intr()
+#define ACPI_ENABLE_IRQS() enable_intr()
+
+#define ACPI_FLUSH_CPU_CACHE() wbinvd()
+
+/* Section 5.2.9.1: global lock acquire/release functions */
+extern int acpi_acquire_global_lock(uint32_t *lock);
+extern int acpi_release_global_lock(uint32_t *lock);
+#define ACPI_ACQUIRE_GLOBAL_LOCK(GLptr, Acq) do { \
+ (Acq) = acpi_acquire_global_lock(&((GLptr)->GlobalLock)); \
+} while (0)
+#define ACPI_RELEASE_GLOBAL_LOCK(GLptr, Acq) do { \
+ (Acq) = acpi_release_global_lock(&((GLptr)->GlobalLock)); \
+} while (0)
+
+#endif /* _KERNEL */
+
+#define ACPI_MACHINE_WIDTH 64
+#define COMPILER_DEPENDENT_INT64 long
+#define COMPILER_DEPENDENT_UINT64 unsigned long
+
+void acpi_SetDefaultIntrModel(int model);
+void acpi_cpu_c1(void);
+void *acpi_map_table(vm_paddr_t pa, const char *sig);
+void acpi_unmap_table(void *table);
+vm_paddr_t acpi_find_table(const char *sig);
+
+#endif /* __ACPICA_MACHDEP_H__ */
diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h
new file mode 100644
index 0000000..ae2f5b9
--- /dev/null
+++ b/sys/amd64/include/apicvar.h
@@ -0,0 +1,232 @@
+/*-
+ * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_APICVAR_H_
+#define _MACHINE_APICVAR_H_
+
+#include <machine/segments.h>
+
+/*
+ * Local && I/O APIC variable definitions.
+ */
+
+/*
+ * Layout of local APIC interrupt vectors:
+ *
+ * 0xff (255) +-------------+
+ * | | 15 (Spurious / IPIs / Local Interrupts)
+ * 0xf0 (240) +-------------+
+ * | | 14 (I/O Interrupts / Timer)
+ * 0xe0 (224) +-------------+
+ * | | 13 (I/O Interrupts)
+ * 0xd0 (208) +-------------+
+ * | | 12 (I/O Interrupts)
+ * 0xc0 (192) +-------------+
+ * | | 11 (I/O Interrupts)
+ * 0xb0 (176) +-------------+
+ * | | 10 (I/O Interrupts)
+ * 0xa0 (160) +-------------+
+ * | | 9 (I/O Interrupts)
+ * 0x90 (144) +-------------+
+ * | | 8 (I/O Interrupts / System Calls)
+ * 0x80 (128) +-------------+
+ * | | 7 (I/O Interrupts)
+ * 0x70 (112) +-------------+
+ * | | 6 (I/O Interrupts)
+ * 0x60 (96) +-------------+
+ * | | 5 (I/O Interrupts)
+ * 0x50 (80) +-------------+
+ * | | 4 (I/O Interrupts)
+ * 0x40 (64) +-------------+
+ * | | 3 (I/O Interrupts)
+ * 0x30 (48) +-------------+
+ * | | 2 (ATPIC Interrupts)
+ * 0x20 (32) +-------------+
+ * | | 1 (Exceptions, traps, faults, etc.)
+ * 0x10 (16) +-------------+
+ * | | 0 (Exceptions, traps, faults, etc.)
+ * 0x00 (0) +-------------+
+ *
+ * Note: 0x80 needs to be handled specially and not allocated to an
+ * I/O device!
+ */
+
+#define MAX_APIC_ID 0xfe
+#define APIC_ID_ALL 0xff
+
+/* I/O Interrupts are used for external devices such as ISA, PCI, etc. */
+#define APIC_IO_INTS (IDT_IO_INTS + 16)
+#define APIC_NUM_IOINTS 191
+
+/* The timer interrupt is used for clock handling and drives hardclock, etc. */
+#define APIC_TIMER_INT (APIC_IO_INTS + APIC_NUM_IOINTS)
+
+/*
+ ********************* !!! WARNING !!! ******************************
+ * Each local apic has an interrupt receive fifo that is two entries deep
+ * for each interrupt priority class (higher 4 bits of interrupt vector).
+ * Once the fifo is full the APIC can no longer receive interrupts for this
+ * class and sending IPIs from other CPUs will be blocked.
+ * To avoid deadlocks there should be no more than two IPI interrupts
+ * pending at the same time.
+ * Currently this is guaranteed by dividing the IPIs in two groups that have
+ * each at most one IPI interrupt pending. The first group is protected by the
+ * smp_ipi_mtx and waits for the completion of the IPI (Only one IPI user
+ * at a time) The second group uses a single interrupt and a bitmap to avoid
+ * redundant IPI interrupts.
+ */
+
+/* Interrupts for local APIC LVT entries other than the timer. */
+#define APIC_LOCAL_INTS 240
+#define APIC_ERROR_INT APIC_LOCAL_INTS
+#define APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
+#define APIC_CMC_INT (APIC_LOCAL_INTS + 2)
+
+#define APIC_IPI_INTS (APIC_LOCAL_INTS + 3)
+#define IPI_RENDEZVOUS (APIC_IPI_INTS) /* Inter-CPU rendezvous. */
+#define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs */
+#define IPI_INVLPG (APIC_IPI_INTS + 2)
+#define IPI_INVLRNG (APIC_IPI_INTS + 3)
+#define IPI_INVLCACHE (APIC_IPI_INTS + 4)
+/* Vector to handle bitmap based IPIs */
+#define IPI_BITMAP_VECTOR (APIC_IPI_INTS + 6)
+
+/* IPIs handled by IPI_BITMAPED_VECTOR (XXX ups is there a better place?) */
+#define IPI_AST 0 /* Generate software trap. */
+#define IPI_PREEMPT 1
+#define IPI_HARDCLOCK 2
+#define IPI_BITMAP_LAST IPI_HARDCLOCK
+#define IPI_IS_BITMAPED(x) ((x) <= IPI_BITMAP_LAST)
+
+#define IPI_STOP (APIC_IPI_INTS + 7) /* Stop CPU until restarted. */
+#define IPI_SUSPEND (APIC_IPI_INTS + 8) /* Suspend CPU until restarted. */
+#define IPI_STOP_HARD (APIC_IPI_INTS + 9) /* Stop CPU with a NMI. */
+
+/*
+ * The spurious interrupt can share the priority class with the IPIs since
+ * it is not a normal interrupt. (Does not use the APIC's interrupt fifo)
+ */
+#define APIC_SPURIOUS_INT 255
+
+#define LVT_LINT0 0
+#define LVT_LINT1 1
+#define LVT_TIMER 2
+#define LVT_ERROR 3
+#define LVT_PMC 4
+#define LVT_THERMAL 5
+#define LVT_CMCI 6
+#define LVT_MAX LVT_CMCI
+
+#ifndef LOCORE
+
+#define APIC_IPI_DEST_SELF -1
+#define APIC_IPI_DEST_ALL -2
+#define APIC_IPI_DEST_OTHERS -3
+
+#define APIC_BUS_UNKNOWN -1
+#define APIC_BUS_ISA 0
+#define APIC_BUS_EISA 1
+#define APIC_BUS_PCI 2
+#define APIC_BUS_MAX APIC_BUS_PCI
+
+/*
+ * An APIC enumerator is a psuedo bus driver that enumerates APIC's including
+ * CPU's and I/O APIC's.
+ */
+struct apic_enumerator {
+ const char *apic_name;
+ int (*apic_probe)(void);
+ int (*apic_probe_cpus)(void);
+ int (*apic_setup_local)(void);
+ int (*apic_setup_io)(void);
+ SLIST_ENTRY(apic_enumerator) apic_next;
+};
+
+inthand_t
+ IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
+ IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
+ IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
+ IDTVEC(spuriousint), IDTVEC(timerint);
+
+extern vm_paddr_t lapic_paddr;
+extern int apic_cpuids[];
+
+u_int apic_alloc_vector(u_int apic_id, u_int irq);
+u_int apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count,
+ u_int align);
+void apic_disable_vector(u_int apic_id, u_int vector);
+void apic_enable_vector(u_int apic_id, u_int vector);
+void apic_free_vector(u_int apic_id, u_int vector, u_int irq);
+u_int apic_idt_to_irq(u_int apic_id, u_int vector);
+void apic_register_enumerator(struct apic_enumerator *enumerator);
+u_int apic_cpuid(u_int apic_id);
+void *ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase);
+int ioapic_disable_pin(void *cookie, u_int pin);
+int ioapic_get_vector(void *cookie, u_int pin);
+void ioapic_register(void *cookie);
+int ioapic_remap_vector(void *cookie, u_int pin, int vector);
+int ioapic_set_bus(void *cookie, u_int pin, int bus_type);
+int ioapic_set_extint(void *cookie, u_int pin);
+int ioapic_set_nmi(void *cookie, u_int pin);
+int ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol);
+int ioapic_set_triggermode(void *cookie, u_int pin,
+ enum intr_trigger trigger);
+int ioapic_set_smi(void *cookie, u_int pin);
+void lapic_create(u_int apic_id, int boot_cpu);
+void lapic_disable(void);
+void lapic_disable_pmc(void);
+void lapic_dump(const char *str);
+void lapic_enable_cmc(void);
+int lapic_enable_pmc(void);
+void lapic_eoi(void);
+int lapic_id(void);
+void lapic_init(vm_paddr_t addr);
+int lapic_intr_pending(u_int vector);
+void lapic_ipi_raw(register_t icrlo, u_int dest);
+void lapic_ipi_vectored(u_int vector, int dest);
+int lapic_ipi_wait(int delay);
+void lapic_handle_cmc(void);
+void lapic_handle_error(void);
+void lapic_handle_intr(int vector, struct trapframe *frame);
+void lapic_handle_timer(struct trapframe *frame);
+void lapic_reenable_pmc(void);
+void lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id);
+int lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked);
+int lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode);
+int lapic_set_lvt_polarity(u_int apic_id, u_int lvt,
+ enum intr_polarity pol);
+int lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
+ enum intr_trigger trigger);
+void lapic_set_tpr(u_int vector);
+void lapic_setup(int boot);
+
+#endif /* !LOCORE */
+#endif /* _MACHINE_APICVAR_H_ */
diff --git a/sys/amd64/include/apm_bios.h b/sys/amd64/include/apm_bios.h
new file mode 100644
index 0000000..9cc0eee
--- /dev/null
+++ b/sys/amd64/include/apm_bios.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/apm_bios.h>
diff --git a/sys/amd64/include/asm.h b/sys/amd64/include/asm.h
new file mode 100644
index 0000000..7efd642
--- /dev/null
+++ b/sys/amd64/include/asm.h
@@ -0,0 +1,91 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)DEFS.h 5.1 (Berkeley) 4/23/90
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_ASM_H_
+#define _MACHINE_ASM_H_
+
+#include <sys/cdefs.h>
+
+#ifdef PIC
+#define PIC_PLT(x) x@PLT
+#define PIC_GOT(x) x@GOTPCREL(%rip)
+#else
+#define PIC_PLT(x) x
+#define PIC_GOT(x) x
+#endif
+
+/*
+ * CNAME and HIDENAME manage the relationship between symbol names in C
+ * and the equivalent assembly language names. CNAME is given a name as
+ * it would be used in a C program. It expands to the equivalent assembly
+ * language name. HIDENAME is given an assembly-language name, and expands
+ * to a possibly-modified form that will be invisible to C programs.
+ */
+#define CNAME(csym) csym
+#define HIDENAME(asmsym) .asmsym
+
+#define _START_ENTRY .text; .p2align 4,0x90
+
+#define _ENTRY(x) _START_ENTRY; \
+ .globl CNAME(x); .type CNAME(x),@function; CNAME(x):
+
+#ifdef PROF
+#define ALTENTRY(x) _ENTRY(x); \
+ pushq %rbp; movq %rsp,%rbp; \
+ call PIC_PLT(HIDENAME(mcount)); \
+ popq %rbp; \
+ jmp 9f
+#define ENTRY(x) _ENTRY(x); \
+ pushq %rbp; movq %rsp,%rbp; \
+ call PIC_PLT(HIDENAME(mcount)); \
+ popq %rbp; \
+ 9:
+#else
+#define ALTENTRY(x) _ENTRY(x)
+#define ENTRY(x) _ENTRY(x)
+#endif
+
+#define END(x) .size x, . - x
+
+#define RCSID(x) .text; .asciz x
+
+#undef __FBSDID
+#if !defined(lint) && !defined(STRIP_FBSDID)
+#define __FBSDID(s) .ident s
+#else
+#define __FBSDID(s) /* nothing */
+#endif /* not lint and not STRIP_FBSDID */
+
+#endif /* !_MACHINE_ASM_H_ */
diff --git a/sys/amd64/include/asmacros.h b/sys/amd64/include/asmacros.h
new file mode 100644
index 0000000..1fb592a
--- /dev/null
+++ b/sys/amd64/include/asmacros.h
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 1993 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_ASMACROS_H_
+#define _MACHINE_ASMACROS_H_
+
+#include <sys/cdefs.h>
+
+/* XXX too much duplication in various asm*.h's. */
+
+/*
+ * CNAME is used to manage the relationship between symbol names in C
+ * and the equivalent assembly language names. CNAME is given a name as
+ * it would be used in a C program. It expands to the equivalent assembly
+ * language name.
+ */
+#define CNAME(csym) csym
+
+#define ALIGN_DATA .p2align 3 /* 8 byte alignment, zero filled */
+#ifdef GPROF
+#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
+#else
+#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
+#endif
+#define SUPERALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+#define GEN_ENTRY(name) ALIGN_TEXT; .globl CNAME(name); \
+ .type CNAME(name),@function; CNAME(name):
+#define NON_GPROF_ENTRY(name) GEN_ENTRY(name)
+#define NON_GPROF_RET .byte 0xc3 /* opcode for `ret' */
+
+#define END(name) .size name, . - name
+
+#ifdef GPROF
+/*
+ * __mcount is like [.]mcount except that doesn't require its caller to set
+ * up a frame pointer. It must be called before pushing anything onto the
+ * stack. gcc should eventually generate code to call __mcount in most
+ * cases. This would make -pg in combination with -fomit-frame-pointer
+ * useful. gcc has a configuration variable PROFILE_BEFORE_PROLOGUE to
+ * allow profiling before setting up the frame pointer, but this is
+ * inadequate for good handling of special cases, e.g., -fpic works best
+ * with profiling after the prologue.
+ *
+ * [.]mexitcount is a new function to support non-statistical profiling if an
+ * accurate clock is available. For C sources, calls to it are generated
+ * by the FreeBSD extension `-mprofiler-epilogue' to gcc. It is best to
+ * call [.]mexitcount at the end of a function like the MEXITCOUNT macro does,
+ * but gcc currently generates calls to it at the start of the epilogue to
+ * avoid problems with -fpic.
+ *
+ * [.]mcount and __mcount may clobber the call-used registers and %ef.
+ * [.]mexitcount may clobber %ecx and %ef.
+ *
+ * Cross-jumping makes non-statistical profiling timing more complicated.
+ * It is handled in many cases by calling [.]mexitcount before jumping. It
+ * is handled for conditional jumps using CROSSJUMP() and CROSSJUMP_LABEL().
+ * It is handled for some fault-handling jumps by not sharing the exit
+ * routine.
+ *
+ * ALTENTRY() must be before a corresponding ENTRY() so that it can jump to
+ * the main entry point. Note that alt entries are counted twice. They
+ * have to be counted as ordinary entries for gprof to get the call times
+ * right for the ordinary entries.
+ *
+ * High local labels are used in macros to avoid clashes with local labels
+ * in functions.
+ *
+ * Ordinary `ret' is used instead of a macro `RET' because there are a lot
+ * of `ret's. 0xc3 is the opcode for `ret' (`#define ret ... ret' can't
+ * be used because this file is sometimes preprocessed in traditional mode).
+ * `ret' clobbers eflags but this doesn't matter.
+ */
+#define ALTENTRY(name) GEN_ENTRY(name) ; MCOUNT ; MEXITCOUNT ; jmp 9f
+#define CROSSJUMP(jtrue, label, jfalse) \
+ jfalse 8f; MEXITCOUNT; jmp __CONCAT(to,label); 8:
+#define CROSSJUMPTARGET(label) \
+ ALIGN_TEXT; __CONCAT(to,label): ; MCOUNT; jmp label
+#define ENTRY(name) GEN_ENTRY(name) ; 9: ; MCOUNT
+#define FAKE_MCOUNT(caller) pushq caller ; call __mcount ; popq %rcx
+#define MCOUNT call __mcount
+#define MCOUNT_LABEL(name) GEN_ENTRY(name) ; nop ; ALIGN_TEXT
+#ifdef GUPROF
+#define MEXITCOUNT call .mexitcount
+#define ret MEXITCOUNT ; NON_GPROF_RET
+#else
+#define MEXITCOUNT
+#endif
+
+#else /* !GPROF */
+/*
+ * ALTENTRY() has to align because it is before a corresponding ENTRY().
+ * ENTRY() has to align to because there may be no ALTENTRY() before it.
+ * If there is a previous ALTENTRY() then the alignment code for ENTRY()
+ * is empty.
+ */
+#define ALTENTRY(name) GEN_ENTRY(name)
+#define CROSSJUMP(jtrue, label, jfalse) jtrue label
+#define CROSSJUMPTARGET(label)
+#define ENTRY(name) GEN_ENTRY(name)
+#define FAKE_MCOUNT(caller)
+#define MCOUNT
+#define MCOUNT_LABEL(name)
+#define MEXITCOUNT
+#endif /* GPROF */
+
+#ifdef LOCORE
+/*
+ * Convenience macro for declaring interrupt entry points.
+ */
+#define IDTVEC(name) ALIGN_TEXT; .globl __CONCAT(X,name); \
+ .type __CONCAT(X,name),@function; __CONCAT(X,name):
+
+/*
+ * Macros to create and destroy a trap frame.
+ */
+#define PUSH_FRAME \
+ subq $TF_RIP,%rsp ; /* skip dummy tf_err and tf_trapno */ \
+ testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \
+ jz 1f ; /* Yes, dont swapgs again */ \
+ swapgs ; \
+1: movq %rdi,TF_RDI(%rsp) ; \
+ movq %rsi,TF_RSI(%rsp) ; \
+ movq %rdx,TF_RDX(%rsp) ; \
+ movq %rcx,TF_RCX(%rsp) ; \
+ movq %r8,TF_R8(%rsp) ; \
+ movq %r9,TF_R9(%rsp) ; \
+ movq %rax,TF_RAX(%rsp) ; \
+ movq %rbx,TF_RBX(%rsp) ; \
+ movq %rbp,TF_RBP(%rsp) ; \
+ movq %r10,TF_R10(%rsp) ; \
+ movq %r11,TF_R11(%rsp) ; \
+ movq %r12,TF_R12(%rsp) ; \
+ movq %r13,TF_R13(%rsp) ; \
+ movq %r14,TF_R14(%rsp) ; \
+ movq %r15,TF_R15(%rsp) ; \
+ movw %fs,TF_FS(%rsp) ; \
+ movw %gs,TF_GS(%rsp) ; \
+ movw %es,TF_ES(%rsp) ; \
+ movw %ds,TF_DS(%rsp) ; \
+ movl $TF_HASSEGS,TF_FLAGS(%rsp) ; \
+ cld
+
+#define POP_FRAME \
+ movq TF_RDI(%rsp),%rdi ; \
+ movq TF_RSI(%rsp),%rsi ; \
+ movq TF_RDX(%rsp),%rdx ; \
+ movq TF_RCX(%rsp),%rcx ; \
+ movq TF_R8(%rsp),%r8 ; \
+ movq TF_R9(%rsp),%r9 ; \
+ movq TF_RAX(%rsp),%rax ; \
+ movq TF_RBX(%rsp),%rbx ; \
+ movq TF_RBP(%rsp),%rbp ; \
+ movq TF_R10(%rsp),%r10 ; \
+ movq TF_R11(%rsp),%r11 ; \
+ movq TF_R12(%rsp),%r12 ; \
+ movq TF_R13(%rsp),%r13 ; \
+ movq TF_R14(%rsp),%r14 ; \
+ movq TF_R15(%rsp),%r15 ; \
+ testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \
+ jz 1f ; /* keep kernel GS.base */ \
+ cli ; \
+ swapgs ; \
+1: addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */
+
+/*
+ * Access per-CPU data.
+ */
+#define PCPU(member) %gs:PC_ ## member
+#define PCPU_ADDR(member, reg) \
+ movq %gs:PC_PRVSPACE, reg ; \
+ addq $PC_ ## member, reg
+
+#endif /* LOCORE */
+
+#endif /* !_MACHINE_ASMACROS_H_ */
diff --git a/sys/amd64/include/atomic.h b/sys/amd64/include/atomic.h
new file mode 100644
index 0000000..91c33e6
--- /dev/null
+++ b/sys/amd64/include/atomic.h
@@ -0,0 +1,483 @@
+/*-
+ * Copyright (c) 1998 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#ifndef _MACHINE_ATOMIC_H_
+#define _MACHINE_ATOMIC_H_
+
+#ifndef _SYS_CDEFS_H_
+#error this file needs sys/cdefs.h as a prerequisite
+#endif
+
+#define mb() __asm __volatile("mfence;" : : : "memory")
+#define wmb() __asm __volatile("sfence;" : : : "memory")
+#define rmb() __asm __volatile("lfence;" : : : "memory")
+
+/*
+ * Various simple operations on memory, each of which is atomic in the
+ * presence of interrupts and multiple processors.
+ *
+ * atomic_set_char(P, V) (*(u_char *)(P) |= (V))
+ * atomic_clear_char(P, V) (*(u_char *)(P) &= ~(V))
+ * atomic_add_char(P, V) (*(u_char *)(P) += (V))
+ * atomic_subtract_char(P, V) (*(u_char *)(P) -= (V))
+ *
+ * atomic_set_short(P, V) (*(u_short *)(P) |= (V))
+ * atomic_clear_short(P, V) (*(u_short *)(P) &= ~(V))
+ * atomic_add_short(P, V) (*(u_short *)(P) += (V))
+ * atomic_subtract_short(P, V) (*(u_short *)(P) -= (V))
+ *
+ * atomic_set_int(P, V) (*(u_int *)(P) |= (V))
+ * atomic_clear_int(P, V) (*(u_int *)(P) &= ~(V))
+ * atomic_add_int(P, V) (*(u_int *)(P) += (V))
+ * atomic_subtract_int(P, V) (*(u_int *)(P) -= (V))
+ * atomic_readandclear_int(P) (return (*(u_int *)(P)); *(u_int *)(P) = 0;)
+ *
+ * atomic_set_long(P, V) (*(u_long *)(P) |= (V))
+ * atomic_clear_long(P, V) (*(u_long *)(P) &= ~(V))
+ * atomic_add_long(P, V) (*(u_long *)(P) += (V))
+ * atomic_subtract_long(P, V) (*(u_long *)(P) -= (V))
+ * atomic_readandclear_long(P) (return (*(u_long *)(P)); *(u_long *)(P) = 0;)
+ */
+
+/*
+ * The above functions are expanded inline in the statically-linked
+ * kernel. Lock prefixes are generated if an SMP kernel is being
+ * built.
+ *
+ * Kernel modules call real functions which are built into the kernel.
+ * This allows kernel modules to be portable between UP and SMP systems.
+ */
+#if defined(KLD_MODULE) || !defined(__GNUCLIKE_ASM)
+#define ATOMIC_ASM(NAME, TYPE, OP, CONS, V) \
+void atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v); \
+void atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
+
+int atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src);
+int atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src);
+u_int atomic_fetchadd_int(volatile u_int *p, u_int v);
+u_long atomic_fetchadd_long(volatile u_long *p, u_long v);
+
+#define ATOMIC_LOAD(TYPE, LOP) \
+u_##TYPE atomic_load_acq_##TYPE(volatile u_##TYPE *p)
+#define ATOMIC_STORE(TYPE) \
+void atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
+
+#else /* !KLD_MODULE && __GNUCLIKE_ASM */
+
+/*
+ * For userland, always use lock prefixes so that the binaries will run
+ * on both SMP and !SMP systems.
+ */
+#if defined(SMP) || !defined(_KERNEL)
+#define MPLOCKED "lock ; "
+#else
+#define MPLOCKED
+#endif
+
+/*
+ * The assembly is volatilized to avoid code chunk removal by the compiler.
+ * GCC aggressively reorders operations and memory clobbering is necessary
+ * in order to avoid that for memory barriers.
+ */
+#define ATOMIC_ASM(NAME, TYPE, OP, CONS, V) \
+static __inline void \
+atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
+{ \
+ __asm __volatile(MPLOCKED OP \
+ : "=m" (*p) \
+ : CONS (V), "m" (*p) \
+ : "cc"); \
+} \
+ \
+static __inline void \
+atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
+{ \
+ __asm __volatile(MPLOCKED OP \
+ : "=m" (*p) \
+ : CONS (V), "m" (*p) \
+ : "memory", "cc"); \
+} \
+struct __hack
+
+/*
+ * Atomic compare and set, used by the mutex functions
+ *
+ * if (*dst == expect) *dst = src (all 32 bit words)
+ *
+ * Returns 0 on failure, non-zero on success
+ */
+
+static __inline int
+atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src)
+{
+ u_char res;
+
+ __asm __volatile(
+ " " MPLOCKED " "
+ " cmpxchgl %2,%1 ; "
+ " sete %0 ; "
+ "1: "
+ "# atomic_cmpset_int"
+ : "=a" (res), /* 0 */
+ "=m" (*dst) /* 1 */
+ : "r" (src), /* 2 */
+ "a" (expect), /* 3 */
+ "m" (*dst) /* 4 */
+ : "memory", "cc");
+
+ return (res);
+}
+
+static __inline int
+atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src)
+{
+ u_char res;
+
+ __asm __volatile(
+ " " MPLOCKED " "
+ " cmpxchgq %2,%1 ; "
+ " sete %0 ; "
+ "1: "
+ "# atomic_cmpset_long"
+ : "=a" (res), /* 0 */
+ "=m" (*dst) /* 1 */
+ : "r" (src), /* 2 */
+ "a" (expect), /* 3 */
+ "m" (*dst) /* 4 */
+ : "memory", "cc");
+
+ return (res);
+}
+
+/*
+ * Atomically add the value of v to the integer pointed to by p and return
+ * the previous value of *p.
+ */
+static __inline u_int
+atomic_fetchadd_int(volatile u_int *p, u_int v)
+{
+
+ __asm __volatile(
+ " " MPLOCKED " "
+ " xaddl %0, %1 ; "
+ "# atomic_fetchadd_int"
+ : "+r" (v), /* 0 (result) */
+ "=m" (*p) /* 1 */
+ : "m" (*p) /* 2 */
+ : "cc");
+ return (v);
+}
+
+/*
+ * Atomically add the value of v to the long integer pointed to by p and return
+ * the previous value of *p.
+ */
+static __inline u_long
+atomic_fetchadd_long(volatile u_long *p, u_long v)
+{
+
+ __asm __volatile(
+ " " MPLOCKED " "
+ " xaddq %0, %1 ; "
+ "# atomic_fetchadd_long"
+ : "+r" (v), /* 0 (result) */
+ "=m" (*p) /* 1 */
+ : "m" (*p) /* 2 */
+ : "cc");
+ return (v);
+}
+
+/*
+ * We assume that a = b will do atomic loads and stores. Due to the
+ * IA32 memory model, a simple store guarantees release semantics.
+ *
+ * However, loads may pass stores, so for atomic_load_acq we have to
+ * ensure a Store/Load barrier to do the load in SMP kernels. We use
+ * "lock cmpxchg" as recommended by the AMD Software Optimization
+ * Guide, and not mfence. For UP kernels, however, the cache of the
+ * single processor is always consistent, so we only need to take care
+ * of the compiler.
+ */
+#define ATOMIC_STORE(TYPE) \
+static __inline void \
+atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
+{ \
+ __compiler_membar(); \
+ *p = v; \
+} \
+struct __hack
+
+#if defined(_KERNEL) && !defined(SMP)
+
+#define ATOMIC_LOAD(TYPE, LOP) \
+static __inline u_##TYPE \
+atomic_load_acq_##TYPE(volatile u_##TYPE *p) \
+{ \
+ u_##TYPE tmp; \
+ \
+ tmp = *p; \
+ __compiler_membar(); \
+ return (tmp); \
+} \
+struct __hack
+
+#else /* !(_KERNEL && !SMP) */
+
+#define ATOMIC_LOAD(TYPE, LOP) \
+static __inline u_##TYPE \
+atomic_load_acq_##TYPE(volatile u_##TYPE *p) \
+{ \
+ u_##TYPE res; \
+ \
+ __asm __volatile(MPLOCKED LOP \
+ : "=a" (res), /* 0 */ \
+ "=m" (*p) /* 1 */ \
+ : "m" (*p) /* 2 */ \
+ : "memory", "cc"); \
+ \
+ return (res); \
+} \
+struct __hack
+
+#endif /* _KERNEL && !SMP */
+
+#endif /* KLD_MODULE || !__GNUCLIKE_ASM */
+
+ATOMIC_ASM(set, char, "orb %b1,%0", "iq", v);
+ATOMIC_ASM(clear, char, "andb %b1,%0", "iq", ~v);
+ATOMIC_ASM(add, char, "addb %b1,%0", "iq", v);
+ATOMIC_ASM(subtract, char, "subb %b1,%0", "iq", v);
+
+ATOMIC_ASM(set, short, "orw %w1,%0", "ir", v);
+ATOMIC_ASM(clear, short, "andw %w1,%0", "ir", ~v);
+ATOMIC_ASM(add, short, "addw %w1,%0", "ir", v);
+ATOMIC_ASM(subtract, short, "subw %w1,%0", "ir", v);
+
+ATOMIC_ASM(set, int, "orl %1,%0", "ir", v);
+ATOMIC_ASM(clear, int, "andl %1,%0", "ir", ~v);
+ATOMIC_ASM(add, int, "addl %1,%0", "ir", v);
+ATOMIC_ASM(subtract, int, "subl %1,%0", "ir", v);
+
+ATOMIC_ASM(set, long, "orq %1,%0", "ir", v);
+ATOMIC_ASM(clear, long, "andq %1,%0", "ir", ~v);
+ATOMIC_ASM(add, long, "addq %1,%0", "ir", v);
+ATOMIC_ASM(subtract, long, "subq %1,%0", "ir", v);
+
+ATOMIC_LOAD(char, "cmpxchgb %b0,%1");
+ATOMIC_LOAD(short, "cmpxchgw %w0,%1");
+ATOMIC_LOAD(int, "cmpxchgl %0,%1");
+ATOMIC_LOAD(long, "cmpxchgq %0,%1");
+
+ATOMIC_STORE(char);
+ATOMIC_STORE(short);
+ATOMIC_STORE(int);
+ATOMIC_STORE(long);
+
+#undef ATOMIC_ASM
+#undef ATOMIC_LOAD
+#undef ATOMIC_STORE
+
+#ifndef WANT_FUNCTIONS
+
+/* Read the current value and store a zero in the destination. */
+#ifdef __GNUCLIKE_ASM
+
+static __inline u_int
+atomic_readandclear_int(volatile u_int *addr)
+{
+ u_int res;
+
+ res = 0;
+ __asm __volatile(
+ " xchgl %1,%0 ; "
+ "# atomic_readandclear_int"
+ : "+r" (res), /* 0 */
+ "=m" (*addr) /* 1 */
+ : "m" (*addr));
+
+ return (res);
+}
+
+static __inline u_long
+atomic_readandclear_long(volatile u_long *addr)
+{
+ u_long res;
+
+ res = 0;
+ __asm __volatile(
+ " xchgq %1,%0 ; "
+ "# atomic_readandclear_long"
+ : "+r" (res), /* 0 */
+ "=m" (*addr) /* 1 */
+ : "m" (*addr));
+
+ return (res);
+}
+
+#else /* !__GNUCLIKE_ASM */
+
+u_int atomic_readandclear_int(volatile u_int *addr);
+u_long atomic_readandclear_long(volatile u_long *addr);
+
+#endif /* __GNUCLIKE_ASM */
+
+#define atomic_set_acq_char atomic_set_barr_char
+#define atomic_set_rel_char atomic_set_barr_char
+#define atomic_clear_acq_char atomic_clear_barr_char
+#define atomic_clear_rel_char atomic_clear_barr_char
+#define atomic_add_acq_char atomic_add_barr_char
+#define atomic_add_rel_char atomic_add_barr_char
+#define atomic_subtract_acq_char atomic_subtract_barr_char
+#define atomic_subtract_rel_char atomic_subtract_barr_char
+
+#define atomic_set_acq_short atomic_set_barr_short
+#define atomic_set_rel_short atomic_set_barr_short
+#define atomic_clear_acq_short atomic_clear_barr_short
+#define atomic_clear_rel_short atomic_clear_barr_short
+#define atomic_add_acq_short atomic_add_barr_short
+#define atomic_add_rel_short atomic_add_barr_short
+#define atomic_subtract_acq_short atomic_subtract_barr_short
+#define atomic_subtract_rel_short atomic_subtract_barr_short
+
+#define atomic_set_acq_int atomic_set_barr_int
+#define atomic_set_rel_int atomic_set_barr_int
+#define atomic_clear_acq_int atomic_clear_barr_int
+#define atomic_clear_rel_int atomic_clear_barr_int
+#define atomic_add_acq_int atomic_add_barr_int
+#define atomic_add_rel_int atomic_add_barr_int
+#define atomic_subtract_acq_int atomic_subtract_barr_int
+#define atomic_subtract_rel_int atomic_subtract_barr_int
+#define atomic_cmpset_acq_int atomic_cmpset_int
+#define atomic_cmpset_rel_int atomic_cmpset_int
+
+#define atomic_set_acq_long atomic_set_barr_long
+#define atomic_set_rel_long atomic_set_barr_long
+#define atomic_clear_acq_long atomic_clear_barr_long
+#define atomic_clear_rel_long atomic_clear_barr_long
+#define atomic_add_acq_long atomic_add_barr_long
+#define atomic_add_rel_long atomic_add_barr_long
+#define atomic_subtract_acq_long atomic_subtract_barr_long
+#define atomic_subtract_rel_long atomic_subtract_barr_long
+#define atomic_cmpset_acq_long atomic_cmpset_long
+#define atomic_cmpset_rel_long atomic_cmpset_long
+
+/* Operations on 8-bit bytes. */
+#define atomic_set_8 atomic_set_char
+#define atomic_set_acq_8 atomic_set_acq_char
+#define atomic_set_rel_8 atomic_set_rel_char
+#define atomic_clear_8 atomic_clear_char
+#define atomic_clear_acq_8 atomic_clear_acq_char
+#define atomic_clear_rel_8 atomic_clear_rel_char
+#define atomic_add_8 atomic_add_char
+#define atomic_add_acq_8 atomic_add_acq_char
+#define atomic_add_rel_8 atomic_add_rel_char
+#define atomic_subtract_8 atomic_subtract_char
+#define atomic_subtract_acq_8 atomic_subtract_acq_char
+#define atomic_subtract_rel_8 atomic_subtract_rel_char
+#define atomic_load_acq_8 atomic_load_acq_char
+#define atomic_store_rel_8 atomic_store_rel_char
+
+/* Operations on 16-bit words. */
+#define atomic_set_16 atomic_set_short
+#define atomic_set_acq_16 atomic_set_acq_short
+#define atomic_set_rel_16 atomic_set_rel_short
+#define atomic_clear_16 atomic_clear_short
+#define atomic_clear_acq_16 atomic_clear_acq_short
+#define atomic_clear_rel_16 atomic_clear_rel_short
+#define atomic_add_16 atomic_add_short
+#define atomic_add_acq_16 atomic_add_acq_short
+#define atomic_add_rel_16 atomic_add_rel_short
+#define atomic_subtract_16 atomic_subtract_short
+#define atomic_subtract_acq_16 atomic_subtract_acq_short
+#define atomic_subtract_rel_16 atomic_subtract_rel_short
+#define atomic_load_acq_16 atomic_load_acq_short
+#define atomic_store_rel_16 atomic_store_rel_short
+
+/* Operations on 32-bit double words. */
+#define atomic_set_32 atomic_set_int
+#define atomic_set_acq_32 atomic_set_acq_int
+#define atomic_set_rel_32 atomic_set_rel_int
+#define atomic_clear_32 atomic_clear_int
+#define atomic_clear_acq_32 atomic_clear_acq_int
+#define atomic_clear_rel_32 atomic_clear_rel_int
+#define atomic_add_32 atomic_add_int
+#define atomic_add_acq_32 atomic_add_acq_int
+#define atomic_add_rel_32 atomic_add_rel_int
+#define atomic_subtract_32 atomic_subtract_int
+#define atomic_subtract_acq_32 atomic_subtract_acq_int
+#define atomic_subtract_rel_32 atomic_subtract_rel_int
+#define atomic_load_acq_32 atomic_load_acq_int
+#define atomic_store_rel_32 atomic_store_rel_int
+#define atomic_cmpset_32 atomic_cmpset_int
+#define atomic_cmpset_acq_32 atomic_cmpset_acq_int
+#define atomic_cmpset_rel_32 atomic_cmpset_rel_int
+#define atomic_readandclear_32 atomic_readandclear_int
+#define atomic_fetchadd_32 atomic_fetchadd_int
+
+/* Operations on 64-bit quad words. */
+#define atomic_set_64 atomic_set_long
+#define atomic_set_acq_64 atomic_set_acq_long
+#define atomic_set_rel_64 atomic_set_rel_long
+#define atomic_clear_64 atomic_clear_long
+#define atomic_clear_acq_64 atomic_clear_acq_long
+#define atomic_clear_rel_64 atomic_clear_rel_long
+#define atomic_add_64 atomic_add_long
+#define atomic_add_acq_64 atomic_add_acq_long
+#define atomic_add_rel_64 atomic_add_rel_long
+#define atomic_subtract_64 atomic_subtract_long
+#define atomic_subtract_acq_64 atomic_subtract_acq_long
+#define atomic_subtract_rel_64 atomic_subtract_rel_long
+#define atomic_load_acq_64 atomic_load_acq_long
+#define atomic_store_rel_64 atomic_store_rel_long
+#define atomic_cmpset_64 atomic_cmpset_long
+#define atomic_cmpset_acq_64 atomic_cmpset_acq_long
+#define atomic_cmpset_rel_64 atomic_cmpset_rel_long
+#define atomic_readandclear_64 atomic_readandclear_long
+
+/* Operations on pointers. */
+#define atomic_set_ptr atomic_set_long
+#define atomic_set_acq_ptr atomic_set_acq_long
+#define atomic_set_rel_ptr atomic_set_rel_long
+#define atomic_clear_ptr atomic_clear_long
+#define atomic_clear_acq_ptr atomic_clear_acq_long
+#define atomic_clear_rel_ptr atomic_clear_rel_long
+#define atomic_add_ptr atomic_add_long
+#define atomic_add_acq_ptr atomic_add_acq_long
+#define atomic_add_rel_ptr atomic_add_rel_long
+#define atomic_subtract_ptr atomic_subtract_long
+#define atomic_subtract_acq_ptr atomic_subtract_acq_long
+#define atomic_subtract_rel_ptr atomic_subtract_rel_long
+#define atomic_load_acq_ptr atomic_load_acq_long
+#define atomic_store_rel_ptr atomic_store_rel_long
+#define atomic_cmpset_ptr atomic_cmpset_long
+#define atomic_cmpset_acq_ptr atomic_cmpset_acq_long
+#define atomic_cmpset_rel_ptr atomic_cmpset_rel_long
+#define atomic_readandclear_ptr atomic_readandclear_long
+
+#endif /* !WANT_FUNCTIONS */
+
+#endif /* !_MACHINE_ATOMIC_H_ */
diff --git a/sys/amd64/include/bus.h b/sys/amd64/include/bus.h
new file mode 100644
index 0000000..f1af2cf
--- /dev/null
+++ b/sys/amd64/include/bus.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/bus.h>
diff --git a/sys/amd64/include/bus_dma.h b/sys/amd64/include/bus_dma.h
new file mode 100644
index 0000000..bc8bdf4
--- /dev/null
+++ b/sys/amd64/include/bus_dma.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2005 Scott Long
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _AMD64_BUS_DMA_H_
+#define _AMD64_BUS_DMA_H_
+
+#include <sys/bus_dma.h>
+
+#endif /* _AMD64_BUS_DMA_H_ */
diff --git a/sys/amd64/include/clock.h b/sys/amd64/include/clock.h
new file mode 100644
index 0000000..d7f7d82
--- /dev/null
+++ b/sys/amd64/include/clock.h
@@ -0,0 +1,43 @@
+/*-
+ * Kernel interface to machine-dependent clock driver.
+ * Garrett Wollman, September 1994.
+ * This file is in the public domain.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_CLOCK_H_
+#define _MACHINE_CLOCK_H_
+
+#ifdef _KERNEL
+/*
+ * i386 to clock driver interface.
+ * XXX large parts of the driver and its interface are misplaced.
+ */
+extern int clkintr_pending;
+extern u_int i8254_freq;
+extern int i8254_max_count;
+extern uint64_t tsc_freq;
+extern int tsc_is_invariant;
+extern int tsc_perf_stat;
+#ifdef SMP
+extern int smp_tsc;
+#endif
+
+void i8254_init(void);
+
+/*
+ * Driver to clock driver interface.
+ */
+
+void startrtclock(void);
+void init_TSC(void);
+
+#define HAS_TIMER_SPKR 1
+int timer_spkr_acquire(void);
+int timer_spkr_release(void);
+void timer_spkr_setfreq(int freq);
+
+#endif /* _KERNEL */
+
+#endif /* !_MACHINE_CLOCK_H_ */
diff --git a/sys/amd64/include/counter.h b/sys/amd64/include/counter.h
new file mode 100644
index 0000000..3ed5f76
--- /dev/null
+++ b/sys/amd64/include/counter.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __MACHINE_COUNTER_H__
+#define __MACHINE_COUNTER_H__
+
+#include <sys/pcpu.h>
+
+extern struct pcpu __pcpu[1];
+
+#define counter_enter() do {} while (0)
+#define counter_exit() do {} while (0)
+
+#define counter_u64_add_protected(c, i) counter_u64_add(c, i)
+
+static inline void
+counter_u64_add(counter_u64_t c, int64_t inc)
+{
+
+ __asm __volatile("addq\t%1,%%gs:(%0)"
+ :
+ : "r" ((char *)c - (char *)&__pcpu[0]), "r" (inc)
+ : "memory", "cc");
+}
+
+#endif /* ! __MACHINE_COUNTER_H__ */
diff --git a/sys/amd64/include/cpu.h b/sys/amd64/include/cpu.h
new file mode 100644
index 0000000..1c2871f
--- /dev/null
+++ b/sys/amd64/include/cpu.h
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)cpu.h 5.4 (Berkeley) 5/9/91
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_CPU_H_
+#define _MACHINE_CPU_H_
+
+/*
+ * Definitions unique to i386 cpu support.
+ */
+#include <machine/psl.h>
+#include <machine/frame.h>
+#include <machine/segments.h>
+
+#define cpu_exec(p) /* nothing */
+#define cpu_swapin(p) /* nothing */
+#define cpu_getstack(td) ((td)->td_frame->tf_rsp)
+#define cpu_setstack(td, ap) ((td)->td_frame->tf_rsp = (ap))
+#define cpu_spinwait() ia32_pause()
+
+#define TRAPF_USERMODE(framep) \
+ (ISPL((framep)->tf_cs) == SEL_UPL)
+#define TRAPF_PC(framep) ((framep)->tf_rip)
+
+#ifdef _KERNEL
+extern char btext[];
+extern char etext[];
+
+void cpu_halt(void);
+void cpu_reset(void);
+void fork_trampoline(void);
+void swi_vm(void *);
+
+/*
+ * Return contents of in-cpu fast counter as a sort of "bogo-time"
+ * for random-harvesting purposes.
+ */
+static __inline u_int64_t
+get_cyclecount(void)
+{
+
+ return (rdtsc());
+}
+
+#endif
+
+#endif /* !_MACHINE_CPU_H_ */
diff --git a/sys/amd64/include/cpufunc.h b/sys/amd64/include/cpufunc.h
new file mode 100644
index 0000000..881fcd2
--- /dev/null
+++ b/sys/amd64/include/cpufunc.h
@@ -0,0 +1,791 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm.
+ * Copyright (c) 1993 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Functions to provide access to special i386 instructions.
+ * This in included in sys/systm.h, and that file should be
+ * used in preference to this.
+ */
+
+#ifndef _MACHINE_CPUFUNC_H_
+#define _MACHINE_CPUFUNC_H_
+
+#ifndef _SYS_CDEFS_H_
+#error this file needs sys/cdefs.h as a prerequisite
+#endif
+
+struct region_descriptor;
+
+#define readb(va) (*(volatile uint8_t *) (va))
+#define readw(va) (*(volatile uint16_t *) (va))
+#define readl(va) (*(volatile uint32_t *) (va))
+#define readq(va) (*(volatile uint64_t *) (va))
+
+#define writeb(va, d) (*(volatile uint8_t *) (va) = (d))
+#define writew(va, d) (*(volatile uint16_t *) (va) = (d))
+#define writel(va, d) (*(volatile uint32_t *) (va) = (d))
+#define writeq(va, d) (*(volatile uint64_t *) (va) = (d))
+
+#if defined(__GNUCLIKE_ASM) && defined(__CC_SUPPORTS___INLINE)
+
+static __inline void
+breakpoint(void)
+{
+ __asm __volatile("int $3");
+}
+
+static __inline u_int
+bsfl(u_int mask)
+{
+ u_int result;
+
+ __asm __volatile("bsfl %1,%0" : "=r" (result) : "rm" (mask));
+ return (result);
+}
+
+static __inline u_long
+bsfq(u_long mask)
+{
+ u_long result;
+
+ __asm __volatile("bsfq %1,%0" : "=r" (result) : "rm" (mask));
+ return (result);
+}
+
+static __inline u_int
+bsrl(u_int mask)
+{
+ u_int result;
+
+ __asm __volatile("bsrl %1,%0" : "=r" (result) : "rm" (mask));
+ return (result);
+}
+
+static __inline u_long
+bsrq(u_long mask)
+{
+ u_long result;
+
+ __asm __volatile("bsrq %1,%0" : "=r" (result) : "rm" (mask));
+ return (result);
+}
+
+static __inline void
+clflush(u_long addr)
+{
+
+ __asm __volatile("clflush %0" : : "m" (*(char *)addr));
+}
+
+static __inline void
+clts(void)
+{
+
+ __asm __volatile("clts");
+}
+
+static __inline void
+disable_intr(void)
+{
+ __asm __volatile("cli" : : : "memory");
+}
+
+static __inline void
+do_cpuid(u_int ax, u_int *p)
+{
+ __asm __volatile("cpuid"
+ : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3])
+ : "0" (ax));
+}
+
+static __inline void
+cpuid_count(u_int ax, u_int cx, u_int *p)
+{
+ __asm __volatile("cpuid"
+ : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3])
+ : "0" (ax), "c" (cx));
+}
+
+static __inline void
+enable_intr(void)
+{
+ __asm __volatile("sti");
+}
+
+#ifdef _KERNEL
+
+#define HAVE_INLINE_FFS
+#define ffs(x) __builtin_ffs(x)
+
+#define HAVE_INLINE_FFSL
+
+static __inline int
+ffsl(long mask)
+{
+ return (mask == 0 ? mask : (int)bsfq((u_long)mask) + 1);
+}
+
+#define HAVE_INLINE_FLS
+
+static __inline int
+fls(int mask)
+{
+ return (mask == 0 ? mask : (int)bsrl((u_int)mask) + 1);
+}
+
+#define HAVE_INLINE_FLSL
+
+static __inline int
+flsl(long mask)
+{
+ return (mask == 0 ? mask : (int)bsrq((u_long)mask) + 1);
+}
+
+#endif /* _KERNEL */
+
+static __inline void
+halt(void)
+{
+ __asm __volatile("hlt");
+}
+
+static __inline u_char
+inb(u_int port)
+{
+ u_char data;
+
+ __asm __volatile("inb %w1, %0" : "=a" (data) : "Nd" (port));
+ return (data);
+}
+
+static __inline u_int
+inl(u_int port)
+{
+ u_int data;
+
+ __asm __volatile("inl %w1, %0" : "=a" (data) : "Nd" (port));
+ return (data);
+}
+
+static __inline void
+insb(u_int port, void *addr, size_t count)
+{
+ __asm __volatile("cld; rep; insb"
+ : "+D" (addr), "+c" (count)
+ : "d" (port)
+ : "memory");
+}
+
+static __inline void
+insw(u_int port, void *addr, size_t count)
+{
+ __asm __volatile("cld; rep; insw"
+ : "+D" (addr), "+c" (count)
+ : "d" (port)
+ : "memory");
+}
+
+static __inline void
+insl(u_int port, void *addr, size_t count)
+{
+ __asm __volatile("cld; rep; insl"
+ : "+D" (addr), "+c" (count)
+ : "d" (port)
+ : "memory");
+}
+
+static __inline void
+invd(void)
+{
+ __asm __volatile("invd");
+}
+
+static __inline u_short
+inw(u_int port)
+{
+ u_short data;
+
+ __asm __volatile("inw %w1, %0" : "=a" (data) : "Nd" (port));
+ return (data);
+}
+
+static __inline void
+outb(u_int port, u_char data)
+{
+ __asm __volatile("outb %0, %w1" : : "a" (data), "Nd" (port));
+}
+
+static __inline void
+outl(u_int port, u_int data)
+{
+ __asm __volatile("outl %0, %w1" : : "a" (data), "Nd" (port));
+}
+
+static __inline void
+outsb(u_int port, const void *addr, size_t count)
+{
+ __asm __volatile("cld; rep; outsb"
+ : "+S" (addr), "+c" (count)
+ : "d" (port));
+}
+
+static __inline void
+outsw(u_int port, const void *addr, size_t count)
+{
+ __asm __volatile("cld; rep; outsw"
+ : "+S" (addr), "+c" (count)
+ : "d" (port));
+}
+
+static __inline void
+outsl(u_int port, const void *addr, size_t count)
+{
+ __asm __volatile("cld; rep; outsl"
+ : "+S" (addr), "+c" (count)
+ : "d" (port));
+}
+
+static __inline void
+outw(u_int port, u_short data)
+{
+ __asm __volatile("outw %0, %w1" : : "a" (data), "Nd" (port));
+}
+
+static __inline u_long
+popcntq(u_long mask)
+{
+ u_long result;
+
+ __asm __volatile("popcntq %1,%0" : "=r" (result) : "rm" (mask));
+ return (result);
+}
+
+static __inline void
+lfence(void)
+{
+
+ __asm __volatile("lfence" : : : "memory");
+}
+
+static __inline void
+mfence(void)
+{
+
+ __asm __volatile("mfence" : : : "memory");
+}
+
+static __inline void
+ia32_pause(void)
+{
+ __asm __volatile("pause");
+}
+
+static __inline u_long
+read_rflags(void)
+{
+ u_long rf;
+
+ __asm __volatile("pushfq; popq %0" : "=r" (rf));
+ return (rf);
+}
+
+static __inline uint64_t
+rdmsr(u_int msr)
+{
+ uint32_t low, high;
+
+ __asm __volatile("rdmsr" : "=a" (low), "=d" (high) : "c" (msr));
+ return (low | ((uint64_t)high << 32));
+}
+
+static __inline uint64_t
+rdpmc(u_int pmc)
+{
+ uint32_t low, high;
+
+ __asm __volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (pmc));
+ return (low | ((uint64_t)high << 32));
+}
+
+static __inline uint64_t
+rdtsc(void)
+{
+ uint32_t low, high;
+
+ __asm __volatile("rdtsc" : "=a" (low), "=d" (high));
+ return (low | ((uint64_t)high << 32));
+}
+
+static __inline uint32_t
+rdtsc32(void)
+{
+ uint32_t rv;
+
+ __asm __volatile("rdtsc" : "=a" (rv) : : "edx");
+ return (rv);
+}
+
+static __inline void
+wbinvd(void)
+{
+ __asm __volatile("wbinvd");
+}
+
+static __inline void
+write_rflags(u_long rf)
+{
+ __asm __volatile("pushq %0; popfq" : : "r" (rf));
+}
+
+static __inline void
+wrmsr(u_int msr, uint64_t newval)
+{
+ uint32_t low, high;
+
+ low = newval;
+ high = newval >> 32;
+ __asm __volatile("wrmsr" : : "a" (low), "d" (high), "c" (msr));
+}
+
+static __inline void
+load_cr0(u_long data)
+{
+
+ __asm __volatile("movq %0,%%cr0" : : "r" (data));
+}
+
+static __inline u_long
+rcr0(void)
+{
+ u_long data;
+
+ __asm __volatile("movq %%cr0,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline u_long
+rcr2(void)
+{
+ u_long data;
+
+ __asm __volatile("movq %%cr2,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_cr3(u_long data)
+{
+
+ __asm __volatile("movq %0,%%cr3" : : "r" (data) : "memory");
+}
+
+static __inline u_long
+rcr3(void)
+{
+ u_long data;
+
+ __asm __volatile("movq %%cr3,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_cr4(u_long data)
+{
+ __asm __volatile("movq %0,%%cr4" : : "r" (data));
+}
+
+static __inline u_long
+rcr4(void)
+{
+ u_long data;
+
+ __asm __volatile("movq %%cr4,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline u_long
+rxcr(u_int reg)
+{
+ u_int low, high;
+
+ __asm __volatile("xgetbv" : "=a" (low), "=d" (high) : "c" (reg));
+ return (low | ((uint64_t)high << 32));
+}
+
+static __inline void
+load_xcr(u_int reg, u_long val)
+{
+ u_int low, high;
+
+ low = val;
+ high = val >> 32;
+ __asm __volatile("xsetbv" : : "c" (reg), "a" (low), "d" (high));
+}
+
+/*
+ * Global TLB flush (except for thise for pages marked PG_G)
+ */
+static __inline void
+invltlb(void)
+{
+
+ load_cr3(rcr3());
+}
+
+/*
+ * TLB flush for an individual page (even if it has PG_G).
+ * Only works on 486+ CPUs (i386 does not have PG_G).
+ */
+static __inline void
+invlpg(u_long addr)
+{
+
+ __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory");
+}
+
+static __inline u_short
+rfs(void)
+{
+ u_short sel;
+ __asm __volatile("movw %%fs,%0" : "=rm" (sel));
+ return (sel);
+}
+
+static __inline u_short
+rgs(void)
+{
+ u_short sel;
+ __asm __volatile("movw %%gs,%0" : "=rm" (sel));
+ return (sel);
+}
+
+static __inline u_short
+rss(void)
+{
+ u_short sel;
+ __asm __volatile("movw %%ss,%0" : "=rm" (sel));
+ return (sel);
+}
+
+static __inline void
+load_ds(u_short sel)
+{
+ __asm __volatile("movw %0,%%ds" : : "rm" (sel));
+}
+
+static __inline void
+load_es(u_short sel)
+{
+ __asm __volatile("movw %0,%%es" : : "rm" (sel));
+}
+
+static __inline void
+cpu_monitor(const void *addr, u_long extensions, u_int hints)
+{
+
+ __asm __volatile("monitor"
+ : : "a" (addr), "c" (extensions), "d" (hints));
+}
+
+static __inline void
+cpu_mwait(u_long extensions, u_int hints)
+{
+
+ __asm __volatile("mwait" : : "a" (hints), "c" (extensions));
+}
+
+#ifdef _KERNEL
+/* This is defined in <machine/specialreg.h> but is too painful to get to */
+#ifndef MSR_FSBASE
+#define MSR_FSBASE 0xc0000100
+#endif
+static __inline void
+load_fs(u_short sel)
+{
+ /* Preserve the fsbase value across the selector load */
+ __asm __volatile("rdmsr; movw %0,%%fs; wrmsr"
+ : : "rm" (sel), "c" (MSR_FSBASE) : "eax", "edx");
+}
+
+#ifndef MSR_GSBASE
+#define MSR_GSBASE 0xc0000101
+#endif
+static __inline void
+load_gs(u_short sel)
+{
+ /*
+ * Preserve the gsbase value across the selector load.
+ * Note that we have to disable interrupts because the gsbase
+ * being trashed happens to be the kernel gsbase at the time.
+ */
+ __asm __volatile("pushfq; cli; rdmsr; movw %0,%%gs; wrmsr; popfq"
+ : : "rm" (sel), "c" (MSR_GSBASE) : "eax", "edx");
+}
+#else
+/* Usable by userland */
+static __inline void
+load_fs(u_short sel)
+{
+ __asm __volatile("movw %0,%%fs" : : "rm" (sel));
+}
+
+static __inline void
+load_gs(u_short sel)
+{
+ __asm __volatile("movw %0,%%gs" : : "rm" (sel));
+}
+#endif
+
+static __inline void
+lidt(struct region_descriptor *addr)
+{
+ __asm __volatile("lidt (%0)" : : "r" (addr));
+}
+
+static __inline void
+lldt(u_short sel)
+{
+ __asm __volatile("lldt %0" : : "r" (sel));
+}
+
+static __inline void
+ltr(u_short sel)
+{
+ __asm __volatile("ltr %0" : : "r" (sel));
+}
+
+static __inline uint64_t
+rdr0(void)
+{
+ uint64_t data;
+ __asm __volatile("movq %%dr0,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_dr0(uint64_t dr0)
+{
+ __asm __volatile("movq %0,%%dr0" : : "r" (dr0));
+}
+
+static __inline uint64_t
+rdr1(void)
+{
+ uint64_t data;
+ __asm __volatile("movq %%dr1,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_dr1(uint64_t dr1)
+{
+ __asm __volatile("movq %0,%%dr1" : : "r" (dr1));
+}
+
+static __inline uint64_t
+rdr2(void)
+{
+ uint64_t data;
+ __asm __volatile("movq %%dr2,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_dr2(uint64_t dr2)
+{
+ __asm __volatile("movq %0,%%dr2" : : "r" (dr2));
+}
+
+static __inline uint64_t
+rdr3(void)
+{
+ uint64_t data;
+ __asm __volatile("movq %%dr3,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_dr3(uint64_t dr3)
+{
+ __asm __volatile("movq %0,%%dr3" : : "r" (dr3));
+}
+
+static __inline uint64_t
+rdr4(void)
+{
+ uint64_t data;
+ __asm __volatile("movq %%dr4,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_dr4(uint64_t dr4)
+{
+ __asm __volatile("movq %0,%%dr4" : : "r" (dr4));
+}
+
+static __inline uint64_t
+rdr5(void)
+{
+ uint64_t data;
+ __asm __volatile("movq %%dr5,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_dr5(uint64_t dr5)
+{
+ __asm __volatile("movq %0,%%dr5" : : "r" (dr5));
+}
+
+static __inline uint64_t
+rdr6(void)
+{
+ uint64_t data;
+ __asm __volatile("movq %%dr6,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_dr6(uint64_t dr6)
+{
+ __asm __volatile("movq %0,%%dr6" : : "r" (dr6));
+}
+
+static __inline uint64_t
+rdr7(void)
+{
+ uint64_t data;
+ __asm __volatile("movq %%dr7,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_dr7(uint64_t dr7)
+{
+ __asm __volatile("movq %0,%%dr7" : : "r" (dr7));
+}
+
+static __inline register_t
+intr_disable(void)
+{
+ register_t rflags;
+
+ rflags = read_rflags();
+ disable_intr();
+ return (rflags);
+}
+
+static __inline void
+intr_restore(register_t rflags)
+{
+ write_rflags(rflags);
+}
+
+#else /* !(__GNUCLIKE_ASM && __CC_SUPPORTS___INLINE) */
+
+int breakpoint(void);
+u_int bsfl(u_int mask);
+u_int bsrl(u_int mask);
+void clflush(u_long addr);
+void clts(void);
+void cpuid_count(u_int ax, u_int cx, u_int *p);
+void disable_intr(void);
+void do_cpuid(u_int ax, u_int *p);
+void enable_intr(void);
+void halt(void);
+void ia32_pause(void);
+u_char inb(u_int port);
+u_int inl(u_int port);
+void insb(u_int port, void *addr, size_t count);
+void insl(u_int port, void *addr, size_t count);
+void insw(u_int port, void *addr, size_t count);
+register_t intr_disable(void);
+void intr_restore(register_t rf);
+void invd(void);
+void invlpg(u_int addr);
+void invltlb(void);
+u_short inw(u_int port);
+void lidt(struct region_descriptor *addr);
+void lldt(u_short sel);
+void load_cr0(u_long cr0);
+void load_cr3(u_long cr3);
+void load_cr4(u_long cr4);
+void load_dr0(uint64_t dr0);
+void load_dr1(uint64_t dr1);
+void load_dr2(uint64_t dr2);
+void load_dr3(uint64_t dr3);
+void load_dr4(uint64_t dr4);
+void load_dr5(uint64_t dr5);
+void load_dr6(uint64_t dr6);
+void load_dr7(uint64_t dr7);
+void load_fs(u_short sel);
+void load_gs(u_short sel);
+void ltr(u_short sel);
+void outb(u_int port, u_char data);
+void outl(u_int port, u_int data);
+void outsb(u_int port, const void *addr, size_t count);
+void outsl(u_int port, const void *addr, size_t count);
+void outsw(u_int port, const void *addr, size_t count);
+void outw(u_int port, u_short data);
+u_long rcr0(void);
+u_long rcr2(void);
+u_long rcr3(void);
+u_long rcr4(void);
+uint64_t rdmsr(u_int msr);
+uint64_t rdpmc(u_int pmc);
+uint64_t rdr0(void);
+uint64_t rdr1(void);
+uint64_t rdr2(void);
+uint64_t rdr3(void);
+uint64_t rdr4(void);
+uint64_t rdr5(void);
+uint64_t rdr6(void);
+uint64_t rdr7(void);
+uint64_t rdtsc(void);
+u_long read_rflags(void);
+u_int rfs(void);
+u_int rgs(void);
+void wbinvd(void);
+void write_rflags(u_int rf);
+void wrmsr(u_int msr, uint64_t newval);
+
+#endif /* __GNUCLIKE_ASM && __CC_SUPPORTS___INLINE */
+
+void reset_dbregs(void);
+
+#ifdef _KERNEL
+int rdmsr_safe(u_int msr, uint64_t *val);
+int wrmsr_safe(u_int msr, uint64_t newval);
+#endif
+
+#endif /* !_MACHINE_CPUFUNC_H_ */
diff --git a/sys/amd64/include/cputypes.h b/sys/amd64/include/cputypes.h
new file mode 100644
index 0000000..eeec4e0
--- /dev/null
+++ b/sys/amd64/include/cputypes.h
@@ -0,0 +1,59 @@
+/*-
+ * Copyright (c) 1993 Christopher G. Demetriou
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_CPUTYPES_H_
+#define _MACHINE_CPUTYPES_H_
+
+/*
+ * Classes of processor.
+ */
+#define CPUCLASS_X86 0 /* X86 */
+#define CPUCLASS_K8 1 /* K8 AMD64 class */
+
+/*
+ * Kinds of processor.
+ */
+#define CPU_X86 0 /* Intel */
+#define CPU_CLAWHAMMER 1 /* AMD Clawhammer */
+#define CPU_SLEDGEHAMMER 2 /* AMD Sledgehammer */
+
+/*
+ * Vendors of processor.
+ */
+#define CPU_VENDOR_AMD 0x1022 /* AMD */
+#define CPU_VENDOR_IDT 0x111d /* Centaur/IDT/VIA */
+#define CPU_VENDOR_INTEL 0x8086 /* Intel */
+#define CPU_VENDOR_CENTAUR CPU_VENDOR_IDT
+
+#ifndef LOCORE
+extern int cpu;
+extern int cpu_class;
+#endif
+
+#endif /* !_MACHINE_CPUTYPES_H_ */
diff --git a/sys/amd64/include/db_machdep.h b/sys/amd64/include/db_machdep.h
new file mode 100644
index 0000000..29e385e
--- /dev/null
+++ b/sys/amd64/include/db_machdep.h
@@ -0,0 +1,94 @@
+/*-
+ * Mach Operating System
+ * Copyright (c) 1991,1990 Carnegie Mellon University
+ * All Rights Reserved.
+ *
+ * Permission to use, copy, modify and distribute this software and its
+ * documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie Mellon
+ * the rights to redistribute these changes.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_DB_MACHDEP_H_
+#define _MACHINE_DB_MACHDEP_H_
+
+#include <machine/frame.h>
+#include <machine/trap.h>
+
+typedef vm_offset_t db_addr_t; /* address - unsigned */
+typedef long db_expr_t; /* expression - signed */
+
+#define PC_REGS() ((db_addr_t)kdb_thrctx->pcb_rip)
+
+#define BKPT_INST 0xcc /* breakpoint instruction */
+#define BKPT_SIZE (1) /* size of breakpoint inst */
+#define BKPT_SET(inst) (BKPT_INST)
+
+#define BKPT_SKIP \
+do { \
+ kdb_frame->tf_rip += 1; \
+ kdb_thrctx->pcb_rip += 1; \
+} while(0)
+
+#define FIXUP_PC_AFTER_BREAK \
+do { \
+ kdb_frame->tf_rip -= 1; \
+ kdb_thrctx->pcb_rip -= 1; \
+} while(0);
+
+#define db_clear_single_step kdb_cpu_clear_singlestep
+#define db_set_single_step kdb_cpu_set_singlestep
+
+#define IS_BREAKPOINT_TRAP(type, code) ((type) == T_BPTFLT)
+/*
+ * Watchpoints are not supported. The debug exception type is in %dr6
+ * and not yet in the args to this macro.
+ */
+#define IS_WATCHPOINT_TRAP(type, code) 0
+
+#define I_CALL 0xe8
+#define I_CALLI 0xff
+#define I_RET 0xc3
+#define I_IRET 0xcf
+
+#define inst_trap_return(ins) (((ins)&0xff) == I_IRET)
+#define inst_return(ins) (((ins)&0xff) == I_RET)
+#define inst_call(ins) (((ins)&0xff) == I_CALL || \
+ (((ins)&0xff) == I_CALLI && \
+ ((ins)&0x3800) == 0x1000))
+#define inst_load(ins) 0
+#define inst_store(ins) 0
+
+/*
+ * There no interesting addresses below _kstack = 0xefbfe000. There
+ * are small absolute values for GUPROF, but we don't want to see them.
+ * Treat "negative" addresses below _kstack as non-small to allow for
+ * future reductions of _kstack and to avoid sign extension problems.
+ *
+ * There is one interesting symbol above -db_maxoff = 0xffff0000,
+ * namely _APTD = 0xfffff000. Accepting this would mess up the
+ * printing of small negative offsets. The next largest symbol is
+ * _APTmap = 0xffc00000. Accepting this is OK (unless db_maxoff is
+ * set to >= 0x400000 - (max stack offset)).
+ */
+#define DB_SMALL_VALUE_MAX 0x7fffffff
+#define DB_SMALL_VALUE_MIN (-0x400001)
+
+#endif /* !_MACHINE_DB_MACHDEP_H_ */
diff --git a/sys/amd64/include/elf.h b/sys/amd64/include/elf.h
new file mode 100644
index 0000000..f932377
--- /dev/null
+++ b/sys/amd64/include/elf.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/elf.h>
diff --git a/sys/amd64/include/endian.h b/sys/amd64/include/endian.h
new file mode 100644
index 0000000..2ad27a9
--- /dev/null
+++ b/sys/amd64/include/endian.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/endian.h>
diff --git a/sys/amd64/include/exec.h b/sys/amd64/include/exec.h
new file mode 100644
index 0000000..8d07887
--- /dev/null
+++ b/sys/amd64/include/exec.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)exec.h 8.1 (Berkeley) 6/11/93
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_EXEC_H_
+#define _MACHINE_EXEC_H_
+
+#define __LDPGSZ 4096
+
+#endif /* !_MACHINE_EXEC_H_ */
diff --git a/sys/amd64/include/float.h b/sys/amd64/include/float.h
new file mode 100644
index 0000000..4759963
--- /dev/null
+++ b/sys/amd64/include/float.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/float.h>
diff --git a/sys/amd64/include/floatingpoint.h b/sys/amd64/include/floatingpoint.h
new file mode 100644
index 0000000..cda9e46
--- /dev/null
+++ b/sys/amd64/include/floatingpoint.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 1993 Andrew Moore, Talke Studio
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#) floatingpoint.h 1.0 (Berkeley) 9/23/93
+ * $FreeBSD$
+ */
+
+#ifndef _FLOATINGPOINT_H_
+#define _FLOATINGPOINT_H_
+
+#include <sys/cdefs.h>
+#include <machine/ieeefp.h>
+
+#endif /* !_FLOATINGPOINT_H_ */
diff --git a/sys/amd64/include/fpu.h b/sys/amd64/include/fpu.h
new file mode 100644
index 0000000..1ce59d0
--- /dev/null
+++ b/sys/amd64/include/fpu.h
@@ -0,0 +1,89 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)npx.h 5.3 (Berkeley) 1/18/91
+ * $FreeBSD$
+ */
+
+/*
+ * Floating Point Data Structures and Constants
+ * W. Jolitz 1/90
+ */
+
+#ifndef _MACHINE_FPU_H_
+#define _MACHINE_FPU_H_
+
+#include <x86/fpu.h>
+
+#ifdef _KERNEL
+
+struct fpu_kern_ctx;
+
+#define PCB_USER_FPU(pcb) (((pcb)->pcb_flags & PCB_KERNFPU) == 0)
+
+#define XSAVE_AREA_ALIGN 64
+
+void fpudna(void);
+void fpudrop(void);
+void fpuexit(struct thread *td);
+int fpuformat(void);
+int fpugetregs(struct thread *td);
+void fpuinit(void);
+void fpurestore(void *addr);
+void fpusave(void *addr);
+int fpusetregs(struct thread *td, struct savefpu *addr,
+ char *xfpustate, size_t xfpustate_size);
+int fpusetxstate(struct thread *td, char *xfpustate,
+ size_t xfpustate_size);
+int fputrap_sse(void);
+int fputrap_x87(void);
+void fpuuserinited(struct thread *td);
+struct fpu_kern_ctx *fpu_kern_alloc_ctx(u_int flags);
+void fpu_kern_free_ctx(struct fpu_kern_ctx *ctx);
+int fpu_kern_enter(struct thread *td, struct fpu_kern_ctx *ctx,
+ u_int flags);
+int fpu_kern_leave(struct thread *td, struct fpu_kern_ctx *ctx);
+int fpu_kern_thread(u_int flags);
+int is_fpu_kern_thread(u_int flags);
+
+struct savefpu *fpu_save_area_alloc(void);
+void fpu_save_area_free(struct savefpu *fsa);
+void fpu_save_area_reset(struct savefpu *fsa);
+
+/*
+ * Flags for fpu_kern_alloc_ctx(), fpu_kern_enter() and fpu_kern_thread().
+ */
+#define FPU_KERN_NORMAL 0x0000
+#define FPU_KERN_NOWAIT 0x0001
+
+#endif
+
+#endif /* !_MACHINE_FPU_H_ */
diff --git a/sys/amd64/include/frame.h b/sys/amd64/include/frame.h
new file mode 100644
index 0000000..0953be7
--- /dev/null
+++ b/sys/amd64/include/frame.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/frame.h>
diff --git a/sys/amd64/include/gdb_machdep.h b/sys/amd64/include/gdb_machdep.h
new file mode 100644
index 0000000..d8c25b9
--- /dev/null
+++ b/sys/amd64/include/gdb_machdep.h
@@ -0,0 +1,52 @@
+/*-
+ * Copyright (c) 2004 Marcel Moolenaar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_GDB_MACHDEP_H_
+#define _MACHINE_GDB_MACHDEP_H_
+
+#define GDB_BUFSZ (GDB_NREGS * 16)
+#define GDB_NREGS 56
+#define GDB_REG_PC 16
+
+static __inline size_t
+gdb_cpu_regsz(int regnum)
+{
+ return ((regnum > 16 && regnum < 24) ? 4 : 8);
+}
+
+static __inline int
+gdb_cpu_query(void)
+{
+ return (0);
+}
+
+void *gdb_cpu_getreg(int, size_t *);
+void gdb_cpu_setreg(int, void *);
+int gdb_cpu_signal(int, int);
+
+#endif /* !_MACHINE_GDB_MACHDEP_H_ */
diff --git a/sys/amd64/include/ieeefp.h b/sys/amd64/include/ieeefp.h
new file mode 100644
index 0000000..a403660
--- /dev/null
+++ b/sys/amd64/include/ieeefp.h
@@ -0,0 +1,308 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm.
+ * Copyright (c) 1990 Andrew Moore, Talke Studio
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#) ieeefp.h 1.0 (Berkeley) 9/23/93
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_IEEEFP_H_
+#define _MACHINE_IEEEFP_H_
+
+/*
+ * Deprecated historical FPU control interface
+ *
+ * IEEE floating point type, constant and function definitions.
+ * XXX: {FP,SSE}*FLD and {FP,SSE}*OFF are undocumented pollution.
+ */
+
+#ifndef _SYS_CDEFS_H_
+#error this file needs sys/cdefs.h as a prerequisite
+#endif
+
+/*
+ * Rounding modes.
+ */
+typedef enum {
+ FP_RN=0, /* round to nearest */
+ FP_RM, /* round down towards minus infinity */
+ FP_RP, /* round up towards plus infinity */
+ FP_RZ /* truncate */
+} fp_rnd_t;
+
+/*
+ * Precision (i.e., rounding precision) modes.
+ */
+typedef enum {
+ FP_PS=0, /* 24 bit (single-precision) */
+ FP_PRS, /* reserved */
+ FP_PD, /* 53 bit (double-precision) */
+ FP_PE /* 64 bit (extended-precision) */
+} fp_prec_t;
+
+#define fp_except_t int
+
+/*
+ * Exception bit masks.
+ */
+#define FP_X_INV 0x01 /* invalid operation */
+#define FP_X_DNML 0x02 /* denormal */
+#define FP_X_DZ 0x04 /* zero divide */
+#define FP_X_OFL 0x08 /* overflow */
+#define FP_X_UFL 0x10 /* underflow */
+#define FP_X_IMP 0x20 /* (im)precision */
+#define FP_X_STK 0x40 /* stack fault */
+
+/*
+ * FPU control word bit-field masks.
+ */
+#define FP_MSKS_FLD 0x3f /* exception masks field */
+#define FP_PRC_FLD 0x300 /* precision control field */
+#define FP_RND_FLD 0xc00 /* rounding control field */
+
+/*
+ * FPU status word bit-field masks.
+ */
+#define FP_STKY_FLD 0x3f /* sticky flags field */
+
+/*
+ * SSE mxcsr register bit-field masks.
+ */
+#define SSE_STKY_FLD 0x3f /* exception flags */
+#define SSE_DAZ_FLD 0x40 /* Denormals are zero */
+#define SSE_MSKS_FLD 0x1f80 /* exception masks field */
+#define SSE_RND_FLD 0x6000 /* rounding control */
+#define SSE_FZ_FLD 0x8000 /* flush to zero on underflow */
+
+/*
+ * FPU control word bit-field offsets (shift counts).
+ */
+#define FP_MSKS_OFF 0 /* exception masks offset */
+#define FP_PRC_OFF 8 /* precision control offset */
+#define FP_RND_OFF 10 /* rounding control offset */
+
+/*
+ * FPU status word bit-field offsets (shift counts).
+ */
+#define FP_STKY_OFF 0 /* sticky flags offset */
+
+/*
+ * SSE mxcsr register bit-field offsets (shift counts).
+ */
+#define SSE_STKY_OFF 0 /* exception flags offset */
+#define SSE_DAZ_OFF 6 /* DAZ exception mask offset */
+#define SSE_MSKS_OFF 7 /* other exception masks offset */
+#define SSE_RND_OFF 13 /* rounding control offset */
+#define SSE_FZ_OFF 15 /* flush to zero offset */
+
+#ifdef __GNUCLIKE_ASM
+
+#define __fldcw(addr) __asm __volatile("fldcw %0" : : "m" (*(addr)))
+#define __fldenv(addr) __asm __volatile("fldenv %0" : : "m" (*(addr)))
+#define __fnstcw(addr) __asm __volatile("fnstcw %0" : "=m" (*(addr)))
+#define __fnstenv(addr) __asm __volatile("fnstenv %0" : "=m" (*(addr)))
+#define __fnstsw(addr) __asm __volatile("fnstsw %0" : "=m" (*(addr)))
+#define __ldmxcsr(addr) __asm __volatile("ldmxcsr %0" : : "m" (*(addr)))
+#define __stmxcsr(addr) __asm __volatile("stmxcsr %0" : "=m" (*(addr)))
+
+/*
+ * Load the control word. Be careful not to trap if there is a currently
+ * unmasked exception (ones that will become freshly unmasked are not a
+ * problem). This case must be handled by a save/restore of the
+ * environment or even of the full x87 state. Accessing the environment
+ * is very inefficient, so only do it when necessary.
+ */
+static __inline void
+__fnldcw(unsigned short _cw, unsigned short _newcw)
+{
+ struct {
+ unsigned _cw;
+ unsigned _other[6];
+ } _env;
+ unsigned short _sw;
+
+ if ((_cw & FP_MSKS_FLD) != FP_MSKS_FLD) {
+ __fnstsw(&_sw);
+ if (((_sw & ~_cw) & FP_STKY_FLD) != 0) {
+ __fnstenv(&_env);
+ _env._cw = _newcw;
+ __fldenv(&_env);
+ return;
+ }
+ }
+ __fldcw(&_newcw);
+}
+
+/*
+ * General notes about conflicting SSE vs FP status bits.
+ * This code assumes that software will not fiddle with the control
+ * bits of the SSE and x87 in such a way to get them out of sync and
+ * still expect this to work. Break this at your peril.
+ * Because I based this on the i386 port, the x87 state is used for
+ * the fpget*() functions, and is shadowed into the SSE state for
+ * the fpset*() functions. For dual source fpget*() functions, I
+ * merge the two together. I think.
+ */
+
+static __inline fp_rnd_t
+__fpgetround(void)
+{
+ unsigned short _cw;
+
+ __fnstcw(&_cw);
+ return ((fp_rnd_t)((_cw & FP_RND_FLD) >> FP_RND_OFF));
+}
+
+static __inline fp_rnd_t
+__fpsetround(fp_rnd_t _m)
+{
+ fp_rnd_t _p;
+ unsigned _mxcsr;
+ unsigned short _cw, _newcw;
+
+ __fnstcw(&_cw);
+ _p = (fp_rnd_t)((_cw & FP_RND_FLD) >> FP_RND_OFF);
+ _newcw = _cw & ~FP_RND_FLD;
+ _newcw |= (_m << FP_RND_OFF) & FP_RND_FLD;
+ __fnldcw(_cw, _newcw);
+ __stmxcsr(&_mxcsr);
+ _mxcsr &= ~SSE_RND_FLD;
+ _mxcsr |= (_m << SSE_RND_OFF) & SSE_RND_FLD;
+ __ldmxcsr(&_mxcsr);
+ return (_p);
+}
+
+/*
+ * Get or set the rounding precision for x87 arithmetic operations.
+ * There is no equivalent SSE mode or control.
+ */
+
+static __inline fp_prec_t
+__fpgetprec(void)
+{
+ unsigned short _cw;
+
+ __fnstcw(&_cw);
+ return ((fp_prec_t)((_cw & FP_PRC_FLD) >> FP_PRC_OFF));
+}
+
+static __inline fp_prec_t
+__fpsetprec(fp_prec_t _m)
+{
+ fp_prec_t _p;
+ unsigned short _cw, _newcw;
+
+ __fnstcw(&_cw);
+ _p = (fp_prec_t)((_cw & FP_PRC_FLD) >> FP_PRC_OFF);
+ _newcw = _cw & ~FP_PRC_FLD;
+ _newcw |= (_m << FP_PRC_OFF) & FP_PRC_FLD;
+ __fnldcw(_cw, _newcw);
+ return (_p);
+}
+
+/*
+ * Get or set the exception mask.
+ * Note that the x87 mask bits are inverted by the API -- a mask bit of 1
+ * means disable for x87 and SSE, but for fp*mask() it means enable.
+ */
+
+static __inline fp_except_t
+__fpgetmask(void)
+{
+ unsigned short _cw;
+
+ __fnstcw(&_cw);
+ return ((~_cw & FP_MSKS_FLD) >> FP_MSKS_OFF);
+}
+
+static __inline fp_except_t
+__fpsetmask(fp_except_t _m)
+{
+ fp_except_t _p;
+ unsigned _mxcsr;
+ unsigned short _cw, _newcw;
+
+ __fnstcw(&_cw);
+ _p = (~_cw & FP_MSKS_FLD) >> FP_MSKS_OFF;
+ _newcw = _cw & ~FP_MSKS_FLD;
+ _newcw |= (~_m << FP_MSKS_OFF) & FP_MSKS_FLD;
+ __fnldcw(_cw, _newcw);
+ __stmxcsr(&_mxcsr);
+ /* XXX should we clear non-ieee SSE_DAZ_FLD and SSE_FZ_FLD ? */
+ _mxcsr &= ~SSE_MSKS_FLD;
+ _mxcsr |= (~_m << SSE_MSKS_OFF) & SSE_MSKS_FLD;
+ __ldmxcsr(&_mxcsr);
+ return (_p);
+}
+
+static __inline fp_except_t
+__fpgetsticky(void)
+{
+ unsigned _ex, _mxcsr;
+ unsigned short _sw;
+
+ __fnstsw(&_sw);
+ _ex = (_sw & FP_STKY_FLD) >> FP_STKY_OFF;
+ __stmxcsr(&_mxcsr);
+ _ex |= (_mxcsr & SSE_STKY_FLD) >> SSE_STKY_OFF;
+ return ((fp_except_t)_ex);
+}
+
+#endif /* __GNUCLIKE_ASM */
+
+#if !defined(__IEEEFP_NOINLINES__) && defined(__GNUCLIKE_ASM)
+
+#define fpgetmask() __fpgetmask()
+#define fpgetprec() __fpgetprec()
+#define fpgetround() __fpgetround()
+#define fpgetsticky() __fpgetsticky()
+#define fpsetmask(m) __fpsetmask(m)
+#define fpsetprec(m) __fpsetprec(m)
+#define fpsetround(m) __fpsetround(m)
+
+#else /* !(!__IEEEFP_NOINLINES__ && __GNUCLIKE_ASM) */
+
+/* Augment the userland declarations. */
+__BEGIN_DECLS
+extern fp_rnd_t fpgetround(void);
+extern fp_rnd_t fpsetround(fp_rnd_t);
+extern fp_except_t fpgetmask(void);
+extern fp_except_t fpsetmask(fp_except_t);
+extern fp_except_t fpgetsticky(void);
+extern fp_except_t fpsetsticky(fp_except_t);
+fp_prec_t fpgetprec(void);
+fp_prec_t fpsetprec(fp_prec_t);
+__END_DECLS
+
+#endif /* !__IEEEFP_NOINLINES__ && __GNUCLIKE_ASM */
+
+#endif /* !_MACHINE_IEEEFP_H_ */
diff --git a/sys/amd64/include/in_cksum.h b/sys/amd64/include/in_cksum.h
new file mode 100644
index 0000000..156035e
--- /dev/null
+++ b/sys/amd64/include/in_cksum.h
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from tahoe: in_cksum.c 1.2 86/01/05
+ * from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91
+ * from: Id: in_cksum.c,v 1.8 1995/12/03 18:35:19 bde Exp
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_IN_CKSUM_H_
+#define _MACHINE_IN_CKSUM_H_ 1
+
+#ifndef _SYS_CDEFS_H_
+#error this file needs sys/cdefs.h as a prerequisite
+#endif
+
+#include <sys/cdefs.h>
+
+#define in_cksum(m, len) in_cksum_skip(m, len, 0)
+
+#if defined(IPVERSION) && (IPVERSION == 4)
+/*
+ * It it useful to have an Internet checksum routine which is inlineable
+ * and optimized specifically for the task of computing IP header checksums
+ * in the normal case (where there are no options and the header length is
+ * therefore always exactly five 32-bit words.
+ */
+#ifdef __CC_SUPPORTS___INLINE
+
+static __inline void
+in_cksum_update(struct ip *ip)
+{
+ int __tmpsum;
+ __tmpsum = (int)ntohs(ip->ip_sum) + 256;
+ ip->ip_sum = htons(__tmpsum + (__tmpsum >> 16));
+}
+
+#else
+
+#define in_cksum_update(ip) \
+ do { \
+ int __tmpsum; \
+ __tmpsum = (int)ntohs(ip->ip_sum) + 256; \
+ ip->ip_sum = htons(__tmpsum + (__tmpsum >> 16)); \
+ } while(0)
+
+#endif
+#endif
+
+#ifdef _KERNEL
+#if defined(IPVERSION) && (IPVERSION == 4)
+u_int in_cksum_hdr(const struct ip *ip);
+#endif
+u_short in_addword(u_short sum, u_short b);
+u_short in_pseudo(u_int sum, u_int b, u_int c);
+u_short in_cksum_skip(struct mbuf *m, int len, int skip);
+#endif
+
+#endif /* _MACHINE_IN_CKSUM_H_ */
diff --git a/sys/amd64/include/intr_machdep.h b/sys/amd64/include/intr_machdep.h
new file mode 100644
index 0000000..8671605
--- /dev/null
+++ b/sys/amd64/include/intr_machdep.h
@@ -0,0 +1,174 @@
+/*-
+ * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __MACHINE_INTR_MACHDEP_H__
+#define __MACHINE_INTR_MACHDEP_H__
+
+#ifdef _KERNEL
+
+/*
+ * The maximum number of I/O interrupts we allow. This number is rather
+ * arbitrary as it is just the maximum IRQ resource value. The interrupt
+ * source for a given IRQ maps that I/O interrupt to device interrupt
+ * source whether it be a pin on an interrupt controller or an MSI interrupt.
+ * The 16 ISA IRQs are assigned fixed IDT vectors, but all other device
+ * interrupts allocate IDT vectors on demand. Currently we have 191 IDT
+ * vectors available for device interrupts. On many systems with I/O APICs,
+ * a lot of the IRQs are not used, so this number can be much larger than
+ * 191 and still be safe since only interrupt sources in actual use will
+ * allocate IDT vectors.
+ *
+ * The first 255 IRQs (0 - 254) are reserved for ISA IRQs and PCI intline IRQs.
+ * IRQ values beyond 256 are used by MSI. We leave 255 unused to avoid
+ * confusion since 255 is used in PCI to indicate an invalid IRQ.
+ */
+#define NUM_MSI_INTS 512
+#define FIRST_MSI_INT 256
+#define NUM_IO_INTS (FIRST_MSI_INT + NUM_MSI_INTS)
+
+/*
+ * Default base address for MSI messages on x86 platforms.
+ */
+#define MSI_INTEL_ADDR_BASE 0xfee00000
+
+/*
+ * - 1 ??? dummy counter.
+ * - 2 counters for each I/O interrupt.
+ * - 1 counter for each CPU for lapic timer.
+ * - 8 counters for each CPU for IPI counters for SMP.
+ */
+#ifdef SMP
+#define INTRCNT_COUNT (1 + NUM_IO_INTS * 2 + (1 + 8) * MAXCPU)
+#else
+#define INTRCNT_COUNT (1 + NUM_IO_INTS * 2 + 1)
+#endif
+
+#ifndef LOCORE
+
+typedef void inthand_t(u_int cs, u_int ef, u_int esp, u_int ss);
+
+#define IDTVEC(name) __CONCAT(X,name)
+
+struct intsrc;
+
+/*
+ * Methods that a PIC provides to mask/unmask a given interrupt source,
+ * "turn on" the interrupt on the CPU side by setting up an IDT entry, and
+ * return the vector associated with this source.
+ */
+struct pic {
+ void (*pic_enable_source)(struct intsrc *);
+ void (*pic_disable_source)(struct intsrc *, int);
+ void (*pic_eoi_source)(struct intsrc *);
+ void (*pic_enable_intr)(struct intsrc *);
+ void (*pic_disable_intr)(struct intsrc *);
+ int (*pic_vector)(struct intsrc *);
+ int (*pic_source_pending)(struct intsrc *);
+ void (*pic_suspend)(struct pic *);
+ void (*pic_resume)(struct pic *);
+ int (*pic_config_intr)(struct intsrc *, enum intr_trigger,
+ enum intr_polarity);
+ int (*pic_assign_cpu)(struct intsrc *, u_int apic_id);
+ TAILQ_ENTRY(pic) pics;
+};
+
+/* Flags for pic_disable_source() */
+enum {
+ PIC_EOI,
+ PIC_NO_EOI,
+};
+
+/*
+ * An interrupt source. The upper-layer code uses the PIC methods to
+ * control a given source. The lower-layer PIC drivers can store additional
+ * private data in a given interrupt source such as an interrupt pin number
+ * or an I/O APIC pointer.
+ */
+struct intsrc {
+ struct pic *is_pic;
+ struct intr_event *is_event;
+ u_long *is_count;
+ u_long *is_straycount;
+ u_int is_index;
+ u_int is_handlers;
+};
+
+struct trapframe;
+
+/*
+ * The following data structure holds per-cpu data, and is placed just
+ * above the top of the space used for the NMI stack.
+ */
+struct nmi_pcpu {
+ register_t np_pcpu;
+ register_t __padding; /* pad to 16 bytes */
+};
+
+extern struct mtx icu_lock;
+extern int elcr_found;
+
+#ifndef DEV_ATPIC
+void atpic_reset(void);
+#endif
+/* XXX: The elcr_* prototypes probably belong somewhere else. */
+int elcr_probe(void);
+enum intr_trigger elcr_read_trigger(u_int irq);
+void elcr_resume(void);
+void elcr_write_trigger(u_int irq, enum intr_trigger trigger);
+#ifdef SMP
+void intr_add_cpu(u_int cpu);
+#endif
+int intr_add_handler(const char *name, int vector, driver_filter_t filter,
+ driver_intr_t handler, void *arg, enum intr_type flags,
+ void **cookiep);
+#ifdef SMP
+int intr_bind(u_int vector, u_char cpu);
+#endif
+int intr_config_intr(int vector, enum intr_trigger trig,
+ enum intr_polarity pol);
+int intr_describe(u_int vector, void *ih, const char *descr);
+void intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame);
+u_int intr_next_cpu(void);
+struct intsrc *intr_lookup_source(int vector);
+int intr_register_pic(struct pic *pic);
+int intr_register_source(struct intsrc *isrc);
+int intr_remove_handler(void *cookie);
+void intr_resume(void);
+void intr_suspend(void);
+void intrcnt_add(const char *name, u_long **countp);
+void nexus_add_irq(u_long irq);
+int msi_alloc(device_t dev, int count, int maxcount, int *irqs);
+void msi_init(void);
+int msi_map(int irq, uint64_t *addr, uint32_t *data);
+int msi_release(int *irqs, int count);
+int msix_alloc(device_t dev, int *irq);
+int msix_release(int irq);
+
+#endif /* !LOCORE */
+#endif /* _KERNEL */
+#endif /* !__MACHINE_INTR_MACHDEP_H__ */
diff --git a/sys/amd64/include/iodev.h b/sys/amd64/include/iodev.h
new file mode 100644
index 0000000..9f53cac
--- /dev/null
+++ b/sys/amd64/include/iodev.h
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 2004 Mark R V Murray
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#ifndef _MACHINE_IODEV_H_
+#define _MACHINE_IODEV_H_
+
+#ifdef _KERNEL
+#include <machine/cpufunc.h>
+
+#define iodev_read_1 inb
+#define iodev_read_2 inw
+#define iodev_read_4 inl
+#define iodev_write_1 outb
+#define iodev_write_2 outw
+#define iodev_write_4 outl
+
+int iodev_open(struct thread *td);
+int iodev_close(struct thread *td);
+int iodev_ioctl(u_long cmd, caddr_t data);
+
+#endif /* _KERNEL */
+#endif /* _MACHINE_IODEV_H_ */
diff --git a/sys/amd64/include/kdb.h b/sys/amd64/include/kdb.h
new file mode 100644
index 0000000..56d2018
--- /dev/null
+++ b/sys/amd64/include/kdb.h
@@ -0,0 +1,59 @@
+/*-
+ * Copyright (c) 2004 Marcel Moolenaar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_KDB_H_
+#define _MACHINE_KDB_H_
+
+#include <machine/frame.h>
+#include <machine/psl.h>
+
+#define KDB_STOPPEDPCB(pc) &stoppcbs[pc->pc_cpuid]
+
+static __inline void
+kdb_cpu_clear_singlestep(void)
+{
+ kdb_frame->tf_rflags &= ~PSL_T;
+}
+
+static __inline void
+kdb_cpu_set_singlestep(void)
+{
+ kdb_frame->tf_rflags |= PSL_T;
+}
+
+static __inline void
+kdb_cpu_sync_icache(unsigned char *addr, size_t size)
+{
+}
+
+static __inline void
+kdb_cpu_trap(int type, int code)
+{
+}
+
+#endif /* _MACHINE_KDB_H_ */
diff --git a/sys/amd64/include/limits.h b/sys/amd64/include/limits.h
new file mode 100644
index 0000000..35eea1f
--- /dev/null
+++ b/sys/amd64/include/limits.h
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)limits.h 8.3 (Berkeley) 1/4/94
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_LIMITS_H_
+#define _MACHINE_LIMITS_H_
+
+#include <sys/cdefs.h>
+
+#ifdef __CC_SUPPORTS_WARNING
+#warning "machine/limits.h is deprecated. Include sys/limits.h instead."
+#endif
+
+#include <sys/limits.h>
+
+#endif /* !_MACHINE_LIMITS_H_ */
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
new file mode 100644
index 0000000..5d7cb74
--- /dev/null
+++ b/sys/amd64/include/md_var.h
@@ -0,0 +1,121 @@
+/*-
+ * Copyright (c) 1995 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_MD_VAR_H_
+#define _MACHINE_MD_VAR_H_
+
+/*
+ * Miscellaneous machine-dependent declarations.
+ */
+
+extern long Maxmem;
+extern u_int basemem;
+extern int busdma_swi_pending;
+extern u_int cpu_exthigh;
+extern u_int cpu_feature;
+extern u_int cpu_feature2;
+extern u_int amd_feature;
+extern u_int amd_feature2;
+extern u_int amd_pminfo;
+extern u_int via_feature_rng;
+extern u_int via_feature_xcrypt;
+extern u_int cpu_clflush_line_size;
+extern u_int cpu_stdext_feature;
+extern u_int cpu_fxsr;
+extern u_int cpu_high;
+extern u_int cpu_id;
+extern u_int cpu_max_ext_state_size;
+extern u_int cpu_mxcsr_mask;
+extern u_int cpu_procinfo;
+extern u_int cpu_procinfo2;
+extern char cpu_vendor[];
+extern u_int cpu_vendor_id;
+extern char ctx_switch_xsave[];
+extern char kstack[];
+extern char sigcode[];
+extern int szsigcode;
+extern uint64_t *vm_page_dump;
+extern int vm_page_dump_size;
+extern int workaround_erratum383;
+extern int _udatasel;
+extern int _ucodesel;
+extern int _ucode32sel;
+extern int _ufssel;
+extern int _ugssel;
+extern int use_xsave;
+extern uint64_t xsave_mask;
+
+typedef void alias_for_inthand_t(u_int cs, u_int ef, u_int esp, u_int ss);
+struct pcb;
+struct savefpu;
+struct thread;
+struct reg;
+struct fpreg;
+struct dbreg;
+struct dumperinfo;
+
+void *alloc_fpusave(int flags);
+void amd64_syscall(struct thread *td, int traced);
+void busdma_swi(void);
+void cpu_setregs(void);
+void ctx_fpusave(void *);
+void doreti_iret(void) __asm(__STRING(doreti_iret));
+void doreti_iret_fault(void) __asm(__STRING(doreti_iret_fault));
+void ld_ds(void) __asm(__STRING(ld_ds));
+void ld_es(void) __asm(__STRING(ld_es));
+void ld_fs(void) __asm(__STRING(ld_fs));
+void ld_gs(void) __asm(__STRING(ld_gs));
+void ld_fsbase(void) __asm(__STRING(ld_fsbase));
+void ld_gsbase(void) __asm(__STRING(ld_gsbase));
+void ds_load_fault(void) __asm(__STRING(ds_load_fault));
+void es_load_fault(void) __asm(__STRING(es_load_fault));
+void fs_load_fault(void) __asm(__STRING(fs_load_fault));
+void gs_load_fault(void) __asm(__STRING(gs_load_fault));
+void fsbase_load_fault(void) __asm(__STRING(fsbase_load_fault));
+void gsbase_load_fault(void) __asm(__STRING(gsbase_load_fault));
+void dump_add_page(vm_paddr_t);
+void dump_drop_page(vm_paddr_t);
+void initializecpu(void);
+void initializecpucache(void);
+void fillw(int /*u_short*/ pat, void *base, size_t cnt);
+void fpstate_drop(struct thread *td);
+int is_physical_memory(vm_paddr_t addr);
+int isa_nmi(int cd);
+void pagecopy(void *from, void *to);
+void pagezero(void *addr);
+void setidt(int idx, alias_for_inthand_t *func, int typ, int dpl, int ist);
+int user_dbreg_trap(void);
+void minidumpsys(struct dumperinfo *);
+struct savefpu *get_pcb_user_save_td(struct thread *td);
+struct savefpu *get_pcb_user_save_pcb(struct pcb *pcb);
+struct pcb *get_pcb_td(struct thread *td);
+
+#endif /* !_MACHINE_MD_VAR_H_ */
diff --git a/sys/amd64/include/memdev.h b/sys/amd64/include/memdev.h
new file mode 100644
index 0000000..649b557
--- /dev/null
+++ b/sys/amd64/include/memdev.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2004 Mark R V Murray
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_MEMDEV_H_
+#define _MACHINE_MEMDEV_H_
+
+#define CDEV_MINOR_MEM 0
+#define CDEV_MINOR_KMEM 1
+
+d_open_t memopen;
+d_read_t memrw;
+d_ioctl_t memioctl;
+d_mmap_t memmmap;
+
+#endif /* _MACHINE_MEMDEV_H_ */
diff --git a/sys/amd64/include/metadata.h b/sys/amd64/include/metadata.h
new file mode 100644
index 0000000..4c7ec9e
--- /dev/null
+++ b/sys/amd64/include/metadata.h
@@ -0,0 +1,35 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm <peter@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_METADATA_H_
+#define _MACHINE_METADATA_H_
+
+#define MODINFOMD_SMAP 0x1001
+#define MODINFOMD_SMAP_XATTR 0x1002
+
+#endif /* !_MACHINE_METADATA_H_ */
diff --git a/sys/amd64/include/minidump.h b/sys/amd64/include/minidump.h
new file mode 100644
index 0000000..2ac529c
--- /dev/null
+++ b/sys/amd64/include/minidump.h
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 2006 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_MINIDUMP_H_
+#define _MACHINE_MINIDUMP_H_ 1
+
+#define MINIDUMP_MAGIC "minidump FreeBSD/amd64"
+#define MINIDUMP_VERSION 2
+
+struct minidumphdr {
+ char magic[24];
+ uint32_t version;
+ uint32_t msgbufsize;
+ uint32_t bitmapsize;
+ uint32_t pmapsize;
+ uint64_t kernbase;
+ uint64_t dmapbase;
+ uint64_t dmapend;
+};
+
+#endif /* _MACHINE_MINIDUMP_H_ */
diff --git a/sys/amd64/include/mp_watchdog.h b/sys/amd64/include/mp_watchdog.h
new file mode 100644
index 0000000..bcec051
--- /dev/null
+++ b/sys/amd64/include/mp_watchdog.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2004 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_MP_WATCHDOG_H_
+#define _MACHINE_MP_WATCHDOG_H_
+
+void ap_watchdog(u_int cpuid);
+
+#endif /* !_MACHINE_MP_WATCHDOG_H_ */
diff --git a/sys/amd64/include/nexusvar.h b/sys/amd64/include/nexusvar.h
new file mode 100644
index 0000000..c39a686
--- /dev/null
+++ b/sys/amd64/include/nexusvar.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright 1998 Massachusetts Institute of Technology
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation for any purpose and without fee is hereby
+ * granted, provided that both the above copyright notice and this
+ * permission notice appear in all copies, that both the above
+ * copyright notice and this permission notice appear in all
+ * supporting documentation, and that the name of M.I.T. not be used
+ * in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission. M.I.T. makes
+ * no representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied
+ * warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS
+ * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
+ * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_NEXUSVAR_H_
+#define _MACHINE_NEXUSVAR_H_
+
+struct nexus_device {
+ struct resource_list nx_resources;
+};
+
+DECLARE_CLASS(nexus_driver);
+
+extern struct rman irq_rman, drq_rman, port_rman, mem_rman;
+
+void nexus_init_resources(void);
+
+#endif /* !_MACHINE_NEXUSVAR_H_ */
diff --git a/sys/amd64/include/npx.h b/sys/amd64/include/npx.h
new file mode 100644
index 0000000..ec70f1c
--- /dev/null
+++ b/sys/amd64/include/npx.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/fpu.h>
diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h
new file mode 100644
index 0000000..9ddcf68
--- /dev/null
+++ b/sys/amd64/include/param.h
@@ -0,0 +1,142 @@
+/*-
+ * Copyright (c) 2002 David E. O'Brien. All rights reserved.
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department and Ralph Campbell.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)param.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+
+#ifndef _AMD64_INCLUDE_PARAM_H_
+#define _AMD64_INCLUDE_PARAM_H_
+
+#include <machine/_align.h>
+
+/*
+ * Machine dependent constants for AMD64.
+ */
+
+
+#define __HAVE_ACPI
+#define __PCI_REROUTE_INTERRUPT
+
+#ifndef MACHINE
+#define MACHINE "amd64"
+#endif
+#ifndef MACHINE_ARCH
+#define MACHINE_ARCH "amd64"
+#endif
+#ifndef MACHINE_ARCH32
+#define MACHINE_ARCH32 "i386"
+#endif
+
+#if defined(SMP) || defined(KLD_MODULE)
+#ifndef MAXCPU
+#define MAXCPU 64
+#endif
+#else
+#define MAXCPU 1
+#endif
+
+#define ALIGNBYTES _ALIGNBYTES
+#define ALIGN(p) _ALIGN(p)
+/*
+ * ALIGNED_POINTER is a boolean macro that checks whether an address
+ * is valid to fetch data elements of type t from on this architecture.
+ * This does not reflect the optimal alignment, just the possibility
+ * (within reasonable limits).
+ */
+#define ALIGNED_POINTER(p, t) 1
+
+/*
+ * CACHE_LINE_SIZE is the compile-time maximum cache line size for an
+ * architecture. It should be used with appropriate caution.
+ */
+#define CACHE_LINE_SHIFT 7
+#define CACHE_LINE_SIZE (1 << CACHE_LINE_SHIFT)
+
+/* Size of the level 1 page table units */
+#define NPTEPG (PAGE_SIZE/(sizeof (pt_entry_t)))
+#define NPTEPGSHIFT 9 /* LOG2(NPTEPG) */
+#define PAGE_SHIFT 12 /* LOG2(PAGE_SIZE) */
+#define PAGE_SIZE (1<<PAGE_SHIFT) /* bytes/page */
+#define PAGE_MASK (PAGE_SIZE-1)
+/* Size of the level 2 page directory units */
+#define NPDEPG (PAGE_SIZE/(sizeof (pd_entry_t)))
+#define NPDEPGSHIFT 9 /* LOG2(NPDEPG) */
+#define PDRSHIFT 21 /* LOG2(NBPDR) */
+#define NBPDR (1<<PDRSHIFT) /* bytes/page dir */
+#define PDRMASK (NBPDR-1)
+/* Size of the level 3 page directory pointer table units */
+#define NPDPEPG (PAGE_SIZE/(sizeof (pdp_entry_t)))
+#define NPDPEPGSHIFT 9 /* LOG2(NPDPEPG) */
+#define PDPSHIFT 30 /* LOG2(NBPDP) */
+#define NBPDP (1<<PDPSHIFT) /* bytes/page dir ptr table */
+#define PDPMASK (NBPDP-1)
+/* Size of the level 4 page-map level-4 table units */
+#define NPML4EPG (PAGE_SIZE/(sizeof (pml4_entry_t)))
+#define NPML4EPGSHIFT 9 /* LOG2(NPML4EPG) */
+#define PML4SHIFT 39 /* LOG2(NBPML4) */
+#define NBPML4 (1UL<<PML4SHIFT)/* bytes/page map lev4 table */
+#define PML4MASK (NBPML4-1)
+
+#define MAXPAGESIZES 3 /* maximum number of supported page sizes */
+
+#define IOPAGES 2 /* pages of i/o permission bitmap */
+
+#ifndef KSTACK_PAGES
+#define KSTACK_PAGES 4 /* pages of kstack (with pcb) */
+#endif
+#define KSTACK_GUARD_PAGES 1 /* pages of kstack guard; 0 disables */
+
+/*
+ * Mach derived conversion macros
+ */
+#define round_page(x) ((((unsigned long)(x)) + PAGE_MASK) & ~(PAGE_MASK))
+#define trunc_page(x) ((unsigned long)(x) & ~(PAGE_MASK))
+#define trunc_2mpage(x) ((unsigned long)(x) & ~PDRMASK)
+#define round_2mpage(x) ((((unsigned long)(x)) + PDRMASK) & ~PDRMASK)
+#define trunc_1gpage(x) ((unsigned long)(x) & ~PDPMASK)
+
+#define atop(x) ((unsigned long)(x) >> PAGE_SHIFT)
+#define ptoa(x) ((unsigned long)(x) << PAGE_SHIFT)
+
+#define amd64_btop(x) ((unsigned long)(x) >> PAGE_SHIFT)
+#define amd64_ptob(x) ((unsigned long)(x) << PAGE_SHIFT)
+
+#define pgtok(x) ((unsigned long)(x) * (PAGE_SIZE / 1024))
+
+#endif /* !_AMD64_INCLUDE_PARAM_H_ */
diff --git a/sys/amd64/include/pc/bios.h b/sys/amd64/include/pc/bios.h
new file mode 100644
index 0000000..e7d568e
--- /dev/null
+++ b/sys/amd64/include/pc/bios.h
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 1997 Michael Smith
+ * Copyright (c) 1998 Jonathan Lemon
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_PC_BIOS_H_
+#define _MACHINE_PC_BIOS_H_
+
+/*
+ * Int 15:E820 'SMAP' structure
+ */
+#define SMAP_SIG 0x534D4150 /* 'SMAP' */
+
+#define SMAP_TYPE_MEMORY 1
+#define SMAP_TYPE_RESERVED 2
+#define SMAP_TYPE_ACPI_RECLAIM 3
+#define SMAP_TYPE_ACPI_NVS 4
+#define SMAP_TYPE_ACPI_ERROR 5
+
+#define SMAP_XATTR_ENABLED 0x00000001
+#define SMAP_XATTR_NON_VOLATILE 0x00000002
+#define SMAP_XATTR_MASK (SMAP_XATTR_ENABLED | SMAP_XATTR_NON_VOLATILE)
+
+struct bios_smap {
+ u_int64_t base;
+ u_int64_t length;
+ u_int32_t type;
+} __packed;
+
+/*
+ * System Management BIOS
+ */
+#define SMBIOS_START 0xf0000
+#define SMBIOS_STEP 0x10
+#define SMBIOS_OFF 0
+#define SMBIOS_LEN 4
+#define SMBIOS_SIG "_SM_"
+
+struct smbios_eps {
+ uint8_t anchor_string[4]; /* '_SM_' */
+ uint8_t checksum;
+ uint8_t length;
+ uint8_t major_version;
+ uint8_t minor_version;
+ uint16_t maximum_structure_size;
+ uint8_t entry_point_revision;
+ uint8_t formatted_area[5];
+ uint8_t intermediate_anchor_string[5]; /* '_DMI_' */
+ uint8_t intermediate_checksum;
+ uint16_t structure_table_length;
+ uint32_t structure_table_address;
+ uint16_t number_structures;
+ uint8_t BCD_revision;
+};
+
+struct smbios_structure_header {
+ uint8_t type;
+ uint8_t length;
+ uint16_t handle;
+};
+
+#ifdef _KERNEL
+#define BIOS_PADDRTOVADDR(x) ((x) + KERNBASE)
+#define BIOS_VADDRTOPADDR(x) ((x) - KERNBASE)
+
+struct bios_oem_signature {
+ char * anchor; /* search anchor string in BIOS memory */
+ size_t offset; /* offset from anchor (may be negative) */
+ size_t totlen; /* total length of BIOS string to copy */
+} __packed;
+
+struct bios_oem_range {
+ u_int from; /* shouldn't be below 0xe0000 */
+ u_int to; /* shouldn't be above 0xfffff */
+} __packed;
+
+struct bios_oem {
+ struct bios_oem_range range;
+ struct bios_oem_signature signature[];
+} __packed;
+
+int bios_oem_strings(struct bios_oem *oem, u_char *buffer, size_t maxlen);
+uint32_t bios_sigsearch(uint32_t start, u_char *sig, int siglen, int paralen,
+ int sigofs);
+#endif
+
+#endif /* _MACHINE_PC_BIOS_H_ */
diff --git a/sys/amd64/include/pc/display.h b/sys/amd64/include/pc/display.h
new file mode 100644
index 0000000..cd2d5ff
--- /dev/null
+++ b/sys/amd64/include/pc/display.h
@@ -0,0 +1,45 @@
+/*
+ * IBM PC display definitions
+ *
+ * $FreeBSD$
+ */
+
+/* Color attributes for foreground text */
+
+#define FG_BLACK 0
+#define FG_BLUE 1
+#define FG_GREEN 2
+#define FG_CYAN 3
+#define FG_RED 4
+#define FG_MAGENTA 5
+#define FG_BROWN 6
+#define FG_LIGHTGREY 7
+#define FG_DARKGREY 8
+#define FG_LIGHTBLUE 9
+#define FG_LIGHTGREEN 10
+#define FG_LIGHTCYAN 11
+#define FG_LIGHTRED 12
+#define FG_LIGHTMAGENTA 13
+#define FG_YELLOW 14
+#define FG_WHITE 15
+#define FG_BLINK 0x80
+
+/* Color attributes for text background */
+
+#define BG_BLACK 0x00
+#define BG_BLUE 0x10
+#define BG_GREEN 0x20
+#define BG_CYAN 0x30
+#define BG_RED 0x40
+#define BG_MAGENTA 0x50
+#define BG_BROWN 0x60
+#define BG_LIGHTGREY 0x70
+
+/* Monochrome attributes for foreground text */
+
+#define FG_UNDERLINE 0x01
+#define FG_INTENSE 0x08
+
+/* Monochrome attributes for text background */
+
+#define BG_INTENSE 0x10
diff --git a/sys/amd64/include/pcb.h b/sys/amd64/include/pcb.h
new file mode 100644
index 0000000..22cbbe2
--- /dev/null
+++ b/sys/amd64/include/pcb.h
@@ -0,0 +1,149 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm.
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)pcb.h 5.10 (Berkeley) 5/12/91
+ * $FreeBSD$
+ */
+
+#ifndef _AMD64_PCB_H_
+#define _AMD64_PCB_H_
+
+/*
+ * AMD64 process control block
+ */
+#include <machine/fpu.h>
+#include <machine/segments.h>
+
+struct pcb {
+ register_t pcb_r15;
+ register_t pcb_r14;
+ register_t pcb_r13;
+ register_t pcb_r12;
+ register_t pcb_rbp;
+ register_t pcb_rsp;
+ register_t pcb_rbx;
+ register_t pcb_rip;
+ register_t pcb_fsbase;
+ register_t pcb_gsbase;
+ register_t pcb_kgsbase;
+ register_t pcb_cr0;
+ register_t pcb_cr2;
+ register_t pcb_cr3;
+ register_t pcb_cr4;
+ register_t pcb_dr0;
+ register_t pcb_dr1;
+ register_t pcb_dr2;
+ register_t pcb_dr3;
+ register_t pcb_dr6;
+ register_t pcb_dr7;
+
+ struct region_descriptor pcb_gdt;
+ struct region_descriptor pcb_idt;
+ struct region_descriptor pcb_ldt;
+ uint16_t pcb_tr;
+
+ u_int pcb_flags;
+#define PCB_FULL_IRET 0x01 /* full iret is required */
+#define PCB_DBREGS 0x02 /* process using debug registers */
+#define PCB_KERNFPU 0x04 /* kernel uses fpu */
+#define PCB_FPUINITDONE 0x08 /* fpu state is initialized */
+#define PCB_USERFPUINITDONE 0x10 /* fpu user state is initialized */
+#define PCB_GS32BIT 0x20 /* linux gs switch */
+#define PCB_32BIT 0x40 /* process has 32 bit context (segs etc) */
+
+ uint16_t pcb_initial_fpucw;
+
+ /* copyin/out fault recovery */
+ caddr_t pcb_onfault;
+
+ /* 32-bit segment descriptor */
+ struct user_segment_descriptor pcb_gs32sd;
+
+ /* local tss, with i/o bitmap; NULL for common */
+ struct amd64tss *pcb_tssp;
+
+ /* model specific registers */
+ register_t pcb_efer;
+ register_t pcb_star;
+ register_t pcb_lstar;
+ register_t pcb_cstar;
+ register_t pcb_sfmask;
+ register_t pcb_xsmask;
+
+ /* fpu context for suspend/resume */
+ void *pcb_fpususpend;
+
+ struct savefpu *pcb_save;
+
+ uint64_t pcb_pad[3];
+};
+
+#ifdef _KERNEL
+struct trapframe;
+
+/*
+ * The pcb_flags is only modified by current thread, or by other threads
+ * when current thread is stopped. However, current thread may change it
+ * from the interrupt context in cpu_switch(), or in the trap handler.
+ * When we read-modify-write pcb_flags from C sources, compiler may generate
+ * code that is not atomic regarding the interrupt handler. If a trap or
+ * interrupt happens and any flag is modified from the handler, it can be
+ * clobbered with the cached value later. Therefore, we implement setting
+ * and clearing flags with single-instruction functions, which do not race
+ * with possible modification of the flags from the trap or interrupt context,
+ * because traps and interrupts are executed only on instruction boundary.
+ */
+static __inline void
+set_pcb_flags(struct pcb *pcb, const u_int flags)
+{
+
+ __asm __volatile("orl %1,%0"
+ : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
+ : "cc");
+}
+
+static __inline void
+clear_pcb_flags(struct pcb *pcb, const u_int flags)
+{
+
+ __asm __volatile("andl %1,%0"
+ : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
+ : "cc");
+}
+
+void makectx(struct trapframe *, struct pcb *);
+int savectx(struct pcb *) __returns_twice;
+void resumectx(struct pcb *);
+
+#endif
+
+#endif /* _AMD64_PCB_H_ */
diff --git a/sys/amd64/include/pci_cfgreg.h b/sys/amd64/include/pci_cfgreg.h
new file mode 100644
index 0000000..717d5cc
--- /dev/null
+++ b/sys/amd64/include/pci_cfgreg.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/pci_cfgreg.h>
diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h
new file mode 100644
index 0000000..bb7d339
--- /dev/null
+++ b/sys/amd64/include/pcpu.h
@@ -0,0 +1,261 @@
+/*-
+ * Copyright (c) Peter Wemm <peter@netplex.com.au>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_PCPU_H_
+#define _MACHINE_PCPU_H_
+
+#ifndef _SYS_CDEFS_H_
+#error "sys/cdefs.h is a prerequisite for this file"
+#endif
+
+#if defined(XEN) || defined(XENHVM)
+#ifndef NR_VIRQS
+#define NR_VIRQS 24
+#endif
+#ifndef NR_IPIS
+#define NR_IPIS 2
+#endif
+#endif
+
+#ifdef XENHVM
+#define PCPU_XEN_FIELDS \
+ ; \
+ unsigned int pc_last_processed_l1i; \
+ unsigned int pc_last_processed_l2i
+#else
+#define PCPU_XEN_FIELDS
+#endif
+
+/*
+ * The SMP parts are setup in pmap.c and locore.s for the BSP, and
+ * mp_machdep.c sets up the data for the AP's to "see" when they awake.
+ * The reason for doing it via a struct is so that an array of pointers
+ * to each CPU's data can be set up for things like "check curproc on all
+ * other processors"
+ */
+#define PCPU_MD_FIELDS \
+ char pc_monitorbuf[128] __aligned(128); /* cache line */ \
+ struct pcpu *pc_prvspace; /* Self-reference */ \
+ struct pmap *pc_curpmap; \
+ struct amd64tss *pc_tssp; /* TSS segment active on CPU */ \
+ struct amd64tss *pc_commontssp;/* Common TSS for the CPU */ \
+ register_t pc_rsp0; \
+ register_t pc_scratch_rsp; /* User %rsp in syscall */ \
+ u_int pc_apic_id; \
+ u_int pc_acpi_id; /* ACPI CPU id */ \
+ /* Pointer to the CPU %fs descriptor */ \
+ struct user_segment_descriptor *pc_fs32p; \
+ /* Pointer to the CPU %gs descriptor */ \
+ struct user_segment_descriptor *pc_gs32p; \
+ /* Pointer to the CPU LDT descriptor */ \
+ struct system_segment_descriptor *pc_ldt; \
+ /* Pointer to the CPU TSS descriptor */ \
+ struct system_segment_descriptor *pc_tss; \
+ u_int pc_cmci_mask /* MCx banks for CMCI */ \
+ PCPU_XEN_FIELDS; \
+ char __pad[293] /* be divisor of PAGE_SIZE \
+ after cache alignment */
+
+#ifdef _KERNEL
+
+#ifdef lint
+
+extern struct pcpu *pcpup;
+
+#define PCPU_GET(member) (pcpup->pc_ ## member)
+#define PCPU_ADD(member, val) (pcpup->pc_ ## member += (val))
+#define PCPU_INC(member) PCPU_ADD(member, 1)
+#define PCPU_PTR(member) (&pcpup->pc_ ## member)
+#define PCPU_SET(member, val) (pcpup->pc_ ## member = (val))
+
+#elif defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF)
+
+/*
+ * Evaluates to the byte offset of the per-cpu variable name.
+ */
+#define __pcpu_offset(name) \
+ __offsetof(struct pcpu, name)
+
+/*
+ * Evaluates to the type of the per-cpu variable name.
+ */
+#define __pcpu_type(name) \
+ __typeof(((struct pcpu *)0)->name)
+
+/*
+ * Evaluates to the address of the per-cpu variable name.
+ */
+#define __PCPU_PTR(name) __extension__ ({ \
+ __pcpu_type(name) *__p; \
+ \
+ __asm __volatile("movq %%gs:%1,%0; addq %2,%0" \
+ : "=r" (__p) \
+ : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))), \
+ "i" (__pcpu_offset(name))); \
+ \
+ __p; \
+})
+
+/*
+ * Evaluates to the value of the per-cpu variable name.
+ */
+#define __PCPU_GET(name) __extension__ ({ \
+ __pcpu_type(name) __res; \
+ struct __s { \
+ u_char __b[MIN(sizeof(__pcpu_type(name)), 8)]; \
+ } __s; \
+ \
+ if (sizeof(__res) == 1 || sizeof(__res) == 2 || \
+ sizeof(__res) == 4 || sizeof(__res) == 8) { \
+ __asm __volatile("mov %%gs:%1,%0" \
+ : "=r" (__s) \
+ : "m" (*(struct __s *)(__pcpu_offset(name)))); \
+ *(struct __s *)(void *)&__res = __s; \
+ } else { \
+ __res = *__PCPU_PTR(name); \
+ } \
+ __res; \
+})
+
+/*
+ * Adds the value to the per-cpu counter name. The implementation
+ * must be atomic with respect to interrupts.
+ */
+#define __PCPU_ADD(name, val) do { \
+ __pcpu_type(name) __val; \
+ struct __s { \
+ u_char __b[MIN(sizeof(__pcpu_type(name)), 8)]; \
+ } __s; \
+ \
+ __val = (val); \
+ if (sizeof(__val) == 1 || sizeof(__val) == 2 || \
+ sizeof(__val) == 4 || sizeof(__val) == 8) { \
+ __s = *(struct __s *)(void *)&__val; \
+ __asm __volatile("add %1,%%gs:%0" \
+ : "=m" (*(struct __s *)(__pcpu_offset(name))) \
+ : "r" (__s)); \
+ } else \
+ *__PCPU_PTR(name) += __val; \
+} while (0)
+
+/*
+ * Increments the value of the per-cpu counter name. The implementation
+ * must be atomic with respect to interrupts.
+ */
+#define __PCPU_INC(name) do { \
+ CTASSERT(sizeof(__pcpu_type(name)) == 1 || \
+ sizeof(__pcpu_type(name)) == 2 || \
+ sizeof(__pcpu_type(name)) == 4 || \
+ sizeof(__pcpu_type(name)) == 8); \
+ if (sizeof(__pcpu_type(name)) == 1) { \
+ __asm __volatile("incb %%gs:%0" \
+ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
+ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
+ } else if (sizeof(__pcpu_type(name)) == 2) { \
+ __asm __volatile("incw %%gs:%0" \
+ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
+ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
+ } else if (sizeof(__pcpu_type(name)) == 4) { \
+ __asm __volatile("incl %%gs:%0" \
+ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
+ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
+ } else if (sizeof(__pcpu_type(name)) == 8) { \
+ __asm __volatile("incq %%gs:%0" \
+ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
+ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
+ } \
+} while (0)
+
+/*
+ * Sets the value of the per-cpu variable name to value val.
+ */
+#define __PCPU_SET(name, val) { \
+ __pcpu_type(name) __val; \
+ struct __s { \
+ u_char __b[MIN(sizeof(__pcpu_type(name)), 8)]; \
+ } __s; \
+ \
+ __val = (val); \
+ if (sizeof(__val) == 1 || sizeof(__val) == 2 || \
+ sizeof(__val) == 4 || sizeof(__val) == 8) { \
+ __s = *(struct __s *)(void *)&__val; \
+ __asm __volatile("mov %1,%%gs:%0" \
+ : "=m" (*(struct __s *)(__pcpu_offset(name))) \
+ : "r" (__s)); \
+ } else { \
+ *__PCPU_PTR(name) = __val; \
+ } \
+}
+
+#define PCPU_GET(member) __PCPU_GET(pc_ ## member)
+#define PCPU_ADD(member, val) __PCPU_ADD(pc_ ## member, val)
+#define PCPU_INC(member) __PCPU_INC(pc_ ## member)
+#define PCPU_PTR(member) __PCPU_PTR(pc_ ## member)
+#define PCPU_SET(member, val) __PCPU_SET(pc_ ## member, val)
+
+#define OFFSETOF_CURTHREAD 0
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnull-dereference"
+#endif
+static __inline __pure2 struct thread *
+__curthread(void)
+{
+ struct thread *td;
+
+ __asm("movq %%gs:%1,%0" : "=r" (td)
+ : "m" (*(char *)OFFSETOF_CURTHREAD));
+ return (td);
+}
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+#define curthread (__curthread())
+
+#define OFFSETOF_CURPCB 32
+static __inline __pure2 struct pcb *
+__curpcb(void)
+{
+ struct pcb *pcb;
+
+ __asm("movq %%gs:%1,%0" : "=r" (pcb) : "m" (*(char *)OFFSETOF_CURPCB));
+ return (pcb);
+}
+#define curpcb (__curpcb())
+
+#define IS_BSP() (PCPU_GET(cpuid) == 0)
+
+#else /* !lint || defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) */
+
+#error "this file needs to be ported to your compiler"
+
+#endif /* lint, etc. */
+
+#endif /* _KERNEL */
+
+#endif /* !_MACHINE_PCPU_H_ */
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
new file mode 100644
index 0000000..6d76ec3
--- /dev/null
+++ b/sys/amd64/include/pmap.h
@@ -0,0 +1,337 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm.
+ * Copyright (c) 1991 Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department and William Jolitz of UUNET Technologies Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Derived from hp300 version by Mike Hibler, this version by William
+ * Jolitz uses a recursive map [a pde points to the page directory] to
+ * map the page tables using the pagetables themselves. This is done to
+ * reduce the impact on kernel virtual memory for lots of sparse address
+ * space, and to reduce the cost of memory to each process.
+ *
+ * from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90
+ * from: @(#)pmap.h 7.4 (Berkeley) 5/12/91
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_PMAP_H_
+#define _MACHINE_PMAP_H_
+
+/*
+ * Page-directory and page-table entries follow this format, with a few
+ * of the fields not present here and there, depending on a lot of things.
+ */
+ /* ---- Intel Nomenclature ---- */
+#define PG_V 0x001 /* P Valid */
+#define PG_RW 0x002 /* R/W Read/Write */
+#define PG_U 0x004 /* U/S User/Supervisor */
+#define PG_NC_PWT 0x008 /* PWT Write through */
+#define PG_NC_PCD 0x010 /* PCD Cache disable */
+#define PG_A 0x020 /* A Accessed */
+#define PG_M 0x040 /* D Dirty */
+#define PG_PS 0x080 /* PS Page size (0=4k,1=2M) */
+#define PG_PTE_PAT 0x080 /* PAT PAT index */
+#define PG_G 0x100 /* G Global */
+#define PG_AVAIL1 0x200 /* / Available for system */
+#define PG_AVAIL2 0x400 /* < programmers use */
+#define PG_AVAIL3 0x800 /* \ */
+#define PG_PDE_PAT 0x1000 /* PAT PAT index */
+#define PG_NX (1ul<<63) /* No-execute */
+
+
+/* Our various interpretations of the above */
+#define PG_W PG_AVAIL1 /* "Wired" pseudoflag */
+#define PG_MANAGED PG_AVAIL2
+#define PG_FRAME (0x000ffffffffff000ul)
+#define PG_PS_FRAME (0x000fffffffe00000ul)
+#define PG_PROT (PG_RW|PG_U) /* all protection bits . */
+#define PG_N (PG_NC_PWT|PG_NC_PCD) /* Non-cacheable */
+
+/* Page level cache control fields used to determine the PAT type */
+#define PG_PDE_CACHE (PG_PDE_PAT | PG_NC_PWT | PG_NC_PCD)
+#define PG_PTE_CACHE (PG_PTE_PAT | PG_NC_PWT | PG_NC_PCD)
+
+/*
+ * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
+ * (PTE) page mappings have identical settings for the following fields:
+ */
+#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_PAT | \
+ PG_M | PG_A | PG_NC_PCD | PG_NC_PWT | PG_U | PG_RW | PG_V)
+
+/*
+ * Page Protection Exception bits
+ */
+
+#define PGEX_P 0x01 /* Protection violation vs. not present */
+#define PGEX_W 0x02 /* during a Write cycle */
+#define PGEX_U 0x04 /* access from User mode (UPL) */
+#define PGEX_RSV 0x08 /* reserved PTE field is non-zero */
+#define PGEX_I 0x10 /* during an instruction fetch */
+
+/*
+ * Pte related macros. This is complicated by having to deal with
+ * the sign extension of the 48th bit.
+ */
+#define KVADDR(l4, l3, l2, l1) ( \
+ ((unsigned long)-1 << 47) | \
+ ((unsigned long)(l4) << PML4SHIFT) | \
+ ((unsigned long)(l3) << PDPSHIFT) | \
+ ((unsigned long)(l2) << PDRSHIFT) | \
+ ((unsigned long)(l1) << PAGE_SHIFT))
+
+#define UVADDR(l4, l3, l2, l1) ( \
+ ((unsigned long)(l4) << PML4SHIFT) | \
+ ((unsigned long)(l3) << PDPSHIFT) | \
+ ((unsigned long)(l2) << PDRSHIFT) | \
+ ((unsigned long)(l1) << PAGE_SHIFT))
+
+#define NKPML4E 1 /* number of kernel PML4 slots */
+
+#define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */
+#define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */
+#define NUPDE (NUPDPE*NPDEPG) /* number of userland PD entries */
+
+/*
+ * NDMPML4E is the number of PML4 entries that are used to implement the
+ * direct map. It must be a power of two.
+ */
+#define NDMPML4E 2
+
+/*
+ * The *PDI values control the layout of virtual memory. The starting address
+ * of the direct map, which is controlled by DMPML4I, must be a multiple of
+ * its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.)
+ */
+#define PML4PML4I (NPML4EPG/2) /* Index of recursive pml4 mapping */
+
+#define KPML4I (NPML4EPG-1) /* Top 512GB for KVM */
+#define DMPML4I rounddown(KPML4I - NDMPML4E, NDMPML4E) /* Below KVM */
+
+#define KPDPI (NPDPEPG-2) /* kernbase at -2GB */
+
+/*
+ * XXX doesn't really belong here I guess...
+ */
+#define ISA_HOLE_START 0xa0000
+#define ISA_HOLE_LENGTH (0x100000-ISA_HOLE_START)
+
+#ifndef LOCORE
+
+#include <sys/queue.h>
+#include <sys/_cpuset.h>
+#include <sys/_lock.h>
+#include <sys/_mutex.h>
+
+#include <vm/_vm_radix.h>
+
+typedef u_int64_t pd_entry_t;
+typedef u_int64_t pt_entry_t;
+typedef u_int64_t pdp_entry_t;
+typedef u_int64_t pml4_entry_t;
+
+#define PML4ESHIFT (3)
+#define PDPESHIFT (3)
+#define PTESHIFT (3)
+#define PDESHIFT (3)
+
+/*
+ * Address of current address space page table maps and directories.
+ */
+#ifdef _KERNEL
+#define addr_PTmap (KVADDR(PML4PML4I, 0, 0, 0))
+#define addr_PDmap (KVADDR(PML4PML4I, PML4PML4I, 0, 0))
+#define addr_PDPmap (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0))
+#define addr_PML4map (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I))
+#define addr_PML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t)))
+#define PTmap ((pt_entry_t *)(addr_PTmap))
+#define PDmap ((pd_entry_t *)(addr_PDmap))
+#define PDPmap ((pd_entry_t *)(addr_PDPmap))
+#define PML4map ((pd_entry_t *)(addr_PML4map))
+#define PML4pml4e ((pd_entry_t *)(addr_PML4pml4e))
+
+extern int nkpt; /* Initial number of kernel page tables */
+extern u_int64_t KPDPphys; /* physical address of kernel level 3 */
+extern u_int64_t KPML4phys; /* physical address of kernel level 4 */
+
+/*
+ * virtual address to page table entry and
+ * to physical address.
+ * Note: these work recursively, thus vtopte of a pte will give
+ * the corresponding pde that in turn maps it.
+ */
+pt_entry_t *vtopte(vm_offset_t);
+#define vtophys(va) pmap_kextract(((vm_offset_t) (va)))
+
+static __inline pt_entry_t
+pte_load(pt_entry_t *ptep)
+{
+ pt_entry_t r;
+
+ r = *ptep;
+ return (r);
+}
+
+static __inline pt_entry_t
+pte_load_store(pt_entry_t *ptep, pt_entry_t pte)
+{
+ pt_entry_t r;
+
+ __asm __volatile(
+ "xchgq %0,%1"
+ : "=m" (*ptep),
+ "=r" (r)
+ : "1" (pte),
+ "m" (*ptep));
+ return (r);
+}
+
+#define pte_load_clear(pte) atomic_readandclear_long(pte)
+
+static __inline void
+pte_store(pt_entry_t *ptep, pt_entry_t pte)
+{
+
+ *ptep = pte;
+}
+
+#define pte_clear(ptep) pte_store((ptep), (pt_entry_t)0ULL)
+
+#define pde_store(pdep, pde) pte_store((pdep), (pde))
+
+extern pt_entry_t pg_nx;
+
+#endif /* _KERNEL */
+
+/*
+ * Pmap stuff
+ */
+struct pv_entry;
+struct pv_chunk;
+
+struct md_page {
+ TAILQ_HEAD(,pv_entry) pv_list;
+ int pat_mode;
+};
+
+/*
+ * The kernel virtual address (KVA) of the level 4 page table page is always
+ * within the direct map (DMAP) region.
+ */
+struct pmap {
+ struct mtx pm_mtx;
+ pml4_entry_t *pm_pml4; /* KVA of level 4 page table */
+ TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */
+ cpuset_t pm_active; /* active on cpus */
+ /* spare u_int here due to padding */
+ struct pmap_statistics pm_stats; /* pmap statistics */
+ struct vm_radix pm_root; /* spare page table pages */
+};
+
+typedef struct pmap *pmap_t;
+
+#ifdef _KERNEL
+extern struct pmap kernel_pmap_store;
+#define kernel_pmap (&kernel_pmap_store)
+
+#define PMAP_LOCK(pmap) mtx_lock(&(pmap)->pm_mtx)
+#define PMAP_LOCK_ASSERT(pmap, type) \
+ mtx_assert(&(pmap)->pm_mtx, (type))
+#define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx)
+#define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, "pmap", \
+ NULL, MTX_DEF | MTX_DUPOK)
+#define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx)
+#define PMAP_MTX(pmap) (&(pmap)->pm_mtx)
+#define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx)
+#define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx)
+#endif
+
+/*
+ * For each vm_page_t, there is a list of all currently valid virtual
+ * mappings of that page. An entry is a pv_entry_t, the list is pv_list.
+ */
+typedef struct pv_entry {
+ vm_offset_t pv_va; /* virtual address for mapping */
+ TAILQ_ENTRY(pv_entry) pv_next;
+} *pv_entry_t;
+
+/*
+ * pv_entries are allocated in chunks per-process. This avoids the
+ * need to track per-pmap assignments.
+ */
+#define _NPCM 3
+#define _NPCPV 168
+struct pv_chunk {
+ pmap_t pc_pmap;
+ TAILQ_ENTRY(pv_chunk) pc_list;
+ uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */
+ TAILQ_ENTRY(pv_chunk) pc_lru;
+ struct pv_entry pc_pventry[_NPCPV];
+};
+
+#ifdef _KERNEL
+
+extern caddr_t CADDR1;
+extern pt_entry_t *CMAP1;
+extern vm_paddr_t phys_avail[];
+extern vm_paddr_t dump_avail[];
+extern vm_offset_t virtual_avail;
+extern vm_offset_t virtual_end;
+
+#define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode)
+#define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0)
+#define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz))
+
+void pmap_bootstrap(vm_paddr_t *);
+int pmap_change_attr(vm_offset_t, vm_size_t, int);
+void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate);
+void pmap_init_pat(void);
+void pmap_kenter(vm_offset_t va, vm_paddr_t pa);
+void *pmap_kenter_temporary(vm_paddr_t pa, int i);
+vm_paddr_t pmap_kextract(vm_offset_t);
+void pmap_kremove(vm_offset_t);
+void *pmap_mapbios(vm_paddr_t, vm_size_t);
+void *pmap_mapdev(vm_paddr_t, vm_size_t);
+void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int);
+boolean_t pmap_page_is_mapped(vm_page_t m);
+void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma);
+void pmap_unmapdev(vm_offset_t, vm_size_t);
+void pmap_invalidate_page(pmap_t, vm_offset_t);
+void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t);
+void pmap_invalidate_all(pmap_t);
+void pmap_invalidate_cache(void);
+void pmap_invalidate_cache_pages(vm_page_t *pages, int count);
+void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva);
+
+#endif /* _KERNEL */
+
+#endif /* !LOCORE */
+
+#endif /* !_MACHINE_PMAP_H_ */
diff --git a/sys/amd64/include/pmc_mdep.h b/sys/amd64/include/pmc_mdep.h
new file mode 100644
index 0000000..73c93fe
--- /dev/null
+++ b/sys/amd64/include/pmc_mdep.h
@@ -0,0 +1,143 @@
+/*-
+ * Copyright (c) 2003-2008 Joseph Koshy
+ * Copyright (c) 2007 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by A. Joseph Koshy under
+ * sponsorship from the FreeBSD Foundation and Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/* Machine dependent interfaces */
+
+#ifndef _MACHINE_PMC_MDEP_H
+#define _MACHINE_PMC_MDEP_H 1
+
+#ifdef _KERNEL
+struct pmc_mdep;
+#endif
+
+#include <dev/hwpmc/hwpmc_amd.h>
+#include <dev/hwpmc/hwpmc_core.h>
+#include <dev/hwpmc/hwpmc_piv.h>
+#include <dev/hwpmc/hwpmc_tsc.h>
+#include <dev/hwpmc/hwpmc_uncore.h>
+
+/*
+ * Intel processors implementing V2 and later of the Intel performance
+ * measurement architecture have PMCs of the following classes: TSC,
+ * IAF, IAP, UCF and UCP.
+ */
+#define PMC_MDEP_CLASS_INDEX_TSC 1
+#define PMC_MDEP_CLASS_INDEX_K8 2
+#define PMC_MDEP_CLASS_INDEX_P4 2
+#define PMC_MDEP_CLASS_INDEX_IAP 2
+#define PMC_MDEP_CLASS_INDEX_IAF 3
+#define PMC_MDEP_CLASS_INDEX_UCP 4
+#define PMC_MDEP_CLASS_INDEX_UCF 5
+
+/*
+ * On the amd64 platform we support the following PMCs.
+ *
+ * TSC The timestamp counter
+ * K8 AMD Athlon64 and Opteron PMCs in 64 bit mode.
+ * PIV Intel P4/HTT and P4/EMT64
+ * IAP Intel Core/Core2/Atom CPUs in 64 bits mode.
+ * IAF Intel fixed-function PMCs in Core2 and later CPUs.
+ * UCP Intel Uncore programmable PMCs.
+ * UCF Intel Uncore fixed-function PMCs.
+ */
+
+union pmc_md_op_pmcallocate {
+ struct pmc_md_amd_op_pmcallocate pm_amd;
+ struct pmc_md_iaf_op_pmcallocate pm_iaf;
+ struct pmc_md_iap_op_pmcallocate pm_iap;
+ struct pmc_md_ucf_op_pmcallocate pm_ucf;
+ struct pmc_md_ucp_op_pmcallocate pm_ucp;
+ struct pmc_md_p4_op_pmcallocate pm_p4;
+ uint64_t __pad[4];
+};
+
+/* Logging */
+#define PMCLOG_READADDR PMCLOG_READ64
+#define PMCLOG_EMITADDR PMCLOG_EMIT64
+
+#ifdef _KERNEL
+
+union pmc_md_pmc {
+ struct pmc_md_amd_pmc pm_amd;
+ struct pmc_md_iaf_pmc pm_iaf;
+ struct pmc_md_iap_pmc pm_iap;
+ struct pmc_md_ucf_pmc pm_ucf;
+ struct pmc_md_ucp_pmc pm_ucp;
+ struct pmc_md_p4_pmc pm_p4;
+};
+
+#define PMC_TRAPFRAME_TO_PC(TF) ((TF)->tf_rip)
+#define PMC_TRAPFRAME_TO_FP(TF) ((TF)->tf_rbp)
+#define PMC_TRAPFRAME_TO_USER_SP(TF) ((TF)->tf_rsp)
+#define PMC_TRAPFRAME_TO_KERNEL_SP(TF) ((TF)->tf_rsp)
+
+#define PMC_AT_FUNCTION_PROLOGUE_PUSH_BP(I) \
+ (((I) & 0xffffffff) == 0xe5894855) /* pushq %rbp; movq %rsp,%rbp */
+#define PMC_AT_FUNCTION_PROLOGUE_MOV_SP_BP(I) \
+ (((I) & 0x00ffffff) == 0x00e58948) /* movq %rsp,%rbp */
+#define PMC_AT_FUNCTION_EPILOGUE_RET(I) \
+ (((I) & 0xFF) == 0xC3) /* ret */
+
+#define PMC_IN_TRAP_HANDLER(PC) \
+ ((PC) >= (uintptr_t) start_exceptions && \
+ (PC) < (uintptr_t) end_exceptions)
+
+#define PMC_IN_KERNEL_STACK(S,START,END) \
+ ((S) >= (START) && (S) < (END))
+#define PMC_IN_KERNEL(va) (((va) >= DMAP_MIN_ADDRESS && \
+ (va) < DMAP_MAX_ADDRESS) || ((va) >= VM_MIN_KERNEL_ADDRESS && \
+ (va) < VM_MAX_KERNEL_ADDRESS))
+
+#define PMC_IN_USERSPACE(va) ((va) <= VM_MAXUSER_ADDRESS)
+
+/* Build a fake kernel trapframe from current instruction pointer. */
+#define PMC_FAKE_TRAPFRAME(TF) \
+ do { \
+ (TF)->tf_cs = 0; (TF)->tf_rflags = 0; \
+ __asm __volatile("movq %%rbp,%0" : "=r" ((TF)->tf_rbp)); \
+ __asm __volatile("movq %%rsp,%0" : "=r" ((TF)->tf_rsp)); \
+ __asm __volatile("call 1f \n\t1: pop %0" : "=r"((TF)->tf_rip)); \
+ } while (0)
+
+/*
+ * Prototypes
+ */
+
+void start_exceptions(void), end_exceptions(void);
+
+struct pmc_mdep *pmc_amd_initialize(void);
+void pmc_amd_finalize(struct pmc_mdep *_md);
+struct pmc_mdep *pmc_intel_initialize(void);
+void pmc_intel_finalize(struct pmc_mdep *_md);
+
+#endif /* _KERNEL */
+#endif /* _MACHINE_PMC_MDEP_H */
diff --git a/sys/amd64/include/ppireg.h b/sys/amd64/include/ppireg.h
new file mode 100644
index 0000000..5774757
--- /dev/null
+++ b/sys/amd64/include/ppireg.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (C) 2005 TAKAHASHI Yoshihiro. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_PPIREG_H_
+#define _MACHINE_PPIREG_H_
+
+#ifdef _KERNEL
+
+#define IO_PPI 0x61 /* Programmable Peripheral Interface */
+
+/*
+ * PPI speaker control values
+ */
+
+#define PIT_ENABLETMR2 0x01 /* Enable timer/counter 2 */
+#define PIT_SPKRDATA 0x02 /* Direct to speaker */
+
+#define PIT_SPKR (PIT_ENABLETMR2 | PIT_SPKRDATA)
+
+#define ppi_spkr_on() outb(IO_PPI, inb(IO_PPI) | PIT_SPKR)
+#define ppi_spkr_off() outb(IO_PPI, inb(IO_PPI) & ~PIT_SPKR)
+
+#endif /* _KERNEL */
+
+#endif /* _MACHINE_PPIREG_H_ */
diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h
new file mode 100644
index 0000000..14585fb
--- /dev/null
+++ b/sys/amd64/include/proc.h
@@ -0,0 +1,91 @@
+/*-
+ * Copyright (c) 1991 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)proc.h 7.1 (Berkeley) 5/15/91
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_PROC_H_
+#define _MACHINE_PROC_H_
+
+#include <machine/segments.h>
+
+struct proc_ldt {
+ caddr_t ldt_base;
+ int ldt_refcnt;
+};
+
+/*
+ * Machine-dependent part of the proc structure for AMD64.
+ */
+struct mdthread {
+ int md_spinlock_count; /* (k) */
+ register_t md_saved_flags; /* (k) */
+ register_t md_spurflt_addr; /* (k) Spurious page fault address. */
+};
+
+struct mdproc {
+ struct proc_ldt *md_ldt; /* (t) per-process ldt */
+ struct system_segment_descriptor md_ldt_sd;
+};
+
+#define KINFO_PROC_SIZE 1088
+#define KINFO_PROC32_SIZE 768
+
+#ifdef _KERNEL
+
+/* Get the current kernel thread stack usage. */
+#define GET_STACK_USAGE(total, used) do { \
+ struct thread *td = curthread; \
+ (total) = td->td_kstack_pages * PAGE_SIZE; \
+ (used) = (char *)td->td_kstack + \
+ td->td_kstack_pages * PAGE_SIZE - \
+ (char *)&td; \
+} while (0)
+
+void set_user_ldt(struct mdproc *);
+struct proc_ldt *user_ldt_alloc(struct proc *, int);
+void user_ldt_free(struct thread *);
+void user_ldt_deref(struct proc_ldt *);
+struct sysarch_args;
+int sysarch_ldt(struct thread *td, struct sysarch_args *uap, int uap_space);
+int amd64_set_ldt_data(struct thread *td, int start, int num,
+ struct user_segment_descriptor *descs);
+
+extern struct mtx dt_lock;
+extern int max_ldt_segment;
+
+struct syscall_args {
+ u_int code;
+ struct sysent *callp;
+ register_t args[8];
+ int narg;
+};
+#endif /* _KERNEL */
+
+#endif /* !_MACHINE_PROC_H_ */
diff --git a/sys/amd64/include/profile.h b/sys/amd64/include/profile.h
new file mode 100644
index 0000000..8fea371
--- /dev/null
+++ b/sys/amd64/include/profile.h
@@ -0,0 +1,201 @@
+/*-
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)profile.h 8.1 (Berkeley) 6/11/93
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_PROFILE_H_
+#define _MACHINE_PROFILE_H_
+
+#ifndef _SYS_CDEFS_H_
+#error this file needs sys/cdefs.h as a prerequisite
+#endif
+
+#ifdef _KERNEL
+
+/*
+ * Config generates something to tell the compiler to align functions on 16
+ * byte boundaries. A strict alignment is good for keeping the tables small.
+ */
+#define FUNCTION_ALIGNMENT 16
+
+/*
+ * The kernel uses assembler stubs instead of unportable inlines.
+ * This is mainly to save a little time when profiling is not enabled,
+ * which is the usual case for the kernel.
+ */
+#define _MCOUNT_DECL void mcount
+#define MCOUNT
+
+#ifdef GUPROF
+#define MCOUNT_DECL(s)
+#define MCOUNT_ENTER(s)
+#define MCOUNT_EXIT(s)
+#ifdef __GNUCLIKE_ASM
+#define MCOUNT_OVERHEAD(label) \
+ __asm __volatile("pushq %0; call __mcount; popq %%rcx" \
+ : \
+ : "i" (label) \
+ : "ax", "dx", "cx", "di", "si", "r8", "r9", "memory")
+#define MEXITCOUNT_OVERHEAD() \
+ __asm __volatile("call .mexitcount; 1:" \
+ : : \
+ : "ax", "dx", "cx", "di", "si", "r8", "r9", "memory")
+#define MEXITCOUNT_OVERHEAD_GETLABEL(labelp) \
+ __asm __volatile("movq $1b,%0" : "=rm" (labelp))
+#elif defined(lint)
+#define MCOUNT_OVERHEAD(label)
+#define MEXITCOUNT_OVERHEAD()
+#define MEXITCOUNT_OVERHEAD_GETLABEL()
+#else
+#error this file needs to be ported to your compiler
+#endif /* !__GNUCLIKE_ASM */
+#else /* !GUPROF */
+#define MCOUNT_DECL(s) register_t s;
+#ifdef SMP
+extern int mcount_lock;
+#define MCOUNT_ENTER(s) { s = intr_disable(); \
+ while (!atomic_cmpset_acq_int(&mcount_lock, 0, 1)) \
+ /* nothing */ ; }
+#define MCOUNT_EXIT(s) { atomic_store_rel_int(&mcount_lock, 0); \
+ intr_restore(s); }
+#else
+#define MCOUNT_ENTER(s) { s = intr_disable(); }
+#define MCOUNT_EXIT(s) (intr_restore(s))
+#endif
+#endif /* GUPROF */
+
+void bintr(void);
+void btrap(void);
+void eintr(void);
+void user(void);
+
+#define MCOUNT_FROMPC_USER(pc) \
+ ((pc < (uintfptr_t)VM_MAXUSER_ADDRESS) ? (uintfptr_t)user : pc)
+
+#define MCOUNT_FROMPC_INTR(pc) \
+ ((pc >= (uintfptr_t)btrap && pc < (uintfptr_t)eintr) ? \
+ ((pc >= (uintfptr_t)bintr) ? (uintfptr_t)bintr : \
+ (uintfptr_t)btrap) : ~0UL)
+
+#else /* !_KERNEL */
+
+#define FUNCTION_ALIGNMENT 4
+
+#define _MCOUNT_DECL \
+static void _mcount(uintfptr_t frompc, uintfptr_t selfpc) __used; \
+static void _mcount
+
+#ifdef __GNUCLIKE_ASM
+#define MCOUNT __asm(" \n\
+ .text \n\
+ .p2align 4,0x90 \n\
+ .globl .mcount \n\
+ .type .mcount,@function \n\
+.mcount: \n\
+ pushq %rdi \n\
+ pushq %rsi \n\
+ pushq %rdx \n\
+ pushq %rcx \n\
+ pushq %r8 \n\
+ pushq %r9 \n\
+ pushq %rax \n\
+ movq 8(%rbp),%rdi \n\
+ movq 7*8(%rsp),%rsi \n\
+ call _mcount \n\
+ popq %rax \n\
+ popq %r9 \n\
+ popq %r8 \n\
+ popq %rcx \n\
+ popq %rdx \n\
+ popq %rsi \n\
+ popq %rdi \n\
+ ret \n\
+ .size .mcount, . - .mcount");
+#if 0
+/*
+ * We could use this, except it doesn't preserve the registers that were
+ * being passed with arguments to the function that we were inserted
+ * into. I've left it here as documentation of what the code above is
+ * supposed to do.
+ */
+#define MCOUNT \
+void \
+mcount() \
+{ \
+ uintfptr_t selfpc, frompc; \
+ /* \
+ * Find the return address for mcount, \
+ * and the return address for mcount's caller. \
+ * \
+ * selfpc = pc pushed by call to mcount \
+ */ \
+ __asm("movq 8(%%rbp),%0" : "=r" (selfpc)); \
+ /* \
+ * frompc = pc pushed by call to mcount's caller. \
+ * The caller's stack frame has already been built, so %rbp is \
+ * the caller's frame pointer. The caller's raddr is in the \
+ * caller's frame following the caller's caller's frame pointer.\
+ */ \
+ __asm("movq (%%rbp),%0" : "=r" (frompc)); \
+ frompc = ((uintfptr_t *)frompc)[1]; \
+ _mcount(frompc, selfpc); \
+}
+#endif
+#else /* !__GNUCLIKE_ASM */
+#define MCOUNT
+#endif /* __GNUCLIKE_ASM */
+
+typedef u_long uintfptr_t;
+
+#endif /* _KERNEL */
+
+/*
+ * An unsigned integral type that can hold non-negative difference between
+ * function pointers.
+ */
+typedef u_long fptrdiff_t;
+
+#ifdef _KERNEL
+
+void mcount(uintfptr_t frompc, uintfptr_t selfpc);
+
+#else /* !_KERNEL */
+
+#include <sys/cdefs.h>
+
+__BEGIN_DECLS
+#ifdef __GNUCLIKE_ASM
+void mcount(void) __asm(".mcount");
+#endif
+__END_DECLS
+
+#endif /* _KERNEL */
+
+#endif /* !_MACHINE_PROFILE_H_ */
diff --git a/sys/amd64/include/psl.h b/sys/amd64/include/psl.h
new file mode 100644
index 0000000..4d945a1
--- /dev/null
+++ b/sys/amd64/include/psl.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/psl.h>
diff --git a/sys/amd64/include/ptrace.h b/sys/amd64/include/ptrace.h
new file mode 100644
index 0000000..bf86754
--- /dev/null
+++ b/sys/amd64/include/ptrace.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/ptrace.h>
diff --git a/sys/amd64/include/reg.h b/sys/amd64/include/reg.h
new file mode 100644
index 0000000..f6fb2bc
--- /dev/null
+++ b/sys/amd64/include/reg.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/reg.h>
diff --git a/sys/amd64/include/reloc.h b/sys/amd64/include/reloc.h
new file mode 100644
index 0000000..1883193
--- /dev/null
+++ b/sys/amd64/include/reloc.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)reloc.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _I386_MACHINE_RELOC_H_
+#define _I386_MACHINE_RELOC_H_
+
+/* Relocation format. */
+struct relocation_info {
+ int r_address; /* offset in text or data segment */
+ unsigned int r_symbolnum : 24, /* ordinal number of add symbol */
+ r_pcrel : 1, /* 1 if value should be pc-relative */
+ r_length : 2, /* log base 2 of value's width */
+ r_extern : 1, /* 1 if need to add symbol to value */
+ r_baserel : 1, /* linkage table relative */
+ r_jmptable : 1, /* relocate to jump table */
+ r_relative : 1, /* load address relative */
+ r_copy : 1; /* run time copy */
+};
+
+#endif
diff --git a/sys/amd64/include/resource.h b/sys/amd64/include/resource.h
new file mode 100644
index 0000000..edde5eb
--- /dev/null
+++ b/sys/amd64/include/resource.h
@@ -0,0 +1,44 @@
+/* $FreeBSD$ */
+/*-
+ * Copyright 1998 Massachusetts Institute of Technology
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation for any purpose and without fee is hereby
+ * granted, provided that both the above copyright notice and this
+ * permission notice appear in all copies, that both the above
+ * copyright notice and this permission notice appear in all
+ * supporting documentation, and that the name of M.I.T. not be used
+ * in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission. M.I.T. makes
+ * no representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied
+ * warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS
+ * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
+ * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _MACHINE_RESOURCE_H_
+#define _MACHINE_RESOURCE_H_ 1
+
+/*
+ * Definitions of resource types for Intel Architecture machines
+ * with support for legacy ISA devices and drivers.
+ */
+
+#define SYS_RES_IRQ 1 /* interrupt lines */
+#define SYS_RES_DRQ 2 /* isa dma lines */
+#define SYS_RES_MEMORY 3 /* i/o memory */
+#define SYS_RES_IOPORT 4 /* i/o ports */
+
+#endif /* !_MACHINE_RESOURCE_H_ */
diff --git a/sys/amd64/include/runq.h b/sys/amd64/include/runq.h
new file mode 100644
index 0000000..855e315
--- /dev/null
+++ b/sys/amd64/include/runq.h
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_RUNQ_H_
+#define _MACHINE_RUNQ_H_
+
+#define RQB_LEN (1) /* Number of priority status words. */
+#define RQB_L2BPW (6) /* Log2(sizeof(rqb_word_t) * NBBY)). */
+#define RQB_BPW (1<<RQB_L2BPW) /* Bits in an rqb_word_t. */
+
+#define RQB_BIT(pri) (1ul << ((pri) & (RQB_BPW - 1)))
+#define RQB_WORD(pri) ((pri) >> RQB_L2BPW)
+
+#define RQB_FFS(word) (bsfq(word))
+
+/*
+ * Type of run queue status word.
+ */
+typedef u_int64_t rqb_word_t;
+
+#endif
diff --git a/sys/amd64/include/segments.h b/sys/amd64/include/segments.h
new file mode 100644
index 0000000..d9f4280
--- /dev/null
+++ b/sys/amd64/include/segments.h
@@ -0,0 +1,106 @@
+/*-
+ * Copyright (c) 1989, 1990 William F. Jolitz
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)segments.h 7.1 (Berkeley) 5/9/91
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_SEGMENTS_H_
+#define _MACHINE_SEGMENTS_H_
+
+/*
+ * AMD64 Segmentation Data Structures and definitions
+ */
+
+#include <x86/segments.h>
+
+/*
+ * System segment descriptors (128 bit wide)
+ */
+struct system_segment_descriptor {
+ u_int64_t sd_lolimit:16; /* segment extent (lsb) */
+ u_int64_t sd_lobase:24; /* segment base address (lsb) */
+ u_int64_t sd_type:5; /* segment type */
+ u_int64_t sd_dpl:2; /* segment descriptor priority level */
+ u_int64_t sd_p:1; /* segment descriptor present */
+ u_int64_t sd_hilimit:4; /* segment extent (msb) */
+ u_int64_t sd_xx0:3; /* unused */
+ u_int64_t sd_gran:1; /* limit granularity (byte/page units)*/
+ u_int64_t sd_hibase:40 __packed;/* segment base address (msb) */
+ u_int64_t sd_xx1:8;
+ u_int64_t sd_mbz:5; /* MUST be zero */
+ u_int64_t sd_xx2:19;
+} __packed;
+
+/*
+ * Software definitions are in this convenient format,
+ * which are translated into inconvenient segment descriptors
+ * when needed to be used by the 386 hardware
+ */
+
+struct soft_segment_descriptor {
+ unsigned long ssd_base; /* segment base address */
+ unsigned long ssd_limit; /* segment extent */
+ unsigned long ssd_type:5; /* segment type */
+ unsigned long ssd_dpl:2; /* segment descriptor priority level */
+ unsigned long ssd_p:1; /* segment descriptor present */
+ unsigned long ssd_long:1; /* long mode (for %cs) */
+ unsigned long ssd_def32:1; /* default 32 vs 16 bit size */
+ unsigned long ssd_gran:1; /* limit granularity (byte/page units)*/
+} __packed;
+
+/*
+ * region descriptors, used to load gdt/idt tables before segments yet exist.
+ */
+struct region_descriptor {
+ unsigned long rd_limit:16; /* segment extent */
+ unsigned long rd_base:64 __packed; /* base address */
+} __packed;
+
+#ifdef _KERNEL
+extern struct user_segment_descriptor gdt[];
+extern struct soft_segment_descriptor gdt_segs[];
+extern struct gate_descriptor *idt;
+extern struct region_descriptor r_gdt, r_idt;
+
+void lgdt(struct region_descriptor *rdp);
+void sdtossd(struct user_segment_descriptor *sdp,
+ struct soft_segment_descriptor *ssdp);
+void ssdtosd(struct soft_segment_descriptor *ssdp,
+ struct user_segment_descriptor *sdp);
+void ssdtosyssd(struct soft_segment_descriptor *ssdp,
+ struct system_segment_descriptor *sdp);
+void update_gdt_gsbase(struct thread *td, uint32_t base);
+void update_gdt_fsbase(struct thread *td, uint32_t base);
+#endif /* _KERNEL */
+
+#endif /* !_MACHINE_SEGMENTS_H_ */
diff --git a/sys/amd64/include/setjmp.h b/sys/amd64/include/setjmp.h
new file mode 100644
index 0000000..c4101a7
--- /dev/null
+++ b/sys/amd64/include/setjmp.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/setjmp.h>
diff --git a/sys/amd64/include/sf_buf.h b/sys/amd64/include/sf_buf.h
new file mode 100644
index 0000000..b5245e6
--- /dev/null
+++ b/sys/amd64/include/sf_buf.h
@@ -0,0 +1,58 @@
+/*-
+ * Copyright (c) 2003, 2005 Alan L. Cox <alc@cs.rice.edu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_SF_BUF_H_
+#define _MACHINE_SF_BUF_H_
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_page.h>
+
+/*
+ * On this machine, the only purpose for which sf_buf is used is to implement
+ * an opaque pointer required by the machine-independent parts of the kernel.
+ * That pointer references the vm_page that is "mapped" by the sf_buf. The
+ * actual mapping is provided by the direct virtual-to-physical mapping.
+ */
+struct sf_buf;
+
+static __inline vm_offset_t
+sf_buf_kva(struct sf_buf *sf)
+{
+
+ return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS((vm_page_t)sf)));
+}
+
+static __inline vm_page_t
+sf_buf_page(struct sf_buf *sf)
+{
+
+ return ((vm_page_t)sf);
+}
+
+#endif /* !_MACHINE_SF_BUF_H_ */
diff --git a/sys/amd64/include/sigframe.h b/sys/amd64/include/sigframe.h
new file mode 100644
index 0000000..d5cdb56
--- /dev/null
+++ b/sys/amd64/include/sigframe.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/sigframe.h>
diff --git a/sys/amd64/include/signal.h b/sys/amd64/include/signal.h
new file mode 100644
index 0000000..db9fe6a
--- /dev/null
+++ b/sys/amd64/include/signal.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/signal.h>
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
new file mode 100644
index 0000000..16d87ea
--- /dev/null
+++ b/sys/amd64/include/smp.h
@@ -0,0 +1,82 @@
+/*-
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef _MACHINE_SMP_H_
+#define _MACHINE_SMP_H_
+
+#ifdef _KERNEL
+
+#ifdef SMP
+
+#ifndef LOCORE
+
+#include <sys/bus.h>
+#include <machine/frame.h>
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
+#include <machine/pcb.h>
+
+/* global symbols in mpboot.S */
+extern char mptramp_start[];
+extern char mptramp_end[];
+extern u_int32_t mptramp_pagetables;
+
+/* global data in mp_machdep.c */
+extern int mp_naps;
+extern int boot_cpu_id;
+extern struct pcb stoppcbs[];
+extern int cpu_apic_ids[];
+#ifdef COUNT_IPIS
+extern u_long *ipi_invltlb_counts[MAXCPU];
+extern u_long *ipi_invlrng_counts[MAXCPU];
+extern u_long *ipi_invlpg_counts[MAXCPU];
+extern u_long *ipi_invlcache_counts[MAXCPU];
+extern u_long *ipi_rendezvous_counts[MAXCPU];
+#endif
+
+/* IPI handlers */
+inthand_t
+ IDTVEC(invltlb), /* TLB shootdowns - global */
+ IDTVEC(invlpg), /* TLB shootdowns - 1 page */
+ IDTVEC(invlrng), /* TLB shootdowns - page range */
+ IDTVEC(invlcache), /* Write back and invalidate cache */
+ IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */
+ IDTVEC(cpustop), /* CPU stops & waits to be restarted */
+ IDTVEC(cpususpend), /* CPU suspends & waits to be resumed */
+ IDTVEC(rendezvous); /* handle CPU rendezvous */
+
+/* functions in mp_machdep.c */
+void cpu_add(u_int apic_id, char boot_cpu);
+void cpustop_handler(void);
+void cpususpend_handler(void);
+void init_secondary(void);
+void ipi_startup(int apic_id, int vector);
+void ipi_all_but_self(u_int ipi);
+void ipi_bitmap_handler(struct trapframe frame);
+void ipi_cpu(int cpu, u_int ipi);
+int ipi_nmi_handler(void);
+void ipi_selected(cpuset_t cpus, u_int ipi);
+u_int mp_bootaddress(u_int);
+void smp_cache_flush(void);
+void smp_invlpg(vm_offset_t addr);
+void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr);
+void smp_invlpg_range(vm_offset_t startva, vm_offset_t endva);
+void smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva,
+ vm_offset_t endva);
+void smp_invltlb(void);
+void smp_masked_invltlb(cpuset_t mask);
+
+#endif /* !LOCORE */
+#endif /* SMP */
+
+#endif /* _KERNEL */
+#endif /* _MACHINE_SMP_H_ */
diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h
new file mode 100644
index 0000000..aace4bf
--- /dev/null
+++ b/sys/amd64/include/specialreg.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/specialreg.h>
diff --git a/sys/amd64/include/stack.h b/sys/amd64/include/stack.h
new file mode 100644
index 0000000..24e2547
--- /dev/null
+++ b/sys/amd64/include/stack.h
@@ -0,0 +1,44 @@
+/*-
+ * Mach Operating System
+ * Copyright (c) 1991,1990 Carnegie Mellon University
+ * All Rights Reserved.
+ *
+ * Permission to use, copy, modify and distribute this software and its
+ * documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_STACK_H_
+#define _MACHINE_STACK_H_
+
+/*
+ * Stack trace.
+ */
+#define INKERNEL(va) (((va) >= DMAP_MIN_ADDRESS && (va) < DMAP_MAX_ADDRESS) \
+ || ((va) >= VM_MIN_KERNEL_ADDRESS && (va) < VM_MAX_KERNEL_ADDRESS))
+
+struct amd64_frame {
+ struct amd64_frame *f_frame;
+ long f_retaddr;
+ long f_arg0;
+};
+
+#endif /* !_MACHINE_STACK_H_ */
diff --git a/sys/amd64/include/stdarg.h b/sys/amd64/include/stdarg.h
new file mode 100644
index 0000000..1f80090
--- /dev/null
+++ b/sys/amd64/include/stdarg.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/stdarg.h>
diff --git a/sys/amd64/include/sysarch.h b/sys/amd64/include/sysarch.h
new file mode 100644
index 0000000..cd380d4
--- /dev/null
+++ b/sys/amd64/include/sysarch.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/sysarch.h>
diff --git a/sys/amd64/include/timerreg.h b/sys/amd64/include/timerreg.h
new file mode 100644
index 0000000..cf5f281
--- /dev/null
+++ b/sys/amd64/include/timerreg.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (C) 2005 TAKAHASHI Yoshihiro. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * The outputs of the three timers are connected as follows:
+ *
+ * timer 0 -> irq 0
+ * timer 1 -> dma chan 0 (for dram refresh)
+ * timer 2 -> speaker (via keyboard controller)
+ *
+ * Timer 0 is used to call hardclock.
+ * Timer 2 is used to generate console beeps.
+ */
+
+#ifndef _MACHINE_TIMERREG_H_
+#define _MACHINE_TIMERREG_H_
+
+#ifdef _KERNEL
+
+#include <dev/ic/i8253reg.h>
+
+#define IO_TIMER1 0x40 /* 8253 Timer #1 */
+#define TIMER_CNTR0 (IO_TIMER1 + TIMER_REG_CNTR0)
+#define TIMER_CNTR1 (IO_TIMER1 + TIMER_REG_CNTR1)
+#define TIMER_CNTR2 (IO_TIMER1 + TIMER_REG_CNTR2)
+#define TIMER_MODE (IO_TIMER1 + TIMER_REG_MODE)
+
+#endif /* _KERNEL */
+
+#endif /* _MACHINE_TIMERREG_H_ */
diff --git a/sys/amd64/include/trap.h b/sys/amd64/include/trap.h
new file mode 100644
index 0000000..4d95077
--- /dev/null
+++ b/sys/amd64/include/trap.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/trap.h>
diff --git a/sys/amd64/include/tss.h b/sys/amd64/include/tss.h
new file mode 100644
index 0000000..fbbe3af
--- /dev/null
+++ b/sys/amd64/include/tss.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)tss.h 5.4 (Berkeley) 1/18/91
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_TSS_H_
+#define _MACHINE_TSS_H_ 1
+
+/*
+ * amd64 Context Data Type
+ *
+ * The alignment is pretty messed up here due to reuse of the original 32 bit
+ * fields. It might be worth trying to set the tss on a +4 byte offset to
+ * make the 64 bit fields aligned in practice.
+ */
+struct amd64tss {
+ u_int32_t tss_rsvd0;
+ u_int64_t tss_rsp0 __packed; /* kernel stack pointer ring 0 */
+ u_int64_t tss_rsp1 __packed; /* kernel stack pointer ring 1 */
+ u_int64_t tss_rsp2 __packed; /* kernel stack pointer ring 2 */
+ u_int32_t tss_rsvd1;
+ u_int32_t tss_rsvd2;
+ u_int64_t tss_ist1 __packed; /* Interrupt stack table 1 */
+ u_int64_t tss_ist2 __packed; /* Interrupt stack table 2 */
+ u_int64_t tss_ist3 __packed; /* Interrupt stack table 3 */
+ u_int64_t tss_ist4 __packed; /* Interrupt stack table 4 */
+ u_int64_t tss_ist5 __packed; /* Interrupt stack table 5 */
+ u_int64_t tss_ist6 __packed; /* Interrupt stack table 6 */
+ u_int64_t tss_ist7 __packed; /* Interrupt stack table 7 */
+ u_int32_t tss_rsvd3;
+ u_int32_t tss_rsvd4;
+ u_int16_t tss_rsvd5;
+ u_int16_t tss_iobase; /* io bitmap offset */
+};
+
+#ifdef _KERNEL
+extern struct amd64tss common_tss[];
+#endif
+
+#endif /* _MACHINE_TSS_H_ */
diff --git a/sys/amd64/include/ucontext.h b/sys/amd64/include/ucontext.h
new file mode 100644
index 0000000..aea80e3
--- /dev/null
+++ b/sys/amd64/include/ucontext.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/ucontext.h>
diff --git a/sys/amd64/include/varargs.h b/sys/amd64/include/varargs.h
new file mode 100644
index 0000000..93faac6
--- /dev/null
+++ b/sys/amd64/include/varargs.h
@@ -0,0 +1,89 @@
+/*-
+ * Copyright (c) 2002 David E. O'Brien. All rights reserved.
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)varargs.h 8.2 (Berkeley) 3/22/94
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_VARARGS_H_
+#define _MACHINE_VARARGS_H_
+
+#ifndef _SYS_CDEFS_H_
+#error this file needs sys/cdefs.h as a prerequisite
+#endif
+
+#ifdef __GNUCLIKE_BUILTIN_VARARGS
+
+#include <sys/_types.h>
+
+#ifndef _VA_LIST_DECLARED
+#define _VA_LIST_DECLARED
+typedef __va_list va_list;
+#endif
+
+typedef int __builtin_va_alist_t __attribute__((__mode__(__word__)));
+
+#define va_alist __builtin_va_alist
+#define va_dcl __builtin_va_alist_t __builtin_va_alist; ...
+#define va_start(ap) __builtin_varargs_start(ap)
+#define va_arg(ap, type) __builtin_va_arg((ap), type)
+#define va_end(ap) __builtin_va_end(ap)
+
+#else /* !__GNUCLIKE_BUILTIN_VARARGS */
+
+typedef char *va_list;
+
+#define __va_size(type) \
+ (((sizeof(type) + sizeof(int) - 1) / sizeof(int)) * sizeof(int))
+
+#if defined(__GNUCLIKE_BUILTIN_VAALIST)
+#define va_alist __builtin_va_alist
+#endif
+#define va_dcl int va_alist; ...
+
+#define va_start(ap) \
+ ((ap) = (va_list)&va_alist)
+
+#define va_arg(ap, type) \
+ (*(type *)((ap) += __va_size(type), (ap) - __va_size(type)))
+
+#define va_end(ap)
+
+#endif /* __GNUCLIKE_BUILTIN_VARARGS */
+
+#endif /* !_MACHINE_VARARGS_H_ */
diff --git a/sys/amd64/include/vdso.h b/sys/amd64/include/vdso.h
new file mode 100644
index 0000000..b81c455
--- /dev/null
+++ b/sys/amd64/include/vdso.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD$ */
+
+#include <x86/vdso.h>
diff --git a/sys/amd64/include/vm.h b/sys/amd64/include/vm.h
new file mode 100644
index 0000000..6573e37
--- /dev/null
+++ b/sys/amd64/include/vm.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2009 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_VM_H_
+#define _MACHINE_VM_H_
+
+#include <machine/specialreg.h>
+
+/* Memory attributes. */
+#define VM_MEMATTR_UNCACHEABLE ((vm_memattr_t)PAT_UNCACHEABLE)
+#define VM_MEMATTR_WRITE_COMBINING ((vm_memattr_t)PAT_WRITE_COMBINING)
+#define VM_MEMATTR_WRITE_THROUGH ((vm_memattr_t)PAT_WRITE_THROUGH)
+#define VM_MEMATTR_WRITE_PROTECTED ((vm_memattr_t)PAT_WRITE_PROTECTED)
+#define VM_MEMATTR_WRITE_BACK ((vm_memattr_t)PAT_WRITE_BACK)
+#define VM_MEMATTR_WEAK_UNCACHEABLE ((vm_memattr_t)PAT_UNCACHED)
+
+#define VM_MEMATTR_DEFAULT VM_MEMATTR_WRITE_BACK
+
+#endif /* !_MACHINE_VM_H_ */
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
new file mode 100644
index 0000000..9a3063e
--- /dev/null
+++ b/sys/amd64/include/vmm.h
@@ -0,0 +1,291 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_H_
+#define _VMM_H_
+
+#ifdef _KERNEL
+
+#define VM_MAX_NAMELEN 32
+
+struct vm;
+struct vm_memory_segment;
+struct seg_desc;
+struct vm_exit;
+struct vm_run;
+struct vlapic;
+
+enum x2apic_state;
+
+typedef int (*vmm_init_func_t)(void);
+typedef int (*vmm_cleanup_func_t)(void);
+typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
+typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
+typedef void (*vmi_cleanup_func_t)(void *vmi);
+typedef int (*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
+ vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot,
+ boolean_t superpages_ok);
+typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
+typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
+ uint64_t *retval);
+typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num,
+ uint64_t val);
+typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num,
+ struct seg_desc *desc);
+typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num,
+ struct seg_desc *desc);
+typedef int (*vmi_inject_event_t)(void *vmi, int vcpu,
+ int type, int vector,
+ uint32_t code, int code_valid);
+typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
+typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+
+struct vmm_ops {
+ vmm_init_func_t init; /* module wide initialization */
+ vmm_cleanup_func_t cleanup;
+
+ vmi_init_func_t vminit; /* vm-specific initialization */
+ vmi_run_func_t vmrun;
+ vmi_cleanup_func_t vmcleanup;
+ vmi_mmap_set_func_t vmmmap_set;
+ vmi_mmap_get_func_t vmmmap_get;
+ vmi_get_register_t vmgetreg;
+ vmi_set_register_t vmsetreg;
+ vmi_get_desc_t vmgetdesc;
+ vmi_set_desc_t vmsetdesc;
+ vmi_inject_event_t vminject;
+ vmi_get_cap_t vmgetcap;
+ vmi_set_cap_t vmsetcap;
+};
+
+extern struct vmm_ops vmm_ops_intel;
+extern struct vmm_ops vmm_ops_amd;
+
+int vm_create(const char *name, struct vm **retvm);
+void vm_destroy(struct vm *vm);
+const char *vm_name(struct vm *vm);
+int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
+int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
+vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
+int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+ struct vm_memory_segment *seg);
+int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
+int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
+int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *ret_desc);
+int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc);
+int vm_run(struct vm *vm, struct vm_run *vmrun);
+int vm_inject_event(struct vm *vm, int vcpu, int type,
+ int vector, uint32_t error_code, int error_code_valid);
+int vm_inject_nmi(struct vm *vm, int vcpu);
+int vm_nmi_pending(struct vm *vm, int vcpuid);
+void vm_nmi_clear(struct vm *vm, int vcpuid);
+uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
+struct vlapic *vm_lapic(struct vm *vm, int cpu);
+int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
+int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
+int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
+int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
+void vm_activate_cpu(struct vm *vm, int vcpu);
+cpuset_t vm_active_cpus(struct vm *vm);
+struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
+
+/*
+ * Return 1 if device indicated by bus/slot/func is supposed to be a
+ * pci passthrough device.
+ *
+ * Return 0 otherwise.
+ */
+int vmm_is_pptdev(int bus, int slot, int func);
+
+void *vm_iommu_domain(struct vm *vm);
+
+enum vcpu_state {
+ VCPU_IDLE,
+ VCPU_RUNNING,
+ VCPU_CANNOT_RUN,
+};
+
+int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state);
+enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu);
+
+static int __inline
+vcpu_is_running(struct vm *vm, int vcpu)
+{
+ return (vcpu_get_state(vm, vcpu) == VCPU_RUNNING);
+}
+
+void *vcpu_stats(struct vm *vm, int vcpu);
+void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
+
+#endif /* KERNEL */
+
+#include <machine/vmm_instruction_emul.h>
+
+#define VM_MAXCPU 8 /* maximum virtual cpus */
+
+/*
+ * Identifiers for events that can be injected into the VM
+ */
+enum vm_event_type {
+ VM_EVENT_NONE,
+ VM_HW_INTR,
+ VM_NMI,
+ VM_HW_EXCEPTION,
+ VM_SW_INTR,
+ VM_PRIV_SW_EXCEPTION,
+ VM_SW_EXCEPTION,
+ VM_EVENT_MAX
+};
+
+/*
+ * Identifiers for architecturally defined registers.
+ */
+enum vm_reg_name {
+ VM_REG_GUEST_RAX,
+ VM_REG_GUEST_RBX,
+ VM_REG_GUEST_RCX,
+ VM_REG_GUEST_RDX,
+ VM_REG_GUEST_RSI,
+ VM_REG_GUEST_RDI,
+ VM_REG_GUEST_RBP,
+ VM_REG_GUEST_R8,
+ VM_REG_GUEST_R9,
+ VM_REG_GUEST_R10,
+ VM_REG_GUEST_R11,
+ VM_REG_GUEST_R12,
+ VM_REG_GUEST_R13,
+ VM_REG_GUEST_R14,
+ VM_REG_GUEST_R15,
+ VM_REG_GUEST_CR0,
+ VM_REG_GUEST_CR3,
+ VM_REG_GUEST_CR4,
+ VM_REG_GUEST_DR7,
+ VM_REG_GUEST_RSP,
+ VM_REG_GUEST_RIP,
+ VM_REG_GUEST_RFLAGS,
+ VM_REG_GUEST_ES,
+ VM_REG_GUEST_CS,
+ VM_REG_GUEST_SS,
+ VM_REG_GUEST_DS,
+ VM_REG_GUEST_FS,
+ VM_REG_GUEST_GS,
+ VM_REG_GUEST_LDTR,
+ VM_REG_GUEST_TR,
+ VM_REG_GUEST_IDTR,
+ VM_REG_GUEST_GDTR,
+ VM_REG_GUEST_EFER,
+ VM_REG_LAST
+};
+
+/*
+ * Identifiers for optional vmm capabilities
+ */
+enum vm_cap_type {
+ VM_CAP_HALT_EXIT,
+ VM_CAP_MTRAP_EXIT,
+ VM_CAP_PAUSE_EXIT,
+ VM_CAP_UNRESTRICTED_GUEST,
+ VM_CAP_MAX
+};
+
+enum x2apic_state {
+ X2APIC_ENABLED,
+ X2APIC_AVAILABLE,
+ X2APIC_DISABLED,
+ X2APIC_STATE_LAST
+};
+
+/*
+ * The 'access' field has the format specified in Table 21-2 of the Intel
+ * Architecture Manual vol 3b.
+ *
+ * XXX The contents of the 'access' field are architecturally defined except
+ * bit 16 - Segment Unusable.
+ */
+struct seg_desc {
+ uint64_t base;
+ uint32_t limit;
+ uint32_t access;
+};
+
+enum vm_exitcode {
+ VM_EXITCODE_INOUT,
+ VM_EXITCODE_VMX,
+ VM_EXITCODE_BOGUS,
+ VM_EXITCODE_RDMSR,
+ VM_EXITCODE_WRMSR,
+ VM_EXITCODE_HLT,
+ VM_EXITCODE_MTRAP,
+ VM_EXITCODE_PAUSE,
+ VM_EXITCODE_PAGING,
+ VM_EXITCODE_SPINUP_AP,
+ VM_EXITCODE_MAX
+};
+
+struct vm_exit {
+ enum vm_exitcode exitcode;
+ int inst_length; /* 0 means unknown */
+ uint64_t rip;
+ union {
+ struct {
+ uint16_t bytes:3; /* 1 or 2 or 4 */
+ uint16_t in:1; /* out is 0, in is 1 */
+ uint16_t string:1;
+ uint16_t rep:1;
+ uint16_t port;
+ uint32_t eax; /* valid for out */
+ } inout;
+ struct {
+ uint64_t gpa;
+ struct vie vie;
+ } paging;
+ /*
+ * VMX specific payload. Used when there is no "better"
+ * exitcode to represent the VM-exit.
+ */
+ struct {
+ int error; /* vmx inst error */
+ uint32_t exit_reason;
+ uint64_t exit_qualification;
+ } vmx;
+ struct {
+ uint32_t code; /* ecx value */
+ uint64_t wval;
+ } msr;
+ struct {
+ int vcpu;
+ uint64_t rip;
+ } spinup_ap;
+ } u;
+};
+
+#endif /* _VMM_H_ */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
new file mode 100644
index 0000000..0729927
--- /dev/null
+++ b/sys/amd64/include/vmm_dev.h
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_DEV_H_
+#define _VMM_DEV_H_
+
+#ifdef _KERNEL
+void vmmdev_init(void);
+int vmmdev_cleanup(void);
+#endif
+
+struct vm_memory_segment {
+ vm_paddr_t gpa; /* in */
+ size_t len; /* in */
+};
+
+struct vm_register {
+ int cpuid;
+ int regnum; /* enum vm_reg_name */
+ uint64_t regval;
+};
+
+struct vm_seg_desc { /* data or code segment */
+ int cpuid;
+ int regnum; /* enum vm_reg_name */
+ struct seg_desc desc;
+};
+
+struct vm_run {
+ int cpuid;
+ uint64_t rip; /* start running here */
+ struct vm_exit vm_exit;
+};
+
+struct vm_event {
+ int cpuid;
+ enum vm_event_type type;
+ int vector;
+ uint32_t error_code;
+ int error_code_valid;
+};
+
+struct vm_lapic_irq {
+ int cpuid;
+ int vector;
+};
+
+struct vm_capability {
+ int cpuid;
+ enum vm_cap_type captype;
+ int capval;
+ int allcpus;
+};
+
+struct vm_pptdev {
+ int bus;
+ int slot;
+ int func;
+};
+
+struct vm_pptdev_mmio {
+ int bus;
+ int slot;
+ int func;
+ vm_paddr_t gpa;
+ vm_paddr_t hpa;
+ size_t len;
+};
+
+struct vm_pptdev_msi {
+ int vcpu;
+ int bus;
+ int slot;
+ int func;
+ int numvec; /* 0 means disabled */
+ int vector;
+ int destcpu;
+};
+
+struct vm_pptdev_msix {
+ int vcpu;
+ int bus;
+ int slot;
+ int func;
+ int idx;
+ uint32_t msg;
+ uint32_t vector_control;
+ uint64_t addr;
+};
+
+struct vm_nmi {
+ int cpuid;
+};
+
+#define MAX_VM_STATS 64
+struct vm_stats {
+ int cpuid; /* in */
+ int num_entries; /* out */
+ struct timeval tv;
+ uint64_t statbuf[MAX_VM_STATS];
+};
+
+struct vm_stat_desc {
+ int index; /* in */
+ char desc[128]; /* out */
+};
+
+struct vm_x2apic {
+ int cpuid;
+ enum x2apic_state state;
+};
+
+enum {
+ IOCNUM_RUN,
+ IOCNUM_MAP_MEMORY,
+ IOCNUM_GET_MEMORY_SEG,
+ IOCNUM_SET_REGISTER,
+ IOCNUM_GET_REGISTER,
+ IOCNUM_SET_SEGMENT_DESCRIPTOR,
+ IOCNUM_GET_SEGMENT_DESCRIPTOR,
+ IOCNUM_INJECT_EVENT,
+ IOCNUM_LAPIC_IRQ,
+ IOCNUM_SET_CAPABILITY,
+ IOCNUM_GET_CAPABILITY,
+ IOCNUM_BIND_PPTDEV,
+ IOCNUM_UNBIND_PPTDEV,
+ IOCNUM_MAP_PPTDEV_MMIO,
+ IOCNUM_PPTDEV_MSI,
+ IOCNUM_PPTDEV_MSIX,
+ IOCNUM_INJECT_NMI,
+ IOCNUM_VM_STATS,
+ IOCNUM_VM_STAT_DESC,
+ IOCNUM_SET_X2APIC_STATE,
+ IOCNUM_GET_X2APIC_STATE,
+};
+
+#define VM_RUN \
+ _IOWR('v', IOCNUM_RUN, struct vm_run)
+#define VM_MAP_MEMORY \
+ _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
+#define VM_GET_MEMORY_SEG \
+ _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
+#define VM_SET_REGISTER \
+ _IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
+#define VM_GET_REGISTER \
+ _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
+#define VM_SET_SEGMENT_DESCRIPTOR \
+ _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define VM_GET_SEGMENT_DESCRIPTOR \
+ _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define VM_INJECT_EVENT \
+ _IOW('v', IOCNUM_INJECT_EVENT, struct vm_event)
+#define VM_LAPIC_IRQ \
+ _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
+#define VM_SET_CAPABILITY \
+ _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
+#define VM_GET_CAPABILITY \
+ _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
+#define VM_BIND_PPTDEV \
+ _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
+#define VM_UNBIND_PPTDEV \
+ _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
+#define VM_MAP_PPTDEV_MMIO \
+ _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
+#define VM_PPTDEV_MSI \
+ _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
+#define VM_PPTDEV_MSIX \
+ _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
+#define VM_INJECT_NMI \
+ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
+#define VM_STATS \
+ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
+#define VM_STAT_DESC \
+ _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
+#define VM_SET_X2APIC_STATE \
+ _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
+#define VM_GET_X2APIC_STATE \
+ _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
+#endif
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
new file mode 100644
index 0000000..a812a73
--- /dev/null
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -0,0 +1,126 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_INSTRUCTION_EMUL_H_
+#define _VMM_INSTRUCTION_EMUL_H_
+
+/*
+ * The data structures 'vie' and 'vie_op' are meant to be opaque to the
+ * consumers of instruction decoding. The only reason why their contents
+ * need to be exposed is because they are part of the 'vm_exit' structure.
+ */
+struct vie_op {
+ uint8_t op_byte; /* actual opcode byte */
+ uint8_t op_type; /* type of operation (e.g. MOV) */
+ uint16_t op_flags;
+};
+
+#define VIE_INST_SIZE 15
+struct vie {
+ uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
+ uint8_t num_valid; /* size of the instruction */
+ uint8_t num_processed;
+
+ uint8_t rex_w:1, /* REX prefix */
+ rex_r:1,
+ rex_x:1,
+ rex_b:1,
+ rex_present:1;
+
+ uint8_t mod:2, /* ModRM byte */
+ reg:4,
+ rm:4;
+
+ uint8_t ss:2, /* SIB byte */
+ index:4,
+ base:4;
+
+ uint8_t disp_bytes;
+ uint8_t imm_bytes;
+
+ uint8_t scale;
+ int base_register; /* VM_REG_GUEST_xyz */
+ int index_register; /* VM_REG_GUEST_xyz */
+
+ int64_t displacement; /* optional addr displacement */
+ int64_t immediate; /* optional immediate operand */
+
+ uint8_t decoded; /* set to 1 if successfully decoded */
+
+ struct vie_op op; /* opcode description */
+};
+
+/*
+ * Callback functions to read and write memory regions.
+ */
+typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
+ uint64_t *rval, int rsize, void *arg);
+
+typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
+ uint64_t wval, int wsize, void *arg);
+
+/*
+ * Emulate the decoded 'vie' instruction.
+ *
+ * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
+ * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
+ * callback functions.
+ *
+ * 'void *vm' should be 'struct vm *' when called from kernel context and
+ * 'struct vmctx *' when called from user context.
+ * s
+ */
+int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t mrr, mem_region_write_t mrw,
+ void *mrarg);
+
+#ifdef _KERNEL
+/*
+ * APIs to fetch and decode the instruction from nested page fault handler.
+ */
+int vmm_fetch_instruction(struct vm *vm, int cpuid,
+ uint64_t rip, int inst_length, uint64_t cr3,
+ struct vie *vie);
+
+/*
+ * Decode the instruction fetched into 'vie' so it can be emulated.
+ *
+ * 'gla' is the guest linear address provided by the hardware assist
+ * that caused the nested page table fault. It is used to verify that
+ * the software instruction decoding is in agreement with the hardware.
+ *
+ * Some hardware assists do not provide the 'gla' to the hypervisor.
+ * To skip the 'gla' verification for this or any other reason pass
+ * in VIE_INVALID_GLA instead.
+ */
+#define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */
+int vmm_decode_instruction(struct vm *vm, int cpuid,
+ uint64_t gla, struct vie *vie);
+#endif /* _KERNEL */
+
+#endif /* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
new file mode 100644
index 0000000..e06fa39
--- /dev/null
+++ b/sys/amd64/include/vmparam.h
@@ -0,0 +1,217 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ * Copyright (c) 2003 Peter Wemm
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)vmparam.h 5.9 (Berkeley) 5/12/91
+ * $FreeBSD$
+ */
+
+
+#ifndef _MACHINE_VMPARAM_H_
+#define _MACHINE_VMPARAM_H_ 1
+
+/*
+ * Machine dependent constants for AMD64.
+ */
+
+/*
+ * Virtual memory related constants, all in bytes
+ */
+#define MAXTSIZ (128UL*1024*1024) /* max text size */
+#ifndef DFLDSIZ
+#define DFLDSIZ (32768UL*1024*1024) /* initial data size limit */
+#endif
+#ifndef MAXDSIZ
+#define MAXDSIZ (32768UL*1024*1024) /* max data size */
+#endif
+#ifndef DFLSSIZ
+#define DFLSSIZ (8UL*1024*1024) /* initial stack size limit */
+#endif
+#ifndef MAXSSIZ
+#define MAXSSIZ (512UL*1024*1024) /* max stack size */
+#endif
+#ifndef SGROWSIZ
+#define SGROWSIZ (128UL*1024) /* amount to grow stack */
+#endif
+
+/*
+ * We provide a machine specific single page allocator through the use
+ * of the direct mapped segment. This uses 2MB pages for reduced
+ * TLB pressure.
+ */
+#define UMA_MD_SMALL_ALLOC
+
+/*
+ * The physical address space is densely populated.
+ */
+#define VM_PHYSSEG_DENSE
+
+/*
+ * The number of PHYSSEG entries must be one greater than the number
+ * of phys_avail entries because the phys_avail entry that spans the
+ * largest physical address that is accessible by ISA DMA is split
+ * into two PHYSSEG entries.
+ */
+#define VM_PHYSSEG_MAX 31
+
+/*
+ * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool
+ * from which physical pages are allocated and VM_FREEPOOL_DIRECT is
+ * the pool from which physical pages for page tables and small UMA
+ * objects are allocated.
+ */
+#define VM_NFREEPOOL 3
+#define VM_FREEPOOL_CACHE 2
+#define VM_FREEPOOL_DEFAULT 0
+#define VM_FREEPOOL_DIRECT 1
+
+/*
+ * Create two free page lists: VM_FREELIST_DEFAULT is for physical
+ * pages that are above the largest physical address that is
+ * accessible by ISA DMA and VM_FREELIST_ISADMA is for physical pages
+ * that are below that address.
+ */
+#define VM_NFREELIST 2
+#define VM_FREELIST_DEFAULT 0
+#define VM_FREELIST_ISADMA 1
+
+/*
+ * An allocation size of 16MB is supported in order to optimize the
+ * use of the direct map by UMA. Specifically, a cache line contains
+ * at most 8 PDEs, collectively mapping 16MB of physical memory. By
+ * reducing the number of distinct 16MB "pages" that are used by UMA,
+ * the physical memory allocator reduces the likelihood of both 2MB
+ * page TLB misses and cache misses caused by 2MB page TLB misses.
+ */
+#define VM_NFREEORDER 13
+
+/*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define VM_NDOMAIN 1
+#endif
+
+/*
+ * Enable superpage reservations: 1 level.
+ */
+#ifndef VM_NRESERVLEVEL
+#define VM_NRESERVLEVEL 1
+#endif
+
+/*
+ * Level 0 reservations consist of 512 pages.
+ */
+#ifndef VM_LEVEL_0_ORDER
+#define VM_LEVEL_0_ORDER 9
+#endif
+
+#ifdef SMP
+#define PA_LOCK_COUNT 256
+#endif
+
+/*
+ * Virtual addresses of things. Derived from the page directory and
+ * page table indexes from pmap.h for precision.
+ *
+ * 0x0000000000000000 - 0x00007fffffffffff user map
+ * 0x0000800000000000 - 0xffff7fffffffffff does not exist (hole)
+ * 0xffff800000000000 - 0xffff804020100fff recursive page table (512GB slot)
+ * 0xffff804020101000 - 0xfffffdffffffffff unused
+ * 0xfffffe0000000000 - 0xfffffeffffffffff 1TB direct map
+ * 0xffffff0000000000 - 0xffffff7fffffffff unused
+ * 0xffffff8000000000 - 0xffffffffffffffff 512GB kernel map
+ *
+ * Within the kernel map:
+ *
+ * 0xffffffff80000000 KERNBASE
+ */
+
+#define VM_MAX_KERNEL_ADDRESS KVADDR(KPML4I, NPDPEPG-1, NPDEPG-1, NPTEPG-1)
+#define VM_MIN_KERNEL_ADDRESS KVADDR(KPML4I, NPDPEPG-512, 0, 0)
+
+#define DMAP_MIN_ADDRESS KVADDR(DMPML4I, 0, 0, 0)
+#define DMAP_MAX_ADDRESS KVADDR(DMPML4I + NDMPML4E, 0, 0, 0)
+
+#define KERNBASE KVADDR(KPML4I, KPDPI, 0, 0)
+
+#define UPT_MAX_ADDRESS KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)
+#define UPT_MIN_ADDRESS KVADDR(PML4PML4I, 0, 0, 0)
+
+#define VM_MAXUSER_ADDRESS UVADDR(NUPML4E, 0, 0, 0)
+
+#define SHAREDPAGE (VM_MAXUSER_ADDRESS - PAGE_SIZE)
+#define USRSTACK SHAREDPAGE
+
+#define VM_MAX_ADDRESS UPT_MAX_ADDRESS
+#define VM_MIN_ADDRESS (0)
+
+#define PHYS_TO_DMAP(x) ((x) | DMAP_MIN_ADDRESS)
+#define DMAP_TO_PHYS(x) ((x) & ~DMAP_MIN_ADDRESS)
+
+/* virtual sizes (bytes) for various kernel submaps */
+#ifndef VM_KMEM_SIZE
+#define VM_KMEM_SIZE (12 * 1024 * 1024)
+#endif
+
+/*
+ * How many physical pages per KVA page allocated.
+ * min(max(max(VM_KMEM_SIZE, Physical memory/VM_KMEM_SIZE_SCALE),
+ * VM_KMEM_SIZE_MIN), VM_KMEM_SIZE_MAX)
+ * is the total KVA space allocated for kmem_map.
+ */
+#ifndef VM_KMEM_SIZE_SCALE
+#define VM_KMEM_SIZE_SCALE (1)
+#endif
+
+/*
+ * Ceiling on amount of kmem_map kva space.
+ */
+#ifndef VM_KMEM_SIZE_MAX
+#define VM_KMEM_SIZE_MAX ((VM_MAX_KERNEL_ADDRESS - \
+ VM_MIN_KERNEL_ADDRESS + 1) * 3 / 5)
+#endif
+
+/* initial pagein size of beginning of executable file */
+#ifndef VM_INITIAL_PAGEIN
+#define VM_INITIAL_PAGEIN 16
+#endif
+
+#define ZERO_REGION_SIZE (2 * 1024 * 1024) /* 2MB */
+
+#endif /* _MACHINE_VMPARAM_H_ */
diff --git a/sys/amd64/include/xen/hypercall.h b/sys/amd64/include/xen/hypercall.h
new file mode 100644
index 0000000..50fa376
--- /dev/null
+++ b/sys/amd64/include/xen/hypercall.h
@@ -0,0 +1,415 @@
+/******************************************************************************
+ * hypercall.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * 64-bit updates:
+ * Benjamin Liu <benjamin.liu@intel.com>
+ * Jun Nakajima <jun.nakajima@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __MACHINE_XEN_HYPERCALL_H__
+#define __MACHINE_XEN_HYPERCALL_H__
+
+#include <sys/systm.h>
+
+#ifndef __XEN_HYPERVISOR_H__
+# error "please don't include this file directly"
+#endif
+
+#define __STR(x) #x
+#define STR(x) __STR(x)
+#define ENOXENSYS 38
+#define CONFIG_XEN_COMPAT 0x030002
+#define __must_check
+
+#ifdef XEN
+#define HYPERCALL_STR(name) \
+ "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"
+#else
+#define HYPERCALL_STR(name) \
+ "mov $("STR(__HYPERVISOR_##name)" * 32),%%eax; "\
+ "add hypercall_stubs(%%rip),%%rax; " \
+ "call *%%rax"
+#endif
+
+#define _hypercall0(type, name) \
+({ \
+ type __res; \
+ __asm__ volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res) \
+ : \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall1(type, name, a1) \
+({ \
+ type __res; \
+ long __ign1; \
+ __asm__ volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1) \
+ : "1" ((long)(a1)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall2(type, name, a1, a2) \
+({ \
+ type __res; \
+ long __ign1, __ign2; \
+ __asm__ volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2) \
+ : "1" ((long)(a1)), "2" ((long)(a2)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall3(type, name, a1, a2, a3) \
+({ \
+ type __res; \
+ long __ign1, __ign2, __ign3; \
+ __asm__ volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
+ "=d" (__ign3) \
+ : "1" ((long)(a1)), "2" ((long)(a2)), \
+ "3" ((long)(a3)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall4(type, name, a1, a2, a3, a4) \
+({ \
+ type __res; \
+ long __ign1, __ign2, __ign3; \
+ register long __arg4 __asm__("r10") = (long)(a4); \
+ __asm__ volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
+ "=d" (__ign3), "+r" (__arg4) \
+ : "1" ((long)(a1)), "2" ((long)(a2)), \
+ "3" ((long)(a3)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
+({ \
+ type __res; \
+ long __ign1, __ign2, __ign3; \
+ register long __arg4 __asm__("r10") = (long)(a4); \
+ register long __arg5 __asm__("r8") = (long)(a5); \
+ __asm__ volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
+ "=d" (__ign3), "+r" (__arg4), "+r" (__arg5) \
+ : "1" ((long)(a1)), "2" ((long)(a2)), \
+ "3" ((long)(a3)) \
+ : "memory" ); \
+ __res; \
+})
+
+static inline int __must_check
+HYPERVISOR_set_trap_table(
+ const trap_info_t *table)
+{
+ return _hypercall1(int, set_trap_table, table);
+}
+
+static inline int __must_check
+HYPERVISOR_mmu_update(
+ mmu_update_t *req, unsigned int count, unsigned int *success_count,
+ domid_t domid)
+{
+ return _hypercall4(int, mmu_update, req, count, success_count, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_mmuext_op(
+ struct mmuext_op *op, unsigned int count, unsigned int *success_count,
+ domid_t domid)
+{
+ return _hypercall4(int, mmuext_op, op, count, success_count, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_set_gdt(
+ unsigned long *frame_list, unsigned int entries)
+{
+ return _hypercall2(int, set_gdt, frame_list, entries);
+}
+
+static inline int __must_check
+HYPERVISOR_stack_switch(
+ unsigned long ss, unsigned long esp)
+{
+ return _hypercall2(int, stack_switch, ss, esp);
+}
+
+static inline int __must_check
+HYPERVISOR_set_callbacks(
+ unsigned long event_address, unsigned long failsafe_address,
+ unsigned long syscall_address)
+{
+ return _hypercall3(int, set_callbacks,
+ event_address, failsafe_address, syscall_address);
+}
+
+static inline int
+HYPERVISOR_fpu_taskswitch(
+ int set)
+{
+ return _hypercall1(int, fpu_taskswitch, set);
+}
+
+static inline int __must_check
+HYPERVISOR_sched_op_compat(
+ int cmd, unsigned long arg)
+{
+ return _hypercall2(int, sched_op_compat, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_sched_op(
+ int cmd, void *arg)
+{
+ return _hypercall2(int, sched_op, cmd, arg);
+}
+
+static inline long __must_check
+HYPERVISOR_set_timer_op(
+ uint64_t timeout)
+{
+ return _hypercall1(long, set_timer_op, timeout);
+}
+
+static inline int __must_check
+HYPERVISOR_platform_op(
+ struct xen_platform_op *platform_op)
+{
+ platform_op->interface_version = XENPF_INTERFACE_VERSION;
+ return _hypercall1(int, platform_op, platform_op);
+}
+
+static inline int __must_check
+HYPERVISOR_set_debugreg(
+ unsigned int reg, unsigned long value)
+{
+ return _hypercall2(int, set_debugreg, reg, value);
+}
+
+static inline unsigned long __must_check
+HYPERVISOR_get_debugreg(
+ unsigned int reg)
+{
+ return _hypercall1(unsigned long, get_debugreg, reg);
+}
+
+static inline int __must_check
+HYPERVISOR_update_descriptor(
+ unsigned long ma, unsigned long word)
+{
+ return _hypercall2(int, update_descriptor, ma, word);
+}
+
+static inline int __must_check
+HYPERVISOR_memory_op(
+ unsigned int cmd, void *arg)
+{
+ return _hypercall2(int, memory_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_multicall(
+ multicall_entry_t *call_list, unsigned int nr_calls)
+{
+ return _hypercall2(int, multicall, call_list, nr_calls);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping(
+ unsigned long va, uint64_t new_val, unsigned long flags)
+{
+ return _hypercall3(int, update_va_mapping, va, new_val, flags);
+}
+
+static inline int __must_check
+HYPERVISOR_event_channel_op(
+ int cmd, void *arg)
+{
+ int rc = _hypercall2(int, event_channel_op, cmd, arg);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (unlikely(rc == -ENOXENSYS)) {
+ struct evtchn_op op;
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, event_channel_op_compat, &op);
+ memcpy(arg, &op.u, sizeof(op.u));
+ }
+#endif
+
+ return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_xen_version(
+ int cmd, void *arg)
+{
+ return _hypercall2(int, xen_version, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_console_io(
+ int cmd, unsigned int count, char *str)
+{
+ return _hypercall3(int, console_io, cmd, count, str);
+}
+
+static inline int __must_check
+HYPERVISOR_physdev_op(
+ int cmd, void *arg)
+{
+ int rc = _hypercall2(int, physdev_op, cmd, arg);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (unlikely(rc == -ENOXENSYS)) {
+ struct physdev_op op;
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, physdev_op_compat, &op);
+ memcpy(arg, &op.u, sizeof(op.u));
+ }
+#endif
+
+ return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_grant_table_op(
+ unsigned int cmd, void *uop, unsigned int count)
+{
+ return _hypercall3(int, grant_table_op, cmd, uop, count);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping_otherdomain(
+ unsigned long va, uint64_t new_val, unsigned long flags, domid_t domid)
+{
+ return _hypercall4(int, update_va_mapping_otherdomain, va,
+ new_val, flags, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_vm_assist(
+ unsigned int cmd, unsigned int type)
+{
+ return _hypercall2(int, vm_assist, cmd, type);
+}
+
+static inline int __must_check
+HYPERVISOR_vcpu_op(
+ int cmd, unsigned int vcpuid, void *extra_args)
+{
+ return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
+}
+
+static inline int __must_check
+HYPERVISOR_set_segment_base(
+ int reg, unsigned long value)
+{
+ return _hypercall2(int, set_segment_base, reg, value);
+}
+
+static inline int __must_check
+HYPERVISOR_suspend(
+ unsigned long srec)
+{
+ struct sched_shutdown sched_shutdown = {
+ .reason = SHUTDOWN_suspend
+ };
+
+ int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
+ &sched_shutdown, srec);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (rc == -ENOXENSYS)
+ rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
+ SHUTDOWN_suspend, srec);
+#endif
+
+ return rc;
+}
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int
+HYPERVISOR_nmi_op(
+ unsigned long op, void *arg)
+{
+ return _hypercall2(int, nmi_op, op, arg);
+}
+#endif
+
+#ifndef CONFIG_XEN
+static inline unsigned long __must_check
+HYPERVISOR_hvm_op(
+ int op, void *arg)
+{
+ return _hypercall2(unsigned long, hvm_op, op, arg);
+}
+#endif
+
+static inline int __must_check
+HYPERVISOR_callback_op(
+ int cmd, const void *arg)
+{
+ return _hypercall2(int, callback_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_xenoprof_op(
+ int op, void *arg)
+{
+ return _hypercall2(int, xenoprof_op, op, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_kexec_op(
+ unsigned long op, void *args)
+{
+ return _hypercall2(int, kexec_op, op, args);
+}
+
+#undef __must_check
+
+#endif /* __MACHINE_XEN_HYPERCALL_H__ */
diff --git a/sys/amd64/include/xen/synch_bitops.h b/sys/amd64/include/xen/synch_bitops.h
new file mode 100644
index 0000000..746687a
--- /dev/null
+++ b/sys/amd64/include/xen/synch_bitops.h
@@ -0,0 +1,129 @@
+#ifndef __XEN_SYNCH_BITOPS_H__
+#define __XEN_SYNCH_BITOPS_H__
+
+/*
+ * Copyright 1992, Linus Torvalds.
+ * Heavily modified to provide guaranteed strong synchronisation
+ * when communicating with Xen or other guest OSes running on other CPUs.
+ */
+
+
+#define ADDR (*(volatile long *) addr)
+
+static __inline__ void synch_set_bit(int nr, volatile void * addr)
+{
+ __asm__ __volatile__ (
+ "lock btsl %1,%0"
+ : "=m" (ADDR) : "Ir" (nr) : "memory" );
+}
+
+static __inline__ void synch_clear_bit(int nr, volatile void * addr)
+{
+ __asm__ __volatile__ (
+ "lock btrl %1,%0"
+ : "=m" (ADDR) : "Ir" (nr) : "memory" );
+}
+
+static __inline__ void synch_change_bit(int nr, volatile void * addr)
+{
+ __asm__ __volatile__ (
+ "lock btcl %1,%0"
+ : "=m" (ADDR) : "Ir" (nr) : "memory" );
+}
+
+static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+ __asm__ __volatile__ (
+ "lock btsl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+ __asm__ __volatile__ (
+ "lock btrl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+
+ __asm__ __volatile__ (
+ "lock btcl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+struct __synch_xchg_dummy { unsigned long a[100]; };
+#define __synch_xg(x) ((volatile struct __synch_xchg_dummy *)(x))
+
+#define synch_cmpxchg(ptr, old, new) \
+((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
+ (unsigned long)(old), \
+ (unsigned long)(new), \
+ sizeof(*(ptr))))
+
+static inline unsigned long __synch_cmpxchg(volatile void *ptr,
+ unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ __asm__ __volatile__("lock; cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__synch_xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ __asm__ __volatile__("lock; cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__synch_xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ __asm__ __volatile__("lock; cmpxchgl %k1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__synch_xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+ case 8:
+ __asm__ __volatile__("lock; cmpxchgq %1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__synch_xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+ }
+ return old;
+}
+
+static __inline__ int synch_const_test_bit(int nr, const volatile void * addr)
+{
+ return ((1UL << (nr & 31)) &
+ (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
+}
+
+static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+ __asm__ __volatile__ (
+ "btl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) );
+ return oldbit;
+}
+
+#define synch_test_bit(nr,addr) \
+(__builtin_constant_p(nr) ? \
+ synch_const_test_bit((nr),(addr)) : \
+ synch_var_test_bit((nr),(addr)))
+
+#endif /* __XEN_SYNCH_BITOPS_H__ */
diff --git a/sys/amd64/include/xen/xen-os.h b/sys/amd64/include/xen/xen-os.h
new file mode 100644
index 0000000..163e7f2
--- /dev/null
+++ b/sys/amd64/include/xen/xen-os.h
@@ -0,0 +1,296 @@
+/******************************************************************************
+ * os.h
+ *
+ * random collection of macros and definition
+ */
+
+#ifndef _XEN_OS_H_
+#define _XEN_OS_H_
+
+#ifdef PAE
+#define CONFIG_X86_PAE
+#endif
+
+#if !defined(__XEN_INTERFACE_VERSION__)
+/*
+ * Can update to a more recent version when we implement
+ * the hypercall page
+ */
+#define __XEN_INTERFACE_VERSION__ 0x00030204
+#endif
+
+#include <xen/interface/xen.h>
+
+/* Force a proper event-channel callback from Xen. */
+void force_evtchn_callback(void);
+
+extern int gdtset;
+
+extern shared_info_t *HYPERVISOR_shared_info;
+
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static inline void rep_nop(void)
+{
+ __asm__ __volatile__ ( "rep;nop" : : : "memory" );
+}
+#define cpu_relax() rep_nop()
+
+/* crude memory allocator for memory allocation early in
+ * boot
+ */
+void *bootmem_alloc(unsigned int size);
+void bootmem_free(void *ptr, unsigned int size);
+
+
+/* Everything below this point is not included by assembler (.S) files. */
+#ifndef __ASSEMBLY__
+
+void printk(const char *fmt, ...);
+
+/* some function prototypes */
+void trap_init(void);
+
+#define likely(x) __builtin_expect((x),1)
+#define unlikely(x) __builtin_expect((x),0)
+
+#ifndef XENHVM
+
+/*
+ * STI/CLI equivalents. These basically set and clear the virtual
+ * event_enable flag in teh shared_info structure. Note that when
+ * the enable bit is set, there may be pending events to be handled.
+ * We may therefore call into do_hypervisor_callback() directly.
+ */
+
+#define __cli() \
+do { \
+ vcpu_info_t *_vcpu; \
+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \
+ _vcpu->evtchn_upcall_mask = 1; \
+ barrier(); \
+} while (0)
+
+#define __sti() \
+do { \
+ vcpu_info_t *_vcpu; \
+ barrier(); \
+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \
+ _vcpu->evtchn_upcall_mask = 0; \
+ barrier(); /* unmask then check (avoid races) */ \
+ if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
+ force_evtchn_callback(); \
+} while (0)
+
+#define __restore_flags(x) \
+do { \
+ vcpu_info_t *_vcpu; \
+ barrier(); \
+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \
+ if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
+ barrier(); /* unmask then check (avoid races) */ \
+ if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
+ force_evtchn_callback(); \
+ } \
+} while (0)
+
+/*
+ * Add critical_{enter, exit}?
+ *
+ */
+#define __save_and_cli(x) \
+do { \
+ vcpu_info_t *_vcpu; \
+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \
+ (x) = _vcpu->evtchn_upcall_mask; \
+ _vcpu->evtchn_upcall_mask = 1; \
+ barrier(); \
+} while (0)
+
+
+#define cli() __cli()
+#define sti() __sti()
+#define save_flags(x) __save_flags(x)
+#define restore_flags(x) __restore_flags(x)
+#define save_and_cli(x) __save_and_cli(x)
+
+#define local_irq_save(x) __save_and_cli(x)
+#define local_irq_restore(x) __restore_flags(x)
+#define local_irq_disable() __cli()
+#define local_irq_enable() __sti()
+
+#define mtx_lock_irqsave(lock, x) {local_irq_save((x)); mtx_lock_spin((lock));}
+#define mtx_unlock_irqrestore(lock, x) {mtx_unlock_spin((lock)); local_irq_restore((x)); }
+#define spin_lock_irqsave mtx_lock_irqsave
+#define spin_unlock_irqrestore mtx_unlock_irqrestore
+
+#else
+#endif
+
+#ifndef mb
+#define mb() __asm__ __volatile__("mfence":::"memory")
+#endif
+#ifndef rmb
+#define rmb() __asm__ __volatile__("lfence":::"memory");
+#endif
+#ifndef wmb
+#define wmb() barrier()
+#endif
+#ifdef SMP
+#define smp_mb() mb()
+#define smp_rmb() rmb()
+#define smp_wmb() wmb()
+#define smp_read_barrier_depends() read_barrier_depends()
+#define set_mb(var, value) do { xchg(&var, value); } while (0)
+#else
+#define smp_mb() barrier()
+#define smp_rmb() barrier()
+#define smp_wmb() barrier()
+#define smp_read_barrier_depends() do { } while(0)
+#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#endif
+
+
+/* This is a barrier for the compiler only, NOT the processor! */
+#define barrier() __asm__ __volatile__("": : :"memory")
+
+#define LOCK_PREFIX ""
+#define LOCK ""
+#define ADDR (*(volatile long *) addr)
+/*
+ * Make sure gcc doesn't try to be clever and move things around
+ * on us. We need to use _exactly_ the address the user gave us,
+ * not some alias that contains the same information.
+ */
+typedef struct { volatile int counter; } atomic_t;
+
+
+
+#define xen_xchg(ptr,v) \
+ ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
+struct __xchg_dummy { unsigned long a[100]; };
+#define __xg(x) ((volatile struct __xchg_dummy *)(x))
+static __inline unsigned long __xchg(unsigned long x, volatile void * ptr,
+ int size)
+{
+ switch (size) {
+ case 1:
+ __asm__ __volatile__("xchgb %b0,%1"
+ :"=q" (x)
+ :"m" (*__xg(ptr)), "0" (x)
+ :"memory");
+ break;
+ case 2:
+ __asm__ __volatile__("xchgw %w0,%1"
+ :"=r" (x)
+ :"m" (*__xg(ptr)), "0" (x)
+ :"memory");
+ break;
+ case 4:
+ __asm__ __volatile__("xchgl %0,%1"
+ :"=r" (x)
+ :"m" (*__xg(ptr)), "0" (x)
+ :"memory");
+ break;
+ }
+ return x;
+}
+
+/**
+ * test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static __inline int test_and_clear_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+
+ __asm__ __volatile__( LOCK_PREFIX
+ "btrl %2,%1\n\tsbbl %0,%0"
+ :"=r" (oldbit),"=m" (ADDR)
+ :"Ir" (nr) : "memory");
+ return oldbit;
+}
+
+static __inline int constant_test_bit(int nr, const volatile void * addr)
+{
+ return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
+}
+
+static __inline int variable_test_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+
+ __asm__ __volatile__(
+ "btl %2,%1\n\tsbbl %0,%0"
+ :"=r" (oldbit)
+ :"m" (ADDR),"Ir" (nr));
+ return oldbit;
+}
+
+#define test_bit(nr,addr) \
+(__builtin_constant_p(nr) ? \
+ constant_test_bit((nr),(addr)) : \
+ variable_test_bit((nr),(addr)))
+
+
+/**
+ * set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered. See __set_bit()
+ * if you do not require the atomic guarantees.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static __inline__ void set_bit(int nr, volatile void * addr)
+{
+ __asm__ __volatile__( LOCK_PREFIX
+ "btsl %1,%0"
+ :"=m" (ADDR)
+ :"Ir" (nr));
+}
+
+/**
+ * clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and may not be reordered. However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * in order to ensure changes are visible on other processors.
+ */
+static __inline__ void clear_bit(int nr, volatile void * addr)
+{
+ __asm__ __volatile__( LOCK_PREFIX
+ "btrl %1,%0"
+ :"=m" (ADDR)
+ :"Ir" (nr));
+}
+
+/**
+ * atomic_inc - increment atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1. Note that the guaranteed
+ * useful range of an atomic_t is only 24 bits.
+ */
+static __inline__ void atomic_inc(atomic_t *v)
+{
+ __asm__ __volatile__(
+ LOCK "incl %0"
+ :"=m" (v->counter)
+ :"m" (v->counter));
+}
+
+
+#define rdtscll(val) \
+ __asm__ __volatile__("rdtsc" : "=A" (val))
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _OS_H_ */
diff --git a/sys/amd64/include/xen/xenfunc.h b/sys/amd64/include/xen/xenfunc.h
new file mode 100644
index 0000000..d03d4f6
--- /dev/null
+++ b/sys/amd64/include/xen/xenfunc.h
@@ -0,0 +1,82 @@
+/*-
+ * Copyright (c) 2004, 2005 Kip Macy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _XEN_XENFUNC_H_
+#define _XEN_XENFUNC_H_
+
+#ifdef XENHVM
+#include <machine/xen/xenvar.h>
+#else
+#include <machine/xen/xenpmap.h>
+#include <machine/segments.h>
+#endif
+
+#define BKPT __asm__("int3");
+#define XPQ_CALL_DEPTH 5
+#define XPQ_CALL_COUNT 2
+#define PG_PRIV PG_AVAIL3
+typedef struct {
+ unsigned long pt_ref;
+ unsigned long pt_eip[XPQ_CALL_COUNT][XPQ_CALL_DEPTH];
+} pteinfo_t;
+
+extern pteinfo_t *pteinfo_list;
+#ifdef XENDEBUG_LOW
+#define __PRINTK(x) printk x
+#else
+#define __PRINTK(x)
+#endif
+
+char *xen_setbootenv(char *cmd_line);
+
+int xen_boothowto(char *envp);
+
+void _xen_machphys_update(vm_paddr_t, vm_paddr_t, char *file, int line);
+
+#ifdef INVARIANTS
+#define xen_machphys_update(a, b) _xen_machphys_update((a), (b), __FILE__, __LINE__)
+#else
+#define xen_machphys_update(a, b) _xen_machphys_update((a), (b), NULL, 0)
+#endif
+
+#ifndef XENHVM
+void xen_update_descriptor(union descriptor *, union descriptor *);
+#endif
+
+extern struct mtx balloon_lock;
+#if 0
+#define balloon_lock(__flags) mtx_lock_irqsave(&balloon_lock, __flags)
+#define balloon_unlock(__flags) mtx_unlock_irqrestore(&balloon_lock, __flags)
+#else
+#define balloon_lock(__flags) __flags = 1
+#define balloon_unlock(__flags) __flags = 0
+#endif
+
+
+
+#endif /* _XEN_XENFUNC_H_ */
diff --git a/sys/amd64/include/xen/xenpmap.h b/sys/amd64/include/xen/xenpmap.h
new file mode 100644
index 0000000..d768dad
--- /dev/null
+++ b/sys/amd64/include/xen/xenpmap.h
@@ -0,0 +1,227 @@
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * Copyright (c) 2004,2005 Kip Macy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _XEN_XENPMAP_H_
+#define _XEN_XENPMAP_H_
+
+#include <machine/xen/features.h>
+
+void _xen_queue_pt_update(vm_paddr_t, vm_paddr_t, char *, int);
+void xen_pt_switch(vm_paddr_t);
+void xen_set_ldt(vm_paddr_t, unsigned long);
+void xen_pgdpt_pin(vm_paddr_t);
+void xen_pgd_pin(vm_paddr_t);
+void xen_pgd_unpin(vm_paddr_t);
+void xen_pt_pin(vm_paddr_t);
+void xen_pt_unpin(vm_paddr_t);
+void xen_flush_queue(void);
+void xen_check_queue(void);
+#if 0
+void pmap_ref(pt_entry_t *pte, vm_paddr_t ma);
+#endif
+
+#ifdef INVARIANTS
+#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), __FILE__, __LINE__)
+#else
+#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), NULL, 0)
+#endif
+
+#ifdef PMAP_DEBUG
+#define PMAP_REF pmap_ref
+#define PMAP_DEC_REF_PAGE pmap_dec_ref_page
+#define PMAP_MARK_PRIV pmap_mark_privileged
+#define PMAP_MARK_UNPRIV pmap_mark_unprivileged
+#else
+#define PMAP_MARK_PRIV(a)
+#define PMAP_MARK_UNPRIV(a)
+#define PMAP_REF(a, b)
+#define PMAP_DEC_REF_PAGE(a)
+#endif
+
+#define ALWAYS_SYNC 0
+
+#ifdef PT_DEBUG
+#define PT_LOG() printk("WP PT_SET %s:%d\n", __FILE__, __LINE__)
+#else
+#define PT_LOG()
+#endif
+
+#define INVALID_P2M_ENTRY (~0UL)
+
+#define pmap_valid_entry(E) ((E) & PG_V) /* is PDE or PTE valid? */
+
+#define SH_PD_SET_VA 1
+#define SH_PD_SET_VA_MA 2
+#define SH_PD_SET_VA_CLEAR 3
+
+struct pmap;
+void pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type);
+#ifdef notyet
+static vm_paddr_t
+vptetomachpte(vm_paddr_t *pte)
+{
+ vm_offset_t offset, ppte;
+ vm_paddr_t pgoffset, retval, *pdir_shadow_ptr;
+ int pgindex;
+
+ ppte = (vm_offset_t)pte;
+ pgoffset = (ppte & PAGE_MASK);
+ offset = ppte - (vm_offset_t)PTmap;
+ pgindex = ppte >> PDRSHIFT;
+
+ pdir_shadow_ptr = (vm_paddr_t *)PCPU_GET(pdir_shadow);
+ retval = (pdir_shadow_ptr[pgindex] & ~PAGE_MASK) + pgoffset;
+ return (retval);
+}
+#endif
+#define PT_GET(_ptp) \
+ (pmap_valid_entry(*(_ptp)) ? xpmap_mtop(*(_ptp)) : (0))
+
+#ifdef WRITABLE_PAGETABLES
+
+#define PT_SET_VA(_ptp,_npte,sync) do { \
+ PMAP_REF((_ptp), xpmap_ptom(_npte)); \
+ PT_LOG(); \
+ *(_ptp) = xpmap_ptom((_npte)); \
+} while (/*CONSTCOND*/0)
+#define PT_SET_VA_MA(_ptp,_npte,sync) do { \
+ PMAP_REF((_ptp), (_npte)); \
+ PT_LOG(); \
+ *(_ptp) = (_npte); \
+} while (/*CONSTCOND*/0)
+#define PT_CLEAR_VA(_ptp, sync) do { \
+ PMAP_REF((pt_entry_t *)(_ptp), 0); \
+ PT_LOG(); \
+ *(_ptp) = 0; \
+} while (/*CONSTCOND*/0)
+
+#define PD_SET_VA(_pmap, _ptp, _npte, sync) do { \
+ PMAP_REF((_ptp), xpmap_ptom(_npte)); \
+ pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PD_SET_VA_MA(_pmap, _ptp, _npte, sync) do { \
+ PMAP_REF((_ptp), (_npte)); \
+ pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA_MA); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PD_CLEAR_VA(_pmap, _ptp, sync) do { \
+ PMAP_REF((pt_entry_t *)(_ptp), 0); \
+ pd_set((_pmap),(_ptp), 0, SH_PD_SET_VA_CLEAR); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+
+#else /* !WRITABLE_PAGETABLES */
+
+#define PT_SET_VA(_ptp,_npte,sync) do { \
+ PMAP_REF((_ptp), xpmap_ptom(_npte)); \
+ xen_queue_pt_update(vtomach(_ptp), \
+ xpmap_ptom(_npte)); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PT_SET_VA_MA(_ptp,_npte,sync) do { \
+ PMAP_REF((_ptp), (_npte)); \
+ xen_queue_pt_update(vtomach(_ptp), _npte); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PT_CLEAR_VA(_ptp, sync) do { \
+ PMAP_REF((pt_entry_t *)(_ptp), 0); \
+ xen_queue_pt_update(vtomach(_ptp), 0); \
+ if (sync || ALWAYS_SYNC) \
+ xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+
+#define PD_SET_VA(_pmap, _ptepindex,_npte,sync) do { \
+ PMAP_REF((_ptp), xpmap_ptom(_npte)); \
+ pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PD_SET_VA_MA(_pmap, _ptepindex,_npte,sync) do { \
+ PMAP_REF((_ptp), (_npte)); \
+ pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA_MA); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PD_CLEAR_VA(_pmap, _ptepindex, sync) do { \
+ PMAP_REF((pt_entry_t *)(_ptp), 0); \
+ pd_set((_pmap),(_ptepindex), 0, SH_PD_SET_VA_CLEAR); \
+ if (sync || ALWAYS_SYNC) xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+
+#endif
+
+#define PT_SET_MA(_va, _ma) \
+do { \
+ PANIC_IF(HYPERVISOR_update_va_mapping(((unsigned long)(_va)),\
+ (_ma), \
+ UVMF_INVLPG| UVMF_ALL) < 0); \
+} while (/*CONSTCOND*/0)
+
+#define PT_UPDATES_FLUSH() do { \
+ xen_flush_queue(); \
+} while (/*CONSTCOND*/0)
+
+static __inline vm_paddr_t
+xpmap_mtop(vm_paddr_t mpa)
+{
+ vm_paddr_t tmp = (mpa & PG_FRAME);
+
+ return machtophys(tmp) | (mpa & ~PG_FRAME);
+}
+
+static __inline vm_paddr_t
+xpmap_ptom(vm_paddr_t ppa)
+{
+ vm_paddr_t tmp = (ppa & PG_FRAME);
+
+ return phystomach(tmp) | (ppa & ~PG_FRAME);
+}
+
+static __inline void
+set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+#ifdef notyet
+ PANIC_IF(max_mapnr && pfn >= max_mapnr);
+#endif
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+#ifdef notyet
+ PANIC_IF((pfn != mfn && mfn != INVALID_P2M_ENTRY));
+#endif
+ return;
+ }
+ xen_phys_machine[pfn] = mfn;
+}
+
+
+
+
+#endif /* _XEN_XENPMAP_H_ */
diff --git a/sys/amd64/include/xen/xenvar.h b/sys/amd64/include/xen/xenvar.h
new file mode 100644
index 0000000..d9dbc5d
--- /dev/null
+++ b/sys/amd64/include/xen/xenvar.h
@@ -0,0 +1,120 @@
+/*-
+ * Copyright (c) 2008 Kip Macy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#ifndef XENVAR_H_
+#define XENVAR_H_
+#define XBOOTUP 0x1
+#define XPMAP 0x2
+extern int xendebug_flags;
+#ifndef NOXENDEBUG
+#define XENPRINTF printk
+#else
+#define XENPRINTF printf
+#endif
+#include <xen/features.h>
+
+#if 0
+#define TRACE_ENTER XENPRINTF("(file=%s, line=%d) entered %s\n", __FILE__, __LINE__, __FUNCTION__)
+#define TRACE_EXIT XENPRINTF("(file=%s, line=%d) exiting %s\n", __FILE__, __LINE__, __FUNCTION__)
+#define TRACE_DEBUG(argflags, _f, _a...) \
+if (xendebug_flags & argflags) XENPRINTF("(file=%s, line=%d) " _f "\n", __FILE__, __LINE__, ## _a);
+#else
+#define TRACE_ENTER
+#define TRACE_EXIT
+#define TRACE_DEBUG(argflags, _f, _a...)
+#endif
+
+#ifdef XENHVM
+
+static inline vm_paddr_t
+phystomach(vm_paddr_t pa)
+{
+
+ return (pa);
+}
+
+static inline vm_paddr_t
+machtophys(vm_paddr_t ma)
+{
+
+ return (ma);
+}
+
+#define vtomach(va) pmap_kextract((vm_offset_t) (va))
+#define PFNTOMFN(pa) (pa)
+#define MFNTOPFN(ma) (ma)
+
+#define set_phys_to_machine(pfn, mfn) ((void)0)
+#define phys_to_machine_mapping_valid(pfn) (TRUE)
+#define PT_UPDATES_FLUSH() ((void)0)
+
+#else
+
+extern xen_pfn_t *xen_phys_machine;
+
+
+extern xen_pfn_t *xen_machine_phys;
+/* Xen starts physical pages after the 4MB ISA hole -
+ * FreeBSD doesn't
+ */
+
+
+#undef ADD_ISA_HOLE /* XXX */
+
+#ifdef ADD_ISA_HOLE
+#define ISA_INDEX_OFFSET 1024
+#define ISA_PDR_OFFSET 1
+#else
+#define ISA_INDEX_OFFSET 0
+#define ISA_PDR_OFFSET 0
+#endif
+
+
+#define PFNTOMFN(i) (xen_phys_machine[(i)])
+#define MFNTOPFN(i) ((vm_paddr_t)xen_machine_phys[(i)])
+
+#define VTOP(x) ((((uintptr_t)(x))) - KERNBASE)
+#define PTOV(x) (((uintptr_t)(x)) + KERNBASE)
+
+#define VTOPFN(x) (VTOP(x) >> PAGE_SHIFT)
+#define PFNTOV(x) PTOV((vm_paddr_t)(x) << PAGE_SHIFT)
+
+#define VTOMFN(va) (vtomach(va) >> PAGE_SHIFT)
+#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
+
+#define phystomach(pa) (((vm_paddr_t)(PFNTOMFN((pa) >> PAGE_SHIFT))) << PAGE_SHIFT)
+#define machtophys(ma) (((vm_paddr_t)(MFNTOPFN((ma) >> PAGE_SHIFT))) << PAGE_SHIFT)
+
+#endif
+
+void xpq_init(void);
+
+int xen_create_contiguous_region(vm_page_t pages, int npages);
+
+void xen_destroy_contiguous_region(void * addr, int npages);
+
+#endif
diff --git a/sys/amd64/linux32/Makefile b/sys/amd64/linux32/Makefile
new file mode 100644
index 0000000..4826981
--- /dev/null
+++ b/sys/amd64/linux32/Makefile
@@ -0,0 +1,17 @@
+# Makefile for syscall tables
+#
+# $FreeBSD$
+
+all:
+ @echo "make sysent only"
+
+sysent: linux32_sysent.c linux32_syscall.h linux32_proto.h linux32_syscalls.c linux32_systrace_args.c
+
+linux32_sysent.c linux32_syscall.h linux32_proto.h linux32_syscalls.c linux32_systrace_args.c: ../../kern/makesyscalls.sh \
+ syscalls.master syscalls.conf
+ -mv -f linux32_sysent.c linux32_sysent.c.bak
+ -mv -f linux32_syscall.h linux32_syscall.h.bak
+ -mv -f linux32_proto.h linux32_proto.h.bak
+ -mv -f linux32_syscalls.c linux32_syscalls.c.bak
+ -mv -f linux32_systrace_args.c linux32_systrace_args.c.bak
+ sh ../../kern/makesyscalls.sh syscalls.master syscalls.conf
diff --git a/sys/amd64/linux32/linux.h b/sys/amd64/linux32/linux.h
new file mode 100644
index 0000000..7b52a64
--- /dev/null
+++ b/sys/amd64/linux32/linux.h
@@ -0,0 +1,788 @@
+/*-
+ * Copyright (c) 2004 Tim J. Robbins
+ * Copyright (c) 2001 Doug Rabson
+ * Copyright (c) 1994-1996 Søren Schmidt
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _AMD64_LINUX_H_
+#define _AMD64_LINUX_H_
+
+#include <amd64/linux32/linux32_syscall.h>
+
+/*
+ * debugging support
+ */
+extern u_char linux_debug_map[];
+#define ldebug(name) isclr(linux_debug_map, LINUX_SYS_linux_ ## name)
+#define ARGS(nm, fmt) "linux(%ld): "#nm"("fmt")\n", (long)td->td_proc->p_pid
+#define LMSG(fmt) "linux(%ld): "fmt"\n", (long)td->td_proc->p_pid
+#define LINUX_DTRACE linuxulator32
+
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_LINUX);
+#endif
+
+#define LINUX32_MAXUSER ((1ul << 32) - PAGE_SIZE)
+#define LINUX32_SHAREDPAGE (LINUX32_MAXUSER - PAGE_SIZE)
+#define LINUX32_USRSTACK LINUX32_SHAREDPAGE
+
+/* XXX 16 = sizeof(linux32_ps_strings) */
+#define LINUX32_PS_STRINGS (LINUX32_USRSTACK - 16)
+#define LINUX32_MAXDSIZ (512 * 1024 * 1024) /* 512MB */
+#define LINUX32_MAXSSIZ (64 * 1024 * 1024) /* 64MB */
+#define LINUX32_MAXVMEM 0 /* Unlimited */
+
+#define PTRIN(v) (void *)(uintptr_t)(v)
+#define PTROUT(v) (l_uintptr_t)(uintptr_t)(v)
+
+/*
+ * Provide a separate set of types for the Linux types.
+ */
+typedef int l_int;
+typedef int32_t l_long;
+typedef int64_t l_longlong;
+typedef short l_short;
+typedef unsigned int l_uint;
+typedef uint32_t l_ulong;
+typedef uint64_t l_ulonglong;
+typedef unsigned short l_ushort;
+
+typedef l_ulong l_uintptr_t;
+typedef l_long l_clock_t;
+typedef l_int l_daddr_t;
+typedef l_ushort l_dev_t;
+typedef l_uint l_gid_t;
+typedef l_ushort l_gid16_t;
+typedef l_ulong l_ino_t;
+typedef l_int l_key_t;
+typedef l_longlong l_loff_t;
+typedef l_ushort l_mode_t;
+typedef l_long l_off_t;
+typedef l_int l_pid_t;
+typedef l_uint l_size_t;
+typedef l_long l_suseconds_t;
+typedef l_long l_time_t;
+typedef l_uint l_uid_t;
+typedef l_ushort l_uid16_t;
+typedef l_int l_timer_t;
+typedef l_int l_mqd_t;
+
+typedef struct {
+ l_int val[2];
+} __packed l_fsid_t;
+
+typedef struct {
+ l_time_t tv_sec;
+ l_suseconds_t tv_usec;
+} l_timeval;
+
+#define l_fd_set fd_set
+
+/*
+ * Miscellaneous
+ */
+#define LINUX_AT_COUNT 16 /* Count of used aux entry types.
+ * Keep this synchronized with
+ * elf_linux_fixup() code.
+ */
+struct l___sysctl_args
+{
+ l_uintptr_t name;
+ l_int nlen;
+ l_uintptr_t oldval;
+ l_uintptr_t oldlenp;
+ l_uintptr_t newval;
+ l_size_t newlen;
+ l_ulong __spare[4];
+} __packed;
+
+/* Resource limits */
+#define LINUX_RLIMIT_CPU 0
+#define LINUX_RLIMIT_FSIZE 1
+#define LINUX_RLIMIT_DATA 2
+#define LINUX_RLIMIT_STACK 3
+#define LINUX_RLIMIT_CORE 4
+#define LINUX_RLIMIT_RSS 5
+#define LINUX_RLIMIT_NPROC 6
+#define LINUX_RLIMIT_NOFILE 7
+#define LINUX_RLIMIT_MEMLOCK 8
+#define LINUX_RLIMIT_AS 9 /* Address space limit */
+
+#define LINUX_RLIM_NLIMITS 10
+
+struct l_rlimit {
+ l_ulong rlim_cur;
+ l_ulong rlim_max;
+} __packed;
+
+struct l_rusage {
+ l_timeval ru_utime;
+ l_timeval ru_stime;
+ l_long ru_maxrss;
+ l_long ru_ixrss;
+ l_long ru_idrss;
+ l_long ru_isrss;
+ l_long ru_minflt;
+ l_long ru_majflt;
+ l_long ru_nswap;
+ l_long ru_inblock;
+ l_long ru_oublock;
+ l_long ru_msgsnd;
+ l_long ru_msgrcv;
+ l_long ru_nsignals;
+ l_long ru_nvcsw;
+ l_long ru_nivcsw;
+} __packed;
+
+/* mmap options */
+#define LINUX_MAP_SHARED 0x0001
+#define LINUX_MAP_PRIVATE 0x0002
+#define LINUX_MAP_FIXED 0x0010
+#define LINUX_MAP_ANON 0x0020
+#define LINUX_MAP_GROWSDOWN 0x0100
+
+struct l_mmap_argv {
+ l_uintptr_t addr;
+ l_size_t len;
+ l_int prot;
+ l_int flags;
+ l_int fd;
+ l_ulong pgoff;
+};
+
+/*
+ * stat family of syscalls
+ */
+struct l_timespec {
+ l_time_t tv_sec;
+ l_long tv_nsec;
+} __packed;
+
+struct l_newstat {
+ l_ushort st_dev;
+ l_ushort __pad1;
+ l_ulong st_ino;
+ l_ushort st_mode;
+ l_ushort st_nlink;
+ l_ushort st_uid;
+ l_ushort st_gid;
+ l_ushort st_rdev;
+ l_ushort __pad2;
+ l_ulong st_size;
+ l_ulong st_blksize;
+ l_ulong st_blocks;
+ struct l_timespec st_atim;
+ struct l_timespec st_mtim;
+ struct l_timespec st_ctim;
+ l_ulong __unused4;
+ l_ulong __unused5;
+} __packed;
+
+struct l_stat {
+ l_ushort st_dev;
+ l_ulong st_ino;
+ l_ushort st_mode;
+ l_ushort st_nlink;
+ l_ushort st_uid;
+ l_ushort st_gid;
+ l_ushort st_rdev;
+ l_long st_size;
+ struct l_timespec st_atim;
+ struct l_timespec st_mtim;
+ struct l_timespec st_ctim;
+ l_long st_blksize;
+ l_long st_blocks;
+ l_ulong st_flags;
+ l_ulong st_gen;
+};
+
+struct l_stat64 {
+ l_ushort st_dev;
+ u_char __pad0[10];
+ l_ulong __st_ino;
+ l_uint st_mode;
+ l_uint st_nlink;
+ l_ulong st_uid;
+ l_ulong st_gid;
+ l_ushort st_rdev;
+ u_char __pad3[10];
+ l_longlong st_size;
+ l_ulong st_blksize;
+ l_ulong st_blocks;
+ l_ulong __pad4;
+ struct l_timespec st_atim;
+ struct l_timespec st_mtim;
+ struct l_timespec st_ctim;
+ l_ulonglong st_ino;
+} __packed;
+
+struct l_statfs64 {
+ l_int f_type;
+ l_int f_bsize;
+ uint64_t f_blocks;
+ uint64_t f_bfree;
+ uint64_t f_bavail;
+ uint64_t f_files;
+ uint64_t f_ffree;
+ l_fsid_t f_fsid;
+ l_int f_namelen;
+ l_int f_spare[6];
+} __packed;
+
+/*
+ * Signalling
+ */
+#define LINUX_SIGHUP 1
+#define LINUX_SIGINT 2
+#define LINUX_SIGQUIT 3
+#define LINUX_SIGILL 4
+#define LINUX_SIGTRAP 5
+#define LINUX_SIGABRT 6
+#define LINUX_SIGIOT LINUX_SIGABRT
+#define LINUX_SIGBUS 7
+#define LINUX_SIGFPE 8
+#define LINUX_SIGKILL 9
+#define LINUX_SIGUSR1 10
+#define LINUX_SIGSEGV 11
+#define LINUX_SIGUSR2 12
+#define LINUX_SIGPIPE 13
+#define LINUX_SIGALRM 14
+#define LINUX_SIGTERM 15
+#define LINUX_SIGSTKFLT 16
+#define LINUX_SIGCHLD 17
+#define LINUX_SIGCONT 18
+#define LINUX_SIGSTOP 19
+#define LINUX_SIGTSTP 20
+#define LINUX_SIGTTIN 21
+#define LINUX_SIGTTOU 22
+#define LINUX_SIGURG 23
+#define LINUX_SIGXCPU 24
+#define LINUX_SIGXFSZ 25
+#define LINUX_SIGVTALRM 26
+#define LINUX_SIGPROF 27
+#define LINUX_SIGWINCH 28
+#define LINUX_SIGIO 29
+#define LINUX_SIGPOLL LINUX_SIGIO
+#define LINUX_SIGPWR 30
+#define LINUX_SIGSYS 31
+#define LINUX_SIGRTMIN 32
+
+#define LINUX_SIGTBLSZ 31
+#define LINUX_NSIG_WORDS 2
+#define LINUX_NBPW 32
+#define LINUX_NSIG (LINUX_NBPW * LINUX_NSIG_WORDS)
+
+/* sigaction flags */
+#define LINUX_SA_NOCLDSTOP 0x00000001
+#define LINUX_SA_NOCLDWAIT 0x00000002
+#define LINUX_SA_SIGINFO 0x00000004
+#define LINUX_SA_RESTORER 0x04000000
+#define LINUX_SA_ONSTACK 0x08000000
+#define LINUX_SA_RESTART 0x10000000
+#define LINUX_SA_INTERRUPT 0x20000000
+#define LINUX_SA_NOMASK 0x40000000
+#define LINUX_SA_ONESHOT 0x80000000
+
+/* sigprocmask actions */
+#define LINUX_SIG_BLOCK 0
+#define LINUX_SIG_UNBLOCK 1
+#define LINUX_SIG_SETMASK 2
+
+/* sigset_t macros */
+#define LINUX_SIGEMPTYSET(set) (set).__bits[0] = (set).__bits[1] = 0
+#define LINUX_SIGISMEMBER(set, sig) SIGISMEMBER(set, sig)
+#define LINUX_SIGADDSET(set, sig) SIGADDSET(set, sig)
+
+/* sigaltstack */
+#define LINUX_MINSIGSTKSZ 2048
+#define LINUX_SS_ONSTACK 1
+#define LINUX_SS_DISABLE 2
+
+int linux_to_bsd_sigaltstack(int lsa);
+int bsd_to_linux_sigaltstack(int bsa);
+
+typedef l_uintptr_t l_handler_t;
+typedef l_ulong l_osigset_t;
+
+typedef struct {
+ l_uint __bits[LINUX_NSIG_WORDS];
+} __packed l_sigset_t;
+
+typedef struct {
+ l_handler_t lsa_handler;
+ l_osigset_t lsa_mask;
+ l_ulong lsa_flags;
+ l_uintptr_t lsa_restorer;
+} __packed l_osigaction_t;
+
+typedef struct {
+ l_handler_t lsa_handler;
+ l_ulong lsa_flags;
+ l_uintptr_t lsa_restorer;
+ l_sigset_t lsa_mask;
+} __packed l_sigaction_t;
+
+typedef struct {
+ l_uintptr_t ss_sp;
+ l_int ss_flags;
+ l_size_t ss_size;
+} __packed l_stack_t;
+
+/* The Linux sigcontext, pretty much a standard 386 trapframe. */
+struct l_sigcontext {
+ l_uint sc_gs;
+ l_uint sc_fs;
+ l_uint sc_es;
+ l_uint sc_ds;
+ l_uint sc_edi;
+ l_uint sc_esi;
+ l_uint sc_ebp;
+ l_uint sc_esp;
+ l_uint sc_ebx;
+ l_uint sc_edx;
+ l_uint sc_ecx;
+ l_uint sc_eax;
+ l_uint sc_trapno;
+ l_uint sc_err;
+ l_uint sc_eip;
+ l_uint sc_cs;
+ l_uint sc_eflags;
+ l_uint sc_esp_at_signal;
+ l_uint sc_ss;
+ l_uint sc_387;
+ l_uint sc_mask;
+ l_uint sc_cr2;
+} __packed;
+
+struct l_ucontext {
+ l_ulong uc_flags;
+ l_uintptr_t uc_link;
+ l_stack_t uc_stack;
+ struct l_sigcontext uc_mcontext;
+ l_sigset_t uc_sigmask;
+} __packed;
+
+#define LINUX_SI_MAX_SIZE 128
+#define LINUX_SI_PAD_SIZE ((LINUX_SI_MAX_SIZE/sizeof(l_int)) - 3)
+
+typedef union l_sigval {
+ l_int sival_int;
+ l_uintptr_t sival_ptr;
+} l_sigval_t;
+
+typedef struct l_siginfo {
+ l_int lsi_signo;
+ l_int lsi_errno;
+ l_int lsi_code;
+ union {
+ l_int _pad[LINUX_SI_PAD_SIZE];
+
+ struct {
+ l_pid_t _pid;
+ l_uid_t _uid;
+ } __packed _kill;
+
+ struct {
+ l_timer_t _tid;
+ l_int _overrun;
+ char _pad[sizeof(l_uid_t) - sizeof(l_int)];
+ l_sigval_t _sigval;
+ l_int _sys_private;
+ } __packed _timer;
+
+ struct {
+ l_pid_t _pid; /* sender's pid */
+ l_uid_t _uid; /* sender's uid */
+ l_sigval_t _sigval;
+ } __packed _rt;
+
+ struct {
+ l_pid_t _pid; /* which child */
+ l_uid_t _uid; /* sender's uid */
+ l_int _status; /* exit code */
+ l_clock_t _utime;
+ l_clock_t _stime;
+ } __packed _sigchld;
+
+ struct {
+ l_uintptr_t _addr; /* Faulting insn/memory ref. */
+ } __packed _sigfault;
+
+ struct {
+ l_long _band; /* POLL_IN,POLL_OUT,POLL_MSG */
+ l_int _fd;
+ } __packed _sigpoll;
+ } _sifields;
+} __packed l_siginfo_t;
+
+#define lsi_pid _sifields._kill._pid
+#define lsi_uid _sifields._kill._uid
+#define lsi_tid _sifields._timer._tid
+#define lsi_overrun _sifields._timer._overrun
+#define lsi_sys_private _sifields._timer._sys_private
+#define lsi_status _sifields._sigchld._status
+#define lsi_utime _sifields._sigchld._utime
+#define lsi_stime _sifields._sigchld._stime
+#define lsi_value _sifields._rt._sigval
+#define lsi_int _sifields._rt._sigval.sival_int
+#define lsi_ptr _sifields._rt._sigval.sival_ptr
+#define lsi_addr _sifields._sigfault._addr
+#define lsi_band _sifields._sigpoll._band
+#define lsi_fd _sifields._sigpoll._fd
+
+struct l_fpreg {
+ u_int16_t significand[4];
+ u_int16_t exponent;
+} __packed;
+
+struct l_fpxreg {
+ u_int16_t significand[4];
+ u_int16_t exponent;
+ u_int16_t padding[3];
+} __packed;
+
+struct l_xmmreg {
+ u_int32_t element[4];
+} __packed;
+
+struct l_fpstate {
+ /* Regular FPU environment */
+ u_int32_t cw;
+ u_int32_t sw;
+ u_int32_t tag;
+ u_int32_t ipoff;
+ u_int32_t cssel;
+ u_int32_t dataoff;
+ u_int32_t datasel;
+ struct l_fpreg _st[8];
+ u_int16_t status;
+ u_int16_t magic; /* 0xffff = regular FPU data */
+
+ /* FXSR FPU environment */
+ u_int32_t _fxsr_env[6]; /* env is ignored. */
+ u_int32_t mxcsr;
+ u_int32_t reserved;
+ struct l_fpxreg _fxsr_st[8]; /* reg data is ignored. */
+ struct l_xmmreg _xmm[8];
+ u_int32_t padding[56];
+} __packed;
+
+/*
+ * We make the stack look like Linux expects it when calling a signal
+ * handler, but use the BSD way of calling the handler and sigreturn().
+ * This means that we need to pass the pointer to the handler too.
+ * It is appended to the frame to not interfere with the rest of it.
+ */
+struct l_sigframe {
+ l_int sf_sig;
+ struct l_sigcontext sf_sc;
+ struct l_fpstate sf_fpstate;
+ l_uint sf_extramask[LINUX_NSIG_WORDS-1];
+ l_handler_t sf_handler;
+} __packed;
+
+struct l_rt_sigframe {
+ l_int sf_sig;
+ l_uintptr_t sf_siginfo;
+ l_uintptr_t sf_ucontext;
+ l_siginfo_t sf_si;
+ struct l_ucontext sf_sc;
+ l_handler_t sf_handler;
+} __packed;
+
+extern struct sysentvec elf_linux_sysvec;
+
+/*
+ * open/fcntl flags
+ */
+#define LINUX_O_RDONLY 00000000
+#define LINUX_O_WRONLY 00000001
+#define LINUX_O_RDWR 00000002
+#define LINUX_O_ACCMODE 00000003
+#define LINUX_O_CREAT 00000100
+#define LINUX_O_EXCL 00000200
+#define LINUX_O_NOCTTY 00000400
+#define LINUX_O_TRUNC 00001000
+#define LINUX_O_APPEND 00002000
+#define LINUX_O_NONBLOCK 00004000
+#define LINUX_O_NDELAY LINUX_O_NONBLOCK
+#define LINUX_O_SYNC 00010000
+#define LINUX_FASYNC 00020000
+#define LINUX_O_DIRECT 00040000 /* Direct disk access hint */
+#define LINUX_O_LARGEFILE 00100000
+#define LINUX_O_DIRECTORY 00200000 /* Must be a directory */
+#define LINUX_O_NOFOLLOW 00400000 /* Do not follow links */
+#define LINUX_O_NOATIME 01000000
+#define LINUX_O_CLOEXEC 02000000
+
+#define LINUX_F_DUPFD 0
+#define LINUX_F_GETFD 1
+#define LINUX_F_SETFD 2
+#define LINUX_F_GETFL 3
+#define LINUX_F_SETFL 4
+#define LINUX_F_GETLK 5
+#define LINUX_F_SETLK 6
+#define LINUX_F_SETLKW 7
+#define LINUX_F_SETOWN 8
+#define LINUX_F_GETOWN 9
+
+#define LINUX_F_GETLK64 12
+#define LINUX_F_SETLK64 13
+#define LINUX_F_SETLKW64 14
+
+#define LINUX_F_RDLCK 0
+#define LINUX_F_WRLCK 1
+#define LINUX_F_UNLCK 2
+
+union l_semun {
+ l_int val;
+ l_uintptr_t buf;
+ l_uintptr_t array;
+ l_uintptr_t __buf;
+ l_uintptr_t __pad;
+} __packed;
+
+/*
+ * Socket defines
+ */
+#define LINUX_SOL_SOCKET 1
+#define LINUX_SOL_IP 0
+#define LINUX_SOL_IPX 256
+#define LINUX_SOL_AX25 257
+#define LINUX_SOL_TCP 6
+#define LINUX_SOL_UDP 17
+
+#define LINUX_SO_DEBUG 1
+#define LINUX_SO_REUSEADDR 2
+#define LINUX_SO_TYPE 3
+#define LINUX_SO_ERROR 4
+#define LINUX_SO_DONTROUTE 5
+#define LINUX_SO_BROADCAST 6
+#define LINUX_SO_SNDBUF 7
+#define LINUX_SO_RCVBUF 8
+#define LINUX_SO_KEEPALIVE 9
+#define LINUX_SO_OOBINLINE 10
+#define LINUX_SO_NO_CHECK 11
+#define LINUX_SO_PRIORITY 12
+#define LINUX_SO_LINGER 13
+#define LINUX_SO_PEERCRED 17
+#define LINUX_SO_RCVLOWAT 18
+#define LINUX_SO_SNDLOWAT 19
+#define LINUX_SO_RCVTIMEO 20
+#define LINUX_SO_SNDTIMEO 21
+#define LINUX_SO_TIMESTAMP 29
+#define LINUX_SO_ACCEPTCONN 30
+
+struct l_sockaddr {
+ l_ushort sa_family;
+ char sa_data[14];
+} __packed;
+
+struct l_msghdr {
+ l_uintptr_t msg_name;
+ l_int msg_namelen;
+ l_uintptr_t msg_iov;
+ l_size_t msg_iovlen;
+ l_uintptr_t msg_control;
+ l_size_t msg_controllen;
+ l_uint msg_flags;
+};
+
+struct l_cmsghdr {
+ l_size_t cmsg_len;
+ l_int cmsg_level;
+ l_int cmsg_type;
+};
+
+struct l_ifmap {
+ l_ulong mem_start;
+ l_ulong mem_end;
+ l_ushort base_addr;
+ u_char irq;
+ u_char dma;
+ u_char port;
+} __packed;
+
+#define LINUX_IFHWADDRLEN 6
+#define LINUX_IFNAMSIZ 16
+
+struct l_ifreq {
+ union {
+ char ifrn_name[LINUX_IFNAMSIZ];
+ } ifr_ifrn;
+
+ union {
+ struct l_sockaddr ifru_addr;
+ struct l_sockaddr ifru_dstaddr;
+ struct l_sockaddr ifru_broadaddr;
+ struct l_sockaddr ifru_netmask;
+ struct l_sockaddr ifru_hwaddr;
+ l_short ifru_flags[1];
+ l_int ifru_metric;
+ l_int ifru_mtu;
+ struct l_ifmap ifru_map;
+ char ifru_slave[LINUX_IFNAMSIZ];
+ l_uintptr_t ifru_data;
+ } ifr_ifru;
+} __packed;
+
+#define ifr_name ifr_ifrn.ifrn_name /* Interface name */
+#define ifr_hwaddr ifr_ifru.ifru_hwaddr /* MAC address */
+
+struct l_ifconf {
+ int ifc_len;
+ union {
+ l_uintptr_t ifcu_buf;
+ l_uintptr_t ifcu_req;
+ } ifc_ifcu;
+} __packed;
+
+#define ifc_buf ifc_ifcu.ifcu_buf
+#define ifc_req ifc_ifcu.ifcu_req
+
+/*
+ * poll()
+ */
+#define LINUX_POLLIN 0x0001
+#define LINUX_POLLPRI 0x0002
+#define LINUX_POLLOUT 0x0004
+#define LINUX_POLLERR 0x0008
+#define LINUX_POLLHUP 0x0010
+#define LINUX_POLLNVAL 0x0020
+#define LINUX_POLLRDNORM 0x0040
+#define LINUX_POLLRDBAND 0x0080
+#define LINUX_POLLWRNORM 0x0100
+#define LINUX_POLLWRBAND 0x0200
+#define LINUX_POLLMSG 0x0400
+
+struct l_pollfd {
+ l_int fd;
+ l_short events;
+ l_short revents;
+} __packed;
+
+struct l_user_desc {
+ l_uint entry_number;
+ l_uint base_addr;
+ l_uint limit;
+ l_uint seg_32bit:1;
+ l_uint contents:2;
+ l_uint read_exec_only:1;
+ l_uint limit_in_pages:1;
+ l_uint seg_not_present:1;
+ l_uint useable:1;
+};
+
+#define LINUX_LOWERWORD 0x0000ffff
+
+/*
+ * Macros which does the same thing as those in Linux include/asm-um/ldt-i386.h.
+ * These convert Linux user space descriptor to machine one.
+ */
+#define LINUX_LDT_entry_a(info) \
+ ((((info)->base_addr & LINUX_LOWERWORD) << 16) | \
+ ((info)->limit & LINUX_LOWERWORD))
+
+#define LINUX_ENTRY_B_READ_EXEC_ONLY 9
+#define LINUX_ENTRY_B_CONTENTS 10
+#define LINUX_ENTRY_B_SEG_NOT_PRESENT 15
+#define LINUX_ENTRY_B_BASE_ADDR 16
+#define LINUX_ENTRY_B_USEABLE 20
+#define LINUX_ENTRY_B_SEG32BIT 22
+#define LINUX_ENTRY_B_LIMIT 23
+
+#define LINUX_LDT_entry_b(info) \
+ (((info)->base_addr & 0xff000000) | \
+ ((info)->limit & 0xf0000) | \
+ ((info)->contents << LINUX_ENTRY_B_CONTENTS) | \
+ (((info)->seg_not_present == 0) << LINUX_ENTRY_B_SEG_NOT_PRESENT) | \
+ (((info)->base_addr & 0x00ff0000) >> LINUX_ENTRY_B_BASE_ADDR) | \
+ (((info)->read_exec_only == 0) << LINUX_ENTRY_B_READ_EXEC_ONLY) | \
+ ((info)->seg_32bit << LINUX_ENTRY_B_SEG32BIT) | \
+ ((info)->useable << LINUX_ENTRY_B_USEABLE) | \
+ ((info)->limit_in_pages << LINUX_ENTRY_B_LIMIT) | 0x7000)
+
+#define LINUX_LDT_empty(info) \
+ ((info)->base_addr == 0 && \
+ (info)->limit == 0 && \
+ (info)->contents == 0 && \
+ (info)->seg_not_present == 1 && \
+ (info)->read_exec_only == 1 && \
+ (info)->seg_32bit == 0 && \
+ (info)->limit_in_pages == 0 && \
+ (info)->useable == 0)
+
+/*
+ * Macros for converting segments.
+ * They do the same as those in arch/i386/kernel/process.c in Linux.
+ */
+#define LINUX_GET_BASE(desc) \
+ ((((desc)->a >> 16) & LINUX_LOWERWORD) | \
+ (((desc)->b << 16) & 0x00ff0000) | \
+ ((desc)->b & 0xff000000))
+
+#define LINUX_GET_LIMIT(desc) \
+ (((desc)->a & LINUX_LOWERWORD) | \
+ ((desc)->b & 0xf0000))
+
+#define LINUX_GET_32BIT(desc) \
+ (((desc)->b >> LINUX_ENTRY_B_SEG32BIT) & 1)
+#define LINUX_GET_CONTENTS(desc) \
+ (((desc)->b >> LINUX_ENTRY_B_CONTENTS) & 3)
+#define LINUX_GET_WRITABLE(desc) \
+ (((desc)->b >> LINUX_ENTRY_B_READ_EXEC_ONLY) & 1)
+#define LINUX_GET_LIMIT_PAGES(desc) \
+ (((desc)->b >> LINUX_ENTRY_B_LIMIT) & 1)
+#define LINUX_GET_PRESENT(desc) \
+ (((desc)->b >> LINUX_ENTRY_B_SEG_NOT_PRESENT) & 1)
+#define LINUX_GET_USEABLE(desc) \
+ (((desc)->b >> LINUX_ENTRY_B_USEABLE) & 1)
+
+struct iovec;
+
+struct l_iovec32 {
+ uint32_t iov_base;
+ l_size_t iov_len;
+};
+
+int linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt,
+ struct iovec **iovp, int error);
+
+/* robust futexes */
+struct linux_robust_list {
+ l_uintptr_t next;
+};
+
+struct linux_robust_list_head {
+ struct linux_robust_list list;
+ l_long futex_offset;
+ l_uintptr_t pending_list;
+};
+
+#endif /* !_AMD64_LINUX_H_ */
diff --git a/sys/amd64/linux32/linux32_dummy.c b/sys/amd64/linux32/linux32_dummy.c
new file mode 100644
index 0000000..95bf3ec
--- /dev/null
+++ b/sys/amd64/linux32/linux32_dummy.c
@@ -0,0 +1,176 @@
+/*-
+ * Copyright (c) 1994-1995 Søren Schmidt
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_kdtrace.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sdt.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+
+#include <amd64/linux32/linux.h>
+#include <amd64/linux32/linux32_proto.h>
+#include <compat/linux/linux_dtrace.h>
+#include <compat/linux/linux_util.h>
+
+/* DTrace init */
+LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
+
+DUMMY(stime);
+DUMMY(olduname);
+DUMMY(syslog);
+DUMMY(uname);
+DUMMY(vhangup);
+DUMMY(swapoff);
+DUMMY(adjtimex);
+DUMMY(create_module);
+DUMMY(init_module);
+DUMMY(delete_module);
+DUMMY(get_kernel_syms);
+DUMMY(quotactl);
+DUMMY(bdflush);
+DUMMY(sysfs);
+DUMMY(query_module);
+DUMMY(nfsservctl);
+DUMMY(rt_sigqueueinfo);
+DUMMY(sendfile);
+DUMMY(setfsuid);
+DUMMY(setfsgid);
+DUMMY(pivot_root);
+DUMMY(mincore);
+DUMMY(ptrace);
+DUMMY(lookup_dcookie);
+DUMMY(epoll_create);
+DUMMY(epoll_ctl);
+DUMMY(epoll_wait);
+DUMMY(remap_file_pages);
+DUMMY(timer_create);
+DUMMY(timer_settime);
+DUMMY(timer_gettime);
+DUMMY(timer_getoverrun);
+DUMMY(timer_delete);
+DUMMY(fstatfs64);
+DUMMY(mbind);
+DUMMY(get_mempolicy);
+DUMMY(set_mempolicy);
+DUMMY(mq_open);
+DUMMY(mq_unlink);
+DUMMY(mq_timedsend);
+DUMMY(mq_timedreceive);
+DUMMY(mq_notify);
+DUMMY(mq_getsetattr);
+DUMMY(kexec_load);
+DUMMY(waitid);
+/* linux 2.6.11: */
+DUMMY(add_key);
+DUMMY(request_key);
+DUMMY(keyctl);
+/* linux 2.6.13: */
+DUMMY(ioprio_set);
+DUMMY(ioprio_get);
+DUMMY(inotify_init);
+DUMMY(inotify_add_watch);
+DUMMY(inotify_rm_watch);
+/* linux 2.6.16: */
+DUMMY(migrate_pages);
+DUMMY(pselect6);
+DUMMY(ppoll);
+DUMMY(unshare);
+/* linux 2.6.17: */
+DUMMY(splice);
+DUMMY(sync_file_range);
+DUMMY(tee);
+DUMMY(vmsplice);
+/* linux 2.6.18: */
+DUMMY(move_pages);
+/* linux 2.6.19: */
+DUMMY(getcpu);
+DUMMY(epoll_pwait);
+/* linux 2.6.22: */
+DUMMY(utimensat);
+DUMMY(signalfd);
+DUMMY(timerfd_create);
+DUMMY(eventfd);
+/* linux 2.6.23: */
+DUMMY(fallocate);
+/* linux 2.6.25: */
+DUMMY(timerfd_settime);
+DUMMY(timerfd_gettime);
+/* linux 2.6.27: */
+DUMMY(signalfd4);
+DUMMY(eventfd2);
+DUMMY(epoll_create1);
+DUMMY(dup3);
+DUMMY(inotify_init1);
+/* linux 2.6.30: */
+DUMMY(preadv);
+DUMMY(pwritev);
+/* linux 2.6.31: */
+DUMMY(rt_tsigqueueinfo);
+DUMMY(perf_event_open);
+/* linux 2.6.33: */
+DUMMY(recvmmsg);
+DUMMY(fanotify_init);
+DUMMY(fanotify_mark);
+/* linux 2.6.36: */
+DUMMY(prlimit64);
+/* later: */
+DUMMY(name_to_handle_at);
+DUMMY(open_by_handle_at);
+DUMMY(clock_adjtime);
+DUMMY(syncfs);
+DUMMY(sendmmsg);
+DUMMY(setns);
+DUMMY(process_vm_readv);
+DUMMY(process_vm_writev);
+
+#define DUMMY_XATTR(s) \
+int \
+linux_ ## s ## xattr( \
+ struct thread *td, struct linux_ ## s ## xattr_args *arg) \
+{ \
+ \
+ return (ENOATTR); \
+}
+DUMMY_XATTR(set);
+DUMMY_XATTR(lset);
+DUMMY_XATTR(fset);
+DUMMY_XATTR(get);
+DUMMY_XATTR(lget);
+DUMMY_XATTR(fget);
+DUMMY_XATTR(list);
+DUMMY_XATTR(llist);
+DUMMY_XATTR(flist);
+DUMMY_XATTR(remove);
+DUMMY_XATTR(lremove);
+DUMMY_XATTR(fremove);
diff --git a/sys/amd64/linux32/linux32_genassym.c b/sys/amd64/linux32/linux32_genassym.c
new file mode 100644
index 0000000..a022fac
--- /dev/null
+++ b/sys/amd64/linux32/linux32_genassym.c
@@ -0,0 +1,14 @@
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/assym.h>
+#include <sys/systm.h>
+
+#include <amd64/linux32/linux.h>
+
+ASSYM(LINUX_SIGF_HANDLER, offsetof(struct l_sigframe, sf_handler));
+ASSYM(LINUX_SIGF_SC, offsetof(struct l_sigframe, sf_sc));
+ASSYM(LINUX_RT_SIGF_HANDLER, offsetof(struct l_rt_sigframe, sf_handler));
+ASSYM(LINUX_RT_SIGF_UC, offsetof(struct l_rt_sigframe, sf_sc));
+ASSYM(LINUX_RT_SIGF_SC, offsetof(struct l_ucontext, uc_mcontext));
diff --git a/sys/amd64/linux32/linux32_ipc64.h b/sys/amd64/linux32/linux32_ipc64.h
new file mode 100644
index 0000000..f8c92c4
--- /dev/null
+++ b/sys/amd64/linux32/linux32_ipc64.h
@@ -0,0 +1,145 @@
+/*-
+ * Copyright (c) 2002 Maxim Sobolev <sobomax@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _AMD64_LINUX_LINUX_IPC64_H_
+#define _AMD64_LINUX_LINUX_IPC64_H_
+
+/*
+ * The ipc64_perm structure for i386 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space is left for:
+ * - 32-bit mode_t and seq
+ * - 2 miscellaneous 32-bit values
+ */
+
+struct l_ipc64_perm
+{
+ l_key_t key;
+ l_uid_t uid;
+ l_gid_t gid;
+ l_uid_t cuid;
+ l_gid_t cgid;
+ l_mode_t mode;
+ l_ushort __pad1;
+ l_ushort seq;
+ l_ushort __pad2;
+ l_ulong __unused1;
+ l_ulong __unused2;
+} __packed;
+
+/*
+ * The msqid64_ds structure for i386 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space is left for:
+ * - 64-bit time_t to solve y2038 problem
+ * - 2 miscellaneous 32-bit values
+ */
+
+struct l_msqid64_ds {
+ struct l_ipc64_perm msg_perm;
+ l_time_t msg_stime; /* last msgsnd time */
+ l_ulong __unused1;
+ l_time_t msg_rtime; /* last msgrcv time */
+ l_ulong __unused2;
+ l_time_t msg_ctime; /* last change time */
+ l_ulong __unused3;
+ l_ulong msg_cbytes; /* current number of bytes on queue */
+ l_ulong msg_qnum; /* number of messages in queue */
+ l_ulong msg_qbytes; /* max number of bytes on queue */
+ l_pid_t msg_lspid; /* pid of last msgsnd */
+ l_pid_t msg_lrpid; /* last receive pid */
+ l_ulong __unused4;
+ l_ulong __unused5;
+} __packed;
+
+/*
+ * The semid64_ds structure for i386 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space is left for:
+ * - 64-bit time_t to solve y2038 problem
+ * - 2 miscellaneous 32-bit values
+ */
+
+struct l_semid64_ds {
+ struct l_ipc64_perm sem_perm; /* permissions */
+ l_time_t sem_otime; /* last semop time */
+ l_ulong __unused1;
+ l_time_t sem_ctime; /* last change time */
+ l_ulong __unused2;
+ l_ulong sem_nsems; /* no. of semaphores in array */
+ l_ulong __unused3;
+ l_ulong __unused4;
+} __packed;
+
+/*
+ * The shmid64_ds structure for i386 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space is left for:
+ * - 64-bit time_t to solve y2038 problem
+ * - 2 miscellaneous 32-bit values
+ */
+
+struct l_shmid64_ds {
+ struct l_ipc64_perm shm_perm; /* operation perms */
+ l_size_t shm_segsz; /* size of segment (bytes) */
+ l_time_t shm_atime; /* last attach time */
+ l_ulong __unused1;
+ l_time_t shm_dtime; /* last detach time */
+ l_ulong __unused2;
+ l_time_t shm_ctime; /* last change time */
+ l_ulong __unused3;
+ l_pid_t shm_cpid; /* pid of creator */
+ l_pid_t shm_lpid; /* pid of last operator */
+ l_ulong shm_nattch; /* no. of current attaches */
+ l_ulong __unused4;
+ l_ulong __unused5;
+} __packed;
+
+struct l_shminfo64 {
+ l_ulong shmmax;
+ l_ulong shmmin;
+ l_ulong shmmni;
+ l_ulong shmseg;
+ l_ulong shmall;
+ l_ulong __unused1;
+ l_ulong __unused2;
+ l_ulong __unused3;
+ l_ulong __unused4;
+} __packed;
+
+#endif /* !_AMD64_LINUX_LINUX_IPC64_H_ */
diff --git a/sys/amd64/linux32/linux32_locore.s b/sys/amd64/linux32/linux32_locore.s
new file mode 100644
index 0000000..36e1abf
--- /dev/null
+++ b/sys/amd64/linux32/linux32_locore.s
@@ -0,0 +1,38 @@
+/* $FreeBSD$ */
+
+#include "linux32_assym.h" /* system definitions */
+#include <machine/asmacros.h> /* miscellaneous asm macros */
+
+#include <amd64/linux32/linux32_syscall.h> /* system call numbers */
+
+.text
+.code32
+
+NON_GPROF_ENTRY(linux_sigcode)
+ call *LINUX_SIGF_HANDLER(%esp)
+ leal LINUX_SIGF_SC(%esp),%ebx /* linux scp */
+ movl %esp, %ebx /* pass sigframe */
+ push %eax /* fake ret addr */
+ movl $LINUX_SYS_linux_sigreturn,%eax /* linux_sigreturn() */
+ int $0x80 /* enter kernel with args */
+0: jmp 0b
+ ALIGN_TEXT
+/* XXXXX */
+linux_rt_sigcode:
+ call *LINUX_RT_SIGF_HANDLER(%esp)
+ leal LINUX_RT_SIGF_UC(%esp),%ebx /* linux ucp */
+ leal LINUX_RT_SIGF_SC(%ebx),%ecx /* linux sigcontext */
+ push %eax /* fake ret addr */
+ movl $LINUX_SYS_linux_rt_sigreturn,%eax /* linux_rt_sigreturn() */
+ int $0x80 /* enter kernel with args */
+0: jmp 0b
+ ALIGN_TEXT
+/* XXXXX */
+linux_esigcode:
+
+ .data
+ .globl linux_szsigcode, linux_sznonrtsigcode
+linux_szsigcode:
+ .long linux_esigcode-linux_sigcode
+linux_sznonrtsigcode:
+ .long linux_rt_sigcode-linux_sigcode
diff --git a/sys/amd64/linux32/linux32_machdep.c b/sys/amd64/linux32/linux32_machdep.c
new file mode 100644
index 0000000..7725163
--- /dev/null
+++ b/sys/amd64/linux32/linux32_machdep.c
@@ -0,0 +1,1067 @@
+/*-
+ * Copyright (c) 2004 Tim J. Robbins
+ * Copyright (c) 2002 Doug Rabson
+ * Copyright (c) 2000 Marcel Moolenaar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/capability.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/clock.h>
+#include <sys/imgact.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/unistd.h>
+#include <sys/wait.h>
+
+#include <machine/frame.h>
+#include <machine/pcb.h>
+#include <machine/psl.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+#include <compat/freebsd32/freebsd32_util.h>
+#include <amd64/linux32/linux.h>
+#include <amd64/linux32/linux32_proto.h>
+#include <compat/linux/linux_ipc.h>
+#include <compat/linux/linux_misc.h>
+#include <compat/linux/linux_signal.h>
+#include <compat/linux/linux_util.h>
+#include <compat/linux/linux_emul.h>
+
+struct l_old_select_argv {
+ l_int nfds;
+ l_uintptr_t readfds;
+ l_uintptr_t writefds;
+ l_uintptr_t exceptfds;
+ l_uintptr_t timeout;
+} __packed;
+
+int
+linux_to_bsd_sigaltstack(int lsa)
+{
+ int bsa = 0;
+
+ if (lsa & LINUX_SS_DISABLE)
+ bsa |= SS_DISABLE;
+ if (lsa & LINUX_SS_ONSTACK)
+ bsa |= SS_ONSTACK;
+ return (bsa);
+}
+
+static int linux_mmap_common(struct thread *td, l_uintptr_t addr,
+ l_size_t len, l_int prot, l_int flags, l_int fd,
+ l_loff_t pos);
+
+int
+bsd_to_linux_sigaltstack(int bsa)
+{
+ int lsa = 0;
+
+ if (bsa & SS_DISABLE)
+ lsa |= LINUX_SS_DISABLE;
+ if (bsa & SS_ONSTACK)
+ lsa |= LINUX_SS_ONSTACK;
+ return (lsa);
+}
+
+static void
+bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru)
+{
+
+ lru->ru_utime.tv_sec = ru->ru_utime.tv_sec;
+ lru->ru_utime.tv_usec = ru->ru_utime.tv_usec;
+ lru->ru_stime.tv_sec = ru->ru_stime.tv_sec;
+ lru->ru_stime.tv_usec = ru->ru_stime.tv_usec;
+ lru->ru_maxrss = ru->ru_maxrss;
+ lru->ru_ixrss = ru->ru_ixrss;
+ lru->ru_idrss = ru->ru_idrss;
+ lru->ru_isrss = ru->ru_isrss;
+ lru->ru_minflt = ru->ru_minflt;
+ lru->ru_majflt = ru->ru_majflt;
+ lru->ru_nswap = ru->ru_nswap;
+ lru->ru_inblock = ru->ru_inblock;
+ lru->ru_oublock = ru->ru_oublock;
+ lru->ru_msgsnd = ru->ru_msgsnd;
+ lru->ru_msgrcv = ru->ru_msgrcv;
+ lru->ru_nsignals = ru->ru_nsignals;
+ lru->ru_nvcsw = ru->ru_nvcsw;
+ lru->ru_nivcsw = ru->ru_nivcsw;
+}
+
+int
+linux_execve(struct thread *td, struct linux_execve_args *args)
+{
+ struct image_args eargs;
+ char *path;
+ int error;
+
+ LCONVPATHEXIST(td, args->path, &path);
+
+#ifdef DEBUG
+ if (ldebug(execve))
+ printf(ARGS(execve, "%s"), path);
+#endif
+
+ error = freebsd32_exec_copyin_args(&eargs, path, UIO_SYSSPACE,
+ args->argp, args->envp);
+ free(path, M_TEMP);
+ if (error == 0)
+ error = kern_execve(td, &eargs, NULL);
+ if (error == 0)
+ /* Linux process can execute FreeBSD one, do not attempt
+ * to create emuldata for such process using
+ * linux_proc_init, this leads to a panic on KASSERT
+ * because such process has p->p_emuldata == NULL.
+ */
+ if (SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX)
+ error = linux_proc_init(td, 0, 0);
+ return (error);
+}
+
+CTASSERT(sizeof(struct l_iovec32) == 8);
+
+static int
+linux32_copyinuio(struct l_iovec32 *iovp, l_ulong iovcnt, struct uio **uiop)
+{
+ struct l_iovec32 iov32;
+ struct iovec *iov;
+ struct uio *uio;
+ uint32_t iovlen;
+ int error, i;
+
+ *uiop = NULL;
+ if (iovcnt > UIO_MAXIOV)
+ return (EINVAL);
+ iovlen = iovcnt * sizeof(struct iovec);
+ uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK);
+ iov = (struct iovec *)(uio + 1);
+ for (i = 0; i < iovcnt; i++) {
+ error = copyin(&iovp[i], &iov32, sizeof(struct l_iovec32));
+ if (error) {
+ free(uio, M_IOV);
+ return (error);
+ }
+ iov[i].iov_base = PTRIN(iov32.iov_base);
+ iov[i].iov_len = iov32.iov_len;
+ }
+ uio->uio_iov = iov;
+ uio->uio_iovcnt = iovcnt;
+ uio->uio_segflg = UIO_USERSPACE;
+ uio->uio_offset = -1;
+ uio->uio_resid = 0;
+ for (i = 0; i < iovcnt; i++) {
+ if (iov->iov_len > INT_MAX - uio->uio_resid) {
+ free(uio, M_IOV);
+ return (EINVAL);
+ }
+ uio->uio_resid += iov->iov_len;
+ iov++;
+ }
+ *uiop = uio;
+ return (0);
+}
+
+int
+linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt, struct iovec **iovp,
+ int error)
+{
+ struct l_iovec32 iov32;
+ struct iovec *iov;
+ uint32_t iovlen;
+ int i;
+
+ *iovp = NULL;
+ if (iovcnt > UIO_MAXIOV)
+ return (error);
+ iovlen = iovcnt * sizeof(struct iovec);
+ iov = malloc(iovlen, M_IOV, M_WAITOK);
+ for (i = 0; i < iovcnt; i++) {
+ error = copyin(&iovp32[i], &iov32, sizeof(struct l_iovec32));
+ if (error) {
+ free(iov, M_IOV);
+ return (error);
+ }
+ iov[i].iov_base = PTRIN(iov32.iov_base);
+ iov[i].iov_len = iov32.iov_len;
+ }
+ *iovp = iov;
+ return(0);
+
+}
+
+int
+linux_readv(struct thread *td, struct linux_readv_args *uap)
+{
+ struct uio *auio;
+ int error;
+
+ error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
+ if (error)
+ return (error);
+ error = kern_readv(td, uap->fd, auio);
+ free(auio, M_IOV);
+ return (error);
+}
+
+int
+linux_writev(struct thread *td, struct linux_writev_args *uap)
+{
+ struct uio *auio;
+ int error;
+
+ error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
+ if (error)
+ return (error);
+ error = kern_writev(td, uap->fd, auio);
+ free(auio, M_IOV);
+ return (error);
+}
+
+struct l_ipc_kludge {
+ l_uintptr_t msgp;
+ l_long msgtyp;
+} __packed;
+
+int
+linux_ipc(struct thread *td, struct linux_ipc_args *args)
+{
+
+ switch (args->what & 0xFFFF) {
+ case LINUX_SEMOP: {
+ struct linux_semop_args a;
+
+ a.semid = args->arg1;
+ a.tsops = args->ptr;
+ a.nsops = args->arg2;
+ return (linux_semop(td, &a));
+ }
+ case LINUX_SEMGET: {
+ struct linux_semget_args a;
+
+ a.key = args->arg1;
+ a.nsems = args->arg2;
+ a.semflg = args->arg3;
+ return (linux_semget(td, &a));
+ }
+ case LINUX_SEMCTL: {
+ struct linux_semctl_args a;
+ int error;
+
+ a.semid = args->arg1;
+ a.semnum = args->arg2;
+ a.cmd = args->arg3;
+ error = copyin(args->ptr, &a.arg, sizeof(a.arg));
+ if (error)
+ return (error);
+ return (linux_semctl(td, &a));
+ }
+ case LINUX_MSGSND: {
+ struct linux_msgsnd_args a;
+
+ a.msqid = args->arg1;
+ a.msgp = args->ptr;
+ a.msgsz = args->arg2;
+ a.msgflg = args->arg3;
+ return (linux_msgsnd(td, &a));
+ }
+ case LINUX_MSGRCV: {
+ struct linux_msgrcv_args a;
+
+ a.msqid = args->arg1;
+ a.msgsz = args->arg2;
+ a.msgflg = args->arg3;
+ if ((args->what >> 16) == 0) {
+ struct l_ipc_kludge tmp;
+ int error;
+
+ if (args->ptr == 0)
+ return (EINVAL);
+ error = copyin(args->ptr, &tmp, sizeof(tmp));
+ if (error)
+ return (error);
+ a.msgp = PTRIN(tmp.msgp);
+ a.msgtyp = tmp.msgtyp;
+ } else {
+ a.msgp = args->ptr;
+ a.msgtyp = args->arg5;
+ }
+ return (linux_msgrcv(td, &a));
+ }
+ case LINUX_MSGGET: {
+ struct linux_msgget_args a;
+
+ a.key = args->arg1;
+ a.msgflg = args->arg2;
+ return (linux_msgget(td, &a));
+ }
+ case LINUX_MSGCTL: {
+ struct linux_msgctl_args a;
+
+ a.msqid = args->arg1;
+ a.cmd = args->arg2;
+ a.buf = args->ptr;
+ return (linux_msgctl(td, &a));
+ }
+ case LINUX_SHMAT: {
+ struct linux_shmat_args a;
+
+ a.shmid = args->arg1;
+ a.shmaddr = args->ptr;
+ a.shmflg = args->arg2;
+ a.raddr = PTRIN((l_uint)args->arg3);
+ return (linux_shmat(td, &a));
+ }
+ case LINUX_SHMDT: {
+ struct linux_shmdt_args a;
+
+ a.shmaddr = args->ptr;
+ return (linux_shmdt(td, &a));
+ }
+ case LINUX_SHMGET: {
+ struct linux_shmget_args a;
+
+ a.key = args->arg1;
+ a.size = args->arg2;
+ a.shmflg = args->arg3;
+ return (linux_shmget(td, &a));
+ }
+ case LINUX_SHMCTL: {
+ struct linux_shmctl_args a;
+
+ a.shmid = args->arg1;
+ a.cmd = args->arg2;
+ a.buf = args->ptr;
+ return (linux_shmctl(td, &a));
+ }
+ default:
+ break;
+ }
+
+ return (EINVAL);
+}
+
+int
+linux_old_select(struct thread *td, struct linux_old_select_args *args)
+{
+ struct l_old_select_argv linux_args;
+ struct linux_select_args newsel;
+ int error;
+
+#ifdef DEBUG
+ if (ldebug(old_select))
+ printf(ARGS(old_select, "%p"), args->ptr);
+#endif
+
+ error = copyin(args->ptr, &linux_args, sizeof(linux_args));
+ if (error)
+ return (error);
+
+ newsel.nfds = linux_args.nfds;
+ newsel.readfds = PTRIN(linux_args.readfds);
+ newsel.writefds = PTRIN(linux_args.writefds);
+ newsel.exceptfds = PTRIN(linux_args.exceptfds);
+ newsel.timeout = PTRIN(linux_args.timeout);
+ return (linux_select(td, &newsel));
+}
+
+int
+linux_set_cloned_tls(struct thread *td, void *desc)
+{
+ struct user_segment_descriptor sd;
+ struct l_user_desc info;
+ struct pcb *pcb;
+ int error;
+ int a[2];
+
+ error = copyin(desc, &info, sizeof(struct l_user_desc));
+ if (error) {
+ printf(LMSG("copyin failed!"));
+ } else {
+ /* We might copy out the entry_number as GUGS32_SEL. */
+ info.entry_number = GUGS32_SEL;
+ error = copyout(&info, desc, sizeof(struct l_user_desc));
+ if (error)
+ printf(LMSG("copyout failed!"));
+
+ a[0] = LINUX_LDT_entry_a(&info);
+ a[1] = LINUX_LDT_entry_b(&info);
+
+ memcpy(&sd, &a, sizeof(a));
+#ifdef DEBUG
+ if (ldebug(clone))
+ printf("Segment created in clone with "
+ "CLONE_SETTLS: lobase: %x, hibase: %x, "
+ "lolimit: %x, hilimit: %x, type: %i, "
+ "dpl: %i, p: %i, xx: %i, long: %i, "
+ "def32: %i, gran: %i\n", sd.sd_lobase,
+ sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit,
+ sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx,
+ sd.sd_long, sd.sd_def32, sd.sd_gran);
+#endif
+ pcb = td->td_pcb;
+ pcb->pcb_gsbase = (register_t)info.base_addr;
+/* XXXKIB pcb->pcb_gs32sd = sd; */
+ td->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL);
+ set_pcb_flags(pcb, PCB_GS32BIT | PCB_32BIT);
+ }
+
+ return (error);
+}
+
+int
+linux_set_upcall_kse(struct thread *td, register_t stack)
+{
+
+ td->td_frame->tf_rsp = stack;
+
+ return (0);
+}
+
+#define STACK_SIZE (2 * 1024 * 1024)
+#define GUARD_SIZE (4 * PAGE_SIZE)
+
+int
+linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
+{
+
+#ifdef DEBUG
+ if (ldebug(mmap2))
+ printf(ARGS(mmap2, "0x%08x, %d, %d, 0x%08x, %d, %d"),
+ args->addr, args->len, args->prot,
+ args->flags, args->fd, args->pgoff);
+#endif
+
+ return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot,
+ args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff *
+ PAGE_SIZE));
+}
+
+int
+linux_mmap(struct thread *td, struct linux_mmap_args *args)
+{
+ int error;
+ struct l_mmap_argv linux_args;
+
+ error = copyin(args->ptr, &linux_args, sizeof(linux_args));
+ if (error)
+ return (error);
+
+#ifdef DEBUG
+ if (ldebug(mmap))
+ printf(ARGS(mmap, "0x%08x, %d, %d, 0x%08x, %d, %d"),
+ linux_args.addr, linux_args.len, linux_args.prot,
+ linux_args.flags, linux_args.fd, linux_args.pgoff);
+#endif
+
+ return (linux_mmap_common(td, linux_args.addr, linux_args.len,
+ linux_args.prot, linux_args.flags, linux_args.fd,
+ (uint32_t)linux_args.pgoff));
+}
+
+static int
+linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot,
+ l_int flags, l_int fd, l_loff_t pos)
+{
+ struct proc *p = td->td_proc;
+ struct mmap_args /* {
+ caddr_t addr;
+ size_t len;
+ int prot;
+ int flags;
+ int fd;
+ long pad;
+ off_t pos;
+ } */ bsd_args;
+ int error;
+ struct file *fp;
+
+ error = 0;
+ bsd_args.flags = 0;
+ fp = NULL;
+
+ /*
+ * Linux mmap(2):
+ * You must specify exactly one of MAP_SHARED and MAP_PRIVATE
+ */
+ if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE)))
+ return (EINVAL);
+
+ if (flags & LINUX_MAP_SHARED)
+ bsd_args.flags |= MAP_SHARED;
+ if (flags & LINUX_MAP_PRIVATE)
+ bsd_args.flags |= MAP_PRIVATE;
+ if (flags & LINUX_MAP_FIXED)
+ bsd_args.flags |= MAP_FIXED;
+ if (flags & LINUX_MAP_ANON) {
+ /* Enforce pos to be on page boundary, then ignore. */
+ if ((pos & PAGE_MASK) != 0)
+ return (EINVAL);
+ pos = 0;
+ bsd_args.flags |= MAP_ANON;
+ } else
+ bsd_args.flags |= MAP_NOSYNC;
+ if (flags & LINUX_MAP_GROWSDOWN)
+ bsd_args.flags |= MAP_STACK;
+
+ /*
+ * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC
+ * on Linux/i386. We do this to ensure maximum compatibility.
+ * Linux/ia64 does the same in i386 emulation mode.
+ */
+ bsd_args.prot = prot;
+ if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
+ bsd_args.prot |= PROT_READ | PROT_EXEC;
+
+ /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */
+ bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd;
+ if (bsd_args.fd != -1) {
+ /*
+ * Linux follows Solaris mmap(2) description:
+ * The file descriptor fildes is opened with
+ * read permission, regardless of the
+ * protection options specified.
+ */
+
+ if ((error = fget(td, bsd_args.fd, CAP_MMAP, &fp)) != 0)
+ return (error);
+ if (fp->f_type != DTYPE_VNODE) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+
+ /* Linux mmap() just fails for O_WRONLY files */
+ if (!(fp->f_flag & FREAD)) {
+ fdrop(fp, td);
+ return (EACCES);
+ }
+
+ fdrop(fp, td);
+ }
+
+ if (flags & LINUX_MAP_GROWSDOWN) {
+ /*
+ * The Linux MAP_GROWSDOWN option does not limit auto
+ * growth of the region. Linux mmap with this option
+ * takes as addr the inital BOS, and as len, the initial
+ * region size. It can then grow down from addr without
+ * limit. However, Linux threads has an implicit internal
+ * limit to stack size of STACK_SIZE. Its just not
+ * enforced explicitly in Linux. But, here we impose
+ * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
+ * region, since we can do this with our mmap.
+ *
+ * Our mmap with MAP_STACK takes addr as the maximum
+ * downsize limit on BOS, and as len the max size of
+ * the region. It then maps the top SGROWSIZ bytes,
+ * and auto grows the region down, up to the limit
+ * in addr.
+ *
+ * If we don't use the MAP_STACK option, the effect
+ * of this code is to allocate a stack region of a
+ * fixed size of (STACK_SIZE - GUARD_SIZE).
+ */
+
+ if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) {
+ /*
+ * Some Linux apps will attempt to mmap
+ * thread stacks near the top of their
+ * address space. If their TOS is greater
+ * than vm_maxsaddr, vm_map_growstack()
+ * will confuse the thread stack with the
+ * process stack and deliver a SEGV if they
+ * attempt to grow the thread stack past their
+ * current stacksize rlimit. To avoid this,
+ * adjust vm_maxsaddr upwards to reflect
+ * the current stacksize rlimit rather
+ * than the maximum possible stacksize.
+ * It would be better to adjust the
+ * mmap'ed region, but some apps do not check
+ * mmap's return value.
+ */
+ PROC_LOCK(p);
+ p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK -
+ lim_cur(p, RLIMIT_STACK);
+ PROC_UNLOCK(p);
+ }
+
+ /*
+ * This gives us our maximum stack size and a new BOS.
+ * If we're using VM_STACK, then mmap will just map
+ * the top SGROWSIZ bytes, and let the stack grow down
+ * to the limit at BOS. If we're not using VM_STACK
+ * we map the full stack, since we don't have a way
+ * to autogrow it.
+ */
+ if (len > STACK_SIZE - GUARD_SIZE) {
+ bsd_args.addr = (caddr_t)PTRIN(addr);
+ bsd_args.len = len;
+ } else {
+ bsd_args.addr = (caddr_t)PTRIN(addr) -
+ (STACK_SIZE - GUARD_SIZE - len);
+ bsd_args.len = STACK_SIZE - GUARD_SIZE;
+ }
+ } else {
+ bsd_args.addr = (caddr_t)PTRIN(addr);
+ bsd_args.len = len;
+ }
+ bsd_args.pos = pos;
+
+#ifdef DEBUG
+ if (ldebug(mmap))
+ printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
+ __func__,
+ (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
+ bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
+#endif
+ error = sys_mmap(td, &bsd_args);
+#ifdef DEBUG
+ if (ldebug(mmap))
+ printf("-> %s() return: 0x%x (0x%08x)\n",
+ __func__, error, (u_int)td->td_retval[0]);
+#endif
+ return (error);
+}
+
+int
+linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
+{
+ struct mprotect_args bsd_args;
+
+ bsd_args.addr = uap->addr;
+ bsd_args.len = uap->len;
+ bsd_args.prot = uap->prot;
+ if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
+ bsd_args.prot |= PROT_READ | PROT_EXEC;
+ return (sys_mprotect(td, &bsd_args));
+}
+
+int
+linux_iopl(struct thread *td, struct linux_iopl_args *args)
+{
+ int error;
+
+ if (args->level < 0 || args->level > 3)
+ return (EINVAL);
+ if ((error = priv_check(td, PRIV_IO)) != 0)
+ return (error);
+ if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
+ return (error);
+ td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) |
+ (args->level * (PSL_IOPL / 3));
+
+ return (0);
+}
+
+int
+linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
+{
+ l_osigaction_t osa;
+ l_sigaction_t act, oact;
+ int error;
+
+#ifdef DEBUG
+ if (ldebug(sigaction))
+ printf(ARGS(sigaction, "%d, %p, %p"),
+ args->sig, (void *)args->nsa, (void *)args->osa);
+#endif
+
+ if (args->nsa != NULL) {
+ error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
+ if (error)
+ return (error);
+ act.lsa_handler = osa.lsa_handler;
+ act.lsa_flags = osa.lsa_flags;
+ act.lsa_restorer = osa.lsa_restorer;
+ LINUX_SIGEMPTYSET(act.lsa_mask);
+ act.lsa_mask.__bits[0] = osa.lsa_mask;
+ }
+
+ error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
+ args->osa ? &oact : NULL);
+
+ if (args->osa != NULL && !error) {
+ osa.lsa_handler = oact.lsa_handler;
+ osa.lsa_flags = oact.lsa_flags;
+ osa.lsa_restorer = oact.lsa_restorer;
+ osa.lsa_mask = oact.lsa_mask.__bits[0];
+ error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
+ }
+
+ return (error);
+}
+
+/*
+ * Linux has two extra args, restart and oldmask. We don't use these,
+ * but it seems that "restart" is actually a context pointer that
+ * enables the signal to happen with a different register set.
+ */
+int
+linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
+{
+ sigset_t sigmask;
+ l_sigset_t mask;
+
+#ifdef DEBUG
+ if (ldebug(sigsuspend))
+ printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
+#endif
+
+ LINUX_SIGEMPTYSET(mask);
+ mask.__bits[0] = args->mask;
+ linux_to_bsd_sigset(&mask, &sigmask);
+ return (kern_sigsuspend(td, sigmask));
+}
+
+int
+linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
+{
+ l_sigset_t lmask;
+ sigset_t sigmask;
+ int error;
+
+#ifdef DEBUG
+ if (ldebug(rt_sigsuspend))
+ printf(ARGS(rt_sigsuspend, "%p, %d"),
+ (void *)uap->newset, uap->sigsetsize);
+#endif
+
+ if (uap->sigsetsize != sizeof(l_sigset_t))
+ return (EINVAL);
+
+ error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
+ if (error)
+ return (error);
+
+ linux_to_bsd_sigset(&lmask, &sigmask);
+ return (kern_sigsuspend(td, sigmask));
+}
+
+int
+linux_pause(struct thread *td, struct linux_pause_args *args)
+{
+ struct proc *p = td->td_proc;
+ sigset_t sigmask;
+
+#ifdef DEBUG
+ if (ldebug(pause))
+ printf(ARGS(pause, ""));
+#endif
+
+ PROC_LOCK(p);
+ sigmask = td->td_sigmask;
+ PROC_UNLOCK(p);
+ return (kern_sigsuspend(td, sigmask));
+}
+
+int
+linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
+{
+ stack_t ss, oss;
+ l_stack_t lss;
+ int error;
+
+#ifdef DEBUG
+ if (ldebug(sigaltstack))
+ printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
+#endif
+
+ if (uap->uss != NULL) {
+ error = copyin(uap->uss, &lss, sizeof(l_stack_t));
+ if (error)
+ return (error);
+
+ ss.ss_sp = PTRIN(lss.ss_sp);
+ ss.ss_size = lss.ss_size;
+ ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
+ }
+ error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
+ (uap->uoss != NULL) ? &oss : NULL);
+ if (!error && uap->uoss != NULL) {
+ lss.ss_sp = PTROUT(oss.ss_sp);
+ lss.ss_size = oss.ss_size;
+ lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
+ error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
+ }
+
+ return (error);
+}
+
+int
+linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
+{
+ struct ftruncate_args sa;
+
+#ifdef DEBUG
+ if (ldebug(ftruncate64))
+ printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
+ (intmax_t)args->length);
+#endif
+
+ sa.fd = args->fd;
+ sa.length = args->length;
+ return sys_ftruncate(td, &sa);
+}
+
+int
+linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
+{
+ struct timeval atv;
+ l_timeval atv32;
+ struct timezone rtz;
+ int error = 0;
+
+ if (uap->tp) {
+ microtime(&atv);
+ atv32.tv_sec = atv.tv_sec;
+ atv32.tv_usec = atv.tv_usec;
+ error = copyout(&atv32, uap->tp, sizeof(atv32));
+ }
+ if (error == 0 && uap->tzp != NULL) {
+ rtz.tz_minuteswest = tz_minuteswest;
+ rtz.tz_dsttime = tz_dsttime;
+ error = copyout(&rtz, uap->tzp, sizeof(rtz));
+ }
+ return (error);
+}
+
+int
+linux_settimeofday(struct thread *td, struct linux_settimeofday_args *uap)
+{
+ l_timeval atv32;
+ struct timeval atv, *tvp;
+ struct timezone atz, *tzp;
+ int error;
+
+ if (uap->tp) {
+ error = copyin(uap->tp, &atv32, sizeof(atv32));
+ if (error)
+ return (error);
+ atv.tv_sec = atv32.tv_sec;
+ atv.tv_usec = atv32.tv_usec;
+ tvp = &atv;
+ } else
+ tvp = NULL;
+ if (uap->tzp) {
+ error = copyin(uap->tzp, &atz, sizeof(atz));
+ if (error)
+ return (error);
+ tzp = &atz;
+ } else
+ tzp = NULL;
+ return (kern_settimeofday(td, tvp, tzp));
+}
+
+int
+linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
+{
+ struct l_rusage s32;
+ struct rusage s;
+ int error;
+
+ error = kern_getrusage(td, uap->who, &s);
+ if (error != 0)
+ return (error);
+ if (uap->rusage != NULL) {
+ bsd_to_linux_rusage(&s, &s32);
+ error = copyout(&s32, uap->rusage, sizeof(s32));
+ }
+ return (error);
+}
+
+int
+linux_sched_rr_get_interval(struct thread *td,
+ struct linux_sched_rr_get_interval_args *uap)
+{
+ struct timespec ts;
+ struct l_timespec ts32;
+ int error;
+
+ error = kern_sched_rr_get_interval(td, uap->pid, &ts);
+ if (error != 0)
+ return (error);
+ ts32.tv_sec = ts.tv_sec;
+ ts32.tv_nsec = ts.tv_nsec;
+ return (copyout(&ts32, uap->interval, sizeof(ts32)));
+}
+
+int
+linux_set_thread_area(struct thread *td,
+ struct linux_set_thread_area_args *args)
+{
+ struct l_user_desc info;
+ struct user_segment_descriptor sd;
+ struct pcb *pcb;
+ int a[2];
+ int error;
+
+ error = copyin(args->desc, &info, sizeof(struct l_user_desc));
+ if (error)
+ return (error);
+
+#ifdef DEBUG
+ if (ldebug(set_thread_area))
+ printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, "
+ "%i, %i, %i"), info.entry_number, info.base_addr,
+ info.limit, info.seg_32bit, info.contents,
+ info.read_exec_only, info.limit_in_pages,
+ info.seg_not_present, info.useable);
+#endif
+
+ /*
+ * Semantics of Linux version: every thread in the system has array
+ * of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown.
+ * This syscall loads one of the selected TLS decriptors with a value
+ * and also loads GDT descriptors 6, 7 and 8 with the content of
+ * the per-thread descriptors.
+ *
+ * Semantics of FreeBSD version: I think we can ignore that Linux has
+ * three per-thread descriptors and use just the first one.
+ * The tls_array[] is used only in [gs]et_thread_area() syscalls and
+ * for loading the GDT descriptors. We use just one GDT descriptor
+ * for TLS, so we will load just one.
+ *
+ * XXX: This doesn't work when a user space process tries to use more
+ * than one TLS segment. Comment in the Linux source says wine might
+ * do this.
+ */
+
+ /*
+ * GLIBC reads current %gs and call set_thread_area() with it.
+ * We should let GUDATA_SEL and GUGS32_SEL proceed as well because
+ * we use these segments.
+ */
+ switch (info.entry_number) {
+ case GUGS32_SEL:
+ case GUDATA_SEL:
+ case 6:
+ case -1:
+ info.entry_number = GUGS32_SEL;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ /*
+ * We have to copy out the GDT entry we use.
+ *
+ * XXX: What if a user space program does not check the return value
+ * and tries to use 6, 7 or 8?
+ */
+ error = copyout(&info, args->desc, sizeof(struct l_user_desc));
+ if (error)
+ return (error);
+
+ if (LINUX_LDT_empty(&info)) {
+ a[0] = 0;
+ a[1] = 0;
+ } else {
+ a[0] = LINUX_LDT_entry_a(&info);
+ a[1] = LINUX_LDT_entry_b(&info);
+ }
+
+ memcpy(&sd, &a, sizeof(a));
+#ifdef DEBUG
+ if (ldebug(set_thread_area))
+ printf("Segment created in set_thread_area: "
+ "lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, "
+ "type: %i, dpl: %i, p: %i, xx: %i, long: %i, "
+ "def32: %i, gran: %i\n",
+ sd.sd_lobase,
+ sd.sd_hibase,
+ sd.sd_lolimit,
+ sd.sd_hilimit,
+ sd.sd_type,
+ sd.sd_dpl,
+ sd.sd_p,
+ sd.sd_xx,
+ sd.sd_long,
+ sd.sd_def32,
+ sd.sd_gran);
+#endif
+
+ pcb = td->td_pcb;
+ pcb->pcb_gsbase = (register_t)info.base_addr;
+ set_pcb_flags(pcb, PCB_32BIT | PCB_GS32BIT);
+ update_gdt_gsbase(td, info.base_addr);
+
+ return (0);
+}
+
+int
+linux_wait4(struct thread *td, struct linux_wait4_args *args)
+{
+ int error, options;
+ struct rusage ru, *rup;
+ struct l_rusage lru;
+
+#ifdef DEBUG
+ if (ldebug(wait4))
+ printf(ARGS(wait4, "%d, %p, %d, %p"),
+ args->pid, (void *)args->status, args->options,
+ (void *)args->rusage);
+#endif
+
+ options = (args->options & (WNOHANG | WUNTRACED));
+ /* WLINUXCLONE should be equal to __WCLONE, but we make sure */
+ if (args->options & __WCLONE)
+ options |= WLINUXCLONE;
+
+ if (args->rusage != NULL)
+ rup = &ru;
+ else
+ rup = NULL;
+ error = linux_common_wait(td, args->pid, args->status, options, rup);
+ if (error)
+ return (error);
+ if (args->rusage != NULL) {
+ bsd_to_linux_rusage(rup, &lru);
+ error = copyout(&lru, args->rusage, sizeof(lru));
+ }
+
+ return (error);
+}
diff --git a/sys/amd64/linux32/linux32_proto.h b/sys/amd64/linux32/linux32_proto.h
new file mode 100644
index 0000000..2049445
--- /dev/null
+++ b/sys/amd64/linux32/linux32_proto.h
@@ -0,0 +1,1682 @@
+/*
+ * System call prototypes.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 236026 2012-05-25 21:50:48Z ed
+ */
+
+#ifndef _LINUX_SYSPROTO_H_
+#define _LINUX_SYSPROTO_H_
+
+#include <sys/signal.h>
+#include <sys/acl.h>
+#include <sys/cpuset.h>
+#include <sys/_ffcounter.h>
+#include <sys/_semaphore.h>
+#include <sys/ucontext.h>
+
+#include <bsm/audit_kevents.h>
+
+struct proc;
+
+struct thread;
+
+#define PAD_(t) (sizeof(register_t) <= sizeof(t) ? \
+ 0 : sizeof(register_t) - sizeof(t))
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define PADL_(t) 0
+#define PADR_(t) PAD_(t)
+#else
+#define PADL_(t) PAD_(t)
+#define PADR_(t) 0
+#endif
+
+#define nosys linux_nosys
+struct linux_fork_args {
+ register_t dummy;
+};
+struct linux_open_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)];
+ char mode_l_[PADL_(l_int)]; l_int mode; char mode_r_[PADR_(l_int)];
+};
+struct linux_waitpid_args {
+ char pid_l_[PADL_(l_pid_t)]; l_pid_t pid; char pid_r_[PADR_(l_pid_t)];
+ char status_l_[PADL_(l_int *)]; l_int * status; char status_r_[PADR_(l_int *)];
+ char options_l_[PADL_(l_int)]; l_int options; char options_r_[PADR_(l_int)];
+};
+struct linux_creat_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char mode_l_[PADL_(l_int)]; l_int mode; char mode_r_[PADR_(l_int)];
+};
+struct linux_link_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char to_l_[PADL_(char *)]; char * to; char to_r_[PADR_(char *)];
+};
+struct linux_unlink_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+};
+struct linux_execve_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char argp_l_[PADL_(uint32_t *)]; uint32_t * argp; char argp_r_[PADR_(uint32_t *)];
+ char envp_l_[PADL_(uint32_t *)]; uint32_t * envp; char envp_r_[PADR_(uint32_t *)];
+};
+struct linux_chdir_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+};
+struct linux_time_args {
+ char tm_l_[PADL_(l_time_t *)]; l_time_t * tm; char tm_r_[PADR_(l_time_t *)];
+};
+struct linux_mknod_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char mode_l_[PADL_(l_int)]; l_int mode; char mode_r_[PADR_(l_int)];
+ char dev_l_[PADL_(l_dev_t)]; l_dev_t dev; char dev_r_[PADR_(l_dev_t)];
+};
+struct linux_chmod_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char mode_l_[PADL_(l_mode_t)]; l_mode_t mode; char mode_r_[PADR_(l_mode_t)];
+};
+struct linux_lchown16_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char uid_l_[PADL_(l_uid16_t)]; l_uid16_t uid; char uid_r_[PADR_(l_uid16_t)];
+ char gid_l_[PADL_(l_gid16_t)]; l_gid16_t gid; char gid_r_[PADR_(l_gid16_t)];
+};
+struct linux_stat_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char up_l_[PADL_(struct linux_stat *)]; struct linux_stat * up; char up_r_[PADR_(struct linux_stat *)];
+};
+struct linux_lseek_args {
+ char fdes_l_[PADL_(l_uint)]; l_uint fdes; char fdes_r_[PADR_(l_uint)];
+ char off_l_[PADL_(l_off_t)]; l_off_t off; char off_r_[PADR_(l_off_t)];
+ char whence_l_[PADL_(l_int)]; l_int whence; char whence_r_[PADR_(l_int)];
+};
+struct linux_getpid_args {
+ register_t dummy;
+};
+struct linux_mount_args {
+ char specialfile_l_[PADL_(char *)]; char * specialfile; char specialfile_r_[PADR_(char *)];
+ char dir_l_[PADL_(char *)]; char * dir; char dir_r_[PADR_(char *)];
+ char filesystemtype_l_[PADL_(char *)]; char * filesystemtype; char filesystemtype_r_[PADR_(char *)];
+ char rwflag_l_[PADL_(l_ulong)]; l_ulong rwflag; char rwflag_r_[PADR_(l_ulong)];
+ char data_l_[PADL_(void *)]; void * data; char data_r_[PADR_(void *)];
+};
+struct linux_oldumount_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+};
+struct linux_setuid16_args {
+ char uid_l_[PADL_(l_uid16_t)]; l_uid16_t uid; char uid_r_[PADR_(l_uid16_t)];
+};
+struct linux_getuid16_args {
+ register_t dummy;
+};
+struct linux_stime_args {
+ register_t dummy;
+};
+struct linux_ptrace_args {
+ char req_l_[PADL_(l_long)]; l_long req; char req_r_[PADR_(l_long)];
+ char pid_l_[PADL_(l_long)]; l_long pid; char pid_r_[PADR_(l_long)];
+ char addr_l_[PADL_(l_long)]; l_long addr; char addr_r_[PADR_(l_long)];
+ char data_l_[PADL_(l_long)]; l_long data; char data_r_[PADR_(l_long)];
+};
+struct linux_alarm_args {
+ char secs_l_[PADL_(l_uint)]; l_uint secs; char secs_r_[PADR_(l_uint)];
+};
+struct linux_pause_args {
+ register_t dummy;
+};
+struct linux_utime_args {
+ char fname_l_[PADL_(char *)]; char * fname; char fname_r_[PADR_(char *)];
+ char times_l_[PADL_(struct l_utimbuf *)]; struct l_utimbuf * times; char times_r_[PADR_(struct l_utimbuf *)];
+};
+struct linux_access_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char amode_l_[PADL_(l_int)]; l_int amode; char amode_r_[PADR_(l_int)];
+};
+struct linux_nice_args {
+ char inc_l_[PADL_(l_int)]; l_int inc; char inc_r_[PADR_(l_int)];
+};
+struct linux_kill_args {
+ char pid_l_[PADL_(l_int)]; l_int pid; char pid_r_[PADR_(l_int)];
+ char signum_l_[PADL_(l_int)]; l_int signum; char signum_r_[PADR_(l_int)];
+};
+struct linux_rename_args {
+ char from_l_[PADL_(char *)]; char * from; char from_r_[PADR_(char *)];
+ char to_l_[PADL_(char *)]; char * to; char to_r_[PADR_(char *)];
+};
+struct linux_mkdir_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char mode_l_[PADL_(l_int)]; l_int mode; char mode_r_[PADR_(l_int)];
+};
+struct linux_rmdir_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+};
+struct linux_pipe_args {
+ char pipefds_l_[PADL_(l_int *)]; l_int * pipefds; char pipefds_r_[PADR_(l_int *)];
+};
+struct linux_times_args {
+ char buf_l_[PADL_(struct l_times_argv *)]; struct l_times_argv * buf; char buf_r_[PADR_(struct l_times_argv *)];
+};
+struct linux_brk_args {
+ char dsend_l_[PADL_(l_ulong)]; l_ulong dsend; char dsend_r_[PADR_(l_ulong)];
+};
+struct linux_setgid16_args {
+ char gid_l_[PADL_(l_gid16_t)]; l_gid16_t gid; char gid_r_[PADR_(l_gid16_t)];
+};
+struct linux_getgid16_args {
+ register_t dummy;
+};
+struct linux_signal_args {
+ char sig_l_[PADL_(l_int)]; l_int sig; char sig_r_[PADR_(l_int)];
+ char handler_l_[PADL_(l_handler_t)]; l_handler_t handler; char handler_r_[PADR_(l_handler_t)];
+};
+struct linux_geteuid16_args {
+ register_t dummy;
+};
+struct linux_getegid16_args {
+ register_t dummy;
+};
+struct linux_umount_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)];
+};
+struct linux_ioctl_args {
+ char fd_l_[PADL_(l_uint)]; l_uint fd; char fd_r_[PADR_(l_uint)];
+ char cmd_l_[PADL_(l_uint)]; l_uint cmd; char cmd_r_[PADR_(l_uint)];
+ char arg_l_[PADL_(uintptr_t)]; uintptr_t arg; char arg_r_[PADR_(uintptr_t)];
+};
+struct linux_fcntl_args {
+ char fd_l_[PADL_(l_uint)]; l_uint fd; char fd_r_[PADR_(l_uint)];
+ char cmd_l_[PADL_(l_uint)]; l_uint cmd; char cmd_r_[PADR_(l_uint)];
+ char arg_l_[PADL_(uintptr_t)]; uintptr_t arg; char arg_r_[PADR_(uintptr_t)];
+};
+struct linux_olduname_args {
+ register_t dummy;
+};
+struct linux_ustat_args {
+ char dev_l_[PADL_(l_dev_t)]; l_dev_t dev; char dev_r_[PADR_(l_dev_t)];
+ char ubuf_l_[PADL_(struct l_ustat *)]; struct l_ustat * ubuf; char ubuf_r_[PADR_(struct l_ustat *)];
+};
+struct linux_getppid_args {
+ register_t dummy;
+};
+struct linux_sigaction_args {
+ char sig_l_[PADL_(l_int)]; l_int sig; char sig_r_[PADR_(l_int)];
+ char nsa_l_[PADL_(l_osigaction_t *)]; l_osigaction_t * nsa; char nsa_r_[PADR_(l_osigaction_t *)];
+ char osa_l_[PADL_(l_osigaction_t *)]; l_osigaction_t * osa; char osa_r_[PADR_(l_osigaction_t *)];
+};
+struct linux_sgetmask_args {
+ register_t dummy;
+};
+struct linux_ssetmask_args {
+ char mask_l_[PADL_(l_osigset_t)]; l_osigset_t mask; char mask_r_[PADR_(l_osigset_t)];
+};
+struct linux_setreuid16_args {
+ char ruid_l_[PADL_(l_uid16_t)]; l_uid16_t ruid; char ruid_r_[PADR_(l_uid16_t)];
+ char euid_l_[PADL_(l_uid16_t)]; l_uid16_t euid; char euid_r_[PADR_(l_uid16_t)];
+};
+struct linux_setregid16_args {
+ char rgid_l_[PADL_(l_gid16_t)]; l_gid16_t rgid; char rgid_r_[PADR_(l_gid16_t)];
+ char egid_l_[PADL_(l_gid16_t)]; l_gid16_t egid; char egid_r_[PADR_(l_gid16_t)];
+};
+struct linux_sigsuspend_args {
+ char hist0_l_[PADL_(l_int)]; l_int hist0; char hist0_r_[PADR_(l_int)];
+ char hist1_l_[PADL_(l_int)]; l_int hist1; char hist1_r_[PADR_(l_int)];
+ char mask_l_[PADL_(l_osigset_t)]; l_osigset_t mask; char mask_r_[PADR_(l_osigset_t)];
+};
+struct linux_sigpending_args {
+ char mask_l_[PADL_(l_osigset_t *)]; l_osigset_t * mask; char mask_r_[PADR_(l_osigset_t *)];
+};
+struct linux_sethostname_args {
+ char hostname_l_[PADL_(char *)]; char * hostname; char hostname_r_[PADR_(char *)];
+ char len_l_[PADL_(u_int)]; u_int len; char len_r_[PADR_(u_int)];
+};
+struct linux_setrlimit_args {
+ char resource_l_[PADL_(l_uint)]; l_uint resource; char resource_r_[PADR_(l_uint)];
+ char rlim_l_[PADL_(struct l_rlimit *)]; struct l_rlimit * rlim; char rlim_r_[PADR_(struct l_rlimit *)];
+};
+struct linux_old_getrlimit_args {
+ char resource_l_[PADL_(l_uint)]; l_uint resource; char resource_r_[PADR_(l_uint)];
+ char rlim_l_[PADL_(struct l_rlimit *)]; struct l_rlimit * rlim; char rlim_r_[PADR_(struct l_rlimit *)];
+};
+struct linux_getrusage_args {
+ char who_l_[PADL_(int)]; int who; char who_r_[PADR_(int)];
+ char rusage_l_[PADL_(struct l_rusage *)]; struct l_rusage * rusage; char rusage_r_[PADR_(struct l_rusage *)];
+};
+struct linux_gettimeofday_args {
+ char tp_l_[PADL_(struct l_timeval *)]; struct l_timeval * tp; char tp_r_[PADR_(struct l_timeval *)];
+ char tzp_l_[PADL_(struct timezone *)]; struct timezone * tzp; char tzp_r_[PADR_(struct timezone *)];
+};
+struct linux_settimeofday_args {
+ char tp_l_[PADL_(struct l_timeval *)]; struct l_timeval * tp; char tp_r_[PADR_(struct l_timeval *)];
+ char tzp_l_[PADL_(struct timezone *)]; struct timezone * tzp; char tzp_r_[PADR_(struct timezone *)];
+};
+struct linux_getgroups16_args {
+ char gidsetsize_l_[PADL_(l_uint)]; l_uint gidsetsize; char gidsetsize_r_[PADR_(l_uint)];
+ char gidset_l_[PADL_(l_gid16_t *)]; l_gid16_t * gidset; char gidset_r_[PADR_(l_gid16_t *)];
+};
+struct linux_setgroups16_args {
+ char gidsetsize_l_[PADL_(l_uint)]; l_uint gidsetsize; char gidsetsize_r_[PADR_(l_uint)];
+ char gidset_l_[PADL_(l_gid16_t *)]; l_gid16_t * gidset; char gidset_r_[PADR_(l_gid16_t *)];
+};
+struct linux_old_select_args {
+ char ptr_l_[PADL_(struct l_old_select_argv *)]; struct l_old_select_argv * ptr; char ptr_r_[PADR_(struct l_old_select_argv *)];
+};
+struct linux_symlink_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char to_l_[PADL_(char *)]; char * to; char to_r_[PADR_(char *)];
+};
+struct linux_lstat_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char up_l_[PADL_(struct linux_lstat *)]; struct linux_lstat * up; char up_r_[PADR_(struct linux_lstat *)];
+};
+struct linux_readlink_args {
+ char name_l_[PADL_(char *)]; char * name; char name_r_[PADR_(char *)];
+ char buf_l_[PADL_(char *)]; char * buf; char buf_r_[PADR_(char *)];
+ char count_l_[PADL_(l_int)]; l_int count; char count_r_[PADR_(l_int)];
+};
+struct linux_reboot_args {
+ char magic1_l_[PADL_(l_int)]; l_int magic1; char magic1_r_[PADR_(l_int)];
+ char magic2_l_[PADL_(l_int)]; l_int magic2; char magic2_r_[PADR_(l_int)];
+ char cmd_l_[PADL_(l_uint)]; l_uint cmd; char cmd_r_[PADR_(l_uint)];
+ char arg_l_[PADL_(void *)]; void * arg; char arg_r_[PADR_(void *)];
+};
+struct linux_readdir_args {
+ char fd_l_[PADL_(l_uint)]; l_uint fd; char fd_r_[PADR_(l_uint)];
+ char dent_l_[PADL_(struct l_dirent *)]; struct l_dirent * dent; char dent_r_[PADR_(struct l_dirent *)];
+ char count_l_[PADL_(l_uint)]; l_uint count; char count_r_[PADR_(l_uint)];
+};
+struct linux_mmap_args {
+ char ptr_l_[PADL_(struct l_mmap_argv *)]; struct l_mmap_argv * ptr; char ptr_r_[PADR_(struct l_mmap_argv *)];
+};
+struct linux_truncate_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char length_l_[PADL_(l_ulong)]; l_ulong length; char length_r_[PADR_(l_ulong)];
+};
+struct linux_ftruncate_args {
+ char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
+ char length_l_[PADL_(long)]; long length; char length_r_[PADR_(long)];
+};
+struct linux_getpriority_args {
+ char which_l_[PADL_(int)]; int which; char which_r_[PADR_(int)];
+ char who_l_[PADL_(int)]; int who; char who_r_[PADR_(int)];
+};
+struct linux_statfs_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char buf_l_[PADL_(struct l_statfs_buf *)]; struct l_statfs_buf * buf; char buf_r_[PADR_(struct l_statfs_buf *)];
+};
+struct linux_fstatfs_args {
+ char fd_l_[PADL_(l_uint)]; l_uint fd; char fd_r_[PADR_(l_uint)];
+ char buf_l_[PADL_(struct l_statfs_buf *)]; struct l_statfs_buf * buf; char buf_r_[PADR_(struct l_statfs_buf *)];
+};
+struct linux_socketcall_args {
+ char what_l_[PADL_(l_int)]; l_int what; char what_r_[PADR_(l_int)];
+ char args_l_[PADL_(l_ulong)]; l_ulong args; char args_r_[PADR_(l_ulong)];
+};
+struct linux_syslog_args {
+ char type_l_[PADL_(l_int)]; l_int type; char type_r_[PADR_(l_int)];
+ char buf_l_[PADL_(char *)]; char * buf; char buf_r_[PADR_(char *)];
+ char len_l_[PADL_(l_int)]; l_int len; char len_r_[PADR_(l_int)];
+};
+struct linux_setitimer_args {
+ char which_l_[PADL_(l_int)]; l_int which; char which_r_[PADR_(l_int)];
+ char itv_l_[PADL_(struct l_itimerval *)]; struct l_itimerval * itv; char itv_r_[PADR_(struct l_itimerval *)];
+ char oitv_l_[PADL_(struct l_itimerval *)]; struct l_itimerval * oitv; char oitv_r_[PADR_(struct l_itimerval *)];
+};
+struct linux_getitimer_args {
+ char which_l_[PADL_(l_int)]; l_int which; char which_r_[PADR_(l_int)];
+ char itv_l_[PADL_(struct l_itimerval *)]; struct l_itimerval * itv; char itv_r_[PADR_(struct l_itimerval *)];
+};
+struct linux_newstat_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char buf_l_[PADL_(struct l_newstat *)]; struct l_newstat * buf; char buf_r_[PADR_(struct l_newstat *)];
+};
+struct linux_newlstat_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char buf_l_[PADL_(struct l_newstat *)]; struct l_newstat * buf; char buf_r_[PADR_(struct l_newstat *)];
+};
+struct linux_newfstat_args {
+ char fd_l_[PADL_(l_uint)]; l_uint fd; char fd_r_[PADR_(l_uint)];
+ char buf_l_[PADL_(struct l_newstat *)]; struct l_newstat * buf; char buf_r_[PADR_(struct l_newstat *)];
+};
+struct linux_uname_args {
+ register_t dummy;
+};
+struct linux_iopl_args {
+ char level_l_[PADL_(l_int)]; l_int level; char level_r_[PADR_(l_int)];
+};
+struct linux_vhangup_args {
+ register_t dummy;
+};
+struct linux_wait4_args {
+ char pid_l_[PADL_(l_pid_t)]; l_pid_t pid; char pid_r_[PADR_(l_pid_t)];
+ char status_l_[PADL_(l_uint *)]; l_uint * status; char status_r_[PADR_(l_uint *)];
+ char options_l_[PADL_(l_int)]; l_int options; char options_r_[PADR_(l_int)];
+ char rusage_l_[PADL_(struct l_rusage *)]; struct l_rusage * rusage; char rusage_r_[PADR_(struct l_rusage *)];
+};
+struct linux_swapoff_args {
+ register_t dummy;
+};
+struct linux_sysinfo_args {
+ char info_l_[PADL_(struct l_sysinfo *)]; struct l_sysinfo * info; char info_r_[PADR_(struct l_sysinfo *)];
+};
+struct linux_ipc_args {
+ char what_l_[PADL_(l_uint)]; l_uint what; char what_r_[PADR_(l_uint)];
+ char arg1_l_[PADL_(l_int)]; l_int arg1; char arg1_r_[PADR_(l_int)];
+ char arg2_l_[PADL_(l_int)]; l_int arg2; char arg2_r_[PADR_(l_int)];
+ char arg3_l_[PADL_(l_int)]; l_int arg3; char arg3_r_[PADR_(l_int)];
+ char ptr_l_[PADL_(void *)]; void * ptr; char ptr_r_[PADR_(void *)];
+ char arg5_l_[PADL_(l_long)]; l_long arg5; char arg5_r_[PADR_(l_long)];
+};
+struct linux_sigreturn_args {
+ char sfp_l_[PADL_(struct l_sigframe *)]; struct l_sigframe * sfp; char sfp_r_[PADR_(struct l_sigframe *)];
+};
+struct linux_clone_args {
+ char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)];
+ char stack_l_[PADL_(void *)]; void * stack; char stack_r_[PADR_(void *)];
+ char parent_tidptr_l_[PADL_(void *)]; void * parent_tidptr; char parent_tidptr_r_[PADR_(void *)];
+ char tls_l_[PADL_(void *)]; void * tls; char tls_r_[PADR_(void *)];
+ char child_tidptr_l_[PADL_(void *)]; void * child_tidptr; char child_tidptr_r_[PADR_(void *)];
+};
+struct linux_setdomainname_args {
+ char name_l_[PADL_(char *)]; char * name; char name_r_[PADR_(char *)];
+ char len_l_[PADL_(int)]; int len; char len_r_[PADR_(int)];
+};
+struct linux_newuname_args {
+ char buf_l_[PADL_(struct l_new_utsname *)]; struct l_new_utsname * buf; char buf_r_[PADR_(struct l_new_utsname *)];
+};
+struct linux_adjtimex_args {
+ register_t dummy;
+};
+struct linux_mprotect_args {
+ char addr_l_[PADL_(caddr_t)]; caddr_t addr; char addr_r_[PADR_(caddr_t)];
+ char len_l_[PADL_(int)]; int len; char len_r_[PADR_(int)];
+ char prot_l_[PADL_(int)]; int prot; char prot_r_[PADR_(int)];
+};
+struct linux_sigprocmask_args {
+ char how_l_[PADL_(l_int)]; l_int how; char how_r_[PADR_(l_int)];
+ char mask_l_[PADL_(l_osigset_t *)]; l_osigset_t * mask; char mask_r_[PADR_(l_osigset_t *)];
+ char omask_l_[PADL_(l_osigset_t *)]; l_osigset_t * omask; char omask_r_[PADR_(l_osigset_t *)];
+};
+struct linux_create_module_args {
+ register_t dummy;
+};
+struct linux_init_module_args {
+ register_t dummy;
+};
+struct linux_delete_module_args {
+ register_t dummy;
+};
+struct linux_get_kernel_syms_args {
+ register_t dummy;
+};
+struct linux_quotactl_args {
+ register_t dummy;
+};
+struct linux_bdflush_args {
+ register_t dummy;
+};
+struct linux_sysfs_args {
+ char option_l_[PADL_(l_int)]; l_int option; char option_r_[PADR_(l_int)];
+ char arg1_l_[PADL_(l_ulong)]; l_ulong arg1; char arg1_r_[PADR_(l_ulong)];
+ char arg2_l_[PADL_(l_ulong)]; l_ulong arg2; char arg2_r_[PADR_(l_ulong)];
+};
+struct linux_personality_args {
+ char per_l_[PADL_(l_ulong)]; l_ulong per; char per_r_[PADR_(l_ulong)];
+};
+struct linux_setfsuid16_args {
+ char uid_l_[PADL_(l_uid16_t)]; l_uid16_t uid; char uid_r_[PADR_(l_uid16_t)];
+};
+struct linux_setfsgid16_args {
+ char gid_l_[PADL_(l_gid16_t)]; l_gid16_t gid; char gid_r_[PADR_(l_gid16_t)];
+};
+struct linux_llseek_args {
+ char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)];
+ char ohigh_l_[PADL_(l_ulong)]; l_ulong ohigh; char ohigh_r_[PADR_(l_ulong)];
+ char olow_l_[PADL_(l_ulong)]; l_ulong olow; char olow_r_[PADR_(l_ulong)];
+ char res_l_[PADL_(l_loff_t *)]; l_loff_t * res; char res_r_[PADR_(l_loff_t *)];
+ char whence_l_[PADL_(l_uint)]; l_uint whence; char whence_r_[PADR_(l_uint)];
+};
+struct linux_getdents_args {
+ char fd_l_[PADL_(l_uint)]; l_uint fd; char fd_r_[PADR_(l_uint)];
+ char dent_l_[PADL_(void *)]; void * dent; char dent_r_[PADR_(void *)];
+ char count_l_[PADL_(l_uint)]; l_uint count; char count_r_[PADR_(l_uint)];
+};
+struct linux_select_args {
+ char nfds_l_[PADL_(l_int)]; l_int nfds; char nfds_r_[PADR_(l_int)];
+ char readfds_l_[PADL_(l_fd_set *)]; l_fd_set * readfds; char readfds_r_[PADR_(l_fd_set *)];
+ char writefds_l_[PADL_(l_fd_set *)]; l_fd_set * writefds; char writefds_r_[PADR_(l_fd_set *)];
+ char exceptfds_l_[PADL_(l_fd_set *)]; l_fd_set * exceptfds; char exceptfds_r_[PADR_(l_fd_set *)];
+ char timeout_l_[PADL_(struct l_timeval *)]; struct l_timeval * timeout; char timeout_r_[PADR_(struct l_timeval *)];
+};
+struct linux_msync_args {
+ char addr_l_[PADL_(l_ulong)]; l_ulong addr; char addr_r_[PADR_(l_ulong)];
+ char len_l_[PADL_(l_size_t)]; l_size_t len; char len_r_[PADR_(l_size_t)];
+ char fl_l_[PADL_(l_int)]; l_int fl; char fl_r_[PADR_(l_int)];
+};
+struct linux_readv_args {
+ char fd_l_[PADL_(l_ulong)]; l_ulong fd; char fd_r_[PADR_(l_ulong)];
+ char iovp_l_[PADL_(struct l_iovec32 *)]; struct l_iovec32 * iovp; char iovp_r_[PADR_(struct l_iovec32 *)];
+ char iovcnt_l_[PADL_(l_ulong)]; l_ulong iovcnt; char iovcnt_r_[PADR_(l_ulong)];
+};
+struct linux_writev_args {
+ char fd_l_[PADL_(l_ulong)]; l_ulong fd; char fd_r_[PADR_(l_ulong)];
+ char iovp_l_[PADL_(struct l_iovec32 *)]; struct l_iovec32 * iovp; char iovp_r_[PADR_(struct l_iovec32 *)];
+ char iovcnt_l_[PADL_(l_ulong)]; l_ulong iovcnt; char iovcnt_r_[PADR_(l_ulong)];
+};
+struct linux_getsid_args {
+ char pid_l_[PADL_(l_pid_t)]; l_pid_t pid; char pid_r_[PADR_(l_pid_t)];
+};
+struct linux_fdatasync_args {
+ char fd_l_[PADL_(l_uint)]; l_uint fd; char fd_r_[PADR_(l_uint)];
+};
+struct linux_sysctl_args {
+ char args_l_[PADL_(struct l___sysctl_args *)]; struct l___sysctl_args * args; char args_r_[PADR_(struct l___sysctl_args *)];
+};
+struct linux_sched_setscheduler_args {
+ char pid_l_[PADL_(l_pid_t)]; l_pid_t pid; char pid_r_[PADR_(l_pid_t)];
+ char policy_l_[PADL_(l_int)]; l_int policy; char policy_r_[PADR_(l_int)];
+ char param_l_[PADL_(struct l_sched_param *)]; struct l_sched_param * param; char param_r_[PADR_(struct l_sched_param *)];
+};
+struct linux_sched_getscheduler_args {
+ char pid_l_[PADL_(l_pid_t)]; l_pid_t pid; char pid_r_[PADR_(l_pid_t)];
+};
+struct linux_sched_get_priority_max_args {
+ char policy_l_[PADL_(l_int)]; l_int policy; char policy_r_[PADR_(l_int)];
+};
+struct linux_sched_get_priority_min_args {
+ char policy_l_[PADL_(l_int)]; l_int policy; char policy_r_[PADR_(l_int)];
+};
+struct linux_sched_rr_get_interval_args {
+ char pid_l_[PADL_(l_pid_t)]; l_pid_t pid; char pid_r_[PADR_(l_pid_t)];
+ char interval_l_[PADL_(struct l_timespec *)]; struct l_timespec * interval; char interval_r_[PADR_(struct l_timespec *)];
+};
+struct linux_nanosleep_args {
+ char rqtp_l_[PADL_(const struct l_timespec *)]; const struct l_timespec * rqtp; char rqtp_r_[PADR_(const struct l_timespec *)];
+ char rmtp_l_[PADL_(struct l_timespec *)]; struct l_timespec * rmtp; char rmtp_r_[PADR_(struct l_timespec *)];
+};
+struct linux_mremap_args {
+ char addr_l_[PADL_(l_ulong)]; l_ulong addr; char addr_r_[PADR_(l_ulong)];
+ char old_len_l_[PADL_(l_ulong)]; l_ulong old_len; char old_len_r_[PADR_(l_ulong)];
+ char new_len_l_[PADL_(l_ulong)]; l_ulong new_len; char new_len_r_[PADR_(l_ulong)];
+ char flags_l_[PADL_(l_ulong)]; l_ulong flags; char flags_r_[PADR_(l_ulong)];
+ char new_addr_l_[PADL_(l_ulong)]; l_ulong new_addr; char new_addr_r_[PADR_(l_ulong)];
+};
+struct linux_setresuid16_args {
+ char ruid_l_[PADL_(l_uid16_t)]; l_uid16_t ruid; char ruid_r_[PADR_(l_uid16_t)];
+ char euid_l_[PADL_(l_uid16_t)]; l_uid16_t euid; char euid_r_[PADR_(l_uid16_t)];
+ char suid_l_[PADL_(l_uid16_t)]; l_uid16_t suid; char suid_r_[PADR_(l_uid16_t)];
+};
+struct linux_getresuid16_args {
+ char ruid_l_[PADL_(l_uid16_t *)]; l_uid16_t * ruid; char ruid_r_[PADR_(l_uid16_t *)];
+ char euid_l_[PADL_(l_uid16_t *)]; l_uid16_t * euid; char euid_r_[PADR_(l_uid16_t *)];
+ char suid_l_[PADL_(l_uid16_t *)]; l_uid16_t * suid; char suid_r_[PADR_(l_uid16_t *)];
+};
+struct linux_query_module_args {
+ register_t dummy;
+};
+struct linux_nfsservctl_args {
+ register_t dummy;
+};
+struct linux_setresgid16_args {
+ char rgid_l_[PADL_(l_gid16_t)]; l_gid16_t rgid; char rgid_r_[PADR_(l_gid16_t)];
+ char egid_l_[PADL_(l_gid16_t)]; l_gid16_t egid; char egid_r_[PADR_(l_gid16_t)];
+ char sgid_l_[PADL_(l_gid16_t)]; l_gid16_t sgid; char sgid_r_[PADR_(l_gid16_t)];
+};
+struct linux_getresgid16_args {
+ char rgid_l_[PADL_(l_gid16_t *)]; l_gid16_t * rgid; char rgid_r_[PADR_(l_gid16_t *)];
+ char egid_l_[PADL_(l_gid16_t *)]; l_gid16_t * egid; char egid_r_[PADR_(l_gid16_t *)];
+ char sgid_l_[PADL_(l_gid16_t *)]; l_gid16_t * sgid; char sgid_r_[PADR_(l_gid16_t *)];
+};
+struct linux_prctl_args {
+ char option_l_[PADL_(l_int)]; l_int option; char option_r_[PADR_(l_int)];
+ char arg2_l_[PADL_(l_int)]; l_int arg2; char arg2_r_[PADR_(l_int)];
+ char arg3_l_[PADL_(l_int)]; l_int arg3; char arg3_r_[PADR_(l_int)];
+ char arg4_l_[PADL_(l_int)]; l_int arg4; char arg4_r_[PADR_(l_int)];
+ char arg5_l_[PADL_(l_int)]; l_int arg5; char arg5_r_[PADR_(l_int)];
+};
+struct linux_rt_sigreturn_args {
+ char ucp_l_[PADL_(struct l_ucontext *)]; struct l_ucontext * ucp; char ucp_r_[PADR_(struct l_ucontext *)];
+};
+struct linux_rt_sigaction_args {
+ char sig_l_[PADL_(l_int)]; l_int sig; char sig_r_[PADR_(l_int)];
+ char act_l_[PADL_(l_sigaction_t *)]; l_sigaction_t * act; char act_r_[PADR_(l_sigaction_t *)];
+ char oact_l_[PADL_(l_sigaction_t *)]; l_sigaction_t * oact; char oact_r_[PADR_(l_sigaction_t *)];
+ char sigsetsize_l_[PADL_(l_size_t)]; l_size_t sigsetsize; char sigsetsize_r_[PADR_(l_size_t)];
+};
+struct linux_rt_sigprocmask_args {
+ char how_l_[PADL_(l_int)]; l_int how; char how_r_[PADR_(l_int)];
+ char mask_l_[PADL_(l_sigset_t *)]; l_sigset_t * mask; char mask_r_[PADR_(l_sigset_t *)];
+ char omask_l_[PADL_(l_sigset_t *)]; l_sigset_t * omask; char omask_r_[PADR_(l_sigset_t *)];
+ char sigsetsize_l_[PADL_(l_size_t)]; l_size_t sigsetsize; char sigsetsize_r_[PADR_(l_size_t)];
+};
+struct linux_rt_sigpending_args {
+ char set_l_[PADL_(l_sigset_t *)]; l_sigset_t * set; char set_r_[PADR_(l_sigset_t *)];
+ char sigsetsize_l_[PADL_(l_size_t)]; l_size_t sigsetsize; char sigsetsize_r_[PADR_(l_size_t)];
+};
+struct linux_rt_sigtimedwait_args {
+ char mask_l_[PADL_(l_sigset_t *)]; l_sigset_t * mask; char mask_r_[PADR_(l_sigset_t *)];
+ char ptr_l_[PADL_(l_siginfo_t *)]; l_siginfo_t * ptr; char ptr_r_[PADR_(l_siginfo_t *)];
+ char timeout_l_[PADL_(struct l_timeval *)]; struct l_timeval * timeout; char timeout_r_[PADR_(struct l_timeval *)];
+ char sigsetsize_l_[PADL_(l_size_t)]; l_size_t sigsetsize; char sigsetsize_r_[PADR_(l_size_t)];
+};
+struct linux_rt_sigqueueinfo_args {
+ register_t dummy;
+};
+struct linux_rt_sigsuspend_args {
+ char newset_l_[PADL_(l_sigset_t *)]; l_sigset_t * newset; char newset_r_[PADR_(l_sigset_t *)];
+ char sigsetsize_l_[PADL_(l_size_t)]; l_size_t sigsetsize; char sigsetsize_r_[PADR_(l_size_t)];
+};
+struct linux_pread_args {
+ char fd_l_[PADL_(l_uint)]; l_uint fd; char fd_r_[PADR_(l_uint)];
+ char buf_l_[PADL_(char *)]; char * buf; char buf_r_[PADR_(char *)];
+ char nbyte_l_[PADL_(l_size_t)]; l_size_t nbyte; char nbyte_r_[PADR_(l_size_t)];
+ char offset_l_[PADL_(l_loff_t)]; l_loff_t offset; char offset_r_[PADR_(l_loff_t)];
+};
+struct linux_pwrite_args {
+ char fd_l_[PADL_(l_uint)]; l_uint fd; char fd_r_[PADR_(l_uint)];
+ char buf_l_[PADL_(char *)]; char * buf; char buf_r_[PADR_(char *)];
+ char nbyte_l_[PADL_(l_size_t)]; l_size_t nbyte; char nbyte_r_[PADR_(l_size_t)];
+ char offset_l_[PADL_(l_loff_t)]; l_loff_t offset; char offset_r_[PADR_(l_loff_t)];
+};
+struct linux_chown16_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char uid_l_[PADL_(l_uid16_t)]; l_uid16_t uid; char uid_r_[PADR_(l_uid16_t)];
+ char gid_l_[PADL_(l_gid16_t)]; l_gid16_t gid; char gid_r_[PADR_(l_gid16_t)];
+};
+struct linux_getcwd_args {
+ char buf_l_[PADL_(char *)]; char * buf; char buf_r_[PADR_(char *)];
+ char bufsize_l_[PADL_(l_ulong)]; l_ulong bufsize; char bufsize_r_[PADR_(l_ulong)];
+};
+struct linux_capget_args {
+ char hdrp_l_[PADL_(struct l_user_cap_header *)]; struct l_user_cap_header * hdrp; char hdrp_r_[PADR_(struct l_user_cap_header *)];
+ char datap_l_[PADL_(struct l_user_cap_data *)]; struct l_user_cap_data * datap; char datap_r_[PADR_(struct l_user_cap_data *)];
+};
+struct linux_capset_args {
+ char hdrp_l_[PADL_(struct l_user_cap_header *)]; struct l_user_cap_header * hdrp; char hdrp_r_[PADR_(struct l_user_cap_header *)];
+ char datap_l_[PADL_(struct l_user_cap_data *)]; struct l_user_cap_data * datap; char datap_r_[PADR_(struct l_user_cap_data *)];
+};
+struct linux_sigaltstack_args {
+ char uss_l_[PADL_(l_stack_t *)]; l_stack_t * uss; char uss_r_[PADR_(l_stack_t *)];
+ char uoss_l_[PADL_(l_stack_t *)]; l_stack_t * uoss; char uoss_r_[PADR_(l_stack_t *)];
+};
+struct linux_sendfile_args {
+ register_t dummy;
+};
+struct linux_vfork_args {
+ register_t dummy;
+};
+struct linux_getrlimit_args {
+ char resource_l_[PADL_(l_uint)]; l_uint resource; char resource_r_[PADR_(l_uint)];
+ char rlim_l_[PADL_(struct l_rlimit *)]; struct l_rlimit * rlim; char rlim_r_[PADR_(struct l_rlimit *)];
+};
+struct linux_mmap2_args {
+ char addr_l_[PADL_(l_ulong)]; l_ulong addr; char addr_r_[PADR_(l_ulong)];
+ char len_l_[PADL_(l_ulong)]; l_ulong len; char len_r_[PADR_(l_ulong)];
+ char prot_l_[PADL_(l_ulong)]; l_ulong prot; char prot_r_[PADR_(l_ulong)];
+ char flags_l_[PADL_(l_ulong)]; l_ulong flags; char flags_r_[PADR_(l_ulong)];
+ char fd_l_[PADL_(l_ulong)]; l_ulong fd; char fd_r_[PADR_(l_ulong)];
+ char pgoff_l_[PADL_(l_ulong)]; l_ulong pgoff; char pgoff_r_[PADR_(l_ulong)];
+};
+struct linux_truncate64_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char length_l_[PADL_(l_loff_t)]; l_loff_t length; char length_r_[PADR_(l_loff_t)];
+};
+struct linux_ftruncate64_args {
+ char fd_l_[PADL_(l_uint)]; l_uint fd; char fd_r_[PADR_(l_uint)];
+ char length_l_[PADL_(l_loff_t)]; l_loff_t length; char length_r_[PADR_(l_loff_t)];
+};
+struct linux_stat64_args {
+ char filename_l_[PADL_(const char *)]; const char * filename; char filename_r_[PADR_(const char *)];
+ char statbuf_l_[PADL_(struct l_stat64 *)]; struct l_stat64 * statbuf; char statbuf_r_[PADR_(struct l_stat64 *)];
+};
+struct linux_lstat64_args {
+ char filename_l_[PADL_(const char *)]; const char * filename; char filename_r_[PADR_(const char *)];
+ char statbuf_l_[PADL_(struct l_stat64 *)]; struct l_stat64 * statbuf; char statbuf_r_[PADR_(struct l_stat64 *)];
+};
+struct linux_fstat64_args {
+ char fd_l_[PADL_(l_int)]; l_int fd; char fd_r_[PADR_(l_int)];
+ char statbuf_l_[PADL_(struct l_stat64 *)]; struct l_stat64 * statbuf; char statbuf_r_[PADR_(struct l_stat64 *)];
+};
+struct linux_lchown_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char uid_l_[PADL_(l_uid_t)]; l_uid_t uid; char uid_r_[PADR_(l_uid_t)];
+ char gid_l_[PADL_(l_gid_t)]; l_gid_t gid; char gid_r_[PADR_(l_gid_t)];
+};
+struct linux_getuid_args {
+ register_t dummy;
+};
+struct linux_getgid_args {
+ register_t dummy;
+};
+struct linux_getgroups_args {
+ char gidsetsize_l_[PADL_(l_int)]; l_int gidsetsize; char gidsetsize_r_[PADR_(l_int)];
+ char grouplist_l_[PADL_(l_gid_t *)]; l_gid_t * grouplist; char grouplist_r_[PADR_(l_gid_t *)];
+};
+struct linux_setgroups_args {
+ char gidsetsize_l_[PADL_(l_int)]; l_int gidsetsize; char gidsetsize_r_[PADR_(l_int)];
+ char grouplist_l_[PADL_(l_gid_t *)]; l_gid_t * grouplist; char grouplist_r_[PADR_(l_gid_t *)];
+};
+struct linux_chown_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char uid_l_[PADL_(l_uid_t)]; l_uid_t uid; char uid_r_[PADR_(l_uid_t)];
+ char gid_l_[PADL_(l_gid_t)]; l_gid_t gid; char gid_r_[PADR_(l_gid_t)];
+};
+struct linux_setfsuid_args {
+ char uid_l_[PADL_(l_uid_t)]; l_uid_t uid; char uid_r_[PADR_(l_uid_t)];
+};
+struct linux_setfsgid_args {
+ char gid_l_[PADL_(l_gid_t)]; l_gid_t gid; char gid_r_[PADR_(l_gid_t)];
+};
+struct linux_pivot_root_args {
+ char new_root_l_[PADL_(char *)]; char * new_root; char new_root_r_[PADR_(char *)];
+ char put_old_l_[PADL_(char *)]; char * put_old; char put_old_r_[PADR_(char *)];
+};
+struct linux_mincore_args {
+ char start_l_[PADL_(l_ulong)]; l_ulong start; char start_r_[PADR_(l_ulong)];
+ char len_l_[PADL_(l_size_t)]; l_size_t len; char len_r_[PADR_(l_size_t)];
+ char vec_l_[PADL_(u_char *)]; u_char * vec; char vec_r_[PADR_(u_char *)];
+};
+struct linux_getdents64_args {
+ char fd_l_[PADL_(l_uint)]; l_uint fd; char fd_r_[PADR_(l_uint)];
+ char dirent_l_[PADL_(void *)]; void * dirent; char dirent_r_[PADR_(void *)];
+ char count_l_[PADL_(l_uint)]; l_uint count; char count_r_[PADR_(l_uint)];
+};
+struct linux_fcntl64_args {
+ char fd_l_[PADL_(l_uint)]; l_uint fd; char fd_r_[PADR_(l_uint)];
+ char cmd_l_[PADL_(l_uint)]; l_uint cmd; char cmd_r_[PADR_(l_uint)];
+ char arg_l_[PADL_(uintptr_t)]; uintptr_t arg; char arg_r_[PADR_(uintptr_t)];
+};
+struct linux_gettid_args {
+ register_t dummy;
+};
+struct linux_setxattr_args {
+ register_t dummy;
+};
+struct linux_lsetxattr_args {
+ register_t dummy;
+};
+struct linux_fsetxattr_args {
+ register_t dummy;
+};
+struct linux_getxattr_args {
+ register_t dummy;
+};
+struct linux_lgetxattr_args {
+ register_t dummy;
+};
+struct linux_fgetxattr_args {
+ register_t dummy;
+};
+struct linux_listxattr_args {
+ register_t dummy;
+};
+struct linux_llistxattr_args {
+ register_t dummy;
+};
+struct linux_flistxattr_args {
+ register_t dummy;
+};
+struct linux_removexattr_args {
+ register_t dummy;
+};
+struct linux_lremovexattr_args {
+ register_t dummy;
+};
+struct linux_fremovexattr_args {
+ register_t dummy;
+};
+struct linux_tkill_args {
+ char tid_l_[PADL_(int)]; int tid; char tid_r_[PADR_(int)];
+ char sig_l_[PADL_(int)]; int sig; char sig_r_[PADR_(int)];
+};
+struct linux_sys_futex_args {
+ char uaddr_l_[PADL_(void *)]; void * uaddr; char uaddr_r_[PADR_(void *)];
+ char op_l_[PADL_(int)]; int op; char op_r_[PADR_(int)];
+ char val_l_[PADL_(uint32_t)]; uint32_t val; char val_r_[PADR_(uint32_t)];
+ char timeout_l_[PADL_(struct l_timespec *)]; struct l_timespec * timeout; char timeout_r_[PADR_(struct l_timespec *)];
+ char uaddr2_l_[PADL_(uint32_t *)]; uint32_t * uaddr2; char uaddr2_r_[PADR_(uint32_t *)];
+ char val3_l_[PADL_(uint32_t)]; uint32_t val3; char val3_r_[PADR_(uint32_t)];
+};
+struct linux_sched_setaffinity_args {
+ char pid_l_[PADL_(l_pid_t)]; l_pid_t pid; char pid_r_[PADR_(l_pid_t)];
+ char len_l_[PADL_(l_uint)]; l_uint len; char len_r_[PADR_(l_uint)];
+ char user_mask_ptr_l_[PADL_(l_ulong *)]; l_ulong * user_mask_ptr; char user_mask_ptr_r_[PADR_(l_ulong *)];
+};
+struct linux_sched_getaffinity_args {
+ char pid_l_[PADL_(l_pid_t)]; l_pid_t pid; char pid_r_[PADR_(l_pid_t)];
+ char len_l_[PADL_(l_uint)]; l_uint len; char len_r_[PADR_(l_uint)];
+ char user_mask_ptr_l_[PADL_(l_ulong *)]; l_ulong * user_mask_ptr; char user_mask_ptr_r_[PADR_(l_ulong *)];
+};
+struct linux_set_thread_area_args {
+ char desc_l_[PADL_(struct l_user_desc *)]; struct l_user_desc * desc; char desc_r_[PADR_(struct l_user_desc *)];
+};
+struct linux_fadvise64_args {
+ char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
+ char offset_l_[PADL_(l_loff_t)]; l_loff_t offset; char offset_r_[PADR_(l_loff_t)];
+ char len_l_[PADL_(l_size_t)]; l_size_t len; char len_r_[PADR_(l_size_t)];
+ char advice_l_[PADL_(int)]; int advice; char advice_r_[PADR_(int)];
+};
+struct linux_exit_group_args {
+ char error_code_l_[PADL_(int)]; int error_code; char error_code_r_[PADR_(int)];
+};
+struct linux_lookup_dcookie_args {
+ register_t dummy;
+};
+struct linux_epoll_create_args {
+ register_t dummy;
+};
+struct linux_epoll_ctl_args {
+ register_t dummy;
+};
+struct linux_epoll_wait_args {
+ register_t dummy;
+};
+struct linux_remap_file_pages_args {
+ register_t dummy;
+};
+struct linux_set_tid_address_args {
+ char tidptr_l_[PADL_(int *)]; int * tidptr; char tidptr_r_[PADR_(int *)];
+};
+struct linux_timer_create_args {
+ register_t dummy;
+};
+struct linux_timer_settime_args {
+ register_t dummy;
+};
+struct linux_timer_gettime_args {
+ register_t dummy;
+};
+struct linux_timer_getoverrun_args {
+ register_t dummy;
+};
+struct linux_timer_delete_args {
+ register_t dummy;
+};
+struct linux_clock_settime_args {
+ char which_l_[PADL_(clockid_t)]; clockid_t which; char which_r_[PADR_(clockid_t)];
+ char tp_l_[PADL_(struct l_timespec *)]; struct l_timespec * tp; char tp_r_[PADR_(struct l_timespec *)];
+};
+struct linux_clock_gettime_args {
+ char which_l_[PADL_(clockid_t)]; clockid_t which; char which_r_[PADR_(clockid_t)];
+ char tp_l_[PADL_(struct l_timespec *)]; struct l_timespec * tp; char tp_r_[PADR_(struct l_timespec *)];
+};
+struct linux_clock_getres_args {
+ char which_l_[PADL_(clockid_t)]; clockid_t which; char which_r_[PADR_(clockid_t)];
+ char tp_l_[PADL_(struct l_timespec *)]; struct l_timespec * tp; char tp_r_[PADR_(struct l_timespec *)];
+};
+struct linux_clock_nanosleep_args {
+ char which_l_[PADL_(clockid_t)]; clockid_t which; char which_r_[PADR_(clockid_t)];
+ char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)];
+ char rqtp_l_[PADL_(struct l_timespec *)]; struct l_timespec * rqtp; char rqtp_r_[PADR_(struct l_timespec *)];
+ char rmtp_l_[PADL_(struct l_timespec *)]; struct l_timespec * rmtp; char rmtp_r_[PADR_(struct l_timespec *)];
+};
+struct linux_statfs64_args {
+ char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+ char bufsize_l_[PADL_(size_t)]; size_t bufsize; char bufsize_r_[PADR_(size_t)];
+ char buf_l_[PADL_(struct l_statfs64_buf *)]; struct l_statfs64_buf * buf; char buf_r_[PADR_(struct l_statfs64_buf *)];
+};
+struct linux_fstatfs64_args {
+ register_t dummy;
+};
+struct linux_tgkill_args {
+ char tgid_l_[PADL_(int)]; int tgid; char tgid_r_[PADR_(int)];
+ char pid_l_[PADL_(int)]; int pid; char pid_r_[PADR_(int)];
+ char sig_l_[PADL_(int)]; int sig; char sig_r_[PADR_(int)];
+};
+struct linux_utimes_args {
+ char fname_l_[PADL_(char *)]; char * fname; char fname_r_[PADR_(char *)];
+ char tptr_l_[PADL_(struct l_timeval *)]; struct l_timeval * tptr; char tptr_r_[PADR_(struct l_timeval *)];
+};
+struct linux_fadvise64_64_args {
+ char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
+ char offset_l_[PADL_(l_loff_t)]; l_loff_t offset; char offset_r_[PADR_(l_loff_t)];
+ char len_l_[PADL_(l_loff_t)]; l_loff_t len; char len_r_[PADR_(l_loff_t)];
+ char advice_l_[PADL_(int)]; int advice; char advice_r_[PADR_(int)];
+};
+struct linux_mbind_args {
+ register_t dummy;
+};
+struct linux_get_mempolicy_args {
+ register_t dummy;
+};
+struct linux_set_mempolicy_args {
+ register_t dummy;
+};
+struct linux_mq_open_args {
+ register_t dummy;
+};
+struct linux_mq_unlink_args {
+ register_t dummy;
+};
+struct linux_mq_timedsend_args {
+ register_t dummy;
+};
+struct linux_mq_timedreceive_args {
+ register_t dummy;
+};
+struct linux_mq_notify_args {
+ register_t dummy;
+};
+struct linux_mq_getsetattr_args {
+ register_t dummy;
+};
+struct linux_kexec_load_args {
+ register_t dummy;
+};
+struct linux_waitid_args {
+ register_t dummy;
+};
+struct linux_add_key_args {
+ register_t dummy;
+};
+struct linux_request_key_args {
+ register_t dummy;
+};
+struct linux_keyctl_args {
+ register_t dummy;
+};
+struct linux_ioprio_set_args {
+ register_t dummy;
+};
+struct linux_ioprio_get_args {
+ register_t dummy;
+};
+struct linux_inotify_init_args {
+ register_t dummy;
+};
+struct linux_inotify_add_watch_args {
+ register_t dummy;
+};
+struct linux_inotify_rm_watch_args {
+ register_t dummy;
+};
+struct linux_migrate_pages_args {
+ register_t dummy;
+};
+struct linux_openat_args {
+ char dfd_l_[PADL_(l_int)]; l_int dfd; char dfd_r_[PADR_(l_int)];
+ char filename_l_[PADL_(const char *)]; const char * filename; char filename_r_[PADR_(const char *)];
+ char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)];
+ char mode_l_[PADL_(l_int)]; l_int mode; char mode_r_[PADR_(l_int)];
+};
+struct linux_mkdirat_args {
+ char dfd_l_[PADL_(l_int)]; l_int dfd; char dfd_r_[PADR_(l_int)];
+ char pathname_l_[PADL_(const char *)]; const char * pathname; char pathname_r_[PADR_(const char *)];
+ char mode_l_[PADL_(l_int)]; l_int mode; char mode_r_[PADR_(l_int)];
+};
+struct linux_mknodat_args {
+ char dfd_l_[PADL_(l_int)]; l_int dfd; char dfd_r_[PADR_(l_int)];
+ char filename_l_[PADL_(const char *)]; const char * filename; char filename_r_[PADR_(const char *)];
+ char mode_l_[PADL_(l_int)]; l_int mode; char mode_r_[PADR_(l_int)];
+ char dev_l_[PADL_(l_uint)]; l_uint dev; char dev_r_[PADR_(l_uint)];
+};
+struct linux_fchownat_args {
+ char dfd_l_[PADL_(l_int)]; l_int dfd; char dfd_r_[PADR_(l_int)];
+ char filename_l_[PADL_(const char *)]; const char * filename; char filename_r_[PADR_(const char *)];
+ char uid_l_[PADL_(l_uid16_t)]; l_uid16_t uid; char uid_r_[PADR_(l_uid16_t)];
+ char gid_l_[PADL_(l_gid16_t)]; l_gid16_t gid; char gid_r_[PADR_(l_gid16_t)];
+ char flag_l_[PADL_(l_int)]; l_int flag; char flag_r_[PADR_(l_int)];
+};
+struct linux_futimesat_args {
+ char dfd_l_[PADL_(l_int)]; l_int dfd; char dfd_r_[PADR_(l_int)];
+ char filename_l_[PADL_(char *)]; char * filename; char filename_r_[PADR_(char *)];
+ char utimes_l_[PADL_(struct l_timeval *)]; struct l_timeval * utimes; char utimes_r_[PADR_(struct l_timeval *)];
+};
+struct linux_fstatat64_args {
+ char dfd_l_[PADL_(l_int)]; l_int dfd; char dfd_r_[PADR_(l_int)];
+ char pathname_l_[PADL_(char *)]; char * pathname; char pathname_r_[PADR_(char *)];
+ char statbuf_l_[PADL_(struct l_stat64 *)]; struct l_stat64 * statbuf; char statbuf_r_[PADR_(struct l_stat64 *)];
+ char flag_l_[PADL_(l_int)]; l_int flag; char flag_r_[PADR_(l_int)];
+};
+struct linux_unlinkat_args {
+ char dfd_l_[PADL_(l_int)]; l_int dfd; char dfd_r_[PADR_(l_int)];
+ char pathname_l_[PADL_(const char *)]; const char * pathname; char pathname_r_[PADR_(const char *)];
+ char flag_l_[PADL_(l_int)]; l_int flag; char flag_r_[PADR_(l_int)];
+};
+struct linux_renameat_args {
+ char olddfd_l_[PADL_(l_int)]; l_int olddfd; char olddfd_r_[PADR_(l_int)];
+ char oldname_l_[PADL_(const char *)]; const char * oldname; char oldname_r_[PADR_(const char *)];
+ char newdfd_l_[PADL_(l_int)]; l_int newdfd; char newdfd_r_[PADR_(l_int)];
+ char newname_l_[PADL_(const char *)]; const char * newname; char newname_r_[PADR_(const char *)];
+};
+struct linux_linkat_args {
+ char olddfd_l_[PADL_(l_int)]; l_int olddfd; char olddfd_r_[PADR_(l_int)];
+ char oldname_l_[PADL_(const char *)]; const char * oldname; char oldname_r_[PADR_(const char *)];
+ char newdfd_l_[PADL_(l_int)]; l_int newdfd; char newdfd_r_[PADR_(l_int)];
+ char newname_l_[PADL_(const char *)]; const char * newname; char newname_r_[PADR_(const char *)];
+ char flag_l_[PADL_(l_int)]; l_int flag; char flag_r_[PADR_(l_int)];
+};
+struct linux_symlinkat_args {
+ char oldname_l_[PADL_(const char *)]; const char * oldname; char oldname_r_[PADR_(const char *)];
+ char newdfd_l_[PADL_(l_int)]; l_int newdfd; char newdfd_r_[PADR_(l_int)];
+ char newname_l_[PADL_(const char *)]; const char * newname; char newname_r_[PADR_(const char *)];
+};
+struct linux_readlinkat_args {
+ char dfd_l_[PADL_(l_int)]; l_int dfd; char dfd_r_[PADR_(l_int)];
+ char path_l_[PADL_(const char *)]; const char * path; char path_r_[PADR_(const char *)];
+ char buf_l_[PADL_(char *)]; char * buf; char buf_r_[PADR_(char *)];
+ char bufsiz_l_[PADL_(l_int)]; l_int bufsiz; char bufsiz_r_[PADR_(l_int)];
+};
+struct linux_fchmodat_args {
+ char dfd_l_[PADL_(l_int)]; l_int dfd; char dfd_r_[PADR_(l_int)];
+ char filename_l_[PADL_(const char *)]; const char * filename; char filename_r_[PADR_(const char *)];
+ char mode_l_[PADL_(l_mode_t)]; l_mode_t mode; char mode_r_[PADR_(l_mode_t)];
+};
+struct linux_faccessat_args {
+ char dfd_l_[PADL_(l_int)]; l_int dfd; char dfd_r_[PADR_(l_int)];
+ char filename_l_[PADL_(const char *)]; const char * filename; char filename_r_[PADR_(const char *)];
+ char amode_l_[PADL_(l_int)]; l_int amode; char amode_r_[PADR_(l_int)];
+ char flag_l_[PADL_(int)]; int flag; char flag_r_[PADR_(int)];
+};
+struct linux_pselect6_args {
+ register_t dummy;
+};
+struct linux_ppoll_args {
+ register_t dummy;
+};
+struct linux_unshare_args {
+ register_t dummy;
+};
+struct linux_set_robust_list_args {
+ char head_l_[PADL_(struct linux_robust_list_head *)]; struct linux_robust_list_head * head; char head_r_[PADR_(struct linux_robust_list_head *)];
+ char len_l_[PADL_(l_size_t)]; l_size_t len; char len_r_[PADR_(l_size_t)];
+};
+struct linux_get_robust_list_args {
+ char pid_l_[PADL_(l_int)]; l_int pid; char pid_r_[PADR_(l_int)];
+ char head_l_[PADL_(struct linux_robust_list_head *)]; struct linux_robust_list_head * head; char head_r_[PADR_(struct linux_robust_list_head *)];
+ char len_l_[PADL_(l_size_t *)]; l_size_t * len; char len_r_[PADR_(l_size_t *)];
+};
+struct linux_splice_args {
+ register_t dummy;
+};
+struct linux_sync_file_range_args {
+ register_t dummy;
+};
+struct linux_tee_args {
+ register_t dummy;
+};
+struct linux_vmsplice_args {
+ register_t dummy;
+};
+struct linux_move_pages_args {
+ register_t dummy;
+};
+struct linux_getcpu_args {
+ register_t dummy;
+};
+struct linux_epoll_pwait_args {
+ register_t dummy;
+};
+struct linux_utimensat_args {
+ register_t dummy;
+};
+struct linux_signalfd_args {
+ register_t dummy;
+};
+struct linux_timerfd_create_args {
+ register_t dummy;
+};
+struct linux_eventfd_args {
+ register_t dummy;
+};
+struct linux_fallocate_args {
+ register_t dummy;
+};
+struct linux_timerfd_settime_args {
+ register_t dummy;
+};
+struct linux_timerfd_gettime_args {
+ register_t dummy;
+};
+struct linux_signalfd4_args {
+ register_t dummy;
+};
+struct linux_eventfd2_args {
+ register_t dummy;
+};
+struct linux_epoll_create1_args {
+ register_t dummy;
+};
+struct linux_dup3_args {
+ register_t dummy;
+};
+struct linux_pipe2_args {
+ char pipefds_l_[PADL_(l_int *)]; l_int * pipefds; char pipefds_r_[PADR_(l_int *)];
+ char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)];
+};
+struct linux_inotify_init1_args {
+ register_t dummy;
+};
+struct linux_preadv_args {
+ register_t dummy;
+};
+struct linux_pwritev_args {
+ register_t dummy;
+};
+struct linux_rt_tsigqueueinfo_args {
+ register_t dummy;
+};
+struct linux_perf_event_open_args {
+ register_t dummy;
+};
+struct linux_recvmmsg_args {
+ register_t dummy;
+};
+struct linux_fanotify_init_args {
+ register_t dummy;
+};
+struct linux_fanotify_mark_args {
+ register_t dummy;
+};
+struct linux_prlimit64_args {
+ register_t dummy;
+};
+struct linux_name_to_handle_at_args {
+ register_t dummy;
+};
+struct linux_open_by_handle_at_args {
+ register_t dummy;
+};
+struct linux_clock_adjtime_args {
+ register_t dummy;
+};
+struct linux_syncfs_args {
+ register_t dummy;
+};
+struct linux_sendmmsg_args {
+ register_t dummy;
+};
+struct linux_setns_args {
+ register_t dummy;
+};
+struct linux_process_vm_readv_args {
+ register_t dummy;
+};
+struct linux_process_vm_writev_args {
+ register_t dummy;
+};
+#define nosys linux_nosys
+int linux_fork(struct thread *, struct linux_fork_args *);
+int linux_open(struct thread *, struct linux_open_args *);
+int linux_waitpid(struct thread *, struct linux_waitpid_args *);
+int linux_creat(struct thread *, struct linux_creat_args *);
+int linux_link(struct thread *, struct linux_link_args *);
+int linux_unlink(struct thread *, struct linux_unlink_args *);
+int linux_execve(struct thread *, struct linux_execve_args *);
+int linux_chdir(struct thread *, struct linux_chdir_args *);
+int linux_time(struct thread *, struct linux_time_args *);
+int linux_mknod(struct thread *, struct linux_mknod_args *);
+int linux_chmod(struct thread *, struct linux_chmod_args *);
+int linux_lchown16(struct thread *, struct linux_lchown16_args *);
+int linux_stat(struct thread *, struct linux_stat_args *);
+int linux_lseek(struct thread *, struct linux_lseek_args *);
+int linux_getpid(struct thread *, struct linux_getpid_args *);
+int linux_mount(struct thread *, struct linux_mount_args *);
+int linux_oldumount(struct thread *, struct linux_oldumount_args *);
+int linux_setuid16(struct thread *, struct linux_setuid16_args *);
+int linux_getuid16(struct thread *, struct linux_getuid16_args *);
+int linux_stime(struct thread *, struct linux_stime_args *);
+int linux_ptrace(struct thread *, struct linux_ptrace_args *);
+int linux_alarm(struct thread *, struct linux_alarm_args *);
+int linux_pause(struct thread *, struct linux_pause_args *);
+int linux_utime(struct thread *, struct linux_utime_args *);
+int linux_access(struct thread *, struct linux_access_args *);
+int linux_nice(struct thread *, struct linux_nice_args *);
+int linux_kill(struct thread *, struct linux_kill_args *);
+int linux_rename(struct thread *, struct linux_rename_args *);
+int linux_mkdir(struct thread *, struct linux_mkdir_args *);
+int linux_rmdir(struct thread *, struct linux_rmdir_args *);
+int linux_pipe(struct thread *, struct linux_pipe_args *);
+int linux_times(struct thread *, struct linux_times_args *);
+int linux_brk(struct thread *, struct linux_brk_args *);
+int linux_setgid16(struct thread *, struct linux_setgid16_args *);
+int linux_getgid16(struct thread *, struct linux_getgid16_args *);
+int linux_signal(struct thread *, struct linux_signal_args *);
+int linux_geteuid16(struct thread *, struct linux_geteuid16_args *);
+int linux_getegid16(struct thread *, struct linux_getegid16_args *);
+int linux_umount(struct thread *, struct linux_umount_args *);
+int linux_ioctl(struct thread *, struct linux_ioctl_args *);
+int linux_fcntl(struct thread *, struct linux_fcntl_args *);
+int linux_olduname(struct thread *, struct linux_olduname_args *);
+int linux_ustat(struct thread *, struct linux_ustat_args *);
+int linux_getppid(struct thread *, struct linux_getppid_args *);
+int linux_sigaction(struct thread *, struct linux_sigaction_args *);
+int linux_sgetmask(struct thread *, struct linux_sgetmask_args *);
+int linux_ssetmask(struct thread *, struct linux_ssetmask_args *);
+int linux_setreuid16(struct thread *, struct linux_setreuid16_args *);
+int linux_setregid16(struct thread *, struct linux_setregid16_args *);
+int linux_sigsuspend(struct thread *, struct linux_sigsuspend_args *);
+int linux_sigpending(struct thread *, struct linux_sigpending_args *);
+int linux_sethostname(struct thread *, struct linux_sethostname_args *);
+int linux_setrlimit(struct thread *, struct linux_setrlimit_args *);
+int linux_old_getrlimit(struct thread *, struct linux_old_getrlimit_args *);
+int linux_getrusage(struct thread *, struct linux_getrusage_args *);
+int linux_gettimeofday(struct thread *, struct linux_gettimeofday_args *);
+int linux_settimeofday(struct thread *, struct linux_settimeofday_args *);
+int linux_getgroups16(struct thread *, struct linux_getgroups16_args *);
+int linux_setgroups16(struct thread *, struct linux_setgroups16_args *);
+int linux_old_select(struct thread *, struct linux_old_select_args *);
+int linux_symlink(struct thread *, struct linux_symlink_args *);
+int linux_lstat(struct thread *, struct linux_lstat_args *);
+int linux_readlink(struct thread *, struct linux_readlink_args *);
+int linux_reboot(struct thread *, struct linux_reboot_args *);
+int linux_readdir(struct thread *, struct linux_readdir_args *);
+int linux_mmap(struct thread *, struct linux_mmap_args *);
+int linux_truncate(struct thread *, struct linux_truncate_args *);
+int linux_ftruncate(struct thread *, struct linux_ftruncate_args *);
+int linux_getpriority(struct thread *, struct linux_getpriority_args *);
+int linux_statfs(struct thread *, struct linux_statfs_args *);
+int linux_fstatfs(struct thread *, struct linux_fstatfs_args *);
+int linux_socketcall(struct thread *, struct linux_socketcall_args *);
+int linux_syslog(struct thread *, struct linux_syslog_args *);
+int linux_setitimer(struct thread *, struct linux_setitimer_args *);
+int linux_getitimer(struct thread *, struct linux_getitimer_args *);
+int linux_newstat(struct thread *, struct linux_newstat_args *);
+int linux_newlstat(struct thread *, struct linux_newlstat_args *);
+int linux_newfstat(struct thread *, struct linux_newfstat_args *);
+int linux_uname(struct thread *, struct linux_uname_args *);
+int linux_iopl(struct thread *, struct linux_iopl_args *);
+int linux_vhangup(struct thread *, struct linux_vhangup_args *);
+int linux_wait4(struct thread *, struct linux_wait4_args *);
+int linux_swapoff(struct thread *, struct linux_swapoff_args *);
+int linux_sysinfo(struct thread *, struct linux_sysinfo_args *);
+int linux_ipc(struct thread *, struct linux_ipc_args *);
+int linux_sigreturn(struct thread *, struct linux_sigreturn_args *);
+int linux_clone(struct thread *, struct linux_clone_args *);
+int linux_setdomainname(struct thread *, struct linux_setdomainname_args *);
+int linux_newuname(struct thread *, struct linux_newuname_args *);
+int linux_adjtimex(struct thread *, struct linux_adjtimex_args *);
+int linux_mprotect(struct thread *, struct linux_mprotect_args *);
+int linux_sigprocmask(struct thread *, struct linux_sigprocmask_args *);
+int linux_create_module(struct thread *, struct linux_create_module_args *);
+int linux_init_module(struct thread *, struct linux_init_module_args *);
+int linux_delete_module(struct thread *, struct linux_delete_module_args *);
+int linux_get_kernel_syms(struct thread *, struct linux_get_kernel_syms_args *);
+int linux_quotactl(struct thread *, struct linux_quotactl_args *);
+int linux_bdflush(struct thread *, struct linux_bdflush_args *);
+int linux_sysfs(struct thread *, struct linux_sysfs_args *);
+int linux_personality(struct thread *, struct linux_personality_args *);
+int linux_setfsuid16(struct thread *, struct linux_setfsuid16_args *);
+int linux_setfsgid16(struct thread *, struct linux_setfsgid16_args *);
+int linux_llseek(struct thread *, struct linux_llseek_args *);
+int linux_getdents(struct thread *, struct linux_getdents_args *);
+int linux_select(struct thread *, struct linux_select_args *);
+int linux_msync(struct thread *, struct linux_msync_args *);
+int linux_readv(struct thread *, struct linux_readv_args *);
+int linux_writev(struct thread *, struct linux_writev_args *);
+int linux_getsid(struct thread *, struct linux_getsid_args *);
+int linux_fdatasync(struct thread *, struct linux_fdatasync_args *);
+int linux_sysctl(struct thread *, struct linux_sysctl_args *);
+int linux_sched_setscheduler(struct thread *, struct linux_sched_setscheduler_args *);
+int linux_sched_getscheduler(struct thread *, struct linux_sched_getscheduler_args *);
+int linux_sched_get_priority_max(struct thread *, struct linux_sched_get_priority_max_args *);
+int linux_sched_get_priority_min(struct thread *, struct linux_sched_get_priority_min_args *);
+int linux_sched_rr_get_interval(struct thread *, struct linux_sched_rr_get_interval_args *);
+int linux_nanosleep(struct thread *, struct linux_nanosleep_args *);
+int linux_mremap(struct thread *, struct linux_mremap_args *);
+int linux_setresuid16(struct thread *, struct linux_setresuid16_args *);
+int linux_getresuid16(struct thread *, struct linux_getresuid16_args *);
+int linux_query_module(struct thread *, struct linux_query_module_args *);
+int linux_nfsservctl(struct thread *, struct linux_nfsservctl_args *);
+int linux_setresgid16(struct thread *, struct linux_setresgid16_args *);
+int linux_getresgid16(struct thread *, struct linux_getresgid16_args *);
+int linux_prctl(struct thread *, struct linux_prctl_args *);
+int linux_rt_sigreturn(struct thread *, struct linux_rt_sigreturn_args *);
+int linux_rt_sigaction(struct thread *, struct linux_rt_sigaction_args *);
+int linux_rt_sigprocmask(struct thread *, struct linux_rt_sigprocmask_args *);
+int linux_rt_sigpending(struct thread *, struct linux_rt_sigpending_args *);
+int linux_rt_sigtimedwait(struct thread *, struct linux_rt_sigtimedwait_args *);
+int linux_rt_sigqueueinfo(struct thread *, struct linux_rt_sigqueueinfo_args *);
+int linux_rt_sigsuspend(struct thread *, struct linux_rt_sigsuspend_args *);
+int linux_pread(struct thread *, struct linux_pread_args *);
+int linux_pwrite(struct thread *, struct linux_pwrite_args *);
+int linux_chown16(struct thread *, struct linux_chown16_args *);
+int linux_getcwd(struct thread *, struct linux_getcwd_args *);
+int linux_capget(struct thread *, struct linux_capget_args *);
+int linux_capset(struct thread *, struct linux_capset_args *);
+int linux_sigaltstack(struct thread *, struct linux_sigaltstack_args *);
+int linux_sendfile(struct thread *, struct linux_sendfile_args *);
+int linux_vfork(struct thread *, struct linux_vfork_args *);
+int linux_getrlimit(struct thread *, struct linux_getrlimit_args *);
+int linux_mmap2(struct thread *, struct linux_mmap2_args *);
+int linux_truncate64(struct thread *, struct linux_truncate64_args *);
+int linux_ftruncate64(struct thread *, struct linux_ftruncate64_args *);
+int linux_stat64(struct thread *, struct linux_stat64_args *);
+int linux_lstat64(struct thread *, struct linux_lstat64_args *);
+int linux_fstat64(struct thread *, struct linux_fstat64_args *);
+int linux_lchown(struct thread *, struct linux_lchown_args *);
+int linux_getuid(struct thread *, struct linux_getuid_args *);
+int linux_getgid(struct thread *, struct linux_getgid_args *);
+int linux_getgroups(struct thread *, struct linux_getgroups_args *);
+int linux_setgroups(struct thread *, struct linux_setgroups_args *);
+int linux_chown(struct thread *, struct linux_chown_args *);
+int linux_setfsuid(struct thread *, struct linux_setfsuid_args *);
+int linux_setfsgid(struct thread *, struct linux_setfsgid_args *);
+int linux_pivot_root(struct thread *, struct linux_pivot_root_args *);
+int linux_mincore(struct thread *, struct linux_mincore_args *);
+int linux_getdents64(struct thread *, struct linux_getdents64_args *);
+int linux_fcntl64(struct thread *, struct linux_fcntl64_args *);
+int linux_gettid(struct thread *, struct linux_gettid_args *);
+int linux_setxattr(struct thread *, struct linux_setxattr_args *);
+int linux_lsetxattr(struct thread *, struct linux_lsetxattr_args *);
+int linux_fsetxattr(struct thread *, struct linux_fsetxattr_args *);
+int linux_getxattr(struct thread *, struct linux_getxattr_args *);
+int linux_lgetxattr(struct thread *, struct linux_lgetxattr_args *);
+int linux_fgetxattr(struct thread *, struct linux_fgetxattr_args *);
+int linux_listxattr(struct thread *, struct linux_listxattr_args *);
+int linux_llistxattr(struct thread *, struct linux_llistxattr_args *);
+int linux_flistxattr(struct thread *, struct linux_flistxattr_args *);
+int linux_removexattr(struct thread *, struct linux_removexattr_args *);
+int linux_lremovexattr(struct thread *, struct linux_lremovexattr_args *);
+int linux_fremovexattr(struct thread *, struct linux_fremovexattr_args *);
+int linux_tkill(struct thread *, struct linux_tkill_args *);
+int linux_sys_futex(struct thread *, struct linux_sys_futex_args *);
+int linux_sched_setaffinity(struct thread *, struct linux_sched_setaffinity_args *);
+int linux_sched_getaffinity(struct thread *, struct linux_sched_getaffinity_args *);
+int linux_set_thread_area(struct thread *, struct linux_set_thread_area_args *);
+int linux_fadvise64(struct thread *, struct linux_fadvise64_args *);
+int linux_exit_group(struct thread *, struct linux_exit_group_args *);
+int linux_lookup_dcookie(struct thread *, struct linux_lookup_dcookie_args *);
+int linux_epoll_create(struct thread *, struct linux_epoll_create_args *);
+int linux_epoll_ctl(struct thread *, struct linux_epoll_ctl_args *);
+int linux_epoll_wait(struct thread *, struct linux_epoll_wait_args *);
+int linux_remap_file_pages(struct thread *, struct linux_remap_file_pages_args *);
+int linux_set_tid_address(struct thread *, struct linux_set_tid_address_args *);
+int linux_timer_create(struct thread *, struct linux_timer_create_args *);
+int linux_timer_settime(struct thread *, struct linux_timer_settime_args *);
+int linux_timer_gettime(struct thread *, struct linux_timer_gettime_args *);
+int linux_timer_getoverrun(struct thread *, struct linux_timer_getoverrun_args *);
+int linux_timer_delete(struct thread *, struct linux_timer_delete_args *);
+int linux_clock_settime(struct thread *, struct linux_clock_settime_args *);
+int linux_clock_gettime(struct thread *, struct linux_clock_gettime_args *);
+int linux_clock_getres(struct thread *, struct linux_clock_getres_args *);
+int linux_clock_nanosleep(struct thread *, struct linux_clock_nanosleep_args *);
+int linux_statfs64(struct thread *, struct linux_statfs64_args *);
+int linux_fstatfs64(struct thread *, struct linux_fstatfs64_args *);
+int linux_tgkill(struct thread *, struct linux_tgkill_args *);
+int linux_utimes(struct thread *, struct linux_utimes_args *);
+int linux_fadvise64_64(struct thread *, struct linux_fadvise64_64_args *);
+int linux_mbind(struct thread *, struct linux_mbind_args *);
+int linux_get_mempolicy(struct thread *, struct linux_get_mempolicy_args *);
+int linux_set_mempolicy(struct thread *, struct linux_set_mempolicy_args *);
+int linux_mq_open(struct thread *, struct linux_mq_open_args *);
+int linux_mq_unlink(struct thread *, struct linux_mq_unlink_args *);
+int linux_mq_timedsend(struct thread *, struct linux_mq_timedsend_args *);
+int linux_mq_timedreceive(struct thread *, struct linux_mq_timedreceive_args *);
+int linux_mq_notify(struct thread *, struct linux_mq_notify_args *);
+int linux_mq_getsetattr(struct thread *, struct linux_mq_getsetattr_args *);
+int linux_kexec_load(struct thread *, struct linux_kexec_load_args *);
+int linux_waitid(struct thread *, struct linux_waitid_args *);
+int linux_add_key(struct thread *, struct linux_add_key_args *);
+int linux_request_key(struct thread *, struct linux_request_key_args *);
+int linux_keyctl(struct thread *, struct linux_keyctl_args *);
+int linux_ioprio_set(struct thread *, struct linux_ioprio_set_args *);
+int linux_ioprio_get(struct thread *, struct linux_ioprio_get_args *);
+int linux_inotify_init(struct thread *, struct linux_inotify_init_args *);
+int linux_inotify_add_watch(struct thread *, struct linux_inotify_add_watch_args *);
+int linux_inotify_rm_watch(struct thread *, struct linux_inotify_rm_watch_args *);
+int linux_migrate_pages(struct thread *, struct linux_migrate_pages_args *);
+int linux_openat(struct thread *, struct linux_openat_args *);
+int linux_mkdirat(struct thread *, struct linux_mkdirat_args *);
+int linux_mknodat(struct thread *, struct linux_mknodat_args *);
+int linux_fchownat(struct thread *, struct linux_fchownat_args *);
+int linux_futimesat(struct thread *, struct linux_futimesat_args *);
+int linux_fstatat64(struct thread *, struct linux_fstatat64_args *);
+int linux_unlinkat(struct thread *, struct linux_unlinkat_args *);
+int linux_renameat(struct thread *, struct linux_renameat_args *);
+int linux_linkat(struct thread *, struct linux_linkat_args *);
+int linux_symlinkat(struct thread *, struct linux_symlinkat_args *);
+int linux_readlinkat(struct thread *, struct linux_readlinkat_args *);
+int linux_fchmodat(struct thread *, struct linux_fchmodat_args *);
+int linux_faccessat(struct thread *, struct linux_faccessat_args *);
+int linux_pselect6(struct thread *, struct linux_pselect6_args *);
+int linux_ppoll(struct thread *, struct linux_ppoll_args *);
+int linux_unshare(struct thread *, struct linux_unshare_args *);
+int linux_set_robust_list(struct thread *, struct linux_set_robust_list_args *);
+int linux_get_robust_list(struct thread *, struct linux_get_robust_list_args *);
+int linux_splice(struct thread *, struct linux_splice_args *);
+int linux_sync_file_range(struct thread *, struct linux_sync_file_range_args *);
+int linux_tee(struct thread *, struct linux_tee_args *);
+int linux_vmsplice(struct thread *, struct linux_vmsplice_args *);
+int linux_move_pages(struct thread *, struct linux_move_pages_args *);
+int linux_getcpu(struct thread *, struct linux_getcpu_args *);
+int linux_epoll_pwait(struct thread *, struct linux_epoll_pwait_args *);
+int linux_utimensat(struct thread *, struct linux_utimensat_args *);
+int linux_signalfd(struct thread *, struct linux_signalfd_args *);
+int linux_timerfd_create(struct thread *, struct linux_timerfd_create_args *);
+int linux_eventfd(struct thread *, struct linux_eventfd_args *);
+int linux_fallocate(struct thread *, struct linux_fallocate_args *);
+int linux_timerfd_settime(struct thread *, struct linux_timerfd_settime_args *);
+int linux_timerfd_gettime(struct thread *, struct linux_timerfd_gettime_args *);
+int linux_signalfd4(struct thread *, struct linux_signalfd4_args *);
+int linux_eventfd2(struct thread *, struct linux_eventfd2_args *);
+int linux_epoll_create1(struct thread *, struct linux_epoll_create1_args *);
+int linux_dup3(struct thread *, struct linux_dup3_args *);
+int linux_pipe2(struct thread *, struct linux_pipe2_args *);
+int linux_inotify_init1(struct thread *, struct linux_inotify_init1_args *);
+int linux_preadv(struct thread *, struct linux_preadv_args *);
+int linux_pwritev(struct thread *, struct linux_pwritev_args *);
+int linux_rt_tsigqueueinfo(struct thread *, struct linux_rt_tsigqueueinfo_args *);
+int linux_perf_event_open(struct thread *, struct linux_perf_event_open_args *);
+int linux_recvmmsg(struct thread *, struct linux_recvmmsg_args *);
+int linux_fanotify_init(struct thread *, struct linux_fanotify_init_args *);
+int linux_fanotify_mark(struct thread *, struct linux_fanotify_mark_args *);
+int linux_prlimit64(struct thread *, struct linux_prlimit64_args *);
+int linux_name_to_handle_at(struct thread *, struct linux_name_to_handle_at_args *);
+int linux_open_by_handle_at(struct thread *, struct linux_open_by_handle_at_args *);
+int linux_clock_adjtime(struct thread *, struct linux_clock_adjtime_args *);
+int linux_syncfs(struct thread *, struct linux_syncfs_args *);
+int linux_sendmmsg(struct thread *, struct linux_sendmmsg_args *);
+int linux_setns(struct thread *, struct linux_setns_args *);
+int linux_process_vm_readv(struct thread *, struct linux_process_vm_readv_args *);
+int linux_process_vm_writev(struct thread *, struct linux_process_vm_writev_args *);
+
+#ifdef COMPAT_43
+
+#define nosys linux_nosys
+
+#endif /* COMPAT_43 */
+
+
+#ifdef COMPAT_FREEBSD4
+
+#define nosys linux_nosys
+
+#endif /* COMPAT_FREEBSD4 */
+
+
+#ifdef COMPAT_FREEBSD6
+
+#define nosys linux_nosys
+
+#endif /* COMPAT_FREEBSD6 */
+
+
+#ifdef COMPAT_FREEBSD7
+
+#define nosys linux_nosys
+
+#endif /* COMPAT_FREEBSD7 */
+
+#define LINUX_SYS_AUE_linux_fork AUE_FORK
+#define LINUX_SYS_AUE_linux_open AUE_OPEN_RWTC
+#define LINUX_SYS_AUE_linux_waitpid AUE_WAIT4
+#define LINUX_SYS_AUE_linux_creat AUE_CREAT
+#define LINUX_SYS_AUE_linux_link AUE_LINK
+#define LINUX_SYS_AUE_linux_unlink AUE_UNLINK
+#define LINUX_SYS_AUE_linux_execve AUE_EXECVE
+#define LINUX_SYS_AUE_linux_chdir AUE_CHDIR
+#define LINUX_SYS_AUE_linux_time AUE_NULL
+#define LINUX_SYS_AUE_linux_mknod AUE_MKNOD
+#define LINUX_SYS_AUE_linux_chmod AUE_CHMOD
+#define LINUX_SYS_AUE_linux_lchown16 AUE_LCHOWN
+#define LINUX_SYS_AUE_linux_stat AUE_STAT
+#define LINUX_SYS_AUE_linux_lseek AUE_LSEEK
+#define LINUX_SYS_AUE_linux_getpid AUE_GETPID
+#define LINUX_SYS_AUE_linux_mount AUE_MOUNT
+#define LINUX_SYS_AUE_linux_oldumount AUE_UMOUNT
+#define LINUX_SYS_AUE_linux_setuid16 AUE_SETUID
+#define LINUX_SYS_AUE_linux_getuid16 AUE_GETUID
+#define LINUX_SYS_AUE_linux_stime AUE_SETTIMEOFDAY
+#define LINUX_SYS_AUE_linux_ptrace AUE_PTRACE
+#define LINUX_SYS_AUE_linux_alarm AUE_NULL
+#define LINUX_SYS_AUE_linux_pause AUE_NULL
+#define LINUX_SYS_AUE_linux_utime AUE_UTIME
+#define LINUX_SYS_AUE_linux_access AUE_ACCESS
+#define LINUX_SYS_AUE_linux_nice AUE_NICE
+#define LINUX_SYS_AUE_linux_kill AUE_KILL
+#define LINUX_SYS_AUE_linux_rename AUE_RENAME
+#define LINUX_SYS_AUE_linux_mkdir AUE_MKDIR
+#define LINUX_SYS_AUE_linux_rmdir AUE_RMDIR
+#define LINUX_SYS_AUE_linux_pipe AUE_PIPE
+#define LINUX_SYS_AUE_linux_times AUE_NULL
+#define LINUX_SYS_AUE_linux_brk AUE_NULL
+#define LINUX_SYS_AUE_linux_setgid16 AUE_SETGID
+#define LINUX_SYS_AUE_linux_getgid16 AUE_GETGID
+#define LINUX_SYS_AUE_linux_signal AUE_NULL
+#define LINUX_SYS_AUE_linux_geteuid16 AUE_GETEUID
+#define LINUX_SYS_AUE_linux_getegid16 AUE_GETEGID
+#define LINUX_SYS_AUE_linux_umount AUE_UMOUNT
+#define LINUX_SYS_AUE_linux_ioctl AUE_IOCTL
+#define LINUX_SYS_AUE_linux_fcntl AUE_FCNTL
+#define LINUX_SYS_AUE_linux_olduname AUE_NULL
+#define LINUX_SYS_AUE_linux_ustat AUE_NULL
+#define LINUX_SYS_AUE_linux_getppid AUE_GETPPID
+#define LINUX_SYS_AUE_linux_sigaction AUE_NULL
+#define LINUX_SYS_AUE_linux_sgetmask AUE_NULL
+#define LINUX_SYS_AUE_linux_ssetmask AUE_NULL
+#define LINUX_SYS_AUE_linux_setreuid16 AUE_SETREUID
+#define LINUX_SYS_AUE_linux_setregid16 AUE_SETREGID
+#define LINUX_SYS_AUE_linux_sigsuspend AUE_NULL
+#define LINUX_SYS_AUE_linux_sigpending AUE_NULL
+#define LINUX_SYS_AUE_linux_sethostname AUE_SYSCTL
+#define LINUX_SYS_AUE_linux_setrlimit AUE_SETRLIMIT
+#define LINUX_SYS_AUE_linux_old_getrlimit AUE_GETRLIMIT
+#define LINUX_SYS_AUE_linux_getrusage AUE_GETRUSAGE
+#define LINUX_SYS_AUE_linux_gettimeofday AUE_NULL
+#define LINUX_SYS_AUE_linux_settimeofday AUE_SETTIMEOFDAY
+#define LINUX_SYS_AUE_linux_getgroups16 AUE_GETGROUPS
+#define LINUX_SYS_AUE_linux_setgroups16 AUE_SETGROUPS
+#define LINUX_SYS_AUE_linux_old_select AUE_SELECT
+#define LINUX_SYS_AUE_linux_symlink AUE_SYMLINK
+#define LINUX_SYS_AUE_linux_lstat AUE_LSTAT
+#define LINUX_SYS_AUE_linux_readlink AUE_READLINK
+#define LINUX_SYS_AUE_linux_reboot AUE_REBOOT
+#define LINUX_SYS_AUE_linux_readdir AUE_GETDIRENTRIES
+#define LINUX_SYS_AUE_linux_mmap AUE_MMAP
+#define LINUX_SYS_AUE_linux_truncate AUE_TRUNCATE
+#define LINUX_SYS_AUE_linux_ftruncate AUE_FTRUNCATE
+#define LINUX_SYS_AUE_linux_getpriority AUE_GETPRIORITY
+#define LINUX_SYS_AUE_linux_statfs AUE_STATFS
+#define LINUX_SYS_AUE_linux_fstatfs AUE_FSTATFS
+#define LINUX_SYS_AUE_linux_socketcall AUE_NULL
+#define LINUX_SYS_AUE_linux_syslog AUE_NULL
+#define LINUX_SYS_AUE_linux_setitimer AUE_SETITIMER
+#define LINUX_SYS_AUE_linux_getitimer AUE_GETITIMER
+#define LINUX_SYS_AUE_linux_newstat AUE_STAT
+#define LINUX_SYS_AUE_linux_newlstat AUE_LSTAT
+#define LINUX_SYS_AUE_linux_newfstat AUE_FSTAT
+#define LINUX_SYS_AUE_linux_uname AUE_NULL
+#define LINUX_SYS_AUE_linux_iopl AUE_NULL
+#define LINUX_SYS_AUE_linux_vhangup AUE_NULL
+#define LINUX_SYS_AUE_linux_wait4 AUE_WAIT4
+#define LINUX_SYS_AUE_linux_swapoff AUE_SWAPOFF
+#define LINUX_SYS_AUE_linux_sysinfo AUE_NULL
+#define LINUX_SYS_AUE_linux_ipc AUE_NULL
+#define LINUX_SYS_AUE_linux_sigreturn AUE_SIGRETURN
+#define LINUX_SYS_AUE_linux_clone AUE_RFORK
+#define LINUX_SYS_AUE_linux_setdomainname AUE_SYSCTL
+#define LINUX_SYS_AUE_linux_newuname AUE_NULL
+#define LINUX_SYS_AUE_linux_adjtimex AUE_ADJTIME
+#define LINUX_SYS_AUE_linux_mprotect AUE_MPROTECT
+#define LINUX_SYS_AUE_linux_sigprocmask AUE_SIGPROCMASK
+#define LINUX_SYS_AUE_linux_create_module AUE_NULL
+#define LINUX_SYS_AUE_linux_init_module AUE_NULL
+#define LINUX_SYS_AUE_linux_delete_module AUE_NULL
+#define LINUX_SYS_AUE_linux_get_kernel_syms AUE_NULL
+#define LINUX_SYS_AUE_linux_quotactl AUE_QUOTACTL
+#define LINUX_SYS_AUE_linux_bdflush AUE_BDFLUSH
+#define LINUX_SYS_AUE_linux_sysfs AUE_NULL
+#define LINUX_SYS_AUE_linux_personality AUE_PERSONALITY
+#define LINUX_SYS_AUE_linux_setfsuid16 AUE_SETFSUID
+#define LINUX_SYS_AUE_linux_setfsgid16 AUE_SETFSGID
+#define LINUX_SYS_AUE_linux_llseek AUE_LSEEK
+#define LINUX_SYS_AUE_linux_getdents AUE_GETDIRENTRIES
+#define LINUX_SYS_AUE_linux_select AUE_SELECT
+#define LINUX_SYS_AUE_linux_msync AUE_MSYNC
+#define LINUX_SYS_AUE_linux_readv AUE_READV
+#define LINUX_SYS_AUE_linux_writev AUE_WRITEV
+#define LINUX_SYS_AUE_linux_getsid AUE_GETSID
+#define LINUX_SYS_AUE_linux_fdatasync AUE_NULL
+#define LINUX_SYS_AUE_linux_sysctl AUE_SYSCTL
+#define LINUX_SYS_AUE_linux_sched_setscheduler AUE_SCHED_SETSCHEDULER
+#define LINUX_SYS_AUE_linux_sched_getscheduler AUE_SCHED_GETSCHEDULER
+#define LINUX_SYS_AUE_linux_sched_get_priority_max AUE_SCHED_GET_PRIORITY_MAX
+#define LINUX_SYS_AUE_linux_sched_get_priority_min AUE_SCHED_GET_PRIORITY_MIN
+#define LINUX_SYS_AUE_linux_sched_rr_get_interval AUE_SCHED_RR_GET_INTERVAL
+#define LINUX_SYS_AUE_linux_nanosleep AUE_NULL
+#define LINUX_SYS_AUE_linux_mremap AUE_NULL
+#define LINUX_SYS_AUE_linux_setresuid16 AUE_SETRESUID
+#define LINUX_SYS_AUE_linux_getresuid16 AUE_GETRESUID
+#define LINUX_SYS_AUE_linux_query_module AUE_NULL
+#define LINUX_SYS_AUE_linux_nfsservctl AUE_NULL
+#define LINUX_SYS_AUE_linux_setresgid16 AUE_SETRESGID
+#define LINUX_SYS_AUE_linux_getresgid16 AUE_GETRESGID
+#define LINUX_SYS_AUE_linux_prctl AUE_PRCTL
+#define LINUX_SYS_AUE_linux_rt_sigreturn AUE_NULL
+#define LINUX_SYS_AUE_linux_rt_sigaction AUE_NULL
+#define LINUX_SYS_AUE_linux_rt_sigprocmask AUE_NULL
+#define LINUX_SYS_AUE_linux_rt_sigpending AUE_NULL
+#define LINUX_SYS_AUE_linux_rt_sigtimedwait AUE_NULL
+#define LINUX_SYS_AUE_linux_rt_sigqueueinfo AUE_NULL
+#define LINUX_SYS_AUE_linux_rt_sigsuspend AUE_NULL
+#define LINUX_SYS_AUE_linux_pread AUE_PREAD
+#define LINUX_SYS_AUE_linux_pwrite AUE_PWRITE
+#define LINUX_SYS_AUE_linux_chown16 AUE_CHOWN
+#define LINUX_SYS_AUE_linux_getcwd AUE_GETCWD
+#define LINUX_SYS_AUE_linux_capget AUE_CAPGET
+#define LINUX_SYS_AUE_linux_capset AUE_CAPSET
+#define LINUX_SYS_AUE_linux_sigaltstack AUE_NULL
+#define LINUX_SYS_AUE_linux_sendfile AUE_SENDFILE
+#define LINUX_SYS_AUE_linux_vfork AUE_VFORK
+#define LINUX_SYS_AUE_linux_getrlimit AUE_GETRLIMIT
+#define LINUX_SYS_AUE_linux_mmap2 AUE_MMAP
+#define LINUX_SYS_AUE_linux_truncate64 AUE_TRUNCATE
+#define LINUX_SYS_AUE_linux_ftruncate64 AUE_FTRUNCATE
+#define LINUX_SYS_AUE_linux_stat64 AUE_STAT
+#define LINUX_SYS_AUE_linux_lstat64 AUE_LSTAT
+#define LINUX_SYS_AUE_linux_fstat64 AUE_FSTAT
+#define LINUX_SYS_AUE_linux_lchown AUE_LCHOWN
+#define LINUX_SYS_AUE_linux_getuid AUE_GETUID
+#define LINUX_SYS_AUE_linux_getgid AUE_GETGID
+#define LINUX_SYS_AUE_linux_getgroups AUE_GETGROUPS
+#define LINUX_SYS_AUE_linux_setgroups AUE_SETGROUPS
+#define LINUX_SYS_AUE_linux_chown AUE_CHOWN
+#define LINUX_SYS_AUE_linux_setfsuid AUE_SETFSUID
+#define LINUX_SYS_AUE_linux_setfsgid AUE_SETFSGID
+#define LINUX_SYS_AUE_linux_pivot_root AUE_PIVOT_ROOT
+#define LINUX_SYS_AUE_linux_mincore AUE_MINCORE
+#define LINUX_SYS_AUE_linux_getdents64 AUE_GETDIRENTRIES
+#define LINUX_SYS_AUE_linux_fcntl64 AUE_FCNTL
+#define LINUX_SYS_AUE_linux_gettid AUE_NULL
+#define LINUX_SYS_AUE_linux_setxattr AUE_NULL
+#define LINUX_SYS_AUE_linux_lsetxattr AUE_NULL
+#define LINUX_SYS_AUE_linux_fsetxattr AUE_NULL
+#define LINUX_SYS_AUE_linux_getxattr AUE_NULL
+#define LINUX_SYS_AUE_linux_lgetxattr AUE_NULL
+#define LINUX_SYS_AUE_linux_fgetxattr AUE_NULL
+#define LINUX_SYS_AUE_linux_listxattr AUE_NULL
+#define LINUX_SYS_AUE_linux_llistxattr AUE_NULL
+#define LINUX_SYS_AUE_linux_flistxattr AUE_NULL
+#define LINUX_SYS_AUE_linux_removexattr AUE_NULL
+#define LINUX_SYS_AUE_linux_lremovexattr AUE_NULL
+#define LINUX_SYS_AUE_linux_fremovexattr AUE_NULL
+#define LINUX_SYS_AUE_linux_tkill AUE_NULL
+#define LINUX_SYS_AUE_linux_sys_futex AUE_NULL
+#define LINUX_SYS_AUE_linux_sched_setaffinity AUE_NULL
+#define LINUX_SYS_AUE_linux_sched_getaffinity AUE_NULL
+#define LINUX_SYS_AUE_linux_set_thread_area AUE_NULL
+#define LINUX_SYS_AUE_linux_fadvise64 AUE_NULL
+#define LINUX_SYS_AUE_linux_exit_group AUE_EXIT
+#define LINUX_SYS_AUE_linux_lookup_dcookie AUE_NULL
+#define LINUX_SYS_AUE_linux_epoll_create AUE_NULL
+#define LINUX_SYS_AUE_linux_epoll_ctl AUE_NULL
+#define LINUX_SYS_AUE_linux_epoll_wait AUE_NULL
+#define LINUX_SYS_AUE_linux_remap_file_pages AUE_NULL
+#define LINUX_SYS_AUE_linux_set_tid_address AUE_NULL
+#define LINUX_SYS_AUE_linux_timer_create AUE_NULL
+#define LINUX_SYS_AUE_linux_timer_settime AUE_NULL
+#define LINUX_SYS_AUE_linux_timer_gettime AUE_NULL
+#define LINUX_SYS_AUE_linux_timer_getoverrun AUE_NULL
+#define LINUX_SYS_AUE_linux_timer_delete AUE_NULL
+#define LINUX_SYS_AUE_linux_clock_settime AUE_CLOCK_SETTIME
+#define LINUX_SYS_AUE_linux_clock_gettime AUE_NULL
+#define LINUX_SYS_AUE_linux_clock_getres AUE_NULL
+#define LINUX_SYS_AUE_linux_clock_nanosleep AUE_NULL
+#define LINUX_SYS_AUE_linux_statfs64 AUE_STATFS
+#define LINUX_SYS_AUE_linux_fstatfs64 AUE_FSTATFS
+#define LINUX_SYS_AUE_linux_tgkill AUE_NULL
+#define LINUX_SYS_AUE_linux_utimes AUE_UTIMES
+#define LINUX_SYS_AUE_linux_fadvise64_64 AUE_NULL
+#define LINUX_SYS_AUE_linux_mbind AUE_NULL
+#define LINUX_SYS_AUE_linux_get_mempolicy AUE_NULL
+#define LINUX_SYS_AUE_linux_set_mempolicy AUE_NULL
+#define LINUX_SYS_AUE_linux_mq_open AUE_NULL
+#define LINUX_SYS_AUE_linux_mq_unlink AUE_NULL
+#define LINUX_SYS_AUE_linux_mq_timedsend AUE_NULL
+#define LINUX_SYS_AUE_linux_mq_timedreceive AUE_NULL
+#define LINUX_SYS_AUE_linux_mq_notify AUE_NULL
+#define LINUX_SYS_AUE_linux_mq_getsetattr AUE_NULL
+#define LINUX_SYS_AUE_linux_kexec_load AUE_NULL
+#define LINUX_SYS_AUE_linux_waitid AUE_NULL
+#define LINUX_SYS_AUE_linux_add_key AUE_NULL
+#define LINUX_SYS_AUE_linux_request_key AUE_NULL
+#define LINUX_SYS_AUE_linux_keyctl AUE_NULL
+#define LINUX_SYS_AUE_linux_ioprio_set AUE_NULL
+#define LINUX_SYS_AUE_linux_ioprio_get AUE_NULL
+#define LINUX_SYS_AUE_linux_inotify_init AUE_NULL
+#define LINUX_SYS_AUE_linux_inotify_add_watch AUE_NULL
+#define LINUX_SYS_AUE_linux_inotify_rm_watch AUE_NULL
+#define LINUX_SYS_AUE_linux_migrate_pages AUE_NULL
+#define LINUX_SYS_AUE_linux_openat AUE_OPEN_RWTC
+#define LINUX_SYS_AUE_linux_mkdirat AUE_MKDIRAT
+#define LINUX_SYS_AUE_linux_mknodat AUE_MKNODAT
+#define LINUX_SYS_AUE_linux_fchownat AUE_FCHOWNAT
+#define LINUX_SYS_AUE_linux_futimesat AUE_FUTIMESAT
+#define LINUX_SYS_AUE_linux_fstatat64 AUE_FSTATAT
+#define LINUX_SYS_AUE_linux_unlinkat AUE_UNLINKAT
+#define LINUX_SYS_AUE_linux_renameat AUE_RENAMEAT
+#define LINUX_SYS_AUE_linux_linkat AUE_LINKAT
+#define LINUX_SYS_AUE_linux_symlinkat AUE_SYMLINKAT
+#define LINUX_SYS_AUE_linux_readlinkat AUE_READLINKAT
+#define LINUX_SYS_AUE_linux_fchmodat AUE_FCHMODAT
+#define LINUX_SYS_AUE_linux_faccessat AUE_FACCESSAT
+#define LINUX_SYS_AUE_linux_pselect6 AUE_NULL
+#define LINUX_SYS_AUE_linux_ppoll AUE_NULL
+#define LINUX_SYS_AUE_linux_unshare AUE_NULL
+#define LINUX_SYS_AUE_linux_set_robust_list AUE_NULL
+#define LINUX_SYS_AUE_linux_get_robust_list AUE_NULL
+#define LINUX_SYS_AUE_linux_splice AUE_NULL
+#define LINUX_SYS_AUE_linux_sync_file_range AUE_NULL
+#define LINUX_SYS_AUE_linux_tee AUE_NULL
+#define LINUX_SYS_AUE_linux_vmsplice AUE_NULL
+#define LINUX_SYS_AUE_linux_move_pages AUE_NULL
+#define LINUX_SYS_AUE_linux_getcpu AUE_NULL
+#define LINUX_SYS_AUE_linux_epoll_pwait AUE_NULL
+#define LINUX_SYS_AUE_linux_utimensat AUE_NULL
+#define LINUX_SYS_AUE_linux_signalfd AUE_NULL
+#define LINUX_SYS_AUE_linux_timerfd_create AUE_NULL
+#define LINUX_SYS_AUE_linux_eventfd AUE_NULL
+#define LINUX_SYS_AUE_linux_fallocate AUE_NULL
+#define LINUX_SYS_AUE_linux_timerfd_settime AUE_NULL
+#define LINUX_SYS_AUE_linux_timerfd_gettime AUE_NULL
+#define LINUX_SYS_AUE_linux_signalfd4 AUE_NULL
+#define LINUX_SYS_AUE_linux_eventfd2 AUE_NULL
+#define LINUX_SYS_AUE_linux_epoll_create1 AUE_NULL
+#define LINUX_SYS_AUE_linux_dup3 AUE_NULL
+#define LINUX_SYS_AUE_linux_pipe2 AUE_NULL
+#define LINUX_SYS_AUE_linux_inotify_init1 AUE_NULL
+#define LINUX_SYS_AUE_linux_preadv AUE_NULL
+#define LINUX_SYS_AUE_linux_pwritev AUE_NULL
+#define LINUX_SYS_AUE_linux_rt_tsigqueueinfo AUE_NULL
+#define LINUX_SYS_AUE_linux_perf_event_open AUE_NULL
+#define LINUX_SYS_AUE_linux_recvmmsg AUE_NULL
+#define LINUX_SYS_AUE_linux_fanotify_init AUE_NULL
+#define LINUX_SYS_AUE_linux_fanotify_mark AUE_NULL
+#define LINUX_SYS_AUE_linux_prlimit64 AUE_NULL
+#define LINUX_SYS_AUE_linux_name_to_handle_at AUE_NULL
+#define LINUX_SYS_AUE_linux_open_by_handle_at AUE_NULL
+#define LINUX_SYS_AUE_linux_clock_adjtime AUE_NULL
+#define LINUX_SYS_AUE_linux_syncfs AUE_NULL
+#define LINUX_SYS_AUE_linux_sendmmsg AUE_NULL
+#define LINUX_SYS_AUE_linux_setns AUE_NULL
+#define LINUX_SYS_AUE_linux_process_vm_readv AUE_NULL
+#define LINUX_SYS_AUE_linux_process_vm_writev AUE_NULL
+
+#undef PAD_
+#undef PADL_
+#undef PADR_
+
+#endif /* !_LINUX_SYSPROTO_H_ */
diff --git a/sys/amd64/linux32/linux32_support.s b/sys/amd64/linux32/linux32_support.s
new file mode 100644
index 0000000..42375c3
--- /dev/null
+++ b/sys/amd64/linux32/linux32_support.s
@@ -0,0 +1,124 @@
+/*-
+ * Copyright (c) 2007 Konstantin Belousov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "linux32_assym.h" /* system definitions */
+#include <machine/asmacros.h> /* miscellaneous asm macros */
+
+#include "assym.s"
+
+futex_fault:
+ movq $0,PCB_ONFAULT(%r8)
+ movl $-EFAULT,%eax
+ ret
+
+ENTRY(futex_xchgl)
+ movq PCPU(CURPCB),%r8
+ movq $futex_fault,PCB_ONFAULT(%r8)
+ movq $VM_MAXUSER_ADDRESS-4,%rax
+ cmpq %rax,%rsi
+ ja futex_fault
+ xchgl %edi,(%rsi)
+ movl %edi,(%rdx)
+ xorl %eax,%eax
+ movq %rax,PCB_ONFAULT(%r8)
+ ret
+
+ENTRY(futex_addl)
+ movq PCPU(CURPCB),%r8
+ movq $futex_fault,PCB_ONFAULT(%r8)
+ movq $VM_MAXUSER_ADDRESS-4,%rax
+ cmpq %rax,%rsi
+ ja futex_fault
+#ifdef SMP
+ lock
+#endif
+ xaddl %edi,(%rsi)
+ movl %edi,(%rdx)
+ xorl %eax,%eax
+ movq %rax,PCB_ONFAULT(%r8)
+ ret
+
+ENTRY(futex_orl)
+ movq PCPU(CURPCB),%r8
+ movq $futex_fault,PCB_ONFAULT(%r8)
+ movq $VM_MAXUSER_ADDRESS-4,%rax
+ cmpq %rax,%rsi
+ ja futex_fault
+ movl (%rsi),%eax
+1: movl %eax,%ecx
+ orl %edi,%ecx
+#ifdef SMP
+ lock
+#endif
+ cmpxchgl %ecx,(%rsi)
+ jnz 1b
+ movl %eax,(%rdx)
+ xorl %eax,%eax
+ movq %rax,PCB_ONFAULT(%r8)
+ ret
+
+ENTRY(futex_andl)
+ movq PCPU(CURPCB),%r8
+ movq $futex_fault,PCB_ONFAULT(%r8)
+ movq $VM_MAXUSER_ADDRESS-4,%rax
+ cmpq %rax,%rsi
+ ja futex_fault
+ movl (%rsi),%eax
+1: movl %eax,%ecx
+ andl %edi,%ecx
+#ifdef SMP
+ lock
+#endif
+ cmpxchgl %ecx,(%rsi)
+ jnz 1b
+ movl %eax,(%rdx)
+ xorl %eax,%eax
+ movq %rax,PCB_ONFAULT(%r8)
+ ret
+
+ENTRY(futex_xorl)
+ movq PCPU(CURPCB),%r8
+ movq $futex_fault,PCB_ONFAULT(%r8)
+ movq $VM_MAXUSER_ADDRESS-4,%rax
+ cmpq %rax,%rsi
+ ja futex_fault
+ movl (%rsi),%eax
+1: movl %eax,%ecx
+ xorl %edi,%ecx
+#ifdef SMP
+ lock
+#endif
+ cmpxchgl %ecx,(%rsi)
+ jnz 1b
+ movl %eax,(%rdx)
+ xorl %eax,%eax
+ movq %rax,PCB_ONFAULT(%r8)
+ ret
diff --git a/sys/amd64/linux32/linux32_syscall.h b/sys/amd64/linux32/linux32_syscall.h
new file mode 100644
index 0000000..5a411f8
--- /dev/null
+++ b/sys/amd64/linux32/linux32_syscall.h
@@ -0,0 +1,324 @@
+/*
+ * System call numbers.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 236026 2012-05-25 21:50:48Z ed
+ */
+
+#define LINUX_SYS_exit 1
+#define LINUX_SYS_linux_fork 2
+#define LINUX_SYS_read 3
+#define LINUX_SYS_write 4
+#define LINUX_SYS_linux_open 5
+#define LINUX_SYS_close 6
+#define LINUX_SYS_linux_waitpid 7
+#define LINUX_SYS_linux_creat 8
+#define LINUX_SYS_linux_link 9
+#define LINUX_SYS_linux_unlink 10
+#define LINUX_SYS_linux_execve 11
+#define LINUX_SYS_linux_chdir 12
+#define LINUX_SYS_linux_time 13
+#define LINUX_SYS_linux_mknod 14
+#define LINUX_SYS_linux_chmod 15
+#define LINUX_SYS_linux_lchown16 16
+#define LINUX_SYS_linux_stat 18
+#define LINUX_SYS_linux_lseek 19
+#define LINUX_SYS_linux_getpid 20
+#define LINUX_SYS_linux_mount 21
+#define LINUX_SYS_linux_oldumount 22
+#define LINUX_SYS_linux_setuid16 23
+#define LINUX_SYS_linux_getuid16 24
+#define LINUX_SYS_linux_stime 25
+#define LINUX_SYS_linux_ptrace 26
+#define LINUX_SYS_linux_alarm 27
+#define LINUX_SYS_linux_pause 29
+#define LINUX_SYS_linux_utime 30
+#define LINUX_SYS_linux_access 33
+#define LINUX_SYS_linux_nice 34
+#define LINUX_SYS_sync 36
+#define LINUX_SYS_linux_kill 37
+#define LINUX_SYS_linux_rename 38
+#define LINUX_SYS_linux_mkdir 39
+#define LINUX_SYS_linux_rmdir 40
+#define LINUX_SYS_dup 41
+#define LINUX_SYS_linux_pipe 42
+#define LINUX_SYS_linux_times 43
+#define LINUX_SYS_linux_brk 45
+#define LINUX_SYS_linux_setgid16 46
+#define LINUX_SYS_linux_getgid16 47
+#define LINUX_SYS_linux_signal 48
+#define LINUX_SYS_linux_geteuid16 49
+#define LINUX_SYS_linux_getegid16 50
+#define LINUX_SYS_acct 51
+#define LINUX_SYS_linux_umount 52
+#define LINUX_SYS_linux_ioctl 54
+#define LINUX_SYS_linux_fcntl 55
+#define LINUX_SYS_setpgid 57
+#define LINUX_SYS_linux_olduname 59
+#define LINUX_SYS_umask 60
+#define LINUX_SYS_chroot 61
+#define LINUX_SYS_linux_ustat 62
+#define LINUX_SYS_dup2 63
+#define LINUX_SYS_linux_getppid 64
+#define LINUX_SYS_getpgrp 65
+#define LINUX_SYS_setsid 66
+#define LINUX_SYS_linux_sigaction 67
+#define LINUX_SYS_linux_sgetmask 68
+#define LINUX_SYS_linux_ssetmask 69
+#define LINUX_SYS_linux_setreuid16 70
+#define LINUX_SYS_linux_setregid16 71
+#define LINUX_SYS_linux_sigsuspend 72
+#define LINUX_SYS_linux_sigpending 73
+#define LINUX_SYS_linux_sethostname 74
+#define LINUX_SYS_linux_setrlimit 75
+#define LINUX_SYS_linux_old_getrlimit 76
+#define LINUX_SYS_linux_getrusage 77
+#define LINUX_SYS_linux_gettimeofday 78
+#define LINUX_SYS_linux_settimeofday 79
+#define LINUX_SYS_linux_getgroups16 80
+#define LINUX_SYS_linux_setgroups16 81
+#define LINUX_SYS_linux_old_select 82
+#define LINUX_SYS_linux_symlink 83
+#define LINUX_SYS_linux_lstat 84
+#define LINUX_SYS_linux_readlink 85
+#define LINUX_SYS_swapon 87
+#define LINUX_SYS_linux_reboot 88
+#define LINUX_SYS_linux_readdir 89
+#define LINUX_SYS_linux_mmap 90
+#define LINUX_SYS_munmap 91
+#define LINUX_SYS_linux_truncate 92
+#define LINUX_SYS_linux_ftruncate 93
+#define LINUX_SYS_fchmod 94
+#define LINUX_SYS_fchown 95
+#define LINUX_SYS_linux_getpriority 96
+#define LINUX_SYS_setpriority 97
+#define LINUX_SYS_linux_statfs 99
+#define LINUX_SYS_linux_fstatfs 100
+#define LINUX_SYS_linux_socketcall 102
+#define LINUX_SYS_linux_syslog 103
+#define LINUX_SYS_linux_setitimer 104
+#define LINUX_SYS_linux_getitimer 105
+#define LINUX_SYS_linux_newstat 106
+#define LINUX_SYS_linux_newlstat 107
+#define LINUX_SYS_linux_newfstat 108
+#define LINUX_SYS_linux_uname 109
+#define LINUX_SYS_linux_iopl 110
+#define LINUX_SYS_linux_vhangup 111
+#define LINUX_SYS_linux_wait4 114
+#define LINUX_SYS_linux_swapoff 115
+#define LINUX_SYS_linux_sysinfo 116
+#define LINUX_SYS_linux_ipc 117
+#define LINUX_SYS_fsync 118
+#define LINUX_SYS_linux_sigreturn 119
+#define LINUX_SYS_linux_clone 120
+#define LINUX_SYS_linux_setdomainname 121
+#define LINUX_SYS_linux_newuname 122
+#define LINUX_SYS_linux_adjtimex 124
+#define LINUX_SYS_linux_mprotect 125
+#define LINUX_SYS_linux_sigprocmask 126
+#define LINUX_SYS_linux_create_module 127
+#define LINUX_SYS_linux_init_module 128
+#define LINUX_SYS_linux_delete_module 129
+#define LINUX_SYS_linux_get_kernel_syms 130
+#define LINUX_SYS_linux_quotactl 131
+#define LINUX_SYS_getpgid 132
+#define LINUX_SYS_fchdir 133
+#define LINUX_SYS_linux_bdflush 134
+#define LINUX_SYS_linux_sysfs 135
+#define LINUX_SYS_linux_personality 136
+#define LINUX_SYS_linux_setfsuid16 138
+#define LINUX_SYS_linux_setfsgid16 139
+#define LINUX_SYS_linux_llseek 140
+#define LINUX_SYS_linux_getdents 141
+#define LINUX_SYS_linux_select 142
+#define LINUX_SYS_flock 143
+#define LINUX_SYS_linux_msync 144
+#define LINUX_SYS_linux_readv 145
+#define LINUX_SYS_linux_writev 146
+#define LINUX_SYS_linux_getsid 147
+#define LINUX_SYS_linux_fdatasync 148
+#define LINUX_SYS_linux_sysctl 149
+#define LINUX_SYS_mlock 150
+#define LINUX_SYS_munlock 151
+#define LINUX_SYS_mlockall 152
+#define LINUX_SYS_munlockall 153
+#define LINUX_SYS_sched_setparam 154
+#define LINUX_SYS_sched_getparam 155
+#define LINUX_SYS_linux_sched_setscheduler 156
+#define LINUX_SYS_linux_sched_getscheduler 157
+#define LINUX_SYS_sched_yield 158
+#define LINUX_SYS_linux_sched_get_priority_max 159
+#define LINUX_SYS_linux_sched_get_priority_min 160
+#define LINUX_SYS_linux_sched_rr_get_interval 161
+#define LINUX_SYS_linux_nanosleep 162
+#define LINUX_SYS_linux_mremap 163
+#define LINUX_SYS_linux_setresuid16 164
+#define LINUX_SYS_linux_getresuid16 165
+#define LINUX_SYS_linux_query_module 167
+#define LINUX_SYS_poll 168
+#define LINUX_SYS_linux_nfsservctl 169
+#define LINUX_SYS_linux_setresgid16 170
+#define LINUX_SYS_linux_getresgid16 171
+#define LINUX_SYS_linux_prctl 172
+#define LINUX_SYS_linux_rt_sigreturn 173
+#define LINUX_SYS_linux_rt_sigaction 174
+#define LINUX_SYS_linux_rt_sigprocmask 175
+#define LINUX_SYS_linux_rt_sigpending 176
+#define LINUX_SYS_linux_rt_sigtimedwait 177
+#define LINUX_SYS_linux_rt_sigqueueinfo 178
+#define LINUX_SYS_linux_rt_sigsuspend 179
+#define LINUX_SYS_linux_pread 180
+#define LINUX_SYS_linux_pwrite 181
+#define LINUX_SYS_linux_chown16 182
+#define LINUX_SYS_linux_getcwd 183
+#define LINUX_SYS_linux_capget 184
+#define LINUX_SYS_linux_capset 185
+#define LINUX_SYS_linux_sigaltstack 186
+#define LINUX_SYS_linux_sendfile 187
+#define LINUX_SYS_linux_vfork 190
+#define LINUX_SYS_linux_getrlimit 191
+#define LINUX_SYS_linux_mmap2 192
+#define LINUX_SYS_linux_truncate64 193
+#define LINUX_SYS_linux_ftruncate64 194
+#define LINUX_SYS_linux_stat64 195
+#define LINUX_SYS_linux_lstat64 196
+#define LINUX_SYS_linux_fstat64 197
+#define LINUX_SYS_linux_lchown 198
+#define LINUX_SYS_linux_getuid 199
+#define LINUX_SYS_linux_getgid 200
+#define LINUX_SYS_geteuid 201
+#define LINUX_SYS_getegid 202
+#define LINUX_SYS_setreuid 203
+#define LINUX_SYS_setregid 204
+#define LINUX_SYS_linux_getgroups 205
+#define LINUX_SYS_linux_setgroups 206
+#define LINUX_SYS_setresuid 208
+#define LINUX_SYS_getresuid 209
+#define LINUX_SYS_setresgid 210
+#define LINUX_SYS_getresgid 211
+#define LINUX_SYS_linux_chown 212
+#define LINUX_SYS_setuid 213
+#define LINUX_SYS_setgid 214
+#define LINUX_SYS_linux_setfsuid 215
+#define LINUX_SYS_linux_setfsgid 216
+#define LINUX_SYS_linux_pivot_root 217
+#define LINUX_SYS_linux_mincore 218
+#define LINUX_SYS_madvise 219
+#define LINUX_SYS_linux_getdents64 220
+#define LINUX_SYS_linux_fcntl64 221
+#define LINUX_SYS_linux_gettid 224
+#define LINUX_SYS_linux_setxattr 226
+#define LINUX_SYS_linux_lsetxattr 227
+#define LINUX_SYS_linux_fsetxattr 228
+#define LINUX_SYS_linux_getxattr 229
+#define LINUX_SYS_linux_lgetxattr 230
+#define LINUX_SYS_linux_fgetxattr 231
+#define LINUX_SYS_linux_listxattr 232
+#define LINUX_SYS_linux_llistxattr 233
+#define LINUX_SYS_linux_flistxattr 234
+#define LINUX_SYS_linux_removexattr 235
+#define LINUX_SYS_linux_lremovexattr 236
+#define LINUX_SYS_linux_fremovexattr 237
+#define LINUX_SYS_linux_tkill 238
+#define LINUX_SYS_linux_sys_futex 240
+#define LINUX_SYS_linux_sched_setaffinity 241
+#define LINUX_SYS_linux_sched_getaffinity 242
+#define LINUX_SYS_linux_set_thread_area 243
+#define LINUX_SYS_linux_fadvise64 250
+#define LINUX_SYS_linux_exit_group 252
+#define LINUX_SYS_linux_lookup_dcookie 253
+#define LINUX_SYS_linux_epoll_create 254
+#define LINUX_SYS_linux_epoll_ctl 255
+#define LINUX_SYS_linux_epoll_wait 256
+#define LINUX_SYS_linux_remap_file_pages 257
+#define LINUX_SYS_linux_set_tid_address 258
+#define LINUX_SYS_linux_timer_create 259
+#define LINUX_SYS_linux_timer_settime 260
+#define LINUX_SYS_linux_timer_gettime 261
+#define LINUX_SYS_linux_timer_getoverrun 262
+#define LINUX_SYS_linux_timer_delete 263
+#define LINUX_SYS_linux_clock_settime 264
+#define LINUX_SYS_linux_clock_gettime 265
+#define LINUX_SYS_linux_clock_getres 266
+#define LINUX_SYS_linux_clock_nanosleep 267
+#define LINUX_SYS_linux_statfs64 268
+#define LINUX_SYS_linux_fstatfs64 269
+#define LINUX_SYS_linux_tgkill 270
+#define LINUX_SYS_linux_utimes 271
+#define LINUX_SYS_linux_fadvise64_64 272
+#define LINUX_SYS_linux_mbind 274
+#define LINUX_SYS_linux_get_mempolicy 275
+#define LINUX_SYS_linux_set_mempolicy 276
+#define LINUX_SYS_linux_mq_open 277
+#define LINUX_SYS_linux_mq_unlink 278
+#define LINUX_SYS_linux_mq_timedsend 279
+#define LINUX_SYS_linux_mq_timedreceive 280
+#define LINUX_SYS_linux_mq_notify 281
+#define LINUX_SYS_linux_mq_getsetattr 282
+#define LINUX_SYS_linux_kexec_load 283
+#define LINUX_SYS_linux_waitid 284
+#define LINUX_SYS_linux_add_key 286
+#define LINUX_SYS_linux_request_key 287
+#define LINUX_SYS_linux_keyctl 288
+#define LINUX_SYS_linux_ioprio_set 289
+#define LINUX_SYS_linux_ioprio_get 290
+#define LINUX_SYS_linux_inotify_init 291
+#define LINUX_SYS_linux_inotify_add_watch 292
+#define LINUX_SYS_linux_inotify_rm_watch 293
+#define LINUX_SYS_linux_migrate_pages 294
+#define LINUX_SYS_linux_openat 295
+#define LINUX_SYS_linux_mkdirat 296
+#define LINUX_SYS_linux_mknodat 297
+#define LINUX_SYS_linux_fchownat 298
+#define LINUX_SYS_linux_futimesat 299
+#define LINUX_SYS_linux_fstatat64 300
+#define LINUX_SYS_linux_unlinkat 301
+#define LINUX_SYS_linux_renameat 302
+#define LINUX_SYS_linux_linkat 303
+#define LINUX_SYS_linux_symlinkat 304
+#define LINUX_SYS_linux_readlinkat 305
+#define LINUX_SYS_linux_fchmodat 306
+#define LINUX_SYS_linux_faccessat 307
+#define LINUX_SYS_linux_pselect6 308
+#define LINUX_SYS_linux_ppoll 309
+#define LINUX_SYS_linux_unshare 310
+#define LINUX_SYS_linux_set_robust_list 311
+#define LINUX_SYS_linux_get_robust_list 312
+#define LINUX_SYS_linux_splice 313
+#define LINUX_SYS_linux_sync_file_range 314
+#define LINUX_SYS_linux_tee 315
+#define LINUX_SYS_linux_vmsplice 316
+#define LINUX_SYS_linux_move_pages 317
+#define LINUX_SYS_linux_getcpu 318
+#define LINUX_SYS_linux_epoll_pwait 319
+#define LINUX_SYS_linux_utimensat 320
+#define LINUX_SYS_linux_signalfd 321
+#define LINUX_SYS_linux_timerfd_create 322
+#define LINUX_SYS_linux_eventfd 323
+#define LINUX_SYS_linux_fallocate 324
+#define LINUX_SYS_linux_timerfd_settime 325
+#define LINUX_SYS_linux_timerfd_gettime 326
+#define LINUX_SYS_linux_signalfd4 327
+#define LINUX_SYS_linux_eventfd2 328
+#define LINUX_SYS_linux_epoll_create1 329
+#define LINUX_SYS_linux_dup3 330
+#define LINUX_SYS_linux_pipe2 331
+#define LINUX_SYS_linux_inotify_init1 332
+#define LINUX_SYS_linux_preadv 333
+#define LINUX_SYS_linux_pwritev 334
+#define LINUX_SYS_linux_rt_tsigqueueinfo 335
+#define LINUX_SYS_linux_perf_event_open 336
+#define LINUX_SYS_linux_recvmmsg 337
+#define LINUX_SYS_linux_fanotify_init 338
+#define LINUX_SYS_linux_fanotify_mark 339
+#define LINUX_SYS_linux_prlimit64 340
+#define LINUX_SYS_linux_name_to_handle_at 341
+#define LINUX_SYS_linux_open_by_handle_at 342
+#define LINUX_SYS_linux_clock_adjtime 343
+#define LINUX_SYS_linux_syncfs 344
+#define LINUX_SYS_linux_sendmmsg 345
+#define LINUX_SYS_linux_setns 346
+#define LINUX_SYS_linux_process_vm_readv 347
+#define LINUX_SYS_linux_process_vm_writev 348
+#define LINUX_SYS_MAXSYSCALL 349
diff --git a/sys/amd64/linux32/linux32_syscalls.c b/sys/amd64/linux32/linux32_syscalls.c
new file mode 100644
index 0000000..ebde899
--- /dev/null
+++ b/sys/amd64/linux32/linux32_syscalls.c
@@ -0,0 +1,360 @@
+/*
+ * System call names.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 236026 2012-05-25 21:50:48Z ed
+ */
+
+const char *linux_syscallnames[] = {
+#define nosys linux_nosys
+ "#0", /* 0 = setup */
+ "exit", /* 1 = exit */
+ "linux_fork", /* 2 = linux_fork */
+ "read", /* 3 = read */
+ "write", /* 4 = write */
+ "linux_open", /* 5 = linux_open */
+ "close", /* 6 = close */
+ "linux_waitpid", /* 7 = linux_waitpid */
+ "linux_creat", /* 8 = linux_creat */
+ "linux_link", /* 9 = linux_link */
+ "linux_unlink", /* 10 = linux_unlink */
+ "linux_execve", /* 11 = linux_execve */
+ "linux_chdir", /* 12 = linux_chdir */
+ "linux_time", /* 13 = linux_time */
+ "linux_mknod", /* 14 = linux_mknod */
+ "linux_chmod", /* 15 = linux_chmod */
+ "linux_lchown16", /* 16 = linux_lchown16 */
+ "#17", /* 17 = break */
+ "linux_stat", /* 18 = linux_stat */
+ "linux_lseek", /* 19 = linux_lseek */
+ "linux_getpid", /* 20 = linux_getpid */
+ "linux_mount", /* 21 = linux_mount */
+ "linux_oldumount", /* 22 = linux_oldumount */
+ "linux_setuid16", /* 23 = linux_setuid16 */
+ "linux_getuid16", /* 24 = linux_getuid16 */
+ "linux_stime", /* 25 = linux_stime */
+ "linux_ptrace", /* 26 = linux_ptrace */
+ "linux_alarm", /* 27 = linux_alarm */
+ "#28", /* 28 = fstat */
+ "linux_pause", /* 29 = linux_pause */
+ "linux_utime", /* 30 = linux_utime */
+ "#31", /* 31 = stty */
+ "#32", /* 32 = gtty */
+ "linux_access", /* 33 = linux_access */
+ "linux_nice", /* 34 = linux_nice */
+ "#35", /* 35 = ftime */
+ "sync", /* 36 = sync */
+ "linux_kill", /* 37 = linux_kill */
+ "linux_rename", /* 38 = linux_rename */
+ "linux_mkdir", /* 39 = linux_mkdir */
+ "linux_rmdir", /* 40 = linux_rmdir */
+ "dup", /* 41 = dup */
+ "linux_pipe", /* 42 = linux_pipe */
+ "linux_times", /* 43 = linux_times */
+ "#44", /* 44 = prof */
+ "linux_brk", /* 45 = linux_brk */
+ "linux_setgid16", /* 46 = linux_setgid16 */
+ "linux_getgid16", /* 47 = linux_getgid16 */
+ "linux_signal", /* 48 = linux_signal */
+ "linux_geteuid16", /* 49 = linux_geteuid16 */
+ "linux_getegid16", /* 50 = linux_getegid16 */
+ "acct", /* 51 = acct */
+ "linux_umount", /* 52 = linux_umount */
+ "#53", /* 53 = lock */
+ "linux_ioctl", /* 54 = linux_ioctl */
+ "linux_fcntl", /* 55 = linux_fcntl */
+ "#56", /* 56 = mpx */
+ "setpgid", /* 57 = setpgid */
+ "#58", /* 58 = ulimit */
+ "linux_olduname", /* 59 = linux_olduname */
+ "umask", /* 60 = umask */
+ "chroot", /* 61 = chroot */
+ "linux_ustat", /* 62 = linux_ustat */
+ "dup2", /* 63 = dup2 */
+ "linux_getppid", /* 64 = linux_getppid */
+ "getpgrp", /* 65 = getpgrp */
+ "setsid", /* 66 = setsid */
+ "linux_sigaction", /* 67 = linux_sigaction */
+ "linux_sgetmask", /* 68 = linux_sgetmask */
+ "linux_ssetmask", /* 69 = linux_ssetmask */
+ "linux_setreuid16", /* 70 = linux_setreuid16 */
+ "linux_setregid16", /* 71 = linux_setregid16 */
+ "linux_sigsuspend", /* 72 = linux_sigsuspend */
+ "linux_sigpending", /* 73 = linux_sigpending */
+ "linux_sethostname", /* 74 = linux_sethostname */
+ "linux_setrlimit", /* 75 = linux_setrlimit */
+ "linux_old_getrlimit", /* 76 = linux_old_getrlimit */
+ "linux_getrusage", /* 77 = linux_getrusage */
+ "linux_gettimeofday", /* 78 = linux_gettimeofday */
+ "linux_settimeofday", /* 79 = linux_settimeofday */
+ "linux_getgroups16", /* 80 = linux_getgroups16 */
+ "linux_setgroups16", /* 81 = linux_setgroups16 */
+ "linux_old_select", /* 82 = linux_old_select */
+ "linux_symlink", /* 83 = linux_symlink */
+ "linux_lstat", /* 84 = linux_lstat */
+ "linux_readlink", /* 85 = linux_readlink */
+ "#86", /* 86 = linux_uselib */
+ "swapon", /* 87 = swapon */
+ "linux_reboot", /* 88 = linux_reboot */
+ "linux_readdir", /* 89 = linux_readdir */
+ "linux_mmap", /* 90 = linux_mmap */
+ "munmap", /* 91 = munmap */
+ "linux_truncate", /* 92 = linux_truncate */
+ "linux_ftruncate", /* 93 = linux_ftruncate */
+ "fchmod", /* 94 = fchmod */
+ "fchown", /* 95 = fchown */
+ "linux_getpriority", /* 96 = linux_getpriority */
+ "setpriority", /* 97 = setpriority */
+ "#98", /* 98 = profil */
+ "linux_statfs", /* 99 = linux_statfs */
+ "linux_fstatfs", /* 100 = linux_fstatfs */
+ "#101", /* 101 = ioperm */
+ "linux_socketcall", /* 102 = linux_socketcall */
+ "linux_syslog", /* 103 = linux_syslog */
+ "linux_setitimer", /* 104 = linux_setitimer */
+ "linux_getitimer", /* 105 = linux_getitimer */
+ "linux_newstat", /* 106 = linux_newstat */
+ "linux_newlstat", /* 107 = linux_newlstat */
+ "linux_newfstat", /* 108 = linux_newfstat */
+ "linux_uname", /* 109 = linux_uname */
+ "linux_iopl", /* 110 = linux_iopl */
+ "linux_vhangup", /* 111 = linux_vhangup */
+ "#112", /* 112 = idle */
+ "#113", /* 113 = vm86old */
+ "linux_wait4", /* 114 = linux_wait4 */
+ "linux_swapoff", /* 115 = linux_swapoff */
+ "linux_sysinfo", /* 116 = linux_sysinfo */
+ "linux_ipc", /* 117 = linux_ipc */
+ "fsync", /* 118 = fsync */
+ "linux_sigreturn", /* 119 = linux_sigreturn */
+ "linux_clone", /* 120 = linux_clone */
+ "linux_setdomainname", /* 121 = linux_setdomainname */
+ "linux_newuname", /* 122 = linux_newuname */
+ "#123", /* 123 = modify_ldt */
+ "linux_adjtimex", /* 124 = linux_adjtimex */
+ "linux_mprotect", /* 125 = linux_mprotect */
+ "linux_sigprocmask", /* 126 = linux_sigprocmask */
+ "linux_create_module", /* 127 = linux_create_module */
+ "linux_init_module", /* 128 = linux_init_module */
+ "linux_delete_module", /* 129 = linux_delete_module */
+ "linux_get_kernel_syms", /* 130 = linux_get_kernel_syms */
+ "linux_quotactl", /* 131 = linux_quotactl */
+ "getpgid", /* 132 = getpgid */
+ "fchdir", /* 133 = fchdir */
+ "linux_bdflush", /* 134 = linux_bdflush */
+ "linux_sysfs", /* 135 = linux_sysfs */
+ "linux_personality", /* 136 = linux_personality */
+ "#137", /* 137 = afs_syscall */
+ "linux_setfsuid16", /* 138 = linux_setfsuid16 */
+ "linux_setfsgid16", /* 139 = linux_setfsgid16 */
+ "linux_llseek", /* 140 = linux_llseek */
+ "linux_getdents", /* 141 = linux_getdents */
+ "linux_select", /* 142 = linux_select */
+ "flock", /* 143 = flock */
+ "linux_msync", /* 144 = linux_msync */
+ "linux_readv", /* 145 = linux_readv */
+ "linux_writev", /* 146 = linux_writev */
+ "linux_getsid", /* 147 = linux_getsid */
+ "linux_fdatasync", /* 148 = linux_fdatasync */
+ "linux_sysctl", /* 149 = linux_sysctl */
+ "mlock", /* 150 = mlock */
+ "munlock", /* 151 = munlock */
+ "mlockall", /* 152 = mlockall */
+ "munlockall", /* 153 = munlockall */
+ "sched_setparam", /* 154 = sched_setparam */
+ "sched_getparam", /* 155 = sched_getparam */
+ "linux_sched_setscheduler", /* 156 = linux_sched_setscheduler */
+ "linux_sched_getscheduler", /* 157 = linux_sched_getscheduler */
+ "sched_yield", /* 158 = sched_yield */
+ "linux_sched_get_priority_max", /* 159 = linux_sched_get_priority_max */
+ "linux_sched_get_priority_min", /* 160 = linux_sched_get_priority_min */
+ "linux_sched_rr_get_interval", /* 161 = linux_sched_rr_get_interval */
+ "linux_nanosleep", /* 162 = linux_nanosleep */
+ "linux_mremap", /* 163 = linux_mremap */
+ "linux_setresuid16", /* 164 = linux_setresuid16 */
+ "linux_getresuid16", /* 165 = linux_getresuid16 */
+ "#166", /* 166 = vm86 */
+ "linux_query_module", /* 167 = linux_query_module */
+ "poll", /* 168 = poll */
+ "linux_nfsservctl", /* 169 = linux_nfsservctl */
+ "linux_setresgid16", /* 170 = linux_setresgid16 */
+ "linux_getresgid16", /* 171 = linux_getresgid16 */
+ "linux_prctl", /* 172 = linux_prctl */
+ "linux_rt_sigreturn", /* 173 = linux_rt_sigreturn */
+ "linux_rt_sigaction", /* 174 = linux_rt_sigaction */
+ "linux_rt_sigprocmask", /* 175 = linux_rt_sigprocmask */
+ "linux_rt_sigpending", /* 176 = linux_rt_sigpending */
+ "linux_rt_sigtimedwait", /* 177 = linux_rt_sigtimedwait */
+ "linux_rt_sigqueueinfo", /* 178 = linux_rt_sigqueueinfo */
+ "linux_rt_sigsuspend", /* 179 = linux_rt_sigsuspend */
+ "linux_pread", /* 180 = linux_pread */
+ "linux_pwrite", /* 181 = linux_pwrite */
+ "linux_chown16", /* 182 = linux_chown16 */
+ "linux_getcwd", /* 183 = linux_getcwd */
+ "linux_capget", /* 184 = linux_capget */
+ "linux_capset", /* 185 = linux_capset */
+ "linux_sigaltstack", /* 186 = linux_sigaltstack */
+ "linux_sendfile", /* 187 = linux_sendfile */
+ "#188", /* 188 = getpmsg */
+ "#189", /* 189 = putpmsg */
+ "linux_vfork", /* 190 = linux_vfork */
+ "linux_getrlimit", /* 191 = linux_getrlimit */
+ "linux_mmap2", /* 192 = linux_mmap2 */
+ "linux_truncate64", /* 193 = linux_truncate64 */
+ "linux_ftruncate64", /* 194 = linux_ftruncate64 */
+ "linux_stat64", /* 195 = linux_stat64 */
+ "linux_lstat64", /* 196 = linux_lstat64 */
+ "linux_fstat64", /* 197 = linux_fstat64 */
+ "linux_lchown", /* 198 = linux_lchown */
+ "linux_getuid", /* 199 = linux_getuid */
+ "linux_getgid", /* 200 = linux_getgid */
+ "geteuid", /* 201 = geteuid */
+ "getegid", /* 202 = getegid */
+ "setreuid", /* 203 = setreuid */
+ "setregid", /* 204 = setregid */
+ "linux_getgroups", /* 205 = linux_getgroups */
+ "linux_setgroups", /* 206 = linux_setgroups */
+ "fchown", /* 207 = fchown */
+ "setresuid", /* 208 = setresuid */
+ "getresuid", /* 209 = getresuid */
+ "setresgid", /* 210 = setresgid */
+ "getresgid", /* 211 = getresgid */
+ "linux_chown", /* 212 = linux_chown */
+ "setuid", /* 213 = setuid */
+ "setgid", /* 214 = setgid */
+ "linux_setfsuid", /* 215 = linux_setfsuid */
+ "linux_setfsgid", /* 216 = linux_setfsgid */
+ "linux_pivot_root", /* 217 = linux_pivot_root */
+ "linux_mincore", /* 218 = linux_mincore */
+ "madvise", /* 219 = madvise */
+ "linux_getdents64", /* 220 = linux_getdents64 */
+ "linux_fcntl64", /* 221 = linux_fcntl64 */
+ "#222", /* 222 = */
+ "#223", /* 223 = */
+ "linux_gettid", /* 224 = linux_gettid */
+ "#225", /* 225 = linux_readahead */
+ "linux_setxattr", /* 226 = linux_setxattr */
+ "linux_lsetxattr", /* 227 = linux_lsetxattr */
+ "linux_fsetxattr", /* 228 = linux_fsetxattr */
+ "linux_getxattr", /* 229 = linux_getxattr */
+ "linux_lgetxattr", /* 230 = linux_lgetxattr */
+ "linux_fgetxattr", /* 231 = linux_fgetxattr */
+ "linux_listxattr", /* 232 = linux_listxattr */
+ "linux_llistxattr", /* 233 = linux_llistxattr */
+ "linux_flistxattr", /* 234 = linux_flistxattr */
+ "linux_removexattr", /* 235 = linux_removexattr */
+ "linux_lremovexattr", /* 236 = linux_lremovexattr */
+ "linux_fremovexattr", /* 237 = linux_fremovexattr */
+ "linux_tkill", /* 238 = linux_tkill */
+ "#239", /* 239 = linux_sendfile64 */
+ "linux_sys_futex", /* 240 = linux_sys_futex */
+ "linux_sched_setaffinity", /* 241 = linux_sched_setaffinity */
+ "linux_sched_getaffinity", /* 242 = linux_sched_getaffinity */
+ "linux_set_thread_area", /* 243 = linux_set_thread_area */
+ "#244", /* 244 = linux_get_thread_area */
+ "#245", /* 245 = linux_io_setup */
+ "#246", /* 246 = linux_io_destroy */
+ "#247", /* 247 = linux_io_getevents */
+ "#248", /* 248 = linux_io_submit */
+ "#249", /* 249 = linux_io_cancel */
+ "linux_fadvise64", /* 250 = linux_fadvise64 */
+ "#251", /* 251 = */
+ "linux_exit_group", /* 252 = linux_exit_group */
+ "linux_lookup_dcookie", /* 253 = linux_lookup_dcookie */
+ "linux_epoll_create", /* 254 = linux_epoll_create */
+ "linux_epoll_ctl", /* 255 = linux_epoll_ctl */
+ "linux_epoll_wait", /* 256 = linux_epoll_wait */
+ "linux_remap_file_pages", /* 257 = linux_remap_file_pages */
+ "linux_set_tid_address", /* 258 = linux_set_tid_address */
+ "linux_timer_create", /* 259 = linux_timer_create */
+ "linux_timer_settime", /* 260 = linux_timer_settime */
+ "linux_timer_gettime", /* 261 = linux_timer_gettime */
+ "linux_timer_getoverrun", /* 262 = linux_timer_getoverrun */
+ "linux_timer_delete", /* 263 = linux_timer_delete */
+ "linux_clock_settime", /* 264 = linux_clock_settime */
+ "linux_clock_gettime", /* 265 = linux_clock_gettime */
+ "linux_clock_getres", /* 266 = linux_clock_getres */
+ "linux_clock_nanosleep", /* 267 = linux_clock_nanosleep */
+ "linux_statfs64", /* 268 = linux_statfs64 */
+ "linux_fstatfs64", /* 269 = linux_fstatfs64 */
+ "linux_tgkill", /* 270 = linux_tgkill */
+ "linux_utimes", /* 271 = linux_utimes */
+ "linux_fadvise64_64", /* 272 = linux_fadvise64_64 */
+ "#273", /* 273 = vserver */
+ "linux_mbind", /* 274 = linux_mbind */
+ "linux_get_mempolicy", /* 275 = linux_get_mempolicy */
+ "linux_set_mempolicy", /* 276 = linux_set_mempolicy */
+ "linux_mq_open", /* 277 = linux_mq_open */
+ "linux_mq_unlink", /* 278 = linux_mq_unlink */
+ "linux_mq_timedsend", /* 279 = linux_mq_timedsend */
+ "linux_mq_timedreceive", /* 280 = linux_mq_timedreceive */
+ "linux_mq_notify", /* 281 = linux_mq_notify */
+ "linux_mq_getsetattr", /* 282 = linux_mq_getsetattr */
+ "linux_kexec_load", /* 283 = linux_kexec_load */
+ "linux_waitid", /* 284 = linux_waitid */
+ "#285", /* 285 = */
+ "linux_add_key", /* 286 = linux_add_key */
+ "linux_request_key", /* 287 = linux_request_key */
+ "linux_keyctl", /* 288 = linux_keyctl */
+ "linux_ioprio_set", /* 289 = linux_ioprio_set */
+ "linux_ioprio_get", /* 290 = linux_ioprio_get */
+ "linux_inotify_init", /* 291 = linux_inotify_init */
+ "linux_inotify_add_watch", /* 292 = linux_inotify_add_watch */
+ "linux_inotify_rm_watch", /* 293 = linux_inotify_rm_watch */
+ "linux_migrate_pages", /* 294 = linux_migrate_pages */
+ "linux_openat", /* 295 = linux_openat */
+ "linux_mkdirat", /* 296 = linux_mkdirat */
+ "linux_mknodat", /* 297 = linux_mknodat */
+ "linux_fchownat", /* 298 = linux_fchownat */
+ "linux_futimesat", /* 299 = linux_futimesat */
+ "linux_fstatat64", /* 300 = linux_fstatat64 */
+ "linux_unlinkat", /* 301 = linux_unlinkat */
+ "linux_renameat", /* 302 = linux_renameat */
+ "linux_linkat", /* 303 = linux_linkat */
+ "linux_symlinkat", /* 304 = linux_symlinkat */
+ "linux_readlinkat", /* 305 = linux_readlinkat */
+ "linux_fchmodat", /* 306 = linux_fchmodat */
+ "linux_faccessat", /* 307 = linux_faccessat */
+ "linux_pselect6", /* 308 = linux_pselect6 */
+ "linux_ppoll", /* 309 = linux_ppoll */
+ "linux_unshare", /* 310 = linux_unshare */
+ "linux_set_robust_list", /* 311 = linux_set_robust_list */
+ "linux_get_robust_list", /* 312 = linux_get_robust_list */
+ "linux_splice", /* 313 = linux_splice */
+ "linux_sync_file_range", /* 314 = linux_sync_file_range */
+ "linux_tee", /* 315 = linux_tee */
+ "linux_vmsplice", /* 316 = linux_vmsplice */
+ "linux_move_pages", /* 317 = linux_move_pages */
+ "linux_getcpu", /* 318 = linux_getcpu */
+ "linux_epoll_pwait", /* 319 = linux_epoll_pwait */
+ "linux_utimensat", /* 320 = linux_utimensat */
+ "linux_signalfd", /* 321 = linux_signalfd */
+ "linux_timerfd_create", /* 322 = linux_timerfd_create */
+ "linux_eventfd", /* 323 = linux_eventfd */
+ "linux_fallocate", /* 324 = linux_fallocate */
+ "linux_timerfd_settime", /* 325 = linux_timerfd_settime */
+ "linux_timerfd_gettime", /* 326 = linux_timerfd_gettime */
+ "linux_signalfd4", /* 327 = linux_signalfd4 */
+ "linux_eventfd2", /* 328 = linux_eventfd2 */
+ "linux_epoll_create1", /* 329 = linux_epoll_create1 */
+ "linux_dup3", /* 330 = linux_dup3 */
+ "linux_pipe2", /* 331 = linux_pipe2 */
+ "linux_inotify_init1", /* 332 = linux_inotify_init1 */
+ "linux_preadv", /* 333 = linux_preadv */
+ "linux_pwritev", /* 334 = linux_pwritev */
+ "linux_rt_tsigqueueinfo", /* 335 = linux_rt_tsigqueueinfo */
+ "linux_perf_event_open", /* 336 = linux_perf_event_open */
+ "linux_recvmmsg", /* 337 = linux_recvmmsg */
+ "linux_fanotify_init", /* 338 = linux_fanotify_init */
+ "linux_fanotify_mark", /* 339 = linux_fanotify_mark */
+ "linux_prlimit64", /* 340 = linux_prlimit64 */
+ "linux_name_to_handle_at", /* 341 = linux_name_to_handle_at */
+ "linux_open_by_handle_at", /* 342 = linux_open_by_handle_at */
+ "linux_clock_adjtime", /* 343 = linux_clock_adjtime */
+ "linux_syncfs", /* 344 = linux_syncfs */
+ "linux_sendmmsg", /* 345 = linux_sendmmsg */
+ "linux_setns", /* 346 = linux_setns */
+ "linux_process_vm_readv", /* 347 = linux_process_vm_readv */
+ "linux_process_vm_writev", /* 348 = linux_process_vm_writev */
+};
diff --git a/sys/amd64/linux32/linux32_sysent.c b/sys/amd64/linux32/linux32_sysent.c
new file mode 100644
index 0000000..1ece240
--- /dev/null
+++ b/sys/amd64/linux32/linux32_sysent.c
@@ -0,0 +1,371 @@
+/*
+ * System call switch table.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 236026 2012-05-25 21:50:48Z ed
+ */
+
+#include "opt_compat.h"
+#include <sys/param.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <compat/linux/linux_sysproto.h>
+#include <amd64/linux32/linux.h>
+#include <amd64/linux32/linux32_proto.h>
+
+#define AS(name) (sizeof(struct name) / sizeof(register_t))
+
+/* The casts are bogus but will do for now. */
+struct sysent linux_sysent[] = {
+#define nosys linux_nosys
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 0 = setup */
+ { AS(sys_exit_args), (sy_call_t *)sys_sys_exit, AUE_EXIT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 1 = exit */
+ { 0, (sy_call_t *)linux_fork, AUE_FORK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 2 = linux_fork */
+ { AS(read_args), (sy_call_t *)sys_read, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 3 = read */
+ { AS(write_args), (sy_call_t *)sys_write, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 4 = write */
+ { AS(linux_open_args), (sy_call_t *)linux_open, AUE_OPEN_RWTC, NULL, 0, 0, 0, SY_THR_STATIC }, /* 5 = linux_open */
+ { AS(close_args), (sy_call_t *)sys_close, AUE_CLOSE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 6 = close */
+ { AS(linux_waitpid_args), (sy_call_t *)linux_waitpid, AUE_WAIT4, NULL, 0, 0, 0, SY_THR_STATIC }, /* 7 = linux_waitpid */
+ { AS(linux_creat_args), (sy_call_t *)linux_creat, AUE_CREAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 8 = linux_creat */
+ { AS(linux_link_args), (sy_call_t *)linux_link, AUE_LINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 9 = linux_link */
+ { AS(linux_unlink_args), (sy_call_t *)linux_unlink, AUE_UNLINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 10 = linux_unlink */
+ { AS(linux_execve_args), (sy_call_t *)linux_execve, AUE_EXECVE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 11 = linux_execve */
+ { AS(linux_chdir_args), (sy_call_t *)linux_chdir, AUE_CHDIR, NULL, 0, 0, 0, SY_THR_STATIC }, /* 12 = linux_chdir */
+ { AS(linux_time_args), (sy_call_t *)linux_time, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 13 = linux_time */
+ { AS(linux_mknod_args), (sy_call_t *)linux_mknod, AUE_MKNOD, NULL, 0, 0, 0, SY_THR_STATIC }, /* 14 = linux_mknod */
+ { AS(linux_chmod_args), (sy_call_t *)linux_chmod, AUE_CHMOD, NULL, 0, 0, 0, SY_THR_STATIC }, /* 15 = linux_chmod */
+ { AS(linux_lchown16_args), (sy_call_t *)linux_lchown16, AUE_LCHOWN, NULL, 0, 0, 0, SY_THR_STATIC }, /* 16 = linux_lchown16 */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 17 = break */
+ { AS(linux_stat_args), (sy_call_t *)linux_stat, AUE_STAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 18 = linux_stat */
+ { AS(linux_lseek_args), (sy_call_t *)linux_lseek, AUE_LSEEK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 19 = linux_lseek */
+ { 0, (sy_call_t *)linux_getpid, AUE_GETPID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 20 = linux_getpid */
+ { AS(linux_mount_args), (sy_call_t *)linux_mount, AUE_MOUNT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 21 = linux_mount */
+ { AS(linux_oldumount_args), (sy_call_t *)linux_oldumount, AUE_UMOUNT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 22 = linux_oldumount */
+ { AS(linux_setuid16_args), (sy_call_t *)linux_setuid16, AUE_SETUID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 23 = linux_setuid16 */
+ { 0, (sy_call_t *)linux_getuid16, AUE_GETUID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 24 = linux_getuid16 */
+ { 0, (sy_call_t *)linux_stime, AUE_SETTIMEOFDAY, NULL, 0, 0, 0, SY_THR_STATIC }, /* 25 = linux_stime */
+ { AS(linux_ptrace_args), (sy_call_t *)linux_ptrace, AUE_PTRACE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 26 = linux_ptrace */
+ { AS(linux_alarm_args), (sy_call_t *)linux_alarm, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 27 = linux_alarm */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 28 = fstat */
+ { 0, (sy_call_t *)linux_pause, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 29 = linux_pause */
+ { AS(linux_utime_args), (sy_call_t *)linux_utime, AUE_UTIME, NULL, 0, 0, 0, SY_THR_STATIC }, /* 30 = linux_utime */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 31 = stty */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 32 = gtty */
+ { AS(linux_access_args), (sy_call_t *)linux_access, AUE_ACCESS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 33 = linux_access */
+ { AS(linux_nice_args), (sy_call_t *)linux_nice, AUE_NICE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 34 = linux_nice */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 35 = ftime */
+ { 0, (sy_call_t *)sys_sync, AUE_SYNC, NULL, 0, 0, 0, SY_THR_STATIC }, /* 36 = sync */
+ { AS(linux_kill_args), (sy_call_t *)linux_kill, AUE_KILL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 37 = linux_kill */
+ { AS(linux_rename_args), (sy_call_t *)linux_rename, AUE_RENAME, NULL, 0, 0, 0, SY_THR_STATIC }, /* 38 = linux_rename */
+ { AS(linux_mkdir_args), (sy_call_t *)linux_mkdir, AUE_MKDIR, NULL, 0, 0, 0, SY_THR_STATIC }, /* 39 = linux_mkdir */
+ { AS(linux_rmdir_args), (sy_call_t *)linux_rmdir, AUE_RMDIR, NULL, 0, 0, 0, SY_THR_STATIC }, /* 40 = linux_rmdir */
+ { AS(dup_args), (sy_call_t *)sys_dup, AUE_DUP, NULL, 0, 0, 0, SY_THR_STATIC }, /* 41 = dup */
+ { AS(linux_pipe_args), (sy_call_t *)linux_pipe, AUE_PIPE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 42 = linux_pipe */
+ { AS(linux_times_args), (sy_call_t *)linux_times, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 43 = linux_times */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 44 = prof */
+ { AS(linux_brk_args), (sy_call_t *)linux_brk, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 45 = linux_brk */
+ { AS(linux_setgid16_args), (sy_call_t *)linux_setgid16, AUE_SETGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 46 = linux_setgid16 */
+ { 0, (sy_call_t *)linux_getgid16, AUE_GETGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 47 = linux_getgid16 */
+ { AS(linux_signal_args), (sy_call_t *)linux_signal, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 48 = linux_signal */
+ { 0, (sy_call_t *)linux_geteuid16, AUE_GETEUID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 49 = linux_geteuid16 */
+ { 0, (sy_call_t *)linux_getegid16, AUE_GETEGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 50 = linux_getegid16 */
+ { AS(acct_args), (sy_call_t *)sys_acct, AUE_ACCT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 51 = acct */
+ { AS(linux_umount_args), (sy_call_t *)linux_umount, AUE_UMOUNT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 52 = linux_umount */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 53 = lock */
+ { AS(linux_ioctl_args), (sy_call_t *)linux_ioctl, AUE_IOCTL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 54 = linux_ioctl */
+ { AS(linux_fcntl_args), (sy_call_t *)linux_fcntl, AUE_FCNTL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 55 = linux_fcntl */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 56 = mpx */
+ { AS(setpgid_args), (sy_call_t *)sys_setpgid, AUE_SETPGRP, NULL, 0, 0, 0, SY_THR_STATIC }, /* 57 = setpgid */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 58 = ulimit */
+ { 0, (sy_call_t *)linux_olduname, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 59 = linux_olduname */
+ { AS(umask_args), (sy_call_t *)sys_umask, AUE_UMASK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 60 = umask */
+ { AS(chroot_args), (sy_call_t *)sys_chroot, AUE_CHROOT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 61 = chroot */
+ { AS(linux_ustat_args), (sy_call_t *)linux_ustat, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 62 = linux_ustat */
+ { AS(dup2_args), (sy_call_t *)sys_dup2, AUE_DUP2, NULL, 0, 0, 0, SY_THR_STATIC }, /* 63 = dup2 */
+ { 0, (sy_call_t *)linux_getppid, AUE_GETPPID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 64 = linux_getppid */
+ { 0, (sy_call_t *)sys_getpgrp, AUE_GETPGRP, NULL, 0, 0, 0, SY_THR_STATIC }, /* 65 = getpgrp */
+ { 0, (sy_call_t *)sys_setsid, AUE_SETSID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 66 = setsid */
+ { AS(linux_sigaction_args), (sy_call_t *)linux_sigaction, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 67 = linux_sigaction */
+ { 0, (sy_call_t *)linux_sgetmask, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 68 = linux_sgetmask */
+ { AS(linux_ssetmask_args), (sy_call_t *)linux_ssetmask, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 69 = linux_ssetmask */
+ { AS(linux_setreuid16_args), (sy_call_t *)linux_setreuid16, AUE_SETREUID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 70 = linux_setreuid16 */
+ { AS(linux_setregid16_args), (sy_call_t *)linux_setregid16, AUE_SETREGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 71 = linux_setregid16 */
+ { AS(linux_sigsuspend_args), (sy_call_t *)linux_sigsuspend, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 72 = linux_sigsuspend */
+ { AS(linux_sigpending_args), (sy_call_t *)linux_sigpending, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 73 = linux_sigpending */
+ { AS(linux_sethostname_args), (sy_call_t *)linux_sethostname, AUE_SYSCTL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 74 = linux_sethostname */
+ { AS(linux_setrlimit_args), (sy_call_t *)linux_setrlimit, AUE_SETRLIMIT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 75 = linux_setrlimit */
+ { AS(linux_old_getrlimit_args), (sy_call_t *)linux_old_getrlimit, AUE_GETRLIMIT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 76 = linux_old_getrlimit */
+ { AS(linux_getrusage_args), (sy_call_t *)linux_getrusage, AUE_GETRUSAGE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 77 = linux_getrusage */
+ { AS(linux_gettimeofday_args), (sy_call_t *)linux_gettimeofday, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 78 = linux_gettimeofday */
+ { AS(linux_settimeofday_args), (sy_call_t *)linux_settimeofday, AUE_SETTIMEOFDAY, NULL, 0, 0, 0, SY_THR_STATIC }, /* 79 = linux_settimeofday */
+ { AS(linux_getgroups16_args), (sy_call_t *)linux_getgroups16, AUE_GETGROUPS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 80 = linux_getgroups16 */
+ { AS(linux_setgroups16_args), (sy_call_t *)linux_setgroups16, AUE_SETGROUPS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 81 = linux_setgroups16 */
+ { AS(linux_old_select_args), (sy_call_t *)linux_old_select, AUE_SELECT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 82 = linux_old_select */
+ { AS(linux_symlink_args), (sy_call_t *)linux_symlink, AUE_SYMLINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 83 = linux_symlink */
+ { AS(linux_lstat_args), (sy_call_t *)linux_lstat, AUE_LSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 84 = linux_lstat */
+ { AS(linux_readlink_args), (sy_call_t *)linux_readlink, AUE_READLINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 85 = linux_readlink */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 86 = linux_uselib */
+ { AS(swapon_args), (sy_call_t *)sys_swapon, AUE_SWAPON, NULL, 0, 0, 0, SY_THR_STATIC }, /* 87 = swapon */
+ { AS(linux_reboot_args), (sy_call_t *)linux_reboot, AUE_REBOOT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 88 = linux_reboot */
+ { AS(linux_readdir_args), (sy_call_t *)linux_readdir, AUE_GETDIRENTRIES, NULL, 0, 0, 0, SY_THR_STATIC }, /* 89 = linux_readdir */
+ { AS(linux_mmap_args), (sy_call_t *)linux_mmap, AUE_MMAP, NULL, 0, 0, 0, SY_THR_STATIC }, /* 90 = linux_mmap */
+ { AS(munmap_args), (sy_call_t *)sys_munmap, AUE_MUNMAP, NULL, 0, 0, 0, SY_THR_STATIC }, /* 91 = munmap */
+ { AS(linux_truncate_args), (sy_call_t *)linux_truncate, AUE_TRUNCATE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 92 = linux_truncate */
+ { AS(linux_ftruncate_args), (sy_call_t *)linux_ftruncate, AUE_FTRUNCATE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 93 = linux_ftruncate */
+ { AS(fchmod_args), (sy_call_t *)sys_fchmod, AUE_FCHMOD, NULL, 0, 0, 0, SY_THR_STATIC }, /* 94 = fchmod */
+ { AS(fchown_args), (sy_call_t *)sys_fchown, AUE_FCHOWN, NULL, 0, 0, 0, SY_THR_STATIC }, /* 95 = fchown */
+ { AS(linux_getpriority_args), (sy_call_t *)linux_getpriority, AUE_GETPRIORITY, NULL, 0, 0, 0, SY_THR_STATIC }, /* 96 = linux_getpriority */
+ { AS(setpriority_args), (sy_call_t *)sys_setpriority, AUE_SETPRIORITY, NULL, 0, 0, 0, SY_THR_STATIC }, /* 97 = setpriority */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 98 = profil */
+ { AS(linux_statfs_args), (sy_call_t *)linux_statfs, AUE_STATFS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 99 = linux_statfs */
+ { AS(linux_fstatfs_args), (sy_call_t *)linux_fstatfs, AUE_FSTATFS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 100 = linux_fstatfs */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 101 = ioperm */
+ { AS(linux_socketcall_args), (sy_call_t *)linux_socketcall, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 102 = linux_socketcall */
+ { AS(linux_syslog_args), (sy_call_t *)linux_syslog, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 103 = linux_syslog */
+ { AS(linux_setitimer_args), (sy_call_t *)linux_setitimer, AUE_SETITIMER, NULL, 0, 0, 0, SY_THR_STATIC }, /* 104 = linux_setitimer */
+ { AS(linux_getitimer_args), (sy_call_t *)linux_getitimer, AUE_GETITIMER, NULL, 0, 0, 0, SY_THR_STATIC }, /* 105 = linux_getitimer */
+ { AS(linux_newstat_args), (sy_call_t *)linux_newstat, AUE_STAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 106 = linux_newstat */
+ { AS(linux_newlstat_args), (sy_call_t *)linux_newlstat, AUE_LSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 107 = linux_newlstat */
+ { AS(linux_newfstat_args), (sy_call_t *)linux_newfstat, AUE_FSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 108 = linux_newfstat */
+ { 0, (sy_call_t *)linux_uname, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 109 = linux_uname */
+ { AS(linux_iopl_args), (sy_call_t *)linux_iopl, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 110 = linux_iopl */
+ { 0, (sy_call_t *)linux_vhangup, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 111 = linux_vhangup */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 112 = idle */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 113 = vm86old */
+ { AS(linux_wait4_args), (sy_call_t *)linux_wait4, AUE_WAIT4, NULL, 0, 0, 0, SY_THR_STATIC }, /* 114 = linux_wait4 */
+ { 0, (sy_call_t *)linux_swapoff, AUE_SWAPOFF, NULL, 0, 0, 0, SY_THR_STATIC }, /* 115 = linux_swapoff */
+ { AS(linux_sysinfo_args), (sy_call_t *)linux_sysinfo, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 116 = linux_sysinfo */
+ { AS(linux_ipc_args), (sy_call_t *)linux_ipc, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 117 = linux_ipc */
+ { AS(fsync_args), (sy_call_t *)sys_fsync, AUE_FSYNC, NULL, 0, 0, 0, SY_THR_STATIC }, /* 118 = fsync */
+ { AS(linux_sigreturn_args), (sy_call_t *)linux_sigreturn, AUE_SIGRETURN, NULL, 0, 0, 0, SY_THR_STATIC }, /* 119 = linux_sigreturn */
+ { AS(linux_clone_args), (sy_call_t *)linux_clone, AUE_RFORK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 120 = linux_clone */
+ { AS(linux_setdomainname_args), (sy_call_t *)linux_setdomainname, AUE_SYSCTL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 121 = linux_setdomainname */
+ { AS(linux_newuname_args), (sy_call_t *)linux_newuname, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 122 = linux_newuname */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 123 = modify_ldt */
+ { 0, (sy_call_t *)linux_adjtimex, AUE_ADJTIME, NULL, 0, 0, 0, SY_THR_STATIC }, /* 124 = linux_adjtimex */
+ { AS(linux_mprotect_args), (sy_call_t *)linux_mprotect, AUE_MPROTECT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 125 = linux_mprotect */
+ { AS(linux_sigprocmask_args), (sy_call_t *)linux_sigprocmask, AUE_SIGPROCMASK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 126 = linux_sigprocmask */
+ { 0, (sy_call_t *)linux_create_module, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 127 = linux_create_module */
+ { 0, (sy_call_t *)linux_init_module, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 128 = linux_init_module */
+ { 0, (sy_call_t *)linux_delete_module, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 129 = linux_delete_module */
+ { 0, (sy_call_t *)linux_get_kernel_syms, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 130 = linux_get_kernel_syms */
+ { 0, (sy_call_t *)linux_quotactl, AUE_QUOTACTL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 131 = linux_quotactl */
+ { AS(getpgid_args), (sy_call_t *)sys_getpgid, AUE_GETPGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 132 = getpgid */
+ { AS(fchdir_args), (sy_call_t *)sys_fchdir, AUE_FCHDIR, NULL, 0, 0, 0, SY_THR_STATIC }, /* 133 = fchdir */
+ { 0, (sy_call_t *)linux_bdflush, AUE_BDFLUSH, NULL, 0, 0, 0, SY_THR_STATIC }, /* 134 = linux_bdflush */
+ { AS(linux_sysfs_args), (sy_call_t *)linux_sysfs, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 135 = linux_sysfs */
+ { AS(linux_personality_args), (sy_call_t *)linux_personality, AUE_PERSONALITY, NULL, 0, 0, 0, SY_THR_STATIC }, /* 136 = linux_personality */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 137 = afs_syscall */
+ { AS(linux_setfsuid16_args), (sy_call_t *)linux_setfsuid16, AUE_SETFSUID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 138 = linux_setfsuid16 */
+ { AS(linux_setfsgid16_args), (sy_call_t *)linux_setfsgid16, AUE_SETFSGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 139 = linux_setfsgid16 */
+ { AS(linux_llseek_args), (sy_call_t *)linux_llseek, AUE_LSEEK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 140 = linux_llseek */
+ { AS(linux_getdents_args), (sy_call_t *)linux_getdents, AUE_GETDIRENTRIES, NULL, 0, 0, 0, SY_THR_STATIC }, /* 141 = linux_getdents */
+ { AS(linux_select_args), (sy_call_t *)linux_select, AUE_SELECT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 142 = linux_select */
+ { AS(flock_args), (sy_call_t *)sys_flock, AUE_FLOCK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 143 = flock */
+ { AS(linux_msync_args), (sy_call_t *)linux_msync, AUE_MSYNC, NULL, 0, 0, 0, SY_THR_STATIC }, /* 144 = linux_msync */
+ { AS(linux_readv_args), (sy_call_t *)linux_readv, AUE_READV, NULL, 0, 0, 0, SY_THR_STATIC }, /* 145 = linux_readv */
+ { AS(linux_writev_args), (sy_call_t *)linux_writev, AUE_WRITEV, NULL, 0, 0, 0, SY_THR_STATIC }, /* 146 = linux_writev */
+ { AS(linux_getsid_args), (sy_call_t *)linux_getsid, AUE_GETSID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 147 = linux_getsid */
+ { AS(linux_fdatasync_args), (sy_call_t *)linux_fdatasync, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 148 = linux_fdatasync */
+ { AS(linux_sysctl_args), (sy_call_t *)linux_sysctl, AUE_SYSCTL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 149 = linux_sysctl */
+ { AS(mlock_args), (sy_call_t *)sys_mlock, AUE_MLOCK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 150 = mlock */
+ { AS(munlock_args), (sy_call_t *)sys_munlock, AUE_MUNLOCK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 151 = munlock */
+ { AS(mlockall_args), (sy_call_t *)sys_mlockall, AUE_MLOCKALL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 152 = mlockall */
+ { 0, (sy_call_t *)sys_munlockall, AUE_MUNLOCKALL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 153 = munlockall */
+ { AS(sched_setparam_args), (sy_call_t *)sys_sched_setparam, AUE_SCHED_SETPARAM, NULL, 0, 0, 0, SY_THR_STATIC }, /* 154 = sched_setparam */
+ { AS(sched_getparam_args), (sy_call_t *)sys_sched_getparam, AUE_SCHED_GETPARAM, NULL, 0, 0, 0, SY_THR_STATIC }, /* 155 = sched_getparam */
+ { AS(linux_sched_setscheduler_args), (sy_call_t *)linux_sched_setscheduler, AUE_SCHED_SETSCHEDULER, NULL, 0, 0, 0, SY_THR_STATIC }, /* 156 = linux_sched_setscheduler */
+ { AS(linux_sched_getscheduler_args), (sy_call_t *)linux_sched_getscheduler, AUE_SCHED_GETSCHEDULER, NULL, 0, 0, 0, SY_THR_STATIC }, /* 157 = linux_sched_getscheduler */
+ { 0, (sy_call_t *)sys_sched_yield, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 158 = sched_yield */
+ { AS(linux_sched_get_priority_max_args), (sy_call_t *)linux_sched_get_priority_max, AUE_SCHED_GET_PRIORITY_MAX, NULL, 0, 0, 0, SY_THR_STATIC }, /* 159 = linux_sched_get_priority_max */
+ { AS(linux_sched_get_priority_min_args), (sy_call_t *)linux_sched_get_priority_min, AUE_SCHED_GET_PRIORITY_MIN, NULL, 0, 0, 0, SY_THR_STATIC }, /* 160 = linux_sched_get_priority_min */
+ { AS(linux_sched_rr_get_interval_args), (sy_call_t *)linux_sched_rr_get_interval, AUE_SCHED_RR_GET_INTERVAL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 161 = linux_sched_rr_get_interval */
+ { AS(linux_nanosleep_args), (sy_call_t *)linux_nanosleep, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 162 = linux_nanosleep */
+ { AS(linux_mremap_args), (sy_call_t *)linux_mremap, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 163 = linux_mremap */
+ { AS(linux_setresuid16_args), (sy_call_t *)linux_setresuid16, AUE_SETRESUID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 164 = linux_setresuid16 */
+ { AS(linux_getresuid16_args), (sy_call_t *)linux_getresuid16, AUE_GETRESUID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 165 = linux_getresuid16 */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 166 = vm86 */
+ { 0, (sy_call_t *)linux_query_module, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 167 = linux_query_module */
+ { AS(poll_args), (sy_call_t *)sys_poll, AUE_POLL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 168 = poll */
+ { 0, (sy_call_t *)linux_nfsservctl, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 169 = linux_nfsservctl */
+ { AS(linux_setresgid16_args), (sy_call_t *)linux_setresgid16, AUE_SETRESGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 170 = linux_setresgid16 */
+ { AS(linux_getresgid16_args), (sy_call_t *)linux_getresgid16, AUE_GETRESGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 171 = linux_getresgid16 */
+ { AS(linux_prctl_args), (sy_call_t *)linux_prctl, AUE_PRCTL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 172 = linux_prctl */
+ { AS(linux_rt_sigreturn_args), (sy_call_t *)linux_rt_sigreturn, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 173 = linux_rt_sigreturn */
+ { AS(linux_rt_sigaction_args), (sy_call_t *)linux_rt_sigaction, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 174 = linux_rt_sigaction */
+ { AS(linux_rt_sigprocmask_args), (sy_call_t *)linux_rt_sigprocmask, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 175 = linux_rt_sigprocmask */
+ { AS(linux_rt_sigpending_args), (sy_call_t *)linux_rt_sigpending, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 176 = linux_rt_sigpending */
+ { AS(linux_rt_sigtimedwait_args), (sy_call_t *)linux_rt_sigtimedwait, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 177 = linux_rt_sigtimedwait */
+ { 0, (sy_call_t *)linux_rt_sigqueueinfo, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 178 = linux_rt_sigqueueinfo */
+ { AS(linux_rt_sigsuspend_args), (sy_call_t *)linux_rt_sigsuspend, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 179 = linux_rt_sigsuspend */
+ { AS(linux_pread_args), (sy_call_t *)linux_pread, AUE_PREAD, NULL, 0, 0, 0, SY_THR_STATIC }, /* 180 = linux_pread */
+ { AS(linux_pwrite_args), (sy_call_t *)linux_pwrite, AUE_PWRITE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 181 = linux_pwrite */
+ { AS(linux_chown16_args), (sy_call_t *)linux_chown16, AUE_CHOWN, NULL, 0, 0, 0, SY_THR_STATIC }, /* 182 = linux_chown16 */
+ { AS(linux_getcwd_args), (sy_call_t *)linux_getcwd, AUE_GETCWD, NULL, 0, 0, 0, SY_THR_STATIC }, /* 183 = linux_getcwd */
+ { AS(linux_capget_args), (sy_call_t *)linux_capget, AUE_CAPGET, NULL, 0, 0, 0, SY_THR_STATIC }, /* 184 = linux_capget */
+ { AS(linux_capset_args), (sy_call_t *)linux_capset, AUE_CAPSET, NULL, 0, 0, 0, SY_THR_STATIC }, /* 185 = linux_capset */
+ { AS(linux_sigaltstack_args), (sy_call_t *)linux_sigaltstack, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 186 = linux_sigaltstack */
+ { 0, (sy_call_t *)linux_sendfile, AUE_SENDFILE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 187 = linux_sendfile */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 188 = getpmsg */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 189 = putpmsg */
+ { 0, (sy_call_t *)linux_vfork, AUE_VFORK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 190 = linux_vfork */
+ { AS(linux_getrlimit_args), (sy_call_t *)linux_getrlimit, AUE_GETRLIMIT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 191 = linux_getrlimit */
+ { AS(linux_mmap2_args), (sy_call_t *)linux_mmap2, AUE_MMAP, NULL, 0, 0, 0, SY_THR_STATIC }, /* 192 = linux_mmap2 */
+ { AS(linux_truncate64_args), (sy_call_t *)linux_truncate64, AUE_TRUNCATE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 193 = linux_truncate64 */
+ { AS(linux_ftruncate64_args), (sy_call_t *)linux_ftruncate64, AUE_FTRUNCATE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 194 = linux_ftruncate64 */
+ { AS(linux_stat64_args), (sy_call_t *)linux_stat64, AUE_STAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 195 = linux_stat64 */
+ { AS(linux_lstat64_args), (sy_call_t *)linux_lstat64, AUE_LSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 196 = linux_lstat64 */
+ { AS(linux_fstat64_args), (sy_call_t *)linux_fstat64, AUE_FSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 197 = linux_fstat64 */
+ { AS(linux_lchown_args), (sy_call_t *)linux_lchown, AUE_LCHOWN, NULL, 0, 0, 0, SY_THR_STATIC }, /* 198 = linux_lchown */
+ { 0, (sy_call_t *)linux_getuid, AUE_GETUID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 199 = linux_getuid */
+ { 0, (sy_call_t *)linux_getgid, AUE_GETGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 200 = linux_getgid */
+ { 0, (sy_call_t *)sys_geteuid, AUE_GETEUID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 201 = geteuid */
+ { 0, (sy_call_t *)sys_getegid, AUE_GETEGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 202 = getegid */
+ { AS(setreuid_args), (sy_call_t *)sys_setreuid, AUE_SETREUID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 203 = setreuid */
+ { AS(setregid_args), (sy_call_t *)sys_setregid, AUE_SETREGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 204 = setregid */
+ { AS(linux_getgroups_args), (sy_call_t *)linux_getgroups, AUE_GETGROUPS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 205 = linux_getgroups */
+ { AS(linux_setgroups_args), (sy_call_t *)linux_setgroups, AUE_SETGROUPS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 206 = linux_setgroups */
+ { AS(fchown_args), (sy_call_t *)sys_fchown, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 207 = fchown */
+ { AS(setresuid_args), (sy_call_t *)sys_setresuid, AUE_SETRESUID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 208 = setresuid */
+ { AS(getresuid_args), (sy_call_t *)sys_getresuid, AUE_GETRESUID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 209 = getresuid */
+ { AS(setresgid_args), (sy_call_t *)sys_setresgid, AUE_SETRESGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 210 = setresgid */
+ { AS(getresgid_args), (sy_call_t *)sys_getresgid, AUE_GETRESGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 211 = getresgid */
+ { AS(linux_chown_args), (sy_call_t *)linux_chown, AUE_CHOWN, NULL, 0, 0, 0, SY_THR_STATIC }, /* 212 = linux_chown */
+ { AS(setuid_args), (sy_call_t *)sys_setuid, AUE_SETUID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 213 = setuid */
+ { AS(setgid_args), (sy_call_t *)sys_setgid, AUE_SETGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 214 = setgid */
+ { AS(linux_setfsuid_args), (sy_call_t *)linux_setfsuid, AUE_SETFSUID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 215 = linux_setfsuid */
+ { AS(linux_setfsgid_args), (sy_call_t *)linux_setfsgid, AUE_SETFSGID, NULL, 0, 0, 0, SY_THR_STATIC }, /* 216 = linux_setfsgid */
+ { AS(linux_pivot_root_args), (sy_call_t *)linux_pivot_root, AUE_PIVOT_ROOT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 217 = linux_pivot_root */
+ { AS(linux_mincore_args), (sy_call_t *)linux_mincore, AUE_MINCORE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 218 = linux_mincore */
+ { AS(madvise_args), (sy_call_t *)sys_madvise, AUE_MADVISE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 219 = madvise */
+ { AS(linux_getdents64_args), (sy_call_t *)linux_getdents64, AUE_GETDIRENTRIES, NULL, 0, 0, 0, SY_THR_STATIC }, /* 220 = linux_getdents64 */
+ { AS(linux_fcntl64_args), (sy_call_t *)linux_fcntl64, AUE_FCNTL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 221 = linux_fcntl64 */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 222 = */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 223 = */
+ { 0, (sy_call_t *)linux_gettid, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 224 = linux_gettid */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 225 = linux_readahead */
+ { 0, (sy_call_t *)linux_setxattr, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 226 = linux_setxattr */
+ { 0, (sy_call_t *)linux_lsetxattr, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 227 = linux_lsetxattr */
+ { 0, (sy_call_t *)linux_fsetxattr, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 228 = linux_fsetxattr */
+ { 0, (sy_call_t *)linux_getxattr, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 229 = linux_getxattr */
+ { 0, (sy_call_t *)linux_lgetxattr, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 230 = linux_lgetxattr */
+ { 0, (sy_call_t *)linux_fgetxattr, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 231 = linux_fgetxattr */
+ { 0, (sy_call_t *)linux_listxattr, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 232 = linux_listxattr */
+ { 0, (sy_call_t *)linux_llistxattr, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 233 = linux_llistxattr */
+ { 0, (sy_call_t *)linux_flistxattr, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 234 = linux_flistxattr */
+ { 0, (sy_call_t *)linux_removexattr, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 235 = linux_removexattr */
+ { 0, (sy_call_t *)linux_lremovexattr, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 236 = linux_lremovexattr */
+ { 0, (sy_call_t *)linux_fremovexattr, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 237 = linux_fremovexattr */
+ { AS(linux_tkill_args), (sy_call_t *)linux_tkill, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 238 = linux_tkill */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 239 = linux_sendfile64 */
+ { AS(linux_sys_futex_args), (sy_call_t *)linux_sys_futex, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 240 = linux_sys_futex */
+ { AS(linux_sched_setaffinity_args), (sy_call_t *)linux_sched_setaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 241 = linux_sched_setaffinity */
+ { AS(linux_sched_getaffinity_args), (sy_call_t *)linux_sched_getaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 242 = linux_sched_getaffinity */
+ { AS(linux_set_thread_area_args), (sy_call_t *)linux_set_thread_area, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 243 = linux_set_thread_area */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 244 = linux_get_thread_area */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 245 = linux_io_setup */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 246 = linux_io_destroy */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 247 = linux_io_getevents */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 248 = linux_io_submit */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 249 = linux_io_cancel */
+ { AS(linux_fadvise64_args), (sy_call_t *)linux_fadvise64, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 250 = linux_fadvise64 */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 251 = */
+ { AS(linux_exit_group_args), (sy_call_t *)linux_exit_group, AUE_EXIT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 252 = linux_exit_group */
+ { 0, (sy_call_t *)linux_lookup_dcookie, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 253 = linux_lookup_dcookie */
+ { 0, (sy_call_t *)linux_epoll_create, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 254 = linux_epoll_create */
+ { 0, (sy_call_t *)linux_epoll_ctl, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 255 = linux_epoll_ctl */
+ { 0, (sy_call_t *)linux_epoll_wait, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 256 = linux_epoll_wait */
+ { 0, (sy_call_t *)linux_remap_file_pages, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 257 = linux_remap_file_pages */
+ { AS(linux_set_tid_address_args), (sy_call_t *)linux_set_tid_address, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 258 = linux_set_tid_address */
+ { 0, (sy_call_t *)linux_timer_create, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 259 = linux_timer_create */
+ { 0, (sy_call_t *)linux_timer_settime, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 260 = linux_timer_settime */
+ { 0, (sy_call_t *)linux_timer_gettime, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 261 = linux_timer_gettime */
+ { 0, (sy_call_t *)linux_timer_getoverrun, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 262 = linux_timer_getoverrun */
+ { 0, (sy_call_t *)linux_timer_delete, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 263 = linux_timer_delete */
+ { AS(linux_clock_settime_args), (sy_call_t *)linux_clock_settime, AUE_CLOCK_SETTIME, NULL, 0, 0, 0, SY_THR_STATIC }, /* 264 = linux_clock_settime */
+ { AS(linux_clock_gettime_args), (sy_call_t *)linux_clock_gettime, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 265 = linux_clock_gettime */
+ { AS(linux_clock_getres_args), (sy_call_t *)linux_clock_getres, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 266 = linux_clock_getres */
+ { AS(linux_clock_nanosleep_args), (sy_call_t *)linux_clock_nanosleep, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 267 = linux_clock_nanosleep */
+ { AS(linux_statfs64_args), (sy_call_t *)linux_statfs64, AUE_STATFS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 268 = linux_statfs64 */
+ { 0, (sy_call_t *)linux_fstatfs64, AUE_FSTATFS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 269 = linux_fstatfs64 */
+ { AS(linux_tgkill_args), (sy_call_t *)linux_tgkill, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 270 = linux_tgkill */
+ { AS(linux_utimes_args), (sy_call_t *)linux_utimes, AUE_UTIMES, NULL, 0, 0, 0, SY_THR_STATIC }, /* 271 = linux_utimes */
+ { AS(linux_fadvise64_64_args), (sy_call_t *)linux_fadvise64_64, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 272 = linux_fadvise64_64 */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 273 = vserver */
+ { 0, (sy_call_t *)linux_mbind, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 274 = linux_mbind */
+ { 0, (sy_call_t *)linux_get_mempolicy, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 275 = linux_get_mempolicy */
+ { 0, (sy_call_t *)linux_set_mempolicy, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 276 = linux_set_mempolicy */
+ { 0, (sy_call_t *)linux_mq_open, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 277 = linux_mq_open */
+ { 0, (sy_call_t *)linux_mq_unlink, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 278 = linux_mq_unlink */
+ { 0, (sy_call_t *)linux_mq_timedsend, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 279 = linux_mq_timedsend */
+ { 0, (sy_call_t *)linux_mq_timedreceive, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 280 = linux_mq_timedreceive */
+ { 0, (sy_call_t *)linux_mq_notify, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 281 = linux_mq_notify */
+ { 0, (sy_call_t *)linux_mq_getsetattr, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 282 = linux_mq_getsetattr */
+ { 0, (sy_call_t *)linux_kexec_load, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 283 = linux_kexec_load */
+ { 0, (sy_call_t *)linux_waitid, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 284 = linux_waitid */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 285 = */
+ { 0, (sy_call_t *)linux_add_key, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 286 = linux_add_key */
+ { 0, (sy_call_t *)linux_request_key, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 287 = linux_request_key */
+ { 0, (sy_call_t *)linux_keyctl, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 288 = linux_keyctl */
+ { 0, (sy_call_t *)linux_ioprio_set, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 289 = linux_ioprio_set */
+ { 0, (sy_call_t *)linux_ioprio_get, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 290 = linux_ioprio_get */
+ { 0, (sy_call_t *)linux_inotify_init, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 291 = linux_inotify_init */
+ { 0, (sy_call_t *)linux_inotify_add_watch, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 292 = linux_inotify_add_watch */
+ { 0, (sy_call_t *)linux_inotify_rm_watch, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 293 = linux_inotify_rm_watch */
+ { 0, (sy_call_t *)linux_migrate_pages, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 294 = linux_migrate_pages */
+ { AS(linux_openat_args), (sy_call_t *)linux_openat, AUE_OPEN_RWTC, NULL, 0, 0, 0, SY_THR_STATIC }, /* 295 = linux_openat */
+ { AS(linux_mkdirat_args), (sy_call_t *)linux_mkdirat, AUE_MKDIRAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 296 = linux_mkdirat */
+ { AS(linux_mknodat_args), (sy_call_t *)linux_mknodat, AUE_MKNODAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 297 = linux_mknodat */
+ { AS(linux_fchownat_args), (sy_call_t *)linux_fchownat, AUE_FCHOWNAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 298 = linux_fchownat */
+ { AS(linux_futimesat_args), (sy_call_t *)linux_futimesat, AUE_FUTIMESAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 299 = linux_futimesat */
+ { AS(linux_fstatat64_args), (sy_call_t *)linux_fstatat64, AUE_FSTATAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 300 = linux_fstatat64 */
+ { AS(linux_unlinkat_args), (sy_call_t *)linux_unlinkat, AUE_UNLINKAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 301 = linux_unlinkat */
+ { AS(linux_renameat_args), (sy_call_t *)linux_renameat, AUE_RENAMEAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 302 = linux_renameat */
+ { AS(linux_linkat_args), (sy_call_t *)linux_linkat, AUE_LINKAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 303 = linux_linkat */
+ { AS(linux_symlinkat_args), (sy_call_t *)linux_symlinkat, AUE_SYMLINKAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 304 = linux_symlinkat */
+ { AS(linux_readlinkat_args), (sy_call_t *)linux_readlinkat, AUE_READLINKAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 305 = linux_readlinkat */
+ { AS(linux_fchmodat_args), (sy_call_t *)linux_fchmodat, AUE_FCHMODAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 306 = linux_fchmodat */
+ { AS(linux_faccessat_args), (sy_call_t *)linux_faccessat, AUE_FACCESSAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 307 = linux_faccessat */
+ { 0, (sy_call_t *)linux_pselect6, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 308 = linux_pselect6 */
+ { 0, (sy_call_t *)linux_ppoll, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 309 = linux_ppoll */
+ { 0, (sy_call_t *)linux_unshare, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 310 = linux_unshare */
+ { AS(linux_set_robust_list_args), (sy_call_t *)linux_set_robust_list, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 311 = linux_set_robust_list */
+ { AS(linux_get_robust_list_args), (sy_call_t *)linux_get_robust_list, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 312 = linux_get_robust_list */
+ { 0, (sy_call_t *)linux_splice, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 313 = linux_splice */
+ { 0, (sy_call_t *)linux_sync_file_range, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 314 = linux_sync_file_range */
+ { 0, (sy_call_t *)linux_tee, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 315 = linux_tee */
+ { 0, (sy_call_t *)linux_vmsplice, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 316 = linux_vmsplice */
+ { 0, (sy_call_t *)linux_move_pages, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 317 = linux_move_pages */
+ { 0, (sy_call_t *)linux_getcpu, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 318 = linux_getcpu */
+ { 0, (sy_call_t *)linux_epoll_pwait, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 319 = linux_epoll_pwait */
+ { 0, (sy_call_t *)linux_utimensat, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 320 = linux_utimensat */
+ { 0, (sy_call_t *)linux_signalfd, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 321 = linux_signalfd */
+ { 0, (sy_call_t *)linux_timerfd_create, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 322 = linux_timerfd_create */
+ { 0, (sy_call_t *)linux_eventfd, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 323 = linux_eventfd */
+ { 0, (sy_call_t *)linux_fallocate, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 324 = linux_fallocate */
+ { 0, (sy_call_t *)linux_timerfd_settime, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 325 = linux_timerfd_settime */
+ { 0, (sy_call_t *)linux_timerfd_gettime, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 326 = linux_timerfd_gettime */
+ { 0, (sy_call_t *)linux_signalfd4, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 327 = linux_signalfd4 */
+ { 0, (sy_call_t *)linux_eventfd2, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 328 = linux_eventfd2 */
+ { 0, (sy_call_t *)linux_epoll_create1, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 329 = linux_epoll_create1 */
+ { 0, (sy_call_t *)linux_dup3, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 330 = linux_dup3 */
+ { AS(linux_pipe2_args), (sy_call_t *)linux_pipe2, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 331 = linux_pipe2 */
+ { 0, (sy_call_t *)linux_inotify_init1, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 332 = linux_inotify_init1 */
+ { 0, (sy_call_t *)linux_preadv, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 333 = linux_preadv */
+ { 0, (sy_call_t *)linux_pwritev, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 334 = linux_pwritev */
+ { 0, (sy_call_t *)linux_rt_tsigqueueinfo, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 335 = linux_rt_tsigqueueinfo */
+ { 0, (sy_call_t *)linux_perf_event_open, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 336 = linux_perf_event_open */
+ { 0, (sy_call_t *)linux_recvmmsg, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 337 = linux_recvmmsg */
+ { 0, (sy_call_t *)linux_fanotify_init, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 338 = linux_fanotify_init */
+ { 0, (sy_call_t *)linux_fanotify_mark, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 339 = linux_fanotify_mark */
+ { 0, (sy_call_t *)linux_prlimit64, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 340 = linux_prlimit64 */
+ { 0, (sy_call_t *)linux_name_to_handle_at, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 341 = linux_name_to_handle_at */
+ { 0, (sy_call_t *)linux_open_by_handle_at, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 342 = linux_open_by_handle_at */
+ { 0, (sy_call_t *)linux_clock_adjtime, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 343 = linux_clock_adjtime */
+ { 0, (sy_call_t *)linux_syncfs, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 344 = linux_syncfs */
+ { 0, (sy_call_t *)linux_sendmmsg, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 345 = linux_sendmmsg */
+ { 0, (sy_call_t *)linux_setns, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 346 = linux_setns */
+ { 0, (sy_call_t *)linux_process_vm_readv, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 347 = linux_process_vm_readv */
+ { 0, (sy_call_t *)linux_process_vm_writev, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 348 = linux_process_vm_writev */
+};
diff --git a/sys/amd64/linux32/linux32_systrace_args.c b/sys/amd64/linux32/linux32_systrace_args.c
new file mode 100644
index 0000000..0b020a7
--- /dev/null
+++ b/sys/amd64/linux32/linux32_systrace_args.c
@@ -0,0 +1,6667 @@
+/*
+ * System call argument to DTrace register array converstion.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * This file is part of the DTrace syscall provider.
+ */
+
+static void
+systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
+{
+ int64_t *iarg = (int64_t *) uarg;
+ switch (sysnum) {
+#define nosys linux_nosys
+ /* sys_exit */
+ case 1: {
+ struct sys_exit_args *p = params;
+ iarg[0] = p->rval; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* linux_fork */
+ case 2: {
+ *n_args = 0;
+ break;
+ }
+ /* read */
+ case 3: {
+ struct read_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* char * */
+ uarg[2] = p->nbyte; /* u_int */
+ *n_args = 3;
+ break;
+ }
+ /* write */
+ case 4: {
+ struct write_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* char * */
+ uarg[2] = p->nbyte; /* u_int */
+ *n_args = 3;
+ break;
+ }
+ /* linux_open */
+ case 5: {
+ struct linux_open_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->flags; /* l_int */
+ iarg[2] = p->mode; /* l_int */
+ *n_args = 3;
+ break;
+ }
+ /* close */
+ case 6: {
+ struct close_args *p = params;
+ iarg[0] = p->fd; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* linux_waitpid */
+ case 7: {
+ struct linux_waitpid_args *p = params;
+ iarg[0] = p->pid; /* l_pid_t */
+ uarg[1] = (intptr_t) p->status; /* l_int * */
+ iarg[2] = p->options; /* l_int */
+ *n_args = 3;
+ break;
+ }
+ /* linux_creat */
+ case 8: {
+ struct linux_creat_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->mode; /* l_int */
+ *n_args = 2;
+ break;
+ }
+ /* linux_link */
+ case 9: {
+ struct linux_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->to; /* char * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_unlink */
+ case 10: {
+ struct linux_unlink_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_execve */
+ case 11: {
+ struct linux_execve_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->argp; /* uint32_t * */
+ uarg[2] = (intptr_t) p->envp; /* uint32_t * */
+ *n_args = 3;
+ break;
+ }
+ /* linux_chdir */
+ case 12: {
+ struct linux_chdir_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_time */
+ case 13: {
+ struct linux_time_args *p = params;
+ uarg[0] = (intptr_t) p->tm; /* l_time_t * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_mknod */
+ case 14: {
+ struct linux_mknod_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->mode; /* l_int */
+ iarg[2] = p->dev; /* l_dev_t */
+ *n_args = 3;
+ break;
+ }
+ /* linux_chmod */
+ case 15: {
+ struct linux_chmod_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->mode; /* l_mode_t */
+ *n_args = 2;
+ break;
+ }
+ /* linux_lchown16 */
+ case 16: {
+ struct linux_lchown16_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->uid; /* l_uid16_t */
+ iarg[2] = p->gid; /* l_gid16_t */
+ *n_args = 3;
+ break;
+ }
+ /* linux_stat */
+ case 18: {
+ struct linux_stat_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->up; /* struct linux_stat * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_lseek */
+ case 19: {
+ struct linux_lseek_args *p = params;
+ iarg[0] = p->fdes; /* l_uint */
+ iarg[1] = p->off; /* l_off_t */
+ iarg[2] = p->whence; /* l_int */
+ *n_args = 3;
+ break;
+ }
+ /* linux_getpid */
+ case 20: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_mount */
+ case 21: {
+ struct linux_mount_args *p = params;
+ uarg[0] = (intptr_t) p->specialfile; /* char * */
+ uarg[1] = (intptr_t) p->dir; /* char * */
+ uarg[2] = (intptr_t) p->filesystemtype; /* char * */
+ iarg[3] = p->rwflag; /* l_ulong */
+ uarg[4] = (intptr_t) p->data; /* void * */
+ *n_args = 5;
+ break;
+ }
+ /* linux_oldumount */
+ case 22: {
+ struct linux_oldumount_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_setuid16 */
+ case 23: {
+ struct linux_setuid16_args *p = params;
+ iarg[0] = p->uid; /* l_uid16_t */
+ *n_args = 1;
+ break;
+ }
+ /* linux_getuid16 */
+ case 24: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_stime */
+ case 25: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_ptrace */
+ case 26: {
+ struct linux_ptrace_args *p = params;
+ iarg[0] = p->req; /* l_long */
+ iarg[1] = p->pid; /* l_long */
+ iarg[2] = p->addr; /* l_long */
+ iarg[3] = p->data; /* l_long */
+ *n_args = 4;
+ break;
+ }
+ /* linux_alarm */
+ case 27: {
+ struct linux_alarm_args *p = params;
+ iarg[0] = p->secs; /* l_uint */
+ *n_args = 1;
+ break;
+ }
+ /* linux_pause */
+ case 29: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_utime */
+ case 30: {
+ struct linux_utime_args *p = params;
+ uarg[0] = (intptr_t) p->fname; /* char * */
+ uarg[1] = (intptr_t) p->times; /* struct l_utimbuf * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_access */
+ case 33: {
+ struct linux_access_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->amode; /* l_int */
+ *n_args = 2;
+ break;
+ }
+ /* linux_nice */
+ case 34: {
+ struct linux_nice_args *p = params;
+ iarg[0] = p->inc; /* l_int */
+ *n_args = 1;
+ break;
+ }
+ /* sync */
+ case 36: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_kill */
+ case 37: {
+ struct linux_kill_args *p = params;
+ iarg[0] = p->pid; /* l_int */
+ iarg[1] = p->signum; /* l_int */
+ *n_args = 2;
+ break;
+ }
+ /* linux_rename */
+ case 38: {
+ struct linux_rename_args *p = params;
+ uarg[0] = (intptr_t) p->from; /* char * */
+ uarg[1] = (intptr_t) p->to; /* char * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_mkdir */
+ case 39: {
+ struct linux_mkdir_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->mode; /* l_int */
+ *n_args = 2;
+ break;
+ }
+ /* linux_rmdir */
+ case 40: {
+ struct linux_rmdir_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* dup */
+ case 41: {
+ struct dup_args *p = params;
+ uarg[0] = p->fd; /* u_int */
+ *n_args = 1;
+ break;
+ }
+ /* linux_pipe */
+ case 42: {
+ struct linux_pipe_args *p = params;
+ uarg[0] = (intptr_t) p->pipefds; /* l_int * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_times */
+ case 43: {
+ struct linux_times_args *p = params;
+ uarg[0] = (intptr_t) p->buf; /* struct l_times_argv * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_brk */
+ case 45: {
+ struct linux_brk_args *p = params;
+ iarg[0] = p->dsend; /* l_ulong */
+ *n_args = 1;
+ break;
+ }
+ /* linux_setgid16 */
+ case 46: {
+ struct linux_setgid16_args *p = params;
+ iarg[0] = p->gid; /* l_gid16_t */
+ *n_args = 1;
+ break;
+ }
+ /* linux_getgid16 */
+ case 47: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_signal */
+ case 48: {
+ struct linux_signal_args *p = params;
+ iarg[0] = p->sig; /* l_int */
+ iarg[1] = p->handler; /* l_handler_t */
+ *n_args = 2;
+ break;
+ }
+ /* linux_geteuid16 */
+ case 49: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_getegid16 */
+ case 50: {
+ *n_args = 0;
+ break;
+ }
+ /* acct */
+ case 51: {
+ struct acct_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_umount */
+ case 52: {
+ struct linux_umount_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->flags; /* l_int */
+ *n_args = 2;
+ break;
+ }
+ /* linux_ioctl */
+ case 54: {
+ struct linux_ioctl_args *p = params;
+ iarg[0] = p->fd; /* l_uint */
+ iarg[1] = p->cmd; /* l_uint */
+ uarg[2] = p->arg; /* uintptr_t */
+ *n_args = 3;
+ break;
+ }
+ /* linux_fcntl */
+ case 55: {
+ struct linux_fcntl_args *p = params;
+ iarg[0] = p->fd; /* l_uint */
+ iarg[1] = p->cmd; /* l_uint */
+ uarg[2] = p->arg; /* uintptr_t */
+ *n_args = 3;
+ break;
+ }
+ /* setpgid */
+ case 57: {
+ struct setpgid_args *p = params;
+ iarg[0] = p->pid; /* int */
+ iarg[1] = p->pgid; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* linux_olduname */
+ case 59: {
+ *n_args = 0;
+ break;
+ }
+ /* umask */
+ case 60: {
+ struct umask_args *p = params;
+ iarg[0] = p->newmask; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* chroot */
+ case 61: {
+ struct chroot_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_ustat */
+ case 62: {
+ struct linux_ustat_args *p = params;
+ iarg[0] = p->dev; /* l_dev_t */
+ uarg[1] = (intptr_t) p->ubuf; /* struct l_ustat * */
+ *n_args = 2;
+ break;
+ }
+ /* dup2 */
+ case 63: {
+ struct dup2_args *p = params;
+ uarg[0] = p->from; /* u_int */
+ uarg[1] = p->to; /* u_int */
+ *n_args = 2;
+ break;
+ }
+ /* linux_getppid */
+ case 64: {
+ *n_args = 0;
+ break;
+ }
+ /* getpgrp */
+ case 65: {
+ *n_args = 0;
+ break;
+ }
+ /* setsid */
+ case 66: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_sigaction */
+ case 67: {
+ struct linux_sigaction_args *p = params;
+ iarg[0] = p->sig; /* l_int */
+ uarg[1] = (intptr_t) p->nsa; /* l_osigaction_t * */
+ uarg[2] = (intptr_t) p->osa; /* l_osigaction_t * */
+ *n_args = 3;
+ break;
+ }
+ /* linux_sgetmask */
+ case 68: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_ssetmask */
+ case 69: {
+ struct linux_ssetmask_args *p = params;
+ iarg[0] = p->mask; /* l_osigset_t */
+ *n_args = 1;
+ break;
+ }
+ /* linux_setreuid16 */
+ case 70: {
+ struct linux_setreuid16_args *p = params;
+ iarg[0] = p->ruid; /* l_uid16_t */
+ iarg[1] = p->euid; /* l_uid16_t */
+ *n_args = 2;
+ break;
+ }
+ /* linux_setregid16 */
+ case 71: {
+ struct linux_setregid16_args *p = params;
+ iarg[0] = p->rgid; /* l_gid16_t */
+ iarg[1] = p->egid; /* l_gid16_t */
+ *n_args = 2;
+ break;
+ }
+ /* linux_sigsuspend */
+ case 72: {
+ struct linux_sigsuspend_args *p = params;
+ iarg[0] = p->hist0; /* l_int */
+ iarg[1] = p->hist1; /* l_int */
+ iarg[2] = p->mask; /* l_osigset_t */
+ *n_args = 3;
+ break;
+ }
+ /* linux_sigpending */
+ case 73: {
+ struct linux_sigpending_args *p = params;
+ uarg[0] = (intptr_t) p->mask; /* l_osigset_t * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_sethostname */
+ case 74: {
+ struct linux_sethostname_args *p = params;
+ uarg[0] = (intptr_t) p->hostname; /* char * */
+ uarg[1] = p->len; /* u_int */
+ *n_args = 2;
+ break;
+ }
+ /* linux_setrlimit */
+ case 75: {
+ struct linux_setrlimit_args *p = params;
+ iarg[0] = p->resource; /* l_uint */
+ uarg[1] = (intptr_t) p->rlim; /* struct l_rlimit * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_old_getrlimit */
+ case 76: {
+ struct linux_old_getrlimit_args *p = params;
+ iarg[0] = p->resource; /* l_uint */
+ uarg[1] = (intptr_t) p->rlim; /* struct l_rlimit * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_getrusage */
+ case 77: {
+ struct linux_getrusage_args *p = params;
+ iarg[0] = p->who; /* int */
+ uarg[1] = (intptr_t) p->rusage; /* struct l_rusage * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_gettimeofday */
+ case 78: {
+ struct linux_gettimeofday_args *p = params;
+ uarg[0] = (intptr_t) p->tp; /* struct l_timeval * */
+ uarg[1] = (intptr_t) p->tzp; /* struct timezone * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_settimeofday */
+ case 79: {
+ struct linux_settimeofday_args *p = params;
+ uarg[0] = (intptr_t) p->tp; /* struct l_timeval * */
+ uarg[1] = (intptr_t) p->tzp; /* struct timezone * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_getgroups16 */
+ case 80: {
+ struct linux_getgroups16_args *p = params;
+ iarg[0] = p->gidsetsize; /* l_uint */
+ uarg[1] = (intptr_t) p->gidset; /* l_gid16_t * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_setgroups16 */
+ case 81: {
+ struct linux_setgroups16_args *p = params;
+ iarg[0] = p->gidsetsize; /* l_uint */
+ uarg[1] = (intptr_t) p->gidset; /* l_gid16_t * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_old_select */
+ case 82: {
+ struct linux_old_select_args *p = params;
+ uarg[0] = (intptr_t) p->ptr; /* struct l_old_select_argv * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_symlink */
+ case 83: {
+ struct linux_symlink_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->to; /* char * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_lstat */
+ case 84: {
+ struct linux_lstat_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->up; /* struct linux_lstat * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_readlink */
+ case 85: {
+ struct linux_readlink_args *p = params;
+ uarg[0] = (intptr_t) p->name; /* char * */
+ uarg[1] = (intptr_t) p->buf; /* char * */
+ iarg[2] = p->count; /* l_int */
+ *n_args = 3;
+ break;
+ }
+ /* swapon */
+ case 87: {
+ struct swapon_args *p = params;
+ uarg[0] = (intptr_t) p->name; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_reboot */
+ case 88: {
+ struct linux_reboot_args *p = params;
+ iarg[0] = p->magic1; /* l_int */
+ iarg[1] = p->magic2; /* l_int */
+ iarg[2] = p->cmd; /* l_uint */
+ uarg[3] = (intptr_t) p->arg; /* void * */
+ *n_args = 4;
+ break;
+ }
+ /* linux_readdir */
+ case 89: {
+ struct linux_readdir_args *p = params;
+ iarg[0] = p->fd; /* l_uint */
+ uarg[1] = (intptr_t) p->dent; /* struct l_dirent * */
+ iarg[2] = p->count; /* l_uint */
+ *n_args = 3;
+ break;
+ }
+ /* linux_mmap */
+ case 90: {
+ struct linux_mmap_args *p = params;
+ uarg[0] = (intptr_t) p->ptr; /* struct l_mmap_argv * */
+ *n_args = 1;
+ break;
+ }
+ /* munmap */
+ case 91: {
+ struct munmap_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* caddr_t */
+ iarg[1] = p->len; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* linux_truncate */
+ case 92: {
+ struct linux_truncate_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->length; /* l_ulong */
+ *n_args = 2;
+ break;
+ }
+ /* linux_ftruncate */
+ case 93: {
+ struct linux_ftruncate_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->length; /* long */
+ *n_args = 2;
+ break;
+ }
+ /* fchmod */
+ case 94: {
+ struct fchmod_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->mode; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* fchown */
+ case 95: {
+ struct fchown_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->uid; /* int */
+ iarg[2] = p->gid; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* linux_getpriority */
+ case 96: {
+ struct linux_getpriority_args *p = params;
+ iarg[0] = p->which; /* int */
+ iarg[1] = p->who; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* setpriority */
+ case 97: {
+ struct setpriority_args *p = params;
+ iarg[0] = p->which; /* int */
+ iarg[1] = p->who; /* int */
+ iarg[2] = p->prio; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* linux_statfs */
+ case 99: {
+ struct linux_statfs_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->buf; /* struct l_statfs_buf * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_fstatfs */
+ case 100: {
+ struct linux_fstatfs_args *p = params;
+ iarg[0] = p->fd; /* l_uint */
+ uarg[1] = (intptr_t) p->buf; /* struct l_statfs_buf * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_socketcall */
+ case 102: {
+ struct linux_socketcall_args *p = params;
+ iarg[0] = p->what; /* l_int */
+ iarg[1] = p->args; /* l_ulong */
+ *n_args = 2;
+ break;
+ }
+ /* linux_syslog */
+ case 103: {
+ struct linux_syslog_args *p = params;
+ iarg[0] = p->type; /* l_int */
+ uarg[1] = (intptr_t) p->buf; /* char * */
+ iarg[2] = p->len; /* l_int */
+ *n_args = 3;
+ break;
+ }
+ /* linux_setitimer */
+ case 104: {
+ struct linux_setitimer_args *p = params;
+ iarg[0] = p->which; /* l_int */
+ uarg[1] = (intptr_t) p->itv; /* struct l_itimerval * */
+ uarg[2] = (intptr_t) p->oitv; /* struct l_itimerval * */
+ *n_args = 3;
+ break;
+ }
+ /* linux_getitimer */
+ case 105: {
+ struct linux_getitimer_args *p = params;
+ iarg[0] = p->which; /* l_int */
+ uarg[1] = (intptr_t) p->itv; /* struct l_itimerval * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_newstat */
+ case 106: {
+ struct linux_newstat_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->buf; /* struct l_newstat * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_newlstat */
+ case 107: {
+ struct linux_newlstat_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->buf; /* struct l_newstat * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_newfstat */
+ case 108: {
+ struct linux_newfstat_args *p = params;
+ iarg[0] = p->fd; /* l_uint */
+ uarg[1] = (intptr_t) p->buf; /* struct l_newstat * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_uname */
+ case 109: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_iopl */
+ case 110: {
+ struct linux_iopl_args *p = params;
+ iarg[0] = p->level; /* l_int */
+ *n_args = 1;
+ break;
+ }
+ /* linux_vhangup */
+ case 111: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_wait4 */
+ case 114: {
+ struct linux_wait4_args *p = params;
+ iarg[0] = p->pid; /* l_pid_t */
+ uarg[1] = (intptr_t) p->status; /* l_uint * */
+ iarg[2] = p->options; /* l_int */
+ uarg[3] = (intptr_t) p->rusage; /* struct l_rusage * */
+ *n_args = 4;
+ break;
+ }
+ /* linux_swapoff */
+ case 115: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_sysinfo */
+ case 116: {
+ struct linux_sysinfo_args *p = params;
+ uarg[0] = (intptr_t) p->info; /* struct l_sysinfo * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_ipc */
+ case 117: {
+ struct linux_ipc_args *p = params;
+ iarg[0] = p->what; /* l_uint */
+ iarg[1] = p->arg1; /* l_int */
+ iarg[2] = p->arg2; /* l_int */
+ iarg[3] = p->arg3; /* l_int */
+ uarg[4] = (intptr_t) p->ptr; /* void * */
+ iarg[5] = p->arg5; /* l_long */
+ *n_args = 6;
+ break;
+ }
+ /* fsync */
+ case 118: {
+ struct fsync_args *p = params;
+ iarg[0] = p->fd; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* linux_sigreturn */
+ case 119: {
+ struct linux_sigreturn_args *p = params;
+ uarg[0] = (intptr_t) p->sfp; /* struct l_sigframe * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_clone */
+ case 120: {
+ struct linux_clone_args *p = params;
+ iarg[0] = p->flags; /* l_int */
+ uarg[1] = (intptr_t) p->stack; /* void * */
+ uarg[2] = (intptr_t) p->parent_tidptr; /* void * */
+ uarg[3] = (intptr_t) p->tls; /* void * */
+ uarg[4] = (intptr_t) p->child_tidptr; /* void * */
+ *n_args = 5;
+ break;
+ }
+ /* linux_setdomainname */
+ case 121: {
+ struct linux_setdomainname_args *p = params;
+ uarg[0] = (intptr_t) p->name; /* char * */
+ iarg[1] = p->len; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* linux_newuname */
+ case 122: {
+ struct linux_newuname_args *p = params;
+ uarg[0] = (intptr_t) p->buf; /* struct l_new_utsname * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_adjtimex */
+ case 124: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_mprotect */
+ case 125: {
+ struct linux_mprotect_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* caddr_t */
+ iarg[1] = p->len; /* int */
+ iarg[2] = p->prot; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* linux_sigprocmask */
+ case 126: {
+ struct linux_sigprocmask_args *p = params;
+ iarg[0] = p->how; /* l_int */
+ uarg[1] = (intptr_t) p->mask; /* l_osigset_t * */
+ uarg[2] = (intptr_t) p->omask; /* l_osigset_t * */
+ *n_args = 3;
+ break;
+ }
+ /* linux_create_module */
+ case 127: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_init_module */
+ case 128: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_delete_module */
+ case 129: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_get_kernel_syms */
+ case 130: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_quotactl */
+ case 131: {
+ *n_args = 0;
+ break;
+ }
+ /* getpgid */
+ case 132: {
+ struct getpgid_args *p = params;
+ iarg[0] = p->pid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* fchdir */
+ case 133: {
+ struct fchdir_args *p = params;
+ iarg[0] = p->fd; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* linux_bdflush */
+ case 134: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_sysfs */
+ case 135: {
+ struct linux_sysfs_args *p = params;
+ iarg[0] = p->option; /* l_int */
+ iarg[1] = p->arg1; /* l_ulong */
+ iarg[2] = p->arg2; /* l_ulong */
+ *n_args = 3;
+ break;
+ }
+ /* linux_personality */
+ case 136: {
+ struct linux_personality_args *p = params;
+ iarg[0] = p->per; /* l_ulong */
+ *n_args = 1;
+ break;
+ }
+ /* linux_setfsuid16 */
+ case 138: {
+ struct linux_setfsuid16_args *p = params;
+ iarg[0] = p->uid; /* l_uid16_t */
+ *n_args = 1;
+ break;
+ }
+ /* linux_setfsgid16 */
+ case 139: {
+ struct linux_setfsgid16_args *p = params;
+ iarg[0] = p->gid; /* l_gid16_t */
+ *n_args = 1;
+ break;
+ }
+ /* linux_llseek */
+ case 140: {
+ struct linux_llseek_args *p = params;
+ iarg[0] = p->fd; /* l_int */
+ iarg[1] = p->ohigh; /* l_ulong */
+ iarg[2] = p->olow; /* l_ulong */
+ uarg[3] = (intptr_t) p->res; /* l_loff_t * */
+ iarg[4] = p->whence; /* l_uint */
+ *n_args = 5;
+ break;
+ }
+ /* linux_getdents */
+ case 141: {
+ struct linux_getdents_args *p = params;
+ iarg[0] = p->fd; /* l_uint */
+ uarg[1] = (intptr_t) p->dent; /* void * */
+ iarg[2] = p->count; /* l_uint */
+ *n_args = 3;
+ break;
+ }
+ /* linux_select */
+ case 142: {
+ struct linux_select_args *p = params;
+ iarg[0] = p->nfds; /* l_int */
+ uarg[1] = (intptr_t) p->readfds; /* l_fd_set * */
+ uarg[2] = (intptr_t) p->writefds; /* l_fd_set * */
+ uarg[3] = (intptr_t) p->exceptfds; /* l_fd_set * */
+ uarg[4] = (intptr_t) p->timeout; /* struct l_timeval * */
+ *n_args = 5;
+ break;
+ }
+ /* flock */
+ case 143: {
+ struct flock_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->how; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* linux_msync */
+ case 144: {
+ struct linux_msync_args *p = params;
+ iarg[0] = p->addr; /* l_ulong */
+ iarg[1] = p->len; /* l_size_t */
+ iarg[2] = p->fl; /* l_int */
+ *n_args = 3;
+ break;
+ }
+ /* linux_readv */
+ case 145: {
+ struct linux_readv_args *p = params;
+ iarg[0] = p->fd; /* l_ulong */
+ uarg[1] = (intptr_t) p->iovp; /* struct l_iovec32 * */
+ iarg[2] = p->iovcnt; /* l_ulong */
+ *n_args = 3;
+ break;
+ }
+ /* linux_writev */
+ case 146: {
+ struct linux_writev_args *p = params;
+ iarg[0] = p->fd; /* l_ulong */
+ uarg[1] = (intptr_t) p->iovp; /* struct l_iovec32 * */
+ iarg[2] = p->iovcnt; /* l_ulong */
+ *n_args = 3;
+ break;
+ }
+ /* linux_getsid */
+ case 147: {
+ struct linux_getsid_args *p = params;
+ iarg[0] = p->pid; /* l_pid_t */
+ *n_args = 1;
+ break;
+ }
+ /* linux_fdatasync */
+ case 148: {
+ struct linux_fdatasync_args *p = params;
+ iarg[0] = p->fd; /* l_uint */
+ *n_args = 1;
+ break;
+ }
+ /* linux_sysctl */
+ case 149: {
+ struct linux_sysctl_args *p = params;
+ uarg[0] = (intptr_t) p->args; /* struct l___sysctl_args * */
+ *n_args = 1;
+ break;
+ }
+ /* mlock */
+ case 150: {
+ struct mlock_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* const void * */
+ uarg[1] = p->len; /* size_t */
+ *n_args = 2;
+ break;
+ }
+ /* munlock */
+ case 151: {
+ struct munlock_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* const void * */
+ uarg[1] = p->len; /* size_t */
+ *n_args = 2;
+ break;
+ }
+ /* mlockall */
+ case 152: {
+ struct mlockall_args *p = params;
+ iarg[0] = p->how; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* munlockall */
+ case 153: {
+ *n_args = 0;
+ break;
+ }
+ /* sched_setparam */
+ case 154: {
+ struct sched_setparam_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ uarg[1] = (intptr_t) p->param; /* const struct sched_param * */
+ *n_args = 2;
+ break;
+ }
+ /* sched_getparam */
+ case 155: {
+ struct sched_getparam_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ uarg[1] = (intptr_t) p->param; /* struct sched_param * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_sched_setscheduler */
+ case 156: {
+ struct linux_sched_setscheduler_args *p = params;
+ iarg[0] = p->pid; /* l_pid_t */
+ iarg[1] = p->policy; /* l_int */
+ uarg[2] = (intptr_t) p->param; /* struct l_sched_param * */
+ *n_args = 3;
+ break;
+ }
+ /* linux_sched_getscheduler */
+ case 157: {
+ struct linux_sched_getscheduler_args *p = params;
+ iarg[0] = p->pid; /* l_pid_t */
+ *n_args = 1;
+ break;
+ }
+ /* sched_yield */
+ case 158: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_sched_get_priority_max */
+ case 159: {
+ struct linux_sched_get_priority_max_args *p = params;
+ iarg[0] = p->policy; /* l_int */
+ *n_args = 1;
+ break;
+ }
+ /* linux_sched_get_priority_min */
+ case 160: {
+ struct linux_sched_get_priority_min_args *p = params;
+ iarg[0] = p->policy; /* l_int */
+ *n_args = 1;
+ break;
+ }
+ /* linux_sched_rr_get_interval */
+ case 161: {
+ struct linux_sched_rr_get_interval_args *p = params;
+ iarg[0] = p->pid; /* l_pid_t */
+ uarg[1] = (intptr_t) p->interval; /* struct l_timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_nanosleep */
+ case 162: {
+ struct linux_nanosleep_args *p = params;
+ uarg[0] = (intptr_t) p->rqtp; /* const struct l_timespec * */
+ uarg[1] = (intptr_t) p->rmtp; /* struct l_timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_mremap */
+ case 163: {
+ struct linux_mremap_args *p = params;
+ iarg[0] = p->addr; /* l_ulong */
+ iarg[1] = p->old_len; /* l_ulong */
+ iarg[2] = p->new_len; /* l_ulong */
+ iarg[3] = p->flags; /* l_ulong */
+ iarg[4] = p->new_addr; /* l_ulong */
+ *n_args = 5;
+ break;
+ }
+ /* linux_setresuid16 */
+ case 164: {
+ struct linux_setresuid16_args *p = params;
+ iarg[0] = p->ruid; /* l_uid16_t */
+ iarg[1] = p->euid; /* l_uid16_t */
+ iarg[2] = p->suid; /* l_uid16_t */
+ *n_args = 3;
+ break;
+ }
+ /* linux_getresuid16 */
+ case 165: {
+ struct linux_getresuid16_args *p = params;
+ uarg[0] = (intptr_t) p->ruid; /* l_uid16_t * */
+ uarg[1] = (intptr_t) p->euid; /* l_uid16_t * */
+ uarg[2] = (intptr_t) p->suid; /* l_uid16_t * */
+ *n_args = 3;
+ break;
+ }
+ /* linux_query_module */
+ case 167: {
+ *n_args = 0;
+ break;
+ }
+ /* poll */
+ case 168: {
+ struct poll_args *p = params;
+ uarg[0] = (intptr_t) p->fds; /* struct pollfd * */
+ uarg[1] = p->nfds; /* unsigned int */
+ iarg[2] = p->timeout; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* linux_nfsservctl */
+ case 169: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_setresgid16 */
+ case 170: {
+ struct linux_setresgid16_args *p = params;
+ iarg[0] = p->rgid; /* l_gid16_t */
+ iarg[1] = p->egid; /* l_gid16_t */
+ iarg[2] = p->sgid; /* l_gid16_t */
+ *n_args = 3;
+ break;
+ }
+ /* linux_getresgid16 */
+ case 171: {
+ struct linux_getresgid16_args *p = params;
+ uarg[0] = (intptr_t) p->rgid; /* l_gid16_t * */
+ uarg[1] = (intptr_t) p->egid; /* l_gid16_t * */
+ uarg[2] = (intptr_t) p->sgid; /* l_gid16_t * */
+ *n_args = 3;
+ break;
+ }
+ /* linux_prctl */
+ case 172: {
+ struct linux_prctl_args *p = params;
+ iarg[0] = p->option; /* l_int */
+ iarg[1] = p->arg2; /* l_int */
+ iarg[2] = p->arg3; /* l_int */
+ iarg[3] = p->arg4; /* l_int */
+ iarg[4] = p->arg5; /* l_int */
+ *n_args = 5;
+ break;
+ }
+ /* linux_rt_sigreturn */
+ case 173: {
+ struct linux_rt_sigreturn_args *p = params;
+ uarg[0] = (intptr_t) p->ucp; /* struct l_ucontext * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_rt_sigaction */
+ case 174: {
+ struct linux_rt_sigaction_args *p = params;
+ iarg[0] = p->sig; /* l_int */
+ uarg[1] = (intptr_t) p->act; /* l_sigaction_t * */
+ uarg[2] = (intptr_t) p->oact; /* l_sigaction_t * */
+ iarg[3] = p->sigsetsize; /* l_size_t */
+ *n_args = 4;
+ break;
+ }
+ /* linux_rt_sigprocmask */
+ case 175: {
+ struct linux_rt_sigprocmask_args *p = params;
+ iarg[0] = p->how; /* l_int */
+ uarg[1] = (intptr_t) p->mask; /* l_sigset_t * */
+ uarg[2] = (intptr_t) p->omask; /* l_sigset_t * */
+ iarg[3] = p->sigsetsize; /* l_size_t */
+ *n_args = 4;
+ break;
+ }
+ /* linux_rt_sigpending */
+ case 176: {
+ struct linux_rt_sigpending_args *p = params;
+ uarg[0] = (intptr_t) p->set; /* l_sigset_t * */
+ iarg[1] = p->sigsetsize; /* l_size_t */
+ *n_args = 2;
+ break;
+ }
+ /* linux_rt_sigtimedwait */
+ case 177: {
+ struct linux_rt_sigtimedwait_args *p = params;
+ uarg[0] = (intptr_t) p->mask; /* l_sigset_t * */
+ uarg[1] = (intptr_t) p->ptr; /* l_siginfo_t * */
+ uarg[2] = (intptr_t) p->timeout; /* struct l_timeval * */
+ iarg[3] = p->sigsetsize; /* l_size_t */
+ *n_args = 4;
+ break;
+ }
+ /* linux_rt_sigqueueinfo */
+ case 178: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_rt_sigsuspend */
+ case 179: {
+ struct linux_rt_sigsuspend_args *p = params;
+ uarg[0] = (intptr_t) p->newset; /* l_sigset_t * */
+ iarg[1] = p->sigsetsize; /* l_size_t */
+ *n_args = 2;
+ break;
+ }
+ /* linux_pread */
+ case 180: {
+ struct linux_pread_args *p = params;
+ iarg[0] = p->fd; /* l_uint */
+ uarg[1] = (intptr_t) p->buf; /* char * */
+ iarg[2] = p->nbyte; /* l_size_t */
+ iarg[3] = p->offset; /* l_loff_t */
+ *n_args = 4;
+ break;
+ }
+ /* linux_pwrite */
+ case 181: {
+ struct linux_pwrite_args *p = params;
+ iarg[0] = p->fd; /* l_uint */
+ uarg[1] = (intptr_t) p->buf; /* char * */
+ iarg[2] = p->nbyte; /* l_size_t */
+ iarg[3] = p->offset; /* l_loff_t */
+ *n_args = 4;
+ break;
+ }
+ /* linux_chown16 */
+ case 182: {
+ struct linux_chown16_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->uid; /* l_uid16_t */
+ iarg[2] = p->gid; /* l_gid16_t */
+ *n_args = 3;
+ break;
+ }
+ /* linux_getcwd */
+ case 183: {
+ struct linux_getcwd_args *p = params;
+ uarg[0] = (intptr_t) p->buf; /* char * */
+ iarg[1] = p->bufsize; /* l_ulong */
+ *n_args = 2;
+ break;
+ }
+ /* linux_capget */
+ case 184: {
+ struct linux_capget_args *p = params;
+ uarg[0] = (intptr_t) p->hdrp; /* struct l_user_cap_header * */
+ uarg[1] = (intptr_t) p->datap; /* struct l_user_cap_data * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_capset */
+ case 185: {
+ struct linux_capset_args *p = params;
+ uarg[0] = (intptr_t) p->hdrp; /* struct l_user_cap_header * */
+ uarg[1] = (intptr_t) p->datap; /* struct l_user_cap_data * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_sigaltstack */
+ case 186: {
+ struct linux_sigaltstack_args *p = params;
+ uarg[0] = (intptr_t) p->uss; /* l_stack_t * */
+ uarg[1] = (intptr_t) p->uoss; /* l_stack_t * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_sendfile */
+ case 187: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_vfork */
+ case 190: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_getrlimit */
+ case 191: {
+ struct linux_getrlimit_args *p = params;
+ iarg[0] = p->resource; /* l_uint */
+ uarg[1] = (intptr_t) p->rlim; /* struct l_rlimit * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_mmap2 */
+ case 192: {
+ struct linux_mmap2_args *p = params;
+ iarg[0] = p->addr; /* l_ulong */
+ iarg[1] = p->len; /* l_ulong */
+ iarg[2] = p->prot; /* l_ulong */
+ iarg[3] = p->flags; /* l_ulong */
+ iarg[4] = p->fd; /* l_ulong */
+ iarg[5] = p->pgoff; /* l_ulong */
+ *n_args = 6;
+ break;
+ }
+ /* linux_truncate64 */
+ case 193: {
+ struct linux_truncate64_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->length; /* l_loff_t */
+ *n_args = 2;
+ break;
+ }
+ /* linux_ftruncate64 */
+ case 194: {
+ struct linux_ftruncate64_args *p = params;
+ iarg[0] = p->fd; /* l_uint */
+ iarg[1] = p->length; /* l_loff_t */
+ *n_args = 2;
+ break;
+ }
+ /* linux_stat64 */
+ case 195: {
+ struct linux_stat64_args *p = params;
+ uarg[0] = (intptr_t) p->filename; /* const char * */
+ uarg[1] = (intptr_t) p->statbuf; /* struct l_stat64 * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_lstat64 */
+ case 196: {
+ struct linux_lstat64_args *p = params;
+ uarg[0] = (intptr_t) p->filename; /* const char * */
+ uarg[1] = (intptr_t) p->statbuf; /* struct l_stat64 * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_fstat64 */
+ case 197: {
+ struct linux_fstat64_args *p = params;
+ iarg[0] = p->fd; /* l_int */
+ uarg[1] = (intptr_t) p->statbuf; /* struct l_stat64 * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_lchown */
+ case 198: {
+ struct linux_lchown_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->uid; /* l_uid_t */
+ iarg[2] = p->gid; /* l_gid_t */
+ *n_args = 3;
+ break;
+ }
+ /* linux_getuid */
+ case 199: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_getgid */
+ case 200: {
+ *n_args = 0;
+ break;
+ }
+ /* geteuid */
+ case 201: {
+ *n_args = 0;
+ break;
+ }
+ /* getegid */
+ case 202: {
+ *n_args = 0;
+ break;
+ }
+ /* setreuid */
+ case 203: {
+ struct setreuid_args *p = params;
+ uarg[0] = p->ruid; /* uid_t */
+ uarg[1] = p->euid; /* uid_t */
+ *n_args = 2;
+ break;
+ }
+ /* setregid */
+ case 204: {
+ struct setregid_args *p = params;
+ iarg[0] = p->rgid; /* gid_t */
+ iarg[1] = p->egid; /* gid_t */
+ *n_args = 2;
+ break;
+ }
+ /* linux_getgroups */
+ case 205: {
+ struct linux_getgroups_args *p = params;
+ iarg[0] = p->gidsetsize; /* l_int */
+ uarg[1] = (intptr_t) p->grouplist; /* l_gid_t * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_setgroups */
+ case 206: {
+ struct linux_setgroups_args *p = params;
+ iarg[0] = p->gidsetsize; /* l_int */
+ uarg[1] = (intptr_t) p->grouplist; /* l_gid_t * */
+ *n_args = 2;
+ break;
+ }
+ /* fchown */
+ case 207: {
+ *n_args = 0;
+ break;
+ }
+ /* setresuid */
+ case 208: {
+ struct setresuid_args *p = params;
+ uarg[0] = p->ruid; /* uid_t */
+ uarg[1] = p->euid; /* uid_t */
+ uarg[2] = p->suid; /* uid_t */
+ *n_args = 3;
+ break;
+ }
+ /* getresuid */
+ case 209: {
+ struct getresuid_args *p = params;
+ uarg[0] = (intptr_t) p->ruid; /* uid_t * */
+ uarg[1] = (intptr_t) p->euid; /* uid_t * */
+ uarg[2] = (intptr_t) p->suid; /* uid_t * */
+ *n_args = 3;
+ break;
+ }
+ /* setresgid */
+ case 210: {
+ struct setresgid_args *p = params;
+ iarg[0] = p->rgid; /* gid_t */
+ iarg[1] = p->egid; /* gid_t */
+ iarg[2] = p->sgid; /* gid_t */
+ *n_args = 3;
+ break;
+ }
+ /* getresgid */
+ case 211: {
+ struct getresgid_args *p = params;
+ uarg[0] = (intptr_t) p->rgid; /* gid_t * */
+ uarg[1] = (intptr_t) p->egid; /* gid_t * */
+ uarg[2] = (intptr_t) p->sgid; /* gid_t * */
+ *n_args = 3;
+ break;
+ }
+ /* linux_chown */
+ case 212: {
+ struct linux_chown_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->uid; /* l_uid_t */
+ iarg[2] = p->gid; /* l_gid_t */
+ *n_args = 3;
+ break;
+ }
+ /* setuid */
+ case 213: {
+ struct setuid_args *p = params;
+ uarg[0] = p->uid; /* uid_t */
+ *n_args = 1;
+ break;
+ }
+ /* setgid */
+ case 214: {
+ struct setgid_args *p = params;
+ iarg[0] = p->gid; /* gid_t */
+ *n_args = 1;
+ break;
+ }
+ /* linux_setfsuid */
+ case 215: {
+ struct linux_setfsuid_args *p = params;
+ iarg[0] = p->uid; /* l_uid_t */
+ *n_args = 1;
+ break;
+ }
+ /* linux_setfsgid */
+ case 216: {
+ struct linux_setfsgid_args *p = params;
+ iarg[0] = p->gid; /* l_gid_t */
+ *n_args = 1;
+ break;
+ }
+ /* linux_pivot_root */
+ case 217: {
+ struct linux_pivot_root_args *p = params;
+ uarg[0] = (intptr_t) p->new_root; /* char * */
+ uarg[1] = (intptr_t) p->put_old; /* char * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_mincore */
+ case 218: {
+ struct linux_mincore_args *p = params;
+ iarg[0] = p->start; /* l_ulong */
+ iarg[1] = p->len; /* l_size_t */
+ uarg[2] = (intptr_t) p->vec; /* u_char * */
+ *n_args = 3;
+ break;
+ }
+ /* madvise */
+ case 219: {
+ struct madvise_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* void * */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->behav; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* linux_getdents64 */
+ case 220: {
+ struct linux_getdents64_args *p = params;
+ iarg[0] = p->fd; /* l_uint */
+ uarg[1] = (intptr_t) p->dirent; /* void * */
+ iarg[2] = p->count; /* l_uint */
+ *n_args = 3;
+ break;
+ }
+ /* linux_fcntl64 */
+ case 221: {
+ struct linux_fcntl64_args *p = params;
+ iarg[0] = p->fd; /* l_uint */
+ iarg[1] = p->cmd; /* l_uint */
+ uarg[2] = p->arg; /* uintptr_t */
+ *n_args = 3;
+ break;
+ }
+ /* linux_gettid */
+ case 224: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_setxattr */
+ case 226: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_lsetxattr */
+ case 227: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_fsetxattr */
+ case 228: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_getxattr */
+ case 229: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_lgetxattr */
+ case 230: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_fgetxattr */
+ case 231: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_listxattr */
+ case 232: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_llistxattr */
+ case 233: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_flistxattr */
+ case 234: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_removexattr */
+ case 235: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_lremovexattr */
+ case 236: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_fremovexattr */
+ case 237: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_tkill */
+ case 238: {
+ struct linux_tkill_args *p = params;
+ iarg[0] = p->tid; /* int */
+ iarg[1] = p->sig; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* linux_sys_futex */
+ case 240: {
+ struct linux_sys_futex_args *p = params;
+ uarg[0] = (intptr_t) p->uaddr; /* void * */
+ iarg[1] = p->op; /* int */
+ uarg[2] = p->val; /* uint32_t */
+ uarg[3] = (intptr_t) p->timeout; /* struct l_timespec * */
+ uarg[4] = (intptr_t) p->uaddr2; /* uint32_t * */
+ uarg[5] = p->val3; /* uint32_t */
+ *n_args = 6;
+ break;
+ }
+ /* linux_sched_setaffinity */
+ case 241: {
+ struct linux_sched_setaffinity_args *p = params;
+ iarg[0] = p->pid; /* l_pid_t */
+ iarg[1] = p->len; /* l_uint */
+ uarg[2] = (intptr_t) p->user_mask_ptr; /* l_ulong * */
+ *n_args = 3;
+ break;
+ }
+ /* linux_sched_getaffinity */
+ case 242: {
+ struct linux_sched_getaffinity_args *p = params;
+ iarg[0] = p->pid; /* l_pid_t */
+ iarg[1] = p->len; /* l_uint */
+ uarg[2] = (intptr_t) p->user_mask_ptr; /* l_ulong * */
+ *n_args = 3;
+ break;
+ }
+ /* linux_set_thread_area */
+ case 243: {
+ struct linux_set_thread_area_args *p = params;
+ uarg[0] = (intptr_t) p->desc; /* struct l_user_desc * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_fadvise64 */
+ case 250: {
+ struct linux_fadvise64_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->offset; /* l_loff_t */
+ iarg[2] = p->len; /* l_size_t */
+ iarg[3] = p->advice; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* linux_exit_group */
+ case 252: {
+ struct linux_exit_group_args *p = params;
+ iarg[0] = p->error_code; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* linux_lookup_dcookie */
+ case 253: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_epoll_create */
+ case 254: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_epoll_ctl */
+ case 255: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_epoll_wait */
+ case 256: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_remap_file_pages */
+ case 257: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_set_tid_address */
+ case 258: {
+ struct linux_set_tid_address_args *p = params;
+ uarg[0] = (intptr_t) p->tidptr; /* int * */
+ *n_args = 1;
+ break;
+ }
+ /* linux_timer_create */
+ case 259: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_timer_settime */
+ case 260: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_timer_gettime */
+ case 261: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_timer_getoverrun */
+ case 262: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_timer_delete */
+ case 263: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_clock_settime */
+ case 264: {
+ struct linux_clock_settime_args *p = params;
+ iarg[0] = p->which; /* clockid_t */
+ uarg[1] = (intptr_t) p->tp; /* struct l_timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_clock_gettime */
+ case 265: {
+ struct linux_clock_gettime_args *p = params;
+ iarg[0] = p->which; /* clockid_t */
+ uarg[1] = (intptr_t) p->tp; /* struct l_timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_clock_getres */
+ case 266: {
+ struct linux_clock_getres_args *p = params;
+ iarg[0] = p->which; /* clockid_t */
+ uarg[1] = (intptr_t) p->tp; /* struct l_timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_clock_nanosleep */
+ case 267: {
+ struct linux_clock_nanosleep_args *p = params;
+ iarg[0] = p->which; /* clockid_t */
+ iarg[1] = p->flags; /* int */
+ uarg[2] = (intptr_t) p->rqtp; /* struct l_timespec * */
+ uarg[3] = (intptr_t) p->rmtp; /* struct l_timespec * */
+ *n_args = 4;
+ break;
+ }
+ /* linux_statfs64 */
+ case 268: {
+ struct linux_statfs64_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = p->bufsize; /* size_t */
+ uarg[2] = (intptr_t) p->buf; /* struct l_statfs64_buf * */
+ *n_args = 3;
+ break;
+ }
+ /* linux_fstatfs64 */
+ case 269: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_tgkill */
+ case 270: {
+ struct linux_tgkill_args *p = params;
+ iarg[0] = p->tgid; /* int */
+ iarg[1] = p->pid; /* int */
+ iarg[2] = p->sig; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* linux_utimes */
+ case 271: {
+ struct linux_utimes_args *p = params;
+ uarg[0] = (intptr_t) p->fname; /* char * */
+ uarg[1] = (intptr_t) p->tptr; /* struct l_timeval * */
+ *n_args = 2;
+ break;
+ }
+ /* linux_fadvise64_64 */
+ case 272: {
+ struct linux_fadvise64_64_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->offset; /* l_loff_t */
+ iarg[2] = p->len; /* l_loff_t */
+ iarg[3] = p->advice; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* linux_mbind */
+ case 274: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_get_mempolicy */
+ case 275: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_set_mempolicy */
+ case 276: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_mq_open */
+ case 277: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_mq_unlink */
+ case 278: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_mq_timedsend */
+ case 279: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_mq_timedreceive */
+ case 280: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_mq_notify */
+ case 281: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_mq_getsetattr */
+ case 282: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_kexec_load */
+ case 283: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_waitid */
+ case 284: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_add_key */
+ case 286: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_request_key */
+ case 287: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_keyctl */
+ case 288: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_ioprio_set */
+ case 289: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_ioprio_get */
+ case 290: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_inotify_init */
+ case 291: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_inotify_add_watch */
+ case 292: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_inotify_rm_watch */
+ case 293: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_migrate_pages */
+ case 294: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_openat */
+ case 295: {
+ struct linux_openat_args *p = params;
+ iarg[0] = p->dfd; /* l_int */
+ uarg[1] = (intptr_t) p->filename; /* const char * */
+ iarg[2] = p->flags; /* l_int */
+ iarg[3] = p->mode; /* l_int */
+ *n_args = 4;
+ break;
+ }
+ /* linux_mkdirat */
+ case 296: {
+ struct linux_mkdirat_args *p = params;
+ iarg[0] = p->dfd; /* l_int */
+ uarg[1] = (intptr_t) p->pathname; /* const char * */
+ iarg[2] = p->mode; /* l_int */
+ *n_args = 3;
+ break;
+ }
+ /* linux_mknodat */
+ case 297: {
+ struct linux_mknodat_args *p = params;
+ iarg[0] = p->dfd; /* l_int */
+ uarg[1] = (intptr_t) p->filename; /* const char * */
+ iarg[2] = p->mode; /* l_int */
+ iarg[3] = p->dev; /* l_uint */
+ *n_args = 4;
+ break;
+ }
+ /* linux_fchownat */
+ case 298: {
+ struct linux_fchownat_args *p = params;
+ iarg[0] = p->dfd; /* l_int */
+ uarg[1] = (intptr_t) p->filename; /* const char * */
+ iarg[2] = p->uid; /* l_uid16_t */
+ iarg[3] = p->gid; /* l_gid16_t */
+ iarg[4] = p->flag; /* l_int */
+ *n_args = 5;
+ break;
+ }
+ /* linux_futimesat */
+ case 299: {
+ struct linux_futimesat_args *p = params;
+ iarg[0] = p->dfd; /* l_int */
+ uarg[1] = (intptr_t) p->filename; /* char * */
+ uarg[2] = (intptr_t) p->utimes; /* struct l_timeval * */
+ *n_args = 3;
+ break;
+ }
+ /* linux_fstatat64 */
+ case 300: {
+ struct linux_fstatat64_args *p = params;
+ iarg[0] = p->dfd; /* l_int */
+ uarg[1] = (intptr_t) p->pathname; /* char * */
+ uarg[2] = (intptr_t) p->statbuf; /* struct l_stat64 * */
+ iarg[3] = p->flag; /* l_int */
+ *n_args = 4;
+ break;
+ }
+ /* linux_unlinkat */
+ case 301: {
+ struct linux_unlinkat_args *p = params;
+ iarg[0] = p->dfd; /* l_int */
+ uarg[1] = (intptr_t) p->pathname; /* const char * */
+ iarg[2] = p->flag; /* l_int */
+ *n_args = 3;
+ break;
+ }
+ /* linux_renameat */
+ case 302: {
+ struct linux_renameat_args *p = params;
+ iarg[0] = p->olddfd; /* l_int */
+ uarg[1] = (intptr_t) p->oldname; /* const char * */
+ iarg[2] = p->newdfd; /* l_int */
+ uarg[3] = (intptr_t) p->newname; /* const char * */
+ *n_args = 4;
+ break;
+ }
+ /* linux_linkat */
+ case 303: {
+ struct linux_linkat_args *p = params;
+ iarg[0] = p->olddfd; /* l_int */
+ uarg[1] = (intptr_t) p->oldname; /* const char * */
+ iarg[2] = p->newdfd; /* l_int */
+ uarg[3] = (intptr_t) p->newname; /* const char * */
+ iarg[4] = p->flag; /* l_int */
+ *n_args = 5;
+ break;
+ }
+ /* linux_symlinkat */
+ case 304: {
+ struct linux_symlinkat_args *p = params;
+ uarg[0] = (intptr_t) p->oldname; /* const char * */
+ iarg[1] = p->newdfd; /* l_int */
+ uarg[2] = (intptr_t) p->newname; /* const char * */
+ *n_args = 3;
+ break;
+ }
+ /* linux_readlinkat */
+ case 305: {
+ struct linux_readlinkat_args *p = params;
+ iarg[0] = p->dfd; /* l_int */
+ uarg[1] = (intptr_t) p->path; /* const char * */
+ uarg[2] = (intptr_t) p->buf; /* char * */
+ iarg[3] = p->bufsiz; /* l_int */
+ *n_args = 4;
+ break;
+ }
+ /* linux_fchmodat */
+ case 306: {
+ struct linux_fchmodat_args *p = params;
+ iarg[0] = p->dfd; /* l_int */
+ uarg[1] = (intptr_t) p->filename; /* const char * */
+ iarg[2] = p->mode; /* l_mode_t */
+ *n_args = 3;
+ break;
+ }
+ /* linux_faccessat */
+ case 307: {
+ struct linux_faccessat_args *p = params;
+ iarg[0] = p->dfd; /* l_int */
+ uarg[1] = (intptr_t) p->filename; /* const char * */
+ iarg[2] = p->amode; /* l_int */
+ iarg[3] = p->flag; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* linux_pselect6 */
+ case 308: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_ppoll */
+ case 309: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_unshare */
+ case 310: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_set_robust_list */
+ case 311: {
+ struct linux_set_robust_list_args *p = params;
+ uarg[0] = (intptr_t) p->head; /* struct linux_robust_list_head * */
+ iarg[1] = p->len; /* l_size_t */
+ *n_args = 2;
+ break;
+ }
+ /* linux_get_robust_list */
+ case 312: {
+ struct linux_get_robust_list_args *p = params;
+ iarg[0] = p->pid; /* l_int */
+ uarg[1] = (intptr_t) p->head; /* struct linux_robust_list_head * */
+ uarg[2] = (intptr_t) p->len; /* l_size_t * */
+ *n_args = 3;
+ break;
+ }
+ /* linux_splice */
+ case 313: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_sync_file_range */
+ case 314: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_tee */
+ case 315: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_vmsplice */
+ case 316: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_move_pages */
+ case 317: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_getcpu */
+ case 318: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_epoll_pwait */
+ case 319: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_utimensat */
+ case 320: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_signalfd */
+ case 321: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_timerfd_create */
+ case 322: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_eventfd */
+ case 323: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_fallocate */
+ case 324: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_timerfd_settime */
+ case 325: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_timerfd_gettime */
+ case 326: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_signalfd4 */
+ case 327: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_eventfd2 */
+ case 328: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_epoll_create1 */
+ case 329: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_dup3 */
+ case 330: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_pipe2 */
+ case 331: {
+ struct linux_pipe2_args *p = params;
+ uarg[0] = (intptr_t) p->pipefds; /* l_int * */
+ iarg[1] = p->flags; /* l_int */
+ *n_args = 2;
+ break;
+ }
+ /* linux_inotify_init1 */
+ case 332: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_preadv */
+ case 333: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_pwritev */
+ case 334: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_rt_tsigqueueinfo */
+ case 335: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_perf_event_open */
+ case 336: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_recvmmsg */
+ case 337: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_fanotify_init */
+ case 338: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_fanotify_mark */
+ case 339: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_prlimit64 */
+ case 340: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_name_to_handle_at */
+ case 341: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_open_by_handle_at */
+ case 342: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_clock_adjtime */
+ case 343: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_syncfs */
+ case 344: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_sendmmsg */
+ case 345: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_setns */
+ case 346: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_process_vm_readv */
+ case 347: {
+ *n_args = 0;
+ break;
+ }
+ /* linux_process_vm_writev */
+ case 348: {
+ *n_args = 0;
+ break;
+ }
+ default:
+ *n_args = 0;
+ break;
+ };
+}
+static void
+systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
+{
+ const char *p = NULL;
+ switch (sysnum) {
+#define nosys linux_nosys
+ /* sys_exit */
+ case 1:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_fork */
+ case 2:
+ break;
+ /* read */
+ case 3:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* write */
+ case 4:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_open */
+ case 5:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "l_int";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* close */
+ case 6:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_waitpid */
+ case 7:
+ switch(ndx) {
+ case 0:
+ p = "l_pid_t";
+ break;
+ case 1:
+ p = "l_int *";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_creat */
+ case 8:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_link */
+ case 9:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_unlink */
+ case 10:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_execve */
+ case 11:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "uint32_t *";
+ break;
+ case 2:
+ p = "uint32_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_chdir */
+ case 12:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_time */
+ case 13:
+ switch(ndx) {
+ case 0:
+ p = "l_time_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_mknod */
+ case 14:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "l_int";
+ break;
+ case 2:
+ p = "l_dev_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_chmod */
+ case 15:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "l_mode_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_lchown16 */
+ case 16:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "l_uid16_t";
+ break;
+ case 2:
+ p = "l_gid16_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_stat */
+ case 18:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct linux_stat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_lseek */
+ case 19:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "l_off_t";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getpid */
+ case 20:
+ break;
+ /* linux_mount */
+ case 21:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "char *";
+ break;
+ case 3:
+ p = "l_ulong";
+ break;
+ case 4:
+ p = "void *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_oldumount */
+ case 22:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_setuid16 */
+ case 23:
+ switch(ndx) {
+ case 0:
+ p = "l_uid16_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getuid16 */
+ case 24:
+ break;
+ /* linux_stime */
+ case 25:
+ break;
+ /* linux_ptrace */
+ case 26:
+ switch(ndx) {
+ case 0:
+ p = "l_long";
+ break;
+ case 1:
+ p = "l_long";
+ break;
+ case 2:
+ p = "l_long";
+ break;
+ case 3:
+ p = "l_long";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_alarm */
+ case 27:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_pause */
+ case 29:
+ break;
+ /* linux_utime */
+ case 30:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct l_utimbuf *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_access */
+ case 33:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_nice */
+ case 34:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sync */
+ case 36:
+ break;
+ /* linux_kill */
+ case 37:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_rename */
+ case 38:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_mkdir */
+ case 39:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_rmdir */
+ case 40:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* dup */
+ case 41:
+ switch(ndx) {
+ case 0:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_pipe */
+ case 42:
+ switch(ndx) {
+ case 0:
+ p = "l_int *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_times */
+ case 43:
+ switch(ndx) {
+ case 0:
+ p = "struct l_times_argv *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_brk */
+ case 45:
+ switch(ndx) {
+ case 0:
+ p = "l_ulong";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_setgid16 */
+ case 46:
+ switch(ndx) {
+ case 0:
+ p = "l_gid16_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getgid16 */
+ case 47:
+ break;
+ /* linux_signal */
+ case 48:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_handler_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_geteuid16 */
+ case 49:
+ break;
+ /* linux_getegid16 */
+ case 50:
+ break;
+ /* acct */
+ case 51:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_umount */
+ case 52:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_ioctl */
+ case 54:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "l_uint";
+ break;
+ case 2:
+ p = "uintptr_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_fcntl */
+ case 55:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "l_uint";
+ break;
+ case 2:
+ p = "uintptr_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setpgid */
+ case 57:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_olduname */
+ case 59:
+ break;
+ /* umask */
+ case 60:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* chroot */
+ case 61:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_ustat */
+ case 62:
+ switch(ndx) {
+ case 0:
+ p = "l_dev_t";
+ break;
+ case 1:
+ p = "struct l_ustat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* dup2 */
+ case 63:
+ switch(ndx) {
+ case 0:
+ p = "u_int";
+ break;
+ case 1:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getppid */
+ case 64:
+ break;
+ /* getpgrp */
+ case 65:
+ break;
+ /* setsid */
+ case 66:
+ break;
+ /* linux_sigaction */
+ case 67:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_osigaction_t *";
+ break;
+ case 2:
+ p = "l_osigaction_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sgetmask */
+ case 68:
+ break;
+ /* linux_ssetmask */
+ case 69:
+ switch(ndx) {
+ case 0:
+ p = "l_osigset_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_setreuid16 */
+ case 70:
+ switch(ndx) {
+ case 0:
+ p = "l_uid16_t";
+ break;
+ case 1:
+ p = "l_uid16_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_setregid16 */
+ case 71:
+ switch(ndx) {
+ case 0:
+ p = "l_gid16_t";
+ break;
+ case 1:
+ p = "l_gid16_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sigsuspend */
+ case 72:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_int";
+ break;
+ case 2:
+ p = "l_osigset_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sigpending */
+ case 73:
+ switch(ndx) {
+ case 0:
+ p = "l_osigset_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sethostname */
+ case 74:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_setrlimit */
+ case 75:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "struct l_rlimit *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_old_getrlimit */
+ case 76:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "struct l_rlimit *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getrusage */
+ case 77:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct l_rusage *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_gettimeofday */
+ case 78:
+ switch(ndx) {
+ case 0:
+ p = "struct l_timeval *";
+ break;
+ case 1:
+ p = "struct timezone *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_settimeofday */
+ case 79:
+ switch(ndx) {
+ case 0:
+ p = "struct l_timeval *";
+ break;
+ case 1:
+ p = "struct timezone *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getgroups16 */
+ case 80:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "l_gid16_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_setgroups16 */
+ case 81:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "l_gid16_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_old_select */
+ case 82:
+ switch(ndx) {
+ case 0:
+ p = "struct l_old_select_argv *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_symlink */
+ case 83:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_lstat */
+ case 84:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct linux_lstat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_readlink */
+ case 85:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* swapon */
+ case 87:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_reboot */
+ case 88:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_int";
+ break;
+ case 2:
+ p = "l_uint";
+ break;
+ case 3:
+ p = "void *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_readdir */
+ case 89:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "struct l_dirent *";
+ break;
+ case 2:
+ p = "l_uint";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_mmap */
+ case 90:
+ switch(ndx) {
+ case 0:
+ p = "struct l_mmap_argv *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* munmap */
+ case 91:
+ switch(ndx) {
+ case 0:
+ p = "caddr_t";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_truncate */
+ case 92:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "l_ulong";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_ftruncate */
+ case 93:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "long";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fchmod */
+ case 94:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fchown */
+ case 95:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getpriority */
+ case 96:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setpriority */
+ case 97:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_statfs */
+ case 99:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct l_statfs_buf *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_fstatfs */
+ case 100:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "struct l_statfs_buf *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_socketcall */
+ case 102:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_ulong";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_syslog */
+ case 103:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_setitimer */
+ case 104:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "struct l_itimerval *";
+ break;
+ case 2:
+ p = "struct l_itimerval *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getitimer */
+ case 105:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "struct l_itimerval *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_newstat */
+ case 106:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct l_newstat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_newlstat */
+ case 107:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct l_newstat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_newfstat */
+ case 108:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "struct l_newstat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_uname */
+ case 109:
+ break;
+ /* linux_iopl */
+ case 110:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_vhangup */
+ case 111:
+ break;
+ /* linux_wait4 */
+ case 114:
+ switch(ndx) {
+ case 0:
+ p = "l_pid_t";
+ break;
+ case 1:
+ p = "l_uint *";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ case 3:
+ p = "struct l_rusage *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_swapoff */
+ case 115:
+ break;
+ /* linux_sysinfo */
+ case 116:
+ switch(ndx) {
+ case 0:
+ p = "struct l_sysinfo *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_ipc */
+ case 117:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "l_int";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ case 3:
+ p = "l_int";
+ break;
+ case 4:
+ p = "void *";
+ break;
+ case 5:
+ p = "l_long";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fsync */
+ case 118:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sigreturn */
+ case 119:
+ switch(ndx) {
+ case 0:
+ p = "struct l_sigframe *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_clone */
+ case 120:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "void *";
+ break;
+ case 2:
+ p = "void *";
+ break;
+ case 3:
+ p = "void *";
+ break;
+ case 4:
+ p = "void *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_setdomainname */
+ case 121:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_newuname */
+ case 122:
+ switch(ndx) {
+ case 0:
+ p = "struct l_new_utsname *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_adjtimex */
+ case 124:
+ break;
+ /* linux_mprotect */
+ case 125:
+ switch(ndx) {
+ case 0:
+ p = "caddr_t";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sigprocmask */
+ case 126:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_osigset_t *";
+ break;
+ case 2:
+ p = "l_osigset_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_create_module */
+ case 127:
+ break;
+ /* linux_init_module */
+ case 128:
+ break;
+ /* linux_delete_module */
+ case 129:
+ break;
+ /* linux_get_kernel_syms */
+ case 130:
+ break;
+ /* linux_quotactl */
+ case 131:
+ break;
+ /* getpgid */
+ case 132:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fchdir */
+ case 133:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_bdflush */
+ case 134:
+ break;
+ /* linux_sysfs */
+ case 135:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_ulong";
+ break;
+ case 2:
+ p = "l_ulong";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_personality */
+ case 136:
+ switch(ndx) {
+ case 0:
+ p = "l_ulong";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_setfsuid16 */
+ case 138:
+ switch(ndx) {
+ case 0:
+ p = "l_uid16_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_setfsgid16 */
+ case 139:
+ switch(ndx) {
+ case 0:
+ p = "l_gid16_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_llseek */
+ case 140:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_ulong";
+ break;
+ case 2:
+ p = "l_ulong";
+ break;
+ case 3:
+ p = "l_loff_t *";
+ break;
+ case 4:
+ p = "l_uint";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getdents */
+ case 141:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "void *";
+ break;
+ case 2:
+ p = "l_uint";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_select */
+ case 142:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_fd_set *";
+ break;
+ case 2:
+ p = "l_fd_set *";
+ break;
+ case 3:
+ p = "l_fd_set *";
+ break;
+ case 4:
+ p = "struct l_timeval *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* flock */
+ case 143:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_msync */
+ case 144:
+ switch(ndx) {
+ case 0:
+ p = "l_ulong";
+ break;
+ case 1:
+ p = "l_size_t";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_readv */
+ case 145:
+ switch(ndx) {
+ case 0:
+ p = "l_ulong";
+ break;
+ case 1:
+ p = "struct l_iovec32 *";
+ break;
+ case 2:
+ p = "l_ulong";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_writev */
+ case 146:
+ switch(ndx) {
+ case 0:
+ p = "l_ulong";
+ break;
+ case 1:
+ p = "struct l_iovec32 *";
+ break;
+ case 2:
+ p = "l_ulong";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getsid */
+ case 147:
+ switch(ndx) {
+ case 0:
+ p = "l_pid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_fdatasync */
+ case 148:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sysctl */
+ case 149:
+ switch(ndx) {
+ case 0:
+ p = "struct l___sysctl_args *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* mlock */
+ case 150:
+ switch(ndx) {
+ case 0:
+ p = "const void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* munlock */
+ case 151:
+ switch(ndx) {
+ case 0:
+ p = "const void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* mlockall */
+ case 152:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* munlockall */
+ case 153:
+ break;
+ /* sched_setparam */
+ case 154:
+ switch(ndx) {
+ case 0:
+ p = "pid_t";
+ break;
+ case 1:
+ p = "const struct sched_param *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sched_getparam */
+ case 155:
+ switch(ndx) {
+ case 0:
+ p = "pid_t";
+ break;
+ case 1:
+ p = "struct sched_param *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sched_setscheduler */
+ case 156:
+ switch(ndx) {
+ case 0:
+ p = "l_pid_t";
+ break;
+ case 1:
+ p = "l_int";
+ break;
+ case 2:
+ p = "struct l_sched_param *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sched_getscheduler */
+ case 157:
+ switch(ndx) {
+ case 0:
+ p = "l_pid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sched_yield */
+ case 158:
+ break;
+ /* linux_sched_get_priority_max */
+ case 159:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sched_get_priority_min */
+ case 160:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sched_rr_get_interval */
+ case 161:
+ switch(ndx) {
+ case 0:
+ p = "l_pid_t";
+ break;
+ case 1:
+ p = "struct l_timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_nanosleep */
+ case 162:
+ switch(ndx) {
+ case 0:
+ p = "const struct l_timespec *";
+ break;
+ case 1:
+ p = "struct l_timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_mremap */
+ case 163:
+ switch(ndx) {
+ case 0:
+ p = "l_ulong";
+ break;
+ case 1:
+ p = "l_ulong";
+ break;
+ case 2:
+ p = "l_ulong";
+ break;
+ case 3:
+ p = "l_ulong";
+ break;
+ case 4:
+ p = "l_ulong";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_setresuid16 */
+ case 164:
+ switch(ndx) {
+ case 0:
+ p = "l_uid16_t";
+ break;
+ case 1:
+ p = "l_uid16_t";
+ break;
+ case 2:
+ p = "l_uid16_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getresuid16 */
+ case 165:
+ switch(ndx) {
+ case 0:
+ p = "l_uid16_t *";
+ break;
+ case 1:
+ p = "l_uid16_t *";
+ break;
+ case 2:
+ p = "l_uid16_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_query_module */
+ case 167:
+ break;
+ /* poll */
+ case 168:
+ switch(ndx) {
+ case 0:
+ p = "struct pollfd *";
+ break;
+ case 1:
+ p = "unsigned int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_nfsservctl */
+ case 169:
+ break;
+ /* linux_setresgid16 */
+ case 170:
+ switch(ndx) {
+ case 0:
+ p = "l_gid16_t";
+ break;
+ case 1:
+ p = "l_gid16_t";
+ break;
+ case 2:
+ p = "l_gid16_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getresgid16 */
+ case 171:
+ switch(ndx) {
+ case 0:
+ p = "l_gid16_t *";
+ break;
+ case 1:
+ p = "l_gid16_t *";
+ break;
+ case 2:
+ p = "l_gid16_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_prctl */
+ case 172:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_int";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ case 3:
+ p = "l_int";
+ break;
+ case 4:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_rt_sigreturn */
+ case 173:
+ switch(ndx) {
+ case 0:
+ p = "struct l_ucontext *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_rt_sigaction */
+ case 174:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_sigaction_t *";
+ break;
+ case 2:
+ p = "l_sigaction_t *";
+ break;
+ case 3:
+ p = "l_size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_rt_sigprocmask */
+ case 175:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_sigset_t *";
+ break;
+ case 2:
+ p = "l_sigset_t *";
+ break;
+ case 3:
+ p = "l_size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_rt_sigpending */
+ case 176:
+ switch(ndx) {
+ case 0:
+ p = "l_sigset_t *";
+ break;
+ case 1:
+ p = "l_size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_rt_sigtimedwait */
+ case 177:
+ switch(ndx) {
+ case 0:
+ p = "l_sigset_t *";
+ break;
+ case 1:
+ p = "l_siginfo_t *";
+ break;
+ case 2:
+ p = "struct l_timeval *";
+ break;
+ case 3:
+ p = "l_size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_rt_sigqueueinfo */
+ case 178:
+ break;
+ /* linux_rt_sigsuspend */
+ case 179:
+ switch(ndx) {
+ case 0:
+ p = "l_sigset_t *";
+ break;
+ case 1:
+ p = "l_size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_pread */
+ case 180:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "l_size_t";
+ break;
+ case 3:
+ p = "l_loff_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_pwrite */
+ case 181:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "l_size_t";
+ break;
+ case 3:
+ p = "l_loff_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_chown16 */
+ case 182:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "l_uid16_t";
+ break;
+ case 2:
+ p = "l_gid16_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getcwd */
+ case 183:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "l_ulong";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_capget */
+ case 184:
+ switch(ndx) {
+ case 0:
+ p = "struct l_user_cap_header *";
+ break;
+ case 1:
+ p = "struct l_user_cap_data *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_capset */
+ case 185:
+ switch(ndx) {
+ case 0:
+ p = "struct l_user_cap_header *";
+ break;
+ case 1:
+ p = "struct l_user_cap_data *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sigaltstack */
+ case 186:
+ switch(ndx) {
+ case 0:
+ p = "l_stack_t *";
+ break;
+ case 1:
+ p = "l_stack_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sendfile */
+ case 187:
+ break;
+ /* linux_vfork */
+ case 190:
+ break;
+ /* linux_getrlimit */
+ case 191:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "struct l_rlimit *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_mmap2 */
+ case 192:
+ switch(ndx) {
+ case 0:
+ p = "l_ulong";
+ break;
+ case 1:
+ p = "l_ulong";
+ break;
+ case 2:
+ p = "l_ulong";
+ break;
+ case 3:
+ p = "l_ulong";
+ break;
+ case 4:
+ p = "l_ulong";
+ break;
+ case 5:
+ p = "l_ulong";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_truncate64 */
+ case 193:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "l_loff_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_ftruncate64 */
+ case 194:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "l_loff_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_stat64 */
+ case 195:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "struct l_stat64 *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_lstat64 */
+ case 196:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "struct l_stat64 *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_fstat64 */
+ case 197:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "struct l_stat64 *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_lchown */
+ case 198:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "l_uid_t";
+ break;
+ case 2:
+ p = "l_gid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getuid */
+ case 199:
+ break;
+ /* linux_getgid */
+ case 200:
+ break;
+ /* geteuid */
+ case 201:
+ break;
+ /* getegid */
+ case 202:
+ break;
+ /* setreuid */
+ case 203:
+ switch(ndx) {
+ case 0:
+ p = "uid_t";
+ break;
+ case 1:
+ p = "uid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setregid */
+ case 204:
+ switch(ndx) {
+ case 0:
+ p = "gid_t";
+ break;
+ case 1:
+ p = "gid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getgroups */
+ case 205:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_gid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_setgroups */
+ case 206:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "l_gid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fchown */
+ case 207:
+ break;
+ /* setresuid */
+ case 208:
+ switch(ndx) {
+ case 0:
+ p = "uid_t";
+ break;
+ case 1:
+ p = "uid_t";
+ break;
+ case 2:
+ p = "uid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getresuid */
+ case 209:
+ switch(ndx) {
+ case 0:
+ p = "uid_t *";
+ break;
+ case 1:
+ p = "uid_t *";
+ break;
+ case 2:
+ p = "uid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setresgid */
+ case 210:
+ switch(ndx) {
+ case 0:
+ p = "gid_t";
+ break;
+ case 1:
+ p = "gid_t";
+ break;
+ case 2:
+ p = "gid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getresgid */
+ case 211:
+ switch(ndx) {
+ case 0:
+ p = "gid_t *";
+ break;
+ case 1:
+ p = "gid_t *";
+ break;
+ case 2:
+ p = "gid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_chown */
+ case 212:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "l_uid_t";
+ break;
+ case 2:
+ p = "l_gid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setuid */
+ case 213:
+ switch(ndx) {
+ case 0:
+ p = "uid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setgid */
+ case 214:
+ switch(ndx) {
+ case 0:
+ p = "gid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_setfsuid */
+ case 215:
+ switch(ndx) {
+ case 0:
+ p = "l_uid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_setfsgid */
+ case 216:
+ switch(ndx) {
+ case 0:
+ p = "l_gid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_pivot_root */
+ case 217:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_mincore */
+ case 218:
+ switch(ndx) {
+ case 0:
+ p = "l_ulong";
+ break;
+ case 1:
+ p = "l_size_t";
+ break;
+ case 2:
+ p = "u_char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* madvise */
+ case 219:
+ switch(ndx) {
+ case 0:
+ p = "void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_getdents64 */
+ case 220:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "void *";
+ break;
+ case 2:
+ p = "l_uint";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_fcntl64 */
+ case 221:
+ switch(ndx) {
+ case 0:
+ p = "l_uint";
+ break;
+ case 1:
+ p = "l_uint";
+ break;
+ case 2:
+ p = "uintptr_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_gettid */
+ case 224:
+ break;
+ /* linux_setxattr */
+ case 226:
+ break;
+ /* linux_lsetxattr */
+ case 227:
+ break;
+ /* linux_fsetxattr */
+ case 228:
+ break;
+ /* linux_getxattr */
+ case 229:
+ break;
+ /* linux_lgetxattr */
+ case 230:
+ break;
+ /* linux_fgetxattr */
+ case 231:
+ break;
+ /* linux_listxattr */
+ case 232:
+ break;
+ /* linux_llistxattr */
+ case 233:
+ break;
+ /* linux_flistxattr */
+ case 234:
+ break;
+ /* linux_removexattr */
+ case 235:
+ break;
+ /* linux_lremovexattr */
+ case 236:
+ break;
+ /* linux_fremovexattr */
+ case 237:
+ break;
+ /* linux_tkill */
+ case 238:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sys_futex */
+ case 240:
+ switch(ndx) {
+ case 0:
+ p = "void *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "uint32_t";
+ break;
+ case 3:
+ p = "struct l_timespec *";
+ break;
+ case 4:
+ p = "uint32_t *";
+ break;
+ case 5:
+ p = "uint32_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sched_setaffinity */
+ case 241:
+ switch(ndx) {
+ case 0:
+ p = "l_pid_t";
+ break;
+ case 1:
+ p = "l_uint";
+ break;
+ case 2:
+ p = "l_ulong *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_sched_getaffinity */
+ case 242:
+ switch(ndx) {
+ case 0:
+ p = "l_pid_t";
+ break;
+ case 1:
+ p = "l_uint";
+ break;
+ case 2:
+ p = "l_ulong *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_set_thread_area */
+ case 243:
+ switch(ndx) {
+ case 0:
+ p = "struct l_user_desc *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_fadvise64 */
+ case 250:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "l_loff_t";
+ break;
+ case 2:
+ p = "l_size_t";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_exit_group */
+ case 252:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_lookup_dcookie */
+ case 253:
+ break;
+ /* linux_epoll_create */
+ case 254:
+ break;
+ /* linux_epoll_ctl */
+ case 255:
+ break;
+ /* linux_epoll_wait */
+ case 256:
+ break;
+ /* linux_remap_file_pages */
+ case 257:
+ break;
+ /* linux_set_tid_address */
+ case 258:
+ switch(ndx) {
+ case 0:
+ p = "int *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_timer_create */
+ case 259:
+ break;
+ /* linux_timer_settime */
+ case 260:
+ break;
+ /* linux_timer_gettime */
+ case 261:
+ break;
+ /* linux_timer_getoverrun */
+ case 262:
+ break;
+ /* linux_timer_delete */
+ case 263:
+ break;
+ /* linux_clock_settime */
+ case 264:
+ switch(ndx) {
+ case 0:
+ p = "clockid_t";
+ break;
+ case 1:
+ p = "struct l_timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_clock_gettime */
+ case 265:
+ switch(ndx) {
+ case 0:
+ p = "clockid_t";
+ break;
+ case 1:
+ p = "struct l_timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_clock_getres */
+ case 266:
+ switch(ndx) {
+ case 0:
+ p = "clockid_t";
+ break;
+ case 1:
+ p = "struct l_timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_clock_nanosleep */
+ case 267:
+ switch(ndx) {
+ case 0:
+ p = "clockid_t";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "struct l_timespec *";
+ break;
+ case 3:
+ p = "struct l_timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_statfs64 */
+ case 268:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "struct l_statfs64_buf *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_fstatfs64 */
+ case 269:
+ break;
+ /* linux_tgkill */
+ case 270:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_utimes */
+ case 271:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct l_timeval *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_fadvise64_64 */
+ case 272:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "l_loff_t";
+ break;
+ case 2:
+ p = "l_loff_t";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_mbind */
+ case 274:
+ break;
+ /* linux_get_mempolicy */
+ case 275:
+ break;
+ /* linux_set_mempolicy */
+ case 276:
+ break;
+ /* linux_mq_open */
+ case 277:
+ break;
+ /* linux_mq_unlink */
+ case 278:
+ break;
+ /* linux_mq_timedsend */
+ case 279:
+ break;
+ /* linux_mq_timedreceive */
+ case 280:
+ break;
+ /* linux_mq_notify */
+ case 281:
+ break;
+ /* linux_mq_getsetattr */
+ case 282:
+ break;
+ /* linux_kexec_load */
+ case 283:
+ break;
+ /* linux_waitid */
+ case 284:
+ break;
+ /* linux_add_key */
+ case 286:
+ break;
+ /* linux_request_key */
+ case 287:
+ break;
+ /* linux_keyctl */
+ case 288:
+ break;
+ /* linux_ioprio_set */
+ case 289:
+ break;
+ /* linux_ioprio_get */
+ case 290:
+ break;
+ /* linux_inotify_init */
+ case 291:
+ break;
+ /* linux_inotify_add_watch */
+ case 292:
+ break;
+ /* linux_inotify_rm_watch */
+ case 293:
+ break;
+ /* linux_migrate_pages */
+ case 294:
+ break;
+ /* linux_openat */
+ case 295:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ case 3:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_mkdirat */
+ case 296:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_mknodat */
+ case 297:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ case 3:
+ p = "l_uint";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_fchownat */
+ case 298:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ case 2:
+ p = "l_uid16_t";
+ break;
+ case 3:
+ p = "l_gid16_t";
+ break;
+ case 4:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_futimesat */
+ case 299:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "struct l_timeval *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_fstatat64 */
+ case 300:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "struct l_stat64 *";
+ break;
+ case 3:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_unlinkat */
+ case 301:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_renameat */
+ case 302:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ case 3:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_linkat */
+ case 303:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ case 3:
+ p = "const char *";
+ break;
+ case 4:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_symlinkat */
+ case 304:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "l_int";
+ break;
+ case 2:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_readlinkat */
+ case 305:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ case 2:
+ p = "char *";
+ break;
+ case 3:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_fchmodat */
+ case 306:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ case 2:
+ p = "l_mode_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_faccessat */
+ case 307:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ case 2:
+ p = "l_int";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_pselect6 */
+ case 308:
+ break;
+ /* linux_ppoll */
+ case 309:
+ break;
+ /* linux_unshare */
+ case 310:
+ break;
+ /* linux_set_robust_list */
+ case 311:
+ switch(ndx) {
+ case 0:
+ p = "struct linux_robust_list_head *";
+ break;
+ case 1:
+ p = "l_size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_get_robust_list */
+ case 312:
+ switch(ndx) {
+ case 0:
+ p = "l_int";
+ break;
+ case 1:
+ p = "struct linux_robust_list_head *";
+ break;
+ case 2:
+ p = "l_size_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_splice */
+ case 313:
+ break;
+ /* linux_sync_file_range */
+ case 314:
+ break;
+ /* linux_tee */
+ case 315:
+ break;
+ /* linux_vmsplice */
+ case 316:
+ break;
+ /* linux_move_pages */
+ case 317:
+ break;
+ /* linux_getcpu */
+ case 318:
+ break;
+ /* linux_epoll_pwait */
+ case 319:
+ break;
+ /* linux_utimensat */
+ case 320:
+ break;
+ /* linux_signalfd */
+ case 321:
+ break;
+ /* linux_timerfd_create */
+ case 322:
+ break;
+ /* linux_eventfd */
+ case 323:
+ break;
+ /* linux_fallocate */
+ case 324:
+ break;
+ /* linux_timerfd_settime */
+ case 325:
+ break;
+ /* linux_timerfd_gettime */
+ case 326:
+ break;
+ /* linux_signalfd4 */
+ case 327:
+ break;
+ /* linux_eventfd2 */
+ case 328:
+ break;
+ /* linux_epoll_create1 */
+ case 329:
+ break;
+ /* linux_dup3 */
+ case 330:
+ break;
+ /* linux_pipe2 */
+ case 331:
+ switch(ndx) {
+ case 0:
+ p = "l_int *";
+ break;
+ case 1:
+ p = "l_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linux_inotify_init1 */
+ case 332:
+ break;
+ /* linux_preadv */
+ case 333:
+ break;
+ /* linux_pwritev */
+ case 334:
+ break;
+ /* linux_rt_tsigqueueinfo */
+ case 335:
+ break;
+ /* linux_perf_event_open */
+ case 336:
+ break;
+ /* linux_recvmmsg */
+ case 337:
+ break;
+ /* linux_fanotify_init */
+ case 338:
+ break;
+ /* linux_fanotify_mark */
+ case 339:
+ break;
+ /* linux_prlimit64 */
+ case 340:
+ break;
+ /* linux_name_to_handle_at */
+ case 341:
+ break;
+ /* linux_open_by_handle_at */
+ case 342:
+ break;
+ /* linux_clock_adjtime */
+ case 343:
+ break;
+ /* linux_syncfs */
+ case 344:
+ break;
+ /* linux_sendmmsg */
+ case 345:
+ break;
+ /* linux_setns */
+ case 346:
+ break;
+ /* linux_process_vm_readv */
+ case 347:
+ break;
+ /* linux_process_vm_writev */
+ case 348:
+ break;
+ default:
+ break;
+ };
+ if (p != NULL)
+ strlcpy(desc, p, descsz);
+}
+static void
+systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
+{
+ const char *p = NULL;
+ switch (sysnum) {
+#define nosys linux_nosys
+ /* sys_exit */
+ case 1:
+ if (ndx == 0 || ndx == 1)
+ p = "void";
+ break;
+ /* linux_fork */
+ case 2:
+ /* read */
+ case 3:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* write */
+ case 4:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_open */
+ case 5:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* close */
+ case 6:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_waitpid */
+ case 7:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_creat */
+ case 8:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_link */
+ case 9:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_unlink */
+ case 10:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_execve */
+ case 11:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_chdir */
+ case 12:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_time */
+ case 13:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_mknod */
+ case 14:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_chmod */
+ case 15:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_lchown16 */
+ case 16:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_stat */
+ case 18:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_lseek */
+ case 19:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getpid */
+ case 20:
+ /* linux_mount */
+ case 21:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_oldumount */
+ case 22:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_setuid16 */
+ case 23:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getuid16 */
+ case 24:
+ /* linux_stime */
+ case 25:
+ /* linux_ptrace */
+ case 26:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_alarm */
+ case 27:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_pause */
+ case 29:
+ /* linux_utime */
+ case 30:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_access */
+ case 33:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_nice */
+ case 34:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sync */
+ case 36:
+ /* linux_kill */
+ case 37:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_rename */
+ case 38:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_mkdir */
+ case 39:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_rmdir */
+ case 40:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* dup */
+ case 41:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_pipe */
+ case 42:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_times */
+ case 43:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_brk */
+ case 45:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_setgid16 */
+ case 46:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getgid16 */
+ case 47:
+ /* linux_signal */
+ case 48:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_geteuid16 */
+ case 49:
+ /* linux_getegid16 */
+ case 50:
+ /* acct */
+ case 51:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_umount */
+ case 52:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_ioctl */
+ case 54:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_fcntl */
+ case 55:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setpgid */
+ case 57:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_olduname */
+ case 59:
+ /* umask */
+ case 60:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* chroot */
+ case 61:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_ustat */
+ case 62:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* dup2 */
+ case 63:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getppid */
+ case 64:
+ /* getpgrp */
+ case 65:
+ /* setsid */
+ case 66:
+ /* linux_sigaction */
+ case 67:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sgetmask */
+ case 68:
+ /* linux_ssetmask */
+ case 69:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_setreuid16 */
+ case 70:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_setregid16 */
+ case 71:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sigsuspend */
+ case 72:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sigpending */
+ case 73:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sethostname */
+ case 74:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_setrlimit */
+ case 75:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_old_getrlimit */
+ case 76:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getrusage */
+ case 77:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_gettimeofday */
+ case 78:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_settimeofday */
+ case 79:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getgroups16 */
+ case 80:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_setgroups16 */
+ case 81:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_old_select */
+ case 82:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_symlink */
+ case 83:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_lstat */
+ case 84:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_readlink */
+ case 85:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* swapon */
+ case 87:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_reboot */
+ case 88:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_readdir */
+ case 89:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_mmap */
+ case 90:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* munmap */
+ case 91:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_truncate */
+ case 92:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_ftruncate */
+ case 93:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fchmod */
+ case 94:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fchown */
+ case 95:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getpriority */
+ case 96:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setpriority */
+ case 97:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_statfs */
+ case 99:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_fstatfs */
+ case 100:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_socketcall */
+ case 102:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_syslog */
+ case 103:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_setitimer */
+ case 104:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getitimer */
+ case 105:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_newstat */
+ case 106:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_newlstat */
+ case 107:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_newfstat */
+ case 108:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_uname */
+ case 109:
+ /* linux_iopl */
+ case 110:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_vhangup */
+ case 111:
+ /* linux_wait4 */
+ case 114:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_swapoff */
+ case 115:
+ /* linux_sysinfo */
+ case 116:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_ipc */
+ case 117:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fsync */
+ case 118:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sigreturn */
+ case 119:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_clone */
+ case 120:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_setdomainname */
+ case 121:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_newuname */
+ case 122:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_adjtimex */
+ case 124:
+ /* linux_mprotect */
+ case 125:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sigprocmask */
+ case 126:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_create_module */
+ case 127:
+ /* linux_init_module */
+ case 128:
+ /* linux_delete_module */
+ case 129:
+ /* linux_get_kernel_syms */
+ case 130:
+ /* linux_quotactl */
+ case 131:
+ /* getpgid */
+ case 132:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fchdir */
+ case 133:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_bdflush */
+ case 134:
+ /* linux_sysfs */
+ case 135:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_personality */
+ case 136:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_setfsuid16 */
+ case 138:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_setfsgid16 */
+ case 139:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_llseek */
+ case 140:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getdents */
+ case 141:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_select */
+ case 142:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* flock */
+ case 143:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_msync */
+ case 144:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_readv */
+ case 145:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_writev */
+ case 146:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getsid */
+ case 147:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_fdatasync */
+ case 148:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sysctl */
+ case 149:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* mlock */
+ case 150:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* munlock */
+ case 151:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* mlockall */
+ case 152:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* munlockall */
+ case 153:
+ /* sched_setparam */
+ case 154:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sched_getparam */
+ case 155:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sched_setscheduler */
+ case 156:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sched_getscheduler */
+ case 157:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sched_yield */
+ case 158:
+ /* linux_sched_get_priority_max */
+ case 159:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sched_get_priority_min */
+ case 160:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sched_rr_get_interval */
+ case 161:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_nanosleep */
+ case 162:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_mremap */
+ case 163:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_setresuid16 */
+ case 164:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getresuid16 */
+ case 165:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_query_module */
+ case 167:
+ /* poll */
+ case 168:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_nfsservctl */
+ case 169:
+ /* linux_setresgid16 */
+ case 170:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getresgid16 */
+ case 171:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_prctl */
+ case 172:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_rt_sigreturn */
+ case 173:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_rt_sigaction */
+ case 174:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_rt_sigprocmask */
+ case 175:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_rt_sigpending */
+ case 176:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_rt_sigtimedwait */
+ case 177:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_rt_sigqueueinfo */
+ case 178:
+ /* linux_rt_sigsuspend */
+ case 179:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_pread */
+ case 180:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_pwrite */
+ case 181:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_chown16 */
+ case 182:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getcwd */
+ case 183:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_capget */
+ case 184:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_capset */
+ case 185:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sigaltstack */
+ case 186:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sendfile */
+ case 187:
+ /* linux_vfork */
+ case 190:
+ /* linux_getrlimit */
+ case 191:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_mmap2 */
+ case 192:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_truncate64 */
+ case 193:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_ftruncate64 */
+ case 194:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_stat64 */
+ case 195:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_lstat64 */
+ case 196:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_fstat64 */
+ case 197:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_lchown */
+ case 198:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getuid */
+ case 199:
+ /* linux_getgid */
+ case 200:
+ /* geteuid */
+ case 201:
+ /* getegid */
+ case 202:
+ /* setreuid */
+ case 203:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setregid */
+ case 204:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getgroups */
+ case 205:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_setgroups */
+ case 206:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fchown */
+ case 207:
+ /* setresuid */
+ case 208:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getresuid */
+ case 209:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setresgid */
+ case 210:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getresgid */
+ case 211:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_chown */
+ case 212:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setuid */
+ case 213:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setgid */
+ case 214:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_setfsuid */
+ case 215:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_setfsgid */
+ case 216:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_pivot_root */
+ case 217:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_mincore */
+ case 218:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* madvise */
+ case 219:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_getdents64 */
+ case 220:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_fcntl64 */
+ case 221:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_gettid */
+ case 224:
+ /* linux_setxattr */
+ case 226:
+ /* linux_lsetxattr */
+ case 227:
+ /* linux_fsetxattr */
+ case 228:
+ /* linux_getxattr */
+ case 229:
+ /* linux_lgetxattr */
+ case 230:
+ /* linux_fgetxattr */
+ case 231:
+ /* linux_listxattr */
+ case 232:
+ /* linux_llistxattr */
+ case 233:
+ /* linux_flistxattr */
+ case 234:
+ /* linux_removexattr */
+ case 235:
+ /* linux_lremovexattr */
+ case 236:
+ /* linux_fremovexattr */
+ case 237:
+ /* linux_tkill */
+ case 238:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sys_futex */
+ case 240:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sched_setaffinity */
+ case 241:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_sched_getaffinity */
+ case 242:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_set_thread_area */
+ case 243:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_fadvise64 */
+ case 250:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_exit_group */
+ case 252:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_lookup_dcookie */
+ case 253:
+ /* linux_epoll_create */
+ case 254:
+ /* linux_epoll_ctl */
+ case 255:
+ /* linux_epoll_wait */
+ case 256:
+ /* linux_remap_file_pages */
+ case 257:
+ /* linux_set_tid_address */
+ case 258:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_timer_create */
+ case 259:
+ /* linux_timer_settime */
+ case 260:
+ /* linux_timer_gettime */
+ case 261:
+ /* linux_timer_getoverrun */
+ case 262:
+ /* linux_timer_delete */
+ case 263:
+ /* linux_clock_settime */
+ case 264:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_clock_gettime */
+ case 265:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_clock_getres */
+ case 266:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_clock_nanosleep */
+ case 267:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_statfs64 */
+ case 268:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_fstatfs64 */
+ case 269:
+ /* linux_tgkill */
+ case 270:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_utimes */
+ case 271:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_fadvise64_64 */
+ case 272:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_mbind */
+ case 274:
+ /* linux_get_mempolicy */
+ case 275:
+ /* linux_set_mempolicy */
+ case 276:
+ /* linux_mq_open */
+ case 277:
+ /* linux_mq_unlink */
+ case 278:
+ /* linux_mq_timedsend */
+ case 279:
+ /* linux_mq_timedreceive */
+ case 280:
+ /* linux_mq_notify */
+ case 281:
+ /* linux_mq_getsetattr */
+ case 282:
+ /* linux_kexec_load */
+ case 283:
+ /* linux_waitid */
+ case 284:
+ /* linux_add_key */
+ case 286:
+ /* linux_request_key */
+ case 287:
+ /* linux_keyctl */
+ case 288:
+ /* linux_ioprio_set */
+ case 289:
+ /* linux_ioprio_get */
+ case 290:
+ /* linux_inotify_init */
+ case 291:
+ /* linux_inotify_add_watch */
+ case 292:
+ /* linux_inotify_rm_watch */
+ case 293:
+ /* linux_migrate_pages */
+ case 294:
+ /* linux_openat */
+ case 295:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_mkdirat */
+ case 296:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_mknodat */
+ case 297:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_fchownat */
+ case 298:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_futimesat */
+ case 299:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_fstatat64 */
+ case 300:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_unlinkat */
+ case 301:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_renameat */
+ case 302:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_linkat */
+ case 303:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_symlinkat */
+ case 304:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_readlinkat */
+ case 305:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_fchmodat */
+ case 306:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_faccessat */
+ case 307:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_pselect6 */
+ case 308:
+ /* linux_ppoll */
+ case 309:
+ /* linux_unshare */
+ case 310:
+ /* linux_set_robust_list */
+ case 311:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_get_robust_list */
+ case 312:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_splice */
+ case 313:
+ /* linux_sync_file_range */
+ case 314:
+ /* linux_tee */
+ case 315:
+ /* linux_vmsplice */
+ case 316:
+ /* linux_move_pages */
+ case 317:
+ /* linux_getcpu */
+ case 318:
+ /* linux_epoll_pwait */
+ case 319:
+ /* linux_utimensat */
+ case 320:
+ /* linux_signalfd */
+ case 321:
+ /* linux_timerfd_create */
+ case 322:
+ /* linux_eventfd */
+ case 323:
+ /* linux_fallocate */
+ case 324:
+ /* linux_timerfd_settime */
+ case 325:
+ /* linux_timerfd_gettime */
+ case 326:
+ /* linux_signalfd4 */
+ case 327:
+ /* linux_eventfd2 */
+ case 328:
+ /* linux_epoll_create1 */
+ case 329:
+ /* linux_dup3 */
+ case 330:
+ /* linux_pipe2 */
+ case 331:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linux_inotify_init1 */
+ case 332:
+ /* linux_preadv */
+ case 333:
+ /* linux_pwritev */
+ case 334:
+ /* linux_rt_tsigqueueinfo */
+ case 335:
+ /* linux_perf_event_open */
+ case 336:
+ /* linux_recvmmsg */
+ case 337:
+ /* linux_fanotify_init */
+ case 338:
+ /* linux_fanotify_mark */
+ case 339:
+ /* linux_prlimit64 */
+ case 340:
+ /* linux_name_to_handle_at */
+ case 341:
+ /* linux_open_by_handle_at */
+ case 342:
+ /* linux_clock_adjtime */
+ case 343:
+ /* linux_syncfs */
+ case 344:
+ /* linux_sendmmsg */
+ case 345:
+ /* linux_setns */
+ case 346:
+ /* linux_process_vm_readv */
+ case 347:
+ /* linux_process_vm_writev */
+ case 348:
+ default:
+ break;
+ };
+ if (p != NULL)
+ strlcpy(desc, p, descsz);
+}
diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c
new file mode 100644
index 0000000..42500da
--- /dev/null
+++ b/sys/amd64/linux32/linux32_sysvec.c
@@ -0,0 +1,1205 @@
+/*-
+ * Copyright (c) 2004 Tim J. Robbins
+ * Copyright (c) 2003 Peter Wemm
+ * Copyright (c) 2002 Doug Rabson
+ * Copyright (c) 1998-1999 Andrew Gallatin
+ * Copyright (c) 1994-1996 Søren Schmidt
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_compat.h"
+
+#ifndef COMPAT_FREEBSD32
+#error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
+#endif
+
+#define __ELF_WORD_SIZE 32
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/exec.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/vnode.h>
+#include <sys/eventhandler.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_param.h>
+
+#include <machine/cpu.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#include <machine/specialreg.h>
+
+#include <amd64/linux32/linux.h>
+#include <amd64/linux32/linux32_proto.h>
+#include <compat/linux/linux_emul.h>
+#include <compat/linux/linux_futex.h>
+#include <compat/linux/linux_ioctl.h>
+#include <compat/linux/linux_mib.h>
+#include <compat/linux/linux_misc.h>
+#include <compat/linux/linux_signal.h>
+#include <compat/linux/linux_util.h>
+
+MODULE_VERSION(linux, 1);
+
+MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
+
+#define AUXARGS_ENTRY_32(pos, id, val) \
+ do { \
+ suword32(pos++, id); \
+ suword32(pos++, val); \
+ } while (0)
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define SHELLMAGIC 0x2123 /* #! */
+#else
+#define SHELLMAGIC 0x2321
+#endif
+
+/*
+ * Allow the sendsig functions to use the ldebug() facility
+ * even though they are not syscalls themselves. Map them
+ * to syscall 0. This is slightly less bogus than using
+ * ldebug(sigreturn).
+ */
+#define LINUX_SYS_linux_rt_sendsig 0
+#define LINUX_SYS_linux_sendsig 0
+
+const char *linux_platform = "i686";
+static int linux_szplatform;
+extern char linux_sigcode[];
+extern int linux_szsigcode;
+
+extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
+
+SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
+SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
+
+static int elf_linux_fixup(register_t **stack_base,
+ struct image_params *iparams);
+static register_t *linux_copyout_strings(struct image_params *imgp);
+static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
+static void exec_linux_setregs(struct thread *td,
+ struct image_params *imgp, u_long stack);
+static void linux32_fixlimit(struct rlimit *rl, int which);
+static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
+
+static eventhandler_tag linux_exit_tag;
+static eventhandler_tag linux_exec_tag;
+
+/*
+ * Linux syscalls return negative errno's, we do positive and map them
+ * Reference:
+ * FreeBSD: src/sys/sys/errno.h
+ * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h
+ * linux-2.6.17.8/include/asm-generic/errno.h
+ */
+static int bsd_to_linux_errno[ELAST + 1] = {
+ -0, -1, -2, -3, -4, -5, -6, -7, -8, -9,
+ -10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
+ -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
+ -30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
+ -90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
+ -100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
+ -110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
+ -116, -66, -6, -6, -6, -6, -6, -37, -38, -9,
+ -6, -6, -43, -42, -75,-125, -84, -95, -16, -74,
+ -72, -67, -71
+};
+
+int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
+ LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
+ LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
+ LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
+ LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
+ LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
+ LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
+ LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
+ 0, LINUX_SIGUSR1, LINUX_SIGUSR2
+};
+
+int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
+ SIGHUP, SIGINT, SIGQUIT, SIGILL,
+ SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
+ SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
+ SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
+ SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
+ SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
+ SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
+ SIGIO, SIGURG, SIGSYS
+};
+
+#define LINUX_T_UNKNOWN 255
+static int _bsd_to_linux_trapcode[] = {
+ LINUX_T_UNKNOWN, /* 0 */
+ 6, /* 1 T_PRIVINFLT */
+ LINUX_T_UNKNOWN, /* 2 */
+ 3, /* 3 T_BPTFLT */
+ LINUX_T_UNKNOWN, /* 4 */
+ LINUX_T_UNKNOWN, /* 5 */
+ 16, /* 6 T_ARITHTRAP */
+ 254, /* 7 T_ASTFLT */
+ LINUX_T_UNKNOWN, /* 8 */
+ 13, /* 9 T_PROTFLT */
+ 1, /* 10 T_TRCTRAP */
+ LINUX_T_UNKNOWN, /* 11 */
+ 14, /* 12 T_PAGEFLT */
+ LINUX_T_UNKNOWN, /* 13 */
+ 17, /* 14 T_ALIGNFLT */
+ LINUX_T_UNKNOWN, /* 15 */
+ LINUX_T_UNKNOWN, /* 16 */
+ LINUX_T_UNKNOWN, /* 17 */
+ 0, /* 18 T_DIVIDE */
+ 2, /* 19 T_NMI */
+ 4, /* 20 T_OFLOW */
+ 5, /* 21 T_BOUND */
+ 7, /* 22 T_DNA */
+ 8, /* 23 T_DOUBLEFLT */
+ 9, /* 24 T_FPOPFLT */
+ 10, /* 25 T_TSSFLT */
+ 11, /* 26 T_SEGNPFLT */
+ 12, /* 27 T_STKFLT */
+ 18, /* 28 T_MCHK */
+ 19, /* 29 T_XMMFLT */
+ 15 /* 30 T_RESERVED */
+};
+#define bsd_to_linux_trapcode(code) \
+ ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
+ _bsd_to_linux_trapcode[(code)]: \
+ LINUX_T_UNKNOWN)
+
+struct linux32_ps_strings {
+ u_int32_t ps_argvstr; /* first of 0 or more argument strings */
+ u_int ps_nargvstr; /* the number of argument strings */
+ u_int32_t ps_envstr; /* first of 0 or more environment strings */
+ u_int ps_nenvstr; /* the number of environment strings */
+};
+
+/*
+ * If FreeBSD & Linux have a difference of opinion about what a trap
+ * means, deal with it here.
+ *
+ * MPSAFE
+ */
+static int
+translate_traps(int signal, int trap_code)
+{
+ if (signal != SIGBUS)
+ return signal;
+ switch (trap_code) {
+ case T_PROTFLT:
+ case T_TSSFLT:
+ case T_DOUBLEFLT:
+ case T_PAGEFLT:
+ return SIGSEGV;
+ default:
+ return signal;
+ }
+}
+
+static int
+elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
+{
+ Elf32_Auxargs *args;
+ Elf32_Addr *base;
+ Elf32_Addr *pos, *uplatform;
+ struct linux32_ps_strings *arginfo;
+
+ arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
+ uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szplatform);
+
+ KASSERT(curthread->td_proc == imgp->proc,
+ ("unsafe elf_linux_fixup(), should be curproc"));
+ base = (Elf32_Addr *)*stack_base;
+ args = (Elf32_Auxargs *)imgp->auxargs;
+ pos = base + (imgp->args->argc + imgp->args->envc + 2);
+
+ AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
+
+ /*
+ * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
+ * as it has appeared in the 2.4.0-rc7 first time.
+ * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
+ * glibc falls back to the hard-coded CLK_TCK value when aux entry
+ * is not present.
+ * Also see linux_times() implementation.
+ */
+ if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
+ AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
+ AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
+ AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
+ AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
+ AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
+ AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
+ AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
+ AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
+ AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
+ AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
+ AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
+ AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
+ AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
+ AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
+ if (args->execfd != -1)
+ AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
+ AUXARGS_ENTRY_32(pos, AT_NULL, 0);
+
+ free(imgp->auxargs, M_TEMP);
+ imgp->auxargs = NULL;
+
+ base--;
+ suword32(base, (uint32_t)imgp->args->argc);
+ *stack_base = (register_t *)base;
+ return 0;
+}
+
+extern unsigned long linux_sznonrtsigcode;
+
+static void
+linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
+{
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ struct sigacts *psp;
+ struct trapframe *regs;
+ struct l_rt_sigframe *fp, frame;
+ int oonstack;
+ int sig;
+ int code;
+
+ sig = ksi->ksi_signo;
+ code = ksi->ksi_code;
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ psp = p->p_sigacts;
+ mtx_assert(&psp->ps_mtx, MA_OWNED);
+ regs = td->td_frame;
+ oonstack = sigonstack(regs->tf_rsp);
+
+#ifdef DEBUG
+ if (ldebug(rt_sendsig))
+ printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
+ catcher, sig, (void*)mask, code);
+#endif
+ /*
+ * Allocate space for the signal handler context.
+ */
+ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
+ SIGISMEMBER(psp->ps_sigonstack, sig)) {
+ fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
+ td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
+ } else
+ fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
+ mtx_unlock(&psp->ps_mtx);
+
+ /*
+ * Build the argument list for the signal handler.
+ */
+ if (p->p_sysent->sv_sigtbl)
+ if (sig <= p->p_sysent->sv_sigsize)
+ sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
+
+ bzero(&frame, sizeof(frame));
+
+ frame.sf_handler = PTROUT(catcher);
+ frame.sf_sig = sig;
+ frame.sf_siginfo = PTROUT(&fp->sf_si);
+ frame.sf_ucontext = PTROUT(&fp->sf_sc);
+
+ /* Fill in POSIX parts */
+ ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
+
+ /*
+ * Build the signal context to be used by sigreturn.
+ */
+ frame.sf_sc.uc_flags = 0; /* XXX ??? */
+ frame.sf_sc.uc_link = 0; /* XXX ??? */
+
+ frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
+ frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
+ frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
+ ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
+ PROC_UNLOCK(p);
+
+ bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
+
+ frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0];
+ frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi;
+ frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi;
+ frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp;
+ frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx;
+ frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx;
+ frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx;
+ frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax;
+ frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip;
+ frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs;
+ frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs;
+ frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs;
+ frame.sf_sc.uc_mcontext.sc_es = regs->tf_es;
+ frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds;
+ frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
+ frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
+ frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss;
+ frame.sf_sc.uc_mcontext.sc_err = regs->tf_err;
+ frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr;
+ frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
+
+#ifdef DEBUG
+ if (ldebug(rt_sendsig))
+ printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
+ frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
+ td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
+#endif
+
+ if (copyout(&frame, fp, sizeof(frame)) != 0) {
+ /*
+ * Process has trashed its stack; give it an illegal
+ * instruction to halt it in its tracks.
+ */
+#ifdef DEBUG
+ if (ldebug(rt_sendsig))
+ printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
+ fp, oonstack);
+#endif
+ PROC_LOCK(p);
+ sigexit(td, SIGILL);
+ }
+
+ /*
+ * Build context to run handler in.
+ */
+ regs->tf_rsp = PTROUT(fp);
+ regs->tf_rip = p->p_sysent->sv_sigcode_base + linux_sznonrtsigcode;
+ regs->tf_rflags &= ~(PSL_T | PSL_D);
+ regs->tf_cs = _ucode32sel;
+ regs->tf_ss = _udatasel;
+ regs->tf_ds = _udatasel;
+ regs->tf_es = _udatasel;
+ regs->tf_fs = _ufssel;
+ regs->tf_gs = _ugssel;
+ regs->tf_flags = TF_HASSEGS;
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+ PROC_LOCK(p);
+ mtx_lock(&psp->ps_mtx);
+}
+
+
+/*
+ * Send an interrupt to process.
+ *
+ * Stack is set up to allow sigcode stored
+ * in u. to call routine, followed by kcall
+ * to sigreturn routine below. After sigreturn
+ * resets the signal mask, the stack, and the
+ * frame pointer, it returns to the user
+ * specified pc, psl.
+ */
+static void
+linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
+{
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ struct sigacts *psp;
+ struct trapframe *regs;
+ struct l_sigframe *fp, frame;
+ l_sigset_t lmask;
+ int oonstack, i;
+ int sig, code;
+
+ sig = ksi->ksi_signo;
+ code = ksi->ksi_code;
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ psp = p->p_sigacts;
+ mtx_assert(&psp->ps_mtx, MA_OWNED);
+ if (SIGISMEMBER(psp->ps_siginfo, sig)) {
+ /* Signal handler installed with SA_SIGINFO. */
+ linux_rt_sendsig(catcher, ksi, mask);
+ return;
+ }
+
+ regs = td->td_frame;
+ oonstack = sigonstack(regs->tf_rsp);
+
+#ifdef DEBUG
+ if (ldebug(sendsig))
+ printf(ARGS(sendsig, "%p, %d, %p, %u"),
+ catcher, sig, (void*)mask, code);
+#endif
+
+ /*
+ * Allocate space for the signal handler context.
+ */
+ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
+ SIGISMEMBER(psp->ps_sigonstack, sig)) {
+ fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
+ td->td_sigstk.ss_size - sizeof(struct l_sigframe));
+ } else
+ fp = (struct l_sigframe *)regs->tf_rsp - 1;
+ mtx_unlock(&psp->ps_mtx);
+ PROC_UNLOCK(p);
+
+ /*
+ * Build the argument list for the signal handler.
+ */
+ if (p->p_sysent->sv_sigtbl)
+ if (sig <= p->p_sysent->sv_sigsize)
+ sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
+
+ bzero(&frame, sizeof(frame));
+
+ frame.sf_handler = PTROUT(catcher);
+ frame.sf_sig = sig;
+
+ bsd_to_linux_sigset(mask, &lmask);
+
+ /*
+ * Build the signal context to be used by sigreturn.
+ */
+ frame.sf_sc.sc_mask = lmask.__bits[0];
+ frame.sf_sc.sc_gs = regs->tf_gs;
+ frame.sf_sc.sc_fs = regs->tf_fs;
+ frame.sf_sc.sc_es = regs->tf_es;
+ frame.sf_sc.sc_ds = regs->tf_ds;
+ frame.sf_sc.sc_edi = regs->tf_rdi;
+ frame.sf_sc.sc_esi = regs->tf_rsi;
+ frame.sf_sc.sc_ebp = regs->tf_rbp;
+ frame.sf_sc.sc_ebx = regs->tf_rbx;
+ frame.sf_sc.sc_edx = regs->tf_rdx;
+ frame.sf_sc.sc_ecx = regs->tf_rcx;
+ frame.sf_sc.sc_eax = regs->tf_rax;
+ frame.sf_sc.sc_eip = regs->tf_rip;
+ frame.sf_sc.sc_cs = regs->tf_cs;
+ frame.sf_sc.sc_eflags = regs->tf_rflags;
+ frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
+ frame.sf_sc.sc_ss = regs->tf_ss;
+ frame.sf_sc.sc_err = regs->tf_err;
+ frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr;
+ frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
+
+ for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
+ frame.sf_extramask[i] = lmask.__bits[i+1];
+
+ if (copyout(&frame, fp, sizeof(frame)) != 0) {
+ /*
+ * Process has trashed its stack; give it an illegal
+ * instruction to halt it in its tracks.
+ */
+ PROC_LOCK(p);
+ sigexit(td, SIGILL);
+ }
+
+ /*
+ * Build context to run handler in.
+ */
+ regs->tf_rsp = PTROUT(fp);
+ regs->tf_rip = p->p_sysent->sv_sigcode_base;
+ regs->tf_rflags &= ~(PSL_T | PSL_D);
+ regs->tf_cs = _ucode32sel;
+ regs->tf_ss = _udatasel;
+ regs->tf_ds = _udatasel;
+ regs->tf_es = _udatasel;
+ regs->tf_fs = _ufssel;
+ regs->tf_gs = _ugssel;
+ regs->tf_flags = TF_HASSEGS;
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+ PROC_LOCK(p);
+ mtx_lock(&psp->ps_mtx);
+}
+
+/*
+ * System call to cleanup state after a signal
+ * has been taken. Reset signal mask and
+ * stack state from context left by sendsig (above).
+ * Return to previous pc and psl as specified by
+ * context left by sendsig. Check carefully to
+ * make sure that the user has not modified the
+ * psl to gain improper privileges or to cause
+ * a machine fault.
+ */
+int
+linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
+{
+ struct l_sigframe frame;
+ struct trapframe *regs;
+ sigset_t bmask;
+ l_sigset_t lmask;
+ int eflags, i;
+ ksiginfo_t ksi;
+
+ regs = td->td_frame;
+
+#ifdef DEBUG
+ if (ldebug(sigreturn))
+ printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
+#endif
+ /*
+ * The trampoline code hands us the sigframe.
+ * It is unsafe to keep track of it ourselves, in the event that a
+ * program jumps out of a signal handler.
+ */
+ if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
+ return (EFAULT);
+
+ /*
+ * Check for security violations.
+ */
+#define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
+ eflags = frame.sf_sc.sc_eflags;
+ /*
+ * XXX do allow users to change the privileged flag PSL_RF. The
+ * cpu sets PSL_RF in tf_eflags for faults. Debuggers should
+ * sometimes set it there too. tf_eflags is kept in the signal
+ * context during signal handling and there is no other place
+ * to remember it, so the PSL_RF bit may be corrupted by the
+ * signal handler without us knowing. Corruption of the PSL_RF
+ * bit at worst causes one more or one less debugger trap, so
+ * allowing it is fairly harmless.
+ */
+ if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
+ return(EINVAL);
+
+ /*
+ * Don't allow users to load a valid privileged %cs. Let the
+ * hardware check for invalid selectors, excess privilege in
+ * other selectors, invalid %eip's and invalid %esp's.
+ */
+#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
+ if (!CS_SECURE(frame.sf_sc.sc_cs)) {
+ ksiginfo_init_trap(&ksi);
+ ksi.ksi_signo = SIGBUS;
+ ksi.ksi_code = BUS_OBJERR;
+ ksi.ksi_trapno = T_PROTFLT;
+ ksi.ksi_addr = (void *)regs->tf_rip;
+ trapsignal(td, &ksi);
+ return(EINVAL);
+ }
+
+ lmask.__bits[0] = frame.sf_sc.sc_mask;
+ for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
+ lmask.__bits[i+1] = frame.sf_extramask[i];
+ linux_to_bsd_sigset(&lmask, &bmask);
+ kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
+
+ /*
+ * Restore signal context.
+ */
+ regs->tf_rdi = frame.sf_sc.sc_edi;
+ regs->tf_rsi = frame.sf_sc.sc_esi;
+ regs->tf_rbp = frame.sf_sc.sc_ebp;
+ regs->tf_rbx = frame.sf_sc.sc_ebx;
+ regs->tf_rdx = frame.sf_sc.sc_edx;
+ regs->tf_rcx = frame.sf_sc.sc_ecx;
+ regs->tf_rax = frame.sf_sc.sc_eax;
+ regs->tf_rip = frame.sf_sc.sc_eip;
+ regs->tf_cs = frame.sf_sc.sc_cs;
+ regs->tf_ds = frame.sf_sc.sc_ds;
+ regs->tf_es = frame.sf_sc.sc_es;
+ regs->tf_fs = frame.sf_sc.sc_fs;
+ regs->tf_gs = frame.sf_sc.sc_gs;
+ regs->tf_rflags = eflags;
+ regs->tf_rsp = frame.sf_sc.sc_esp_at_signal;
+ regs->tf_ss = frame.sf_sc.sc_ss;
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+
+ return (EJUSTRETURN);
+}
+
+/*
+ * System call to cleanup state after a signal
+ * has been taken. Reset signal mask and
+ * stack state from context left by rt_sendsig (above).
+ * Return to previous pc and psl as specified by
+ * context left by sendsig. Check carefully to
+ * make sure that the user has not modified the
+ * psl to gain improper privileges or to cause
+ * a machine fault.
+ */
+int
+linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
+{
+ struct l_ucontext uc;
+ struct l_sigcontext *context;
+ sigset_t bmask;
+ l_stack_t *lss;
+ stack_t ss;
+ struct trapframe *regs;
+ int eflags;
+ ksiginfo_t ksi;
+
+ regs = td->td_frame;
+
+#ifdef DEBUG
+ if (ldebug(rt_sigreturn))
+ printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
+#endif
+ /*
+ * The trampoline code hands us the ucontext.
+ * It is unsafe to keep track of it ourselves, in the event that a
+ * program jumps out of a signal handler.
+ */
+ if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
+ return (EFAULT);
+
+ context = &uc.uc_mcontext;
+
+ /*
+ * Check for security violations.
+ */
+#define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
+ eflags = context->sc_eflags;
+ /*
+ * XXX do allow users to change the privileged flag PSL_RF. The
+ * cpu sets PSL_RF in tf_eflags for faults. Debuggers should
+ * sometimes set it there too. tf_eflags is kept in the signal
+ * context during signal handling and there is no other place
+ * to remember it, so the PSL_RF bit may be corrupted by the
+ * signal handler without us knowing. Corruption of the PSL_RF
+ * bit at worst causes one more or one less debugger trap, so
+ * allowing it is fairly harmless.
+ */
+ if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
+ return(EINVAL);
+
+ /*
+ * Don't allow users to load a valid privileged %cs. Let the
+ * hardware check for invalid selectors, excess privilege in
+ * other selectors, invalid %eip's and invalid %esp's.
+ */
+#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
+ if (!CS_SECURE(context->sc_cs)) {
+ ksiginfo_init_trap(&ksi);
+ ksi.ksi_signo = SIGBUS;
+ ksi.ksi_code = BUS_OBJERR;
+ ksi.ksi_trapno = T_PROTFLT;
+ ksi.ksi_addr = (void *)regs->tf_rip;
+ trapsignal(td, &ksi);
+ return(EINVAL);
+ }
+
+ linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
+ kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
+
+ /*
+ * Restore signal context
+ */
+ regs->tf_gs = context->sc_gs;
+ regs->tf_fs = context->sc_fs;
+ regs->tf_es = context->sc_es;
+ regs->tf_ds = context->sc_ds;
+ regs->tf_rdi = context->sc_edi;
+ regs->tf_rsi = context->sc_esi;
+ regs->tf_rbp = context->sc_ebp;
+ regs->tf_rbx = context->sc_ebx;
+ regs->tf_rdx = context->sc_edx;
+ regs->tf_rcx = context->sc_ecx;
+ regs->tf_rax = context->sc_eax;
+ regs->tf_rip = context->sc_eip;
+ regs->tf_cs = context->sc_cs;
+ regs->tf_rflags = eflags;
+ regs->tf_rsp = context->sc_esp_at_signal;
+ regs->tf_ss = context->sc_ss;
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+
+ /*
+ * call sigaltstack & ignore results..
+ */
+ lss = &uc.uc_stack;
+ ss.ss_sp = PTRIN(lss->ss_sp);
+ ss.ss_size = lss->ss_size;
+ ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
+
+#ifdef DEBUG
+ if (ldebug(rt_sigreturn))
+ printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
+ ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
+#endif
+ (void)kern_sigaltstack(td, &ss, NULL);
+
+ return (EJUSTRETURN);
+}
+
+static int
+linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
+{
+ struct proc *p;
+ struct trapframe *frame;
+
+ p = td->td_proc;
+ frame = td->td_frame;
+
+ sa->args[0] = frame->tf_rbx;
+ sa->args[1] = frame->tf_rcx;
+ sa->args[2] = frame->tf_rdx;
+ sa->args[3] = frame->tf_rsi;
+ sa->args[4] = frame->tf_rdi;
+ sa->args[5] = frame->tf_rbp; /* Unconfirmed */
+ sa->code = frame->tf_rax;
+
+ if (sa->code >= p->p_sysent->sv_size)
+ sa->callp = &p->p_sysent->sv_table[0];
+ else
+ sa->callp = &p->p_sysent->sv_table[sa->code];
+ sa->narg = sa->callp->sy_narg;
+
+ td->td_retval[0] = 0;
+ td->td_retval[1] = frame->tf_rdx;
+
+ return (0);
+}
+
+/*
+ * If a linux binary is exec'ing something, try this image activator
+ * first. We override standard shell script execution in order to
+ * be able to modify the interpreter path. We only do this if a linux
+ * binary is doing the exec, so we do not create an EXEC module for it.
+ */
+static int exec_linux_imgact_try(struct image_params *iparams);
+
+static int
+exec_linux_imgact_try(struct image_params *imgp)
+{
+ const char *head = (const char *)imgp->image_header;
+ char *rpath;
+ int error = -1;
+
+ /*
+ * The interpreter for shell scripts run from a linux binary needs
+ * to be located in /compat/linux if possible in order to recursively
+ * maintain linux path emulation.
+ */
+ if (((const short *)head)[0] == SHELLMAGIC) {
+ /*
+ * Run our normal shell image activator. If it succeeds attempt
+ * to use the alternate path for the interpreter. If an
+ * alternate * path is found, use our stringspace to store it.
+ */
+ if ((error = exec_shell_imgact(imgp)) == 0) {
+ linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
+ imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
+ AT_FDCWD);
+ if (rpath != NULL)
+ imgp->args->fname_buf =
+ imgp->interpreter_name = rpath;
+ }
+ }
+ return (error);
+}
+
+/*
+ * Clear registers on exec
+ * XXX copied from ia32_signal.c.
+ */
+static void
+exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
+{
+ struct trapframe *regs = td->td_frame;
+ struct pcb *pcb = td->td_pcb;
+
+ mtx_lock(&dt_lock);
+ if (td->td_proc->p_md.md_ldt != NULL)
+ user_ldt_free(td);
+ else
+ mtx_unlock(&dt_lock);
+
+ critical_enter();
+ wrmsr(MSR_FSBASE, 0);
+ wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */
+ pcb->pcb_fsbase = 0;
+ pcb->pcb_gsbase = 0;
+ critical_exit();
+ pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
+
+ bzero((char *)regs, sizeof(struct trapframe));
+ regs->tf_rip = imgp->entry_addr;
+ regs->tf_rsp = stack;
+ regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
+ regs->tf_gs = _ugssel;
+ regs->tf_fs = _ufssel;
+ regs->tf_es = _udatasel;
+ regs->tf_ds = _udatasel;
+ regs->tf_ss = _udatasel;
+ regs->tf_flags = TF_HASSEGS;
+ regs->tf_cs = _ucode32sel;
+ regs->tf_rbx = imgp->ps_strings;
+
+ fpstate_drop(td);
+
+ /* Do full restore on return so that we can change to a different %cs */
+ set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
+ clear_pcb_flags(pcb, PCB_GS32BIT);
+ td->td_retval[1] = 0;
+}
+
+/*
+ * XXX copied from ia32_sysvec.c.
+ */
+static register_t *
+linux_copyout_strings(struct image_params *imgp)
+{
+ int argc, envc;
+ u_int32_t *vectp;
+ char *stringp, *destp;
+ u_int32_t *stack_base;
+ struct linux32_ps_strings *arginfo;
+
+ /*
+ * Calculate string base and vector table pointers.
+ * Also deal with signal trampoline code for this exec type.
+ */
+ arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
+ destp = (caddr_t)arginfo - SPARE_USRSPACE - linux_szplatform -
+ roundup((ARG_MAX - imgp->args->stringspace),
+ sizeof(char *));
+
+ /*
+ * Install LINUX_PLATFORM
+ */
+ copyout(linux_platform, ((caddr_t)arginfo - linux_szplatform),
+ linux_szplatform);
+
+ /*
+ * If we have a valid auxargs ptr, prepare some room
+ * on the stack.
+ */
+ if (imgp->auxargs) {
+ /*
+ * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
+ * lower compatibility.
+ */
+ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
+ (LINUX_AT_COUNT * 2);
+ /*
+ * The '+ 2' is for the null pointers at the end of each of
+ * the arg and env vector sets,and imgp->auxarg_size is room
+ * for argument of Runtime loader.
+ */
+ vectp = (u_int32_t *) (destp - (imgp->args->argc +
+ imgp->args->envc + 2 + imgp->auxarg_size) *
+ sizeof(u_int32_t));
+
+ } else
+ /*
+ * The '+ 2' is for the null pointers at the end of each of
+ * the arg and env vector sets
+ */
+ vectp = (u_int32_t *)(destp - (imgp->args->argc +
+ imgp->args->envc + 2) * sizeof(u_int32_t));
+
+ /*
+ * vectp also becomes our initial stack base
+ */
+ stack_base = vectp;
+
+ stringp = imgp->args->begin_argv;
+ argc = imgp->args->argc;
+ envc = imgp->args->envc;
+ /*
+ * Copy out strings - arguments and environment.
+ */
+ copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
+
+ /*
+ * Fill in "ps_strings" struct for ps, w, etc.
+ */
+ suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
+ suword32(&arginfo->ps_nargvstr, argc);
+
+ /*
+ * Fill in argument portion of vector table.
+ */
+ for (; argc > 0; --argc) {
+ suword32(vectp++, (uint32_t)(intptr_t)destp);
+ while (*stringp++ != 0)
+ destp++;
+ destp++;
+ }
+
+ /* a null vector table pointer separates the argp's from the envp's */
+ suword32(vectp++, 0);
+
+ suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
+ suword32(&arginfo->ps_nenvstr, envc);
+
+ /*
+ * Fill in environment portion of vector table.
+ */
+ for (; envc > 0; --envc) {
+ suword32(vectp++, (uint32_t)(intptr_t)destp);
+ while (*stringp++ != 0)
+ destp++;
+ destp++;
+ }
+
+ /* end of vector table is a null pointer */
+ suword32(vectp, 0);
+
+ return ((register_t *)stack_base);
+}
+
+static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
+ "32-bit Linux emulation");
+
+static u_long linux32_maxdsiz = LINUX32_MAXDSIZ;
+SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
+ &linux32_maxdsiz, 0, "");
+static u_long linux32_maxssiz = LINUX32_MAXSSIZ;
+SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
+ &linux32_maxssiz, 0, "");
+static u_long linux32_maxvmem = LINUX32_MAXVMEM;
+SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
+ &linux32_maxvmem, 0, "");
+
+static void
+linux32_fixlimit(struct rlimit *rl, int which)
+{
+
+ switch (which) {
+ case RLIMIT_DATA:
+ if (linux32_maxdsiz != 0) {
+ if (rl->rlim_cur > linux32_maxdsiz)
+ rl->rlim_cur = linux32_maxdsiz;
+ if (rl->rlim_max > linux32_maxdsiz)
+ rl->rlim_max = linux32_maxdsiz;
+ }
+ break;
+ case RLIMIT_STACK:
+ if (linux32_maxssiz != 0) {
+ if (rl->rlim_cur > linux32_maxssiz)
+ rl->rlim_cur = linux32_maxssiz;
+ if (rl->rlim_max > linux32_maxssiz)
+ rl->rlim_max = linux32_maxssiz;
+ }
+ break;
+ case RLIMIT_VMEM:
+ if (linux32_maxvmem != 0) {
+ if (rl->rlim_cur > linux32_maxvmem)
+ rl->rlim_cur = linux32_maxvmem;
+ if (rl->rlim_max > linux32_maxvmem)
+ rl->rlim_max = linux32_maxvmem;
+ }
+ break;
+ }
+}
+
+struct sysentvec elf_linux_sysvec = {
+ .sv_size = LINUX_SYS_MAXSYSCALL,
+ .sv_table = linux_sysent,
+ .sv_mask = 0,
+ .sv_sigsize = LINUX_SIGTBLSZ,
+ .sv_sigtbl = bsd_to_linux_signal,
+ .sv_errsize = ELAST + 1,
+ .sv_errtbl = bsd_to_linux_errno,
+ .sv_transtrap = translate_traps,
+ .sv_fixup = elf_linux_fixup,
+ .sv_sendsig = linux_sendsig,
+ .sv_sigcode = linux_sigcode,
+ .sv_szsigcode = &linux_szsigcode,
+ .sv_prepsyscall = NULL,
+ .sv_name = "Linux ELF32",
+ .sv_coredump = elf32_coredump,
+ .sv_imgact_try = exec_linux_imgact_try,
+ .sv_minsigstksz = LINUX_MINSIGSTKSZ,
+ .sv_pagesize = PAGE_SIZE,
+ .sv_minuser = VM_MIN_ADDRESS,
+ .sv_maxuser = LINUX32_MAXUSER,
+ .sv_usrstack = LINUX32_USRSTACK,
+ .sv_psstrings = LINUX32_PS_STRINGS,
+ .sv_stackprot = VM_PROT_ALL,
+ .sv_copyout_strings = linux_copyout_strings,
+ .sv_setregs = exec_linux_setregs,
+ .sv_fixlimit = linux32_fixlimit,
+ .sv_maxssiz = &linux32_maxssiz,
+ .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP,
+ .sv_set_syscall_retval = cpu_set_syscall_retval,
+ .sv_fetch_syscall_args = linux32_fetch_syscall_args,
+ .sv_syscallnames = NULL,
+ .sv_shared_page_base = LINUX32_SHAREDPAGE,
+ .sv_shared_page_len = PAGE_SIZE,
+ .sv_schedtail = linux_schedtail,
+};
+INIT_SYSENTVEC(elf_sysvec, &elf_linux_sysvec);
+
+static char GNU_ABI_VENDOR[] = "GNU";
+static int GNULINUX_ABI_DESC = 0;
+
+static boolean_t
+linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
+{
+ const Elf32_Word *desc;
+ uintptr_t p;
+
+ p = (uintptr_t)(note + 1);
+ p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
+
+ desc = (const Elf32_Word *)p;
+ if (desc[0] != GNULINUX_ABI_DESC)
+ return (FALSE);
+
+ /*
+ * For linux we encode osrel as follows (see linux_mib.c):
+ * VVVMMMIII (version, major, minor), see linux_mib.c.
+ */
+ *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
+
+ return (TRUE);
+}
+
+static Elf_Brandnote linux32_brandnote = {
+ .hdr.n_namesz = sizeof(GNU_ABI_VENDOR),
+ .hdr.n_descsz = 16, /* XXX at least 16 */
+ .hdr.n_type = 1,
+ .vendor = GNU_ABI_VENDOR,
+ .flags = BN_TRANSLATE_OSREL,
+ .trans_osrel = linux32_trans_osrel
+};
+
+static Elf32_Brandinfo linux_brand = {
+ .brand = ELFOSABI_LINUX,
+ .machine = EM_386,
+ .compat_3_brand = "Linux",
+ .emul_path = "/compat/linux",
+ .interp_path = "/lib/ld-linux.so.1",
+ .sysvec = &elf_linux_sysvec,
+ .interp_newpath = NULL,
+ .brand_note = &linux32_brandnote,
+ .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
+};
+
+static Elf32_Brandinfo linux_glibc2brand = {
+ .brand = ELFOSABI_LINUX,
+ .machine = EM_386,
+ .compat_3_brand = "Linux",
+ .emul_path = "/compat/linux",
+ .interp_path = "/lib/ld-linux.so.2",
+ .sysvec = &elf_linux_sysvec,
+ .interp_newpath = NULL,
+ .brand_note = &linux32_brandnote,
+ .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
+};
+
+Elf32_Brandinfo *linux_brandlist[] = {
+ &linux_brand,
+ &linux_glibc2brand,
+ NULL
+};
+
+static int
+linux_elf_modevent(module_t mod, int type, void *data)
+{
+ Elf32_Brandinfo **brandinfo;
+ int error;
+ struct linux_ioctl_handler **lihp;
+ struct linux_device_handler **ldhp;
+
+ error = 0;
+
+ switch(type) {
+ case MOD_LOAD:
+ for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
+ ++brandinfo)
+ if (elf32_insert_brand_entry(*brandinfo) < 0)
+ error = EINVAL;
+ if (error == 0) {
+ SET_FOREACH(lihp, linux_ioctl_handler_set)
+ linux_ioctl_register_handler(*lihp);
+ SET_FOREACH(ldhp, linux_device_handler_set)
+ linux_device_register_handler(*ldhp);
+ mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
+ sx_init(&emul_shared_lock, "emuldata->shared lock");
+ LIST_INIT(&futex_list);
+ mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
+ linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
+ linux_proc_exit, NULL, 1000);
+ linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
+ linux_proc_exec, NULL, 1000);
+ linux_szplatform = roundup(strlen(linux_platform) + 1,
+ sizeof(char *));
+ linux_osd_jail_register();
+ stclohz = (stathz ? stathz : hz);
+ if (bootverbose)
+ printf("Linux ELF exec handler installed\n");
+ } else
+ printf("cannot insert Linux ELF brand handler\n");
+ break;
+ case MOD_UNLOAD:
+ for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
+ ++brandinfo)
+ if (elf32_brand_inuse(*brandinfo))
+ error = EBUSY;
+ if (error == 0) {
+ for (brandinfo = &linux_brandlist[0];
+ *brandinfo != NULL; ++brandinfo)
+ if (elf32_remove_brand_entry(*brandinfo) < 0)
+ error = EINVAL;
+ }
+ if (error == 0) {
+ SET_FOREACH(lihp, linux_ioctl_handler_set)
+ linux_ioctl_unregister_handler(*lihp);
+ SET_FOREACH(ldhp, linux_device_handler_set)
+ linux_device_unregister_handler(*ldhp);
+ mtx_destroy(&emul_lock);
+ sx_destroy(&emul_shared_lock);
+ mtx_destroy(&futex_mtx);
+ EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
+ EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
+ linux_osd_jail_deregister();
+ if (bootverbose)
+ printf("Linux ELF exec handler removed\n");
+ } else
+ printf("Could not deinstall ELF interpreter entry\n");
+ break;
+ default:
+ return EOPNOTSUPP;
+ }
+ return error;
+}
+
+static moduledata_t linux_elf_mod = {
+ "linuxelf",
+ linux_elf_modevent,
+ 0
+};
+
+DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
diff --git a/sys/amd64/linux32/syscalls.conf b/sys/amd64/linux32/syscalls.conf
new file mode 100644
index 0000000..dc0ad82
--- /dev/null
+++ b/sys/amd64/linux32/syscalls.conf
@@ -0,0 +1,11 @@
+# $FreeBSD$
+sysnames="linux32_syscalls.c"
+sysproto="linux32_proto.h"
+sysproto_h=_LINUX_SYSPROTO_H_
+syshdr="linux32_syscall.h"
+syssw="linux32_sysent.c"
+sysmk="/dev/null"
+syscallprefix="LINUX_SYS_"
+switchname="linux_sysent"
+namesname="linux_syscallnames"
+systrace="linux32_systrace_args.c"
diff --git a/sys/amd64/linux32/syscalls.master b/sys/amd64/linux32/syscalls.master
new file mode 100644
index 0000000..c3a10af
--- /dev/null
+++ b/sys/amd64/linux32/syscalls.master
@@ -0,0 +1,561 @@
+ $FreeBSD$
+
+; @(#)syscalls.master 8.1 (Berkeley) 7/19/93
+; System call name/number master file (or rather, slave, from LINUX).
+; Processed to create linux_sysent.c, linux_proto.h and linux_syscall.h.
+
+; Columns: number audit type nargs name alt{name,tag,rtyp}/comments
+; number system call number, must be in order
+; audit the audit event associated with the system call
+; A value of AUE_NULL means no auditing, but it also means that
+; there is no audit event for the call at this time. For the
+; case where the event exists, but we don't want auditing, the
+; event should be #defined to AUE_NULL in audit_kevents.h.
+; type one of STD, OBSOL, UNIMPL
+; name psuedo-prototype of syscall routine
+; If one of the following alts is different, then all appear:
+; altname name of system call if different
+; alttag name of args struct tag if different from [o]`name'"_args"
+; altrtyp return type if not int (bogus - syscalls always return int)
+; for UNIMPL/OBSOL, name continues with comments
+
+; types:
+; STD always included
+; OBSOL obsolete, not included in system, only specifies name
+; UNIMPL not implemented, placeholder only
+
+#include "opt_compat.h"
+#include <sys/param.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <compat/linux/linux_sysproto.h>
+#include <amd64/linux32/linux.h>
+#include <amd64/linux32/linux32_proto.h>
+
+; Isn't pretty, but there seems to be no other way to trap nosys
+#define nosys linux_nosys
+
+; #ifdef's, etc. may be included, and are copied to the output files.
+
+0 AUE_NULL UNIMPL setup
+1 AUE_EXIT NOPROTO { void sys_exit(int rval); } exit \
+ sys_exit_args void
+2 AUE_FORK STD { int linux_fork(void); }
+3 AUE_NULL NOPROTO { int read(int fd, char *buf, \
+ u_int nbyte); }
+4 AUE_NULL NOPROTO { int write(int fd, char *buf, \
+ u_int nbyte); }
+5 AUE_OPEN_RWTC STD { int linux_open(char *path, l_int flags, \
+ l_int mode); }
+6 AUE_CLOSE NOPROTO { int close(int fd); }
+7 AUE_WAIT4 STD { int linux_waitpid(l_pid_t pid, \
+ l_int *status, l_int options); }
+8 AUE_CREAT STD { int linux_creat(char *path, \
+ l_int mode); }
+9 AUE_LINK STD { int linux_link(char *path, char *to); }
+10 AUE_UNLINK STD { int linux_unlink(char *path); }
+11 AUE_EXECVE STD { int linux_execve(char *path, uint32_t *argp, \
+ uint32_t *envp); }
+12 AUE_CHDIR STD { int linux_chdir(char *path); }
+13 AUE_NULL STD { int linux_time(l_time_t *tm); }
+14 AUE_MKNOD STD { int linux_mknod(char *path, l_int mode, \
+ l_dev_t dev); }
+15 AUE_CHMOD STD { int linux_chmod(char *path, \
+ l_mode_t mode); }
+16 AUE_LCHOWN STD { int linux_lchown16(char *path, \
+ l_uid16_t uid, l_gid16_t gid); }
+17 AUE_NULL UNIMPL break
+18 AUE_STAT STD { int linux_stat(char *path, \
+ struct linux_stat *up); }
+19 AUE_LSEEK STD { int linux_lseek(l_uint fdes, l_off_t off, \
+ l_int whence); }
+20 AUE_GETPID STD { int linux_getpid(void); }
+21 AUE_MOUNT STD { int linux_mount(char *specialfile, \
+ char *dir, char *filesystemtype, \
+ l_ulong rwflag, void *data); }
+22 AUE_UMOUNT STD { int linux_oldumount(char *path); }
+23 AUE_SETUID STD { int linux_setuid16(l_uid16_t uid); }
+24 AUE_GETUID STD { int linux_getuid16(void); }
+25 AUE_SETTIMEOFDAY STD { int linux_stime(void); }
+26 AUE_PTRACE STD { int linux_ptrace(l_long req, l_long pid, \
+ l_long addr, l_long data); }
+27 AUE_NULL STD { int linux_alarm(l_uint secs); }
+28 AUE_FSTAT UNIMPL fstat
+29 AUE_NULL STD { int linux_pause(void); }
+30 AUE_UTIME STD { int linux_utime(char *fname, \
+ struct l_utimbuf *times); }
+31 AUE_NULL UNIMPL stty
+32 AUE_NULL UNIMPL gtty
+33 AUE_ACCESS STD { int linux_access(char *path, l_int amode); }
+34 AUE_NICE STD { int linux_nice(l_int inc); }
+35 AUE_NULL UNIMPL ftime
+36 AUE_SYNC NOPROTO { int sync(void); }
+37 AUE_KILL STD { int linux_kill(l_int pid, l_int signum); }
+38 AUE_RENAME STD { int linux_rename(char *from, char *to); }
+39 AUE_MKDIR STD { int linux_mkdir(char *path, l_int mode); }
+40 AUE_RMDIR STD { int linux_rmdir(char *path); }
+41 AUE_DUP NOPROTO { int dup(u_int fd); }
+42 AUE_PIPE STD { int linux_pipe(l_int *pipefds); }
+43 AUE_NULL STD { int linux_times(struct l_times_argv *buf); }
+44 AUE_NULL UNIMPL prof
+45 AUE_NULL STD { int linux_brk(l_ulong dsend); }
+46 AUE_SETGID STD { int linux_setgid16(l_gid16_t gid); }
+47 AUE_GETGID STD { int linux_getgid16(void); }
+48 AUE_NULL STD { int linux_signal(l_int sig, \
+ l_handler_t handler); }
+49 AUE_GETEUID STD { int linux_geteuid16(void); }
+50 AUE_GETEGID STD { int linux_getegid16(void); }
+51 AUE_ACCT NOPROTO { int acct(char *path); }
+52 AUE_UMOUNT STD { int linux_umount(char *path, l_int flags); }
+53 AUE_NULL UNIMPL lock
+54 AUE_IOCTL STD { int linux_ioctl(l_uint fd, l_uint cmd, \
+ uintptr_t arg); }
+55 AUE_FCNTL STD { int linux_fcntl(l_uint fd, l_uint cmd, \
+ uintptr_t arg); }
+56 AUE_NULL UNIMPL mpx
+57 AUE_SETPGRP NOPROTO { int setpgid(int pid, int pgid); }
+58 AUE_NULL UNIMPL ulimit
+59 AUE_NULL STD { int linux_olduname(void); }
+60 AUE_UMASK NOPROTO { int umask(int newmask); }
+61 AUE_CHROOT NOPROTO { int chroot(char *path); }
+62 AUE_NULL STD { int linux_ustat(l_dev_t dev, \
+ struct l_ustat *ubuf); }
+63 AUE_DUP2 NOPROTO { int dup2(u_int from, u_int to); }
+64 AUE_GETPPID STD { int linux_getppid(void); }
+65 AUE_GETPGRP NOPROTO { int getpgrp(void); }
+66 AUE_SETSID NOPROTO { int setsid(void); }
+67 AUE_NULL STD { int linux_sigaction(l_int sig, \
+ l_osigaction_t *nsa, \
+ l_osigaction_t *osa); }
+68 AUE_NULL STD { int linux_sgetmask(void); }
+69 AUE_NULL STD { int linux_ssetmask(l_osigset_t mask); }
+70 AUE_SETREUID STD { int linux_setreuid16(l_uid16_t ruid, \
+ l_uid16_t euid); }
+71 AUE_SETREGID STD { int linux_setregid16(l_gid16_t rgid, \
+ l_gid16_t egid); }
+72 AUE_NULL STD { int linux_sigsuspend(l_int hist0, \
+ l_int hist1, l_osigset_t mask); }
+73 AUE_NULL STD { int linux_sigpending(l_osigset_t *mask); }
+74 AUE_SYSCTL STD { int linux_sethostname(char *hostname, \
+ u_int len); }
+75 AUE_SETRLIMIT STD { int linux_setrlimit(l_uint resource, \
+ struct l_rlimit *rlim); }
+76 AUE_GETRLIMIT STD { int linux_old_getrlimit(l_uint resource, \
+ struct l_rlimit *rlim); }
+77 AUE_GETRUSAGE STD { int linux_getrusage(int who, \
+ struct l_rusage *rusage); }
+78 AUE_NULL STD { int linux_gettimeofday( \
+ struct l_timeval *tp, \
+ struct timezone *tzp); }
+79 AUE_SETTIMEOFDAY STD { int linux_settimeofday( \
+ struct l_timeval *tp, \
+ struct timezone *tzp); }
+80 AUE_GETGROUPS STD { int linux_getgroups16(l_uint gidsetsize, \
+ l_gid16_t *gidset); }
+81 AUE_SETGROUPS STD { int linux_setgroups16(l_uint gidsetsize, \
+ l_gid16_t *gidset); }
+82 AUE_SELECT STD { int linux_old_select( \
+ struct l_old_select_argv *ptr); }
+83 AUE_SYMLINK STD { int linux_symlink(char *path, char *to); }
+; 84: oldlstat
+84 AUE_LSTAT STD { int linux_lstat(char *path, struct linux_lstat *up); }
+85 AUE_READLINK STD { int linux_readlink(char *name, char *buf, \
+ l_int count); }
+86 AUE_USELIB UNIMPL linux_uselib
+87 AUE_SWAPON NOPROTO { int swapon(char *name); }
+88 AUE_REBOOT STD { int linux_reboot(l_int magic1, \
+ l_int magic2, l_uint cmd, void *arg); }
+; 89: old_readdir
+89 AUE_GETDIRENTRIES STD { int linux_readdir(l_uint fd, \
+ struct l_dirent *dent, l_uint count); }
+; 90: old_mmap
+90 AUE_MMAP STD { int linux_mmap(struct l_mmap_argv *ptr); }
+91 AUE_MUNMAP NOPROTO { int munmap(caddr_t addr, int len); }
+92 AUE_TRUNCATE STD { int linux_truncate(char *path, \
+ l_ulong length); }
+93 AUE_FTRUNCATE STD { int linux_ftruncate(int fd, long length); }
+94 AUE_FCHMOD NOPROTO { int fchmod(int fd, int mode); }
+95 AUE_FCHOWN NOPROTO { int fchown(int fd, int uid, int gid); }
+96 AUE_GETPRIORITY STD { int linux_getpriority(int which, int who); }
+97 AUE_SETPRIORITY NOPROTO { int setpriority(int which, int who, \
+ int prio); }
+98 AUE_PROFILE UNIMPL profil
+99 AUE_STATFS STD { int linux_statfs(char *path, \
+ struct l_statfs_buf *buf); }
+100 AUE_FSTATFS STD { int linux_fstatfs(l_uint fd, \
+ struct l_statfs_buf *buf); }
+101 AUE_NULL UNIMPL ioperm
+102 AUE_NULL STD { int linux_socketcall(l_int what, \
+ l_ulong args); }
+103 AUE_NULL STD { int linux_syslog(l_int type, char *buf, \
+ l_int len); }
+104 AUE_SETITIMER STD { int linux_setitimer(l_int which, \
+ struct l_itimerval *itv, \
+ struct l_itimerval *oitv); }
+105 AUE_GETITIMER STD { int linux_getitimer(l_int which, \
+ struct l_itimerval *itv); }
+106 AUE_STAT STD { int linux_newstat(char *path, \
+ struct l_newstat *buf); }
+107 AUE_LSTAT STD { int linux_newlstat(char *path, \
+ struct l_newstat *buf); }
+108 AUE_FSTAT STD { int linux_newfstat(l_uint fd, \
+ struct l_newstat *buf); }
+; 109: olduname
+109 AUE_NULL STD { int linux_uname(void); }
+110 AUE_NULL STD { int linux_iopl(l_int level); }
+111 AUE_NULL STD { int linux_vhangup(void); }
+112 AUE_NULL UNIMPL idle
+113 AUE_NULL UNIMPL vm86old
+114 AUE_WAIT4 STD { int linux_wait4(l_pid_t pid, \
+ l_uint *status, l_int options, \
+ struct l_rusage *rusage); }
+115 AUE_SWAPOFF STD { int linux_swapoff(void); }
+116 AUE_NULL STD { int linux_sysinfo(struct l_sysinfo *info); }
+117 AUE_NULL STD { int linux_ipc(l_uint what, l_int arg1, \
+ l_int arg2, l_int arg3, void *ptr, \
+ l_long arg5); }
+118 AUE_FSYNC NOPROTO { int fsync(int fd); }
+119 AUE_SIGRETURN STD { int linux_sigreturn( \
+ struct l_sigframe *sfp); }
+120 AUE_RFORK STD { int linux_clone(l_int flags, void *stack, \
+ void *parent_tidptr, void *tls, void * child_tidptr); }
+121 AUE_SYSCTL STD { int linux_setdomainname(char *name, \
+ int len); }
+122 AUE_NULL STD { int linux_newuname( \
+ struct l_new_utsname *buf); }
+123 AUE_NULL UNIMPL modify_ldt
+124 AUE_ADJTIME STD { int linux_adjtimex(void); }
+125 AUE_MPROTECT STD { int linux_mprotect(caddr_t addr, int len, \
+ int prot); }
+126 AUE_SIGPROCMASK STD { int linux_sigprocmask(l_int how, \
+ l_osigset_t *mask, l_osigset_t *omask); }
+127 AUE_NULL STD { int linux_create_module(void); }
+128 AUE_NULL STD { int linux_init_module(void); }
+129 AUE_NULL STD { int linux_delete_module(void); }
+130 AUE_NULL STD { int linux_get_kernel_syms(void); }
+131 AUE_QUOTACTL STD { int linux_quotactl(void); }
+132 AUE_GETPGID NOPROTO { int getpgid(int pid); }
+133 AUE_FCHDIR NOPROTO { int fchdir(int fd); }
+134 AUE_BDFLUSH STD { int linux_bdflush(void); }
+135 AUE_NULL STD { int linux_sysfs(l_int option, \
+ l_ulong arg1, l_ulong arg2); }
+136 AUE_PERSONALITY STD { int linux_personality(l_ulong per); }
+137 AUE_NULL UNIMPL afs_syscall
+138 AUE_SETFSUID STD { int linux_setfsuid16(l_uid16_t uid); }
+139 AUE_SETFSGID STD { int linux_setfsgid16(l_gid16_t gid); }
+140 AUE_LSEEK STD { int linux_llseek(l_int fd, l_ulong ohigh, \
+ l_ulong olow, l_loff_t *res, \
+ l_uint whence); }
+141 AUE_GETDIRENTRIES STD { int linux_getdents(l_uint fd, void *dent, \
+ l_uint count); }
+; 142: newselect
+142 AUE_SELECT STD { int linux_select(l_int nfds, \
+ l_fd_set *readfds, l_fd_set *writefds, \
+ l_fd_set *exceptfds, \
+ struct l_timeval *timeout); }
+143 AUE_FLOCK NOPROTO { int flock(int fd, int how); }
+144 AUE_MSYNC STD { int linux_msync(l_ulong addr, \
+ l_size_t len, l_int fl); }
+145 AUE_READV STD { int linux_readv(l_ulong fd, struct l_iovec32 *iovp, \
+ l_ulong iovcnt); }
+146 AUE_WRITEV STD { int linux_writev(l_ulong fd, struct l_iovec32 *iovp, \
+ l_ulong iovcnt); }
+147 AUE_GETSID STD { int linux_getsid(l_pid_t pid); }
+148 AUE_NULL STD { int linux_fdatasync(l_uint fd); }
+149 AUE_SYSCTL STD { int linux_sysctl( \
+ struct l___sysctl_args *args); }
+150 AUE_MLOCK NOPROTO { int mlock(const void *addr, size_t len); }
+151 AUE_MUNLOCK NOPROTO { int munlock(const void *addr, size_t len); }
+152 AUE_MLOCKALL NOPROTO { int mlockall(int how); }
+153 AUE_MUNLOCKALL NOPROTO { int munlockall(void); }
+154 AUE_SCHED_SETPARAM NOPROTO { int sched_setparam(pid_t pid, \
+ const struct sched_param *param); }
+155 AUE_SCHED_GETPARAM NOPROTO { int sched_getparam(pid_t pid, \
+ struct sched_param *param); }
+156 AUE_SCHED_SETSCHEDULER STD { int linux_sched_setscheduler( \
+ l_pid_t pid, l_int policy, \
+ struct l_sched_param *param); }
+157 AUE_SCHED_GETSCHEDULER STD { int linux_sched_getscheduler( \
+ l_pid_t pid); }
+158 AUE_NULL NOPROTO { int sched_yield(void); }
+159 AUE_SCHED_GET_PRIORITY_MAX STD { int linux_sched_get_priority_max( \
+ l_int policy); }
+160 AUE_SCHED_GET_PRIORITY_MIN STD { int linux_sched_get_priority_min( \
+ l_int policy); }
+161 AUE_SCHED_RR_GET_INTERVAL STD { int linux_sched_rr_get_interval(l_pid_t pid, \
+ struct l_timespec *interval); }
+162 AUE_NULL STD { int linux_nanosleep( \
+ const struct l_timespec *rqtp, \
+ struct l_timespec *rmtp); }
+163 AUE_NULL STD { int linux_mremap(l_ulong addr, \
+ l_ulong old_len, l_ulong new_len, \
+ l_ulong flags, l_ulong new_addr); }
+164 AUE_SETRESUID STD { int linux_setresuid16(l_uid16_t ruid, \
+ l_uid16_t euid, l_uid16_t suid); }
+165 AUE_GETRESUID STD { int linux_getresuid16(l_uid16_t *ruid, \
+ l_uid16_t *euid, l_uid16_t *suid); }
+166 AUE_NULL UNIMPL vm86
+167 AUE_NULL STD { int linux_query_module(void); }
+168 AUE_POLL NOPROTO { int poll(struct pollfd *fds, \
+ unsigned int nfds, int timeout); }
+169 AUE_NULL STD { int linux_nfsservctl(void); }
+170 AUE_SETRESGID STD { int linux_setresgid16(l_gid16_t rgid, \
+ l_gid16_t egid, l_gid16_t sgid); }
+171 AUE_GETRESGID STD { int linux_getresgid16(l_gid16_t *rgid, \
+ l_gid16_t *egid, l_gid16_t *sgid); }
+172 AUE_PRCTL STD { int linux_prctl(l_int option, l_int arg2, l_int arg3, \
+ l_int arg4, l_int arg5); }
+173 AUE_NULL STD { int linux_rt_sigreturn( \
+ struct l_ucontext *ucp); }
+174 AUE_NULL STD { int linux_rt_sigaction(l_int sig, \
+ l_sigaction_t *act, l_sigaction_t *oact, \
+ l_size_t sigsetsize); }
+175 AUE_NULL STD { int linux_rt_sigprocmask(l_int how, \
+ l_sigset_t *mask, l_sigset_t *omask, \
+ l_size_t sigsetsize); }
+176 AUE_NULL STD { int linux_rt_sigpending(l_sigset_t *set, \
+ l_size_t sigsetsize); }
+177 AUE_NULL STD { int linux_rt_sigtimedwait(l_sigset_t *mask, \
+ l_siginfo_t *ptr, \
+ struct l_timeval *timeout, \
+ l_size_t sigsetsize); }
+178 AUE_NULL STD { int linux_rt_sigqueueinfo(void); }
+179 AUE_NULL STD { int linux_rt_sigsuspend( \
+ l_sigset_t *newset, \
+ l_size_t sigsetsize); }
+180 AUE_PREAD STD { int linux_pread(l_uint fd, char *buf, \
+ l_size_t nbyte, l_loff_t offset); }
+181 AUE_PWRITE STD { int linux_pwrite(l_uint fd, char *buf, \
+ l_size_t nbyte, l_loff_t offset); }
+182 AUE_CHOWN STD { int linux_chown16(char *path, \
+ l_uid16_t uid, l_gid16_t gid); }
+183 AUE_GETCWD STD { int linux_getcwd(char *buf, \
+ l_ulong bufsize); }
+184 AUE_CAPGET STD { int linux_capget(struct l_user_cap_header *hdrp, \
+ struct l_user_cap_data *datap); }
+185 AUE_CAPSET STD { int linux_capset(struct l_user_cap_header *hdrp, \
+ struct l_user_cap_data *datap); }
+186 AUE_NULL STD { int linux_sigaltstack(l_stack_t *uss, \
+ l_stack_t *uoss); }
+187 AUE_SENDFILE STD { int linux_sendfile(void); }
+188 AUE_GETPMSG UNIMPL getpmsg
+189 AUE_PUTPMSG UNIMPL putpmsg
+190 AUE_VFORK STD { int linux_vfork(void); }
+; 191: ugetrlimit
+191 AUE_GETRLIMIT STD { int linux_getrlimit(l_uint resource, \
+ struct l_rlimit *rlim); }
+192 AUE_MMAP STD { int linux_mmap2(l_ulong addr, l_ulong len, \
+ l_ulong prot, l_ulong flags, l_ulong fd, \
+ l_ulong pgoff); }
+193 AUE_TRUNCATE STD { int linux_truncate64(char *path, \
+ l_loff_t length); }
+194 AUE_FTRUNCATE STD { int linux_ftruncate64(l_uint fd, \
+ l_loff_t length); }
+195 AUE_STAT STD { int linux_stat64(const char *filename, \
+ struct l_stat64 *statbuf); }
+196 AUE_LSTAT STD { int linux_lstat64(const char *filename, \
+ struct l_stat64 *statbuf); }
+197 AUE_FSTAT STD { int linux_fstat64(l_int fd, \
+ struct l_stat64 *statbuf); }
+198 AUE_LCHOWN STD { int linux_lchown(char *path, l_uid_t uid, \
+ l_gid_t gid); }
+199 AUE_GETUID STD { int linux_getuid(void); }
+200 AUE_GETGID STD { int linux_getgid(void); }
+201 AUE_GETEUID NOPROTO { int geteuid(void); }
+202 AUE_GETEGID NOPROTO { int getegid(void); }
+203 AUE_SETREUID NOPROTO { int setreuid(uid_t ruid, uid_t euid); }
+204 AUE_SETREGID NOPROTO { int setregid(gid_t rgid, gid_t egid); }
+205 AUE_GETGROUPS STD { int linux_getgroups(l_int gidsetsize, \
+ l_gid_t *grouplist); }
+206 AUE_SETGROUPS STD { int linux_setgroups(l_int gidsetsize, \
+ l_gid_t *grouplist); }
+207 AUE_FCHOWN NODEF fchown fchown fchown_args int
+208 AUE_SETRESUID NOPROTO { int setresuid(uid_t ruid, uid_t euid, \
+ uid_t suid); }
+209 AUE_GETRESUID NOPROTO { int getresuid(uid_t *ruid, uid_t *euid, \
+ uid_t *suid); }
+210 AUE_SETRESGID NOPROTO { int setresgid(gid_t rgid, gid_t egid, \
+ gid_t sgid); }
+211 AUE_GETRESGID NOPROTO { int getresgid(gid_t *rgid, gid_t *egid, \
+ gid_t *sgid); }
+212 AUE_CHOWN STD { int linux_chown(char *path, l_uid_t uid, \
+ l_gid_t gid); }
+213 AUE_SETUID NOPROTO { int setuid(uid_t uid); }
+214 AUE_SETGID NOPROTO { int setgid(gid_t gid); }
+215 AUE_SETFSUID STD { int linux_setfsuid(l_uid_t uid); }
+216 AUE_SETFSGID STD { int linux_setfsgid(l_gid_t gid); }
+217 AUE_PIVOT_ROOT STD { int linux_pivot_root(char *new_root, \
+ char *put_old); }
+218 AUE_MINCORE STD { int linux_mincore(l_ulong start, \
+ l_size_t len, u_char *vec); }
+219 AUE_MADVISE NOPROTO { int madvise(void *addr, size_t len, \
+ int behav); }
+220 AUE_GETDIRENTRIES STD { int linux_getdents64(l_uint fd, \
+ void *dirent, l_uint count); }
+221 AUE_FCNTL STD { int linux_fcntl64(l_uint fd, l_uint cmd, \
+ uintptr_t arg); }
+222 AUE_NULL UNIMPL
+223 AUE_NULL UNIMPL
+224 AUE_NULL STD { long linux_gettid(void); }
+225 AUE_NULL UNIMPL linux_readahead
+226 AUE_NULL STD { int linux_setxattr(void); }
+227 AUE_NULL STD { int linux_lsetxattr(void); }
+228 AUE_NULL STD { int linux_fsetxattr(void); }
+229 AUE_NULL STD { int linux_getxattr(void); }
+230 AUE_NULL STD { int linux_lgetxattr(void); }
+231 AUE_NULL STD { int linux_fgetxattr(void); }
+232 AUE_NULL STD { int linux_listxattr(void); }
+233 AUE_NULL STD { int linux_llistxattr(void); }
+234 AUE_NULL STD { int linux_flistxattr(void); }
+235 AUE_NULL STD { int linux_removexattr(void); }
+236 AUE_NULL STD { int linux_lremovexattr(void); }
+237 AUE_NULL STD { int linux_fremovexattr(void); }
+238 AUE_NULL STD { int linux_tkill(int tid, int sig); }
+239 AUE_SENDFILE UNIMPL linux_sendfile64
+240 AUE_NULL STD { int linux_sys_futex(void *uaddr, int op, uint32_t val, \
+ struct l_timespec *timeout, uint32_t *uaddr2, uint32_t val3); }
+241 AUE_NULL STD { int linux_sched_setaffinity(l_pid_t pid, l_uint len, \
+ l_ulong *user_mask_ptr); }
+242 AUE_NULL STD { int linux_sched_getaffinity(l_pid_t pid, l_uint len, \
+ l_ulong *user_mask_ptr); }
+243 AUE_NULL STD { int linux_set_thread_area(struct l_user_desc *desc); }
+244 AUE_NULL UNIMPL linux_get_thread_area
+245 AUE_NULL UNIMPL linux_io_setup
+246 AUE_NULL UNIMPL linux_io_destroy
+247 AUE_NULL UNIMPL linux_io_getevents
+248 AUE_NULL UNIMPL linux_io_submit
+249 AUE_NULL UNIMPL linux_io_cancel
+250 AUE_NULL STD { int linux_fadvise64(int fd, l_loff_t offset, \
+ l_size_t len, int advice); }
+251 AUE_NULL UNIMPL
+252 AUE_EXIT STD { int linux_exit_group(int error_code); }
+253 AUE_NULL STD { int linux_lookup_dcookie(void); }
+254 AUE_NULL STD { int linux_epoll_create(void); }
+255 AUE_NULL STD { int linux_epoll_ctl(void); }
+256 AUE_NULL STD { int linux_epoll_wait(void); }
+257 AUE_NULL STD { int linux_remap_file_pages(void); }
+258 AUE_NULL STD { int linux_set_tid_address(int *tidptr); }
+259 AUE_NULL STD { int linux_timer_create(void); }
+260 AUE_NULL STD { int linux_timer_settime(void); }
+261 AUE_NULL STD { int linux_timer_gettime(void); }
+262 AUE_NULL STD { int linux_timer_getoverrun(void); }
+263 AUE_NULL STD { int linux_timer_delete(void); }
+264 AUE_CLOCK_SETTIME STD { int linux_clock_settime(clockid_t which, struct l_timespec *tp); }
+265 AUE_NULL STD { int linux_clock_gettime(clockid_t which, struct l_timespec *tp); }
+266 AUE_NULL STD { int linux_clock_getres(clockid_t which, struct l_timespec *tp); }
+267 AUE_NULL STD { int linux_clock_nanosleep(clockid_t which, int flags, \
+ struct l_timespec *rqtp, struct l_timespec *rmtp); }
+268 AUE_STATFS STD { int linux_statfs64(char *path, size_t bufsize, struct l_statfs64_buf *buf); }
+269 AUE_FSTATFS STD { int linux_fstatfs64(void); }
+270 AUE_NULL STD { int linux_tgkill(int tgid, int pid, int sig); }
+271 AUE_UTIMES STD { int linux_utimes(char *fname, \
+ struct l_timeval *tptr); }
+272 AUE_NULL STD { int linux_fadvise64_64(int fd, \
+ l_loff_t offset, l_loff_t len, \
+ int advice); }
+273 AUE_NULL UNIMPL vserver
+274 AUE_NULL STD { int linux_mbind(void); }
+275 AUE_NULL STD { int linux_get_mempolicy(void); }
+276 AUE_NULL STD { int linux_set_mempolicy(void); }
+; linux 2.6.6:
+277 AUE_NULL STD { int linux_mq_open(void); }
+278 AUE_NULL STD { int linux_mq_unlink(void); }
+279 AUE_NULL STD { int linux_mq_timedsend(void); }
+280 AUE_NULL STD { int linux_mq_timedreceive(void); }
+281 AUE_NULL STD { int linux_mq_notify(void); }
+282 AUE_NULL STD { int linux_mq_getsetattr(void); }
+283 AUE_NULL STD { int linux_kexec_load(void); }
+284 AUE_NULL STD { int linux_waitid(void); }
+285 AUE_NULL UNIMPL
+; linux 2.6.11:
+286 AUE_NULL STD { int linux_add_key(void); }
+287 AUE_NULL STD { int linux_request_key(void); }
+288 AUE_NULL STD { int linux_keyctl(void); }
+; linux 2.6.13:
+289 AUE_NULL STD { int linux_ioprio_set(void); }
+290 AUE_NULL STD { int linux_ioprio_get(void); }
+291 AUE_NULL STD { int linux_inotify_init(void); }
+292 AUE_NULL STD { int linux_inotify_add_watch(void); }
+293 AUE_NULL STD { int linux_inotify_rm_watch(void); }
+; linux 2.6.16:
+294 AUE_NULL STD { int linux_migrate_pages(void); }
+295 AUE_OPEN_RWTC STD { int linux_openat(l_int dfd, const char *filename, \
+ l_int flags, l_int mode); }
+296 AUE_MKDIRAT STD { int linux_mkdirat(l_int dfd, const char *pathname, \
+ l_int mode); }
+297 AUE_MKNODAT STD { int linux_mknodat(l_int dfd, const char *filename, \
+ l_int mode, l_uint dev); }
+298 AUE_FCHOWNAT STD { int linux_fchownat(l_int dfd, const char *filename, \
+ l_uid16_t uid, l_gid16_t gid, l_int flag); }
+299 AUE_FUTIMESAT STD { int linux_futimesat(l_int dfd, char *filename, \
+ struct l_timeval *utimes); }
+300 AUE_FSTATAT STD { int linux_fstatat64(l_int dfd, char *pathname, \
+ struct l_stat64 *statbuf, l_int flag); }
+301 AUE_UNLINKAT STD { int linux_unlinkat(l_int dfd, const char *pathname, \
+ l_int flag); }
+302 AUE_RENAMEAT STD { int linux_renameat(l_int olddfd, const char *oldname, \
+ l_int newdfd, const char *newname); }
+303 AUE_LINKAT STD { int linux_linkat(l_int olddfd, const char *oldname, \
+ l_int newdfd, const char *newname, l_int flag); }
+304 AUE_SYMLINKAT STD { int linux_symlinkat(const char *oldname, l_int newdfd, \
+ const char *newname); }
+305 AUE_READLINKAT STD { int linux_readlinkat(l_int dfd, const char *path, \
+ char *buf, l_int bufsiz); }
+306 AUE_FCHMODAT STD { int linux_fchmodat(l_int dfd, const char *filename, \
+ l_mode_t mode); }
+307 AUE_FACCESSAT STD { int linux_faccessat(l_int dfd, const char *filename, l_int amode, int flag); }
+308 AUE_NULL STD { int linux_pselect6(void); }
+309 AUE_NULL STD { int linux_ppoll(void); }
+310 AUE_NULL STD { int linux_unshare(void); }
+; linux 2.6.17:
+311 AUE_NULL STD { int linux_set_robust_list(struct linux_robust_list_head *head, \
+ l_size_t len); }
+312 AUE_NULL STD { int linux_get_robust_list(l_int pid, struct linux_robust_list_head *head, \
+ l_size_t *len); }
+313 AUE_NULL STD { int linux_splice(void); }
+314 AUE_NULL STD { int linux_sync_file_range(void); }
+315 AUE_NULL STD { int linux_tee(void); }
+316 AUE_NULL STD { int linux_vmsplice(void); }
+; linux 2.6.18:
+317 AUE_NULL STD { int linux_move_pages(void); }
+; linux 2.6.19:
+318 AUE_NULL STD { int linux_getcpu(void); }
+319 AUE_NULL STD { int linux_epoll_pwait(void); }
+; linux 2.6.22:
+320 AUE_NULL STD { int linux_utimensat(void); }
+321 AUE_NULL STD { int linux_signalfd(void); }
+322 AUE_NULL STD { int linux_timerfd_create(void); }
+323 AUE_NULL STD { int linux_eventfd(void); }
+; linux 2.6.23:
+324 AUE_NULL STD { int linux_fallocate(void); }
+; linux 2.6.25:
+325 AUE_NULL STD { int linux_timerfd_settime(void); }
+326 AUE_NULL STD { int linux_timerfd_gettime(void); }
+; linux 2.6.27:
+327 AUE_NULL STD { int linux_signalfd4(void); }
+328 AUE_NULL STD { int linux_eventfd2(void); }
+329 AUE_NULL STD { int linux_epoll_create1(void); }
+330 AUE_NULL STD { int linux_dup3(void); }
+331 AUE_NULL STD { int linux_pipe2(l_int *pipefds, l_int flags); }
+332 AUE_NULL STD { int linux_inotify_init1(void); }
+; linux 2.6.30:
+333 AUE_NULL STD { int linux_preadv(void); }
+334 AUE_NULL STD { int linux_pwritev(void); }
+; linux 2.6.31:
+335 AUE_NULL STD { int linux_rt_tsigqueueinfo(void); }
+336 AUE_NULL STD { int linux_perf_event_open(void); }
+; linux 2.6.33:
+337 AUE_NULL STD { int linux_recvmmsg(void); }
+338 AUE_NULL STD { int linux_fanotify_init(void); }
+339 AUE_NULL STD { int linux_fanotify_mark(void); }
+; linux 2.6.36:
+340 AUE_NULL STD { int linux_prlimit64(void); }
+; later:
+341 AUE_NULL STD { int linux_name_to_handle_at(void); }
+342 AUE_NULL STD { int linux_open_by_handle_at(void); }
+343 AUE_NULL STD { int linux_clock_adjtime(void); }
+344 AUE_NULL STD { int linux_syncfs(void); }
+345 AUE_NULL STD { int linux_sendmmsg(void); }
+346 AUE_NULL STD { int linux_setns(void); }
+347 AUE_NULL STD { int linux_process_vm_readv(void); }
+348 AUE_NULL STD { int linux_process_vm_writev(void); }
diff --git a/sys/amd64/pci/pci_cfgreg.c b/sys/amd64/pci/pci_cfgreg.c
new file mode 100644
index 0000000..90d9087
--- /dev/null
+++ b/sys/amd64/pci/pci_cfgreg.c
@@ -0,0 +1,370 @@
+/*-
+ * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
+ * Copyright (c) 2000, Michael Smith <msmith@freebsd.org>
+ * Copyright (c) 2000, BSDi
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <machine/pci_cfgreg.h>
+
+enum {
+ CFGMECH_NONE = 0,
+ CFGMECH_1,
+ CFGMECH_PCIE,
+};
+
+static uint32_t pci_docfgregread(int bus, int slot, int func, int reg,
+ int bytes);
+static int pciereg_cfgread(int bus, unsigned slot, unsigned func,
+ unsigned reg, unsigned bytes);
+static void pciereg_cfgwrite(int bus, unsigned slot, unsigned func,
+ unsigned reg, int data, unsigned bytes);
+static int pcireg_cfgread(int bus, int slot, int func, int reg, int bytes);
+static void pcireg_cfgwrite(int bus, int slot, int func, int reg, int data, int bytes);
+
+SYSCTL_DECL(_hw_pci);
+
+static int cfgmech;
+static vm_offset_t pcie_base;
+static int pcie_minbus, pcie_maxbus;
+static uint32_t pcie_badslots;
+static struct mtx pcicfg_mtx;
+static int mcfg_enable = 1;
+TUNABLE_INT("hw.pci.mcfg", &mcfg_enable);
+SYSCTL_INT(_hw_pci, OID_AUTO, mcfg, CTLFLAG_RDTUN, &mcfg_enable, 0,
+ "Enable support for PCI-e memory mapped config access");
+
+/*
+ * Initialise access to PCI configuration space
+ */
+int
+pci_cfgregopen(void)
+{
+ static int once = 0;
+ uint64_t pciebar;
+ uint16_t did, vid;
+
+ if (!once) {
+ mtx_init(&pcicfg_mtx, "pcicfg", NULL, MTX_SPIN);
+ once = 1;
+ }
+
+ if (cfgmech != CFGMECH_NONE)
+ return (1);
+ cfgmech = CFGMECH_1;
+
+ /*
+ * Grope around in the PCI config space to see if this is a
+ * chipset that is capable of doing memory-mapped config cycles.
+ * This also implies that it can do PCIe extended config cycles.
+ */
+
+ /* Check for supported chipsets */
+ vid = pci_cfgregread(0, 0, 0, PCIR_VENDOR, 2);
+ did = pci_cfgregread(0, 0, 0, PCIR_DEVICE, 2);
+ switch (vid) {
+ case 0x8086:
+ switch (did) {
+ case 0x3590:
+ case 0x3592:
+ /* Intel 7520 or 7320 */
+ pciebar = pci_cfgregread(0, 0, 0, 0xce, 2) << 16;
+ pcie_cfgregopen(pciebar, 0, 255);
+ break;
+ case 0x2580:
+ case 0x2584:
+ case 0x2590:
+ /* Intel 915, 925, or 915GM */
+ pciebar = pci_cfgregread(0, 0, 0, 0x48, 4);
+ pcie_cfgregopen(pciebar, 0, 255);
+ break;
+ }
+ }
+
+ return (1);
+}
+
+static uint32_t
+pci_docfgregread(int bus, int slot, int func, int reg, int bytes)
+{
+
+ if (cfgmech == CFGMECH_PCIE &&
+ (bus >= pcie_minbus && bus <= pcie_maxbus) &&
+ (bus != 0 || !(1 << slot & pcie_badslots)))
+ return (pciereg_cfgread(bus, slot, func, reg, bytes));
+ else
+ return (pcireg_cfgread(bus, slot, func, reg, bytes));
+}
+
+/*
+ * Read configuration space register
+ */
+u_int32_t
+pci_cfgregread(int bus, int slot, int func, int reg, int bytes)
+{
+ uint32_t line;
+
+ /*
+ * Some BIOS writers seem to want to ignore the spec and put
+ * 0 in the intline rather than 255 to indicate none. Some use
+ * numbers in the range 128-254 to indicate something strange and
+ * apparently undocumented anywhere. Assume these are completely bogus
+ * and map them to 255, which the rest of the PCI code recognizes as
+ * as an invalid IRQ.
+ */
+ if (reg == PCIR_INTLINE && bytes == 1) {
+ line = pci_docfgregread(bus, slot, func, PCIR_INTLINE, 1);
+ if (line == 0 || line >= 128)
+ line = PCI_INVALID_IRQ;
+ return (line);
+ }
+ return (pci_docfgregread(bus, slot, func, reg, bytes));
+}
+
+/*
+ * Write configuration space register
+ */
+void
+pci_cfgregwrite(int bus, int slot, int func, int reg, u_int32_t data, int bytes)
+{
+
+ if (cfgmech == CFGMECH_PCIE &&
+ (bus >= pcie_minbus && bus <= pcie_maxbus) &&
+ (bus != 0 || !(1 << slot & pcie_badslots)))
+ pciereg_cfgwrite(bus, slot, func, reg, data, bytes);
+ else
+ pcireg_cfgwrite(bus, slot, func, reg, data, bytes);
+}
+
+/*
+ * Configuration space access using direct register operations
+ */
+
+/* enable configuration space accesses and return data port address */
+static int
+pci_cfgenable(unsigned bus, unsigned slot, unsigned func, int reg, int bytes)
+{
+ int dataport = 0;
+
+ if (bus <= PCI_BUSMAX && slot <= PCI_SLOTMAX && func <= PCI_FUNCMAX &&
+ (unsigned)reg <= PCI_REGMAX && bytes != 3 &&
+ (unsigned)bytes <= 4 && (reg & (bytes - 1)) == 0) {
+ outl(CONF1_ADDR_PORT, (1 << 31) | (bus << 16) | (slot << 11)
+ | (func << 8) | (reg & ~0x03));
+ dataport = CONF1_DATA_PORT + (reg & 0x03);
+ }
+ return (dataport);
+}
+
+/* disable configuration space accesses */
+static void
+pci_cfgdisable(void)
+{
+
+ /*
+ * Do nothing. Writing a 0 to the address port can apparently
+ * confuse some bridges and cause spurious access failures.
+ */
+}
+
+static int
+pcireg_cfgread(int bus, int slot, int func, int reg, int bytes)
+{
+ int data = -1;
+ int port;
+
+ mtx_lock_spin(&pcicfg_mtx);
+ port = pci_cfgenable(bus, slot, func, reg, bytes);
+ if (port != 0) {
+ switch (bytes) {
+ case 1:
+ data = inb(port);
+ break;
+ case 2:
+ data = inw(port);
+ break;
+ case 4:
+ data = inl(port);
+ break;
+ }
+ pci_cfgdisable();
+ }
+ mtx_unlock_spin(&pcicfg_mtx);
+ return (data);
+}
+
+static void
+pcireg_cfgwrite(int bus, int slot, int func, int reg, int data, int bytes)
+{
+ int port;
+
+ mtx_lock_spin(&pcicfg_mtx);
+ port = pci_cfgenable(bus, slot, func, reg, bytes);
+ if (port != 0) {
+ switch (bytes) {
+ case 1:
+ outb(port, data);
+ break;
+ case 2:
+ outw(port, data);
+ break;
+ case 4:
+ outl(port, data);
+ break;
+ }
+ pci_cfgdisable();
+ }
+ mtx_unlock_spin(&pcicfg_mtx);
+}
+
+int
+pcie_cfgregopen(uint64_t base, uint8_t minbus, uint8_t maxbus)
+{
+ uint32_t val1, val2;
+ int slot;
+
+ if (!mcfg_enable)
+ return (0);
+
+ if (minbus != 0)
+ return (0);
+
+ if (bootverbose)
+ printf("PCIe: Memory Mapped configuration base @ 0x%lx\n",
+ base);
+
+ /* XXX: We should make sure this really fits into the direct map. */
+ pcie_base = (vm_offset_t)pmap_mapdev(base, (maxbus + 1) << 20);
+ pcie_minbus = minbus;
+ pcie_maxbus = maxbus;
+ cfgmech = CFGMECH_PCIE;
+
+ /*
+ * On some AMD systems, some of the devices on bus 0 are
+ * inaccessible using memory-mapped PCI config access. Walk
+ * bus 0 looking for such devices. For these devices, we will
+ * fall back to using type 1 config access instead.
+ */
+ if (pci_cfgregopen() != 0) {
+ for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
+ val1 = pcireg_cfgread(0, slot, 0, 0, 4);
+ if (val1 == 0xffffffff)
+ continue;
+
+ val2 = pciereg_cfgread(0, slot, 0, 0, 4);
+ if (val2 != val1)
+ pcie_badslots |= (1 << slot);
+ }
+ }
+
+ return (1);
+}
+
+#define PCIE_VADDR(base, reg, bus, slot, func) \
+ ((base) + \
+ ((((bus) & 0xff) << 20) | \
+ (((slot) & 0x1f) << 15) | \
+ (((func) & 0x7) << 12) | \
+ ((reg) & 0xfff)))
+
+/*
+ * AMD BIOS And Kernel Developer's Guides for CPU families starting with 10h
+ * have a requirement that all accesses to the memory mapped PCI configuration
+ * space are done using AX class of registers.
+ * Since other vendors do not currently have any contradicting requirements
+ * the AMD access pattern is applied universally.
+ */
+
+static int
+pciereg_cfgread(int bus, unsigned slot, unsigned func, unsigned reg,
+ unsigned bytes)
+{
+ vm_offset_t va;
+ int data = -1;
+
+ if (bus < pcie_minbus || bus > pcie_maxbus || slot > PCI_SLOTMAX ||
+ func > PCI_FUNCMAX || reg > PCIE_REGMAX)
+ return (-1);
+
+ va = PCIE_VADDR(pcie_base, reg, bus, slot, func);
+
+ switch (bytes) {
+ case 4:
+ __asm("movl %1, %0" : "=a" (data)
+ : "m" (*(volatile uint32_t *)va));
+ break;
+ case 2:
+ __asm("movzwl %1, %0" : "=a" (data)
+ : "m" (*(volatile uint16_t *)va));
+ break;
+ case 1:
+ __asm("movzbl %1, %0" : "=a" (data)
+ : "m" (*(volatile uint8_t *)va));
+ break;
+ }
+
+ return (data);
+}
+
+static void
+pciereg_cfgwrite(int bus, unsigned slot, unsigned func, unsigned reg, int data,
+ unsigned bytes)
+{
+ vm_offset_t va;
+
+ if (bus < pcie_minbus || bus > pcie_maxbus || slot > PCI_SLOTMAX ||
+ func > PCI_FUNCMAX || reg > PCIE_REGMAX)
+ return;
+
+ va = PCIE_VADDR(pcie_base, reg, bus, slot, func);
+
+ switch (bytes) {
+ case 4:
+ __asm("movl %1, %0" : "=m" (*(volatile uint32_t *)va)
+ : "a" (data));
+ break;
+ case 2:
+ __asm("movw %1, %0" : "=m" (*(volatile uint16_t *)va)
+ : "a" ((uint16_t)data));
+ break;
+ case 1:
+ __asm("movb %1, %0" : "=m" (*(volatile uint8_t *)va)
+ : "a" ((uint8_t)data));
+ break;
+ }
+}
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
new file mode 100644
index 0000000..dc071d3
--- /dev/null
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -0,0 +1,265 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/smp.h>
+
+#include <machine/vmm.h>
+#include "io/iommu.h"
+
+static int
+amdv_init(void)
+{
+
+ printf("amdv_init: not implemented\n");
+ return (ENXIO);
+}
+
+static int
+amdv_cleanup(void)
+{
+
+ printf("amdv_cleanup: not implemented\n");
+ return (ENXIO);
+}
+
+static void *
+amdv_vminit(struct vm *vm)
+{
+
+ printf("amdv_vminit: not implemented\n");
+ return (NULL);
+}
+
+static int
+amdv_vmrun(void *arg, int vcpu, register_t rip)
+{
+
+ printf("amdv_vmrun: not implemented\n");
+ return (ENXIO);
+}
+
+static void
+amdv_vmcleanup(void *arg)
+{
+
+ printf("amdv_vmcleanup: not implemented\n");
+ return;
+}
+
+static int
+amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot, boolean_t spok)
+{
+
+ printf("amdv_vmmmap_set: not implemented\n");
+ return (EINVAL);
+}
+
+static vm_paddr_t
+amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+
+ printf("amdv_vmmmap_get: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
+{
+
+ printf("amdv_getreg: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val)
+{
+
+ printf("amdv_setreg: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+ printf("amdv_get_desc: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+ printf("amdv_get_desc: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_inject_event(void *vmi, int vcpu, int type, int vector,
+ uint32_t error_code, int error_code_valid)
+{
+
+ printf("amdv_inject_event: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getcap(void *arg, int vcpu, int type, int *retval)
+{
+
+ printf("amdv_getcap: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setcap(void *arg, int vcpu, int type, int val)
+{
+
+ printf("amdv_setcap: not implemented\n");
+ return (EINVAL);
+}
+
+struct vmm_ops vmm_ops_amd = {
+ amdv_init,
+ amdv_cleanup,
+ amdv_vminit,
+ amdv_vmrun,
+ amdv_vmcleanup,
+ amdv_vmmmap_set,
+ amdv_vmmmap_get,
+ amdv_getreg,
+ amdv_setreg,
+ amdv_getdesc,
+ amdv_setdesc,
+ amdv_inject_event,
+ amdv_getcap,
+ amdv_setcap
+};
+
+static int
+amd_iommu_init(void)
+{
+
+ printf("amd_iommu_init: not implemented\n");
+ return (ENXIO);
+}
+
+static void
+amd_iommu_cleanup(void)
+{
+
+ printf("amd_iommu_cleanup: not implemented\n");
+}
+
+static void
+amd_iommu_enable(void)
+{
+
+ printf("amd_iommu_enable: not implemented\n");
+}
+
+static void
+amd_iommu_disable(void)
+{
+
+ printf("amd_iommu_disable: not implemented\n");
+}
+
+static void *
+amd_iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+ printf("amd_iommu_create_domain: not implemented\n");
+ return (NULL);
+}
+
+static void
+amd_iommu_destroy_domain(void *domain)
+{
+
+ printf("amd_iommu_destroy_domain: not implemented\n");
+}
+
+static uint64_t
+amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa,
+ uint64_t len)
+{
+
+ printf("amd_iommu_create_mapping: not implemented\n");
+ return (0);
+}
+
+static uint64_t
+amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+ printf("amd_iommu_remove_mapping: not implemented\n");
+ return (0);
+}
+
+static void
+amd_iommu_add_device(void *domain, int bus, int slot, int func)
+{
+
+ printf("amd_iommu_add_device: not implemented\n");
+}
+
+static void
+amd_iommu_remove_device(void *domain, int bus, int slot, int func)
+{
+
+ printf("amd_iommu_remove_device: not implemented\n");
+}
+
+static void
+amd_iommu_invalidate_tlb(void *domain)
+{
+
+ printf("amd_iommu_invalidate_tlb: not implemented\n");
+}
+
+struct iommu_ops iommu_ops_amd = {
+ amd_iommu_init,
+ amd_iommu_cleanup,
+ amd_iommu_enable,
+ amd_iommu_disable,
+ amd_iommu_create_domain,
+ amd_iommu_destroy_domain,
+ amd_iommu_create_mapping,
+ amd_iommu_remove_mapping,
+ amd_iommu_add_device,
+ amd_iommu_remove_device,
+ amd_iommu_invalidate_tlb,
+};
diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c
new file mode 100644
index 0000000..4f91601
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.c
@@ -0,0 +1,392 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/param.h>
+#include <machine/cpufunc.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmx_cpufunc.h"
+#include "vmx_msr.h"
+#include "vmx.h"
+#include "ept.h"
+
+#define EPT_PWL4(cap) ((cap) & (1UL << 6))
+#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14))
+#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */
+#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */
+#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
+#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20))
+
+#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL
+#define INVVPID_ALL_TYPES_SUPPORTED(cap) \
+ (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
+
+#define INVEPT_ALL_TYPES_MASK 0x6000000UL
+#define INVEPT_ALL_TYPES_SUPPORTED(cap) \
+ (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
+
+#define EPT_PG_RD (1 << 0)
+#define EPT_PG_WR (1 << 1)
+#define EPT_PG_EX (1 << 2)
+#define EPT_PG_MEMORY_TYPE(x) ((x) << 3)
+#define EPT_PG_IGNORE_PAT (1 << 6)
+#define EPT_PG_SUPERPAGE (1 << 7)
+
+#define EPT_ADDR_MASK ((uint64_t)-1 << 12)
+
+MALLOC_DECLARE(M_VMX);
+
+static uint64_t page_sizes_mask;
+
+int
+ept_init(void)
+{
+ int page_shift;
+ uint64_t cap;
+
+ cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
+
+ /*
+ * Verify that:
+ * - page walk length is 4 steps
+ * - extended page tables can be laid out in write-back memory
+ * - invvpid instruction with all possible types is supported
+ * - invept instruction with all possible types is supported
+ */
+ if (!EPT_PWL4(cap) ||
+ !EPT_MEMORY_TYPE_WB(cap) ||
+ !INVVPID_SUPPORTED(cap) ||
+ !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
+ !INVEPT_SUPPORTED(cap) ||
+ !INVEPT_ALL_TYPES_SUPPORTED(cap))
+ return (EINVAL);
+
+ /* Set bits in 'page_sizes_mask' for each valid page size */
+ page_shift = PAGE_SHIFT;
+ page_sizes_mask = 1UL << page_shift; /* 4KB page */
+
+ page_shift += 9;
+ if (EPT_PDE_SUPERPAGE(cap))
+ page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */
+
+ page_shift += 9;
+ if (EPT_PDPTE_SUPERPAGE(cap))
+ page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */
+
+ return (0);
+}
+
+#if 0
+static void
+ept_dump(uint64_t *ptp, int nlevels)
+{
+ int i, t, tabs;
+ uint64_t *ptpnext, ptpval;
+
+ if (--nlevels < 0)
+ return;
+
+ tabs = 3 - nlevels;
+ for (t = 0; t < tabs; t++)
+ printf("\t");
+ printf("PTP = %p\n", ptp);
+
+ for (i = 0; i < 512; i++) {
+ ptpval = ptp[i];
+
+ if (ptpval == 0)
+ continue;
+
+ for (t = 0; t < tabs; t++)
+ printf("\t");
+ printf("%3d 0x%016lx\n", i, ptpval);
+
+ if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
+ ptpnext = (uint64_t *)
+ PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+ ept_dump(ptpnext, nlevels);
+ }
+ }
+}
+#endif
+
+static size_t
+ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
+{
+ int spshift, ptpshift, ptpindex, nlevels;
+
+ /*
+ * Compute the size of the mapping that we can accomodate.
+ *
+ * This is based on three factors:
+ * - super page sizes supported by the processor
+ * - alignment of the region starting at 'gpa' and 'hpa'
+ * - length of the region 'len'
+ */
+ spshift = PAGE_SHIFT;
+ if (spok)
+ spshift += (EPT_PWLEVELS - 1) * 9;
+ while (spshift >= PAGE_SHIFT) {
+ uint64_t spsize = 1UL << spshift;
+ if ((page_sizes_mask & spsize) != 0 &&
+ (gpa & (spsize - 1)) == 0 &&
+ (hpa & (spsize - 1)) == 0 &&
+ length >= spsize) {
+ break;
+ }
+ spshift -= 9;
+ }
+
+ if (spshift < PAGE_SHIFT) {
+ panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
+ "length 0x%016lx, page_sizes_mask 0x%016lx",
+ gpa, hpa, length, page_sizes_mask);
+ }
+
+ nlevels = EPT_PWLEVELS;
+ while (--nlevels >= 0) {
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ /* We have reached the leaf mapping */
+ if (spshift >= ptpshift)
+ break;
+
+ /*
+ * We are working on a non-leaf page table page.
+ *
+ * Create the next level page table page if necessary and point
+ * to it from the current page table.
+ */
+ if (ptp[ptpindex] == 0) {
+ void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
+ ptp[ptpindex] = vtophys(nlp);
+ ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
+ }
+
+ /* Work our way down to the next level page table page */
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
+ }
+
+ if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
+ panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
+ "mismatch\n", gpa, ptpshift);
+ }
+
+ if (prot != VM_PROT_NONE) {
+ /* Do the mapping */
+ ptp[ptpindex] = hpa;
+
+ /* Apply the access controls */
+ if (prot & VM_PROT_READ)
+ ptp[ptpindex] |= EPT_PG_RD;
+ if (prot & VM_PROT_WRITE)
+ ptp[ptpindex] |= EPT_PG_WR;
+ if (prot & VM_PROT_EXECUTE)
+ ptp[ptpindex] |= EPT_PG_EX;
+
+ /*
+ * XXX should we enforce this memory type by setting the
+ * ignore PAT bit to 1.
+ */
+ ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
+
+ if (nlevels > 0)
+ ptp[ptpindex] |= EPT_PG_SUPERPAGE;
+ } else {
+ /* Remove the mapping */
+ ptp[ptpindex] = 0;
+ }
+
+ return (1UL << ptpshift);
+}
+
+static vm_paddr_t
+ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
+{
+ int nlevels, ptpshift, ptpindex;
+ uint64_t ptpval, hpabase, pgmask;
+
+ nlevels = EPT_PWLEVELS;
+ while (--nlevels >= 0) {
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ ptpval = ptp[ptpindex];
+
+ /* Cannot make progress beyond this point */
+ if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
+ break;
+
+ if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
+ pgmask = (1UL << ptpshift) - 1;
+ hpabase = ptpval & ~pgmask;
+ return (hpabase | (gpa & pgmask));
+ }
+
+ /* Work our way down to the next level page table page */
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+ }
+
+ return ((vm_paddr_t)-1);
+}
+
+static void
+ept_free_pt_entry(pt_entry_t pte)
+{
+ if (pte == 0)
+ return;
+
+ /* sanity check */
+ if ((pte & EPT_PG_SUPERPAGE) != 0)
+ panic("ept_free_pt_entry: pte cannot have superpage bit");
+
+ return;
+}
+
+static void
+ept_free_pd_entry(pd_entry_t pde)
+{
+ pt_entry_t *pt;
+ int i;
+
+ if (pde == 0)
+ return;
+
+ if ((pde & EPT_PG_SUPERPAGE) == 0) {
+ pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
+ for (i = 0; i < NPTEPG; i++)
+ ept_free_pt_entry(pt[i]);
+ free(pt, M_VMX); /* free the page table page */
+ }
+}
+
+static void
+ept_free_pdp_entry(pdp_entry_t pdpe)
+{
+ pd_entry_t *pd;
+ int i;
+
+ if (pdpe == 0)
+ return;
+
+ if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
+ pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
+ for (i = 0; i < NPDEPG; i++)
+ ept_free_pd_entry(pd[i]);
+ free(pd, M_VMX); /* free the page directory page */
+ }
+}
+
+static void
+ept_free_pml4_entry(pml4_entry_t pml4e)
+{
+ pdp_entry_t *pdp;
+ int i;
+
+ if (pml4e == 0)
+ return;
+
+ if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
+ for (i = 0; i < NPDPEPG; i++)
+ ept_free_pdp_entry(pdp[i]);
+ free(pdp, M_VMX); /* free the page directory ptr page */
+ }
+}
+
+void
+ept_vmcleanup(struct vmx *vmx)
+{
+ int i;
+
+ for (i = 0; i < NPML4EPG; i++)
+ ept_free_pml4_entry(vmx->pml4ept[i]);
+}
+
+int
+ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
+ vm_memattr_t attr, int prot, boolean_t spok)
+{
+ size_t n;
+ struct vmx *vmx = arg;
+
+ while (len > 0) {
+ n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
+ prot, spok);
+ len -= n;
+ gpa += n;
+ hpa += n;
+ }
+
+ return (0);
+}
+
+vm_paddr_t
+ept_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+ vm_paddr_t hpa;
+ struct vmx *vmx;
+
+ vmx = arg;
+ hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
+ return (hpa);
+}
+
+static void
+invept_single_context(void *arg)
+{
+ struct invept_desc desc = *(struct invept_desc *)arg;
+
+ invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
+}
+
+void
+ept_invalidate_mappings(u_long pml4ept)
+{
+ struct invept_desc invept_desc = { 0 };
+
+ invept_desc.eptp = EPTP(pml4ept);
+
+ smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
+}
diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h
new file mode 100644
index 0000000..2d7258d
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _EPT_H_
+#define _EPT_H_
+
+struct vmx;
+
+#define EPT_PWLEVELS 4 /* page walk levels */
+#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
+
+int ept_init(void);
+int ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
+vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
+void ept_invalidate_mappings(u_long ept_pml4);
+void ept_vmcleanup(struct vmx *vmx);
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
new file mode 100644
index 0000000..a5784dd
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -0,0 +1,551 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/pcpu.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/segments.h>
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmcs.h"
+#include "vmx_cpufunc.h"
+#include "ept.h"
+#include "vmx.h"
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+static uint64_t
+vmcs_fix_regval(uint32_t encoding, uint64_t val)
+{
+
+ switch (encoding) {
+ case VMCS_GUEST_CR0:
+ val = vmx_fix_cr0(val);
+ break;
+ case VMCS_GUEST_CR4:
+ val = vmx_fix_cr4(val);
+ break;
+ default:
+ break;
+ }
+ return (val);
+}
+
+static uint32_t
+vmcs_field_encoding(int ident)
+{
+ switch (ident) {
+ case VM_REG_GUEST_CR0:
+ return (VMCS_GUEST_CR0);
+ case VM_REG_GUEST_CR3:
+ return (VMCS_GUEST_CR3);
+ case VM_REG_GUEST_CR4:
+ return (VMCS_GUEST_CR4);
+ case VM_REG_GUEST_DR7:
+ return (VMCS_GUEST_DR7);
+ case VM_REG_GUEST_RSP:
+ return (VMCS_GUEST_RSP);
+ case VM_REG_GUEST_RIP:
+ return (VMCS_GUEST_RIP);
+ case VM_REG_GUEST_RFLAGS:
+ return (VMCS_GUEST_RFLAGS);
+ case VM_REG_GUEST_ES:
+ return (VMCS_GUEST_ES_SELECTOR);
+ case VM_REG_GUEST_CS:
+ return (VMCS_GUEST_CS_SELECTOR);
+ case VM_REG_GUEST_SS:
+ return (VMCS_GUEST_SS_SELECTOR);
+ case VM_REG_GUEST_DS:
+ return (VMCS_GUEST_DS_SELECTOR);
+ case VM_REG_GUEST_FS:
+ return (VMCS_GUEST_FS_SELECTOR);
+ case VM_REG_GUEST_GS:
+ return (VMCS_GUEST_GS_SELECTOR);
+ case VM_REG_GUEST_TR:
+ return (VMCS_GUEST_TR_SELECTOR);
+ case VM_REG_GUEST_LDTR:
+ return (VMCS_GUEST_LDTR_SELECTOR);
+ case VM_REG_GUEST_EFER:
+ return (VMCS_GUEST_IA32_EFER);
+ default:
+ return (-1);
+ }
+
+}
+
+static int
+vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
+{
+
+ switch (seg) {
+ case VM_REG_GUEST_ES:
+ *base = VMCS_GUEST_ES_BASE;
+ *lim = VMCS_GUEST_ES_LIMIT;
+ *acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_CS:
+ *base = VMCS_GUEST_CS_BASE;
+ *lim = VMCS_GUEST_CS_LIMIT;
+ *acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_SS:
+ *base = VMCS_GUEST_SS_BASE;
+ *lim = VMCS_GUEST_SS_LIMIT;
+ *acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_DS:
+ *base = VMCS_GUEST_DS_BASE;
+ *lim = VMCS_GUEST_DS_LIMIT;
+ *acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_FS:
+ *base = VMCS_GUEST_FS_BASE;
+ *lim = VMCS_GUEST_FS_LIMIT;
+ *acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_GS:
+ *base = VMCS_GUEST_GS_BASE;
+ *lim = VMCS_GUEST_GS_LIMIT;
+ *acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_TR:
+ *base = VMCS_GUEST_TR_BASE;
+ *lim = VMCS_GUEST_TR_LIMIT;
+ *acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_LDTR:
+ *base = VMCS_GUEST_LDTR_BASE;
+ *lim = VMCS_GUEST_LDTR_LIMIT;
+ *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_IDTR:
+ *base = VMCS_GUEST_IDTR_BASE;
+ *lim = VMCS_GUEST_IDTR_LIMIT;
+ *acc = VMCS_INVALID_ENCODING;
+ break;
+ case VM_REG_GUEST_GDTR:
+ *base = VMCS_GUEST_GDTR_BASE;
+ *lim = VMCS_GUEST_GDTR_LIMIT;
+ *acc = VMCS_INVALID_ENCODING;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+int
+vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval)
+{
+ int error;
+ uint32_t encoding;
+
+ /*
+ * If we need to get at vmx-specific state in the VMCS we can bypass
+ * the translation of 'ident' to 'encoding' by simply setting the
+ * sign bit. As it so happens the upper 16 bits are reserved (i.e
+ * set to 0) in the encodings for the VMCS so we are free to use the
+ * sign bit.
+ */
+ if (ident < 0)
+ encoding = ident & 0x7fffffff;
+ else
+ encoding = vmcs_field_encoding(ident);
+
+ if (encoding == (uint32_t)-1)
+ return (EINVAL);
+
+ VMPTRLD(vmcs);
+ error = vmread(encoding, retval);
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val)
+{
+ int error;
+ uint32_t encoding;
+
+ if (ident < 0)
+ encoding = ident & 0x7fffffff;
+ else
+ encoding = vmcs_field_encoding(ident);
+
+ if (encoding == (uint32_t)-1)
+ return (EINVAL);
+
+ val = vmcs_fix_regval(encoding, val);
+
+ VMPTRLD(vmcs);
+ error = vmwrite(encoding, val);
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+ int error;
+ uint32_t base, limit, access;
+
+ error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+ if (error != 0)
+ panic("vmcs_setdesc: invalid segment register %d", seg);
+
+ VMPTRLD(vmcs);
+ if ((error = vmwrite(base, desc->base)) != 0)
+ goto done;
+
+ if ((error = vmwrite(limit, desc->limit)) != 0)
+ goto done;
+
+ if (access != VMCS_INVALID_ENCODING) {
+ if ((error = vmwrite(access, desc->access)) != 0)
+ goto done;
+ }
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+ int error;
+ uint32_t base, limit, access;
+ uint64_t u64;
+
+ error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+ if (error != 0)
+ panic("vmcs_getdesc: invalid segment register %d", seg);
+
+ VMPTRLD(vmcs);
+ if ((error = vmread(base, &u64)) != 0)
+ goto done;
+ desc->base = u64;
+
+ if ((error = vmread(limit, &u64)) != 0)
+ goto done;
+ desc->limit = u64;
+
+ if (access != VMCS_INVALID_ENCODING) {
+ if ((error = vmread(access, &u64)) != 0)
+ goto done;
+ desc->access = u64;
+ }
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
+{
+ int error;
+
+ VMPTRLD(vmcs);
+
+ /*
+ * Guest MSRs are saved in the VM-exit MSR-store area.
+ * Guest MSRs are loaded from the VM-entry MSR-load area.
+ * Both areas point to the same location in memory.
+ */
+ if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
+ goto done;
+
+ error = 0;
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_set_defaults(struct vmcs *vmcs,
+ u_long host_rip, u_long host_rsp, u_long ept_pml4,
+ uint32_t pinbased_ctls, uint32_t procbased_ctls,
+ uint32_t procbased_ctls2, uint32_t exit_ctls,
+ uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
+{
+ int error, codesel, datasel, tsssel;
+ u_long cr0, cr4, efer;
+ uint64_t eptp, pat, fsbase, idtrbase;
+ uint32_t exc_bitmap;
+
+ codesel = vmm_get_host_codesel();
+ datasel = vmm_get_host_datasel();
+ tsssel = vmm_get_host_tsssel();
+
+ /*
+ * Make sure we have a "current" VMCS to work with.
+ */
+ VMPTRLD(vmcs);
+
+ /*
+ * Load the VMX controls
+ */
+ if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
+ goto done;
+
+ /* Guest state */
+
+ /* Initialize guest IA32_PAT MSR with the default value */
+ pat = PAT_VALUE(0, PAT_WRITE_BACK) |
+ PAT_VALUE(1, PAT_WRITE_THROUGH) |
+ PAT_VALUE(2, PAT_UNCACHED) |
+ PAT_VALUE(3, PAT_UNCACHEABLE) |
+ PAT_VALUE(4, PAT_WRITE_BACK) |
+ PAT_VALUE(5, PAT_WRITE_THROUGH) |
+ PAT_VALUE(6, PAT_UNCACHED) |
+ PAT_VALUE(7, PAT_UNCACHEABLE);
+ if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
+ goto done;
+
+ /* Host state */
+
+ /* Initialize host IA32_PAT MSR */
+ pat = vmm_get_host_pat();
+ if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
+ goto done;
+
+ /* Load the IA32_EFER MSR */
+ efer = vmm_get_host_efer();
+ if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
+ goto done;
+
+ /* Load the control registers */
+
+ cr0 = vmm_get_host_cr0();
+ if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
+ goto done;
+
+ cr4 = vmm_get_host_cr4() | CR4_VMXE;
+ if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
+ goto done;
+
+ /* Load the segment selectors */
+ if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
+ goto done;
+
+ /*
+ * Load the Base-Address for %fs and idtr.
+ *
+ * Note that we exclude %gs, tss and gdtr here because their base
+ * address is pcpu specific.
+ */
+ fsbase = vmm_get_host_fsbase();
+ if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0)
+ goto done;
+
+ idtrbase = vmm_get_host_idtrbase();
+ if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0)
+ goto done;
+
+ /* instruction pointer */
+ if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
+ goto done;
+
+ /* stack pointer */
+ if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
+ goto done;
+
+ /* eptp */
+ eptp = EPTP(ept_pml4);
+ if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
+ goto done;
+
+ /* vpid */
+ if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
+ goto done;
+
+ /* msr bitmap */
+ if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
+ goto done;
+
+ /* exception bitmap */
+ exc_bitmap = 1 << IDT_MC;
+ if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
+ goto done;
+
+ /* link pointer */
+ if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
+ goto done;
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+uint64_t
+vmcs_read(uint32_t encoding)
+{
+ int error;
+ uint64_t val;
+
+ error = vmread(encoding, &val);
+ if (error != 0)
+ panic("vmcs_read(%u) error %d", encoding, error);
+
+ return (val);
+}
+
+#ifdef DDB
+extern int vmxon_enabled[];
+
+DB_SHOW_COMMAND(vmcs, db_show_vmcs)
+{
+ uint64_t cur_vmcs, val;
+ uint32_t exit;
+
+ if (!vmxon_enabled[curcpu]) {
+ db_printf("VMX not enabled\n");
+ return;
+ }
+
+ if (have_addr) {
+ db_printf("Only current VMCS supported\n");
+ return;
+ }
+
+ vmptrst(&cur_vmcs);
+ if (cur_vmcs == VMCS_INITIAL) {
+ db_printf("No current VM context\n");
+ return;
+ }
+ db_printf("VMCS: %jx\n", cur_vmcs);
+ db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID));
+ db_printf("Activity: ");
+ val = vmcs_read(VMCS_GUEST_ACTIVITY);
+ switch (val) {
+ case 0:
+ db_printf("Active");
+ break;
+ case 1:
+ db_printf("HLT");
+ break;
+ case 2:
+ db_printf("Shutdown");
+ break;
+ case 3:
+ db_printf("Wait for SIPI");
+ break;
+ default:
+ db_printf("Unknown: %#lx", val);
+ }
+ db_printf("\n");
+ exit = vmcs_read(VMCS_EXIT_REASON);
+ if (exit & 0x80000000)
+ db_printf("Entry Failure Reason: %u\n", exit & 0xffff);
+ else
+ db_printf("Exit Reason: %u\n", exit & 0xffff);
+ db_printf("Qualification: %#lx\n", vmcs_exit_qualification());
+ db_printf("Guest Linear Address: %#lx\n",
+ vmcs_read(VMCS_GUEST_LINEAR_ADDRESS));
+ switch (exit & 0x8000ffff) {
+ case EXIT_REASON_EXCEPTION:
+ case EXIT_REASON_EXT_INTR:
+ val = vmcs_read(VMCS_EXIT_INTERRUPTION_INFO);
+ db_printf("Interrupt Type: ");
+ switch (val >> 8 & 0x7) {
+ case 0:
+ db_printf("external");
+ break;
+ case 2:
+ db_printf("NMI");
+ break;
+ case 3:
+ db_printf("HW exception");
+ break;
+ case 4:
+ db_printf("SW exception");
+ break;
+ default:
+ db_printf("?? %lu", val >> 8 & 0x7);
+ break;
+ }
+ db_printf(" Vector: %lu", val & 0xff);
+ if (val & 0x800)
+ db_printf(" Error Code: %lx",
+ vmcs_read(VMCS_EXIT_INTERRUPTION_ERROR));
+ db_printf("\n");
+ break;
+ case EXIT_REASON_EPT_FAULT:
+ case EXIT_REASON_EPT_MISCONFIG:
+ db_printf("Guest Physical Address: %#lx\n",
+ vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS));
+ break;
+ }
+ db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error());
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
new file mode 100644
index 0000000..f39eed2
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -0,0 +1,338 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMCS_H_
+#define _VMCS_H_
+
+#ifdef _KERNEL
+struct vmcs {
+ uint32_t identifier;
+ uint32_t abort_code;
+ char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
+};
+CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
+
+/* MSR save region is composed of an array of 'struct msr_entry' */
+struct msr_entry {
+ uint32_t index;
+ uint32_t reserved;
+ uint64_t val;
+
+};
+
+int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
+int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
+ u_long ept_pml4,
+ uint32_t pinbased_ctls, uint32_t procbased_ctls,
+ uint32_t procbased_ctls2, uint32_t exit_ctls,
+ uint32_t entry_ctls, u_long msr_bitmap,
+ uint16_t vpid);
+int vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval);
+int vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val);
+int vmcs_getdesc(struct vmcs *vmcs, int ident,
+ struct seg_desc *desc);
+int vmcs_setdesc(struct vmcs *vmcs, int ident,
+ struct seg_desc *desc);
+uint64_t vmcs_read(uint32_t encoding);
+
+#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
+#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP)
+#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR)
+#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff)
+#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION)
+#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3)
+#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
+#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
+
+#endif /* _KERNEL */
+
+#define VMCS_INITIAL 0xffffffffffffffff
+
+#define VMCS_IDENT(encoding) ((encoding) | 0x80000000)
+/*
+ * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
+ */
+#define VMCS_INVALID_ENCODING 0xffffffff
+
+/* 16-bit control fields */
+#define VMCS_VPID 0x00000000
+
+/* 16-bit guest-state fields */
+#define VMCS_GUEST_ES_SELECTOR 0x00000800
+#define VMCS_GUEST_CS_SELECTOR 0x00000802
+#define VMCS_GUEST_SS_SELECTOR 0x00000804
+#define VMCS_GUEST_DS_SELECTOR 0x00000806
+#define VMCS_GUEST_FS_SELECTOR 0x00000808
+#define VMCS_GUEST_GS_SELECTOR 0x0000080A
+#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C
+#define VMCS_GUEST_TR_SELECTOR 0x0000080E
+
+/* 16-bit host-state fields */
+#define VMCS_HOST_ES_SELECTOR 0x00000C00
+#define VMCS_HOST_CS_SELECTOR 0x00000C02
+#define VMCS_HOST_SS_SELECTOR 0x00000C04
+#define VMCS_HOST_DS_SELECTOR 0x00000C06
+#define VMCS_HOST_FS_SELECTOR 0x00000C08
+#define VMCS_HOST_GS_SELECTOR 0x00000C0A
+#define VMCS_HOST_TR_SELECTOR 0x00000C0C
+
+/* 64-bit control fields */
+#define VMCS_IO_BITMAP_A 0x00002000
+#define VMCS_IO_BITMAP_B 0x00002002
+#define VMCS_MSR_BITMAP 0x00002004
+#define VMCS_EXIT_MSR_STORE 0x00002006
+#define VMCS_EXIT_MSR_LOAD 0x00002008
+#define VMCS_ENTRY_MSR_LOAD 0x0000200A
+#define VMCS_EXECUTIVE_VMCS 0x0000200C
+#define VMCS_TSC_OFFSET 0x00002010
+#define VMCS_VIRTUAL_APIC 0x00002012
+#define VMCS_APIC_ACCESS 0x00002014
+#define VMCS_EPTP 0x0000201A
+
+/* 64-bit read-only fields */
+#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
+
+/* 64-bit guest-state fields */
+#define VMCS_LINK_POINTER 0x00002800
+#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802
+#define VMCS_GUEST_IA32_PAT 0x00002804
+#define VMCS_GUEST_IA32_EFER 0x00002806
+#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
+#define VMCS_GUEST_PDPTE0 0x0000280A
+#define VMCS_GUEST_PDPTE1 0x0000280C
+#define VMCS_GUEST_PDPTE2 0x0000280E
+#define VMCS_GUEST_PDPTE3 0x00002810
+
+/* 64-bit host-state fields */
+#define VMCS_HOST_IA32_PAT 0x00002C00
+#define VMCS_HOST_IA32_EFER 0x00002C02
+#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04
+
+/* 32-bit control fields */
+#define VMCS_PIN_BASED_CTLS 0x00004000
+#define VMCS_PRI_PROC_BASED_CTLS 0x00004002
+#define VMCS_EXCEPTION_BITMAP 0x00004004
+#define VMCS_PF_ERROR_MASK 0x00004006
+#define VMCS_PF_ERROR_MATCH 0x00004008
+#define VMCS_CR3_TARGET_COUNT 0x0000400A
+#define VMCS_EXIT_CTLS 0x0000400C
+#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E
+#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010
+#define VMCS_ENTRY_CTLS 0x00004012
+#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014
+#define VMCS_ENTRY_INTR_INFO 0x00004016
+#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018
+#define VMCS_ENTRY_INST_LENGTH 0x0000401A
+#define VMCS_TPR_THRESHOLD 0x0000401C
+#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E
+#define VMCS_PLE_GAP 0x00004020
+#define VMCS_PLE_WINDOW 0x00004022
+
+/* 32-bit read-only data fields */
+#define VMCS_INSTRUCTION_ERROR 0x00004400
+#define VMCS_EXIT_REASON 0x00004402
+#define VMCS_EXIT_INTERRUPTION_INFO 0x00004404
+#define VMCS_EXIT_INTERRUPTION_ERROR 0x00004406
+#define VMCS_IDT_VECTORING_INFO 0x00004408
+#define VMCS_IDT_VECTORING_ERROR 0x0000440A
+#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C
+#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E
+
+/* 32-bit guest-state fields */
+#define VMCS_GUEST_ES_LIMIT 0x00004800
+#define VMCS_GUEST_CS_LIMIT 0x00004802
+#define VMCS_GUEST_SS_LIMIT 0x00004804
+#define VMCS_GUEST_DS_LIMIT 0x00004806
+#define VMCS_GUEST_FS_LIMIT 0x00004808
+#define VMCS_GUEST_GS_LIMIT 0x0000480A
+#define VMCS_GUEST_LDTR_LIMIT 0x0000480C
+#define VMCS_GUEST_TR_LIMIT 0x0000480E
+#define VMCS_GUEST_GDTR_LIMIT 0x00004810
+#define VMCS_GUEST_IDTR_LIMIT 0x00004812
+#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814
+#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816
+#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818
+#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A
+#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C
+#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E
+#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820
+#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822
+#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824
+#define VMCS_GUEST_ACTIVITY 0x00004826
+#define VMCS_GUEST_SMBASE 0x00004828
+#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A
+#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E
+
+/* 32-bit host state fields */
+#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00
+
+/* Natural Width control fields */
+#define VMCS_CR0_MASK 0x00006000
+#define VMCS_CR4_MASK 0x00006002
+#define VMCS_CR0_SHADOW 0x00006004
+#define VMCS_CR4_SHADOW 0x00006006
+#define VMCS_CR3_TARGET0 0x00006008
+#define VMCS_CR3_TARGET1 0x0000600A
+#define VMCS_CR3_TARGET2 0x0000600C
+#define VMCS_CR3_TARGET3 0x0000600E
+
+/* Natural Width read-only fields */
+#define VMCS_EXIT_QUALIFICATION 0x00006400
+#define VMCS_IO_RCX 0x00006402
+#define VMCS_IO_RSI 0x00006404
+#define VMCS_IO_RDI 0x00006406
+#define VMCS_IO_RIP 0x00006408
+#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A
+
+/* Natural Width guest-state fields */
+#define VMCS_GUEST_CR0 0x00006800
+#define VMCS_GUEST_CR3 0x00006802
+#define VMCS_GUEST_CR4 0x00006804
+#define VMCS_GUEST_ES_BASE 0x00006806
+#define VMCS_GUEST_CS_BASE 0x00006808
+#define VMCS_GUEST_SS_BASE 0x0000680A
+#define VMCS_GUEST_DS_BASE 0x0000680C
+#define VMCS_GUEST_FS_BASE 0x0000680E
+#define VMCS_GUEST_GS_BASE 0x00006810
+#define VMCS_GUEST_LDTR_BASE 0x00006812
+#define VMCS_GUEST_TR_BASE 0x00006814
+#define VMCS_GUEST_GDTR_BASE 0x00006816
+#define VMCS_GUEST_IDTR_BASE 0x00006818
+#define VMCS_GUEST_DR7 0x0000681A
+#define VMCS_GUEST_RSP 0x0000681C
+#define VMCS_GUEST_RIP 0x0000681E
+#define VMCS_GUEST_RFLAGS 0x00006820
+#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
+#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824
+#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826
+
+/* Natural Width host-state fields */
+#define VMCS_HOST_CR0 0x00006C00
+#define VMCS_HOST_CR3 0x00006C02
+#define VMCS_HOST_CR4 0x00006C04
+#define VMCS_HOST_FS_BASE 0x00006C06
+#define VMCS_HOST_GS_BASE 0x00006C08
+#define VMCS_HOST_TR_BASE 0x00006C0A
+#define VMCS_HOST_GDTR_BASE 0x00006C0C
+#define VMCS_HOST_IDTR_BASE 0x00006C0E
+#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10
+#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12
+#define VMCS_HOST_RSP 0x00006C14
+#define VMCS_HOST_RIP 0x00006c16
+
+/*
+ * VM instruction error numbers
+ */
+#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5
+
+/*
+ * VMCS exit reasons
+ */
+#define EXIT_REASON_EXCEPTION 0
+#define EXIT_REASON_EXT_INTR 1
+#define EXIT_REASON_TRIPLE_FAULT 2
+#define EXIT_REASON_INIT 3
+#define EXIT_REASON_SIPI 4
+#define EXIT_REASON_IO_SMI 5
+#define EXIT_REASON_SMI 6
+#define EXIT_REASON_INTR_WINDOW 7
+#define EXIT_REASON_NMI_WINDOW 8
+#define EXIT_REASON_TASK_SWITCH 9
+#define EXIT_REASON_CPUID 10
+#define EXIT_REASON_GETSEC 11
+#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
+#define EXIT_REASON_INVLPG 14
+#define EXIT_REASON_RDPMC 15
+#define EXIT_REASON_RDTSC 16
+#define EXIT_REASON_RSM 17
+#define EXIT_REASON_VMCALL 18
+#define EXIT_REASON_VMCLEAR 19
+#define EXIT_REASON_VMLAUNCH 20
+#define EXIT_REASON_VMPTRLD 21
+#define EXIT_REASON_VMPTRST 22
+#define EXIT_REASON_VMREAD 23
+#define EXIT_REASON_VMRESUME 24
+#define EXIT_REASON_VMWRITE 25
+#define EXIT_REASON_VMXOFF 26
+#define EXIT_REASON_VMXON 27
+#define EXIT_REASON_CR_ACCESS 28
+#define EXIT_REASON_DR_ACCESS 29
+#define EXIT_REASON_INOUT 30
+#define EXIT_REASON_RDMSR 31
+#define EXIT_REASON_WRMSR 32
+#define EXIT_REASON_INVAL_VMCS 33
+#define EXIT_REASON_INVAL_MSR 34
+#define EXIT_REASON_MWAIT 36
+#define EXIT_REASON_MTF 37
+#define EXIT_REASON_MONITOR 39
+#define EXIT_REASON_PAUSE 40
+#define EXIT_REASON_MCE 41
+#define EXIT_REASON_TPR 43
+#define EXIT_REASON_APIC 44
+#define EXIT_REASON_GDTR_IDTR 46
+#define EXIT_REASON_LDTR_TR 47
+#define EXIT_REASON_EPT_FAULT 48
+#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_INVEPT 50
+#define EXIT_REASON_RDTSCP 51
+#define EXIT_REASON_VMX_PREEMPT 52
+#define EXIT_REASON_INVVPID 53
+#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
+
+/*
+ * VMCS interrupt information fields
+ */
+#define VMCS_INTERRUPTION_INFO_VALID (1U << 31)
+#define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8)
+#define VMCS_INTERRUPTION_INFO_NMI (2 << 8)
+
+/*
+ * VMCS Guest interruptibility field
+ */
+#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0)
+#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1)
+#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2)
+#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3)
+
+/*
+ * Exit qualification for EXIT_REASON_INVAL_VMCS
+ */
+#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3
+
+/*
+ * Exit qualification for EPT violation
+ */
+#define EPT_VIOLATION_DATA_READ (1UL << 0)
+#define EPT_VIOLATION_DATA_WRITE (1UL << 1)
+#define EPT_VIOLATION_INST_FETCH (1UL << 2)
+#define EPT_VIOLATION_GLA_VALID (1UL << 7)
+#define EPT_VIOLATION_XLAT_VALID (1UL << 8)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
new file mode 100644
index 0000000..fb41074
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -0,0 +1,1867 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/psl.h>
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/pmap.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/vmparam.h>
+
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+#include "vmm_ktr.h"
+#include "vmm_stat.h"
+
+#include "vmx_msr.h"
+#include "ept.h"
+#include "vmx_cpufunc.h"
+#include "vmx.h"
+#include "x86.h"
+#include "vmx_controls.h"
+
+#define PINBASED_CTLS_ONE_SETTING \
+ (PINBASED_EXTINT_EXITING | \
+ PINBASED_NMI_EXITING | \
+ PINBASED_VIRTUAL_NMI)
+#define PINBASED_CTLS_ZERO_SETTING 0
+
+#define PROCBASED_CTLS_WINDOW_SETTING \
+ (PROCBASED_INT_WINDOW_EXITING | \
+ PROCBASED_NMI_WINDOW_EXITING)
+
+#define PROCBASED_CTLS_ONE_SETTING \
+ (PROCBASED_SECONDARY_CONTROLS | \
+ PROCBASED_IO_EXITING | \
+ PROCBASED_MSR_BITMAPS | \
+ PROCBASED_CTLS_WINDOW_SETTING)
+#define PROCBASED_CTLS_ZERO_SETTING \
+ (PROCBASED_CR3_LOAD_EXITING | \
+ PROCBASED_CR3_STORE_EXITING | \
+ PROCBASED_IO_BITMAPS)
+
+#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT
+#define PROCBASED_CTLS2_ZERO_SETTING 0
+
+#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \
+ (VM_EXIT_HOST_LMA | \
+ VM_EXIT_SAVE_EFER | \
+ VM_EXIT_LOAD_EFER)
+
+#define VM_EXIT_CTLS_ONE_SETTING \
+ (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \
+ VM_EXIT_SAVE_PAT | \
+ VM_EXIT_LOAD_PAT)
+#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS
+
+#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER
+
+#define VM_ENTRY_CTLS_ONE_SETTING \
+ (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \
+ VM_ENTRY_LOAD_PAT)
+#define VM_ENTRY_CTLS_ZERO_SETTING \
+ (VM_ENTRY_LOAD_DEBUG_CONTROLS | \
+ VM_ENTRY_INTO_SMM | \
+ VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
+
+#define guest_msr_rw(vmx, msr) \
+ msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
+
+#define HANDLED 1
+#define UNHANDLED 0
+
+MALLOC_DEFINE(M_VMX, "vmx", "vmx");
+
+int vmxon_enabled[MAXCPU];
+static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
+
+static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
+static uint32_t exit_ctls, entry_ctls;
+
+static uint64_t cr0_ones_mask, cr0_zeros_mask;
+static uint64_t cr4_ones_mask, cr4_zeros_mask;
+
+static volatile u_int nextvpid;
+
+static int vmx_no_patmsr;
+
+/*
+ * Virtual NMI blocking conditions.
+ *
+ * Some processor implementations also require NMI to be blocked if
+ * the STI_BLOCKING bit is set. It is possible to detect this at runtime
+ * based on the (exit_reason,exit_qual) tuple being set to
+ * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING).
+ *
+ * We take the easy way out and also include STI_BLOCKING as one of the
+ * gating items for vNMI injection.
+ */
+static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING |
+ VMCS_INTERRUPTIBILITY_NMI_BLOCKING |
+ VMCS_INTERRUPTIBILITY_STI_BLOCKING;
+
+/*
+ * Optional capabilities
+ */
+static int cap_halt_exit;
+static int cap_pause_exit;
+static int cap_unrestricted_guest;
+static int cap_monitor_trap;
+
+/* statistics */
+static VMM_STAT_INTEL(VMEXIT_HLT_IGNORED, "number of times hlt was ignored");
+
+#ifdef KTR
+static const char *
+exit_reason_to_str(int reason)
+{
+ static char reasonbuf[32];
+
+ switch (reason) {
+ case EXIT_REASON_EXCEPTION:
+ return "exception";
+ case EXIT_REASON_EXT_INTR:
+ return "extint";
+ case EXIT_REASON_TRIPLE_FAULT:
+ return "triplefault";
+ case EXIT_REASON_INIT:
+ return "init";
+ case EXIT_REASON_SIPI:
+ return "sipi";
+ case EXIT_REASON_IO_SMI:
+ return "iosmi";
+ case EXIT_REASON_SMI:
+ return "smi";
+ case EXIT_REASON_INTR_WINDOW:
+ return "intrwindow";
+ case EXIT_REASON_NMI_WINDOW:
+ return "nmiwindow";
+ case EXIT_REASON_TASK_SWITCH:
+ return "taskswitch";
+ case EXIT_REASON_CPUID:
+ return "cpuid";
+ case EXIT_REASON_GETSEC:
+ return "getsec";
+ case EXIT_REASON_HLT:
+ return "hlt";
+ case EXIT_REASON_INVD:
+ return "invd";
+ case EXIT_REASON_INVLPG:
+ return "invlpg";
+ case EXIT_REASON_RDPMC:
+ return "rdpmc";
+ case EXIT_REASON_RDTSC:
+ return "rdtsc";
+ case EXIT_REASON_RSM:
+ return "rsm";
+ case EXIT_REASON_VMCALL:
+ return "vmcall";
+ case EXIT_REASON_VMCLEAR:
+ return "vmclear";
+ case EXIT_REASON_VMLAUNCH:
+ return "vmlaunch";
+ case EXIT_REASON_VMPTRLD:
+ return "vmptrld";
+ case EXIT_REASON_VMPTRST:
+ return "vmptrst";
+ case EXIT_REASON_VMREAD:
+ return "vmread";
+ case EXIT_REASON_VMRESUME:
+ return "vmresume";
+ case EXIT_REASON_VMWRITE:
+ return "vmwrite";
+ case EXIT_REASON_VMXOFF:
+ return "vmxoff";
+ case EXIT_REASON_VMXON:
+ return "vmxon";
+ case EXIT_REASON_CR_ACCESS:
+ return "craccess";
+ case EXIT_REASON_DR_ACCESS:
+ return "draccess";
+ case EXIT_REASON_INOUT:
+ return "inout";
+ case EXIT_REASON_RDMSR:
+ return "rdmsr";
+ case EXIT_REASON_WRMSR:
+ return "wrmsr";
+ case EXIT_REASON_INVAL_VMCS:
+ return "invalvmcs";
+ case EXIT_REASON_INVAL_MSR:
+ return "invalmsr";
+ case EXIT_REASON_MWAIT:
+ return "mwait";
+ case EXIT_REASON_MTF:
+ return "mtf";
+ case EXIT_REASON_MONITOR:
+ return "monitor";
+ case EXIT_REASON_PAUSE:
+ return "pause";
+ case EXIT_REASON_MCE:
+ return "mce";
+ case EXIT_REASON_TPR:
+ return "tpr";
+ case EXIT_REASON_APIC:
+ return "apic";
+ case EXIT_REASON_GDTR_IDTR:
+ return "gdtridtr";
+ case EXIT_REASON_LDTR_TR:
+ return "ldtrtr";
+ case EXIT_REASON_EPT_FAULT:
+ return "eptfault";
+ case EXIT_REASON_EPT_MISCONFIG:
+ return "eptmisconfig";
+ case EXIT_REASON_INVEPT:
+ return "invept";
+ case EXIT_REASON_RDTSCP:
+ return "rdtscp";
+ case EXIT_REASON_VMX_PREEMPT:
+ return "vmxpreempt";
+ case EXIT_REASON_INVVPID:
+ return "invvpid";
+ case EXIT_REASON_WBINVD:
+ return "wbinvd";
+ case EXIT_REASON_XSETBV:
+ return "xsetbv";
+ default:
+ snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
+ return (reasonbuf);
+ }
+}
+
+#ifdef SETJMP_TRACE
+static const char *
+vmx_setjmp_rc2str(int rc)
+{
+ switch (rc) {
+ case VMX_RETURN_DIRECT:
+ return "direct";
+ case VMX_RETURN_LONGJMP:
+ return "longjmp";
+ case VMX_RETURN_VMRESUME:
+ return "vmresume";
+ case VMX_RETURN_VMLAUNCH:
+ return "vmlaunch";
+ case VMX_RETURN_AST:
+ return "ast";
+ default:
+ return "unknown";
+ }
+}
+
+#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \
+ VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \
+ (vmxctx)->regname)
+
+static void
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+ uint64_t host_rip, host_rsp;
+
+ if (vmxctx != &vmx->ctx[vcpu])
+ panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p",
+ vmxctx, &vmx->ctx[vcpu]);
+
+ VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx);
+ VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)",
+ vmx_setjmp_rc2str(rc), rc);
+
+ host_rsp = host_rip = ~0;
+ vmread(VMCS_HOST_RIP, &host_rip);
+ vmread(VMCS_HOST_RSP, &host_rsp);
+ VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx",
+ host_rip, host_rsp);
+
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip);
+
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2);
+}
+#endif
+#else
+static void __inline
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+ return;
+}
+#endif /* KTR */
+
+u_long
+vmx_fix_cr0(u_long cr0)
+{
+
+ return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
+}
+
+u_long
+vmx_fix_cr4(u_long cr4)
+{
+
+ return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
+}
+
+static void
+msr_save_area_init(struct msr_entry *g_area, int *g_count)
+{
+ int cnt;
+
+ static struct msr_entry guest_msrs[] = {
+ { MSR_KGSBASE, 0, 0 },
+ };
+
+ cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
+ if (cnt > GUEST_MSR_MAX_ENTRIES)
+ panic("guest msr save area overrun");
+ bcopy(guest_msrs, g_area, sizeof(guest_msrs));
+ *g_count = cnt;
+}
+
+static void
+vmx_disable(void *arg __unused)
+{
+ struct invvpid_desc invvpid_desc = { 0 };
+ struct invept_desc invept_desc = { 0 };
+
+ if (vmxon_enabled[curcpu]) {
+ /*
+ * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
+ *
+ * VMXON or VMXOFF are not required to invalidate any TLB
+ * caching structures. This prevents potential retention of
+ * cached information in the TLB between distinct VMX episodes.
+ */
+ invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
+ invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
+ vmxoff();
+ }
+ load_cr4(rcr4() & ~CR4_VMXE);
+}
+
+static int
+vmx_cleanup(void)
+{
+
+ smp_rendezvous(NULL, vmx_disable, NULL, NULL);
+
+ return (0);
+}
+
+static void
+vmx_enable(void *arg __unused)
+{
+ int error;
+
+ load_cr4(rcr4() | CR4_VMXE);
+
+ *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
+ error = vmxon(vmxon_region[curcpu]);
+ if (error == 0)
+ vmxon_enabled[curcpu] = 1;
+}
+
+static int
+vmx_init(void)
+{
+ int error;
+ uint64_t fixed0, fixed1, feature_control;
+ uint32_t tmp;
+
+ /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
+ if (!(cpu_feature2 & CPUID2_VMX)) {
+ printf("vmx_init: processor does not support VMX operation\n");
+ return (ENXIO);
+ }
+
+ /*
+ * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
+ * are set (bits 0 and 2 respectively).
+ */
+ feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
+ if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
+ (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
+ printf("vmx_init: VMX operation disabled by BIOS\n");
+ return (ENXIO);
+ }
+
+ /* Check support for primary processor-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_CTLS_ONE_SETTING,
+ PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired primary "
+ "processor-based controls\n");
+ return (error);
+ }
+
+ /* Clear the processor-based ctl bits that are set on demand */
+ procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
+
+ /* Check support for secondary processor-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+ MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED_CTLS2_ONE_SETTING,
+ PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
+ if (error) {
+ printf("vmx_init: processor does not support desired secondary "
+ "processor-based controls\n");
+ return (error);
+ }
+
+ /* Check support for VPID */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED2_ENABLE_VPID, 0, &tmp);
+ if (error == 0)
+ procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
+
+ /* Check support for pin-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+ MSR_VMX_TRUE_PINBASED_CTLS,
+ PINBASED_CTLS_ONE_SETTING,
+ PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "pin-based controls\n");
+ return (error);
+ }
+
+ /* Check support for VM-exit controls */
+ error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
+ VM_EXIT_CTLS_ONE_SETTING,
+ VM_EXIT_CTLS_ZERO_SETTING,
+ &exit_ctls);
+ if (error) {
+ /* Try again without the PAT MSR bits */
+ error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
+ MSR_VMX_TRUE_EXIT_CTLS,
+ VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
+ VM_EXIT_CTLS_ZERO_SETTING,
+ &exit_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "exit controls\n");
+ return (error);
+ } else {
+ if (bootverbose)
+ printf("vmm: PAT MSR access not supported\n");
+ guest_msr_valid(MSR_PAT);
+ vmx_no_patmsr = 1;
+ }
+ }
+
+ /* Check support for VM-entry controls */
+ if (!vmx_no_patmsr) {
+ error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
+ MSR_VMX_TRUE_ENTRY_CTLS,
+ VM_ENTRY_CTLS_ONE_SETTING,
+ VM_ENTRY_CTLS_ZERO_SETTING,
+ &entry_ctls);
+ } else {
+ error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
+ MSR_VMX_TRUE_ENTRY_CTLS,
+ VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
+ VM_ENTRY_CTLS_ZERO_SETTING,
+ &entry_ctls);
+ }
+
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "entry controls\n");
+ return (error);
+ }
+
+ /*
+ * Check support for optional features by testing them
+ * as individual bits
+ */
+ cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_HLT_EXITING, 0,
+ &tmp) == 0);
+
+ cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_PROCBASED_CTLS,
+ PROCBASED_MTF, 0,
+ &tmp) == 0);
+
+ cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_PAUSE_EXITING, 0,
+ &tmp) == 0);
+
+ cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+ MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED2_UNRESTRICTED_GUEST, 0,
+ &tmp) == 0);
+
+ /* Initialize EPT */
+ error = ept_init();
+ if (error) {
+ printf("vmx_init: ept initialization failed (%d)\n", error);
+ return (error);
+ }
+
+ /*
+ * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
+ */
+ fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
+ fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
+ cr0_ones_mask = fixed0 & fixed1;
+ cr0_zeros_mask = ~fixed0 & ~fixed1;
+
+ /*
+ * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
+ * if unrestricted guest execution is allowed.
+ */
+ if (cap_unrestricted_guest)
+ cr0_ones_mask &= ~(CR0_PG | CR0_PE);
+
+ /*
+ * Do not allow the guest to set CR0_NW or CR0_CD.
+ */
+ cr0_zeros_mask |= (CR0_NW | CR0_CD);
+
+ fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
+ fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
+ cr4_ones_mask = fixed0 & fixed1;
+ cr4_zeros_mask = ~fixed0 & ~fixed1;
+
+ /* enable VMX operation */
+ smp_rendezvous(NULL, vmx_enable, NULL, NULL);
+
+ return (0);
+}
+
+/*
+ * If this processor does not support VPIDs then simply return 0.
+ *
+ * Otherwise generate the next value of VPID to use. Any value is alright
+ * as long as it is non-zero.
+ *
+ * We always execute in VMX non-root context with EPT enabled. Thus all
+ * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This
+ * in turn means that multiple VMs can share the same VPID as long as
+ * they have distinct EPT page tables.
+ *
+ * XXX
+ * We should optimize this so that it returns VPIDs that are not in
+ * use. Then we will not unnecessarily invalidate mappings in
+ * vmx_set_pcpu_defaults() just because two or more vcpus happen to
+ * use the same 'vpid'.
+ */
+static uint16_t
+vmx_vpid(void)
+{
+ uint16_t vpid = 0;
+
+ if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) {
+ do {
+ vpid = atomic_fetchadd_int(&nextvpid, 1);
+ } while (vpid == 0);
+ }
+
+ return (vpid);
+}
+
+static int
+vmx_setup_cr_shadow(int which, struct vmcs *vmcs)
+{
+ int error, mask_ident, shadow_ident;
+ uint64_t mask_value, shadow_value;
+
+ if (which != 0 && which != 4)
+ panic("vmx_setup_cr_shadow: unknown cr%d", which);
+
+ if (which == 0) {
+ mask_ident = VMCS_CR0_MASK;
+ mask_value = cr0_ones_mask | cr0_zeros_mask;
+ shadow_ident = VMCS_CR0_SHADOW;
+ shadow_value = cr0_ones_mask;
+ } else {
+ mask_ident = VMCS_CR4_MASK;
+ mask_value = cr4_ones_mask | cr4_zeros_mask;
+ shadow_ident = VMCS_CR4_SHADOW;
+ shadow_value = cr4_ones_mask;
+ }
+
+ error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value);
+ if (error)
+ return (error);
+
+ error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value);
+ if (error)
+ return (error);
+
+ return (0);
+}
+#define vmx_setup_cr0_shadow(vmcs) vmx_setup_cr_shadow(0, (vmcs))
+#define vmx_setup_cr4_shadow(vmcs) vmx_setup_cr_shadow(4, (vmcs))
+
+static void *
+vmx_vminit(struct vm *vm)
+{
+ uint16_t vpid;
+ int i, error, guest_msr_count;
+ struct vmx *vmx;
+
+ vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
+ if ((uintptr_t)vmx & PAGE_MASK) {
+ panic("malloc of struct vmx not aligned on %d byte boundary",
+ PAGE_SIZE);
+ }
+ vmx->vm = vm;
+
+ /*
+ * Clean up EPTP-tagged guest physical and combined mappings
+ *
+ * VMX transitions are not required to invalidate any guest physical
+ * mappings. So, it may be possible for stale guest physical mappings
+ * to be present in the processor TLBs.
+ *
+ * Combined mappings for this EP4TA are also invalidated for all VPIDs.
+ */
+ ept_invalidate_mappings(vtophys(vmx->pml4ept));
+
+ msr_bitmap_initialize(vmx->msr_bitmap);
+
+ /*
+ * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
+ * The guest FSBASE and GSBASE are saved and restored during
+ * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
+ * always restored from the vmcs host state area on vm-exit.
+ *
+ * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
+ * how they are saved/restored so can be directly accessed by the
+ * guest.
+ *
+ * Guest KGSBASE is saved and restored in the guest MSR save area.
+ * Host KGSBASE is restored before returning to userland from the pcb.
+ * There will be a window of time when we are executing in the host
+ * kernel context with a value of KGSBASE from the guest. This is ok
+ * because the value of KGSBASE is inconsequential in kernel context.
+ *
+ * MSR_EFER is saved and restored in the guest VMCS area on a
+ * VM exit and entry respectively. It is also restored from the
+ * host VMCS area on a VM exit.
+ */
+ if (guest_msr_rw(vmx, MSR_GSBASE) ||
+ guest_msr_rw(vmx, MSR_FSBASE) ||
+ guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
+ guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
+ guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
+ guest_msr_rw(vmx, MSR_KGSBASE) ||
+ guest_msr_rw(vmx, MSR_EFER))
+ panic("vmx_vminit: error setting guest msr access");
+
+ /*
+ * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
+ * and entry respectively. It is also restored from the host VMCS
+ * area on a VM exit. However, if running on a system with no
+ * MSR_PAT save/restore support, leave access disabled so accesses
+ * will be trapped.
+ */
+ if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
+ panic("vmx_vminit: error setting guest pat msr access");
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ vmx->vmcs[i].identifier = vmx_revision();
+ error = vmclear(&vmx->vmcs[i]);
+ if (error != 0) {
+ panic("vmx_vminit: vmclear error %d on vcpu %d\n",
+ error, i);
+ }
+
+ vpid = vmx_vpid();
+
+ error = vmcs_set_defaults(&vmx->vmcs[i],
+ (u_long)vmx_longjmp,
+ (u_long)&vmx->ctx[i],
+ vtophys(vmx->pml4ept),
+ pinbased_ctls,
+ procbased_ctls,
+ procbased_ctls2,
+ exit_ctls, entry_ctls,
+ vtophys(vmx->msr_bitmap),
+ vpid);
+
+ if (error != 0)
+ panic("vmx_vminit: vmcs_set_defaults error %d", error);
+
+ vmx->cap[i].set = 0;
+ vmx->cap[i].proc_ctls = procbased_ctls;
+
+ vmx->state[i].lastcpu = -1;
+ vmx->state[i].vpid = vpid;
+
+ msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
+
+ error = vmcs_set_msr_save(&vmx->vmcs[i],
+ vtophys(vmx->guest_msrs[i]),
+ guest_msr_count);
+ if (error != 0)
+ panic("vmcs_set_msr_save error %d", error);
+
+ error = vmx_setup_cr0_shadow(&vmx->vmcs[i]);
+ if (error != 0)
+ panic("vmx_setup_cr0_shadow %d", error);
+
+ error = vmx_setup_cr4_shadow(&vmx->vmcs[i]);
+ if (error != 0)
+ panic("vmx_setup_cr4_shadow %d", error);
+ }
+
+ return (vmx);
+}
+
+static int
+vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
+{
+ int handled, func;
+
+ func = vmxctx->guest_rax;
+
+ handled = x86_emulate_cpuid(vm, vcpu,
+ (uint32_t*)(&vmxctx->guest_rax),
+ (uint32_t*)(&vmxctx->guest_rbx),
+ (uint32_t*)(&vmxctx->guest_rcx),
+ (uint32_t*)(&vmxctx->guest_rdx));
+ return (handled);
+}
+
+static __inline void
+vmx_run_trace(struct vmx *vmx, int vcpu)
+{
+#ifdef KTR
+ VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip());
+#endif
+}
+
+static __inline void
+vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
+ int handled)
+{
+#ifdef KTR
+ VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
+ handled ? "handled" : "unhandled",
+ exit_reason_to_str(exit_reason), rip);
+#endif
+}
+
+static __inline void
+vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
+{
+#ifdef KTR
+ VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
+#endif
+}
+
+static int
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
+{
+ int error, lastcpu;
+ struct vmxstate *vmxstate;
+ struct invvpid_desc invvpid_desc = { 0 };
+
+ vmxstate = &vmx->state[vcpu];
+ lastcpu = vmxstate->lastcpu;
+ vmxstate->lastcpu = curcpu;
+
+ if (lastcpu == curcpu) {
+ error = 0;
+ goto done;
+ }
+
+ vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+ error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
+ if (error != 0)
+ goto done;
+
+ error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
+ if (error != 0)
+ goto done;
+
+ error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+ if (error != 0)
+ goto done;
+
+ /*
+ * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+ *
+ * We do this because this vcpu was executing on a different host
+ * cpu when it last ran. We do not track whether it invalidated
+ * mappings associated with its 'vpid' during that run. So we must
+ * assume that the mappings associated with 'vpid' on 'curcpu' are
+ * stale and invalidate them.
+ *
+ * Note that we incur this penalty only when the scheduler chooses to
+ * move the thread associated with this vcpu between host cpus.
+ *
+ * Note also that this will invalidate mappings tagged with 'vpid'
+ * for "all" EP4TAs.
+ */
+ if (vmxstate->vpid != 0) {
+ invvpid_desc.vpid = vmxstate->vpid;
+ invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+ }
+done:
+ return (error);
+}
+
+static void
+vm_exit_update_rip(struct vm_exit *vmexit)
+{
+ int error;
+
+ error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length);
+ if (error)
+ panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+}
+
+/*
+ * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
+ */
+CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
+
+static void __inline
+vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_set_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_clear_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_set_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static int
+vmx_inject_nmi(struct vmx *vmx, int vcpu)
+{
+ int error;
+ uint64_t info, interruptibility;
+
+ /* Bail out if no NMI requested */
+ if (!vm_nmi_pending(vmx->vm, vcpu))
+ return (0);
+
+ error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+ if (error) {
+ panic("vmx_inject_nmi: vmread(interruptibility) %d",
+ error);
+ }
+ if (interruptibility & nmi_blocking_bits)
+ goto nmiblocked;
+
+ /*
+ * Inject the virtual NMI. The vector must be the NMI IDT entry
+ * or the VMCS entry check will fail.
+ */
+ info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID;
+ info |= IDT_NMI;
+
+ error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+ if (error)
+ panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error);
+
+ VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI");
+
+ /* Clear the request */
+ vm_nmi_clear(vmx->vm, vcpu);
+ return (1);
+
+nmiblocked:
+ /*
+ * Set the NMI Window Exiting execution control so we can inject
+ * the virtual NMI as soon as blocking condition goes away.
+ */
+ vmx_set_nmi_window_exiting(vmx, vcpu);
+
+ VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
+ return (1);
+}
+
+static void
+vmx_inject_interrupts(struct vmx *vmx, int vcpu)
+{
+ int error, vector;
+ uint64_t info, rflags, interruptibility;
+
+ const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING |
+ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING;
+
+ /*
+ * If there is already an interrupt pending then just return.
+ *
+ * This could happen if an interrupt was injected on a prior
+ * VM entry but the actual entry into guest mode was aborted
+ * because of a pending AST.
+ */
+ error = vmread(VMCS_ENTRY_INTR_INFO, &info);
+ if (error)
+ panic("vmx_inject_interrupts: vmread(intrinfo) %d", error);
+ if (info & VMCS_INTERRUPTION_INFO_VALID)
+ return;
+
+ /*
+ * NMI injection has priority so deal with those first
+ */
+ if (vmx_inject_nmi(vmx, vcpu))
+ return;
+
+ /* Ask the local apic for a vector to inject */
+ vector = lapic_pending_intr(vmx->vm, vcpu);
+ if (vector < 0)
+ return;
+
+ if (vector < 32 || vector > 255)
+ panic("vmx_inject_interrupts: invalid vector %d\n", vector);
+
+ /* Check RFLAGS.IF and the interruptibility state of the guest */
+ error = vmread(VMCS_GUEST_RFLAGS, &rflags);
+ if (error)
+ panic("vmx_inject_interrupts: vmread(rflags) %d", error);
+
+ if ((rflags & PSL_I) == 0)
+ goto cantinject;
+
+ error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+ if (error) {
+ panic("vmx_inject_interrupts: vmread(interruptibility) %d",
+ error);
+ }
+ if (interruptibility & HWINTR_BLOCKED)
+ goto cantinject;
+
+ /* Inject the interrupt */
+ info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID;
+ info |= vector;
+ error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+ if (error)
+ panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error);
+
+ /* Update the Local APIC ISR */
+ lapic_intr_accepted(vmx->vm, vcpu, vector);
+
+ VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
+
+ return;
+
+cantinject:
+ /*
+ * Set the Interrupt Window Exiting execution control so we can inject
+ * the interrupt as soon as blocking condition goes away.
+ */
+ vmx_set_int_window_exiting(vmx, vcpu);
+
+ VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
+}
+
+static int
+vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+ int error, cr, vmcs_guest_cr;
+ uint64_t regval, ones_mask, zeros_mask;
+ const struct vmxctx *vmxctx;
+
+ /* We only handle mov to %cr0 or %cr4 at this time */
+ if ((exitqual & 0xf0) != 0x00)
+ return (UNHANDLED);
+
+ cr = exitqual & 0xf;
+ if (cr != 0 && cr != 4)
+ return (UNHANDLED);
+
+ vmxctx = &vmx->ctx[vcpu];
+
+ /*
+ * We must use vmwrite() directly here because vmcs_setreg() will
+ * call vmclear(vmcs) as a side-effect which we certainly don't want.
+ */
+ switch ((exitqual >> 8) & 0xf) {
+ case 0:
+ regval = vmxctx->guest_rax;
+ break;
+ case 1:
+ regval = vmxctx->guest_rcx;
+ break;
+ case 2:
+ regval = vmxctx->guest_rdx;
+ break;
+ case 3:
+ regval = vmxctx->guest_rbx;
+ break;
+ case 4:
+ error = vmread(VMCS_GUEST_RSP, &regval);
+ if (error) {
+ panic("vmx_emulate_cr_access: "
+ "error %d reading guest rsp", error);
+ }
+ break;
+ case 5:
+ regval = vmxctx->guest_rbp;
+ break;
+ case 6:
+ regval = vmxctx->guest_rsi;
+ break;
+ case 7:
+ regval = vmxctx->guest_rdi;
+ break;
+ case 8:
+ regval = vmxctx->guest_r8;
+ break;
+ case 9:
+ regval = vmxctx->guest_r9;
+ break;
+ case 10:
+ regval = vmxctx->guest_r10;
+ break;
+ case 11:
+ regval = vmxctx->guest_r11;
+ break;
+ case 12:
+ regval = vmxctx->guest_r12;
+ break;
+ case 13:
+ regval = vmxctx->guest_r13;
+ break;
+ case 14:
+ regval = vmxctx->guest_r14;
+ break;
+ case 15:
+ regval = vmxctx->guest_r15;
+ break;
+ }
+
+ if (cr == 0) {
+ ones_mask = cr0_ones_mask;
+ zeros_mask = cr0_zeros_mask;
+ vmcs_guest_cr = VMCS_GUEST_CR0;
+ } else {
+ ones_mask = cr4_ones_mask;
+ zeros_mask = cr4_zeros_mask;
+ vmcs_guest_cr = VMCS_GUEST_CR4;
+ }
+ regval |= ones_mask;
+ regval &= ~zeros_mask;
+ error = vmwrite(vmcs_guest_cr, regval);
+ if (error) {
+ panic("vmx_emulate_cr_access: error %d writing cr%d",
+ error, cr);
+ }
+
+ return (HANDLED);
+}
+
+static int
+vmx_ept_fault(struct vm *vm, int cpu,
+ uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
+ uint64_t cr3, uint64_t ept_qual, struct vie *vie)
+{
+ int read, write, error;
+
+ /* EPT violation on an instruction fetch doesn't make sense here */
+ if (ept_qual & EPT_VIOLATION_INST_FETCH)
+ return (UNHANDLED);
+
+ /* EPT violation must be a read fault or a write fault */
+ read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
+ write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
+ if ((read | write) == 0)
+ return (UNHANDLED);
+
+ /*
+ * The EPT violation must have been caused by accessing a
+ * guest-physical address that is a translation of a guest-linear
+ * address.
+ */
+ if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
+ (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
+ return (UNHANDLED);
+ }
+
+ /* Fetch, decode and emulate the faulting instruction */
+ if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
+ return (UNHANDLED);
+
+ if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
+ return (UNHANDLED);
+
+ /*
+ * Check if this is a local apic access
+ */
+ if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
+ return (UNHANDLED);
+
+ error = vmm_emulate_instruction(vm, cpu, gpa, vie,
+ lapic_mmio_read, lapic_mmio_write, 0);
+
+ return (error ? UNHANDLED : HANDLED);
+}
+
+static int
+vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+ int error, handled;
+ struct vmcs *vmcs;
+ struct vmxctx *vmxctx;
+ uint32_t eax, ecx, edx;
+ uint64_t qual, gla, gpa, cr3, intr_info;
+
+ handled = 0;
+ vmcs = &vmx->vmcs[vcpu];
+ vmxctx = &vmx->ctx[vcpu];
+ qual = vmexit->u.vmx.exit_qualification;
+ vmexit->exitcode = VM_EXITCODE_BOGUS;
+
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
+
+ switch (vmexit->u.vmx.exit_reason) {
+ case EXIT_REASON_CR_ACCESS:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
+ handled = vmx_emulate_cr_access(vmx, vcpu, qual);
+ break;
+ case EXIT_REASON_RDMSR:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
+ ecx = vmxctx->guest_rcx;
+ error = emulate_rdmsr(vmx->vm, vcpu, ecx);
+ if (error) {
+ vmexit->exitcode = VM_EXITCODE_RDMSR;
+ vmexit->u.msr.code = ecx;
+ } else
+ handled = 1;
+ break;
+ case EXIT_REASON_WRMSR:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
+ eax = vmxctx->guest_rax;
+ ecx = vmxctx->guest_rcx;
+ edx = vmxctx->guest_rdx;
+ error = emulate_wrmsr(vmx->vm, vcpu, ecx,
+ (uint64_t)edx << 32 | eax);
+ if (error) {
+ vmexit->exitcode = VM_EXITCODE_WRMSR;
+ vmexit->u.msr.code = ecx;
+ vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
+ } else
+ handled = 1;
+ break;
+ case EXIT_REASON_HLT:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
+ /*
+ * If there is an event waiting to be injected then there is
+ * no need to 'hlt'.
+ */
+ error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info);
+ if (error)
+ panic("vmx_exit_process: vmread(intrinfo) %d", error);
+
+ if (intr_info & VMCS_INTERRUPTION_INFO_VALID) {
+ handled = 1;
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1);
+ } else
+ vmexit->exitcode = VM_EXITCODE_HLT;
+ break;
+ case EXIT_REASON_MTF:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
+ vmexit->exitcode = VM_EXITCODE_MTRAP;
+ break;
+ case EXIT_REASON_PAUSE:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
+ vmexit->exitcode = VM_EXITCODE_PAUSE;
+ break;
+ case EXIT_REASON_INTR_WINDOW:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
+ vmx_clear_int_window_exiting(vmx, vcpu);
+ VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
+ return (1);
+ case EXIT_REASON_EXT_INTR:
+ /*
+ * External interrupts serve only to cause VM exits and allow
+ * the host interrupt handler to run.
+ *
+ * If this external interrupt triggers a virtual interrupt
+ * to a VM, then that state will be recorded by the
+ * host interrupt handler in the VM's softc. We will inject
+ * this virtual interrupt during the subsequent VM enter.
+ */
+
+ /*
+ * This is special. We want to treat this as an 'handled'
+ * VM-exit but not increment the instruction pointer.
+ */
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
+ return (1);
+ case EXIT_REASON_NMI_WINDOW:
+ /* Exit to allow the pending virtual NMI to be injected */
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
+ vmx_clear_nmi_window_exiting(vmx, vcpu);
+ VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
+ return (1);
+ case EXIT_REASON_INOUT:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
+ vmexit->exitcode = VM_EXITCODE_INOUT;
+ vmexit->u.inout.bytes = (qual & 0x7) + 1;
+ vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
+ vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
+ vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
+ vmexit->u.inout.port = (uint16_t)(qual >> 16);
+ vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
+ break;
+ case EXIT_REASON_CPUID:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
+ handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
+ break;
+ case EXIT_REASON_EPT_FAULT:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1);
+ gla = vmcs_gla();
+ gpa = vmcs_gpa();
+ cr3 = vmcs_guest_cr3();
+ handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
+ vmexit->rip, vmexit->inst_length,
+ cr3, qual, &vmexit->u.paging.vie);
+ if (!handled) {
+ vmexit->exitcode = VM_EXITCODE_PAGING;
+ vmexit->u.paging.gpa = gpa;
+ }
+ break;
+ default:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
+ break;
+ }
+
+ if (handled) {
+ /*
+ * It is possible that control is returned to userland
+ * even though we were able to handle the VM exit in the
+ * kernel.
+ *
+ * In such a case we want to make sure that the userland
+ * restarts guest execution at the instruction *after*
+ * the one we just processed. Therefore we update the
+ * guest rip in the VMCS and in 'vmexit'.
+ */
+ vm_exit_update_rip(vmexit);
+ vmexit->rip += vmexit->inst_length;
+ vmexit->inst_length = 0;
+
+ /*
+ * Special case for spinning up an AP - exit to userspace to
+ * give the controlling process a chance to intercept and
+ * spin up a thread for the AP.
+ */
+ if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
+ handled = 0;
+ } else {
+ if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
+ /*
+ * If this VM exit was not claimed by anybody then
+ * treat it as a generic VMX exit.
+ */
+ vmexit->exitcode = VM_EXITCODE_VMX;
+ vmexit->u.vmx.error = 0;
+ } else {
+ /*
+ * The exitcode and collateral have been populated.
+ * The VM exit will be processed further in userland.
+ */
+ }
+ }
+ return (handled);
+}
+
+static int
+vmx_run(void *arg, int vcpu, register_t rip)
+{
+ int error, vie, rc, handled, astpending;
+ uint32_t exit_reason;
+ struct vmx *vmx;
+ struct vmxctx *vmxctx;
+ struct vmcs *vmcs;
+ struct vm_exit *vmexit;
+
+ vmx = arg;
+ vmcs = &vmx->vmcs[vcpu];
+ vmxctx = &vmx->ctx[vcpu];
+ vmxctx->launched = 0;
+
+ astpending = 0;
+ vmexit = vm_exitinfo(vmx->vm, vcpu);
+
+ /*
+ * XXX Can we avoid doing this every time we do a vm run?
+ */
+ VMPTRLD(vmcs);
+
+ /*
+ * XXX
+ * We do this every time because we may setup the virtual machine
+ * from a different process than the one that actually runs it.
+ *
+ * If the life of a virtual machine was spent entirely in the context
+ * of a single process we could do this once in vmcs_set_defaults().
+ */
+ if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0)
+ panic("vmx_run: error %d writing to VMCS_HOST_CR3", error);
+
+ if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0)
+ panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+
+ if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0)
+ panic("vmx_run: error %d setting up pcpu defaults", error);
+
+ do {
+ lapic_timer_tick(vmx->vm, vcpu);
+ vmx_inject_interrupts(vmx, vcpu);
+ vmx_run_trace(vmx, vcpu);
+ rc = vmx_setjmp(vmxctx);
+#ifdef SETJMP_TRACE
+ vmx_setjmp_trace(vmx, vcpu, vmxctx, rc);
+#endif
+ switch (rc) {
+ case VMX_RETURN_DIRECT:
+ if (vmxctx->launched == 0) {
+ vmxctx->launched = 1;
+ vmx_launch(vmxctx);
+ } else
+ vmx_resume(vmxctx);
+ panic("vmx_launch/resume should not return");
+ break;
+ case VMX_RETURN_LONGJMP:
+ break; /* vm exit */
+ case VMX_RETURN_AST:
+ astpending = 1;
+ break;
+ case VMX_RETURN_VMRESUME:
+ vie = vmcs_instruction_error();
+ if (vmxctx->launch_error == VM_FAIL_INVALID ||
+ vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) {
+ printf("vmresume error %d vmcs inst error %d\n",
+ vmxctx->launch_error, vie);
+ goto err_exit;
+ }
+ vmx_launch(vmxctx); /* try to launch the guest */
+ panic("vmx_launch should not return");
+ break;
+ case VMX_RETURN_VMLAUNCH:
+ vie = vmcs_instruction_error();
+#if 1
+ printf("vmlaunch error %d vmcs inst error %d\n",
+ vmxctx->launch_error, vie);
+#endif
+ goto err_exit;
+ default:
+ panic("vmx_setjmp returned %d", rc);
+ }
+
+ /* enable interrupts */
+ enable_intr();
+
+ /* collect some basic information for VM exit processing */
+ vmexit->rip = rip = vmcs_guest_rip();
+ vmexit->inst_length = vmexit_instruction_length();
+ vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
+ vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
+
+ if (astpending) {
+ handled = 1;
+ vmexit->inst_length = 0;
+ vmexit->exitcode = VM_EXITCODE_BOGUS;
+ vmx_astpending_trace(vmx, vcpu, rip);
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
+ break;
+ }
+
+ handled = vmx_exit_process(vmx, vcpu, vmexit);
+ vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
+
+ } while (handled);
+
+ /*
+ * If a VM exit has been handled then the exitcode must be BOGUS
+ * If a VM exit is not handled then the exitcode must not be BOGUS
+ */
+ if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
+ (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
+ panic("Mismatch between handled (%d) and exitcode (%d)",
+ handled, vmexit->exitcode);
+ }
+
+ if (!handled)
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1);
+
+ VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode);
+
+ /*
+ * XXX
+ * We need to do this to ensure that any VMCS state cached by the
+ * processor is flushed to memory. We need to do this in case the
+ * VM moves to a different cpu the next time it runs.
+ *
+ * Can we avoid doing this?
+ */
+ VMCLEAR(vmcs);
+ return (0);
+
+err_exit:
+ vmexit->exitcode = VM_EXITCODE_VMX;
+ vmexit->u.vmx.exit_reason = (uint32_t)-1;
+ vmexit->u.vmx.exit_qualification = (uint32_t)-1;
+ vmexit->u.vmx.error = vie;
+ VMCLEAR(vmcs);
+ return (ENOEXEC);
+}
+
+static void
+vmx_vmcleanup(void *arg)
+{
+ int error;
+ struct vmx *vmx = arg;
+
+ /*
+ * XXXSMP we also need to clear the VMCS active on the other vcpus.
+ */
+ error = vmclear(&vmx->vmcs[0]);
+ if (error != 0)
+ panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
+
+ ept_vmcleanup(vmx);
+ free(vmx, M_VMX);
+
+ return;
+}
+
+static register_t *
+vmxctx_regptr(struct vmxctx *vmxctx, int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_RAX:
+ return (&vmxctx->guest_rax);
+ case VM_REG_GUEST_RBX:
+ return (&vmxctx->guest_rbx);
+ case VM_REG_GUEST_RCX:
+ return (&vmxctx->guest_rcx);
+ case VM_REG_GUEST_RDX:
+ return (&vmxctx->guest_rdx);
+ case VM_REG_GUEST_RSI:
+ return (&vmxctx->guest_rsi);
+ case VM_REG_GUEST_RDI:
+ return (&vmxctx->guest_rdi);
+ case VM_REG_GUEST_RBP:
+ return (&vmxctx->guest_rbp);
+ case VM_REG_GUEST_R8:
+ return (&vmxctx->guest_r8);
+ case VM_REG_GUEST_R9:
+ return (&vmxctx->guest_r9);
+ case VM_REG_GUEST_R10:
+ return (&vmxctx->guest_r10);
+ case VM_REG_GUEST_R11:
+ return (&vmxctx->guest_r11);
+ case VM_REG_GUEST_R12:
+ return (&vmxctx->guest_r12);
+ case VM_REG_GUEST_R13:
+ return (&vmxctx->guest_r13);
+ case VM_REG_GUEST_R14:
+ return (&vmxctx->guest_r14);
+ case VM_REG_GUEST_R15:
+ return (&vmxctx->guest_r15);
+ default:
+ break;
+ }
+ return (NULL);
+}
+
+static int
+vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
+{
+ register_t *regp;
+
+ if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+ *retval = *regp;
+ return (0);
+ } else
+ return (EINVAL);
+}
+
+static int
+vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
+{
+ register_t *regp;
+
+ if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+ *regp = val;
+ return (0);
+ } else
+ return (EINVAL);
+}
+
+static int
+vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
+{
+ struct vmx *vmx = arg;
+
+ if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
+ return (0);
+
+ /*
+ * If the vcpu is running then don't mess with the VMCS.
+ *
+ * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause
+ * the subsequent vmlaunch/vmresume to fail.
+ */
+ if (vcpu_is_running(vmx->vm, vcpu))
+ panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+ return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval));
+}
+
+static int
+vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
+{
+ int error;
+ uint64_t ctls;
+ struct vmx *vmx = arg;
+
+ /*
+ * XXX Allow caller to set contents of the guest registers saved in
+ * the 'vmxctx' even though the vcpu might be running. We need this
+ * specifically to support the rdmsr emulation that will set the
+ * %eax and %edx registers during vm exit processing.
+ */
+ if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
+ return (0);
+
+ /*
+ * If the vcpu is running then don't mess with the VMCS.
+ *
+ * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause
+ * the subsequent vmlaunch/vmresume to fail.
+ */
+ if (vcpu_is_running(vmx->vm, vcpu))
+ panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+ error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val);
+
+ if (error == 0) {
+ /*
+ * If the "load EFER" VM-entry control is 1 then the
+ * value of EFER.LMA must be identical to "IA-32e mode guest"
+ * bit in the VM-entry control.
+ */
+ if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
+ (reg == VM_REG_GUEST_EFER)) {
+ vmcs_getreg(&vmx->vmcs[vcpu],
+ VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
+ if (val & EFER_LMA)
+ ctls |= VM_ENTRY_GUEST_LMA;
+ else
+ ctls &= ~VM_ENTRY_GUEST_LMA;
+ vmcs_setreg(&vmx->vmcs[vcpu],
+ VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
+ }
+ }
+
+ return (error);
+}
+
+static int
+vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+ struct vmx *vmx = arg;
+
+ return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+ struct vmx *vmx = arg;
+
+ return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
+ int code_valid)
+{
+ int error;
+ uint64_t info;
+ struct vmx *vmx = arg;
+ struct vmcs *vmcs = &vmx->vmcs[vcpu];
+
+ static uint32_t type_map[VM_EVENT_MAX] = {
+ 0x1, /* VM_EVENT_NONE */
+ 0x0, /* VM_HW_INTR */
+ 0x2, /* VM_NMI */
+ 0x3, /* VM_HW_EXCEPTION */
+ 0x4, /* VM_SW_INTR */
+ 0x5, /* VM_PRIV_SW_EXCEPTION */
+ 0x6, /* VM_SW_EXCEPTION */
+ };
+
+ /*
+ * If there is already an exception pending to be delivered to the
+ * vcpu then just return.
+ */
+ error = vmcs_getreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
+ if (error)
+ return (error);
+
+ if (info & VMCS_INTERRUPTION_INFO_VALID)
+ return (EAGAIN);
+
+ info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
+ info |= VMCS_INTERRUPTION_INFO_VALID;
+ error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
+ if (error != 0)
+ return (error);
+
+ if (code_valid) {
+ error = vmcs_setreg(vmcs,
+ VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
+ code);
+ }
+ return (error);
+}
+
+static int
+vmx_getcap(void *arg, int vcpu, int type, int *retval)
+{
+ struct vmx *vmx = arg;
+ int vcap;
+ int ret;
+
+ ret = ENOENT;
+
+ vcap = vmx->cap[vcpu].set;
+
+ switch (type) {
+ case VM_CAP_HALT_EXIT:
+ if (cap_halt_exit)
+ ret = 0;
+ break;
+ case VM_CAP_PAUSE_EXIT:
+ if (cap_pause_exit)
+ ret = 0;
+ break;
+ case VM_CAP_MTRAP_EXIT:
+ if (cap_monitor_trap)
+ ret = 0;
+ break;
+ case VM_CAP_UNRESTRICTED_GUEST:
+ if (cap_unrestricted_guest)
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+
+ if (ret == 0)
+ *retval = (vcap & (1 << type)) ? 1 : 0;
+
+ return (ret);
+}
+
+static int
+vmx_setcap(void *arg, int vcpu, int type, int val)
+{
+ struct vmx *vmx = arg;
+ struct vmcs *vmcs = &vmx->vmcs[vcpu];
+ uint32_t baseval;
+ uint32_t *pptr;
+ int error;
+ int flag;
+ int reg;
+ int retval;
+
+ retval = ENOENT;
+ pptr = NULL;
+
+ switch (type) {
+ case VM_CAP_HALT_EXIT:
+ if (cap_halt_exit) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_HLT_EXITING;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_MTRAP_EXIT:
+ if (cap_monitor_trap) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_MTF;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_PAUSE_EXIT:
+ if (cap_pause_exit) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_PAUSE_EXITING;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_UNRESTRICTED_GUEST:
+ if (cap_unrestricted_guest) {
+ retval = 0;
+ baseval = procbased_ctls2;
+ flag = PROCBASED2_UNRESTRICTED_GUEST;
+ reg = VMCS_SEC_PROC_BASED_CTLS;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (retval == 0) {
+ if (val) {
+ baseval |= flag;
+ } else {
+ baseval &= ~flag;
+ }
+ VMPTRLD(vmcs);
+ error = vmwrite(reg, baseval);
+ VMCLEAR(vmcs);
+
+ if (error) {
+ retval = error;
+ } else {
+ /*
+ * Update optional stored flags, and record
+ * setting
+ */
+ if (pptr != NULL) {
+ *pptr = baseval;
+ }
+
+ if (val) {
+ vmx->cap[vcpu].set |= (1 << type);
+ } else {
+ vmx->cap[vcpu].set &= ~(1 << type);
+ }
+ }
+ }
+
+ return (retval);
+}
+
+struct vmm_ops vmm_ops_intel = {
+ vmx_init,
+ vmx_cleanup,
+ vmx_vminit,
+ vmx_run,
+ vmx_vmcleanup,
+ ept_vmmmap_set,
+ ept_vmmmap_get,
+ vmx_getreg,
+ vmx_setreg,
+ vmx_getdesc,
+ vmx_setdesc,
+ vmx_inject,
+ vmx_getcap,
+ vmx_setcap
+};
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
new file mode 100644
index 0000000..c7cd567
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -0,0 +1,120 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_H_
+#define _VMX_H_
+
+#include "vmcs.h"
+
+#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */
+
+struct vmxctx {
+ register_t tmpstk[32]; /* vmx_return() stack */
+ register_t tmpstktop;
+
+ register_t guest_rdi; /* Guest state */
+ register_t guest_rsi;
+ register_t guest_rdx;
+ register_t guest_rcx;
+ register_t guest_r8;
+ register_t guest_r9;
+ register_t guest_rax;
+ register_t guest_rbx;
+ register_t guest_rbp;
+ register_t guest_r10;
+ register_t guest_r11;
+ register_t guest_r12;
+ register_t guest_r13;
+ register_t guest_r14;
+ register_t guest_r15;
+ register_t guest_cr2;
+
+ register_t host_r15; /* Host state */
+ register_t host_r14;
+ register_t host_r13;
+ register_t host_r12;
+ register_t host_rbp;
+ register_t host_rsp;
+ register_t host_rbx;
+ register_t host_rip;
+ /*
+ * XXX todo debug registers and fpu state
+ */
+
+ int launched; /* vmcs launch state */
+ int launch_error;
+};
+
+struct vmxcap {
+ int set;
+ uint32_t proc_ctls;
+};
+
+struct vmxstate {
+ int lastcpu; /* host cpu that this 'vcpu' last ran on */
+ uint16_t vpid;
+};
+
+/* virtual machine softc */
+struct vmx {
+ pml4_entry_t pml4ept[NPML4EPG];
+ struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */
+ char msr_bitmap[PAGE_SIZE];
+ struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
+ struct vmxctx ctx[VM_MAXCPU];
+ struct vmxcap cap[VM_MAXCPU];
+ struct vmxstate state[VM_MAXCPU];
+ struct vm *vm;
+};
+CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
+
+#define VMX_RETURN_DIRECT 0
+#define VMX_RETURN_LONGJMP 1
+#define VMX_RETURN_VMRESUME 2
+#define VMX_RETURN_VMLAUNCH 3
+#define VMX_RETURN_AST 4
+/*
+ * vmx_setjmp() returns:
+ * - 0 when it returns directly
+ * - 1 when it returns from vmx_longjmp
+ * - 2 when it returns from vmx_resume (which would only be in the error case)
+ * - 3 when it returns from vmx_launch (which would only be in the error case)
+ * - 4 when it returns from vmx_resume or vmx_launch because of AST pending
+ */
+int vmx_setjmp(struct vmxctx *ctx);
+void vmx_longjmp(void); /* returns via vmx_setjmp */
+void vmx_launch(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
+void vmx_resume(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
+
+u_long vmx_fix_cr0(u_long cr0);
+u_long vmx_fix_cr4(u_long cr4);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h
new file mode 100644
index 0000000..31f29f8
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_controls.h
@@ -0,0 +1,92 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_CONTROLS_H_
+#define _VMX_CONTROLS_H_
+
+/* Pin-Based VM-Execution Controls */
+#define PINBASED_EXTINT_EXITING (1 << 0)
+#define PINBASED_NMI_EXITING (1 << 3)
+#define PINBASED_VIRTUAL_NMI (1 << 5)
+#define PINBASED_PREMPTION_TIMER (1 << 6)
+
+/* Primary Processor-Based VM-Execution Controls */
+#define PROCBASED_INT_WINDOW_EXITING (1 << 2)
+#define PROCBASED_TSC_OFFSET (1 << 3)
+#define PROCBASED_HLT_EXITING (1 << 7)
+#define PROCBASED_INVLPG_EXITING (1 << 9)
+#define PROCBASED_MWAIT_EXITING (1 << 10)
+#define PROCBASED_RDPMC_EXITING (1 << 11)
+#define PROCBASED_RDTSC_EXITING (1 << 12)
+#define PROCBASED_CR3_LOAD_EXITING (1 << 15)
+#define PROCBASED_CR3_STORE_EXITING (1 << 16)
+#define PROCBASED_CR8_LOAD_EXITING (1 << 19)
+#define PROCBASED_CR8_STORE_EXITING (1 << 20)
+#define PROCBASED_USE_TPR_SHADOW (1 << 21)
+#define PROCBASED_NMI_WINDOW_EXITING (1 << 22)
+#define PROCBASED_MOV_DR_EXITING (1 << 23)
+#define PROCBASED_IO_EXITING (1 << 24)
+#define PROCBASED_IO_BITMAPS (1 << 25)
+#define PROCBASED_MTF (1 << 27)
+#define PROCBASED_MSR_BITMAPS (1 << 28)
+#define PROCBASED_MONITOR_EXITING (1 << 29)
+#define PROCBASED_PAUSE_EXITING (1 << 30)
+#define PROCBASED_SECONDARY_CONTROLS (1 << 31)
+
+/* Secondary Processor-Based VM-Execution Controls */
+#define PROCBASED2_VIRTUALIZE_APIC (1 << 0)
+#define PROCBASED2_ENABLE_EPT (1 << 1)
+#define PROCBASED2_DESC_TABLE_EXITING (1 << 2)
+#define PROCBASED2_ENABLE_RDTSCP (1 << 3)
+#define PROCBASED2_VIRTUALIZE_X2APIC (1 << 4)
+#define PROCBASED2_ENABLE_VPID (1 << 5)
+#define PROCBASED2_WBINVD_EXITING (1 << 6)
+#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7)
+#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10)
+
+/* VM Exit Controls */
+#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2)
+#define VM_EXIT_HOST_LMA (1 << 9)
+#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12)
+#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15)
+#define VM_EXIT_SAVE_PAT (1 << 18)
+#define VM_EXIT_LOAD_PAT (1 << 19)
+#define VM_EXIT_SAVE_EFER (1 << 20)
+#define VM_EXIT_LOAD_EFER (1 << 21)
+#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22)
+
+/* VM Entry Controls */
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2)
+#define VM_ENTRY_GUEST_LMA (1 << 9)
+#define VM_ENTRY_INTO_SMM (1 << 10)
+#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11)
+#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13)
+#define VM_ENTRY_LOAD_PAT (1 << 14)
+#define VM_ENTRY_LOAD_EFER (1 << 15)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h
new file mode 100644
index 0000000..2e66443
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_cpufunc.h
@@ -0,0 +1,218 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_CPUFUNC_H_
+#define _VMX_CPUFUNC_H_
+
+struct vmcs;
+
+/*
+ * Section 5.2 "Conventions" from Intel Architecture Manual 2B.
+ *
+ * error
+ * VMsucceed 0
+ * VMFailInvalid 1
+ * VMFailValid 2 see also VMCS VM-Instruction Error Field
+ */
+#define VM_SUCCESS 0
+#define VM_FAIL_INVALID 1
+#define VM_FAIL_VALID 2
+#define VMX_SET_ERROR_CODE \
+ " jnc 1f;" \
+ " mov $1, %[error];" /* CF: error = 1 */ \
+ " jmp 3f;" \
+ "1: jnz 2f;" \
+ " mov $2, %[error];" /* ZF: error = 2 */ \
+ " jmp 3f;" \
+ "2: mov $0, %[error];" \
+ "3:"
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmxon(char *region)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(region);
+ __asm __volatile("vmxon %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+
+ return (error);
+}
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmclear(struct vmcs *vmcs)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(vmcs);
+ __asm __volatile("vmclear %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+ return (error);
+}
+
+static __inline void
+vmxoff(void)
+{
+
+ __asm __volatile("vmxoff");
+}
+
+static __inline void
+vmptrst(uint64_t *addr)
+{
+
+ __asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory");
+}
+
+static __inline int
+vmptrld(struct vmcs *vmcs)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(vmcs);
+ __asm __volatile("vmptrld %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+ return (error);
+}
+
+static __inline int
+vmwrite(uint64_t reg, uint64_t val)
+{
+ int error;
+
+ __asm __volatile("vmwrite %[val], %[reg];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [val] "r" (val), [reg] "r" (reg)
+ : "memory");
+
+ return (error);
+}
+
+static __inline int
+vmread(uint64_t r, uint64_t *addr)
+{
+ int error;
+
+ __asm __volatile("vmread %[r], %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [r] "r" (r), [addr] "m" (*addr)
+ : "memory");
+
+ return (error);
+}
+
+static void __inline
+VMCLEAR(struct vmcs *vmcs)
+{
+ int err;
+
+ err = vmclear(vmcs);
+ if (err != 0)
+ panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
+
+ critical_exit();
+}
+
+static void __inline
+VMPTRLD(struct vmcs *vmcs)
+{
+ int err;
+
+ critical_enter();
+
+ err = vmptrld(vmcs);
+ if (err != 0)
+ panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
+}
+
+#define INVVPID_TYPE_ADDRESS 0UL
+#define INVVPID_TYPE_SINGLE_CONTEXT 1UL
+#define INVVPID_TYPE_ALL_CONTEXTS 2UL
+
+struct invvpid_desc {
+ uint16_t vpid;
+ uint16_t _res1;
+ uint32_t _res2;
+ uint64_t linear_addr;
+};
+CTASSERT(sizeof(struct invvpid_desc) == 16);
+
+static void __inline
+invvpid(uint64_t type, struct invvpid_desc desc)
+{
+ int error;
+
+ __asm __volatile("invvpid %[desc], %[type];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [desc] "m" (desc), [type] "r" (type)
+ : "memory");
+
+ if (error)
+ panic("invvpid error %d", error);
+}
+
+#define INVEPT_TYPE_SINGLE_CONTEXT 1UL
+#define INVEPT_TYPE_ALL_CONTEXTS 2UL
+struct invept_desc {
+ uint64_t eptp;
+ uint64_t _res;
+};
+CTASSERT(sizeof(struct invept_desc) == 16);
+
+static void __inline
+invept(uint64_t type, struct invept_desc desc)
+{
+ int error;
+
+ __asm __volatile("invept %[desc], %[type];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [desc] "m" (desc), [type] "r" (type)
+ : "memory");
+
+ if (error)
+ panic("invept error %d", error);
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c
new file mode 100644
index 0000000..823a05d
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@@ -0,0 +1,89 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/assym.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmx.h"
+#include "vmx_cpufunc.h"
+
+ASSYM(VMXCTX_TMPSTKTOP, offsetof(struct vmxctx, tmpstktop));
+ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi));
+ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi));
+ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx));
+ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx));
+ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8));
+ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9));
+ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax));
+ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx));
+ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp));
+ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10));
+ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11));
+ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12));
+ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13));
+ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14));
+ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15));
+ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2));
+
+ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15));
+ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14));
+ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13));
+ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12));
+ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp));
+ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp));
+ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
+ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
+
+ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
+
+ASSYM(VM_SUCCESS, VM_SUCCESS);
+ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID);
+ASSYM(VM_FAIL_VALID, VM_FAIL_VALID);
+
+ASSYM(VMX_RETURN_DIRECT, VMX_RETURN_DIRECT);
+ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP);
+ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME);
+ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH);
+ASSYM(VMX_RETURN_AST, VMX_RETURN_AST);
+
+ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
+ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
+ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
+ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
new file mode 100644
index 0000000..2aba63c
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -0,0 +1,172 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/cpufunc.h>
+
+#include "vmx_msr.h"
+
+static boolean_t
+vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
+{
+
+ if (msr_val & (1UL << (bitpos + 32)))
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+static boolean_t
+vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
+{
+
+ if ((msr_val & (1UL << bitpos)) == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+uint32_t
+vmx_revision(void)
+{
+
+ return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
+}
+
+/*
+ * Generate a bitmask to be used for the VMCS execution control fields.
+ *
+ * The caller specifies what bits should be set to one in 'ones_mask'
+ * and what bits should be set to zero in 'zeros_mask'. The don't-care
+ * bits are set to the default value. The default values are obtained
+ * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
+ * VMX Capabilities".
+ *
+ * Returns zero on success and non-zero on error.
+ */
+int
+vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+ uint32_t zeros_mask, uint32_t *retval)
+{
+ int i;
+ uint64_t val, trueval;
+ boolean_t true_ctls_avail, one_allowed, zero_allowed;
+
+ /* We cannot ask the same bit to be set to both '1' and '0' */
+ if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
+ return (EINVAL);
+
+ if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
+ true_ctls_avail = TRUE;
+ else
+ true_ctls_avail = FALSE;
+
+ val = rdmsr(ctl_reg);
+ if (true_ctls_avail)
+ trueval = rdmsr(true_ctl_reg); /* step c */
+ else
+ trueval = val; /* step a */
+
+ for (i = 0; i < 32; i++) {
+ one_allowed = vmx_ctl_allows_one_setting(trueval, i);
+ zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
+
+ KASSERT(one_allowed || zero_allowed,
+ ("invalid zero/one setting for bit %d of ctl 0x%0x, "
+ "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
+
+ if (zero_allowed && !one_allowed) { /* b(i),c(i) */
+ if (ones_mask & (1 << i))
+ return (EINVAL);
+ *retval &= ~(1 << i);
+ } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */
+ if (zeros_mask & (1 << i))
+ return (EINVAL);
+ *retval |= 1 << i;
+ } else {
+ if (zeros_mask & (1 << i)) /* b(ii),c(ii) */
+ *retval &= ~(1 << i);
+ else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
+ *retval |= 1 << i;
+ else if (!true_ctls_avail)
+ *retval &= ~(1 << i); /* b(iii) */
+ else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
+ *retval &= ~(1 << i);
+ else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
+ *retval |= 1 << i;
+ else {
+ panic("vmx_set_ctlreg: unable to determine "
+ "correct value of ctl bit %d for msr "
+ "0x%0x and true msr 0x%0x", i, ctl_reg,
+ true_ctl_reg);
+ }
+ }
+ }
+
+ return (0);
+}
+
+void
+msr_bitmap_initialize(char *bitmap)
+{
+
+ memset(bitmap, 0xff, PAGE_SIZE);
+}
+
+int
+msr_bitmap_change_access(char *bitmap, u_int msr, int access)
+{
+ int byte, bit;
+
+ if (msr <= 0x00001FFF)
+ byte = msr / 8;
+ else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
+ byte = 1024 + (msr - 0xC0000000) / 8;
+ else
+ return (EINVAL);
+
+ bit = msr & 0x7;
+
+ if (access & MSR_BITMAP_ACCESS_READ)
+ bitmap[byte] &= ~(1 << bit);
+ else
+ bitmap[byte] |= 1 << bit;
+
+ byte += 2048;
+ if (access & MSR_BITMAP_ACCESS_WRITE)
+ bitmap[byte] &= ~(1 << bit);
+ else
+ bitmap[byte] |= 1 << bit;
+
+ return (0);
+}
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
new file mode 100644
index 0000000..e6379a9
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_MSR_H_
+#define _VMX_MSR_H_
+
+#define MSR_VMX_BASIC 0x480
+#define MSR_VMX_EPT_VPID_CAP 0x48C
+
+#define MSR_VMX_PROCBASED_CTLS 0x482
+#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48E
+
+#define MSR_VMX_PINBASED_CTLS 0x481
+#define MSR_VMX_TRUE_PINBASED_CTLS 0x48D
+
+#define MSR_VMX_PROCBASED_CTLS2 0x48B
+
+#define MSR_VMX_EXIT_CTLS 0x483
+#define MSR_VMX_TRUE_EXIT_CTLS 0x48f
+
+#define MSR_VMX_ENTRY_CTLS 0x484
+#define MSR_VMX_TRUE_ENTRY_CTLS 0x490
+
+#define MSR_VMX_CR0_FIXED0 0x486
+#define MSR_VMX_CR0_FIXED1 0x487
+
+#define MSR_VMX_CR4_FIXED0 0x488
+#define MSR_VMX_CR4_FIXED1 0x489
+
+uint32_t vmx_revision(void);
+
+int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+ uint32_t zeros_mask, uint32_t *retval);
+
+/*
+ * According to Section 21.10.4 "Software Access to Related Structures",
+ * changes to data structures pointed to by the VMCS must be made only when
+ * there is no logical processor with a current VMCS that points to the
+ * data structure.
+ *
+ * This pretty much limits us to configuring the MSR bitmap before VMCS
+ * initialization for SMP VMs. Unless of course we do it the hard way - which
+ * would involve some form of synchronization between the vcpus to vmclear
+ * all VMCSs' that point to the bitmap.
+ */
+#define MSR_BITMAP_ACCESS_NONE 0x0
+#define MSR_BITMAP_ACCESS_READ 0x1
+#define MSR_BITMAP_ACCESS_WRITE 0x2
+#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
+void msr_bitmap_initialize(char *bitmap);
+int msr_bitmap_change_access(char *bitmap, u_int msr, int access);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
new file mode 100644
index 0000000..4ba582a
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -0,0 +1,246 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+
+#include "vmx_assym.s"
+
+/*
+ * Disable interrupts before updating %rsp in VMX_CHECK_AST or
+ * VMX_GUEST_RESTORE.
+ *
+ * The location that %rsp points to is a 'vmxctx' and not a
+ * real stack so we don't want an interrupt handler to trash it
+ */
+#define VMX_DISABLE_INTERRUPTS cli
+
+/*
+ * If the thread hosting the vcpu has an ast pending then take care of it
+ * by returning from vmx_setjmp() with a return value of VMX_RETURN_AST.
+ *
+ * Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts
+ * are disabled.
+ */
+#define VMX_CHECK_AST \
+ movq PCPU(CURTHREAD),%rax; \
+ testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax); \
+ je 9f; \
+ movq $VMX_RETURN_AST,%rsi; \
+ movq %rdi,%rsp; \
+ addq $VMXCTX_TMPSTKTOP,%rsp; \
+ callq vmx_return; \
+9:
+
+/*
+ * Assumes that %rdi holds a pointer to the 'vmxctx'.
+ *
+ * On "return" all registers are updated to reflect guest state. The two
+ * exceptions are %rip and %rsp. These registers are atomically switched
+ * by hardware from the guest area of the vmcs.
+ *
+ * We modify %rsp to point to the 'vmxctx' so we can use it to restore
+ * host context in case of an error with 'vmlaunch' or 'vmresume'.
+ */
+#define VMX_GUEST_RESTORE \
+ movq %rdi,%rsp; \
+ movq VMXCTX_GUEST_CR2(%rdi),%rsi; \
+ movq %rsi,%cr2; \
+ movq VMXCTX_GUEST_RSI(%rdi),%rsi; \
+ movq VMXCTX_GUEST_RDX(%rdi),%rdx; \
+ movq VMXCTX_GUEST_RCX(%rdi),%rcx; \
+ movq VMXCTX_GUEST_R8(%rdi),%r8; \
+ movq VMXCTX_GUEST_R9(%rdi),%r9; \
+ movq VMXCTX_GUEST_RAX(%rdi),%rax; \
+ movq VMXCTX_GUEST_RBX(%rdi),%rbx; \
+ movq VMXCTX_GUEST_RBP(%rdi),%rbp; \
+ movq VMXCTX_GUEST_R10(%rdi),%r10; \
+ movq VMXCTX_GUEST_R11(%rdi),%r11; \
+ movq VMXCTX_GUEST_R12(%rdi),%r12; \
+ movq VMXCTX_GUEST_R13(%rdi),%r13; \
+ movq VMXCTX_GUEST_R14(%rdi),%r14; \
+ movq VMXCTX_GUEST_R15(%rdi),%r15; \
+ movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
+
+#define VM_INSTRUCTION_ERROR(reg) \
+ jnc 1f; \
+ movl $VM_FAIL_INVALID,reg; /* CF is set */ \
+ jmp 3f; \
+1: jnz 2f; \
+ movl $VM_FAIL_VALID,reg; /* ZF is set */ \
+ jmp 3f; \
+2: movl $VM_SUCCESS,reg; \
+3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp)
+
+ .text
+/*
+ * int vmx_setjmp(ctxp)
+ * %rdi = ctxp
+ *
+ * Return value is '0' when it returns directly from here.
+ * Return value is '1' when it returns after a vm exit through vmx_longjmp.
+ */
+ENTRY(vmx_setjmp)
+ movq (%rsp),%rax /* return address */
+ movq %r15,VMXCTX_HOST_R15(%rdi)
+ movq %r14,VMXCTX_HOST_R14(%rdi)
+ movq %r13,VMXCTX_HOST_R13(%rdi)
+ movq %r12,VMXCTX_HOST_R12(%rdi)
+ movq %rbp,VMXCTX_HOST_RBP(%rdi)
+ movq %rsp,VMXCTX_HOST_RSP(%rdi)
+ movq %rbx,VMXCTX_HOST_RBX(%rdi)
+ movq %rax,VMXCTX_HOST_RIP(%rdi)
+
+ /*
+ * XXX save host debug registers
+ */
+ movl $VMX_RETURN_DIRECT,%eax
+ ret
+END(vmx_setjmp)
+
+/*
+ * void vmx_return(struct vmxctx *ctxp, int retval)
+ * %rdi = ctxp
+ * %rsi = retval
+ * Return to vmm context through vmx_setjmp() with a value of 'retval'.
+ */
+ENTRY(vmx_return)
+ /* Restore host context. */
+ movq VMXCTX_HOST_R15(%rdi),%r15
+ movq VMXCTX_HOST_R14(%rdi),%r14
+ movq VMXCTX_HOST_R13(%rdi),%r13
+ movq VMXCTX_HOST_R12(%rdi),%r12
+ movq VMXCTX_HOST_RBP(%rdi),%rbp
+ movq VMXCTX_HOST_RSP(%rdi),%rsp
+ movq VMXCTX_HOST_RBX(%rdi),%rbx
+ movq VMXCTX_HOST_RIP(%rdi),%rax
+ movq %rax,(%rsp) /* return address */
+
+ /*
+ * XXX restore host debug registers
+ */
+ movl %esi,%eax
+ ret
+END(vmx_return)
+
+/*
+ * void vmx_longjmp(void)
+ * %rsp points to the struct vmxctx
+ */
+ENTRY(vmx_longjmp)
+ /*
+ * Save guest state that is not automatically saved in the vmcs.
+ */
+ movq %rdi,VMXCTX_GUEST_RDI(%rsp)
+ movq %rsi,VMXCTX_GUEST_RSI(%rsp)
+ movq %rdx,VMXCTX_GUEST_RDX(%rsp)
+ movq %rcx,VMXCTX_GUEST_RCX(%rsp)
+ movq %r8,VMXCTX_GUEST_R8(%rsp)
+ movq %r9,VMXCTX_GUEST_R9(%rsp)
+ movq %rax,VMXCTX_GUEST_RAX(%rsp)
+ movq %rbx,VMXCTX_GUEST_RBX(%rsp)
+ movq %rbp,VMXCTX_GUEST_RBP(%rsp)
+ movq %r10,VMXCTX_GUEST_R10(%rsp)
+ movq %r11,VMXCTX_GUEST_R11(%rsp)
+ movq %r12,VMXCTX_GUEST_R12(%rsp)
+ movq %r13,VMXCTX_GUEST_R13(%rsp)
+ movq %r14,VMXCTX_GUEST_R14(%rsp)
+ movq %r15,VMXCTX_GUEST_R15(%rsp)
+
+ movq %cr2,%rdi
+ movq %rdi,VMXCTX_GUEST_CR2(%rsp)
+
+ movq %rsp,%rdi
+ movq $VMX_RETURN_LONGJMP,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_longjmp)
+
+/*
+ * void vmx_resume(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 2.
+ */
+ENTRY(vmx_resume)
+ VMX_DISABLE_INTERRUPTS
+
+ VMX_CHECK_AST
+
+ /*
+ * Restore guest state that is not automatically loaded from the vmcs.
+ */
+ VMX_GUEST_RESTORE
+
+ vmresume
+
+ /*
+ * Capture the reason why vmresume failed.
+ */
+ VM_INSTRUCTION_ERROR(%eax)
+
+ /* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
+ movq %rsp,%rdi
+ movq $VMX_RETURN_VMRESUME,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_resume)
+
+/*
+ * void vmx_launch(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 3.
+ */
+ENTRY(vmx_launch)
+ VMX_DISABLE_INTERRUPTS
+
+ VMX_CHECK_AST
+
+ /*
+ * Restore guest state that is not automatically loaded from the vmcs.
+ */
+ VMX_GUEST_RESTORE
+
+ vmlaunch
+
+ /*
+ * Capture the reason why vmlaunch failed.
+ */
+ VM_INSTRUCTION_ERROR(%eax)
+
+ /* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
+ movq %rsp,%rdi
+ movq $VMX_RETURN_VMLAUNCH,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_launch)
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
new file mode 100644
index 0000000..ef0e9bc
--- /dev/null
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -0,0 +1,677 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/pci_cfgreg.h>
+
+#include "io/iommu.h"
+
+/*
+ * Documented in the "Intel Virtualization Technology for Directed I/O",
+ * Architecture Spec, September 2008.
+ */
+
+/* Section 10.4 "Register Descriptions" */
+struct vtdmap {
+ volatile uint32_t version;
+ volatile uint32_t res0;
+ volatile uint64_t cap;
+ volatile uint64_t ext_cap;
+ volatile uint32_t gcr;
+ volatile uint32_t gsr;
+ volatile uint64_t rta;
+ volatile uint64_t ccr;
+};
+
+#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F)
+#define VTD_CAP_ND(cap) ((cap) & 0x7)
+#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1)
+#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF)
+#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1)
+
+#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1)
+#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
+#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF)
+
+#define VTD_GCR_WBF (1 << 27)
+#define VTD_GCR_SRTP (1 << 30)
+#define VTD_GCR_TE (1 << 31)
+
+#define VTD_GSR_WBFS (1 << 27)
+#define VTD_GSR_RTPS (1 << 30)
+#define VTD_GSR_TES (1 << 31)
+
+#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */
+#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */
+
+#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */
+#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */
+#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */
+#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */
+#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */
+#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */
+#define VTD_IIR_DOMAIN_P 32
+
+#define VTD_ROOT_PRESENT 0x1
+#define VTD_CTX_PRESENT 0x1
+#define VTD_CTX_TT_ALL (1UL << 2)
+
+#define VTD_PTE_RD (1UL << 0)
+#define VTD_PTE_WR (1UL << 1)
+#define VTD_PTE_SUPERPAGE (1UL << 7)
+#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL)
+
+struct domain {
+ uint64_t *ptp; /* first level page table page */
+ int pt_levels; /* number of page table levels */
+ int addrwidth; /* 'AW' field in context entry */
+ int spsmask; /* supported super page sizes */
+ u_int id; /* domain id */
+ vm_paddr_t maxaddr; /* highest address to be mapped */
+ SLIST_ENTRY(domain) next;
+};
+
+static SLIST_HEAD(, domain) domhead;
+
+#define DRHD_MAX_UNITS 8
+static int drhd_num;
+static struct vtdmap *vtdmaps[DRHD_MAX_UNITS];
+static int max_domains;
+typedef int (*drhd_ident_func_t)(void);
+
+static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+
+static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
+
+/*
+ * Config space register definitions from the "Intel 5520 and 5500" datasheet.
+ */
+static int
+tylersburg_vtd_ident(void)
+{
+ int units, nlbus;
+ uint16_t did, vid;
+ uint32_t miscsts, vtbar;
+
+ const int bus = 0;
+ const int slot = 20;
+ const int func = 0;
+
+ units = 0;
+
+ vid = pci_cfgregread(bus, slot, func, PCIR_VENDOR, 2);
+ did = pci_cfgregread(bus, slot, func, PCIR_DEVICE, 2);
+ if (vid != 0x8086 || did != 0x342E)
+ goto done;
+
+ /*
+ * Check if this is a dual IOH configuration.
+ */
+ miscsts = pci_cfgregread(bus, slot, func, 0x9C, 4);
+ if (miscsts & (1 << 25))
+ nlbus = pci_cfgregread(bus, slot, func, 0x160, 1);
+ else
+ nlbus = -1;
+
+ vtbar = pci_cfgregread(bus, slot, func, 0x180, 4);
+ if (vtbar & 0x1) {
+ vtdmaps[units++] = (struct vtdmap *)
+ PHYS_TO_DMAP(vtbar & 0xffffe000);
+ } else if (bootverbose)
+ printf("VT-d unit in legacy IOH is disabled!\n");
+
+ if (nlbus != -1) {
+ vtbar = pci_cfgregread(nlbus, slot, func, 0x180, 4);
+ if (vtbar & 0x1) {
+ vtdmaps[units++] = (struct vtdmap *)
+ PHYS_TO_DMAP(vtbar & 0xffffe000);
+ } else if (bootverbose)
+ printf("VT-d unit in non-legacy IOH is disabled!\n");
+ }
+done:
+ return (units);
+}
+
+static drhd_ident_func_t drhd_ident_funcs[] = {
+ tylersburg_vtd_ident,
+ NULL
+};
+
+static int
+vtd_max_domains(struct vtdmap *vtdmap)
+{
+ int nd;
+
+ nd = VTD_CAP_ND(vtdmap->cap);
+
+ switch (nd) {
+ case 0:
+ return (16);
+ case 1:
+ return (64);
+ case 2:
+ return (256);
+ case 3:
+ return (1024);
+ case 4:
+ return (4 * 1024);
+ case 5:
+ return (16 * 1024);
+ case 6:
+ return (64 * 1024);
+ default:
+ panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
+ }
+}
+
+static u_int
+domain_id(void)
+{
+ u_int id;
+ struct domain *dom;
+
+ /* Skip domain id 0 - it is reserved when Caching Mode field is set */
+ for (id = 1; id < max_domains; id++) {
+ SLIST_FOREACH(dom, &domhead, next) {
+ if (dom->id == id)
+ break;
+ }
+ if (dom == NULL)
+ break; /* found it */
+ }
+
+ if (id >= max_domains)
+ panic("domain ids exhausted");
+
+ return (id);
+}
+
+static void
+vtd_wbflush(struct vtdmap *vtdmap)
+{
+
+ if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
+ pmap_invalidate_cache();
+
+ if (VTD_CAP_RWBF(vtdmap->cap)) {
+ vtdmap->gcr = VTD_GCR_WBF;
+ while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
+ ;
+ }
+}
+
+static void
+vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
+{
+
+ vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
+ while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
+ ;
+}
+
+static void
+vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
+{
+ int offset;
+ volatile uint64_t *iotlb_reg, val;
+
+ vtd_wbflush(vtdmap);
+
+ offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
+ iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
+
+ *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
+ VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
+
+ while (1) {
+ val = *iotlb_reg;
+ if ((val & VTD_IIR_IVT) == 0)
+ break;
+ }
+}
+
+static void
+vtd_translation_enable(struct vtdmap *vtdmap)
+{
+
+ vtdmap->gcr = VTD_GCR_TE;
+ while ((vtdmap->gsr & VTD_GSR_TES) == 0)
+ ;
+}
+
+static void
+vtd_translation_disable(struct vtdmap *vtdmap)
+{
+
+ vtdmap->gcr = 0;
+ while ((vtdmap->gsr & VTD_GSR_TES) != 0)
+ ;
+}
+
+static int
+vtd_init(void)
+{
+ int i, units;
+ struct vtdmap *vtdmap;
+ vm_paddr_t ctx_paddr;
+
+ for (i = 0; drhd_ident_funcs[i] != NULL; i++) {
+ units = (*drhd_ident_funcs[i])();
+ if (units > 0)
+ break;
+ }
+
+ if (units <= 0)
+ return (ENXIO);
+
+ drhd_num = units;
+ vtdmap = vtdmaps[0];
+
+ if (VTD_CAP_CM(vtdmap->cap) != 0)
+ panic("vtd_init: invalid caching mode");
+
+ max_domains = vtd_max_domains(vtdmap);
+
+ /*
+ * Set up the root-table to point to the context-entry tables
+ */
+ for (i = 0; i < 256; i++) {
+ ctx_paddr = vtophys(ctx_tables[i]);
+ if (ctx_paddr & PAGE_MASK)
+ panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
+
+ root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
+ }
+
+ return (0);
+}
+
+static void
+vtd_cleanup(void)
+{
+}
+
+static void
+vtd_enable(void)
+{
+ int i;
+ struct vtdmap *vtdmap;
+
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_wbflush(vtdmap);
+
+ /* Update the root table address */
+ vtdmap->rta = vtophys(root_table);
+ vtdmap->gcr = VTD_GCR_SRTP;
+ while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
+ ;
+
+ vtd_ctx_global_invalidate(vtdmap);
+ vtd_iotlb_global_invalidate(vtdmap);
+
+ vtd_translation_enable(vtdmap);
+ }
+}
+
+static void
+vtd_disable(void)
+{
+ int i;
+ struct vtdmap *vtdmap;
+
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_translation_disable(vtdmap);
+ }
+}
+
+static void
+vtd_add_device(void *arg, int bus, int slot, int func)
+{
+ int idx;
+ uint64_t *ctxp;
+ struct domain *dom = arg;
+ vm_paddr_t pt_paddr;
+ struct vtdmap *vtdmap;
+
+ if (bus < 0 || bus > PCI_BUSMAX ||
+ slot < 0 || slot > PCI_SLOTMAX ||
+ func < 0 || func > PCI_FUNCMAX)
+ panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+ vtdmap = vtdmaps[0];
+ ctxp = ctx_tables[bus];
+ pt_paddr = vtophys(dom->ptp);
+ idx = (slot << 3 | func) * 2;
+
+ if (ctxp[idx] & VTD_CTX_PRESENT) {
+ panic("vtd_add_device: device %d/%d/%d is already owned by "
+ "domain %d", bus, slot, func,
+ (uint16_t)(ctxp[idx + 1] >> 8));
+ }
+
+ /*
+ * Order is important. The 'present' bit is set only after all fields
+ * of the context pointer are initialized.
+ */
+ ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
+
+ if (VTD_ECAP_DI(vtdmap->ext_cap))
+ ctxp[idx] = VTD_CTX_TT_ALL;
+ else
+ ctxp[idx] = 0;
+
+ ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
+
+ /*
+ * 'Not Present' entries are not cached in either the Context Cache
+ * or in the IOTLB, so there is no need to invalidate either of them.
+ */
+}
+
+static void
+vtd_remove_device(void *arg, int bus, int slot, int func)
+{
+ int i, idx;
+ uint64_t *ctxp;
+ struct vtdmap *vtdmap;
+
+ if (bus < 0 || bus > PCI_BUSMAX ||
+ slot < 0 || slot > PCI_SLOTMAX ||
+ func < 0 || func > PCI_FUNCMAX)
+ panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+ ctxp = ctx_tables[bus];
+ idx = (slot << 3 | func) * 2;
+
+ /*
+ * Order is important. The 'present' bit is must be cleared first.
+ */
+ ctxp[idx] = 0;
+ ctxp[idx + 1] = 0;
+
+ /*
+ * Invalidate the Context Cache and the IOTLB.
+ *
+ * XXX use device-selective invalidation for Context Cache
+ * XXX use domain-selective invalidation for IOTLB
+ */
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_ctx_global_invalidate(vtdmap);
+ vtd_iotlb_global_invalidate(vtdmap);
+ }
+}
+
+#define CREATE_MAPPING 0
+#define REMOVE_MAPPING 1
+
+static uint64_t
+vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
+ int remove)
+{
+ struct domain *dom;
+ int i, spshift, ptpshift, ptpindex, nlevels;
+ uint64_t spsize, *ptp;
+
+ dom = arg;
+ ptpindex = 0;
+ ptpshift = 0;
+
+ if (gpa & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
+
+ if (hpa & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
+
+ if (len & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned len 0x%0lx", len);
+
+ /*
+ * Compute the size of the mapping that we can accomodate.
+ *
+ * This is based on three factors:
+ * - supported super page size
+ * - alignment of the region starting at 'gpa' and 'hpa'
+ * - length of the region 'len'
+ */
+ spshift = 48;
+ for (i = 3; i >= 0; i--) {
+ spsize = 1UL << spshift;
+ if ((dom->spsmask & (1 << i)) != 0 &&
+ (gpa & (spsize - 1)) == 0 &&
+ (hpa & (spsize - 1)) == 0 &&
+ (len >= spsize)) {
+ break;
+ }
+ spshift -= 9;
+ }
+
+ ptp = dom->ptp;
+ nlevels = dom->pt_levels;
+ while (--nlevels >= 0) {
+ ptpshift = 12 + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ /* We have reached the leaf mapping */
+ if (spshift >= ptpshift) {
+ break;
+ }
+
+ /*
+ * We are working on a non-leaf page table page.
+ *
+ * Create a downstream page table page if necessary and point
+ * to it from the current page table.
+ */
+ if (ptp[ptpindex] == 0) {
+ void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
+ ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
+ }
+
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
+ }
+
+ if ((gpa & ((1UL << ptpshift) - 1)) != 0)
+ panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
+
+ /*
+ * Update the 'gpa' -> 'hpa' mapping
+ */
+ if (remove) {
+ ptp[ptpindex] = 0;
+ } else {
+ ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
+
+ if (nlevels > 0)
+ ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
+ }
+
+ return (1UL << ptpshift);
+}
+
+static uint64_t
+vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+ return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
+}
+
+static uint64_t
+vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
+{
+
+ return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
+}
+
+static void
+vtd_invalidate_tlb(void *dom)
+{
+ int i;
+ struct vtdmap *vtdmap;
+
+ /*
+ * Invalidate the IOTLB.
+ * XXX use domain-selective invalidation for IOTLB
+ */
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_iotlb_global_invalidate(vtdmap);
+ }
+}
+
+static void *
+vtd_create_domain(vm_paddr_t maxaddr)
+{
+ struct domain *dom;
+ vm_paddr_t addr;
+ int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
+ struct vtdmap *vtdmap;
+
+ if (drhd_num <= 0)
+ panic("vtd_create_domain: no dma remapping hardware available");
+
+ vtdmap = vtdmaps[0];
+
+ /*
+ * Calculate AGAW.
+ * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
+ */
+ addr = 0;
+ for (gaw = 0; addr < maxaddr; gaw++)
+ addr = 1ULL << gaw;
+
+ res = (gaw - 12) % 9;
+ if (res == 0)
+ agaw = gaw;
+ else
+ agaw = gaw + 9 - res;
+
+ if (agaw > 64)
+ agaw = 64;
+
+ /*
+ * Select the smallest Supported AGAW and the corresponding number
+ * of page table levels.
+ */
+ pt_levels = 2;
+ sagaw = 30;
+ addrwidth = 0;
+ tmp = VTD_CAP_SAGAW(vtdmap->cap);
+ for (i = 0; i < 5; i++) {
+ if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
+ break;
+ pt_levels++;
+ addrwidth++;
+ sagaw += 9;
+ if (sagaw > 64)
+ sagaw = 64;
+ }
+
+ if (i >= 5) {
+ panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
+ VTD_CAP_SAGAW(vtdmap->cap), agaw);
+ }
+
+ dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
+ dom->pt_levels = pt_levels;
+ dom->addrwidth = addrwidth;
+ dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
+ dom->id = domain_id();
+ dom->maxaddr = maxaddr;
+ dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
+ if ((uintptr_t)dom->ptp & PAGE_MASK)
+ panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
+
+ SLIST_INSERT_HEAD(&domhead, dom, next);
+
+ return (dom);
+}
+
+static void
+vtd_free_ptp(uint64_t *ptp, int level)
+{
+ int i;
+ uint64_t *nlp;
+
+ if (level > 1) {
+ for (i = 0; i < 512; i++) {
+ if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
+ continue;
+ if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
+ continue;
+ nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
+ vtd_free_ptp(nlp, level - 1);
+ }
+ }
+
+ bzero(ptp, PAGE_SIZE);
+ free(ptp, M_VTD);
+}
+
+static void
+vtd_destroy_domain(void *arg)
+{
+ struct domain *dom;
+
+ dom = arg;
+
+ SLIST_REMOVE(&domhead, dom, domain, next);
+ vtd_free_ptp(dom->ptp, dom->pt_levels);
+ free(dom, M_VTD);
+}
+
+struct iommu_ops iommu_ops_intel = {
+ vtd_init,
+ vtd_cleanup,
+ vtd_enable,
+ vtd_disable,
+ vtd_create_domain,
+ vtd_destroy_domain,
+ vtd_create_mapping,
+ vtd_remove_mapping,
+ vtd_add_device,
+ vtd_remove_device,
+ vtd_invalidate_tlb,
+};
diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c
new file mode 100644
index 0000000..c8447cc
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.c
@@ -0,0 +1,277 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+#include "iommu.h"
+
+static boolean_t iommu_avail;
+static struct iommu_ops *ops;
+static void *host_domain;
+
+static __inline int
+IOMMU_INIT(void)
+{
+ if (ops != NULL)
+ return ((*ops->init)());
+ else
+ return (ENXIO);
+}
+
+static __inline void
+IOMMU_CLEANUP(void)
+{
+ if (ops != NULL && iommu_avail)
+ (*ops->cleanup)();
+}
+
+static __inline void *
+IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr)
+{
+
+ if (ops != NULL && iommu_avail)
+ return ((*ops->create_domain)(maxaddr));
+ else
+ return (NULL);
+}
+
+static __inline void
+IOMMU_DESTROY_DOMAIN(void *dom)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->destroy_domain)(dom);
+}
+
+static __inline uint64_t
+IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+ if (ops != NULL && iommu_avail)
+ return ((*ops->create_mapping)(domain, gpa, hpa, len));
+ else
+ return (len); /* XXX */
+}
+
+static __inline uint64_t
+IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+ if (ops != NULL && iommu_avail)
+ return ((*ops->remove_mapping)(domain, gpa, len));
+ else
+ return (len); /* XXX */
+}
+
+static __inline void
+IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->add_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->remove_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_INVALIDATE_TLB(void *domain)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->invalidate_tlb)(domain);
+}
+
+static __inline void
+IOMMU_ENABLE(void)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->enable)();
+}
+
+static __inline void
+IOMMU_DISABLE(void)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->disable)();
+}
+
+void
+iommu_init(void)
+{
+ int error, bus, slot, func;
+ vm_paddr_t maxaddr;
+ const char *name;
+ device_t dev;
+
+ if (vmm_is_intel())
+ ops = &iommu_ops_intel;
+ else if (vmm_is_amd())
+ ops = &iommu_ops_amd;
+ else
+ ops = NULL;
+
+ error = IOMMU_INIT();
+ if (error)
+ return;
+
+ iommu_avail = TRUE;
+
+ /*
+ * Create a domain for the devices owned by the host
+ */
+ maxaddr = vmm_mem_maxaddr();
+ host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
+ if (host_domain == NULL)
+ panic("iommu_init: unable to create a host domain");
+
+ /*
+ * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to
+ * the host
+ */
+ iommu_create_mapping(host_domain, 0, 0, maxaddr);
+
+ for (bus = 0; bus <= PCI_BUSMAX; bus++) {
+ for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
+ for (func = 0; func <= PCI_FUNCMAX; func++) {
+ dev = pci_find_dbsf(0, bus, slot, func);
+ if (dev == NULL)
+ continue;
+
+ /* skip passthrough devices */
+ name = device_get_name(dev);
+ if (name != NULL && strcmp(name, "ppt") == 0)
+ continue;
+
+ /* everything else belongs to the host domain */
+ iommu_add_device(host_domain, bus, slot, func);
+ }
+ }
+ }
+ IOMMU_ENABLE();
+
+}
+
+void
+iommu_cleanup(void)
+{
+ IOMMU_DISABLE();
+ IOMMU_DESTROY_DOMAIN(host_domain);
+ IOMMU_CLEANUP();
+}
+
+void *
+iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+ return (IOMMU_CREATE_DOMAIN(maxaddr));
+}
+
+void
+iommu_destroy_domain(void *dom)
+{
+
+ IOMMU_DESTROY_DOMAIN(dom);
+}
+
+void
+iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
+{
+ uint64_t mapped, remaining;
+
+ remaining = len;
+
+ while (remaining > 0) {
+ mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining);
+ gpa += mapped;
+ hpa += mapped;
+ remaining -= mapped;
+ }
+}
+
+void
+iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len)
+{
+ uint64_t unmapped, remaining;
+
+ remaining = len;
+
+ while (remaining > 0) {
+ unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining);
+ gpa += unmapped;
+ remaining -= unmapped;
+ }
+}
+
+void *
+iommu_host_domain(void)
+{
+
+ return (host_domain);
+}
+
+void
+iommu_add_device(void *dom, int bus, int slot, int func)
+{
+
+ IOMMU_ADD_DEVICE(dom, bus, slot, func);
+}
+
+void
+iommu_remove_device(void *dom, int bus, int slot, int func)
+{
+
+ IOMMU_REMOVE_DEVICE(dom, bus, slot, func);
+}
+
+void
+iommu_invalidate_tlb(void *domain)
+{
+
+ IOMMU_INVALIDATE_TLB(domain);
+}
diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h
new file mode 100644
index 0000000..d5c1d6e
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_IOMMU_H_
+#define _IO_IOMMU_H_
+
+typedef int (*iommu_init_func_t)(void);
+typedef void (*iommu_cleanup_func_t)(void);
+typedef void (*iommu_enable_func_t)(void);
+typedef void (*iommu_disable_func_t)(void);
+typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr);
+typedef void (*iommu_destroy_domain_t)(void *domain);
+typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa,
+ vm_paddr_t hpa, uint64_t len);
+typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa,
+ uint64_t len);
+typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func);
+typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func);
+typedef void (*iommu_invalidate_tlb_t)(void *dom);
+
+struct iommu_ops {
+ iommu_init_func_t init; /* module wide */
+ iommu_cleanup_func_t cleanup;
+ iommu_enable_func_t enable;
+ iommu_disable_func_t disable;
+
+ iommu_create_domain_t create_domain; /* domain-specific */
+ iommu_destroy_domain_t destroy_domain;
+ iommu_create_mapping_t create_mapping;
+ iommu_remove_mapping_t remove_mapping;
+ iommu_add_device_t add_device;
+ iommu_remove_device_t remove_device;
+ iommu_invalidate_tlb_t invalidate_tlb;
+};
+
+extern struct iommu_ops iommu_ops_intel;
+extern struct iommu_ops iommu_ops_amd;
+
+void iommu_init(void);
+void iommu_cleanup(void);
+void *iommu_host_domain(void);
+void *iommu_create_domain(vm_paddr_t maxaddr);
+void iommu_destroy_domain(void *dom);
+void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa,
+ size_t len);
+void iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len);
+void iommu_add_device(void *dom, int bus, int slot, int func);
+void iommu_remove_device(void *dom, int bus, int slot, int func);
+void iommu_invalidate_tlb(void *domain);
+#endif
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
new file mode 100644
index 0000000..5aedaf2
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.c
@@ -0,0 +1,594 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/pciio.h>
+#include <sys/rman.h>
+#include <sys/smp.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/resource.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+
+#include "iommu.h"
+#include "ppt.h"
+
+/* XXX locking */
+
+#define MAX_PPTDEVS (sizeof(pptdevs) / sizeof(pptdevs[0]))
+#define MAX_MSIMSGS 32
+
+/*
+ * If the MSI-X table is located in the middle of a BAR then that MMIO
+ * region gets split into two segments - one segment above the MSI-X table
+ * and the other segment below the MSI-X table - with a hole in place of
+ * the MSI-X table so accesses to it can be trapped and emulated.
+ *
+ * So, allocate a MMIO segment for each BAR register + 1 additional segment.
+ */
+#define MAX_MMIOSEGS ((PCIR_MAX_BAR_0 + 1) + 1)
+
+MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
+
+struct pptintr_arg { /* pptintr(pptintr_arg) */
+ struct pptdev *pptdev;
+ int vec;
+ int vcpu;
+};
+
+static struct pptdev {
+ device_t dev;
+ struct vm *vm; /* owner of this device */
+ struct vm_memory_segment mmio[MAX_MMIOSEGS];
+ struct {
+ int num_msgs; /* guest state */
+
+ int startrid; /* host state */
+ struct resource *res[MAX_MSIMSGS];
+ void *cookie[MAX_MSIMSGS];
+ struct pptintr_arg arg[MAX_MSIMSGS];
+ } msi;
+
+ struct {
+ int num_msgs;
+ int startrid;
+ int msix_table_rid;
+ struct resource *msix_table_res;
+ struct resource **res;
+ void **cookie;
+ struct pptintr_arg *arg;
+ } msix;
+} pptdevs[64];
+
+static int num_pptdevs;
+
+static int
+ppt_probe(device_t dev)
+{
+ int bus, slot, func;
+ struct pci_devinfo *dinfo;
+
+ dinfo = (struct pci_devinfo *)device_get_ivars(dev);
+
+ bus = pci_get_bus(dev);
+ slot = pci_get_slot(dev);
+ func = pci_get_function(dev);
+
+ /*
+ * To qualify as a pci passthrough device a device must:
+ * - be allowed by administrator to be used in this role
+ * - be an endpoint device
+ */
+ if (vmm_is_pptdev(bus, slot, func) &&
+ (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL)
+ return (0);
+ else
+ return (ENXIO);
+}
+
+static int
+ppt_attach(device_t dev)
+{
+ int n;
+
+ if (num_pptdevs >= MAX_PPTDEVS) {
+ printf("ppt_attach: maximum number of pci passthrough devices "
+ "exceeded\n");
+ return (ENXIO);
+ }
+
+ n = num_pptdevs++;
+ pptdevs[n].dev = dev;
+
+ if (bootverbose)
+ device_printf(dev, "attached\n");
+
+ return (0);
+}
+
+static int
+ppt_detach(device_t dev)
+{
+ /*
+ * XXX check whether there are any pci passthrough devices assigned
+ * to guests before we allow this driver to detach.
+ */
+
+ return (0);
+}
+
+static device_method_t ppt_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, ppt_probe),
+ DEVMETHOD(device_attach, ppt_attach),
+ DEVMETHOD(device_detach, ppt_detach),
+ {0, 0}
+};
+
+static devclass_t ppt_devclass;
+DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0);
+DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
+
+static struct pptdev *
+ppt_find(int bus, int slot, int func)
+{
+ device_t dev;
+ int i, b, s, f;
+
+ for (i = 0; i < num_pptdevs; i++) {
+ dev = pptdevs[i].dev;
+ b = pci_get_bus(dev);
+ s = pci_get_slot(dev);
+ f = pci_get_function(dev);
+ if (bus == b && slot == s && func == f)
+ return (&pptdevs[i]);
+ }
+ return (NULL);
+}
+
+static void
+ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
+{
+ int i;
+ struct vm_memory_segment *seg;
+
+ for (i = 0; i < MAX_MMIOSEGS; i++) {
+ seg = &ppt->mmio[i];
+ if (seg->len == 0)
+ continue;
+ (void)vm_unmap_mmio(vm, seg->gpa, seg->len);
+ bzero(seg, sizeof(struct vm_memory_segment));
+ }
+}
+
+static void
+ppt_teardown_msi(struct pptdev *ppt)
+{
+ int i, rid;
+ void *cookie;
+ struct resource *res;
+
+ if (ppt->msi.num_msgs == 0)
+ return;
+
+ for (i = 0; i < ppt->msi.num_msgs; i++) {
+ rid = ppt->msi.startrid + i;
+ res = ppt->msi.res[i];
+ cookie = ppt->msi.cookie[i];
+
+ if (cookie != NULL)
+ bus_teardown_intr(ppt->dev, res, cookie);
+
+ if (res != NULL)
+ bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+
+ ppt->msi.res[i] = NULL;
+ ppt->msi.cookie[i] = NULL;
+ }
+
+ if (ppt->msi.startrid == 1)
+ pci_release_msi(ppt->dev);
+
+ ppt->msi.num_msgs = 0;
+}
+
+static void
+ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
+{
+ int rid;
+ struct resource *res;
+ void *cookie;
+
+ rid = ppt->msix.startrid + idx;
+ res = ppt->msix.res[idx];
+ cookie = ppt->msix.cookie[idx];
+
+ if (cookie != NULL)
+ bus_teardown_intr(ppt->dev, res, cookie);
+
+ if (res != NULL)
+ bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+
+ ppt->msix.res[idx] = NULL;
+ ppt->msix.cookie[idx] = NULL;
+}
+
+static void
+ppt_teardown_msix(struct pptdev *ppt)
+{
+ int i;
+
+ if (ppt->msix.num_msgs == 0)
+ return;
+
+ for (i = 0; i < ppt->msix.num_msgs; i++)
+ ppt_teardown_msix_intr(ppt, i);
+
+ if (ppt->msix.msix_table_res) {
+ bus_release_resource(ppt->dev, SYS_RES_MEMORY,
+ ppt->msix.msix_table_rid,
+ ppt->msix.msix_table_res);
+ ppt->msix.msix_table_res = NULL;
+ ppt->msix.msix_table_rid = 0;
+ }
+
+ free(ppt->msix.res, M_PPTMSIX);
+ free(ppt->msix.cookie, M_PPTMSIX);
+ free(ppt->msix.arg, M_PPTMSIX);
+
+ pci_release_msi(ppt->dev);
+
+ ppt->msix.num_msgs = 0;
+}
+
+int
+ppt_assign_device(struct vm *vm, int bus, int slot, int func)
+{
+ struct pptdev *ppt;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt != NULL) {
+ /*
+ * If this device is owned by a different VM then we
+ * cannot change its owner.
+ */
+ if (ppt->vm != NULL && ppt->vm != vm)
+ return (EBUSY);
+
+ ppt->vm = vm;
+ iommu_add_device(vm_iommu_domain(vm), bus, slot, func);
+ return (0);
+ }
+ return (ENOENT);
+}
+
+int
+ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
+{
+ struct pptdev *ppt;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt != NULL) {
+ /*
+ * If this device is not owned by this 'vm' then bail out.
+ */
+ if (ppt->vm != vm)
+ return (EBUSY);
+ ppt_unmap_mmio(vm, ppt);
+ ppt_teardown_msi(ppt);
+ ppt_teardown_msix(ppt);
+ iommu_remove_device(vm_iommu_domain(vm), bus, slot, func);
+ ppt->vm = NULL;
+ return (0);
+ }
+ return (ENOENT);
+}
+
+int
+ppt_unassign_all(struct vm *vm)
+{
+ int i, bus, slot, func;
+ device_t dev;
+
+ for (i = 0; i < num_pptdevs; i++) {
+ if (pptdevs[i].vm == vm) {
+ dev = pptdevs[i].dev;
+ bus = pci_get_bus(dev);
+ slot = pci_get_slot(dev);
+ func = pci_get_function(dev);
+ ppt_unassign_device(vm, bus, slot, func);
+ }
+ }
+
+ return (0);
+}
+
+int
+ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+ vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+ int i, error;
+ struct vm_memory_segment *seg;
+ struct pptdev *ppt;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt != NULL) {
+ if (ppt->vm != vm)
+ return (EBUSY);
+
+ for (i = 0; i < MAX_MMIOSEGS; i++) {
+ seg = &ppt->mmio[i];
+ if (seg->len == 0) {
+ error = vm_map_mmio(vm, gpa, len, hpa);
+ if (error == 0) {
+ seg->gpa = gpa;
+ seg->len = len;
+ }
+ return (error);
+ }
+ }
+ return (ENOSPC);
+ }
+ return (ENOENT);
+}
+
+static int
+pptintr(void *arg)
+{
+ int vec;
+ struct pptdev *ppt;
+ struct pptintr_arg *pptarg;
+
+ pptarg = arg;
+ ppt = pptarg->pptdev;
+ vec = pptarg->vec;
+
+ if (ppt->vm != NULL)
+ (void) lapic_set_intr(ppt->vm, pptarg->vcpu, vec);
+ else {
+ /*
+ * XXX
+ * This is not expected to happen - panic?
+ */
+ }
+
+ /*
+ * For legacy interrupts give other filters a chance in case
+ * the interrupt was not generated by the passthrough device.
+ */
+ if (ppt->msi.startrid == 0)
+ return (FILTER_STRAY);
+ else
+ return (FILTER_HANDLED);
+}
+
+int
+ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int destcpu, int vector, int numvec)
+{
+ int i, rid, flags;
+ int msi_count, startrid, error, tmp;
+ struct pptdev *ppt;
+
+ if ((destcpu >= VM_MAXCPU || destcpu < 0) ||
+ (vector < 0 || vector > 255) ||
+ (numvec < 0 || numvec > MAX_MSIMSGS))
+ return (EINVAL);
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt == NULL)
+ return (ENOENT);
+ if (ppt->vm != vm) /* Make sure we own this device */
+ return (EBUSY);
+
+ /* Free any allocated resources */
+ ppt_teardown_msi(ppt);
+
+ if (numvec == 0) /* nothing more to do */
+ return (0);
+
+ flags = RF_ACTIVE;
+ msi_count = pci_msi_count(ppt->dev);
+ if (msi_count == 0) {
+ startrid = 0; /* legacy interrupt */
+ msi_count = 1;
+ flags |= RF_SHAREABLE;
+ } else
+ startrid = 1; /* MSI */
+
+ /*
+ * The device must be capable of supporting the number of vectors
+ * the guest wants to allocate.
+ */
+ if (numvec > msi_count)
+ return (EINVAL);
+
+ /*
+ * Make sure that we can allocate all the MSI vectors that are needed
+ * by the guest.
+ */
+ if (startrid == 1) {
+ tmp = numvec;
+ error = pci_alloc_msi(ppt->dev, &tmp);
+ if (error)
+ return (error);
+ else if (tmp != numvec) {
+ pci_release_msi(ppt->dev);
+ return (ENOSPC);
+ } else {
+ /* success */
+ }
+ }
+
+ ppt->msi.startrid = startrid;
+
+ /*
+ * Allocate the irq resource and attach it to the interrupt handler.
+ */
+ for (i = 0; i < numvec; i++) {
+ ppt->msi.num_msgs = i + 1;
+ ppt->msi.cookie[i] = NULL;
+
+ rid = startrid + i;
+ ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+ &rid, flags);
+ if (ppt->msi.res[i] == NULL)
+ break;
+
+ ppt->msi.arg[i].pptdev = ppt;
+ ppt->msi.arg[i].vec = vector + i;
+ ppt->msi.arg[i].vcpu = destcpu;
+
+ error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
+ INTR_TYPE_NET | INTR_MPSAFE,
+ pptintr, NULL, &ppt->msi.arg[i],
+ &ppt->msi.cookie[i]);
+ if (error != 0)
+ break;
+ }
+
+ if (i < numvec) {
+ ppt_teardown_msi(ppt);
+ return (ENXIO);
+ }
+
+ return (0);
+}
+
+int
+ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
+{
+ struct pptdev *ppt;
+ struct pci_devinfo *dinfo;
+ int numvec, alloced, rid, error;
+ size_t res_size, cookie_size, arg_size;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt == NULL)
+ return (ENOENT);
+ if (ppt->vm != vm) /* Make sure we own this device */
+ return (EBUSY);
+
+ dinfo = device_get_ivars(ppt->dev);
+ if (!dinfo)
+ return (ENXIO);
+
+ /*
+ * First-time configuration:
+ * Allocate the MSI-X table
+ * Allocate the IRQ resources
+ * Set up some variables in ppt->msix
+ */
+ if (ppt->msix.num_msgs == 0) {
+ numvec = pci_msix_count(ppt->dev);
+ if (numvec <= 0)
+ return (EINVAL);
+
+ ppt->msix.startrid = 1;
+ ppt->msix.num_msgs = numvec;
+
+ res_size = numvec * sizeof(ppt->msix.res[0]);
+ cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
+ arg_size = numvec * sizeof(ppt->msix.arg[0]);
+
+ ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
+ ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
+ M_WAITOK | M_ZERO);
+ ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
+
+ rid = dinfo->cfg.msix.msix_table_bar;
+ ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
+ SYS_RES_MEMORY, &rid, RF_ACTIVE);
+
+ if (ppt->msix.msix_table_res == NULL) {
+ ppt_teardown_msix(ppt);
+ return (ENOSPC);
+ }
+ ppt->msix.msix_table_rid = rid;
+
+ alloced = numvec;
+ error = pci_alloc_msix(ppt->dev, &alloced);
+ if (error || alloced != numvec) {
+ ppt_teardown_msix(ppt);
+ return (error == 0 ? ENOSPC: error);
+ }
+ }
+
+ if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+ /* Tear down the IRQ if it's already set up */
+ ppt_teardown_msix_intr(ppt, idx);
+
+ /* Allocate the IRQ resource */
+ ppt->msix.cookie[idx] = NULL;
+ rid = ppt->msix.startrid + idx;
+ ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+ &rid, RF_ACTIVE);
+ if (ppt->msix.res[idx] == NULL)
+ return (ENXIO);
+
+ ppt->msix.arg[idx].pptdev = ppt;
+ ppt->msix.arg[idx].vec = msg;
+ ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF;
+
+ /* Setup the MSI-X interrupt */
+ error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
+ INTR_TYPE_NET | INTR_MPSAFE,
+ pptintr, NULL, &ppt->msix.arg[idx],
+ &ppt->msix.cookie[idx]);
+
+ if (error != 0) {
+ bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
+ bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
+ ppt->msix.cookie[idx] = NULL;
+ ppt->msix.res[idx] = NULL;
+ return (ENXIO);
+ }
+ } else {
+ /* Masked, tear it down if it's already been set up */
+ ppt_teardown_msix_intr(ppt, idx);
+ }
+
+ return (0);
+}
+
diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h
new file mode 100644
index 0000000..63c8228
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.h
@@ -0,0 +1,41 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_PPT_H_
+#define _IO_PPT_H_
+
+int ppt_assign_device(struct vm *vm, int bus, int slot, int func);
+int ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
+int ppt_unassign_all(struct vm *vm);
+int ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+ vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int destcpu, int vector, int numvec);
+int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
+#endif
diff --git a/sys/amd64/vmm/io/vdev.c b/sys/amd64/vmm/io/vdev.c
new file mode 100644
index 0000000..cd6c5d1
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.c
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include "vdev.h"
+
+struct vdev {
+ SLIST_ENTRY(vdev) entry;
+ struct vdev_ops *ops;
+ void *dev;
+};
+static SLIST_HEAD(, vdev) vdev_head;
+static int vdev_count;
+
+struct vdev_region {
+ SLIST_ENTRY(vdev_region) entry;
+ struct vdev_ops *ops;
+ void *dev;
+ struct io_region *io;
+};
+static SLIST_HEAD(, vdev_region) region_head;
+static int region_count;
+
+static MALLOC_DEFINE(M_VDEV, "vdev", "vdev");
+
+#define VDEV_INIT (0)
+#define VDEV_RESET (1)
+#define VDEV_HALT (2)
+
+// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"};
+
+static int
+vdev_system_event(int event)
+{
+ struct vdev *vd;
+ int rc;
+
+ // TODO: locking
+ SLIST_FOREACH(vd, &vdev_head, entry) {
+ // printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name);
+ switch (event) {
+ case VDEV_INIT:
+ rc = vd->ops->init(vd->dev);
+ break;
+ case VDEV_RESET:
+ rc = vd->ops->reset(vd->dev);
+ break;
+ case VDEV_HALT:
+ rc = vd->ops->halt(vd->dev);
+ break;
+ default:
+ break;
+ }
+ if (rc) {
+ printf("vdev %s init failed rc=%d\n",
+ vd->ops->name, rc);
+ return rc;
+ }
+ }
+ return 0;
+}
+
+int
+vdev_init(void)
+{
+ return vdev_system_event(VDEV_INIT);
+}
+
+int
+vdev_reset(void)
+{
+ return vdev_system_event(VDEV_RESET);
+}
+
+int
+vdev_halt(void)
+{
+ return vdev_system_event(VDEV_HALT);
+}
+
+void
+vdev_vm_init(void)
+{
+ SLIST_INIT(&vdev_head);
+ vdev_count = 0;
+
+ SLIST_INIT(&region_head);
+ region_count = 0;
+}
+void
+vdev_vm_cleanup(void)
+{
+ struct vdev *vd;
+
+ // TODO: locking
+ while (!SLIST_EMPTY(&vdev_head)) {
+ vd = SLIST_FIRST(&vdev_head);
+ SLIST_REMOVE_HEAD(&vdev_head, entry);
+ free(vd, M_VDEV);
+ vdev_count--;
+ }
+}
+
+int
+vdev_register(struct vdev_ops *ops, void *dev)
+{
+ struct vdev *vd;
+ vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO);
+ vd->ops = ops;
+ vd->dev = dev;
+
+ // TODO: locking
+ SLIST_INSERT_HEAD(&vdev_head, vd, entry);
+ vdev_count++;
+ return 0;
+}
+
+void
+vdev_unregister(void *dev)
+{
+ struct vdev *vd, *found;
+
+ found = NULL;
+ // TODO: locking
+ SLIST_FOREACH(vd, &vdev_head, entry) {
+ if (vd->dev == dev) {
+ found = vd;
+ }
+ }
+
+ if (found) {
+ SLIST_REMOVE(&vdev_head, found, vdev, entry);
+ free(found, M_VDEV);
+ }
+}
+
+#define IN_RANGE(val, start, end) \
+ (((val) >= (start)) && ((val) < (end)))
+
+static struct vdev_region*
+vdev_find_region(struct io_region *io, void *dev)
+{
+ struct vdev_region *region, *found;
+ uint64_t region_base;
+ uint64_t region_end;
+
+ found = NULL;
+
+ // TODO: locking
+ // FIXME: we should verify we are in the context the current
+ // vcpu here as well.
+ SLIST_FOREACH(region, &region_head, entry) {
+ region_base = region->io->base;
+ region_end = region_base + region->io->len;
+ if (IN_RANGE(io->base, region_base, region_end) &&
+ IN_RANGE(io->base+io->len, region_base, region_end+1) &&
+ (dev && dev == region->dev)) {
+ found = region;
+ break;
+ }
+ }
+ return found;
+}
+
+int
+vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io)
+{
+ struct vdev_region *region;
+
+ region = vdev_find_region(io, dev);
+ if (region) {
+ return -EEXIST;
+ }
+
+ region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO);
+ region->io = io;
+ region->ops = ops;
+ region->dev = dev;
+
+ // TODO: locking
+ SLIST_INSERT_HEAD(&region_head, region, entry);
+ region_count++;
+
+ return 0;
+}
+
+void
+vdev_unregister_region(void *dev, struct io_region *io)
+{
+ struct vdev_region *region;
+
+ region = vdev_find_region(io, dev);
+
+ if (region) {
+ SLIST_REMOVE(&region_head, region, vdev_region, entry);
+ free(region, M_VDEV);
+ region_count--;
+ }
+}
+
+static int
+vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read)
+{
+ struct vdev_region *region;
+ struct io_region io;
+ region_attr_t attr;
+ int rc;
+
+ io.base = gpa;
+ io.len = size;
+
+ region = vdev_find_region(&io, NULL);
+ if (!region)
+ return -EINVAL;
+
+ attr = (read) ? MMIO_READ : MMIO_WRITE;
+ if (!(region->io->attr & attr))
+ return -EPERM;
+
+ if (read)
+ rc = region->ops->memread(region->dev, gpa, size, data);
+ else
+ rc = region->ops->memwrite(region->dev, gpa, size, *data);
+
+ return rc;
+}
+
+int
+vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data)
+{
+ return vdev_memrw(gpa, size, data, 1);
+}
+
+int
+vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data)
+{
+ return vdev_memrw(gpa, size, &data, 0);
+}
diff --git a/sys/amd64/vmm/io/vdev.h b/sys/amd64/vmm/io/vdev.h
new file mode 100644
index 0000000..6feeba8
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.h
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VDEV_H_
+#define _VDEV_H_
+
+typedef enum {
+ BYTE = 1,
+ WORD = 2,
+ DWORD = 4,
+ QWORD = 8,
+} opsize_t;
+
+typedef enum {
+ MMIO_READ = 1,
+ MMIO_WRITE = 2,
+} region_attr_t;
+
+struct io_region {
+ uint64_t base;
+ uint64_t len;
+ region_attr_t attr;
+ int vcpu;
+};
+
+typedef int (*vdev_init_t)(void* dev);
+typedef int (*vdev_reset_t)(void* dev);
+typedef int (*vdev_halt_t)(void* dev);
+typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data);
+typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data);
+
+
+struct vdev_ops {
+ const char *name;
+ vdev_init_t init;
+ vdev_reset_t reset;
+ vdev_halt_t halt;
+ vdev_memread_t memread;
+ vdev_memwrite_t memwrite;
+};
+
+
+void vdev_vm_init(void);
+void vdev_vm_cleanup(void);
+
+int vdev_register(struct vdev_ops *ops, void *dev);
+void vdev_unregister(void *dev);
+
+int vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io);
+void vdev_unregister_region(void *dev, struct io_region *io);
+
+int vdev_init(void);
+int vdev_reset(void);
+int vdev_halt(void);
+int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data);
+int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data);
+
+#endif /* _VDEV_H_ */
+
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
new file mode 100644
index 0000000..a56a36e
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -0,0 +1,907 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <machine/clock.h>
+#include <x86/specialreg.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+
+#include "vmm_stat.h"
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+#include "vdev.h"
+#include "vlapic.h"
+
+#define VLAPIC_CTR0(vlapic, format) \
+ VMM_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
+
+#define VLAPIC_CTR1(vlapic, format, p1) \
+ VMM_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
+
+#define VLAPIC_CTR_IRR(vlapic, msg) \
+do { \
+ uint32_t *irrptr = &(vlapic)->apic.irr0; \
+ irrptr[0] = irrptr[0]; /* silence compiler */ \
+ VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \
+} while (0)
+
+#define VLAPIC_CTR_ISR(vlapic, msg) \
+do { \
+ uint32_t *isrptr = &(vlapic)->apic.isr0; \
+ isrptr[0] = isrptr[0]; /* silence compiler */ \
+ VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \
+} while (0)
+
+static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
+
+#define PRIO(x) ((x) >> 4)
+
+#define VLAPIC_VERSION (16)
+#define VLAPIC_MAXLVT_ENTRIES (5)
+
+#define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
+
+enum boot_state {
+ BS_INIT,
+ BS_SIPI,
+ BS_RUNNING
+};
+
+struct vlapic {
+ struct vm *vm;
+ int vcpuid;
+
+ struct io_region *mmio;
+ struct vdev_ops *ops;
+ struct LAPIC apic;
+
+ int esr_update;
+
+ int divisor;
+ int ccr_ticks;
+
+ /*
+ * The 'isrvec_stk' is a stack of vectors injected by the local apic.
+ * A vector is popped from the stack when the processor does an EOI.
+ * The vector on the top of the stack is used to compute the
+ * Processor Priority in conjunction with the TPR.
+ */
+ uint8_t isrvec_stk[ISRVEC_STK_SIZE];
+ int isrvec_stk_top;
+
+ uint64_t msr_apicbase;
+ enum boot_state boot_state;
+};
+
+#define VLAPIC_BUS_FREQ tsc_freq
+
+static int
+vlapic_timer_divisor(uint32_t dcr)
+{
+ switch (dcr & 0xB) {
+ case APIC_TDCR_1:
+ return (1);
+ case APIC_TDCR_2:
+ return (2);
+ case APIC_TDCR_4:
+ return (4);
+ case APIC_TDCR_8:
+ return (8);
+ case APIC_TDCR_16:
+ return (16);
+ case APIC_TDCR_32:
+ return (32);
+ case APIC_TDCR_64:
+ return (64);
+ case APIC_TDCR_128:
+ return (128);
+ default:
+ panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
+ }
+}
+
+static void
+vlapic_mask_lvts(uint32_t *lvts, int num_lvt)
+{
+ int i;
+ for (i = 0; i < num_lvt; i++) {
+ *lvts |= APIC_LVT_M;
+ lvts += 4;
+ }
+}
+
+#if 0
+static inline void
+vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
+{
+ printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
+ *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
+ *lvt & APIC_LVTT_M);
+}
+#endif
+
+static uint64_t
+vlapic_get_ccr(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ return lapic->ccr_timer;
+}
+
+static void
+vlapic_update_errors(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ lapic->esr = 0; // XXX
+}
+
+static void
+vlapic_init_ipi(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ lapic->version = VLAPIC_VERSION;
+ lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT);
+ lapic->dfr = 0xffffffff;
+ lapic->svr = APIC_SVR_VECTOR;
+ vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1);
+}
+
+static int
+vlapic_op_reset(void* dev)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ struct LAPIC *lapic = &vlapic->apic;
+
+ memset(lapic, 0, sizeof(*lapic));
+ lapic->apr = vlapic->vcpuid;
+ vlapic_init_ipi(vlapic);
+ vlapic->divisor = vlapic_timer_divisor(lapic->dcr_timer);
+
+ if (vlapic->vcpuid == 0)
+ vlapic->boot_state = BS_RUNNING; /* BSP */
+ else
+ vlapic->boot_state = BS_INIT; /* AP */
+
+ return 0;
+
+}
+
+static int
+vlapic_op_init(void* dev)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ vdev_register_region(vlapic->ops, vlapic, vlapic->mmio);
+ return vlapic_op_reset(dev);
+}
+
+static int
+vlapic_op_halt(void* dev)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ vdev_unregister_region(vlapic, vlapic->mmio);
+ return 0;
+
+}
+
+void
+vlapic_set_intr_ready(struct vlapic *vlapic, int vector)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ uint32_t *irrptr;
+ int idx;
+
+ if (vector < 0 || vector >= 256)
+ panic("vlapic_set_intr_ready: invalid vector %d\n", vector);
+
+ idx = (vector / 32) * 4;
+ irrptr = &lapic->irr0;
+ atomic_set_int(&irrptr[idx], 1 << (vector % 32));
+ VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
+}
+
+static void
+vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed)
+{
+ uint32_t icr_timer;
+
+ icr_timer = vlapic->apic.icr_timer;
+
+ vlapic->ccr_ticks = ticks;
+ if (elapsed < icr_timer)
+ vlapic->apic.ccr_timer = icr_timer - elapsed;
+ else {
+ /*
+ * This can happen when the guest is trying to run its local
+ * apic timer higher that the setting of 'hz' in the host.
+ *
+ * We deal with this by running the guest local apic timer
+ * at the rate of the host's 'hz' setting.
+ */
+ vlapic->apic.ccr_timer = 0;
+ }
+}
+
+static __inline uint32_t *
+vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ int i;
+
+ if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) {
+ panic("vlapic_get_lvt: invalid LVT\n");
+ }
+ i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
+ return ((&lapic->lvt_timer) + i);;
+}
+
+#if 1
+static void
+dump_isrvec_stk(struct vlapic *vlapic)
+{
+ int i;
+ uint32_t *isrptr;
+
+ isrptr = &vlapic->apic.isr0;
+ for (i = 0; i < 8; i++)
+ printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
+
+ for (i = 0; i <= vlapic->isrvec_stk_top; i++)
+ printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
+}
+#endif
+
+/*
+ * Algorithm adopted from section "Interrupt, Task and Processor Priority"
+ * in Intel Architecture Manual Vol 3a.
+ */
+static void
+vlapic_update_ppr(struct vlapic *vlapic)
+{
+ int isrvec, tpr, ppr;
+
+ /*
+ * Note that the value on the stack at index 0 is always 0.
+ *
+ * This is a placeholder for the value of ISRV when none of the
+ * bits is set in the ISRx registers.
+ */
+ isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
+ tpr = vlapic->apic.tpr;
+
+#if 1
+ {
+ int i, lastprio, curprio, vector, idx;
+ uint32_t *isrptr;
+
+ if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
+ panic("isrvec_stk is corrupted: %d", isrvec);
+
+ /*
+ * Make sure that the priority of the nested interrupts is
+ * always increasing.
+ */
+ lastprio = -1;
+ for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
+ curprio = PRIO(vlapic->isrvec_stk[i]);
+ if (curprio <= lastprio) {
+ dump_isrvec_stk(vlapic);
+ panic("isrvec_stk does not satisfy invariant");
+ }
+ lastprio = curprio;
+ }
+
+ /*
+ * Make sure that each bit set in the ISRx registers has a
+ * corresponding entry on the isrvec stack.
+ */
+ i = 1;
+ isrptr = &vlapic->apic.isr0;
+ for (vector = 0; vector < 256; vector++) {
+ idx = (vector / 32) * 4;
+ if (isrptr[idx] & (1 << (vector % 32))) {
+ if (i > vlapic->isrvec_stk_top ||
+ vlapic->isrvec_stk[i] != vector) {
+ dump_isrvec_stk(vlapic);
+ panic("ISR and isrvec_stk out of sync");
+ }
+ i++;
+ }
+ }
+ }
+#endif
+
+ if (PRIO(tpr) >= PRIO(isrvec))
+ ppr = tpr;
+ else
+ ppr = isrvec & 0xf0;
+
+ vlapic->apic.ppr = ppr;
+ VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
+}
+
+static void
+vlapic_process_eoi(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ uint32_t *isrptr;
+ int i, idx, bitpos;
+
+ isrptr = &lapic->isr0;
+
+ /*
+ * The x86 architecture reserves the the first 32 vectors for use
+ * by the processor.
+ */
+ for (i = 7; i > 0; i--) {
+ idx = i * 4;
+ bitpos = fls(isrptr[idx]);
+ if (bitpos != 0) {
+ if (vlapic->isrvec_stk_top <= 0) {
+ panic("invalid vlapic isrvec_stk_top %d",
+ vlapic->isrvec_stk_top);
+ }
+ isrptr[idx] &= ~(1 << (bitpos - 1));
+ VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
+ vlapic->isrvec_stk_top--;
+ vlapic_update_ppr(vlapic);
+ return;
+ }
+ }
+}
+
+static __inline int
+vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask)
+{
+ return (*lvt & mask);
+}
+
+static __inline int
+vlapic_periodic_timer(struct vlapic *vlapic)
+{
+ uint32_t *lvt;
+
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+ return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
+}
+
+static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
+
+static void
+vlapic_fire_timer(struct vlapic *vlapic)
+{
+ int vector;
+ uint32_t *lvt;
+
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+ if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) {
+ vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1);
+ vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR);
+ vlapic_set_intr_ready(vlapic, vector);
+ }
+}
+
+static int
+lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
+{
+ int i;
+ cpuset_t dmask;
+ uint32_t dest, vec, mode;
+ struct vlapic *vlapic2;
+ struct vm_exit *vmexit;
+
+ if (x2apic(vlapic))
+ dest = icrval >> 32;
+ else
+ dest = icrval >> (32 + 24);
+ vec = icrval & APIC_VECTOR_MASK;
+ mode = icrval & APIC_DELMODE_MASK;
+
+ if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
+ switch (icrval & APIC_DEST_MASK) {
+ case APIC_DEST_DESTFLD:
+ CPU_SETOF(dest, &dmask);
+ break;
+ case APIC_DEST_SELF:
+ CPU_SETOF(vlapic->vcpuid, &dmask);
+ break;
+ case APIC_DEST_ALLISELF:
+ dmask = vm_active_cpus(vlapic->vm);
+ break;
+ case APIC_DEST_ALLESELF:
+ dmask = vm_active_cpus(vlapic->vm);
+ CPU_CLR(vlapic->vcpuid, &dmask);
+ break;
+ }
+
+ while ((i = cpusetobj_ffs(&dmask)) != 0) {
+ i--;
+ CPU_CLR(i, &dmask);
+ if (mode == APIC_DELMODE_FIXED)
+ lapic_set_intr(vlapic->vm, i, vec);
+ else
+ vm_inject_nmi(vlapic->vm, i);
+ }
+
+ return (0); /* handled completely in the kernel */
+ }
+
+ if (mode == APIC_DELMODE_INIT) {
+ if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
+ return (0);
+
+ if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+ vlapic2 = vm_lapic(vlapic->vm, dest);
+
+ /* move from INIT to waiting-for-SIPI state */
+ if (vlapic2->boot_state == BS_INIT) {
+ vlapic2->boot_state = BS_SIPI;
+ }
+
+ return (0);
+ }
+ }
+
+ if (mode == APIC_DELMODE_STARTUP) {
+ if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+ vlapic2 = vm_lapic(vlapic->vm, dest);
+
+ /*
+ * Ignore SIPIs in any state other than wait-for-SIPI
+ */
+ if (vlapic2->boot_state != BS_SIPI)
+ return (0);
+
+ vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
+ vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
+ vmexit->u.spinup_ap.vcpu = dest;
+ vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
+
+ /*
+ * XXX this assumes that the startup IPI always succeeds
+ */
+ vlapic2->boot_state = BS_RUNNING;
+ vm_activate_cpu(vlapic2->vm, dest);
+
+ return (0);
+ }
+ }
+
+ /*
+ * This will cause a return to userland.
+ */
+ return (1);
+}
+
+int
+vlapic_pending_intr(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ int idx, i, bitpos, vector;
+ uint32_t *irrptr, val;
+
+ irrptr = &lapic->irr0;
+
+ /*
+ * The x86 architecture reserves the the first 32 vectors for use
+ * by the processor.
+ */
+ for (i = 7; i > 0; i--) {
+ idx = i * 4;
+ val = atomic_load_acq_int(&irrptr[idx]);
+ bitpos = fls(val);
+ if (bitpos != 0) {
+ vector = i * 32 + (bitpos - 1);
+ if (PRIO(vector) > PRIO(lapic->ppr)) {
+ VLAPIC_CTR1(vlapic, "pending intr %d", vector);
+ return (vector);
+ } else
+ break;
+ }
+ }
+ VLAPIC_CTR0(vlapic, "no pending intr");
+ return (-1);
+}
+
+void
+vlapic_intr_accepted(struct vlapic *vlapic, int vector)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ uint32_t *irrptr, *isrptr;
+ int idx, stk_top;
+
+ /*
+ * clear the ready bit for vector being accepted in irr
+ * and set the vector as in service in isr.
+ */
+ idx = (vector / 32) * 4;
+
+ irrptr = &lapic->irr0;
+ atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
+ VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
+
+ isrptr = &lapic->isr0;
+ isrptr[idx] |= 1 << (vector % 32);
+ VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
+
+ /*
+ * Update the PPR
+ */
+ vlapic->isrvec_stk_top++;
+
+ stk_top = vlapic->isrvec_stk_top;
+ if (stk_top >= ISRVEC_STK_SIZE)
+ panic("isrvec_stk_top overflow %d", stk_top);
+
+ vlapic->isrvec_stk[stk_top] = vector;
+ vlapic_update_ppr(vlapic);
+}
+
+int
+vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ struct LAPIC *lapic = &vlapic->apic;
+ uint64_t offset = gpa & ~(PAGE_SIZE);
+ uint32_t *reg;
+ int i;
+
+ if (offset > sizeof(*lapic)) {
+ *data = 0;
+ return 0;
+ }
+
+ offset &= ~3;
+ switch(offset)
+ {
+ case APIC_OFFSET_ID:
+ if (x2apic(vlapic))
+ *data = vlapic->vcpuid;
+ else
+ *data = vlapic->vcpuid << 24;
+ break;
+ case APIC_OFFSET_VER:
+ *data = lapic->version;
+ break;
+ case APIC_OFFSET_TPR:
+ *data = lapic->tpr;
+ break;
+ case APIC_OFFSET_APR:
+ *data = lapic->apr;
+ break;
+ case APIC_OFFSET_PPR:
+ *data = lapic->ppr;
+ break;
+ case APIC_OFFSET_EOI:
+ *data = lapic->eoi;
+ break;
+ case APIC_OFFSET_LDR:
+ *data = lapic->ldr;
+ break;
+ case APIC_OFFSET_DFR:
+ *data = lapic->dfr;
+ break;
+ case APIC_OFFSET_SVR:
+ *data = lapic->svr;
+ break;
+ case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+ i = (offset - APIC_OFFSET_ISR0) >> 2;
+ reg = &lapic->isr0;
+ *data = *(reg + i);
+ break;
+ case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+ i = (offset - APIC_OFFSET_TMR0) >> 2;
+ reg = &lapic->tmr0;
+ *data = *(reg + i);
+ break;
+ case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+ i = (offset - APIC_OFFSET_IRR0) >> 2;
+ reg = &lapic->irr0;
+ *data = atomic_load_acq_int(reg + i);
+ break;
+ case APIC_OFFSET_ESR:
+ *data = lapic->esr;
+ break;
+ case APIC_OFFSET_ICR_LOW:
+ *data = lapic->icr_lo;
+ break;
+ case APIC_OFFSET_ICR_HI:
+ *data = lapic->icr_hi;
+ break;
+ case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+ reg = vlapic_get_lvt(vlapic, offset);
+ *data = *(reg);
+ break;
+ case APIC_OFFSET_ICR:
+ *data = lapic->icr_timer;
+ break;
+ case APIC_OFFSET_CCR:
+ *data = vlapic_get_ccr(vlapic);
+ break;
+ case APIC_OFFSET_DCR:
+ *data = lapic->dcr_timer;
+ break;
+ case APIC_OFFSET_RRR:
+ default:
+ *data = 0;
+ break;
+ }
+ return 0;
+}
+
+int
+vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ struct LAPIC *lapic = &vlapic->apic;
+ uint64_t offset = gpa & ~(PAGE_SIZE);
+ uint32_t *reg;
+ int retval;
+
+ if (offset > sizeof(*lapic)) {
+ return 0;
+ }
+
+ retval = 0;
+ offset &= ~3;
+ switch(offset)
+ {
+ case APIC_OFFSET_ID:
+ break;
+ case APIC_OFFSET_TPR:
+ lapic->tpr = data & 0xff;
+ vlapic_update_ppr(vlapic);
+ break;
+ case APIC_OFFSET_EOI:
+ vlapic_process_eoi(vlapic);
+ break;
+ case APIC_OFFSET_LDR:
+ break;
+ case APIC_OFFSET_DFR:
+ break;
+ case APIC_OFFSET_SVR:
+ lapic->svr = data;
+ break;
+ case APIC_OFFSET_ICR_LOW:
+ if (!x2apic(vlapic)) {
+ data &= 0xffffffff;
+ data |= (uint64_t)lapic->icr_hi << 32;
+ }
+ retval = lapic_process_icr(vlapic, data);
+ break;
+ case APIC_OFFSET_ICR_HI:
+ if (!x2apic(vlapic)) {
+ retval = 0;
+ lapic->icr_hi = data;
+ }
+ break;
+ case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+ reg = vlapic_get_lvt(vlapic, offset);
+ if (!(lapic->svr & APIC_SVR_ENABLE)) {
+ data |= APIC_LVT_M;
+ }
+ *reg = data;
+ // vlapic_dump_lvt(offset, reg);
+ break;
+ case APIC_OFFSET_ICR:
+ lapic->icr_timer = data;
+ vlapic_start_timer(vlapic, 0);
+ break;
+
+ case APIC_OFFSET_DCR:
+ lapic->dcr_timer = data;
+ vlapic->divisor = vlapic_timer_divisor(data);
+ break;
+
+ case APIC_OFFSET_ESR:
+ vlapic_update_errors(vlapic);
+ break;
+ case APIC_OFFSET_VER:
+ case APIC_OFFSET_APR:
+ case APIC_OFFSET_PPR:
+ case APIC_OFFSET_RRR:
+ case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+ case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+ case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+ case APIC_OFFSET_CCR:
+ default:
+ // Read only.
+ break;
+ }
+
+ return (retval);
+}
+
+int
+vlapic_timer_tick(struct vlapic *vlapic)
+{
+ int curticks, delta, periodic, fired;
+ uint32_t ccr;
+ uint32_t decrement, leftover;
+
+restart:
+ curticks = ticks;
+ delta = curticks - vlapic->ccr_ticks;
+
+ /* Local APIC timer is disabled */
+ if (vlapic->apic.icr_timer == 0)
+ return (-1);
+
+ /* One-shot mode and timer has already counted down to zero */
+ periodic = vlapic_periodic_timer(vlapic);
+ if (!periodic && vlapic->apic.ccr_timer == 0)
+ return (-1);
+ /*
+ * The 'curticks' and 'ccr_ticks' are out of sync by more than
+ * 2^31 ticks. We deal with this by restarting the timer.
+ */
+ if (delta < 0) {
+ vlapic_start_timer(vlapic, 0);
+ goto restart;
+ }
+
+ fired = 0;
+ decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz;
+
+ vlapic->ccr_ticks = curticks;
+ ccr = vlapic->apic.ccr_timer;
+
+ while (delta-- > 0) {
+ if (ccr > decrement) {
+ ccr -= decrement;
+ continue;
+ }
+
+ /* Trigger the local apic timer interrupt */
+ vlapic_fire_timer(vlapic);
+ if (periodic) {
+ leftover = decrement - ccr;
+ vlapic_start_timer(vlapic, leftover);
+ ccr = vlapic->apic.ccr_timer;
+ } else {
+ /*
+ * One-shot timer has counted down to zero.
+ */
+ ccr = 0;
+ }
+ fired = 1;
+ break;
+ }
+
+ vlapic->apic.ccr_timer = ccr;
+
+ if (!fired)
+ return ((ccr / decrement) + 1);
+ else
+ return (0);
+}
+
+struct vdev_ops vlapic_dev_ops = {
+ .name = "vlapic",
+ .init = vlapic_op_init,
+ .reset = vlapic_op_reset,
+ .halt = vlapic_op_halt,
+ .memread = vlapic_op_mem_read,
+ .memwrite = vlapic_op_mem_write,
+};
+static struct io_region vlapic_mmio[VM_MAXCPU];
+
+struct vlapic *
+vlapic_init(struct vm *vm, int vcpuid)
+{
+ struct vlapic *vlapic;
+
+ vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
+ vlapic->vm = vm;
+ vlapic->vcpuid = vcpuid;
+
+ vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
+
+ if (vcpuid == 0)
+ vlapic->msr_apicbase |= APICBASE_BSP;
+
+ vlapic->ops = &vlapic_dev_ops;
+
+ vlapic->mmio = vlapic_mmio + vcpuid;
+ vlapic->mmio->base = DEFAULT_APIC_BASE;
+ vlapic->mmio->len = PAGE_SIZE;
+ vlapic->mmio->attr = MMIO_READ|MMIO_WRITE;
+ vlapic->mmio->vcpu = vcpuid;
+
+ vdev_register(&vlapic_dev_ops, vlapic);
+
+ vlapic_op_init(vlapic);
+
+ return (vlapic);
+}
+
+void
+vlapic_cleanup(struct vlapic *vlapic)
+{
+ vlapic_op_halt(vlapic);
+ vdev_unregister(vlapic);
+ free(vlapic, M_VLAPIC);
+}
+
+uint64_t
+vlapic_get_apicbase(struct vlapic *vlapic)
+{
+
+ return (vlapic->msr_apicbase);
+}
+
+void
+vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
+{
+ int err;
+ enum x2apic_state state;
+
+ err = vm_get_x2apic_state(vlapic->vm, vlapic->vcpuid, &state);
+ if (err)
+ panic("vlapic_set_apicbase: err %d fetching x2apic state", err);
+
+ if (state == X2APIC_DISABLED)
+ val &= ~APICBASE_X2APIC;
+
+ vlapic->msr_apicbase = val;
+}
+
+void
+vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, vcpuid);
+
+ if (state == X2APIC_DISABLED)
+ vlapic->msr_apicbase &= ~APICBASE_X2APIC;
+}
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
new file mode 100644
index 0000000..00de019
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VLAPIC_H_
+#define _VLAPIC_H_
+
+#include "vdev.h"
+
+struct vm;
+
+/*
+ * Map of APIC Registers: Offset Description Access
+ */
+#define APIC_OFFSET_ID 0x20 // Local APIC ID R/W
+#define APIC_OFFSET_VER 0x30 // Local APIC Version R
+#define APIC_OFFSET_TPR 0x80 // Task Priority Register R/W
+#define APIC_OFFSET_APR 0x90 // Arbitration Priority Register R
+#define APIC_OFFSET_PPR 0xA0 // Processor Priority Register R
+#define APIC_OFFSET_EOI 0xB0 // EOI Register W
+#define APIC_OFFSET_RRR 0xC0 // Remote read R
+#define APIC_OFFSET_LDR 0xD0 // Logical Destination R/W
+#define APIC_OFFSET_DFR 0xE0 // Destination Format Register 0..27 R; 28..31 R/W
+#define APIC_OFFSET_SVR 0xF0 // Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W
+#define APIC_OFFSET_ISR0 0x100 // ISR 000-031 R
+#define APIC_OFFSET_ISR1 0x110 // ISR 032-063 R
+#define APIC_OFFSET_ISR2 0x120 // ISR 064-095 R
+#define APIC_OFFSET_ISR3 0x130 // ISR 095-128 R
+#define APIC_OFFSET_ISR4 0x140 // ISR 128-159 R
+#define APIC_OFFSET_ISR5 0x150 // ISR 160-191 R
+#define APIC_OFFSET_ISR6 0x160 // ISR 192-223 R
+#define APIC_OFFSET_ISR7 0x170 // ISR 224-255 R
+#define APIC_OFFSET_TMR0 0x180 // TMR 000-031 R
+#define APIC_OFFSET_TMR1 0x190 // TMR 032-063 R
+#define APIC_OFFSET_TMR2 0x1A0 // TMR 064-095 R
+#define APIC_OFFSET_TMR3 0x1B0 // TMR 095-128 R
+#define APIC_OFFSET_TMR4 0x1C0 // TMR 128-159 R
+#define APIC_OFFSET_TMR5 0x1D0 // TMR 160-191 R
+#define APIC_OFFSET_TMR6 0x1E0 // TMR 192-223 R
+#define APIC_OFFSET_TMR7 0x1F0 // TMR 224-255 R
+#define APIC_OFFSET_IRR0 0x200 // IRR 000-031 R
+#define APIC_OFFSET_IRR1 0x210 // IRR 032-063 R
+#define APIC_OFFSET_IRR2 0x220 // IRR 064-095 R
+#define APIC_OFFSET_IRR3 0x230 // IRR 095-128 R
+#define APIC_OFFSET_IRR4 0x240 // IRR 128-159 R
+#define APIC_OFFSET_IRR5 0x250 // IRR 160-191 R
+#define APIC_OFFSET_IRR6 0x260 // IRR 192-223 R
+#define APIC_OFFSET_IRR7 0x270 // IRR 224-255 R
+#define APIC_OFFSET_ESR 0x280 // Error Status Register R
+#define APIC_OFFSET_ICR_LOW 0x300 // Interrupt Command Reg. (0-31) R/W
+#define APIC_OFFSET_ICR_HI 0x310 // Interrupt Command Reg. (32-63) R/W
+#define APIC_OFFSET_TIMER_LVT 0x320 // Local Vector Table (Timer) R/W
+#define APIC_OFFSET_THERM_LVT 0x330 // Local Vector Table (Thermal) R/W (PIV+)
+#define APIC_OFFSET_PERF_LVT 0x340 // Local Vector Table (Performance) R/W (P6+)
+#define APIC_OFFSET_LINT0_LVT 0x350 // Local Vector Table (LINT0) R/W
+#define APIC_OFFSET_LINT1_LVT 0x360 // Local Vector Table (LINT1) R/W
+#define APIC_OFFSET_ERROR_LVT 0x370 // Local Vector Table (ERROR) R/W
+#define APIC_OFFSET_ICR 0x380 // Initial Count Reg. for Timer R/W
+#define APIC_OFFSET_CCR 0x390 // Current Count of Timer R
+#define APIC_OFFSET_DCR 0x3E0 // Timer Divide Configuration Reg. R/W
+
+/*
+ * 16 priority levels with at most one vector injected per level.
+ */
+#define ISRVEC_STK_SIZE (16 + 1)
+
+enum x2apic_state;
+
+struct vlapic *vlapic_init(struct vm *vm, int vcpuid);
+void vlapic_cleanup(struct vlapic *vlapic);
+
+int vlapic_op_mem_write(void* dev, uint64_t gpa,
+ opsize_t size, uint64_t data);
+
+int vlapic_op_mem_read(void* dev, uint64_t gpa,
+ opsize_t size, uint64_t *data);
+
+int vlapic_pending_intr(struct vlapic *vlapic);
+void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
+void vlapic_set_intr_ready(struct vlapic *vlapic, int vector);
+int vlapic_timer_tick(struct vlapic *vlapic);
+
+uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
+void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);
+void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s);
+
+#endif /* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
new file mode 100644
index 0000000..f21bddd
--- /dev/null
+++ b/sys/amd64/vmm/vmm.c
@@ -0,0 +1,992 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+
+#include <machine/vm.h>
+#include <machine/pcb.h>
+#include <machine/smp.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmm_mem.h"
+#include "vmm_util.h"
+#include <machine/vmm_dev.h>
+#include "vlapic.h"
+#include "vmm_msr.h"
+#include "vmm_ipi.h"
+#include "vmm_stat.h"
+#include "vmm_lapic.h"
+
+#include "io/ppt.h"
+#include "io/iommu.h"
+
+struct vlapic;
+
+struct vcpu {
+ int flags;
+ enum vcpu_state state;
+ struct mtx mtx;
+ int hostcpu; /* host cpuid this vcpu last ran on */
+ uint64_t guest_msrs[VMM_MSR_NUM];
+ struct vlapic *vlapic;
+ int vcpuid;
+ struct savefpu *guestfpu; /* guest fpu state */
+ void *stats;
+ struct vm_exit exitinfo;
+ enum x2apic_state x2apic_state;
+ int nmi_pending;
+};
+
+#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
+#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
+#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
+
+#define VM_MAX_MEMORY_SEGMENTS 2
+
+struct vm {
+ void *cookie; /* processor-specific data */
+ void *iommu; /* iommu-specific data */
+ struct vcpu vcpu[VM_MAXCPU];
+ int num_mem_segs;
+ struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+ char name[VM_MAX_NAMELEN];
+
+ /*
+ * Set of active vcpus.
+ * An active vcpu is one that has been started implicitly (BSP) or
+ * explicitly (AP) by sending it a startup ipi.
+ */
+ cpuset_t active_cpus;
+};
+
+static int vmm_initialized;
+
+static struct vmm_ops *ops;
+#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0)
+#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
+
+#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL)
+#define VMRUN(vmi, vcpu, rip) \
+ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
+#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
+#define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \
+ (ops != NULL ? \
+ (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \
+ ENXIO)
+#define VMMMAP_GET(vmi, gpa) \
+ (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
+#define VMGETREG(vmi, vcpu, num, retval) \
+ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
+#define VMSETREG(vmi, vcpu, num, val) \
+ (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
+#define VMGETDESC(vmi, vcpu, num, desc) \
+ (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define VMSETDESC(vmi, vcpu, num, desc) \
+ (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \
+ (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
+#define VMGETCAP(vmi, vcpu, num, retval) \
+ (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
+#define VMSETCAP(vmi, vcpu, num, val) \
+ (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
+
+#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
+#define fpu_stop_emulating() clts()
+
+static MALLOC_DEFINE(M_VM, "vm", "vm");
+CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */
+
+/* statistics */
+static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
+
+static void
+vcpu_cleanup(struct vcpu *vcpu)
+{
+ vlapic_cleanup(vcpu->vlapic);
+ vmm_stat_free(vcpu->stats);
+ fpu_save_area_free(vcpu->guestfpu);
+}
+
+static void
+vcpu_init(struct vm *vm, uint32_t vcpu_id)
+{
+ struct vcpu *vcpu;
+
+ vcpu = &vm->vcpu[vcpu_id];
+
+ vcpu_lock_init(vcpu);
+ vcpu->hostcpu = NOCPU;
+ vcpu->vcpuid = vcpu_id;
+ vcpu->vlapic = vlapic_init(vm, vcpu_id);
+ vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
+ vcpu->guestfpu = fpu_save_area_alloc();
+ fpu_save_area_reset(vcpu->guestfpu);
+ vcpu->stats = vmm_stat_alloc();
+}
+
+struct vm_exit *
+vm_exitinfo(struct vm *vm, int cpuid)
+{
+ struct vcpu *vcpu;
+
+ if (cpuid < 0 || cpuid >= VM_MAXCPU)
+ panic("vm_exitinfo: invalid cpuid %d", cpuid);
+
+ vcpu = &vm->vcpu[cpuid];
+
+ return (&vcpu->exitinfo);
+}
+
+static int
+vmm_init(void)
+{
+ int error;
+
+ vmm_host_state_init();
+ vmm_ipi_init();
+
+ error = vmm_mem_init();
+ if (error)
+ return (error);
+
+ if (vmm_is_intel())
+ ops = &vmm_ops_intel;
+ else if (vmm_is_amd())
+ ops = &vmm_ops_amd;
+ else
+ return (ENXIO);
+
+ vmm_msr_init();
+
+ return (VMM_INIT());
+}
+
+static int
+vmm_handler(module_t mod, int what, void *arg)
+{
+ int error;
+
+ switch (what) {
+ case MOD_LOAD:
+ vmmdev_init();
+ iommu_init();
+ error = vmm_init();
+ if (error == 0)
+ vmm_initialized = 1;
+ break;
+ case MOD_UNLOAD:
+ error = vmmdev_cleanup();
+ if (error == 0) {
+ iommu_cleanup();
+ vmm_ipi_cleanup();
+ error = VMM_CLEANUP();
+ }
+ vmm_initialized = 0;
+ break;
+ default:
+ error = 0;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t vmm_kmod = {
+ "vmm",
+ vmm_handler,
+ NULL
+};
+
+/*
+ * vmm initialization has the following dependencies:
+ *
+ * - iommu initialization must happen after the pci passthru driver has had
+ * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
+ *
+ * - VT-x initialization requires smp_rendezvous() and therefore must happen
+ * after SMP is fully functional (after SI_SUB_SMP).
+ */
+DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
+MODULE_VERSION(vmm, 1);
+
+SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
+
+int
+vm_create(const char *name, struct vm **retvm)
+{
+ int i;
+ struct vm *vm;
+ vm_paddr_t maxaddr;
+
+ const int BSP = 0;
+
+ /*
+ * If vmm.ko could not be successfully initialized then don't attempt
+ * to create the virtual machine.
+ */
+ if (!vmm_initialized)
+ return (ENXIO);
+
+ if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
+ return (EINVAL);
+
+ vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
+ strcpy(vm->name, name);
+ vm->cookie = VMINIT(vm);
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ vcpu_init(vm, i);
+ guest_msrs_init(vm, i);
+ }
+
+ maxaddr = vmm_mem_maxaddr();
+ vm->iommu = iommu_create_domain(maxaddr);
+ vm_activate_cpu(vm, BSP);
+
+ *retvm = vm;
+ return (0);
+}
+
+static void
+vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
+{
+ size_t len;
+ vm_paddr_t hpa;
+ void *host_domain;
+
+ host_domain = iommu_host_domain();
+
+ len = 0;
+ while (len < seg->len) {
+ hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
+ if (hpa == (vm_paddr_t)-1) {
+ panic("vm_free_mem_segs: cannot free hpa "
+ "associated with gpa 0x%016lx", seg->gpa + len);
+ }
+
+ /*
+ * Remove the 'gpa' to 'hpa' mapping in VMs domain.
+ * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
+ */
+ iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
+ iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
+
+ vmm_mem_free(hpa, PAGE_SIZE);
+
+ len += PAGE_SIZE;
+ }
+
+ /*
+ * Invalidate cached translations associated with 'vm->iommu' since
+ * we have now moved some pages from it.
+ */
+ iommu_invalidate_tlb(vm->iommu);
+
+ bzero(seg, sizeof(struct vm_memory_segment));
+}
+
+void
+vm_destroy(struct vm *vm)
+{
+ int i;
+
+ ppt_unassign_all(vm);
+
+ for (i = 0; i < vm->num_mem_segs; i++)
+ vm_free_mem_seg(vm, &vm->mem_segs[i]);
+
+ vm->num_mem_segs = 0;
+
+ for (i = 0; i < VM_MAXCPU; i++)
+ vcpu_cleanup(&vm->vcpu[i]);
+
+ iommu_destroy_domain(vm->iommu);
+
+ VMCLEANUP(vm->cookie);
+
+ free(vm, M_VM);
+}
+
+const char *
+vm_name(struct vm *vm)
+{
+ return (vm->name);
+}
+
+int
+vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
+ VM_PROT_RW, spok));
+}
+
+int
+vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
+ VM_PROT_NONE, spok));
+}
+
+/*
+ * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
+ */
+static boolean_t
+vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
+{
+ int i;
+ vm_paddr_t gpabase, gpalimit;
+
+ if (gpa & PAGE_MASK)
+ panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ gpabase = vm->mem_segs[i].gpa;
+ gpalimit = gpabase + vm->mem_segs[i].len;
+ if (gpa >= gpabase && gpa < gpalimit)
+ return (FALSE);
+ }
+
+ return (TRUE);
+}
+
+int
+vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+ int error, available, allocated;
+ struct vm_memory_segment *seg;
+ vm_paddr_t g, hpa;
+ void *host_domain;
+
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
+ return (EINVAL);
+
+ available = allocated = 0;
+ g = gpa;
+ while (g < gpa + len) {
+ if (vm_gpa_available(vm, g))
+ available++;
+ else
+ allocated++;
+
+ g += PAGE_SIZE;
+ }
+
+ /*
+ * If there are some allocated and some available pages in the address
+ * range then it is an error.
+ */
+ if (allocated && available)
+ return (EINVAL);
+
+ /*
+ * If the entire address range being requested has already been
+ * allocated then there isn't anything more to do.
+ */
+ if (allocated && available == 0)
+ return (0);
+
+ if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
+ return (E2BIG);
+
+ host_domain = iommu_host_domain();
+
+ seg = &vm->mem_segs[vm->num_mem_segs];
+
+ error = 0;
+ seg->gpa = gpa;
+ seg->len = 0;
+ while (seg->len < len) {
+ hpa = vmm_mem_alloc(PAGE_SIZE);
+ if (hpa == 0) {
+ error = ENOMEM;
+ break;
+ }
+
+ error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
+ VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
+ if (error)
+ break;
+
+ /*
+ * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
+ * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
+ */
+ iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
+ iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
+
+ seg->len += PAGE_SIZE;
+ }
+
+ if (error) {
+ vm_free_mem_seg(vm, seg);
+ return (error);
+ }
+
+ /*
+ * Invalidate cached translations associated with 'host_domain' since
+ * we have now moved some pages from it.
+ */
+ iommu_invalidate_tlb(host_domain);
+
+ vm->num_mem_segs++;
+
+ return (0);
+}
+
+vm_paddr_t
+vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+ vm_paddr_t nextpage;
+
+ nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
+ if (len > nextpage - gpa)
+ panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+
+ return (VMMMAP_GET(vm->cookie, gpa));
+}
+
+int
+vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+ struct vm_memory_segment *seg)
+{
+ int i;
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ if (gpabase == vm->mem_segs[i].gpa) {
+ *seg = vm->mem_segs[i];
+ return (0);
+ }
+ }
+ return (-1);
+}
+
+int
+vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
+{
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (reg >= VM_REG_LAST)
+ return (EINVAL);
+
+ return (VMGETREG(vm->cookie, vcpu, reg, retval));
+}
+
+int
+vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
+{
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (reg >= VM_REG_LAST)
+ return (EINVAL);
+
+ return (VMSETREG(vm->cookie, vcpu, reg, val));
+}
+
+static boolean_t
+is_descriptor_table(int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_IDTR:
+ case VM_REG_GUEST_GDTR:
+ return (TRUE);
+ default:
+ return (FALSE);
+ }
+}
+
+static boolean_t
+is_segment_register(int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_ES:
+ case VM_REG_GUEST_CS:
+ case VM_REG_GUEST_SS:
+ case VM_REG_GUEST_DS:
+ case VM_REG_GUEST_FS:
+ case VM_REG_GUEST_GS:
+ case VM_REG_GUEST_TR:
+ case VM_REG_GUEST_LDTR:
+ return (TRUE);
+ default:
+ return (FALSE);
+ }
+}
+
+int
+vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc)
+{
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (!is_segment_register(reg) && !is_descriptor_table(reg))
+ return (EINVAL);
+
+ return (VMGETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (!is_segment_register(reg) && !is_descriptor_table(reg))
+ return (EINVAL);
+
+ return (VMSETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+static void
+restore_guest_fpustate(struct vcpu *vcpu)
+{
+
+ /* flush host state to the pcb */
+ fpuexit(curthread);
+
+ /* restore guest FPU state */
+ fpu_stop_emulating();
+ fpurestore(vcpu->guestfpu);
+
+ /*
+ * The FPU is now "dirty" with the guest's state so turn on emulation
+ * to trap any access to the FPU by the host.
+ */
+ fpu_start_emulating();
+}
+
+static void
+save_guest_fpustate(struct vcpu *vcpu)
+{
+
+ if ((rcr0() & CR0_TS) == 0)
+ panic("fpu emulation not enabled in host!");
+
+ /* save guest FPU state */
+ fpu_stop_emulating();
+ fpusave(vcpu->guestfpu);
+ fpu_start_emulating();
+}
+
+static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
+
+int
+vm_run(struct vm *vm, struct vm_run *vmrun)
+{
+ int error, vcpuid, sleepticks, t;
+ struct vcpu *vcpu;
+ struct pcb *pcb;
+ uint64_t tscval, rip;
+ struct vm_exit *vme;
+
+ vcpuid = vmrun->cpuid;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+ vme = &vmrun->vm_exit;
+ rip = vmrun->rip;
+restart:
+ critical_enter();
+
+ tscval = rdtsc();
+
+ pcb = PCPU_GET(curpcb);
+ set_pcb_flags(pcb, PCB_FULL_IRET);
+
+ restore_guest_msrs(vm, vcpuid);
+ restore_guest_fpustate(vcpu);
+
+ vcpu->hostcpu = curcpu;
+ error = VMRUN(vm->cookie, vcpuid, rip);
+ vcpu->hostcpu = NOCPU;
+
+ save_guest_fpustate(vcpu);
+ restore_host_msrs(vm, vcpuid);
+
+ vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
+
+ /* copy the exit information */
+ bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
+
+ critical_exit();
+
+ /*
+ * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
+ * is ready to run.
+ */
+ if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
+ vcpu_lock(vcpu);
+
+ /*
+ * Figure out the number of host ticks until the next apic
+ * timer interrupt in the guest.
+ */
+ sleepticks = lapic_timer_tick(vm, vcpuid);
+
+ /*
+ * If the guest local apic timer is disabled then sleep for
+ * a long time but not forever.
+ */
+ if (sleepticks < 0)
+ sleepticks = hz;
+
+ /*
+ * Do a final check for pending NMI or interrupts before
+ * really putting this thread to sleep.
+ *
+ * These interrupts could have happened any time after we
+ * returned from VMRUN() and before we grabbed the vcpu lock.
+ */
+ if (!vm_nmi_pending(vm, vcpuid) &&
+ lapic_pending_intr(vm, vcpuid) < 0) {
+ if (sleepticks <= 0)
+ panic("invalid sleepticks %d", sleepticks);
+ t = ticks;
+ msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
+ vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+ }
+
+ vcpu_unlock(vcpu);
+
+ rip = vme->rip + vme->inst_length;
+ goto restart;
+ }
+
+ return (error);
+}
+
+int
+vm_inject_event(struct vm *vm, int vcpuid, int type,
+ int vector, uint32_t code, int code_valid)
+{
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
+ return (EINVAL);
+
+ if (vector < 0 || vector > 255)
+ return (EINVAL);
+
+ return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
+}
+
+static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
+
+int
+vm_inject_nmi(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu->nmi_pending = 1;
+ vm_interrupt_hostcpu(vm, vcpuid);
+ return (0);
+}
+
+int
+vm_nmi_pending(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ return (vcpu->nmi_pending);
+}
+
+void
+vm_nmi_clear(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ if (vcpu->nmi_pending == 0)
+ panic("vm_nmi_clear: inconsistent nmi_pending state");
+
+ vcpu->nmi_pending = 0;
+ vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
+}
+
+int
+vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (type < 0 || type >= VM_CAP_MAX)
+ return (EINVAL);
+
+ return (VMGETCAP(vm->cookie, vcpu, type, retval));
+}
+
+int
+vm_set_capability(struct vm *vm, int vcpu, int type, int val)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (type < 0 || type >= VM_CAP_MAX)
+ return (EINVAL);
+
+ return (VMSETCAP(vm->cookie, vcpu, type, val));
+}
+
+uint64_t *
+vm_guest_msrs(struct vm *vm, int cpu)
+{
+ return (vm->vcpu[cpu].guest_msrs);
+}
+
+struct vlapic *
+vm_lapic(struct vm *vm, int cpu)
+{
+ return (vm->vcpu[cpu].vlapic);
+}
+
+boolean_t
+vmm_is_pptdev(int bus, int slot, int func)
+{
+ int found, i, n;
+ int b, s, f;
+ char *val, *cp, *cp2;
+
+ /*
+ * XXX
+ * The length of an environment variable is limited to 128 bytes which
+ * puts an upper limit on the number of passthru devices that may be
+ * specified using a single environment variable.
+ *
+ * Work around this by scanning multiple environment variable
+ * names instead of a single one - yuck!
+ */
+ const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
+
+ /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
+ found = 0;
+ for (i = 0; names[i] != NULL && !found; i++) {
+ cp = val = getenv(names[i]);
+ while (cp != NULL && *cp != '\0') {
+ if ((cp2 = strchr(cp, ' ')) != NULL)
+ *cp2 = '\0';
+
+ n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
+ if (n == 3 && bus == b && slot == s && func == f) {
+ found = 1;
+ break;
+ }
+
+ if (cp2 != NULL)
+ *cp2++ = ' ';
+
+ cp = cp2;
+ }
+ freeenv(val);
+ }
+ return (found);
+}
+
+void *
+vm_iommu_domain(struct vm *vm)
+{
+
+ return (vm->iommu);
+}
+
+int
+vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
+{
+ int error;
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+
+ /*
+ * The following state transitions are allowed:
+ * IDLE -> RUNNING -> IDLE
+ * IDLE -> CANNOT_RUN -> IDLE
+ */
+ if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
+ (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
+ error = 0;
+ vcpu->state = state;
+ } else {
+ error = EBUSY;
+ }
+
+ vcpu_unlock(vcpu);
+
+ return (error);
+}
+
+enum vcpu_state
+vcpu_get_state(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+ enum vcpu_state state;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+ state = vcpu->state;
+ vcpu_unlock(vcpu);
+
+ return (state);
+}
+
+void
+vm_activate_cpu(struct vm *vm, int vcpuid)
+{
+
+ if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
+ CPU_SET(vcpuid, &vm->active_cpus);
+}
+
+cpuset_t
+vm_active_cpus(struct vm *vm)
+{
+
+ return (vm->active_cpus);
+}
+
+void *
+vcpu_stats(struct vm *vm, int vcpuid)
+{
+
+ return (vm->vcpu[vcpuid].stats);
+}
+
+int
+vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
+{
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ *state = vm->vcpu[vcpuid].x2apic_state;
+
+ return (0);
+}
+
+int
+vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (state >= X2APIC_STATE_LAST)
+ return (EINVAL);
+
+ vm->vcpu[vcpuid].x2apic_state = state;
+
+ vlapic_set_x2apic_state(vm, vcpuid, state);
+
+ return (0);
+}
+
+void
+vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
+{
+ int hostcpu;
+ struct vcpu *vcpu;
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+ hostcpu = vcpu->hostcpu;
+ if (hostcpu == NOCPU) {
+ /*
+ * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
+ * the host thread must be sleeping waiting for an event to
+ * kick the vcpu out of 'hlt'.
+ *
+ * XXX this is racy because the condition exists right before
+ * and after calling VMRUN() in vm_run(). The wakeup() is
+ * benign in this case.
+ */
+ if (vcpu->state == VCPU_RUNNING)
+ wakeup_one(vcpu);
+ } else {
+ if (vcpu->state != VCPU_RUNNING)
+ panic("invalid vcpu state %d", vcpu->state);
+ if (hostcpu != curcpu)
+ ipi_cpu(hostcpu, vmm_ipinum);
+ }
+ vcpu_unlock(vcpu);
+}
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
new file mode 100644
index 0000000..7608d5c
--- /dev/null
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -0,0 +1,526 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/sysctl.h>
+#include <sys/libkern.h>
+#include <sys/ioccom.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_stat.h"
+#include "vmm_mem.h"
+#include "io/ppt.h"
+#include <machine/vmm_dev.h>
+
+struct vmmdev_softc {
+ struct vm *vm; /* vm instance cookie */
+ struct cdev *cdev;
+ SLIST_ENTRY(vmmdev_softc) link;
+};
+static SLIST_HEAD(, vmmdev_softc) head;
+
+static struct mtx vmmdev_mtx;
+
+static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
+
+SYSCTL_DECL(_hw_vmm);
+
+static struct vmmdev_softc *
+vmmdev_lookup(const char *name)
+{
+ struct vmmdev_softc *sc;
+
+#ifdef notyet /* XXX kernel is not compiled with invariants */
+ mtx_assert(&vmmdev_mtx, MA_OWNED);
+#endif
+
+ SLIST_FOREACH(sc, &head, link) {
+ if (strcmp(name, vm_name(sc->vm)) == 0)
+ break;
+ }
+
+ return (sc);
+}
+
+static struct vmmdev_softc *
+vmmdev_lookup2(struct cdev *cdev)
+{
+
+ return (cdev->si_drv1);
+}
+
+static int
+vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
+{
+ int error, off, c;
+ vm_paddr_t hpa, gpa;
+ struct vmmdev_softc *sc;
+
+ static char zerobuf[PAGE_SIZE];
+
+ error = 0;
+ mtx_lock(&vmmdev_mtx);
+ sc = vmmdev_lookup2(cdev);
+ if (sc == NULL)
+ error = ENXIO;
+
+ while (uio->uio_resid > 0 && error == 0) {
+ gpa = uio->uio_offset;
+ off = gpa & PAGE_MASK;
+ c = min(uio->uio_resid, PAGE_SIZE - off);
+
+ /*
+ * The VM has a hole in its physical memory map. If we want to
+ * use 'dd' to inspect memory beyond the hole we need to
+ * provide bogus data for memory that lies in the hole.
+ *
+ * Since this device does not support lseek(2), dd(1) will
+ * read(2) blocks of data to simulate the lseek(2).
+ */
+ hpa = vm_gpa2hpa(sc->vm, gpa, c);
+ if (hpa == (vm_paddr_t)-1) {
+ if (uio->uio_rw == UIO_READ)
+ error = uiomove(zerobuf, c, uio);
+ else
+ error = EFAULT;
+ } else
+ error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
+ }
+
+ mtx_unlock(&vmmdev_mtx);
+ return (error);
+}
+
+static int
+vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
+ struct thread *td)
+{
+ int error, vcpu, state_changed;
+ enum vcpu_state new_state;
+ struct vmmdev_softc *sc;
+ struct vm_memory_segment *seg;
+ struct vm_register *vmreg;
+ struct vm_seg_desc* vmsegdesc;
+ struct vm_run *vmrun;
+ struct vm_event *vmevent;
+ struct vm_lapic_irq *vmirq;
+ struct vm_capability *vmcap;
+ struct vm_pptdev *pptdev;
+ struct vm_pptdev_mmio *pptmmio;
+ struct vm_pptdev_msi *pptmsi;
+ struct vm_pptdev_msix *pptmsix;
+ struct vm_nmi *vmnmi;
+ struct vm_stats *vmstats;
+ struct vm_stat_desc *statdesc;
+ struct vm_x2apic *x2apic;
+
+ sc = vmmdev_lookup2(cdev);
+ if (sc == NULL)
+ return (ENXIO);
+
+ vcpu = -1;
+ state_changed = 0;
+
+ /*
+ * Some VMM ioctls can operate only on vcpus that are not running.
+ */
+ switch (cmd) {
+ case VM_RUN:
+ case VM_GET_REGISTER:
+ case VM_SET_REGISTER:
+ case VM_GET_SEGMENT_DESCRIPTOR:
+ case VM_SET_SEGMENT_DESCRIPTOR:
+ case VM_INJECT_EVENT:
+ case VM_GET_CAPABILITY:
+ case VM_SET_CAPABILITY:
+ case VM_PPTDEV_MSI:
+ case VM_PPTDEV_MSIX:
+ case VM_SET_X2APIC_STATE:
+ /*
+ * XXX fragile, handle with care
+ * Assumes that the first field of the ioctl data is the vcpu.
+ */
+ vcpu = *(int *)data;
+ if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+ error = EINVAL;
+ goto done;
+ }
+
+ if (cmd == VM_RUN)
+ new_state = VCPU_RUNNING;
+ else
+ new_state = VCPU_CANNOT_RUN;
+
+ error = vcpu_set_state(sc->vm, vcpu, new_state);
+ if (error)
+ goto done;
+
+ state_changed = 1;
+ break;
+
+ case VM_MAP_PPTDEV_MMIO:
+ case VM_BIND_PPTDEV:
+ case VM_UNBIND_PPTDEV:
+ case VM_MAP_MEMORY:
+ /*
+ * ioctls that operate on the entire virtual machine must
+ * prevent all vcpus from running.
+ */
+ error = 0;
+ for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
+ error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN);
+ if (error)
+ break;
+ }
+
+ if (error) {
+ while (--vcpu >= 0)
+ vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+ goto done;
+ }
+
+ state_changed = 2;
+ break;
+
+ default:
+ break;
+ }
+
+ switch(cmd) {
+ case VM_RUN:
+ vmrun = (struct vm_run *)data;
+ error = vm_run(sc->vm, vmrun);
+ break;
+ case VM_STAT_DESC: {
+ const char *desc;
+ statdesc = (struct vm_stat_desc *)data;
+ desc = vmm_stat_desc(statdesc->index);
+ if (desc != NULL) {
+ error = 0;
+ strlcpy(statdesc->desc, desc, sizeof(statdesc->desc));
+ } else
+ error = EINVAL;
+ break;
+ }
+ case VM_STATS: {
+ CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES);
+ vmstats = (struct vm_stats *)data;
+ getmicrotime(&vmstats->tv);
+ error = vmm_stat_copy(sc->vm, vmstats->cpuid,
+ &vmstats->num_entries, vmstats->statbuf);
+ break;
+ }
+ case VM_PPTDEV_MSI:
+ pptmsi = (struct vm_pptdev_msi *)data;
+ error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
+ pptmsi->bus, pptmsi->slot, pptmsi->func,
+ pptmsi->destcpu, pptmsi->vector,
+ pptmsi->numvec);
+ break;
+ case VM_PPTDEV_MSIX:
+ pptmsix = (struct vm_pptdev_msix *)data;
+ error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
+ pptmsix->bus, pptmsix->slot,
+ pptmsix->func, pptmsix->idx,
+ pptmsix->msg, pptmsix->vector_control,
+ pptmsix->addr);
+ break;
+ case VM_MAP_PPTDEV_MMIO:
+ pptmmio = (struct vm_pptdev_mmio *)data;
+ error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
+ pptmmio->func, pptmmio->gpa, pptmmio->len,
+ pptmmio->hpa);
+ break;
+ case VM_BIND_PPTDEV:
+ pptdev = (struct vm_pptdev *)data;
+ error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
+ pptdev->func);
+ break;
+ case VM_UNBIND_PPTDEV:
+ pptdev = (struct vm_pptdev *)data;
+ error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
+ pptdev->func);
+ break;
+ case VM_INJECT_EVENT:
+ vmevent = (struct vm_event *)data;
+ error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type,
+ vmevent->vector,
+ vmevent->error_code,
+ vmevent->error_code_valid);
+ break;
+ case VM_INJECT_NMI:
+ vmnmi = (struct vm_nmi *)data;
+ error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
+ break;
+ case VM_LAPIC_IRQ:
+ vmirq = (struct vm_lapic_irq *)data;
+ error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector);
+ break;
+ case VM_MAP_MEMORY:
+ seg = (struct vm_memory_segment *)data;
+ error = vm_malloc(sc->vm, seg->gpa, seg->len);
+ break;
+ case VM_GET_MEMORY_SEG:
+ seg = (struct vm_memory_segment *)data;
+ seg->len = 0;
+ (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg);
+ error = 0;
+ break;
+ case VM_GET_REGISTER:
+ vmreg = (struct vm_register *)data;
+ error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+ &vmreg->regval);
+ break;
+ case VM_SET_REGISTER:
+ vmreg = (struct vm_register *)data;
+ error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+ vmreg->regval);
+ break;
+ case VM_SET_SEGMENT_DESCRIPTOR:
+ vmsegdesc = (struct vm_seg_desc *)data;
+ error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
+ vmsegdesc->regnum,
+ &vmsegdesc->desc);
+ break;
+ case VM_GET_SEGMENT_DESCRIPTOR:
+ vmsegdesc = (struct vm_seg_desc *)data;
+ error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
+ vmsegdesc->regnum,
+ &vmsegdesc->desc);
+ break;
+ case VM_GET_CAPABILITY:
+ vmcap = (struct vm_capability *)data;
+ error = vm_get_capability(sc->vm, vmcap->cpuid,
+ vmcap->captype,
+ &vmcap->capval);
+ break;
+ case VM_SET_CAPABILITY:
+ vmcap = (struct vm_capability *)data;
+ error = vm_set_capability(sc->vm, vmcap->cpuid,
+ vmcap->captype,
+ vmcap->capval);
+ break;
+ case VM_SET_X2APIC_STATE:
+ x2apic = (struct vm_x2apic *)data;
+ error = vm_set_x2apic_state(sc->vm,
+ x2apic->cpuid, x2apic->state);
+ break;
+ case VM_GET_X2APIC_STATE:
+ x2apic = (struct vm_x2apic *)data;
+ error = vm_get_x2apic_state(sc->vm,
+ x2apic->cpuid, &x2apic->state);
+ break;
+ default:
+ error = ENOTTY;
+ break;
+ }
+
+ if (state_changed == 1) {
+ vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+ } else if (state_changed == 2) {
+ for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
+ vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+ }
+
+done:
+ return (error);
+}
+
+static int
+vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
+ int nprot, vm_memattr_t *memattr)
+{
+ int error;
+ struct vmmdev_softc *sc;
+
+ error = -1;
+ mtx_lock(&vmmdev_mtx);
+
+ sc = vmmdev_lookup2(cdev);
+ if (sc != NULL && (nprot & PROT_EXEC) == 0) {
+ *paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
+ if (*paddr != (vm_paddr_t)-1)
+ error = 0;
+ }
+
+ mtx_unlock(&vmmdev_mtx);
+
+ return (error);
+}
+
+static void
+vmmdev_destroy(struct vmmdev_softc *sc, boolean_t unlink)
+{
+
+ /*
+ * XXX must stop virtual machine instances that may be still
+ * running and cleanup their state.
+ */
+ if (sc->cdev)
+ destroy_dev(sc->cdev);
+
+ if (sc->vm)
+ vm_destroy(sc->vm);
+
+ if (unlink) {
+ mtx_lock(&vmmdev_mtx);
+ SLIST_REMOVE(&head, sc, vmmdev_softc, link);
+ mtx_unlock(&vmmdev_mtx);
+ }
+
+ free(sc, M_VMMDEV);
+}
+
+static int
+sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ char buf[VM_MAX_NAMELEN];
+ struct vmmdev_softc *sc;
+
+ strlcpy(buf, "beavis", sizeof(buf));
+ error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ /*
+ * XXX TODO if any process has this device open then fail
+ */
+
+ mtx_lock(&vmmdev_mtx);
+ sc = vmmdev_lookup(buf);
+ if (sc == NULL) {
+ mtx_unlock(&vmmdev_mtx);
+ return (EINVAL);
+ }
+
+ sc->cdev->si_drv1 = NULL;
+ mtx_unlock(&vmmdev_mtx);
+
+ vmmdev_destroy(sc, TRUE);
+
+ return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
+ NULL, 0, sysctl_vmm_destroy, "A", NULL);
+
+static struct cdevsw vmmdevsw = {
+ .d_name = "vmmdev",
+ .d_version = D_VERSION,
+ .d_ioctl = vmmdev_ioctl,
+ .d_mmap = vmmdev_mmap,
+ .d_read = vmmdev_rw,
+ .d_write = vmmdev_rw,
+};
+
+static int
+sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ struct vm *vm;
+ struct vmmdev_softc *sc, *sc2;
+ char buf[VM_MAX_NAMELEN];
+
+ strlcpy(buf, "beavis", sizeof(buf));
+ error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ mtx_lock(&vmmdev_mtx);
+ sc = vmmdev_lookup(buf);
+ mtx_unlock(&vmmdev_mtx);
+ if (sc != NULL)
+ return (EEXIST);
+
+ error = vm_create(buf, &vm);
+ if (error != 0)
+ return (error);
+
+ sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
+ sc->vm = vm;
+
+ /*
+ * Lookup the name again just in case somebody sneaked in when we
+ * dropped the lock.
+ */
+ mtx_lock(&vmmdev_mtx);
+ sc2 = vmmdev_lookup(buf);
+ if (sc2 == NULL)
+ SLIST_INSERT_HEAD(&head, sc, link);
+ mtx_unlock(&vmmdev_mtx);
+
+ if (sc2 != NULL) {
+ vmmdev_destroy(sc, FALSE);
+ return (EEXIST);
+ }
+
+ sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+ "vmm/%s", buf);
+ sc->cdev->si_drv1 = sc;
+
+ return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
+ NULL, 0, sysctl_vmm_create, "A", NULL);
+
+void
+vmmdev_init(void)
+{
+ mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
+}
+
+int
+vmmdev_cleanup(void)
+{
+ int error;
+
+ if (SLIST_EMPTY(&head))
+ error = 0;
+ else
+ error = EBUSY;
+
+ return (error);
+}
diff --git a/sys/amd64/vmm/vmm_host.c b/sys/amd64/vmm/vmm_host.c
new file mode 100644
index 0000000..8dfef73
--- /dev/null
+++ b/sys/amd64/vmm/vmm_host.c
@@ -0,0 +1,124 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/pcpu.h>
+
+#include <machine/cpufunc.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+
+#include "vmm_host.h"
+
+static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4;
+
+void
+vmm_host_state_init(void)
+{
+
+ vmm_host_efer = rdmsr(MSR_EFER);
+ vmm_host_pat = rdmsr(MSR_PAT);
+
+ /*
+ * We always want CR0.TS to be set when the processor does a VM exit.
+ *
+ * With emulation turned on unconditionally after a VM exit, we are
+ * able to trap inadvertent use of the FPU until the guest FPU state
+ * has been safely squirreled away.
+ */
+ vmm_host_cr0 = rcr0() | CR0_TS;
+
+ vmm_host_cr4 = rcr4();
+}
+
+uint64_t
+vmm_get_host_pat(void)
+{
+
+ return (vmm_host_pat);
+}
+
+uint64_t
+vmm_get_host_efer(void)
+{
+
+ return (vmm_host_efer);
+}
+
+uint64_t
+vmm_get_host_cr0(void)
+{
+
+ return (vmm_host_cr0);
+}
+
+uint64_t
+vmm_get_host_cr4(void)
+{
+
+ return (vmm_host_cr4);
+}
+
+uint64_t
+vmm_get_host_datasel(void)
+{
+
+ return (GSEL(GDATA_SEL, SEL_KPL));
+
+}
+
+uint64_t
+vmm_get_host_codesel(void)
+{
+
+ return (GSEL(GCODE_SEL, SEL_KPL));
+}
+
+uint64_t
+vmm_get_host_tsssel(void)
+{
+
+ return (GSEL(GPROC0_SEL, SEL_KPL));
+}
+
+uint64_t
+vmm_get_host_fsbase(void)
+{
+
+ return (0);
+}
+
+uint64_t
+vmm_get_host_idtrbase(void)
+{
+
+ return (r_idt.rd_base);
+}
diff --git a/sys/amd64/vmm/vmm_host.h b/sys/amd64/vmm/vmm_host.h
new file mode 100644
index 0000000..839f54a
--- /dev/null
+++ b/sys/amd64/vmm/vmm_host.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_HOST_H_
+#define _VMM_HOST_H_
+
+#ifndef _KERNEL
+#error "no user-servicable parts inside"
+#endif
+
+void vmm_host_state_init(void);
+
+uint64_t vmm_get_host_pat(void);
+uint64_t vmm_get_host_efer(void);
+uint64_t vmm_get_host_cr0(void);
+uint64_t vmm_get_host_cr4(void);
+uint64_t vmm_get_host_datasel(void);
+uint64_t vmm_get_host_codesel(void);
+uint64_t vmm_get_host_tsssel(void);
+uint64_t vmm_get_host_fsbase(void);
+uint64_t vmm_get_host_idtrbase(void);
+
+/*
+ * Inline access to host state that is used on every VM entry
+ */
+static __inline uint64_t
+vmm_get_host_trbase(void)
+{
+
+ return ((uint64_t)PCPU_GET(tssp));
+}
+
+static __inline uint64_t
+vmm_get_host_gdtrbase(void)
+{
+
+ return ((uint64_t)&gdt[NGDT * curcpu]);
+}
+
+struct pcpu;
+extern struct pcpu __pcpu[];
+
+static __inline uint64_t
+vmm_get_host_gsbase(void)
+{
+
+ return ((uint64_t)&__pcpu[curcpu]);
+}
+
+#endif
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
new file mode 100644
index 0000000..7b480bd
--- /dev/null
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -0,0 +1,867 @@
+/*-
+ * Copyright (c) 2012 Sandvine, Inc.
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/pcpu.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/vmm.h>
+#else /* !_KERNEL */
+#include <sys/types.h>
+#include <sys/errno.h>
+
+#include <machine/vmm.h>
+
+#include <vmmapi.h>
+#endif /* _KERNEL */
+
+
+
+/* struct vie_op.op_type */
+enum {
+ VIE_OP_TYPE_NONE = 0,
+ VIE_OP_TYPE_MOV,
+ VIE_OP_TYPE_AND,
+ VIE_OP_TYPE_LAST
+};
+
+/* struct vie_op.op_flags */
+#define VIE_OP_F_IMM (1 << 0) /* immediate operand present */
+#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
+
+static const struct vie_op one_byte_opcodes[256] = {
+ [0x88] = {
+ .op_byte = 0x88,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0x89] = {
+ .op_byte = 0x89,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0x8B] = {
+ .op_byte = 0x8B,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0xC7] = {
+ .op_byte = 0xC7,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_IMM,
+ },
+ [0x23] = {
+ .op_byte = 0x23,
+ .op_type = VIE_OP_TYPE_AND,
+ },
+ [0x81] = {
+ /* XXX Group 1 extended opcode - not just AND */
+ .op_byte = 0x81,
+ .op_type = VIE_OP_TYPE_AND,
+ .op_flags = VIE_OP_F_IMM,
+ }
+};
+
+/* struct vie.mod */
+#define VIE_MOD_INDIRECT 0
+#define VIE_MOD_INDIRECT_DISP8 1
+#define VIE_MOD_INDIRECT_DISP32 2
+#define VIE_MOD_DIRECT 3
+
+/* struct vie.rm */
+#define VIE_RM_SIB 4
+#define VIE_RM_DISP32 5
+
+#define GB (1024 * 1024 * 1024)
+
+static enum vm_reg_name gpr_map[16] = {
+ VM_REG_GUEST_RAX,
+ VM_REG_GUEST_RCX,
+ VM_REG_GUEST_RDX,
+ VM_REG_GUEST_RBX,
+ VM_REG_GUEST_RSP,
+ VM_REG_GUEST_RBP,
+ VM_REG_GUEST_RSI,
+ VM_REG_GUEST_RDI,
+ VM_REG_GUEST_R8,
+ VM_REG_GUEST_R9,
+ VM_REG_GUEST_R10,
+ VM_REG_GUEST_R11,
+ VM_REG_GUEST_R12,
+ VM_REG_GUEST_R13,
+ VM_REG_GUEST_R14,
+ VM_REG_GUEST_R15
+};
+
+static uint64_t size2mask[] = {
+ [1] = 0xff,
+ [2] = 0xffff,
+ [4] = 0xffffffff,
+ [8] = 0xffffffffffffffff,
+};
+
+static int
+vie_valid_register(enum vm_reg_name reg)
+{
+#ifdef _KERNEL
+ /*
+ * XXX
+ * The operand register in which we store the result of the
+ * read must be a GPR that we can modify even if the vcpu
+ * is "running". All the GPRs qualify except for %rsp.
+ *
+ * This is a limitation of the vm_set_register() API
+ * and can be fixed if necessary.
+ */
+ if (reg == VM_REG_GUEST_RSP)
+ return (0);
+#endif
+ return (1);
+}
+
+static int
+vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
+{
+ int error;
+
+ if (!vie_valid_register(reg))
+ return (EINVAL);
+
+ error = vm_get_register(vm, vcpuid, reg, rval);
+
+ return (error);
+}
+
+static int
+vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
+{
+ uint64_t val;
+ int error, rshift;
+ enum vm_reg_name reg;
+
+ rshift = 0;
+ reg = gpr_map[vie->reg];
+
+ /*
+ * 64-bit mode imposes limitations on accessing legacy byte registers.
+ *
+ * The legacy high-byte registers cannot be addressed if the REX
+ * prefix is present. In this case the values 4, 5, 6 and 7 of the
+ * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
+ *
+ * If the REX prefix is not present then the values 4, 5, 6 and 7
+ * of the 'ModRM:reg' field address the legacy high-byte registers,
+ * %ah, %ch, %dh and %bh respectively.
+ */
+ if (!vie->rex_present) {
+ if (vie->reg & 0x4) {
+ /*
+ * Obtain the value of %ah by reading %rax and shifting
+ * right by 8 bits (same for %bh, %ch and %dh).
+ */
+ rshift = 8;
+ reg = gpr_map[vie->reg & 0x3];
+ }
+ }
+
+ if (!vie_valid_register(reg))
+ return (EINVAL);
+
+ error = vm_get_register(vm, vcpuid, reg, &val);
+ *rval = val >> rshift;
+ return (error);
+}
+
+static int
+vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+ uint64_t val, int size)
+{
+ int error;
+ uint64_t origval;
+
+ if (!vie_valid_register(reg))
+ return (EINVAL);
+
+ switch (size) {
+ case 1:
+ case 2:
+ error = vie_read_register(vm, vcpuid, reg, &origval);
+ if (error)
+ return (error);
+ val &= size2mask[size];
+ val |= origval & ~size2mask[size];
+ break;
+ case 4:
+ val &= 0xffffffffUL;
+ break;
+ case 8:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ error = vm_set_register(vm, vcpuid, reg, val);
+ return (error);
+}
+
+/*
+ * The following simplifying assumptions are made during emulation:
+ *
+ * - guest is in 64-bit mode
+ * - default address size is 64-bits
+ * - default operand size is 32-bits
+ *
+ * - operand size override is not supported
+ *
+ * - address size override is not supported
+ */
+static int
+emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint8_t byte;
+ uint64_t val;
+
+ size = 4;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x88:
+ /*
+ * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
+ * 88/r: mov r/m8, r8
+ * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
+ */
+ size = 1;
+ error = vie_read_bytereg(vm, vcpuid, vie, &byte);
+ if (error == 0)
+ error = memwrite(vm, vcpuid, gpa, byte, size, arg);
+ break;
+ case 0x89:
+ /*
+ * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+ * 89/r: mov r/m32, r32
+ * REX.W + 89/r mov r/m64, r64
+ */
+ if (vie->rex_w)
+ size = 8;
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val);
+ if (error == 0) {
+ val &= size2mask[size];
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ }
+ break;
+ case 0x8B:
+ /*
+ * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
+ * 8B/r: mov r32, r/m32
+ * REX.W 8B/r: mov r64, r/m64
+ */
+ if (vie->rex_w)
+ size = 8;
+ error = memread(vm, vcpuid, gpa, &val, size, arg);
+ if (error == 0) {
+ reg = gpr_map[vie->reg];
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ }
+ break;
+ case 0xC7:
+ /*
+ * MOV from imm32 to mem (ModRM:r/m)
+ * C7/0 mov r/m32, imm32
+ * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
+ */
+ val = vie->immediate; /* already sign-extended */
+
+ if (vie->rex_w)
+ size = 8;
+
+ if (size != 8)
+ val &= size2mask[size];
+
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ break;
+ default:
+ break;
+ }
+
+ return (error);
+}
+
+static int
+emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint64_t val1, val2;
+
+ size = 4;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x23:
+ /*
+ * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
+ * result in reg.
+ *
+ * 23/r and r32, r/m32
+ * REX.W + 23/r and r64, r/m64
+ */
+ if (vie->rex_w)
+ size = 8;
+
+ /* get the first operand */
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val1);
+ if (error)
+ break;
+
+ /* get the second operand */
+ error = memread(vm, vcpuid, gpa, &val2, size, arg);
+ if (error)
+ break;
+
+ /* perform the operation and write the result */
+ val1 &= val2;
+ error = vie_update_register(vm, vcpuid, reg, val1, size);
+ break;
+ case 0x81:
+ /*
+ * AND reg (ModRM:reg) with immediate and store the
+ * result in reg
+ *
+ * 81/ and r/m32, imm32
+ * REX.W + 81/ and r/m64, imm32 sign-extended to 64
+ *
+ * Currently, only the AND operation of the 0x81 opcode
+ * is implemented (ModRM:reg = b100).
+ */
+ if ((vie->reg & 7) != 4)
+ break;
+
+ if (vie->rex_w)
+ size = 8;
+
+ /* get the first operand */
+ error = memread(vm, vcpuid, gpa, &val1, size, arg);
+ if (error)
+ break;
+
+ /*
+ * perform the operation with the pre-fetched immediate
+ * operand and write the result
+ */
+ val1 &= vie->immediate;
+ error = memwrite(vm, vcpuid, gpa, val1, size, arg);
+ break;
+ default:
+ break;
+ }
+ return (error);
+}
+
+int
+vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite,
+ void *memarg)
+{
+ int error;
+
+ if (!vie->decoded)
+ return (EINVAL);
+
+ switch (vie->op.op_type) {
+ case VIE_OP_TYPE_MOV:
+ error = emulate_mov(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_AND:
+ error = emulate_and(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+#ifdef _KERNEL
+static void
+vie_init(struct vie *vie)
+{
+
+ bzero(vie, sizeof(struct vie));
+
+ vie->base_register = VM_REG_LAST;
+ vie->index_register = VM_REG_LAST;
+}
+
+static int
+gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
+ uint64_t *gpa, uint64_t *gpaend)
+{
+ vm_paddr_t hpa;
+ int nlevels, ptpshift, ptpindex;
+ uint64_t *ptpbase, pte, pgsize;
+
+ /*
+ * XXX assumes 64-bit guest with 4 page walk levels
+ */
+ nlevels = 4;
+ while (--nlevels >= 0) {
+ /* Zero out the lower 12 bits and the upper 12 bits */
+ ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
+
+ hpa = vm_gpa2hpa(vm, ptpphys, PAGE_SIZE);
+ if (hpa == -1)
+ goto error;
+
+ ptpbase = (uint64_t *)PHYS_TO_DMAP(hpa);
+
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gla >> ptpshift) & 0x1FF;
+ pgsize = 1UL << ptpshift;
+
+ pte = ptpbase[ptpindex];
+
+ if ((pte & PG_V) == 0)
+ goto error;
+
+ if (pte & PG_PS) {
+ if (pgsize > 1 * GB)
+ goto error;
+ else
+ break;
+ }
+
+ ptpphys = pte;
+ }
+
+ /* Zero out the lower 'ptpshift' bits and the upper 12 bits */
+ pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
+ *gpa = pte | (gla & (pgsize - 1));
+ *gpaend = pte + pgsize;
+ return (0);
+
+error:
+ return (-1);
+}
+
+int
+vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
+ uint64_t cr3, struct vie *vie)
+{
+ int n, err;
+ uint64_t hpa, gpa, gpaend, off;
+
+ /*
+ * XXX cache previously fetched instructions using 'rip' as the tag
+ */
+
+ if (inst_length > VIE_INST_SIZE)
+ panic("vmm_fetch_instruction: invalid length %d", inst_length);
+
+ vie_init(vie);
+
+ /* Copy the instruction into 'vie' */
+ while (vie->num_valid < inst_length) {
+ err = gla2gpa(vm, rip, cr3, &gpa, &gpaend);
+ if (err)
+ break;
+
+ off = gpa & PAGE_MASK;
+ n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
+
+ hpa = vm_gpa2hpa(vm, gpa, n);
+ if (hpa == -1)
+ break;
+
+ bcopy((void *)PHYS_TO_DMAP(hpa), &vie->inst[vie->num_valid], n);
+
+ rip += n;
+ vie->num_valid += n;
+ }
+
+ if (vie->num_valid == inst_length)
+ return (0);
+ else
+ return (-1);
+}
+
+static int
+vie_peek(struct vie *vie, uint8_t *x)
+{
+
+ if (vie->num_processed < vie->num_valid) {
+ *x = vie->inst[vie->num_processed];
+ return (0);
+ } else
+ return (-1);
+}
+
+static void
+vie_advance(struct vie *vie)
+{
+
+ vie->num_processed++;
+}
+
+static int
+decode_rex(struct vie *vie)
+{
+ uint8_t x;
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ if (x >= 0x40 && x <= 0x4F) {
+ vie->rex_present = 1;
+
+ vie->rex_w = x & 0x8 ? 1 : 0;
+ vie->rex_r = x & 0x4 ? 1 : 0;
+ vie->rex_x = x & 0x2 ? 1 : 0;
+ vie->rex_b = x & 0x1 ? 1 : 0;
+
+ vie_advance(vie);
+ }
+
+ return (0);
+}
+
+static int
+decode_opcode(struct vie *vie)
+{
+ uint8_t x;
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ vie->op = one_byte_opcodes[x];
+
+ if (vie->op.op_type == VIE_OP_TYPE_NONE)
+ return (-1);
+
+ vie_advance(vie);
+ return (0);
+}
+
+/*
+ * XXX assuming 32-bit or 64-bit guest
+ */
+static int
+decode_modrm(struct vie *vie)
+{
+ uint8_t x;
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ vie->mod = (x >> 6) & 0x3;
+ vie->rm = (x >> 0) & 0x7;
+ vie->reg = (x >> 3) & 0x7;
+
+ /*
+ * A direct addressing mode makes no sense in the context of an EPT
+ * fault. There has to be a memory access involved to cause the
+ * EPT fault.
+ */
+ if (vie->mod == VIE_MOD_DIRECT)
+ return (-1);
+
+ if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
+ (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
+ /*
+ * Table 2-5: Special Cases of REX Encodings
+ *
+ * mod=0, r/m=5 is used in the compatibility mode to
+ * indicate a disp32 without a base register.
+ *
+ * mod!=3, r/m=4 is used in the compatibility mode to
+ * indicate that the SIB byte is present.
+ *
+ * The 'b' bit in the REX prefix is don't care in
+ * this case.
+ */
+ } else {
+ vie->rm |= (vie->rex_b << 3);
+ }
+
+ vie->reg |= (vie->rex_r << 3);
+
+ /* SIB */
+ if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
+ goto done;
+
+ vie->base_register = gpr_map[vie->rm];
+
+ switch (vie->mod) {
+ case VIE_MOD_INDIRECT_DISP8:
+ vie->disp_bytes = 1;
+ break;
+ case VIE_MOD_INDIRECT_DISP32:
+ vie->disp_bytes = 4;
+ break;
+ case VIE_MOD_INDIRECT:
+ if (vie->rm == VIE_RM_DISP32) {
+ vie->disp_bytes = 4;
+ vie->base_register = VM_REG_LAST; /* no base */
+ }
+ break;
+ }
+
+ /* Figure out immediate operand size (if any) */
+ if (vie->op.op_flags & VIE_OP_F_IMM)
+ vie->imm_bytes = 4;
+ else if (vie->op.op_flags & VIE_OP_F_IMM8)
+ vie->imm_bytes = 1;
+
+done:
+ vie_advance(vie);
+
+ return (0);
+}
+
+static int
+decode_sib(struct vie *vie)
+{
+ uint8_t x;
+
+ /* Proceed only if SIB byte is present */
+ if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
+ return (0);
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ /* De-construct the SIB byte */
+ vie->ss = (x >> 6) & 0x3;
+ vie->index = (x >> 3) & 0x7;
+ vie->base = (x >> 0) & 0x7;
+
+ /* Apply the REX prefix modifiers */
+ vie->index |= vie->rex_x << 3;
+ vie->base |= vie->rex_b << 3;
+
+ switch (vie->mod) {
+ case VIE_MOD_INDIRECT_DISP8:
+ vie->disp_bytes = 1;
+ break;
+ case VIE_MOD_INDIRECT_DISP32:
+ vie->disp_bytes = 4;
+ break;
+ }
+
+ if (vie->mod == VIE_MOD_INDIRECT &&
+ (vie->base == 5 || vie->base == 13)) {
+ /*
+ * Special case when base register is unused if mod = 0
+ * and base = %rbp or %r13.
+ *
+ * Documented in:
+ * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+ * Table 2-5: Special Cases of REX Encodings
+ */
+ vie->disp_bytes = 4;
+ } else {
+ vie->base_register = gpr_map[vie->base];
+ }
+
+ /*
+ * All encodings of 'index' are valid except for %rsp (4).
+ *
+ * Documented in:
+ * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+ * Table 2-5: Special Cases of REX Encodings
+ */
+ if (vie->index != 4)
+ vie->index_register = gpr_map[vie->index];
+
+ /* 'scale' makes sense only in the context of an index register */
+ if (vie->index_register < VM_REG_LAST)
+ vie->scale = 1 << vie->ss;
+
+ vie_advance(vie);
+
+ return (0);
+}
+
+static int
+decode_displacement(struct vie *vie)
+{
+ int n, i;
+ uint8_t x;
+
+ union {
+ char buf[4];
+ int8_t signed8;
+ int32_t signed32;
+ } u;
+
+ if ((n = vie->disp_bytes) == 0)
+ return (0);
+
+ if (n != 1 && n != 4)
+ panic("decode_displacement: invalid disp_bytes %d", n);
+
+ for (i = 0; i < n; i++) {
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ u.buf[i] = x;
+ vie_advance(vie);
+ }
+
+ if (n == 1)
+ vie->displacement = u.signed8; /* sign-extended */
+ else
+ vie->displacement = u.signed32; /* sign-extended */
+
+ return (0);
+}
+
+static int
+decode_immediate(struct vie *vie)
+{
+ int i, n;
+ uint8_t x;
+ union {
+ char buf[4];
+ int8_t signed8;
+ int32_t signed32;
+ } u;
+
+ if ((n = vie->imm_bytes) == 0)
+ return (0);
+
+ if (n != 1 && n != 4)
+ panic("decode_immediate: invalid imm_bytes %d", n);
+
+ for (i = 0; i < n; i++) {
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ u.buf[i] = x;
+ vie_advance(vie);
+ }
+
+ if (n == 1)
+ vie->immediate = u.signed8; /* sign-extended */
+ else
+ vie->immediate = u.signed32; /* sign-extended */
+
+ return (0);
+}
+
+/*
+ * Verify that the 'guest linear address' provided as collateral of the nested
+ * page table fault matches with our instruction decoding.
+ */
+static int
+verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+ int error;
+ uint64_t base, idx;
+
+ /* Skip 'gla' verification */
+ if (gla == VIE_INVALID_GLA)
+ return (0);
+
+ base = 0;
+ if (vie->base_register != VM_REG_LAST) {
+ error = vm_get_register(vm, cpuid, vie->base_register, &base);
+ if (error) {
+ printf("verify_gla: error %d getting base reg %d\n",
+ error, vie->base_register);
+ return (-1);
+ }
+ }
+
+ idx = 0;
+ if (vie->index_register != VM_REG_LAST) {
+ error = vm_get_register(vm, cpuid, vie->index_register, &idx);
+ if (error) {
+ printf("verify_gla: error %d getting index reg %d\n",
+ error, vie->index_register);
+ return (-1);
+ }
+ }
+
+ if (base + vie->scale * idx + vie->displacement != gla) {
+ printf("verify_gla mismatch: "
+ "base(0x%0lx), scale(%d), index(0x%0lx), "
+ "disp(0x%0lx), gla(0x%0lx)\n",
+ base, vie->scale, idx, vie->displacement, gla);
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
+vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+
+ if (decode_rex(vie))
+ return (-1);
+
+ if (decode_opcode(vie))
+ return (-1);
+
+ if (decode_modrm(vie))
+ return (-1);
+
+ if (decode_sib(vie))
+ return (-1);
+
+ if (decode_displacement(vie))
+ return (-1);
+
+ if (decode_immediate(vie))
+ return (-1);
+
+ if (verify_gla(vm, cpuid, gla, vie))
+ return (-1);
+
+ vie->decoded = 1; /* success */
+
+ return (0);
+}
+#endif /* _KERNEL */
diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c
new file mode 100644
index 0000000..643d326
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.c
@@ -0,0 +1,93 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/bus.h>
+
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
+#include <machine/segments.h>
+#include <machine/md_var.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+
+extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn);
+
+/*
+ * The default is to use the IPI_AST to interrupt a vcpu.
+ */
+int vmm_ipinum = IPI_AST;
+
+CTASSERT(APIC_SPURIOUS_INT == 255);
+
+void
+vmm_ipi_init(void)
+{
+ int idx;
+ uintptr_t func;
+ struct gate_descriptor *ip;
+
+ /*
+ * Search backwards from the highest IDT vector available for use
+ * as our IPI vector. We install the 'justreturn' handler at that
+ * vector and use it to interrupt the vcpus.
+ *
+ * We do this because the IPI_AST is heavyweight and saves all
+ * registers in the trapframe. This is overkill for our use case
+ * which is simply to EOI the interrupt and return.
+ */
+ idx = APIC_SPURIOUS_INT;
+ while (--idx >= APIC_IPI_INTS) {
+ ip = &idt[idx];
+ func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+ if (func == (uintptr_t)&IDTVEC(rsvd)) {
+ vmm_ipinum = idx;
+ setidt(vmm_ipinum, IDTVEC(justreturn), SDT_SYSIGT,
+ SEL_KPL, 0);
+ break;
+ }
+ }
+
+ if (vmm_ipinum != IPI_AST && bootverbose) {
+ printf("vmm_ipi_init: installing ipi handler to interrupt "
+ "vcpus at vector %d\n", vmm_ipinum);
+ }
+}
+
+void
+vmm_ipi_cleanup(void)
+{
+ if (vmm_ipinum != IPI_AST)
+ setidt(vmm_ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+}
diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h
new file mode 100644
index 0000000..91552e3
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_IPI_H_
+#define _VMM_IPI_H_
+
+struct vm;
+
+extern int vmm_ipinum;
+
+void vmm_ipi_init(void);
+void vmm_ipi_cleanup(void);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_ktr.h b/sys/amd64/vmm/vmm_ktr.h
new file mode 100644
index 0000000..e691c61
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ktr.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_KTR_H_
+#define _VMM_KTR_H_
+
+#include <sys/ktr.h>
+#include <sys/pcpu.h>
+
+#define KTR_VMM KTR_GEN
+
+#define VMM_CTR0(vm, vcpuid, format) \
+CTR3(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu)
+
+#define VMM_CTR1(vm, vcpuid, format, p1) \
+CTR4(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+ (p1))
+
+#define VMM_CTR2(vm, vcpuid, format, p1, p2) \
+CTR5(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+ (p1), (p2))
+
+#define VMM_CTR3(vm, vcpuid, format, p1, p2, p3) \
+CTR6(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+ (p1), (p2), (p3))
+#endif
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
new file mode 100644
index 0000000..d024b71
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -0,0 +1,201 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <x86/specialreg.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+#include "vmm_lapic.h"
+#include "vlapic.h"
+
+int
+lapic_pending_intr(struct vm *vm, int cpu)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ return (vlapic_pending_intr(vlapic));
+}
+
+void
+lapic_intr_accepted(struct vm *vm, int cpu, int vector)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ vlapic_intr_accepted(vlapic, vector);
+}
+
+int
+lapic_set_intr(struct vm *vm, int cpu, int vector)
+{
+ struct vlapic *vlapic;
+
+ if (cpu < 0 || cpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (vector < 32 || vector > 255)
+ return (EINVAL);
+
+ vlapic = vm_lapic(vm, cpu);
+ vlapic_set_intr_ready(vlapic, vector);
+
+ vm_interrupt_hostcpu(vm, cpu);
+
+ return (0);
+}
+
+int
+lapic_timer_tick(struct vm *vm, int cpu)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ return (vlapic_timer_tick(vlapic));
+}
+
+static boolean_t
+x2apic_msr(u_int msr)
+{
+ if (msr >= 0x800 && msr <= 0xBFF)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+static u_int
+x2apic_msr_to_regoff(u_int msr)
+{
+
+ return ((msr - 0x800) << 4);
+}
+
+boolean_t
+lapic_msr(u_int msr)
+{
+
+ if (x2apic_msr(msr) || (msr == MSR_APICBASE))
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+int
+lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval)
+{
+ int error;
+ u_int offset;
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ if (msr == MSR_APICBASE) {
+ *rval = vlapic_get_apicbase(vlapic);
+ error = 0;
+ } else {
+ offset = x2apic_msr_to_regoff(msr);
+ error = vlapic_op_mem_read(vlapic, offset, DWORD, rval);
+ }
+
+ return (error);
+}
+
+int
+lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
+{
+ int error;
+ u_int offset;
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ if (msr == MSR_APICBASE) {
+ vlapic_set_apicbase(vlapic, val);
+ error = 0;
+ } else {
+ offset = x2apic_msr_to_regoff(msr);
+ error = vlapic_op_mem_write(vlapic, offset, DWORD, val);
+ }
+
+ return (error);
+}
+
+int
+lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size,
+ void *arg)
+{
+ int error;
+ uint64_t off;
+ struct vlapic *vlapic;
+
+ off = gpa - DEFAULT_APIC_BASE;
+
+ /*
+ * Memory mapped local apic accesses must be 4 bytes wide and
+ * aligned on a 16-byte boundary.
+ */
+ if (size != 4 || off & 0xf)
+ return (EINVAL);
+
+ vlapic = vm_lapic(vm, cpu);
+ error = vlapic_op_mem_write(vlapic, off, DWORD, wval);
+ return (error);
+}
+
+int
+lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size,
+ void *arg)
+{
+ int error;
+ uint64_t off;
+ struct vlapic *vlapic;
+
+ off = gpa - DEFAULT_APIC_BASE;
+
+ /*
+ * Memory mapped local apic accesses must be 4 bytes wide and
+ * aligned on a 16-byte boundary.
+ */
+ if (size != 4 || off & 0xf)
+ return (EINVAL);
+
+ vlapic = vm_lapic(vm, cpu);
+ error = vlapic_op_mem_read(vlapic, off, DWORD, rval);
+ return (error);
+}
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
new file mode 100644
index 0000000..a79912e
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_LAPIC_H_
+#define _VMM_LAPIC_H_
+
+struct vm;
+
+boolean_t lapic_msr(u_int num);
+int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval);
+int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
+
+int lapic_mmio_read(void *vm, int cpu, uint64_t gpa,
+ uint64_t *rval, int size, void *arg);
+int lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
+ uint64_t wval, int size, void *arg);
+
+int lapic_timer_tick(struct vm *vm, int cpu);
+
+/*
+ * Returns a vector between 32 and 255 if an interrupt is pending in the
+ * IRR that can be delivered based on the current state of ISR and TPR.
+ *
+ * Note that the vector does not automatically transition to the ISR as a
+ * result of calling this function.
+ *
+ * Returns -1 if there is no eligible vector that can be delivered to the
+ * guest at this time.
+ */
+int lapic_pending_intr(struct vm *vm, int cpu);
+
+/*
+ * Transition 'vector' from IRR to ISR. This function is called with the
+ * vector returned by 'lapic_pending_intr()' when the guest is able to
+ * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
+ * block interrupt delivery).
+ */
+void lapic_intr_accepted(struct vm *vm, int cpu, int vector);
+
+/*
+ * Signals to the LAPIC that an interrupt at 'vector' needs to be generated
+ * to the 'cpu', the state is recorded in IRR.
+ */
+int lapic_set_intr(struct vm *vm, int cpu, int vector);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c
new file mode 100644
index 0000000..04f99b1
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.c
@@ -0,0 +1,135 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/linker.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+
+#include <machine/md_var.h>
+#include <machine/metadata.h>
+#include <machine/pc/bios.h>
+#include <machine/vmparam.h>
+#include <machine/pmap.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+
+SYSCTL_DECL(_hw_vmm);
+
+static u_long pages_allocated;
+SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD,
+ &pages_allocated, 0, "4KB pages allocated");
+
+static void
+update_pages_allocated(int howmany)
+{
+ pages_allocated += howmany; /* XXX locking? */
+}
+
+int
+vmm_mem_init(void)
+{
+
+ return (0);
+}
+
+vm_paddr_t
+vmm_mem_alloc(size_t size)
+{
+ int flags;
+ vm_page_t m;
+ vm_paddr_t pa;
+
+ if (size != PAGE_SIZE)
+ panic("vmm_mem_alloc: invalid allocation size %lu", size);
+
+ flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
+ VM_ALLOC_ZERO;
+
+ while (1) {
+ /*
+ * XXX need policy to determine when to back off the allocation
+ */
+ m = vm_page_alloc(NULL, 0, flags);
+ if (m == NULL)
+ VM_WAIT;
+ else
+ break;
+ }
+
+ pa = VM_PAGE_TO_PHYS(m);
+
+ if ((m->flags & PG_ZERO) == 0)
+ pagezero((void *)PHYS_TO_DMAP(pa));
+ m->valid = VM_PAGE_BITS_ALL;
+
+ update_pages_allocated(1);
+
+ return (pa);
+}
+
+void
+vmm_mem_free(vm_paddr_t base, size_t length)
+{
+ vm_page_t m;
+
+ if (base & PAGE_MASK) {
+ panic("vmm_mem_free: base 0x%0lx must be aligned on a "
+ "0x%0x boundary\n", base, PAGE_SIZE);
+ }
+
+ if (length != PAGE_SIZE)
+ panic("vmm_mem_free: invalid length %lu", length);
+
+ m = PHYS_TO_VM_PAGE(base);
+ m->wire_count--;
+ vm_page_free(m);
+ atomic_subtract_int(&cnt.v_wire_count, 1);
+
+ update_pages_allocated(-1);
+}
+
+vm_paddr_t
+vmm_mem_maxaddr(void)
+{
+
+ return (ptoa(Maxmem));
+}
diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h
new file mode 100644
index 0000000..7d45c74
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_MEM_H_
+#define _VMM_MEM_H_
+
+int vmm_mem_init(void);
+vm_paddr_t vmm_mem_alloc(size_t size);
+void vmm_mem_free(vm_paddr_t start, size_t size);
+vm_paddr_t vmm_mem_maxaddr(void);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c
new file mode 100644
index 0000000..d97c819
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.c
@@ -0,0 +1,254 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <machine/specialreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+
+#define VMM_MSR_F_EMULATE 0x01
+#define VMM_MSR_F_READONLY 0x02
+#define VMM_MSR_F_INVALID 0x04 /* guest_msr_valid() can override this */
+
+struct vmm_msr {
+ int num;
+ int flags;
+ uint64_t hostval;
+};
+
+static struct vmm_msr vmm_msr[] = {
+ { MSR_LSTAR, 0 },
+ { MSR_CSTAR, 0 },
+ { MSR_STAR, 0 },
+ { MSR_SF_MASK, 0 },
+ { MSR_PAT, VMM_MSR_F_EMULATE | VMM_MSR_F_INVALID },
+ { MSR_BIOS_SIGN,VMM_MSR_F_EMULATE },
+ { MSR_MCG_CAP, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
+};
+
+#define vmm_msr_num (sizeof(vmm_msr) / sizeof(vmm_msr[0]))
+CTASSERT(VMM_MSR_NUM >= vmm_msr_num);
+
+#define readonly_msr(idx) \
+ ((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0)
+
+#define emulated_msr(idx) \
+ ((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0)
+
+#define invalid_msr(idx) \
+ ((vmm_msr[(idx)].flags & VMM_MSR_F_INVALID) != 0)
+
+void
+vmm_msr_init(void)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (emulated_msr(i))
+ continue;
+ /*
+ * XXX this assumes that the value of the host msr does not
+ * change after we have cached it.
+ */
+ vmm_msr[i].hostval = rdmsr(vmm_msr[i].num);
+ }
+}
+
+void
+guest_msrs_init(struct vm *vm, int cpu)
+{
+ int i;
+ uint64_t *guest_msrs;
+
+ guest_msrs = vm_guest_msrs(vm, cpu);
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ switch (vmm_msr[i].num) {
+ case MSR_LSTAR:
+ case MSR_CSTAR:
+ case MSR_STAR:
+ case MSR_SF_MASK:
+ case MSR_BIOS_SIGN:
+ case MSR_MCG_CAP:
+ guest_msrs[i] = 0;
+ break;
+ case MSR_PAT:
+ guest_msrs[i] = PAT_VALUE(0, PAT_WRITE_BACK) |
+ PAT_VALUE(1, PAT_WRITE_THROUGH) |
+ PAT_VALUE(2, PAT_UNCACHED) |
+ PAT_VALUE(3, PAT_UNCACHEABLE) |
+ PAT_VALUE(4, PAT_WRITE_BACK) |
+ PAT_VALUE(5, PAT_WRITE_THROUGH) |
+ PAT_VALUE(6, PAT_UNCACHED) |
+ PAT_VALUE(7, PAT_UNCACHEABLE);
+ break;
+ default:
+ panic("guest_msrs_init: missing initialization for msr "
+ "0x%0x", vmm_msr[i].num);
+ }
+ }
+}
+
+static int
+msr_num_to_idx(u_int num)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++)
+ if (vmm_msr[i].num == num)
+ return (i);
+
+ return (-1);
+}
+
+int
+emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
+{
+ int idx;
+ uint64_t *guest_msrs;
+
+ if (lapic_msr(num))
+ return (lapic_wrmsr(vm, cpu, num, val));
+
+ idx = msr_num_to_idx(num);
+ if (idx < 0 || invalid_msr(idx))
+ return (EINVAL);
+
+ if (!readonly_msr(idx)) {
+ guest_msrs = vm_guest_msrs(vm, cpu);
+
+ /* Stash the value */
+ guest_msrs[idx] = val;
+
+ /* Update processor state for non-emulated MSRs */
+ if (!emulated_msr(idx))
+ wrmsr(vmm_msr[idx].num, val);
+ }
+
+ return (0);
+}
+
+int
+emulate_rdmsr(struct vm *vm, int cpu, u_int num)
+{
+ int error, idx;
+ uint32_t eax, edx;
+ uint64_t result, *guest_msrs;
+
+ if (lapic_msr(num)) {
+ error = lapic_rdmsr(vm, cpu, num, &result);
+ goto done;
+ }
+
+ idx = msr_num_to_idx(num);
+ if (idx < 0 || invalid_msr(idx)) {
+ error = EINVAL;
+ goto done;
+ }
+
+ guest_msrs = vm_guest_msrs(vm, cpu);
+ result = guest_msrs[idx];
+
+ /*
+ * If this is not an emulated msr register make sure that the processor
+ * state matches our cached state.
+ */
+ if (!emulated_msr(idx) && (rdmsr(num) != result)) {
+ panic("emulate_rdmsr: msr 0x%0x has inconsistent cached "
+ "(0x%016lx) and actual (0x%016lx) values", num,
+ result, rdmsr(num));
+ }
+
+ error = 0;
+
+done:
+ if (error == 0) {
+ eax = result;
+ edx = result >> 32;
+ error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax);
+ if (error)
+ panic("vm_set_register(rax) error %d", error);
+ error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx);
+ if (error)
+ panic("vm_set_register(rdx) error %d", error);
+ }
+ return (error);
+}
+
+void
+restore_guest_msrs(struct vm *vm, int cpu)
+{
+ int i;
+ uint64_t *guest_msrs;
+
+ guest_msrs = vm_guest_msrs(vm, cpu);
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (emulated_msr(i))
+ continue;
+ else
+ wrmsr(vmm_msr[i].num, guest_msrs[i]);
+ }
+}
+
+void
+restore_host_msrs(struct vm *vm, int cpu)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (emulated_msr(i))
+ continue;
+ else
+ wrmsr(vmm_msr[i].num, vmm_msr[i].hostval);
+ }
+}
+
+/*
+ * Must be called by the CPU-specific code before any guests are
+ * created
+ */
+void
+guest_msr_valid(int msr)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (vmm_msr[i].num == msr && invalid_msr(i)) {
+ vmm_msr[i].flags &= ~VMM_MSR_F_INVALID;
+ }
+ }
+}
diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h
new file mode 100644
index 0000000..8a1fda3
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_MSR_H_
+#define _VMM_MSR_H_
+
+#define VMM_MSR_NUM 16
+struct vm;
+
+void vmm_msr_init(void);
+int emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val);
+int emulate_rdmsr(struct vm *vm, int vcpu, u_int msr);
+void guest_msrs_init(struct vm *vm, int cpu);
+void guest_msr_valid(int msr);
+void restore_host_msrs(struct vm *vm, int cpu);
+void restore_guest_msrs(struct vm *vm, int cpu);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c
new file mode 100644
index 0000000..2143d25
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.c
@@ -0,0 +1,130 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <machine/vmm.h>
+#include "vmm_util.h"
+#include "vmm_stat.h"
+
+static int vstnum;
+static struct vmm_stat_type *vsttab[MAX_VMM_STAT_TYPES];
+
+static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat");
+
+void
+vmm_stat_init(void *arg)
+{
+ struct vmm_stat_type *vst = arg;
+
+ /* We require all stats to identify themselves with a description */
+ if (vst->desc == NULL)
+ return;
+
+ if (vst->scope == VMM_STAT_SCOPE_INTEL && !vmm_is_intel())
+ return;
+
+ if (vst->scope == VMM_STAT_SCOPE_AMD && !vmm_is_amd())
+ return;
+
+ if (vstnum >= MAX_VMM_STAT_TYPES) {
+ printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc);
+ return;
+ }
+
+ vst->index = vstnum;
+ vsttab[vstnum++] = vst;
+}
+
+int
+vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf)
+{
+ int i;
+ uint64_t *stats;
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ stats = vcpu_stats(vm, vcpu);
+ for (i = 0; i < vstnum; i++)
+ buf[i] = stats[i];
+ *num_stats = vstnum;
+ return (0);
+}
+
+void *
+vmm_stat_alloc(void)
+{
+ u_long size;
+
+ size = vstnum * sizeof(uint64_t);
+
+ return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK));
+}
+
+void
+vmm_stat_free(void *vp)
+{
+ free(vp, M_VMM_STAT);
+}
+
+const char *
+vmm_stat_desc(int index)
+{
+
+ if (index >= 0 && index < vstnum)
+ return (vsttab[index]->desc);
+ else
+ return (NULL);
+}
+
+/* global statistics */
+VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus");
+VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
+VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt");
+VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted");
+VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted");
+VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted");
+VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted");
+VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits");
+VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted");
+VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening");
+VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening");
+VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted");
+VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted");
+VMM_STAT(VMEXIT_EPT_FAULT, "vm exits due to nested page fault");
+VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason");
+VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit");
+VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace");
diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h
new file mode 100644
index 0000000..93c7e87
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.h
@@ -0,0 +1,105 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_STAT_H_
+#define _VMM_STAT_H_
+
+struct vm;
+
+#define MAX_VMM_STAT_TYPES 64 /* arbitrary */
+
+enum vmm_stat_scope {
+ VMM_STAT_SCOPE_ANY,
+ VMM_STAT_SCOPE_INTEL, /* Intel VMX specific statistic */
+ VMM_STAT_SCOPE_AMD, /* AMD SVM specific statistic */
+};
+
+struct vmm_stat_type {
+ int index; /* position in the stats buffer */
+ const char *desc; /* description of statistic */
+ enum vmm_stat_scope scope;
+};
+
+void vmm_stat_init(void *arg);
+
+#define VMM_STAT_DEFINE(type, desc, scope) \
+ struct vmm_stat_type type[1] = { \
+ { -1, desc, scope } \
+ }; \
+ SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
+
+#define VMM_STAT_DECLARE(type) \
+ extern struct vmm_stat_type type[1]
+
+#define VMM_STAT(type, desc) \
+ VMM_STAT_DEFINE(type, desc, VMM_STAT_SCOPE_ANY)
+#define VMM_STAT_INTEL(type, desc) \
+ VMM_STAT_DEFINE(type, desc, VMM_STAT_SCOPE_INTEL)
+#define VMM_STAT_AMD(type, desc) \
+ VMM_STAT_DEFINE(type, desc, VMM_STAT_SCOPE_AMD)
+
+void *vmm_stat_alloc(void);
+void vmm_stat_free(void *vp);
+
+/*
+ * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries
+ */
+int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf);
+const char *vmm_stat_desc(int index);
+
+static void __inline
+vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
+{
+#ifdef VMM_KEEP_STATS
+ uint64_t *stats = vcpu_stats(vm, vcpu);
+ if (vst->index >= 0)
+ stats[vst->index] += x;
+#endif
+}
+
+VMM_STAT_DECLARE(VCPU_MIGRATIONS);
+VMM_STAT_DECLARE(VMEXIT_COUNT);
+VMM_STAT_DECLARE(VMEXIT_EXTINT);
+VMM_STAT_DECLARE(VMEXIT_HLT);
+VMM_STAT_DECLARE(VMEXIT_CR_ACCESS);
+VMM_STAT_DECLARE(VMEXIT_RDMSR);
+VMM_STAT_DECLARE(VMEXIT_WRMSR);
+VMM_STAT_DECLARE(VMEXIT_MTRAP);
+VMM_STAT_DECLARE(VMEXIT_PAUSE);
+VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW);
+VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW);
+VMM_STAT_DECLARE(VMEXIT_INOUT);
+VMM_STAT_DECLARE(VMEXIT_CPUID);
+VMM_STAT_DECLARE(VMEXIT_EPT_FAULT);
+VMM_STAT_DECLARE(VMEXIT_UNKNOWN);
+VMM_STAT_DECLARE(VMEXIT_ASTPENDING);
+VMM_STAT_DECLARE(VMEXIT_USERSPACE);
+#endif
diff --git a/sys/amd64/vmm/vmm_support.S b/sys/amd64/vmm/vmm_support.S
new file mode 100644
index 0000000..2afc608
--- /dev/null
+++ b/sys/amd64/vmm/vmm_support.S
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define LOCORE
+
+#include <machine/asmacros.h>
+
+#define LA_EOI 0xB0
+
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(justreturn)
+ pushq %rax
+ movq lapic, %rax
+ movl $0, LA_EOI(%rax)
+ popq %rax
+ iretq
diff --git a/sys/amd64/vmm/vmm_util.c b/sys/amd64/vmm/vmm_util.c
new file mode 100644
index 0000000..f245f92
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.c
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/libkern.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+
+boolean_t
+vmm_is_intel(void)
+{
+
+ if (strcmp(cpu_vendor, "GenuineIntel") == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+boolean_t
+vmm_is_amd(void)
+{
+ if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+boolean_t
+vmm_supports_1G_pages(void)
+{
+ unsigned int regs[4];
+
+ /*
+ * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages
+ *
+ * Both Intel and AMD support this bit.
+ */
+ if (cpu_exthigh >= 0x80000001) {
+ do_cpuid(0x80000001, regs);
+ if (regs[3] & (1 << 26))
+ return (TRUE);
+ }
+ return (FALSE);
+}
+
+#include <sys/proc.h>
+#include <machine/frame.h>
+#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x))
+#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x))
+void
+dump_trapframe(struct trapframe *tf)
+{
+ DUMP_REG(rdi);
+ DUMP_REG(rsi);
+ DUMP_REG(rdx);
+ DUMP_REG(rcx);
+ DUMP_REG(r8);
+ DUMP_REG(r9);
+ DUMP_REG(rax);
+ DUMP_REG(rbx);
+ DUMP_REG(rbp);
+ DUMP_REG(r10);
+ DUMP_REG(r11);
+ DUMP_REG(r12);
+ DUMP_REG(r13);
+ DUMP_REG(r14);
+ DUMP_REG(r15);
+ DUMP_REG(trapno);
+ DUMP_REG(addr);
+ DUMP_REG(flags);
+ DUMP_REG(err);
+ DUMP_REG(rip);
+ DUMP_REG(rflags);
+ DUMP_REG(rsp);
+ DUMP_SEG(cs);
+ DUMP_SEG(ss);
+ DUMP_SEG(fs);
+ DUMP_SEG(gs);
+ DUMP_SEG(es);
+ DUMP_SEG(ds);
+}
diff --git a/sys/amd64/vmm/vmm_util.h b/sys/amd64/vmm/vmm_util.h
new file mode 100644
index 0000000..7f82332
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_UTIL_H_
+#define _VMM_UTIL_H_
+
+struct trapframe;
+
+boolean_t vmm_is_intel(void);
+boolean_t vmm_is_amd(void);
+boolean_t vmm_supports_1G_pages(void);
+
+void dump_trapframe(struct trapframe *tf);
+
+#endif
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
new file mode 100644
index 0000000..fa2eabc
--- /dev/null
+++ b/sys/amd64/vmm/x86.c
@@ -0,0 +1,219 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+
+#include <machine/clock.h>
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+#include <machine/vmm.h>
+
+#include "x86.h"
+
+#define CPUID_VM_HIGH 0x40000000
+
+static const char bhyve_id[12] = "BHyVE BHyVE ";
+
+int
+x86_emulate_cpuid(struct vm *vm, int vcpu_id,
+ uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+ int error;
+ unsigned int func, regs[4];
+ enum x2apic_state x2apic_state;
+
+ /*
+ * Requests for invalid CPUID levels should map to the highest
+ * available level instead.
+ */
+ if (cpu_exthigh != 0 && *eax >= 0x80000000) {
+ if (*eax > cpu_exthigh)
+ *eax = cpu_exthigh;
+ } else if (*eax >= 0x40000000) {
+ if (*eax > CPUID_VM_HIGH)
+ *eax = CPUID_VM_HIGH;
+ } else if (*eax > cpu_high) {
+ *eax = cpu_high;
+ }
+
+ func = *eax;
+
+ /*
+ * In general the approach used for CPU topology is to
+ * advertise a flat topology where all CPUs are packages with
+ * no multi-core or SMT.
+ */
+ switch (func) {
+ case CPUID_0000_0000:
+ case CPUID_0000_0002:
+ case CPUID_0000_0003:
+ case CPUID_0000_000A:
+ cpuid_count(*eax, *ecx, regs);
+ break;
+
+ case CPUID_8000_0000:
+ case CPUID_8000_0001:
+ case CPUID_8000_0002:
+ case CPUID_8000_0003:
+ case CPUID_8000_0004:
+ case CPUID_8000_0006:
+ case CPUID_8000_0008:
+ cpuid_count(*eax, *ecx, regs);
+ break;
+
+ case CPUID_8000_0007:
+ cpuid_count(*eax, *ecx, regs);
+ /*
+ * If the host TSCs are not synchronized across
+ * physical cpus then we cannot advertise an
+ * invariant tsc to a vcpu.
+ *
+ * XXX This still falls short because the vcpu
+ * can observe the TSC moving backwards as it
+ * migrates across physical cpus. But at least
+ * it should discourage the guest from using the
+ * TSC to keep track of time.
+ */
+ if (!smp_tsc)
+ regs[3] &= ~AMDPM_TSC_INVARIANT;
+ break;
+
+ case CPUID_0000_0001:
+ do_cpuid(1, regs);
+
+ error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
+ if (error) {
+ panic("x86_emulate_cpuid: error %d "
+ "fetching x2apic state", error);
+ }
+
+ /*
+ * Override the APIC ID only in ebx
+ */
+ regs[1] &= ~(CPUID_LOCAL_APIC_ID);
+ regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
+
+ /*
+ * Don't expose VMX, SpeedStep or TME capability.
+ * Advertise x2APIC capability and Hypervisor guest.
+ */
+ regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
+
+ regs[2] |= CPUID2_HV;
+
+ if (x2apic_state != X2APIC_DISABLED)
+ regs[2] |= CPUID2_X2APIC;
+
+ /*
+ * Hide xsave/osxsave/avx until the FPU save/restore
+ * issues are resolved
+ */
+ regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE |
+ CPUID2_AVX);
+
+ /*
+ * Hide monitor/mwait until we know how to deal with
+ * these instructions.
+ */
+ regs[2] &= ~CPUID2_MON;
+
+ /*
+ * Hide thermal monitoring
+ */
+ regs[3] &= ~(CPUID_ACPI | CPUID_TM);
+
+ /*
+ * Machine check handling is done in the host.
+ * Hide MTRR capability.
+ */
+ regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
+
+ /*
+ * Disable multi-core.
+ */
+ regs[1] &= ~CPUID_HTT_CORES;
+ regs[3] &= ~CPUID_HTT;
+ break;
+
+ case CPUID_0000_0004:
+ do_cpuid(4, regs);
+
+ /*
+ * Do not expose topology.
+ */
+ regs[0] &= 0xffff8000;
+ regs[0] |= 0x04008000;
+ break;
+
+ case CPUID_0000_0006:
+ case CPUID_0000_0007:
+ /*
+ * Handle the access, but report 0 for
+ * all options
+ */
+ regs[0] = 0;
+ regs[1] = 0;
+ regs[2] = 0;
+ regs[3] = 0;
+ break;
+
+ case CPUID_0000_000B:
+ /*
+ * Processor topology enumeration
+ */
+ regs[0] = 0;
+ regs[1] = 0;
+ regs[2] = *ecx & 0xff;
+ regs[3] = vcpu_id;
+ break;
+
+ case 0x40000000:
+ regs[0] = CPUID_VM_HIGH;
+ bcopy(bhyve_id, &regs[1], 4);
+ bcopy(bhyve_id, &regs[2], 4);
+ bcopy(bhyve_id, &regs[3], 4);
+ break;
+ default:
+ /* XXX: Leaf 5? */
+ return (0);
+ }
+
+ *eax = regs[0];
+ *ebx = regs[1];
+ *ecx = regs[2];
+ *edx = regs[3];
+ return (1);
+}
diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h
new file mode 100644
index 0000000..368e967
--- /dev/null
+++ b/sys/amd64/vmm/x86.h
@@ -0,0 +1,64 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _X86_H_
+#define _X86_H_
+
+#define CPUID_0000_0000 (0x0)
+#define CPUID_0000_0001 (0x1)
+#define CPUID_0000_0002 (0x2)
+#define CPUID_0000_0003 (0x3)
+#define CPUID_0000_0004 (0x4)
+#define CPUID_0000_0006 (0x6)
+#define CPUID_0000_0007 (0x7)
+#define CPUID_0000_000A (0xA)
+#define CPUID_0000_000B (0xB)
+#define CPUID_8000_0000 (0x80000000)
+#define CPUID_8000_0001 (0x80000001)
+#define CPUID_8000_0002 (0x80000002)
+#define CPUID_8000_0003 (0x80000003)
+#define CPUID_8000_0004 (0x80000004)
+#define CPUID_8000_0006 (0x80000006)
+#define CPUID_8000_0007 (0x80000007)
+#define CPUID_8000_0008 (0x80000008)
+
+/*
+ * CPUID instruction Fn0000_0001:
+ */
+#define CPUID_0000_0001_APICID_MASK (0xff<<24)
+#define CPUID_0000_0001_APICID_SHIFT 24
+
+/*
+ * CPUID instruction Fn0000_0001 ECX
+ */
+#define CPUID_0000_0001_FEAT0_VMX (1<<5)
+
+int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx,
+ uint32_t *ecx, uint32_t *edx);
+
+#endif
OpenPOWER on IntegriCloud