diff options
-rw-r--r-- | sys/amd64/conf/GENERIC | 1 | ||||
-rw-r--r-- | sys/conf/files.amd64 | 9 | ||||
-rw-r--r-- | sys/conf/files.i386 | 9 | ||||
-rw-r--r-- | sys/conf/options | 1 | ||||
-rw-r--r-- | sys/dev/acpica/acpi_pci.c | 29 | ||||
-rw-r--r-- | sys/i386/conf/NOTES | 1 | ||||
-rw-r--r-- | sys/x86/include/busdma_impl.h | 1 | ||||
-rw-r--r-- | sys/x86/iommu/busdma_dmar.c | 752 | ||||
-rw-r--r-- | sys/x86/iommu/busdma_dmar.h | 65 | ||||
-rw-r--r-- | sys/x86/iommu/intel_ctx.c | 631 | ||||
-rw-r--r-- | sys/x86/iommu/intel_dmar.h | 435 | ||||
-rw-r--r-- | sys/x86/iommu/intel_drv.c | 1182 | ||||
-rw-r--r-- | sys/x86/iommu/intel_fault.c | 315 | ||||
-rw-r--r-- | sys/x86/iommu/intel_gas.c | 724 | ||||
-rw-r--r-- | sys/x86/iommu/intel_idpgtbl.c | 783 | ||||
-rw-r--r-- | sys/x86/iommu/intel_qi.c | 414 | ||||
-rw-r--r-- | sys/x86/iommu/intel_quirks.c | 195 | ||||
-rw-r--r-- | sys/x86/iommu/intel_reg.h | 330 | ||||
-rw-r--r-- | sys/x86/iommu/intel_utils.c | 563 |
19 files changed, 6439 insertions, 1 deletions
diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index 3b48f0f..6b3f648 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -85,6 +85,7 @@ device cpufreq # Bus support. device acpi +options ACPI_DMAR device pci # Floppy drives diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index e1d1857..33c4297 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -531,6 +531,15 @@ x86/cpufreq/powernow.c optional cpufreq x86/cpufreq/est.c optional cpufreq x86/cpufreq/hwpstate.c optional cpufreq x86/cpufreq/p4tcc.c optional cpufreq +x86/iommu/busdma_dmar.c optional acpi acpi_dmar pci +x86/iommu/intel_ctx.c optional acpi acpi_dmar pci +x86/iommu/intel_drv.c optional acpi acpi_dmar pci +x86/iommu/intel_fault.c optional acpi acpi_dmar pci +x86/iommu/intel_gas.c optional acpi acpi_dmar pci +x86/iommu/intel_idpgtbl.c optional acpi acpi_dmar pci +x86/iommu/intel_qi.c optional acpi acpi_dmar pci +x86/iommu/intel_quirks.c optional acpi acpi_dmar pci +x86/iommu/intel_utils.c optional acpi acpi_dmar pci x86/isa/atpic.c optional atpic isa x86/isa/atrtc.c standard x86/isa/clock.c standard diff --git a/sys/conf/files.i386 b/sys/conf/files.i386 index d946425..7e6e54a 100644 --- a/sys/conf/files.i386 +++ b/sys/conf/files.i386 @@ -566,6 +566,15 @@ x86/cpufreq/hwpstate.c optional cpufreq x86/cpufreq/p4tcc.c optional cpufreq x86/cpufreq/powernow.c optional cpufreq x86/cpufreq/smist.c optional cpufreq +x86/iommu/busdma_dmar.c optional acpi acpi_dmar pci +x86/iommu/intel_ctx.c optional acpi acpi_dmar pci +x86/iommu/intel_drv.c optional acpi acpi_dmar pci +x86/iommu/intel_fault.c optional acpi acpi_dmar pci +x86/iommu/intel_gas.c optional acpi acpi_dmar pci +x86/iommu/intel_idpgtbl.c optional acpi acpi_dmar pci +x86/iommu/intel_qi.c optional acpi acpi_dmar pci +x86/iommu/intel_quirks.c optional acpi acpi_dmar pci +x86/iommu/intel_utils.c optional acpi acpi_dmar pci x86/isa/atpic.c optional atpic x86/isa/atrtc.c optional native x86/isa/clock.c optional native diff --git a/sys/conf/options b/sys/conf/options index a4c785e..642064d 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -688,6 +688,7 @@ OPENSOLARIS_WITNESS opt_global.h ACPI_DEBUG opt_acpi.h ACPI_MAX_TASKS opt_acpi.h ACPI_MAX_THREADS opt_acpi.h +ACPI_DMAR opt_acpi.h # ISA support DEV_ISA opt_isa.h diff --git a/sys/dev/acpica/acpi_pci.c b/sys/dev/acpica/acpi_pci.c index 39fba88..78d8639 100644 --- a/sys/dev/acpica/acpi_pci.c +++ b/sys/dev/acpica/acpi_pci.c @@ -29,6 +29,8 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_acpi.h" + #include <sys/param.h> #include <sys/systm.h> #include <sys/bus.h> @@ -80,6 +82,7 @@ static ACPI_STATUS acpi_pci_save_handle(ACPI_HANDLE handle, UINT32 level, static int acpi_pci_set_powerstate_method(device_t dev, device_t child, int state); static void acpi_pci_update_device(ACPI_HANDLE handle, device_t pci_child); +static bus_dma_tag_t acpi_pci_get_dma_tag(device_t bus, device_t child); static device_method_t acpi_pci_methods[] = { /* Device interface */ @@ -90,6 +93,7 @@ static device_method_t acpi_pci_methods[] = { DEVMETHOD(bus_read_ivar, acpi_pci_read_ivar), DEVMETHOD(bus_write_ivar, acpi_pci_write_ivar), DEVMETHOD(bus_child_location_str, acpi_pci_child_location_str_method), + DEVMETHOD(bus_get_dma_tag, acpi_pci_get_dma_tag), /* PCI interface */ DEVMETHOD(pci_set_powerstate, acpi_pci_set_powerstate_method), @@ -308,3 +312,28 @@ acpi_pci_attach(device_t dev) return (bus_generic_attach(dev)); } + +#ifdef ACPI_DMAR +bus_dma_tag_t dmar_get_dma_tag(device_t dev, device_t child); +static bus_dma_tag_t +acpi_pci_get_dma_tag(device_t bus, device_t child) +{ + bus_dma_tag_t tag; + + if (device_get_parent(child) == bus) { + /* try dmar and return if it works */ + tag = dmar_get_dma_tag(bus, child); + } else + tag = NULL; + if (tag == NULL) + tag = pci_get_dma_tag(bus, child); + return (tag); +} +#else +static bus_dma_tag_t +acpi_pci_get_dma_tag(device_t bus, device_t child) +{ + + return (pci_get_dma_tag(bus, child)); +} +#endif diff --git a/sys/i386/conf/NOTES b/sys/i386/conf/NOTES index e236f10..212dbd9 100644 --- a/sys/i386/conf/NOTES +++ b/sys/i386/conf/NOTES @@ -491,6 +491,7 @@ device tdfx_linux # Enable Linuxulator support device acpi options ACPI_DEBUG +options ACPI_DMAR # ACPI WMI Mapping driver device acpi_wmi diff --git a/sys/x86/include/busdma_impl.h b/sys/x86/include/busdma_impl.h index 70c9a6f..b851e7a 100644 --- a/sys/x86/include/busdma_impl.h +++ b/sys/x86/include/busdma_impl.h @@ -82,7 +82,6 @@ struct bus_dma_impl { bus_dmasync_op_t op); }; -void busdma_lock_mutex(void *arg, bus_dma_lock_op_t op); void bus_dma_dflt_lock(void *arg, bus_dma_lock_op_t op); int bus_dma_run_filter(struct bus_dma_tag_common *dmat, bus_addr_t paddr); int common_bus_dma_tag_create(struct bus_dma_tag_common *parent, diff --git a/sys/x86/iommu/busdma_dmar.c b/sys/x86/iommu/busdma_dmar.c new file mode 100644 index 0000000..cada29d --- /dev/null +++ b/sys/x86/iommu/busdma_dmar.c @@ -0,0 +1,752 @@ +/*- + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/bus.h> +#include <sys/conf.h> +#include <sys/interrupt.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/proc.h> +#include <sys/memdesc.h> +#include <sys/mutex.h> +#include <sys/sysctl.h> +#include <sys/rman.h> +#include <sys/taskqueue.h> +#include <sys/tree.h> +#include <sys/uio.h> +#include <dev/pci/pcivar.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <machine/atomic.h> +#include <machine/bus.h> +#include <machine/md_var.h> +#include <machine/specialreg.h> +#include <x86/include/busdma_impl.h> +#include <x86/iommu/intel_reg.h> +#include <x86/iommu/busdma_dmar.h> +#include <x86/iommu/intel_dmar.h> + +/* + * busdma_dmar.c, the implementation of the busdma(9) interface using + * DMAR units from Intel VT-d. + */ + +static bool +dmar_bus_dma_is_dev_disabled(device_t dev) +{ + char str[128], *env; + int domain, bus, slot, func; + + domain = pci_get_domain(dev); + bus = pci_get_bus(dev); + slot = pci_get_slot(dev); + func = pci_get_function(dev); + snprintf(str, sizeof(str), "hw.busdma.pci%d.%d.%d.%d.bounce", + domain, bus, slot, func); + env = getenv(str); + if (env == NULL) + return (false); + freeenv(env); + return (true); +} + +struct dmar_ctx * +dmar_instantiate_ctx(struct dmar_unit *dmar, device_t dev, bool rmrr) +{ + struct dmar_ctx *ctx; + bool disabled; + + /* + * If the user requested the IOMMU disabled for the device, we + * cannot disable the DMAR, due to possibility of other + * devices on the same DMAR still requiring translation. + * Instead provide the identity mapping for the device + * context. + */ + disabled = dmar_bus_dma_is_dev_disabled(dev); + ctx = dmar_get_ctx(dmar, dev, disabled, rmrr); + if (ctx == NULL) + return (NULL); + ctx->ctx_tag.owner = dev; + if (disabled) { + /* + * Keep the first reference on context, release the + * later refs. + */ + DMAR_LOCK(dmar); + if ((ctx->flags & DMAR_CTX_DISABLED) == 0) { + ctx->flags |= DMAR_CTX_DISABLED; + DMAR_UNLOCK(dmar); + } else { + dmar_free_ctx_locked(dmar, ctx); + } + ctx = NULL; + } + return (ctx); +} + +bus_dma_tag_t +dmar_get_dma_tag(device_t dev, device_t child) +{ + struct dmar_unit *dmar; + struct dmar_ctx *ctx; + bus_dma_tag_t res; + + dmar = dmar_find(child); + /* Not in scope of any DMAR ? */ + if (dmar == NULL) + return (NULL); + dmar_quirks_pre_use(dmar); + dmar_instantiate_rmrr_ctxs(dmar); + + ctx = dmar_instantiate_ctx(dmar, child, false); + res = ctx == NULL ? NULL : (bus_dma_tag_t)&ctx->ctx_tag; + return (res); +} + +static MALLOC_DEFINE(M_DMAR_DMAMAP, "dmar_dmamap", "Intel DMAR DMA Map"); + +static void dmar_bus_schedule_dmamap(struct dmar_unit *unit, + struct bus_dmamap_dmar *map); + +static int +dmar_bus_dma_tag_create(bus_dma_tag_t parent, bus_size_t alignment, + bus_addr_t boundary, bus_addr_t lowaddr, bus_addr_t highaddr, + bus_dma_filter_t *filter, void *filterarg, bus_size_t maxsize, + int nsegments, bus_size_t maxsegsz, int flags, bus_dma_lock_t *lockfunc, + void *lockfuncarg, bus_dma_tag_t *dmat) +{ + struct bus_dma_tag_dmar *newtag, *oldtag; + int error; + + *dmat = NULL; + error = common_bus_dma_tag_create(parent != NULL ? + &((struct bus_dma_tag_dmar *)parent)->common : NULL, alignment, + boundary, lowaddr, highaddr, filter, filterarg, maxsize, + nsegments, maxsegsz, flags, lockfunc, lockfuncarg, + sizeof(struct bus_dma_tag_dmar), (void **)&newtag); + if (error != 0) + goto out; + + oldtag = (struct bus_dma_tag_dmar *)parent; + newtag->common.impl = &bus_dma_dmar_impl; + newtag->ctx = oldtag->ctx; + newtag->owner = oldtag->owner; + + *dmat = (bus_dma_tag_t)newtag; +out: + CTR4(KTR_BUSDMA, "%s returned tag %p tag flags 0x%x error %d", + __func__, newtag, (newtag != NULL ? newtag->common.flags : 0), + error); + return (error); +} + +static int +dmar_bus_dma_tag_destroy(bus_dma_tag_t dmat1) +{ + struct bus_dma_tag_dmar *dmat, *dmat_copy, *parent; + int error; + + error = 0; + dmat_copy = dmat = (struct bus_dma_tag_dmar *)dmat1; + + if (dmat != NULL) { + if (dmat->map_count != 0) { + error = EBUSY; + goto out; + } + while (dmat != NULL) { + parent = (struct bus_dma_tag_dmar *)dmat->common.parent; + if (atomic_fetchadd_int(&dmat->common.ref_count, -1) == + 1) { + if (dmat == &dmat->ctx->ctx_tag) + dmar_free_ctx(dmat->ctx); + free(dmat->segments, M_DMAR_DMAMAP); + free(dmat, M_DEVBUF); + dmat = parent; + } else + dmat = NULL; + } + } +out: + CTR3(KTR_BUSDMA, "%s tag %p error %d", __func__, dmat_copy, error); + return (error); +} + +static int +dmar_bus_dmamap_create(bus_dma_tag_t dmat, int flags, bus_dmamap_t *mapp) +{ + struct bus_dma_tag_dmar *tag; + struct bus_dmamap_dmar *map; + + tag = (struct bus_dma_tag_dmar *)dmat; + map = malloc(sizeof(*map), M_DMAR_DMAMAP, M_NOWAIT | M_ZERO); + if (map == NULL) { + *mapp = NULL; + return (ENOMEM); + } + if (tag->segments == NULL) { + tag->segments = malloc(sizeof(bus_dma_segment_t) * + tag->common.nsegments, M_DMAR_DMAMAP, M_NOWAIT); + if (tag->segments == NULL) { + free(map, M_DMAR_DMAMAP); + *mapp = NULL; + return (ENOMEM); + } + } + TAILQ_INIT(&map->map_entries); + map->tag = tag; + map->locked = true; + map->cansleep = false; + tag->map_count++; + *mapp = (bus_dmamap_t)map; + + return (0); +} + +static int +dmar_bus_dmamap_destroy(bus_dma_tag_t dmat, bus_dmamap_t map1) +{ + struct bus_dma_tag_dmar *tag; + struct bus_dmamap_dmar *map; + + tag = (struct bus_dma_tag_dmar *)dmat; + map = (struct bus_dmamap_dmar *)map1; + if (map != NULL) { + DMAR_CTX_LOCK(tag->ctx); + if (!TAILQ_EMPTY(&map->map_entries)) { + DMAR_CTX_UNLOCK(tag->ctx); + return (EBUSY); + } + DMAR_CTX_UNLOCK(tag->ctx); + free(map, M_DMAR_DMAMAP); + } + tag->map_count--; + return (0); +} + + +static int +dmar_bus_dmamem_alloc(bus_dma_tag_t dmat, void** vaddr, int flags, + bus_dmamap_t *mapp) +{ + struct bus_dma_tag_dmar *tag; + struct bus_dmamap_dmar *map; + int error, mflags; + vm_memattr_t attr; + + error = dmar_bus_dmamap_create(dmat, flags, mapp); + if (error != 0) + return (error); + + mflags = (flags & BUS_DMA_NOWAIT) != 0 ? M_NOWAIT : M_WAITOK; + mflags |= (flags & BUS_DMA_ZERO) != 0 ? M_ZERO : 0; + attr = (flags & BUS_DMA_NOCACHE) != 0 ? VM_MEMATTR_UNCACHEABLE : + VM_MEMATTR_DEFAULT; + + tag = (struct bus_dma_tag_dmar *)dmat; + map = (struct bus_dmamap_dmar *)*mapp; + + if (tag->common.maxsize < PAGE_SIZE && + tag->common.alignment <= tag->common.maxsize && + attr == VM_MEMATTR_DEFAULT) { + *vaddr = malloc(tag->common.maxsize, M_DEVBUF, mflags); + map->flags |= BUS_DMAMAP_DMAR_MALLOC; + } else { + *vaddr = (void *)kmem_alloc_attr(kernel_arena, + tag->common.maxsize, mflags, 0ul, BUS_SPACE_MAXADDR, + attr); + map->flags |= BUS_DMAMAP_DMAR_KMEM_ALLOC; + } + if (*vaddr == NULL) { + dmar_bus_dmamap_destroy(dmat, *mapp); + *mapp = NULL; + return (ENOMEM); + } + return (0); +} + +static void +dmar_bus_dmamem_free(bus_dma_tag_t dmat, void *vaddr, bus_dmamap_t map1) +{ + struct bus_dma_tag_dmar *tag; + struct bus_dmamap_dmar *map; + + tag = (struct bus_dma_tag_dmar *)dmat; + map = (struct bus_dmamap_dmar *)map1; + + if ((map->flags & BUS_DMAMAP_DMAR_MALLOC) != 0) { + free(vaddr, M_DEVBUF); + map->flags &= ~BUS_DMAMAP_DMAR_MALLOC; + } else { + KASSERT((map->flags & BUS_DMAMAP_DMAR_KMEM_ALLOC) != 0, + ("dmar_bus_dmamem_free for non alloced map %p", map)); + kmem_free(kernel_arena, (vm_offset_t)vaddr, tag->common.maxsize); + map->flags &= ~BUS_DMAMAP_DMAR_KMEM_ALLOC; + } + + dmar_bus_dmamap_destroy(dmat, map1); +} + +static int +dmar_bus_dmamap_load_something1(struct bus_dma_tag_dmar *tag, + struct bus_dmamap_dmar *map, vm_page_t *ma, int offset, bus_size_t buflen, + int flags, bus_dma_segment_t *segs, int *segp, + struct dmar_map_entries_tailq *unroll_list) +{ + struct dmar_ctx *ctx; + struct dmar_map_entry *entry; + dmar_gaddr_t size; + bus_size_t buflen1; + int error, idx, gas_flags, seg; + + if (segs == NULL) + segs = tag->segments; + ctx = tag->ctx; + seg = *segp; + error = 0; + idx = 0; + while (buflen > 0) { + seg++; + if (seg >= tag->common.nsegments) { + error = EFBIG; + break; + } + buflen1 = buflen > tag->common.maxsegsz ? + tag->common.maxsegsz : buflen; + buflen -= buflen1; + size = round_page(offset + buflen1); + + /* + * (Too) optimistically allow split if there are more + * then one segments left. + */ + gas_flags = map->cansleep ? DMAR_GM_CANWAIT : 0; + if (seg + 1 < tag->common.nsegments) + gas_flags |= DMAR_GM_CANSPLIT; + + error = dmar_gas_map(ctx, &tag->common, size, + DMAR_MAP_ENTRY_READ | DMAR_MAP_ENTRY_WRITE, + gas_flags, ma + idx, &entry); + if (error != 0) + break; + if ((gas_flags & DMAR_GM_CANSPLIT) != 0) { + KASSERT(size >= entry->end - entry->start, + ("split increased entry size %jx %jx %jx", + (uintmax_t)size, (uintmax_t)entry->start, + (uintmax_t)entry->end)); + size = entry->end - entry->start; + if (buflen1 > size) + buflen1 = size; + } else { + KASSERT(entry->end - entry->start == size, + ("no split allowed %jx %jx %jx", + (uintmax_t)size, (uintmax_t)entry->start, + (uintmax_t)entry->end)); + } + + KASSERT(((entry->start + offset) & (tag->common.alignment - 1)) + == 0, + ("alignment failed: ctx %p start 0x%jx offset %x " + "align 0x%jx", ctx, (uintmax_t)entry->start, offset, + (uintmax_t)tag->common.alignment)); + KASSERT(entry->end <= tag->common.lowaddr || + entry->start >= tag->common.highaddr, + ("entry placement failed: ctx %p start 0x%jx end 0x%jx " + "lowaddr 0x%jx highaddr 0x%jx", ctx, + (uintmax_t)entry->start, (uintmax_t)entry->end, + (uintmax_t)tag->common.lowaddr, + (uintmax_t)tag->common.highaddr)); + KASSERT(dmar_test_boundary(entry->start, entry->end - + entry->start, tag->common.boundary), + ("boundary failed: ctx %p start 0x%jx end 0x%jx " + "boundary 0x%jx", ctx, (uintmax_t)entry->start, + (uintmax_t)entry->end, (uintmax_t)tag->common.boundary)); + KASSERT(buflen1 <= tag->common.maxsegsz, + ("segment too large: ctx %p start 0x%jx end 0x%jx " + "maxsegsz 0x%jx", ctx, (uintmax_t)entry->start, + (uintmax_t)entry->end, (uintmax_t)tag->common.maxsegsz)); + + DMAR_CTX_LOCK(ctx); + TAILQ_INSERT_TAIL(&map->map_entries, entry, dmamap_link); + entry->flags |= DMAR_MAP_ENTRY_MAP; + DMAR_CTX_UNLOCK(ctx); + TAILQ_INSERT_TAIL(unroll_list, entry, unroll_link); + + segs[seg].ds_addr = entry->start + offset; + segs[seg].ds_len = buflen1; + + idx += OFF_TO_IDX(trunc_page(offset + buflen1)); + offset += buflen1; + offset &= DMAR_PAGE_MASK; + } + if (error == 0) + *segp = seg; + return (error); +} + +static int +dmar_bus_dmamap_load_something(struct bus_dma_tag_dmar *tag, + struct bus_dmamap_dmar *map, vm_page_t *ma, int offset, bus_size_t buflen, + int flags, bus_dma_segment_t *segs, int *segp) +{ + struct dmar_ctx *ctx; + struct dmar_map_entry *entry, *entry1; + struct dmar_map_entries_tailq unroll_list; + int error; + + ctx = tag->ctx; + atomic_add_long(&ctx->loads, 1); + + TAILQ_INIT(&unroll_list); + error = dmar_bus_dmamap_load_something1(tag, map, ma, offset, + buflen, flags, segs, segp, &unroll_list); + if (error != 0) { + /* + * The busdma interface does not allow us to report + * partial buffer load, so unfortunately we have to + * revert all work done. + */ + DMAR_CTX_LOCK(ctx); + TAILQ_FOREACH_SAFE(entry, &unroll_list, unroll_link, + entry1) { + /* + * No entries other than what we have created + * during the failed run might have been + * inserted there in between, since we own ctx + * pglock. + */ + TAILQ_REMOVE(&map->map_entries, entry, dmamap_link); + TAILQ_REMOVE(&unroll_list, entry, unroll_link); + TAILQ_INSERT_TAIL(&ctx->unload_entries, entry, + dmamap_link); + } + DMAR_CTX_UNLOCK(ctx); + taskqueue_enqueue(ctx->dmar->delayed_taskqueue, + &ctx->unload_task); + } + + if (error == ENOMEM && (flags & BUS_DMA_NOWAIT) == 0 && + !map->cansleep) + error = EINPROGRESS; + if (error == EINPROGRESS) + dmar_bus_schedule_dmamap(ctx->dmar, map); + return (error); +} + +static int +dmar_bus_dmamap_load_ma(bus_dma_tag_t dmat, bus_dmamap_t map1, + struct vm_page **ma, bus_size_t tlen, int ma_offs, int flags, + bus_dma_segment_t *segs, int *segp) +{ + struct bus_dma_tag_dmar *tag; + struct bus_dmamap_dmar *map; + + tag = (struct bus_dma_tag_dmar *)dmat; + map = (struct bus_dmamap_dmar *)map1; + return (dmar_bus_dmamap_load_something(tag, map, ma, ma_offs, tlen, + flags, segs, segp)); +} + +static int +dmar_bus_dmamap_load_phys(bus_dma_tag_t dmat, bus_dmamap_t map1, + vm_paddr_t buf, bus_size_t buflen, int flags, bus_dma_segment_t *segs, + int *segp) +{ + struct bus_dma_tag_dmar *tag; + struct bus_dmamap_dmar *map; + vm_page_t *ma; + vm_paddr_t pstart, pend; + int error, i, ma_cnt, offset; + + tag = (struct bus_dma_tag_dmar *)dmat; + map = (struct bus_dmamap_dmar *)map1; + pstart = trunc_page(buf); + pend = round_page(buf + buflen); + offset = buf & PAGE_MASK; + ma_cnt = OFF_TO_IDX(pend - pstart); + ma = malloc(sizeof(vm_page_t) * ma_cnt, M_DEVBUF, map->cansleep ? + M_WAITOK : M_NOWAIT); + if (ma == NULL) + return (ENOMEM); + for (i = 0; i < ma_cnt; i++) + ma[i] = PHYS_TO_VM_PAGE(pstart + i * PAGE_SIZE); + error = dmar_bus_dmamap_load_something(tag, map, ma, offset, buflen, + flags, segs, segp); + free(ma, M_DEVBUF); + return (error); +} + +static int +dmar_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map1, void *buf, + bus_size_t buflen, pmap_t pmap, int flags, bus_dma_segment_t *segs, + int *segp) +{ + struct bus_dma_tag_dmar *tag; + struct bus_dmamap_dmar *map; + vm_page_t *ma, fma; + vm_paddr_t pstart, pend, paddr; + int error, i, ma_cnt, offset; + + tag = (struct bus_dma_tag_dmar *)dmat; + map = (struct bus_dmamap_dmar *)map1; + pstart = trunc_page((vm_offset_t)buf); + pend = round_page((vm_offset_t)buf + buflen); + offset = (vm_offset_t)buf & PAGE_MASK; + ma_cnt = OFF_TO_IDX(pend - pstart); + ma = malloc(sizeof(vm_page_t) * ma_cnt, M_DEVBUF, map->cansleep ? + M_WAITOK : M_NOWAIT); + if (ma == NULL) + return (ENOMEM); + if (dumping) { + /* + * If dumping, do not attempt to call + * PHYS_TO_VM_PAGE() at all. It may return non-NULL + * but the vm_page returned might be not initialized, + * e.g. for the kernel itself. + */ + KASSERT(pmap == kernel_pmap, ("non-kernel address write")); + fma = malloc(sizeof(struct vm_page) * ma_cnt, M_DEVBUF, + M_ZERO | (map->cansleep ? M_WAITOK : M_NOWAIT)); + if (fma == NULL) { + free(ma, M_DEVBUF); + return (ENOMEM); + } + for (i = 0; i < ma_cnt; i++, pstart += PAGE_SIZE) { + paddr = pmap_kextract(pstart); + vm_page_initfake(&fma[i], paddr, VM_MEMATTR_DEFAULT); + ma[i] = &fma[i]; + } + } else { + fma = NULL; + for (i = 0; i < ma_cnt; i++, pstart += PAGE_SIZE) { + if (pmap == kernel_pmap) + paddr = pmap_kextract(pstart); + else + paddr = pmap_extract(pmap, pstart); + ma[i] = PHYS_TO_VM_PAGE(paddr); + KASSERT(VM_PAGE_TO_PHYS(ma[i]) == paddr, + ("PHYS_TO_VM_PAGE failed %jx %jx m %p", + (uintmax_t)paddr, (uintmax_t)VM_PAGE_TO_PHYS(ma[i]), + ma[i])); + } + } + error = dmar_bus_dmamap_load_something(tag, map, ma, offset, buflen, + flags, segs, segp); + free(ma, M_DEVBUF); + free(fma, M_DEVBUF); + return (error); +} + +static void +dmar_bus_dmamap_waitok(bus_dma_tag_t dmat, bus_dmamap_t map1, + struct memdesc *mem, bus_dmamap_callback_t *callback, void *callback_arg) +{ + struct bus_dmamap_dmar *map; + + if (map1 == NULL) + return; + map = (struct bus_dmamap_dmar *)map1; + map->mem = *mem; + map->tag = (struct bus_dma_tag_dmar *)dmat; + map->callback = callback; + map->callback_arg = callback_arg; +} + +static bus_dma_segment_t * +dmar_bus_dmamap_complete(bus_dma_tag_t dmat, bus_dmamap_t map1, + bus_dma_segment_t *segs, int nsegs, int error) +{ + struct bus_dma_tag_dmar *tag; + struct bus_dmamap_dmar *map; + + tag = (struct bus_dma_tag_dmar *)dmat; + map = (struct bus_dmamap_dmar *)map1; + + if (!map->locked) { + KASSERT(map->cansleep, + ("map not locked and not sleepable context %p", map)); + + /* + * We are called from the delayed context. Relock the + * driver. + */ + (tag->common.lockfunc)(tag->common.lockfuncarg, BUS_DMA_LOCK); + map->locked = true; + } + + if (segs == NULL) + segs = tag->segments; + return (segs); +} + +/* + * The limitations of busdma KPI forces the dmar to perform the actual + * unload, consisting of the unmapping of the map entries page tables, + * from the delayed context on i386, since page table page mapping + * might require a sleep to be successfull. The unfortunate + * consequence is that the DMA requests can be served some time after + * the bus_dmamap_unload() call returned. + * + * On amd64, we assume that sf allocation cannot fail. + */ +static void +dmar_bus_dmamap_unload(bus_dma_tag_t dmat, bus_dmamap_t map1) +{ + struct bus_dma_tag_dmar *tag; + struct bus_dmamap_dmar *map; + struct dmar_ctx *ctx; +#if defined(__amd64__) + struct dmar_map_entries_tailq entries; +#endif + + tag = (struct bus_dma_tag_dmar *)dmat; + map = (struct bus_dmamap_dmar *)map1; + ctx = tag->ctx; + atomic_add_long(&ctx->unloads, 1); + +#if defined(__i386__) + DMAR_CTX_LOCK(ctx); + TAILQ_CONCAT(&ctx->unload_entries, &map->map_entries, dmamap_link); + DMAR_CTX_UNLOCK(ctx); + taskqueue_enqueue(ctx->dmar->delayed_taskqueue, &ctx->unload_task); +#else /* defined(__amd64__) */ + TAILQ_INIT(&entries); + DMAR_CTX_LOCK(ctx); + TAILQ_CONCAT(&entries, &map->map_entries, dmamap_link); + DMAR_CTX_UNLOCK(ctx); + THREAD_NO_SLEEPING(); + dmar_ctx_unload(ctx, &entries, false); + THREAD_SLEEPING_OK(); + KASSERT(TAILQ_EMPTY(&entries), ("lazy dmar_ctx_unload %p", ctx)); +#endif +} + +static void +dmar_bus_dmamap_sync(bus_dma_tag_t dmat, bus_dmamap_t map, + bus_dmasync_op_t op) +{ +} + +struct bus_dma_impl bus_dma_dmar_impl = { + .tag_create = dmar_bus_dma_tag_create, + .tag_destroy = dmar_bus_dma_tag_destroy, + .map_create = dmar_bus_dmamap_create, + .map_destroy = dmar_bus_dmamap_destroy, + .mem_alloc = dmar_bus_dmamem_alloc, + .mem_free = dmar_bus_dmamem_free, + .load_phys = dmar_bus_dmamap_load_phys, + .load_buffer = dmar_bus_dmamap_load_buffer, + .load_ma = dmar_bus_dmamap_load_ma, + .map_waitok = dmar_bus_dmamap_waitok, + .map_complete = dmar_bus_dmamap_complete, + .map_unload = dmar_bus_dmamap_unload, + .map_sync = dmar_bus_dmamap_sync +}; + +static void +dmar_bus_task_dmamap(void *arg, int pending) +{ + struct bus_dma_tag_dmar *tag; + struct bus_dmamap_dmar *map; + struct dmar_unit *unit; + struct dmar_ctx *ctx; + + unit = arg; + DMAR_LOCK(unit); + while ((map = TAILQ_FIRST(&unit->delayed_maps)) != NULL) { + TAILQ_REMOVE(&unit->delayed_maps, map, delay_link); + DMAR_UNLOCK(unit); + tag = map->tag; + ctx = map->tag->ctx; + map->cansleep = true; + map->locked = false; + bus_dmamap_load_mem((bus_dma_tag_t)tag, (bus_dmamap_t)map, + &map->mem, map->callback, map->callback_arg, + BUS_DMA_WAITOK); + map->cansleep = false; + if (map->locked) { + (tag->common.lockfunc)(tag->common.lockfuncarg, + BUS_DMA_UNLOCK); + } else + map->locked = true; + map->cansleep = false; + DMAR_LOCK(unit); + } + DMAR_UNLOCK(unit); +} + +static void +dmar_bus_schedule_dmamap(struct dmar_unit *unit, struct bus_dmamap_dmar *map) +{ + struct dmar_ctx *ctx; + + ctx = map->tag->ctx; + map->locked = false; + DMAR_LOCK(unit); + TAILQ_INSERT_TAIL(&unit->delayed_maps, map, delay_link); + DMAR_UNLOCK(unit); + taskqueue_enqueue(unit->delayed_taskqueue, &unit->dmamap_load_task); +} + +int +dmar_init_busdma(struct dmar_unit *unit) +{ + + TAILQ_INIT(&unit->delayed_maps); + TASK_INIT(&unit->dmamap_load_task, 0, dmar_bus_task_dmamap, unit); + unit->delayed_taskqueue = taskqueue_create("dmar", M_WAITOK, + taskqueue_thread_enqueue, &unit->delayed_taskqueue); + taskqueue_start_threads(&unit->delayed_taskqueue, 1, PI_DISK, + "dmar%d busdma taskq", unit->unit); + return (0); +} + +void +dmar_fini_busdma(struct dmar_unit *unit) +{ + + if (unit->delayed_taskqueue == NULL) + return; + + taskqueue_drain(unit->delayed_taskqueue, &unit->dmamap_load_task); + taskqueue_free(unit->delayed_taskqueue); + unit->delayed_taskqueue = NULL; +} diff --git a/sys/x86/iommu/busdma_dmar.h b/sys/x86/iommu/busdma_dmar.h new file mode 100644 index 0000000..60ea6bc --- /dev/null +++ b/sys/x86/iommu/busdma_dmar.h @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __X86_IOMMU_BUSDMA_DMAR_H +#define __X86_IOMMU_BUSDMA_DMAR_H + +struct dmar_map_entry; +TAILQ_HEAD(dmar_map_entries_tailq, dmar_map_entry); + +struct bus_dma_tag_dmar { + struct bus_dma_tag_common common; + struct dmar_ctx *ctx; + device_t owner; + int map_count; + bus_dma_segment_t *segments; +}; + +struct bus_dmamap_dmar { + struct bus_dma_tag_dmar *tag; + struct memdesc mem; + bus_dmamap_callback_t *callback; + void *callback_arg; + struct dmar_map_entries_tailq map_entries; + TAILQ_ENTRY(bus_dmamap_dmar) delay_link; + bool locked; + bool cansleep; + int flags; +}; + +#define BUS_DMAMAP_DMAR_MALLOC 0x0001 +#define BUS_DMAMAP_DMAR_KMEM_ALLOC 0x0002 + +extern struct bus_dma_impl bus_dma_dmar_impl; + +bus_dma_tag_t dmar_get_dma_tag(device_t dev, device_t child); + +#endif diff --git a/sys/x86/iommu/intel_ctx.c b/sys/x86/iommu/intel_ctx.c new file mode 100644 index 0000000..fc5fe09 --- /dev/null +++ b/sys/x86/iommu/intel_ctx.c @@ -0,0 +1,631 @@ +/*- + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/bus.h> +#include <sys/interrupt.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/limits.h> +#include <sys/lock.h> +#include <sys/memdesc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/rman.h> +#include <sys/sysctl.h> +#include <sys/taskqueue.h> +#include <sys/tree.h> +#include <sys/uio.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/vm_map.h> +#include <machine/atomic.h> +#include <machine/bus.h> +#include <machine/md_var.h> +#include <machine/specialreg.h> +#include <x86/include/busdma_impl.h> +#include <x86/iommu/intel_reg.h> +#include <x86/iommu/busdma_dmar.h> +#include <x86/iommu/intel_dmar.h> +#include <dev/pci/pcivar.h> + +static MALLOC_DEFINE(M_DMAR_CTX, "dmar_ctx", "Intel DMAR Context"); + +static void dmar_ctx_unload_task(void *arg, int pending); + +static void +dmar_ensure_ctx_page(struct dmar_unit *dmar, int bus) +{ + struct sf_buf *sf; + dmar_root_entry_t *re; + vm_page_t ctxm; + + /* + * Allocated context page must be linked. + */ + ctxm = dmar_pgalloc(dmar->ctx_obj, 1 + bus, DMAR_PGF_NOALLOC); + if (ctxm != NULL) + return; + + /* + * Page not present, allocate and link. Note that other + * thread might execute this sequence in parallel. This + * should be safe, because the context entries written by both + * threads are equal. + */ + TD_PREP_PINNED_ASSERT; + ctxm = dmar_pgalloc(dmar->ctx_obj, 1 + bus, DMAR_PGF_ZERO | + DMAR_PGF_WAITOK); + re = dmar_map_pgtbl(dmar->ctx_obj, 0, DMAR_PGF_NOALLOC, &sf); + re += bus; + dmar_pte_store(&re->r1, DMAR_ROOT_R1_P | (DMAR_ROOT_R1_CTP_MASK & + VM_PAGE_TO_PHYS(ctxm))); + dmar_unmap_pgtbl(sf, DMAR_IS_COHERENT(dmar)); + TD_PINNED_ASSERT; +} + +static dmar_ctx_entry_t * +dmar_map_ctx_entry(struct dmar_ctx *ctx, struct sf_buf **sfp) +{ + dmar_ctx_entry_t *ctxp; + + ctxp = dmar_map_pgtbl(ctx->dmar->ctx_obj, 1 + ctx->bus, + DMAR_PGF_NOALLOC | DMAR_PGF_WAITOK, sfp); + ctxp += ((ctx->slot & 0x1f) << 3) + (ctx->func & 0x7); + return (ctxp); +} + +static void +ctx_tag_init(struct dmar_ctx *ctx) +{ + bus_addr_t maxaddr; + + maxaddr = MIN(ctx->end, BUS_SPACE_MAXADDR); + ctx->ctx_tag.common.ref_count = 1; /* Prevent free */ + ctx->ctx_tag.common.impl = &bus_dma_dmar_impl; + ctx->ctx_tag.common.boundary = PCI_DMA_BOUNDARY; + ctx->ctx_tag.common.lowaddr = maxaddr; + ctx->ctx_tag.common.highaddr = maxaddr; + ctx->ctx_tag.common.maxsize = maxaddr; + ctx->ctx_tag.common.nsegments = BUS_SPACE_UNRESTRICTED; + ctx->ctx_tag.common.maxsegsz = maxaddr; + ctx->ctx_tag.ctx = ctx; + /* XXXKIB initialize tag further */ +} + +static void +ctx_id_entry_init(struct dmar_ctx *ctx, dmar_ctx_entry_t *ctxp) +{ + struct dmar_unit *unit; + vm_page_t ctx_root; + + unit = ctx->dmar; + KASSERT(ctxp->ctx1 == 0 && ctxp->ctx2 == 0, + ("dmar%d: initialized ctx entry %d:%d:%d 0x%jx 0x%jx", + unit->unit, ctx->bus, ctx->slot, ctx->func, ctxp->ctx1, + ctxp->ctx2)); + ctxp->ctx2 = DMAR_CTX2_DID(ctx->domain); + ctxp->ctx2 |= ctx->awlvl; + if ((ctx->flags & DMAR_CTX_IDMAP) != 0 && + (unit->hw_ecap & DMAR_ECAP_PT) != 0) { + KASSERT(ctx->pgtbl_obj == NULL, + ("ctx %p non-null pgtbl_obj", ctx)); + dmar_pte_store(&ctxp->ctx1, DMAR_CTX1_T_PASS | DMAR_CTX1_P); + } else { + ctx_root = dmar_pgalloc(ctx->pgtbl_obj, 0, DMAR_PGF_NOALLOC); + dmar_pte_store(&ctxp->ctx1, DMAR_CTX1_T_UNTR | + (DMAR_CTX1_ASR_MASK & VM_PAGE_TO_PHYS(ctx_root)) | + DMAR_CTX1_P); + } +} + +static int +ctx_init_rmrr(struct dmar_ctx *ctx, device_t dev) +{ + struct dmar_map_entries_tailq rmrr_entries; + struct dmar_map_entry *entry, *entry1; + vm_page_t *ma; + dmar_gaddr_t start, end; + vm_pindex_t size, i; + int error, error1; + + error = 0; + TAILQ_INIT(&rmrr_entries); + dmar_ctx_parse_rmrr(ctx, dev, &rmrr_entries); + TAILQ_FOREACH_SAFE(entry, &rmrr_entries, unroll_link, entry1) { + /* + * VT-d specification requires that the start of an + * RMRR entry is 4k-aligned. Buggy BIOSes put + * anything into the start and end fields. Truncate + * and round as neccesary. + * + * We also allow the overlapping RMRR entries, see + * dmar_gas_alloc_region(). + */ + start = entry->start; + end = entry->end; + entry->start = trunc_page(start); + entry->end = round_page(end); + size = OFF_TO_IDX(entry->end - entry->start); + ma = malloc(sizeof(vm_page_t) * size, M_TEMP, M_WAITOK); + for (i = 0; i < size; i++) { + ma[i] = vm_page_getfake(entry->start + PAGE_SIZE * i, + VM_MEMATTR_DEFAULT); + } + error1 = dmar_gas_map_region(ctx, entry, DMAR_MAP_ENTRY_READ | + DMAR_MAP_ENTRY_WRITE, DMAR_GM_CANWAIT, ma); + /* + * Non-failed RMRR entries are owned by context rb + * tree. Get rid of the failed entry, but do not stop + * the loop. Rest of the parsed RMRR entries are + * loaded and removed on the context destruction. + */ + if (error1 == 0 && entry->end != entry->start) { + DMAR_LOCK(ctx->dmar); + ctx->flags |= DMAR_CTX_RMRR; + DMAR_UNLOCK(ctx->dmar); + } else { + if (error1 != 0) { + device_printf(dev, + "dmar%d failed to map RMRR region (%jx, %jx) %d\n", + ctx->dmar->unit, start, end, error1); + error = error1; + } + TAILQ_REMOVE(&rmrr_entries, entry, unroll_link); + dmar_gas_free_entry(ctx, entry); + } + for (i = 0; i < size; i++) + vm_page_putfake(ma[i]); + free(ma, M_TEMP); + } + return (error); +} + +static struct dmar_ctx * +dmar_get_ctx_alloc(struct dmar_unit *dmar, int bus, int slot, int func) +{ + struct dmar_ctx *ctx; + + ctx = malloc(sizeof(*ctx), M_DMAR_CTX, M_WAITOK | M_ZERO); + RB_INIT(&ctx->rb_root); + TAILQ_INIT(&ctx->unload_entries); + TASK_INIT(&ctx->unload_task, 0, dmar_ctx_unload_task, ctx); + mtx_init(&ctx->lock, "dmarctx", NULL, MTX_DEF); + ctx->dmar = dmar; + ctx->bus = bus; + ctx->slot = slot; + ctx->func = func; + return (ctx); +} + +static void +dmar_ctx_dtr(struct dmar_ctx *ctx, bool gas_inited, bool pgtbl_inited) +{ + + if (gas_inited) { + DMAR_CTX_LOCK(ctx); + dmar_gas_fini_ctx(ctx); + DMAR_CTX_UNLOCK(ctx); + } + if (pgtbl_inited) { + if (ctx->pgtbl_obj != NULL) + DMAR_CTX_PGLOCK(ctx); + ctx_free_pgtbl(ctx); + } + mtx_destroy(&ctx->lock); + free(ctx, M_DMAR_CTX); +} + +struct dmar_ctx * +dmar_get_ctx(struct dmar_unit *dmar, device_t dev, bool id_mapped, bool rmrr_init) +{ + struct dmar_ctx *ctx, *ctx1; + dmar_ctx_entry_t *ctxp; + struct sf_buf *sf; + int bus, slot, func, error, mgaw; + bool enable; + + bus = pci_get_bus(dev); + slot = pci_get_slot(dev); + func = pci_get_function(dev); + enable = false; + TD_PREP_PINNED_ASSERT; + DMAR_LOCK(dmar); + ctx = dmar_find_ctx_locked(dmar, bus, slot, func); + error = 0; + if (ctx == NULL) { + /* + * Perform the allocations which require sleep or have + * higher chance to succeed if the sleep is allowed. + */ + DMAR_UNLOCK(dmar); + dmar_ensure_ctx_page(dmar, bus); + ctx1 = dmar_get_ctx_alloc(dmar, bus, slot, func); + + if (id_mapped) { + /* + * For now, use the maximal usable physical + * address of the installed memory to + * calculate the mgaw. It is useful for the + * identity mapping, and less so for the + * virtualized bus address space. + */ + ctx1->end = ptoa(Maxmem); + mgaw = dmar_maxaddr2mgaw(dmar, ctx1->end, false); + error = ctx_set_agaw(ctx1, mgaw); + if (error != 0) { + dmar_ctx_dtr(ctx1, false, false); + TD_PINNED_ASSERT; + return (NULL); + } + } else { + ctx1->end = BUS_SPACE_MAXADDR; + mgaw = dmar_maxaddr2mgaw(dmar, ctx1->end, true); + error = ctx_set_agaw(ctx1, mgaw); + if (error != 0) { + dmar_ctx_dtr(ctx1, false, false); + TD_PINNED_ASSERT; + return (NULL); + } + /* Use all supported address space for remapping. */ + ctx1->end = 1ULL << (ctx1->agaw - 1); + } + + + dmar_gas_init_ctx(ctx1); + if (id_mapped) { + if ((dmar->hw_ecap & DMAR_ECAP_PT) == 0) { + ctx1->pgtbl_obj = ctx_get_idmap_pgtbl(ctx1, + ctx1->end); + } + ctx1->flags |= DMAR_CTX_IDMAP; + } else { + error = ctx_alloc_pgtbl(ctx1); + if (error != 0) { + dmar_ctx_dtr(ctx1, true, false); + TD_PINNED_ASSERT; + return (NULL); + } + /* Disable local apic region access */ + error = dmar_gas_reserve_region(ctx1, 0xfee00000, + 0xfeefffff + 1); + if (error != 0) { + dmar_ctx_dtr(ctx1, true, true); + TD_PINNED_ASSERT; + return (NULL); + } + error = ctx_init_rmrr(ctx1, dev); + if (error != 0) { + dmar_ctx_dtr(ctx1, true, true); + TD_PINNED_ASSERT; + return (NULL); + } + } + ctxp = dmar_map_ctx_entry(ctx1, &sf); + DMAR_LOCK(dmar); + + /* + * Recheck the contexts, other thread might have + * already allocated needed one. + */ + ctx = dmar_find_ctx_locked(dmar, bus, slot, func); + if (ctx == NULL) { + ctx = ctx1; + ctx->domain = alloc_unrl(dmar->domids); + if (ctx->domain == -1) { + DMAR_UNLOCK(dmar); + dmar_unmap_pgtbl(sf, true); + dmar_ctx_dtr(ctx, true, true); + TD_PINNED_ASSERT; + return (NULL); + } + ctx_tag_init(ctx); + + /* + * This is the first activated context for the + * DMAR unit. Enable the translation after + * everything is set up. + */ + if (LIST_EMPTY(&dmar->contexts)) + enable = true; + LIST_INSERT_HEAD(&dmar->contexts, ctx, link); + ctx_id_entry_init(ctx, ctxp); + device_printf(dev, + "dmar%d pci%d:%d:%d:%d domain %d mgaw %d agaw %d\n", + dmar->unit, dmar->segment, bus, slot, + func, ctx->domain, ctx->mgaw, ctx->agaw); + } else { + dmar_ctx_dtr(ctx1, true, true); + } + dmar_unmap_pgtbl(sf, DMAR_IS_COHERENT(dmar)); + } + ctx->refs++; + if ((ctx->flags & DMAR_CTX_RMRR) != 0) + ctx->refs++; /* XXXKIB */ + + /* + * If dmar declares Caching Mode as Set, follow 11.5 "Caching + * Mode Consideration" and do the (global) invalidation of the + * negative TLB entries. + */ + if ((dmar->hw_cap & DMAR_CAP_CM) != 0 || enable) { + if (dmar->qi_enabled) { + dmar_qi_invalidate_ctx_glob_locked(dmar); + if ((dmar->hw_ecap & DMAR_ECAP_DI) != 0) + dmar_qi_invalidate_iotlb_glob_locked(dmar); + } else { + error = dmar_inv_ctx_glob(dmar); + if (error == 0 && + (dmar->hw_ecap & DMAR_ECAP_DI) != 0) + error = dmar_inv_iotlb_glob(dmar); + if (error != 0) { + dmar_free_ctx_locked(dmar, ctx); + TD_PINNED_ASSERT; + return (NULL); + } + } + } + + /* + * The dmar lock was potentially dropped between check for the + * empty context list and now. Recheck the state of GCMD_TE + * to avoid unneeded command. + */ + if (enable && !rmrr_init && (dmar->hw_gcmd & DMAR_GCMD_TE) == 0) { + error = dmar_enable_translation(dmar); + if (error != 0) { + dmar_free_ctx_locked(dmar, ctx); + TD_PINNED_ASSERT; + return (NULL); + } + } + DMAR_UNLOCK(dmar); + TD_PINNED_ASSERT; + return (ctx); +} + +void +dmar_free_ctx_locked(struct dmar_unit *dmar, struct dmar_ctx *ctx) +{ + struct sf_buf *sf; + dmar_ctx_entry_t *ctxp; + + DMAR_ASSERT_LOCKED(dmar); + KASSERT(ctx->refs >= 1, + ("dmar %p ctx %p refs %u", dmar, ctx, ctx->refs)); + + /* + * If our reference is not last, only the dereference should + * be performed. + */ + if (ctx->refs > 1) { + ctx->refs--; + DMAR_UNLOCK(dmar); + return; + } + + KASSERT((ctx->flags & DMAR_CTX_RMRR) == 0, + ("lost ref on RMRR ctx %p", ctx)); + KASSERT((ctx->flags & DMAR_CTX_DISABLED) == 0, + ("lost ref on disabled ctx %p", ctx)); + + /* + * Otherwise, the context entry must be cleared before the + * page table is destroyed. The mapping of the context + * entries page could require sleep, unlock the dmar. + */ + DMAR_UNLOCK(dmar); + TD_PREP_PINNED_ASSERT; + ctxp = dmar_map_ctx_entry(ctx, &sf); + DMAR_LOCK(dmar); + KASSERT(ctx->refs >= 1, + ("dmar %p ctx %p refs %u", dmar, ctx, ctx->refs)); + + /* + * Other thread might have referenced the context, in which + * case again only the dereference should be performed. + */ + if (ctx->refs > 1) { + ctx->refs--; + DMAR_UNLOCK(dmar); + dmar_unmap_pgtbl(sf, DMAR_IS_COHERENT(dmar)); + TD_PINNED_ASSERT; + return; + } + + KASSERT((ctx->flags & DMAR_CTX_RMRR) == 0, + ("lost ref on RMRR ctx %p", ctx)); + KASSERT((ctx->flags & DMAR_CTX_DISABLED) == 0, + ("lost ref on disabled ctx %p", ctx)); + + /* + * Clear the context pointer and flush the caches. + * XXXKIB: cannot do this if any RMRR entries are still present. + */ + dmar_pte_clear(&ctxp->ctx1); + ctxp->ctx2 = 0; + dmar_inv_ctx_glob(dmar); + if ((dmar->hw_ecap & DMAR_ECAP_DI) != 0) { + if (dmar->qi_enabled) + dmar_qi_invalidate_iotlb_glob_locked(dmar); + else + dmar_inv_iotlb_glob(dmar); + } + LIST_REMOVE(ctx, link); + DMAR_UNLOCK(dmar); + + /* + * The rest of the destruction is invisible for other users of + * the dmar unit. + */ + taskqueue_drain(dmar->delayed_taskqueue, &ctx->unload_task); + KASSERT(TAILQ_EMPTY(&ctx->unload_entries), + ("unfinished unloads %p", ctx)); + dmar_unmap_pgtbl(sf, DMAR_IS_COHERENT(dmar)); + free_unr(dmar->domids, ctx->domain); + dmar_ctx_dtr(ctx, true, true); + TD_PINNED_ASSERT; +} + +void +dmar_free_ctx(struct dmar_ctx *ctx) +{ + struct dmar_unit *dmar; + + dmar = ctx->dmar; + DMAR_LOCK(dmar); + dmar_free_ctx_locked(dmar, ctx); +} + +struct dmar_ctx * +dmar_find_ctx_locked(struct dmar_unit *dmar, int bus, int slot, int func) +{ + struct dmar_ctx *ctx; + + DMAR_ASSERT_LOCKED(dmar); + + LIST_FOREACH(ctx, &dmar->contexts, link) { + if (ctx->bus == bus && ctx->slot == slot && ctx->func == func) + return (ctx); + } + return (NULL); +} + +void +dmar_ctx_free_entry(struct dmar_map_entry *entry, bool free) +{ + struct dmar_ctx *ctx; + + ctx = entry->ctx; + DMAR_CTX_LOCK(ctx); + if ((entry->flags & DMAR_MAP_ENTRY_RMRR) != 0) + dmar_gas_free_region(ctx, entry); + else + dmar_gas_free_space(ctx, entry); + DMAR_CTX_UNLOCK(ctx); + if (free) + dmar_gas_free_entry(ctx, entry); + else + entry->flags = 0; +} + +void +dmar_ctx_unload_entry(struct dmar_map_entry *entry, bool free) +{ + struct dmar_unit *unit; + + unit = entry->ctx->dmar; + if (unit->qi_enabled) { + DMAR_LOCK(unit); + dmar_qi_invalidate_locked(entry->ctx, entry->start, + entry->end - entry->start, &entry->gseq); + if (!free) + entry->flags |= DMAR_MAP_ENTRY_QI_NF; + TAILQ_INSERT_TAIL(&unit->tlb_flush_entries, entry, dmamap_link); + DMAR_UNLOCK(unit); + } else { + ctx_flush_iotlb_sync(entry->ctx, entry->start, entry->end - + entry->start); + dmar_ctx_free_entry(entry, free); + } +} + +void +dmar_ctx_unload(struct dmar_ctx *ctx, struct dmar_map_entries_tailq *entries, + bool cansleep) +{ + struct dmar_unit *unit; + struct dmar_map_entry *entry, *entry1; + struct dmar_qi_genseq gseq; + int error; + + unit = ctx->dmar; + + TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) { + KASSERT((entry->flags & DMAR_MAP_ENTRY_MAP) != 0, + ("not mapped entry %p %p", ctx, entry)); + error = ctx_unmap_buf(ctx, entry->start, entry->end - + entry->start, cansleep ? DMAR_PGF_WAITOK : 0); + KASSERT(error == 0, ("unmap %p error %d", ctx, error)); + if (!unit->qi_enabled) { + ctx_flush_iotlb_sync(ctx, entry->start, + entry->end - entry->start); + TAILQ_REMOVE(entries, entry, dmamap_link); + dmar_ctx_free_entry(entry, true); + } + } + if (TAILQ_EMPTY(entries)) + return; + + KASSERT(unit->qi_enabled, ("loaded entry left")); + DMAR_LOCK(unit); + TAILQ_FOREACH(entry, entries, dmamap_link) { + entry->gseq.gen = 0; + entry->gseq.seq = 0; + dmar_qi_invalidate_locked(ctx, entry->start, entry->end - + entry->start, TAILQ_NEXT(entry, dmamap_link) == NULL ? + &gseq : NULL); + } + TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) { + entry->gseq = gseq; + TAILQ_REMOVE(entries, entry, dmamap_link); + TAILQ_INSERT_TAIL(&unit->tlb_flush_entries, entry, dmamap_link); + } + DMAR_UNLOCK(unit); +} + +static void +dmar_ctx_unload_task(void *arg, int pending) +{ + struct dmar_ctx *ctx; + struct dmar_map_entries_tailq entries; + + ctx = arg; + TAILQ_INIT(&entries); + + for (;;) { + DMAR_CTX_LOCK(ctx); + TAILQ_SWAP(&ctx->unload_entries, &entries, dmar_map_entry, + dmamap_link); + DMAR_CTX_UNLOCK(ctx); + if (TAILQ_EMPTY(&entries)) + break; + dmar_ctx_unload(ctx, &entries, true); + } +} diff --git a/sys/x86/iommu/intel_dmar.h b/sys/x86/iommu/intel_dmar.h new file mode 100644 index 0000000..994e5e1 --- /dev/null +++ b/sys/x86/iommu/intel_dmar.h @@ -0,0 +1,435 @@ +/*- + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __X86_IOMMU_INTEL_DMAR_H +#define __X86_IOMMU_INTEL_DMAR_H + +/* Host or physical memory address, after translation. */ +typedef uint64_t dmar_haddr_t; +/* Guest or bus address, before translation. */ +typedef uint64_t dmar_gaddr_t; + +struct dmar_qi_genseq { + u_int gen; + uint32_t seq; +}; + +struct dmar_map_entry { + dmar_gaddr_t start; + dmar_gaddr_t end; + dmar_gaddr_t free_after; /* Free space after the entry */ + dmar_gaddr_t free_down; /* Max free space below the + current R/B tree node */ + u_int flags; + TAILQ_ENTRY(dmar_map_entry) dmamap_link; /* Link for dmamap entries */ + RB_ENTRY(dmar_map_entry) rb_entry; /* Links for ctx entries */ + TAILQ_ENTRY(dmar_map_entry) unroll_link; /* Link for unroll after + dmamap_load failure */ + struct dmar_ctx *ctx; + struct dmar_qi_genseq gseq; +}; + +RB_HEAD(dmar_gas_entries_tree, dmar_map_entry); +RB_PROTOTYPE(dmar_gas_entries_tree, dmar_map_entry, rb_entry, + dmar_gas_cmp_entries); + +#define DMAR_MAP_ENTRY_PLACE 0x0001 /* Fake entry */ +#define DMAR_MAP_ENTRY_RMRR 0x0002 /* Permanent, not linked by + dmamap_link */ +#define DMAR_MAP_ENTRY_MAP 0x0004 /* Busdma created, linked by + dmamap_link */ +#define DMAR_MAP_ENTRY_UNMAPPED 0x0010 /* No backing pages */ +#define DMAR_MAP_ENTRY_QI_NF 0x0020 /* qi task, do not free entry */ +#define DMAR_MAP_ENTRY_READ 0x1000 /* Read permitted */ +#define DMAR_MAP_ENTRY_WRITE 0x2000 /* Write permitted */ +#define DMAR_MAP_ENTRY_SNOOP 0x4000 /* Snoop */ +#define DMAR_MAP_ENTRY_TM 0x8000 /* Transient */ + +struct dmar_ctx { + int bus; /* pci bus/slot/func */ + int slot; + int func; + int domain; /* DID */ + int mgaw; /* Real max address width */ + int agaw; /* Adjusted guest address width */ + int pglvl; /* The pagelevel */ + int awlvl; /* The pagelevel as the bitmask, to set in + context entry */ + dmar_gaddr_t end;/* Highest address + 1 in the guest AS */ + u_int refs; /* References to the context, from tags */ + struct dmar_unit *dmar; + struct bus_dma_tag_dmar ctx_tag; /* Root tag */ + struct mtx lock; + LIST_ENTRY(dmar_ctx) link; /* Member in the dmar list */ + vm_object_t pgtbl_obj; /* Page table pages */ + u_int flags; /* Protected by dmar lock */ + uint64_t last_fault_rec[2]; /* Last fault reported */ + u_int entries_cnt; + u_long loads; + u_long unloads; + struct dmar_gas_entries_tree rb_root; + struct dmar_map_entries_tailq unload_entries; /* Entries to unload */ + struct dmar_map_entry *first_place, *last_place; + struct task unload_task; +}; + +/* struct dmar_ctx flags */ +#define DMAR_CTX_FAULTED 0x0001 /* Fault was reported, + last_fault_rec is valid */ +#define DMAR_CTX_IDMAP 0x0002 /* Context uses identity page table */ +#define DMAR_CTX_RMRR 0x0004 /* Context contains RMRR entry, + cannot be turned off */ +#define DMAR_CTX_DISABLED 0x0008 /* Device is disabled, the + ephemeral reference is kept + to prevent context destruction */ + +#define DMAR_CTX_PGLOCK(ctx) VM_OBJECT_WLOCK((ctx)->pgtbl_obj) +#define DMAR_CTX_PGTRYLOCK(ctx) VM_OBJECT_TRYWLOCK((ctx)->pgtbl_obj) +#define DMAR_CTX_PGUNLOCK(ctx) VM_OBJECT_WUNLOCK((ctx)->pgtbl_obj) +#define DMAR_CTX_ASSERT_PGLOCKED(ctx) \ + VM_OBJECT_ASSERT_WLOCKED((ctx)->pgtbl_obj) + +#define DMAR_CTX_LOCK(ctx) mtx_lock(&(ctx)->lock) +#define DMAR_CTX_UNLOCK(ctx) mtx_unlock(&(ctx)->lock) +#define DMAR_CTX_ASSERT_LOCKED(ctx) mtx_assert(&(ctx)->lock, MA_OWNED) + +struct dmar_msi_data { + int irq; + int irq_rid; + struct resource *irq_res; + void *intr_handle; + int (*handler)(void *); + int msi_data_reg; + int msi_addr_reg; + int msi_uaddr_reg; + void (*enable_intr)(struct dmar_unit *); + void (*disable_intr)(struct dmar_unit *); + const char *name; +}; + +#define DMAR_INTR_FAULT 0 +#define DMAR_INTR_QI 1 +#define DMAR_INTR_TOTAL 2 + +struct dmar_unit { + device_t dev; + int unit; + uint16_t segment; + uint64_t base; + + /* Resources */ + int reg_rid; + struct resource *regs; + + struct dmar_msi_data intrs[DMAR_INTR_TOTAL]; + + /* Hardware registers cache */ + uint32_t hw_ver; + uint64_t hw_cap; + uint64_t hw_ecap; + uint32_t hw_gcmd; + + /* Data for being a dmar */ + struct mtx lock; + LIST_HEAD(, dmar_ctx) contexts; + struct unrhdr *domids; + vm_object_t ctx_obj; + u_int barrier_flags; + + /* Fault handler data */ + struct mtx fault_lock; + uint64_t *fault_log; + int fault_log_head; + int fault_log_tail; + int fault_log_size; + struct task fault_task; + struct taskqueue *fault_taskqueue; + + /* QI */ + int qi_enabled; + vm_offset_t inv_queue; + vm_size_t inv_queue_size; + uint32_t inv_queue_avail; + uint32_t inv_queue_tail; + volatile uint32_t inv_waitd_seq_hw; /* hw writes there on wait + descr completion */ + uint64_t inv_waitd_seq_hw_phys; + uint32_t inv_waitd_seq; /* next sequence number to use for wait descr */ + u_int inv_waitd_gen; /* seq number generation AKA seq overflows */ + u_int inv_seq_waiters; /* count of waiters for seq */ + u_int inv_queue_full; /* informational counter */ + + /* Delayed freeing of map entries queue processing */ + struct dmar_map_entries_tailq tlb_flush_entries; + struct task qi_task; + struct taskqueue *qi_taskqueue; + + /* Busdma delayed map load */ + struct task dmamap_load_task; + TAILQ_HEAD(, bus_dmamap_dmar) delayed_maps; + struct taskqueue *delayed_taskqueue; +}; + +#define DMAR_LOCK(dmar) mtx_lock(&(dmar)->lock) +#define DMAR_UNLOCK(dmar) mtx_unlock(&(dmar)->lock) +#define DMAR_ASSERT_LOCKED(dmar) mtx_assert(&(dmar)->lock, MA_OWNED) + +#define DMAR_FAULT_LOCK(dmar) mtx_lock_spin(&(dmar)->fault_lock) +#define DMAR_FAULT_UNLOCK(dmar) mtx_unlock_spin(&(dmar)->fault_lock) +#define DMAR_FAULT_ASSERT_LOCKED(dmar) mtx_assert(&(dmar)->fault_lock, MA_OWNED) + +#define DMAR_IS_COHERENT(dmar) (((dmar)->hw_ecap & DMAR_ECAP_C) != 0) +#define DMAR_HAS_QI(dmar) (((dmar)->hw_ecap & DMAR_ECAP_QI) != 0) + +/* Barrier ids */ +#define DMAR_BARRIER_RMRR 0 +#define DMAR_BARRIER_USEQ 1 + +struct dmar_unit *dmar_find(device_t dev); + +u_int dmar_nd2mask(u_int nd); +bool dmar_pglvl_supported(struct dmar_unit *unit, int pglvl); +int ctx_set_agaw(struct dmar_ctx *ctx, int mgaw); +int dmar_maxaddr2mgaw(struct dmar_unit* unit, dmar_gaddr_t maxaddr, + bool allow_less); +vm_pindex_t pglvl_max_pages(int pglvl); +int ctx_is_sp_lvl(struct dmar_ctx *ctx, int lvl); +dmar_gaddr_t pglvl_page_size(int total_pglvl, int lvl); +dmar_gaddr_t ctx_page_size(struct dmar_ctx *ctx, int lvl); +int calc_am(struct dmar_unit *unit, dmar_gaddr_t base, dmar_gaddr_t size, + dmar_gaddr_t *isizep); +struct vm_page *dmar_pgalloc(vm_object_t obj, vm_pindex_t idx, int flags); +void dmar_pgfree(vm_object_t obj, vm_pindex_t idx, int flags); +void *dmar_map_pgtbl(vm_object_t obj, vm_pindex_t idx, int flags, + struct sf_buf **sf); +void dmar_unmap_pgtbl(struct sf_buf *sf, bool coherent); +int dmar_load_root_entry_ptr(struct dmar_unit *unit); +int dmar_inv_ctx_glob(struct dmar_unit *unit); +int dmar_inv_iotlb_glob(struct dmar_unit *unit); +int dmar_flush_write_bufs(struct dmar_unit *unit); +int dmar_enable_translation(struct dmar_unit *unit); +int dmar_disable_translation(struct dmar_unit *unit); +bool dmar_barrier_enter(struct dmar_unit *dmar, u_int barrier_id); +void dmar_barrier_exit(struct dmar_unit *dmar, u_int barrier_id); + +int dmar_fault_intr(void *arg); +void dmar_enable_fault_intr(struct dmar_unit *unit); +void dmar_disable_fault_intr(struct dmar_unit *unit); +int dmar_init_fault_log(struct dmar_unit *unit); +void dmar_fini_fault_log(struct dmar_unit *unit); + +int dmar_qi_intr(void *arg); +void dmar_enable_qi_intr(struct dmar_unit *unit); +void dmar_disable_qi_intr(struct dmar_unit *unit); +int dmar_init_qi(struct dmar_unit *unit); +void dmar_fini_qi(struct dmar_unit *unit); +void dmar_qi_invalidate_locked(struct dmar_ctx *ctx, dmar_gaddr_t start, + dmar_gaddr_t size, struct dmar_qi_genseq *pseq); +void dmar_qi_invalidate_ctx_glob_locked(struct dmar_unit *unit); +void dmar_qi_invalidate_iotlb_glob_locked(struct dmar_unit *unit); + +vm_object_t ctx_get_idmap_pgtbl(struct dmar_ctx *ctx, dmar_gaddr_t maxaddr); +void put_idmap_pgtbl(vm_object_t obj); +int ctx_map_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size, + vm_page_t *ma, uint64_t pflags, int flags); +int ctx_unmap_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size, + int flags); +void ctx_flush_iotlb_sync(struct dmar_ctx *ctx, dmar_gaddr_t base, + dmar_gaddr_t size); +int ctx_alloc_pgtbl(struct dmar_ctx *ctx); +void ctx_free_pgtbl(struct dmar_ctx *ctx); + +struct dmar_ctx *dmar_instantiate_ctx(struct dmar_unit *dmar, device_t dev, + bool rmrr); +struct dmar_ctx *dmar_get_ctx(struct dmar_unit *dmar, device_t dev, + bool id_mapped, bool rmrr_init); +void dmar_free_ctx_locked(struct dmar_unit *dmar, struct dmar_ctx *ctx); +void dmar_free_ctx(struct dmar_ctx *ctx); +struct dmar_ctx *dmar_find_ctx_locked(struct dmar_unit *dmar, int bus, + int slot, int func); +void dmar_ctx_unload_entry(struct dmar_map_entry *entry, bool free); +void dmar_ctx_unload(struct dmar_ctx *ctx, + struct dmar_map_entries_tailq *entries, bool cansleep); +void dmar_ctx_free_entry(struct dmar_map_entry *entry, bool free); + +int dmar_init_busdma(struct dmar_unit *unit); +void dmar_fini_busdma(struct dmar_unit *unit); + +void dmar_gas_init_ctx(struct dmar_ctx *ctx); +void dmar_gas_fini_ctx(struct dmar_ctx *ctx); +struct dmar_map_entry *dmar_gas_alloc_entry(struct dmar_ctx *ctx, u_int flags); +void dmar_gas_free_entry(struct dmar_ctx *ctx, struct dmar_map_entry *entry); +void dmar_gas_free_space(struct dmar_ctx *ctx, struct dmar_map_entry *entry); +int dmar_gas_map(struct dmar_ctx *ctx, const struct bus_dma_tag_common *common, + dmar_gaddr_t size, u_int eflags, u_int flags, vm_page_t *ma, + struct dmar_map_entry **res); +void dmar_gas_free_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry); +int dmar_gas_map_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry, + u_int eflags, u_int flags, vm_page_t *ma); +int dmar_gas_reserve_region(struct dmar_ctx *ctx, dmar_gaddr_t start, + dmar_gaddr_t end); + +void dmar_ctx_parse_rmrr(struct dmar_ctx *ctx, device_t dev, + struct dmar_map_entries_tailq *rmrr_entries); +int dmar_instantiate_rmrr_ctxs(struct dmar_unit *dmar); + +void dmar_quirks_post_ident(struct dmar_unit *dmar); +void dmar_quirks_pre_use(struct dmar_unit *dmar); + +#define DMAR_GM_CANWAIT 0x0001 +#define DMAR_GM_CANSPLIT 0x0002 + +#define DMAR_PGF_WAITOK 0x0001 +#define DMAR_PGF_ZERO 0x0002 +#define DMAR_PGF_ALLOC 0x0004 +#define DMAR_PGF_NOALLOC 0x0008 +#define DMAR_PGF_OBJL 0x0010 + +extern dmar_haddr_t dmar_high; +extern int haw; +extern int dmar_tbl_pagecnt; +extern int dmar_match_verbose; +extern int dmar_check_free; + +static inline uint32_t +dmar_read4(const struct dmar_unit *unit, int reg) +{ + + return (bus_read_4(unit->regs, reg)); +} + +static inline uint64_t +dmar_read8(const struct dmar_unit *unit, int reg) +{ +#ifdef __i386__ + uint32_t high, low; + + low = bus_read_4(unit->regs, reg); + high = bus_read_4(unit->regs, reg + 4); + return (low | ((uint64_t)high << 32)); +#else + return (bus_read_8(unit->regs, reg)); +#endif +} + +static inline void +dmar_write4(const struct dmar_unit *unit, int reg, uint32_t val) +{ + + KASSERT(reg != DMAR_GCMD_REG || (val & DMAR_GCMD_TE) == + (unit->hw_gcmd & DMAR_GCMD_TE), + ("dmar%d clearing TE 0x%08x 0x%08x", unit->unit, + unit->hw_gcmd, val)); + bus_write_4(unit->regs, reg, val); +} + +static inline void +dmar_write8(const struct dmar_unit *unit, int reg, uint64_t val) +{ + + KASSERT(reg != DMAR_GCMD_REG, ("8byte GCMD write")); +#ifdef __i386__ + uint32_t high, low; + + low = val; + high = val >> 32; + bus_write_4(unit->regs, reg, low); + bus_write_4(unit->regs, reg + 4, high); +#else + bus_write_8(unit->regs, reg, val); +#endif +} + +/* + * dmar_pte_store and dmar_pte_clear ensure that on i386, 32bit writes + * are issued in the correct order. For store, the lower word, + * containing the P or R and W bits, is set only after the high word + * is written. For clear, the P bit is cleared first, then the high + * word is cleared. + */ +static inline void +dmar_pte_store(volatile uint64_t *dst, uint64_t val) +{ + + KASSERT(*dst == 0, ("used pte %p oldval %jx newval %jx", + dst, (uintmax_t)*dst, (uintmax_t)val)); +#ifdef __i386__ + volatile uint32_t *p; + uint32_t hi, lo; + + hi = val >> 32; + lo = val; + p = (volatile uint32_t *)dst; + *(p + 1) = hi; + *p = lo; +#else + *dst = val; +#endif +} + +static inline void +dmar_pte_clear(volatile uint64_t *dst) +{ +#ifdef __i386__ + volatile uint32_t *p; + + p = (volatile uint32_t *)dst; + *p = 0; + *(p + 1) = 0; +#else + *dst = 0; +#endif +} + +static inline bool +dmar_test_boundary(dmar_gaddr_t start, dmar_gaddr_t size, + dmar_gaddr_t boundary) +{ + + if (boundary == 0) + return (true); + return (start + size <= ((start + boundary) & ~(boundary - 1))); +} + +#ifdef INVARIANTS +#define TD_PREP_PINNED_ASSERT \ + int old_td_pinned; \ + old_td_pinned = curthread->td_pinned +#define TD_PINNED_ASSERT \ + KASSERT(curthread->td_pinned == old_td_pinned, \ + ("pin count leak: %d %d %s:%d", curthread->td_pinned, \ + old_td_pinned, __FILE__, __LINE__)) +#else +#define TD_PREP_PINNED_ASSERT +#define TD_PINNED_ASSERT +#endif + +#endif diff --git a/sys/x86/iommu/intel_drv.c b/sys/x86/iommu/intel_drv.c new file mode 100644 index 0000000..a846b92 --- /dev/null +++ b/sys/x86/iommu/intel_drv.c @@ -0,0 +1,1182 @@ +/*- + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_acpi.h" +#if defined(__amd64__) /* || defined(__ia64__) */ +#define DEV_APIC +#else +#include "opt_apic.h" +#endif +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/memdesc.h> +#include <sys/module.h> +#include <sys/rman.h> +#include <sys/rwlock.h> +#include <sys/smp.h> +#include <sys/taskqueue.h> +#include <sys/tree.h> +#include <machine/bus.h> +#include <contrib/dev/acpica/include/acpi.h> +#include <contrib/dev/acpica/include/accommon.h> +#include <dev/acpica/acpivar.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/vm_map.h> +#include <x86/include/busdma_impl.h> +#include <x86/iommu/intel_reg.h> +#include <x86/iommu/busdma_dmar.h> +#include <x86/iommu/intel_dmar.h> +#include <dev/pci/pcivar.h> + +#ifdef DEV_APIC +#include "pcib_if.h" +#endif + +#define DMAR_FAULT_IRQ_RID 0 +#define DMAR_QI_IRQ_RID 1 +#define DMAR_REG_RID 2 + +static devclass_t dmar_devclass; +static device_t *dmar_devs; +static int dmar_devcnt; + +typedef int (*dmar_iter_t)(ACPI_DMAR_HEADER *, void *); + +static void +dmar_iterate_tbl(dmar_iter_t iter, void *arg) +{ + ACPI_TABLE_DMAR *dmartbl; + ACPI_DMAR_HEADER *dmarh; + char *ptr, *ptrend; + ACPI_STATUS status; + + status = AcpiGetTable(ACPI_SIG_DMAR, 1, (ACPI_TABLE_HEADER **)&dmartbl); + if (ACPI_FAILURE(status)) + return; + ptr = (char *)dmartbl + sizeof(*dmartbl); + ptrend = (char *)dmartbl + dmartbl->Header.Length; + for (;;) { + if (ptr >= ptrend) + break; + dmarh = (ACPI_DMAR_HEADER *)ptr; + if (dmarh->Length <= 0) { + printf("dmar_identify: corrupted DMAR table, l %d\n", + dmarh->Length); + break; + } + ptr += dmarh->Length; + if (!iter(dmarh, arg)) + break; + } +} + +struct find_iter_args { + int i; + ACPI_DMAR_HARDWARE_UNIT *res; +}; + +static int +dmar_find_iter(ACPI_DMAR_HEADER *dmarh, void *arg) +{ + struct find_iter_args *fia; + + if (dmarh->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT) + return (1); + + fia = arg; + if (fia->i == 0) { + fia->res = (ACPI_DMAR_HARDWARE_UNIT *)dmarh; + return (0); + } + fia->i--; + return (1); +} + +static ACPI_DMAR_HARDWARE_UNIT * +dmar_find_by_index(int idx) +{ + struct find_iter_args fia; + + fia.i = idx; + fia.res = NULL; + dmar_iterate_tbl(dmar_find_iter, &fia); + return (fia.res); +} + +static int +dmar_count_iter(ACPI_DMAR_HEADER *dmarh, void *arg) +{ + + if (dmarh->Type == ACPI_DMAR_TYPE_HARDWARE_UNIT) + dmar_devcnt++; + return (1); +} + +static int dmar_enable = 0; +static void +dmar_identify(driver_t *driver, device_t parent) +{ + ACPI_TABLE_DMAR *dmartbl; + ACPI_DMAR_HARDWARE_UNIT *dmarh; + ACPI_STATUS status; + int i, error; + + if (acpi_disabled("dmar")) + return; + TUNABLE_INT_FETCH("hw.dmar.enable", &dmar_enable); + if (!dmar_enable) + return; +#ifdef INVARIANTS + TUNABLE_INT_FETCH("hw.dmar.check_free", &dmar_check_free); +#endif + TUNABLE_INT_FETCH("hw.dmar.match_verbose", &dmar_match_verbose); + status = AcpiGetTable(ACPI_SIG_DMAR, 1, (ACPI_TABLE_HEADER **)&dmartbl); + if (ACPI_FAILURE(status)) + return; + haw = dmartbl->Width + 1; + if ((1ULL << (haw + 1)) > BUS_SPACE_MAXADDR) + dmar_high = BUS_SPACE_MAXADDR; + else + dmar_high = 1ULL << (haw + 1); + if (bootverbose) { + printf("DMAR HAW=%d flags=<%b>\n", dmartbl->Width, + (unsigned)dmartbl->Flags, + "\020\001INTR_REMAP\002X2APIC_OPT_OUT"); + } + + dmar_iterate_tbl(dmar_count_iter, NULL); + if (dmar_devcnt == 0) + return; + dmar_devs = malloc(sizeof(device_t) * dmar_devcnt, M_DEVBUF, + M_WAITOK | M_ZERO); + for (i = 0; i < dmar_devcnt; i++) { + dmarh = dmar_find_by_index(i); + if (dmarh == NULL) { + printf("dmar_identify: cannot find HWUNIT %d\n", i); + continue; + } + dmar_devs[i] = BUS_ADD_CHILD(parent, 1, "dmar", i); + if (dmar_devs[i] == NULL) { + printf("dmar_identify: cannot create instance %d\n", i); + continue; + } + error = bus_set_resource(dmar_devs[i], SYS_RES_MEMORY, + DMAR_REG_RID, dmarh->Address, PAGE_SIZE); + if (error != 0) { + printf( + "dmar%d: unable to alloc register window at 0x%08jx: error %d\n", + i, (uintmax_t)dmarh->Address, error); + device_delete_child(parent, dmar_devs[i]); + dmar_devs[i] = NULL; + } + } +} + +static int +dmar_probe(device_t dev) +{ + + if (acpi_get_handle(dev) != NULL) + return (ENXIO); + device_set_desc(dev, "DMA remap"); + return (BUS_PROBE_NOWILDCARD); +} + +static void +dmar_release_intr(device_t dev, struct dmar_unit *unit, int idx) +{ + struct dmar_msi_data *dmd; + + dmd = &unit->intrs[idx]; + if (dmd->irq == -1) + return; + bus_teardown_intr(dev, dmd->irq_res, dmd->intr_handle); + bus_release_resource(dev, SYS_RES_IRQ, dmd->irq_rid, dmd->irq_res); + bus_delete_resource(dev, SYS_RES_IRQ, dmd->irq_rid); + PCIB_RELEASE_MSIX(device_get_parent(device_get_parent(dev)), + dev, dmd->irq); + dmd->irq = -1; +} + +static void +dmar_release_resources(device_t dev, struct dmar_unit *unit) +{ + int i; + + dmar_fini_busdma(unit); + dmar_fini_qi(unit); + dmar_fini_fault_log(unit); + for (i = 0; i < DMAR_INTR_TOTAL; i++) + dmar_release_intr(dev, unit, i); + if (unit->regs != NULL) { + bus_deactivate_resource(dev, SYS_RES_MEMORY, unit->reg_rid, + unit->regs); + bus_release_resource(dev, SYS_RES_MEMORY, unit->reg_rid, + unit->regs); + unit->regs = NULL; + } + if (unit->domids != NULL) { + delete_unrhdr(unit->domids); + unit->domids = NULL; + } + if (unit->ctx_obj != NULL) { + vm_object_deallocate(unit->ctx_obj); + unit->ctx_obj = NULL; + } +} + +static int +dmar_alloc_irq(device_t dev, struct dmar_unit *unit, int idx) +{ + device_t pcib; + struct dmar_msi_data *dmd; + uint64_t msi_addr; + uint32_t msi_data; + int error; + + dmd = &unit->intrs[idx]; + pcib = device_get_parent(device_get_parent(dev)); /* Really not pcib */ + error = PCIB_ALLOC_MSIX(pcib, dev, &dmd->irq); + if (error != 0) { + device_printf(dev, "cannot allocate %s interrupt, %d\n", + dmd->name, error); + goto err1; + } + error = bus_set_resource(dev, SYS_RES_IRQ, dmd->irq_rid, + dmd->irq, 1); + if (error != 0) { + device_printf(dev, "cannot set %s interrupt resource, %d\n", + dmd->name, error); + goto err2; + } + dmd->irq_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, + &dmd->irq_rid, RF_ACTIVE); + if (dmd->irq_res == NULL) { + device_printf(dev, + "cannot allocate resource for %s interrupt\n", dmd->name); + error = ENXIO; + goto err3; + } + error = bus_setup_intr(dev, dmd->irq_res, INTR_TYPE_MISC, + dmd->handler, NULL, unit, &dmd->intr_handle); + if (error != 0) { + device_printf(dev, "cannot setup %s interrupt, %d\n", + dmd->name, error); + goto err4; + } + bus_describe_intr(dev, dmd->irq_res, dmd->intr_handle, dmd->name); + error = PCIB_MAP_MSI(pcib, dev, dmd->irq, &msi_addr, &msi_data); + if (error != 0) { + device_printf(dev, "cannot map %s interrupt, %d\n", + dmd->name, error); + goto err5; + } + dmar_write4(unit, dmd->msi_data_reg, msi_data); + dmar_write4(unit, dmd->msi_addr_reg, msi_addr); + /* Only for xAPIC mode */ + dmar_write4(unit, dmd->msi_uaddr_reg, msi_addr >> 32); + return (0); + +err5: + bus_teardown_intr(dev, dmd->irq_res, dmd->intr_handle); +err4: + bus_release_resource(dev, SYS_RES_IRQ, dmd->irq_rid, dmd->irq_res); +err3: + bus_delete_resource(dev, SYS_RES_IRQ, dmd->irq_rid); +err2: + PCIB_RELEASE_MSIX(pcib, dev, dmd->irq); + dmd->irq = -1; +err1: + return (error); +} + +#ifdef DEV_APIC +static int +dmar_remap_intr(device_t dev, device_t child, u_int irq) +{ + struct dmar_unit *unit; + struct dmar_msi_data *dmd; + uint64_t msi_addr; + uint32_t msi_data; + int i, error; + + unit = device_get_softc(dev); + for (i = 0; i < DMAR_INTR_TOTAL; i++) { + dmd = &unit->intrs[i]; + if (irq == dmd->irq) { + error = PCIB_MAP_MSI(device_get_parent( + device_get_parent(dev)), + dev, irq, &msi_addr, &msi_data); + if (error != 0) + return (error); + DMAR_LOCK(unit); + (dmd->disable_intr)(unit); + dmar_write4(unit, dmd->msi_data_reg, msi_data); + dmar_write4(unit, dmd->msi_addr_reg, msi_addr); + dmar_write4(unit, dmd->msi_uaddr_reg, msi_addr >> 32); + (dmd->enable_intr)(unit); + DMAR_UNLOCK(unit); + return (0); + } + } + return (ENOENT); +} +#endif + +static void +dmar_print_caps(device_t dev, struct dmar_unit *unit, + ACPI_DMAR_HARDWARE_UNIT *dmaru) +{ + uint32_t caphi, ecaphi; + + device_printf(dev, "regs@0x%08jx, ver=%d.%d, seg=%d, flags=<%b>\n", + (uintmax_t)dmaru->Address, DMAR_MAJOR_VER(unit->hw_ver), + DMAR_MINOR_VER(unit->hw_ver), dmaru->Segment, + dmaru->Flags, "\020\001INCLUDE_ALL_PCI"); + caphi = unit->hw_cap >> 32; + device_printf(dev, "cap=%b,", (u_int)unit->hw_cap, + "\020\004AFL\005WBF\006PLMR\007PHMR\010CM\027ZLR\030ISOCH"); + printf("%b, ", caphi, "\020\010PSI\027DWD\030DRD"); + printf("ndoms=%d, sagaw=%d, mgaw=%d, fro=%d, nfr=%d, superp=%d", + DMAR_CAP_ND(unit->hw_cap), DMAR_CAP_SAGAW(unit->hw_cap), + DMAR_CAP_MGAW(unit->hw_cap), DMAR_CAP_FRO(unit->hw_cap), + DMAR_CAP_NFR(unit->hw_cap), DMAR_CAP_SPS(unit->hw_cap)); + if ((unit->hw_cap & DMAR_CAP_PSI) != 0) + printf(", mamv=%d", DMAR_CAP_MAMV(unit->hw_cap)); + printf("\n"); + ecaphi = unit->hw_ecap >> 32; + device_printf(dev, "ecap=%b,", (u_int)unit->hw_ecap, + "\020\001C\002QI\003DI\004IR\005EIM\007PT\010SC"); + printf("%b, ", ecaphi, "\020"); + printf("mhmw=%d, iro=%d\n", DMAR_ECAP_MHMV(unit->hw_ecap), + DMAR_ECAP_IRO(unit->hw_ecap)); +} + +static int +dmar_attach(device_t dev) +{ + struct dmar_unit *unit; + ACPI_DMAR_HARDWARE_UNIT *dmaru; + int i, error; + + unit = device_get_softc(dev); + unit->dev = dev; + unit->unit = device_get_unit(dev); + dmaru = dmar_find_by_index(unit->unit); + if (dmaru == NULL) + return (EINVAL); + unit->segment = dmaru->Segment; + unit->base = dmaru->Address; + unit->reg_rid = DMAR_REG_RID; + unit->regs = bus_alloc_resource_any(dev, SYS_RES_MEMORY, + &unit->reg_rid, RF_ACTIVE); + if (unit->regs == NULL) { + device_printf(dev, "cannot allocate register window\n"); + return (ENOMEM); + } + unit->hw_ver = dmar_read4(unit, DMAR_VER_REG); + unit->hw_cap = dmar_read8(unit, DMAR_CAP_REG); + unit->hw_ecap = dmar_read8(unit, DMAR_ECAP_REG); + if (bootverbose) + dmar_print_caps(dev, unit, dmaru); + dmar_quirks_post_ident(unit); + + for (i = 0; i < DMAR_INTR_TOTAL; i++) + unit->intrs[i].irq = -1; + + unit->intrs[DMAR_INTR_FAULT].name = "fault"; + unit->intrs[DMAR_INTR_FAULT].irq_rid = DMAR_FAULT_IRQ_RID; + unit->intrs[DMAR_INTR_FAULT].handler = dmar_fault_intr; + unit->intrs[DMAR_INTR_FAULT].msi_data_reg = DMAR_FEDATA_REG; + unit->intrs[DMAR_INTR_FAULT].msi_addr_reg = DMAR_FEADDR_REG; + unit->intrs[DMAR_INTR_FAULT].msi_uaddr_reg = DMAR_FEUADDR_REG; + unit->intrs[DMAR_INTR_FAULT].enable_intr = dmar_enable_fault_intr; + unit->intrs[DMAR_INTR_FAULT].disable_intr = dmar_disable_fault_intr; + error = dmar_alloc_irq(dev, unit, DMAR_INTR_FAULT); + if (error != 0) { + dmar_release_resources(dev, unit); + return (error); + } + if (DMAR_HAS_QI(unit)) { + unit->intrs[DMAR_INTR_QI].name = "qi"; + unit->intrs[DMAR_INTR_QI].irq_rid = DMAR_QI_IRQ_RID; + unit->intrs[DMAR_INTR_QI].handler = dmar_qi_intr; + unit->intrs[DMAR_INTR_QI].msi_data_reg = DMAR_IEDATA_REG; + unit->intrs[DMAR_INTR_QI].msi_addr_reg = DMAR_IEADDR_REG; + unit->intrs[DMAR_INTR_QI].msi_uaddr_reg = DMAR_IEUADDR_REG; + unit->intrs[DMAR_INTR_QI].enable_intr = dmar_enable_qi_intr; + unit->intrs[DMAR_INTR_QI].disable_intr = dmar_disable_qi_intr; + error = dmar_alloc_irq(dev, unit, DMAR_INTR_QI); + if (error != 0) { + dmar_release_resources(dev, unit); + return (error); + } + } + + mtx_init(&unit->lock, "dmarhw", NULL, MTX_DEF); + unit->domids = new_unrhdr(0, dmar_nd2mask(DMAR_CAP_ND(unit->hw_cap)), + &unit->lock); + + /* + * 9.2 "Context Entry": + * When Caching Mode (CM) field is reported as Set, the + * domain-id value of zero is architecturally reserved. + * Software must not use domain-id value of zero + * when CM is Set. + */ + if ((unit->hw_cap & DMAR_CAP_CM) != 0) + alloc_unr_specific(unit->domids, 0); + + unit->ctx_obj = vm_pager_allocate(OBJT_PHYS, NULL, IDX_TO_OFF(1 + + DMAR_CTX_CNT), 0, 0, NULL); + + /* + * Allocate and load the root entry table pointer. Enable the + * address translation after the required invalidations are + * done. + */ + dmar_pgalloc(unit->ctx_obj, 0, DMAR_PGF_WAITOK | DMAR_PGF_ZERO); + DMAR_LOCK(unit); + error = dmar_load_root_entry_ptr(unit); + if (error != 0) { + DMAR_UNLOCK(unit); + dmar_release_resources(dev, unit); + return (error); + } + error = dmar_inv_ctx_glob(unit); + if (error != 0) { + DMAR_UNLOCK(unit); + dmar_release_resources(dev, unit); + return (error); + } + if ((unit->hw_ecap & DMAR_ECAP_DI) != 0) { + error = dmar_inv_iotlb_glob(unit); + if (error != 0) { + DMAR_UNLOCK(unit); + dmar_release_resources(dev, unit); + return (error); + } + } + + DMAR_UNLOCK(unit); + error = dmar_init_fault_log(unit); + if (error != 0) { + dmar_release_resources(dev, unit); + return (error); + } + error = dmar_init_qi(unit); + if (error != 0) { + dmar_release_resources(dev, unit); + return (error); + } + error = dmar_init_busdma(unit); + if (error != 0) { + dmar_release_resources(dev, unit); + return (error); + } + +#ifdef NOTYET + DMAR_LOCK(unit); + error = dmar_enable_translation(unit); + if (error != 0) { + DMAR_UNLOCK(unit); + dmar_release_resources(dev, unit); + return (error); + } + DMAR_UNLOCK(unit); +#endif + + return (0); +} + +static int +dmar_detach(device_t dev) +{ + + return (EBUSY); +} + +static int +dmar_suspend(device_t dev) +{ + + return (0); +} + +static int +dmar_resume(device_t dev) +{ + + /* XXXKIB */ + return (0); +} + +static device_method_t dmar_methods[] = { + DEVMETHOD(device_identify, dmar_identify), + DEVMETHOD(device_probe, dmar_probe), + DEVMETHOD(device_attach, dmar_attach), + DEVMETHOD(device_detach, dmar_detach), + DEVMETHOD(device_suspend, dmar_suspend), + DEVMETHOD(device_resume, dmar_resume), +#ifdef DEV_APIC + DEVMETHOD(bus_remap_intr, dmar_remap_intr), +#endif + DEVMETHOD_END +}; + +static driver_t dmar_driver = { + "dmar", + dmar_methods, + sizeof(struct dmar_unit), +}; + +DRIVER_MODULE(dmar, acpi, dmar_driver, dmar_devclass, 0, 0); +MODULE_DEPEND(dmar, acpi, 1, 1, 1); + +static void +dmar_print_path(device_t dev, const char *banner, int busno, int depth, + const ACPI_DMAR_PCI_PATH *path) +{ + int i; + + device_printf(dev, "%s [%d, ", banner, busno); + for (i = 0; i < depth; i++) { + if (i != 0) + printf(", "); + printf("(%d, %d)", path[i].Device, path[i].Function); + } + printf("]\n"); +} + +static int +dmar_dev_depth(device_t child) +{ + devclass_t pci_class; + device_t bus, pcib; + int depth; + + pci_class = devclass_find("pci"); + for (depth = 1; ; depth++) { + bus = device_get_parent(child); + pcib = device_get_parent(bus); + if (device_get_devclass(device_get_parent(pcib)) != + pci_class) + return (depth); + child = pcib; + } +} + +static void +dmar_dev_path(device_t child, int *busno, ACPI_DMAR_PCI_PATH *path, int depth) +{ + devclass_t pci_class; + device_t bus, pcib; + + pci_class = devclass_find("pci"); + for (depth--; depth != -1; depth--) { + path[depth].Device = pci_get_slot(child); + path[depth].Function = pci_get_function(child); + bus = device_get_parent(child); + pcib = device_get_parent(bus); + if (device_get_devclass(device_get_parent(pcib)) != + pci_class) { + /* reached a host bridge */ + *busno = pcib_get_bus(bus); + return; + } + child = pcib; + } + panic("wrong depth"); +} + +static int +dmar_match_pathes(int busno1, const ACPI_DMAR_PCI_PATH *path1, int depth1, + int busno2, const ACPI_DMAR_PCI_PATH *path2, int depth2, + enum AcpiDmarScopeType scope_type) +{ + int i, depth; + + if (busno1 != busno2) + return (0); + if (scope_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT && depth1 != depth2) + return (0); + depth = depth1; + if (depth2 < depth) + depth = depth2; + for (i = 0; i < depth; i++) { + if (path1[i].Device != path2[i].Device || + path1[i].Function != path2[i].Function) + return (0); + } + return (1); +} + +static int +dmar_match_devscope(ACPI_DMAR_DEVICE_SCOPE *devscope, device_t dev, + int dev_busno, const ACPI_DMAR_PCI_PATH *dev_path, int dev_path_len) +{ + ACPI_DMAR_PCI_PATH *path; + int path_len; + + if (devscope->Length < sizeof(*devscope)) { + printf("dmar_find: corrupted DMAR table, dl %d\n", + devscope->Length); + return (-1); + } + if (devscope->EntryType != ACPI_DMAR_SCOPE_TYPE_ENDPOINT && + devscope->EntryType != ACPI_DMAR_SCOPE_TYPE_BRIDGE) + return (0); + path_len = devscope->Length - sizeof(*devscope); + if (path_len % 2 != 0) { + printf("dmar_find_bsf: corrupted DMAR table, dl %d\n", + devscope->Length); + return (-1); + } + path_len /= 2; + path = (ACPI_DMAR_PCI_PATH *)(devscope + 1); + if (path_len == 0) { + printf("dmar_find: corrupted DMAR table, dl %d\n", + devscope->Length); + return (-1); + } + if (dmar_match_verbose) + dmar_print_path(dev, "DMAR", devscope->Bus, path_len, path); + + return (dmar_match_pathes(devscope->Bus, path, path_len, dev_busno, + dev_path, dev_path_len, devscope->EntryType)); +} + +struct dmar_unit * +dmar_find(device_t dev) +{ + device_t dmar_dev; + ACPI_DMAR_HARDWARE_UNIT *dmarh; + ACPI_DMAR_DEVICE_SCOPE *devscope; + char *ptr, *ptrend; + int i, match, dev_domain, dev_busno, dev_path_len; + + dmar_dev = NULL; + dev_domain = pci_get_domain(dev); + dev_path_len = dmar_dev_depth(dev); + ACPI_DMAR_PCI_PATH dev_path[dev_path_len]; + dmar_dev_path(dev, &dev_busno, dev_path, dev_path_len); + if (dmar_match_verbose) + dmar_print_path(dev, "PCI", dev_busno, dev_path_len, dev_path); + + for (i = 0; i < dmar_devcnt; i++) { + if (dmar_devs[i] == NULL) + continue; + dmarh = dmar_find_by_index(i); + if (dmarh == NULL) + continue; + if (dmarh->Segment != dev_domain) + continue; + if ((dmarh->Flags & ACPI_DMAR_INCLUDE_ALL) != 0) { + dmar_dev = dmar_devs[i]; + if (dmar_match_verbose) { + device_printf(dev, + "pci%d:%d:%d:%d matched dmar%d INCLUDE_ALL\n", + dev_domain, pci_get_bus(dev), + pci_get_slot(dev), + pci_get_function(dev), + ((struct dmar_unit *)device_get_softc( + dmar_dev))->unit); + } + goto found; + } + ptr = (char *)dmarh + sizeof(*dmarh); + ptrend = (char *)dmarh + dmarh->Header.Length; + for (;;) { + if (ptr >= ptrend) + break; + devscope = (ACPI_DMAR_DEVICE_SCOPE *)ptr; + ptr += devscope->Length; + if (dmar_match_verbose) { + device_printf(dev, + "pci%d:%d:%d:%d matching dmar%d\n", + dev_domain, pci_get_bus(dev), + pci_get_slot(dev), + pci_get_function(dev), + ((struct dmar_unit *)device_get_softc( + dmar_devs[i]))->unit); + } + match = dmar_match_devscope(devscope, dev, dev_busno, + dev_path, dev_path_len); + if (dmar_match_verbose) { + if (match == -1) + printf("table error\n"); + else if (match == 0) + printf("not matched\n"); + else + printf("matched\n"); + } + if (match == -1) + return (NULL); + else if (match == 1) { + dmar_dev = dmar_devs[i]; + goto found; + } + } + } + return (NULL); +found: + return (device_get_softc(dmar_dev)); +} + +struct rmrr_iter_args { + struct dmar_ctx *ctx; + device_t dev; + int dev_domain; + int dev_busno; + ACPI_DMAR_PCI_PATH *dev_path; + int dev_path_len; + struct dmar_map_entries_tailq *rmrr_entries; +}; + +static int +dmar_rmrr_iter(ACPI_DMAR_HEADER *dmarh, void *arg) +{ + struct rmrr_iter_args *ria; + ACPI_DMAR_RESERVED_MEMORY *resmem; + ACPI_DMAR_DEVICE_SCOPE *devscope; + struct dmar_map_entry *entry; + char *ptr, *ptrend; + int match; + + if (dmarh->Type != ACPI_DMAR_TYPE_RESERVED_MEMORY) + return (1); + + ria = arg; + resmem = (ACPI_DMAR_RESERVED_MEMORY *)dmarh; + if (dmar_match_verbose) { + printf("RMRR [%jx,%jx] segment %d\n", + (uintmax_t)resmem->BaseAddress, + (uintmax_t)resmem->EndAddress, + resmem->Segment); + } + if (resmem->Segment != ria->dev_domain) + return (1); + + ptr = (char *)resmem + sizeof(*resmem); + ptrend = (char *)resmem + resmem->Header.Length; + for (;;) { + if (ptr >= ptrend) + break; + devscope = (ACPI_DMAR_DEVICE_SCOPE *)ptr; + ptr += devscope->Length; + match = dmar_match_devscope(devscope, ria->dev, ria->dev_busno, + ria->dev_path, ria->dev_path_len); + if (match == 1) { + if (dmar_match_verbose) + printf("matched\n"); + entry = dmar_gas_alloc_entry(ria->ctx, DMAR_PGF_WAITOK); + entry->start = resmem->BaseAddress; + /* The RMRR entry end address is inclusive. */ + entry->end = resmem->EndAddress; + TAILQ_INSERT_TAIL(ria->rmrr_entries, entry, + unroll_link); + } else if (dmar_match_verbose) { + printf("not matched, err %d\n", match); + } + } + + return (1); +} + +void +dmar_ctx_parse_rmrr(struct dmar_ctx *ctx, device_t dev, + struct dmar_map_entries_tailq *rmrr_entries) +{ + struct rmrr_iter_args ria; + + ria.dev_domain = pci_get_domain(dev); + ria.dev_path_len = dmar_dev_depth(dev); + ACPI_DMAR_PCI_PATH dev_path[ria.dev_path_len]; + dmar_dev_path(dev, &ria.dev_busno, dev_path, ria.dev_path_len); + + if (dmar_match_verbose) { + device_printf(dev, "parsing RMRR entries for "); + dmar_print_path(dev, "PCI", ria.dev_busno, ria.dev_path_len, + dev_path); + } + + ria.ctx = ctx; + ria.dev = dev; + ria.dev_path = dev_path; + ria.rmrr_entries = rmrr_entries; + dmar_iterate_tbl(dmar_rmrr_iter, &ria); +} + +struct inst_rmrr_iter_args { + struct dmar_unit *dmar; +}; + +static device_t +dmar_path_dev(int segment, int path_len, int busno, + const ACPI_DMAR_PCI_PATH *path) +{ + devclass_t pci_class; + device_t bus, pcib, dev; + int i; + + pci_class = devclass_find("pci"); + dev = NULL; + for (i = 0; i < path_len; i++, path++) { + dev = pci_find_dbsf(segment, busno, path->Device, + path->Function); + if (dev == NULL) + break; + if (i != path_len - 1) { + bus = device_get_parent(dev); + pcib = device_get_parent(bus); + if (device_get_devclass(device_get_parent(pcib)) != + pci_class) + return (NULL); + } + busno = pcib_get_bus(dev); + } + return (dev); +} + +static int +dmar_inst_rmrr_iter(ACPI_DMAR_HEADER *dmarh, void *arg) +{ + const ACPI_DMAR_RESERVED_MEMORY *resmem; + const ACPI_DMAR_DEVICE_SCOPE *devscope; + struct inst_rmrr_iter_args *iria; + const char *ptr, *ptrend; + struct dmar_unit *dev_dmar; + device_t dev; + + if (dmarh->Type != ACPI_DMAR_TYPE_RESERVED_MEMORY) + return (1); + + iria = arg; + resmem = (ACPI_DMAR_RESERVED_MEMORY *)dmarh; + if (resmem->Segment != iria->dmar->segment) + return (1); + if (dmar_match_verbose) { + printf("dmar%d: RMRR [%jx,%jx]\n", iria->dmar->unit, + (uintmax_t)resmem->BaseAddress, + (uintmax_t)resmem->EndAddress); + } + + ptr = (const char *)resmem + sizeof(*resmem); + ptrend = (const char *)resmem + resmem->Header.Length; + for (;;) { + if (ptr >= ptrend) + break; + devscope = (const ACPI_DMAR_DEVICE_SCOPE *)ptr; + ptr += devscope->Length; + /* XXXKIB bridge */ + if (devscope->EntryType != ACPI_DMAR_SCOPE_TYPE_ENDPOINT) + continue; + if (dmar_match_verbose) { + dmar_print_path(iria->dmar->dev, "RMRR scope", + devscope->Bus, (devscope->Length - + sizeof(ACPI_DMAR_DEVICE_SCOPE)) / 2, + (const ACPI_DMAR_PCI_PATH *)(devscope + 1)); + } + dev = dmar_path_dev(resmem->Segment, (devscope->Length - + sizeof(ACPI_DMAR_DEVICE_SCOPE)) / 2, devscope->Bus, + (const ACPI_DMAR_PCI_PATH *)(devscope + 1)); + if (dev == NULL) { + if (dmar_match_verbose) + printf("null dev\n"); + continue; + } + dev_dmar = dmar_find(dev); + if (dev_dmar != iria->dmar) { + if (dmar_match_verbose) { + printf("dmar%d matched, skipping\n", + dev_dmar->unit); + } + continue; + } + if (dmar_match_verbose) + printf("matched, instantiating RMRR context\n"); + dmar_instantiate_ctx(iria->dmar, dev, true); + } + + return (1); + +} + +/* + * Pre-create all contexts for the DMAR which have RMRR entries. + */ +int +dmar_instantiate_rmrr_ctxs(struct dmar_unit *dmar) +{ + struct inst_rmrr_iter_args iria; + int error; + + if (!dmar_barrier_enter(dmar, DMAR_BARRIER_RMRR)) + return (0); + + error = 0; + iria.dmar = dmar; + if (dmar_match_verbose) + printf("dmar%d: instantiating RMRR contexts\n", dmar->unit); + dmar_iterate_tbl(dmar_inst_rmrr_iter, &iria); + DMAR_LOCK(dmar); + if (!LIST_EMPTY(&dmar->contexts)) { + KASSERT((dmar->hw_gcmd & DMAR_GCMD_TE) == 0, + ("dmar%d: RMRR not handled but translation is already enabled", + dmar->unit)); + error = dmar_enable_translation(dmar); + } + dmar_barrier_exit(dmar, DMAR_BARRIER_RMRR); + return (error); +} + +#ifdef DDB +#include <ddb/ddb.h> +#include <ddb/db_lex.h> + +static void +dmar_print_ctx_entry(const struct dmar_map_entry *entry) +{ + struct dmar_map_entry *l, *r; + + db_printf( + " start %jx end %jx free_after %jx free_down %jx flags %x ", + entry->start, entry->end, entry->free_after, entry->free_down, + entry->flags); + db_printf("left "); + l = RB_LEFT(entry, rb_entry); + if (l == NULL) + db_printf("NULL "); + else + db_printf("%jx ", l->start); + db_printf("right "); + r = RB_RIGHT(entry, rb_entry); + if (r == NULL) + db_printf("NULL"); + else + db_printf("%jx", r->start); + db_printf("\n"); +} + +static void +dmar_print_ctx(struct dmar_ctx *ctx, bool show_mappings) +{ + struct dmar_map_entry *entry; + + db_printf( + " @%p pci%d:%d:%d dom %d mgaw %d agaw %d pglvl %d end %jx\n" + " refs %d flags %x pgobj %p map_ents %u loads %lu unloads %lu\n", + ctx, ctx->bus, ctx->slot, ctx->func, ctx->domain, ctx->mgaw, + ctx->agaw, ctx->pglvl, (uintmax_t)ctx->end, ctx->refs, + ctx->flags, ctx->pgtbl_obj, ctx->entries_cnt, ctx->loads, + ctx->unloads); + if (!show_mappings) + return; + db_printf(" mapped:\n"); + RB_FOREACH(entry, dmar_gas_entries_tree, &ctx->rb_root) { + dmar_print_ctx_entry(entry); + if (db_pager_quit) + break; + } + if (db_pager_quit) + return; + db_printf(" unloading:\n"); + TAILQ_FOREACH(entry, &ctx->unload_entries, dmamap_link) { + dmar_print_ctx_entry(entry); + if (db_pager_quit) + break; + } +} + +DB_FUNC(dmar_ctx, db_dmar_print_ctx, db_show_table, CS_OWN, NULL) +{ + struct dmar_unit *unit; + struct dmar_ctx *ctx; + bool show_mappings, valid; + int domain, bus, device, function, i, t; + db_expr_t radix; + + valid = false; + radix = db_radix; + db_radix = 10; + t = db_read_token(); + if (t == tSLASH) { + t = db_read_token(); + if (t != tIDENT) { + db_printf("Bad modifier\n"); + db_radix = radix; + db_skip_to_eol(); + return; + } + show_mappings = strchr(db_tok_string, 'm') != NULL; + t = db_read_token(); + } else { + show_mappings = false; + } + if (t == tNUMBER) { + domain = db_tok_number; + t = db_read_token(); + if (t == tNUMBER) { + bus = db_tok_number; + t = db_read_token(); + if (t == tNUMBER) { + device = db_tok_number; + t = db_read_token(); + if (t == tNUMBER) { + function = db_tok_number; + valid = true; + } + } + } + } + db_radix = radix; + db_skip_to_eol(); + if (!valid) { + db_printf("usage: show dmar_ctx [/m] " + "<domain> <bus> <device> <func>\n"); + return; + } + for (i = 0; i < dmar_devcnt; i++) { + unit = device_get_softc(dmar_devs[i]); + LIST_FOREACH(ctx, &unit->contexts, link) { + if (domain == unit->segment && bus == ctx->bus && + device == ctx->slot && function == ctx->func) { + dmar_print_ctx(ctx, show_mappings); + goto out; + } + } + } +out:; +} + +static void +dmar_print_one(int idx, bool show_ctxs, bool show_mappings) +{ + struct dmar_unit *unit; + struct dmar_ctx *ctx; + int i, frir; + + unit = device_get_softc(dmar_devs[idx]); + db_printf("dmar%d at %p, root at 0x%jx, ver 0x%x\n", unit->unit, unit, + dmar_read8(unit, DMAR_RTADDR_REG), dmar_read4(unit, DMAR_VER_REG)); + db_printf("cap 0x%jx ecap 0x%jx gsts 0x%x fsts 0x%x fectl 0x%x\n", + (uintmax_t)dmar_read8(unit, DMAR_CAP_REG), + (uintmax_t)dmar_read8(unit, DMAR_ECAP_REG), + dmar_read4(unit, DMAR_GSTS_REG), + dmar_read4(unit, DMAR_FSTS_REG), + dmar_read4(unit, DMAR_FECTL_REG)); + db_printf("fed 0x%x fea 0x%x feua 0x%x\n", + dmar_read4(unit, DMAR_FEDATA_REG), + dmar_read4(unit, DMAR_FEADDR_REG), + dmar_read4(unit, DMAR_FEUADDR_REG)); + db_printf("primary fault log:\n"); + for (i = 0; i < DMAR_CAP_NFR(unit->hw_cap); i++) { + frir = (DMAR_CAP_FRO(unit->hw_cap) + i) * 16; + db_printf(" %d at 0x%x: %jx %jx\n", i, frir, + (uintmax_t)dmar_read8(unit, frir), + (uintmax_t)dmar_read8(unit, frir + 8)); + } + if (DMAR_HAS_QI(unit)) { + db_printf("ied 0x%x iea 0x%x ieua 0x%x\n", + dmar_read4(unit, DMAR_IEDATA_REG), + dmar_read4(unit, DMAR_IEADDR_REG), + dmar_read4(unit, DMAR_IEUADDR_REG)); + if (unit->qi_enabled) { + db_printf("qi is enabled: queue @0x%jx (IQA 0x%jx) " + "size 0x%jx\n" + " head 0x%x tail 0x%x avail 0x%x status 0x%x ctrl 0x%x\n" + " hw compl 0x%x@%p/phys@%jx next seq 0x%x gen 0x%x\n", + (uintmax_t)unit->inv_queue, + (uintmax_t)dmar_read8(unit, DMAR_IQA_REG), + (uintmax_t)unit->inv_queue_size, + dmar_read4(unit, DMAR_IQH_REG), + dmar_read4(unit, DMAR_IQT_REG), + unit->inv_queue_avail, + dmar_read4(unit, DMAR_ICS_REG), + dmar_read4(unit, DMAR_IECTL_REG), + unit->inv_waitd_seq_hw, + &unit->inv_waitd_seq_hw, + (uintmax_t)unit->inv_waitd_seq_hw_phys, + unit->inv_waitd_seq, + unit->inv_waitd_gen); + } else { + db_printf("qi is disabled\n"); + } + } + if (show_ctxs) { + db_printf("contexts:\n"); + LIST_FOREACH(ctx, &unit->contexts, link) { + dmar_print_ctx(ctx, show_mappings); + if (db_pager_quit) + break; + } + } +} + +DB_SHOW_COMMAND(dmar, db_dmar_print) +{ + bool show_ctxs, show_mappings; + + show_ctxs = strchr(modif, 'c') != NULL; + show_mappings = strchr(modif, 'm') != NULL; + if (!have_addr) { + db_printf("usage: show dmar [/c] [/m] index\n"); + return; + } + dmar_print_one((int)addr, show_ctxs, show_mappings); +} + +DB_SHOW_ALL_COMMAND(dmars, db_show_all_dmars) +{ + int i; + bool show_ctxs, show_mappings; + + show_ctxs = strchr(modif, 'c') != NULL; + show_mappings = strchr(modif, 'm') != NULL; + + for (i = 0; i < dmar_devcnt; i++) { + dmar_print_one(i, show_ctxs, show_mappings); + if (db_pager_quit) + break; + } +} +#endif diff --git a/sys/x86/iommu/intel_fault.c b/sys/x86/iommu/intel_fault.c new file mode 100644 index 0000000..18f8fef --- /dev/null +++ b/sys/x86/iommu/intel_fault.c @@ -0,0 +1,315 @@ +/*- + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_acpi.h" + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/memdesc.h> +#include <sys/module.h> +#include <sys/rman.h> +#include <sys/taskqueue.h> +#include <sys/tree.h> +#include <machine/bus.h> +#include <contrib/dev/acpica/include/acpi.h> +#include <contrib/dev/acpica/include/accommon.h> +#include <dev/acpica/acpivar.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <x86/include/busdma_impl.h> +#include <x86/iommu/intel_reg.h> +#include <x86/iommu/busdma_dmar.h> +#include <x86/iommu/intel_dmar.h> + +/* + * Fault interrupt handling for DMARs. If advanced fault logging is + * not implemented by hardware, the code emulates it. Fast interrupt + * handler flushes the fault registers into circular buffer at + * unit->fault_log, and schedules a task. + * + * The fast handler is used since faults usually come in bursts, and + * number of fault log registers is limited, e.g. down to one for 5400 + * MCH. We are trying to reduce the latency for clearing the fault + * register file. The task is usually long-running, since printf() is + * slow, but this is not problematic because bursts are rare. + * + * For the same reason, each translation unit task is executed in its + * own thread. + * + * XXXKIB It seems there is no hardware available which implements + * advanced fault logging, so the code to handle AFL is not written. + */ + +static int +dmar_fault_next(struct dmar_unit *unit, int faultp) +{ + + faultp += 2; + if (faultp == unit->fault_log_size) + faultp = 0; + return (faultp); +} + +static void +dmar_fault_intr_clear(struct dmar_unit *unit, uint32_t fsts) +{ + uint32_t clear; + + clear = 0; + if ((fsts & DMAR_FSTS_ITE) != 0) { + printf("DMAR%d: Invalidation timed out\n", unit->unit); + clear |= DMAR_FSTS_ITE; + } + if ((fsts & DMAR_FSTS_ICE) != 0) { + printf("DMAR%d: Invalidation completion error\n", + unit->unit); + clear |= DMAR_FSTS_ICE; + } + if ((fsts & DMAR_FSTS_IQE) != 0) { + printf("DMAR%d: Invalidation queue error\n", + unit->unit); + clear |= DMAR_FSTS_IQE; + } + if ((fsts & DMAR_FSTS_APF) != 0) { + printf("DMAR%d: Advanced pending fault\n", unit->unit); + clear |= DMAR_FSTS_APF; + } + if ((fsts & DMAR_FSTS_AFO) != 0) { + printf("DMAR%d: Advanced fault overflow\n", unit->unit); + clear |= DMAR_FSTS_AFO; + } + if (clear != 0) + dmar_write4(unit, DMAR_FSTS_REG, clear); +} + +int +dmar_fault_intr(void *arg) +{ + struct dmar_unit *unit; + uint64_t fault_rec[2]; + uint32_t fsts; + int fri, frir, faultp; + bool enqueue; + + unit = arg; + enqueue = false; + fsts = dmar_read4(unit, DMAR_FSTS_REG); + dmar_fault_intr_clear(unit, fsts); + + if ((fsts & DMAR_FSTS_PPF) == 0) + goto done; + + fri = DMAR_FSTS_FRI(fsts); + for (;;) { + frir = (DMAR_CAP_FRO(unit->hw_cap) + fri) * 16; + fault_rec[1] = dmar_read8(unit, frir + 8); + if ((fault_rec[1] & DMAR_FRCD2_F) == 0) + break; + fault_rec[0] = dmar_read8(unit, frir); + dmar_write4(unit, frir + 12, DMAR_FRCD2_F32); + DMAR_FAULT_LOCK(unit); + faultp = unit->fault_log_head; + if (dmar_fault_next(unit, faultp) == unit->fault_log_tail) { + /* XXXKIB log overflow */ + } else { + unit->fault_log[faultp] = fault_rec[0]; + unit->fault_log[faultp + 1] = fault_rec[1]; + unit->fault_log_head = dmar_fault_next(unit, faultp); + enqueue = true; + } + DMAR_FAULT_UNLOCK(unit); + fri += 1; + if (fri >= DMAR_CAP_NFR(unit->hw_cap)) + fri = 0; + } + +done: + /* + * On SandyBridge, due to errata BJ124, IvyBridge errata + * BV100, and Haswell errata HSD40, "Spurious Intel VT-d + * Interrupts May Occur When the PFO Bit is Set". Handle the + * cases by clearing overflow bit even if no fault is + * reported. + * + * On IvyBridge, errata BV30 states that clearing clear + * DMAR_FRCD2_F bit in the fault register causes spurious + * interrupt. Do nothing. + * + */ + if ((fsts & DMAR_FSTS_PFO) != 0) { + printf("DMAR%d: Fault Overflow\n", unit->unit); + dmar_write4(unit, DMAR_FSTS_REG, DMAR_FSTS_PFO); + } + + if (enqueue) { + taskqueue_enqueue_fast(unit->fault_taskqueue, + &unit->fault_task); + } + return (FILTER_HANDLED); +} + +static void +dmar_fault_task(void *arg, int pending __unused) +{ + struct dmar_unit *unit; + struct dmar_ctx *ctx; + uint64_t fault_rec[2]; + int sid, bus, slot, func, faultp; + + unit = arg; + DMAR_FAULT_LOCK(unit); + for (;;) { + faultp = unit->fault_log_tail; + if (faultp == unit->fault_log_head) + break; + + fault_rec[0] = unit->fault_log[faultp]; + fault_rec[1] = unit->fault_log[faultp + 1]; + unit->fault_log_tail = dmar_fault_next(unit, faultp); + DMAR_FAULT_UNLOCK(unit); + + sid = DMAR_FRCD2_SID(fault_rec[1]); + bus = (sid >> 8) & 0xf; + slot = (sid >> 3) & 0x1f; + func = sid & 0x7; + printf("DMAR%d: ", unit->unit); + DMAR_LOCK(unit); + ctx = dmar_find_ctx_locked(unit, bus, slot, func); + if (ctx == NULL) { + printf("<unknown dev>:"); + } else { + ctx->flags |= DMAR_CTX_FAULTED; + ctx->last_fault_rec[0] = fault_rec[0]; + ctx->last_fault_rec[1] = fault_rec[1]; + device_print_prettyname(ctx->ctx_tag.owner); + } + DMAR_UNLOCK(unit); + printf( + "pci%d:%d:%d fault acc %x adt 0x%x reason 0x%x addr %jx\n", + bus, slot, func, DMAR_FRCD2_T(fault_rec[1]), + DMAR_FRCD2_AT(fault_rec[1]), DMAR_FRCD2_FR(fault_rec[1]), + (uintmax_t)fault_rec[0]); + DMAR_FAULT_LOCK(unit); + } + DMAR_FAULT_UNLOCK(unit); +} + +static void +dmar_clear_faults(struct dmar_unit *unit) +{ + uint32_t frec, frir, fsts; + int i; + + for (i = 0; i < DMAR_CAP_NFR(unit->hw_cap); i++) { + frir = (DMAR_CAP_FRO(unit->hw_cap) + i) * 16; + frec = dmar_read4(unit, frir + 12); + if ((frec & DMAR_FRCD2_F32) == 0) + continue; + dmar_write4(unit, frir + 12, DMAR_FRCD2_F32); + } + fsts = dmar_read4(unit, DMAR_FSTS_REG); + dmar_write4(unit, DMAR_FSTS_REG, fsts); +} + +int +dmar_init_fault_log(struct dmar_unit *unit) +{ + + mtx_init(&unit->fault_lock, "dmarflt", NULL, MTX_SPIN); + unit->fault_log_size = 256; /* 128 fault log entries */ + TUNABLE_INT_FETCH("hw.dmar.fault_log_size", &unit->fault_log_size); + if (unit->fault_log_size % 2 != 0) + panic("hw.dmar_fault_log_size must be even"); + unit->fault_log = malloc(sizeof(uint64_t) * unit->fault_log_size, + M_DEVBUF, M_WAITOK | M_ZERO); + + TASK_INIT(&unit->fault_task, 0, dmar_fault_task, unit); + unit->fault_taskqueue = taskqueue_create_fast("dmar", M_WAITOK, + taskqueue_thread_enqueue, &unit->fault_taskqueue); + taskqueue_start_threads(&unit->fault_taskqueue, 1, PI_AV, + "dmar%d fault taskq", unit->unit); + + DMAR_LOCK(unit); + dmar_disable_fault_intr(unit); + dmar_clear_faults(unit); + dmar_enable_fault_intr(unit); + DMAR_UNLOCK(unit); + + return (0); +} + +void +dmar_fini_fault_log(struct dmar_unit *unit) +{ + + DMAR_LOCK(unit); + dmar_disable_fault_intr(unit); + DMAR_UNLOCK(unit); + + if (unit->fault_taskqueue == NULL) + return; + + taskqueue_drain(unit->fault_taskqueue, &unit->fault_task); + taskqueue_free(unit->fault_taskqueue); + unit->fault_taskqueue = NULL; + mtx_destroy(&unit->fault_lock); + + free(unit->fault_log, M_DEVBUF); + unit->fault_log = NULL; + unit->fault_log_head = unit->fault_log_tail = 0; +} + +void +dmar_enable_fault_intr(struct dmar_unit *unit) +{ + uint32_t fectl; + + DMAR_ASSERT_LOCKED(unit); + fectl = dmar_read4(unit, DMAR_FECTL_REG); + fectl &= ~DMAR_FECTL_IM; + dmar_write4(unit, DMAR_FECTL_REG, fectl); +} + +void +dmar_disable_fault_intr(struct dmar_unit *unit) +{ + uint32_t fectl; + + DMAR_ASSERT_LOCKED(unit); + fectl = dmar_read4(unit, DMAR_FECTL_REG); + dmar_write4(unit, DMAR_FECTL_REG, fectl | DMAR_FECTL_IM); +} diff --git a/sys/x86/iommu/intel_gas.c b/sys/x86/iommu/intel_gas.c new file mode 100644 index 0000000..51ad151 --- /dev/null +++ b/sys/x86/iommu/intel_gas.c @@ -0,0 +1,724 @@ +/*- + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#define RB_AUGMENT(entry) dmar_gas_augment_entry(entry) + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/bus.h> +#include <sys/interrupt.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/memdesc.h> +#include <sys/mutex.h> +#include <sys/sysctl.h> +#include <sys/rman.h> +#include <sys/taskqueue.h> +#include <sys/tree.h> +#include <sys/uio.h> +#include <dev/pci/pcivar.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/uma.h> +#include <machine/atomic.h> +#include <machine/bus.h> +#include <machine/md_var.h> +#include <machine/specialreg.h> +#include <x86/include/busdma_impl.h> +#include <x86/iommu/intel_reg.h> +#include <x86/iommu/busdma_dmar.h> +#include <x86/iommu/intel_dmar.h> + +/* + * Guest Address Space management. + */ + +static uma_zone_t dmar_map_entry_zone; + +static void +intel_gas_init(void) +{ + + dmar_map_entry_zone = uma_zcreate("DMAR_MAP_ENTRY", + sizeof(struct dmar_map_entry), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, 0); +} +SYSINIT(intel_gas, SI_SUB_DRIVERS, SI_ORDER_FIRST, intel_gas_init, NULL); + +struct dmar_map_entry * +dmar_gas_alloc_entry(struct dmar_ctx *ctx, u_int flags) +{ + struct dmar_map_entry *res; + + KASSERT((flags & ~(DMAR_PGF_WAITOK)) == 0, + ("unsupported flags %x", flags)); + + res = uma_zalloc(dmar_map_entry_zone, ((flags & DMAR_PGF_WAITOK) != + 0 ? M_WAITOK : M_NOWAIT) | M_ZERO); + if (res != NULL) { + res->ctx = ctx; + atomic_add_int(&ctx->entries_cnt, 1); + } + return (res); +} + +void +dmar_gas_free_entry(struct dmar_ctx *ctx, struct dmar_map_entry *entry) +{ + + KASSERT(ctx == entry->ctx, + ("mismatched free ctx %p entry %p entry->ctx %p", ctx, + entry, entry->ctx)); + atomic_subtract_int(&ctx->entries_cnt, 1); + uma_zfree(dmar_map_entry_zone, entry); +} + +static int +dmar_gas_cmp_entries(struct dmar_map_entry *a, struct dmar_map_entry *b) +{ + + /* Last entry have zero size, so <= */ + KASSERT(a->start <= a->end, ("inverted entry %p (%jx, %jx)", + a, (uintmax_t)a->start, (uintmax_t)a->end)); + KASSERT(b->start <= b->end, ("inverted entry %p (%jx, %jx)", + b, (uintmax_t)b->start, (uintmax_t)b->end)); + KASSERT(a->end <= b->start || b->end <= a->start || + a->end == a->start || b->end == b->start, + ("overlapping entries %p (%jx, %jx) %p (%jx, %jx)", + a, (uintmax_t)a->start, (uintmax_t)a->end, + b, (uintmax_t)b->start, (uintmax_t)b->end)); + + if (a->end < b->end) + return (-1); + else if (b->end < a->end) + return (1); + return (0); +} + +static void +dmar_gas_augment_entry(struct dmar_map_entry *entry) +{ + struct dmar_map_entry *l, *r; + + for (; entry != NULL; entry = RB_PARENT(entry, rb_entry)) { + l = RB_LEFT(entry, rb_entry); + r = RB_RIGHT(entry, rb_entry); + if (l == NULL && r == NULL) { + entry->free_down = entry->free_after; + } else if (l == NULL && r != NULL) { + entry->free_down = MAX(entry->free_after, r->free_down); + } else if (/*l != NULL && */ r == NULL) { + entry->free_down = MAX(entry->free_after, l->free_down); + } else /* if (l != NULL && r != NULL) */ { + entry->free_down = MAX(entry->free_after, l->free_down); + entry->free_down = MAX(entry->free_down, r->free_down); + } + } +} + +RB_GENERATE(dmar_gas_entries_tree, dmar_map_entry, rb_entry, + dmar_gas_cmp_entries); + +static void +dmar_gas_fix_free(struct dmar_ctx *ctx, struct dmar_map_entry *entry) +{ + struct dmar_map_entry *next; + + next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry); + entry->free_after = (next != NULL ? next->start : ctx->end) - + entry->end; + dmar_gas_augment_entry(entry); +} + +#ifdef INVARIANTS +static void +dmar_gas_check_free(struct dmar_ctx *ctx) +{ + struct dmar_map_entry *entry, *next, *l, *r; + dmar_gaddr_t v; + + RB_FOREACH(entry, dmar_gas_entries_tree, &ctx->rb_root) { + KASSERT(ctx == entry->ctx, + ("mismatched free ctx %p entry %p entry->ctx %p", ctx, + entry, entry->ctx)); + next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry); + if (next == NULL) { + MPASS(entry->free_after == ctx->end - entry->end); + } else { + MPASS(entry->free_after = next->start - entry->end); + MPASS(entry->end <= next->start); + } + l = RB_LEFT(entry, rb_entry); + r = RB_RIGHT(entry, rb_entry); + if (l == NULL && r == NULL) { + MPASS(entry->free_down == entry->free_after); + } else if (l == NULL && r != NULL) { + MPASS(entry->free_down = MAX(entry->free_after, + r->free_down)); + } else if (r == NULL) { + MPASS(entry->free_down = MAX(entry->free_after, + l->free_down)); + } else { + v = MAX(entry->free_after, l->free_down); + v = MAX(entry->free_down, r->free_down); + MPASS(entry->free_down == v); + } + } +} +#endif + +static bool +dmar_gas_rb_insert(struct dmar_ctx *ctx, struct dmar_map_entry *entry) +{ + struct dmar_map_entry *prev, *found; + + found = RB_INSERT(dmar_gas_entries_tree, &ctx->rb_root, entry); + dmar_gas_fix_free(ctx, entry); + prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry); + if (prev != NULL) + dmar_gas_fix_free(ctx, prev); + return (found == NULL); +} + +static void +dmar_gas_rb_remove(struct dmar_ctx *ctx, struct dmar_map_entry *entry) +{ + struct dmar_map_entry *prev; + + prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry); + RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry); + if (prev != NULL) + dmar_gas_fix_free(ctx, prev); +} + +void +dmar_gas_init_ctx(struct dmar_ctx *ctx) +{ + struct dmar_map_entry *begin, *end; + + begin = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK); + end = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK); + + DMAR_CTX_LOCK(ctx); + KASSERT(ctx->entries_cnt == 2, ("dirty ctx %p", ctx)); + KASSERT(RB_EMPTY(&ctx->rb_root), ("non-empty entries %p", ctx)); + + begin->start = 0; + begin->end = DMAR_PAGE_SIZE; + begin->free_after = ctx->end - begin->end; + begin->flags = DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_UNMAPPED; + dmar_gas_rb_insert(ctx, begin); + + end->start = ctx->end; + end->end = ctx->end; + end->free_after = 0; + end->flags = DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_UNMAPPED; + dmar_gas_rb_insert(ctx, end); + + ctx->first_place = begin; + ctx->last_place = end; + DMAR_CTX_UNLOCK(ctx); +} + +void +dmar_gas_fini_ctx(struct dmar_ctx *ctx) +{ + struct dmar_map_entry *entry, *entry1; + + DMAR_CTX_ASSERT_LOCKED(ctx); + KASSERT(ctx->entries_cnt == 2, ("ctx still in use %p", ctx)); + + entry = RB_MIN(dmar_gas_entries_tree, &ctx->rb_root); + KASSERT(entry->start == 0, ("start entry start %p", ctx)); + KASSERT(entry->end == DMAR_PAGE_SIZE, ("start entry end %p", ctx)); + KASSERT(entry->flags == DMAR_MAP_ENTRY_PLACE, + ("start entry flags %p", ctx)); + RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry); + dmar_gas_free_entry(ctx, entry); + + entry = RB_MAX(dmar_gas_entries_tree, &ctx->rb_root); + KASSERT(entry->start == ctx->end, ("end entry start %p", ctx)); + KASSERT(entry->end == ctx->end, ("end entry end %p", ctx)); + KASSERT(entry->free_after == 0, ("end entry free_after%p", ctx)); + KASSERT(entry->flags == DMAR_MAP_ENTRY_PLACE, + ("end entry flags %p", ctx)); + RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry); + dmar_gas_free_entry(ctx, entry); + + RB_FOREACH_SAFE(entry, dmar_gas_entries_tree, &ctx->rb_root, entry1) { + KASSERT((entry->flags & DMAR_MAP_ENTRY_RMRR) != 0, + ("non-RMRR entry left %p", ctx)); + RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry); + dmar_gas_free_entry(ctx, entry); + } +} + +struct dmar_gas_match_args { + struct dmar_ctx *ctx; + dmar_gaddr_t size; + const struct bus_dma_tag_common *common; + u_int gas_flags; + struct dmar_map_entry *entry; +}; + +static bool +dmar_gas_match_one(struct dmar_gas_match_args *a, struct dmar_map_entry *prev, + dmar_gaddr_t end) +{ + dmar_gaddr_t bs, start; + + if (a->entry->start + a->size > end) + return (false); + + /* DMAR_PAGE_SIZE to create gap after new entry. */ + if (a->entry->start < prev->end + DMAR_PAGE_SIZE || + a->entry->start + a->size + DMAR_PAGE_SIZE > prev->end + + prev->free_after) + return (false); + + /* No boundary crossing. */ + if (dmar_test_boundary(a->entry->start, a->size, a->common->boundary)) + return (true); + + /* + * The start to start + size region crosses the boundary. + * Check if there is enough space after the next boundary + * after the prev->end. + */ + bs = (a->entry->start + a->common->boundary) & ~(a->common->boundary + - 1); + start = roundup2(bs, a->common->alignment); + /* DMAR_PAGE_SIZE to create gap after new entry. */ + if (start + a->size + DMAR_PAGE_SIZE <= prev->end + prev->free_after && + start + a->size <= end) { + a->entry->start = start; + return (true); + } + + /* + * Not enough space to align at boundary, but allowed to split. + * We already checked that start + size does not overlap end. + * + * XXXKIB. It is possible that bs is exactly at the start of + * the next entry, then we do not have gap. Ignore for now. + */ + if ((a->gas_flags & DMAR_GM_CANSPLIT) != 0) { + a->size = bs - a->entry->start; + return (true); + } + + return (false); +} + +static void +dmar_gas_match_insert(struct dmar_gas_match_args *a, + struct dmar_map_entry *prev) +{ + struct dmar_map_entry *next; + bool found; + + /* + * The prev->end is always aligned on the page size, which + * causes page alignment for the entry->start too. The size + * is checked to be multiple of the page size. + * + * The page sized gap is created between consequent + * allocations to ensure that out-of-bounds accesses fault. + */ + a->entry->end = a->entry->start + a->size; + + next = RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, prev); + KASSERT(next->start >= a->entry->end && + next->start - a->entry->start >= a->size, + ("dmar_gas_match_insert hole failed %p prev (%jx, %jx) " + "free_after %jx next (%jx, %jx) entry (%jx, %jx)", a->ctx, + (uintmax_t)prev->start, (uintmax_t)prev->end, + (uintmax_t)prev->free_after, + (uintmax_t)next->start, (uintmax_t)next->end, + (uintmax_t)a->entry->start, (uintmax_t)a->entry->end)); + + prev->free_after = a->entry->start - prev->end; + a->entry->free_after = next->start - a->entry->end; + + found = dmar_gas_rb_insert(a->ctx, a->entry); + KASSERT(found, ("found dup %p start %jx size %jx", + a->ctx, (uintmax_t)a->entry->start, (uintmax_t)a->size)); + a->entry->flags = DMAR_MAP_ENTRY_MAP; + + KASSERT(RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root, + a->entry) == prev, + ("entry %p prev %p inserted prev %p", a->entry, prev, + RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root, a->entry))); + KASSERT(RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, + a->entry) == next, + ("entry %p next %p inserted next %p", a->entry, next, + RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, a->entry))); +} + +static int +dmar_gas_lowermatch(struct dmar_gas_match_args *a, struct dmar_map_entry *prev) +{ + struct dmar_map_entry *l; + int ret; + + if (prev->end < a->common->lowaddr) { + a->entry->start = roundup2(prev->end + DMAR_PAGE_SIZE, + a->common->alignment); + if (dmar_gas_match_one(a, prev, a->common->lowaddr)) { + dmar_gas_match_insert(a, prev); + return (0); + } + } + if (prev->free_down < a->size + DMAR_PAGE_SIZE) + return (ENOMEM); + l = RB_LEFT(prev, rb_entry); + if (l != NULL) { + ret = dmar_gas_lowermatch(a, l); + if (ret == 0) + return (0); + } + l = RB_RIGHT(prev, rb_entry); + if (l != NULL) + return (dmar_gas_lowermatch(a, l)); + return (ENOMEM); +} + +static int +dmar_gas_uppermatch(struct dmar_gas_match_args *a) +{ + struct dmar_map_entry *next, *prev, find_entry; + + find_entry.start = a->common->highaddr; + next = RB_NFIND(dmar_gas_entries_tree, &a->ctx->rb_root, &find_entry); + if (next == NULL) + return (ENOMEM); + prev = RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root, next); + KASSERT(prev != NULL, ("no prev %p %jx", a->ctx, + (uintmax_t)find_entry.start)); + for (;;) { + a->entry->start = prev->start + DMAR_PAGE_SIZE; + if (a->entry->start < a->common->highaddr) + a->entry->start = a->common->highaddr; + a->entry->start = roundup2(a->entry->start, + a->common->alignment); + if (dmar_gas_match_one(a, prev, a->ctx->end)) { + dmar_gas_match_insert(a, prev); + return (0); + } + + /* + * XXXKIB. This falls back to linear iteration over + * the free space in the high region. But high + * regions are almost unused, the code should be + * enough to cover the case, although in the + * non-optimal way. + */ + prev = next; + next = RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, prev); + KASSERT(next != NULL, ("no next %p %jx", a->ctx, + (uintmax_t)find_entry.start)); + if (next->end >= a->ctx->end) + return (ENOMEM); + } +} + +static int +dmar_gas_find_space(struct dmar_ctx *ctx, + const struct bus_dma_tag_common *common, dmar_gaddr_t size, + u_int flags, struct dmar_map_entry *entry) +{ + struct dmar_gas_match_args a; + int error; + + DMAR_CTX_ASSERT_LOCKED(ctx); + KASSERT(entry->flags == 0, ("dirty entry %p %p", ctx, entry)); + KASSERT((size & DMAR_PAGE_MASK) == 0, ("size %jx", (uintmax_t)size)); + + a.ctx = ctx; + a.size = size; + a.common = common; + a.gas_flags = flags; + a.entry = entry; + + /* Handle lower region. */ + if (common->lowaddr > 0) { + error = dmar_gas_lowermatch(&a, RB_ROOT(&ctx->rb_root)); + if (error == 0) + return (0); + KASSERT(error == ENOMEM, + ("error %d from dmar_gas_lowermatch", error)); + } + /* Handle upper region. */ + if (common->highaddr >= ctx->end) + return (ENOMEM); + error = dmar_gas_uppermatch(&a); + KASSERT(error == ENOMEM, + ("error %d from dmar_gas_uppermatch", error)); + return (error); +} + +static int +dmar_gas_alloc_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry, + u_int flags) +{ + struct dmar_map_entry *next, *prev; + bool found; + + DMAR_CTX_ASSERT_LOCKED(ctx); + + if ((entry->start & DMAR_PAGE_MASK) != 0 || + (entry->end & DMAR_PAGE_MASK) != 0) + return (EINVAL); + if (entry->start >= entry->end) + return (EINVAL); + if (entry->end >= ctx->end) + return (EINVAL); + + next = RB_NFIND(dmar_gas_entries_tree, &ctx->rb_root, entry); + KASSERT(next != NULL, ("next must be non-null %p %jx", ctx, + (uintmax_t)entry->start)); + prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, next); + /* prev could be NULL */ + + /* + * Adapt to broken BIOSes which specify overlapping RMRR + * entries. + * + * XXXKIB: this does not handle a case when prev or next + * entries are completely covered by the current one, which + * extends both ways. + */ + if (prev != NULL && prev->end > entry->start && + (prev->flags & DMAR_MAP_ENTRY_PLACE) == 0) { + if ((prev->flags & DMAR_MAP_ENTRY_RMRR) == 0) + return (EBUSY); + entry->start = prev->end; + } + if (next != NULL && next->start < entry->end && + (next->flags & DMAR_MAP_ENTRY_PLACE) == 0) { + if ((next->flags & DMAR_MAP_ENTRY_RMRR) == 0) + return (EBUSY); + entry->end = next->start; + } + if (entry->end == entry->start) + return (0); + + if (prev != NULL && prev->end > entry->start) { + /* This assumes that prev is the placeholder entry. */ + dmar_gas_rb_remove(ctx, prev); + prev = NULL; + } + if (next != NULL && next->start < entry->end) { + dmar_gas_rb_remove(ctx, next); + next = NULL; + } + + found = dmar_gas_rb_insert(ctx, entry); + KASSERT(found, ("found RMRR dup %p start %jx end %jx", + ctx, (uintmax_t)entry->start, (uintmax_t)entry->end)); + entry->flags = DMAR_MAP_ENTRY_RMRR; + +#ifdef INVARIANTS + struct dmar_map_entry *ip, *in; + ip = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry); + in = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry); + KASSERT(prev == NULL || ip == prev, + ("RMRR %p (%jx %jx) prev %p (%jx %jx) ins prev %p (%jx %jx)", + entry, entry->start, entry->end, prev, + prev == NULL ? 0 : prev->start, prev == NULL ? 0 : prev->end, + ip, ip == NULL ? 0 : ip->start, ip == NULL ? 0 : ip->end)); + KASSERT(next == NULL || in == next, + ("RMRR %p (%jx %jx) next %p (%jx %jx) ins next %p (%jx %jx)", + entry, entry->start, entry->end, next, + next == NULL ? 0 : next->start, next == NULL ? 0 : next->end, + in, in == NULL ? 0 : in->start, in == NULL ? 0 : in->end)); +#endif + + return (0); +} + +void +dmar_gas_free_space(struct dmar_ctx *ctx, struct dmar_map_entry *entry) +{ + + DMAR_CTX_ASSERT_LOCKED(ctx); + KASSERT((entry->flags & (DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_RMRR | + DMAR_MAP_ENTRY_MAP)) == DMAR_MAP_ENTRY_MAP, + ("permanent entry %p %p", ctx, entry)); + + dmar_gas_rb_remove(ctx, entry); + entry->flags &= ~DMAR_MAP_ENTRY_MAP; +#ifdef INVARIANTS + if (dmar_check_free) + dmar_gas_check_free(ctx); +#endif +} + +void +dmar_gas_free_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry) +{ + struct dmar_map_entry *next, *prev; + + DMAR_CTX_ASSERT_LOCKED(ctx); + KASSERT((entry->flags & (DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_RMRR | + DMAR_MAP_ENTRY_MAP)) == DMAR_MAP_ENTRY_RMRR, + ("non-RMRR entry %p %p", ctx, entry)); + + prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry); + next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry); + dmar_gas_rb_remove(ctx, entry); + entry->flags &= ~DMAR_MAP_ENTRY_RMRR; + + if (prev == NULL) + dmar_gas_rb_insert(ctx, ctx->first_place); + if (next == NULL) + dmar_gas_rb_insert(ctx, ctx->last_place); +} + +int +dmar_gas_map(struct dmar_ctx *ctx, const struct bus_dma_tag_common *common, + dmar_gaddr_t size, u_int eflags, u_int flags, vm_page_t *ma, + struct dmar_map_entry **res) +{ + struct dmar_map_entry *entry; + int error; + + KASSERT((flags & ~(DMAR_GM_CANWAIT | DMAR_GM_CANSPLIT)) == 0, + ("invalid flags 0x%x", flags)); + + entry = dmar_gas_alloc_entry(ctx, (flags & DMAR_GM_CANWAIT) != 0 ? + DMAR_PGF_WAITOK : 0); + if (entry == NULL) + return (ENOMEM); + DMAR_CTX_LOCK(ctx); + error = dmar_gas_find_space(ctx, common, size, flags, entry); + if (error == ENOMEM) { + DMAR_CTX_UNLOCK(ctx); + dmar_gas_free_entry(ctx, entry); + return (error); + } +#ifdef INVARIANTS + if (dmar_check_free) + dmar_gas_check_free(ctx); +#endif + KASSERT(error == 0, + ("unexpected error %d from dmar_gas_find_entry", error)); + KASSERT(entry->end < ctx->end, ("allocated GPA %jx, max GPA %jx", + (uintmax_t)entry->end, (uintmax_t)ctx->end)); + entry->flags |= eflags; + DMAR_CTX_UNLOCK(ctx); + + error = ctx_map_buf(ctx, entry->start, size, ma, + ((eflags & DMAR_MAP_ENTRY_READ) != 0 ? DMAR_PTE_R : 0) | + ((eflags & DMAR_MAP_ENTRY_WRITE) != 0 ? DMAR_PTE_W : 0) | + ((eflags & DMAR_MAP_ENTRY_SNOOP) != 0 ? DMAR_PTE_SNP : 0) | + ((eflags & DMAR_MAP_ENTRY_TM) != 0 ? DMAR_PTE_TM : 0), + (flags & DMAR_GM_CANWAIT) != 0 ? DMAR_PGF_WAITOK : 0); + if (error == ENOMEM) { + dmar_ctx_unload_entry(entry, true); + return (error); + } + KASSERT(error == 0, + ("unexpected error %d from ctx_map_buf", error)); + + *res = entry; + return (0); +} + +int +dmar_gas_map_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry, + u_int eflags, u_int flags, vm_page_t *ma) +{ + dmar_gaddr_t start; + int error; + + KASSERT(entry->flags == 0, ("used RMRR entry %p %p %x", ctx, + entry, entry->flags)); + KASSERT((flags & ~(DMAR_GM_CANWAIT)) == 0, + ("invalid flags 0x%x", flags)); + + start = entry->start; + DMAR_CTX_LOCK(ctx); + error = dmar_gas_alloc_region(ctx, entry, flags); + if (error != 0) { + DMAR_CTX_UNLOCK(ctx); + return (error); + } + entry->flags |= eflags; + DMAR_CTX_UNLOCK(ctx); + if (entry->end == entry->start) + return (0); + + error = ctx_map_buf(ctx, entry->start, entry->end - entry->start, + ma + OFF_TO_IDX(start - entry->start), + ((eflags & DMAR_MAP_ENTRY_READ) != 0 ? DMAR_PTE_R : 0) | + ((eflags & DMAR_MAP_ENTRY_WRITE) != 0 ? DMAR_PTE_W : 0) | + ((eflags & DMAR_MAP_ENTRY_SNOOP) != 0 ? DMAR_PTE_SNP : 0) | + ((eflags & DMAR_MAP_ENTRY_TM) != 0 ? DMAR_PTE_TM : 0), + (flags & DMAR_GM_CANWAIT) != 0 ? DMAR_PGF_WAITOK : 0); + if (error == ENOMEM) { + dmar_ctx_unload_entry(entry, false); + return (error); + } + KASSERT(error == 0, + ("unexpected error %d from ctx_map_buf", error)); + + return (0); +} + +int +dmar_gas_reserve_region(struct dmar_ctx *ctx, dmar_gaddr_t start, + dmar_gaddr_t end) +{ + struct dmar_map_entry *entry; + int error; + + entry = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK); + entry->start = start; + entry->end = end; + DMAR_CTX_LOCK(ctx); + error = dmar_gas_alloc_region(ctx, entry, DMAR_GM_CANWAIT); + if (error == 0) + entry->flags |= DMAR_MAP_ENTRY_UNMAPPED; + DMAR_CTX_UNLOCK(ctx); + if (error != 0) + dmar_gas_free_entry(ctx, entry); + return (error); +} diff --git a/sys/x86/iommu/intel_idpgtbl.c b/sys/x86/iommu/intel_idpgtbl.c new file mode 100644 index 0000000..b1a8c8f --- /dev/null +++ b/sys/x86/iommu/intel_idpgtbl.c @@ -0,0 +1,783 @@ +/*- + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/bus.h> +#include <sys/interrupt.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/memdesc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/rman.h> +#include <sys/sf_buf.h> +#include <sys/sysctl.h> +#include <sys/taskqueue.h> +#include <sys/tree.h> +#include <sys/uio.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/vm_map.h> +#include <machine/atomic.h> +#include <machine/bus.h> +#include <machine/cpu.h> +#include <machine/md_var.h> +#include <machine/specialreg.h> +#include <x86/include/busdma_impl.h> +#include <x86/iommu/intel_reg.h> +#include <x86/iommu/busdma_dmar.h> +#include <x86/iommu/intel_dmar.h> + +static int ctx_unmap_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base, + dmar_gaddr_t size, int flags); + +/* + * The cache of the identity mapping page tables for the DMARs. Using + * the cache saves significant amount of memory for page tables by + * reusing the page tables, since usually DMARs are identical and have + * the same capabilities. Still, cache records the information needed + * to match DMAR capabilities and page table format, to correctly + * handle different DMARs. + */ + +struct idpgtbl { + dmar_gaddr_t maxaddr; /* Page table covers the guest address + range [0..maxaddr) */ + int pglvl; /* Total page table levels ignoring + superpages */ + int leaf; /* The last materialized page table + level, it is non-zero if superpages + are supported */ + vm_object_t pgtbl_obj; /* The page table pages */ + LIST_ENTRY(idpgtbl) link; +}; + +static struct sx idpgtbl_lock; +SX_SYSINIT(idpgtbl, &idpgtbl_lock, "idpgtbl"); +static LIST_HEAD(, idpgtbl) idpgtbls = LIST_HEAD_INITIALIZER(idpgtbls); +static MALLOC_DEFINE(M_DMAR_IDPGTBL, "dmar_idpgtbl", + "Intel DMAR Identity mappings cache elements"); + +/* + * Build the next level of the page tables for the identity mapping. + * - lvl is the level to build; + * - idx is the index of the page table page in the pgtbl_obj, which is + * being allocated filled now; + * - addr is the starting address in the bus address space which is + * mapped by the page table page. + */ +static void +ctx_idmap_nextlvl(struct idpgtbl *tbl, int lvl, vm_pindex_t idx, + dmar_gaddr_t addr) +{ + vm_page_t m, m1; + dmar_pte_t *pte; + struct sf_buf *sf; + dmar_gaddr_t f, pg_sz; + vm_pindex_t base; + int i; + + VM_OBJECT_ASSERT_LOCKED(tbl->pgtbl_obj); + if (addr >= tbl->maxaddr) + return; + m = dmar_pgalloc(tbl->pgtbl_obj, idx, DMAR_PGF_OBJL | DMAR_PGF_WAITOK | + DMAR_PGF_ZERO); + base = idx * DMAR_NPTEPG + 1; /* Index of the first child page of idx */ + pg_sz = pglvl_page_size(tbl->pglvl, lvl); + if (lvl != tbl->leaf) { + for (i = 0, f = addr; i < DMAR_NPTEPG; i++, f += pg_sz) + ctx_idmap_nextlvl(tbl, lvl + 1, base + i, f); + } + VM_OBJECT_WUNLOCK(tbl->pgtbl_obj); + pte = dmar_map_pgtbl(tbl->pgtbl_obj, idx, DMAR_PGF_WAITOK, &sf); + if (lvl == tbl->leaf) { + for (i = 0, f = addr; i < DMAR_NPTEPG; i++, f += pg_sz) { + if (f >= tbl->maxaddr) + break; + pte[i].pte = (DMAR_PTE_ADDR_MASK & f) | + DMAR_PTE_R | DMAR_PTE_W; + } + } else { + for (i = 0, f = addr; i < DMAR_NPTEPG; i++, f += pg_sz) { + if (f >= tbl->maxaddr) + break; + m1 = dmar_pgalloc(tbl->pgtbl_obj, base + i, + DMAR_PGF_NOALLOC); + KASSERT(m1 != NULL, ("lost page table page")); + pte[i].pte = (DMAR_PTE_ADDR_MASK & + VM_PAGE_TO_PHYS(m1)) | DMAR_PTE_R | DMAR_PTE_W; + } + } + /* ctx_get_idmap_pgtbl flushes CPU cache if needed. */ + dmar_unmap_pgtbl(sf, true); + VM_OBJECT_WLOCK(tbl->pgtbl_obj); +} + +/* + * Find a ready and compatible identity-mapping page table in the + * cache. If not found, populate the identity-mapping page table for + * the context, up to the maxaddr. The maxaddr byte is allowed to be + * not mapped, which is aligned with the definition of Maxmem as the + * highest usable physical address + 1. If superpages are used, the + * maxaddr is typically mapped. + */ +vm_object_t +ctx_get_idmap_pgtbl(struct dmar_ctx *ctx, dmar_gaddr_t maxaddr) +{ + struct dmar_unit *unit; + struct idpgtbl *tbl; + vm_object_t res; + vm_page_t m; + int leaf, i; + + leaf = 0; /* silence gcc */ + + /* + * First, determine where to stop the paging structures. + */ + for (i = 0; i < ctx->pglvl; i++) { + if (i == ctx->pglvl - 1 || ctx_is_sp_lvl(ctx, i)) { + leaf = i; + break; + } + } + + /* + * Search the cache for a compatible page table. Qualified + * page table must map up to maxaddr, its level must be + * supported by the DMAR and leaf should be equal to the + * calculated value. The later restriction could be lifted + * but I believe it is currently impossible to have any + * deviations for existing hardware. + */ + sx_slock(&idpgtbl_lock); + LIST_FOREACH(tbl, &idpgtbls, link) { + if (tbl->maxaddr >= maxaddr && + dmar_pglvl_supported(ctx->dmar, tbl->pglvl) && + tbl->leaf == leaf) { + res = tbl->pgtbl_obj; + vm_object_reference(res); + sx_sunlock(&idpgtbl_lock); + ctx->pglvl = tbl->pglvl; /* XXXKIB ? */ + goto end; + } + } + + /* + * Not found in cache, relock the cache into exclusive mode to + * be able to add element, and recheck cache again after the + * relock. + */ + sx_sunlock(&idpgtbl_lock); + sx_xlock(&idpgtbl_lock); + LIST_FOREACH(tbl, &idpgtbls, link) { + if (tbl->maxaddr >= maxaddr && + dmar_pglvl_supported(ctx->dmar, tbl->pglvl) && + tbl->leaf == leaf) { + res = tbl->pgtbl_obj; + vm_object_reference(res); + sx_xunlock(&idpgtbl_lock); + ctx->pglvl = tbl->pglvl; /* XXXKIB ? */ + return (res); + } + } + + /* + * Still not found, create new page table. + */ + tbl = malloc(sizeof(*tbl), M_DMAR_IDPGTBL, M_WAITOK); + tbl->pglvl = ctx->pglvl; + tbl->leaf = leaf; + tbl->maxaddr = maxaddr; + tbl->pgtbl_obj = vm_pager_allocate(OBJT_PHYS, NULL, + IDX_TO_OFF(pglvl_max_pages(tbl->pglvl)), 0, 0, NULL); + VM_OBJECT_WLOCK(tbl->pgtbl_obj); + ctx_idmap_nextlvl(tbl, 0, 0, 0); + VM_OBJECT_WUNLOCK(tbl->pgtbl_obj); + LIST_INSERT_HEAD(&idpgtbls, tbl, link); + res = tbl->pgtbl_obj; + vm_object_reference(res); + sx_xunlock(&idpgtbl_lock); + +end: + /* + * Table was found or created. + * + * If DMAR does not snoop paging structures accesses, flush + * CPU cache to memory. Note that dmar_unmap_pgtbl() coherent + * argument was possibly invalid at the time of the identity + * page table creation, since DMAR which was passed at the + * time of creation could be coherent, while current DMAR is + * not. + * + * If DMAR cannot look into the chipset write buffer, flush it + * as well. + */ + unit = ctx->dmar; + if (!DMAR_IS_COHERENT(unit)) { + VM_OBJECT_WLOCK(res); + for (m = vm_page_lookup(res, 0); m != NULL; + m = vm_page_next(m)) + pmap_invalidate_cache_pages(&m, 1); + VM_OBJECT_WUNLOCK(res); + } + if ((unit->hw_cap & DMAR_CAP_RWBF) != 0) { + DMAR_LOCK(unit); + dmar_flush_write_bufs(unit); + DMAR_UNLOCK(unit); + } + + return (res); +} + +/* + * Return a reference to the identity mapping page table to the cache. + */ +void +put_idmap_pgtbl(vm_object_t obj) +{ + struct idpgtbl *tbl, *tbl1; + vm_object_t rmobj; + + sx_slock(&idpgtbl_lock); + KASSERT(obj->ref_count >= 2, ("lost cache reference")); + vm_object_deallocate(obj); + + /* + * Cache always owns one last reference on the page table object. + * If there is an additional reference, object must stay. + */ + if (obj->ref_count > 1) { + sx_sunlock(&idpgtbl_lock); + return; + } + + /* + * Cache reference is the last, remove cache element and free + * page table object, returning the page table pages to the + * system. + */ + sx_sunlock(&idpgtbl_lock); + sx_xlock(&idpgtbl_lock); + LIST_FOREACH_SAFE(tbl, &idpgtbls, link, tbl1) { + rmobj = tbl->pgtbl_obj; + if (rmobj->ref_count == 1) { + LIST_REMOVE(tbl, link); + atomic_subtract_int(&dmar_tbl_pagecnt, + rmobj->resident_page_count); + vm_object_deallocate(rmobj); + free(tbl, M_DMAR_IDPGTBL); + } + } + sx_xunlock(&idpgtbl_lock); +} + +/* + * The core routines to map and unmap host pages at the given guest + * address. Support superpages. + */ + +/* + * Index of the pte for the guest address base in the page table at + * the level lvl. + */ +static int +ctx_pgtbl_pte_off(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl) +{ + + base >>= DMAR_PAGE_SHIFT + (ctx->pglvl - lvl - 1) * DMAR_NPTEPGSHIFT; + return (base & DMAR_PTEMASK); +} + +/* + * Returns the page index of the page table page in the page table + * object, which maps the given address base at the page table level + * lvl. + */ +static vm_pindex_t +ctx_pgtbl_get_pindex(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl) +{ + vm_pindex_t idx, pidx; + int i; + + KASSERT(lvl >= 0 && lvl < ctx->pglvl, ("wrong lvl %p %d", ctx, lvl)); + + for (pidx = idx = 0, i = 0; i < lvl; i++, pidx = idx) + idx = ctx_pgtbl_pte_off(ctx, base, i) + pidx * DMAR_NPTEPG + 1; + return (idx); +} + +static dmar_pte_t * +ctx_pgtbl_map_pte(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl, int flags, + vm_pindex_t *idxp, struct sf_buf **sf) +{ + vm_page_t m; + struct sf_buf *sfp; + dmar_pte_t *pte, *ptep; + vm_pindex_t idx, idx1; + + DMAR_CTX_ASSERT_PGLOCKED(ctx); + KASSERT((flags & DMAR_PGF_OBJL) != 0, ("lost PGF_OBJL")); + + idx = ctx_pgtbl_get_pindex(ctx, base, lvl); + if (*sf != NULL && idx == *idxp) { + pte = (dmar_pte_t *)sf_buf_kva(*sf); + } else { + if (*sf != NULL) + dmar_unmap_pgtbl(*sf, DMAR_IS_COHERENT(ctx->dmar)); + *idxp = idx; +retry: + pte = dmar_map_pgtbl(ctx->pgtbl_obj, idx, flags, sf); + if (pte == NULL) { + KASSERT(lvl > 0, ("lost root page table page %p", ctx)); + /* + * Page table page does not exists, allocate + * it and create pte in the up level. + */ + m = dmar_pgalloc(ctx->pgtbl_obj, idx, flags | + DMAR_PGF_ZERO); + if (m == NULL) + return (NULL); + + /* + * Prevent potential free while pgtbl_obj is + * unlocked in the recursive call to + * ctx_pgtbl_map_pte(), if other thread did + * pte write and clean while the lock if + * dropped. + */ + m->wire_count++; + + sfp = NULL; + ptep = ctx_pgtbl_map_pte(ctx, base, lvl - 1, flags, + &idx1, &sfp); + if (ptep == NULL) { + KASSERT(m->pindex != 0, + ("loosing root page %p", ctx)); + m->wire_count--; + dmar_pgfree(ctx->pgtbl_obj, m->pindex, flags); + return (NULL); + } + dmar_pte_store(&ptep->pte, DMAR_PTE_R | DMAR_PTE_W | + VM_PAGE_TO_PHYS(m)); + sf_buf_page(sfp)->wire_count += 1; + m->wire_count--; + dmar_unmap_pgtbl(sfp, DMAR_IS_COHERENT(ctx->dmar)); + /* Only executed once. */ + goto retry; + } + } + pte += ctx_pgtbl_pte_off(ctx, base, lvl); + return (pte); +} + +static int +ctx_map_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size, + vm_page_t *ma, uint64_t pflags, int flags) +{ + dmar_pte_t *pte; + struct sf_buf *sf; + dmar_gaddr_t pg_sz, base1, size1; + vm_pindex_t pi, c, idx, run_sz; + int lvl; + bool superpage; + + DMAR_CTX_ASSERT_PGLOCKED(ctx); + + base1 = base; + size1 = size; + flags |= DMAR_PGF_OBJL; + TD_PREP_PINNED_ASSERT; + + for (sf = NULL, pi = 0; size > 0; base += pg_sz, size -= pg_sz, + pi += run_sz) { + for (lvl = 0, c = 0, superpage = false;; lvl++) { + pg_sz = ctx_page_size(ctx, lvl); + run_sz = pg_sz >> DMAR_PAGE_SHIFT; + if (lvl == ctx->pglvl - 1) + break; + /* + * Check if the current base suitable for the + * superpage mapping. First, verify the level. + */ + if (!ctx_is_sp_lvl(ctx, lvl)) + continue; + /* + * Next, look at the size of the mapping and + * alignment of both guest and host addresses. + */ + if (size < pg_sz || (base & (pg_sz - 1)) != 0 || + (VM_PAGE_TO_PHYS(ma[pi]) & (pg_sz - 1)) != 0) + continue; + /* All passed, check host pages contiguouty. */ + if (c == 0) { + for (c = 1; c < run_sz; c++) { + if (VM_PAGE_TO_PHYS(ma[pi + c]) != + VM_PAGE_TO_PHYS(ma[pi + c - 1]) + + PAGE_SIZE) + break; + } + } + if (c >= run_sz) { + superpage = true; + break; + } + } + KASSERT(size >= pg_sz, + ("mapping loop overflow %p %jx %jx %jx", ctx, + (uintmax_t)base, (uintmax_t)size, (uintmax_t)pg_sz)); + pte = ctx_pgtbl_map_pte(ctx, base, lvl, flags, &idx, &sf); + if (pte == NULL) { + KASSERT((flags & DMAR_PGF_WAITOK) == 0, + ("failed waitable pte alloc %p", ctx)); + if (sf != NULL) { + dmar_unmap_pgtbl(sf, + DMAR_IS_COHERENT(ctx->dmar)); + } + ctx_unmap_buf_locked(ctx, base1, base - base1, flags); + TD_PINNED_ASSERT; + return (ENOMEM); + } + dmar_pte_store(&pte->pte, VM_PAGE_TO_PHYS(ma[pi]) | pflags | + (superpage ? DMAR_PTE_SP : 0)); + sf_buf_page(sf)->wire_count += 1; + } + if (sf != NULL) + dmar_unmap_pgtbl(sf, DMAR_IS_COHERENT(ctx->dmar)); + TD_PINNED_ASSERT; + return (0); +} + +int +ctx_map_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size, + vm_page_t *ma, uint64_t pflags, int flags) +{ + struct dmar_unit *unit; + int error; + + unit = ctx->dmar; + + KASSERT((ctx->flags & DMAR_CTX_IDMAP) == 0, + ("modifying idmap pagetable ctx %p", ctx)); + KASSERT((base & DMAR_PAGE_MASK) == 0, + ("non-aligned base %p %jx %jx", ctx, (uintmax_t)base, + (uintmax_t)size)); + KASSERT((size & DMAR_PAGE_MASK) == 0, + ("non-aligned size %p %jx %jx", ctx, (uintmax_t)base, + (uintmax_t)size)); + KASSERT(size > 0, ("zero size %p %jx %jx", ctx, (uintmax_t)base, + (uintmax_t)size)); + KASSERT(base < (1ULL << ctx->agaw), + ("base too high %p %jx %jx agaw %d", ctx, (uintmax_t)base, + (uintmax_t)size, ctx->agaw)); + KASSERT(base + size < (1ULL << ctx->agaw), + ("end too high %p %jx %jx agaw %d", ctx, (uintmax_t)base, + (uintmax_t)size, ctx->agaw)); + KASSERT(base + size > base, + ("size overflow %p %jx %jx", ctx, (uintmax_t)base, + (uintmax_t)size)); + KASSERT((pflags & (DMAR_PTE_R | DMAR_PTE_W)) != 0, + ("neither read nor write %jx", (uintmax_t)pflags)); + KASSERT((pflags & ~(DMAR_PTE_R | DMAR_PTE_W | DMAR_PTE_SNP | + DMAR_PTE_TM)) == 0, + ("invalid pte flags %jx", (uintmax_t)pflags)); + KASSERT((pflags & DMAR_PTE_SNP) == 0 || + (unit->hw_ecap & DMAR_ECAP_SC) != 0, + ("PTE_SNP for dmar without snoop control %p %jx", + ctx, (uintmax_t)pflags)); + KASSERT((pflags & DMAR_PTE_TM) == 0 || + (unit->hw_ecap & DMAR_ECAP_DI) != 0, + ("PTE_TM for dmar without DIOTLB %p %jx", + ctx, (uintmax_t)pflags)); + KASSERT((flags & ~DMAR_PGF_WAITOK) == 0, ("invalid flags %x", flags)); + + DMAR_CTX_PGLOCK(ctx); + error = ctx_map_buf_locked(ctx, base, size, ma, pflags, flags); + DMAR_CTX_PGUNLOCK(ctx); + if (error != 0) + return (error); + + if ((unit->hw_cap & DMAR_CAP_CM) != 0) + ctx_flush_iotlb_sync(ctx, base, size); + else if ((unit->hw_cap & DMAR_CAP_RWBF) != 0) { + /* See 11.1 Write Buffer Flushing. */ + DMAR_LOCK(unit); + dmar_flush_write_bufs(unit); + DMAR_UNLOCK(unit); + } + return (0); +} + +static void ctx_unmap_clear_pte(struct dmar_ctx *ctx, dmar_gaddr_t base, + int lvl, int flags, dmar_pte_t *pte, struct sf_buf **sf, bool free_fs); + +static void +ctx_free_pgtbl_pde(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl, int flags) +{ + struct sf_buf *sf; + dmar_pte_t *pde; + vm_pindex_t idx; + + sf = NULL; + pde = ctx_pgtbl_map_pte(ctx, base, lvl, flags, &idx, &sf); + ctx_unmap_clear_pte(ctx, base, lvl, flags, pde, &sf, true); +} + +static void +ctx_unmap_clear_pte(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl, + int flags, dmar_pte_t *pte, struct sf_buf **sf, bool free_sf) +{ + vm_page_t m; + + dmar_pte_clear(&pte->pte); + m = sf_buf_page(*sf); + if (free_sf) { + dmar_unmap_pgtbl(*sf, DMAR_IS_COHERENT(ctx->dmar)); + *sf = NULL; + } + m->wire_count--; + if (m->wire_count != 0) + return; + KASSERT(lvl != 0, + ("lost reference (lvl) on root pg ctx %p base %jx lvl %d", + ctx, (uintmax_t)base, lvl)); + KASSERT(m->pindex != 0, + ("lost reference (idx) on root pg ctx %p base %jx lvl %d", + ctx, (uintmax_t)base, lvl)); + dmar_pgfree(ctx->pgtbl_obj, m->pindex, flags); + ctx_free_pgtbl_pde(ctx, base, lvl - 1, flags); +} + +/* + * Assumes that the unmap is never partial. + */ +static int +ctx_unmap_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base, + dmar_gaddr_t size, int flags) +{ + dmar_pte_t *pte; + struct sf_buf *sf; + vm_pindex_t idx; + dmar_gaddr_t pg_sz, base1, size1; + int lvl; + + DMAR_CTX_ASSERT_PGLOCKED(ctx); + if (size == 0) + return (0); + + KASSERT((ctx->flags & DMAR_CTX_IDMAP) == 0, + ("modifying idmap pagetable ctx %p", ctx)); + KASSERT((base & DMAR_PAGE_MASK) == 0, + ("non-aligned base %p %jx %jx", ctx, (uintmax_t)base, + (uintmax_t)size)); + KASSERT((size & DMAR_PAGE_MASK) == 0, + ("non-aligned size %p %jx %jx", ctx, (uintmax_t)base, + (uintmax_t)size)); + KASSERT(base < (1ULL << ctx->agaw), + ("base too high %p %jx %jx agaw %d", ctx, (uintmax_t)base, + (uintmax_t)size, ctx->agaw)); + KASSERT(base + size < (1ULL << ctx->agaw), + ("end too high %p %jx %jx agaw %d", ctx, (uintmax_t)base, + (uintmax_t)size, ctx->agaw)); + KASSERT(base + size > base, + ("size overflow %p %jx %jx", ctx, (uintmax_t)base, + (uintmax_t)size)); + KASSERT((flags & ~DMAR_PGF_WAITOK) == 0, ("invalid flags %x", flags)); + + pg_sz = 0; /* silence gcc */ + base1 = base; + size1 = size; + flags |= DMAR_PGF_OBJL; + TD_PREP_PINNED_ASSERT; + + for (sf = NULL; size > 0; base += pg_sz, size -= pg_sz) { + for (lvl = 0; lvl < ctx->pglvl; lvl++) { + if (lvl != ctx->pglvl - 1 && !ctx_is_sp_lvl(ctx, lvl)) + continue; + pg_sz = ctx_page_size(ctx, lvl); + if (pg_sz > size) + continue; + pte = ctx_pgtbl_map_pte(ctx, base, lvl, flags, + &idx, &sf); + KASSERT(pte != NULL, + ("sleeping or page missed %p %jx %d 0x%x", + ctx, (uintmax_t)base, lvl, flags)); + if ((pte->pte & DMAR_PTE_SP) != 0 || + lvl == ctx->pglvl - 1) { + ctx_unmap_clear_pte(ctx, base, lvl, flags, + pte, &sf, false); + break; + } + } + KASSERT(size >= pg_sz, + ("unmapping loop overflow %p %jx %jx %jx", ctx, + (uintmax_t)base, (uintmax_t)size, (uintmax_t)pg_sz)); + } + if (sf != NULL) + dmar_unmap_pgtbl(sf, DMAR_IS_COHERENT(ctx->dmar)); + /* + * See 11.1 Write Buffer Flushing for an explanation why RWBF + * can be ignored there. + */ + + TD_PINNED_ASSERT; + return (0); +} + +int +ctx_unmap_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size, + int flags) +{ + int error; + + DMAR_CTX_PGLOCK(ctx); + error = ctx_unmap_buf_locked(ctx, base, size, flags); + DMAR_CTX_PGUNLOCK(ctx); + return (error); +} + +int +ctx_alloc_pgtbl(struct dmar_ctx *ctx) +{ + vm_page_t m; + + KASSERT(ctx->pgtbl_obj == NULL, ("already initialized %p", ctx)); + + ctx->pgtbl_obj = vm_pager_allocate(OBJT_PHYS, NULL, + IDX_TO_OFF(pglvl_max_pages(ctx->pglvl)), 0, 0, NULL); + DMAR_CTX_PGLOCK(ctx); + m = dmar_pgalloc(ctx->pgtbl_obj, 0, DMAR_PGF_WAITOK | + DMAR_PGF_ZERO | DMAR_PGF_OBJL); + /* No implicit free of the top level page table page. */ + m->wire_count = 1; + DMAR_CTX_PGUNLOCK(ctx); + return (0); +} + +void +ctx_free_pgtbl(struct dmar_ctx *ctx) +{ + vm_object_t obj; + vm_page_t m; + + obj = ctx->pgtbl_obj; + if (obj == NULL) { + KASSERT((ctx->dmar->hw_ecap & DMAR_ECAP_PT) != 0 && + (ctx->flags & DMAR_CTX_IDMAP) != 0, + ("lost pagetable object ctx %p", ctx)); + return; + } + DMAR_CTX_ASSERT_PGLOCKED(ctx); + ctx->pgtbl_obj = NULL; + + if ((ctx->flags & DMAR_CTX_IDMAP) != 0) { + put_idmap_pgtbl(obj); + ctx->flags &= ~DMAR_CTX_IDMAP; + return; + } + + /* Obliterate wire_counts */ + VM_OBJECT_ASSERT_WLOCKED(obj); + for (m = vm_page_lookup(obj, 0); m != NULL; m = vm_page_next(m)) + m->wire_count = 0; + VM_OBJECT_WUNLOCK(obj); + vm_object_deallocate(obj); +} + +static inline uint64_t +ctx_wait_iotlb_flush(struct dmar_unit *unit, uint64_t wt, int iro) +{ + uint64_t iotlbr; + + dmar_write8(unit, iro + DMAR_IOTLB_REG_OFF, DMAR_IOTLB_IVT | + DMAR_IOTLB_DR | DMAR_IOTLB_DW | wt); + for (;;) { + iotlbr = dmar_read8(unit, iro + DMAR_IOTLB_REG_OFF); + if ((iotlbr & DMAR_IOTLB_IVT) == 0) + break; + cpu_spinwait(); + } + return (iotlbr); +} + +void +ctx_flush_iotlb_sync(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size) +{ + struct dmar_unit *unit; + dmar_gaddr_t isize; + uint64_t iotlbr; + int am, iro; + + unit = ctx->dmar; + KASSERT(!unit->qi_enabled, ("dmar%d: sync iotlb flush call", + unit->unit)); + iro = DMAR_ECAP_IRO(unit->hw_ecap) * 16; + DMAR_LOCK(unit); + if ((unit->hw_cap & DMAR_CAP_PSI) == 0 || size > 2 * 1024 * 1024) { + iotlbr = ctx_wait_iotlb_flush(unit, DMAR_IOTLB_IIRG_DOM | + DMAR_IOTLB_DID(ctx->domain), iro); + KASSERT((iotlbr & DMAR_IOTLB_IAIG_MASK) != + DMAR_IOTLB_IAIG_INVLD, + ("dmar%d: invalidation failed %jx", unit->unit, + (uintmax_t)iotlbr)); + } else { + for (; size > 0; base += isize, size -= isize) { + am = calc_am(unit, base, size, &isize); + dmar_write8(unit, iro, base | am); + iotlbr = ctx_wait_iotlb_flush(unit, + DMAR_IOTLB_IIRG_PAGE | DMAR_IOTLB_DID(ctx->domain), + iro); + KASSERT((iotlbr & DMAR_IOTLB_IAIG_MASK) != + DMAR_IOTLB_IAIG_INVLD, + ("dmar%d: PSI invalidation failed " + "iotlbr 0x%jx base 0x%jx size 0x%jx am %d", + unit->unit, (uintmax_t)iotlbr, + (uintmax_t)base, (uintmax_t)size, am)); + /* + * Any non-page granularity covers whole guest + * address space for the domain. + */ + if ((iotlbr & DMAR_IOTLB_IAIG_MASK) != + DMAR_IOTLB_IAIG_PAGE) + break; + } + } + DMAR_UNLOCK(unit); +} diff --git a/sys/x86/iommu/intel_qi.c b/sys/x86/iommu/intel_qi.c new file mode 100644 index 0000000..a682c93 --- /dev/null +++ b/sys/x86/iommu/intel_qi.c @@ -0,0 +1,414 @@ +/*- + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_acpi.h" + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/memdesc.h> +#include <sys/module.h> +#include <sys/rman.h> +#include <sys/taskqueue.h> +#include <sys/tree.h> +#include <machine/bus.h> +#include <contrib/dev/acpica/include/acpi.h> +#include <contrib/dev/acpica/include/accommon.h> +#include <dev/acpica/acpivar.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <machine/cpu.h> +#include <x86/include/busdma_impl.h> +#include <x86/iommu/intel_reg.h> +#include <x86/iommu/busdma_dmar.h> +#include <x86/iommu/intel_dmar.h> + +static bool +dmar_qi_seq_processed(const struct dmar_unit *unit, + const struct dmar_qi_genseq *pseq) +{ + + return (pseq->gen < unit->inv_waitd_gen || + (pseq->gen == unit->inv_waitd_gen && + pseq->seq <= unit->inv_waitd_seq_hw)); +} + +static int +dmar_enable_qi(struct dmar_unit *unit) +{ + + DMAR_ASSERT_LOCKED(unit); + unit->hw_gcmd |= DMAR_GCMD_QIE; + dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd); + /* XXXKIB should have a timeout */ + while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES) == 0) + cpu_spinwait(); + return (0); +} + +static int +dmar_disable_qi(struct dmar_unit *unit) +{ + + DMAR_ASSERT_LOCKED(unit); + unit->hw_gcmd &= ~DMAR_GCMD_QIE; + dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd); + /* XXXKIB should have a timeout */ + while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES) != 0) + cpu_spinwait(); + return (0); +} + +static void +dmar_qi_advance_tail(struct dmar_unit *unit) +{ + + DMAR_ASSERT_LOCKED(unit); + dmar_write4(unit, DMAR_IQT_REG, unit->inv_queue_tail); +} + +static void +dmar_qi_ensure(struct dmar_unit *unit, int descr_count) +{ + uint32_t head; + int bytes; + + DMAR_ASSERT_LOCKED(unit); + bytes = descr_count << DMAR_IQ_DESCR_SZ_SHIFT; + for (;;) { + if (bytes <= unit->inv_queue_avail) + break; + /* refill */ + head = dmar_read4(unit, DMAR_IQH_REG); + head &= DMAR_IQH_MASK; + unit->inv_queue_avail = head - unit->inv_queue_tail - + DMAR_IQ_DESCR_SZ; + if (head <= unit->inv_queue_tail) + unit->inv_queue_avail += unit->inv_queue_size; + if (bytes <= unit->inv_queue_avail) + break; + + /* + * No space in the queue, do busy wait. Hardware must + * make a progress. But first advance the tail to + * inform the descriptor streamer about entries we + * might have already filled, otherwise they could + * clog the whole queue.. + */ + dmar_qi_advance_tail(unit); + unit->inv_queue_full++; + cpu_spinwait(); + } + unit->inv_queue_avail -= bytes; +} + +static void +dmar_qi_emit(struct dmar_unit *unit, uint64_t data1, uint64_t data2) +{ + + DMAR_ASSERT_LOCKED(unit); + *(volatile uint64_t *)(unit->inv_queue + unit->inv_queue_tail) = data1; + unit->inv_queue_tail += DMAR_IQ_DESCR_SZ / 2; + KASSERT(unit->inv_queue_tail <= unit->inv_queue_size, + ("tail overflow 0x%x 0x%jx", unit->inv_queue_tail, + (uintmax_t)unit->inv_queue_size)); + unit->inv_queue_tail &= unit->inv_queue_size - 1; + *(volatile uint64_t *)(unit->inv_queue + unit->inv_queue_tail) = data2; + unit->inv_queue_tail += DMAR_IQ_DESCR_SZ / 2; + KASSERT(unit->inv_queue_tail <= unit->inv_queue_size, + ("tail overflow 0x%x 0x%jx", unit->inv_queue_tail, + (uintmax_t)unit->inv_queue_size)); + unit->inv_queue_tail &= unit->inv_queue_size - 1; +} + +static void +dmar_qi_emit_wait_descr(struct dmar_unit *unit, uint32_t seq, bool intr, + bool memw, bool fence) +{ + + DMAR_ASSERT_LOCKED(unit); + dmar_qi_emit(unit, DMAR_IQ_DESCR_WAIT_ID | + (intr ? DMAR_IQ_DESCR_WAIT_IF : 0) | + (memw ? DMAR_IQ_DESCR_WAIT_SW : 0) | + (fence ? DMAR_IQ_DESCR_WAIT_FN : 0) | + (memw ? DMAR_IQ_DESCR_WAIT_SD(seq) : 0), + memw ? unit->inv_waitd_seq_hw_phys : 0); +} + +static void +dmar_qi_emit_wait_seq(struct dmar_unit *unit, struct dmar_qi_genseq *pseq) +{ + struct dmar_qi_genseq gsec; + uint32_t seq; + + KASSERT(pseq != NULL, ("wait descriptor with no place for seq")); + DMAR_ASSERT_LOCKED(unit); + if (unit->inv_waitd_seq == 0xffffffff) { + gsec.gen = unit->inv_waitd_gen; + gsec.seq = unit->inv_waitd_seq; + dmar_qi_ensure(unit, 1); + dmar_qi_emit_wait_descr(unit, gsec.seq, false, true, false); + dmar_qi_advance_tail(unit); + while (!dmar_qi_seq_processed(unit, &gsec)) + cpu_spinwait(); + unit->inv_waitd_gen++; + unit->inv_waitd_seq = 1; + } + seq = unit->inv_waitd_seq++; + pseq->gen = unit->inv_waitd_gen; + pseq->seq = seq; + dmar_qi_emit_wait_descr(unit, seq, true, true, false); +} + +static void +dmar_qi_wait_for_seq(struct dmar_unit *unit, const struct dmar_qi_genseq *gseq) +{ + + DMAR_ASSERT_LOCKED(unit); + unit->inv_seq_waiters++; + while (!dmar_qi_seq_processed(unit, gseq)) { + if (cold) { + cpu_spinwait(); + } else { + msleep(&unit->inv_seq_waiters, &unit->lock, 0, + "dmarse", hz); + } + } + unit->inv_seq_waiters--; +} + +void +dmar_qi_invalidate_locked(struct dmar_ctx *ctx, dmar_gaddr_t base, + dmar_gaddr_t size, struct dmar_qi_genseq *pseq) +{ + struct dmar_unit *unit; + dmar_gaddr_t isize; + int am; + + unit = ctx->dmar; + DMAR_ASSERT_LOCKED(unit); + for (; size > 0; base += isize, size -= isize) { + am = calc_am(unit, base, size, &isize); + dmar_qi_ensure(unit, 1); + dmar_qi_emit(unit, DMAR_IQ_DESCR_IOTLB_INV | + DMAR_IQ_DESCR_IOTLB_PAGE | DMAR_IQ_DESCR_IOTLB_DW | + DMAR_IQ_DESCR_IOTLB_DR | + DMAR_IQ_DESCR_IOTLB_DID(ctx->domain), + base | am); + } + if (pseq != NULL) { + dmar_qi_ensure(unit, 1); + dmar_qi_emit_wait_seq(unit, pseq); + } + dmar_qi_advance_tail(unit); +} + +void +dmar_qi_invalidate_ctx_glob_locked(struct dmar_unit *unit) +{ + struct dmar_qi_genseq gseq; + + DMAR_ASSERT_LOCKED(unit); + dmar_qi_ensure(unit, 2); + dmar_qi_emit(unit, DMAR_IQ_DESCR_CTX_INV | DMAR_IQ_DESCR_CTX_GLOB, 0); + dmar_qi_emit_wait_seq(unit, &gseq); + dmar_qi_advance_tail(unit); + dmar_qi_wait_for_seq(unit, &gseq); +} + +void +dmar_qi_invalidate_iotlb_glob_locked(struct dmar_unit *unit) +{ + struct dmar_qi_genseq gseq; + + DMAR_ASSERT_LOCKED(unit); + dmar_qi_ensure(unit, 2); + dmar_qi_emit(unit, DMAR_IQ_DESCR_IOTLB_INV | DMAR_IQ_DESCR_IOTLB_GLOB | + DMAR_IQ_DESCR_IOTLB_DW | DMAR_IQ_DESCR_IOTLB_DR, 0); + dmar_qi_emit_wait_seq(unit, &gseq); + dmar_qi_advance_tail(unit); + dmar_qi_wait_for_seq(unit, &gseq); +} + +int +dmar_qi_intr(void *arg) +{ + struct dmar_unit *unit; + + unit = arg; + KASSERT(unit->qi_enabled, ("dmar%d: QI is not enabled", unit->unit)); + taskqueue_enqueue_fast(unit->qi_taskqueue, &unit->qi_task); + return (FILTER_HANDLED); +} + +static void +dmar_qi_task(void *arg, int pending __unused) +{ + struct dmar_unit *unit; + struct dmar_map_entry *entry; + uint32_t ics; + + unit = arg; + + DMAR_LOCK(unit); + for (;;) { + entry = TAILQ_FIRST(&unit->tlb_flush_entries); + if (entry == NULL) + break; + if ((entry->gseq.gen == 0 && entry->gseq.seq == 0) || + !dmar_qi_seq_processed(unit, &entry->gseq)) + break; + TAILQ_REMOVE(&unit->tlb_flush_entries, entry, dmamap_link); + DMAR_UNLOCK(unit); + dmar_ctx_free_entry(entry, (entry->flags & + DMAR_MAP_ENTRY_QI_NF) == 0); + DMAR_LOCK(unit); + } + ics = dmar_read4(unit, DMAR_ICS_REG); + if ((ics & DMAR_ICS_IWC) != 0) { + ics = DMAR_ICS_IWC; + dmar_write4(unit, DMAR_ICS_REG, ics); + } + if (unit->inv_seq_waiters > 0) + wakeup(&unit->inv_seq_waiters); + DMAR_UNLOCK(unit); +} + +int +dmar_init_qi(struct dmar_unit *unit) +{ + uint64_t iqa; + uint32_t ics; + int qi_sz; + + if (!DMAR_HAS_QI(unit) || (unit->hw_cap & DMAR_CAP_CM) != 0) + return (0); + unit->qi_enabled = 1; + TUNABLE_INT_FETCH("hw.dmar.qi", &unit->qi_enabled); + if (!unit->qi_enabled) + return (0); + + TAILQ_INIT(&unit->tlb_flush_entries); + TASK_INIT(&unit->qi_task, 0, dmar_qi_task, unit); + unit->qi_taskqueue = taskqueue_create_fast("dmar", M_WAITOK, + taskqueue_thread_enqueue, &unit->qi_taskqueue); + taskqueue_start_threads(&unit->qi_taskqueue, 1, PI_AV, + "dmar%d qi taskq", unit->unit); + + unit->inv_waitd_gen = 0; + unit->inv_waitd_seq = 1; + + qi_sz = DMAR_IQA_QS_DEF; + TUNABLE_INT_FETCH("hw.dmar.qi_size", &qi_sz); + if (qi_sz > DMAR_IQA_QS_MAX) + qi_sz = DMAR_IQA_QS_MAX; + unit->inv_queue_size = (1ULL << qi_sz) * PAGE_SIZE; + /* Reserve one descriptor to prevent wraparound. */ + unit->inv_queue_avail = unit->inv_queue_size - DMAR_IQ_DESCR_SZ; + + /* The invalidation queue reads by DMARs are always coherent. */ + unit->inv_queue = kmem_alloc_contig(kernel_arena, unit->inv_queue_size, + M_WAITOK | M_ZERO, 0, dmar_high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + unit->inv_waitd_seq_hw_phys = pmap_kextract( + (vm_offset_t)&unit->inv_waitd_seq_hw); + + DMAR_LOCK(unit); + dmar_write8(unit, DMAR_IQT_REG, 0); + iqa = pmap_kextract(unit->inv_queue); + iqa |= qi_sz; + dmar_write8(unit, DMAR_IQA_REG, iqa); + dmar_enable_qi(unit); + ics = dmar_read4(unit, DMAR_ICS_REG); + if ((ics & DMAR_ICS_IWC) != 0) { + ics = DMAR_ICS_IWC; + dmar_write4(unit, DMAR_ICS_REG, ics); + } + DMAR_UNLOCK(unit); + + return (0); +} + +void +dmar_fini_qi(struct dmar_unit *unit) +{ + struct dmar_qi_genseq gseq; + + if (unit->qi_enabled) + return; + taskqueue_drain(unit->qi_taskqueue, &unit->qi_task); + taskqueue_free(unit->qi_taskqueue); + unit->qi_taskqueue = NULL; + + DMAR_LOCK(unit); + /* quisce */ + dmar_qi_ensure(unit, 1); + dmar_qi_emit_wait_seq(unit, &gseq); + dmar_qi_advance_tail(unit); + dmar_qi_wait_for_seq(unit, &gseq); + /* only after the quisce, disable queue */ + dmar_disable_qi(unit); + KASSERT(unit->inv_seq_waiters == 0, + ("dmar%d: waiters on disabled queue", unit->unit)); + DMAR_UNLOCK(unit); + + kmem_free(kernel_arena, unit->inv_queue, unit->inv_queue_size); + unit->inv_queue = 0; + unit->inv_queue_size = 0; + unit->qi_enabled = 0; +} + +void +dmar_enable_qi_intr(struct dmar_unit *unit) +{ + uint32_t iectl; + + DMAR_ASSERT_LOCKED(unit); + KASSERT(DMAR_HAS_QI(unit), ("dmar%d: QI is not supported", unit->unit)); + iectl = dmar_read4(unit, DMAR_IECTL_REG); + iectl &= ~DMAR_IECTL_IM; + dmar_write4(unit, DMAR_IECTL_REG, iectl); +} + +void +dmar_disable_qi_intr(struct dmar_unit *unit) +{ + uint32_t iectl; + + DMAR_ASSERT_LOCKED(unit); + KASSERT(DMAR_HAS_QI(unit), ("dmar%d: QI is not supported", unit->unit)); + iectl = dmar_read4(unit, DMAR_IECTL_REG); + dmar_write4(unit, DMAR_IECTL_REG, iectl | DMAR_IECTL_IM); +} diff --git a/sys/x86/iommu/intel_quirks.c b/sys/x86/iommu/intel_quirks.c new file mode 100644 index 0000000..7c35ae6 --- /dev/null +++ b/sys/x86/iommu/intel_quirks.c @@ -0,0 +1,195 @@ +/*- + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/memdesc.h> +#include <sys/module.h> +#include <sys/rman.h> +#include <sys/rwlock.h> +#include <sys/smp.h> +#include <sys/taskqueue.h> +#include <sys/tree.h> +#include <machine/bus.h> +#include <contrib/dev/acpica/include/acpi.h> +#include <contrib/dev/acpica/include/accommon.h> +#include <dev/acpica/acpivar.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/vm_map.h> +#include <x86/include/busdma_impl.h> +#include <x86/iommu/intel_reg.h> +#include <x86/iommu/busdma_dmar.h> +#include <x86/iommu/intel_dmar.h> +#include <dev/pci/pcivar.h> + +typedef void (*dmar_quirk_fun)(struct dmar_unit *); + +struct intel_dmar_quirk_cpu { + u_int ext_family; + u_int ext_model; + u_int family_code; + u_int model; + u_int stepping; + dmar_quirk_fun quirk; + const char *descr; +}; + +struct intel_dmar_quirk_nb { + u_int dev_id; + u_int rev_no; + dmar_quirk_fun quirk; + const char *descr; +}; + +static void +dmar_match_quirks(struct dmar_unit *dmar, + const struct intel_dmar_quirk_nb *nb_quirks, int nb_quirks_len, + const struct intel_dmar_quirk_cpu *cpu_quirks, int cpu_quirks_len) +{ + device_t nb; + const struct intel_dmar_quirk_nb *nb_quirk; + const struct intel_dmar_quirk_cpu *cpu_quirk; + u_int p[4]; + u_int dev_id, rev_no; + u_int ext_family, ext_model, family_code, model, stepping; + int i; + + if (nb_quirks != NULL) { + nb = pci_find_bsf(0, 0, 0); + if (nb != NULL) { + dev_id = pci_get_device(nb); + rev_no = pci_get_revid(nb); + for (i = 0; i < nb_quirks_len; i++) { + nb_quirk = &nb_quirks[i]; + if (nb_quirk->dev_id == dev_id && + nb_quirk->rev_no == rev_no) { + if (bootverbose) { + device_printf(dmar->dev, + "NB IOMMU quirk %s\n", + nb_quirk->descr); + } + nb_quirk->quirk(dmar); + } + } + } else { + device_printf(dmar->dev, "cannot find northbridge\n"); + } + } + if (cpu_quirks != NULL) { + do_cpuid(1, p); + ext_family = (p[0] & CPUID_EXT_FAMILY) >> 20; + ext_model = (p[0] & CPUID_EXT_MODEL) >> 16; + family_code = (p[0] & CPUID_FAMILY) >> 8; + model = (p[0] & CPUID_MODEL) >> 4; + stepping = p[0] & CPUID_STEPPING; + for (i = 0; i < cpu_quirks_len; i++) { + cpu_quirk = &cpu_quirks[i]; + if (cpu_quirk->ext_family == ext_family && + cpu_quirk->ext_model == ext_model && + cpu_quirk->family_code == family_code && + cpu_quirk->model == model && + (cpu_quirk->stepping == -1 || + cpu_quirk->stepping == stepping)) { + if (bootverbose) { + device_printf(dmar->dev, + "CPU IOMMU quirk %s\n", + cpu_quirk->descr); + } + cpu_quirk->quirk(dmar); + } + } + } +} + +static void +nb_5400_no_low_high_prot_mem(struct dmar_unit *unit) +{ + + unit->hw_cap &= ~(DMAR_CAP_PHMR | DMAR_CAP_PLMR); +} + +static const struct intel_dmar_quirk_nb pre_use_nb[] = { + { + .dev_id = 0x4001, .rev_no = 0x20, + .quirk = nb_5400_no_low_high_prot_mem, + .descr = "5400 E23" /* no low/high protected memory */ + }, + { + .dev_id = 0x4003, .rev_no = 0x20, + .quirk = nb_5400_no_low_high_prot_mem, + .descr = "5400 E23" /* no low/high protected memory */ + }, +}; + +static void +cpu_e5_am9(struct dmar_unit *unit) +{ + + unit->hw_cap &= ~(0x3fULL << 48); + unit->hw_cap |= (9ULL << 48); +} + +static const struct intel_dmar_quirk_cpu post_ident_cpu[] = { + { + .ext_family = 0, .ext_model = 2, .family_code = 6, .model = 13, + .stepping = 6, .quirk = cpu_e5_am9, + .descr = "E5 BT176" /* AM should be at most 9 */ + }, +}; + +void +dmar_quirks_pre_use(struct dmar_unit *dmar) +{ + + if (!dmar_barrier_enter(dmar, DMAR_BARRIER_USEQ)) + return; + DMAR_LOCK(dmar); + dmar_match_quirks(dmar, pre_use_nb, nitems(pre_use_nb), + NULL, 0); + dmar_barrier_exit(dmar, DMAR_BARRIER_USEQ); +} + +void +dmar_quirks_post_ident(struct dmar_unit *dmar) +{ + + dmar_match_quirks(dmar, NULL, 0, post_ident_cpu, + nitems(post_ident_cpu)); +} diff --git a/sys/x86/iommu/intel_reg.h b/sys/x86/iommu/intel_reg.h new file mode 100644 index 0000000..4c266de --- /dev/null +++ b/sys/x86/iommu/intel_reg.h @@ -0,0 +1,330 @@ +/*- + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __X86_IOMMU_INTEL_REG_H +#define __X86_IOMMU_INTEL_REG_H + +#define DMAR_PAGE_SIZE PAGE_SIZE +#define DMAR_PAGE_MASK (DMAR_PAGE_SIZE - 1) +#define DMAR_PAGE_SHIFT PAGE_SHIFT +#define DMAR_NPTEPG (DMAR_PAGE_SIZE / sizeof(dmar_pte_t)) +#define DMAR_NPTEPGSHIFT 9 +#define DMAR_PTEMASK (DMAR_NPTEPG - 1) + +typedef struct dmar_root_entry { + uint64_t r1; + uint64_t r2; +} dmar_root_entry_t; +#define DMAR_ROOT_R1_P 1 /* Present */ +#define DMAR_ROOT_R1_CTP_MASK 0xfffffffffffff000 /* Mask for Context-Entry + Table Pointer */ + +#define DMAR_CTX_CNT (DMAR_PAGE_SIZE / sizeof(dmar_root_entry_t)) + +typedef struct dmar_ctx_entry { + uint64_t ctx1; + uint64_t ctx2; +} dmar_ctx_entry_t; +#define DMAR_CTX1_P 1 /* Present */ +#define DMAR_CTX1_FPD 2 /* Fault Processing Disable */ + /* Translation Type: */ +#define DMAR_CTX1_T_UNTR 0 /* only Untranslated */ +#define DMAR_CTX1_T_TR 4 /* both Untranslated + and Translated */ +#define DMAR_CTX1_T_PASS 8 /* Pass-Through */ +#define DMAR_CTX1_ASR_MASK 0xfffffffffffff000 /* Mask for the Address + Space Root */ +#define DMAR_CTX2_AW_2LVL 0 /* 2-level page tables */ +#define DMAR_CTX2_AW_3LVL 1 /* 3-level page tables */ +#define DMAR_CTX2_AW_4LVL 2 /* 4-level page tables */ +#define DMAR_CTX2_AW_5LVL 3 /* 5-level page tables */ +#define DMAR_CTX2_AW_6LVL 4 /* 6-level page tables */ +#define DMAR_CTX2_DID(x) ((x) << 8) /* Domain Identifier */ + +typedef struct dmar_pte { + uint64_t pte; +} dmar_pte_t; +#define DMAR_PTE_R 1 /* Read */ +#define DMAR_PTE_W (1 << 1) /* Write */ +#define DMAR_PTE_SP (1 << 7) /* Super Page */ +#define DMAR_PTE_SNP (1 << 11) /* Snoop Behaviour */ +#define DMAR_PTE_ADDR_MASK 0xffffffffff000 /* Address Mask */ +#define DMAR_PTE_TM (1ULL << 62) /* Transient Mapping */ + +/* Version register */ +#define DMAR_VER_REG 0 +#define DMAR_MAJOR_VER(x) (((x) >> 4) & 0xf) +#define DMAR_MINOR_VER(x) ((x) & 0xf) + +/* Capabilities register */ +#define DMAR_CAP_REG 0x8 +#define DMAR_CAP_DRD (1ULL << 55) /* DMA Read Draining */ +#define DMAR_CAP_DWD (1ULL << 54) /* DMA Write Draining */ +#define DMAR_CAP_MAMV(x) ((u_int)(((x) >> 48) & 0x3f)) + /* Maximum Address Mask */ +#define DMAR_CAP_NFR(x) ((u_int)(((x) >> 40) & 0xff) + 1) + /* Num of Fault-recording regs */ +#define DMAR_CAP_PSI (1ULL << 39) /* Page Selective Invalidation */ +#define DMAR_CAP_SPS(x) ((u_int)(((x) >> 34) & 0xf)) /* Super-Page Support */ +#define DMAR_CAP_SPS_2M 0x1 +#define DMAR_CAP_SPS_1G 0x2 +#define DMAR_CAP_SPS_512G 0x4 +#define DMAR_CAP_SPS_1T 0x8 +#define DMAR_CAP_FRO(x) ((u_int)(((x) >> 24) & 0x1ff)) + /* Fault-recording reg offset */ +#define DMAR_CAP_ISOCH (1 << 23) /* Isochrony */ +#define DMAR_CAP_ZLR (1 << 22) /* Zero-length reads */ +#define DMAR_CAP_MGAW(x) ((u_int)(((x) >> 16) & 0x3f)) + /* Max Guest Address Width */ +#define DMAR_CAP_SAGAW(x) ((u_int)(((x) >> 8) & 0x1f)) + /* Adjusted Guest Address Width */ +#define DMAR_CAP_SAGAW_2LVL 0x01 +#define DMAR_CAP_SAGAW_3LVL 0x02 +#define DMAR_CAP_SAGAW_4LVL 0x04 +#define DMAR_CAP_SAGAW_5LVL 0x08 +#define DMAR_CAP_SAGAW_6LVL 0x10 +#define DMAR_CAP_CM (1 << 7) /* Caching mode */ +#define DMAR_CAP_PHMR (1 << 6) /* Protected High-mem Region */ +#define DMAR_CAP_PLMR (1 << 5) /* Protected Low-mem Region */ +#define DMAR_CAP_RWBF (1 << 4) /* Required Write-Buffer Flushing */ +#define DMAR_CAP_AFL (1 << 3) /* Advanced Fault Logging */ +#define DMAR_CAP_ND(x) ((u_int)((x) & 0x3)) /* Number of domains */ + +/* Extended Capabilities register */ +#define DMAR_ECAP_REG 0x10 +#define DMAR_ECAP_MHMV(x) ((u_int)(((x) >> 20) & 0xf)) + /* Maximum Handle Mask Value */ +#define DMAR_ECAP_IRO(x) ((u_int)(((x) >> 8) & 0x3ff)) + /* IOTLB Register Offset */ +#define DMAR_ECAP_SC (1 << 7) /* Snoop Control */ +#define DMAR_ECAP_PT (1 << 6) /* Pass Through */ +#define DMAR_ECAP_EIM (1 << 4) /* Extended Interrupt Mode */ +#define DMAR_ECAP_IR (1 << 3) /* Interrupt Remapping */ +#define DMAR_ECAP_DI (1 << 2) /* Device IOTLB */ +#define DMAR_ECAP_QI (1 << 1) /* Queued Invalidation */ +#define DMAR_ECAP_C (1 << 0) /* Coherency */ + +/* Global Command register */ +#define DMAR_GCMD_REG 0x18 +#define DMAR_GCMD_TE (1 << 31) /* Translation Enable */ +#define DMAR_GCMD_SRTP (1 << 30) /* Set Root Table Pointer */ +#define DMAR_GCMD_SFL (1 << 29) /* Set Fault Log */ +#define DMAR_GCMD_EAFL (1 << 28) /* Enable Advanced Fault Logging */ +#define DMAR_GCMD_WBF (1 << 27) /* Write Buffer Flush */ +#define DMAR_GCMD_QIE (1 << 26) /* Queued Invalidation Enable */ +#define DMAR_GCMD_IRE (1 << 25) /* Interrupt Remapping Enable */ +#define DMAR_GCMD_SIRTP (1 << 24) /* Set Interrupt Remap Table Pointer */ +#define DMAR_GCMD_CFI (1 << 23) /* Compatibility Format Interrupt */ + +/* Global Status register */ +#define DMAR_GSTS_REG 0x1c +#define DMAR_GSTS_TES (1 << 31) /* Translation Enable Status */ +#define DMAR_GSTS_RTPS (1 << 30) /* Root Table Pointer Status */ +#define DMAR_GSTS_FLS (1 << 29) /* Fault Log Status */ +#define DMAR_GSTS_AFLS (1 << 28) /* Advanced Fault Logging Status */ +#define DMAR_GSTS_WBFS (1 << 27) /* Write Buffer Flush Status */ +#define DMAR_GSTS_QIES (1 << 26) /* Queued Invalidation Enable Status */ +#define DMAR_GSTS_IRES (1 << 25) /* Interrupt Remapping Enable Status */ +#define DMAR_GSTS_IRTPS (1 << 24) /* Interrupt Remapping Table + Pointer Status */ +#define DMAR_GSTS_CFIS (1 << 23) /* Compatibility Format + Interrupt Status */ + +/* Root-Entry Table Address register */ +#define DMAR_RTADDR_REG 0x20 + +/* Context Command register */ +#define DMAR_CCMD_REG 0x28 +#define DMAR_CCMD_ICC (1ULL << 63) /* Invalidate Context-Cache */ +#define DMAR_CCMD_ICC32 (1 << 31) +#define DMAR_CCMD_CIRG_MASK (0x3ULL << 61) /* Context Invalidation + Request Granularity */ +#define DMAR_CCMD_CIRG_GLOB (0x1ULL << 61) /* Global */ +#define DMAR_CCMD_CIRG_DOM (0x2ULL << 61) /* Domain */ +#define DMAR_CCMD_CIRG_DEV (0x3ULL << 61) /* Device */ +#define DMAR_CCMD_CAIG(x) (((x) >> 59) & 0x3) /* Context Actual + Invalidation Granularity */ +#define DMAR_CCMD_CAIG_GLOB 0x1 /* Global */ +#define DMAR_CCMD_CAIG_DOM 0x2 /* Domain */ +#define DMAR_CCMD_CAIG_DEV 0x3 /* Device */ +#define DMAR_CCMD_FM (0x3UUL << 32) /* Function Mask */ +#define DMAR_CCMD_SID(x) (((x) & 0xffff) << 16) /* Source-ID */ +#define DMAR_CCMD_DID(x) ((x) & 0xffff) /* Domain-ID */ + +/* Invalidate Address register */ +#define DMAR_IVA_REG_OFF 0 +#define DMAR_IVA_IH (1 << 6) /* Invalidation Hint */ +#define DMAR_IVA_AM(x) ((x) & 0x1f) /* Address Mask */ +#define DMAR_IVA_ADDR(x) ((x) & ~0xfffULL) /* Address */ + +/* IOTLB Invalidate register */ +#define DMAR_IOTLB_REG_OFF 0x8 +#define DMAR_IOTLB_IVT (1ULL << 63) /* Invalidate IOTLB */ +#define DMAR_IOTLB_IVT32 (1 << 31) +#define DMAR_IOTLB_IIRG_MASK (0x3ULL << 60) /* Invalidation Request + Granularity */ +#define DMAR_IOTLB_IIRG_GLB (0x1ULL << 60) /* Global */ +#define DMAR_IOTLB_IIRG_DOM (0x2ULL << 60) /* Domain-selective */ +#define DMAR_IOTLB_IIRG_PAGE (0x3ULL << 60) /* Page-selective */ +#define DMAR_IOTLB_IAIG_MASK (0x3ULL << 57) /* Actual Invalidation + Granularity */ +#define DMAR_IOTLB_IAIG_INVLD 0 /* Hw detected error */ +#define DMAR_IOTLB_IAIG_GLB (0x1ULL << 57) /* Global */ +#define DMAR_IOTLB_IAIG_DOM (0x2ULL << 57) /* Domain-selective */ +#define DMAR_IOTLB_IAIG_PAGE (0x3ULL << 57) /* Page-selective */ +#define DMAR_IOTLB_DR (0x1ULL << 49) /* Drain Reads */ +#define DMAR_IOTLB_DW (0x1ULL << 48) /* Drain Writes */ +#define DMAR_IOTLB_DID(x) (((uint64_t)(x) & 0xffff) << 32) /* Domain Id */ + +/* Fault Status register */ +#define DMAR_FSTS_REG 0x34 +#define DMAR_FSTS_FRI(x) (((x) >> 8) & 0xff) /* Fault Record Index */ +#define DMAR_FSTS_ITE (1 << 6) /* Invalidation Time-out */ +#define DMAR_FSTS_ICE (1 << 5) /* Invalidation Completion */ +#define DMAR_FSTS_IQE (1 << 4) /* Invalidation Queue */ +#define DMAR_FSTS_APF (1 << 3) /* Advanced Pending Fault */ +#define DMAR_FSTS_AFO (1 << 2) /* Advanced Fault Overflow */ +#define DMAR_FSTS_PPF (1 << 1) /* Primary Pending Fault */ +#define DMAR_FSTS_PFO 1 /* Fault Overflow */ + +/* Fault Event Control register */ +#define DMAR_FECTL_REG 0x38 +#define DMAR_FECTL_IM (1 << 31) /* Interrupt Mask */ +#define DMAR_FECTL_IP (1 << 30) /* Interrupt Pending */ + +/* Fault Event Data register */ +#define DMAR_FEDATA_REG 0x3c + +/* Fault Event Address register */ +#define DMAR_FEADDR_REG 0x40 + +/* Fault Event Upper Address register */ +#define DMAR_FEUADDR_REG 0x44 + +/* Advanced Fault Log register */ +#define DMAR_AFLOG_REG 0x58 + +/* Fault Recording Register, also usable for Advanced Fault Log records */ +#define DMAR_FRCD2_F (1ULL << 63) /* Fault */ +#define DMAR_FRCD2_F32 (1 << 31) +#define DMAR_FRCD2_T(x) ((int)((x >> 62) & 1)) /* Type */ +#define DMAR_FRCD2_T_W 0 /* Write request */ +#define DMAR_FRCD2_T_R 1 /* Read or AtomicOp */ +#define DMAR_FRCD2_AT(x) ((int)((x >> 60) & 0x3)) /* Address Type */ +#define DMAR_FRCD2_FR(x) ((int)((x >> 32) & 0xff)) /* Fault Reason */ +#define DMAR_FRCD2_SID(x) ((int)(x & 0xffff)) /* Source Identifier */ +#define DMAR_FRCS1_FI_MASK 0xffffffffff000 /* Fault Info, Address Mask */ + +/* Protected Memory Enable register */ +#define DMAR_PMEN_REG 0x64 +#define DMAR_PMEN_EPM (1 << 31) /* Enable Protected Memory */ +#define DMAR_PMEN_PRS 1 /* Protected Region Status */ + +/* Protected Low-Memory Base register */ +#define DMAR_PLMBASE_REG 0x68 + +/* Protected Low-Memory Limit register */ +#define DMAR_PLMLIMIT_REG 0x6c + +/* Protected High-Memory Base register */ +#define DMAR_PHMBASE_REG 0x70 + +/* Protected High-Memory Limit register */ +#define DMAR_PHMLIMIT_REG 0x78 + +/* Queued Invalidation Descriptors */ +#define DMAR_IQ_DESCR_SZ_SHIFT 4 /* Shift for descriptor count + to ring offset */ +#define DMAR_IQ_DESCR_SZ (1 << DMAR_IQ_DESCR_SZ_SHIFT) + /* Descriptor size */ + +#define DMAR_IQ_DESCR_CTX_INV 0x1 /* Context-cache Invalidate + Descriptor */ +#define DMAR_IQ_DESCR_CTX_GLOB (0x1 << 4) /* Granularity: Global */ +#define DMAR_IQ_DESCR_CTX_DOM (0x2 << 4) /* Granularity: Domain */ +#define DMAR_IQ_DESCR_CTX_DEV (0x3 << 4) /* Granularity: Device */ +#define DMAR_IQ_DESCR_CTX_DID(x) (((uint32_t)(x)) << 16) /* Domain Id */ +#define DMAR_IQ_DESCR_CTX_SRC(x) (((uint64_t)(x)) << 32) /* Source Id */ +#define DMAR_IQ_DESCR_CTX_FM(x) (((uint64_t)(x)) << 48) /* Function Mask */ + +#define DMAR_IQ_DESCR_IOTLB_INV 0x2 /* IOTLB Invalidate Descriptor */ +#define DMAR_IQ_DESCR_IOTLB_GLOB (0x1 << 4) /* Granularity: Global */ +#define DMAR_IQ_DESCR_IOTLB_DOM (0x2 << 4) /* Granularity: Domain */ +#define DMAR_IQ_DESCR_IOTLB_PAGE (0x3 << 4) /* Granularity: Page */ +#define DMAR_IQ_DESCR_IOTLB_DW (1 << 6) /* Drain Writes */ +#define DMAR_IQ_DESCR_IOTLB_DR (1 << 7) /* Drain Reads */ +#define DMAR_IQ_DESCR_IOTLB_DID(x) (((uint32_t)(x)) << 16) /* Domain Id */ + +#define DMAR_IQ_DESCR_WAIT_ID 0x5 /* Invalidation Wait Descriptor */ +#define DMAR_IQ_DESCR_WAIT_IF (1 << 4) /* Interrupt Flag */ +#define DMAR_IQ_DESCR_WAIT_SW (1 << 5) /* Status Write */ +#define DMAR_IQ_DESCR_WAIT_FN (1 << 6) /* Fence */ +#define DMAR_IQ_DESCR_WAIT_SD(x) (((uint64_t)(x)) << 32) /* Status Data */ + +/* Invalidation Queue Head register */ +#define DMAR_IQH_REG 0x80 +#define DMAR_IQH_MASK 0x7fff0 /* Next cmd index mask */ + +/* Invalidation Queue Tail register */ +#define DMAR_IQT_REG 0x88 +#define DMAR_IQT_MASK 0x7fff0 + +/* Invalidation Queue Address register */ +#define DMAR_IQA_REG 0x90 +#define DMAR_IQA_IQA_MASK 0xfffffffffffff000 /* Invalidation Queue + Base Address mask */ +#define DMAR_IQA_QS_MASK 0x7 /* Queue Size in pages */ +#define DMAR_IQA_QS_MAX 0x7 /* Max Queue size */ +#define DMAR_IQA_QS_DEF 3 + + /* Invalidation Completion Status register */ +#define DMAR_ICS_REG 0x9c +#define DMAR_ICS_IWC 1 /* Invalidation Wait + Descriptor Complete */ + +/* Invalidation Event Control register */ +#define DMAR_IECTL_REG 0xa0 +#define DMAR_IECTL_IM (1 << 31) /* Interrupt Mask */ +#define DMAR_IECTL_IP (1 << 30) /* Interrupt Pending */ + +/* Invalidation Event Data register */ +#define DMAR_IEDATA_REG 0xa4 + +/* Invalidation Event Address register */ +#define DMAR_IEADDR_REG 0xa8 + +/* Invalidation Event Upper Address register */ +#define DMAR_IEUADDR_REG 0xac + +/* Interrupt Remapping Table Address register */ +#define DMAR_IRTA_REG 0xb8 + +#endif diff --git a/sys/x86/iommu/intel_utils.c b/sys/x86/iommu/intel_utils.c new file mode 100644 index 0000000..d81ec04 --- /dev/null +++ b/sys/x86/iommu/intel_utils.c @@ -0,0 +1,563 @@ +/*- + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/memdesc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/rman.h> +#include <sys/rwlock.h> +#include <sys/sched.h> +#include <sys/sf_buf.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/taskqueue.h> +#include <sys/tree.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_pageout.h> +#include <machine/bus.h> +#include <machine/cpu.h> +#include <x86/include/busdma_impl.h> +#include <x86/iommu/intel_reg.h> +#include <x86/iommu/busdma_dmar.h> +#include <x86/iommu/intel_dmar.h> + +u_int +dmar_nd2mask(u_int nd) +{ + static const u_int masks[] = { + 0x000f, /* nd == 0 */ + 0x002f, /* nd == 1 */ + 0x00ff, /* nd == 2 */ + 0x02ff, /* nd == 3 */ + 0x0fff, /* nd == 4 */ + 0x2fff, /* nd == 5 */ + 0xffff, /* nd == 6 */ + 0x0000, /* nd == 7 reserved */ + }; + + KASSERT(nd <= 6, ("number of domains %d", nd)); + return (masks[nd]); +} + +static const struct sagaw_bits_tag { + int agaw; + int cap; + int awlvl; + int pglvl; +} sagaw_bits[] = { + {.agaw = 30, .cap = DMAR_CAP_SAGAW_2LVL, .awlvl = DMAR_CTX2_AW_2LVL, + .pglvl = 2}, + {.agaw = 39, .cap = DMAR_CAP_SAGAW_3LVL, .awlvl = DMAR_CTX2_AW_3LVL, + .pglvl = 3}, + {.agaw = 48, .cap = DMAR_CAP_SAGAW_4LVL, .awlvl = DMAR_CTX2_AW_4LVL, + .pglvl = 4}, + {.agaw = 57, .cap = DMAR_CAP_SAGAW_5LVL, .awlvl = DMAR_CTX2_AW_5LVL, + .pglvl = 5}, + {.agaw = 64, .cap = DMAR_CAP_SAGAW_6LVL, .awlvl = DMAR_CTX2_AW_6LVL, + .pglvl = 6} +}; +#define SIZEOF_SAGAW_BITS (sizeof(sagaw_bits) / sizeof(sagaw_bits[0])) + +bool +dmar_pglvl_supported(struct dmar_unit *unit, int pglvl) +{ + int i; + + for (i = 0; i < SIZEOF_SAGAW_BITS; i++) { + if (sagaw_bits[i].pglvl != pglvl) + continue; + if ((DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) != 0) + return (true); + } + return (false); +} + +int +ctx_set_agaw(struct dmar_ctx *ctx, int mgaw) +{ + int sagaw, i; + + ctx->mgaw = mgaw; + sagaw = DMAR_CAP_SAGAW(ctx->dmar->hw_cap); + for (i = 0; i < SIZEOF_SAGAW_BITS; i++) { + if (sagaw_bits[i].agaw >= mgaw) { + ctx->agaw = sagaw_bits[i].agaw; + ctx->pglvl = sagaw_bits[i].pglvl; + ctx->awlvl = sagaw_bits[i].awlvl; + return (0); + } + } + device_printf(ctx->dmar->dev, + "context request mgaw %d for pci%d:%d:%d:%d, " + "no agaw found, sagaw %x\n", mgaw, ctx->dmar->segment, ctx->bus, + ctx->slot, ctx->func, sagaw); + return (EINVAL); +} + +/* + * Find a best fit mgaw for the given maxaddr: + * - if allow_less is false, must find sagaw which maps all requested + * addresses (used by identity mappings); + * - if allow_less is true, and no supported sagaw can map all requested + * address space, accept the biggest sagaw, whatever is it. + */ +int +dmar_maxaddr2mgaw(struct dmar_unit *unit, dmar_gaddr_t maxaddr, bool allow_less) +{ + int i; + + for (i = 0; i < SIZEOF_SAGAW_BITS; i++) { + if ((1ULL << sagaw_bits[i].agaw) >= maxaddr && + (DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) != 0) + break; + } + if (allow_less && i == SIZEOF_SAGAW_BITS) { + do { + i--; + } while ((DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) + == 0); + } + if (i < SIZEOF_SAGAW_BITS) + return (sagaw_bits[i].agaw); + KASSERT(0, ("no mgaw for maxaddr %jx allow_less %d", + (uintmax_t) maxaddr, allow_less)); + return (-1); +} + +/* + * Calculate the total amount of page table pages needed to map the + * whole bus address space on the context with the selected agaw. + */ +vm_pindex_t +pglvl_max_pages(int pglvl) +{ + vm_pindex_t res; + int i; + + for (res = 0, i = pglvl; i > 0; i--) { + res *= DMAR_NPTEPG; + res++; + } + return (res); +} + +/* + * Return true if the page table level lvl supports the superpage for + * the context ctx. + */ +int +ctx_is_sp_lvl(struct dmar_ctx *ctx, int lvl) +{ + int alvl, cap_sps; + static const int sagaw_sp[] = { + DMAR_CAP_SPS_2M, + DMAR_CAP_SPS_1G, + DMAR_CAP_SPS_512G, + DMAR_CAP_SPS_1T + }; + + alvl = ctx->pglvl - lvl - 1; + cap_sps = DMAR_CAP_SPS(ctx->dmar->hw_cap); + return (alvl < sizeof(sagaw_sp) / sizeof(sagaw_sp[0]) && + (sagaw_sp[alvl] & cap_sps) != 0); +} + +dmar_gaddr_t +pglvl_page_size(int total_pglvl, int lvl) +{ + int rlvl; + static const dmar_gaddr_t pg_sz[] = { + (dmar_gaddr_t)DMAR_PAGE_SIZE, + (dmar_gaddr_t)DMAR_PAGE_SIZE << DMAR_NPTEPGSHIFT, + (dmar_gaddr_t)DMAR_PAGE_SIZE << (2 * DMAR_NPTEPGSHIFT), + (dmar_gaddr_t)DMAR_PAGE_SIZE << (3 * DMAR_NPTEPGSHIFT), + (dmar_gaddr_t)DMAR_PAGE_SIZE << (4 * DMAR_NPTEPGSHIFT), + (dmar_gaddr_t)DMAR_PAGE_SIZE << (5 * DMAR_NPTEPGSHIFT) + }; + + KASSERT(lvl >= 0 && lvl < total_pglvl, + ("total %d lvl %d", total_pglvl, lvl)); + rlvl = total_pglvl - lvl - 1; + KASSERT(rlvl < sizeof(pg_sz) / sizeof(pg_sz[0]), + ("sizeof pg_sz lvl %d", lvl)); + return (pg_sz[rlvl]); +} + +dmar_gaddr_t +ctx_page_size(struct dmar_ctx *ctx, int lvl) +{ + + return (pglvl_page_size(ctx->pglvl, lvl)); +} + +int +calc_am(struct dmar_unit *unit, dmar_gaddr_t base, dmar_gaddr_t size, + dmar_gaddr_t *isizep) +{ + dmar_gaddr_t isize; + int am; + + for (am = DMAR_CAP_MAMV(unit->hw_cap);; am--) { + isize = 1ULL << (am + DMAR_PAGE_SHIFT); + if ((base & (isize - 1)) == 0 && size >= isize) + break; + if (am == 0) + break; + } + *isizep = isize; + return (am); +} + +dmar_haddr_t dmar_high; +int haw; +int dmar_tbl_pagecnt; + +vm_page_t +dmar_pgalloc(vm_object_t obj, vm_pindex_t idx, int flags) +{ + vm_page_t m; + int zeroed; + + zeroed = (flags & DMAR_PGF_ZERO) != 0 ? VM_ALLOC_ZERO : 0; + for (;;) { + if ((flags & DMAR_PGF_OBJL) == 0) + VM_OBJECT_WLOCK(obj); + m = vm_page_lookup(obj, idx); + if ((flags & DMAR_PGF_NOALLOC) != 0 || m != NULL) { + if ((flags & DMAR_PGF_OBJL) == 0) + VM_OBJECT_WUNLOCK(obj); + break; + } + m = vm_page_alloc_contig(obj, idx, VM_ALLOC_NOBUSY | + VM_ALLOC_SYSTEM | VM_ALLOC_NODUMP | zeroed, 1, 0, + dmar_high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((flags & DMAR_PGF_OBJL) == 0) + VM_OBJECT_WUNLOCK(obj); + if (m != NULL) { + if (zeroed && (m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + atomic_add_int(&dmar_tbl_pagecnt, 1); + break; + } + if ((flags & DMAR_PGF_WAITOK) == 0) + break; + if ((flags & DMAR_PGF_OBJL) != 0) + VM_OBJECT_WUNLOCK(obj); + VM_WAIT; + if ((flags & DMAR_PGF_OBJL) != 0) + VM_OBJECT_WLOCK(obj); + } + return (m); +} + +void +dmar_pgfree(vm_object_t obj, vm_pindex_t idx, int flags) +{ + vm_page_t m; + + if ((flags & DMAR_PGF_OBJL) == 0) + VM_OBJECT_WLOCK(obj); + m = vm_page_lookup(obj, idx); + if (m != NULL) { + vm_page_free(m); + atomic_subtract_int(&dmar_tbl_pagecnt, 1); + } + if ((flags & DMAR_PGF_OBJL) == 0) + VM_OBJECT_WUNLOCK(obj); +} + +void * +dmar_map_pgtbl(vm_object_t obj, vm_pindex_t idx, int flags, + struct sf_buf **sf) +{ + vm_page_t m; + bool allocated; + + if ((flags & DMAR_PGF_OBJL) == 0) + VM_OBJECT_WLOCK(obj); + m = vm_page_lookup(obj, idx); + if (m == NULL && (flags & DMAR_PGF_ALLOC) != 0) { + m = dmar_pgalloc(obj, idx, flags | DMAR_PGF_OBJL); + allocated = true; + } else + allocated = false; + if (m == NULL) { + if ((flags & DMAR_PGF_OBJL) == 0) + VM_OBJECT_WUNLOCK(obj); + return (NULL); + } + /* Sleepable allocations cannot fail. */ + if ((flags & DMAR_PGF_WAITOK) != 0) + VM_OBJECT_WUNLOCK(obj); + sched_pin(); + *sf = sf_buf_alloc(m, SFB_CPUPRIVATE | ((flags & DMAR_PGF_WAITOK) + == 0 ? SFB_NOWAIT : 0)); + if (*sf == NULL) { + sched_unpin(); + if (allocated) { + VM_OBJECT_ASSERT_WLOCKED(obj); + dmar_pgfree(obj, m->pindex, flags | DMAR_PGF_OBJL); + } + if ((flags & DMAR_PGF_OBJL) == 0) + VM_OBJECT_WUNLOCK(obj); + return (NULL); + } + if ((flags & (DMAR_PGF_WAITOK | DMAR_PGF_OBJL)) == + (DMAR_PGF_WAITOK | DMAR_PGF_OBJL)) + VM_OBJECT_WLOCK(obj); + else if ((flags & (DMAR_PGF_WAITOK | DMAR_PGF_OBJL)) == 0) + VM_OBJECT_WUNLOCK(obj); + return ((void *)sf_buf_kva(*sf)); +} + +void +dmar_unmap_pgtbl(struct sf_buf *sf, bool coherent) +{ + vm_page_t m; + + m = sf_buf_page(sf); + sf_buf_free(sf); + sched_unpin(); + + /* + * If DMAR does not snoop paging structures accesses, flush + * CPU cache to memory. + */ + if (!coherent) + pmap_invalidate_cache_pages(&m, 1); +} + +/* + * Load the root entry pointer into the hardware, busily waiting for + * the completion. + */ +int +dmar_load_root_entry_ptr(struct dmar_unit *unit) +{ + vm_page_t root_entry; + + /* + * Access to the GCMD register must be serialized while the + * command is submitted. + */ + DMAR_ASSERT_LOCKED(unit); + + /* VM_OBJECT_RLOCK(unit->ctx_obj); */ + VM_OBJECT_WLOCK(unit->ctx_obj); + root_entry = vm_page_lookup(unit->ctx_obj, 0); + /* VM_OBJECT_RUNLOCK(unit->ctx_obj); */ + VM_OBJECT_WUNLOCK(unit->ctx_obj); + dmar_write8(unit, DMAR_RTADDR_REG, VM_PAGE_TO_PHYS(root_entry)); + dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_SRTP); + /* XXXKIB should have a timeout */ + while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_RTPS) == 0) + cpu_spinwait(); + return (0); +} + +/* + * Globally invalidate the context entries cache, busily waiting for + * the completion. + */ +int +dmar_inv_ctx_glob(struct dmar_unit *unit) +{ + + /* + * Access to the CCMD register must be serialized while the + * command is submitted. + */ + DMAR_ASSERT_LOCKED(unit); + KASSERT(!unit->qi_enabled, ("QI enabled")); + + /* + * The DMAR_CCMD_ICC bit in the upper dword should be written + * after the low dword write is completed. Amd64 + * dmar_write8() does not have this issue, i386 dmar_write8() + * writes the upper dword last. + */ + dmar_write8(unit, DMAR_CCMD_REG, DMAR_CCMD_ICC | DMAR_CCMD_CIRG_GLOB); + /* XXXKIB should have a timeout */ + while ((dmar_read4(unit, DMAR_CCMD_REG + 4) & DMAR_CCMD_ICC32) != 0) + cpu_spinwait(); + return (0); +} + +/* + * Globally invalidate the IOTLB, busily waiting for the completion. + */ +int +dmar_inv_iotlb_glob(struct dmar_unit *unit) +{ + int reg; + + DMAR_ASSERT_LOCKED(unit); + KASSERT(!unit->qi_enabled, ("QI enabled")); + + reg = 16 * DMAR_ECAP_IRO(unit->hw_ecap); + /* See a comment about DMAR_CCMD_ICC in dmar_inv_ctx_glob. */ + dmar_write8(unit, reg + DMAR_IOTLB_REG_OFF, DMAR_IOTLB_IVT | + DMAR_IOTLB_IIRG_GLB | DMAR_IOTLB_DR | DMAR_IOTLB_DW); + /* XXXKIB should have a timeout */ + while ((dmar_read4(unit, reg + DMAR_IOTLB_REG_OFF + 4) & + DMAR_IOTLB_IVT32) != 0) + cpu_spinwait(); + return (0); +} + +/* + * Flush the chipset write buffers. See 11.1 "Write Buffer Flushing" + * in the architecture specification. + */ +int +dmar_flush_write_bufs(struct dmar_unit *unit) +{ + + DMAR_ASSERT_LOCKED(unit); + + /* + * DMAR_GCMD_WBF is only valid when CAP_RWBF is reported. + */ + KASSERT((unit->hw_cap & DMAR_CAP_RWBF) != 0, + ("dmar%d: no RWBF", unit->unit)); + + dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_WBF); + /* XXXKIB should have a timeout */ + while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_WBFS) == 0) + cpu_spinwait(); + return (0); +} + +int +dmar_enable_translation(struct dmar_unit *unit) +{ + + DMAR_ASSERT_LOCKED(unit); + unit->hw_gcmd |= DMAR_GCMD_TE; + dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd); + /* XXXKIB should have a timeout */ + while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES) == 0) + cpu_spinwait(); + return (0); +} + +int +dmar_disable_translation(struct dmar_unit *unit) +{ + + DMAR_ASSERT_LOCKED(unit); + unit->hw_gcmd &= ~DMAR_GCMD_TE; + dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd); + /* XXXKIB should have a timeout */ + while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES) != 0) + cpu_spinwait(); + return (0); +} + +#define BARRIER_F \ + u_int f_done, f_inproc, f_wakeup; \ + \ + f_done = 1 << (barrier_id * 3); \ + f_inproc = 1 << (barrier_id * 3 + 1); \ + f_wakeup = 1 << (barrier_id * 3 + 2) + +bool +dmar_barrier_enter(struct dmar_unit *dmar, u_int barrier_id) +{ + BARRIER_F; + + DMAR_LOCK(dmar); + if ((dmar->barrier_flags & f_done) != 0) { + DMAR_UNLOCK(dmar); + return (false); + } + + if ((dmar->barrier_flags & f_inproc) != 0) { + while ((dmar->barrier_flags & f_inproc) != 0) { + dmar->barrier_flags |= f_wakeup; + msleep(&dmar->barrier_flags, &dmar->lock, 0, + "dmarb", 0); + } + KASSERT((dmar->barrier_flags & f_done) != 0, + ("dmar%d barrier %d missing done", dmar->unit, barrier_id)); + DMAR_UNLOCK(dmar); + return (false); + } + + dmar->barrier_flags |= f_inproc; + DMAR_UNLOCK(dmar); + return (true); +} + +void +dmar_barrier_exit(struct dmar_unit *dmar, u_int barrier_id) +{ + BARRIER_F; + + DMAR_ASSERT_LOCKED(dmar); + KASSERT((dmar->barrier_flags & (f_done | f_inproc)) == f_inproc, + ("dmar%d barrier %d missed entry", dmar->unit, barrier_id)); + dmar->barrier_flags |= f_done; + if ((dmar->barrier_flags & f_wakeup) != 0) + wakeup(&dmar->barrier_flags); + dmar->barrier_flags &= ~(f_inproc | f_wakeup); + DMAR_UNLOCK(dmar); +} + +int dmar_match_verbose; + +static SYSCTL_NODE(_hw, OID_AUTO, dmar, CTLFLAG_RD, NULL, + ""); +SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt, CTLFLAG_RD | CTLFLAG_TUN, + &dmar_tbl_pagecnt, 0, + "Count of pages used for DMAR pagetables"); +SYSCTL_INT(_hw_dmar, OID_AUTO, match_verbose, CTLFLAG_RW | CTLFLAG_TUN, + &dmar_match_verbose, 0, + "Verbose matching of the PCI devices to DMAR paths"); +#ifdef INVARIANTS +int dmar_check_free; +SYSCTL_INT(_hw_dmar, OID_AUTO, check_free, CTLFLAG_RW | CTLFLAG_TUN, + &dmar_check_free, 0, + "Check the GPA RBtree for free_down and free_after validity"); +#endif + |