diff options
Diffstat (limited to 'arch/powerpc')
-rw-r--r-- | arch/powerpc/include/asm/dma-mapping.h | 1 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/Makefile | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-dma.c | 319 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-ioda.c | 102 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.h | 7 |
5 files changed, 381 insertions, 50 deletions
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 8fa3945..354f435 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -74,6 +74,7 @@ static inline unsigned long device_to_mask(struct device *dev) extern struct dma_map_ops dma_iommu_ops; #endif extern const struct dma_map_ops dma_nommu_ops; +extern const struct dma_map_ops dma_pseudo_bypass_ops; static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 703a350..2467bda 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -6,7 +6,7 @@ obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o -obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o +obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-dma.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o obj-$(CONFIG_PPC_SCOM) += opal-xscom.o diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c new file mode 100644 index 0000000..1d5409b --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci-dma.c @@ -0,0 +1,319 @@ +/* + * DMA operations supporting pseudo-bypass for PHB3+ + * + * Author: Russell Currey <ruscur@russell.cc> + * + * Copyright 2018 IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include <linux/export.h> +#include <linux/memblock.h> +#include <linux/device.h> +#include <linux/dma-mapping.h> +#include <linux/hash.h> + +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> +#include <asm/pnv-pci.h> +#include <asm/tce.h> + +#include "pci.h" + +/* select and allocate a TCE using the bitmap */ +static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr) +{ + int tce; + __be64 old, new; + + spin_lock(&pe->tce_alloc_lock); + tce = bitmap_find_next_zero_area(pe->tce_bitmap, + pe->tce_count, + 0, + 1, + 0); + bitmap_set(pe->tce_bitmap, tce, 1); + old = pe->tces[tce]; + new = cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE); + pe->tces[tce] = new; + pe_info(pe, "allocating TCE %i 0x%016llx (old 0x%016llx)\n", + tce, new, old); + spin_unlock(&pe->tce_alloc_lock); + + return tce; +} + +/* + * The tracking table for assigning TCEs has two entries per TCE. + * - @entry1 contains the physical address and the smallest bit indicates + * if it's currently valid. + * - @entry2 contains the DMA address returned in the upper 34 bits, and a + * refcount in the lower 30 bits. + */ +static dma_addr_t dma_pseudo_bypass_get_address(struct device *dev, + phys_addr_t addr) +{ + struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + struct pnv_ioda_pe *pe; + u64 i, entry1, entry2, dma_prefix, tce, ret; + u64 offset = addr & ((1 << phb->ioda.max_tce_order) - 1); + + pe = &phb->ioda.pe_array[pci_get_pdn(pdev)->pe_number]; + + /* look through the tracking table for a free entry */ + for (i = 0; i < pe->tce_count; i++) { + entry1 = pe->tce_tracker[i * 2]; + entry2 = pe->tce_tracker[i * 2 + 1]; + dma_prefix = entry2 >> 34; + + /* if the address is the same and the entry is valid */ + if (entry1 == ((addr - offset) | 1)) { + /* all we need to do here is increment the refcount */ + ret = cmpxchg(&pe->tce_tracker[i * 2 + 1], + entry2, entry2 + 1); + if (ret != entry2) { + /* conflict, start looking again just in case */ + i--; + continue; + } + return (dma_prefix << phb->ioda.max_tce_order) | offset; + /* if the entry is invalid then we want to replace it */ + } else if (!(entry1 & 1)) { + /* set the real address, note that it isn't valid yet */ + ret = cmpxchg(&pe->tce_tracker[i * 2], + entry1, (addr - offset)); + if (ret != entry1) { + /* conflict, start looking again */ + i--; + continue; + } + + /* now we can allocate a TCE */ + tce = dma_pseudo_bypass_select_tce(pe, addr - offset); + + /* set new value, including TCE index and new refcount */ + ret = cmpxchg(&pe->tce_tracker[i * 2 + 1], + entry2, tce << 34 | 1); + if (ret != entry2) { + /* + * XXX In this case we need to throw out + * everything, including the TCE we just + * allocated. For now, just leave it. + */ + i--; + continue; + } + + /* now set the valid bit */ + ret = cmpxchg(&pe->tce_tracker[i * 2], + (addr - offset), (addr - offset) | 1); + if (ret != (addr - offset)) { + /* + * XXX Same situation as above. We'd probably + * want to null out entry2 as well. + */ + i--; + continue; + } + return (tce << phb->ioda.max_tce_order) | offset; + /* it's a valid entry but not ours, keep looking */ + } else { + continue; + } + } + /* If we get here, the table must be full, so error out. */ + return -1ULL; +} + +/* + * For the moment, unmapping just decrements the refcount and doesn't actually + * remove the TCE. This is because it's very likely that a previously allocated + * TCE will be used again, and this saves having to invalidate it. + * + * TODO implement some kind of garbage collection that clears unused TCE entries + * once the table reaches a certain size. + */ +static void dma_pseudo_bypass_unmap_address(struct device *dev, dma_addr_t dma_addr) +{ + struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + struct pnv_ioda_pe *pe; + u64 i, entry1, entry2, dma_prefix, refcount; + + pe = &phb->ioda.pe_array[pci_get_pdn(pdev)->pe_number]; + + for (i = 0; i < pe->tce_count; i++) { + entry1 = pe->tce_tracker[i * 2]; + entry2 = pe->tce_tracker[i * 2 + 1]; + dma_prefix = entry2 >> 34; + refcount = entry2 & ((1 << 30) - 1); + + /* look through entry2 until we find our address */ + if (dma_prefix == (dma_addr >> phb->ioda.max_tce_order)) { + refcount--; + cmpxchg(&pe->tce_tracker[i * 2 + 1], entry2, (dma_prefix << 34) | refcount); + if (!refcount) { + /* + * Here is where we would remove the valid bit + * from entry1, clear the entry in the TCE table + * and invalidate the TCE - but we want to leave + * them until the table fills up (for now). + */ + } + break; + } + } +} + +static int dma_pseudo_bypass_dma_supported(struct device *dev, u64 mask) +{ + /* + * Normally dma_supported() checks if the mask is capable of addressing + * all of memory. Since we map physical memory in chunks that the + * device can address, the device will be able to address whatever it + * wants - just not all at once. + */ + return 1; +} + +static void *dma_pseudo_bypass_alloc_coherent(struct device *dev, + size_t size, + dma_addr_t *dma_handle, + gfp_t flag, + unsigned long attrs) +{ + void *ret; + struct page *page; + int node = dev_to_node(dev); + + /* ignore region specifiers */ + flag &= ~(__GFP_HIGHMEM); + + page = alloc_pages_node(node, flag, get_order(size)); + if (page == NULL) + return NULL; + ret = page_address(page); + memset(ret, 0, size); + *dma_handle = dma_pseudo_bypass_get_address(dev, __pa(ret)); + + return ret; +} + +static void dma_pseudo_bypass_free_coherent(struct device *dev, + size_t size, + void *vaddr, + dma_addr_t dma_handle, + unsigned long attrs) +{ + free_pages((unsigned long)vaddr, get_order(size)); +} + +static int dma_pseudo_bypass_mmap_coherent(struct device *dev, + struct vm_area_struct *vma, + void *cpu_addr, + dma_addr_t handle, + size_t size, + unsigned long attrs) +{ + unsigned long pfn = page_to_pfn(virt_to_page(cpu_addr)); + + return remap_pfn_range(vma, vma->vm_start, + pfn + vma->vm_pgoff, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +} + +static inline dma_addr_t dma_pseudo_bypass_map_page(struct device *dev, + struct page *page, + unsigned long offset, + size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + BUG_ON(dir == DMA_NONE); + + /* XXX I don't know if this is necessary (or even desired) */ + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + __dma_sync_page(page, offset, size, dir); + + return dma_pseudo_bypass_get_address(dev, page_to_phys(page) + offset); +} + +static inline void dma_pseudo_bypass_unmap_page(struct device *dev, + dma_addr_t dma_address, + size_t size, + enum dma_data_direction direction, + unsigned long attrs) +{ + dma_pseudo_bypass_unmap_address(dev, dma_address); +} + + +static int dma_pseudo_bypass_map_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction, + unsigned long attrs) +{ + struct scatterlist *sg; + int i; + + + for_each_sg(sgl, sg, nents, i) { + sg->dma_address = dma_pseudo_bypass_get_address(dev, sg_phys(sg)); + sg->dma_length = sg->length; + + if (attrs & DMA_ATTR_SKIP_CPU_SYNC) + continue; + + __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); + } + + return nents; +} + +static void dma_pseudo_bypass_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction, + unsigned long attrs) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) { + dma_pseudo_bypass_unmap_address(dev, sg->dma_address); + } +} + +static u64 dma_pseudo_bypass_get_required_mask(struct device *dev) +{ + /* + * there's no limitation on our end, the driver should just call + * set_mask() with as many bits as the device can address. + */ + return -1ULL; +} + +static int dma_pseudo_bypass_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return dma_addr == -1ULL; +} + + +const struct dma_map_ops dma_pseudo_bypass_ops = { + .alloc = dma_pseudo_bypass_alloc_coherent, + .free = dma_pseudo_bypass_free_coherent, + .mmap = dma_pseudo_bypass_mmap_coherent, + .map_sg = dma_pseudo_bypass_map_sg, + .unmap_sg = dma_pseudo_bypass_unmap_sg, + .dma_supported = dma_pseudo_bypass_dma_supported, + .map_page = dma_pseudo_bypass_map_page, + .unmap_page = dma_pseudo_bypass_unmap_page, + .get_required_mask = dma_pseudo_bypass_get_required_mask, + .mapping_error = dma_pseudo_bypass_mapping_error, +}; +EXPORT_SYMBOL(dma_pseudo_bypass_ops); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index bcb3bfc..7ecc186 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -25,6 +25,7 @@ #include <linux/iommu.h> #include <linux/rculist.h> #include <linux/sizes.h> +#include <linux/vmalloc.h> #include <asm/sections.h> #include <asm/io.h> @@ -1088,6 +1089,9 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) pe->pbus = NULL; pe->mve_number = -1; pe->rid = dev->bus->number << 8 | pdn->devfn; + pe->tces = NULL; + pe->tce_tracker = NULL; + pe->tce_bitmap = NULL; pe_info(pe, "Associated device to PE\n"); @@ -1569,6 +1573,9 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) pe->mve_number = -1; pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | pci_iov_virtfn_devfn(pdev, vf_index); + pe->tces = NULL; + pe->tce_tracker = NULL; + pe->tce_bitmap = NULL; pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n", hose->global_number, pdev->bus->number, @@ -1774,43 +1781,40 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe) return true; } -/* - * Reconfigure TVE#0 to be usable as 64-bit DMA space. - * - * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses. - * Devices can only access more than that if bit 59 of the PCI address is set - * by hardware, which indicates TVE#1 should be used instead of TVE#0. - * Many PCI devices are not capable of addressing that many bits, and as a - * result are limited to the 4GB of virtual memory made available to 32-bit - * devices in TVE#0. - * - * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit - * devices by configuring the virtual memory past the first 4GB inaccessible - * by 64-bit DMAs. This should only be used by devices that want more than - * 4GB, and only on PEs that have no 32-bit devices. - * - * Currently this will only work on PHB3 (POWER8). - */ -static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe) +static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe) { - u64 window_size, table_size, tce_count, addr; + u64 tce_count, table_size, window_size; + struct pnv_phb *p = pe->phb; struct page *table_pages; - u64 tce_order = 28; /* 256MB TCEs */ __be64 *tces; - s64 rc; + int rc = -ENOMEM; + int bitmap_size, tracker_entries; + + /* + * XXX These are factors for scaling the size of the TCE table, and + * the table that tracks these allocations. These should eventually + * be kernel command line options with defaults above 1, for situations + * where your memory expands after the machine has booted. + */ + int tce_size_factor = 1; + int tracking_table_factor = 1; /* - * Window size needs to be a power of two, but needs to account for - * shifting memory by the 4GB offset required to skip 32bit space. + * The window size covers all of memory (and optionally more), with + * enough tracker entries to cover them all being allocated. So we + * create enough TCEs to cover all of memory at once. */ - window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32)); - tce_count = window_size >> tce_order; + window_size = roundup_pow_of_two(tce_size_factor * memory_hotplug_max()); + tracker_entries = (tracking_table_factor * memory_hotplug_max()) >> + p->ioda.max_tce_order; + tce_count = window_size >> p->ioda.max_tce_order; + bitmap_size = BITS_TO_LONGS(tce_count) * sizeof(unsigned long); table_size = tce_count << 3; if (table_size < PAGE_SIZE) table_size = PAGE_SIZE; - table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL, + table_pages = alloc_pages_node(p->hose->node, GFP_KERNEL, get_order(table_size)); if (!table_pages) goto err; @@ -1821,26 +1825,33 @@ static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe) memset(tces, 0, table_size); - for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) { - tces[(addr + (1ULL << 32)) >> tce_order] = - cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE); - } + pe->tces = tces; + pe->tce_count = tce_count; + pe->tce_bitmap = kzalloc(bitmap_size, GFP_KERNEL); + /* The tracking table has two u64s per TCE */ + pe->tce_tracker = vzalloc(sizeof(u64) * 2 * tracker_entries); + spin_lock_init(&pe->tce_alloc_lock); + + /* mark the first 4GB as reserved so this can still be used for 32bit */ + bitmap_set(pe->tce_bitmap, 0, 1ULL << (32 - p->ioda.max_tce_order)); + + pe_info(pe, "pseudo-bypass sizes: tracker %d bitmap %d TCEs %lld\n", + tracker_entries, bitmap_size, tce_count); rc = opal_pci_map_pe_dma_window(pe->phb->opal_id, pe->pe_number, - /* reconfigure window 0 */ (pe->pe_number << 1) + 0, 1, __pa(tces), table_size, - 1 << tce_order); + 1 << p->ioda.max_tce_order); if (rc == OPAL_SUCCESS) { - pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n"); + pe_info(pe, "TCE tables configured for pseudo-bypass\n"); return 0; } err: - pe_err(pe, "Error configuring 64-bit DMA bypass\n"); - return -EIO; + pe_err(pe, "error configuring pseudo-bypass\n"); + return rc; } static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) @@ -1851,7 +1862,6 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) struct pnv_ioda_pe *pe; uint64_t top; bool bypass = false; - s64 rc; if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) return -ENODEV; @@ -1868,21 +1878,15 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) } else { /* * If the device can't set the TCE bypass bit but still wants - * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to - * bypass the 32-bit region and be usable for 64-bit DMAs. - * The device needs to be able to address all of this space. + * to access 4GB or more, we need to use a different set of DMA + * operations with an indirect mapping. */ if (dma_mask >> 32 && - dma_mask > (memory_hotplug_max() + (1ULL << 32)) && - pnv_pci_ioda_pe_single_vendor(pe) && - phb->model == PNV_PHB_MODEL_PHB3) { - /* Configure the bypass mode */ - rc = pnv_pci_ioda_dma_64bit_bypass(pe); - if (rc) - return rc; - /* 4GB offset bypasses 32-bit space */ - set_dma_offset(&pdev->dev, (1ULL << 32)); - set_dma_ops(&pdev->dev, &dma_nommu_ops); + phb->model != PNV_PHB_MODEL_P7IOC && + pnv_pci_ioda_pe_single_vendor(pe)) { + if (!pe->tces) + pnv_pci_pseudo_bypass_setup(pe); + set_dma_ops(&pdev->dev, &dma_pseudo_bypass_ops); } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) { /* * Fail the request if a DMA mask between 32 and 64 bits diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index c9952de..83492ab 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -70,6 +70,13 @@ struct pnv_ioda_pe { bool tce_bypass_enabled; uint64_t tce_bypass_base; + /* TCE tables for DMA pseudo-bypass */ + __be64 *tces; + u64 tce_count; + unsigned long *tce_bitmap; + u64 *tce_tracker; // 2 u64s per TCE + spinlock_t tce_alloc_lock; + /* MSIs. MVE index is identical for for 32 and 64 bit MSI * and -1 if not supported. (It's actually identical to the * PE number) |