5 files changed, 381 insertions, 50 deletions
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index 8fa3945..354f435 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -74,6 +74,7 @@ static inline unsigned long device_to_mask(struct device *dev)
 extern struct dma_map_ops dma_iommu_ops;
 #endif
 extern const struct dma_map_ops dma_nommu_ops;
+extern const struct dma_map_ops dma_pseudo_bypass_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 703a350..2467bda 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -6,7 +6,7 @@ obj-y			+= opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
 obj-y			+= opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
 
 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
-obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o
+obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-dma.o
 obj-$(CONFIG_CXL_BASE)	+= pci-cxl.o
 obj-$(CONFIG_EEH)	+= eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c
new file mode 100644
index 0000000..1d5409b
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-dma.c
@@ -0,0 +1,319 @@
+/*
+ * DMA operations supporting pseudo-bypass for PHB3+
+ *
+ * Author: Russell Currey <ruscur@russell.cc>
+ *
+ * Copyright 2018 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/memblock.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/hash.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/pnv-pci.h>
+#include <asm/tce.h>
+
+#include "pci.h"
+
+/* select and allocate a TCE using the bitmap */
+static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr)
+{
+	int tce;
+	__be64 old, new;
+
+	spin_lock(&pe->tce_alloc_lock);
+	tce = bitmap_find_next_zero_area(pe->tce_bitmap,
+					 pe->tce_count,
+					 0,
+					 1,
+					 0);
+	bitmap_set(pe->tce_bitmap, tce, 1);
+	old = pe->tces[tce];
+	new = cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
+	pe->tces[tce] = new;
+	pe_info(pe, "allocating TCE %i 0x%016llx (old 0x%016llx)\n",
+		tce, new, old);
+	spin_unlock(&pe->tce_alloc_lock);
+
+	return tce;
+}
+
+/*
+ * The tracking table for assigning TCEs has two entries per TCE.
+ * - @entry1 contains the physical address and the smallest bit indicates
+ *     if it's currently valid.
+ * - @entry2 contains the DMA address returned in the upper 34 bits, and a
+ *     refcount in the lower 30 bits.
+ */
+static dma_addr_t dma_pseudo_bypass_get_address(struct device *dev,
+					    phys_addr_t addr)
+{
+	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pnv_ioda_pe *pe;
+        u64 i, entry1, entry2, dma_prefix, tce, ret;
+	u64 offset = addr & ((1 << phb->ioda.max_tce_order) - 1);
+
+	pe = &phb->ioda.pe_array[pci_get_pdn(pdev)->pe_number];
+
+	/* look through the tracking table for a free entry */
+	for (i = 0; i < pe->tce_count; i++) {
+		entry1 = pe->tce_tracker[i * 2];
+		entry2 = pe->tce_tracker[i * 2 + 1];
+		dma_prefix = entry2 >> 34;
+
+		/* if the address is the same and the entry is valid */
+		if (entry1 == ((addr - offset) | 1)) {
+			/* all we need to do here is increment the refcount */
+			ret = cmpxchg(&pe->tce_tracker[i * 2 + 1],
+				      entry2, entry2 + 1);
+			if (ret != entry2) {
+				/* conflict, start looking again just in case */
+				i--;
+				continue;
+			}
+			return (dma_prefix << phb->ioda.max_tce_order) | offset;
+		/* if the entry is invalid then we want to replace it */
+		} else if (!(entry1 & 1)) {
+			/* set the real address, note that it isn't valid yet */
+			ret = cmpxchg(&pe->tce_tracker[i * 2],
+				      entry1, (addr - offset));
+			if (ret != entry1) {
+				/* conflict, start looking again */
+				i--;
+				continue;
+			}
+
+			/* now we can allocate a TCE */
+			tce = dma_pseudo_bypass_select_tce(pe, addr - offset);
+
+			/* set new value, including TCE index and new refcount */
+			ret = cmpxchg(&pe->tce_tracker[i * 2 + 1],
+				      entry2, tce << 34 | 1);
+			if (ret != entry2) {
+				/*
+				 * XXX In this case we need to throw out
+				 * everything, including the TCE we just
+				 * allocated.  For now, just leave it.
+				 */
+				i--;
+				continue;
+			}
+
+			/* now set the valid bit */
+			ret = cmpxchg(&pe->tce_tracker[i * 2],
+				      (addr - offset), (addr - offset) | 1);
+			if (ret != (addr - offset)) {
+				/*
+				 * XXX Same situation as above.  We'd probably
+				 * want to null out entry2 as well.
+				 */
+				i--;
+				continue;
+			}
+			return (tce << phb->ioda.max_tce_order) | offset;
+		/* it's a valid entry but not ours, keep looking */
+		} else {
+			continue;
+		}
+	}
+	/* If we get here, the table must be full, so error out. */
+	return -1ULL;
+}
+
+/*
+ * For the moment, unmapping just decrements the refcount and doesn't actually
+ * remove the TCE.  This is because it's very likely that a previously allocated
+ * TCE will be used again, and this saves having to invalidate it.
+ *
+ * TODO implement some kind of garbage collection that clears unused TCE entries
+ * once the table reaches a certain size.
+ */
+static void dma_pseudo_bypass_unmap_address(struct device *dev, dma_addr_t dma_addr)
+{
+	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pnv_ioda_pe *pe;
+	u64 i, entry1, entry2, dma_prefix, refcount;
+
+	pe = &phb->ioda.pe_array[pci_get_pdn(pdev)->pe_number];
+
+	for (i = 0; i < pe->tce_count; i++) {
+		entry1 = pe->tce_tracker[i * 2];
+		entry2 = pe->tce_tracker[i * 2 + 1];
+		dma_prefix = entry2 >> 34;
+		refcount = entry2 & ((1 << 30) - 1);
+
+		/* look through entry2 until we find our address */
+		if (dma_prefix == (dma_addr >> phb->ioda.max_tce_order)) {
+			refcount--;
+			cmpxchg(&pe->tce_tracker[i * 2 + 1], entry2, (dma_prefix << 34) | refcount);
+			if (!refcount) {
+				/*
+				 * Here is where we would remove the valid bit
+				 * from entry1, clear the entry in the TCE table
+				 * and invalidate the TCE - but we want to leave
+				 * them until the table fills up (for now).
+				 */
+			}
+			break;
+		}
+	}
+}
+
+static int dma_pseudo_bypass_dma_supported(struct device *dev, u64 mask)
+{
+	/*
+	 * Normally dma_supported() checks if the mask is capable of addressing
+	 * all of memory.  Since we map physical memory in chunks that the
+	 * device can address, the device will be able to address whatever it
+	 * wants - just not all at once.
+	 */
+	return 1;
+}
+
+static void *dma_pseudo_bypass_alloc_coherent(struct device *dev,
+					  size_t size,
+					  dma_addr_t *dma_handle,
+					  gfp_t flag,
+					  unsigned long attrs)
+{
+	void *ret;
+	struct page *page;
+	int node = dev_to_node(dev);
+
+	/* ignore region specifiers */
+	flag &= ~(__GFP_HIGHMEM);
+
+	page = alloc_pages_node(node, flag, get_order(size));
+	if (page == NULL)
+		return NULL;
+	ret = page_address(page);
+	memset(ret, 0, size);
+	*dma_handle = dma_pseudo_bypass_get_address(dev, __pa(ret));
+
+	return ret;
+}
+
+static void dma_pseudo_bypass_free_coherent(struct device *dev,
+					 size_t size,
+					 void *vaddr,
+					 dma_addr_t dma_handle,
+					 unsigned long attrs)
+{
+	free_pages((unsigned long)vaddr, get_order(size));
+}
+
+static int dma_pseudo_bypass_mmap_coherent(struct device *dev,
+				       struct vm_area_struct *vma,
+				       void *cpu_addr,
+				       dma_addr_t handle,
+				       size_t size,
+				       unsigned long attrs)
+{
+	unsigned long pfn = page_to_pfn(virt_to_page(cpu_addr));
+
+	return remap_pfn_range(vma, vma->vm_start,
+			       pfn + vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot);
+}
+
+static inline dma_addr_t dma_pseudo_bypass_map_page(struct device *dev,
+						struct page *page,
+						unsigned long offset,
+						size_t size,
+						enum dma_data_direction dir,
+						unsigned long attrs)
+{
+	BUG_ON(dir == DMA_NONE);
+
+	/* XXX I don't know if this is necessary (or even desired) */
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		__dma_sync_page(page, offset, size, dir);
+
+	return dma_pseudo_bypass_get_address(dev, page_to_phys(page) + offset);
+}
+
+static inline void dma_pseudo_bypass_unmap_page(struct device *dev,
+					 dma_addr_t dma_address,
+					 size_t size,
+					 enum dma_data_direction direction,
+					 unsigned long attrs)
+{
+	dma_pseudo_bypass_unmap_address(dev, dma_address);
+}
+
+
+static int dma_pseudo_bypass_map_sg(struct device *dev, struct scatterlist *sgl,
+			     int nents, enum dma_data_direction direction,
+			     unsigned long attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+
+	for_each_sg(sgl, sg, nents, i) {
+		sg->dma_address = dma_pseudo_bypass_get_address(dev, sg_phys(sg));
+		sg->dma_length = sg->length;
+
+		if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+			continue;
+
+		__dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
+	}
+
+	return nents;
+}
+
+static void dma_pseudo_bypass_unmap_sg(struct device *dev, struct scatterlist *sgl,
+				int nents, enum dma_data_direction direction,
+				unsigned long attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i) {
+		dma_pseudo_bypass_unmap_address(dev, sg->dma_address);
+	}
+}
+
+static u64 dma_pseudo_bypass_get_required_mask(struct device *dev)
+{
+	/*
+	 * there's no limitation on our end, the driver should just call
+	 * set_mask() with as many bits as the device can address.
+	 */
+	return -1ULL;
+}
+
+static int dma_pseudo_bypass_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return dma_addr == -1ULL;
+}
+
+
+const struct dma_map_ops dma_pseudo_bypass_ops = {
+	.alloc				= dma_pseudo_bypass_alloc_coherent,
+	.free				= dma_pseudo_bypass_free_coherent,
+	.mmap				= dma_pseudo_bypass_mmap_coherent,
+	.map_sg				= dma_pseudo_bypass_map_sg,
+	.unmap_sg			= dma_pseudo_bypass_unmap_sg,
+	.dma_supported			= dma_pseudo_bypass_dma_supported,
+	.map_page			= dma_pseudo_bypass_map_page,
+	.unmap_page			= dma_pseudo_bypass_unmap_page,
+	.get_required_mask		= dma_pseudo_bypass_get_required_mask,
+	.mapping_error			= dma_pseudo_bypass_mapping_error,
+};
+EXPORT_SYMBOL(dma_pseudo_bypass_ops);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index bcb3bfc..7ecc186 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -25,6 +25,7 @@
 #include <linux/iommu.h>
 #include <linux/rculist.h>
 #include <linux/sizes.h>
+#include <linux/vmalloc.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -1088,6 +1089,9 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 	pe->pbus = NULL;
 	pe->mve_number = -1;
 	pe->rid = dev->bus->number << 8 | pdn->devfn;
+	pe->tces = NULL;
+	pe->tce_tracker = NULL;
+	pe->tce_bitmap = NULL;
 
 	pe_info(pe, "Associated device to PE\n");
 
@@ -1569,6 +1573,9 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 		pe->mve_number = -1;
 		pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
 			   pci_iov_virtfn_devfn(pdev, vf_index);
+		pe->tces = NULL;
+		pe->tce_tracker = NULL;
+		pe->tce_bitmap = NULL;
 
 		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
 			hose->global_number, pdev->bus->number,
@@ -1774,43 +1781,40 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
 	return true;
 }
 
-/*
- * Reconfigure TVE#0 to be usable as 64-bit DMA space.
- *
- * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
- * Devices can only access more than that if bit 59 of the PCI address is set
- * by hardware, which indicates TVE#1 should be used instead of TVE#0.
- * Many PCI devices are not capable of addressing that many bits, and as a
- * result are limited to the 4GB of virtual memory made available to 32-bit
- * devices in TVE#0.
- *
- * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
- * devices by configuring the virtual memory past the first 4GB inaccessible
- * by 64-bit DMAs.  This should only be used by devices that want more than
- * 4GB, and only on PEs that have no 32-bit devices.
- *
- * Currently this will only work on PHB3 (POWER8).
- */
-static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
+static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe)
 {
-	u64 window_size, table_size, tce_count, addr;
+	u64 tce_count, table_size, window_size;
+	struct pnv_phb *p = pe->phb;
 	struct page *table_pages;
-	u64 tce_order = 28; /* 256MB TCEs */
 	__be64 *tces;
-	s64 rc;
+	int rc = -ENOMEM;
+	int bitmap_size, tracker_entries;
+
+	/*
+	 * XXX These are factors for scaling the size of the TCE table, and
+	 * the table that tracks these allocations.  These should eventually
+	 * be kernel command line options with defaults above 1, for situations
+	 * where your memory expands after the machine has booted.
+	 */
+	int tce_size_factor = 1;
+	int tracking_table_factor = 1;
 
 	/*
-	 * Window size needs to be a power of two, but needs to account for
-	 * shifting memory by the 4GB offset required to skip 32bit space.
+	 * The window size covers all of memory (and optionally more), with
+	 * enough tracker entries to cover them all being allocated.  So we
+	 * create enough TCEs to cover all of memory at once.
 	 */
-	window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
-	tce_count = window_size >> tce_order;
+	window_size = roundup_pow_of_two(tce_size_factor * memory_hotplug_max());
+	tracker_entries = (tracking_table_factor * memory_hotplug_max()) >>
+		p->ioda.max_tce_order;
+	tce_count = window_size >> p->ioda.max_tce_order;
+	bitmap_size = BITS_TO_LONGS(tce_count) * sizeof(unsigned long);
 	table_size = tce_count << 3;
 
 	if (table_size < PAGE_SIZE)
 		table_size = PAGE_SIZE;
 
-	table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
+	table_pages = alloc_pages_node(p->hose->node, GFP_KERNEL,
 				       get_order(table_size));
 	if (!table_pages)
 		goto err;
@@ -1821,26 +1825,33 @@ static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
 
 	memset(tces, 0, table_size);
 
-	for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
-		tces[(addr + (1ULL << 32)) >> tce_order] =
-			cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
-	}
+	pe->tces = tces;
+	pe->tce_count = tce_count;
+	pe->tce_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+	/* The tracking table has two u64s per TCE */
+	pe->tce_tracker = vzalloc(sizeof(u64) * 2 * tracker_entries);
+	spin_lock_init(&pe->tce_alloc_lock);
+
+	/* mark the first 4GB as reserved so this can still be used for 32bit */
+	bitmap_set(pe->tce_bitmap, 0, 1ULL << (32 - p->ioda.max_tce_order));
+
+	pe_info(pe, "pseudo-bypass sizes: tracker %d bitmap %d TCEs %lld\n",
+		tracker_entries, bitmap_size, tce_count);
 
 	rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
 					pe->pe_number,
-					/* reconfigure window 0 */
 					(pe->pe_number << 1) + 0,
 					1,
 					__pa(tces),
 					table_size,
-					1 << tce_order);
+					1 << p->ioda.max_tce_order);
 	if (rc == OPAL_SUCCESS) {
-		pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
+		pe_info(pe, "TCE tables configured for pseudo-bypass\n");
 		return 0;
 	}
 err:
-	pe_err(pe, "Error configuring 64-bit DMA bypass\n");
-	return -EIO;
+	pe_err(pe, "error configuring pseudo-bypass\n");
+	return rc;
 }
 
 static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
@@ -1851,7 +1862,6 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
 	struct pnv_ioda_pe *pe;
 	uint64_t top;
 	bool bypass = false;
-	s64 rc;
 
 	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
 		return -ENODEV;
@@ -1868,21 +1878,15 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
 	} else {
 		/*
 		 * If the device can't set the TCE bypass bit but still wants
-		 * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
-		 * bypass the 32-bit region and be usable for 64-bit DMAs.
-		 * The device needs to be able to address all of this space.
+		 * to access 4GB or more, we need to use a different set of DMA
+		 * operations with an indirect mapping.
 		 */
 		if (dma_mask >> 32 &&
-		    dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
-		    pnv_pci_ioda_pe_single_vendor(pe) &&
-		    phb->model == PNV_PHB_MODEL_PHB3) {
-			/* Configure the bypass mode */
-			rc = pnv_pci_ioda_dma_64bit_bypass(pe);
-			if (rc)
-				return rc;
-			/* 4GB offset bypasses 32-bit space */
-			set_dma_offset(&pdev->dev, (1ULL << 32));
-			set_dma_ops(&pdev->dev, &dma_nommu_ops);
+		    phb->model != PNV_PHB_MODEL_P7IOC &&
+		    pnv_pci_ioda_pe_single_vendor(pe)) {
+			if (!pe->tces)
+				pnv_pci_pseudo_bypass_setup(pe);
+			set_dma_ops(&pdev->dev, &dma_pseudo_bypass_ops);
 		} else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
 			/*
 			 * Fail the request if a DMA mask between 32 and 64 bits
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index c9952de..83492ab 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -70,6 +70,13 @@ struct pnv_ioda_pe {
 	bool			tce_bypass_enabled;
 	uint64_t		tce_bypass_base;
 
+	/* TCE tables for DMA pseudo-bypass */
+	__be64			*tces;
+	u64			tce_count;
+	unsigned long		*tce_bitmap;
+	u64			*tce_tracker; // 2 u64s per TCE
+	spinlock_t		tce_alloc_lock;
+
 	/* MSIs. MVE index is identical for for 32 and 64 bit MSI
 	 * and -1 if not supported. (It's actually identical to the
 	 * PE number)