From 3e7abe2556b583e87dabda3e0e6178a67b20d06f Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 20 Jul 2011 06:22:21 -0700 Subject: intel-iommu: Fix AB-BA lockdep report When unbinding a device so that I could pass it through to a KVM VM, I got the lockdep report below. It looks like a legitimate lock ordering problem: - domain_context_mapping_one() takes iommu->lock and calls iommu_support_dev_iotlb(), which takes device_domain_lock (inside iommu->lock). - domain_remove_one_dev_info() starts by taking device_domain_lock then takes iommu->lock inside it (near the end of the function). So this is the classic AB-BA deadlock. It looks like a safe fix is to simply release device_domain_lock a bit earlier, since as far as I can tell, it doesn't protect any of the stuff accessed at the end of domain_remove_one_dev_info() anyway. BTW, the use of device_domain_lock looks a bit unsafe to me... it's at least not obvious to me why we aren't vulnerable to the race below: iommu_support_dev_iotlb() domain_remove_dev_info() lock device_domain_lock find info unlock device_domain_lock lock device_domain_lock find same info unlock device_domain_lock free_devinfo_mem(info) do stuff with info after it's free However I don't understand the locking here well enough to know if this is a real problem, let alone what the best fix is. Anyway here's the full lockdep output that prompted all of this: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.39.1+ #1 ------------------------------------------------------- bash/13954 is trying to acquire lock: (&(&iommu->lock)->rlock){......}, at: [] domain_remove_one_dev_info+0x121/0x230 but task is already holding lock: (device_domain_lock){-.-...}, at: [] domain_remove_one_dev_info+0x208/0x230 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (device_domain_lock){-.-...}: [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_irqsave+0x55/0xa0 [] domain_context_mapping_one+0x600/0x750 [] domain_context_mapping+0x3f/0x120 [] iommu_prepare_identity_map+0x1c5/0x1e0 [] intel_iommu_init+0x88e/0xb5e [] pci_iommu_init+0x16/0x41 [] do_one_initcall+0x45/0x190 [] kernel_init+0xe3/0x168 [] kernel_thread_helper+0x4/0x10 -> #0 (&(&iommu->lock)->rlock){......}: [] __lock_acquire+0x195e/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_irqsave+0x55/0xa0 [] domain_remove_one_dev_info+0x121/0x230 [] device_notifier+0x72/0x90 [] notifier_call_chain+0x8c/0xc0 [] __blocking_notifier_call_chain+0x78/0xb0 [] blocking_notifier_call_chain+0x16/0x20 [] __device_release_driver+0xbc/0xe0 [] device_release_driver+0x2f/0x50 [] driver_unbind+0xa3/0xc0 [] drv_attr_store+0x2c/0x30 [] sysfs_write_file+0xe6/0x170 [] vfs_write+0xce/0x190 [] sys_write+0x54/0xa0 [] system_call_fastpath+0x16/0x1b other info that might help us debug this: 6 locks held by bash/13954: #0: (&buffer->mutex){+.+.+.}, at: [] sysfs_write_file+0x44/0x170 #1: (s_active#3){++++.+}, at: [] sysfs_write_file+0xcd/0x170 #2: (&__lockdep_no_validate__){+.+.+.}, at: [] driver_unbind+0x9b/0xc0 #3: (&__lockdep_no_validate__){+.+.+.}, at: [] device_release_driver+0x27/0x50 #4: (&(&priv->bus_notifier)->rwsem){.+.+.+}, at: [] __blocking_notifier_call_chain+0x5f/0xb0 #5: (device_domain_lock){-.-...}, at: [] domain_remove_one_dev_info+0x208/0x230 stack backtrace: Pid: 13954, comm: bash Not tainted 2.6.39.1+ #1 Call Trace: [] print_circular_bug+0xf7/0x100 [] __lock_acquire+0x195e/0x1e10 [] ? trace_hardirqs_off+0xd/0x10 [] ? trace_hardirqs_on_caller+0x13d/0x180 [] lock_acquire+0x9d/0x130 [] ? domain_remove_one_dev_info+0x121/0x230 [] _raw_spin_lock_irqsave+0x55/0xa0 [] ? domain_remove_one_dev_info+0x121/0x230 [] ? trace_hardirqs_off+0xd/0x10 [] domain_remove_one_dev_info+0x121/0x230 [] device_notifier+0x72/0x90 [] notifier_call_chain+0x8c/0xc0 [] __blocking_notifier_call_chain+0x78/0xb0 [] blocking_notifier_call_chain+0x16/0x20 [] __device_release_driver+0xbc/0xe0 [] device_release_driver+0x2f/0x50 [] driver_unbind+0xa3/0xc0 [] drv_attr_store+0x2c/0x30 [] sysfs_write_file+0xe6/0x170 [] vfs_write+0xce/0x190 [] sys_write+0x54/0xa0 [] system_call_fastpath+0x16/0x1b Signed-off-by: Roland Dreier Signed-off-by: David Woodhouse --- drivers/iommu/intel-iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index c621c98..d9514c4 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -3568,6 +3568,8 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain, found = 1; } + spin_unlock_irqrestore(&device_domain_lock, flags); + if (found == 0) { unsigned long tmp_flags; spin_lock_irqsave(&domain->iommu_lock, tmp_flags); @@ -3584,8 +3586,6 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain, spin_unlock_irqrestore(&iommu->lock, tmp_flags); } } - - spin_unlock_irqrestore(&device_domain_lock, flags); } static void vm_domain_remove_all_dev_info(struct dmar_domain *domain) -- cgit v1.1 From 6fbcfb3e467adb414e235eeefaeaf51ad12f2461 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 25 Sep 2011 19:11:14 -0700 Subject: intel-iommu: Workaround IOTLB hang on Ironlake GPU To work around a hardware issue, we have to submit IOTLB flushes while the graphics engine is idle. The graphics driver will (we hope) go to great lengths to ensure that it gets that right on the affected chipset(s)... so let's not screw it over by deferring the unmap and doing it later. That wouldn't be very helpful. Signed-off-by: David Woodhouse --- drivers/iommu/intel-iommu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index d9514c4..72d68c5 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -3950,7 +3950,11 @@ static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev) if (!(ggc & GGC_MEMORY_VT_ENABLED)) { printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); dmar_map_gfx = 0; - } + } else if (dmar_map_gfx) { + /* we have to ensure the gfx device is idle before we flush */ + printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n"); + intel_iommu_strict = 1; + } } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); -- cgit v1.1 From c0771df8d5297bfb9c4fbe8ada085a49cb22ec4f Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 14 Oct 2011 20:59:46 +0100 Subject: intel-iommu: Export a flag indicating that the IOMMU is used for iGFX. We really don't want this to work in the general case; device drivers *shouldn't* care whether they are behind an IOMMU or not. But the integrated graphics is a special case, because the IOMMU and the GTT are all kind of smashed into one and generally horrifically buggy, so it's reasonable for the graphics driver to want to know when the IOMMU is active for the graphics hardware. Signed-off-by: David Woodhouse --- drivers/iommu/intel-iommu.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 72d68c5..5565753 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -404,6 +404,9 @@ static int dmar_forcedac; static int intel_iommu_strict; static int intel_iommu_superpage = 1; +int intel_iommu_gfx_mapped; +EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); + #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) static DEFINE_SPINLOCK(device_domain_lock); static LIST_HEAD(device_domain_list); @@ -3226,9 +3229,6 @@ static void __init init_no_remapping_devices(void) } } - if (dmar_map_gfx) - return; - for_each_drhd_unit(drhd) { int i; if (drhd->ignored || drhd->include_all) @@ -3236,18 +3236,23 @@ static void __init init_no_remapping_devices(void) for (i = 0; i < drhd->devices_cnt; i++) if (drhd->devices[i] && - !IS_GFX_DEVICE(drhd->devices[i])) + !IS_GFX_DEVICE(drhd->devices[i])) break; if (i < drhd->devices_cnt) continue; - /* bypass IOMMU if it is just for gfx devices */ - drhd->ignored = 1; - for (i = 0; i < drhd->devices_cnt; i++) { - if (!drhd->devices[i]) - continue; - drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO; + /* This IOMMU has *only* gfx devices. Either bypass it or + set the gfx_mapped flag, as appropriate */ + if (dmar_map_gfx) { + intel_iommu_gfx_mapped = 1; + } else { + drhd->ignored = 1; + for (i = 0; i < drhd->devices_cnt; i++) { + if (!drhd->devices[i]) + continue; + drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO; + } } } } -- cgit v1.1 From 292827cb164ad00cc7689a21283b1261c0b6daed Mon Sep 17 00:00:00 2001 From: Allen Kay Date: Fri, 14 Oct 2011 12:31:54 -0700 Subject: intel-iommu: fix return value of iommu_unmap() API iommu_unmap() API expects IOMMU drivers to return the actual page order of the address being unmapped. Previous code was just returning page order passed in from the caller. This patch fixes this problem. Signed-off-by: Allen Kay Signed-off-by: David Woodhouse --- drivers/iommu/intel-iommu.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 5565753..237ef52 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -819,13 +819,14 @@ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, } /* clear last level pte, a tlb flush should be followed */ -static void dma_pte_clear_range(struct dmar_domain *domain, +static int dma_pte_clear_range(struct dmar_domain *domain, unsigned long start_pfn, unsigned long last_pfn) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; unsigned int large_page = 1; struct dma_pte *first_pte, *pte; + int order; BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); @@ -849,6 +850,9 @@ static void dma_pte_clear_range(struct dmar_domain *domain, (void *)pte - (void *)first_pte); } while (start_pfn && start_pfn <= last_pfn); + + order = (large_page - 1) * 9; + return order; } /* free page table pages. last level pte should already be cleared */ @@ -3869,14 +3873,15 @@ static int intel_iommu_unmap(struct iommu_domain *domain, { struct dmar_domain *dmar_domain = domain->priv; size_t size = PAGE_SIZE << gfp_order; + int order; - dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT, + order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT, (iova + size - 1) >> VTD_PAGE_SHIFT); if (dmar_domain->max_addr == iova + size) dmar_domain->max_addr = iova; - return gfp_order; + return order; } static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, -- cgit v1.1 From 8140a95d228efbcd64d84150e794761a32463947 Mon Sep 17 00:00:00 2001 From: Allen Kay Date: Fri, 14 Oct 2011 12:32:17 -0700 Subject: intel-iommu: set iommu_superpage on VM domains to lowest common denominator set dmar->iommu_superpage field to the smallest common denominator of super page sizes supported by all active VT-d engines. Initialize this field in intel_iommu_domain_init() API so intel_iommu_map() API will be able to use iommu_superpage field to determine the appropriate super page size to use. Signed-off-by: Allen Kay Signed-off-by: David Woodhouse --- drivers/iommu/intel-iommu.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 237ef52..e588360 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -580,17 +580,18 @@ static void domain_update_iommu_snooping(struct dmar_domain *domain) static void domain_update_iommu_superpage(struct dmar_domain *domain) { - int i, mask = 0xf; + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu = NULL; + int mask = 0xf; if (!intel_iommu_superpage) { domain->iommu_superpage = 0; return; } - domain->iommu_superpage = 4; /* 1TiB */ - - for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) { - mask |= cap_super_page_val(g_iommus[i]->cap); + /* set iommu_superpage to the smallest common denominator */ + for_each_active_iommu(iommu, drhd) { + mask &= cap_super_page_val(iommu->cap); if (!mask) { break; } @@ -3748,6 +3749,7 @@ static int intel_iommu_domain_init(struct iommu_domain *domain) vm_domain_exit(dmar_domain); return -ENOMEM; } + domain_update_iommu_cap(dmar_domain); domain->priv = dmar_domain; return 0; -- cgit v1.1 From 4399c8bf2b9093696fa8160d79712e7346989c46 Mon Sep 17 00:00:00 2001 From: Allen Kay Date: Fri, 14 Oct 2011 12:32:46 -0700 Subject: intel-iommu: fix superpage support in pfn_to_dma_pte() If target_level == 0, current code breaks out of the while-loop if SUPERPAGE bit is set. We should also break out if PTE is not present. If we don't do this, KVM calls to iommu_iova_to_phys() will cause pfn_to_dma_pte() to create mapping for 4KiB pages. Signed-off-by: Allen Kay Signed-off-by: David Woodhouse --- drivers/iommu/intel-iommu.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index e588360..a88f3cb 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -306,6 +306,11 @@ static inline bool dma_pte_present(struct dma_pte *pte) return (pte->val & 3) != 0; } +static inline bool dma_pte_superpage(struct dma_pte *pte) +{ + return (pte->val & (1 << 7)); +} + static inline int first_pte_in_page(struct dma_pte *pte) { return !((unsigned long)pte & ~VTD_PAGE_MASK); @@ -734,29 +739,23 @@ out: } static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, - unsigned long pfn, int large_level) + unsigned long pfn, int target_level) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; struct dma_pte *parent, *pte = NULL; int level = agaw_to_level(domain->agaw); - int offset, target_level; + int offset; BUG_ON(!domain->pgd); BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width); parent = domain->pgd; - /* Search pte */ - if (!large_level) - target_level = 1; - else - target_level = large_level; - while (level > 0) { void *tmp_page; offset = pfn_level_offset(pfn, level); pte = &parent[offset]; - if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE)) + if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) break; if (level == target_level) break; -- cgit v1.1