diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/frontswap.c | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 7 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 28 | ||||
-rw-r--r-- | mm/memory.c | 9 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 9 | ||||
-rw-r--r-- | mm/migrate.c | 25 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 79 | ||||
-rw-r--r-- | mm/page_alloc.c | 8 | ||||
-rw-r--r-- | mm/pagewalk.c | 70 | ||||
-rw-r--r-- | mm/slab_common.c | 4 | ||||
-rw-r--r-- | mm/swap_state.c | 18 | ||||
-rw-r--r-- | mm/swapfile.c | 2 |
13 files changed, 151 insertions, 112 deletions
diff --git a/mm/frontswap.c b/mm/frontswap.c index 538367e..1b24bdc 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -319,7 +319,7 @@ void __frontswap_invalidate_area(unsigned type) return; frontswap_ops->invalidate_area(type); atomic_set(&sis->frontswap_pages, 0); - memset(sis->frontswap_map, 0, sis->max / sizeof(long)); + bitmap_zero(sis->frontswap_map, sis->max); } clear_bit(type, need_init); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 03a89a2..362c329 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2325,7 +2325,12 @@ static void collapse_huge_page(struct mm_struct *mm, pte_unmap(pte); spin_lock(&mm->page_table_lock); BUG_ON(!pmd_none(*pmd)); - set_pmd_at(mm, address, pmd, _pmd); + /* + * We can only use set_pmd_at when establishing + * hugepmds and never for establishing regular pmds that + * points to regular pagetables. Use pmd_populate for that + */ + pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(&mm->page_table_lock); anon_vma_unlock_write(vma->anon_vma); goto out; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f8feeec..e2bfbf7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2839,7 +2839,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait(mm, (pmd_t *)ptep, address); + migration_entry_wait_huge(mm, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cb1c9de..1947218 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1199,7 +1199,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = &mz->reclaim_iter[reclaim->priority]; - last_visited = iter->last_visited; if (prev && reclaim->generation != iter->generation) { iter->last_visited = NULL; goto out_unlock; @@ -1218,13 +1217,12 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, * is alive. */ dead_count = atomic_read(&root->dead_count); - smp_rmb(); - last_visited = iter->last_visited; - if (last_visited) { - if ((dead_count != iter->last_dead_count) || - !css_tryget(&last_visited->css)) { + if (dead_count == iter->last_dead_count) { + smp_rmb(); + last_visited = iter->last_visited; + if (last_visited && + !css_tryget(&last_visited->css)) last_visited = NULL; - } } } @@ -3141,8 +3139,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) return -ENOMEM; } - INIT_WORK(&s->memcg_params->destroy, - kmem_cache_destroy_work_func); s->memcg_params->is_root_cache = true; /* @@ -4108,8 +4104,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, if (mem_cgroup_disabled()) return NULL; - VM_BUG_ON(PageSwapCache(page)); - if (PageTransHuge(page)) { nr_pages <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); @@ -4205,6 +4199,18 @@ void mem_cgroup_uncharge_page(struct page *page) if (page_mapped(page)) return; VM_BUG_ON(page->mapping && !PageAnon(page)); + /* + * If the page is in swap cache, uncharge should be deferred + * to the swap path, which also properly accounts swap usage + * and handles memcg lifetime. + * + * Note that this check is not stable and reclaim may add the + * page to swap cache at any time after this. However, if the + * page is not in swap cache by the time page->mapcount hits + * 0, there won't be any page table references to the swap + * slot, and reclaim will free it and not actually write the + * page to disk. + */ if (PageSwapCache(page)) return; __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); diff --git a/mm/memory.c b/mm/memory.c index 6dc1882..61a262b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -220,7 +220,6 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) tlb->start = -1UL; tlb->end = 0; tlb->need_flush = 0; - tlb->fast_mode = (num_possible_cpus() == 1); tlb->local.next = NULL; tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); @@ -244,9 +243,6 @@ void tlb_flush_mmu(struct mmu_gather *tlb) tlb_table_flush(tlb); #endif - if (tlb_fast_mode(tlb)) - return; - for (batch = &tlb->local; batch; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); batch->nr = 0; @@ -288,11 +284,6 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) VM_BUG_ON(!tlb->need_flush); - if (tlb_fast_mode(tlb)) { - free_page_and_swap_cache(page); - return 1; /* avoid calling tlb_flush_mmu() */ - } - batch = tlb->active; batch->pages[batch->nr++] = page; if (batch->nr == batch->max) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a221fac..1ad92b4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -720,9 +720,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, start = phys_start_pfn << PAGE_SHIFT; size = nr_pages * PAGE_SIZE; ret = release_mem_region_adjustable(&iomem_resource, start, size); - if (ret) - pr_warn("Unable to release resource <%016llx-%016llx> (%d)\n", - start, start + size - 1, ret); + if (ret) { + resource_size_t endres = start + size - 1; + + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); + } sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { diff --git a/mm/migrate.c b/mm/migrate.c index 27ed225..6f0c244 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -165,7 +165,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pte = arch_make_huge_pte(pte, vma, new, 0); } #endif - flush_cache_page(vma, addr, pte_pfn(pte)); + flush_dcache_page(new); set_pte_at(mm, addr, ptep, pte); if (PageHuge(new)) { @@ -200,15 +200,14 @@ static void remove_migration_ptes(struct page *old, struct page *new) * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. */ -void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, - unsigned long address) +static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, + spinlock_t *ptl) { - pte_t *ptep, pte; - spinlock_t *ptl; + pte_t pte; swp_entry_t entry; struct page *page; - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + spin_lock(ptl); pte = *ptep; if (!is_swap_pte(pte)) goto out; @@ -236,6 +235,20 @@ out: pte_unmap_unlock(ptep, ptl); } +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + spinlock_t *ptl = pte_lockptr(mm, pmd); + pte_t *ptep = pte_offset_map(pmd, address); + __migration_entry_wait(mm, ptep, ptl); +} + +void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte) +{ + spinlock_t *ptl = &(mm)->page_table_lock; + __migration_entry_wait(mm, pte, ptl); +} + #ifdef CONFIG_BLOCK /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index be04122..6725ff1 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -40,48 +40,44 @@ void __mmu_notifier_release(struct mm_struct *mm) int id; /* - * srcu_read_lock() here will block synchronize_srcu() in - * mmu_notifier_unregister() until all registered - * ->release() callouts this function makes have - * returned. + * SRCU here will block mmu_notifier_unregister until + * ->release returns. */ id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) + /* + * If ->release runs before mmu_notifier_unregister it must be + * handled, as it's the only way for the driver to flush all + * existing sptes and stop the driver from establishing any more + * sptes before all the pages in the mm are freed. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + srcu_read_unlock(&srcu, id); + spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { mn = hlist_entry(mm->mmu_notifier_mm->list.first, struct mmu_notifier, hlist); - /* - * Unlink. This will prevent mmu_notifier_unregister() - * from also making the ->release() callout. + * We arrived before mmu_notifier_unregister so + * mmu_notifier_unregister will do nothing other than to wait + * for ->release to finish and for mmu_notifier_unregister to + * return. */ hlist_del_init_rcu(&mn->hlist); - spin_unlock(&mm->mmu_notifier_mm->lock); - - /* - * Clear sptes. (see 'release' description in mmu_notifier.h) - */ - if (mn->ops->release) - mn->ops->release(mn, mm); - - spin_lock(&mm->mmu_notifier_mm->lock); } spin_unlock(&mm->mmu_notifier_mm->lock); /* - * All callouts to ->release() which we have done are complete. - * Allow synchronize_srcu() in mmu_notifier_unregister() to complete - */ - srcu_read_unlock(&srcu, id); - - /* - * mmu_notifier_unregister() may have unlinked a notifier and may - * still be calling out to it. Additionally, other notifiers - * may have been active via vmtruncate() et. al. Block here - * to ensure that all notifier callouts for this mm have been - * completed and the sptes are really cleaned up before returning - * to exit_mmap(). + * synchronize_srcu here prevents mmu_notifier_release from returning to + * exit_mmap (which would proceed with freeing all pages in the mm) + * until the ->release method returns, if it was invoked by + * mmu_notifier_unregister. + * + * The mmu_notifier_mm can't go away from under us because one mm_count + * is held by exit_mmap. */ synchronize_srcu(&srcu); } @@ -292,31 +288,34 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) { BUG_ON(atomic_read(&mm->mm_count) <= 0); - spin_lock(&mm->mmu_notifier_mm->lock); if (!hlist_unhashed(&mn->hlist)) { + /* + * SRCU here will force exit_mmap to wait for ->release to + * finish before freeing the pages. + */ int id; + id = srcu_read_lock(&srcu); /* - * Ensure we synchronize up with __mmu_notifier_release(). + * exit_mmap will block in mmu_notifier_release to guarantee + * that ->release is called before freeing the pages. */ - id = srcu_read_lock(&srcu); - - hlist_del_rcu(&mn->hlist); - spin_unlock(&mm->mmu_notifier_mm->lock); - if (mn->ops->release) mn->ops->release(mn, mm); + srcu_read_unlock(&srcu, id); + spin_lock(&mm->mmu_notifier_mm->lock); /* - * Allow __mmu_notifier_release() to complete. + * Can not use list_del_rcu() since __mmu_notifier_release + * can delete it before we hold the lock. */ - srcu_read_unlock(&srcu, id); - } else + hlist_del_init_rcu(&mn->hlist); spin_unlock(&mm->mmu_notifier_mm->lock); + } /* - * Wait for any running method to finish, including ->release() if it - * was run by __mmu_notifier_release() instead of us. + * Wait for any running method to finish, of course including + * ->release if it was run by mmu_notifier_relase instead of us. */ synchronize_srcu(&srcu); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 98cbdf6..c3edb62 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1628,6 +1628,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, long min = mark; long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; + long free_cma = 0; free_pages -= (1 << order) - 1; if (alloc_flags & ALLOC_HIGH) @@ -1637,9 +1638,10 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ if (!(alloc_flags & ALLOC_CMA)) - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); + free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages <= min + lowmem_reserve) + + if (free_pages - free_cma <= min + lowmem_reserve) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -5158,7 +5160,7 @@ unsigned long free_reserved_area(unsigned long start, unsigned long end, for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { if (poison) memset((void *)pos, poison, PAGE_SIZE); - free_reserved_page(virt_to_page(pos)); + free_reserved_page(virt_to_page((void *)pos)); } if (pages && s) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 35aa294..5da2cbc 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -127,28 +127,7 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, return 0; } -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - struct vm_area_struct *vma; - - /* We don't need vma lookup at all. */ - if (!walk->hugetlb_entry) - return NULL; - - VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); - vma = find_vma(walk->mm, addr); - if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) - return vma; - - return NULL; -} - #else /* CONFIG_HUGETLB_PAGE */ -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - return NULL; -} - static int walk_hugetlb_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -198,30 +177,53 @@ int walk_page_range(unsigned long addr, unsigned long end, if (!walk->mm) return -EINVAL; + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + pgd = pgd_offset(walk->mm, addr); do { - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; next = pgd_addr_end(addr, end); /* - * handle hugetlb vma individually because pagetable walk for - * the hugetlb page is dependent on the architecture and - * we can't handled it in the same manner as non-huge pages. + * This function was not intended to be vma based. + * But there are vma special cases to be handled: + * - hugetlb vma's + * - VM_PFNMAP vma's */ - vma = hugetlb_vma(addr, walk); + vma = find_vma(walk->mm, addr); if (vma) { - if (vma->vm_end < next) + /* + * There are no page structures backing a VM_PFNMAP + * range, so do not allow split_huge_page_pmd(). + */ + if ((vma->vm_start <= addr) && + (vma->vm_flags & VM_PFNMAP)) { next = vma->vm_end; + pgd = pgd_offset(walk->mm, next); + continue; + } /* - * Hugepage is very tightly coupled with vma, so - * walk through hugetlb entries within a given vma. + * Handle hugetlb vma individually because pagetable + * walk for the hugetlb page is dependent on the + * architecture and we can't handled it in the same + * manner as non-huge pages. */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); - continue; + if (walk->hugetlb_entry && (vma->vm_start <= addr) && + is_vm_hugetlb_page(vma)) { + if (vma->vm_end < next) + next = vma->vm_end; + /* + * Hugepage is very tightly coupled with vma, + * so walk through hugetlb entries within a + * given vma. + */ + err = walk_hugetlb_range(vma, addr, next, walk); + if (err) + break; + pgd = pgd_offset(walk->mm, next); + continue; + } } if (pgd_none_or_clear_bad(pgd)) { diff --git a/mm/slab_common.c b/mm/slab_common.c index ff3218a..2d41450 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -373,8 +373,10 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) { int index; - if (WARN_ON_ONCE(size > KMALLOC_MAX_SIZE)) + if (size > KMALLOC_MAX_SIZE) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); return NULL; + } if (size <= 192) { if (!size) diff --git a/mm/swap_state.c b/mm/swap_state.c index b3d40dc..f24ab0d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -336,8 +336,24 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); - if (err == -EEXIST) { /* seems racy */ + if (err == -EEXIST) { radix_tree_preload_end(); + /* + * We might race against get_swap_page() and stumble + * across a SWAP_HAS_CACHE swap_map entry whose page + * has not been brought into the swapcache yet, while + * the other end is scheduled away waiting on discard + * I/O completion at scan_swap_map(). + * + * In order to avoid turning this transitory state + * into a permanent loop around this -EEXIST case + * if !CONFIG_PREEMPT and the I/O completion happens + * to be waiting on the CPU waitqueue where we are now + * busy looping, we just conditionally invoke the + * scheduler here, if there are some more important + * tasks to run. + */ + cond_resched(); continue; } if (err) { /* swp entry is obsolete ? */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 6c340d9..746af55b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2116,7 +2116,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } /* frontswap enabled? set up bit-per-page map for frontswap */ if (frontswap_enabled) - frontswap_map = vzalloc(maxpages / sizeof(long)); + frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { |